PlanOpticon

planopticon / video_processor / providers / gemini_provider.py

Source Blame History 235 lines

a94205b…	leo	1	"""Google Gemini provider implementation using the google-genai SDK."""
a94205b…	leo	2
a94205b…	leo	3	import logging
a94205b…	leo	4	import os
a94205b…	leo	5	from pathlib import Path
a94205b…	leo	6	from typing import Optional
a94205b…	leo	7
a94205b…	leo	8	from dotenv import load_dotenv
a94205b…	leo	9
0981a08…	noreply	10	from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry
a94205b…	leo	11
a94205b…	leo	12	load_dotenv()
a94205b…	leo	13	logger = logging.getLogger(__name__)
a94205b…	leo	14
a94205b…	leo	15	# Capabilities inferred from model id patterns
a94205b…	leo	16	_VISION_KEYWORDS = {"gemini-2", "gemini-3", "gemini-pro", "gemini-flash", "gemini-ultra"}
a94205b…	leo	17	_AUDIO_KEYWORDS = {"gemini-2", "gemini-3", "gemini-flash"}
a94205b…	leo	18
a94205b…	leo	19
a94205b…	leo	20	class GeminiProvider(BaseProvider):
a94205b…	leo	21	"""Google Gemini API provider via google-genai SDK."""
a94205b…	leo	22
a94205b…	leo	23	provider_name = "gemini"
a94205b…	leo	24
287a3bb…	leo	25	def __init__(
287a3bb…	leo	26	self,
287a3bb…	leo	27	api_key: Optional[str] = None,
287a3bb…	leo	28	credentials_path: Optional[str] = None,
287a3bb…	leo	29	):
a94205b…	leo	30	self.api_key = api_key or os.getenv("GEMINI_API_KEY")
287a3bb…	leo	31	self.credentials_path = credentials_path or os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
287a3bb…	leo	32
287a3bb…	leo	33	if not self.api_key and not self.credentials_path:
829e24a…	leo	34	raise ValueError("Neither GEMINI_API_KEY nor GOOGLE_APPLICATION_CREDENTIALS is set")
287a3bb…	leo	35
a94205b…	leo	36	try:
a94205b…	leo	37	from google import genai
829e24a…	leo	38
a94205b…	leo	39	self._genai = genai
287a3bb…	leo	40
287a3bb…	leo	41	if self.api_key:
287a3bb…	leo	42	self.client = genai.Client(api_key=self.api_key)
287a3bb…	leo	43	else:
287a3bb…	leo	44	# Service account → use Vertex AI mode
287a3bb…	leo	45	import json
287a3bb…	leo	46
287a3bb…	leo	47	with open(self.credentials_path) as f:
287a3bb…	leo	48	sa_info = json.load(f)
287a3bb…	leo	49	project = sa_info.get("project_id", "")
287a3bb…	leo	50	location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1")
287a3bb…	leo	51
287a3bb…	leo	52	self.client = genai.Client(
287a3bb…	leo	53	vertexai=True,
287a3bb…	leo	54	project=project,
287a3bb…	leo	55	location=location,
287a3bb…	leo	56	)
a94205b…	leo	57	except ImportError:
a94205b…	leo	58	raise ImportError(
829e24a…	leo	59	"google-genai package not installed. Install with: pip install google-genai"
a94205b…	leo	60	)
a94205b…	leo	61
a94205b…	leo	62	def chat(
a94205b…	leo	63	self,
a94205b…	leo	64	messages: list[dict],
a94205b…	leo	65	max_tokens: int = 4096,
a94205b…	leo	66	temperature: float = 0.7,
a94205b…	leo	67	model: Optional[str] = None,
a94205b…	leo	68	) -> str:
a94205b…	leo	69	from google.genai import types
a94205b…	leo	70
a94205b…	leo	71	model = model or "gemini-2.5-flash"
a94205b…	leo	72	# Convert OpenAI-style messages to Gemini contents
a94205b…	leo	73	contents = []
a94205b…	leo	74	for msg in messages:
a94205b…	leo	75	role = "user" if msg["role"] == "user" else "model"
829e24a…	leo	76	contents.append(
829e24a…	leo	77	types.Content(
829e24a…	leo	78	role=role,
829e24a…	leo	79	parts=[types.Part.from_text(text=msg["content"])],
829e24a…	leo	80	)
829e24a…	leo	81	)
a94205b…	leo	82
a94205b…	leo	83	response = self.client.models.generate_content(
a94205b…	leo	84	model=model,
a94205b…	leo	85	contents=contents,
a94205b…	leo	86	config=types.GenerateContentConfig(
a94205b…	leo	87	max_output_tokens=max_tokens,
a94205b…	leo	88	temperature=temperature,
a94205b…	leo	89	),
a94205b…	leo	90	)
287a3bb…	leo	91	um = getattr(response, "usage_metadata", None)
287a3bb…	leo	92	self._last_usage = {
287a3bb…	leo	93	"input_tokens": getattr(um, "prompt_token_count", 0) if um else 0,
287a3bb…	leo	94	"output_tokens": getattr(um, "candidates_token_count", 0) if um else 0,
287a3bb…	leo	95	}
a94205b…	leo	96	return response.text or ""
a94205b…	leo	97
a94205b…	leo	98	def analyze_image(
a94205b…	leo	99	self,
a94205b…	leo	100	image_bytes: bytes,
a94205b…	leo	101	prompt: str,
a94205b…	leo	102	max_tokens: int = 4096,
a94205b…	leo	103	model: Optional[str] = None,
a94205b…	leo	104	) -> str:
a94205b…	leo	105	from google.genai import types
a94205b…	leo	106
a94205b…	leo	107	model = model or "gemini-2.5-flash"
a94205b…	leo	108	response = self.client.models.generate_content(
a94205b…	leo	109	model=model,
a94205b…	leo	110	contents=[
a94205b…	leo	111	types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg"),
a94205b…	leo	112	prompt,
a94205b…	leo	113	],
a94205b…	leo	114	config=types.GenerateContentConfig(
a94205b…	leo	115	max_output_tokens=max_tokens,
a94205b…	leo	116	),
a94205b…	leo	117	)
287a3bb…	leo	118	um = getattr(response, "usage_metadata", None)
287a3bb…	leo	119	self._last_usage = {
287a3bb…	leo	120	"input_tokens": getattr(um, "prompt_token_count", 0) if um else 0,
287a3bb…	leo	121	"output_tokens": getattr(um, "candidates_token_count", 0) if um else 0,
287a3bb…	leo	122	}
a94205b…	leo	123	return response.text or ""
a94205b…	leo	124
a94205b…	leo	125	def transcribe_audio(
a94205b…	leo	126	self,
a94205b…	leo	127	audio_path: str \| Path,
a94205b…	leo	128	language: Optional[str] = None,
a94205b…	leo	129	model: Optional[str] = None,
a94205b…	leo	130	) -> dict:
a94205b…	leo	131	from google.genai import types
a94205b…	leo	132
a94205b…	leo	133	model = model or "gemini-2.5-flash"
a94205b…	leo	134	audio_path = Path(audio_path)
a94205b…	leo	135
a94205b…	leo	136	# Determine mime type
a94205b…	leo	137	suffix = audio_path.suffix.lower()
a94205b…	leo	138	mime_map = {
a94205b…	leo	139	".wav": "audio/wav",
a94205b…	leo	140	".mp3": "audio/mpeg",
a94205b…	leo	141	".m4a": "audio/mp4",
a94205b…	leo	142	".flac": "audio/flac",
a94205b…	leo	143	".ogg": "audio/ogg",
a94205b…	leo	144	".webm": "audio/webm",
a94205b…	leo	145	}
a94205b…	leo	146	mime_type = mime_map.get(suffix, "audio/wav")
a94205b…	leo	147
a94205b…	leo	148	# Read audio bytes
a94205b…	leo	149	audio_bytes = audio_path.read_bytes()
a94205b…	leo	150
a94205b…	leo	151	lang_hint = f" The audio is in {language}." if language else ""
a94205b…	leo	152	prompt = (
a94205b…	leo	153	f"Transcribe this audio accurately.{lang_hint} "
a94205b…	leo	154	"Return a JSON object with keys: "
a94205b…	leo	155	'"text" (full transcript), '
a94205b…	leo	156	'"segments" (array of {start, end, text} objects with timestamps in seconds).'
a94205b…	leo	157	)
a94205b…	leo	158
a94205b…	leo	159	response = self.client.models.generate_content(
a94205b…	leo	160	model=model,
a94205b…	leo	161	contents=[
a94205b…	leo	162	types.Part.from_bytes(data=audio_bytes, mime_type=mime_type),
a94205b…	leo	163	prompt,
a94205b…	leo	164	],
a94205b…	leo	165	config=types.GenerateContentConfig(
a94205b…	leo	166	max_output_tokens=8192,
a94205b…	leo	167	response_mime_type="application/json",
a94205b…	leo	168	),
a94205b…	leo	169	)
a94205b…	leo	170
a94205b…	leo	171	# Parse JSON response
a94205b…	leo	172	import json
829e24a…	leo	173
a94205b…	leo	174	try:
a94205b…	leo	175	data = json.loads(response.text)
a94205b…	leo	176	except (json.JSONDecodeError, TypeError):
a94205b…	leo	177	data = {"text": response.text or "", "segments": []}
a94205b…	leo	178
a94205b…	leo	179	return {
a94205b…	leo	180	"text": data.get("text", ""),
a94205b…	leo	181	"segments": data.get("segments", []),
a94205b…	leo	182	"language": language,
a94205b…	leo	183	"duration": None,
a94205b…	leo	184	"provider": "gemini",
a94205b…	leo	185	"model": model,
a94205b…	leo	186	}
a94205b…	leo	187
a94205b…	leo	188	def list_models(self) -> list[ModelInfo]:
a94205b…	leo	189	models = []
a94205b…	leo	190	try:
a94205b…	leo	191	for m in self.client.models.list():
a94205b…	leo	192	mid = m.name or ""
287a3bb…	leo	193	# Strip prefix variants from different API modes
287a3bb…	leo	194	for prefix in ("models/", "publishers/google/models/"):
287a3bb…	leo	195	if mid.startswith(prefix):
829e24a…	leo	196	mid = mid[len(prefix) :]
287a3bb…	leo	197	break
a94205b…	leo	198	display = getattr(m, "display_name", mid) or mid
a94205b…	leo	199
a94205b…	leo	200	caps = []
a94205b…	leo	201	mid_lower = mid.lower()
a94205b…	leo	202	if "gemini" in mid_lower:
a94205b…	leo	203	caps.append("chat")
a94205b…	leo	204	if any(kw in mid_lower for kw in _VISION_KEYWORDS):
a94205b…	leo	205	caps.append("vision")
a94205b…	leo	206	if any(kw in mid_lower for kw in _AUDIO_KEYWORDS):
a94205b…	leo	207	caps.append("audio")
a94205b…	leo	208	if "embedding" in mid_lower:
a94205b…	leo	209	caps.append("embedding")
a94205b…	leo	210
a94205b…	leo	211	if caps:
829e24a…	leo	212	models.append(
829e24a…	leo	213	ModelInfo(
829e24a…	leo	214	id=mid,
829e24a…	leo	215	provider="gemini",
829e24a…	leo	216	display_name=display,
829e24a…	leo	217	capabilities=caps,
829e24a…	leo	218	)
829e24a…	leo	219	)
a94205b…	leo	220	except Exception as e:
a94205b…	leo	221	logger.warning(f"Failed to list Gemini models: {e}")
a94205b…	leo	222	return sorted(models, key=lambda m: m.id)
0981a08…	noreply	223
0981a08…	noreply	224
0981a08…	noreply	225	ProviderRegistry.register(
0981a08…	noreply	226	name="gemini",
0981a08…	noreply	227	provider_class=GeminiProvider,
0981a08…	noreply	228	env_var="GEMINI_API_KEY",
0981a08…	noreply	229	model_prefixes=["gemini-"],
0981a08…	noreply	230	default_models={
0981a08…	noreply	231	"chat": "gemini-2.5-flash",
0981a08…	noreply	232	"vision": "gemini-2.5-flash",
0981a08…	noreply	233	"audio": "gemini-2.5-flash",
0981a08…	noreply	234	},
0981a08…	noreply	235	)

PlanOpticon

Keyboard Shortcuts