PlanOpticon

planopticon / video_processor / providers / gemini_provider.py
Source Blame History 235 lines
a94205b… leo 1 """Google Gemini provider implementation using the google-genai SDK."""
a94205b… leo 2
a94205b… leo 3 import logging
a94205b… leo 4 import os
a94205b… leo 5 from pathlib import Path
a94205b… leo 6 from typing import Optional
a94205b… leo 7
a94205b… leo 8 from dotenv import load_dotenv
a94205b… leo 9
0981a08… noreply 10 from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry
a94205b… leo 11
a94205b… leo 12 load_dotenv()
a94205b… leo 13 logger = logging.getLogger(__name__)
a94205b… leo 14
a94205b… leo 15 # Capabilities inferred from model id patterns
a94205b… leo 16 _VISION_KEYWORDS = {"gemini-2", "gemini-3", "gemini-pro", "gemini-flash", "gemini-ultra"}
a94205b… leo 17 _AUDIO_KEYWORDS = {"gemini-2", "gemini-3", "gemini-flash"}
a94205b… leo 18
a94205b… leo 19
a94205b… leo 20 class GeminiProvider(BaseProvider):
a94205b… leo 21 """Google Gemini API provider via google-genai SDK."""
a94205b… leo 22
a94205b… leo 23 provider_name = "gemini"
a94205b… leo 24
287a3bb… leo 25 def __init__(
287a3bb… leo 26 self,
287a3bb… leo 27 api_key: Optional[str] = None,
287a3bb… leo 28 credentials_path: Optional[str] = None,
287a3bb… leo 29 ):
a94205b… leo 30 self.api_key = api_key or os.getenv("GEMINI_API_KEY")
287a3bb… leo 31 self.credentials_path = credentials_path or os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
287a3bb… leo 32
287a3bb… leo 33 if not self.api_key and not self.credentials_path:
829e24a… leo 34 raise ValueError("Neither GEMINI_API_KEY nor GOOGLE_APPLICATION_CREDENTIALS is set")
287a3bb… leo 35
a94205b… leo 36 try:
a94205b… leo 37 from google import genai
829e24a… leo 38
a94205b… leo 39 self._genai = genai
287a3bb… leo 40
287a3bb… leo 41 if self.api_key:
287a3bb… leo 42 self.client = genai.Client(api_key=self.api_key)
287a3bb… leo 43 else:
287a3bb… leo 44 # Service account → use Vertex AI mode
287a3bb… leo 45 import json
287a3bb… leo 46
287a3bb… leo 47 with open(self.credentials_path) as f:
287a3bb… leo 48 sa_info = json.load(f)
287a3bb… leo 49 project = sa_info.get("project_id", "")
287a3bb… leo 50 location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1")
287a3bb… leo 51
287a3bb… leo 52 self.client = genai.Client(
287a3bb… leo 53 vertexai=True,
287a3bb… leo 54 project=project,
287a3bb… leo 55 location=location,
287a3bb… leo 56 )
a94205b… leo 57 except ImportError:
a94205b… leo 58 raise ImportError(
829e24a… leo 59 "google-genai package not installed. Install with: pip install google-genai"
a94205b… leo 60 )
a94205b… leo 61
a94205b… leo 62 def chat(
a94205b… leo 63 self,
a94205b… leo 64 messages: list[dict],
a94205b… leo 65 max_tokens: int = 4096,
a94205b… leo 66 temperature: float = 0.7,
a94205b… leo 67 model: Optional[str] = None,
a94205b… leo 68 ) -> str:
a94205b… leo 69 from google.genai import types
a94205b… leo 70
a94205b… leo 71 model = model or "gemini-2.5-flash"
a94205b… leo 72 # Convert OpenAI-style messages to Gemini contents
a94205b… leo 73 contents = []
a94205b… leo 74 for msg in messages:
a94205b… leo 75 role = "user" if msg["role"] == "user" else "model"
829e24a… leo 76 contents.append(
829e24a… leo 77 types.Content(
829e24a… leo 78 role=role,
829e24a… leo 79 parts=[types.Part.from_text(text=msg["content"])],
829e24a… leo 80 )
829e24a… leo 81 )
a94205b… leo 82
a94205b… leo 83 response = self.client.models.generate_content(
a94205b… leo 84 model=model,
a94205b… leo 85 contents=contents,
a94205b… leo 86 config=types.GenerateContentConfig(
a94205b… leo 87 max_output_tokens=max_tokens,
a94205b… leo 88 temperature=temperature,
a94205b… leo 89 ),
a94205b… leo 90 )
287a3bb… leo 91 um = getattr(response, "usage_metadata", None)
287a3bb… leo 92 self._last_usage = {
287a3bb… leo 93 "input_tokens": getattr(um, "prompt_token_count", 0) if um else 0,
287a3bb… leo 94 "output_tokens": getattr(um, "candidates_token_count", 0) if um else 0,
287a3bb… leo 95 }
a94205b… leo 96 return response.text or ""
a94205b… leo 97
a94205b… leo 98 def analyze_image(
a94205b… leo 99 self,
a94205b… leo 100 image_bytes: bytes,
a94205b… leo 101 prompt: str,
a94205b… leo 102 max_tokens: int = 4096,
a94205b… leo 103 model: Optional[str] = None,
a94205b… leo 104 ) -> str:
a94205b… leo 105 from google.genai import types
a94205b… leo 106
a94205b… leo 107 model = model or "gemini-2.5-flash"
a94205b… leo 108 response = self.client.models.generate_content(
a94205b… leo 109 model=model,
a94205b… leo 110 contents=[
a94205b… leo 111 types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg"),
a94205b… leo 112 prompt,
a94205b… leo 113 ],
a94205b… leo 114 config=types.GenerateContentConfig(
a94205b… leo 115 max_output_tokens=max_tokens,
a94205b… leo 116 ),
a94205b… leo 117 )
287a3bb… leo 118 um = getattr(response, "usage_metadata", None)
287a3bb… leo 119 self._last_usage = {
287a3bb… leo 120 "input_tokens": getattr(um, "prompt_token_count", 0) if um else 0,
287a3bb… leo 121 "output_tokens": getattr(um, "candidates_token_count", 0) if um else 0,
287a3bb… leo 122 }
a94205b… leo 123 return response.text or ""
a94205b… leo 124
a94205b… leo 125 def transcribe_audio(
a94205b… leo 126 self,
a94205b… leo 127 audio_path: str | Path,
a94205b… leo 128 language: Optional[str] = None,
a94205b… leo 129 model: Optional[str] = None,
a94205b… leo 130 ) -> dict:
a94205b… leo 131 from google.genai import types
a94205b… leo 132
a94205b… leo 133 model = model or "gemini-2.5-flash"
a94205b… leo 134 audio_path = Path(audio_path)
a94205b… leo 135
a94205b… leo 136 # Determine mime type
a94205b… leo 137 suffix = audio_path.suffix.lower()
a94205b… leo 138 mime_map = {
a94205b… leo 139 ".wav": "audio/wav",
a94205b… leo 140 ".mp3": "audio/mpeg",
a94205b… leo 141 ".m4a": "audio/mp4",
a94205b… leo 142 ".flac": "audio/flac",
a94205b… leo 143 ".ogg": "audio/ogg",
a94205b… leo 144 ".webm": "audio/webm",
a94205b… leo 145 }
a94205b… leo 146 mime_type = mime_map.get(suffix, "audio/wav")
a94205b… leo 147
a94205b… leo 148 # Read audio bytes
a94205b… leo 149 audio_bytes = audio_path.read_bytes()
a94205b… leo 150
a94205b… leo 151 lang_hint = f" The audio is in {language}." if language else ""
a94205b… leo 152 prompt = (
a94205b… leo 153 f"Transcribe this audio accurately.{lang_hint} "
a94205b… leo 154 "Return a JSON object with keys: "
a94205b… leo 155 '"text" (full transcript), '
a94205b… leo 156 '"segments" (array of {start, end, text} objects with timestamps in seconds).'
a94205b… leo 157 )
a94205b… leo 158
a94205b… leo 159 response = self.client.models.generate_content(
a94205b… leo 160 model=model,
a94205b… leo 161 contents=[
a94205b… leo 162 types.Part.from_bytes(data=audio_bytes, mime_type=mime_type),
a94205b… leo 163 prompt,
a94205b… leo 164 ],
a94205b… leo 165 config=types.GenerateContentConfig(
a94205b… leo 166 max_output_tokens=8192,
a94205b… leo 167 response_mime_type="application/json",
a94205b… leo 168 ),
a94205b… leo 169 )
a94205b… leo 170
a94205b… leo 171 # Parse JSON response
a94205b… leo 172 import json
829e24a… leo 173
a94205b… leo 174 try:
a94205b… leo 175 data = json.loads(response.text)
a94205b… leo 176 except (json.JSONDecodeError, TypeError):
a94205b… leo 177 data = {"text": response.text or "", "segments": []}
a94205b… leo 178
a94205b… leo 179 return {
a94205b… leo 180 "text": data.get("text", ""),
a94205b… leo 181 "segments": data.get("segments", []),
a94205b… leo 182 "language": language,
a94205b… leo 183 "duration": None,
a94205b… leo 184 "provider": "gemini",
a94205b… leo 185 "model": model,
a94205b… leo 186 }
a94205b… leo 187
a94205b… leo 188 def list_models(self) -> list[ModelInfo]:
a94205b… leo 189 models = []
a94205b… leo 190 try:
a94205b… leo 191 for m in self.client.models.list():
a94205b… leo 192 mid = m.name or ""
287a3bb… leo 193 # Strip prefix variants from different API modes
287a3bb… leo 194 for prefix in ("models/", "publishers/google/models/"):
287a3bb… leo 195 if mid.startswith(prefix):
829e24a… leo 196 mid = mid[len(prefix) :]
287a3bb… leo 197 break
a94205b… leo 198 display = getattr(m, "display_name", mid) or mid
a94205b… leo 199
a94205b… leo 200 caps = []
a94205b… leo 201 mid_lower = mid.lower()
a94205b… leo 202 if "gemini" in mid_lower:
a94205b… leo 203 caps.append("chat")
a94205b… leo 204 if any(kw in mid_lower for kw in _VISION_KEYWORDS):
a94205b… leo 205 caps.append("vision")
a94205b… leo 206 if any(kw in mid_lower for kw in _AUDIO_KEYWORDS):
a94205b… leo 207 caps.append("audio")
a94205b… leo 208 if "embedding" in mid_lower:
a94205b… leo 209 caps.append("embedding")
a94205b… leo 210
a94205b… leo 211 if caps:
829e24a… leo 212 models.append(
829e24a… leo 213 ModelInfo(
829e24a… leo 214 id=mid,
829e24a… leo 215 provider="gemini",
829e24a… leo 216 display_name=display,
829e24a… leo 217 capabilities=caps,
829e24a… leo 218 )
829e24a… leo 219 )
a94205b… leo 220 except Exception as e:
a94205b… leo 221 logger.warning(f"Failed to list Gemini models: {e}")
a94205b… leo 222 return sorted(models, key=lambda m: m.id)
0981a08… noreply 223
0981a08… noreply 224
0981a08… noreply 225 ProviderRegistry.register(
0981a08… noreply 226 name="gemini",
0981a08… noreply 227 provider_class=GeminiProvider,
0981a08… noreply 228 env_var="GEMINI_API_KEY",
0981a08… noreply 229 model_prefixes=["gemini-"],
0981a08… noreply 230 default_models={
0981a08… noreply 231 "chat": "gemini-2.5-flash",
0981a08… noreply 232 "vision": "gemini-2.5-flash",
0981a08… noreply 233 "audio": "gemini-2.5-flash",
0981a08… noreply 234 },
0981a08… noreply 235 )

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button