PlanOpticon

planopticon / video_processor / providers / gemini_provider.py
Blame History Raw 236 lines
1
"""Google Gemini provider implementation using the google-genai SDK."""
2
3
import logging
4
import os
5
from pathlib import Path
6
from typing import Optional
7
8
from dotenv import load_dotenv
9
10
from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry
11
12
load_dotenv()
13
logger = logging.getLogger(__name__)
14
15
# Capabilities inferred from model id patterns
16
_VISION_KEYWORDS = {"gemini-2", "gemini-3", "gemini-pro", "gemini-flash", "gemini-ultra"}
17
_AUDIO_KEYWORDS = {"gemini-2", "gemini-3", "gemini-flash"}
18
19
20
class GeminiProvider(BaseProvider):
21
"""Google Gemini API provider via google-genai SDK."""
22
23
provider_name = "gemini"
24
25
def __init__(
26
self,
27
api_key: Optional[str] = None,
28
credentials_path: Optional[str] = None,
29
):
30
self.api_key = api_key or os.getenv("GEMINI_API_KEY")
31
self.credentials_path = credentials_path or os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
32
33
if not self.api_key and not self.credentials_path:
34
raise ValueError("Neither GEMINI_API_KEY nor GOOGLE_APPLICATION_CREDENTIALS is set")
35
36
try:
37
from google import genai
38
39
self._genai = genai
40
41
if self.api_key:
42
self.client = genai.Client(api_key=self.api_key)
43
else:
44
# Service account → use Vertex AI mode
45
import json
46
47
with open(self.credentials_path) as f:
48
sa_info = json.load(f)
49
project = sa_info.get("project_id", "")
50
location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1")
51
52
self.client = genai.Client(
53
vertexai=True,
54
project=project,
55
location=location,
56
)
57
except ImportError:
58
raise ImportError(
59
"google-genai package not installed. Install with: pip install google-genai"
60
)
61
62
def chat(
63
self,
64
messages: list[dict],
65
max_tokens: int = 4096,
66
temperature: float = 0.7,
67
model: Optional[str] = None,
68
) -> str:
69
from google.genai import types
70
71
model = model or "gemini-2.5-flash"
72
# Convert OpenAI-style messages to Gemini contents
73
contents = []
74
for msg in messages:
75
role = "user" if msg["role"] == "user" else "model"
76
contents.append(
77
types.Content(
78
role=role,
79
parts=[types.Part.from_text(text=msg["content"])],
80
)
81
)
82
83
response = self.client.models.generate_content(
84
model=model,
85
contents=contents,
86
config=types.GenerateContentConfig(
87
max_output_tokens=max_tokens,
88
temperature=temperature,
89
),
90
)
91
um = getattr(response, "usage_metadata", None)
92
self._last_usage = {
93
"input_tokens": getattr(um, "prompt_token_count", 0) if um else 0,
94
"output_tokens": getattr(um, "candidates_token_count", 0) if um else 0,
95
}
96
return response.text or ""
97
98
def analyze_image(
99
self,
100
image_bytes: bytes,
101
prompt: str,
102
max_tokens: int = 4096,
103
model: Optional[str] = None,
104
) -> str:
105
from google.genai import types
106
107
model = model or "gemini-2.5-flash"
108
response = self.client.models.generate_content(
109
model=model,
110
contents=[
111
types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg"),
112
prompt,
113
],
114
config=types.GenerateContentConfig(
115
max_output_tokens=max_tokens,
116
),
117
)
118
um = getattr(response, "usage_metadata", None)
119
self._last_usage = {
120
"input_tokens": getattr(um, "prompt_token_count", 0) if um else 0,
121
"output_tokens": getattr(um, "candidates_token_count", 0) if um else 0,
122
}
123
return response.text or ""
124
125
def transcribe_audio(
126
self,
127
audio_path: str | Path,
128
language: Optional[str] = None,
129
model: Optional[str] = None,
130
) -> dict:
131
from google.genai import types
132
133
model = model or "gemini-2.5-flash"
134
audio_path = Path(audio_path)
135
136
# Determine mime type
137
suffix = audio_path.suffix.lower()
138
mime_map = {
139
".wav": "audio/wav",
140
".mp3": "audio/mpeg",
141
".m4a": "audio/mp4",
142
".flac": "audio/flac",
143
".ogg": "audio/ogg",
144
".webm": "audio/webm",
145
}
146
mime_type = mime_map.get(suffix, "audio/wav")
147
148
# Read audio bytes
149
audio_bytes = audio_path.read_bytes()
150
151
lang_hint = f" The audio is in {language}." if language else ""
152
prompt = (
153
f"Transcribe this audio accurately.{lang_hint} "
154
"Return a JSON object with keys: "
155
'"text" (full transcript), '
156
'"segments" (array of {start, end, text} objects with timestamps in seconds).'
157
)
158
159
response = self.client.models.generate_content(
160
model=model,
161
contents=[
162
types.Part.from_bytes(data=audio_bytes, mime_type=mime_type),
163
prompt,
164
],
165
config=types.GenerateContentConfig(
166
max_output_tokens=8192,
167
response_mime_type="application/json",
168
),
169
)
170
171
# Parse JSON response
172
import json
173
174
try:
175
data = json.loads(response.text)
176
except (json.JSONDecodeError, TypeError):
177
data = {"text": response.text or "", "segments": []}
178
179
return {
180
"text": data.get("text", ""),
181
"segments": data.get("segments", []),
182
"language": language,
183
"duration": None,
184
"provider": "gemini",
185
"model": model,
186
}
187
188
def list_models(self) -> list[ModelInfo]:
189
models = []
190
try:
191
for m in self.client.models.list():
192
mid = m.name or ""
193
# Strip prefix variants from different API modes
194
for prefix in ("models/", "publishers/google/models/"):
195
if mid.startswith(prefix):
196
mid = mid[len(prefix) :]
197
break
198
display = getattr(m, "display_name", mid) or mid
199
200
caps = []
201
mid_lower = mid.lower()
202
if "gemini" in mid_lower:
203
caps.append("chat")
204
if any(kw in mid_lower for kw in _VISION_KEYWORDS):
205
caps.append("vision")
206
if any(kw in mid_lower for kw in _AUDIO_KEYWORDS):
207
caps.append("audio")
208
if "embedding" in mid_lower:
209
caps.append("embedding")
210
211
if caps:
212
models.append(
213
ModelInfo(
214
id=mid,
215
provider="gemini",
216
display_name=display,
217
capabilities=caps,
218
)
219
)
220
except Exception as e:
221
logger.warning(f"Failed to list Gemini models: {e}")
222
return sorted(models, key=lambda m: m.id)
223
224
225
ProviderRegistry.register(
226
name="gemini",
227
provider_class=GeminiProvider,
228
env_var="GEMINI_API_KEY",
229
model_prefixes=["gemini-"],
230
default_models={
231
"chat": "gemini-2.5-flash",
232
"vision": "gemini-2.5-flash",
233
"audio": "gemini-2.5-flash",
234
},
235
)
236

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button