PlanOpticon

planopticon / video_processor / providers / vertex_provider.py
Blame History Raw 227 lines
1
"""Google Vertex AI provider implementation."""
2
3
import logging
4
import os
5
from pathlib import Path
6
from typing import Optional
7
8
from dotenv import load_dotenv
9
10
from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry
11
12
load_dotenv()
13
logger = logging.getLogger(__name__)
14
15
# Curated list of models available on Vertex AI
16
_VERTEX_MODELS = [
17
ModelInfo(
18
id="gemini-2.0-flash",
19
provider="vertex",
20
display_name="Gemini 2.0 Flash",
21
capabilities=["chat", "vision", "audio"],
22
),
23
ModelInfo(
24
id="gemini-2.0-pro",
25
provider="vertex",
26
display_name="Gemini 2.0 Pro",
27
capabilities=["chat", "vision", "audio"],
28
),
29
ModelInfo(
30
id="gemini-1.5-pro",
31
provider="vertex",
32
display_name="Gemini 1.5 Pro",
33
capabilities=["chat", "vision", "audio"],
34
),
35
ModelInfo(
36
id="gemini-1.5-flash",
37
provider="vertex",
38
display_name="Gemini 1.5 Flash",
39
capabilities=["chat", "vision", "audio"],
40
),
41
]
42
43
44
class VertexProvider(BaseProvider):
45
"""Google Vertex AI provider using google-genai SDK with Vertex config."""
46
47
provider_name = "vertex"
48
49
def __init__(
50
self,
51
project: Optional[str] = None,
52
location: Optional[str] = None,
53
):
54
try:
55
from google import genai
56
from google.genai import types # noqa: F401
57
except ImportError:
58
raise ImportError(
59
"google-cloud-aiplatform or google-genai package not installed. "
60
"Install with: pip install google-cloud-aiplatform"
61
)
62
63
self._genai = genai
64
self._project = project or os.getenv("GOOGLE_CLOUD_PROJECT")
65
self._location = location or os.getenv("GOOGLE_CLOUD_REGION", "us-central1")
66
67
if not self._project:
68
raise ValueError("GOOGLE_CLOUD_PROJECT not set")
69
70
self.client = genai.Client(
71
vertexai=True,
72
project=self._project,
73
location=self._location,
74
)
75
self._last_usage = {}
76
77
def chat(
78
self,
79
messages: list[dict],
80
max_tokens: int = 4096,
81
temperature: float = 0.7,
82
model: Optional[str] = None,
83
) -> str:
84
from google.genai import types
85
86
model = model or "gemini-2.0-flash"
87
if model.startswith("vertex/"):
88
model = model[len("vertex/") :]
89
90
contents = []
91
for msg in messages:
92
role = "user" if msg["role"] == "user" else "model"
93
contents.append(
94
types.Content(
95
role=role,
96
parts=[types.Part.from_text(text=msg["content"])],
97
)
98
)
99
100
response = self.client.models.generate_content(
101
model=model,
102
contents=contents,
103
config=types.GenerateContentConfig(
104
max_output_tokens=max_tokens,
105
temperature=temperature,
106
),
107
)
108
um = getattr(response, "usage_metadata", None)
109
self._last_usage = {
110
"input_tokens": getattr(um, "prompt_token_count", 0) if um else 0,
111
"output_tokens": getattr(um, "candidates_token_count", 0) if um else 0,
112
}
113
return response.text or ""
114
115
def analyze_image(
116
self,
117
image_bytes: bytes,
118
prompt: str,
119
max_tokens: int = 4096,
120
model: Optional[str] = None,
121
) -> str:
122
from google.genai import types
123
124
model = model or "gemini-2.0-flash"
125
if model.startswith("vertex/"):
126
model = model[len("vertex/") :]
127
128
response = self.client.models.generate_content(
129
model=model,
130
contents=[
131
types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg"),
132
prompt,
133
],
134
config=types.GenerateContentConfig(
135
max_output_tokens=max_tokens,
136
),
137
)
138
um = getattr(response, "usage_metadata", None)
139
self._last_usage = {
140
"input_tokens": getattr(um, "prompt_token_count", 0) if um else 0,
141
"output_tokens": getattr(um, "candidates_token_count", 0) if um else 0,
142
}
143
return response.text or ""
144
145
def transcribe_audio(
146
self,
147
audio_path: str | Path,
148
language: Optional[str] = None,
149
model: Optional[str] = None,
150
) -> dict:
151
import json
152
153
from google.genai import types
154
155
model = model or "gemini-2.0-flash"
156
if model.startswith("vertex/"):
157
model = model[len("vertex/") :]
158
159
audio_path = Path(audio_path)
160
suffix = audio_path.suffix.lower()
161
mime_map = {
162
".wav": "audio/wav",
163
".mp3": "audio/mpeg",
164
".m4a": "audio/mp4",
165
".flac": "audio/flac",
166
".ogg": "audio/ogg",
167
".webm": "audio/webm",
168
}
169
mime_type = mime_map.get(suffix, "audio/wav")
170
audio_bytes = audio_path.read_bytes()
171
172
lang_hint = f" The audio is in {language}." if language else ""
173
prompt = (
174
f"Transcribe this audio accurately.{lang_hint} "
175
"Return a JSON object with keys: "
176
'"text" (full transcript), '
177
'"segments" (array of {start, end, text} objects with timestamps in seconds).'
178
)
179
180
response = self.client.models.generate_content(
181
model=model,
182
contents=[
183
types.Part.from_bytes(data=audio_bytes, mime_type=mime_type),
184
prompt,
185
],
186
config=types.GenerateContentConfig(
187
max_output_tokens=8192,
188
response_mime_type="application/json",
189
),
190
)
191
192
try:
193
data = json.loads(response.text)
194
except (json.JSONDecodeError, TypeError):
195
data = {"text": response.text or "", "segments": []}
196
197
um = getattr(response, "usage_metadata", None)
198
self._last_usage = {
199
"input_tokens": getattr(um, "prompt_token_count", 0) if um else 0,
200
"output_tokens": getattr(um, "candidates_token_count", 0) if um else 0,
201
}
202
203
return {
204
"text": data.get("text", ""),
205
"segments": data.get("segments", []),
206
"language": language,
207
"duration": None,
208
"provider": "vertex",
209
"model": model,
210
}
211
212
def list_models(self) -> list[ModelInfo]:
213
return list(_VERTEX_MODELS)
214
215
216
ProviderRegistry.register(
217
name="vertex",
218
provider_class=VertexProvider,
219
env_var="GOOGLE_CLOUD_PROJECT",
220
model_prefixes=["vertex/"],
221
default_models={
222
"chat": "gemini-2.0-flash",
223
"vision": "gemini-2.0-flash",
224
"audio": "gemini-2.0-flash",
225
},
226
)
227

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button