|
a94205b…
|
leo
|
1 |
"""OpenAI provider implementation.""" |
|
a94205b…
|
leo
|
2 |
|
|
a94205b…
|
leo
|
3 |
import base64 |
|
a94205b…
|
leo
|
4 |
import logging |
|
a94205b…
|
leo
|
5 |
import os |
|
a94205b…
|
leo
|
6 |
from pathlib import Path |
|
a94205b…
|
leo
|
7 |
from typing import Optional |
|
a94205b…
|
leo
|
8 |
|
|
a94205b…
|
leo
|
9 |
from dotenv import load_dotenv |
|
a94205b…
|
leo
|
10 |
from openai import OpenAI |
|
a94205b…
|
leo
|
11 |
|
|
0981a08…
|
noreply
|
12 |
from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry |
|
a94205b…
|
leo
|
13 |
|
|
a94205b…
|
leo
|
14 |
load_dotenv() |
|
a94205b…
|
leo
|
15 |
logger = logging.getLogger(__name__) |
|
a94205b…
|
leo
|
16 |
|
|
a94205b…
|
leo
|
17 |
# Models known to have vision capability |
|
829e24a…
|
leo
|
18 |
_VISION_MODELS = { |
|
829e24a…
|
leo
|
19 |
"gpt-4o", |
|
829e24a…
|
leo
|
20 |
"gpt-4o-mini", |
|
829e24a…
|
leo
|
21 |
"gpt-4-turbo", |
|
829e24a…
|
leo
|
22 |
"gpt-4.1", |
|
829e24a…
|
leo
|
23 |
"gpt-4.1-mini", |
|
829e24a…
|
leo
|
24 |
"gpt-4.1-nano", |
|
829e24a…
|
leo
|
25 |
"o1", |
|
829e24a…
|
leo
|
26 |
"o3", |
|
829e24a…
|
leo
|
27 |
"o3-mini", |
|
829e24a…
|
leo
|
28 |
"o4-mini", |
|
829e24a…
|
leo
|
29 |
} |
|
a94205b…
|
leo
|
30 |
_AUDIO_MODELS = {"whisper-1"} |
|
a94205b…
|
leo
|
31 |
|
|
a94205b…
|
leo
|
32 |
|
|
a94205b…
|
leo
|
33 |
class OpenAIProvider(BaseProvider): |
|
a94205b…
|
leo
|
34 |
"""OpenAI API provider.""" |
|
a94205b…
|
leo
|
35 |
|
|
a94205b…
|
leo
|
36 |
provider_name = "openai" |
|
a94205b…
|
leo
|
37 |
|
|
a94205b…
|
leo
|
38 |
def __init__(self, api_key: Optional[str] = None): |
|
a94205b…
|
leo
|
39 |
self.api_key = api_key or os.getenv("OPENAI_API_KEY") |
|
a94205b…
|
leo
|
40 |
if not self.api_key: |
|
a94205b…
|
leo
|
41 |
raise ValueError("OPENAI_API_KEY not set") |
|
a94205b…
|
leo
|
42 |
self.client = OpenAI(api_key=self.api_key) |
|
a94205b…
|
leo
|
43 |
|
|
a94205b…
|
leo
|
44 |
def chat( |
|
a94205b…
|
leo
|
45 |
self, |
|
a94205b…
|
leo
|
46 |
messages: list[dict], |
|
a94205b…
|
leo
|
47 |
max_tokens: int = 4096, |
|
a94205b…
|
leo
|
48 |
temperature: float = 0.7, |
|
a94205b…
|
leo
|
49 |
model: Optional[str] = None, |
|
a94205b…
|
leo
|
50 |
) -> str: |
|
0981a08…
|
noreply
|
51 |
model = model or "gpt-4o-mini" |
|
a94205b…
|
leo
|
52 |
response = self.client.chat.completions.create( |
|
a94205b…
|
leo
|
53 |
model=model, |
|
a94205b…
|
leo
|
54 |
messages=messages, |
|
a94205b…
|
leo
|
55 |
max_tokens=max_tokens, |
|
a94205b…
|
leo
|
56 |
temperature=temperature, |
|
a94205b…
|
leo
|
57 |
) |
|
287a3bb…
|
leo
|
58 |
self._last_usage = { |
|
287a3bb…
|
leo
|
59 |
"input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0, |
|
829e24a…
|
leo
|
60 |
"output_tokens": getattr(response.usage, "completion_tokens", 0) |
|
829e24a…
|
leo
|
61 |
if response.usage |
|
829e24a…
|
leo
|
62 |
else 0, |
|
287a3bb…
|
leo
|
63 |
} |
|
a94205b…
|
leo
|
64 |
return response.choices[0].message.content or "" |
|
a94205b…
|
leo
|
65 |
|
|
a94205b…
|
leo
|
66 |
def analyze_image( |
|
a94205b…
|
leo
|
67 |
self, |
|
a94205b…
|
leo
|
68 |
image_bytes: bytes, |
|
a94205b…
|
leo
|
69 |
prompt: str, |
|
a94205b…
|
leo
|
70 |
max_tokens: int = 4096, |
|
a94205b…
|
leo
|
71 |
model: Optional[str] = None, |
|
a94205b…
|
leo
|
72 |
) -> str: |
|
0981a08…
|
noreply
|
73 |
model = model or "gpt-4o-mini" |
|
a94205b…
|
leo
|
74 |
b64 = base64.b64encode(image_bytes).decode() |
|
a94205b…
|
leo
|
75 |
response = self.client.chat.completions.create( |
|
a94205b…
|
leo
|
76 |
model=model, |
|
a94205b…
|
leo
|
77 |
messages=[ |
|
a94205b…
|
leo
|
78 |
{ |
|
a94205b…
|
leo
|
79 |
"role": "user", |
|
a94205b…
|
leo
|
80 |
"content": [ |
|
a94205b…
|
leo
|
81 |
{"type": "text", "text": prompt}, |
|
a94205b…
|
leo
|
82 |
{ |
|
a94205b…
|
leo
|
83 |
"type": "image_url", |
|
a94205b…
|
leo
|
84 |
"image_url": {"url": f"data:image/jpeg;base64,{b64}"}, |
|
a94205b…
|
leo
|
85 |
}, |
|
a94205b…
|
leo
|
86 |
], |
|
a94205b…
|
leo
|
87 |
} |
|
a94205b…
|
leo
|
88 |
], |
|
a94205b…
|
leo
|
89 |
max_tokens=max_tokens, |
|
a94205b…
|
leo
|
90 |
) |
|
287a3bb…
|
leo
|
91 |
self._last_usage = { |
|
287a3bb…
|
leo
|
92 |
"input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0, |
|
829e24a…
|
leo
|
93 |
"output_tokens": getattr(response.usage, "completion_tokens", 0) |
|
829e24a…
|
leo
|
94 |
if response.usage |
|
829e24a…
|
leo
|
95 |
else 0, |
|
287a3bb…
|
leo
|
96 |
} |
|
a94205b…
|
leo
|
97 |
return response.choices[0].message.content or "" |
|
287a3bb…
|
leo
|
98 |
|
|
287a3bb…
|
leo
|
99 |
# Whisper API limit is 25MB |
|
287a3bb…
|
leo
|
100 |
_MAX_FILE_SIZE = 25 * 1024 * 1024 |
|
a94205b…
|
leo
|
101 |
|
|
a94205b…
|
leo
|
102 |
def transcribe_audio( |
|
a94205b…
|
leo
|
103 |
self, |
|
a94205b…
|
leo
|
104 |
audio_path: str | Path, |
|
a94205b…
|
leo
|
105 |
language: Optional[str] = None, |
|
a94205b…
|
leo
|
106 |
model: Optional[str] = None, |
|
a94205b…
|
leo
|
107 |
) -> dict: |
|
a94205b…
|
leo
|
108 |
model = model or "whisper-1" |
|
287a3bb…
|
leo
|
109 |
audio_path = Path(audio_path) |
|
287a3bb…
|
leo
|
110 |
file_size = audio_path.stat().st_size |
|
287a3bb…
|
leo
|
111 |
|
|
287a3bb…
|
leo
|
112 |
if file_size <= self._MAX_FILE_SIZE: |
|
287a3bb…
|
leo
|
113 |
return self._transcribe_single(audio_path, language, model) |
|
287a3bb…
|
leo
|
114 |
|
|
287a3bb…
|
leo
|
115 |
# File too large — split into chunks and transcribe each |
|
287a3bb…
|
leo
|
116 |
logger.info( |
|
287a3bb…
|
leo
|
117 |
f"Audio file {file_size / 1024 / 1024:.1f}MB exceeds Whisper 25MB limit, chunking..." |
|
287a3bb…
|
leo
|
118 |
) |
|
287a3bb…
|
leo
|
119 |
return self._transcribe_chunked(audio_path, language, model) |
|
287a3bb…
|
leo
|
120 |
|
|
829e24a…
|
leo
|
121 |
def _transcribe_single(self, audio_path: Path, language: Optional[str], model: str) -> dict: |
|
287a3bb…
|
leo
|
122 |
"""Transcribe a single audio file.""" |
|
a94205b…
|
leo
|
123 |
with open(audio_path, "rb") as f: |
|
a94205b…
|
leo
|
124 |
kwargs = {"model": model, "file": f} |
|
a94205b…
|
leo
|
125 |
if language: |
|
a94205b…
|
leo
|
126 |
kwargs["language"] = language |
|
a94205b…
|
leo
|
127 |
response = self.client.audio.transcriptions.create( |
|
a94205b…
|
leo
|
128 |
**kwargs, response_format="verbose_json" |
|
a94205b…
|
leo
|
129 |
) |
|
a94205b…
|
leo
|
130 |
return { |
|
a94205b…
|
leo
|
131 |
"text": response.text, |
|
a94205b…
|
leo
|
132 |
"segments": [ |
|
a94205b…
|
leo
|
133 |
{ |
|
a94205b…
|
leo
|
134 |
"start": seg.start, |
|
a94205b…
|
leo
|
135 |
"end": seg.end, |
|
a94205b…
|
leo
|
136 |
"text": seg.text, |
|
a94205b…
|
leo
|
137 |
} |
|
a94205b…
|
leo
|
138 |
for seg in (response.segments or []) |
|
a94205b…
|
leo
|
139 |
], |
|
a94205b…
|
leo
|
140 |
"language": getattr(response, "language", language), |
|
a94205b…
|
leo
|
141 |
"duration": getattr(response, "duration", None), |
|
a94205b…
|
leo
|
142 |
"provider": "openai", |
|
a94205b…
|
leo
|
143 |
"model": model, |
|
a94205b…
|
leo
|
144 |
} |
|
a94205b…
|
leo
|
145 |
|
|
829e24a…
|
leo
|
146 |
def _transcribe_chunked(self, audio_path: Path, language: Optional[str], model: str) -> dict: |
|
287a3bb…
|
leo
|
147 |
"""Split audio into chunks under 25MB and transcribe each.""" |
|
287a3bb…
|
leo
|
148 |
import tempfile |
|
829e24a…
|
leo
|
149 |
|
|
287a3bb…
|
leo
|
150 |
from video_processor.extractors.audio_extractor import AudioExtractor |
|
287a3bb…
|
leo
|
151 |
|
|
287a3bb…
|
leo
|
152 |
extractor = AudioExtractor() |
|
287a3bb…
|
leo
|
153 |
audio_data, sr = extractor.load_audio(audio_path) |
|
287a3bb…
|
leo
|
154 |
total_duration = len(audio_data) / sr |
|
287a3bb…
|
leo
|
155 |
|
|
287a3bb…
|
leo
|
156 |
# Calculate chunk duration to stay under 25MB |
|
287a3bb…
|
leo
|
157 |
# WAV: 16-bit mono = 2 bytes/sample, plus header overhead |
|
287a3bb…
|
leo
|
158 |
bytes_per_second = sr * 2 |
|
287a3bb…
|
leo
|
159 |
max_seconds = self._MAX_FILE_SIZE // bytes_per_second |
|
287a3bb…
|
leo
|
160 |
# Use 80% of max to leave headroom |
|
287a3bb…
|
leo
|
161 |
chunk_ms = int(max_seconds * 0.8 * 1000) |
|
287a3bb…
|
leo
|
162 |
|
|
287a3bb…
|
leo
|
163 |
segments_data = extractor.segment_audio(audio_data, sr, segment_length_ms=chunk_ms) |
|
287a3bb…
|
leo
|
164 |
logger.info(f"Split into {len(segments_data)} chunks of ~{chunk_ms / 1000:.0f}s each") |
|
287a3bb…
|
leo
|
165 |
|
|
287a3bb…
|
leo
|
166 |
all_text = [] |
|
287a3bb…
|
leo
|
167 |
all_segments = [] |
|
287a3bb…
|
leo
|
168 |
time_offset = 0.0 |
|
287a3bb…
|
leo
|
169 |
detected_language = language |
|
287a3bb…
|
leo
|
170 |
|
|
287a3bb…
|
leo
|
171 |
with tempfile.TemporaryDirectory() as tmpdir: |
|
287a3bb…
|
leo
|
172 |
for i, chunk in enumerate(segments_data): |
|
287a3bb…
|
leo
|
173 |
chunk_path = Path(tmpdir) / f"chunk_{i:03d}.wav" |
|
287a3bb…
|
leo
|
174 |
extractor.save_segment(chunk, chunk_path, sr) |
|
287a3bb…
|
leo
|
175 |
|
|
287a3bb…
|
leo
|
176 |
logger.info(f"Transcribing chunk {i + 1}/{len(segments_data)}...") |
|
287a3bb…
|
leo
|
177 |
result = self._transcribe_single(chunk_path, language, model) |
|
287a3bb…
|
leo
|
178 |
|
|
287a3bb…
|
leo
|
179 |
all_text.append(result["text"]) |
|
287a3bb…
|
leo
|
180 |
for seg in result.get("segments", []): |
|
829e24a…
|
leo
|
181 |
all_segments.append( |
|
829e24a…
|
leo
|
182 |
{ |
|
829e24a…
|
leo
|
183 |
"start": seg["start"] + time_offset, |
|
829e24a…
|
leo
|
184 |
"end": seg["end"] + time_offset, |
|
829e24a…
|
leo
|
185 |
"text": seg["text"], |
|
829e24a…
|
leo
|
186 |
} |
|
829e24a…
|
leo
|
187 |
) |
|
287a3bb…
|
leo
|
188 |
|
|
287a3bb…
|
leo
|
189 |
if not detected_language and result.get("language"): |
|
287a3bb…
|
leo
|
190 |
detected_language = result["language"] |
|
287a3bb…
|
leo
|
191 |
|
|
287a3bb…
|
leo
|
192 |
time_offset += len(chunk) / sr |
|
287a3bb…
|
leo
|
193 |
|
|
287a3bb…
|
leo
|
194 |
return { |
|
287a3bb…
|
leo
|
195 |
"text": " ".join(all_text), |
|
287a3bb…
|
leo
|
196 |
"segments": all_segments, |
|
287a3bb…
|
leo
|
197 |
"language": detected_language, |
|
287a3bb…
|
leo
|
198 |
"duration": total_duration, |
|
287a3bb…
|
leo
|
199 |
"provider": "openai", |
|
287a3bb…
|
leo
|
200 |
"model": model, |
|
287a3bb…
|
leo
|
201 |
} |
|
287a3bb…
|
leo
|
202 |
|
|
a94205b…
|
leo
|
203 |
def list_models(self) -> list[ModelInfo]: |
|
a94205b…
|
leo
|
204 |
models = [] |
|
a94205b…
|
leo
|
205 |
try: |
|
a94205b…
|
leo
|
206 |
for m in self.client.models.list(): |
|
a94205b…
|
leo
|
207 |
mid = m.id |
|
a94205b…
|
leo
|
208 |
caps = [] |
|
a94205b…
|
leo
|
209 |
# Infer capabilities from model id |
|
a94205b…
|
leo
|
210 |
if any(mid.startswith(p) for p in ("gpt-", "o1", "o3", "o4")): |
|
a94205b…
|
leo
|
211 |
caps.append("chat") |
|
a94205b…
|
leo
|
212 |
if any(v in mid for v in _VISION_MODELS) or "gpt-4o" in mid or "gpt-4.1" in mid: |
|
a94205b…
|
leo
|
213 |
caps.append("vision") |
|
a94205b…
|
leo
|
214 |
if mid in _AUDIO_MODELS or mid.startswith("whisper"): |
|
a94205b…
|
leo
|
215 |
caps.append("audio") |
|
a94205b…
|
leo
|
216 |
if "embedding" in mid: |
|
a94205b…
|
leo
|
217 |
caps.append("embedding") |
|
a94205b…
|
leo
|
218 |
if caps: |
|
829e24a…
|
leo
|
219 |
models.append( |
|
829e24a…
|
leo
|
220 |
ModelInfo( |
|
829e24a…
|
leo
|
221 |
id=mid, |
|
829e24a…
|
leo
|
222 |
provider="openai", |
|
829e24a…
|
leo
|
223 |
display_name=mid, |
|
829e24a…
|
leo
|
224 |
capabilities=caps, |
|
829e24a…
|
leo
|
225 |
) |
|
829e24a…
|
leo
|
226 |
) |
|
a94205b…
|
leo
|
227 |
except Exception as e: |
|
a94205b…
|
leo
|
228 |
logger.warning(f"Failed to list OpenAI models: {e}") |
|
a94205b…
|
leo
|
229 |
return sorted(models, key=lambda m: m.id) |
|
0981a08…
|
noreply
|
230 |
|
|
0981a08…
|
noreply
|
231 |
|
|
0981a08…
|
noreply
|
232 |
ProviderRegistry.register( |
|
0981a08…
|
noreply
|
233 |
name="openai", |
|
0981a08…
|
noreply
|
234 |
provider_class=OpenAIProvider, |
|
0981a08…
|
noreply
|
235 |
env_var="OPENAI_API_KEY", |
|
0981a08…
|
noreply
|
236 |
model_prefixes=["gpt-", "o1", "o3", "o4", "whisper"], |
|
0981a08…
|
noreply
|
237 |
default_models={"chat": "gpt-4o-mini", "vision": "gpt-4o-mini", "audio": "whisper-1"}, |
|
0981a08…
|
noreply
|
238 |
) |