PlanOpticon

planopticon / video_processor / providers / openai_provider.py
Source Blame History 238 lines
a94205b… leo 1 """OpenAI provider implementation."""
a94205b… leo 2
a94205b… leo 3 import base64
a94205b… leo 4 import logging
a94205b… leo 5 import os
a94205b… leo 6 from pathlib import Path
a94205b… leo 7 from typing import Optional
a94205b… leo 8
a94205b… leo 9 from dotenv import load_dotenv
a94205b… leo 10 from openai import OpenAI
a94205b… leo 11
0981a08… noreply 12 from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry
a94205b… leo 13
a94205b… leo 14 load_dotenv()
a94205b… leo 15 logger = logging.getLogger(__name__)
a94205b… leo 16
a94205b… leo 17 # Models known to have vision capability
829e24a… leo 18 _VISION_MODELS = {
829e24a… leo 19 "gpt-4o",
829e24a… leo 20 "gpt-4o-mini",
829e24a… leo 21 "gpt-4-turbo",
829e24a… leo 22 "gpt-4.1",
829e24a… leo 23 "gpt-4.1-mini",
829e24a… leo 24 "gpt-4.1-nano",
829e24a… leo 25 "o1",
829e24a… leo 26 "o3",
829e24a… leo 27 "o3-mini",
829e24a… leo 28 "o4-mini",
829e24a… leo 29 }
a94205b… leo 30 _AUDIO_MODELS = {"whisper-1"}
a94205b… leo 31
a94205b… leo 32
a94205b… leo 33 class OpenAIProvider(BaseProvider):
a94205b… leo 34 """OpenAI API provider."""
a94205b… leo 35
a94205b… leo 36 provider_name = "openai"
a94205b… leo 37
a94205b… leo 38 def __init__(self, api_key: Optional[str] = None):
a94205b… leo 39 self.api_key = api_key or os.getenv("OPENAI_API_KEY")
a94205b… leo 40 if not self.api_key:
a94205b… leo 41 raise ValueError("OPENAI_API_KEY not set")
a94205b… leo 42 self.client = OpenAI(api_key=self.api_key)
a94205b… leo 43
a94205b… leo 44 def chat(
a94205b… leo 45 self,
a94205b… leo 46 messages: list[dict],
a94205b… leo 47 max_tokens: int = 4096,
a94205b… leo 48 temperature: float = 0.7,
a94205b… leo 49 model: Optional[str] = None,
a94205b… leo 50 ) -> str:
0981a08… noreply 51 model = model or "gpt-4o-mini"
a94205b… leo 52 response = self.client.chat.completions.create(
a94205b… leo 53 model=model,
a94205b… leo 54 messages=messages,
a94205b… leo 55 max_tokens=max_tokens,
a94205b… leo 56 temperature=temperature,
a94205b… leo 57 )
287a3bb… leo 58 self._last_usage = {
287a3bb… leo 59 "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0,
829e24a… leo 60 "output_tokens": getattr(response.usage, "completion_tokens", 0)
829e24a… leo 61 if response.usage
829e24a… leo 62 else 0,
287a3bb… leo 63 }
a94205b… leo 64 return response.choices[0].message.content or ""
a94205b… leo 65
a94205b… leo 66 def analyze_image(
a94205b… leo 67 self,
a94205b… leo 68 image_bytes: bytes,
a94205b… leo 69 prompt: str,
a94205b… leo 70 max_tokens: int = 4096,
a94205b… leo 71 model: Optional[str] = None,
a94205b… leo 72 ) -> str:
0981a08… noreply 73 model = model or "gpt-4o-mini"
a94205b… leo 74 b64 = base64.b64encode(image_bytes).decode()
a94205b… leo 75 response = self.client.chat.completions.create(
a94205b… leo 76 model=model,
a94205b… leo 77 messages=[
a94205b… leo 78 {
a94205b… leo 79 "role": "user",
a94205b… leo 80 "content": [
a94205b… leo 81 {"type": "text", "text": prompt},
a94205b… leo 82 {
a94205b… leo 83 "type": "image_url",
a94205b… leo 84 "image_url": {"url": f"data:image/jpeg;base64,{b64}"},
a94205b… leo 85 },
a94205b… leo 86 ],
a94205b… leo 87 }
a94205b… leo 88 ],
a94205b… leo 89 max_tokens=max_tokens,
a94205b… leo 90 )
287a3bb… leo 91 self._last_usage = {
287a3bb… leo 92 "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0,
829e24a… leo 93 "output_tokens": getattr(response.usage, "completion_tokens", 0)
829e24a… leo 94 if response.usage
829e24a… leo 95 else 0,
287a3bb… leo 96 }
a94205b… leo 97 return response.choices[0].message.content or ""
287a3bb… leo 98
287a3bb… leo 99 # Whisper API limit is 25MB
287a3bb… leo 100 _MAX_FILE_SIZE = 25 * 1024 * 1024
a94205b… leo 101
a94205b… leo 102 def transcribe_audio(
a94205b… leo 103 self,
a94205b… leo 104 audio_path: str | Path,
a94205b… leo 105 language: Optional[str] = None,
a94205b… leo 106 model: Optional[str] = None,
a94205b… leo 107 ) -> dict:
a94205b… leo 108 model = model or "whisper-1"
287a3bb… leo 109 audio_path = Path(audio_path)
287a3bb… leo 110 file_size = audio_path.stat().st_size
287a3bb… leo 111
287a3bb… leo 112 if file_size <= self._MAX_FILE_SIZE:
287a3bb… leo 113 return self._transcribe_single(audio_path, language, model)
287a3bb… leo 114
287a3bb… leo 115 # File too large — split into chunks and transcribe each
287a3bb… leo 116 logger.info(
287a3bb… leo 117 f"Audio file {file_size / 1024 / 1024:.1f}MB exceeds Whisper 25MB limit, chunking..."
287a3bb… leo 118 )
287a3bb… leo 119 return self._transcribe_chunked(audio_path, language, model)
287a3bb… leo 120
829e24a… leo 121 def _transcribe_single(self, audio_path: Path, language: Optional[str], model: str) -> dict:
287a3bb… leo 122 """Transcribe a single audio file."""
a94205b… leo 123 with open(audio_path, "rb") as f:
a94205b… leo 124 kwargs = {"model": model, "file": f}
a94205b… leo 125 if language:
a94205b… leo 126 kwargs["language"] = language
a94205b… leo 127 response = self.client.audio.transcriptions.create(
a94205b… leo 128 **kwargs, response_format="verbose_json"
a94205b… leo 129 )
a94205b… leo 130 return {
a94205b… leo 131 "text": response.text,
a94205b… leo 132 "segments": [
a94205b… leo 133 {
a94205b… leo 134 "start": seg.start,
a94205b… leo 135 "end": seg.end,
a94205b… leo 136 "text": seg.text,
a94205b… leo 137 }
a94205b… leo 138 for seg in (response.segments or [])
a94205b… leo 139 ],
a94205b… leo 140 "language": getattr(response, "language", language),
a94205b… leo 141 "duration": getattr(response, "duration", None),
a94205b… leo 142 "provider": "openai",
a94205b… leo 143 "model": model,
a94205b… leo 144 }
a94205b… leo 145
829e24a… leo 146 def _transcribe_chunked(self, audio_path: Path, language: Optional[str], model: str) -> dict:
287a3bb… leo 147 """Split audio into chunks under 25MB and transcribe each."""
287a3bb… leo 148 import tempfile
829e24a… leo 149
287a3bb… leo 150 from video_processor.extractors.audio_extractor import AudioExtractor
287a3bb… leo 151
287a3bb… leo 152 extractor = AudioExtractor()
287a3bb… leo 153 audio_data, sr = extractor.load_audio(audio_path)
287a3bb… leo 154 total_duration = len(audio_data) / sr
287a3bb… leo 155
287a3bb… leo 156 # Calculate chunk duration to stay under 25MB
287a3bb… leo 157 # WAV: 16-bit mono = 2 bytes/sample, plus header overhead
287a3bb… leo 158 bytes_per_second = sr * 2
287a3bb… leo 159 max_seconds = self._MAX_FILE_SIZE // bytes_per_second
287a3bb… leo 160 # Use 80% of max to leave headroom
287a3bb… leo 161 chunk_ms = int(max_seconds * 0.8 * 1000)
287a3bb… leo 162
287a3bb… leo 163 segments_data = extractor.segment_audio(audio_data, sr, segment_length_ms=chunk_ms)
287a3bb… leo 164 logger.info(f"Split into {len(segments_data)} chunks of ~{chunk_ms / 1000:.0f}s each")
287a3bb… leo 165
287a3bb… leo 166 all_text = []
287a3bb… leo 167 all_segments = []
287a3bb… leo 168 time_offset = 0.0
287a3bb… leo 169 detected_language = language
287a3bb… leo 170
287a3bb… leo 171 with tempfile.TemporaryDirectory() as tmpdir:
287a3bb… leo 172 for i, chunk in enumerate(segments_data):
287a3bb… leo 173 chunk_path = Path(tmpdir) / f"chunk_{i:03d}.wav"
287a3bb… leo 174 extractor.save_segment(chunk, chunk_path, sr)
287a3bb… leo 175
287a3bb… leo 176 logger.info(f"Transcribing chunk {i + 1}/{len(segments_data)}...")
287a3bb… leo 177 result = self._transcribe_single(chunk_path, language, model)
287a3bb… leo 178
287a3bb… leo 179 all_text.append(result["text"])
287a3bb… leo 180 for seg in result.get("segments", []):
829e24a… leo 181 all_segments.append(
829e24a… leo 182 {
829e24a… leo 183 "start": seg["start"] + time_offset,
829e24a… leo 184 "end": seg["end"] + time_offset,
829e24a… leo 185 "text": seg["text"],
829e24a… leo 186 }
829e24a… leo 187 )
287a3bb… leo 188
287a3bb… leo 189 if not detected_language and result.get("language"):
287a3bb… leo 190 detected_language = result["language"]
287a3bb… leo 191
287a3bb… leo 192 time_offset += len(chunk) / sr
287a3bb… leo 193
287a3bb… leo 194 return {
287a3bb… leo 195 "text": " ".join(all_text),
287a3bb… leo 196 "segments": all_segments,
287a3bb… leo 197 "language": detected_language,
287a3bb… leo 198 "duration": total_duration,
287a3bb… leo 199 "provider": "openai",
287a3bb… leo 200 "model": model,
287a3bb… leo 201 }
287a3bb… leo 202
a94205b… leo 203 def list_models(self) -> list[ModelInfo]:
a94205b… leo 204 models = []
a94205b… leo 205 try:
a94205b… leo 206 for m in self.client.models.list():
a94205b… leo 207 mid = m.id
a94205b… leo 208 caps = []
a94205b… leo 209 # Infer capabilities from model id
a94205b… leo 210 if any(mid.startswith(p) for p in ("gpt-", "o1", "o3", "o4")):
a94205b… leo 211 caps.append("chat")
a94205b… leo 212 if any(v in mid for v in _VISION_MODELS) or "gpt-4o" in mid or "gpt-4.1" in mid:
a94205b… leo 213 caps.append("vision")
a94205b… leo 214 if mid in _AUDIO_MODELS or mid.startswith("whisper"):
a94205b… leo 215 caps.append("audio")
a94205b… leo 216 if "embedding" in mid:
a94205b… leo 217 caps.append("embedding")
a94205b… leo 218 if caps:
829e24a… leo 219 models.append(
829e24a… leo 220 ModelInfo(
829e24a… leo 221 id=mid,
829e24a… leo 222 provider="openai",
829e24a… leo 223 display_name=mid,
829e24a… leo 224 capabilities=caps,
829e24a… leo 225 )
829e24a… leo 226 )
a94205b… leo 227 except Exception as e:
a94205b… leo 228 logger.warning(f"Failed to list OpenAI models: {e}")
a94205b… leo 229 return sorted(models, key=lambda m: m.id)
0981a08… noreply 230
0981a08… noreply 231
0981a08… noreply 232 ProviderRegistry.register(
0981a08… noreply 233 name="openai",
0981a08… noreply 234 provider_class=OpenAIProvider,
0981a08… noreply 235 env_var="OPENAI_API_KEY",
0981a08… noreply 236 model_prefixes=["gpt-", "o1", "o3", "o4", "whisper"],
0981a08… noreply 237 default_models={"chat": "gpt-4o-mini", "vision": "gpt-4o-mini", "audio": "whisper-1"},
0981a08… noreply 238 )

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button