PlanOpticon

planopticon / video_processor / providers / openai_provider.py

Source Blame History 238 lines

a94205b…	leo	1	"""OpenAI provider implementation."""
a94205b…	leo	2
a94205b…	leo	3	import base64
a94205b…	leo	4	import logging
a94205b…	leo	5	import os
a94205b…	leo	6	from pathlib import Path
a94205b…	leo	7	from typing import Optional
a94205b…	leo	8
a94205b…	leo	9	from dotenv import load_dotenv
a94205b…	leo	10	from openai import OpenAI
a94205b…	leo	11
0981a08…	noreply	12	from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry
a94205b…	leo	13
a94205b…	leo	14	load_dotenv()
a94205b…	leo	15	logger = logging.getLogger(__name__)
a94205b…	leo	16
a94205b…	leo	17	# Models known to have vision capability
829e24a…	leo	18	_VISION_MODELS = {
829e24a…	leo	19	"gpt-4o",
829e24a…	leo	20	"gpt-4o-mini",
829e24a…	leo	21	"gpt-4-turbo",
829e24a…	leo	22	"gpt-4.1",
829e24a…	leo	23	"gpt-4.1-mini",
829e24a…	leo	24	"gpt-4.1-nano",
829e24a…	leo	25	"o1",
829e24a…	leo	26	"o3",
829e24a…	leo	27	"o3-mini",
829e24a…	leo	28	"o4-mini",
829e24a…	leo	29	}
a94205b…	leo	30	_AUDIO_MODELS = {"whisper-1"}
a94205b…	leo	31
a94205b…	leo	32
a94205b…	leo	33	class OpenAIProvider(BaseProvider):
a94205b…	leo	34	"""OpenAI API provider."""
a94205b…	leo	35
a94205b…	leo	36	provider_name = "openai"
a94205b…	leo	37
a94205b…	leo	38	def __init__(self, api_key: Optional[str] = None):
a94205b…	leo	39	self.api_key = api_key or os.getenv("OPENAI_API_KEY")
a94205b…	leo	40	if not self.api_key:
a94205b…	leo	41	raise ValueError("OPENAI_API_KEY not set")
a94205b…	leo	42	self.client = OpenAI(api_key=self.api_key)
a94205b…	leo	43
a94205b…	leo	44	def chat(
a94205b…	leo	45	self,
a94205b…	leo	46	messages: list[dict],
a94205b…	leo	47	max_tokens: int = 4096,
a94205b…	leo	48	temperature: float = 0.7,
a94205b…	leo	49	model: Optional[str] = None,
a94205b…	leo	50	) -> str:
0981a08…	noreply	51	model = model or "gpt-4o-mini"
a94205b…	leo	52	response = self.client.chat.completions.create(
a94205b…	leo	53	model=model,
a94205b…	leo	54	messages=messages,
a94205b…	leo	55	max_tokens=max_tokens,
a94205b…	leo	56	temperature=temperature,
a94205b…	leo	57	)
287a3bb…	leo	58	self._last_usage = {
287a3bb…	leo	59	"input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0,
829e24a…	leo	60	"output_tokens": getattr(response.usage, "completion_tokens", 0)
829e24a…	leo	61	if response.usage
829e24a…	leo	62	else 0,
287a3bb…	leo	63	}
a94205b…	leo	64	return response.choices[0].message.content or ""
a94205b…	leo	65
a94205b…	leo	66	def analyze_image(
a94205b…	leo	67	self,
a94205b…	leo	68	image_bytes: bytes,
a94205b…	leo	69	prompt: str,
a94205b…	leo	70	max_tokens: int = 4096,
a94205b…	leo	71	model: Optional[str] = None,
a94205b…	leo	72	) -> str:
0981a08…	noreply	73	model = model or "gpt-4o-mini"
a94205b…	leo	74	b64 = base64.b64encode(image_bytes).decode()
a94205b…	leo	75	response = self.client.chat.completions.create(
a94205b…	leo	76	model=model,
a94205b…	leo	77	messages=[
a94205b…	leo	78	{
a94205b…	leo	79	"role": "user",
a94205b…	leo	80	"content": [
a94205b…	leo	81	{"type": "text", "text": prompt},
a94205b…	leo	82	{
a94205b…	leo	83	"type": "image_url",
a94205b…	leo	84	"image_url": {"url": f"data:image/jpeg;base64,{b64}"},
a94205b…	leo	85	},
a94205b…	leo	86	],
a94205b…	leo	87	}
a94205b…	leo	88	],
a94205b…	leo	89	max_tokens=max_tokens,
a94205b…	leo	90	)
287a3bb…	leo	91	self._last_usage = {
287a3bb…	leo	92	"input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0,
829e24a…	leo	93	"output_tokens": getattr(response.usage, "completion_tokens", 0)
829e24a…	leo	94	if response.usage
829e24a…	leo	95	else 0,
287a3bb…	leo	96	}
a94205b…	leo	97	return response.choices[0].message.content or ""
287a3bb…	leo	98
287a3bb…	leo	99	# Whisper API limit is 25MB
287a3bb…	leo	100	_MAX_FILE_SIZE = 25 * 1024 * 1024
a94205b…	leo	101
a94205b…	leo	102	def transcribe_audio(
a94205b…	leo	103	self,
a94205b…	leo	104	audio_path: str \| Path,
a94205b…	leo	105	language: Optional[str] = None,
a94205b…	leo	106	model: Optional[str] = None,
a94205b…	leo	107	) -> dict:
a94205b…	leo	108	model = model or "whisper-1"
287a3bb…	leo	109	audio_path = Path(audio_path)
287a3bb…	leo	110	file_size = audio_path.stat().st_size
287a3bb…	leo	111
287a3bb…	leo	112	if file_size <= self._MAX_FILE_SIZE:
287a3bb…	leo	113	return self._transcribe_single(audio_path, language, model)
287a3bb…	leo	114
287a3bb…	leo	115	# File too large — split into chunks and transcribe each
287a3bb…	leo	116	logger.info(
287a3bb…	leo	117	f"Audio file {file_size / 1024 / 1024:.1f}MB exceeds Whisper 25MB limit, chunking..."
287a3bb…	leo	118	)
287a3bb…	leo	119	return self._transcribe_chunked(audio_path, language, model)
287a3bb…	leo	120
829e24a…	leo	121	def _transcribe_single(self, audio_path: Path, language: Optional[str], model: str) -> dict:
287a3bb…	leo	122	"""Transcribe a single audio file."""
a94205b…	leo	123	with open(audio_path, "rb") as f:
a94205b…	leo	124	kwargs = {"model": model, "file": f}
a94205b…	leo	125	if language:
a94205b…	leo	126	kwargs["language"] = language
a94205b…	leo	127	response = self.client.audio.transcriptions.create(
a94205b…	leo	128	**kwargs, response_format="verbose_json"
a94205b…	leo	129	)
a94205b…	leo	130	return {
a94205b…	leo	131	"text": response.text,
a94205b…	leo	132	"segments": [
a94205b…	leo	133	{
a94205b…	leo	134	"start": seg.start,
a94205b…	leo	135	"end": seg.end,
a94205b…	leo	136	"text": seg.text,
a94205b…	leo	137	}
a94205b…	leo	138	for seg in (response.segments or [])
a94205b…	leo	139	],
a94205b…	leo	140	"language": getattr(response, "language", language),
a94205b…	leo	141	"duration": getattr(response, "duration", None),
a94205b…	leo	142	"provider": "openai",
a94205b…	leo	143	"model": model,
a94205b…	leo	144	}
a94205b…	leo	145
829e24a…	leo	146	def _transcribe_chunked(self, audio_path: Path, language: Optional[str], model: str) -> dict:
287a3bb…	leo	147	"""Split audio into chunks under 25MB and transcribe each."""
287a3bb…	leo	148	import tempfile
829e24a…	leo	149
287a3bb…	leo	150	from video_processor.extractors.audio_extractor import AudioExtractor
287a3bb…	leo	151
287a3bb…	leo	152	extractor = AudioExtractor()
287a3bb…	leo	153	audio_data, sr = extractor.load_audio(audio_path)
287a3bb…	leo	154	total_duration = len(audio_data) / sr
287a3bb…	leo	155
287a3bb…	leo	156	# Calculate chunk duration to stay under 25MB
287a3bb…	leo	157	# WAV: 16-bit mono = 2 bytes/sample, plus header overhead
287a3bb…	leo	158	bytes_per_second = sr * 2
287a3bb…	leo	159	max_seconds = self._MAX_FILE_SIZE // bytes_per_second
287a3bb…	leo	160	# Use 80% of max to leave headroom
287a3bb…	leo	161	chunk_ms = int(max_seconds * 0.8 * 1000)
287a3bb…	leo	162
287a3bb…	leo	163	segments_data = extractor.segment_audio(audio_data, sr, segment_length_ms=chunk_ms)
287a3bb…	leo	164	logger.info(f"Split into {len(segments_data)} chunks of ~{chunk_ms / 1000:.0f}s each")
287a3bb…	leo	165
287a3bb…	leo	166	all_text = []
287a3bb…	leo	167	all_segments = []
287a3bb…	leo	168	time_offset = 0.0
287a3bb…	leo	169	detected_language = language
287a3bb…	leo	170
287a3bb…	leo	171	with tempfile.TemporaryDirectory() as tmpdir:
287a3bb…	leo	172	for i, chunk in enumerate(segments_data):
287a3bb…	leo	173	chunk_path = Path(tmpdir) / f"chunk_{i:03d}.wav"
287a3bb…	leo	174	extractor.save_segment(chunk, chunk_path, sr)
287a3bb…	leo	175
287a3bb…	leo	176	logger.info(f"Transcribing chunk {i + 1}/{len(segments_data)}...")
287a3bb…	leo	177	result = self._transcribe_single(chunk_path, language, model)
287a3bb…	leo	178
287a3bb…	leo	179	all_text.append(result["text"])
287a3bb…	leo	180	for seg in result.get("segments", []):
829e24a…	leo	181	all_segments.append(
829e24a…	leo	182	{
829e24a…	leo	183	"start": seg["start"] + time_offset,
829e24a…	leo	184	"end": seg["end"] + time_offset,
829e24a…	leo	185	"text": seg["text"],
829e24a…	leo	186	}
829e24a…	leo	187	)
287a3bb…	leo	188
287a3bb…	leo	189	if not detected_language and result.get("language"):
287a3bb…	leo	190	detected_language = result["language"]
287a3bb…	leo	191
287a3bb…	leo	192	time_offset += len(chunk) / sr
287a3bb…	leo	193
287a3bb…	leo	194	return {
287a3bb…	leo	195	"text": " ".join(all_text),
287a3bb…	leo	196	"segments": all_segments,
287a3bb…	leo	197	"language": detected_language,
287a3bb…	leo	198	"duration": total_duration,
287a3bb…	leo	199	"provider": "openai",
287a3bb…	leo	200	"model": model,
287a3bb…	leo	201	}
287a3bb…	leo	202
a94205b…	leo	203	def list_models(self) -> list[ModelInfo]:
a94205b…	leo	204	models = []
a94205b…	leo	205	try:
a94205b…	leo	206	for m in self.client.models.list():
a94205b…	leo	207	mid = m.id
a94205b…	leo	208	caps = []
a94205b…	leo	209	# Infer capabilities from model id
a94205b…	leo	210	if any(mid.startswith(p) for p in ("gpt-", "o1", "o3", "o4")):
a94205b…	leo	211	caps.append("chat")
a94205b…	leo	212	if any(v in mid for v in _VISION_MODELS) or "gpt-4o" in mid or "gpt-4.1" in mid:
a94205b…	leo	213	caps.append("vision")
a94205b…	leo	214	if mid in _AUDIO_MODELS or mid.startswith("whisper"):
a94205b…	leo	215	caps.append("audio")
a94205b…	leo	216	if "embedding" in mid:
a94205b…	leo	217	caps.append("embedding")
a94205b…	leo	218	if caps:
829e24a…	leo	219	models.append(
829e24a…	leo	220	ModelInfo(
829e24a…	leo	221	id=mid,
829e24a…	leo	222	provider="openai",
829e24a…	leo	223	display_name=mid,
829e24a…	leo	224	capabilities=caps,
829e24a…	leo	225	)
829e24a…	leo	226	)
a94205b…	leo	227	except Exception as e:
a94205b…	leo	228	logger.warning(f"Failed to list OpenAI models: {e}")
a94205b…	leo	229	return sorted(models, key=lambda m: m.id)
0981a08…	noreply	230
0981a08…	noreply	231
0981a08…	noreply	232	ProviderRegistry.register(
0981a08…	noreply	233	name="openai",
0981a08…	noreply	234	provider_class=OpenAIProvider,
0981a08…	noreply	235	env_var="OPENAI_API_KEY",
0981a08…	noreply	236	model_prefixes=["gpt-", "o1", "o3", "o4", "whisper"],
0981a08…	noreply	237	default_models={"chat": "gpt-4o-mini", "vision": "gpt-4o-mini", "audio": "whisper-1"},
0981a08…	noreply	238	)

PlanOpticon

Keyboard Shortcuts