PlanOpticon
API Providers
Providers API Reference
video_processor.providers.base
Abstract base class, registry, and shared types for provider implementations.
BaseProvider
Bases: ABC
Abstract base for all provider implementations.
Source code in video_processor/providers/base.py
class BaseProvider(ABC):
"""Abstract base for all provider implementations."""
provider_name: str = ""
@abstractmethod
def chat(
self,
messages: list[dict],
max_tokens: int = 4096,
temperature: float = 0.7,
model: Optional[str] = None,
) -> str:
"""Send a chat completion request. Returns the assistant text."""
@abstractmethod
def analyze_image(
self,
image_bytes: bytes,
prompt: str,
max_tokens: int = 4096,
model: Optional[str] = None,
) -> str:
"""Analyze an image with a prompt. Returns the assistant text."""
@abstractmethod
def transcribe_audio(
self,
audio_path: str | Path,
language: Optional[str] = None,
model: Optional[str] = None,
) -> dict:
"""Transcribe an audio file. Returns dict with 'text', 'segments', etc."""
@abstractmethod
def list_models(self) -> list[ModelInfo]:
"""Discover available models from this provider's API."""
analyze_image(image_bytes, prompt, max_tokens=4096, model=None)
abstractmethod
Analyze an image with a prompt. Returns the assistant text.
Source code in video_processor/providers/base.py
@abstractmethod
def analyze_image(
self,
image_bytes: bytes,
prompt: str,
max_tokens: int = 4096,
model: Optional[str] = None,
) -> str:
"""Analyze an image with a prompt. Returns the assistant text."""
chat(messages, max_tokens=4096, temperature=0.7, model=None)
abstractmethod
Send a chat completion request. Returns the assistant text.
Source code in video_processor/providers/base.py
@abstractmethod
def chat(
self,
messages: list[dict],
max_tokens: int = 4096,
temperature: float = 0.7,
model: Optional[str] = None,
) -> str:
"""Send a chat completion request. Returns the assistant text."""
list_models()
abstractmethod
Discover available models from this provider's API.
Source code in video_processor/providers/base.py
@abstractmethod
def list_models(self) -> list[ModelInfo]:
"""Discover available models from this provider's API."""
transcribe_audio(audio_path, language=None, model=None)
abstractmethod
Transcribe an audio file. Returns dict with 'text', 'segments', etc.
Source code in video_processor/providers/base.py
@abstractmethod
def transcribe_audio(
self,
audio_path: str | Path,
language: Optional[str] = None,
model: Optional[str] = None,
) -> dict:
"""Transcribe an audio file. Returns dict with 'text', 'segments', etc."""
ModelInfo
Bases: BaseModel
Information about an available model.
Source code in video_processor/providers/base.py
class ModelInfo(BaseModel):
"""Information about an available model."""
id: str = Field(description="Model identifier (e.g. gpt-4o)")
provider: str = Field(description="Provider name (openai, anthropic, gemini)")
display_name: str = Field(default="", description="Human-readable name")
capabilities: List[str] = Field(
default_factory=list, description="Model capabilities: chat, vision, audio, embedding"
)
OpenAICompatibleProvider
Bases: [BaseProvider](#video_processor.providers.base.BaseProvider)
Base for providers using OpenAI-compatible APIs.
Suitable for Together, Fireworks, Cerebras, xAI, Azure, and similar services.
Source code in video_processor/providers/base.py
class OpenAICompatibleProvider(BaseProvider):
"""Base for providers using OpenAI-compatible APIs.
Suitable for Together, Fireworks, Cerebras, xAI, Azure, and similar services.
"""
provider_name: str = ""
base_url: str = ""
env_var: str = ""
def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None):
from openai import OpenAI
self._api_key = api_key or os.getenv(self.env_var, "")
self._base_url = base_url or self.base_url
self._client = OpenAI(api_key=self._api_key, base_url=self._base_url)
self._last_usage = None
def chat(
self,
messages: list[dict],
max_tokens: int = 4096,
temperature: float = 0.7,
model: Optional[str] = None,
) -> str:
model = model or "gpt-4o"
response = self._client.chat.completions.create(
model=model,
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
)
self._last_usage = {
"input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0,
"output_tokens": getattr(response.usage, "completion_tokens", 0)
if response.usage
else 0,
}
return response.choices[0].message.content or ""
def analyze_image(
self,
image_bytes: bytes,
prompt: str,
max_tokens: int = 4096,
model: Optional[str] = None,
) -> str:
model = model or "gpt-4o"
b64 = base64.b64encode(image_bytes).decode()
response = self._client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{b64}"},
},
],
}
],
max_tokens=max_tokens,
)
self._last_usage = {
"input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0,
"output_tokens": getattr(response.usage, "completion_tokens", 0)
if response.usage
else 0,
}
return response.choices[0].message.content or ""
def transcribe_audio(
self,
audio_path: str | Path,
language: Optional[str] = None,
model: Optional[str] = None,
) -> dict:
raise NotImplementedError(f"{self.provider_name} does not support audio transcription")
def list_models(self) -> list[ModelInfo]:
models = []
try:
for m in self._client.models.list():
mid = m.id
caps = ["chat"]
models.append(
ModelInfo(
id=mid,
provider=self.provider_name,
display_name=mid,
capabilities=caps,
)
)
except Exception as e:
logger.warning(f"Failed to list {self.provider_name} models: {e}")
return sorted(models, key=lambda m: m.id)
ProviderRegistry
Registry for provider classes. Providers register themselves with metadata.
Source code in video_processor/providers/base.py
class ProviderRegistry:
"""Registry for provider classes. Providers register themselves with metadata."""
_providers: Dict[str, Dict] = {}
@classmethod
def register(
cls,
name: str,
provider_class: type,
env_var: str = "",
model_prefixes: Optional[List[str]] = None,
default_models: Optional[Dict[str, str]] = None,
) -> None:
"""Register a provider class with its metadata."""
cls._providers[name] = {
"class": provider_class,
"env_var": env_var,
"model_prefixes": model_prefixes or [],
"default_models": default_models or {},
}
@classmethod
def get(cls, name: str) -> type:
"""Return the provider class for a given name."""
if name not in cls._providers:
raise ValueError(f"Unknown provider: {name}")
return cls._providers[name]["class"]
@classmethod
def get_by_model(cls, model_id: str) -> Optional[str]:
"""Return provider name for a model ID based on prefix matching."""
for name, info in cls._providers.items():
for prefix in info["model_prefixes"]:
if model_id.startswith(prefix):
return name
return None
@classmethod
def get_default_models(cls, name: str) -> Dict[str, str]:
"""Return the default models dict for a provider."""
if name not in cls._providers:
return {}
return cls._providers[name].get("default_models", {})
@classmethod
def available(cls) -> List[str]:
"""Return names of providers whose env var is set (or have no env var requirement)."""
result = []
for name, info in cls._providers.items():
env_var = info.get("env_var", "")
if not env_var:
# Providers without an env var (e.g. ollama) need special availability checks
result.append(name)
elif os.getenv(env_var, ""):
result.append(name)
return result
@classmethod
def all_registered(cls) -> Dict[str, Dict]:
"""Return all registered providers and their metadata."""
return dict(cls._providers)
all_registered()
classmethod
Return all registered providers and their metadata.
Source code in video_processor/providers/base.py
@classmethod
def all_registered(cls) -> Dict[str, Dict]:
"""Return all registered providers and their metadata."""
return dict(cls._providers)
available()
classmethod
Return names of providers whose env var is set (or have no env var requirement).
Source code in video_processor/providers/base.py
@classmethod
def available(cls) -> List[str]:
"""Return names of providers whose env var is set (or have no env var requirement)."""
result = []
for name, info in cls._providers.items():
env_var = info.get("env_var", "")
if not env_var:
# Providers without an env var (e.g. ollama) need special availability checks
result.append(name)
elif os.getenv(env_var, ""):
result.append(name)
return result
get(name)
classmethod
Return the provider class for a given name.
Source code in video_processor/providers/base.py
@classmethod
def get(cls, name: str) -> type:
"""Return the provider class for a given name."""
if name not in cls._providers:
raise ValueError(f"Unknown provider: {name}")
return cls._providers[name]["class"]
get_by_model(model_id)
classmethod
Return provider name for a model ID based on prefix matching.
Source code in video_processor/providers/base.py
@classmethod
def get_by_model(cls, model_id: str) -> Optional[str]:
"""Return provider name for a model ID based on prefix matching."""
for name, info in cls._providers.items():
for prefix in info["model_prefixes"]:
if model_id.startswith(prefix):
return name
return None
get_default_models(name)
classmethod
Return the default models dict for a provider.
Source code in video_processor/providers/base.py
@classmethod
def get_default_models(cls, name: str) -> Dict[str, str]:
"""Return the default models dict for a provider."""
if name not in cls._providers:
return {}
return cls._providers[name].get("default_models", {})
register(name, provider_class, env_var='', model_prefixes=None, default_models=None)
classmethod
Register a provider class with its metadata.
Source code in video_processor/providers/base.py
@classmethod
def register(
cls,
name: str,
provider_class: type,
env_var: str = "",
model_prefixes: Optional[List[str]] = None,
default_models: Optional[Dict[str, str]] = None,
) -> None:
"""Register a provider class with its metadata."""
cls._providers[name] = {
"class": provider_class,
"env_var": env_var,
"model_prefixes": model_prefixes or [],
"default_models": default_models or {},
}
video_processor.providers.manager
ProviderManager - unified interface for routing API calls to the best available provider.
ProviderManager
Routes API calls to the best available provider/model.
Supports explicit model selection or auto-routing based on discovered available models.
Source code in video_processor/providers/manager.py
class ProviderManager:
"""
Routes API calls to the best available provider/model.
Supports explicit model selection or auto-routing based on
discovered available models.
"""
def __init__(
self,
vision_model: Optional[str] = None,
chat_model: Optional[str] = None,
transcription_model: Optional[str] = None,
provider: Optional[str] = None,
auto: bool = True,
):
"""
Initialize the ProviderManager.
Parameters
----------
vision_model : override model for vision tasks (e.g. 'gpt-4o')
chat_model : override model for chat/LLM tasks
transcription_model : override model for transcription
provider : force all tasks to a single provider ('openai', 'anthropic', 'gemini')
auto : if True and no model specified, pick the best available
"""
_ensure_providers_registered()
self.auto = auto
self._providers: dict[str, BaseProvider] = {}
self._available_models: Optional[list[ModelInfo]] = None
self.usage = UsageTracker()
# If a single provider is forced, apply it
if provider:
self.vision_model = vision_model or self._default_for_provider(provider, "vision")
self.chat_model = chat_model or self._default_for_provider(provider, "chat")
self.transcription_model = transcription_model or self._default_for_provider(
provider, "audio"
)
else:
self.vision_model = vision_model
self.chat_model = chat_model
self.transcription_model = transcription_model
self._forced_provider = provider
@staticmethod
def _default_for_provider(provider: str, capability: str) -> str:
"""Return the default model for a provider/capability combo."""
defaults = ProviderRegistry.get_default_models(provider)
if defaults:
return defaults.get(capability, "")
# Fallback for unregistered providers
return ""
def _get_provider(self, provider_name: str) -> BaseProvider:
"""Lazily initialize and cache a provider instance."""
if provider_name not in self._providers:
_ensure_providers_registered()
provider_class = ProviderRegistry.get(provider_name)
self._providers[provider_name] = provider_class()
return self._providers[provider_name]
def _provider_for_model(self, model_id: str) -> str:
"""Infer the provider from a model id."""
_ensure_providers_registered()
# Check registry prefix matching first
provider_name = ProviderRegistry.get_by_model(model_id)
if provider_name:
return provider_name
# Try discovery (exact match, then prefix match for ollama name:tag format)
models = self._get_available_models()
for m in models:
if m.id == model_id:
return m.provider
for m in models:
if m.id.startswith(model_id + ":"):
return m.provider
raise ValueError(f"Cannot determine provider for model: {model_id}")
def _get_available_models(self) -> list[ModelInfo]:
if self._available_models is None:
self._available_models = discover_available_models()
return self._available_models
def _resolve_model(
self, explicit: Optional[str], capability: str, preferences: list[tuple[str, str]]
) -> tuple[str, str]:
"""
Resolve which (provider, model) to use for a capability.
Returns (provider_name, model_id).
"""
if explicit:
prov = self._provider_for_model(explicit)
return prov, explicit
if self.auto:
# Try preference order, picking the first provider that has an API key
for prov, model in preferences:
try:
self._get_provider(prov)
return prov, model
except (ValueError, ImportError):
continue
# Fallback: try Ollama if available (no API key needed)
try:
from video_processor.providers.ollama_provider import OllamaProvider
if OllamaProvider.is_available():
provider = self._get_provider("ollama")
models = provider.list_models()
for m in models:
if capability in m.capabilities:
return "ollama", m.id
except Exception:
pass
raise RuntimeError(
f"No provider available for capability '{capability}'. "
"Set an API key for at least one provider, or start Ollama."
)
def _track(self, provider: BaseProvider, prov_name: str, model: str) -> None:
"""Record usage from the last API call on a provider."""
last = getattr(provider, "_last_usage", None)
if last:
self.usage.record(
provider=prov_name,
model=model,
input_tokens=last.get("input_tokens", 0),
output_tokens=last.get("output_tokens", 0),
)
provider._last_usage = None
# --- Public API ---
def chat(
self,
messages: list[dict],
max_tokens: int = 4096,
temperature: float = 0.7,
) -> str:
"""Send a chat completion to the best available provider."""
prov_name, model = self._resolve_model(self.chat_model, "chat", _CHAT_PREFERENCES)
logger.info(f"Chat: using {prov_name}/{model}")
provider = self._get_provider(prov_name)
result = provider.chat(
messages, max_tokens=max_tokens, temperature=temperature, model=model
)
self._track(provider, prov_name, model)
return result
def analyze_image(
self,
image_bytes: bytes,
prompt: str,
max_tokens: int = 4096,
) -> str:
"""Analyze an image using the best available vision provider."""
prov_name, model = self._resolve_model(self.vision_model, "vision", _VISION_PREFERENCES)
logger.info(f"Vision: using {prov_name}/{model}")
provider = self._get_provider(prov_name)
result = provider.analyze_image(image_bytes, prompt, max_tokens=max_tokens, model=model)
self._track(provider, prov_name, model)
return result
def transcribe_audio(
self,
audio_path: str | Path,
language: Optional[str] = None,
speaker_hints: Optional[list[str]] = None,
) -> dict:
"""Transcribe audio using local Whisper if available, otherwise API."""
# Prefer local Whisper — no file size limits, no API costs
if not self.transcription_model or self.transcription_model.startswith("whisper-local"):
try:
from video_processor.providers.whisper_local import WhisperLocal
if WhisperLocal.is_available():
# Parse model size from "whisper-local:large" or default to "large"
size = "large"
if self.transcription_model and ":" in self.transcription_model:
size = self.transcription_model.split(":", 1)[1]
if not hasattr(self, "_whisper_local"):
self._whisper_local = WhisperLocal(model_size=size)
logger.info(f"Transcription: using local whisper-{size}")
# Pass speaker names as initial prompt hint for Whisper
whisper_kwargs = {"language": language}
if speaker_hints:
whisper_kwargs["initial_prompt"] = (
"Speakers: " + ", ".join(speaker_hints) + "."
)
result = self._whisper_local.transcribe(audio_path, **whisper_kwargs)
duration = result.get("duration") or 0
self.usage.record(
provider="local",
model=f"whisper-{size}",
audio_minutes=duration / 60 if duration else 0,
)
return result
except ImportError:
pass
# Fall back to API-based transcription
prov_name, model = self._resolve_model(
self.transcription_model, "audio", _TRANSCRIPTION_PREFERENCES
)
logger.info(f"Transcription: using {prov_name}/{model}")
provider = self._get_provider(prov_name)
# Build transcription kwargs, passing speaker hints where supported
transcribe_kwargs: dict = {"language": language, "model": model}
if speaker_hints:
if prov_name == "openai":
# OpenAI Whisper supports a 'prompt' parameter for hints
transcribe_kwargs["prompt"] = "Speakers: " + ", ".join(speaker_hints) + "."
else:
transcribe_kwargs["speaker_hints"] = speaker_hints
result = provider.transcribe_audio(audio_path, **transcribe_kwargs)
duration = result.get("duration") or 0
self.usage.record(
provider=prov_name,
model=model,
audio_minutes=duration / 60 if duration else 0,
)
return result
def get_models_used(self) -> dict[str, str]:
"""Return a dict mapping capability to 'provider/model' for tracking."""
result = {}
for cap, explicit, prefs in [
("vision", self.vision_model, _VISION_PREFERENCES),
("chat", self.chat_model, _CHAT_PREFERENCES),
("transcription", self.transcription_model, _TRANSCRIPTION_PREFERENCES),
]:
try:
prov, model = self._resolve_model(explicit, cap, prefs)
result[cap] = f"{prov}/{model}"
except RuntimeError:
pass
return result
init(vision_model=None, chat_model=None, transcription_model=None, provider=None, auto=True)
Initialize the ProviderManager.
Parameters
vision_model : override model for vision tasks (e.g. 'gpt-4o') chat_model : override model for chat/LLM tasks transcription_model : override model for transcription provider : force all tasks to a single provider ('openai', 'anthropic', 'gemini') auto : if True and no model specified, pick the best available
Source code in video_processor/providers/manager.py
def __init__(
self,
vision_model: Optional[str] = None,
chat_model: Optional[str] = None,
transcription_model: Optional[str] = None,
provider: Optional[str] = None,
auto: bool = True,
):
"""
Initialize the ProviderManager.
Parameters
----------
vision_model : override model for vision tasks (e.g. 'gpt-4o')
chat_model : override model for chat/LLM tasks
transcription_model : override model for transcription
provider : force all tasks to a single provider ('openai', 'anthropic', 'gemini')
auto : if True and no model specified, pick the best available
"""
_ensure_providers_registered()
self.auto = auto
self._providers: dict[str, BaseProvider] = {}
self._available_models: Optional[list[ModelInfo]] = None
self.usage = UsageTracker()
# If a single provider is forced, apply it
if provider:
self.vision_model = vision_model or self._default_for_provider(provider, "vision")
self.chat_model = chat_model or self._default_for_provider(provider, "chat")
self.transcription_model = transcription_model or self._default_for_provider(
provider, "audio"
)
else:
self.vision_model = vision_model
self.chat_model = chat_model
self.transcription_model = transcription_model
self._forced_provider = provider
analyze_image(image_bytes, prompt, max_tokens=4096)
Analyze an image using the best available vision provider.
Source code in video_processor/providers/manager.py
def analyze_image(
self,
image_bytes: bytes,
prompt: str,
max_tokens: int = 4096,
) -> str:
"""Analyze an image using the best available vision provider."""
prov_name, model = self._resolve_model(self.vision_model, "vision", _VISION_PREFERENCES)
logger.info(f"Vision: using {prov_name}/{model}")
provider = self._get_provider(prov_name)
result = provider.analyze_image(image_bytes, prompt, max_tokens=max_tokens, model=model)
self._track(provider, prov_name, model)
return result
chat(messages, max_tokens=4096, temperature=0.7)
Send a chat completion to the best available provider.
Source code in video_processor/providers/manager.py
def chat(
self,
messages: list[dict],
max_tokens: int = 4096,
temperature: float = 0.7,
) -> str:
"""Send a chat completion to the best available provider."""
prov_name, model = self._resolve_model(self.chat_model, "chat", _CHAT_PREFERENCES)
logger.info(f"Chat: using {prov_name}/{model}")
provider = self._get_provider(prov_name)
result = provider.chat(
messages, max_tokens=max_tokens, temperature=temperature, model=model
)
self._track(provider, prov_name, model)
return result
get_models_used()
Return a dict mapping capability to 'provider/model' for tracking.
Source code in video_processor/providers/manager.py
def get_models_used(self) -> dict[str, str]:
"""Return a dict mapping capability to 'provider/model' for tracking."""
result = {}
for cap, explicit, prefs in [
("vision", self.vision_model, _VISION_PREFERENCES),
("chat", self.chat_model, _CHAT_PREFERENCES),
("transcription", self.transcription_model, _TRANSCRIPTION_PREFERENCES),
]:
try:
prov, model = self._resolve_model(explicit, cap, prefs)
result[cap] = f"{prov}/{model}"
except RuntimeError:
pass
return result
transcribe_audio(audio_path, language=None, speaker_hints=None)
Transcribe audio using local Whisper if available, otherwise API.
Source code in video_processor/providers/manager.py
def transcribe_audio(
self,
audio_path: str | Path,
language: Optional[str] = None,
speaker_hints: Optional[list[str]] = None,
) -> dict:
"""Transcribe audio using local Whisper if available, otherwise API."""
# Prefer local Whisper — no file size limits, no API costs
if not self.transcription_model or self.transcription_model.startswith("whisper-local"):
try:
from video_processor.providers.whisper_local import WhisperLocal
if WhisperLocal.is_available():
# Parse model size from "whisper-local:large" or default to "large"
size = "large"
if self.transcription_model and ":" in self.transcription_model:
size = self.transcription_model.split(":", 1)[1]
if not hasattr(self, "_whisper_local"):
self._whisper_local = WhisperLocal(model_size=size)
logger.info(f"Transcription: using local whisper-{size}")
# Pass speaker names as initial prompt hint for Whisper
whisper_kwargs = {"language": language}
if speaker_hints:
whisper_kwargs["initial_prompt"] = (
"Speakers: " + ", ".join(speaker_hints) + "."
)
result = self._whisper_local.transcribe(audio_path, **whisper_kwargs)
duration = result.get("duration") or 0
self.usage.record(
provider="local",
model=f"whisper-{size}",
audio_minutes=duration / 60 if duration else 0,
)
return result
except ImportError:
pass
# Fall back to API-based transcription
prov_name, model = self._resolve_model(
self.transcription_model, "audio", _TRANSCRIPTION_PREFERENCES
)
logger.info(f"Transcription: using {prov_name}/{model}")
provider = self._get_provider(prov_name)
# Build transcription kwargs, passing speaker hints where supported
transcribe_kwargs: dict = {"language": language, "model": model}
if speaker_hints:
if prov_name == "openai":
# OpenAI Whisper supports a 'prompt' parameter for hints
transcribe_kwargs["prompt"] = "Speakers: " + ", ".join(speaker_hints) + "."
else:
transcribe_kwargs["speaker_hints"] = speaker_hints
result = provider.transcribe_audio(audio_path, **transcribe_kwargs)
duration = result.get("duration") or 0
self.usage.record(
provider=prov_name,
model=model,
audio_minutes=duration / 60 if duration else 0,
)
return result
video_processor.providers.discovery
Auto-discover available models across providers.
clear_discovery_cache()
Clear the cached model list.
Source code in video_processor/providers/discovery.py
def clear_discovery_cache() -> None:
"""Clear the cached model list."""
global _cached_models
_cached_models = None
discover_available_models(api_keys=None, force_refresh=False)
Discover available models from all configured providers.
For each provider with a valid API key, calls list_models() and returns a unified list. Results are cached for the session.
Source code in video_processor/providers/discovery.py
def discover_available_models(
api_keys: Optional[dict[str, str]] = None,
force_refresh: bool = False,
) -> list[ModelInfo]:
"""
Discover available models from all configured providers.
For each provider with a valid API key, calls list_models() and returns
a unified list. Results are cached for the session.
"""
global _cached_models
if _cached_models is not None and not force_refresh:
return _cached_models
_ensure_providers_registered()
keys = api_keys or {
"openai": os.getenv("OPENAI_API_KEY", ""),
"anthropic": os.getenv("ANTHROPIC_API_KEY", ""),
"gemini": os.getenv("GEMINI_API_KEY", ""),
}
all_models: list[ModelInfo] = []
for name, info in ProviderRegistry.all_registered().items():
env_var = info.get("env_var", "")
provider_class = info["class"]
if name == "ollama":
# Ollama: no API key, check server availability
try:
if provider_class.is_available():
provider = provider_class()
models = provider.list_models()
logger.info(f"Discovered {len(models)} Ollama models")
all_models.extend(models)
except Exception as e:
logger.info(f"Ollama discovery skipped: {e}")
continue
# For key-based providers, check the api_keys dict first, then env var
key = keys.get(name, "")
if not key and env_var:
key = os.getenv(env_var, "")
# Special case: Gemini also supports service account credentials
gemini_creds = ""
if name == "gemini":
gemini_creds = os.getenv("GOOGLE_APPLICATION_CREDENTIALS", "")
if not key and not gemini_creds:
continue
try:
# Handle provider-specific constructor args
if name == "gemini":
provider = provider_class(
api_key=key or None,
credentials_path=gemini_creds or None,
)
else:
provider = provider_class(api_key=key)
models = provider.list_models()
logger.info(f"Discovered {len(models)} {name.capitalize()} models")
all_models.extend(models)
except Exception as e:
logger.info(f"{name.capitalize()} discovery skipped: {e}")
# Sort by provider then id
all_models.sort(key=lambda m: (m.provider, m.id))
_cached_models = all_models
logger.info(f"Total discovered models: {len(all_models)}")
return all_models
Overview
The provider system abstracts LLM API calls behind a unified interface. It supports multiple providers (OpenAI, Anthropic, Gemini, Ollama, and OpenAI-compatible services), automatic model discovery, capability-based routing, and usage tracking.
Key components:
BaseProvider-- abstract interface that all providers implementProviderRegistry-- global registry mapping provider names to classesProviderManager-- high-level router that picks the best provider for each taskdiscover_available_models()-- scans all configured providers for available models
BaseProvider (ABC)
from video_processor.providers.base import BaseProvider
Abstract base class that all provider implementations must subclass. Defines the four core capabilities: chat, vision, audio transcription, and model listing.
Class attribute:
| Attribute | Type | Description |
|---|---|---|
| provider_name | str | Identifier for this provider (e.g., "openai", "anthropic") |
chat()
def chat(
self,
messages: list[dict],
max_tokens: int = 4096,
temperature: float = 0.7,
model: Optional[str] = None,
) -> str
Send a chat completion request.
Parameters:
| Parameter | Type | Default | Description |
|---|---|---|---|
| messages | list[dict] | required | OpenAI-format message list (role, content) |
| max_tokens | int | 4096 | Maximum tokens in the response |
| temperature | float | 0.7 | Sampling temperature |
| model | Optional[str] | None | Override model ID |
Returns: str -- the assistant's text response.
analyze_image()
def analyze_image(
self,
image_bytes: bytes,
prompt: str,
max_tokens: int = 4096,
model: Optional[str] = None,
) -> str
Analyze an image with a text prompt using a vision-capable model.
Parameters:
| Parameter | Type | Default | Description |
|---|---|---|---|
| image_bytes | bytes | required | Raw image data (JPEG, PNG, etc.) |
| prompt | str | required | Analysis instructions |
| max_tokens | int | 4096 | Maximum tokens in the response |
| model | Optional[str] | None | Override model ID |
Returns: str -- the assistant's analysis text.
transcribe_audio()
def transcribe_audio(
self,
audio_path: str | Path,
language: Optional[str] = None,
model: Optional[str] = None,
) -> dict
Transcribe an audio file.
Parameters:
| Parameter | Type | Default | Description |
|---|---|---|---|
| audio_path | str \ | Path | required |
| language | Optional[str] | None | Language hint (ISO 639-1 code) |
| model | Optional[str] | None | Override model ID |
Returns: dict -- transcription result with keys text, segments, duration, etc.
list_models()
def list_models(self) -> list[ModelInfo]
Discover available models from this provider's API.
Returns: list[ModelInfo] -- available models with capability metadata.
ModelInfo
from video_processor.providers.base import ModelInfo
Pydantic model describing an available model from a provider.
| Field | Type | Default | Description |
|---|---|---|---|
| id | str | required | Model identifier (e.g., "gpt-4o", "claude-haiku-4-5-20251001") |
| provider | str | required | Provider name (e.g., "openai", "anthropic", "gemini") |
| display_name | str | "" | Human-readable display name |
| capabilities | List[str] | [] | Model capabilities: "chat", "vision", "audio", "embedding" |
{
"id": "gpt-4o",
"provider": "openai",
"display_name": "GPT-4o",
"capabilities": ["chat", "vision"]
}
ProviderRegistry
from video_processor.providers.base import ProviderRegistry
Class-level registry for provider classes. Providers register themselves with metadata on import. This registry is used internally by ProviderManager but can also be used directly for introspection.
register()
@classmethod
def register(
cls,
name: str,
provider_class: type,
env_var: str = "",
model_prefixes: Optional[List[str]] = None,
default_models: Optional[Dict[str, str]] = None,
) -> None
Register a provider class with its metadata. Called by each provider module at import time.
Parameters:
| Parameter | Type | Default | Description |
|---|---|---|---|
| name | str | required | Provider name (e.g., "openai") |
| provider_class | type | required | The provider class |
| env_var | str | "" | Environment variable for API key |
| model_prefixes | Optional[List[str]] | None | Model ID prefixes for auto-detection (e.g., ["gpt-", "o1-"]) |
| default_models | Optional[Dict[str, str]] | None | Default models per capability (e.g., {"chat": "gpt-4o", "vision": "gpt-4o"}) |
get()
@classmethod
def get(cls, name: str) -> type
Return the provider class for a given name. Raises ValueError if the provider is not registered.
get_by_model()
@classmethod
def get_by_model(cls, model_id: str) -> Optional[str]
Return the provider name for a model ID based on prefix matching. Returns None if no match is found.
get_default_models()
@classmethod
def get_default_models(cls, name: str) -> Dict[str, str]
Return the default models dict for a provider, mapping capability names to model IDs.
available()
@classmethod
def available(cls) -> List[str]
Return names of providers whose required environment variable is set (or providers with no env var requirement, like Ollama).
all_registered()
@classmethod
def all_registered(cls) -> Dict[str, Dict]
Return all registered providers and their metadata dictionaries.
OpenAICompatibleProvider
from video_processor.providers.base import OpenAICompatibleProvider
Base class for providers using OpenAI-compatible APIs (Together, Fireworks, Cerebras, xAI, Azure). Implements chat(), analyze_image(), and list_models() using the OpenAI client library. transcribe_audio() raises NotImplementedError by default.
Constructor:
def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None)
| Parameter | Type | Default | Description |
|---|---|---|---|
| api_key | Optional[str] | None | API key (falls back to self.env_var environment variable) |
| base_url | Optional[str] | None | API base URL (falls back to self.base_url class attribute) |
Subclass attributes to override:
| Attribute | Description |
|---|---|
| provider_name | Provider identifier string |
| base_url | Default API base URL |
| env_var | Environment variable name for the API key |
Usage tracking: After each chat() or analyze_image() call, the provider stores token counts in self._last_usage as {"input_tokens": int, "output_tokens": int}. This is consumed by ProviderManager._track().
ProviderManager
from video_processor.providers.manager import ProviderManager
High-level router that selects the best available provider and model for each API call. Supports explicit model selection, forced provider, or automatic selection based on discovered capabilities.
Constructor
def __init__(
self,
vision_model: Optional[str] = None,
chat_model: Optional[str] = None,
transcription_model: Optional[str] = None,
provider: Optional[str] = None,
auto: bool = True,
)
| Parameter | Type | Default | Description |
|---|---|---|---|
| vision_model | Optional[str] | None | Override model for vision tasks (e.g., "gpt-4o") |
| chat_model | Optional[str] | None | Override model for chat/LLM tasks |
| transcription_model | Optional[str] | None | Override model for transcription |
| provider | Optional[str] | None | Force all tasks to a single provider |
| auto | bool | True | If True and no model specified, pick the best available |
Attributes:
| Attribute | Type | Description |
|---|---|---|
| usage | UsageTracker | Tracks token counts and API costs across all calls |
Auto-selection preferences
When auto=True and no explicit model is set, providers are tried in this order:
Vision: Gemini (gemini-2.5-flash) > OpenAI (gpt-4o-mini) > Anthropic (claude-haiku-4-5-20251001)
Chat: Anthropic (claude-haiku-4-5-20251001) > OpenAI (gpt-4o-mini) > Gemini (gemini-2.5-flash)
Transcription: OpenAI (whisper-1) > Gemini (gemini-2.5-flash)
If no API-key-based provider is available, Ollama is tried as a fallback.
chat()
def chat(
self,
messages: list[dict],
max_tokens: int = 4096,
temperature: float = 0.7,
) -> str
Send a chat completion to the best available provider. Automatically resolves which provider and model to use.
Parameters:
| Parameter | Type | Default | Description |
|---|---|---|---|
| messages | list[dict] | required | OpenAI-format messages |
| max_tokens | int | 4096 | Maximum response tokens |
| temperature | float | 0.7 | Sampling temperature |
Returns: str -- assistant response text.
Raises: RuntimeError if no provider is available for the chat capability.
analyze_image()
def analyze_image(
self,
image_bytes: bytes,
prompt: str,
max_tokens: int = 4096,
) -> str
Analyze an image using the best available vision provider.
Returns: str -- analysis text.
Raises: RuntimeError if no provider is available for the vision capability.
transcribe_audio()
def transcribe_audio(
self,
audio_path: str | Path,
language: Optional[str] = None,
speaker_hints: Optional[list[str]] = None,
) -> dict
Transcribe audio. Prefers local Whisper (no file size limits, no API costs) when available, falling back to API-based transcription.
Parameters:
| Parameter | Type | Default | Description |
|---|---|---|---|
| audio_path | str \ | Path | required |
| language | Optional[str] | None | Language hint |
| speaker_hints | Optional[list[str]] | None | Speaker names for better recognition |
Returns: dict -- transcription result with text, segments, duration.
Local Whisper: If transcription_model is unset or starts with "whisper-local", the manager tries local Whisper first. Use "whisper-local:large" to specify a model size.
get_models_used()
def get_models_used(self) -> dict[str, str]
Return a dict mapping capability to "provider/model" string for tracking purposes.
pm = ProviderManager()
print(pm.get_models_used())
# {"vision": "gemini/gemini-2.5-flash", "chat": "anthropic/claude-haiku-4-5-20251001", ...}
Usage examples
from video_processor.providers.manager import ProviderManager
# Auto-select best providers
pm = ProviderManager()
# Force everything through one provider
pm = ProviderManager(provider="openai")
# Explicit model selection
pm = ProviderManager(
vision_model="gpt-4o",
chat_model="claude-haiku-4-5-20251001",
transcription_model="whisper-local:large",
)
# Chat completion
response = pm.chat([
{"role": "user", "content": "Summarize this meeting transcript..."}
])
# Image analysis
with open("diagram.png", "rb") as f:
analysis = pm.analyze_image(f.read(), "Describe this architecture diagram")
# Transcription with speaker hints
result = pm.transcribe_audio(
"meeting.mp3",
language="en",
speaker_hints=["Alice", "Bob", "Charlie"],
)
# Check usage
print(pm.usage.summary())
discover_available_models()
from video_processor.providers.discovery import discover_available_models
def discover_available_models(
api_keys: Optional[dict[str, str]] = None,
force_refresh: bool = False,
) -> list[ModelInfo]
Discover available models from all configured providers. For each provider with a valid API key, calls list_models() and returns a unified, sorted list.
Parameters:
| Parameter | Type | Default | Description |
|---|---|---|---|
| api_keys | Optional[dict[str, str]] | None | Override API keys (defaults to environment variables) |
| force_refresh | bool | False | Force re-discovery, ignoring the session cache |
Returns: list[ModelInfo] -- all discovered models, sorted by provider then model ID.
Caching: Results are cached for the session. Use force_refresh=True or clear_discovery_cache() to refresh.
from video_processor.providers.discovery import (
discover_available_models,
clear_discovery_cache,
)
# Discover models using environment variables
models = discover_available_models()
for m in models:
print(f"{m.provider}/{m.id} - {m.capabilities}")
# Force refresh
models = discover_available_models(force_refresh=True)
# Override API keys
models = discover_available_models(api_keys={
"openai": "sk-...",
"anthropic": "sk-ant-...",
})
# Clear cache
clear_discovery_cache()
clear_discovery_cache()
def clear_discovery_cache() -> None
Clear the cached model list, forcing the next discover_available_models() call to re-query providers.
Built-in Providers
The following providers are registered automatically when the provider system initializes:
| Provider | Environment Variable | Capabilities | Default Chat Model |
|---|---|---|---|
| openai | OPENAI_API_KEY | chat, vision, audio | gpt-4o-mini |
| anthropic | ANTHROPIC_API_KEY | chat, vision | claude-haiku-4-5-20251001 |
| gemini | GEMINI_API_KEY | chat, vision, audio | gemini-2.5-flash |
| ollama | (none -- checks server) | chat, vision | (depends on installed models) |
| together | TOGETHER_API_KEY | chat | (varies) |
| fireworks | FIREWORKS_API_KEY | chat | (varies) |
| cerebras | CEREBRAS_API_KEY | chat | (varies) |
| xai | XAI_API_KEY | chat | (varies) |
| azure | AZURE_OPENAI_API_KEY | chat, vision | (varies) |
Z 2