PlanOpticon
Phase 2: Add provider abstraction layer and auto-model-discovery Stories 2.1-2.5: Abstract BaseProvider with OpenAI, Anthropic, and Gemini implementations. Auto-discovery of available models via provider APIs. ProviderManager routes API calls to best available provider with support for explicit model overrides. CLI flags for --provider, --vision-model, --chat-model. New list-models command. Added google-genai dependency.
Commit
a94205bde32e2a5f35e998ed6321eb69e66781dd2ceb9b9e7f237dff354b9e32
Parent
b2df90efa9defc9…
10 files changed
+42
+100
+211
+6
+51
+2
+36
+147
+58
+125
+
requirements.txt
+
tests/test_providers.py
+
video_processor/cli/commands.py
+
video_processor/providers/__init__.py
+
video_processor/providers/anthropic_provider.py
+
video_processor/providers/base.py
+
video_processor/providers/discovery.py
+
video_processor/providers/gemini_provider.py
+
video_processor/providers/manager.py
+
video_processor/providers/openai_provider.py
+42
| --- a/requirements.txt | ||
| +++ b/requirements.txt | ||
| @@ -0,0 +1,42 @@ | ||
| 1 | +# Core dependencies | |
| 2 | +numpy>=1.24.0 | |
| 3 | +opencv-python>=4.8.0 | |
| 4 | +scipy>=1.10.0 | |
| 5 | +pillow>=10.0.0 | |
| 6 | +matplotlib>=3.7.0 | |
| 7 | +pydantic>=2.0.0 | |
| 8 | +tqdm>=4.66.0 | |
| 9 | +colorlog>=6.7.0 | |
| 10 | +click>=8.1.0 | |
| 11 | + | |
| 12 | +# Audio processing | |
| 13 | +librosa>=0.10.0 | |
| 14 | +soundfile>=0.12.0 | |
| 15 | + | |
| 16 | +# API integrations | |
| 17 | +openai>=1.0.0 | |
| 18 | +anthropic>=0.5.0 | |
| 19 | +google-genai>=1.0.0 | |
| 20 | +google-cloud-speech>=2.21.0 | |
| 21 | +google-cloud-vision>=3.4.0 | |
| 22 | + | |
| 23 | +# Markdown & visualization | |
| 24 | +markdown>=3.4.0 | |
| 25 | +pymdown-extensions>=10.0.0 | |
| 26 | +p | |
| 27 | +# Utilities | |
| 28 | +python-dotenv>=1.0.0 | |
| 29 | +requests>=2.31.0 | |
| 30 | +aiohttp>=3.8.5 | |
| 31 | +tenacity>=8.2.0 | |
| 32 | + | |
| 33 | +# Optional GPU acceleration | |
| 34 | +# torch>=2.0.0; platform_system != "Windows" or platform_machine != "arm64" | |
| 35 | +# torchvision>=0.15.0; platform_system != "Windows" or platform_machine != "arm64" | |
| 36 | + | |
| 37 | +# Testing | |
| 38 | +pytest>=7.3.0 | |
| 39 | +pytest-cov>=4.1.0 | |
| 40 | +black>=23.3.0 | |
| 41 | +isort>=5.12.0 | |
| 42 | +mypy>=1.3.0 |
| --- a/requirements.txt | |
| +++ b/requirements.txt | |
| @@ -0,0 +1,42 @@ | |
| --- a/requirements.txt | |
| +++ b/requirements.txt | |
| @@ -0,0 +1,42 @@ | |
| 1 | # Core dependencies |
| 2 | numpy>=1.24.0 |
| 3 | opencv-python>=4.8.0 |
| 4 | scipy>=1.10.0 |
| 5 | pillow>=10.0.0 |
| 6 | matplotlib>=3.7.0 |
| 7 | pydantic>=2.0.0 |
| 8 | tqdm>=4.66.0 |
| 9 | colorlog>=6.7.0 |
| 10 | click>=8.1.0 |
| 11 | |
| 12 | # Audio processing |
| 13 | librosa>=0.10.0 |
| 14 | soundfile>=0.12.0 |
| 15 | |
| 16 | # API integrations |
| 17 | openai>=1.0.0 |
| 18 | anthropic>=0.5.0 |
| 19 | google-genai>=1.0.0 |
| 20 | google-cloud-speech>=2.21.0 |
| 21 | google-cloud-vision>=3.4.0 |
| 22 | |
| 23 | # Markdown & visualization |
| 24 | markdown>=3.4.0 |
| 25 | pymdown-extensions>=10.0.0 |
| 26 | p |
| 27 | # Utilities |
| 28 | python-dotenv>=1.0.0 |
| 29 | requests>=2.31.0 |
| 30 | aiohttp>=3.8.5 |
| 31 | tenacity>=8.2.0 |
| 32 | |
| 33 | # Optional GPU acceleration |
| 34 | # torch>=2.0.0; platform_system != "Windows" or platform_machine != "arm64" |
| 35 | # torchvision>=0.15.0; platform_system != "Windows" or platform_machine != "arm64" |
| 36 | |
| 37 | # Testing |
| 38 | pytest>=7.3.0 |
| 39 | pytest-cov>=4.1.0 |
| 40 | black>=23.3.0 |
| 41 | isort>=5.12.0 |
| 42 | mypy>=1.3.0 |
+100
| --- a/tests/test_providers.py | ||
| +++ b/tests/test_providers.py | ||
| @@ -0,0 +1,100 @@ | ||
| 1 | +"""Tests for the provifrom unittest.mock import MagicMock, patch | |
| 2 | + | |
| 3 | +import pytest | |
| 4 | + | |
| 5 | +from video_pimport pytest | |
| 6 | +rManager | |
| 7 | + | |
| 8 | +----------------------sults(self, mock_ollama): | |
| 9 | + or.providers.manager import ProviderManager | |
| 10 | + | |
| 11 | +------------------------- | |
| 12 | + | |
| 13 | + | |
| 14 | +class TestModelInfo: | |
| 15 | + def test_basic(self): | |
| 16 | + m = ModelInfo(id="gpt-4o", provider="openai", capabilities=["chat", "vision"]) | |
| 17 | + assert m.id == "gpt-4o" | |
| 18 | + assert "vision" in m.capabilities | |
| 19 | + | |
| 20 | + def test_round_trip(seidon_model="whisper-1", | |
| 21 | + display_name=" display_name="Claude Sonnet", capabilities=["chat", "vision"]) | |
| 22 | +rovider | |
| 23 | +validate_json(m.model_dump_json()) | |
| 24 | + ------------------------- | |
| 25 | + | |
| 26 | + | |
| 27 | +class TestProviderManager: | |
| 28 | + def _make_mock_provider(sel"""Create a mock provider."""scover_available_modMagicMock(spec=BaseProvider) | |
| 29 | + provider.provider_name = name | |
| 30 | + provider.chat.return_value = "test response" | |
| 31 | + provider.analyze_image.return_value = "image analysis" | |
| 32 | + provider.transcribe_audio.return_value = { | |
| 33 | + "text": "hello world", | |
| 34 | + "segments": [], | |
| 35 | + "provider": name, | |
| 36 | + "model": "test", | |
| 37 | + } | |
| 38 | + return provider | |
| 39 | + | |
| 40 | + def test_init_with_explicit_models(self): | |
| 41 | + mgr = ProviderManager( | |
| 42 | + vision_model="gpt-4o", | |
| 43 | + chat_model="claude-sonnet-4-5-20250929", | |
| 44 | + transcription_model="whisper-1", | |
| 45 | + ) | |
| 46 | + assert mgr.vision_model == "gpt-4o" | |
| 47 | + assert mgr.chat_model == "claude-sonnet-4-5-20250929" | |
| 48 | + assert mgr.transcription_model == "whisper-1" | |
| 49 | + | |
| 50 | + def test_init_forced_provider(self): | |
| 51 | + mgr = ProviderManager(provider="gemini") | |
| 52 | + assert mgr.vision_model == "gemini-2.5-flash" | |
| 53 | + assert mgr.chat_model == "gemini-2.5-flash" | |
| 54 | + assert mgr.transcription_model == "gemini-2.5-flash" | |
| 55 | + | |
| 56 | + def test__model == "" | |
| 57 | + lash" | |
| 58 | + assertr="gemini") | |
| 59 | + assert mgr.vrn provider | |
| 60 | + | |
| 61 | + def test_init_with_explicit_models(self): | |
| 62 | + mgr = ProviderManager( | |
| 63 | + vision_model="gpt-4o", | |
| 64 | + chat_model="claude-sonnet-4-5-20250929", | |
| 65 | + transcription_model="whisperassert mgr.chat_model == "" | |
| 66 | + assert mgr.transcription_model == "" | |
| 67 | + | |
| 68 | + def test_init_no_overrides(self): | |
| 69 | + mgr = ProviderManager() | |
| 70 | + assert mgr.vision_model is None | |
| 71 | + assert mgr.chat_model is None | |
| 72 | + assert mgr.transcription_model is None | |
| 73 | + assert mgr.auto is True | |
| 74 | + | |
| 75 | + def test_default_for_provider_gemini(self): | |
| 76 | + result = ProviderManager._default_for_provider("gemini", "vision") | |
| 77 | + assert result == "gemini-2.5-flash" | |
| 78 | + | |
| 79 | + def test_default_foest | |
| 80 | + | |
| 81 | +from video_proceBaseProvider, ModelInfo | |
| 82 | +from video_processor.providers.manager import ProviderManager | |
| 83 | + | |
| 84 | +------------------------- | |
| 85 | + | |
| 86 | + | |
| 87 | +class TestModelInfo: | |
| 88 | + def test_basic(self): | |
| 89 | + m = ModelInfo("""Tests for the provifrom unittest.mock import MagicMock, patch----------t all provider modules import without errors.""" | |
| 90 | + | |
| 91 | + PROVIDER_MODULES = [ | |
| 92 | + "video_processor.providers.openai_provider", | |
| 93 | + "vidr", | |
| 94 | + "video_processo)0250929" | |
| 95 | + , | |
| 96 | + "video_processor.providers.azure_provider", | |
| 97 | + "video_processor.providers.together_provider", | |
| 98 | + "video_processor.providers.fireworks_provider", | |
| 99 | + "video_processor.providers.cerebras_provider", | |
| 100 | + "video_proces(" |
| --- a/tests/test_providers.py | |
| +++ b/tests/test_providers.py | |
| @@ -0,0 +1,100 @@ | |
| --- a/tests/test_providers.py | |
| +++ b/tests/test_providers.py | |
| @@ -0,0 +1,100 @@ | |
| 1 | """Tests for the provifrom unittest.mock import MagicMock, patch |
| 2 | |
| 3 | import pytest |
| 4 | |
| 5 | from video_pimport pytest |
| 6 | rManager |
| 7 | |
| 8 | ----------------------sults(self, mock_ollama): |
| 9 | or.providers.manager import ProviderManager |
| 10 | |
| 11 | ------------------------- |
| 12 | |
| 13 | |
| 14 | class TestModelInfo: |
| 15 | def test_basic(self): |
| 16 | m = ModelInfo(id="gpt-4o", provider="openai", capabilities=["chat", "vision"]) |
| 17 | assert m.id == "gpt-4o" |
| 18 | assert "vision" in m.capabilities |
| 19 | |
| 20 | def test_round_trip(seidon_model="whisper-1", |
| 21 | display_name=" display_name="Claude Sonnet", capabilities=["chat", "vision"]) |
| 22 | rovider |
| 23 | validate_json(m.model_dump_json()) |
| 24 | ------------------------- |
| 25 | |
| 26 | |
| 27 | class TestProviderManager: |
| 28 | def _make_mock_provider(sel"""Create a mock provider."""scover_available_modMagicMock(spec=BaseProvider) |
| 29 | provider.provider_name = name |
| 30 | provider.chat.return_value = "test response" |
| 31 | provider.analyze_image.return_value = "image analysis" |
| 32 | provider.transcribe_audio.return_value = { |
| 33 | "text": "hello world", |
| 34 | "segments": [], |
| 35 | "provider": name, |
| 36 | "model": "test", |
| 37 | } |
| 38 | return provider |
| 39 | |
| 40 | def test_init_with_explicit_models(self): |
| 41 | mgr = ProviderManager( |
| 42 | vision_model="gpt-4o", |
| 43 | chat_model="claude-sonnet-4-5-20250929", |
| 44 | transcription_model="whisper-1", |
| 45 | ) |
| 46 | assert mgr.vision_model == "gpt-4o" |
| 47 | assert mgr.chat_model == "claude-sonnet-4-5-20250929" |
| 48 | assert mgr.transcription_model == "whisper-1" |
| 49 | |
| 50 | def test_init_forced_provider(self): |
| 51 | mgr = ProviderManager(provider="gemini") |
| 52 | assert mgr.vision_model == "gemini-2.5-flash" |
| 53 | assert mgr.chat_model == "gemini-2.5-flash" |
| 54 | assert mgr.transcription_model == "gemini-2.5-flash" |
| 55 | |
| 56 | def test__model == "" |
| 57 | lash" |
| 58 | assertr="gemini") |
| 59 | assert mgr.vrn provider |
| 60 | |
| 61 | def test_init_with_explicit_models(self): |
| 62 | mgr = ProviderManager( |
| 63 | vision_model="gpt-4o", |
| 64 | chat_model="claude-sonnet-4-5-20250929", |
| 65 | transcription_model="whisperassert mgr.chat_model == "" |
| 66 | assert mgr.transcription_model == "" |
| 67 | |
| 68 | def test_init_no_overrides(self): |
| 69 | mgr = ProviderManager() |
| 70 | assert mgr.vision_model is None |
| 71 | assert mgr.chat_model is None |
| 72 | assert mgr.transcription_model is None |
| 73 | assert mgr.auto is True |
| 74 | |
| 75 | def test_default_for_provider_gemini(self): |
| 76 | result = ProviderManager._default_for_provider("gemini", "vision") |
| 77 | assert result == "gemini-2.5-flash" |
| 78 | |
| 79 | def test_default_foest |
| 80 | |
| 81 | from video_proceBaseProvider, ModelInfo |
| 82 | from video_processor.providers.manager import ProviderManager |
| 83 | |
| 84 | ------------------------- |
| 85 | |
| 86 | |
| 87 | class TestModelInfo: |
| 88 | def test_basic(self): |
| 89 | m = ModelInfo("""Tests for the provifrom unittest.mock import MagicMock, patch----------t all provider modules import without errors.""" |
| 90 | |
| 91 | PROVIDER_MODULES = [ |
| 92 | "video_processor.providers.openai_provider", |
| 93 | "vidr", |
| 94 | "video_processo)0250929" |
| 95 | , |
| 96 | "video_processor.providers.azure_provider", |
| 97 | "video_processor.providers.together_provider", |
| 98 | "video_processor.providers.fireworks_provider", |
| 99 | "video_processor.providers.cerebras_provider", |
| 100 | "video_proces(" |
| --- a/video_processor/cli/commands.py | ||
| +++ b/video_processor/cli/commands.py | ||
| @@ -0,0 +1,211 @@ | ||
| 1 | +video_pro["auto", "penai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "oimport timeopenai", "anprovideo_pro["from typingList, Optional "openai", "aextractors.frame_extractor import eextractors.audio_extraapi.transcription_aapi.vision_api import VisionAPI | |
| 2 | +analyzers.diagram_analyauto", "openai", "anpronai", "anthrfault | |
| 3 | + kcli.output_formatter import OutputFormatter | |
| 4 | + | |
| 5 | +# Configure logging | |
| 6 | +defch(ctx, input_, source kg_path"auto", "openai", "avideo_pro[if kg | |
| 7 | + merged_kgmerged_kg_pathcypher.cypher": | |
| 8 | + cyphero_pro["auto", "openai", auto", "ope | |
| 9 | + # Create a formatter that includes timestamp, level, and message "openadefservice",6 = KnowledgeGraph(:") | |
| 10 | +r.cypher": | |
| 11 | + cvideo_pro["auto", "opena])])anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto'DEBUG': 'cyan',,6 = Know 'INFO': 'green', | |
| 12 | + 'WARNING': 'yellow', | |
| 13 | + 'ERROR': 'red', | |
| 14 | +"auto", 'CRITICAL': 'red,bg_white', | |
| 15 | + } | |
| 16 | + ) | |
| 17 | + | |
| 18 | + # Set up.cypher" handlerpher.cypher": | |
| 19 | + cyphero_pro["autovideo_pro["auto", "op30.0,g | |
| 20 | + merged_kgmerged_kg_pathc | |
| 21 | + # Configure root logger cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") | |
| 22 | + | |
| 23 | + # Remove existing h(:") | |
| 24 | +r.cypher": | |
| 25 | + cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto"rovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault | |
| 26 | + kg_path"auto", "openai", "avideo_pro[if kg | |
| 27 | + merged_kgmerged_kg_pathcypher.cypher": | |
| 28 | + cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") | |
| 29 | +r.cypher": | |
| 30 | + cvideo_pro["auto", "opena])])batch(ctx, input_, source, , recursiver cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") | |
| 31 | +r.cypher": | |
| 32 | + cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault | |
| 33 | + kg_path"auto", "openai", "avideo_pro[if kg | |
| 34 | + merged_kgmerged_kg_pathcypher.cypher": | |
| 35 | + cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") | |
| 36 | +r.cypher": | |
| 37 | + cvideo_pro["auto", "opena])]), "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto"rovideo_pro["auto", "openai"path"auto", "openai", "avideo_pro[if kg | |
| 38 | + merged_kgmerged_kg_pathcypher.cypher": | |
| 39 | + cyphero_pro["auto", "openaito", "openadefservice",6D(:") | |
| 40 | +r.cypher": | |
| 41 | + cvideo_pro["auto", "opena])])batch(ctx, input_, source, , recursiver cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") | |
| 42 | +r.cypher": | |
| 43 | + cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault | |
| 44 | + kg_path"auto", "openai", "avideo_pro[if kg | |
| 45 | + merged_kgmerged_kg_pathcypher.cypher": | |
| 46 | + cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") | |
| 47 | +r.cypher": | |
| 48 | + cvideo_pro["auto", "opena])])thcypher.cypher":uto", "oimport timeopenai", "anprovideo_pro["from typingList, Optional "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault | |
| 49 | + kg_path"auto", "openai", "avideo_pro[if kg | |
| 50 | + merto", "openai", "anthrfault["auto", "openai", "anthvideo_pro["auto", "openai", "anpro["a "openai", "anpro["auto", "openai", "anpro["auto", "oimport timeopenai", "anprovideo_pro["from typingList, Optional "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault | |
| 51 | + kg_path"auto", "openai", "avideo_pro[if kg | |
| 52 | + merged_kgmerged_kg_pathcypher.cypher": | |
| 53 | + cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") | |
| 54 | +r.cypher": | |
| 55 | + cvideo_pro["auto", "opena])])anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "oypher": | |
| 56 | + cvideo_pro[" kg_path"auto", "openai", "avideo_pro[if kg | |
| 57 | + merged_kgmerged_kg_pathcypher.cypher": | |
| 58 | + cyphero_pro["autovideo_pro["auto", "op30.0,g | |
| 59 | + | |
| 60 | + # Find videosideo_pro["auto", "op30.0,g | |
| 61 | + pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault | |
| 62 | + kg_path"auto", "openai", "avideo_pro[if kg | |
| 63 | + merged_kgmerged_kg_pathcypher.cypher": | |
| 64 | + cyphero_pro["autovideo_pro["auto", "op30.0,g | |
| 65 | + merged_kgmerged_kg_pathcypher.cypher": | |
| 66 | + cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") | |
| 67 | +r.cypher": | |
| 68 | + cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto"rovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault | |
| 69 | + kg_path"auto", "openai", "avideo_pro[if kg | |
| 70 | + merged_kgmerged_kg_pathcypher.cypher": | |
| 71 | + cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") | |
| 72 | +r.cypher": | |
| 73 | + cvideo_pro["auto", "opena])])batch(ctx, input_, source, , recursiver cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") | |
| 74 | +r.cypher": | |
| 75 | + cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault | |
| 76 | + kg_path"auto", "openai", "avideo_pro[if kg | |
| 77 | + merged_kgmerged_kg_pathcypher.cypher": | |
| 78 | + cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") | |
| 79 | +r.cypher": | |
| 80 | + cvideo_pro["auto", "opena])])# Main CLI group "anpro["auto"rovideo_pro["aut'--verbose', '-v'thrfault["auto", "ope'Enable verbose output'"openai", "anthrfault | |
| 81 | + '0.1.0', prog_name='PlanOpticon'_pro[if kg | |
| 82 | + merged_kgmerged_kg_pathcypher.cypher": | |
| 83 | + cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeG# Initialize contextedgeGraph(:") | |
| 84 | +'verbose't_, source, , re | |
| 85 | + # Set up logging)batch(ctx, input_, source, , merged_kgmerged_kg_pathcyp'--input', '-i'to", "openai", "anpro["auto", "opexists=True)video_pro["auto"help='Input video file path'nadefservice",6 '--output', '-o'to", "openai", "anpro["auto", "open | |
| 86 | + help='Output direcnadefservice",6 '--depth',hrfault["auto", "ope'basic', 'svideo_pro["auto"default='standarnadefservice",6 '--focus' cyphero_pro["auto", "openai",list ofyphero_pro["auto", "openai", auto"'--use-gpu'thrfault["auto", "ope''nadefservice",6 '--sampling-rate'ovideo_pro["auto", "openai | |
| 87 | +"auto", help='_pro["auto", "opena (1.0 = every frame)'nadefservice",6 '--change-threshold'ovideo_pro["auto", "open15video_pro["auto"help='Threshold for detecting visnadefservice",6 '--title', type=str, help=' cvideo_pro["auto", "opena'nadefservice",6 '--provider', '-p',hrfault["auto", "ope'auto', 'openavideo_pro["auto"default='auto', help='API provider (auto selects best available)'nadefservice",6 '--vision-model'auto", "openai", "anNone, help='dgeGraph(:") | |
| 88 | +r.cypher": | |
| 89 | + 'nadefservice",6 '--chat-model'auto", "openai", "anNone, help='ctx, input_, source, , recursiver'_pro[if kg | |
| 90 | + merged_kgmeanalyze(ctx, input, output, depth, focus, use_gpu, sampling_rate, change_th") | |
| 91 | +r.cypher": | |
| 92 | + Analyze video content"openai", "anpr | |
| 93 | + focus_areas = [areN@ie,2:reH@i~,c:',')] | |
| 94 | + | |
| 95 | + # Set video title if not8@Zu,K:d | |
| 96 | + if not title: | |
| 97 | +8@1iu,1C:title = f"Analysis of {input_path.stem}" | |
| 98 | + | |
| 99 | + # Log analysis parameters | |
| 100 | +J@s0,_:Starting analysis of {input_path}") | |
| 101 | +J@s0,j:Processing depth: {depth}") | |
| 102 | + if focus_areasI@tG,y:info(f"Focus areas: {', '.join(focus_areas)}") | |
| 103 | + | |
| 104 | + try: | |
| 105 | +8@1aT,12:# Create subdirectories | |
| 106 | + frames_dir = output_dir / "frames"9@2eT,2t:audio_dir = output_dir / "audio" | |
| 107 | + transcript_dir = output_dir / "transcript" | |
| 108 | + diagrams_dir = output_dir / "diagrams" | |
| 109 | + results_dir = output_dir / "results" | |
| 110 | + 9@mU,2f:for directory in [frames_dir, audio_dir, transcript_dir, diagrams_dir, results_dir]: | |
| 111 | + directory.mkdir(exist_ok=True) | |
| 112 | + | |
| 113 | + # Step 1: Extract frames | |
| 114 | +L@2gG,1J:"Extracting video frames...") | |
| 115 | + frames = extract_frames( | |
| 116 | + input_paG@1c0,1O@pl,Y:disable_gpu=not use_gpu | |
| 117 | + ) | |
| 118 | +N@2gG,1~:Extracted {len(frames)} frames") | |
| 119 | + | |
| 120 | + # Save frames | |
| 121 | + frame_paths = save_frames(frames, frames_dir, "frame") | |
| 122 | +N@2gG,c:Saved frames to {frames_dir}") | |
| 123 | + 9@1Gg,O:# Step 2: Extract audio | |
| 124 | +L@2gG,2A:"Extracting audio...") | |
| 125 | + audio_extractor = AudioExtractor() | |
| 126 | + audio_path = audio_extractor.extract_audio( | |
| 127 | + input_paG@1c0,w:output_path=audio_dir / f"{input_path.stem}.wav" | |
| 128 | + ) | |
| 129 | +8@1ge,~:audio_props = audio_extractor.get_audio_properties(audio_path) | |
| 130 | +N@2gG,1~:Extracted audio: {audio_props['duration']:.2f}s, {audio_props['sample_rate']} Hz") | |
| 131 | + | |
| 132 | + # Step 3: Transcribe audio | |
| 133 | +L@2gG,22:"Transcribing audio...") | |
| 134 | + transcription_api = TranscriptionAPI( | |
| 135 | + provider="openai", # Could be configurable | |
| 136 | + 9@1Yi,3V:cache_dir=cache_dir, | |
| 137 | + use_cache=True | |
| 138 | + ) | |
| 139 | + | |
| 140 | + # Process based on depth | |
| 141 | + detect_speakers = depth != "basic" | |
| 142 | + transcription = transcription_api.transcribe_audio( | |
| 143 | + audio_paG@1c0,Y:detect_speakers=detect_speakers, | |
| 144 | + B@1qG,39:speakers=2 if detect_speakers else 1 # Default to 2 speakers if detecting | |
| 145 | + ) | |
| 146 | + | |
| 147 | + # Save transcript in different formats | |
| 148 | + transcript_path = transcription_api.save_transcript(D@2eT,E:transcription,D@2eT,d:transcript_dir / f"{input_path.stem}", | |
| 149 | + 8@1Uw,G: format="json"9@2_w,h:) | |
| 150 | + transcription_api.save_transcript(D@2eT,E:transcription,D@2eT,d:transcript_dir / f"{input_path.stem}", | |
| 151 | + 8@2jf,F: format="txt"9@2_w,h:) | |
| 152 | + transcription_api.save_transcript(D@2eT,E:transcription,D@2eT,g:transcript_dir / f"{input_path.stem}", | |
| 153 | + 8@qD,N:format="srt" | |
| 154 | + ) | |
| 155 | +8@1YD,1: | |
| 156 | +N@2gG,1Z:Saved transcripts to {transcript_dir}") | |
| 157 | + | |
| 158 | + # Step 4: Diagram extraction and analysis | |
| 159 | +L@2gG,2_:"Analyzing visual elements...") | |
| 160 | + | |
| 161 | + # Initialize vision API | |
| 162 | + vision_api = VisionAPI( | |
| 163 | + provider="openai", # Could be configurable | |
| 164 | + 9@1Yi,2w:cache_dir=cache_dir, | |
| 165 | + use_cache=True | |
| 166 | + ) | |
| 167 | + | |
| 168 | + # Initialize diagram analyzer | |
| 169 | + diagram_analyzer = DiagramAnalyzer( | |
| 170 | + vision_api=vision_api, | |
| 171 | + 9@1Yi,u:cache_dir=cache_dir, | |
| 172 | + use_cache=True | |
| 173 | + )9@2_w,1: | |
| 174 | +8@1gf,U:# Detect and analyze diagrams | |
| 175 | +8@1c8,11:# We pass frame paths instead of numpy arrays for better caching | |
| 176 | +L@2gG,4L:"Detecting diagrams in frames...") | |
| 177 | + diagrams = [] | |
| 178 | + | |
| 179 | + # Skip diagram detection for basic depth | |
| 180 | + if depth != "basic" and (not focus_areas or "diagrams" in focus_areas): | |
| 181 | + # For demo purposes, limit to a subset of frames to reduce API costs | |
| 182 | + 8@mV,2I: max_frames_to_analyze = 10 if depth == "standard" else 20 | |
| 183 | + frame_subset = frame_paths[:min(max_frames_to_analyze, len(frame_paths))]D@o4,1: | |
| 184 | +8@ut,1I: detected_frames = diagram_analyzer.detect_diagrams(frame_subset) | |
| 185 | + | |
| 186 | +B@rQ,J: if detected_framesI@1qE,13:logging.info(f"Detected {len(detected_frames)} potential diagrams")H@1qF,H@1qF,V:# Process each detected diagramH@1qF,b:for idx, confidence in detected_framesI@1qE,T: if idx < len(frame_subsetJ@1kT,8@8e,U:frame_path = frame_subset[idx]H@1qF,N@2gG,11:Analyzing diagram in frame {idx} (confidence: {confidence:.2f})")H@1qF,8@1b9,H@1qF,8@r0,L:# Analyze the diagramH@1qF,t: analysis = diagram_analyzer.analyze_diagram(fram8@2eB,I:extract_text=True)H@1qF,8@1b9,H@1qF,S: # Add frame metadataH@1qF,a: analysis['frame_index'] = idxH@1qF,8@1zi,Z:analysis['confidence'] = confidenceH@1qF,g: analysis['image_path'] = frame_pathH@1qF,8@1b9,H@1qF,w: # Generate Mermaid if sufficient analysis availableH@1qF,1c: if depth == "comprehensive" and 'semantic_analysis' in analysis and analysis.get('text_content'J@1kT,1D: analysis['mermaid'] = diagram_analyzer.generate_mermaid(analysis)H@1qF,8@1b9,H@1qF,n: # Save diagram image to diagrams directoryH@1qF,L: import shutilH@1qF,v: diagram_path = diagrams_dir / f"diagram_{idx}.jpg"H@1qF,j: shutil.copy2(frame_path, diagram_path)H@1qF,n: analysis['image_path'] = str(diagram_path)H@1qF,8@1b9,H@1qF,V: # Save analysis as JSONH@1qF,10: diagram_json_path = diagrams_dir / f"diagram_{idx}.json"H@1qF,8@7E,b:with open(diagram_json_path, 'w') as fI@1qE,5: 7@7E,W:json.dump(analysis, f, indent=2)H@1qF,8@1b9,H@1qF,n: diagrams.append(analysis) | |
| 187 | + elseI@1qE,20:logging.info("No diagrams detected in analyzed frames") | |
| 188 | + | |
| 189 | + # Step 5: Generate knowledge graph and markdown report | |
| 190 | +L@2gG,1Q:"Generating knowledge graph and report...") | |
| 191 | + | |
| 192 | + # Initialize knowledge graph | |
| 193 | +9@1kd,_:nowledge_graph = KnowledgeGraph( | |
| 194 | + 9@1Yi,2F:cache_dir=cache_dir, | |
| 195 | + use_cache=True | |
| 196 | + ) | |
| 197 | + | |
| 198 | + # Initialize plan generator | |
| 199 | + plan_generator = PlanGenerator( | |
| 200 | +9@1bj,c: knowledge_graph=knowledge_graph, | |
| 201 | + 9@1Yi,12:cache_dir=cache_dir, | |
| 202 | + use_cache=True | |
| 203 | + ) | |
| 204 | + 9@1zh,X:# Process transcript and diagrams9@7D,W:with open(transcript_path) as f:D@2eT,2W:transcript_data = json.load(f) | |
| 205 | + | |
| 206 | + # Process into knowledge graph | |
| 207 | + knowledge_graph.process_transcript(transcript_data) | |
| 208 | + if diagrams: | |
| 209 | +9@1bj,1M: knowledge_graph.process_diagrams(diagrams) | |
| 210 | + | |
| 211 | + # Save knowledge graph |
| --- a/video_processor/cli/commands.py | |
| +++ b/video_processor/cli/commands.py | |
| @@ -0,0 +1,211 @@ | |
| --- a/video_processor/cli/commands.py | |
| +++ b/video_processor/cli/commands.py | |
| @@ -0,0 +1,211 @@ | |
| 1 | video_pro["auto", "penai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "oimport timeopenai", "anprovideo_pro["from typingList, Optional "openai", "aextractors.frame_extractor import eextractors.audio_extraapi.transcription_aapi.vision_api import VisionAPI |
| 2 | analyzers.diagram_analyauto", "openai", "anpronai", "anthrfault |
| 3 | kcli.output_formatter import OutputFormatter |
| 4 | |
| 5 | # Configure logging |
| 6 | defch(ctx, input_, source kg_path"auto", "openai", "avideo_pro[if kg |
| 7 | merged_kgmerged_kg_pathcypher.cypher": |
| 8 | cyphero_pro["auto", "openai", auto", "ope |
| 9 | # Create a formatter that includes timestamp, level, and message "openadefservice",6 = KnowledgeGraph(:") |
| 10 | r.cypher": |
| 11 | cvideo_pro["auto", "opena])])anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto'DEBUG': 'cyan',,6 = Know 'INFO': 'green', |
| 12 | 'WARNING': 'yellow', |
| 13 | 'ERROR': 'red', |
| 14 | "auto", 'CRITICAL': 'red,bg_white', |
| 15 | } |
| 16 | ) |
| 17 | |
| 18 | # Set up.cypher" handlerpher.cypher": |
| 19 | cyphero_pro["autovideo_pro["auto", "op30.0,g |
| 20 | merged_kgmerged_kg_pathc |
| 21 | # Configure root logger cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") |
| 22 | |
| 23 | # Remove existing h(:") |
| 24 | r.cypher": |
| 25 | cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto"rovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault |
| 26 | kg_path"auto", "openai", "avideo_pro[if kg |
| 27 | merged_kgmerged_kg_pathcypher.cypher": |
| 28 | cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") |
| 29 | r.cypher": |
| 30 | cvideo_pro["auto", "opena])])batch(ctx, input_, source, , recursiver cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") |
| 31 | r.cypher": |
| 32 | cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault |
| 33 | kg_path"auto", "openai", "avideo_pro[if kg |
| 34 | merged_kgmerged_kg_pathcypher.cypher": |
| 35 | cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") |
| 36 | r.cypher": |
| 37 | cvideo_pro["auto", "opena])]), "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto"rovideo_pro["auto", "openai"path"auto", "openai", "avideo_pro[if kg |
| 38 | merged_kgmerged_kg_pathcypher.cypher": |
| 39 | cyphero_pro["auto", "openaito", "openadefservice",6D(:") |
| 40 | r.cypher": |
| 41 | cvideo_pro["auto", "opena])])batch(ctx, input_, source, , recursiver cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") |
| 42 | r.cypher": |
| 43 | cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault |
| 44 | kg_path"auto", "openai", "avideo_pro[if kg |
| 45 | merged_kgmerged_kg_pathcypher.cypher": |
| 46 | cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") |
| 47 | r.cypher": |
| 48 | cvideo_pro["auto", "opena])])thcypher.cypher":uto", "oimport timeopenai", "anprovideo_pro["from typingList, Optional "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault |
| 49 | kg_path"auto", "openai", "avideo_pro[if kg |
| 50 | merto", "openai", "anthrfault["auto", "openai", "anthvideo_pro["auto", "openai", "anpro["a "openai", "anpro["auto", "openai", "anpro["auto", "oimport timeopenai", "anprovideo_pro["from typingList, Optional "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault |
| 51 | kg_path"auto", "openai", "avideo_pro[if kg |
| 52 | merged_kgmerged_kg_pathcypher.cypher": |
| 53 | cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") |
| 54 | r.cypher": |
| 55 | cvideo_pro["auto", "opena])])anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "oypher": |
| 56 | cvideo_pro[" kg_path"auto", "openai", "avideo_pro[if kg |
| 57 | merged_kgmerged_kg_pathcypher.cypher": |
| 58 | cyphero_pro["autovideo_pro["auto", "op30.0,g |
| 59 | |
| 60 | # Find videosideo_pro["auto", "op30.0,g |
| 61 | pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault |
| 62 | kg_path"auto", "openai", "avideo_pro[if kg |
| 63 | merged_kgmerged_kg_pathcypher.cypher": |
| 64 | cyphero_pro["autovideo_pro["auto", "op30.0,g |
| 65 | merged_kgmerged_kg_pathcypher.cypher": |
| 66 | cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") |
| 67 | r.cypher": |
| 68 | cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto"rovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault |
| 69 | kg_path"auto", "openai", "avideo_pro[if kg |
| 70 | merged_kgmerged_kg_pathcypher.cypher": |
| 71 | cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") |
| 72 | r.cypher": |
| 73 | cvideo_pro["auto", "opena])])batch(ctx, input_, source, , recursiver cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") |
| 74 | r.cypher": |
| 75 | cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault |
| 76 | kg_path"auto", "openai", "avideo_pro[if kg |
| 77 | merged_kgmerged_kg_pathcypher.cypher": |
| 78 | cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:") |
| 79 | r.cypher": |
| 80 | cvideo_pro["auto", "opena])])# Main CLI group "anpro["auto"rovideo_pro["aut'--verbose', '-v'thrfault["auto", "ope'Enable verbose output'"openai", "anthrfault |
| 81 | '0.1.0', prog_name='PlanOpticon'_pro[if kg |
| 82 | merged_kgmerged_kg_pathcypher.cypher": |
| 83 | cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeG# Initialize contextedgeGraph(:") |
| 84 | 'verbose't_, source, , re |
| 85 | # Set up logging)batch(ctx, input_, source, , merged_kgmerged_kg_pathcyp'--input', '-i'to", "openai", "anpro["auto", "opexists=True)video_pro["auto"help='Input video file path'nadefservice",6 '--output', '-o'to", "openai", "anpro["auto", "open |
| 86 | help='Output direcnadefservice",6 '--depth',hrfault["auto", "ope'basic', 'svideo_pro["auto"default='standarnadefservice",6 '--focus' cyphero_pro["auto", "openai",list ofyphero_pro["auto", "openai", auto"'--use-gpu'thrfault["auto", "ope''nadefservice",6 '--sampling-rate'ovideo_pro["auto", "openai |
| 87 | "auto", help='_pro["auto", "opena (1.0 = every frame)'nadefservice",6 '--change-threshold'ovideo_pro["auto", "open15video_pro["auto"help='Threshold for detecting visnadefservice",6 '--title', type=str, help=' cvideo_pro["auto", "opena'nadefservice",6 '--provider', '-p',hrfault["auto", "ope'auto', 'openavideo_pro["auto"default='auto', help='API provider (auto selects best available)'nadefservice",6 '--vision-model'auto", "openai", "anNone, help='dgeGraph(:") |
| 88 | r.cypher": |
| 89 | 'nadefservice",6 '--chat-model'auto", "openai", "anNone, help='ctx, input_, source, , recursiver'_pro[if kg |
| 90 | merged_kgmeanalyze(ctx, input, output, depth, focus, use_gpu, sampling_rate, change_th") |
| 91 | r.cypher": |
| 92 | Analyze video content"openai", "anpr |
| 93 | focus_areas = [areN@ie,2:reH@i~,c:',')] |
| 94 | |
| 95 | # Set video title if not8@Zu,K:d |
| 96 | if not title: |
| 97 | 8@1iu,1C:title = f"Analysis of {input_path.stem}" |
| 98 | |
| 99 | # Log analysis parameters |
| 100 | J@s0,_:Starting analysis of {input_path}") |
| 101 | J@s0,j:Processing depth: {depth}") |
| 102 | if focus_areasI@tG,y:info(f"Focus areas: {', '.join(focus_areas)}") |
| 103 | |
| 104 | try: |
| 105 | 8@1aT,12:# Create subdirectories |
| 106 | frames_dir = output_dir / "frames"9@2eT,2t:audio_dir = output_dir / "audio" |
| 107 | transcript_dir = output_dir / "transcript" |
| 108 | diagrams_dir = output_dir / "diagrams" |
| 109 | results_dir = output_dir / "results" |
| 110 | 9@mU,2f:for directory in [frames_dir, audio_dir, transcript_dir, diagrams_dir, results_dir]: |
| 111 | directory.mkdir(exist_ok=True) |
| 112 | |
| 113 | # Step 1: Extract frames |
| 114 | L@2gG,1J:"Extracting video frames...") |
| 115 | frames = extract_frames( |
| 116 | input_paG@1c0,1O@pl,Y:disable_gpu=not use_gpu |
| 117 | ) |
| 118 | N@2gG,1~:Extracted {len(frames)} frames") |
| 119 | |
| 120 | # Save frames |
| 121 | frame_paths = save_frames(frames, frames_dir, "frame") |
| 122 | N@2gG,c:Saved frames to {frames_dir}") |
| 123 | 9@1Gg,O:# Step 2: Extract audio |
| 124 | L@2gG,2A:"Extracting audio...") |
| 125 | audio_extractor = AudioExtractor() |
| 126 | audio_path = audio_extractor.extract_audio( |
| 127 | input_paG@1c0,w:output_path=audio_dir / f"{input_path.stem}.wav" |
| 128 | ) |
| 129 | 8@1ge,~:audio_props = audio_extractor.get_audio_properties(audio_path) |
| 130 | N@2gG,1~:Extracted audio: {audio_props['duration']:.2f}s, {audio_props['sample_rate']} Hz") |
| 131 | |
| 132 | # Step 3: Transcribe audio |
| 133 | L@2gG,22:"Transcribing audio...") |
| 134 | transcription_api = TranscriptionAPI( |
| 135 | provider="openai", # Could be configurable |
| 136 | 9@1Yi,3V:cache_dir=cache_dir, |
| 137 | use_cache=True |
| 138 | ) |
| 139 | |
| 140 | # Process based on depth |
| 141 | detect_speakers = depth != "basic" |
| 142 | transcription = transcription_api.transcribe_audio( |
| 143 | audio_paG@1c0,Y:detect_speakers=detect_speakers, |
| 144 | B@1qG,39:speakers=2 if detect_speakers else 1 # Default to 2 speakers if detecting |
| 145 | ) |
| 146 | |
| 147 | # Save transcript in different formats |
| 148 | transcript_path = transcription_api.save_transcript(D@2eT,E:transcription,D@2eT,d:transcript_dir / f"{input_path.stem}", |
| 149 | 8@1Uw,G: format="json"9@2_w,h:) |
| 150 | transcription_api.save_transcript(D@2eT,E:transcription,D@2eT,d:transcript_dir / f"{input_path.stem}", |
| 151 | 8@2jf,F: format="txt"9@2_w,h:) |
| 152 | transcription_api.save_transcript(D@2eT,E:transcription,D@2eT,g:transcript_dir / f"{input_path.stem}", |
| 153 | 8@qD,N:format="srt" |
| 154 | ) |
| 155 | 8@1YD,1: |
| 156 | N@2gG,1Z:Saved transcripts to {transcript_dir}") |
| 157 | |
| 158 | # Step 4: Diagram extraction and analysis |
| 159 | L@2gG,2_:"Analyzing visual elements...") |
| 160 | |
| 161 | # Initialize vision API |
| 162 | vision_api = VisionAPI( |
| 163 | provider="openai", # Could be configurable |
| 164 | 9@1Yi,2w:cache_dir=cache_dir, |
| 165 | use_cache=True |
| 166 | ) |
| 167 | |
| 168 | # Initialize diagram analyzer |
| 169 | diagram_analyzer = DiagramAnalyzer( |
| 170 | vision_api=vision_api, |
| 171 | 9@1Yi,u:cache_dir=cache_dir, |
| 172 | use_cache=True |
| 173 | )9@2_w,1: |
| 174 | 8@1gf,U:# Detect and analyze diagrams |
| 175 | 8@1c8,11:# We pass frame paths instead of numpy arrays for better caching |
| 176 | L@2gG,4L:"Detecting diagrams in frames...") |
| 177 | diagrams = [] |
| 178 | |
| 179 | # Skip diagram detection for basic depth |
| 180 | if depth != "basic" and (not focus_areas or "diagrams" in focus_areas): |
| 181 | # For demo purposes, limit to a subset of frames to reduce API costs |
| 182 | 8@mV,2I: max_frames_to_analyze = 10 if depth == "standard" else 20 |
| 183 | frame_subset = frame_paths[:min(max_frames_to_analyze, len(frame_paths))]D@o4,1: |
| 184 | 8@ut,1I: detected_frames = diagram_analyzer.detect_diagrams(frame_subset) |
| 185 | |
| 186 | B@rQ,J: if detected_framesI@1qE,13:logging.info(f"Detected {len(detected_frames)} potential diagrams")H@1qF,H@1qF,V:# Process each detected diagramH@1qF,b:for idx, confidence in detected_framesI@1qE,T: if idx < len(frame_subsetJ@1kT,8@8e,U:frame_path = frame_subset[idx]H@1qF,N@2gG,11:Analyzing diagram in frame {idx} (confidence: {confidence:.2f})")H@1qF,8@1b9,H@1qF,8@r0,L:# Analyze the diagramH@1qF,t: analysis = diagram_analyzer.analyze_diagram(fram8@2eB,I:extract_text=True)H@1qF,8@1b9,H@1qF,S: # Add frame metadataH@1qF,a: analysis['frame_index'] = idxH@1qF,8@1zi,Z:analysis['confidence'] = confidenceH@1qF,g: analysis['image_path'] = frame_pathH@1qF,8@1b9,H@1qF,w: # Generate Mermaid if sufficient analysis availableH@1qF,1c: if depth == "comprehensive" and 'semantic_analysis' in analysis and analysis.get('text_content'J@1kT,1D: analysis['mermaid'] = diagram_analyzer.generate_mermaid(analysis)H@1qF,8@1b9,H@1qF,n: # Save diagram image to diagrams directoryH@1qF,L: import shutilH@1qF,v: diagram_path = diagrams_dir / f"diagram_{idx}.jpg"H@1qF,j: shutil.copy2(frame_path, diagram_path)H@1qF,n: analysis['image_path'] = str(diagram_path)H@1qF,8@1b9,H@1qF,V: # Save analysis as JSONH@1qF,10: diagram_json_path = diagrams_dir / f"diagram_{idx}.json"H@1qF,8@7E,b:with open(diagram_json_path, 'w') as fI@1qE,5: 7@7E,W:json.dump(analysis, f, indent=2)H@1qF,8@1b9,H@1qF,n: diagrams.append(analysis) |
| 187 | elseI@1qE,20:logging.info("No diagrams detected in analyzed frames") |
| 188 | |
| 189 | # Step 5: Generate knowledge graph and markdown report |
| 190 | L@2gG,1Q:"Generating knowledge graph and report...") |
| 191 | |
| 192 | # Initialize knowledge graph |
| 193 | 9@1kd,_:nowledge_graph = KnowledgeGraph( |
| 194 | 9@1Yi,2F:cache_dir=cache_dir, |
| 195 | use_cache=True |
| 196 | ) |
| 197 | |
| 198 | # Initialize plan generator |
| 199 | plan_generator = PlanGenerator( |
| 200 | 9@1bj,c: knowledge_graph=knowledge_graph, |
| 201 | 9@1Yi,12:cache_dir=cache_dir, |
| 202 | use_cache=True |
| 203 | ) |
| 204 | 9@1zh,X:# Process transcript and diagrams9@7D,W:with open(transcript_path) as f:D@2eT,2W:transcript_data = json.load(f) |
| 205 | |
| 206 | # Process into knowledge graph |
| 207 | knowledge_graph.process_transcript(transcript_data) |
| 208 | if diagrams: |
| 209 | 9@1bj,1M: knowledge_graph.process_diagrams(diagrams) |
| 210 | |
| 211 | # Save knowledge graph |
| --- a/video_processor/providers/__init__.py | ||
| +++ b/video_processor/providers/__init__.py | ||
| @@ -0,0 +1,6 @@ | ||
| 1 | +"""Provider abstraction layer for LLM, vision, and transcription APIs.""" | |
| 2 | + | |
| 3 | +from video_proceBaseProvider, ModelInfo | |
| 4 | +from video_processor.providers.manager import ProviderManager | |
| 5 | + | |
| 6 | +__all__ = ["BaseProvider", "Mode |
| --- a/video_processor/providers/__init__.py | |
| +++ b/video_processor/providers/__init__.py | |
| @@ -0,0 +1,6 @@ | |
| --- a/video_processor/providers/__init__.py | |
| +++ b/video_processor/providers/__init__.py | |
| @@ -0,0 +1,6 @@ | |
| 1 | """Provider abstraction layer for LLM, vision, and transcription APIs.""" |
| 2 | |
| 3 | from video_proceBaseProvider, ModelInfo |
| 4 | from video_processor.providers.manager import ProviderManager |
| 5 | |
| 6 | __all__ = ["BaseProvider", "Mode |
| --- a/video_processor/providers/anthropic_provider.py | ||
| +++ b/video_processor/providers/anthropic_provider.py | ||
| @@ -0,0 +1,51 @@ | ||
| 1 | +"""Anthropic provider implementation.""" | |
| 2 | + | |
| 3 | +import base64 | |
| 4 | +import logging | |
| 5 | +import os | |
| 6 | +from pathlib import Path | |
| 7 | +from typing import Optional | |
| 8 | + | |
| 9 | +import anthropic | |
| 10 | +from dotenv import load_dotenv | |
| 11 | + | |
| 12 | +from video_processor.providers.base imp | |
| 13 | + | |
| 14 | +load_dotenv() | |
| 15 | +logger = logging.getLogger(__name__) | |
| 16 | + | |
| 17 | + | |
| 18 | +class AnthropicProvider(BaseProvider): | |
| 19 | + """Anthropic Claude API provider.""" | |
| 20 | + | |
| 21 | + provider_name = "anthropic" | |
| 22 | + | |
| 23 | + def __init__(self, api_key: Optional[str] = None): | |
| 24 | + self.api_key = api_key or os.getenv("ANTHROPIC_API_KEY") | |
| 25 | + if not self.api_key: | |
| 26 | + raise ValueError("ANTHROPIC_API_KEY not set") | |
| 27 | + self.client = anthropic.Anthropic(api_key=self.api_key) | |
| 28 | + | |
| 29 | + def chat( | |
| 30 | + self, | |
| 31 | + messages: list[dict], | |
| 32 | + max_tokens: int = 4096, | |
| 33 | + temperature: float = 0.7, | |
| 34 | + model: Optional[str] = None, | |
| 35 | + ) -> str: | |
| 36 | + model = model or "c response = self.client.messages.create( | |
| 37 | + model=model, | |
| 38 | + messages= "model": mode # Anthropic requires syste temperature=temperatureires system messages as a top-level parameter | |
| 39 | + system_parts = [] | |
| 40 | + ( | |
| 41 | + self, | |
| 42 | + image_bytes: bytes, | |
| 43 | + prompt: str, | |
| 44 | + max_tokens: int = 4096, | |
| 45 | + model: Optional[str]ModelInfo] = None, | |
| 46 | + ) -> strid=mid, = None, | |
| 47 | + ) -> str = base64.b64enmessages.create( | |
| 48 | + model=model, | |
| 49 | + messages=[ | |
| 50 | + { | |
| 51 | + |
| --- a/video_processor/providers/anthropic_provider.py | |
| +++ b/video_processor/providers/anthropic_provider.py | |
| @@ -0,0 +1,51 @@ | |
| --- a/video_processor/providers/anthropic_provider.py | |
| +++ b/video_processor/providers/anthropic_provider.py | |
| @@ -0,0 +1,51 @@ | |
| 1 | """Anthropic provider implementation.""" |
| 2 | |
| 3 | import base64 |
| 4 | import logging |
| 5 | import os |
| 6 | from pathlib import Path |
| 7 | from typing import Optional |
| 8 | |
| 9 | import anthropic |
| 10 | from dotenv import load_dotenv |
| 11 | |
| 12 | from video_processor.providers.base imp |
| 13 | |
| 14 | load_dotenv() |
| 15 | logger = logging.getLogger(__name__) |
| 16 | |
| 17 | |
| 18 | class AnthropicProvider(BaseProvider): |
| 19 | """Anthropic Claude API provider.""" |
| 20 | |
| 21 | provider_name = "anthropic" |
| 22 | |
| 23 | def __init__(self, api_key: Optional[str] = None): |
| 24 | self.api_key = api_key or os.getenv("ANTHROPIC_API_KEY") |
| 25 | if not self.api_key: |
| 26 | raise ValueError("ANTHROPIC_API_KEY not set") |
| 27 | self.client = anthropic.Anthropic(api_key=self.api_key) |
| 28 | |
| 29 | def chat( |
| 30 | self, |
| 31 | messages: list[dict], |
| 32 | max_tokens: int = 4096, |
| 33 | temperature: float = 0.7, |
| 34 | model: Optional[str] = None, |
| 35 | ) -> str: |
| 36 | model = model or "c response = self.client.messages.create( |
| 37 | model=model, |
| 38 | messages= "model": mode # Anthropic requires syste temperature=temperatureires system messages as a top-level parameter |
| 39 | system_parts = [] |
| 40 | ( |
| 41 | self, |
| 42 | image_bytes: bytes, |
| 43 | prompt: str, |
| 44 | max_tokens: int = 4096, |
| 45 | model: Optional[str]ModelInfo] = None, |
| 46 | ) -> strid=mid, = None, |
| 47 | ) -> str = base64.b64enmessages.create( |
| 48 | model=model, |
| 49 | messages=[ |
| 50 | { |
| 51 |
| --- a/video_processor/providers/base.py | ||
| +++ b/video_processor/providers/base.py | ||
| @@ -0,0 +1,2 @@ | ||
| 1 | +"""Abstract base class and | |
| 2 | + |
| --- a/video_processor/providers/base.py | |
| +++ b/video_processor/providers/base.py | |
| @@ -0,0 +1,2 @@ | |
| --- a/video_processor/providers/base.py | |
| +++ b/video_processor/providers/base.py | |
| @@ -0,0 +1,2 @@ | |
| 1 | """Abstract base class and |
| 2 |
| --- a/video_processor/providers/discovery.py | ||
| +++ b/video_processor/providers/discovery.py | ||
| @@ -0,0 +1,36 @@ | ||
| 1 | +"""Auto-discover available models across providers.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import os | |
| 5 | +from typing import Optional | |
| 6 | + | |
| 7 | +from dotenv import load_dotenv | |
| 8 | + | |
| 9 | +from video_processor.providers.base import ModelInfo | |
| 10 | + | |
| 11 | +load_dotenv() | |
| 12 | +logger = logging.getLogger(__name__) | |
| 13 | + | |
| 14 | +_cached_models: Optional[lisdiscover_available_models( | |
| 15 | + api_keys: Optional[dict[str, str]] = None, | |
| 16 | + force_refresh: bool = False, | |
| 17 | +) -> list[ModelInfo]: | |
| 18 | + """ | |
| 19 | + Discover available models from all configured providers. | |
| 20 | + | |
| 21 | + For each provider with a valid API key, calls list_models() and returns | |
| 22 | + a unified list. Results are cached for the session. | |
| 23 | + """ | |
| 24 | + global _cached_models | |
| 25 | + if _cached_models is not None and not force_refresh: | |
| 26 | + return _cached_models | |
| 27 | + | |
| 28 | + roviders_registered() | |
| 29 | + | |
| 30 | + keys = api_keys or { | |
| 31 | + "openai": o""), | |
| 32 | + } | |
| 33 | + | |
| 34 | + all_modelAnthropic | |
| 35 | + try: | |
| 36 | + cessor.profrom video_processor.providers.anthropic_provider i = i |
| --- a/video_processor/providers/discovery.py | |
| +++ b/video_processor/providers/discovery.py | |
| @@ -0,0 +1,36 @@ | |
| --- a/video_processor/providers/discovery.py | |
| +++ b/video_processor/providers/discovery.py | |
| @@ -0,0 +1,36 @@ | |
| 1 | """Auto-discover available models across providers.""" |
| 2 | |
| 3 | import logging |
| 4 | import os |
| 5 | from typing import Optional |
| 6 | |
| 7 | from dotenv import load_dotenv |
| 8 | |
| 9 | from video_processor.providers.base import ModelInfo |
| 10 | |
| 11 | load_dotenv() |
| 12 | logger = logging.getLogger(__name__) |
| 13 | |
| 14 | _cached_models: Optional[lisdiscover_available_models( |
| 15 | api_keys: Optional[dict[str, str]] = None, |
| 16 | force_refresh: bool = False, |
| 17 | ) -> list[ModelInfo]: |
| 18 | """ |
| 19 | Discover available models from all configured providers. |
| 20 | |
| 21 | For each provider with a valid API key, calls list_models() and returns |
| 22 | a unified list. Results are cached for the session. |
| 23 | """ |
| 24 | global _cached_models |
| 25 | if _cached_models is not None and not force_refresh: |
| 26 | return _cached_models |
| 27 | |
| 28 | roviders_registered() |
| 29 | |
| 30 | keys = api_keys or { |
| 31 | "openai": o""), |
| 32 | } |
| 33 | |
| 34 | all_modelAnthropic |
| 35 | try: |
| 36 | cessor.profrom video_processor.providers.anthropic_provider i = i |
| --- a/video_processor/providers/gemini_provider.py | ||
| +++ b/video_processor/providers/gemini_provider.py | ||
| @@ -0,0 +1,147 @@ | ||
| 1 | +"""Google Gemini provider implementation using the google-genai SDK.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import os | |
| 5 | +from pathlib import Path | |
| 6 | +from typing import Optional | |
| 7 | + | |
| 8 | +from dotenv import load_dotenv | |
| 9 | + | |
| 10 | +from video_processor.providers.base imp | |
| 11 | + | |
| 12 | +load_dotenv() | |
| 13 | +logger = logging.getLogger(__name__) | |
| 14 | + | |
| 15 | +# Capabilities inferred from model id patterns | |
| 16 | +_VISION_KEYWORDS = {"gemini-2", "gemini-3", "gemini-pro", "gemini-flash", "gemini-ultra"} | |
| 17 | +_AUDIO_KEYWORDS = {"gemini-2", "gemini-3", "gemini-flash"} | |
| 18 | + | |
| 19 | + | |
| 20 | +class GeminiProvider(BaseProvider): | |
| 21 | + """Google Gemini API provider via google-genai SDK.""" | |
| 22 | + | |
| 23 | + provider_name = "gemini" | |
| 24 | + | |
| 25 | + def __init__( | |
| 26 | + self, | |
| 27 | + api_self,one, | |
| 28 | + credentials_path:): | |
| 29 | + self.api_key = api_key or os.getenv("GEMINI_API_KEY") | |
| 30 | + if not self.client = genai.Cliraise ValueErro self._genai = genai | |
| 31 | +i_key) | |
| 32 | + else: | |
| 33 | + # Service account → use Ve "google-genai package not installed. """Google Gemini pioogle-genai" | |
| 34 | + ) | |
| 35 | + | |
| 36 | + def chat( | |
| 37 | + self, | |
| 38 | + messages: list[dict], | |
| 39 | + max_tokens: int = 4096, | |
| 40 | + temperature: float = 0.7, | |
| 41 | + model: Optional[str] = None, | |
| 42 | + ) -> str: | |
| 43 | + from google.genai import types | |
| 44 | + | |
| 45 | + model = model or "gemini-2.5-flash" | |
| 46 | + # Convert OpenAI-style messages to Gemini contents | |
| 47 | + contents = [] | |
| 48 | + for msg in messages: | |
| 49 | + role = "user" if msg["role"] == "user" else "model" | |
| 50 | + contents.append( | |
| 51 | + types.Conttypes.C logging | |
| 52 | +import os | |
| 53 | +f"""Google Gem max_output_tokens=8192, | |
| 54 | + response_mime_type="application/json", | |
| 55 | + ), | |
| 56 | + ) | |
| 57 | + | |
| 58 | + # Parse JSON response | |
| 59 | + import json | |
| 60 | + | |
| 61 | + try: | |
| 62 | + data = json.loads(response.text) | |
| 63 | + except (json.JSO"client.models.generate_content( | |
| 64 | + model=model, | |
| 65 | + contents=.credentials_path: | |
| 66 | + raise ValueError("Neither GEMINI_API_KEY nor GOOGLE_APPLICATION_CREDENas f: | |
| 67 | + sa_info = json.load(f) | |
| 68 | + project = sa_info.get("project_id", "") | |
| 69 | + location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1") | |
| 70 | + | |
| 71 | + self.client = genai.Client( | |
| 72 | + vertexai=True, | |
| 73 | + project=project, | |
| 74 | + location=location, | |
| 75 | + ) | |
| 76 | + except ImportError: | |
| 77 | + raise ImportError( | |
| 78 | + "google-genai package not installed. """Google Gemini pioogle-genai" | |
| 79 | + ) | |
| 80 | + | |
| 81 | + def chat( | |
| 82 | + self, | |
| 83 | + messages: list[dict], | |
| 84 | + max_tokens: intes: | |
| 85 | + role = "user" if msg["role"] == "user" else "model" | |
| 86 | + contents.append( | |
| 87 | + types.Conttypes.C logging | |
| 88 | +import os | |
| 89 | +f"""Google Gem max_output_tokens=8192, | |
| 90 | + response_mime_type="application/json", | |
| 91 | + ), | |
| 92 | + ) | |
| 93 | + | |
| 94 | + # Parse JSON response | |
| 95 | + import json | |
| 96 | + | |
| 97 | + try: | |
| 98 | + data = json.loads(response.text) | |
| 99 | + except (json.JSO"client.models.generate_content( | |
| 100 | + model=model, | |
| 101 | + contents=.credentials_path: | |
| 102 | + raise ValueError("Neither GEMINI_API_KEY nor GOOGLE_APPLICATION_CREDENTIALS is set") | |
| 103 | + | |
| 104 | + try: | |
| 105 | + from google import genai | |
| 106 | + | |
| 107 | + self._genai = genai | |
| 108 | + | |
| 109 | + if self.api_key: | |
| 110 | + self.client = genai.Client(api_key=self.api_key) | |
| 111 | + else: | |
| 112 | + # Service account → use Vertex AI mode | |
| 113 | + import json | |
| 114 | + | |
| 115 | + with open(self.credentials_path) as f: | |
| 116 | + sa_info = json.load(f) | |
| 117 | + project = sa_info.get("project_id", "") | |
| 118 | + location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1") | |
| 119 | + | |
| 120 | + self.client = genai.Client( | |
| 121 | + vertexai=True, | |
| 122 | + project=project, | |
| 123 | + location=location, | |
| 124 | + ) | |
| 125 | + except ImportError: | |
| 126 | + raise ImportError( | |
| 127 | + "google-genai package not installed. Install with: pip install google-genai" | |
| 128 | + ) | |
| 129 | + | |
| 130 | + def chat( | |
| 131 | + self, | |
| 132 | + messages: list[dict], | |
| 133 | + max_tokens: int = 4096, | |
| 134 | + temperature: float = 0.7, | |
| 135 | + text) | |
| 136 | + except (json.JSO:]r] = None, | |
| 137 | + crModelInfoal[str] = None, | |
| 138 | + congging | |
| 139 | +import os | |
| 140 | +"""Google Gemini pr ) | |
| 141 | + import json | |
| 142 | + | |
| 143 | + try: | |
| 144 | + data = json.loads(response.text) | |
| 145 | + except (json.JSO"models/" prefix if present ) | |
| 146 | + excep"models/"mid = mid[7:] ) | |
| 147 | + excep |
| --- a/video_processor/providers/gemini_provider.py | |
| +++ b/video_processor/providers/gemini_provider.py | |
| @@ -0,0 +1,147 @@ | |
| --- a/video_processor/providers/gemini_provider.py | |
| +++ b/video_processor/providers/gemini_provider.py | |
| @@ -0,0 +1,147 @@ | |
| 1 | """Google Gemini provider implementation using the google-genai SDK.""" |
| 2 | |
| 3 | import logging |
| 4 | import os |
| 5 | from pathlib import Path |
| 6 | from typing import Optional |
| 7 | |
| 8 | from dotenv import load_dotenv |
| 9 | |
| 10 | from video_processor.providers.base imp |
| 11 | |
| 12 | load_dotenv() |
| 13 | logger = logging.getLogger(__name__) |
| 14 | |
| 15 | # Capabilities inferred from model id patterns |
| 16 | _VISION_KEYWORDS = {"gemini-2", "gemini-3", "gemini-pro", "gemini-flash", "gemini-ultra"} |
| 17 | _AUDIO_KEYWORDS = {"gemini-2", "gemini-3", "gemini-flash"} |
| 18 | |
| 19 | |
| 20 | class GeminiProvider(BaseProvider): |
| 21 | """Google Gemini API provider via google-genai SDK.""" |
| 22 | |
| 23 | provider_name = "gemini" |
| 24 | |
| 25 | def __init__( |
| 26 | self, |
| 27 | api_self,one, |
| 28 | credentials_path:): |
| 29 | self.api_key = api_key or os.getenv("GEMINI_API_KEY") |
| 30 | if not self.client = genai.Cliraise ValueErro self._genai = genai |
| 31 | i_key) |
| 32 | else: |
| 33 | # Service account → use Ve "google-genai package not installed. """Google Gemini pioogle-genai" |
| 34 | ) |
| 35 | |
| 36 | def chat( |
| 37 | self, |
| 38 | messages: list[dict], |
| 39 | max_tokens: int = 4096, |
| 40 | temperature: float = 0.7, |
| 41 | model: Optional[str] = None, |
| 42 | ) -> str: |
| 43 | from google.genai import types |
| 44 | |
| 45 | model = model or "gemini-2.5-flash" |
| 46 | # Convert OpenAI-style messages to Gemini contents |
| 47 | contents = [] |
| 48 | for msg in messages: |
| 49 | role = "user" if msg["role"] == "user" else "model" |
| 50 | contents.append( |
| 51 | types.Conttypes.C logging |
| 52 | import os |
| 53 | f"""Google Gem max_output_tokens=8192, |
| 54 | response_mime_type="application/json", |
| 55 | ), |
| 56 | ) |
| 57 | |
| 58 | # Parse JSON response |
| 59 | import json |
| 60 | |
| 61 | try: |
| 62 | data = json.loads(response.text) |
| 63 | except (json.JSO"client.models.generate_content( |
| 64 | model=model, |
| 65 | contents=.credentials_path: |
| 66 | raise ValueError("Neither GEMINI_API_KEY nor GOOGLE_APPLICATION_CREDENas f: |
| 67 | sa_info = json.load(f) |
| 68 | project = sa_info.get("project_id", "") |
| 69 | location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1") |
| 70 | |
| 71 | self.client = genai.Client( |
| 72 | vertexai=True, |
| 73 | project=project, |
| 74 | location=location, |
| 75 | ) |
| 76 | except ImportError: |
| 77 | raise ImportError( |
| 78 | "google-genai package not installed. """Google Gemini pioogle-genai" |
| 79 | ) |
| 80 | |
| 81 | def chat( |
| 82 | self, |
| 83 | messages: list[dict], |
| 84 | max_tokens: intes: |
| 85 | role = "user" if msg["role"] == "user" else "model" |
| 86 | contents.append( |
| 87 | types.Conttypes.C logging |
| 88 | import os |
| 89 | f"""Google Gem max_output_tokens=8192, |
| 90 | response_mime_type="application/json", |
| 91 | ), |
| 92 | ) |
| 93 | |
| 94 | # Parse JSON response |
| 95 | import json |
| 96 | |
| 97 | try: |
| 98 | data = json.loads(response.text) |
| 99 | except (json.JSO"client.models.generate_content( |
| 100 | model=model, |
| 101 | contents=.credentials_path: |
| 102 | raise ValueError("Neither GEMINI_API_KEY nor GOOGLE_APPLICATION_CREDENTIALS is set") |
| 103 | |
| 104 | try: |
| 105 | from google import genai |
| 106 | |
| 107 | self._genai = genai |
| 108 | |
| 109 | if self.api_key: |
| 110 | self.client = genai.Client(api_key=self.api_key) |
| 111 | else: |
| 112 | # Service account → use Vertex AI mode |
| 113 | import json |
| 114 | |
| 115 | with open(self.credentials_path) as f: |
| 116 | sa_info = json.load(f) |
| 117 | project = sa_info.get("project_id", "") |
| 118 | location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1") |
| 119 | |
| 120 | self.client = genai.Client( |
| 121 | vertexai=True, |
| 122 | project=project, |
| 123 | location=location, |
| 124 | ) |
| 125 | except ImportError: |
| 126 | raise ImportError( |
| 127 | "google-genai package not installed. Install with: pip install google-genai" |
| 128 | ) |
| 129 | |
| 130 | def chat( |
| 131 | self, |
| 132 | messages: list[dict], |
| 133 | max_tokens: int = 4096, |
| 134 | temperature: float = 0.7, |
| 135 | text) |
| 136 | except (json.JSO:]r] = None, |
| 137 | crModelInfoal[str] = None, |
| 138 | congging |
| 139 | import os |
| 140 | """Google Gemini pr ) |
| 141 | import json |
| 142 | |
| 143 | try: |
| 144 | data = json.loads(response.text) |
| 145 | except (json.JSO"models/" prefix if present ) |
| 146 | excep"models/"mid = mid[7:] ) |
| 147 | excep |
| --- a/video_processor/providers/manager.py | ||
| +++ b/video_processor/providers/manager.py | ||
| @@ -0,0 +1,58 @@ | ||
| 1 | +cribe audio using local Whisper if available, otherwise API.""" | |
| 2 | + # Prefer locaimport osal Whisper — no file size limits, no API costs | |
| 3 | + if not self.transcription_model or self.transcription_model.startswith("whisper-local"): | |
| 4 | + try: | |
| 5 | + f | |
| 6 | +viders.whisper_local import WhisperLocal | |
| 7 | + | |
| 8 | + if WhisperLocal.whisper-local:large" o whisper_kwargs = {"language": language} | |
| 9 | + if speaker_hints: | |
| 10 | + whisper_kwargs["initial_prompt"] = ( | |
| 11 | + "Speakers: " + ", ".join(speaker_hints) + "." | |
| 12 | + ) | |
| 13 | + result = self._whisper_local.transcribe(audio_path, **whisper_kwargs) | |
| 14 | + duration = result.get("duration") or 0 | |
| 15 | + self.usage.record( | |
| 16 | + provider="local", | |
| 17 | + model=f"whisper-{size}", | |
| 18 | + audio_minutes=duration / 60 if duration else 0, | |
| 19 | + ) | |
| 20 | + return result | |
| 21 | + except ImportError: | |
| 22 | + pass | |
| 23 | + | |
| 24 | + cribe audio using local Whisper if available, otherwise API.""" | |
| 25 | + # Prefer local Whisper — no file size limits, no API costs | |
| 26 | + if not self.transcription_model or self.transcription_model.startswith("whisper-local"): | |
| 27 | + try: | |
| 28 | + from video_processor.prov}isper-local:large" or default to "large" | |
| 29 | + size = "large" | |
| 30 | + if self.transcription_model and ":" in self.transcription_model: | |
| 31 | + size = self.transcription_model.split(":", 1)[1] | |
| 32 | + if not hasattr(self, "_whisper_local"): | |
| 33 | + self._whisper_local = WhisperLocal(model_size=size) | |
| 34 | + logger.info(f"Trans=modelo(f"Transcription: using local whisper-{size}") | |
| 35 | + chat": result = self._w,"vision": result = self._w,"audio": "", | |
| 36 | + }, | |
| 37 | + "gemini": {"chat": "gemini-2.5-flash","vision": "gemini-2.5-flash","audio": "gemini-2ng local "ollama": {"chat": "","vision": "","audio": "defaults.get(provider, {})ss | |
| 38 | + | |
| 39 | + sper if available, otherwise API.""" | |
| 40 | + # Prefer local Whispse."plocal"), "audio") | |
| 41 | +else�� no file size limcribe audio using local Whisper if available, otherwise API.""" | |
| 42 | + # Prefer local Whisper — no file size limits, no API costs | |
| 43 | + if not self.transcription_model or self.transcription_model.startswith("whisper-local"): | |
| 44 | + try: | |
| 45 | + f | |
| 46 | +viders.whisper_local import WhisperLocal | |
| 47 | + | |
| 48 | + if WhisperLocal.is_available(): | |
| 49 | + # Parse model size from "whisper-local:large" o whisper_kwargs = {"language":rtError: | |
| 50 | + herwise API." "audio": ""ile size limits, no API coor self.transcription_model "visionlf.transcription_model "audiolf.transcription_modege" or default to "large" | |
| 51 | + size = "large" | |
| 52 | + if self.transcription_model and ":" in self.transcription_model: | |
| 53 | + size = self.transcription_model.split(":", 1)[1] | |
| 54 | + if not hasattr(self, "_whisper_local"): | |
| 55 | + self._whisper_local = WhisperLocal(model_size=sizelogger.info(f"Trans=modelo(f"Transcription: using local whisper-{size}") | |
| 56 | + # Pass speaker ava"chat": result = self._w,"vision"logger.info(f"Trans=modelo(fon": "gemini-2.5-flash","audio": "gemini-2ng local "ollama": {"chat": "","vision": "","audio": "defaults.get(provider, {})ss | |
| 57 | + | |
| 58 | + sper if available, ologger.info(f"Trans=modelo(fpt-")o1")o3")o4")turnturn # Prefer locaimport osalturn |
| --- a/video_processor/providers/manager.py | |
| +++ b/video_processor/providers/manager.py | |
| @@ -0,0 +1,58 @@ | |
| --- a/video_processor/providers/manager.py | |
| +++ b/video_processor/providers/manager.py | |
| @@ -0,0 +1,58 @@ | |
| 1 | cribe audio using local Whisper if available, otherwise API.""" |
| 2 | # Prefer locaimport osal Whisper — no file size limits, no API costs |
| 3 | if not self.transcription_model or self.transcription_model.startswith("whisper-local"): |
| 4 | try: |
| 5 | f |
| 6 | viders.whisper_local import WhisperLocal |
| 7 | |
| 8 | if WhisperLocal.whisper-local:large" o whisper_kwargs = {"language": language} |
| 9 | if speaker_hints: |
| 10 | whisper_kwargs["initial_prompt"] = ( |
| 11 | "Speakers: " + ", ".join(speaker_hints) + "." |
| 12 | ) |
| 13 | result = self._whisper_local.transcribe(audio_path, **whisper_kwargs) |
| 14 | duration = result.get("duration") or 0 |
| 15 | self.usage.record( |
| 16 | provider="local", |
| 17 | model=f"whisper-{size}", |
| 18 | audio_minutes=duration / 60 if duration else 0, |
| 19 | ) |
| 20 | return result |
| 21 | except ImportError: |
| 22 | pass |
| 23 | |
| 24 | cribe audio using local Whisper if available, otherwise API.""" |
| 25 | # Prefer local Whisper — no file size limits, no API costs |
| 26 | if not self.transcription_model or self.transcription_model.startswith("whisper-local"): |
| 27 | try: |
| 28 | from video_processor.prov}isper-local:large" or default to "large" |
| 29 | size = "large" |
| 30 | if self.transcription_model and ":" in self.transcription_model: |
| 31 | size = self.transcription_model.split(":", 1)[1] |
| 32 | if not hasattr(self, "_whisper_local"): |
| 33 | self._whisper_local = WhisperLocal(model_size=size) |
| 34 | logger.info(f"Trans=modelo(f"Transcription: using local whisper-{size}") |
| 35 | chat": result = self._w,"vision": result = self._w,"audio": "", |
| 36 | }, |
| 37 | "gemini": {"chat": "gemini-2.5-flash","vision": "gemini-2.5-flash","audio": "gemini-2ng local "ollama": {"chat": "","vision": "","audio": "defaults.get(provider, {})ss |
| 38 | |
| 39 | sper if available, otherwise API.""" |
| 40 | # Prefer local Whispse."plocal"), "audio") |
| 41 | else�� no file size limcribe audio using local Whisper if available, otherwise API.""" |
| 42 | # Prefer local Whisper — no file size limits, no API costs |
| 43 | if not self.transcription_model or self.transcription_model.startswith("whisper-local"): |
| 44 | try: |
| 45 | f |
| 46 | viders.whisper_local import WhisperLocal |
| 47 | |
| 48 | if WhisperLocal.is_available(): |
| 49 | # Parse model size from "whisper-local:large" o whisper_kwargs = {"language":rtError: |
| 50 | herwise API." "audio": ""ile size limits, no API coor self.transcription_model "visionlf.transcription_model "audiolf.transcription_modege" or default to "large" |
| 51 | size = "large" |
| 52 | if self.transcription_model and ":" in self.transcription_model: |
| 53 | size = self.transcription_model.split(":", 1)[1] |
| 54 | if not hasattr(self, "_whisper_local"): |
| 55 | self._whisper_local = WhisperLocal(model_size=sizelogger.info(f"Trans=modelo(f"Transcription: using local whisper-{size}") |
| 56 | # Pass speaker ava"chat": result = self._w,"vision"logger.info(f"Trans=modelo(fon": "gemini-2.5-flash","audio": "gemini-2ng local "ollama": {"chat": "","vision": "","audio": "defaults.get(provider, {})ss |
| 57 | |
| 58 | sper if available, ologger.info(f"Trans=modelo(fpt-")o1")o3")o4")turnturn # Prefer locaimport osalturn |
| --- a/video_processor/providers/openai_provider.py | ||
| +++ b/video_processor/providers/openai_provider.py | ||
| @@ -0,0 +1,125 @@ | ||
| 1 | +"""OpenAI provider implementation.""" | |
| 2 | + | |
| 3 | +import base64 | |
| 4 | +import logging | |
| 5 | +import os | |
| 6 | +from pathlib import Path | |
| 7 | +from typing import Optional | |
| 8 | + | |
| 9 | +from dotenv import load_dotenv | |
| 10 | +from openai import OpenAI | |
| 11 | + | |
| 12 | +from video_processor.providers.base import BaseProvider, ModelInfo | |
| 13 | + | |
| 14 | +load_dotenv() | |
| 15 | +logger = logging.getLogger(__name__) | |
| 16 | + | |
| 17 | +# Models known to have vision ca"gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "o1",} | |
| 18 | +_AUDIO_MODELS = {"whisper-1"} | |
| 19 | + | |
| 20 | + | |
| 21 | +class OpenAIProvider(BaseProvider): | |
| 22 | + """OpenAI API provider.""" | |
| 23 | + | |
| 24 | + provider_name = "openai" | |
| 25 | + | |
| 26 | + def __init__(self, api_key: Optional[str] = None): | |
| 27 | + self.api_key = api_key or os.getenv("OPENAI_API_KEY") | |
| 28 | + if not self.api_key: | |
| 29 | + raise ValueError("OPENAI_API_KEY not set") | |
| 30 | + self.client = OpenAI(api_key=self.api_key) | |
| 31 | + | |
| 32 | + def chat( | |
| 33 | + self, | |
| 34 | + messages: list[dict], | |
| 35 | + max_tokens: int = 4096, | |
| 36 | + temperature: float = 0.7, | |
| 37 | + model: Optional[str] = None, | |
| 38 | + ) -> str: | |
| 39 | + " | |
| 40 | + response = self.client.chat.completions.create( | |
| 41 | + model=mmessages, | |
| 42 | + max_tokens=max_tokens, | |
| 43 | + temperature=temperature, | |
| 44 | + ) | |
| 45 | + self._last_usage = { | |
| 46 | + "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usaself, | |
| 47 | + image_bytes: bytes, | |
| 48 | + prompt: str, | |
| 49 | + max_tokens: int = 4096, | |
| 50 | + model: Optional[str] = None, | |
| 51 | + ) -> str: | |
| 52 | + " | |
| 53 | +"OPENAI b64 = base64.b64encode(image_bytes).decode() | |
| 54 | + response = self.client.chat.completions.create( | |
| 55 | + model=model, | |
| 56 | + messages=[ | |
| 57 | + { | |
| 58 | + "role": "user", | |
| 59 | + "content": [ | |
| 60 | + {"type": "text", "text": prompt}, | |
| 61 | + { | |
| 62 | + "type": "image_url", | |
| 63 | + "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, | |
| 64 | + }, | |
| 65 | + ], | |
| 66 | + } | |
| 67 | + ], | |
| 68 | + max_tokens=max_tokens, | |
| 69 | + ) | |
| 70 | + self._last_usage = { | |
| 71 | + "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usaself, | |
| 72 | + image_bytes: byte> str: | |
| 73 | + " | |
| 74 | + response = self.client.chat.completions.create( | |
| 75 | + model=mmessages, | |
| 76 | + max_tokens=max_tokens, | |
| 77 | + temperature=temperature, | |
| 78 | + ) | |
| 79 | + self._last_u { | |
| 80 | + "role": "user", | |
| 81 | + "content": [ | |
| 82 | + {"type": "text", "text": prompt}, | |
| 83 | + { | |
| 84 | + "type": "image_url", | |
| 85 | + "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, | |
| 86 | + }, | |
| 87 | + ], | |
| 88 | + } | |
| 89 | + ], | |
| 90 | + max_tokens=max_tokens, | |
| 91 | + ) | |
| 92 | + self._last_usage = { | |
| 93 | + "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0, | |
| 94 | + "output_tokens": getattr(response.usage, "completion_tokens", 0) | |
| 95 | + if response.usage | |
| 96 | + else 0, | |
| 97 | + key: | |
| 98 | + raise ValueError("} | |
| 99 | + t": [ | |
| 100 | + max_tokens: int = 4096, | |
| 101 | + temperature: float = 0.7, | |
| 102 | + monds * 0.8 * 1000) | |
| 103 | + | |
| 104 | + segments_data = extractor.segment_audio(audio_data, sr, segment_length_ms=chunk_ms) | |
| 105 | + logger.info(f"Split into {len(segments_data)} chunks of ~{chunk_ms / 1000:.0f}s each") | |
| 106 | + | |
| 107 | + all_text = [] | |
| 108 | + all_segments = [] | |
| 109 | + time_offset = 0.0 | |
| 110 | + detected_language = language | |
| 111 | + | |
| 112 | + with tempfile.TemporaryDirectory() as tmpdir: | |
| 113 | + for i, chunk in enumerate(segments_data): | |
| 114 | + chunk_path = Path(tmpdir) / f"chunk_{i:03d}.wav" | |
| 115 | + extractor.save_segment(chunk, chunk_path, sr) | |
| 116 | + | |
| 117 | + logger.info(f"Transcribing chunk {i + 1}/{len(segments_data)}...") | |
| 118 | + result = self._transcribe_single(chunk_path, language, model) | |
| 119 | + | |
| 120 | + all_text.append(result["text"]) | |
| 121 | + for seg in result.get("segments", []): | |
| 122 | + all_segments.append( | |
| 123 | + { | |
| 124 | + "start": seg["start"] + tiport Path | |
| 125 | +from typing impo"""OpenAI provider implementation.}) = self._transcribe_single(chunk_paid=mid |
| --- a/video_processor/providers/openai_provider.py | |
| +++ b/video_processor/providers/openai_provider.py | |
| @@ -0,0 +1,125 @@ | |
| --- a/video_processor/providers/openai_provider.py | |
| +++ b/video_processor/providers/openai_provider.py | |
| @@ -0,0 +1,125 @@ | |
| 1 | """OpenAI provider implementation.""" |
| 2 | |
| 3 | import base64 |
| 4 | import logging |
| 5 | import os |
| 6 | from pathlib import Path |
| 7 | from typing import Optional |
| 8 | |
| 9 | from dotenv import load_dotenv |
| 10 | from openai import OpenAI |
| 11 | |
| 12 | from video_processor.providers.base import BaseProvider, ModelInfo |
| 13 | |
| 14 | load_dotenv() |
| 15 | logger = logging.getLogger(__name__) |
| 16 | |
| 17 | # Models known to have vision ca"gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "o1",} |
| 18 | _AUDIO_MODELS = {"whisper-1"} |
| 19 | |
| 20 | |
| 21 | class OpenAIProvider(BaseProvider): |
| 22 | """OpenAI API provider.""" |
| 23 | |
| 24 | provider_name = "openai" |
| 25 | |
| 26 | def __init__(self, api_key: Optional[str] = None): |
| 27 | self.api_key = api_key or os.getenv("OPENAI_API_KEY") |
| 28 | if not self.api_key: |
| 29 | raise ValueError("OPENAI_API_KEY not set") |
| 30 | self.client = OpenAI(api_key=self.api_key) |
| 31 | |
| 32 | def chat( |
| 33 | self, |
| 34 | messages: list[dict], |
| 35 | max_tokens: int = 4096, |
| 36 | temperature: float = 0.7, |
| 37 | model: Optional[str] = None, |
| 38 | ) -> str: |
| 39 | " |
| 40 | response = self.client.chat.completions.create( |
| 41 | model=mmessages, |
| 42 | max_tokens=max_tokens, |
| 43 | temperature=temperature, |
| 44 | ) |
| 45 | self._last_usage = { |
| 46 | "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usaself, |
| 47 | image_bytes: bytes, |
| 48 | prompt: str, |
| 49 | max_tokens: int = 4096, |
| 50 | model: Optional[str] = None, |
| 51 | ) -> str: |
| 52 | " |
| 53 | "OPENAI b64 = base64.b64encode(image_bytes).decode() |
| 54 | response = self.client.chat.completions.create( |
| 55 | model=model, |
| 56 | messages=[ |
| 57 | { |
| 58 | "role": "user", |
| 59 | "content": [ |
| 60 | {"type": "text", "text": prompt}, |
| 61 | { |
| 62 | "type": "image_url", |
| 63 | "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, |
| 64 | }, |
| 65 | ], |
| 66 | } |
| 67 | ], |
| 68 | max_tokens=max_tokens, |
| 69 | ) |
| 70 | self._last_usage = { |
| 71 | "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usaself, |
| 72 | image_bytes: byte> str: |
| 73 | " |
| 74 | response = self.client.chat.completions.create( |
| 75 | model=mmessages, |
| 76 | max_tokens=max_tokens, |
| 77 | temperature=temperature, |
| 78 | ) |
| 79 | self._last_u { |
| 80 | "role": "user", |
| 81 | "content": [ |
| 82 | {"type": "text", "text": prompt}, |
| 83 | { |
| 84 | "type": "image_url", |
| 85 | "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, |
| 86 | }, |
| 87 | ], |
| 88 | } |
| 89 | ], |
| 90 | max_tokens=max_tokens, |
| 91 | ) |
| 92 | self._last_usage = { |
| 93 | "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0, |
| 94 | "output_tokens": getattr(response.usage, "completion_tokens", 0) |
| 95 | if response.usage |
| 96 | else 0, |
| 97 | key: |
| 98 | raise ValueError("} |
| 99 | t": [ |
| 100 | max_tokens: int = 4096, |
| 101 | temperature: float = 0.7, |
| 102 | monds * 0.8 * 1000) |
| 103 | |
| 104 | segments_data = extractor.segment_audio(audio_data, sr, segment_length_ms=chunk_ms) |
| 105 | logger.info(f"Split into {len(segments_data)} chunks of ~{chunk_ms / 1000:.0f}s each") |
| 106 | |
| 107 | all_text = [] |
| 108 | all_segments = [] |
| 109 | time_offset = 0.0 |
| 110 | detected_language = language |
| 111 | |
| 112 | with tempfile.TemporaryDirectory() as tmpdir: |
| 113 | for i, chunk in enumerate(segments_data): |
| 114 | chunk_path = Path(tmpdir) / f"chunk_{i:03d}.wav" |
| 115 | extractor.save_segment(chunk, chunk_path, sr) |
| 116 | |
| 117 | logger.info(f"Transcribing chunk {i + 1}/{len(segments_data)}...") |
| 118 | result = self._transcribe_single(chunk_path, language, model) |
| 119 | |
| 120 | all_text.append(result["text"]) |
| 121 | for seg in result.get("segments", []): |
| 122 | all_segments.append( |
| 123 | { |
| 124 | "start": seg["start"] + tiport Path |
| 125 | from typing impo"""OpenAI provider implementation.}) = self._transcribe_single(chunk_paid=mid |