PlanOpticon

Phase 2: Add provider abstraction layer and auto-model-discovery Stories 2.1-2.5: Abstract BaseProvider with OpenAI, Anthropic, and Gemini implementations. Auto-discovery of available models via provider APIs. ProviderManager routes API calls to best available provider with support for explicit model overrides. CLI flags for --provider, --vision-model, --chat-model. New list-models command. Added google-genai dependency.

leo 2026-02-14 22:14 trunk
Commit a94205bde32e2a5f35e998ed6321eb69e66781dd2ceb9b9e7f237dff354b9e32
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,42 @@
1
+# Core dependencies
2
+numpy>=1.24.0
3
+opencv-python>=4.8.0
4
+scipy>=1.10.0
5
+pillow>=10.0.0
6
+matplotlib>=3.7.0
7
+pydantic>=2.0.0
8
+tqdm>=4.66.0
9
+colorlog>=6.7.0
10
+click>=8.1.0
11
+
12
+# Audio processing
13
+librosa>=0.10.0
14
+soundfile>=0.12.0
15
+
16
+# API integrations
17
+openai>=1.0.0
18
+anthropic>=0.5.0
19
+google-genai>=1.0.0
20
+google-cloud-speech>=2.21.0
21
+google-cloud-vision>=3.4.0
22
+
23
+# Markdown & visualization
24
+markdown>=3.4.0
25
+pymdown-extensions>=10.0.0
26
+p
27
+# Utilities
28
+python-dotenv>=1.0.0
29
+requests>=2.31.0
30
+aiohttp>=3.8.5
31
+tenacity>=8.2.0
32
+
33
+# Optional GPU acceleration
34
+# torch>=2.0.0; platform_system != "Windows" or platform_machine != "arm64"
35
+# torchvision>=0.15.0; platform_system != "Windows" or platform_machine != "arm64"
36
+
37
+# Testing
38
+pytest>=7.3.0
39
+pytest-cov>=4.1.0
40
+black>=23.3.0
41
+isort>=5.12.0
42
+mypy>=1.3.0
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,42 @@
1 # Core dependencies
2 numpy>=1.24.0
3 opencv-python>=4.8.0
4 scipy>=1.10.0
5 pillow>=10.0.0
6 matplotlib>=3.7.0
7 pydantic>=2.0.0
8 tqdm>=4.66.0
9 colorlog>=6.7.0
10 click>=8.1.0
11
12 # Audio processing
13 librosa>=0.10.0
14 soundfile>=0.12.0
15
16 # API integrations
17 openai>=1.0.0
18 anthropic>=0.5.0
19 google-genai>=1.0.0
20 google-cloud-speech>=2.21.0
21 google-cloud-vision>=3.4.0
22
23 # Markdown & visualization
24 markdown>=3.4.0
25 pymdown-extensions>=10.0.0
26 p
27 # Utilities
28 python-dotenv>=1.0.0
29 requests>=2.31.0
30 aiohttp>=3.8.5
31 tenacity>=8.2.0
32
33 # Optional GPU acceleration
34 # torch>=2.0.0; platform_system != "Windows" or platform_machine != "arm64"
35 # torchvision>=0.15.0; platform_system != "Windows" or platform_machine != "arm64"
36
37 # Testing
38 pytest>=7.3.0
39 pytest-cov>=4.1.0
40 black>=23.3.0
41 isort>=5.12.0
42 mypy>=1.3.0
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -0,0 +1,100 @@
1
+"""Tests for the provifrom unittest.mock import MagicMock, patch
2
+
3
+import pytest
4
+
5
+from video_pimport pytest
6
+rManager
7
+
8
+----------------------sults(self, mock_ollama):
9
+ or.providers.manager import ProviderManager
10
+
11
+-------------------------
12
+
13
+
14
+class TestModelInfo:
15
+ def test_basic(self):
16
+ m = ModelInfo(id="gpt-4o", provider="openai", capabilities=["chat", "vision"])
17
+ assert m.id == "gpt-4o"
18
+ assert "vision" in m.capabilities
19
+
20
+ def test_round_trip(seidon_model="whisper-1",
21
+ display_name=" display_name="Claude Sonnet", capabilities=["chat", "vision"])
22
+rovider
23
+validate_json(m.model_dump_json())
24
+ -------------------------
25
+
26
+
27
+class TestProviderManager:
28
+ def _make_mock_provider(sel"""Create a mock provider."""scover_available_modMagicMock(spec=BaseProvider)
29
+ provider.provider_name = name
30
+ provider.chat.return_value = "test response"
31
+ provider.analyze_image.return_value = "image analysis"
32
+ provider.transcribe_audio.return_value = {
33
+ "text": "hello world",
34
+ "segments": [],
35
+ "provider": name,
36
+ "model": "test",
37
+ }
38
+ return provider
39
+
40
+ def test_init_with_explicit_models(self):
41
+ mgr = ProviderManager(
42
+ vision_model="gpt-4o",
43
+ chat_model="claude-sonnet-4-5-20250929",
44
+ transcription_model="whisper-1",
45
+ )
46
+ assert mgr.vision_model == "gpt-4o"
47
+ assert mgr.chat_model == "claude-sonnet-4-5-20250929"
48
+ assert mgr.transcription_model == "whisper-1"
49
+
50
+ def test_init_forced_provider(self):
51
+ mgr = ProviderManager(provider="gemini")
52
+ assert mgr.vision_model == "gemini-2.5-flash"
53
+ assert mgr.chat_model == "gemini-2.5-flash"
54
+ assert mgr.transcription_model == "gemini-2.5-flash"
55
+
56
+ def test__model == ""
57
+ lash"
58
+ assertr="gemini")
59
+ assert mgr.vrn provider
60
+
61
+ def test_init_with_explicit_models(self):
62
+ mgr = ProviderManager(
63
+ vision_model="gpt-4o",
64
+ chat_model="claude-sonnet-4-5-20250929",
65
+ transcription_model="whisperassert mgr.chat_model == ""
66
+ assert mgr.transcription_model == ""
67
+
68
+ def test_init_no_overrides(self):
69
+ mgr = ProviderManager()
70
+ assert mgr.vision_model is None
71
+ assert mgr.chat_model is None
72
+ assert mgr.transcription_model is None
73
+ assert mgr.auto is True
74
+
75
+ def test_default_for_provider_gemini(self):
76
+ result = ProviderManager._default_for_provider("gemini", "vision")
77
+ assert result == "gemini-2.5-flash"
78
+
79
+ def test_default_foest
80
+
81
+from video_proceBaseProvider, ModelInfo
82
+from video_processor.providers.manager import ProviderManager
83
+
84
+-------------------------
85
+
86
+
87
+class TestModelInfo:
88
+ def test_basic(self):
89
+ m = ModelInfo("""Tests for the provifrom unittest.mock import MagicMock, patch----------t all provider modules import without errors."""
90
+
91
+ PROVIDER_MODULES = [
92
+ "video_processor.providers.openai_provider",
93
+ "vidr",
94
+ "video_processo)0250929"
95
+ ,
96
+ "video_processor.providers.azure_provider",
97
+ "video_processor.providers.together_provider",
98
+ "video_processor.providers.fireworks_provider",
99
+ "video_processor.providers.cerebras_provider",
100
+ "video_proces("
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -0,0 +1,100 @@
1 """Tests for the provifrom unittest.mock import MagicMock, patch
2
3 import pytest
4
5 from video_pimport pytest
6 rManager
7
8 ----------------------sults(self, mock_ollama):
9 or.providers.manager import ProviderManager
10
11 -------------------------
12
13
14 class TestModelInfo:
15 def test_basic(self):
16 m = ModelInfo(id="gpt-4o", provider="openai", capabilities=["chat", "vision"])
17 assert m.id == "gpt-4o"
18 assert "vision" in m.capabilities
19
20 def test_round_trip(seidon_model="whisper-1",
21 display_name=" display_name="Claude Sonnet", capabilities=["chat", "vision"])
22 rovider
23 validate_json(m.model_dump_json())
24 -------------------------
25
26
27 class TestProviderManager:
28 def _make_mock_provider(sel"""Create a mock provider."""scover_available_modMagicMock(spec=BaseProvider)
29 provider.provider_name = name
30 provider.chat.return_value = "test response"
31 provider.analyze_image.return_value = "image analysis"
32 provider.transcribe_audio.return_value = {
33 "text": "hello world",
34 "segments": [],
35 "provider": name,
36 "model": "test",
37 }
38 return provider
39
40 def test_init_with_explicit_models(self):
41 mgr = ProviderManager(
42 vision_model="gpt-4o",
43 chat_model="claude-sonnet-4-5-20250929",
44 transcription_model="whisper-1",
45 )
46 assert mgr.vision_model == "gpt-4o"
47 assert mgr.chat_model == "claude-sonnet-4-5-20250929"
48 assert mgr.transcription_model == "whisper-1"
49
50 def test_init_forced_provider(self):
51 mgr = ProviderManager(provider="gemini")
52 assert mgr.vision_model == "gemini-2.5-flash"
53 assert mgr.chat_model == "gemini-2.5-flash"
54 assert mgr.transcription_model == "gemini-2.5-flash"
55
56 def test__model == ""
57 lash"
58 assertr="gemini")
59 assert mgr.vrn provider
60
61 def test_init_with_explicit_models(self):
62 mgr = ProviderManager(
63 vision_model="gpt-4o",
64 chat_model="claude-sonnet-4-5-20250929",
65 transcription_model="whisperassert mgr.chat_model == ""
66 assert mgr.transcription_model == ""
67
68 def test_init_no_overrides(self):
69 mgr = ProviderManager()
70 assert mgr.vision_model is None
71 assert mgr.chat_model is None
72 assert mgr.transcription_model is None
73 assert mgr.auto is True
74
75 def test_default_for_provider_gemini(self):
76 result = ProviderManager._default_for_provider("gemini", "vision")
77 assert result == "gemini-2.5-flash"
78
79 def test_default_foest
80
81 from video_proceBaseProvider, ModelInfo
82 from video_processor.providers.manager import ProviderManager
83
84 -------------------------
85
86
87 class TestModelInfo:
88 def test_basic(self):
89 m = ModelInfo("""Tests for the provifrom unittest.mock import MagicMock, patch----------t all provider modules import without errors."""
90
91 PROVIDER_MODULES = [
92 "video_processor.providers.openai_provider",
93 "vidr",
94 "video_processo)0250929"
95 ,
96 "video_processor.providers.azure_provider",
97 "video_processor.providers.together_provider",
98 "video_processor.providers.fireworks_provider",
99 "video_processor.providers.cerebras_provider",
100 "video_proces("
--- a/video_processor/cli/commands.py
+++ b/video_processor/cli/commands.py
@@ -0,0 +1,211 @@
1
+video_pro["auto", "penai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "oimport timeopenai", "anprovideo_pro["from typingList, Optional "openai", "aextractors.frame_extractor import eextractors.audio_extraapi.transcription_aapi.vision_api import VisionAPI
2
+analyzers.diagram_analyauto", "openai", "anpronai", "anthrfault
3
+ kcli.output_formatter import OutputFormatter
4
+
5
+# Configure logging
6
+defch(ctx, input_, source kg_path"auto", "openai", "avideo_pro[if kg
7
+ merged_kgmerged_kg_pathcypher.cypher":
8
+ cyphero_pro["auto", "openai", auto", "ope
9
+ # Create a formatter that includes timestamp, level, and message "openadefservice",6 = KnowledgeGraph(:")
10
+r.cypher":
11
+ cvideo_pro["auto", "opena])])anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto'DEBUG': 'cyan',,6 = Know 'INFO': 'green',
12
+ 'WARNING': 'yellow',
13
+ 'ERROR': 'red',
14
+"auto", 'CRITICAL': 'red,bg_white',
15
+ }
16
+ )
17
+
18
+ # Set up.cypher" handlerpher.cypher":
19
+ cyphero_pro["autovideo_pro["auto", "op30.0,g
20
+ merged_kgmerged_kg_pathc
21
+ # Configure root logger cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
22
+
23
+ # Remove existing h(:")
24
+r.cypher":
25
+ cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto"rovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault
26
+ kg_path"auto", "openai", "avideo_pro[if kg
27
+ merged_kgmerged_kg_pathcypher.cypher":
28
+ cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
29
+r.cypher":
30
+ cvideo_pro["auto", "opena])])batch(ctx, input_, source, , recursiver cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
31
+r.cypher":
32
+ cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault
33
+ kg_path"auto", "openai", "avideo_pro[if kg
34
+ merged_kgmerged_kg_pathcypher.cypher":
35
+ cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
36
+r.cypher":
37
+ cvideo_pro["auto", "opena])]), "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto"rovideo_pro["auto", "openai"path"auto", "openai", "avideo_pro[if kg
38
+ merged_kgmerged_kg_pathcypher.cypher":
39
+ cyphero_pro["auto", "openaito", "openadefservice",6D(:")
40
+r.cypher":
41
+ cvideo_pro["auto", "opena])])batch(ctx, input_, source, , recursiver cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
42
+r.cypher":
43
+ cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault
44
+ kg_path"auto", "openai", "avideo_pro[if kg
45
+ merged_kgmerged_kg_pathcypher.cypher":
46
+ cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
47
+r.cypher":
48
+ cvideo_pro["auto", "opena])])thcypher.cypher":uto", "oimport timeopenai", "anprovideo_pro["from typingList, Optional "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault
49
+ kg_path"auto", "openai", "avideo_pro[if kg
50
+ merto", "openai", "anthrfault["auto", "openai", "anthvideo_pro["auto", "openai", "anpro["a "openai", "anpro["auto", "openai", "anpro["auto", "oimport timeopenai", "anprovideo_pro["from typingList, Optional "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault
51
+ kg_path"auto", "openai", "avideo_pro[if kg
52
+ merged_kgmerged_kg_pathcypher.cypher":
53
+ cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
54
+r.cypher":
55
+ cvideo_pro["auto", "opena])])anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "oypher":
56
+ cvideo_pro[" kg_path"auto", "openai", "avideo_pro[if kg
57
+ merged_kgmerged_kg_pathcypher.cypher":
58
+ cyphero_pro["autovideo_pro["auto", "op30.0,g
59
+
60
+ # Find videosideo_pro["auto", "op30.0,g
61
+ pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault
62
+ kg_path"auto", "openai", "avideo_pro[if kg
63
+ merged_kgmerged_kg_pathcypher.cypher":
64
+ cyphero_pro["autovideo_pro["auto", "op30.0,g
65
+ merged_kgmerged_kg_pathcypher.cypher":
66
+ cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
67
+r.cypher":
68
+ cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto"rovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault
69
+ kg_path"auto", "openai", "avideo_pro[if kg
70
+ merged_kgmerged_kg_pathcypher.cypher":
71
+ cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
72
+r.cypher":
73
+ cvideo_pro["auto", "opena])])batch(ctx, input_, source, , recursiver cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
74
+r.cypher":
75
+ cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault
76
+ kg_path"auto", "openai", "avideo_pro[if kg
77
+ merged_kgmerged_kg_pathcypher.cypher":
78
+ cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
79
+r.cypher":
80
+ cvideo_pro["auto", "opena])])# Main CLI group "anpro["auto"rovideo_pro["aut'--verbose', '-v'thrfault["auto", "ope'Enable verbose output'"openai", "anthrfault
81
+ '0.1.0', prog_name='PlanOpticon'_pro[if kg
82
+ merged_kgmerged_kg_pathcypher.cypher":
83
+ cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeG# Initialize contextedgeGraph(:")
84
+'verbose't_, source, , re
85
+ # Set up logging)batch(ctx, input_, source, , merged_kgmerged_kg_pathcyp'--input', '-i'to", "openai", "anpro["auto", "opexists=True)video_pro["auto"help='Input video file path'nadefservice",6 '--output', '-o'to", "openai", "anpro["auto", "open
86
+ help='Output direcnadefservice",6 '--depth',hrfault["auto", "ope'basic', 'svideo_pro["auto"default='standarnadefservice",6 '--focus' cyphero_pro["auto", "openai",list ofyphero_pro["auto", "openai", auto"'--use-gpu'thrfault["auto", "ope''nadefservice",6 '--sampling-rate'ovideo_pro["auto", "openai
87
+"auto", help='_pro["auto", "opena (1.0 = every frame)'nadefservice",6 '--change-threshold'ovideo_pro["auto", "open15video_pro["auto"help='Threshold for detecting visnadefservice",6 '--title', type=str, help=' cvideo_pro["auto", "opena'nadefservice",6 '--provider', '-p',hrfault["auto", "ope'auto', 'openavideo_pro["auto"default='auto', help='API provider (auto selects best available)'nadefservice",6 '--vision-model'auto", "openai", "anNone, help='dgeGraph(:")
88
+r.cypher":
89
+ 'nadefservice",6 '--chat-model'auto", "openai", "anNone, help='ctx, input_, source, , recursiver'_pro[if kg
90
+ merged_kgmeanalyze(ctx, input, output, depth, focus, use_gpu, sampling_rate, change_th")
91
+r.cypher":
92
+ Analyze video content"openai", "anpr
93
+ focus_areas = [areN@ie,2:reH@i~,c:',')]
94
+
95
+ # Set video title if not8@Zu,K:d
96
+ if not title:
97
+8@1iu,1C:title = f"Analysis of {input_path.stem}"
98
+
99
+ # Log analysis parameters
100
+J@s0,_:Starting analysis of {input_path}")
101
+J@s0,j:Processing depth: {depth}")
102
+ if focus_areasI@tG,y:info(f"Focus areas: {', '.join(focus_areas)}")
103
+
104
+ try:
105
+8@1aT,12:# Create subdirectories
106
+ frames_dir = output_dir / "frames"9@2eT,2t:audio_dir = output_dir / "audio"
107
+ transcript_dir = output_dir / "transcript"
108
+ diagrams_dir = output_dir / "diagrams"
109
+ results_dir = output_dir / "results"
110
+ 9@mU,2f:for directory in [frames_dir, audio_dir, transcript_dir, diagrams_dir, results_dir]:
111
+ directory.mkdir(exist_ok=True)
112
+
113
+ # Step 1: Extract frames
114
+L@2gG,1J:"Extracting video frames...")
115
+ frames = extract_frames(
116
+ input_paG@1c0,1O@pl,Y:disable_gpu=not use_gpu
117
+ )
118
+N@2gG,1~:Extracted {len(frames)} frames")
119
+
120
+ # Save frames
121
+ frame_paths = save_frames(frames, frames_dir, "frame")
122
+N@2gG,c:Saved frames to {frames_dir}")
123
+ 9@1Gg,O:# Step 2: Extract audio
124
+L@2gG,2A:"Extracting audio...")
125
+ audio_extractor = AudioExtractor()
126
+ audio_path = audio_extractor.extract_audio(
127
+ input_paG@1c0,w:output_path=audio_dir / f"{input_path.stem}.wav"
128
+ )
129
+8@1ge,~:audio_props = audio_extractor.get_audio_properties(audio_path)
130
+N@2gG,1~:Extracted audio: {audio_props['duration']:.2f}s, {audio_props['sample_rate']} Hz")
131
+
132
+ # Step 3: Transcribe audio
133
+L@2gG,22:"Transcribing audio...")
134
+ transcription_api = TranscriptionAPI(
135
+ provider="openai", # Could be configurable
136
+ 9@1Yi,3V:cache_dir=cache_dir,
137
+ use_cache=True
138
+ )
139
+
140
+ # Process based on depth
141
+ detect_speakers = depth != "basic"
142
+ transcription = transcription_api.transcribe_audio(
143
+ audio_paG@1c0,Y:detect_speakers=detect_speakers,
144
+ B@1qG,39:speakers=2 if detect_speakers else 1 # Default to 2 speakers if detecting
145
+ )
146
+
147
+ # Save transcript in different formats
148
+ transcript_path = transcription_api.save_transcript(D@2eT,E:transcription,D@2eT,d:transcript_dir / f"{input_path.stem}",
149
+ 8@1Uw,G: format="json"9@2_w,h:)
150
+ transcription_api.save_transcript(D@2eT,E:transcription,D@2eT,d:transcript_dir / f"{input_path.stem}",
151
+ 8@2jf,F: format="txt"9@2_w,h:)
152
+ transcription_api.save_transcript(D@2eT,E:transcription,D@2eT,g:transcript_dir / f"{input_path.stem}",
153
+ 8@qD,N:format="srt"
154
+ )
155
+8@1YD,1:
156
+N@2gG,1Z:Saved transcripts to {transcript_dir}")
157
+
158
+ # Step 4: Diagram extraction and analysis
159
+L@2gG,2_:"Analyzing visual elements...")
160
+
161
+ # Initialize vision API
162
+ vision_api = VisionAPI(
163
+ provider="openai", # Could be configurable
164
+ 9@1Yi,2w:cache_dir=cache_dir,
165
+ use_cache=True
166
+ )
167
+
168
+ # Initialize diagram analyzer
169
+ diagram_analyzer = DiagramAnalyzer(
170
+ vision_api=vision_api,
171
+ 9@1Yi,u:cache_dir=cache_dir,
172
+ use_cache=True
173
+ )9@2_w,1:
174
+8@1gf,U:# Detect and analyze diagrams
175
+8@1c8,11:# We pass frame paths instead of numpy arrays for better caching
176
+L@2gG,4L:"Detecting diagrams in frames...")
177
+ diagrams = []
178
+
179
+ # Skip diagram detection for basic depth
180
+ if depth != "basic" and (not focus_areas or "diagrams" in focus_areas):
181
+ # For demo purposes, limit to a subset of frames to reduce API costs
182
+ 8@mV,2I: max_frames_to_analyze = 10 if depth == "standard" else 20
183
+ frame_subset = frame_paths[:min(max_frames_to_analyze, len(frame_paths))]D@o4,1:
184
+8@ut,1I: detected_frames = diagram_analyzer.detect_diagrams(frame_subset)
185
+
186
+B@rQ,J: if detected_framesI@1qE,13:logging.info(f"Detected {len(detected_frames)} potential diagrams")H@1qF,H@1qF,V:# Process each detected diagramH@1qF,b:for idx, confidence in detected_framesI@1qE,T: if idx < len(frame_subsetJ@1kT,8@8e,U:frame_path = frame_subset[idx]H@1qF,N@2gG,11:Analyzing diagram in frame {idx} (confidence: {confidence:.2f})")H@1qF,8@1b9,H@1qF,8@r0,L:# Analyze the diagramH@1qF,t: analysis = diagram_analyzer.analyze_diagram(fram8@2eB,I:extract_text=True)H@1qF,8@1b9,H@1qF,S: # Add frame metadataH@1qF,a: analysis['frame_index'] = idxH@1qF,8@1zi,Z:analysis['confidence'] = confidenceH@1qF,g: analysis['image_path'] = frame_pathH@1qF,8@1b9,H@1qF,w: # Generate Mermaid if sufficient analysis availableH@1qF,1c: if depth == "comprehensive" and 'semantic_analysis' in analysis and analysis.get('text_content'J@1kT,1D: analysis['mermaid'] = diagram_analyzer.generate_mermaid(analysis)H@1qF,8@1b9,H@1qF,n: # Save diagram image to diagrams directoryH@1qF,L: import shutilH@1qF,v: diagram_path = diagrams_dir / f"diagram_{idx}.jpg"H@1qF,j: shutil.copy2(frame_path, diagram_path)H@1qF,n: analysis['image_path'] = str(diagram_path)H@1qF,8@1b9,H@1qF,V: # Save analysis as JSONH@1qF,10: diagram_json_path = diagrams_dir / f"diagram_{idx}.json"H@1qF,8@7E,b:with open(diagram_json_path, 'w') as fI@1qE,5: 7@7E,W:json.dump(analysis, f, indent=2)H@1qF,8@1b9,H@1qF,n: diagrams.append(analysis)
187
+ elseI@1qE,20:logging.info("No diagrams detected in analyzed frames")
188
+
189
+ # Step 5: Generate knowledge graph and markdown report
190
+L@2gG,1Q:"Generating knowledge graph and report...")
191
+
192
+ # Initialize knowledge graph
193
+9@1kd,_:nowledge_graph = KnowledgeGraph(
194
+ 9@1Yi,2F:cache_dir=cache_dir,
195
+ use_cache=True
196
+ )
197
+
198
+ # Initialize plan generator
199
+ plan_generator = PlanGenerator(
200
+9@1bj,c: knowledge_graph=knowledge_graph,
201
+ 9@1Yi,12:cache_dir=cache_dir,
202
+ use_cache=True
203
+ )
204
+ 9@1zh,X:# Process transcript and diagrams9@7D,W:with open(transcript_path) as f:D@2eT,2W:transcript_data = json.load(f)
205
+
206
+ # Process into knowledge graph
207
+ knowledge_graph.process_transcript(transcript_data)
208
+ if diagrams:
209
+9@1bj,1M: knowledge_graph.process_diagrams(diagrams)
210
+
211
+ # Save knowledge graph
--- a/video_processor/cli/commands.py
+++ b/video_processor/cli/commands.py
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/cli/commands.py
+++ b/video_processor/cli/commands.py
@@ -0,0 +1,211 @@
1 video_pro["auto", "penai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "oimport timeopenai", "anprovideo_pro["from typingList, Optional "openai", "aextractors.frame_extractor import eextractors.audio_extraapi.transcription_aapi.vision_api import VisionAPI
2 analyzers.diagram_analyauto", "openai", "anpronai", "anthrfault
3 kcli.output_formatter import OutputFormatter
4
5 # Configure logging
6 defch(ctx, input_, source kg_path"auto", "openai", "avideo_pro[if kg
7 merged_kgmerged_kg_pathcypher.cypher":
8 cyphero_pro["auto", "openai", auto", "ope
9 # Create a formatter that includes timestamp, level, and message "openadefservice",6 = KnowledgeGraph(:")
10 r.cypher":
11 cvideo_pro["auto", "opena])])anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto'DEBUG': 'cyan',,6 = Know 'INFO': 'green',
12 'WARNING': 'yellow',
13 'ERROR': 'red',
14 "auto", 'CRITICAL': 'red,bg_white',
15 }
16 )
17
18 # Set up.cypher" handlerpher.cypher":
19 cyphero_pro["autovideo_pro["auto", "op30.0,g
20 merged_kgmerged_kg_pathc
21 # Configure root logger cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
22
23 # Remove existing h(:")
24 r.cypher":
25 cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto"rovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault
26 kg_path"auto", "openai", "avideo_pro[if kg
27 merged_kgmerged_kg_pathcypher.cypher":
28 cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
29 r.cypher":
30 cvideo_pro["auto", "opena])])batch(ctx, input_, source, , recursiver cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
31 r.cypher":
32 cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault
33 kg_path"auto", "openai", "avideo_pro[if kg
34 merged_kgmerged_kg_pathcypher.cypher":
35 cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
36 r.cypher":
37 cvideo_pro["auto", "opena])]), "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto"rovideo_pro["auto", "openai"path"auto", "openai", "avideo_pro[if kg
38 merged_kgmerged_kg_pathcypher.cypher":
39 cyphero_pro["auto", "openaito", "openadefservice",6D(:")
40 r.cypher":
41 cvideo_pro["auto", "opena])])batch(ctx, input_, source, , recursiver cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
42 r.cypher":
43 cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault
44 kg_path"auto", "openai", "avideo_pro[if kg
45 merged_kgmerged_kg_pathcypher.cypher":
46 cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
47 r.cypher":
48 cvideo_pro["auto", "opena])])thcypher.cypher":uto", "oimport timeopenai", "anprovideo_pro["from typingList, Optional "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault
49 kg_path"auto", "openai", "avideo_pro[if kg
50 merto", "openai", "anthrfault["auto", "openai", "anthvideo_pro["auto", "openai", "anpro["a "openai", "anpro["auto", "openai", "anpro["auto", "oimport timeopenai", "anprovideo_pro["from typingList, Optional "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault
51 kg_path"auto", "openai", "avideo_pro[if kg
52 merged_kgmerged_kg_pathcypher.cypher":
53 cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
54 r.cypher":
55 cvideo_pro["auto", "opena])])anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "oypher":
56 cvideo_pro[" kg_path"auto", "openai", "avideo_pro[if kg
57 merged_kgmerged_kg_pathcypher.cypher":
58 cyphero_pro["autovideo_pro["auto", "op30.0,g
59
60 # Find videosideo_pro["auto", "op30.0,g
61 pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault
62 kg_path"auto", "openai", "avideo_pro[if kg
63 merged_kgmerged_kg_pathcypher.cypher":
64 cyphero_pro["autovideo_pro["auto", "op30.0,g
65 merged_kgmerged_kg_pathcypher.cypher":
66 cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
67 r.cypher":
68 cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto"rovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault
69 kg_path"auto", "openai", "avideo_pro[if kg
70 merged_kgmerged_kg_pathcypher.cypher":
71 cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
72 r.cypher":
73 cvideo_pro["auto", "opena])])batch(ctx, input_, source, , recursiver cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
74 r.cypher":
75 cvideo_pro["auto", "opena])])video_pro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anpro["auto", "openai", "anprovideo_pro["auto", "openai", "anthrfault["auto", "openai", "anthrfault
76 kg_path"auto", "openai", "avideo_pro[if kg
77 merged_kgmerged_kg_pathcypher.cypher":
78 cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeGraph(:")
79 r.cypher":
80 cvideo_pro["auto", "opena])])# Main CLI group "anpro["auto"rovideo_pro["aut'--verbose', '-v'thrfault["auto", "ope'Enable verbose output'"openai", "anthrfault
81 '0.1.0', prog_name='PlanOpticon'_pro[if kg
82 merged_kgmerged_kg_pathcypher.cypher":
83 cyphero_pro["auto", "openai", auto", "openadefservice",6 = KnowledgeG# Initialize contextedgeGraph(:")
84 'verbose't_, source, , re
85 # Set up logging)batch(ctx, input_, source, , merged_kgmerged_kg_pathcyp'--input', '-i'to", "openai", "anpro["auto", "opexists=True)video_pro["auto"help='Input video file path'nadefservice",6 '--output', '-o'to", "openai", "anpro["auto", "open
86 help='Output direcnadefservice",6 '--depth',hrfault["auto", "ope'basic', 'svideo_pro["auto"default='standarnadefservice",6 '--focus' cyphero_pro["auto", "openai",list ofyphero_pro["auto", "openai", auto"'--use-gpu'thrfault["auto", "ope''nadefservice",6 '--sampling-rate'ovideo_pro["auto", "openai
87 "auto", help='_pro["auto", "opena (1.0 = every frame)'nadefservice",6 '--change-threshold'ovideo_pro["auto", "open15video_pro["auto"help='Threshold for detecting visnadefservice",6 '--title', type=str, help=' cvideo_pro["auto", "opena'nadefservice",6 '--provider', '-p',hrfault["auto", "ope'auto', 'openavideo_pro["auto"default='auto', help='API provider (auto selects best available)'nadefservice",6 '--vision-model'auto", "openai", "anNone, help='dgeGraph(:")
88 r.cypher":
89 'nadefservice",6 '--chat-model'auto", "openai", "anNone, help='ctx, input_, source, , recursiver'_pro[if kg
90 merged_kgmeanalyze(ctx, input, output, depth, focus, use_gpu, sampling_rate, change_th")
91 r.cypher":
92 Analyze video content"openai", "anpr
93 focus_areas = [areN@ie,2:reH@i~,c:',')]
94
95 # Set video title if not8@Zu,K:d
96 if not title:
97 8@1iu,1C:title = f"Analysis of {input_path.stem}"
98
99 # Log analysis parameters
100 J@s0,_:Starting analysis of {input_path}")
101 J@s0,j:Processing depth: {depth}")
102 if focus_areasI@tG,y:info(f"Focus areas: {', '.join(focus_areas)}")
103
104 try:
105 8@1aT,12:# Create subdirectories
106 frames_dir = output_dir / "frames"9@2eT,2t:audio_dir = output_dir / "audio"
107 transcript_dir = output_dir / "transcript"
108 diagrams_dir = output_dir / "diagrams"
109 results_dir = output_dir / "results"
110 9@mU,2f:for directory in [frames_dir, audio_dir, transcript_dir, diagrams_dir, results_dir]:
111 directory.mkdir(exist_ok=True)
112
113 # Step 1: Extract frames
114 L@2gG,1J:"Extracting video frames...")
115 frames = extract_frames(
116 input_paG@1c0,1O@pl,Y:disable_gpu=not use_gpu
117 )
118 N@2gG,1~:Extracted {len(frames)} frames")
119
120 # Save frames
121 frame_paths = save_frames(frames, frames_dir, "frame")
122 N@2gG,c:Saved frames to {frames_dir}")
123 9@1Gg,O:# Step 2: Extract audio
124 L@2gG,2A:"Extracting audio...")
125 audio_extractor = AudioExtractor()
126 audio_path = audio_extractor.extract_audio(
127 input_paG@1c0,w:output_path=audio_dir / f"{input_path.stem}.wav"
128 )
129 8@1ge,~:audio_props = audio_extractor.get_audio_properties(audio_path)
130 N@2gG,1~:Extracted audio: {audio_props['duration']:.2f}s, {audio_props['sample_rate']} Hz")
131
132 # Step 3: Transcribe audio
133 L@2gG,22:"Transcribing audio...")
134 transcription_api = TranscriptionAPI(
135 provider="openai", # Could be configurable
136 9@1Yi,3V:cache_dir=cache_dir,
137 use_cache=True
138 )
139
140 # Process based on depth
141 detect_speakers = depth != "basic"
142 transcription = transcription_api.transcribe_audio(
143 audio_paG@1c0,Y:detect_speakers=detect_speakers,
144 B@1qG,39:speakers=2 if detect_speakers else 1 # Default to 2 speakers if detecting
145 )
146
147 # Save transcript in different formats
148 transcript_path = transcription_api.save_transcript(D@2eT,E:transcription,D@2eT,d:transcript_dir / f"{input_path.stem}",
149 8@1Uw,G: format="json"9@2_w,h:)
150 transcription_api.save_transcript(D@2eT,E:transcription,D@2eT,d:transcript_dir / f"{input_path.stem}",
151 8@2jf,F: format="txt"9@2_w,h:)
152 transcription_api.save_transcript(D@2eT,E:transcription,D@2eT,g:transcript_dir / f"{input_path.stem}",
153 8@qD,N:format="srt"
154 )
155 8@1YD,1:
156 N@2gG,1Z:Saved transcripts to {transcript_dir}")
157
158 # Step 4: Diagram extraction and analysis
159 L@2gG,2_:"Analyzing visual elements...")
160
161 # Initialize vision API
162 vision_api = VisionAPI(
163 provider="openai", # Could be configurable
164 9@1Yi,2w:cache_dir=cache_dir,
165 use_cache=True
166 )
167
168 # Initialize diagram analyzer
169 diagram_analyzer = DiagramAnalyzer(
170 vision_api=vision_api,
171 9@1Yi,u:cache_dir=cache_dir,
172 use_cache=True
173 )9@2_w,1:
174 8@1gf,U:# Detect and analyze diagrams
175 8@1c8,11:# We pass frame paths instead of numpy arrays for better caching
176 L@2gG,4L:"Detecting diagrams in frames...")
177 diagrams = []
178
179 # Skip diagram detection for basic depth
180 if depth != "basic" and (not focus_areas or "diagrams" in focus_areas):
181 # For demo purposes, limit to a subset of frames to reduce API costs
182 8@mV,2I: max_frames_to_analyze = 10 if depth == "standard" else 20
183 frame_subset = frame_paths[:min(max_frames_to_analyze, len(frame_paths))]D@o4,1:
184 8@ut,1I: detected_frames = diagram_analyzer.detect_diagrams(frame_subset)
185
186 B@rQ,J: if detected_framesI@1qE,13:logging.info(f"Detected {len(detected_frames)} potential diagrams")H@1qF,H@1qF,V:# Process each detected diagramH@1qF,b:for idx, confidence in detected_framesI@1qE,T: if idx < len(frame_subsetJ@1kT,8@8e,U:frame_path = frame_subset[idx]H@1qF,N@2gG,11:Analyzing diagram in frame {idx} (confidence: {confidence:.2f})")H@1qF,8@1b9,H@1qF,8@r0,L:# Analyze the diagramH@1qF,t: analysis = diagram_analyzer.analyze_diagram(fram8@2eB,I:extract_text=True)H@1qF,8@1b9,H@1qF,S: # Add frame metadataH@1qF,a: analysis['frame_index'] = idxH@1qF,8@1zi,Z:analysis['confidence'] = confidenceH@1qF,g: analysis['image_path'] = frame_pathH@1qF,8@1b9,H@1qF,w: # Generate Mermaid if sufficient analysis availableH@1qF,1c: if depth == "comprehensive" and 'semantic_analysis' in analysis and analysis.get('text_content'J@1kT,1D: analysis['mermaid'] = diagram_analyzer.generate_mermaid(analysis)H@1qF,8@1b9,H@1qF,n: # Save diagram image to diagrams directoryH@1qF,L: import shutilH@1qF,v: diagram_path = diagrams_dir / f"diagram_{idx}.jpg"H@1qF,j: shutil.copy2(frame_path, diagram_path)H@1qF,n: analysis['image_path'] = str(diagram_path)H@1qF,8@1b9,H@1qF,V: # Save analysis as JSONH@1qF,10: diagram_json_path = diagrams_dir / f"diagram_{idx}.json"H@1qF,8@7E,b:with open(diagram_json_path, 'w') as fI@1qE,5: 7@7E,W:json.dump(analysis, f, indent=2)H@1qF,8@1b9,H@1qF,n: diagrams.append(analysis)
187 elseI@1qE,20:logging.info("No diagrams detected in analyzed frames")
188
189 # Step 5: Generate knowledge graph and markdown report
190 L@2gG,1Q:"Generating knowledge graph and report...")
191
192 # Initialize knowledge graph
193 9@1kd,_:nowledge_graph = KnowledgeGraph(
194 9@1Yi,2F:cache_dir=cache_dir,
195 use_cache=True
196 )
197
198 # Initialize plan generator
199 plan_generator = PlanGenerator(
200 9@1bj,c: knowledge_graph=knowledge_graph,
201 9@1Yi,12:cache_dir=cache_dir,
202 use_cache=True
203 )
204 9@1zh,X:# Process transcript and diagrams9@7D,W:with open(transcript_path) as f:D@2eT,2W:transcript_data = json.load(f)
205
206 # Process into knowledge graph
207 knowledge_graph.process_transcript(transcript_data)
208 if diagrams:
209 9@1bj,1M: knowledge_graph.process_diagrams(diagrams)
210
211 # Save knowledge graph
--- a/video_processor/providers/__init__.py
+++ b/video_processor/providers/__init__.py
@@ -0,0 +1,6 @@
1
+"""Provider abstraction layer for LLM, vision, and transcription APIs."""
2
+
3
+from video_proceBaseProvider, ModelInfo
4
+from video_processor.providers.manager import ProviderManager
5
+
6
+__all__ = ["BaseProvider", "Mode
--- a/video_processor/providers/__init__.py
+++ b/video_processor/providers/__init__.py
@@ -0,0 +1,6 @@
 
 
 
 
 
 
--- a/video_processor/providers/__init__.py
+++ b/video_processor/providers/__init__.py
@@ -0,0 +1,6 @@
1 """Provider abstraction layer for LLM, vision, and transcription APIs."""
2
3 from video_proceBaseProvider, ModelInfo
4 from video_processor.providers.manager import ProviderManager
5
6 __all__ = ["BaseProvider", "Mode
--- a/video_processor/providers/anthropic_provider.py
+++ b/video_processor/providers/anthropic_provider.py
@@ -0,0 +1,51 @@
1
+"""Anthropic provider implementation."""
2
+
3
+import base64
4
+import logging
5
+import os
6
+from pathlib import Path
7
+from typing import Optional
8
+
9
+import anthropic
10
+from dotenv import load_dotenv
11
+
12
+from video_processor.providers.base imp
13
+
14
+load_dotenv()
15
+logger = logging.getLogger(__name__)
16
+
17
+
18
+class AnthropicProvider(BaseProvider):
19
+ """Anthropic Claude API provider."""
20
+
21
+ provider_name = "anthropic"
22
+
23
+ def __init__(self, api_key: Optional[str] = None):
24
+ self.api_key = api_key or os.getenv("ANTHROPIC_API_KEY")
25
+ if not self.api_key:
26
+ raise ValueError("ANTHROPIC_API_KEY not set")
27
+ self.client = anthropic.Anthropic(api_key=self.api_key)
28
+
29
+ def chat(
30
+ self,
31
+ messages: list[dict],
32
+ max_tokens: int = 4096,
33
+ temperature: float = 0.7,
34
+ model: Optional[str] = None,
35
+ ) -> str:
36
+ model = model or "c response = self.client.messages.create(
37
+ model=model,
38
+ messages= "model": mode # Anthropic requires syste temperature=temperatureires system messages as a top-level parameter
39
+ system_parts = []
40
+ (
41
+ self,
42
+ image_bytes: bytes,
43
+ prompt: str,
44
+ max_tokens: int = 4096,
45
+ model: Optional[str]ModelInfo] = None,
46
+ ) -> strid=mid, = None,
47
+ ) -> str = base64.b64enmessages.create(
48
+ model=model,
49
+ messages=[
50
+ {
51
+
--- a/video_processor/providers/anthropic_provider.py
+++ b/video_processor/providers/anthropic_provider.py
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/providers/anthropic_provider.py
+++ b/video_processor/providers/anthropic_provider.py
@@ -0,0 +1,51 @@
1 """Anthropic provider implementation."""
2
3 import base64
4 import logging
5 import os
6 from pathlib import Path
7 from typing import Optional
8
9 import anthropic
10 from dotenv import load_dotenv
11
12 from video_processor.providers.base imp
13
14 load_dotenv()
15 logger = logging.getLogger(__name__)
16
17
18 class AnthropicProvider(BaseProvider):
19 """Anthropic Claude API provider."""
20
21 provider_name = "anthropic"
22
23 def __init__(self, api_key: Optional[str] = None):
24 self.api_key = api_key or os.getenv("ANTHROPIC_API_KEY")
25 if not self.api_key:
26 raise ValueError("ANTHROPIC_API_KEY not set")
27 self.client = anthropic.Anthropic(api_key=self.api_key)
28
29 def chat(
30 self,
31 messages: list[dict],
32 max_tokens: int = 4096,
33 temperature: float = 0.7,
34 model: Optional[str] = None,
35 ) -> str:
36 model = model or "c response = self.client.messages.create(
37 model=model,
38 messages= "model": mode # Anthropic requires syste temperature=temperatureires system messages as a top-level parameter
39 system_parts = []
40 (
41 self,
42 image_bytes: bytes,
43 prompt: str,
44 max_tokens: int = 4096,
45 model: Optional[str]ModelInfo] = None,
46 ) -> strid=mid, = None,
47 ) -> str = base64.b64enmessages.create(
48 model=model,
49 messages=[
50 {
51
--- a/video_processor/providers/base.py
+++ b/video_processor/providers/base.py
@@ -0,0 +1,2 @@
1
+"""Abstract base class and
2
+
--- a/video_processor/providers/base.py
+++ b/video_processor/providers/base.py
@@ -0,0 +1,2 @@
 
 
--- a/video_processor/providers/base.py
+++ b/video_processor/providers/base.py
@@ -0,0 +1,2 @@
1 """Abstract base class and
2
--- a/video_processor/providers/discovery.py
+++ b/video_processor/providers/discovery.py
@@ -0,0 +1,36 @@
1
+"""Auto-discover available models across providers."""
2
+
3
+import logging
4
+import os
5
+from typing import Optional
6
+
7
+from dotenv import load_dotenv
8
+
9
+from video_processor.providers.base import ModelInfo
10
+
11
+load_dotenv()
12
+logger = logging.getLogger(__name__)
13
+
14
+_cached_models: Optional[lisdiscover_available_models(
15
+ api_keys: Optional[dict[str, str]] = None,
16
+ force_refresh: bool = False,
17
+) -> list[ModelInfo]:
18
+ """
19
+ Discover available models from all configured providers.
20
+
21
+ For each provider with a valid API key, calls list_models() and returns
22
+ a unified list. Results are cached for the session.
23
+ """
24
+ global _cached_models
25
+ if _cached_models is not None and not force_refresh:
26
+ return _cached_models
27
+
28
+ roviders_registered()
29
+
30
+ keys = api_keys or {
31
+ "openai": o""),
32
+ }
33
+
34
+ all_modelAnthropic
35
+ try:
36
+ cessor.profrom video_processor.providers.anthropic_provider i = i
--- a/video_processor/providers/discovery.py
+++ b/video_processor/providers/discovery.py
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/providers/discovery.py
+++ b/video_processor/providers/discovery.py
@@ -0,0 +1,36 @@
1 """Auto-discover available models across providers."""
2
3 import logging
4 import os
5 from typing import Optional
6
7 from dotenv import load_dotenv
8
9 from video_processor.providers.base import ModelInfo
10
11 load_dotenv()
12 logger = logging.getLogger(__name__)
13
14 _cached_models: Optional[lisdiscover_available_models(
15 api_keys: Optional[dict[str, str]] = None,
16 force_refresh: bool = False,
17 ) -> list[ModelInfo]:
18 """
19 Discover available models from all configured providers.
20
21 For each provider with a valid API key, calls list_models() and returns
22 a unified list. Results are cached for the session.
23 """
24 global _cached_models
25 if _cached_models is not None and not force_refresh:
26 return _cached_models
27
28 roviders_registered()
29
30 keys = api_keys or {
31 "openai": o""),
32 }
33
34 all_modelAnthropic
35 try:
36 cessor.profrom video_processor.providers.anthropic_provider i = i
--- a/video_processor/providers/gemini_provider.py
+++ b/video_processor/providers/gemini_provider.py
@@ -0,0 +1,147 @@
1
+"""Google Gemini provider implementation using the google-genai SDK."""
2
+
3
+import logging
4
+import os
5
+from pathlib import Path
6
+from typing import Optional
7
+
8
+from dotenv import load_dotenv
9
+
10
+from video_processor.providers.base imp
11
+
12
+load_dotenv()
13
+logger = logging.getLogger(__name__)
14
+
15
+# Capabilities inferred from model id patterns
16
+_VISION_KEYWORDS = {"gemini-2", "gemini-3", "gemini-pro", "gemini-flash", "gemini-ultra"}
17
+_AUDIO_KEYWORDS = {"gemini-2", "gemini-3", "gemini-flash"}
18
+
19
+
20
+class GeminiProvider(BaseProvider):
21
+ """Google Gemini API provider via google-genai SDK."""
22
+
23
+ provider_name = "gemini"
24
+
25
+ def __init__(
26
+ self,
27
+ api_self,one,
28
+ credentials_path:):
29
+ self.api_key = api_key or os.getenv("GEMINI_API_KEY")
30
+ if not self.client = genai.Cliraise ValueErro self._genai = genai
31
+i_key)
32
+ else:
33
+ # Service account → use Ve "google-genai package not installed. """Google Gemini pioogle-genai"
34
+ )
35
+
36
+ def chat(
37
+ self,
38
+ messages: list[dict],
39
+ max_tokens: int = 4096,
40
+ temperature: float = 0.7,
41
+ model: Optional[str] = None,
42
+ ) -> str:
43
+ from google.genai import types
44
+
45
+ model = model or "gemini-2.5-flash"
46
+ # Convert OpenAI-style messages to Gemini contents
47
+ contents = []
48
+ for msg in messages:
49
+ role = "user" if msg["role"] == "user" else "model"
50
+ contents.append(
51
+ types.Conttypes.C logging
52
+import os
53
+f"""Google Gem max_output_tokens=8192,
54
+ response_mime_type="application/json",
55
+ ),
56
+ )
57
+
58
+ # Parse JSON response
59
+ import json
60
+
61
+ try:
62
+ data = json.loads(response.text)
63
+ except (json.JSO"client.models.generate_content(
64
+ model=model,
65
+ contents=.credentials_path:
66
+ raise ValueError("Neither GEMINI_API_KEY nor GOOGLE_APPLICATION_CREDENas f:
67
+ sa_info = json.load(f)
68
+ project = sa_info.get("project_id", "")
69
+ location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1")
70
+
71
+ self.client = genai.Client(
72
+ vertexai=True,
73
+ project=project,
74
+ location=location,
75
+ )
76
+ except ImportError:
77
+ raise ImportError(
78
+ "google-genai package not installed. """Google Gemini pioogle-genai"
79
+ )
80
+
81
+ def chat(
82
+ self,
83
+ messages: list[dict],
84
+ max_tokens: intes:
85
+ role = "user" if msg["role"] == "user" else "model"
86
+ contents.append(
87
+ types.Conttypes.C logging
88
+import os
89
+f"""Google Gem max_output_tokens=8192,
90
+ response_mime_type="application/json",
91
+ ),
92
+ )
93
+
94
+ # Parse JSON response
95
+ import json
96
+
97
+ try:
98
+ data = json.loads(response.text)
99
+ except (json.JSO"client.models.generate_content(
100
+ model=model,
101
+ contents=.credentials_path:
102
+ raise ValueError("Neither GEMINI_API_KEY nor GOOGLE_APPLICATION_CREDENTIALS is set")
103
+
104
+ try:
105
+ from google import genai
106
+
107
+ self._genai = genai
108
+
109
+ if self.api_key:
110
+ self.client = genai.Client(api_key=self.api_key)
111
+ else:
112
+ # Service account → use Vertex AI mode
113
+ import json
114
+
115
+ with open(self.credentials_path) as f:
116
+ sa_info = json.load(f)
117
+ project = sa_info.get("project_id", "")
118
+ location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1")
119
+
120
+ self.client = genai.Client(
121
+ vertexai=True,
122
+ project=project,
123
+ location=location,
124
+ )
125
+ except ImportError:
126
+ raise ImportError(
127
+ "google-genai package not installed. Install with: pip install google-genai"
128
+ )
129
+
130
+ def chat(
131
+ self,
132
+ messages: list[dict],
133
+ max_tokens: int = 4096,
134
+ temperature: float = 0.7,
135
+ text)
136
+ except (json.JSO:]r] = None,
137
+ crModelInfoal[str] = None,
138
+ congging
139
+import os
140
+"""Google Gemini pr )
141
+ import json
142
+
143
+ try:
144
+ data = json.loads(response.text)
145
+ except (json.JSO"models/" prefix if present )
146
+ excep"models/"mid = mid[7:] )
147
+ excep
--- a/video_processor/providers/gemini_provider.py
+++ b/video_processor/providers/gemini_provider.py
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/providers/gemini_provider.py
+++ b/video_processor/providers/gemini_provider.py
@@ -0,0 +1,147 @@
1 """Google Gemini provider implementation using the google-genai SDK."""
2
3 import logging
4 import os
5 from pathlib import Path
6 from typing import Optional
7
8 from dotenv import load_dotenv
9
10 from video_processor.providers.base imp
11
12 load_dotenv()
13 logger = logging.getLogger(__name__)
14
15 # Capabilities inferred from model id patterns
16 _VISION_KEYWORDS = {"gemini-2", "gemini-3", "gemini-pro", "gemini-flash", "gemini-ultra"}
17 _AUDIO_KEYWORDS = {"gemini-2", "gemini-3", "gemini-flash"}
18
19
20 class GeminiProvider(BaseProvider):
21 """Google Gemini API provider via google-genai SDK."""
22
23 provider_name = "gemini"
24
25 def __init__(
26 self,
27 api_self,one,
28 credentials_path:):
29 self.api_key = api_key or os.getenv("GEMINI_API_KEY")
30 if not self.client = genai.Cliraise ValueErro self._genai = genai
31 i_key)
32 else:
33 # Service account → use Ve "google-genai package not installed. """Google Gemini pioogle-genai"
34 )
35
36 def chat(
37 self,
38 messages: list[dict],
39 max_tokens: int = 4096,
40 temperature: float = 0.7,
41 model: Optional[str] = None,
42 ) -> str:
43 from google.genai import types
44
45 model = model or "gemini-2.5-flash"
46 # Convert OpenAI-style messages to Gemini contents
47 contents = []
48 for msg in messages:
49 role = "user" if msg["role"] == "user" else "model"
50 contents.append(
51 types.Conttypes.C logging
52 import os
53 f"""Google Gem max_output_tokens=8192,
54 response_mime_type="application/json",
55 ),
56 )
57
58 # Parse JSON response
59 import json
60
61 try:
62 data = json.loads(response.text)
63 except (json.JSO"client.models.generate_content(
64 model=model,
65 contents=.credentials_path:
66 raise ValueError("Neither GEMINI_API_KEY nor GOOGLE_APPLICATION_CREDENas f:
67 sa_info = json.load(f)
68 project = sa_info.get("project_id", "")
69 location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1")
70
71 self.client = genai.Client(
72 vertexai=True,
73 project=project,
74 location=location,
75 )
76 except ImportError:
77 raise ImportError(
78 "google-genai package not installed. """Google Gemini pioogle-genai"
79 )
80
81 def chat(
82 self,
83 messages: list[dict],
84 max_tokens: intes:
85 role = "user" if msg["role"] == "user" else "model"
86 contents.append(
87 types.Conttypes.C logging
88 import os
89 f"""Google Gem max_output_tokens=8192,
90 response_mime_type="application/json",
91 ),
92 )
93
94 # Parse JSON response
95 import json
96
97 try:
98 data = json.loads(response.text)
99 except (json.JSO"client.models.generate_content(
100 model=model,
101 contents=.credentials_path:
102 raise ValueError("Neither GEMINI_API_KEY nor GOOGLE_APPLICATION_CREDENTIALS is set")
103
104 try:
105 from google import genai
106
107 self._genai = genai
108
109 if self.api_key:
110 self.client = genai.Client(api_key=self.api_key)
111 else:
112 # Service account → use Vertex AI mode
113 import json
114
115 with open(self.credentials_path) as f:
116 sa_info = json.load(f)
117 project = sa_info.get("project_id", "")
118 location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1")
119
120 self.client = genai.Client(
121 vertexai=True,
122 project=project,
123 location=location,
124 )
125 except ImportError:
126 raise ImportError(
127 "google-genai package not installed. Install with: pip install google-genai"
128 )
129
130 def chat(
131 self,
132 messages: list[dict],
133 max_tokens: int = 4096,
134 temperature: float = 0.7,
135 text)
136 except (json.JSO:]r] = None,
137 crModelInfoal[str] = None,
138 congging
139 import os
140 """Google Gemini pr )
141 import json
142
143 try:
144 data = json.loads(response.text)
145 except (json.JSO"models/" prefix if present )
146 excep"models/"mid = mid[7:] )
147 excep
--- a/video_processor/providers/manager.py
+++ b/video_processor/providers/manager.py
@@ -0,0 +1,58 @@
1
+cribe audio using local Whisper if available, otherwise API."""
2
+ # Prefer locaimport osal Whisper — no file size limits, no API costs
3
+ if not self.transcription_model or self.transcription_model.startswith("whisper-local"):
4
+ try:
5
+ f
6
+viders.whisper_local import WhisperLocal
7
+
8
+ if WhisperLocal.whisper-local:large" o whisper_kwargs = {"language": language}
9
+ if speaker_hints:
10
+ whisper_kwargs["initial_prompt"] = (
11
+ "Speakers: " + ", ".join(speaker_hints) + "."
12
+ )
13
+ result = self._whisper_local.transcribe(audio_path, **whisper_kwargs)
14
+ duration = result.get("duration") or 0
15
+ self.usage.record(
16
+ provider="local",
17
+ model=f"whisper-{size}",
18
+ audio_minutes=duration / 60 if duration else 0,
19
+ )
20
+ return result
21
+ except ImportError:
22
+ pass
23
+
24
+ cribe audio using local Whisper if available, otherwise API."""
25
+ # Prefer local Whisper — no file size limits, no API costs
26
+ if not self.transcription_model or self.transcription_model.startswith("whisper-local"):
27
+ try:
28
+ from video_processor.prov}isper-local:large" or default to "large"
29
+ size = "large"
30
+ if self.transcription_model and ":" in self.transcription_model:
31
+ size = self.transcription_model.split(":", 1)[1]
32
+ if not hasattr(self, "_whisper_local"):
33
+ self._whisper_local = WhisperLocal(model_size=size)
34
+ logger.info(f"Trans=modelo(f"Transcription: using local whisper-{size}")
35
+ chat": result = self._w,"vision": result = self._w,"audio": "",
36
+ },
37
+ "gemini": {"chat": "gemini-2.5-flash","vision": "gemini-2.5-flash","audio": "gemini-2ng local "ollama": {"chat": "","vision": "","audio": "defaults.get(provider, {})ss
38
+
39
+ sper if available, otherwise API."""
40
+ # Prefer local Whispse."plocal"), "audio")
41
+else�� no file size limcribe audio using local Whisper if available, otherwise API."""
42
+ # Prefer local Whisper — no file size limits, no API costs
43
+ if not self.transcription_model or self.transcription_model.startswith("whisper-local"):
44
+ try:
45
+ f
46
+viders.whisper_local import WhisperLocal
47
+
48
+ if WhisperLocal.is_available():
49
+ # Parse model size from "whisper-local:large" o whisper_kwargs = {"language":rtError:
50
+ herwise API." "audio": ""ile size limits, no API coor self.transcription_model "visionlf.transcription_model "audiolf.transcription_modege" or default to "large"
51
+ size = "large"
52
+ if self.transcription_model and ":" in self.transcription_model:
53
+ size = self.transcription_model.split(":", 1)[1]
54
+ if not hasattr(self, "_whisper_local"):
55
+ self._whisper_local = WhisperLocal(model_size=sizelogger.info(f"Trans=modelo(f"Transcription: using local whisper-{size}")
56
+ # Pass speaker ava"chat": result = self._w,"vision"logger.info(f"Trans=modelo(fon": "gemini-2.5-flash","audio": "gemini-2ng local "ollama": {"chat": "","vision": "","audio": "defaults.get(provider, {})ss
57
+
58
+ sper if available, ologger.info(f"Trans=modelo(fpt-")o1")o3")o4")turnturn # Prefer locaimport osalturn
--- a/video_processor/providers/manager.py
+++ b/video_processor/providers/manager.py
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/providers/manager.py
+++ b/video_processor/providers/manager.py
@@ -0,0 +1,58 @@
1 cribe audio using local Whisper if available, otherwise API."""
2 # Prefer locaimport osal Whisper — no file size limits, no API costs
3 if not self.transcription_model or self.transcription_model.startswith("whisper-local"):
4 try:
5 f
6 viders.whisper_local import WhisperLocal
7
8 if WhisperLocal.whisper-local:large" o whisper_kwargs = {"language": language}
9 if speaker_hints:
10 whisper_kwargs["initial_prompt"] = (
11 "Speakers: " + ", ".join(speaker_hints) + "."
12 )
13 result = self._whisper_local.transcribe(audio_path, **whisper_kwargs)
14 duration = result.get("duration") or 0
15 self.usage.record(
16 provider="local",
17 model=f"whisper-{size}",
18 audio_minutes=duration / 60 if duration else 0,
19 )
20 return result
21 except ImportError:
22 pass
23
24 cribe audio using local Whisper if available, otherwise API."""
25 # Prefer local Whisper — no file size limits, no API costs
26 if not self.transcription_model or self.transcription_model.startswith("whisper-local"):
27 try:
28 from video_processor.prov}isper-local:large" or default to "large"
29 size = "large"
30 if self.transcription_model and ":" in self.transcription_model:
31 size = self.transcription_model.split(":", 1)[1]
32 if not hasattr(self, "_whisper_local"):
33 self._whisper_local = WhisperLocal(model_size=size)
34 logger.info(f"Trans=modelo(f"Transcription: using local whisper-{size}")
35 chat": result = self._w,"vision": result = self._w,"audio": "",
36 },
37 "gemini": {"chat": "gemini-2.5-flash","vision": "gemini-2.5-flash","audio": "gemini-2ng local "ollama": {"chat": "","vision": "","audio": "defaults.get(provider, {})ss
38
39 sper if available, otherwise API."""
40 # Prefer local Whispse."plocal"), "audio")
41 else�� no file size limcribe audio using local Whisper if available, otherwise API."""
42 # Prefer local Whisper — no file size limits, no API costs
43 if not self.transcription_model or self.transcription_model.startswith("whisper-local"):
44 try:
45 f
46 viders.whisper_local import WhisperLocal
47
48 if WhisperLocal.is_available():
49 # Parse model size from "whisper-local:large" o whisper_kwargs = {"language":rtError:
50 herwise API." "audio": ""ile size limits, no API coor self.transcription_model "visionlf.transcription_model "audiolf.transcription_modege" or default to "large"
51 size = "large"
52 if self.transcription_model and ":" in self.transcription_model:
53 size = self.transcription_model.split(":", 1)[1]
54 if not hasattr(self, "_whisper_local"):
55 self._whisper_local = WhisperLocal(model_size=sizelogger.info(f"Trans=modelo(f"Transcription: using local whisper-{size}")
56 # Pass speaker ava"chat": result = self._w,"vision"logger.info(f"Trans=modelo(fon": "gemini-2.5-flash","audio": "gemini-2ng local "ollama": {"chat": "","vision": "","audio": "defaults.get(provider, {})ss
57
58 sper if available, ologger.info(f"Trans=modelo(fpt-")o1")o3")o4")turnturn # Prefer locaimport osalturn
--- a/video_processor/providers/openai_provider.py
+++ b/video_processor/providers/openai_provider.py
@@ -0,0 +1,125 @@
1
+"""OpenAI provider implementation."""
2
+
3
+import base64
4
+import logging
5
+import os
6
+from pathlib import Path
7
+from typing import Optional
8
+
9
+from dotenv import load_dotenv
10
+from openai import OpenAI
11
+
12
+from video_processor.providers.base import BaseProvider, ModelInfo
13
+
14
+load_dotenv()
15
+logger = logging.getLogger(__name__)
16
+
17
+# Models known to have vision ca"gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "o1",}
18
+_AUDIO_MODELS = {"whisper-1"}
19
+
20
+
21
+class OpenAIProvider(BaseProvider):
22
+ """OpenAI API provider."""
23
+
24
+ provider_name = "openai"
25
+
26
+ def __init__(self, api_key: Optional[str] = None):
27
+ self.api_key = api_key or os.getenv("OPENAI_API_KEY")
28
+ if not self.api_key:
29
+ raise ValueError("OPENAI_API_KEY not set")
30
+ self.client = OpenAI(api_key=self.api_key)
31
+
32
+ def chat(
33
+ self,
34
+ messages: list[dict],
35
+ max_tokens: int = 4096,
36
+ temperature: float = 0.7,
37
+ model: Optional[str] = None,
38
+ ) -> str:
39
+ "
40
+ response = self.client.chat.completions.create(
41
+ model=mmessages,
42
+ max_tokens=max_tokens,
43
+ temperature=temperature,
44
+ )
45
+ self._last_usage = {
46
+ "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usaself,
47
+ image_bytes: bytes,
48
+ prompt: str,
49
+ max_tokens: int = 4096,
50
+ model: Optional[str] = None,
51
+ ) -> str:
52
+ "
53
+"OPENAI b64 = base64.b64encode(image_bytes).decode()
54
+ response = self.client.chat.completions.create(
55
+ model=model,
56
+ messages=[
57
+ {
58
+ "role": "user",
59
+ "content": [
60
+ {"type": "text", "text": prompt},
61
+ {
62
+ "type": "image_url",
63
+ "image_url": {"url": f"data:image/jpeg;base64,{b64}"},
64
+ },
65
+ ],
66
+ }
67
+ ],
68
+ max_tokens=max_tokens,
69
+ )
70
+ self._last_usage = {
71
+ "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usaself,
72
+ image_bytes: byte> str:
73
+ "
74
+ response = self.client.chat.completions.create(
75
+ model=mmessages,
76
+ max_tokens=max_tokens,
77
+ temperature=temperature,
78
+ )
79
+ self._last_u {
80
+ "role": "user",
81
+ "content": [
82
+ {"type": "text", "text": prompt},
83
+ {
84
+ "type": "image_url",
85
+ "image_url": {"url": f"data:image/jpeg;base64,{b64}"},
86
+ },
87
+ ],
88
+ }
89
+ ],
90
+ max_tokens=max_tokens,
91
+ )
92
+ self._last_usage = {
93
+ "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0,
94
+ "output_tokens": getattr(response.usage, "completion_tokens", 0)
95
+ if response.usage
96
+ else 0,
97
+ key:
98
+ raise ValueError("}
99
+ t": [
100
+ max_tokens: int = 4096,
101
+ temperature: float = 0.7,
102
+ monds * 0.8 * 1000)
103
+
104
+ segments_data = extractor.segment_audio(audio_data, sr, segment_length_ms=chunk_ms)
105
+ logger.info(f"Split into {len(segments_data)} chunks of ~{chunk_ms / 1000:.0f}s each")
106
+
107
+ all_text = []
108
+ all_segments = []
109
+ time_offset = 0.0
110
+ detected_language = language
111
+
112
+ with tempfile.TemporaryDirectory() as tmpdir:
113
+ for i, chunk in enumerate(segments_data):
114
+ chunk_path = Path(tmpdir) / f"chunk_{i:03d}.wav"
115
+ extractor.save_segment(chunk, chunk_path, sr)
116
+
117
+ logger.info(f"Transcribing chunk {i + 1}/{len(segments_data)}...")
118
+ result = self._transcribe_single(chunk_path, language, model)
119
+
120
+ all_text.append(result["text"])
121
+ for seg in result.get("segments", []):
122
+ all_segments.append(
123
+ {
124
+ "start": seg["start"] + tiport Path
125
+from typing impo"""OpenAI provider implementation.}) = self._transcribe_single(chunk_paid=mid
--- a/video_processor/providers/openai_provider.py
+++ b/video_processor/providers/openai_provider.py
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/providers/openai_provider.py
+++ b/video_processor/providers/openai_provider.py
@@ -0,0 +1,125 @@
1 """OpenAI provider implementation."""
2
3 import base64
4 import logging
5 import os
6 from pathlib import Path
7 from typing import Optional
8
9 from dotenv import load_dotenv
10 from openai import OpenAI
11
12 from video_processor.providers.base import BaseProvider, ModelInfo
13
14 load_dotenv()
15 logger = logging.getLogger(__name__)
16
17 # Models known to have vision ca"gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "o1",}
18 _AUDIO_MODELS = {"whisper-1"}
19
20
21 class OpenAIProvider(BaseProvider):
22 """OpenAI API provider."""
23
24 provider_name = "openai"
25
26 def __init__(self, api_key: Optional[str] = None):
27 self.api_key = api_key or os.getenv("OPENAI_API_KEY")
28 if not self.api_key:
29 raise ValueError("OPENAI_API_KEY not set")
30 self.client = OpenAI(api_key=self.api_key)
31
32 def chat(
33 self,
34 messages: list[dict],
35 max_tokens: int = 4096,
36 temperature: float = 0.7,
37 model: Optional[str] = None,
38 ) -> str:
39 "
40 response = self.client.chat.completions.create(
41 model=mmessages,
42 max_tokens=max_tokens,
43 temperature=temperature,
44 )
45 self._last_usage = {
46 "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usaself,
47 image_bytes: bytes,
48 prompt: str,
49 max_tokens: int = 4096,
50 model: Optional[str] = None,
51 ) -> str:
52 "
53 "OPENAI b64 = base64.b64encode(image_bytes).decode()
54 response = self.client.chat.completions.create(
55 model=model,
56 messages=[
57 {
58 "role": "user",
59 "content": [
60 {"type": "text", "text": prompt},
61 {
62 "type": "image_url",
63 "image_url": {"url": f"data:image/jpeg;base64,{b64}"},
64 },
65 ],
66 }
67 ],
68 max_tokens=max_tokens,
69 )
70 self._last_usage = {
71 "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usaself,
72 image_bytes: byte> str:
73 "
74 response = self.client.chat.completions.create(
75 model=mmessages,
76 max_tokens=max_tokens,
77 temperature=temperature,
78 )
79 self._last_u {
80 "role": "user",
81 "content": [
82 {"type": "text", "text": prompt},
83 {
84 "type": "image_url",
85 "image_url": {"url": f"data:image/jpeg;base64,{b64}"},
86 },
87 ],
88 }
89 ],
90 max_tokens=max_tokens,
91 )
92 self._last_usage = {
93 "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0,
94 "output_tokens": getattr(response.usage, "completion_tokens", 0)
95 if response.usage
96 else 0,
97 key:
98 raise ValueError("}
99 t": [
100 max_tokens: int = 4096,
101 temperature: float = 0.7,
102 monds * 0.8 * 1000)
103
104 segments_data = extractor.segment_audio(audio_data, sr, segment_length_ms=chunk_ms)
105 logger.info(f"Split into {len(segments_data)} chunks of ~{chunk_ms / 1000:.0f}s each")
106
107 all_text = []
108 all_segments = []
109 time_offset = 0.0
110 detected_language = language
111
112 with tempfile.TemporaryDirectory() as tmpdir:
113 for i, chunk in enumerate(segments_data):
114 chunk_path = Path(tmpdir) / f"chunk_{i:03d}.wav"
115 extractor.save_segment(chunk, chunk_path, sr)
116
117 logger.info(f"Transcribing chunk {i + 1}/{len(segments_data)}...")
118 result = self._transcribe_single(chunk_path, language, model)
119
120 all_text.append(result["text"])
121 for seg in result.get("segments", []):
122 all_segments.append(
123 {
124 "start": seg["start"] + tiport Path
125 from typing impo"""OpenAI provider implementation.}) = self._transcribe_single(chunk_paid=mid

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button