287a3bb0cda… — PlanOpticon

A .env.example

+4

		--- a/.env.example
		+++ b/.env.example
		@@ -0,0 +1,4 @@
	1	+# API Keys
	2	+OPENAI_API_KEY=your_openai_api_key_here
	3	+ANTHROPIC_API_KEY=your_anthropic_api_key_here
	4	+GOOGLE_APPLICATION_CREDENTIALS=path_to_your_google_credentials.json

	--- a/.env.example
	+++ b/.env.example
	@@ -0,0 +1,4 @@

	--- a/.env.example
	+++ b/.env.example
	@@ -0,0 +1,4 @@
1	# API Keys
2	OPENAI_API_KEY=your_openai_api_key_here
3	ANTHROPIC_API_KEY=your_anthropic_api_key_here
4	GOOGLE_APPLICATION_CREDENTIALS=path_to_your_google_credentials.json

M .gitignore

+7 -1

		--- .gitignore
		+++ .gitignore
		@@ -1,11 +1,17 @@
1	1	# Byte-compiled / optimized / DLL files
2	2	__pycache__/
3	3	*.py[cod]
4	4	*$py.class
	5	+.env
5	6
6		-panopticon-435716-271902ced24f.json
	7	+# Google Cloud service account keys
	8	+panopticon-*.json
	9	+planopticon-*.json
	10	+--*ced24f.json
	11	+service_account.json
	12	+credentials.json
7	13
8	14	# C extensions
9	15	*.so
10	16
11	17	# Test data
12	18
13	19	ADDED README_new.md

	--- .gitignore
	+++ .gitignore
	@@ -1,11 +1,17 @@
1	# Byte-compiled / optimized / DLL files
2	__pycache__/
3	*.py[cod]
4	*$py.class

5
6	panopticon-435716-271902ced24f.json





7
8	# C extensions
9	*.so
10
11	# Test data
12
13	DDED README_new.md

	--- .gitignore
	+++ .gitignore
	@@ -1,11 +1,17 @@
1	# Byte-compiled / optimized / DLL files
2	__pycache__/
3	*.py[cod]
4	*$py.class
5	.env
6
7	# Google Cloud service account keys
8	panopticon-*.json
9	planopticon-*.json
10	--*ced24f.json
11	service_account.json
12	credentials.json
13
14	# C extensions
15	*.so
16
17	# Test data
18
19	DDED README_new.md

A README_new.md

+149

		--- a/README_new.md
		+++ b/README_new.md
		@@ -0,0 +1,149 @@
	1	+# PlanOpticon
	2	+
	3	+Comprehensive Video Analysis & Knowledge Extraction CLI
	4	+
	5	+## Overview
	6	+
	7	+PlanOpticon is an advanced AI-powered CLI tool that conducts thorough analysis of video content, extracting structured knowledge, diagrams, and actionable insights. Using state-of-the-art computer vision and natural language processing techniques, PlanOpticon transforms video assets into valuable, structured information.
	8	+
	9	+## Core Features
	10	+
	11	+- Complete Transcription: Full speech-to-text with speaker attribution and semantic segmentation
	12	+- Visual Element Extraction: Automated recognition and digitization of diagrams, charts, whiteboards, and visual aids
	13	+- Action Item Detection: Intelligent identification and prioritization of tasks, commitments, and follow-ups
	14	+- Knowledge Structure: Organization of extracted content into searchable, related concepts
	15	+- Plan Generation: Synthesis of extracted elements into cohesive action plans and summaries
	16	+
	17	+## Installation
	18	+
	19	+### Prerequisites
	20	+
	21	+- Python 3.9+
	22	+- FFmpeg (for audio/video processing)
	23	+- API keys for cloud services (OpenAI, Google Cloud, etc.)
	24	+
	25	+### Setup
	26	+
	27	+1. Clone the repository:
	28	+
	29	+```bash
	30	+git clone https://github.com/yourusername/planopticon.git
	31	+cd planopticon
	32	+```
	33	+
	34	+2. Run the setup script which creates a virtual environment and installs dependencies:
	35	+
	36	+```bash
	37	+./scripts/setup.sh
	38	+```
	39	+
	40	+3. Configure your API keys by editing the `.env` file created during setup.
	41	+
	42	+### Manual Installation
	43	+
	44	+If you prefer to set up manually:
	45	+
	46	+```bash
	47	+# Create virtual environment
	48	+python -m venv venv
	49	+source venv/bin/activate # On Windows: venv\Scripts\activate
	50	+
	51	+# Install dependencies
	52	+pip install -e .
	53	+
	54	+# Install optional GPU dependencies (if available)
	55	+pip install -r requirements-gpu.txt # For NVIDIA GPUs
	56	+pip install -r requirements-apple.txt # For Apple Silicon
	57	+```
	58	+
	59	+## Usage
	60	+
	61	+PlanOpticon is designed as a command-line interface tool:
	62	+
	63	+```bash
	64	+# Basic usage
	65	+planopticon analyze --input video.mp4 --output analysis/
	66	+
	67	+# Specify processing depth
	68	+planopticon analyze --input video.mp4 --depth comprehensive --output analysis/
	69	+
	70	+# Focus on specific extraction types
	71	+planopticon analyze --input video.mp4 --focus "diagrams,action-items" --output analysis/
	72	+
	73	+# Process with GPU acceleration
	74	+planopticon analyze --input video.mp4 --use-gpu --output analysis/
	75	+```
	76	+
	77	+### Output Structure
	78	+
	79	+```
	80	+analysis/
	81	+├── transcript/
	82	+│ ├── video_name.json # Full transcription with timestamps and speakers
	83	+│ ├── video_name.txt # Plain text transcription
	84	+│ └── video_name.srt # Subtitle format
	85	+├── frames/ # Extracted key frames
	86	+│ ├── frame_0001.jpg
	87	+│ └── frame_0002.jpg
	88	+├── audio/ # Extracted audio
	89	+│ └── video_name.wav
	90	+├── diagrams/ # Extracted and digitized visual elements
	91	+│ ├── diagram_001.svg
	92	+│ └── whiteboard_001.svg
	93	+└── cache/ # API response cache
	94	+```
	95	+
	96	+## Development
	97	+
	98	+### Architecture
	99	+
	100	+PlanOpticon follows a modular pipeline architecture:
	101	+
	102	+```
	103	+video_processor/
	104	+├── extractors/ # Video and audio extraction
	105	+├── api/ # Cloud API integrations
	106	+├── analyzers/ # Content analysis components
	107	+├── integrators/ # Knowledge integration
	108	+├── utils/ # Common utilities
	109	+└── cli/ # Command-line interface
	110	+```
	111	+
	112	+### Code Standards
	113	+
	114	+- Follow PEP 8 style guidelines for all Python code
	115	+- Write comprehensive docstrings using NumPy style
	116	+- Include type hints consistently throughout the codebase
	117	+- Maintain test coverage for key components
	118	+
	119	+### Testing
	120	+
	121	+Run tests with pytest:
	122	+
	123	+```bash
	124	+pytest
	125	+```
	126	+
	127	+## System Requirements
	128	+
	129	+- Python 3.9+
	130	+- 8GB RAM minimum (16GB recommended)
	131	+- 2GB disk space for models and dependencies
	132	+- CUDA-compatible GPU (optional, for accelerated processing)
	133	+- ARM64 or x86_64 architecture
	134	+
	135	+## License
	136	+
	137	+MIT License
	138	+
	139	+## Roadmap
	140	+
	141	+See [work_plan.md](work_plan.md) for detailed development roadmap and milestones.
	142	+
	143	+## Contributing
	144	+
	145	+Contributions are welcome! Please feel free to submit a Pull Request.
	146	+
	147	+## Contact
	148	+
	149	+For questions or contributions, please open an issue on GitHub or contact the maintainers at [email protected].

	--- a/README_new.md
	+++ b/README_new.md
	@@ -0,0 +1,149 @@

	--- a/README_new.md
	+++ b/README_new.md
	@@ -0,0 +1,149 @@
1	# PlanOpticon
2
3	Comprehensive Video Analysis & Knowledge Extraction CLI
4
5	## Overview
6
7	PlanOpticon is an advanced AI-powered CLI tool that conducts thorough analysis of video content, extracting structured knowledge, diagrams, and actionable insights. Using state-of-the-art computer vision and natural language processing techniques, PlanOpticon transforms video assets into valuable, structured information.
8
9	## Core Features
10
11	- Complete Transcription: Full speech-to-text with speaker attribution and semantic segmentation
12	- Visual Element Extraction: Automated recognition and digitization of diagrams, charts, whiteboards, and visual aids
13	- Action Item Detection: Intelligent identification and prioritization of tasks, commitments, and follow-ups
14	- Knowledge Structure: Organization of extracted content into searchable, related concepts
15	- Plan Generation: Synthesis of extracted elements into cohesive action plans and summaries
16
17	## Installation
18
19	### Prerequisites
20
21	- Python 3.9+
22	- FFmpeg (for audio/video processing)
23	- API keys for cloud services (OpenAI, Google Cloud, etc.)
24
25	### Setup
26
27	1. Clone the repository:
28
29	```bash
30	git clone https://github.com/yourusername/planopticon.git
31	cd planopticon
32	```
33
34	2. Run the setup script which creates a virtual environment and installs dependencies:
35
36	```bash
37	./scripts/setup.sh
38	```
39
40	3. Configure your API keys by editing the `.env` file created during setup.
41
42	### Manual Installation
43
44	If you prefer to set up manually:
45
46	```bash
47	# Create virtual environment
48	python -m venv venv
49	source venv/bin/activate # On Windows: venv\Scripts\activate
50
51	# Install dependencies
52	pip install -e .
53
54	# Install optional GPU dependencies (if available)
55	pip install -r requirements-gpu.txt # For NVIDIA GPUs
56	pip install -r requirements-apple.txt # For Apple Silicon
57	```
58
59	## Usage
60
61	PlanOpticon is designed as a command-line interface tool:
62
63	```bash
64	# Basic usage
65	planopticon analyze --input video.mp4 --output analysis/
66
67	# Specify processing depth
68	planopticon analyze --input video.mp4 --depth comprehensive --output analysis/
69
70	# Focus on specific extraction types
71	planopticon analyze --input video.mp4 --focus "diagrams,action-items" --output analysis/
72
73	# Process with GPU acceleration
74	planopticon analyze --input video.mp4 --use-gpu --output analysis/
75	```
76
77	### Output Structure
78
79	```
80	analysis/
81	├── transcript/
82	│ ├── video_name.json # Full transcription with timestamps and speakers
83	│ ├── video_name.txt # Plain text transcription
84	│ └── video_name.srt # Subtitle format
85	├── frames/ # Extracted key frames
86	│ ├── frame_0001.jpg
87	│ └── frame_0002.jpg
88	├── audio/ # Extracted audio
89	│ └── video_name.wav
90	├── diagrams/ # Extracted and digitized visual elements
91	│ ├── diagram_001.svg
92	│ └── whiteboard_001.svg
93	└── cache/ # API response cache
94	```
95
96	## Development
97
98	### Architecture
99
100	PlanOpticon follows a modular pipeline architecture:
101
102	```
103	video_processor/
104	├── extractors/ # Video and audio extraction
105	├── api/ # Cloud API integrations
106	├── analyzers/ # Content analysis components
107	├── integrators/ # Knowledge integration
108	├── utils/ # Common utilities
109	└── cli/ # Command-line interface
110	```
111
112	### Code Standards
113
114	- Follow PEP 8 style guidelines for all Python code
115	- Write comprehensive docstrings using NumPy style
116	- Include type hints consistently throughout the codebase
117	- Maintain test coverage for key components
118
119	### Testing
120
121	Run tests with pytest:
122
123	```bash
124	pytest
125	```
126
127	## System Requirements
128
129	- Python 3.9+
130	- 8GB RAM minimum (16GB recommended)
131	- 2GB disk space for models and dependencies
132	- CUDA-compatible GPU (optional, for accelerated processing)
133	- ARM64 or x86_64 architecture
134
135	## License
136
137	MIT License
138
139	## Roadmap
140
141	See [work_plan.md](work_plan.md) for detailed development roadmap and milestones.
142
143	## Contributing
144
145	Contributions are welcome! Please feel free to submit a Pull Request.
146
147	## Contact
148
149	For questions or contributions, please open an issue on GitHub or contact the maintainers at [email protected].

M docs/architecture/pipeline.md

+3

		--- docs/architecture/pipeline.md
		+++ docs/architecture/pipeline.md
		@@ -12,10 +12,13 @@
12	12	participant DiagramAnalyzer
13	13	participant KnowledgeGraph
14	14
15	15	CLI->>Pipeline: process_single_video()
16	16	Pipeline->>FrameExtractor: extract_frames()
	17	+ Note over FrameExtractor: Change detection + periodic capture (every 30s)
	18	+ Pipeline->>Pipeline: filter_people_frames()
	19	+ Note over Pipeline: OpenCV face detection removes webcam/people frames
17	20	Pipeline->>AudioExtractor: extract_audio()
18	21	Pipeline->>Provider: transcribe_audio()
19	22	Pipeline->>DiagramAnalyzer: process_frames()
20	23
21	24	loop Each frame
22	25

	--- docs/architecture/pipeline.md
	+++ docs/architecture/pipeline.md
	@@ -12,10 +12,13 @@
12	participant DiagramAnalyzer
13	participant KnowledgeGraph
14
15	CLI->>Pipeline: process_single_video()
16	Pipeline->>FrameExtractor: extract_frames()



17	Pipeline->>AudioExtractor: extract_audio()
18	Pipeline->>Provider: transcribe_audio()
19	Pipeline->>DiagramAnalyzer: process_frames()
20
21	loop Each frame
22

	--- docs/architecture/pipeline.md
	+++ docs/architecture/pipeline.md
	@@ -12,10 +12,13 @@
12	participant DiagramAnalyzer
13	participant KnowledgeGraph
14
15	CLI->>Pipeline: process_single_video()
16	Pipeline->>FrameExtractor: extract_frames()
17	Note over FrameExtractor: Change detection + periodic capture (every 30s)
18	Pipeline->>Pipeline: filter_people_frames()
19	Note over Pipeline: OpenCV face detection removes webcam/people frames
20	Pipeline->>AudioExtractor: extract_audio()
21	Pipeline->>Provider: transcribe_audio()
22	Pipeline->>DiagramAnalyzer: process_frames()
23
24	loop Each frame
25

M docs/cli-reference.md

+1

		--- docs/cli-reference.md
		+++ docs/cli-reference.md
		@@ -15,10 +15,11 @@
15	15	\| `--depth` \| `basic\\|standard\\|comprehensive` \| `standard` \| Processing depth \|
16	16	\| `--focus` \| TEXT \| all \| Comma-separated focus areas \|
17	17	\| `--use-gpu` \| FLAG \| off \| Enable GPU acceleration \|
18	18	\| `--sampling-rate` \| FLOAT \| 0.5 \| Frame sampling rate (fps) \|
19	19	\| `--change-threshold` \| FLOAT \| 0.15 \| Visual change threshold \|
	20	+\| `--periodic-capture` \| FLOAT \| 30.0 \| Capture a frame every N seconds regardless of change (0 to disable) \|
20	21	\| `--title` \| TEXT \| auto \| Report title \|
21	22	\| `-p`, `--provider` \| `auto\\|openai\\|anthropic\\|gemini` \| `auto` \| API provider \|
22	23	\| `--vision-model` \| TEXT \| auto \| Override vision model \|
23	24	\| `--chat-model` \| TEXT \| auto \| Override chat model \|
24	25
25	26

	--- docs/cli-reference.md
	+++ docs/cli-reference.md
	@@ -15,10 +15,11 @@
15	\| `--depth` \| `basic\\|standard\\|comprehensive` \| `standard` \| Processing depth \|
16	\| `--focus` \| TEXT \| all \| Comma-separated focus areas \|
17	\| `--use-gpu` \| FLAG \| off \| Enable GPU acceleration \|
18	\| `--sampling-rate` \| FLOAT \| 0.5 \| Frame sampling rate (fps) \|
19	\| `--change-threshold` \| FLOAT \| 0.15 \| Visual change threshold \|

20	\| `--title` \| TEXT \| auto \| Report title \|
21	\| `-p`, `--provider` \| `auto\\|openai\\|anthropic\\|gemini` \| `auto` \| API provider \|
22	\| `--vision-model` \| TEXT \| auto \| Override vision model \|
23	\| `--chat-model` \| TEXT \| auto \| Override chat model \|
24
25

	--- docs/cli-reference.md
	+++ docs/cli-reference.md
	@@ -15,10 +15,11 @@
15	\| `--depth` \| `basic\\|standard\\|comprehensive` \| `standard` \| Processing depth \|
16	\| `--focus` \| TEXT \| all \| Comma-separated focus areas \|
17	\| `--use-gpu` \| FLAG \| off \| Enable GPU acceleration \|
18	\| `--sampling-rate` \| FLOAT \| 0.5 \| Frame sampling rate (fps) \|
19	\| `--change-threshold` \| FLOAT \| 0.15 \| Visual change threshold \|
20	\| `--periodic-capture` \| FLOAT \| 30.0 \| Capture a frame every N seconds regardless of change (0 to disable) \|
21	\| `--title` \| TEXT \| auto \| Report title \|
22	\| `-p`, `--provider` \| `auto\\|openai\\|anthropic\\|gemini` \| `auto` \| API provider \|
23	\| `--vision-model` \| TEXT \| auto \| Override vision model \|
24	\| `--chat-model` \| TEXT \| auto \| Override chat model \|
25
26

M docs/guide/single-video.md

+13 -11

		--- docs/guide/single-video.md
		+++ docs/guide/single-video.md
		@@ -8,21 +8,22 @@
8	8
9	9	## What happens
10	10
11	11	The pipeline runs these steps in order:
12	12
13		-1. Frame extraction — Samples frames from the video using change detection to avoid duplicates
14		-2. Audio extraction — Extracts audio track to WAV
15		-3. Transcription — Sends audio to speech-to-text (Whisper or Gemini)
16		-4. Diagram detection — Vision model classifies each frame as diagram/chart/whiteboard/screenshot/none
17		-5. Diagram analysis — High-confidence diagrams get full extraction (description, text, mermaid, chart data)
18		-6. Screengrab fallback — Medium-confidence frames are saved as captioned screenshots
19		-7. Knowledge graph — Extracts entities and relationships from transcript + diagrams
20		-8. Key points — LLM extracts main points and topics
21		-9. Action items — LLM finds tasks, commitments, and follow-ups
22		-10. Reports — Generates markdown, HTML, and PDF
23		-11. Export — Renders mermaid diagrams to SVG/PNG, reproduces charts
	13	+1. Frame extraction — Samples frames using change detection for transitions plus periodic capture (every 30s) for slow-evolving content like document scrolling
	14	+2. People frame filtering — OpenCV face detection automatically removes webcam/video conference frames, keeping only shared content (slides, documents, screen shares)
	15	+3. Audio extraction — Extracts audio track to WAV
	16	+4. Transcription — Sends audio to speech-to-text (Whisper or Gemini)
	17	+5. Diagram detection — Vision model classifies each frame as diagram/chart/whiteboard/screenshot/none
	18	+6. Diagram analysis — High-confidence diagrams get full extraction (description, text, mermaid, chart data)
	19	+7. Screengrab fallback — Medium-confidence frames are saved as captioned screenshots
	20	+8. Knowledge graph — Extracts entities and relationships from transcript + diagrams
	21	+9. Key points — LLM extracts main points and topics
	22	+10. Action items — LLM finds tasks, commitments, and follow-ups
	23	+11. Reports — Generates markdown, HTML, and PDF
	24	+12. Export — Renders mermaid diagrams to SVG/PNG, reproduces charts
24	25
25	26	## Processing depth
26	27
27	28	### `basic`
28	29	- Transcription only
		@@ -53,14 +54,15 @@
53	54	"duration_seconds": 3600.0
54	55	},
55	56	"stats": {
56	57	"duration_seconds": 45.2,
57	58	"frames_extracted": 42,
	59	+ "people_frames_filtered": 11,
58	60	"diagrams_detected": 3,
59	61	"screen_captures": 5
60	62	},
61	63	"key_points": [...],
62	64	"action_items": [...],
63	65	"diagrams": [...],
64	66	"screen_captures": [...]
65	67	}
66	68	```
67	69

	--- docs/guide/single-video.md
	+++ docs/guide/single-video.md
	@@ -8,21 +8,22 @@
8
9	## What happens
10
11	The pipeline runs these steps in order:
12
13	1. Frame extraction — Samples frames from the video using change detection to avoid duplicates
14	2. Audio extraction — Extracts audio track to WAV
15	3. Transcription — Sends audio to speech-to-text (Whisper or Gemini)
16	4. Diagram detection — Vision model classifies each frame as diagram/chart/whiteboard/screenshot/none
17	5. Diagram analysis — High-confidence diagrams get full extraction (description, text, mermaid, chart data)
18	6. Screengrab fallback — Medium-confidence frames are saved as captioned screenshots
19	7. Knowledge graph — Extracts entities and relationships from transcript + diagrams
20	8. Key points — LLM extracts main points and topics
21	9. Action items — LLM finds tasks, commitments, and follow-ups
22	10. Reports — Generates markdown, HTML, and PDF
23	11. Export — Renders mermaid diagrams to SVG/PNG, reproduces charts

24
25	## Processing depth
26
27	### `basic`
28	- Transcription only
	@@ -53,14 +54,15 @@
53	"duration_seconds": 3600.0
54	},
55	"stats": {
56	"duration_seconds": 45.2,
57	"frames_extracted": 42,

58	"diagrams_detected": 3,
59	"screen_captures": 5
60	},
61	"key_points": [...],
62	"action_items": [...],
63	"diagrams": [...],
64	"screen_captures": [...]
65	}
66	```
67

	--- docs/guide/single-video.md
	+++ docs/guide/single-video.md
	@@ -8,21 +8,22 @@
8
9	## What happens
10
11	The pipeline runs these steps in order:
12
13	1. Frame extraction — Samples frames using change detection for transitions plus periodic capture (every 30s) for slow-evolving content like document scrolling
14	2. People frame filtering — OpenCV face detection automatically removes webcam/video conference frames, keeping only shared content (slides, documents, screen shares)
15	3. Audio extraction — Extracts audio track to WAV
16	4. Transcription — Sends audio to speech-to-text (Whisper or Gemini)
17	5. Diagram detection — Vision model classifies each frame as diagram/chart/whiteboard/screenshot/none
18	6. Diagram analysis — High-confidence diagrams get full extraction (description, text, mermaid, chart data)
19	7. Screengrab fallback — Medium-confidence frames are saved as captioned screenshots
20	8. Knowledge graph — Extracts entities and relationships from transcript + diagrams
21	9. Key points — LLM extracts main points and topics
22	10. Action items — LLM finds tasks, commitments, and follow-ups
23	11. Reports — Generates markdown, HTML, and PDF
24	12. Export — Renders mermaid diagrams to SVG/PNG, reproduces charts
25
26	## Processing depth
27
28	### `basic`
29	- Transcription only
	@@ -53,14 +54,15 @@
54	"duration_seconds": 3600.0
55	},
56	"stats": {
57	"duration_seconds": 45.2,
58	"frames_extracted": 42,
59	"people_frames_filtered": 11,
60	"diagrams_detected": 3,
61	"screen_captures": 5
62	},
63	"key_points": [...],
64	"action_items": [...],
65	"diagrams": [...],
66	"screen_captures": [...]
67	}
68	```
69

M pyproject.toml

-1

		--- pyproject.toml
		+++ pyproject.toml
		@@ -15,11 +15,10 @@
15	15	keywords = ["video", "analysis", "ai", "knowledge-extraction", "transcription", "diagrams"]
16	16	classifiers = [
17	17	"Development Status :: 4 - Beta",
18	18	"Intended Audience :: Developers",
19	19	"Intended Audience :: Science/Research",
20		- "License :: OSI Approved :: MIT License",
21	20	"Operating System :: OS Independent",
22	21	"Programming Language :: Python :: 3",
23	22	"Programming Language :: Python :: 3.10",
24	23	"Programming Language :: Python :: 3.11",
25	24	"Programming Language :: Python :: 3.12",
26	25
27	26	ADDED requirements-apple.txt
28	27	ADDED requirements-gpu.txt
29	28	ADDED scripts/setup.sh
30	29	ADDED tests/__init__.py
31	30	ADDED tests/test_audio_extractor.py
32	31	ADDED tests/test_frame_extractor.py

	--- pyproject.toml
	+++ pyproject.toml
	@@ -15,11 +15,10 @@
15	keywords = ["video", "analysis", "ai", "knowledge-extraction", "transcription", "diagrams"]
16	classifiers = [
17	"Development Status :: 4 - Beta",
18	"Intended Audience :: Developers",
19	"Intended Audience :: Science/Research",
20	"License :: OSI Approved :: MIT License",
21	"Operating System :: OS Independent",
22	"Programming Language :: Python :: 3",
23	"Programming Language :: Python :: 3.10",
24	"Programming Language :: Python :: 3.11",
25	"Programming Language :: Python :: 3.12",
26
27	DDED requirements-apple.txt
28	DDED requirements-gpu.txt
29	DDED scripts/setup.sh
30	DDED tests/__init__.py
31	DDED tests/test_audio_extractor.py
32	DDED tests/test_frame_extractor.py

	--- pyproject.toml
	+++ pyproject.toml
	@@ -15,11 +15,10 @@
15	keywords = ["video", "analysis", "ai", "knowledge-extraction", "transcription", "diagrams"]
16	classifiers = [
17	"Development Status :: 4 - Beta",
18	"Intended Audience :: Developers",
19	"Intended Audience :: Science/Research",

20	"Operating System :: OS Independent",
21	"Programming Language :: Python :: 3",
22	"Programming Language :: Python :: 3.10",
23	"Programming Language :: Python :: 3.11",
24	"Programming Language :: Python :: 3.12",
25
26	DDED requirements-apple.txt
27	DDED requirements-gpu.txt
28	DDED scripts/setup.sh
29	DDED tests/__init__.py
30	DDED tests/test_audio_extractor.py
31	DDED tests/test_frame_extractor.py

A requirements-apple.txt

+4

		--- a/requirements-apple.txt
		+++ b/requirements-apple.txt
		@@ -0,0 +1,4 @@
	1	+# ARM-specific optimi0tions for macOS
	2	+torch==2.11.0
	3	+torchvision==0.15.0
	4	+torchaudio==2.0

	--- a/requirements-apple.txt
	+++ b/requirements-apple.txt
	@@ -0,0 +1,4 @@

	--- a/requirements-apple.txt
	+++ b/requirements-apple.txt
	@@ -0,0 +1,4 @@
1	# ARM-specific optimi0tions for macOS
2	torch==2.11.0
3	torchvision==0.15.0
4	torchaudio==2.0

A requirements-gpu.txt

+10

		--- a/requirements-gpu.txt
		+++ b/requirements-gpu.txt
		@@ -0,0 +1,10 @@
	1	+# NVIDIA GPU accelera0.0+cu118 dependencies
	2	+torch==2.11.0
	3	+torchvision==0.15.0+cu118
	4	+torchaudio==2.0.0+cu118
	5	+# Additional CUDA dependencies
	6	+opencv-contrib-python-headless>=4.7.0.72
	7	+cupy-cuda11x>=12.0.0
	8	+
	9	+# Find packages at:
	10	+# https://download.pytorch

	--- a/requirements-gpu.txt
	+++ b/requirements-gpu.txt
	@@ -0,0 +1,10 @@

	--- a/requirements-gpu.txt
	+++ b/requirements-gpu.txt
	@@ -0,0 +1,10 @@
1	# NVIDIA GPU accelera0.0+cu118 dependencies
2	torch==2.11.0
3	torchvision==0.15.0+cu118
4	torchaudio==2.0.0+cu118
5	# Additional CUDA dependencies
6	opencv-contrib-python-headless>=4.7.0.72
7	cupy-cuda11x>=12.0.0
8
9	# Find packages at:
10	# https://download.pytorch

A scripts/setup.sh

+120

		--- a/scripts/setup.sh
		+++ b/scripts/setup.sh
		@@ -0,0 +1,120 @@
	1	+#!/bin/bash
	2	+# PlanOpticon setup script
	3	+set -e
	4	+
	5	+# Detect operating system
	6	+if [[ "$OSTYPE" == "darwin"* ]]; then
	7	+ OS="macos"
	8	+elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
	9	+ OS="linux"
	10	+else
	11	+ echo "Unsupported operating system: $OSTYPE"
	12	+ exit 1
	13	+fi
	14	+
	15	+# Detect architecture
	16	+ARCH=$(uname -m)
	17	+if [[ "$ARCH" == "arm64" ]] \|\| [[ "$ARCH" == "aarch64" ]]; then
	18	+ ARCH="arm64"
	19	+elif [[ "$ARCH" == "x86_64" ]]; then
	20	+ ARCH="x86_64"
	21	+else
	22	+ echo "Unsupported architecture: $ARCH"
	23	+ exit 1
	24	+fi
	25	+
	26	+echo "Setting up PlanOpticon on $OS ($ARCH)..."
	27	+
	28	+# Check for Python
	29	+if ! command -v python3 &> /dev/null; then
	30	+ echo "Python 3 is required but not found."
	31	+ if [[ "$OS" == "macos" ]]; then
	32	+ echo "Please install Python 3 using Homebrew or from python.org."
	33	+ echo " brew install python"
	34	+ elif [[ "$OS" == "linux" ]]; then
	35	+ echo "Please install Python 3 using your package manager."
	36	+ echo " Ubuntu/Debian: sudo apt install python3 python3-pip python3-venv"
	37	+ echo " Fedora: sudo dnf install python3 python3-pip"
	38	+ fi
	39	+ exit 1
	40	+fi
	41	+
	42	+# Check Python version
	43	+PY_VERSION=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
	44	+PY_MAJOR=$(echo $PY_VERSION \| cut -d. -f1)
	45	+PY_MINOR=$(echo $PY_VERSION \| cut -d. -f2)
	46	+
	47	+if [[ "$PY_MAJOR" -lt 3 ]] \|\| [[ "$PY_MAJOR" -eq 3 && "$PY_MINOR" -lt 9 ]]; then
	48	+ echo "Python 3.9 or higher is required, but found $PY_VERSION."
	49	+ echo "Please upgrade your Python installation."
	50	+ exit 1
	51	+fi
	52	+
	53	+echo "Using Python $PY_VERSION"
	54	+
	55	+# Check for FFmpeg
	56	+if ! command -v ffmpeg &> /dev/null; then
	57	+ echo "FFmpeg is required but not found."
	58	+ if [[ "$OS" == "macos" ]]; then
	59	+ echo "Please install FFmpeg using Homebrew:"
	60	+ echo " brew install ffmpeg"
	61	+ elif [[ "$OS" == "linux" ]]; then
	62	+ echo "Please install FFmpeg using your package manager:"
	63	+ echo " Ubuntu/Debian: sudo apt install ffmpeg"
	64	+ echo " Fedora: sudo dnf install ffmpeg"
	65	+ fi
	66	+ exit 1
	67	+fi
	68	+
	69	+echo "FFmpeg found"
	70	+
	71	+# Create and activate virtual environment
	72	+if [[ -d "venv" ]]; then
	73	+ echo "Virtual environment already exists"
	74	+else
	75	+ echo "Creating virtual environment..."
	76	+ python3 -m venv venv
	77	+fi
	78	+
	79	+# Determine activate script path
	80	+if [[ "$OS" == "macos" ]] \|\| [[ "$OS" == "linux" ]]; then
	81	+ ACTIVATE="venv/bin/activate"
	82	+fi
	83	+
	84	+echo "Activating virtual environment..."
	85	+source "$ACTIVATE"
	86	+
	87	+# Upgrade pip
	88	+echo "Upgrading pip..."
	89	+pip install --upgrade pip
	90	+
	91	+# Install dependencies
	92	+echo "Installing dependencies..."
	93	+pip install -e .
	94	+
	95	+# Install optional GPU dependencies if available
	96	+if [[ "$OS" == "macos" && "$ARCH" == "arm64" ]]; then
	97	+ echo "Installing optional ARM-specific packages for macOS..."
	98	+ pip install -r requirements-apple.txt 2>/dev/null \|\| echo "No ARM-specific packages found or could not install them."
	99	+elif [[ "$ARCH" == "x86_64" ]]; then
	100	+ # Check for NVIDIA GPU
	101	+ if [[ "$OS" == "linux" ]] && command -v nvidia-smi &> /dev/null; then
	102	+ echo "NVIDIA GPU detected, installing GPU dependencies..."
	103	+ pip install -r requirements-gpu.txt 2>/dev/null \|\| echo "Could not install GPU packages."
	104	+ fi
	105	+fi
	106	+
	107	+# Create example .env file if it doesn't exist
	108	+if [[ ! -f ".env" ]]; then
	109	+ echo "Creating example .env file..."
	110	+ cp .env.example .env
	111	+ echo "Please edit the .env file to add your API keys."
	112	+fi
	113	+
	114	+echo "Setup complete! PlanOpticon is ready to use."
	115	+echo ""
	116	+echo "To activate the virtual environment, run:"
	117	+echo " source \"$ACTIVATE\""
	118	+echo ""
	119	+echo "To run PlanOpticon, use:"
	120	+echo " planopticon --help"

	--- a/scripts/setup.sh
	+++ b/scripts/setup.sh
	@@ -0,0 +1,120 @@

	--- a/scripts/setup.sh
	+++ b/scripts/setup.sh
	@@ -0,0 +1,120 @@
1	#!/bin/bash
2	# PlanOpticon setup script
3	set -e
4
5	# Detect operating system
6	if [[ "$OSTYPE" == "darwin"* ]]; then
7	OS="macos"
8	elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
9	OS="linux"
10	else
11	echo "Unsupported operating system: $OSTYPE"
12	exit 1
13	fi
14
15	# Detect architecture
16	ARCH=$(uname -m)
17	if [[ "$ARCH" == "arm64" ]] \|\| [[ "$ARCH" == "aarch64" ]]; then
18	ARCH="arm64"
19	elif [[ "$ARCH" == "x86_64" ]]; then
20	ARCH="x86_64"
21	else
22	echo "Unsupported architecture: $ARCH"
23	exit 1
24	fi
25
26	echo "Setting up PlanOpticon on $OS ($ARCH)..."
27
28	# Check for Python
29	if ! command -v python3 &> /dev/null; then
30	echo "Python 3 is required but not found."
31	if [[ "$OS" == "macos" ]]; then
32	echo "Please install Python 3 using Homebrew or from python.org."
33	echo " brew install python"
34	elif [[ "$OS" == "linux" ]]; then
35	echo "Please install Python 3 using your package manager."
36	echo " Ubuntu/Debian: sudo apt install python3 python3-pip python3-venv"
37	echo " Fedora: sudo dnf install python3 python3-pip"
38	fi
39	exit 1
40	fi
41
42	# Check Python version
43	PY_VERSION=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
44	PY_MAJOR=$(echo $PY_VERSION \| cut -d. -f1)
45	PY_MINOR=$(echo $PY_VERSION \| cut -d. -f2)
46
47	if [[ "$PY_MAJOR" -lt 3 ]] \|\| [[ "$PY_MAJOR" -eq 3 && "$PY_MINOR" -lt 9 ]]; then
48	echo "Python 3.9 or higher is required, but found $PY_VERSION."
49	echo "Please upgrade your Python installation."
50	exit 1
51	fi
52
53	echo "Using Python $PY_VERSION"
54
55	# Check for FFmpeg
56	if ! command -v ffmpeg &> /dev/null; then
57	echo "FFmpeg is required but not found."
58	if [[ "$OS" == "macos" ]]; then
59	echo "Please install FFmpeg using Homebrew:"
60	echo " brew install ffmpeg"
61	elif [[ "$OS" == "linux" ]]; then
62	echo "Please install FFmpeg using your package manager:"
63	echo " Ubuntu/Debian: sudo apt install ffmpeg"
64	echo " Fedora: sudo dnf install ffmpeg"
65	fi
66	exit 1
67	fi
68
69	echo "FFmpeg found"
70
71	# Create and activate virtual environment
72	if [[ -d "venv" ]]; then
73	echo "Virtual environment already exists"
74	else
75	echo "Creating virtual environment..."
76	python3 -m venv venv
77	fi
78
79	# Determine activate script path
80	if [[ "$OS" == "macos" ]] \|\| [[ "$OS" == "linux" ]]; then
81	ACTIVATE="venv/bin/activate"
82	fi
83
84	echo "Activating virtual environment..."
85	source "$ACTIVATE"
86
87	# Upgrade pip
88	echo "Upgrading pip..."
89	pip install --upgrade pip
90
91	# Install dependencies
92	echo "Installing dependencies..."
93	pip install -e .
94
95	# Install optional GPU dependencies if available
96	if [[ "$OS" == "macos" && "$ARCH" == "arm64" ]]; then
97	echo "Installing optional ARM-specific packages for macOS..."
98	pip install -r requirements-apple.txt 2>/dev/null \|\| echo "No ARM-specific packages found or could not install them."
99	elif [[ "$ARCH" == "x86_64" ]]; then
100	# Check for NVIDIA GPU
101	if [[ "$OS" == "linux" ]] && command -v nvidia-smi &> /dev/null; then
102	echo "NVIDIA GPU detected, installing GPU dependencies..."
103	pip install -r requirements-gpu.txt 2>/dev/null \|\| echo "Could not install GPU packages."
104	fi
105	fi
106
107	# Create example .env file if it doesn't exist
108	if [[ ! -f ".env" ]]; then
109	echo "Creating example .env file..."
110	cp .env.example .env
111	echo "Please edit the .env file to add your API keys."
112	fi
113
114	echo "Setup complete! PlanOpticon is ready to use."
115	echo ""
116	echo "To activate the virtual environment, run:"
117	echo " source \"$ACTIVATE\""
118	echo ""
119	echo "To run PlanOpticon, use:"
120	echo " planopticon --help"

A tests/__init__.py

No diff available

A tests/test_audio_extractor.py

+121

		--- a/tests/test_audio_extractor.py
		+++ b/tests/test_audio_extractor.py
		@@ -0,0 +1,121 @@
	1	+"""Tests for the import osaudio extractor module."""
	2	+
	3	+import tempfile
	4	+from pathlib import Pathpatch, MagicMock
	5	+
	6	+import numpy as np
	7	+import pytest
	8	+
	9	+from video_processor.extractors.audio_extractor import AudioExtractor
	10	+
	11	+class TestAudioExtractor:
	12	+ """Test suite for AudioExtractor class."""
	13	+ r AudioExtractor class."""
	14	+
	15	+ def test_init(self):
	16	+ """Test initialization of AudioExtractor."""
	17	+ # Default parameters
	18	+ extractor = AudioExtractor()
	19	+ assert extractor.sampldir) / "
	20	+ # Custom parameters
	21	+ extractor = AudioExtractor(sample_rate=44100, mono=False)
	22	+ assert extractor.sample_rate == 44100
	23	+ asse
	24	+ @patch('subprocess.run')
	25	+ def test_extract_audio(self, mock_run):
	26	+ """Test audio extraction from video."""
	27	+ # Mock the subprocess.run call
	28	+ mock_result = MagicMock()
	29	+ mock_result.returncode = 0
	30	+ mock_run.
	31	+ll
	32	+ assert extractor.mono is False
	33	+
	34	+ @patch("subprocess.run")
	35	+ def tesvideocess.run")
	36	+ def audio extractor module."""
	37	+
	38	+import tempfile
	39	+from pathlib import Path
	40	+from unittest.mock import MagicMock, patch
	41	+
	42	+import numpy as np
	43	+
	44	+from vi"""Tests
	45	+ # Extract audio
	46	+ extractor = AudioExtractor()
	47	+ o_path.with_suffix(".wav")
	48	+
	49	+ # Test with default output path
	50	+ output_path = extractor.extract_audio(video_path)
	51	+ assert output_path == vide o_path.with_suffix(".wav")
	52	+
	53	+ # Test with custom output path
	54	+ custom_output = Path(temp_dir) / "custom_audio.wav"
	55	+ output_path = extractor.extract_audio(video_path, custom_ou
	56	+ .extract_aucustom_output
	57	+
	58	+ # Verify subprocess.run was called with correct arguments
	59	+ mock_run.assert_called()
	60	+ args, kwargs = mock_run.call_args
	61	+ assert "ffmpeg" in args[0]
	62	+ assert "-i" in args[0]
	63	+ assert
	64	+ @patch('soundfile.info')
	65	+ def test_get_audio_properties(self, mock_sf_info):
	66	+ """Test getting audio properties."""
	67	+ # Mock soundfile.info
	68	+ mock_info = MagicMock()
	69	+ mock_info.duration = 10.5
	70	+ mock_info.samplerate = 16000
	71	+ mock_info.channels = 1
	72	+ mock_info.format = "WAV"
	73	+ mock_info.subtype = "PCM_16"
	74	+ mock_sf_inf
	75	+ll
	76	+ assert extractor.mono is False
	77	+
	78	+ @patch("subprocess.run")
	79	+ def test_extract_audio(self, mock_run):
	80	+ """Test audio extraction from video."""
	81	+ # Mock the subprocess.run call
	82	+ mock_resu ck_result.returncode = 0
	83	+ mock_run.return_value = mock_result
	84	+
	85	+ with tempfile.TemporaryDirectory() as temp_dir:
	86	+
	87	+ .extract_aua dummy video file
	88	+ video_path = Path(temp_dir) / "test_video.mp4"
	89	+ with open(video_path, "wb") as f:
	90	+ f.write(b"dummy video content")
	91	+
	92	+ # Extract audio
	93	+ extractor = AudioExtractor()
	94	+
	95	+ # Test with defaul utput_path = extractor.extract_audio(video_path)
	96	+ assert output_path == video_path.with_suffix(".wav")
	97	+
	98	+ # Test with custom output path
	99	+ custom_output = Path(temp # _dir) / "cu assert str(vidtor.extract_audio(video_path, custom_output)
	100	+ assert output_path == custom_output
	101	+
	102	+
	103	+ sample_rate,
	104	+ with segment_length_ms=500,
	105	+ all
	106	+ as called with correct actor module."""
	107	+
	108	+import tempfile
	109	+from pathlib import Path
	110	+from unittest.mock import MagicMock, patch
	111	+
	112	+import numpy as np
	113	+
	114	+from video_processor.e"""Tests assert str(video_path) in args[0]
	115	+
	116	+ @patch("soundfile.info")
	117	+ def test_get_audi
	118	+ sample_rate,
	119	+ with segment_ """Test getting aud Mock soundfile.info
	120	+ mock_info = MagicMock()
	121	+ mock_info.duration = 10.

	--- a/tests/test_audio_extractor.py
	+++ b/tests/test_audio_extractor.py
	@@ -0,0 +1,121 @@

	--- a/tests/test_audio_extractor.py
	+++ b/tests/test_audio_extractor.py
	@@ -0,0 +1,121 @@
1	"""Tests for the import osaudio extractor module."""
2
3	import tempfile
4	from pathlib import Pathpatch, MagicMock
5
6	import numpy as np
7	import pytest
8
9	from video_processor.extractors.audio_extractor import AudioExtractor
10
11	class TestAudioExtractor:
12	"""Test suite for AudioExtractor class."""
13	r AudioExtractor class."""
14
15	def test_init(self):
16	"""Test initialization of AudioExtractor."""
17	# Default parameters
18	extractor = AudioExtractor()
19	assert extractor.sampldir) / "
20	# Custom parameters
21	extractor = AudioExtractor(sample_rate=44100, mono=False)
22	assert extractor.sample_rate == 44100
23	asse
24	@patch('subprocess.run')
25	def test_extract_audio(self, mock_run):
26	"""Test audio extraction from video."""
27	# Mock the subprocess.run call
28	mock_result = MagicMock()
29	mock_result.returncode = 0
30	mock_run.
31	ll
32	assert extractor.mono is False
33
34	@patch("subprocess.run")
35	def tesvideocess.run")
36	def audio extractor module."""
37
38	import tempfile
39	from pathlib import Path
40	from unittest.mock import MagicMock, patch
41
42	import numpy as np
43
44	from vi"""Tests
45	# Extract audio
46	extractor = AudioExtractor()
47	o_path.with_suffix(".wav")
48
49	# Test with default output path
50	output_path = extractor.extract_audio(video_path)
51	assert output_path == vide o_path.with_suffix(".wav")
52
53	# Test with custom output path
54	custom_output = Path(temp_dir) / "custom_audio.wav"
55	output_path = extractor.extract_audio(video_path, custom_ou
56	.extract_aucustom_output
57
58	# Verify subprocess.run was called with correct arguments
59	mock_run.assert_called()
60	args, kwargs = mock_run.call_args
61	assert "ffmpeg" in args[0]
62	assert "-i" in args[0]
63	assert
64	@patch('soundfile.info')
65	def test_get_audio_properties(self, mock_sf_info):
66	"""Test getting audio properties."""
67	# Mock soundfile.info
68	mock_info = MagicMock()
69	mock_info.duration = 10.5
70	mock_info.samplerate = 16000
71	mock_info.channels = 1
72	mock_info.format = "WAV"
73	mock_info.subtype = "PCM_16"
74	mock_sf_inf
75	ll
76	assert extractor.mono is False
77
78	@patch("subprocess.run")
79	def test_extract_audio(self, mock_run):
80	"""Test audio extraction from video."""
81	# Mock the subprocess.run call
82	mock_resu ck_result.returncode = 0
83	mock_run.return_value = mock_result
84
85	with tempfile.TemporaryDirectory() as temp_dir:
86
87	.extract_aua dummy video file
88	video_path = Path(temp_dir) / "test_video.mp4"
89	with open(video_path, "wb") as f:
90	f.write(b"dummy video content")
91
92	# Extract audio
93	extractor = AudioExtractor()
94
95	# Test with defaul utput_path = extractor.extract_audio(video_path)
96	assert output_path == video_path.with_suffix(".wav")
97
98	# Test with custom output path
99	custom_output = Path(temp # _dir) / "cu assert str(vidtor.extract_audio(video_path, custom_output)
100	assert output_path == custom_output
101
102
103	sample_rate,
104	with segment_length_ms=500,
105	all
106	as called with correct actor module."""
107
108	import tempfile
109	from pathlib import Path
110	from unittest.mock import MagicMock, patch
111
112	import numpy as np
113
114	from video_processor.e"""Tests assert str(video_path) in args[0]
115
116	@patch("soundfile.info")
117	def test_get_audi
118	sample_rate,
119	with segment_ """Test getting aud Mock soundfile.info
120	mock_info = MagicMock()
121	mock_info.duration = 10.

A tests/test_frame_extractor.py

+58

		--- a/tests/test_frame_extractor.py
		+++ b/tests/test_frame_extractor.py
		@@ -0,0 +1,58 @@
	1	+"""Tests for the rame extractor module."""
	2	+from pathlib import Path
	3	+
	4	+import numpy as np
	5	+import pytest
	6	+
	7	+from video_processor.extractors.frame_extractor import (
	8	+ calculate_frame_difference,
	9	+ extract_frames,
	10	+ is_gpu
	11	+)ilable,
	12	+ save_frames,
	13	+)
	14	+
	15	+
	16	+# Create dummy test frames
	17	+@pytest.fixture
	18	+def dummy_frames():
	19	+ # Create a list of dummy frames with different content
	20	+ frames = []
	21	+ for i in range(3):
	22	+ # Create frame with different intensity for each
	23	+ frame = np.ones((100, 100, 3), dtype=np.uint8) * (i * 50)
	24	+ frames.appendframe)
	25	+ return frames
	26	+
	27	+
	28	+def test_calculate_frame_difference():
	29	+ """Test frame difference calculation."""
	30	+ # Create two frames with some difference
	31	+ frame1 = np.zeros((100, 100, 3), dtype=np.uint8)
	32	+ frame2 = np.ones((100, 100, 3), dtype=np.uin t8) * 128 # 50% intensity
	33	+
	34	+ # Calculate difference
	35	+ diff = calculate_frame_
	36	+ # Expected difference is around 128/255 = 0.5
	37	+ assert 0.45 <= diff <= 0.55
	38	+ def test_is_gpu_available():
	39	+ """Test GPU availability check."""
	40	+ # This just tests that the function runs without error
	41	+ # We don't assert the result because it depends on the system
	42	+ result = is_gpu_available()
	43	+ assert isinstance(result, bool)
	44	+
	45	+def test_save_frames(dummy_frames):
	46	+ """Test saving frames to disk."""
	47	+ with tempfile.TemporaryDirectory() as temp_dir:
	48	+ # Save frames
	49	+ paths = save_frames(dummy_frames, temp_dir, "test_frame")
	50	+
	51	+ # Check that me")
	52	+
	53	+ # Check that we got the correct number of paths
	54	+ assert len(p
	55	+ # Check that files were created
	56	+ for path in paths:
	57	+ assert os.path.exists(path)
	58	+ assert os.path.getsize(path) > 0

	--- a/tests/test_frame_extractor.py
	+++ b/tests/test_frame_extractor.py
	@@ -0,0 +1,58 @@

	--- a/tests/test_frame_extractor.py
	+++ b/tests/test_frame_extractor.py
	@@ -0,0 +1,58 @@
1	"""Tests for the rame extractor module."""
2	from pathlib import Path
3
4	import numpy as np
5	import pytest
6
7	from video_processor.extractors.frame_extractor import (
8	calculate_frame_difference,
9	extract_frames,
10	is_gpu
11	)ilable,
12	save_frames,
13	)
14
15
16	# Create dummy test frames
17	@pytest.fixture
18	def dummy_frames():
19	# Create a list of dummy frames with different content
20	frames = []
21	for i in range(3):
22	# Create frame with different intensity for each
23	frame = np.ones((100, 100, 3), dtype=np.uint8) * (i * 50)
24	frames.appendframe)
25	return frames
26
27
28	def test_calculate_frame_difference():
29	"""Test frame difference calculation."""
30	# Create two frames with some difference
31	frame1 = np.zeros((100, 100, 3), dtype=np.uint8)
32	frame2 = np.ones((100, 100, 3), dtype=np.uin t8) * 128 # 50% intensity
33
34	# Calculate difference
35	diff = calculate_frame_
36	# Expected difference is around 128/255 = 0.5
37	assert 0.45 <= diff <= 0.55
38	def test_is_gpu_available():
39	"""Test GPU availability check."""
40	# This just tests that the function runs without error
41	# We don't assert the result because it depends on the system
42	result = is_gpu_available()
43	assert isinstance(result, bool)
44
45	def test_save_frames(dummy_frames):
46	"""Test saving frames to disk."""
47	with tempfile.TemporaryDirectory() as temp_dir:
48	# Save frames
49	paths = save_frames(dummy_frames, temp_dir, "test_frame")
50
51	# Check that me")
52
53	# Check that we got the correct number of paths
54	assert len(p
55	# Check that files were created
56	for path in paths:
57	assert os.path.exists(path)
58	assert os.path.getsize(path) > 0

M tests/test_providers.py

+2

		--- tests/test_providers.py
		+++ tests/test_providers.py
		@@ -104,16 +104,18 @@
104	104	assert "anthropic/claude-sonnet-4-5-20250929" == used["chat"]
105	105
106	106
107	107	class TestDiscovery:
108	108	@patch("video_processor.providers.discovery._cached_models", None)
	109	+ @patch.dict("os.environ", {}, clear=True)
109	110	def test_discover_skips_missing_keys(self):
110	111	from video_processor.providers.discovery import discover_available_models
111	112	# No API keys -> empty list, no errors
112	113	models = discover_available_models(api_keys={"openai": "", "anthropic": "", "gemini": ""})
113	114	assert models == []
114	115
	116	+ @patch.dict("os.environ", {}, clear=True)
115	117	@patch("video_processor.providers.discovery._cached_models", None)
116	118	def test_discover_caches_results(self):
117	119	from video_processor.providers import discovery
118	120
119	121	models = discovery.discover_available_models(api_keys={"openai": "", "anthropic": "", "gemini": ""})
120	122
121	123	ADDED video_processor/__init__.py
122	124	ADDED video_processor/analyzers/__init__.py

	--- tests/test_providers.py
	+++ tests/test_providers.py
	@@ -104,16 +104,18 @@
104	assert "anthropic/claude-sonnet-4-5-20250929" == used["chat"]
105
106
107	class TestDiscovery:
108	@patch("video_processor.providers.discovery._cached_models", None)

109	def test_discover_skips_missing_keys(self):
110	from video_processor.providers.discovery import discover_available_models
111	# No API keys -> empty list, no errors
112	models = discover_available_models(api_keys={"openai": "", "anthropic": "", "gemini": ""})
113	assert models == []
114

115	@patch("video_processor.providers.discovery._cached_models", None)
116	def test_discover_caches_results(self):
117	from video_processor.providers import discovery
118
119	models = discovery.discover_available_models(api_keys={"openai": "", "anthropic": "", "gemini": ""})
120
121	DDED video_processor/__init__.py
122	DDED video_processor/analyzers/__init__.py

	--- tests/test_providers.py
	+++ tests/test_providers.py
	@@ -104,16 +104,18 @@
104	assert "anthropic/claude-sonnet-4-5-20250929" == used["chat"]
105
106
107	class TestDiscovery:
108	@patch("video_processor.providers.discovery._cached_models", None)
109	@patch.dict("os.environ", {}, clear=True)
110	def test_discover_skips_missing_keys(self):
111	from video_processor.providers.discovery import discover_available_models
112	# No API keys -> empty list, no errors
113	models = discover_available_models(api_keys={"openai": "", "anthropic": "", "gemini": ""})
114	assert models == []
115
116	@patch.dict("os.environ", {}, clear=True)
117	@patch("video_processor.providers.discovery._cached_models", None)
118	def test_discover_caches_results(self):
119	from video_processor.providers import discovery
120
121	models = discovery.discover_available_models(api_keys={"openai": "", "anthropic": "", "gemini": ""})
122
123	DDED video_processor/__init__.py
124	DDED video_processor/analyzers/__init__.py

A video_processor/__init__.py

No diff available

A video_processor/analyzers/__init__.py

No diff available

M video_processor/analyzers/diagram_analyzer.py

+11 -3

		--- video_processor/analyzers/diagram_analyzer.py
		+++ video_processor/analyzers/diagram_analyzer.py
		@@ -3,26 +3,34 @@
3	3	import json
4	4	import logging
5	5	import shutil
6	6	from pathlib import Path
7	7	from typing import List, Optional, Tuple, Union
	8	+
	9	+from tqdm import tqdm
8	10
9	11	from video_processor.models import DiagramResult, DiagramType, ScreenCapture
10	12	from video_processor.providers.manager import ProviderManager
11	13
12	14	logger = logging.getLogger(__name__)
13	15
14	16	# Classification prompt — returns JSON
15	17	_CLASSIFY_PROMPT = """\
16		-Examine this image from a video recording. Classify whether it contains a diagram, \
17		-chart, slide, whiteboard content, architecture drawing, or other structured visual information.
	18	+Examine this image from a video recording. Your job is to identify ONLY shared content \
	19	+— slides, presentations, charts, diagrams, documents, screen shares, whiteboard content, \
	20	+architecture drawings, tables, or other structured visual information worth capturing.
	21	+
	22	+IMPORTANT: If the image primarily shows a person, people, webcam feeds, faces, or a \
	23	+video conference participant view, return confidence 0.0. We are ONLY interested in \
	24	+shared/presented content, NOT people or camera views.
18	25
19	26	Return ONLY a JSON object (no markdown fences):
20	27	{
21	28	"is_diagram": true/false,
22	29	"diagram_type": "flowchart"\|"sequence"\|"architecture"\|"whiteboard"\|"chart"\|"table"\|"slide"\|"screenshot"\|"unknown",
23	30	"confidence": 0.0 to 1.0,
	31	+ "content_type": "slide"\|"diagram"\|"document"\|"screen_share"\|"whiteboard"\|"chart"\|"person"\|"other",
24	32	"brief_description": "one-sentence description of what you see"
25	33	}
26	34	"""
27	35
28	36	# Single-pass analysis prompt — extracts everything in one call
		@@ -137,11 +145,11 @@
137	145	diagrams: List[DiagramResult] = []
138	146	captures: List[ScreenCapture] = []
139	147	diagram_idx = 0
140	148	capture_idx = 0
141	149
142		- for i, fp in enumerate(frame_paths):
	150	+ for i, fp in enumerate(tqdm(frame_paths, desc="Analyzing frames", unit="frame")):
143	151	fp = Path(fp)
144	152	logger.info(f"Classifying frame {i}/{len(frame_paths)}: {fp.name}")
145	153
146	154	try:
147	155	classification = self.classify_frame(fp)
148	156
149	157	ADDED video_processor/cli/__init__.py

	--- video_processor/analyzers/diagram_analyzer.py
	+++ video_processor/analyzers/diagram_analyzer.py
	@@ -3,26 +3,34 @@
3	import json
4	import logging
5	import shutil
6	from pathlib import Path
7	from typing import List, Optional, Tuple, Union


8
9	from video_processor.models import DiagramResult, DiagramType, ScreenCapture
10	from video_processor.providers.manager import ProviderManager
11
12	logger = logging.getLogger(__name__)
13
14	# Classification prompt — returns JSON
15	_CLASSIFY_PROMPT = """\
16	Examine this image from a video recording. Classify whether it contains a diagram, \
17	chart, slide, whiteboard content, architecture drawing, or other structured visual information.





18
19	Return ONLY a JSON object (no markdown fences):
20	{
21	"is_diagram": true/false,
22	"diagram_type": "flowchart"\|"sequence"\|"architecture"\|"whiteboard"\|"chart"\|"table"\|"slide"\|"screenshot"\|"unknown",
23	"confidence": 0.0 to 1.0,

24	"brief_description": "one-sentence description of what you see"
25	}
26	"""
27
28	# Single-pass analysis prompt — extracts everything in one call
	@@ -137,11 +145,11 @@
137	diagrams: List[DiagramResult] = []
138	captures: List[ScreenCapture] = []
139	diagram_idx = 0
140	capture_idx = 0
141
142	for i, fp in enumerate(frame_paths):
143	fp = Path(fp)
144	logger.info(f"Classifying frame {i}/{len(frame_paths)}: {fp.name}")
145
146	try:
147	classification = self.classify_frame(fp)
148
149	DDED video_processor/cli/__init__.py

	--- video_processor/analyzers/diagram_analyzer.py
	+++ video_processor/analyzers/diagram_analyzer.py
	@@ -3,26 +3,34 @@
3	import json
4	import logging
5	import shutil
6	from pathlib import Path
7	from typing import List, Optional, Tuple, Union
8
9	from tqdm import tqdm
10
11	from video_processor.models import DiagramResult, DiagramType, ScreenCapture
12	from video_processor.providers.manager import ProviderManager
13
14	logger = logging.getLogger(__name__)
15
16	# Classification prompt — returns JSON
17	_CLASSIFY_PROMPT = """\
18	Examine this image from a video recording. Your job is to identify ONLY shared content \
19	— slides, presentations, charts, diagrams, documents, screen shares, whiteboard content, \
20	architecture drawings, tables, or other structured visual information worth capturing.
21
22	IMPORTANT: If the image primarily shows a person, people, webcam feeds, faces, or a \
23	video conference participant view, return confidence 0.0. We are ONLY interested in \
24	shared/presented content, NOT people or camera views.
25
26	Return ONLY a JSON object (no markdown fences):
27	{
28	"is_diagram": true/false,
29	"diagram_type": "flowchart"\|"sequence"\|"architecture"\|"whiteboard"\|"chart"\|"table"\|"slide"\|"screenshot"\|"unknown",
30	"confidence": 0.0 to 1.0,
31	"content_type": "slide"\|"diagram"\|"document"\|"screen_share"\|"whiteboard"\|"chart"\|"person"\|"other",
32	"brief_description": "one-sentence description of what you see"
33	}
34	"""
35
36	# Single-pass analysis prompt — extracts everything in one call
	@@ -137,11 +145,11 @@
145	diagrams: List[DiagramResult] = []
146	captures: List[ScreenCapture] = []
147	diagram_idx = 0
148	capture_idx = 0
149
150	for i, fp in enumerate(tqdm(frame_paths, desc="Analyzing frames", unit="frame")):
151	fp = Path(fp)
152	logger.info(f"Classifying frame {i}/{len(frame_paths)}: {fp.name}")
153
154	try:
155	classification = self.classify_frame(fp)
156
157	DDED video_processor/cli/__init__.py

A video_processor/cli/__init__.py

No diff available

M video_processor/cli/commands.py

+17 -7

		--- video_processor/cli/commands.py
		+++ video_processor/cli/commands.py
		@@ -8,10 +8,11 @@
8	8	from pathlib import Path
9	9	from typing import List, Optional
10	10
11	11	import click
12	12	import colorlog
	13	+from tqdm import tqdm
13	14
14	15
15	16	def setup_logging(verbose: bool = False) -> None:
16	17	"""Set up logging with color formatting."""
17	18	log_level = logging.DEBUG if verbose else logging.INFO
		@@ -57,10 +58,11 @@
57	58	)
58	59	@click.option("--focus", type=str, help='Comma-separated focus areas (e.g., "diagrams,action-items")')
59	60	@click.option("--use-gpu", is_flag=True, help="Enable GPU acceleration if available")
60	61	@click.option("--sampling-rate", type=float, default=0.5, help="Frame sampling rate")
61	62	@click.option("--change-threshold", type=float, default=0.15, help="Visual change threshold")
	63	+@click.option("--periodic-capture", type=float, default=30.0, help="Capture a frame every N seconds regardless of change (0 to disable)")
62	64	@click.option("--title", type=str, help="Title for the analysis report")
63	65	@click.option(
64	66	"--provider",
65	67	"-p",
66	68	type=click.Choice(["auto", "openai", "anthropic", "gemini"]),
		@@ -77,10 +79,11 @@
77	79	depth,
78	80	focus,
79	81	use_gpu,
80	82	sampling_rate,
81	83	change_threshold,
	84	+ periodic_capture,
82	85	title,
83	86	provider,
84	87	vision_model,
85	88	chat_model,
86	89	):
		@@ -104,16 +107,19 @@
104	107	provider_manager=pm,
105	108	depth=depth,
106	109	focus_areas=focus_areas,
107	110	sampling_rate=sampling_rate,
108	111	change_threshold=change_threshold,
	112	+ periodic_capture_seconds=periodic_capture,
109	113	use_gpu=use_gpu,
110	114	title=title,
111	115	)
112		- logging.info(f"Results at {output}/manifest.json")
	116	+ click.echo(pm.usage.format_summary())
	117	+ click.echo(f"\n Results: {output}/manifest.json")
113	118	except Exception as e:
114	119	logging.error(f"Error: {e}")
	120	+ click.echo(pm.usage.format_summary())
115	121	if ctx.obj["verbose"]:
116	122	import traceback
117	123
118	124	traceback.print_exc()
119	125	sys.exit(1)
		@@ -150,12 +156,13 @@
150	156	default="local",
151	157	help="Video source (local directory, Google Drive, or Dropbox)",
152	158	)
153	159	@click.option("--folder-id", type=str, default=None, help="Google Drive folder ID")
154	160	@click.option("--folder-path", type=str, default=None, help="Cloud folder path")
	161	+@click.option("--recursive/--no-recursive", default=True, help="Recurse into subfolders (default: recursive)")
155	162	@click.pass_context
156		-def batch(ctx, input_dir, output, depth, pattern, title, provider, vision_model, chat_model, source, folder_id, folder_path):
	163	+def batch(ctx, input_dir, output, depth, pattern, title, provider, vision_model, chat_model, source, folder_id, folder_path, recursive):
157	164	"""Process a folder of videos in batch."""
158	165	from video_processor.integrators.knowledge_graph import KnowledgeGraph
159	166	from video_processor.integrators.plan_generator import PlanGenerator
160	167	from video_processor.models import BatchManifest, BatchVideoEntry
161	168	from video_processor.output_structure import (
		@@ -180,11 +187,11 @@
180	187
181	188	cloud = GoogleDriveSource()
182	189	if not cloud.authenticate():
183	190	logging.error("Google Drive authentication failed")
184	191	sys.exit(1)
185		- cloud_files = cloud.list_videos(folder_id=folder_id, folder_path=folder_path, patterns=patterns)
	192	+ cloud_files = cloud.list_videos(folder_id=folder_id, folder_path=folder_path, patterns=patterns, recursive=recursive)
186	193	local_paths = cloud.download_all(cloud_files, download_dir)
187	194	elif source == "dropbox":
188	195	from video_processor.sources.dropbox_source import DropboxSource
189	196
190	197	cloud = DropboxSource()
		@@ -202,14 +209,15 @@
202	209	if not input_dir:
203	210	logging.error("--input-dir is required for local source")
204	211	sys.exit(1)
205	212	input_dir = Path(input_dir)
206	213
207		- # Find videos
	214	+ # Find videos (rglob for recursive, glob for flat)
208	215	videos = []
	216	+ glob_fn = input_dir.rglob if recursive else input_dir.glob
209	217	for pat in patterns:
210		- videos.extend(sorted(input_dir.glob(pat)))
	218	+ videos.extend(sorted(glob_fn(pat)))
211	219	videos = sorted(set(videos))
212	220
213	221	if not videos:
214	222	logging.error(f"No videos found in {input_dir} matching {pattern}")
215	223	sys.exit(1)
		@@ -219,11 +227,11 @@
219	227	dirs = create_batch_output_dirs(output, title)
220	228	manifests = []
221	229	entries = []
222	230	merged_kg = KnowledgeGraph()
223	231
224		- for idx, video_path in enumerate(videos):
	232	+ for idx, video_path in enumerate(tqdm(videos, desc="Batch processing", unit="video")):
225	233	video_name = video_path.stem
226	234	video_output = dirs["videos"] / video_name
227	235	logging.info(f"Processing video {idx + 1}/{len(videos)}: {video_path.name}")
228	236
229	237	entry = BatchVideoEntry(
		@@ -290,11 +298,13 @@
290	298	videos=entries,
291	299	batch_summary_md="batch_summary.md",
292	300	merged_knowledge_graph_json="knowledge_graph.json",
293	301	)
294	302	write_batch_manifest(batch_manifest, output)
295		- logging.info(f"Batch complete: {batch_manifest.completed_videos}/{batch_manifest.total_videos} succeeded")
	303	+ click.echo(pm.usage.format_summary())
	304	+ click.echo(f"\n Batch complete: {batch_manifest.completed_videos}/{batch_manifest.total_videos} succeeded")
	305	+ click.echo(f" Results: {output}/batch_manifest.json")
296	306
297	307
298	308	@cli.command("list-models")
299	309	@click.pass_context
300	310	def list_models(ctx):
301	311
302	312	ADDED video_processor/cli/output_formatter.py
303	313	ADDED video_processor/extractors/__init__.py
304	314	ADDED video_processor/extractors/audio_extractor.py
305	315	ADDED video_processor/extractors/frame_extractor.py
306	316	ADDED video_processor/extractors/text_extractor.py
307	317	ADDED video_processor/integrators/__init__.py

	--- video_processor/cli/commands.py
	+++ video_processor/cli/commands.py
	@@ -8,10 +8,11 @@
8	from pathlib import Path
9	from typing import List, Optional
10
11	import click
12	import colorlog

13
14
15	def setup_logging(verbose: bool = False) -> None:
16	"""Set up logging with color formatting."""
17	log_level = logging.DEBUG if verbose else logging.INFO
	@@ -57,10 +58,11 @@
57	)
58	@click.option("--focus", type=str, help='Comma-separated focus areas (e.g., "diagrams,action-items")')
59	@click.option("--use-gpu", is_flag=True, help="Enable GPU acceleration if available")
60	@click.option("--sampling-rate", type=float, default=0.5, help="Frame sampling rate")
61	@click.option("--change-threshold", type=float, default=0.15, help="Visual change threshold")

62	@click.option("--title", type=str, help="Title for the analysis report")
63	@click.option(
64	"--provider",
65	"-p",
66	type=click.Choice(["auto", "openai", "anthropic", "gemini"]),
	@@ -77,10 +79,11 @@
77	depth,
78	focus,
79	use_gpu,
80	sampling_rate,
81	change_threshold,

82	title,
83	provider,
84	vision_model,
85	chat_model,
86	):
	@@ -104,16 +107,19 @@
104	provider_manager=pm,
105	depth=depth,
106	focus_areas=focus_areas,
107	sampling_rate=sampling_rate,
108	change_threshold=change_threshold,

109	use_gpu=use_gpu,
110	title=title,
111	)
112	logging.info(f"Results at {output}/manifest.json")

113	except Exception as e:
114	logging.error(f"Error: {e}")

115	if ctx.obj["verbose"]:
116	import traceback
117
118	traceback.print_exc()
119	sys.exit(1)
	@@ -150,12 +156,13 @@
150	default="local",
151	help="Video source (local directory, Google Drive, or Dropbox)",
152	)
153	@click.option("--folder-id", type=str, default=None, help="Google Drive folder ID")
154	@click.option("--folder-path", type=str, default=None, help="Cloud folder path")

155	@click.pass_context
156	def batch(ctx, input_dir, output, depth, pattern, title, provider, vision_model, chat_model, source, folder_id, folder_path):
157	"""Process a folder of videos in batch."""
158	from video_processor.integrators.knowledge_graph import KnowledgeGraph
159	from video_processor.integrators.plan_generator import PlanGenerator
160	from video_processor.models import BatchManifest, BatchVideoEntry
161	from video_processor.output_structure import (
	@@ -180,11 +187,11 @@
180
181	cloud = GoogleDriveSource()
182	if not cloud.authenticate():
183	logging.error("Google Drive authentication failed")
184	sys.exit(1)
185	cloud_files = cloud.list_videos(folder_id=folder_id, folder_path=folder_path, patterns=patterns)
186	local_paths = cloud.download_all(cloud_files, download_dir)
187	elif source == "dropbox":
188	from video_processor.sources.dropbox_source import DropboxSource
189
190	cloud = DropboxSource()
	@@ -202,14 +209,15 @@
202	if not input_dir:
203	logging.error("--input-dir is required for local source")
204	sys.exit(1)
205	input_dir = Path(input_dir)
206
207	# Find videos
208	videos = []

209	for pat in patterns:
210	videos.extend(sorted(input_dir.glob(pat)))
211	videos = sorted(set(videos))
212
213	if not videos:
214	logging.error(f"No videos found in {input_dir} matching {pattern}")
215	sys.exit(1)
	@@ -219,11 +227,11 @@
219	dirs = create_batch_output_dirs(output, title)
220	manifests = []
221	entries = []
222	merged_kg = KnowledgeGraph()
223
224	for idx, video_path in enumerate(videos):
225	video_name = video_path.stem
226	video_output = dirs["videos"] / video_name
227	logging.info(f"Processing video {idx + 1}/{len(videos)}: {video_path.name}")
228
229	entry = BatchVideoEntry(
	@@ -290,11 +298,13 @@
290	videos=entries,
291	batch_summary_md="batch_summary.md",
292	merged_knowledge_graph_json="knowledge_graph.json",
293	)
294	write_batch_manifest(batch_manifest, output)
295	logging.info(f"Batch complete: {batch_manifest.completed_videos}/{batch_manifest.total_videos} succeeded")


296
297
298	@cli.command("list-models")
299	@click.pass_context
300	def list_models(ctx):
301
302	DDED video_processor/cli/output_formatter.py
303	DDED video_processor/extractors/__init__.py
304	DDED video_processor/extractors/audio_extractor.py
305	DDED video_processor/extractors/frame_extractor.py
306	DDED video_processor/extractors/text_extractor.py
307	DDED video_processor/integrators/__init__.py

	--- video_processor/cli/commands.py
	+++ video_processor/cli/commands.py
	@@ -8,10 +8,11 @@
8	from pathlib import Path
9	from typing import List, Optional
10
11	import click
12	import colorlog
13	from tqdm import tqdm
14
15
16	def setup_logging(verbose: bool = False) -> None:
17	"""Set up logging with color formatting."""
18	log_level = logging.DEBUG if verbose else logging.INFO
	@@ -57,10 +58,11 @@
58	)
59	@click.option("--focus", type=str, help='Comma-separated focus areas (e.g., "diagrams,action-items")')
60	@click.option("--use-gpu", is_flag=True, help="Enable GPU acceleration if available")
61	@click.option("--sampling-rate", type=float, default=0.5, help="Frame sampling rate")
62	@click.option("--change-threshold", type=float, default=0.15, help="Visual change threshold")
63	@click.option("--periodic-capture", type=float, default=30.0, help="Capture a frame every N seconds regardless of change (0 to disable)")
64	@click.option("--title", type=str, help="Title for the analysis report")
65	@click.option(
66	"--provider",
67	"-p",
68	type=click.Choice(["auto", "openai", "anthropic", "gemini"]),
	@@ -77,10 +79,11 @@
79	depth,
80	focus,
81	use_gpu,
82	sampling_rate,
83	change_threshold,
84	periodic_capture,
85	title,
86	provider,
87	vision_model,
88	chat_model,
89	):
	@@ -104,16 +107,19 @@
107	provider_manager=pm,
108	depth=depth,
109	focus_areas=focus_areas,
110	sampling_rate=sampling_rate,
111	change_threshold=change_threshold,
112	periodic_capture_seconds=periodic_capture,
113	use_gpu=use_gpu,
114	title=title,
115	)
116	click.echo(pm.usage.format_summary())
117	click.echo(f"\n Results: {output}/manifest.json")
118	except Exception as e:
119	logging.error(f"Error: {e}")
120	click.echo(pm.usage.format_summary())
121	if ctx.obj["verbose"]:
122	import traceback
123
124	traceback.print_exc()
125	sys.exit(1)
	@@ -150,12 +156,13 @@
156	default="local",
157	help="Video source (local directory, Google Drive, or Dropbox)",
158	)
159	@click.option("--folder-id", type=str, default=None, help="Google Drive folder ID")
160	@click.option("--folder-path", type=str, default=None, help="Cloud folder path")
161	@click.option("--recursive/--no-recursive", default=True, help="Recurse into subfolders (default: recursive)")
162	@click.pass_context
163	def batch(ctx, input_dir, output, depth, pattern, title, provider, vision_model, chat_model, source, folder_id, folder_path, recursive):
164	"""Process a folder of videos in batch."""
165	from video_processor.integrators.knowledge_graph import KnowledgeGraph
166	from video_processor.integrators.plan_generator import PlanGenerator
167	from video_processor.models import BatchManifest, BatchVideoEntry
168	from video_processor.output_structure import (
	@@ -180,11 +187,11 @@
187
188	cloud = GoogleDriveSource()
189	if not cloud.authenticate():
190	logging.error("Google Drive authentication failed")
191	sys.exit(1)
192	cloud_files = cloud.list_videos(folder_id=folder_id, folder_path=folder_path, patterns=patterns, recursive=recursive)
193	local_paths = cloud.download_all(cloud_files, download_dir)
194	elif source == "dropbox":
195	from video_processor.sources.dropbox_source import DropboxSource
196
197	cloud = DropboxSource()
	@@ -202,14 +209,15 @@
209	if not input_dir:
210	logging.error("--input-dir is required for local source")
211	sys.exit(1)
212	input_dir = Path(input_dir)
213
214	# Find videos (rglob for recursive, glob for flat)
215	videos = []
216	glob_fn = input_dir.rglob if recursive else input_dir.glob
217	for pat in patterns:
218	videos.extend(sorted(glob_fn(pat)))
219	videos = sorted(set(videos))
220
221	if not videos:
222	logging.error(f"No videos found in {input_dir} matching {pattern}")
223	sys.exit(1)
	@@ -219,11 +227,11 @@
227	dirs = create_batch_output_dirs(output, title)
228	manifests = []
229	entries = []
230	merged_kg = KnowledgeGraph()
231
232	for idx, video_path in enumerate(tqdm(videos, desc="Batch processing", unit="video")):
233	video_name = video_path.stem
234	video_output = dirs["videos"] / video_name
235	logging.info(f"Processing video {idx + 1}/{len(videos)}: {video_path.name}")
236
237	entry = BatchVideoEntry(
	@@ -290,11 +298,13 @@
298	videos=entries,
299	batch_summary_md="batch_summary.md",
300	merged_knowledge_graph_json="knowledge_graph.json",
301	)
302	write_batch_manifest(batch_manifest, output)
303	click.echo(pm.usage.format_summary())
304	click.echo(f"\n Batch complete: {batch_manifest.completed_videos}/{batch_manifest.total_videos} succeeded")
305	click.echo(f" Results: {output}/batch_manifest.json")
306
307
308	@cli.command("list-models")
309	@click.pass_context
310	def list_models(ctx):
311
312	DDED video_processor/cli/output_formatter.py
313	DDED video_processor/extractors/__init__.py
314	DDED video_processor/extractors/audio_extractor.py
315	DDED video_processor/extractors/frame_extractor.py
316	DDED video_processor/extractors/text_extractor.py
317	DDED video_processor/integrators/__init__.py

A video_processor/cli/output_formatter.py

+216

		--- a/video_processor/cli/output_formatter.py
		+++ b/video_processor/cli/output_formatter.py
		@@ -0,0 +1,216 @@
	1	+"""Output formatting for PlanOpticon analysis results."""
	2	+
	3	+import html
	4	+import json
	5	+import logging
	6	+import shutil
	7	+from pathlib import Path
	8	+from typing import Dict, List, Optional, Union
	9	+
	10	+logger = logging.getLogger(__name__)
	11	+
	12	+class OutputFormatter:
	13	+ """Formats and organizes output from video analysis."""
	14	+ ut from video analysis."""
	15	+
	16	+ def __init__(self, output_dir: Union[str, Pat
	17	+ Parameters
	18	+ ----------
	19	+ output_dir : str or Path
	20	+ Output directory for formatted content
	21	+ """
	22	+ self.output_dir = Path(output_dir)
	23	+ rents=True, exist_ok=True)
	24	+
	25	+ def organize_outputs(
	26	+ self,
	27	+ markdown_path: Union[str, Path],
	28	+ knowledge_graph_path: Union[str, Path],
	29	+ diagrams: List[Dict],
	30	+ frames_dir: Optional[Union[str, Path]] = None,
	31	+ transcript_path: Option
	32	+ ) -> Dict:
	33	+ """
	34	+ Organize outputs into a consistent structure.
	35	+
	36	+ Parameters
	37	+ ----------
	38	+ markdown_path : str or Path
	39	+ Path to markdown analysis
	40	+ knowledge_graph_path : str or Path
	41	+ Path to knowledge graph JSON
	42	+ diagrams : list
	43	+ List of diagram analysis results
	44	+ frames_dir : str or Path, optional
	45	+ Directory with extracted frames
	46	+ transcript_path : str or Path, optional
	47	+ .output_
	48	+ Returns
	49	+ -------
	50	+ dict
	51	+ Dictionary with organized output paths
	52	+ """
	53	+ # Create output structure
	54	+ md_dir = self.output_dir / "markdown"
	55	+ diagrams_dir = self.output_dir / "diagrams"
	56	+ data_dir = self.output_dir / "data"
	57	+ = self.output_dir / "data"
	58	+
	59	+ md_dir.mkdir(exist_ok=True)
	60	+ diagrams_dir.mkdir edge graph JSON
	61	+t_ok=True)
	62	+
	63	+ # Copy markdown file
	64	+ markdown_path = Path(markdown_path)
	65	+ md_output = md_dir / markdown_path.name
	66	+ shutil.copy2
	67	+ # Copy knowledge graph
	68	+ kg_path = Path(knowledge_graph_path)
	69	+ kg_output = data_dir / kg_path.name
	70	+ shutil.copy2(kg_path, kg_output)
	71	+ edge graph JSON
	72	+diagram images if available
	73	+ diagram_images = []
	74	+ for diagram in diagrams:
	75	+ if "image_path" in diagram and diagram["image_path"]:
	76	+ img_path = Path(diagram["image_path"])
	77	+ if img_path.exists():
	78	+ img_output = diagrams_dir / img_path.name
	79	+ shutil.copy2(img_path, img_output)
	80	+ diagram_imag edge graph JSON
	81	+script_path: Optional[Union[str, Path]] = None,
	82	+ ) -> Dict:
	83	+ """
	84	+ Organize outputs into a consistent structure.
	85	+
	86	+ Parameters
	87	+ ----------
	88	+ markdown_path : str or Path
	89	+ Path to markdown analysis
	90	+ knowledge_graph_path : str or Pat edge graph JSON
	91	+ diagrams : list
	92	+ List of diagram analysis results
	93	+ frames_dir : str or Path, optional
	94	+ Directory with extracted frames
	95	+ transcript_path : str or Path, optional
	96	+ Path to transcript file
	97	+
	98	+ Returns
	99	+ -------
	100	+
	101	+ nized output paths
	102	+ """
	103	+ # Create output structure
	104	+ md_dir = self.output_dir / "markdown"
	105	+ diagrams_dir = self.output_dir / "diagrams"
	106	+ data_dir = self.output_dir / "data"
	107	+
	108	+
	109	+ diagrams_dir.mkdir(exist_ok=True)
	110	+ data_dir.mkdir(exist_ok=True)
	111	+
	112	+ # Copy markdown file
	113	+ markdown_path = Path lines.a
	114	+ ath = Path(markdown_path)
	115	+ md_output = md_dir / markdown_path.name
	116	+ shutil.copy2(markdown_path, md_output)
	117	+
	118	+ # Copy knowledge graph
	119	+ kg_path = Path(knowledge_graph_path)
	120	+ k
	121	+rkdown_p shutil.copy2(kg_path, kg_output)
	122	+
	123	+ # Copy diagram images if available
	124	+ diagram_images = []
	125	+ for diagram in diagrams:
	126	+ if "image_path" in diagram and diagram["image_path"]:
	127	+ img_path = Path(diagram["image_path"])
	128	+ if img_pa
	129	+ }
	130	+ utput else None,
	131	+ }
	132	+
	133	+ def create_html_index(self, outputs: Dict) -> Path:
	134	+ """
	135	+ Create HTML index page for outputs.
	136	+
	137	+ Parameters
	138	+ ----------
	139	+ outputs : dict
	140	+ Dictionary with organized output paths
	141	+
	142	+ Returns
	143	+ -------
	144	+ Path
	145	+ Path to HTML index
	146	+ """
	147	+ esc = html.escape
	148	+
	149	+ # Simple HTML index template
	150	+ lines = [
	151	+ "<!DOCTYPE html>",
	152	+ "<html>",
	153	+ "<head>",
	154	+ " <title>PlanOpticon Analysis Results</title>",
	155	+ " <style>",
	156	+ margin: 0; padding: 20px; line-height: 1.6; }",
	157	+ " .container { max-width: 1200px; margin: 0 auto; }",
	158	+ " h1 { color: #333; }",
	159	+ " h2 { color: #555; margin-top: 30px; }",
	160	+ " .section { margin-bottom: 30px; }",
	161	+ " .files { display: flex; flex-wrap: wrap; }",
	162	+ " .file-item { margin: 10px; text-align: center; }",
	163	+ " .file-item img { max-width: 200px; max-height: 150px; object-fit: contain; }",
	164	+ " .file-name { margin-top: 5px; font-size: 0.9em; }",
	165	+ " a { color: #0066cc; text-decoration: none; }",
	166	+ " a:hover { text-decoration: underline; }",
	167	+ " </style>",
	168	+ "</head>",
	169	+ "<body>",
	170	+ "<div class='container'>",
	171	+ " <h1>PlanOpticon Analysis Results</h1>",
	172	+ ""
	173	+ ]
	174	+
	175	+ # Add markdown section
	176	+ if outputs.get("markdown"):
	177	+ md_path = Path(outputs["markdown"])
	178	+ md_rel = esc(str(md_path.relative_to(self.output_dir)))
	179	+
	180	+ lines.append(" <div class='section'>")
	181	+ lines.append(" <h2>Analysis Report</h2>")
	182	+ lines.append(f" <p><a href='{md_rel}' target='_blank'>View Analysis</a></p>")
	183	+ lines.append(" </div>")
	184	+
	185	+ # Add diagrams section
	186	+ if outputs.get("diagram_images") and len(outputs["diagram_images"]) > 0:
	187	+ lines.append(" <div class='section'>")
	188	+ lines.append(" <h2>Diagrams</h2>")
	189	+ lines.append(" <div class='files'>")
	190	+
	191	+ for img_path in outputs["diagram_images"]:
	192	+ img_path = Path(img_path)
	193	+ img_rel = esc(str(img_path.relative_to(self.output_dir)))
	194	+ img_name = esc(img_path.name)
	195	+
	196	+ lines.append(" <div class='file-item'>")
	197	+ lines.append(f" <a href='{img_rel}' target='_blank'>")
	198	+ lines.append(f" <img src='{img_rel}' alt='Diagram'>")
	199	+ lines.append(" </a>")
	200	+ lines.append(f" <div class='file-name'>{img_name}</div>")
	201	+ lines.append(" </divf" t("markdo<li><a href='{data_rel}' target='_b)
	202	+
	203	+ lines.append(" </ul>")
	204	+ lines.append(" </div>")
	205	+
	206	+ # Close HTML
	207	+ lines.append("</div>")
	208	+ lines.append("</body>")
	209	+ lines.append("</html>")
	210	+
	211	+ # Write HTML file
	212	+ index_path = self.output_dir / "index.html"
	213	+ with open(index_path, "w") as f:
	214	+ f.write("\n".join(lines))
	215	+
	216	+ logger.info(f"Created HTML index at {index_path}")

	--- a/video_processor/cli/output_formatter.py
	+++ b/video_processor/cli/output_formatter.py
	@@ -0,0 +1,216 @@

	--- a/video_processor/cli/output_formatter.py
	+++ b/video_processor/cli/output_formatter.py
	@@ -0,0 +1,216 @@
1	"""Output formatting for PlanOpticon analysis results."""
2
3	import html
4	import json
5	import logging
6	import shutil
7	from pathlib import Path
8	from typing import Dict, List, Optional, Union
9
10	logger = logging.getLogger(__name__)
11
12	class OutputFormatter:
13	"""Formats and organizes output from video analysis."""
14	ut from video analysis."""
15
16	def __init__(self, output_dir: Union[str, Pat
17	Parameters
18	----------
19	output_dir : str or Path
20	Output directory for formatted content
21	"""
22	self.output_dir = Path(output_dir)
23	rents=True, exist_ok=True)
24
25	def organize_outputs(
26	self,
27	markdown_path: Union[str, Path],
28	knowledge_graph_path: Union[str, Path],
29	diagrams: List[Dict],
30	frames_dir: Optional[Union[str, Path]] = None,
31	transcript_path: Option
32	) -> Dict:
33	"""
34	Organize outputs into a consistent structure.
35
36	Parameters
37	----------
38	markdown_path : str or Path
39	Path to markdown analysis
40	knowledge_graph_path : str or Path
41	Path to knowledge graph JSON
42	diagrams : list
43	List of diagram analysis results
44	frames_dir : str or Path, optional
45	Directory with extracted frames
46	transcript_path : str or Path, optional
47	.output_
48	Returns
49	-------
50	dict
51	Dictionary with organized output paths
52	"""
53	# Create output structure
54	md_dir = self.output_dir / "markdown"
55	diagrams_dir = self.output_dir / "diagrams"
56	data_dir = self.output_dir / "data"
57	= self.output_dir / "data"
58
59	md_dir.mkdir(exist_ok=True)
60	diagrams_dir.mkdir edge graph JSON
61	t_ok=True)
62
63	# Copy markdown file
64	markdown_path = Path(markdown_path)
65	md_output = md_dir / markdown_path.name
66	shutil.copy2
67	# Copy knowledge graph
68	kg_path = Path(knowledge_graph_path)
69	kg_output = data_dir / kg_path.name
70	shutil.copy2(kg_path, kg_output)
71	edge graph JSON
72	diagram images if available
73	diagram_images = []
74	for diagram in diagrams:
75	if "image_path" in diagram and diagram["image_path"]:
76	img_path = Path(diagram["image_path"])
77	if img_path.exists():
78	img_output = diagrams_dir / img_path.name
79	shutil.copy2(img_path, img_output)
80	diagram_imag edge graph JSON
81	script_path: Optional[Union[str, Path]] = None,
82	) -> Dict:
83	"""
84	Organize outputs into a consistent structure.
85
86	Parameters
87	----------
88	markdown_path : str or Path
89	Path to markdown analysis
90	knowledge_graph_path : str or Pat edge graph JSON
91	diagrams : list
92	List of diagram analysis results
93	frames_dir : str or Path, optional
94	Directory with extracted frames
95	transcript_path : str or Path, optional
96	Path to transcript file
97
98	Returns
99	-------
100
101	nized output paths
102	"""
103	# Create output structure
104	md_dir = self.output_dir / "markdown"
105	diagrams_dir = self.output_dir / "diagrams"
106	data_dir = self.output_dir / "data"
107
108
109	diagrams_dir.mkdir(exist_ok=True)
110	data_dir.mkdir(exist_ok=True)
111
112	# Copy markdown file
113	markdown_path = Path lines.a
114	ath = Path(markdown_path)
115	md_output = md_dir / markdown_path.name
116	shutil.copy2(markdown_path, md_output)
117
118	# Copy knowledge graph
119	kg_path = Path(knowledge_graph_path)
120	k
121	rkdown_p shutil.copy2(kg_path, kg_output)
122
123	# Copy diagram images if available
124	diagram_images = []
125	for diagram in diagrams:
126	if "image_path" in diagram and diagram["image_path"]:
127	img_path = Path(diagram["image_path"])
128	if img_pa
129	}
130	utput else None,
131	}
132
133	def create_html_index(self, outputs: Dict) -> Path:
134	"""
135	Create HTML index page for outputs.
136
137	Parameters
138	----------
139	outputs : dict
140	Dictionary with organized output paths
141
142	Returns
143	-------
144	Path
145	Path to HTML index
146	"""
147	esc = html.escape
148
149	# Simple HTML index template
150	lines = [
151	"<!DOCTYPE html>",
152	"<html>",
153	"<head>",
154	" <title>PlanOpticon Analysis Results</title>",
155	" <style>",
156	margin: 0; padding: 20px; line-height: 1.6; }",
157	" .container { max-width: 1200px; margin: 0 auto; }",
158	" h1 { color: #333; }",
159	" h2 { color: #555; margin-top: 30px; }",
160	" .section { margin-bottom: 30px; }",
161	" .files { display: flex; flex-wrap: wrap; }",
162	" .file-item { margin: 10px; text-align: center; }",
163	" .file-item img { max-width: 200px; max-height: 150px; object-fit: contain; }",
164	" .file-name { margin-top: 5px; font-size: 0.9em; }",
165	" a { color: #0066cc; text-decoration: none; }",
166	" a:hover { text-decoration: underline; }",
167	" </style>",
168	"</head>",
169	"<body>",
170	"<div class='container'>",
171	" <h1>PlanOpticon Analysis Results</h1>",
172	""
173	]
174
175	# Add markdown section
176	if outputs.get("markdown"):
177	md_path = Path(outputs["markdown"])
178	md_rel = esc(str(md_path.relative_to(self.output_dir)))
179
180	lines.append(" <div class='section'>")
181	lines.append(" <h2>Analysis Report</h2>")
182	lines.append(f" <p><a href='{md_rel}' target='_blank'>View Analysis</a></p>")
183	lines.append(" </div>")
184
185	# Add diagrams section
186	if outputs.get("diagram_images") and len(outputs["diagram_images"]) > 0:
187	lines.append(" <div class='section'>")
188	lines.append(" <h2>Diagrams</h2>")
189	lines.append(" <div class='files'>")
190
191	for img_path in outputs["diagram_images"]:
192	img_path = Path(img_path)
193	img_rel = esc(str(img_path.relative_to(self.output_dir)))
194	img_name = esc(img_path.name)
195
196	lines.append(" <div class='file-item'>")
197	lines.append(f" <a href='{img_rel}' target='_blank'>")
198	lines.append(f" <img src='{img_rel}' alt='Diagram'>")
199	lines.append(" </a>")
200	lines.append(f" <div class='file-name'>{img_name}</div>")
201	lines.append(" </divf" t("markdo<li><a href='{data_rel}' target='_b)
202
203	lines.append(" </ul>")
204	lines.append(" </div>")
205
206	# Close HTML
207	lines.append("</div>")
208	lines.append("</body>")
209	lines.append("</html>")
210
211	# Write HTML file
212	index_path = self.output_dir / "index.html"
213	with open(index_path, "w") as f:
214	f.write("\n".join(lines))
215
216	logger.info(f"Created HTML index at {index_path}")

A video_processor/extractors/__init__.py

+1

		--- a/video_processor/extractors/__init__.py
		+++ b/video_processor/extractors/__init__.py
		@@ -0,0 +1 @@
	1	+from v

	--- a/video_processor/extractors/__init__.py
	+++ b/video_processor/extractors/__init__.py
	@@ -0,0 +1 @@

	--- a/video_processor/extractors/__init__.py
	+++ b/video_processor/extractors/__init__.py
	@@ -0,0 +1 @@
1	from v

A video_processor/extractors/audio_extractor.py

+196

		--- a/video_processor/extractors/audio_extractor.py
		+++ b/video_processor/extractors/audio_extractor.py
		@@ -0,0 +1,196 @@
	1	+"""Audio extraction and processing module for video analysis."""
	2	+import logging
	3	+import os
	4	+import subprocess
	5	+from pathlib import Path
	6	+from typing import Dict, Optional, Tuple, Union
	7	+
	8	+import librosa
	9	+import numpy as np
	10	+import soundfile as sf
	11	+
	12	+logger = loclass AudioExtractor:
	13	+ """Extract and process audio from video files."""
	14	+
	15	+ def __init__(self, sample_rate: int = 16000, mono: bool = True):
	16	+ """
	17	+ Initi
	18	+ Parameters
	19	+""Audio extraction and processxtraction and processing module for video analysis."""
	20	+
	21	+import logging
	22	+import subprocess
	23	+from pathlib import Path
	24	+from typing import Dict, Optional, Tupl
	25	+ def e cted audio file
	26	+ : Union[str, Path],
	27	+ output_path: Optiona
	28	+ Path t format: str = "wav"Extract audio from video file.
	29	+
	30	+ Parameters
	31	+video_path : str ovideo file
	32	+ output_patheo file
	33	+ output_path : str or Path, optional
	34	+ Path to save extracted audio (if None, saves alongside video)
	35	+ format : ""Audio extraction and processing module for video analysis."""
	36	+
	37	+i"""Audio extraction and processing module for video analysis."""
	38	+
	39	+import logging
	40	+import subprocess
	41	+from pathlib import Path
	42	+from typing import Dict, Option
	43	+ format# Generate output path if not provided
	44	+ if output_path is None:
	45	+ output_path = video_path.with_suffix(f".{format}")
	46	+ else:
	47	+ output_path = Path(output_path)
	48	+
	49	+ # Ensure output directory exists
	50	+ output_path.parent.mkdir(pa
	51	+ # Extract audio using ffmpeg
	52	+ try:
	53	+ cmd = [
	54	+ "ffmpeg",
	55	+ "-i", str(video_path),
	56	+ "-o_path),
	57	+ "-acodec",
	58	+ "pcm_s16le", # PCM 16-bit little-en "-ar",
	59	+ str(se "1" if self.mono else "2", # Channels (mono or stereo)
	60	+ "-y", # Overwrite output
	61	+ str(output_path)
	62	+ s
	63	+ -]
	64	+
	65	+ ers
	66	+ """Audio extraction and
	67	+ cmd,
	68	+ self.sample_rate = samp
	69	+ stderr=subprocess.PIPE,
	70	+ check=True
	71	+ )
	72	+
	73	+ath:
	74	+ tract_audio(
	75	+ self,
	76	+ video_path: Union[str, Path],
	77	+ out r, Path]] = None,
	78	+ format: str = "wav",
	79	+ ) -> Path:
	80	+ """
	81	+ Extract audio from video file.
	82	+
	83	+ Parameters
	84	+ ----------
	85	+ video_path : str or Path
	86	+ Path to video file
	87	+ output_path : str or Path, optional
	88	+ Path to save extra alongside video)
	89	+ format : str
	90	+ Audio format to save (wav, mp3, etc.)
	91	+
	92	+ Returns
	93	+ -------
	94	+ Pat
	95	+ Parameters
	96	+ path : str oaudio file
	97	+ Path(video_path)
	98	+ if not video_path.exists():
	99	+ raise FileNotFoundError(f"Video file not found: {video_path}")
	100	+
	101	+ # Generate output path if not provided
	102	+ if output_path is None:
	103	+ ith_suffix(f".{format}")
	104	+ else:
	105	+ output_path = Path(output_path)
	106	+
	107	+
	108	+ Path t # Ensure output direc
	109	+ mono=self.mono
	110	+ extract)
	111	+
	112	+ extract_audio(
	113	+ g ffmpeg
	114	+ try:
	115	+ cmd = [
	116	+ "ffmpeg",
	117	+ "-i",
	118	+ str(video_path),
	119	+ "-vn", # No video
	120	+ "-acodec",
	121	+
	122	+ Parameters
	123	+ path : str oaudio file
	124	+ ""Audio extrac"""A
	125	+ Audio segment data
	126	+ output_path : str or Path
	127	+ Path to save segment
	128	+ sample_rate : int
	129	+ Sample rate of segment
	130	+
	131	+ Returns
	132	+ -------
	133	+ Path
	134	+ Path to saved segment
	135	+ """
	136	+ output_path = Path(output_path)
	137	+ output_path.parent.mkdir(parents=True, exist_ok=True)
	138	+
	139	+ sf.write(output_path, segment, sample_rate)
	140	+ return output_path
	141	+f self.mono else "2", # Channels (mono or stereo)
	142	+ "-y", # Overwrite output
	143	+ str(output_path),
	144	+ ]
	145	+
	146	+ # Run ffmpeg command
	147	+ subprocess.run(cmd, stdou not found: {audio_path}")
	148	+
	149	+
	150	+ properties = {
	151	+ "duration": info.duration,
	152	+ "sample_rate": info.samplerate,
	153	+ "channels": info.channels,
	154	+ "format": info.format,
	155	+ "subtype": info.subtype,
	156	+
	157	+ath:
	158	+ }
	159	+
	160	+ return properties
	161	+
	162	+ def segment_audio(
	163	+ self,
	164	+ audio_data: np.ndarray,
	165	+ sample_rate: int,
	166	+ segment_length_ms: int = 30000,
	167	+ overlap_ms: int = 0
	168	+ Parameters
	169	+ ----------
	170	+ audio_data : np.ndarray
	171	+ Audio data
	172	+ sample_rate : int
	173	+ Sample rate of audio
	174	+ segment_length_ms : int
	175	+ Length of segments in milliseconds
	176	+ overlap_ms : int
	177	+ Overlap betwee ""Audio extrac"""Audio extraction and processing module for video analysis."""
	178	+
	179	+import logging
	180	+import subprocess
	181	+from pathlib import Path
	182	+from typing import Dict, Optional, Tuple, Union
	183	+
	184	+import librosa
	185	+import numpy as np
	186	+import
	187	+ # Calculate hop length
	188	+ hop_length = segment_length_samples - overlap_samples
	189	+ _samples - overlap_samples
	190	+
	191	+ list
	192	+ segments = []
	193	+
	194	+ # Generate segments
	195	+ for i in range(0, len(audio_data), hop_length):
	196	+ end_idx = min(i

	--- a/video_processor/extractors/audio_extractor.py
	+++ b/video_processor/extractors/audio_extractor.py
	@@ -0,0 +1,196 @@

	--- a/video_processor/extractors/audio_extractor.py
	+++ b/video_processor/extractors/audio_extractor.py
	@@ -0,0 +1,196 @@
1	"""Audio extraction and processing module for video analysis."""
2	import logging
3	import os
4	import subprocess
5	from pathlib import Path
6	from typing import Dict, Optional, Tuple, Union
7
8	import librosa
9	import numpy as np
10	import soundfile as sf
11
12	logger = loclass AudioExtractor:
13	"""Extract and process audio from video files."""
14
15	def __init__(self, sample_rate: int = 16000, mono: bool = True):
16	"""
17	Initi
18	Parameters
19	""Audio extraction and processxtraction and processing module for video analysis."""
20
21	import logging
22	import subprocess
23	from pathlib import Path
24	from typing import Dict, Optional, Tupl
25	def e cted audio file
26	: Union[str, Path],
27	output_path: Optiona
28	Path t format: str = "wav"Extract audio from video file.
29
30	Parameters
31	video_path : str ovideo file
32	output_patheo file
33	output_path : str or Path, optional
34	Path to save extracted audio (if None, saves alongside video)
35	format : ""Audio extraction and processing module for video analysis."""
36
37	i"""Audio extraction and processing module for video analysis."""
38
39	import logging
40	import subprocess
41	from pathlib import Path
42	from typing import Dict, Option
43	format# Generate output path if not provided
44	if output_path is None:
45	output_path = video_path.with_suffix(f".{format}")
46	else:
47	output_path = Path(output_path)
48
49	# Ensure output directory exists
50	output_path.parent.mkdir(pa
51	# Extract audio using ffmpeg
52	try:
53	cmd = [
54	"ffmpeg",
55	"-i", str(video_path),
56	"-o_path),
57	"-acodec",
58	"pcm_s16le", # PCM 16-bit little-en "-ar",
59	str(se "1" if self.mono else "2", # Channels (mono or stereo)
60	"-y", # Overwrite output
61	str(output_path)
62	s
63	-]
64
65	ers
66	"""Audio extraction and
67	cmd,
68	self.sample_rate = samp
69	stderr=subprocess.PIPE,
70	check=True
71	)
72
73	ath:
74	tract_audio(
75	self,
76	video_path: Union[str, Path],
77	out r, Path]] = None,
78	format: str = "wav",
79	) -> Path:
80	"""
81	Extract audio from video file.
82
83	Parameters
84	----------
85	video_path : str or Path
86	Path to video file
87	output_path : str or Path, optional
88	Path to save extra alongside video)
89	format : str
90	Audio format to save (wav, mp3, etc.)
91
92	Returns
93	-------
94	Pat
95	Parameters
96	path : str oaudio file
97	Path(video_path)
98	if not video_path.exists():
99	raise FileNotFoundError(f"Video file not found: {video_path}")
100
101	# Generate output path if not provided
102	if output_path is None:
103	ith_suffix(f".{format}")
104	else:
105	output_path = Path(output_path)
106
107
108	Path t # Ensure output direc
109	mono=self.mono
110	extract)
111
112	extract_audio(
113	g ffmpeg
114	try:
115	cmd = [
116	"ffmpeg",
117	"-i",
118	str(video_path),
119	"-vn", # No video
120	"-acodec",
121
122	Parameters
123	path : str oaudio file
124	""Audio extrac"""A
125	Audio segment data
126	output_path : str or Path
127	Path to save segment
128	sample_rate : int
129	Sample rate of segment
130
131	Returns
132	-------
133	Path
134	Path to saved segment
135	"""
136	output_path = Path(output_path)
137	output_path.parent.mkdir(parents=True, exist_ok=True)
138
139	sf.write(output_path, segment, sample_rate)
140	return output_path
141	f self.mono else "2", # Channels (mono or stereo)
142	"-y", # Overwrite output
143	str(output_path),
144	]
145
146	# Run ffmpeg command
147	subprocess.run(cmd, stdou not found: {audio_path}")
148
149
150	properties = {
151	"duration": info.duration,
152	"sample_rate": info.samplerate,
153	"channels": info.channels,
154	"format": info.format,
155	"subtype": info.subtype,
156
157	ath:
158	}
159
160	return properties
161
162	def segment_audio(
163	self,
164	audio_data: np.ndarray,
165	sample_rate: int,
166	segment_length_ms: int = 30000,
167	overlap_ms: int = 0
168	Parameters
169	----------
170	audio_data : np.ndarray
171	Audio data
172	sample_rate : int
173	Sample rate of audio
174	segment_length_ms : int
175	Length of segments in milliseconds
176	overlap_ms : int
177	Overlap betwee ""Audio extrac"""Audio extraction and processing module for video analysis."""
178
179	import logging
180	import subprocess
181	from pathlib import Path
182	from typing import Dict, Optional, Tuple, Union
183
184	import librosa
185	import numpy as np
186	import
187	# Calculate hop length
188	hop_length = segment_length_samples - overlap_samples
189	_samples - overlap_samples
190
191	list
192	segments = []
193
194	# Generate segments
195	for i in range(0, len(audio_data), hop_length):
196	end_idx = min(i

A video_processor/extractors/frame_extractor.py

+197

		--- a/video_processor/extractors/frame_extractor.py
		+++ b/video_processor/extractors/frame_extractor.py
		@@ -0,0 +1,197 @@
	1	+"""Frame extraction modul for video processing."""
	2	+
	3	+import functools
	4	+import logging
	5	+import sys
	6	+import tempfile
	7	+from pathlib import Path
	8	+from typing import List, Optional, Tuple, Union
	9	+
	10	+import cv2
	11	+import numpy as np
	12	+from tqdm import tqdm
	13	+
	14	+logger = logging.getLogger(__name__)
	15	+
	16	+# Haar cascade for face detection — ships with OpenCV
	17	+_FACE_CASCADE_PATH = cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
	18	+_FACE_CASCADE = None
	19	+
	20	+
	21	+def _get_face_cascade() -> cv2.CascadeClassifier:
	22	+ """Lazy-load the face cascade classifier."""
	23	+ global _FACE_CASCADE
	24	+ if _FACE_CASCADE is None:
	25	+ _FACE_CASCADE = cv2.CascadeClassifier(_FACE_CASCADE_PATH)
	26	+ return _FACE_CASCADE
	27	+
	28	+
	29	+def detect_faces(frame: np.ndarray) -> List[Tuple[int, int, int, int]]:
	30	+ """Detect faces in a frame using Haar cascade. Returns list of (x, y, w, h)."""
	31	+ gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) if len(frame.shape) == 3 else frame
	32	+ cascade = _get_face_cascade()
	33	+ faces = cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(40, 40))
	34	+ return list(faces) if len(faces) > 0 else []
	35	+
	36	+
	37	+def is_people_frame(
	38	+ frame: np.ndarray,
	39	+ face_area_threshold: float = 0.03,
	40	+ min_face_size: int = 90,
	41	+) -> bool:
	42	+ """
	43	+ Determine if a frame is primarily showing people (webcam/video conference).
	44	+
	45	+ Heuristics:
	46	+ 1. Face detection — if significant faces occupy enough frame area
	47	+ 2. Black bar detection — video conferences often have thick black bars
	48	+ 3. Small faces with black bars — profile pictures in conference UI
	49	+
	50	+ Faces smaller than min_face_size are ignored (sidebar thumbnails in screen shares).
	51	+
	52	+ Parameters
	53	+ ----------
	54	+ frame : np.ndarray
	55	+ BGR image frame
	56	+ face_area_threshold : float
	57	+ Minimum ratio of total face area to frame area to classify as people frame
	58	+ min_face_size : int
	59	+ Minimum face width/height in pixels to count as a significant face
	60	+
	61	+ Returns
	62	+ -------
	63	+ bool
	64	+ True if frame is primarily people/webcam content
	65	+ """
	66	+ h, w = frame.shape[:2]
	67	+ frame_area = h * w
	68	+
	69	+ # Detect all faces
	70	+ all_faces = detect_faces(frame)
	71	+
	72	+ny ones (sidebar thumbnails)
	73	+ significant_faces = [(x, y, fw, fh) for (x, y, fw, fh) in all_faces if fw >= min_face_size]
	74	+
	75	+ if significant_faces:
	76	+ total_face_area = sum(fw * fh for (_, _, fw, fh) in significant_faces)
	77	+ face_ratio = total_facount > 0
	78	+ except Exception:
	79	+ return False
	80	+
	81	+
	82	+def gpu_accelerated(func):
	83	+ """Dtation when available."""
	84	+
	85	+ @functools.wraps(func)
	86	+ def wrapper(args, *kwar'disable_gpu'if is_gpu_available() and not kwargs.get("disable_gpu"):
	87	+ # Remove the 'disable_gpu' kwargs.pop("disable_gpu", None)
	88	+ return func_gpu(args, *kwargs)
	89	+ # Remove 'disable_gpu', None.getCudaEnabledDe kwargs.pop("dis return wrapper return func(args, *kwargs)
	90	+
	91	+ return wrapper
	92	+
	93	+
	94	+def calculate_frame_difference(prev_frame: np.ndarray, curr_frame: np.ndarray) -> float:
	95	+ """
	96	+ Calculate the difference between two frames.
	97	+
	98	+ Parameters
	99	+ ----------
	100	+ prev_frame : np.ndarray
	101	+ Previous fr reason = Returns
	102	+ -------
	103	+ float
	104	+ Difference score between 0 and 1
	105	+ """
	106	+ # Convert to grayscale
	107	+ if len(prev_frame.shape) == 3:
	108	+ prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
	109	+ else:
	110	+ OR_BGR2GRAY)
	111	+ else:
	112	+ prev_gray = prev_frame
	113	+
	114	+ if len(curr_frame.shape) == 3:
	115	+ curr_gray = cv2.cvtColor(curr_frame, cv2.COL curr_gray = curr_frame
	116	+
	117	+ # Calculate absolute difference
	118	+ diff = cv2.abs diff(prev_gray, curr_gray)
	119	+
	120	+ # Normalize and return mean difference
	121	+ retrn mean difference
	122	+ return np.mean(diff) / 255.0
	123	+
	124	+
	125	+@gpu_accelerated
	126	+def extract_frames(
	127	+ video_path: Union[str, Path],
	128	+ sampling_rate: float = 1.0,
	129	+ change_threshold: float = 0.15,
	130	+ periodic_capture_seconds: float = 30.0,
	131	+ max_frames: Optional[int] =None,
	132	+ resize_to: Optionamax_memory_mb: int = 1024,
	133	+) -> List[np.ndarray]:
	134	+ """
	135	+ Extract frames from video based on visual change detection + periodic capture.
	136	+
	137	+ Two capture strategies work together:
	138	+ 1. Change detection: capture when visual difference exceeds threshold
	139	+ (catches transitions like webcam ↔ screen share)
	140	+ 2. Periodic capture: capture every N seconds regardless of change
	141	+ (catches slow-evolving content like document scrolling)
	142	+
	143	+ The downstream people filter removes any webcam frames captured periodically.
	144	+
	145	+ Parameters
	146	+ ----------
	147	+ video_path : str or Path
	148	+ Path to video file
	149	+ sampling_rate : float
	150	+ Frame sampling rate (1.0 = every frame)
	151	+ change_threshold : float
	152	+ Threshold for detecting significant visual changes
	153	+ periodic_capture_seconds : float
	154	+ Capture a frame every N seconds regardless of change (0 to disable)
	155	+ max_frames : int, optional
	156	+ Maximum number of frames to extract
	157	+ resize_to : tuple of (width, height), optional
	158	+ Resizprev_gray, curr_gray)
	159	+
	160	+ # Normalize and return mean difference
	161	+ return np.mean(diff) / 255.0
	162	+
	163	+
	164	+@gpu_accelerated
	165	+def extract_frames(
	166	+ video_path: Union[str, Path],
	167	+ sampling_rate: float = 1.0,
	168	+ change_threshold: float = 0.15,
	169	+ periodic_capture_seconds: float = 30.0,
	170	+ max_frames: Optional[int] = None,
	171	+ resize_to: Optional[Tuple[int, int]] = None,
	172	+ max_memory_mb: int = 1024,
	173	+) -> List[np.ndarray]:
	174	+ """
	175	+ Extract frames from video based on visual change detection + periodic capture.
	176	+
	177	+ Two capture strategieelif (
	178	+ periodic_interval > 0 and (frame_idx - last_capture_: == 3 else frame
	179	+threshold:
	180	+ reason = "periodic"
	181	+
	182	+ if should_capture:
	183	+ extracted_frames.app )
	184	+ def save_frames(sidebar thumbnails in scr"Frame extraction module for videctiombnails in scf = calculate_frame_difference(prev_frame, frame)
	185	+ if diff > change_threshold:
	186	+ should_capture = True
	187	+ reason reason = f"change={diff:.3f}"
	188	+
	189	+ # Periodic capture — even if change is small
	190	+ elif (
	191	+ last_capture_frame) >= periodic_interval
	192	+ ):
	193	+ should_capture = True
	194	+ reason = "periodic"
	195	+
	196	+ if should_capture:
	197	+ extracted_frames.app extracted_frames.ap

	--- a/video_processor/extractors/frame_extractor.py
	+++ b/video_processor/extractors/frame_extractor.py
	@@ -0,0 +1,197 @@

	--- a/video_processor/extractors/frame_extractor.py
	+++ b/video_processor/extractors/frame_extractor.py
	@@ -0,0 +1,197 @@
1	"""Frame extraction modul for video processing."""
2
3	import functools
4	import logging
5	import sys
6	import tempfile
7	from pathlib import Path
8	from typing import List, Optional, Tuple, Union
9
10	import cv2
11	import numpy as np
12	from tqdm import tqdm
13
14	logger = logging.getLogger(__name__)
15
16	# Haar cascade for face detection — ships with OpenCV
17	_FACE_CASCADE_PATH = cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
18	_FACE_CASCADE = None
19
20
21	def _get_face_cascade() -> cv2.CascadeClassifier:
22	"""Lazy-load the face cascade classifier."""
23	global _FACE_CASCADE
24	if _FACE_CASCADE is None:
25	_FACE_CASCADE = cv2.CascadeClassifier(_FACE_CASCADE_PATH)
26	return _FACE_CASCADE
27
28
29	def detect_faces(frame: np.ndarray) -> List[Tuple[int, int, int, int]]:
30	"""Detect faces in a frame using Haar cascade. Returns list of (x, y, w, h)."""
31	gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) if len(frame.shape) == 3 else frame
32	cascade = _get_face_cascade()
33	faces = cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(40, 40))
34	return list(faces) if len(faces) > 0 else []
35
36
37	def is_people_frame(
38	frame: np.ndarray,
39	face_area_threshold: float = 0.03,
40	min_face_size: int = 90,
41	) -> bool:
42	"""
43	Determine if a frame is primarily showing people (webcam/video conference).
44
45	Heuristics:
46	1. Face detection — if significant faces occupy enough frame area
47	2. Black bar detection — video conferences often have thick black bars
48	3. Small faces with black bars — profile pictures in conference UI
49
50	Faces smaller than min_face_size are ignored (sidebar thumbnails in screen shares).
51
52	Parameters
53	----------
54	frame : np.ndarray
55	BGR image frame
56	face_area_threshold : float
57	Minimum ratio of total face area to frame area to classify as people frame
58	min_face_size : int
59	Minimum face width/height in pixels to count as a significant face
60
61	Returns
62	-------
63	bool
64	True if frame is primarily people/webcam content
65	"""
66	h, w = frame.shape[:2]
67	frame_area = h * w
68
69	# Detect all faces
70	all_faces = detect_faces(frame)
71
72	ny ones (sidebar thumbnails)
73	significant_faces = [(x, y, fw, fh) for (x, y, fw, fh) in all_faces if fw >= min_face_size]
74
75	if significant_faces:
76	total_face_area = sum(fw * fh for (_, _, fw, fh) in significant_faces)
77	face_ratio = total_facount > 0
78	except Exception:
79	return False
80
81
82	def gpu_accelerated(func):
83	"""Dtation when available."""
84
85	@functools.wraps(func)
86	def wrapper(args, *kwar'disable_gpu'if is_gpu_available() and not kwargs.get("disable_gpu"):
87	# Remove the 'disable_gpu' kwargs.pop("disable_gpu", None)
88	return func_gpu(args, *kwargs)
89	# Remove 'disable_gpu', None.getCudaEnabledDe kwargs.pop("dis return wrapper return func(args, *kwargs)
90
91	return wrapper
92
93
94	def calculate_frame_difference(prev_frame: np.ndarray, curr_frame: np.ndarray) -> float:
95	"""
96	Calculate the difference between two frames.
97
98	Parameters
99	----------
100	prev_frame : np.ndarray
101	Previous fr reason = Returns
102	-------
103	float
104	Difference score between 0 and 1
105	"""
106	# Convert to grayscale
107	if len(prev_frame.shape) == 3:
108	prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
109	else:
110	OR_BGR2GRAY)
111	else:
112	prev_gray = prev_frame
113
114	if len(curr_frame.shape) == 3:
115	curr_gray = cv2.cvtColor(curr_frame, cv2.COL curr_gray = curr_frame
116
117	# Calculate absolute difference
118	diff = cv2.abs diff(prev_gray, curr_gray)
119
120	# Normalize and return mean difference
121	retrn mean difference
122	return np.mean(diff) / 255.0
123
124
125	@gpu_accelerated
126	def extract_frames(
127	video_path: Union[str, Path],
128	sampling_rate: float = 1.0,
129	change_threshold: float = 0.15,
130	periodic_capture_seconds: float = 30.0,
131	max_frames: Optional[int] =None,
132	resize_to: Optionamax_memory_mb: int = 1024,
133	) -> List[np.ndarray]:
134	"""
135	Extract frames from video based on visual change detection + periodic capture.
136
137	Two capture strategies work together:
138	1. Change detection: capture when visual difference exceeds threshold
139	(catches transitions like webcam ↔ screen share)
140	2. Periodic capture: capture every N seconds regardless of change
141	(catches slow-evolving content like document scrolling)
142
143	The downstream people filter removes any webcam frames captured periodically.
144
145	Parameters
146	----------
147	video_path : str or Path
148	Path to video file
149	sampling_rate : float
150	Frame sampling rate (1.0 = every frame)
151	change_threshold : float
152	Threshold for detecting significant visual changes
153	periodic_capture_seconds : float
154	Capture a frame every N seconds regardless of change (0 to disable)
155	max_frames : int, optional
156	Maximum number of frames to extract
157	resize_to : tuple of (width, height), optional
158	Resizprev_gray, curr_gray)
159
160	# Normalize and return mean difference
161	return np.mean(diff) / 255.0
162
163
164	@gpu_accelerated
165	def extract_frames(
166	video_path: Union[str, Path],
167	sampling_rate: float = 1.0,
168	change_threshold: float = 0.15,
169	periodic_capture_seconds: float = 30.0,
170	max_frames: Optional[int] = None,
171	resize_to: Optional[Tuple[int, int]] = None,
172	max_memory_mb: int = 1024,
173	) -> List[np.ndarray]:
174	"""
175	Extract frames from video based on visual change detection + periodic capture.
176
177	Two capture strategieelif (
178	periodic_interval > 0 and (frame_idx - last_capture_: == 3 else frame
179	threshold:
180	reason = "periodic"
181
182	if should_capture:
183	extracted_frames.app )
184	def save_frames(sidebar thumbnails in scr"Frame extraction module for videctiombnails in scf = calculate_frame_difference(prev_frame, frame)
185	if diff > change_threshold:
186	should_capture = True
187	reason reason = f"change={diff:.3f}"
188
189	# Periodic capture — even if change is small
190	elif (
191	last_capture_frame) >= periodic_interval
192	):
193	should_capture = True
194	reason = "periodic"
195
196	if should_capture:
197	extracted_frames.app extracted_frames.ap

A video_processor/extractors/text_extractor.py

+270

		--- a/video_processor/extractors/text_extractor.py
		+++ b/video_processor/extractors/text_extractor.py
		@@ -0,0 +1,270 @@
	1	+"""Text extraction module fclass TextExtractor:
	2	+ """Extract text from images, frames, and diagrams."""
	3	+ , frames, and diagrams."""
	4	+
	5	+ def __init__(self, tesseract_path: Optional[str] = nt[1], b
	6	+ Parameters
	7	+tesseract_path : str, optional
	8	+ Path to tesseract executable for local OCR
	9	+ """
	10	+ self.tesseract_path = tesseract_path
	11	+ns in imr:
	12	+ # Check if we're using tesseract locally
	13	+ self.use_local_ocr = False
	14	+ if tesseract_path:
	15	+ try:
	16	+ import pytesseract
	17	+ pytesseract.pytesseract.tesseract_cmd = tesseract_path
	18	+ self.use_local_ocr = True
	19	+ except ImportError:
	20	+ logger.warning("pytesseract not installed, local OCR unavailable")
	21	+
	22	+ def preprocess_image(self, image: np.ndarray) -> np.ndarray:
	23	+ """
	24	+ Preprocess image for better text extraction.
	25	+nt[1], b
	26	+ Parameters
	27	+ ----------
	28	+ image : np.ndarray
	29	+ Input image
	30	+ ay
	31	+ Input image
	32	+
	33	+ Returns
	34	+ -------
	35	+ np.ndarray
	36	+ Preprocessed image
	37	+ """
	38	+ # Convert to grayscale if not already
	39	+ if len(image.shape) == 3:
	40	+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	41	+ else
	42	+ # Apply adaptive thresholding
	43	+ thresh = cv2.adaptiveThreshold(
	44	+ gray,
	45	+ text_fro 255,
	46	+ y
	47	+ old(
	48	+ gray, 255, cv2.A
	49	+ cv2.THRESH_BINARY_ent = [x2 [x1, y1,)
	50	+ ocal_ocr # Noise removal
	51	+ kernel = np.ones((1, 1), np.uint8)
	52	+ opening = cv2.morphologyEx(thres
	53	+ # Invert back
	54	+ result
	55	+ if tereturn result
	56	+ ()
	57	+
	58	+ return ressult
	59	+
	60	+ def extract_text_local(self, image: np.ndarray) -> str:
	61	+ """
	62	+ nt[1], b
	63	+ Parameters
	64	+ ----------
	65	+ image : np.ndarray
	66	+ Input image
	67	+ ay
	68	+ # Extract text
	69	+ f not self.use_local_ocr if not self.use_local_ocr:
	70	+
	71	+ import pytesseract
	72	+ """Text extraction module for frames and diagrams."""
	73	+
	74	+import logging
	75	+from paextractim typing imp"""Text extr nt, int]]return text
	76	+ import pytesseract
	77	+
	78	+ pytesseract.pytesseract.tesseract_cmd = tesseract_path
	79	+ self.use_local_ocr = True
	80	+ nt[1], b
	81	+ Parameters
	82	+ ----------
	83	+ image : np.ndarray
	84	+ Input image
	85	+ ay
	86	+ list
	87	+ List of ay) -> np.ndarray:
	88	+ """
	89	+ Preprocess image for better text extraction.
	90	+
	91	+ Parameters
	92	+ ----------
	93	+ image : np.ndarray
	94	+ Input image
	95	+
	96	+ Returns
	97	+ -------
	98	+ np.nd
	99	+
	100	+ """
	101	+ # Convert to grayscale if not already
	102	+ if len(image.shape) == 3:
	103	+ gray = cv2.cvtColor(imans in im
	104	+ self.use_l_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2
	105	+ )
	106	+
	107	+ # Noise removal
	108	+ kernel = np.ones((1, 1), np.uint8)
	109	+ opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)
	110	+
	111	+ # Invert back
	112	+ result = cv2.bitwise_not(opening)
	113	+
	114	+ return result
	115	+
	116	+ def extract_text_local(self, image: np.ndarray) -> str:
	117	+ """
	118	+ Extract text from image u sing local OCR (Tesseract).
	119	+
	120	+ Tesseract).
	121	+
	122	+ y
	123	+ Input image
	124	+
	125	+ Returns
	126	+ -------
	127	+ str
	128	+ Extractednt[1], b
	129	+ Parameters
	130	+ if not self.use_local_ocr:
	131	+ ay
	132	+ list
	133	+ """Text extraction module for frames and diagrams."""
	134	+
	135	+import logging
	136	+from pathlames and xes:
	137	+ return []
	138	+
	139	+ # Sort boxes by x coordinate
	140	+ sorted_boxes = sorted("""Texto
	141	+ merged = []
	142	+ curre
	143	+ for box in sorted_boxes[1:]:
	144	+ # Check if current box overlaps with thcurrent[0] <= box[0] + box[2] and
	145	+ box[0] <= current[0] + current[2] and
	146	+ current[1] <= box[1] + box[3] and
	147	+ box[3]
	148	+ and box[1]) """
	149	+ Ex
	150	+ ):
	151	+ # Calculate merged box
	152	+ x1 = min(current[0], box[0])
	153	+ y1 = min(current[1], box[1])
	154	+ x2 = max(current[0] + current[2], box[0] + box[2])
	155	+
	156	+ + box[3])
	157	+
	158	+ # Update current box
	159	+ current = [x1, y1, x2 - x1, y2 - y1]
	160	+ else:
	161	+ # Add current box to merged list and update current
	162	+ merged.appen
	163	+ imagurrent = list(box)
	164	+
	165	+ # A --return merged
	166	+ b import Path
	167	+from typi" image: np.ndarray, hape) == 3:
	168	+ :ct.pytesseract.tesseract_cmd = t[Tuple[int, int, int, int]]
	169	+ ) -> Dict[Tuple[int, int, int, int], str]:
	170	+ """
	171	+nt[1], b
	172	+ Parameters
	173	+ ----------
	174	+ image : np.ndarray
	175	+ Input image
	176	+ay
	177	+ Input image
	178	+ regions : list
	179	+ List ay
	180	+ dict #-----
	181	+ dict
	182	+ Dictionary of {region: text}
	183	+ R2GRAY) # Apply adaptive thresregion
	184	+
	185	+ text_from_image(seregion
	186	+ roi = image[y:y+h, x:x+w]
	187	+ ath
	188	+from typing import Dict, List, Optional, Tuple, Union
	189	+
	190	+import cv2
	191	+import numpy as np
	192	+"""Text extracti
	193	+ text_from_image(self, image: np # Extract text
	194	+ if self.use_local_ocr:
	195	+ text = self.extract_text_local(roi)
	196	+ else:
	197	+ t
	198	+ et implemented"
	199	+
	200	+ # Store non-empty results
	201	+ if text.st
	202	+ if tereturn results
	203	+ ()
	204	+
	205	+ return results
	206	+
	207	+ def extract_text_from_image(self, image: np.ndarray, detect_regions: bool = True) -> str:
	208	+ """
	209	+ Extrant[1], b
	210	+ Parameters
	211	+ ----------
	212	+ image : np.ndarray
	213	+ Input image
	214	+2])
	215	+ y2 = max(current[1] + current[3], box[1] + box[3])
	216	+
	217	+ # Update ay
	218	+ # Extract text
	219	+ # Add current box to merged list and update current
	220	+ merged.append(tuple(current))
	221	+ current = list(box)
	222	+
	223	+ # Add the last box
	224	+ merged.append(tuple(curre
	225	+ append(tuple(current))
	226	+
	227	+ return merged
	228	+
	229	+ def extract_text_from_regions(
	230	+ self, image: np.ndarray, regions: List[Tuple[int, int, int, int]]
	231	+ ) -> Dict[Tuple[int, int, int, int], str]:
	232	+ """
	233	+ Extract text from specified regions in image.
	234	+
	235	+ Parameters
	236	+ ----------
	237	+ image : n nt, int]]return text
	238	+ ()
	239	+
	240	+ return results
	241	+file() -> str:
	242	+ """
	243	+ file.ent[1], b
	244	+ Parameters
	245	+ , w, h = region
	246	+
	247	+ # Extract region
	248	+ roi = image[y : y + h, x : x + w]
	249	+
	250	+ # Skip empty regions
	251	+ if roi.si ay
	252	+ # Extract text
	253	+ if self.use_local_ocr:
	254	+ text = self.extract_text_local(roi)
	255	+ else:
	256	+ text = "API-based text extraction not yet implemen"""Text for frames and diagrams."""
	257	+
	258	+import logging
	259	+from pathlib import Path
	260	+from typing import Dict, List, Optional, Tuple, Union
	261	+
	262	+import cv2
	263	+import numpy as np
	264	+
	265	+logge m typing import Dict, List, Optional, Tuple, Union
	266	+
	267	+import cv2
	268	+import numpy as np
	269	+
	270	+logger ="""Text nt, int]]return text

	--- a/video_processor/extractors/text_extractor.py
	+++ b/video_processor/extractors/text_extractor.py
	@@ -0,0 +1,270 @@

	--- a/video_processor/extractors/text_extractor.py
	+++ b/video_processor/extractors/text_extractor.py
	@@ -0,0 +1,270 @@
1	"""Text extraction module fclass TextExtractor:
2	"""Extract text from images, frames, and diagrams."""
3	, frames, and diagrams."""
4
5	def __init__(self, tesseract_path: Optional[str] = nt[1], b
6	Parameters
7	tesseract_path : str, optional
8	Path to tesseract executable for local OCR
9	"""
10	self.tesseract_path = tesseract_path
11	ns in imr:
12	# Check if we're using tesseract locally
13	self.use_local_ocr = False
14	if tesseract_path:
15	try:
16	import pytesseract
17	pytesseract.pytesseract.tesseract_cmd = tesseract_path
18	self.use_local_ocr = True
19	except ImportError:
20	logger.warning("pytesseract not installed, local OCR unavailable")
21
22	def preprocess_image(self, image: np.ndarray) -> np.ndarray:
23	"""
24	Preprocess image for better text extraction.
25	nt[1], b
26	Parameters
27	----------
28	image : np.ndarray
29	Input image
30	ay
31	Input image
32
33	Returns
34	-------
35	np.ndarray
36	Preprocessed image
37	"""
38	# Convert to grayscale if not already
39	if len(image.shape) == 3:
40	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
41	else
42	# Apply adaptive thresholding
43	thresh = cv2.adaptiveThreshold(
44	gray,
45	text_fro 255,
46	y
47	old(
48	gray, 255, cv2.A
49	cv2.THRESH_BINARY_ent = [x2 [x1, y1,)
50	ocal_ocr # Noise removal
51	kernel = np.ones((1, 1), np.uint8)
52	opening = cv2.morphologyEx(thres
53	# Invert back
54	result
55	if tereturn result
56	()
57
58	return ressult
59
60	def extract_text_local(self, image: np.ndarray) -> str:
61	"""
62	nt[1], b
63	Parameters
64	----------
65	image : np.ndarray
66	Input image
67	ay
68	# Extract text
69	f not self.use_local_ocr if not self.use_local_ocr:
70
71	import pytesseract
72	"""Text extraction module for frames and diagrams."""
73
74	import logging
75	from paextractim typing imp"""Text extr nt, int]]return text
76	import pytesseract
77
78	pytesseract.pytesseract.tesseract_cmd = tesseract_path
79	self.use_local_ocr = True
80	nt[1], b
81	Parameters
82	----------
83	image : np.ndarray
84	Input image
85	ay
86	list
87	List of ay) -> np.ndarray:
88	"""
89	Preprocess image for better text extraction.
90
91	Parameters
92	----------
93	image : np.ndarray
94	Input image
95
96	Returns
97	-------
98	np.nd
99
100	"""
101	# Convert to grayscale if not already
102	if len(image.shape) == 3:
103	gray = cv2.cvtColor(imans in im
104	self.use_l_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2
105	)
106
107	# Noise removal
108	kernel = np.ones((1, 1), np.uint8)
109	opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)
110
111	# Invert back
112	result = cv2.bitwise_not(opening)
113
114	return result
115
116	def extract_text_local(self, image: np.ndarray) -> str:
117	"""
118	Extract text from image u sing local OCR (Tesseract).
119
120	Tesseract).
121
122	y
123	Input image
124
125	Returns
126	-------
127	str
128	Extractednt[1], b
129	Parameters
130	if not self.use_local_ocr:
131	ay
132	list
133	"""Text extraction module for frames and diagrams."""
134
135	import logging
136	from pathlames and xes:
137	return []
138
139	# Sort boxes by x coordinate
140	sorted_boxes = sorted("""Texto
141	merged = []
142	curre
143	for box in sorted_boxes[1:]:
144	# Check if current box overlaps with thcurrent[0] <= box[0] + box[2] and
145	box[0] <= current[0] + current[2] and
146	current[1] <= box[1] + box[3] and
147	box[3]
148	and box[1]) """
149	Ex
150	):
151	# Calculate merged box
152	x1 = min(current[0], box[0])
153	y1 = min(current[1], box[1])
154	x2 = max(current[0] + current[2], box[0] + box[2])
155
156	+ box[3])
157
158	# Update current box
159	current = [x1, y1, x2 - x1, y2 - y1]
160	else:
161	# Add current box to merged list and update current
162	merged.appen
163	imagurrent = list(box)
164
165	# A --return merged
166	b import Path
167	from typi" image: np.ndarray, hape) == 3:
168	:ct.pytesseract.tesseract_cmd = t[Tuple[int, int, int, int]]
169	) -> Dict[Tuple[int, int, int, int], str]:
170	"""
171	nt[1], b
172	Parameters
173	----------
174	image : np.ndarray
175	Input image
176	ay
177	Input image
178	regions : list
179	List ay
180	dict #-----
181	dict
182	Dictionary of {region: text}
183	R2GRAY) # Apply adaptive thresregion
184
185	text_from_image(seregion
186	roi = image[y:y+h, x:x+w]
187	ath
188	from typing import Dict, List, Optional, Tuple, Union
189
190	import cv2
191	import numpy as np
192	"""Text extracti
193	text_from_image(self, image: np # Extract text
194	if self.use_local_ocr:
195	text = self.extract_text_local(roi)
196	else:
197	t
198	et implemented"
199
200	# Store non-empty results
201	if text.st
202	if tereturn results
203	()
204
205	return results
206
207	def extract_text_from_image(self, image: np.ndarray, detect_regions: bool = True) -> str:
208	"""
209	Extrant[1], b
210	Parameters
211	----------
212	image : np.ndarray
213	Input image
214	2])
215	y2 = max(current[1] + current[3], box[1] + box[3])
216
217	# Update ay
218	# Extract text
219	# Add current box to merged list and update current
220	merged.append(tuple(current))
221	current = list(box)
222
223	# Add the last box
224	merged.append(tuple(curre
225	append(tuple(current))
226
227	return merged
228
229	def extract_text_from_regions(
230	self, image: np.ndarray, regions: List[Tuple[int, int, int, int]]
231	) -> Dict[Tuple[int, int, int, int], str]:
232	"""
233	Extract text from specified regions in image.
234
235	Parameters
236	----------
237	image : n nt, int]]return text
238	()
239
240	return results
241	file() -> str:
242	"""
243	file.ent[1], b
244	Parameters
245	, w, h = region
246
247	# Extract region
248	roi = image[y : y + h, x : x + w]
249
250	# Skip empty regions
251	if roi.si ay
252	# Extract text
253	if self.use_local_ocr:
254	text = self.extract_text_local(roi)
255	else:
256	text = "API-based text extraction not yet implemen"""Text for frames and diagrams."""
257
258	import logging
259	from pathlib import Path
260	from typing import Dict, List, Optional, Tuple, Union
261
262	import cv2
263	import numpy as np
264
265	logge m typing import Dict, List, Optional, Tuple, Union
266
267	import cv2
268	import numpy as np
269
270	logger ="""Text nt, int]]return text

A video_processor/integrators/__init__.py

No diff available

M video_processor/integrators/knowledge_graph.py

+68 -59

		--- video_processor/integrators/knowledge_graph.py
		+++ video_processor/integrators/knowledge_graph.py
		@@ -2,10 +2,12 @@
2	2
3	3	import json
4	4	import logging
5	5	from pathlib import Path
6	6	from typing import Dict, List, Optional, Union
	7	+
	8	+from tqdm import tqdm
7	9
8	10	from video_processor.models import Entity, KnowledgeGraphData, Relationship
9	11	from video_processor.providers.manager import ProviderManager
10	12	from video_processor.utils.json_parsing import parse_json_from_response
11	13
		@@ -31,65 +33,57 @@
31	33	[{"role": "user", "content": prompt}],
32	34	max_tokens=4096,
33	35	temperature=temperature,
34	36	)
35	37
36		- def extract_entities(self, text: str) -> List[Entity]:
37		- """Extract entities from text using LLM, returning pydantic models."""
	38	+ def extract_entities_and_relationships(self, text: str) -> tuple[List[Entity], List[Relationship]]:
	39	+ """Extract entities and relationships in a single LLM call."""
38	40	prompt = (
39		- "Extract all notable entities (people, concepts, technologies, "
40		- "organizations, time references) from the following content.\n\n"
	41	+ "Extract all notable entities and relationships from the following content.\n\n"
41	42	f"CONTENT:\n{text}\n\n"
42		- 'Return a JSON array of objects: '
43		- '[{"name": "...", "type": "person\|concept\|technology\|organization\|time", '
44		- '"description": "brief description"}]\n\n'
45		- "Return ONLY the JSON array."
	43	+ "Return a JSON object with two keys:\n"
	44	+ '- "entities": array of {"name": "...", "type": "person\|concept\|technology\|organization\|time", "description": "brief description"}\n'
	45	+ '- "relationships": array of {"source": "entity name", "target": "entity name", "type": "relationship description"}\n\n'
	46	+ "Return ONLY the JSON object."
46	47	)
47	48	raw = self._chat(prompt)
48	49	parsed = parse_json_from_response(raw)
49	50
50	51	entities = []
51		- if isinstance(parsed, list):
	52	+ rels = []
	53	+
	54	+ if isinstance(parsed, dict):
	55	+ for item in parsed.get("entities", []):
	56	+ if isinstance(item, dict) and "name" in item:
	57	+ entities.append(Entity(
	58	+ name=item["name"],
	59	+ type=item.get("type", "concept"),
	60	+ descriptions=[item["description"]] if item.get("description") else [],
	61	+ ))
	62	+ entity_names = {e.name for e in entities}
	63	+ for item in parsed.get("relationships", []):
	64	+ if isinstance(item, dict) and "source" in item and "target" in item:
	65	+ rels.append(Relationship(
	66	+ source=item["source"],
	67	+ target=item["target"],
	68	+ type=item.get("type", "related_to"),
	69	+ ))
	70	+ elif isinstance(parsed, list):
	71	+ # Fallback: if model returns a flat entity list
52	72	for item in parsed:
53	73	if isinstance(item, dict) and "name" in item:
54	74	entities.append(Entity(
55	75	name=item["name"],
56	76	type=item.get("type", "concept"),
57	77	descriptions=[item["description"]] if item.get("description") else [],
58	78	))
59		- return entities
60		-
61		- def extract_relationships(self, text: str, entities: List[Entity]) -> List[Relationship]:
62		- """Extract relationships between entities using LLM."""
63		- entity_names = ", ".join(e.name for e in entities)
64		- prompt = (
65		- "Given the following content and entities, identify relationships.\n\n"
66		- f"CONTENT:\n{text}\n\n"
67		- f"ENTITIES: {entity_names}\n\n"
68		- 'Return a JSON array: '
69		- '[{"source": "entity A", "target": "entity B", '
70		- '"type": "relationship description"}]\n\n'
71		- "Return ONLY the JSON array."
72		- )
73		- raw = self._chat(prompt)
74		- parsed = parse_json_from_response(raw)
75		-
76		- rels = []
77		- if isinstance(parsed, list):
78		- for item in parsed:
79		- if isinstance(item, dict) and "source" in item and "target" in item:
80		- rels.append(Relationship(
81		- source=item["source"],
82		- target=item["target"],
83		- type=item.get("type", "related_to"),
84		- ))
85		- return rels
	79	+
	80	+ return entities, rels
86	81
87	82	def add_content(self, text: str, source: str, timestamp: Optional[float] = None) -> None:
88	83	"""Add content to knowledge graph by extracting entities and relationships."""
89		- entities = self.extract_entities(text)
90		- relationships = self.extract_relationships(text, entities)
	84	+ entities, relationships = self.extract_entities_and_relationships(text)
91	85
92	86	for entity in entities:
93	87	eid = entity.name
94	88	if eid in self.nodes:
95	89	self.nodes[eid]["occurrences"].append({
		@@ -120,38 +114,53 @@
120	114	"type": rel.type,
121	115	"content_source": source,
122	116	"timestamp": timestamp,
123	117	})
124	118
125		- def process_transcript(self, transcript: Dict) -> None:
126		- """Process transcript segments into knowledge graph."""
	119	+ def process_transcript(self, transcript: Dict, batch_size: int = 10) -> None:
	120	+ """Process transcript segments into knowledge graph, batching for efficiency."""
127	121	if "segments" not in transcript:
128	122	logger.warning("Transcript missing segments")
129	123	return
130	124
131		- for i, segment in enumerate(transcript["segments"]):
132		- if "text" in segment:
133		- source = f"transcript_segment_{i}"
134		- timestamp = segment.get("start", None)
135		- speaker = segment.get("speaker", None)
136		-
137		- if speaker and speaker not in self.nodes:
138		- self.nodes[speaker] = {
139		- "id": speaker,
140		- "name": speaker,
141		- "type": "person",
142		- "descriptions": {"Speaker in transcript"},
143		- "occurrences": [],
144		- }
145		- if speaker:
146		- source = f"{speaker}_segment_{i}"
147		-
148		- self.add_content(segment["text"], source, timestamp)
	125	+ segments = transcript["segments"]
	126	+
	127	+ # Register speakers first
	128	+ for i, segment in enumerate(segments):
	129	+ speaker = segment.get("speaker", None)
	130	+ if speaker and speaker not in self.nodes:
	131	+ self.nodes[speaker] = {
	132	+ "id": speaker,
	133	+ "name": speaker,
	134	+ "type": "person",
	135	+ "descriptions": {"Speaker in transcript"},
	136	+ "occurrences": [],
	137	+ }
	138	+
	139	+ # Batch segments together for fewer API calls
	140	+ batches = []
	141	+ for start in range(0, len(segments), batch_size):
	142	+ batches.append(segments[start:start + batch_size])
	143	+
	144	+ for batch in tqdm(batches, desc="Building knowledge graph", unit="batch"):
	145	+ # Combine batch text
	146	+ combined_text = " ".join(
	147	+ seg["text"] for seg in batch if "text" in seg
	148	+ )
	149	+ if not combined_text.strip():
	150	+ continue
	151	+
	152	+ # Use first segment's timestamp as batch timestamp
	153	+ batch_start_idx = segments.index(batch[0])
	154	+ timestamp = batch[0].get("start", None)
	155	+ source = f"transcript_batch_{batch_start_idx}"
	156	+
	157	+ self.add_content(combined_text, source, timestamp)
149	158
150	159	def process_diagrams(self, diagrams: List[Dict]) -> None:
151	160	"""Process diagram results into knowledge graph."""
152		- for i, diagram in enumerate(diagrams):
	161	+ for i, diagram in enumerate(tqdm(diagrams, desc="Processing diagrams for KG", unit="diag")):
153	162	text_content = diagram.get("text_content", "")
154	163	if text_content:
155	164	source = f"diagram_{i}"
156	165	self.add_content(text_content, source)
157	166
158	167

	--- video_processor/integrators/knowledge_graph.py
	+++ video_processor/integrators/knowledge_graph.py
	@@ -2,10 +2,12 @@
2
3	import json
4	import logging
5	from pathlib import Path
6	from typing import Dict, List, Optional, Union


7
8	from video_processor.models import Entity, KnowledgeGraphData, Relationship
9	from video_processor.providers.manager import ProviderManager
10	from video_processor.utils.json_parsing import parse_json_from_response
11
	@@ -31,65 +33,57 @@
31	[{"role": "user", "content": prompt}],
32	max_tokens=4096,
33	temperature=temperature,
34	)
35
36	def extract_entities(self, text: str) -> List[Entity]:
37	"""Extract entities from text using LLM, returning pydantic models."""
38	prompt = (
39	"Extract all notable entities (people, concepts, technologies, "
40	"organizations, time references) from the following content.\n\n"
41	f"CONTENT:\n{text}\n\n"
42	'Return a JSON array of objects: '
43	'[{"name": "...", "type": "person\|concept\|technology\|organization\|time", '
44	'"description": "brief description"}]\n\n'
45	"Return ONLY the JSON array."
46	)
47	raw = self._chat(prompt)
48	parsed = parse_json_from_response(raw)
49
50	entities = []
51	if isinstance(parsed, list):



















52	for item in parsed:
53	if isinstance(item, dict) and "name" in item:
54	entities.append(Entity(
55	name=item["name"],
56	type=item.get("type", "concept"),
57	descriptions=[item["description"]] if item.get("description") else [],
58	))
59	return entities
60
61	def extract_relationships(self, text: str, entities: List[Entity]) -> List[Relationship]:
62	"""Extract relationships between entities using LLM."""
63	entity_names = ", ".join(e.name for e in entities)
64	prompt = (
65	"Given the following content and entities, identify relationships.\n\n"
66	f"CONTENT:\n{text}\n\n"
67	f"ENTITIES: {entity_names}\n\n"
68	'Return a JSON array: '
69	'[{"source": "entity A", "target": "entity B", '
70	'"type": "relationship description"}]\n\n'
71	"Return ONLY the JSON array."
72	)
73	raw = self._chat(prompt)
74	parsed = parse_json_from_response(raw)
75
76	rels = []
77	if isinstance(parsed, list):
78	for item in parsed:
79	if isinstance(item, dict) and "source" in item and "target" in item:
80	rels.append(Relationship(
81	source=item["source"],
82	target=item["target"],
83	type=item.get("type", "related_to"),
84	))
85	return rels
86
87	def add_content(self, text: str, source: str, timestamp: Optional[float] = None) -> None:
88	"""Add content to knowledge graph by extracting entities and relationships."""
89	entities = self.extract_entities(text)
90	relationships = self.extract_relationships(text, entities)
91
92	for entity in entities:
93	eid = entity.name
94	if eid in self.nodes:
95	self.nodes[eid]["occurrences"].append({
	@@ -120,38 +114,53 @@
120	"type": rel.type,
121	"content_source": source,
122	"timestamp": timestamp,
123	})
124
125	def process_transcript(self, transcript: Dict) -> None:
126	"""Process transcript segments into knowledge graph."""
127	if "segments" not in transcript:
128	logger.warning("Transcript missing segments")
129	return
130
131	for i, segment in enumerate(transcript["segments"]):
132	if "text" in segment:
133	source = f"transcript_segment_{i}"
134	timestamp = segment.get("start", None)
135	speaker = segment.get("speaker", None)
136
137	if speaker and speaker not in self.nodes:
138	self.nodes[speaker] = {
139	"id": speaker,
140	"name": speaker,
141	"type": "person",
142	"descriptions": {"Speaker in transcript"},
143	"occurrences": [],
144	}
145	if speaker:
146	source = f"{speaker}_segment_{i}"
147
148	self.add_content(segment["text"], source, timestamp)















149
150	def process_diagrams(self, diagrams: List[Dict]) -> None:
151	"""Process diagram results into knowledge graph."""
152	for i, diagram in enumerate(diagrams):
153	text_content = diagram.get("text_content", "")
154	if text_content:
155	source = f"diagram_{i}"
156	self.add_content(text_content, source)
157
158

	--- video_processor/integrators/knowledge_graph.py
	+++ video_processor/integrators/knowledge_graph.py
	@@ -2,10 +2,12 @@
2
3	import json
4	import logging
5	from pathlib import Path
6	from typing import Dict, List, Optional, Union
7
8	from tqdm import tqdm
9
10	from video_processor.models import Entity, KnowledgeGraphData, Relationship
11	from video_processor.providers.manager import ProviderManager
12	from video_processor.utils.json_parsing import parse_json_from_response
13
	@@ -31,65 +33,57 @@
33	[{"role": "user", "content": prompt}],
34	max_tokens=4096,
35	temperature=temperature,
36	)
37
38	def extract_entities_and_relationships(self, text: str) -> tuple[List[Entity], List[Relationship]]:
39	"""Extract entities and relationships in a single LLM call."""
40	prompt = (
41	"Extract all notable entities and relationships from the following content.\n\n"

42	f"CONTENT:\n{text}\n\n"
43	"Return a JSON object with two keys:\n"
44	'- "entities": array of {"name": "...", "type": "person\|concept\|technology\|organization\|time", "description": "brief description"}\n'
45	'- "relationships": array of {"source": "entity name", "target": "entity name", "type": "relationship description"}\n\n'
46	"Return ONLY the JSON object."
47	)
48	raw = self._chat(prompt)
49	parsed = parse_json_from_response(raw)
50
51	entities = []
52	rels = []
53
54	if isinstance(parsed, dict):
55	for item in parsed.get("entities", []):
56	if isinstance(item, dict) and "name" in item:
57	entities.append(Entity(
58	name=item["name"],
59	type=item.get("type", "concept"),
60	descriptions=[item["description"]] if item.get("description") else [],
61	))
62	entity_names = {e.name for e in entities}
63	for item in parsed.get("relationships", []):
64	if isinstance(item, dict) and "source" in item and "target" in item:
65	rels.append(Relationship(
66	source=item["source"],
67	target=item["target"],
68	type=item.get("type", "related_to"),
69	))
70	elif isinstance(parsed, list):
71	# Fallback: if model returns a flat entity list
72	for item in parsed:
73	if isinstance(item, dict) and "name" in item:
74	entities.append(Entity(
75	name=item["name"],
76	type=item.get("type", "concept"),
77	descriptions=[item["description"]] if item.get("description") else [],
78	))
79
80	return entities, rels

























81
82	def add_content(self, text: str, source: str, timestamp: Optional[float] = None) -> None:
83	"""Add content to knowledge graph by extracting entities and relationships."""
84	entities, relationships = self.extract_entities_and_relationships(text)

85
86	for entity in entities:
87	eid = entity.name
88	if eid in self.nodes:
89	self.nodes[eid]["occurrences"].append({
	@@ -120,38 +114,53 @@
114	"type": rel.type,
115	"content_source": source,
116	"timestamp": timestamp,
117	})
118
119	def process_transcript(self, transcript: Dict, batch_size: int = 10) -> None:
120	"""Process transcript segments into knowledge graph, batching for efficiency."""
121	if "segments" not in transcript:
122	logger.warning("Transcript missing segments")
123	return
124
125	segments = transcript["segments"]
126
127	# Register speakers first
128	for i, segment in enumerate(segments):
129	speaker = segment.get("speaker", None)
130	if speaker and speaker not in self.nodes:
131	self.nodes[speaker] = {
132	"id": speaker,
133	"name": speaker,
134	"type": "person",
135	"descriptions": {"Speaker in transcript"},
136	"occurrences": [],
137	}
138
139	# Batch segments together for fewer API calls
140	batches = []
141	for start in range(0, len(segments), batch_size):
142	batches.append(segments[start:start + batch_size])
143
144	for batch in tqdm(batches, desc="Building knowledge graph", unit="batch"):
145	# Combine batch text
146	combined_text = " ".join(
147	seg["text"] for seg in batch if "text" in seg
148	)
149	if not combined_text.strip():
150	continue
151
152	# Use first segment's timestamp as batch timestamp
153	batch_start_idx = segments.index(batch[0])
154	timestamp = batch[0].get("start", None)
155	source = f"transcript_batch_{batch_start_idx}"
156
157	self.add_content(combined_text, source, timestamp)
158
159	def process_diagrams(self, diagrams: List[Dict]) -> None:
160	"""Process diagram results into knowledge graph."""
161	for i, diagram in enumerate(tqdm(diagrams, desc="Processing diagrams for KG", unit="diag")):
162	text_content = diagram.get("text_content", "")
163	if text_content:
164	source = f"diagram_{i}"
165	self.add_content(text_content, source)
166
167

M video_processor/models.py

+1

		--- video_processor/models.py
		+++ video_processor/models.py
		@@ -121,10 +121,11 @@
121	121	"""Statistics about a processing run."""
122	122	start_time: Optional[str] = Field(default=None, description="ISO format start time")
123	123	end_time: Optional[str] = Field(default=None, description="ISO format end time")
124	124	duration_seconds: Optional[float] = Field(default=None, description="Total processing time")
125	125	frames_extracted: int = Field(default=0)
	126	+ people_frames_filtered: int = Field(default=0)
126	127	diagrams_detected: int = Field(default=0)
127	128	screen_captures: int = Field(default=0)
128	129	transcript_duration_seconds: Optional[float] = Field(default=None)
129	130	models_used: Dict[str, str] = Field(
130	131	default_factory=dict,
131	132

	--- video_processor/models.py
	+++ video_processor/models.py
	@@ -121,10 +121,11 @@
121	"""Statistics about a processing run."""
122	start_time: Optional[str] = Field(default=None, description="ISO format start time")
123	end_time: Optional[str] = Field(default=None, description="ISO format end time")
124	duration_seconds: Optional[float] = Field(default=None, description="Total processing time")
125	frames_extracted: int = Field(default=0)

126	diagrams_detected: int = Field(default=0)
127	screen_captures: int = Field(default=0)
128	transcript_duration_seconds: Optional[float] = Field(default=None)
129	models_used: Dict[str, str] = Field(
130	default_factory=dict,
131

	--- video_processor/models.py
	+++ video_processor/models.py
	@@ -121,10 +121,11 @@
121	"""Statistics about a processing run."""
122	start_time: Optional[str] = Field(default=None, description="ISO format start time")
123	end_time: Optional[str] = Field(default=None, description="ISO format end time")
124	duration_seconds: Optional[float] = Field(default=None, description="Total processing time")
125	frames_extracted: int = Field(default=0)
126	people_frames_filtered: int = Field(default=0)
127	diagrams_detected: int = Field(default=0)
128	screen_captures: int = Field(default=0)
129	transcript_duration_seconds: Optional[float] = Field(default=None)
130	models_used: Dict[str, str] = Field(
131	default_factory=dict,
132

M video_processor/pipeline.py

+167 -74

		--- video_processor/pipeline.py
		+++ video_processor/pipeline.py
		@@ -4,14 +4,16 @@
4	4	import logging
5	5	import time
6	6	from datetime import datetime
7	7	from pathlib import Path
8	8	from typing import Optional
	9	+
	10	+from tqdm import tqdm
9	11
10	12	from video_processor.analyzers.diagram_analyzer import DiagramAnalyzer
11	13	from video_processor.extractors.audio_extractor import AudioExtractor
12		-from video_processor.extractors.frame_extractor import extract_frames, save_frames
	14	+from video_processor.extractors.frame_extractor import extract_frames, filter_people_frames, save_frames
13	15	from video_processor.integrators.knowledge_graph import KnowledgeGraph
14	16	from video_processor.integrators.plan_generator import PlanGenerator
15	17	from video_processor.models import (
16	18	ActionItem,
17	19	KeyPoint,
		@@ -32,10 +34,11 @@
32	34	provider_manager: Optional[ProviderManager] = None,
33	35	depth: str = "standard",
34	36	focus_areas: Optional[list[str]] = None,
35	37	sampling_rate: float = 0.5,
36	38	change_threshold: float = 0.15,
	39	+ periodic_capture_seconds: float = 30.0,
37	40	use_gpu: bool = False,
38	41	title: Optional[str] = None,
39	42	) -> VideoManifest:
40	43	"""
41	44	Full pipeline: frames -> audio -> transcription -> diagrams -> KG -> report -> export.
		@@ -56,108 +59,191 @@
56	59	dirs = create_video_output_dirs(output_dir, video_name)
57	60
58	61	logger.info(f"Processing: {input_path}")
59	62	logger.info(f"Depth: {depth}, Focus: {focus_areas or 'all'}")
60	63
	64	+ steps = [
	65	+ "Extract frames",
	66	+ "Extract audio",
	67	+ "Transcribe",
	68	+ "Analyze visuals",
	69	+ "Build knowledge graph",
	70	+ "Extract key points",
	71	+ "Generate report",
	72	+ "Export formats",
	73	+ ]
	74	+ pipeline_bar = tqdm(steps, desc="Pipeline", unit="step", position=0)
	75	+
61	76	# --- Step 1: Extract frames ---
62		- logger.info("Extracting video frames...")
63		- frames = extract_frames(
64		- input_path,
65		- sampling_rate=sampling_rate,
66		- change_threshold=change_threshold,
67		- disable_gpu=not use_gpu,
68		- )
69		- frame_paths = save_frames(frames, dirs["frames"], "frame")
70		- logger.info(f"Extracted {len(frames)} frames")
	77	+ pm.usage.start_step("Frame extraction")
	78	+ pipeline_bar.set_description("Pipeline: extracting frames")
	79	+ existing_frames = sorted(dirs["frames"].glob("frame_*.jpg"))
	80	+ people_removed = 0
	81	+ if existing_frames:
	82	+ frame_paths = existing_frames
	83	+ logger.info(f"Resuming: found {len(frame_paths)} frames on disk, skipping extraction")
	84	+ else:
	85	+ logger.info("Extracting video frames...")
	86	+ frames = extract_frames(
	87	+ input_path,
	88	+ sampling_rate=sampling_rate,
	89	+ change_threshold=change_threshold,
	90	+ periodic_capture_seconds=periodic_capture_seconds,
	91	+ disable_gpu=not use_gpu,
	92	+ )
	93	+ logger.info(f"Extracted {len(frames)} raw frames")
	94	+
	95	+ # Filter out people/webcam frames before saving
	96	+ frames, people_removed = filter_people_frames(frames)
	97	+ frame_paths = save_frames(frames, dirs["frames"], "frame")
	98	+ logger.info(f"Saved {len(frames)} content frames ({people_removed} people frames filtered)")
	99	+ pipeline_bar.update(1)
71	100
72	101	# --- Step 2: Extract audio ---
73		- logger.info("Extracting audio...")
	102	+ pm.usage.start_step("Audio extraction")
	103	+ pipeline_bar.set_description("Pipeline: extracting audio")
	104	+ audio_path = dirs["root"] / "audio" / f"{video_name}.wav"
74	105	audio_extractor = AudioExtractor()
75		- audio_path = audio_extractor.extract_audio(
76		- input_path, output_path=dirs["root"] / "audio" / f"{video_name}.wav"
77		- )
	106	+ if audio_path.exists():
	107	+ logger.info(f"Resuming: found audio at {audio_path}, skipping extraction")
	108	+ else:
	109	+ logger.info("Extracting audio...")
	110	+ audio_path = audio_extractor.extract_audio(input_path, output_path=audio_path)
78	111	audio_props = audio_extractor.get_audio_properties(audio_path)
	112	+ pipeline_bar.update(1)
79	113
80	114	# --- Step 3: Transcribe ---
81		- logger.info("Transcribing audio...")
82		- transcription = pm.transcribe_audio(audio_path)
83		- transcript_text = transcription.get("text", "")
84		- segments = transcription.get("segments", [])
85		-
86		- # Save transcript files
87		- transcript_data = {
88		- "text": transcript_text,
89		- "segments": segments,
90		- "duration": transcription.get("duration") or audio_props.get("duration"),
91		- "language": transcription.get("language"),
92		- "provider": transcription.get("provider"),
93		- "model": transcription.get("model"),
94		- }
	115	+ pm.usage.start_step("Transcription")
	116	+ pipeline_bar.set_description("Pipeline: transcribing audio")
95	117	transcript_json = dirs["transcript"] / "transcript.json"
96		- transcript_json.write_text(json.dumps(transcript_data, indent=2))
97		-
98		- transcript_txt = dirs["transcript"] / "transcript.txt"
99		- transcript_txt.write_text(transcript_text)
100		-
101		- # SRT
102		- transcript_srt = dirs["transcript"] / "transcript.srt"
103		- srt_lines = []
104		- for i, seg in enumerate(segments):
105		- start = seg.get("start", 0)
106		- end = seg.get("end", 0)
107		- srt_lines.append(str(i + 1))
108		- srt_lines.append(
109		- f"{_format_srt_time(start)} --> {_format_srt_time(end)}"
110		- )
111		- srt_lines.append(seg.get("text", "").strip())
112		- srt_lines.append("")
113		- transcript_srt.write_text("\n".join(srt_lines))
	118	+ if transcript_json.exists():
	119	+ logger.info("Resuming: found transcript on disk, skipping transcription")
	120	+ transcript_data = json.loads(transcript_json.read_text())
	121	+ transcript_text = transcript_data.get("text", "")
	122	+ segments = transcript_data.get("segments", [])
	123	+ else:
	124	+ logger.info("Transcribing audio...")
	125	+ transcription = pm.transcribe_audio(audio_path)
	126	+ transcript_text = transcription.get("text", "")
	127	+ segments = transcription.get("segments", [])
	128	+
	129	+ # Save transcript files
	130	+ transcript_data = {
	131	+ "text": transcript_text,
	132	+ "segments": segments,
	133	+ "duration": transcription.get("duration") or audio_props.get("duration"),
	134	+ "language": transcription.get("language"),
	135	+ "provider": transcription.get("provider"),
	136	+ "model": transcription.get("model"),
	137	+ }
	138	+ transcript_json.write_text(json.dumps(transcript_data, indent=2))
	139	+
	140	+ transcript_txt = dirs["transcript"] / "transcript.txt"
	141	+ transcript_txt.write_text(transcript_text)
	142	+
	143	+ # SRT
	144	+ transcript_srt = dirs["transcript"] / "transcript.srt"
	145	+ srt_lines = []
	146	+ for i, seg in enumerate(segments):
	147	+ start = seg.get("start", 0)
	148	+ end = seg.get("end", 0)
	149	+ srt_lines.append(str(i + 1))
	150	+ srt_lines.append(
	151	+ f"{_format_srt_time(start)} --> {_format_srt_time(end)}"
	152	+ )
	153	+ srt_lines.append(seg.get("text", "").strip())
	154	+ srt_lines.append("")
	155	+ transcript_srt.write_text("\n".join(srt_lines))
	156	+ pipeline_bar.update(1)
114	157
115	158	# --- Step 4: Diagram extraction ---
	159	+ pm.usage.start_step("Visual analysis")
	160	+ pipeline_bar.set_description("Pipeline: analyzing visuals")
116	161	diagrams = []
117	162	screen_captures = []
118		- if depth != "basic" and (not focus_areas or "diagrams" in focus_areas):
	163	+ existing_diagrams = sorted(dirs["diagrams"].glob("diagram_*.json")) if dirs["diagrams"].exists() else []
	164	+ if existing_diagrams:
	165	+ logger.info(f"Resuming: found {len(existing_diagrams)} diagrams on disk, skipping analysis")
	166	+ from video_processor.models import DiagramResult
	167	+ for dj in existing_diagrams:
	168	+ try:
	169	+ diagrams.append(DiagramResult.model_validate_json(dj.read_text()))
	170	+ except Exception as e:
	171	+ logger.warning(f"Failed to load diagram {dj}: {e}")
	172	+ elif depth != "basic" and (not focus_areas or "diagrams" in focus_areas):
119	173	logger.info("Analyzing visual elements...")
120	174	analyzer = DiagramAnalyzer(provider_manager=pm)
121	175	max_frames = 10 if depth == "standard" else 20
122		- subset = frame_paths[:min(max_frames, len(frame_paths))]
	176	+ # Evenly sample across all frames rather than just taking the first N
	177	+ if len(frame_paths) <= max_frames:
	178	+ subset = frame_paths
	179	+ else:
	180	+ step = len(frame_paths) / max_frames
	181	+ subset = [frame_paths[int(i * step)] for i in range(max_frames)]
123	182	diagrams, screen_captures = analyzer.process_frames(
124	183	subset, diagrams_dir=dirs["diagrams"], captures_dir=dirs["captures"]
125	184	)
	185	+ pipeline_bar.update(1)
126	186
127	187	# --- Step 5: Knowledge graph ---
128		- logger.info("Building knowledge graph...")
129		- kg = KnowledgeGraph(provider_manager=pm)
130		- kg.process_transcript(transcript_data)
131		- if diagrams:
132		- diagram_dicts = [d.model_dump() for d in diagrams]
133		- kg.process_diagrams(diagram_dicts)
134		- kg_path = kg.save(dirs["results"] / "knowledge_graph.json")
	188	+ pm.usage.start_step("Knowledge graph")
	189	+ pipeline_bar.set_description("Pipeline: building knowledge graph")
	190	+ kg_json_path = dirs["results"] / "knowledge_graph.json"
	191	+ if kg_json_path.exists():
	192	+ logger.info("Resuming: found knowledge graph on disk, loading")
	193	+ kg_data = json.loads(kg_json_path.read_text())
	194	+ kg = KnowledgeGraph(provider_manager=pm)
	195	+ kg = KnowledgeGraph.from_dict(kg_data)
	196	+ else:
	197	+ logger.info("Building knowledge graph...")
	198	+ kg = KnowledgeGraph(provider_manager=pm)
	199	+ kg.process_transcript(transcript_data)
	200	+ if diagrams:
	201	+ diagram_dicts = [d.model_dump() for d in diagrams]
	202	+ kg.process_diagrams(diagram_dicts)
	203	+ kg.save(kg_json_path)
	204	+ pipeline_bar.update(1)
135	205
136	206	# --- Step 6: Extract key points & action items ---
137		- key_points = _extract_key_points(pm, transcript_text)
138		- action_items = _extract_action_items(pm, transcript_text)
139		-
140		- # Save structured data
	207	+ pm.usage.start_step("Key points & actions")
	208	+ pipeline_bar.set_description("Pipeline: extracting key points")
141	209	kp_path = dirs["results"] / "key_points.json"
142		- kp_path.write_text(json.dumps([kp.model_dump() for kp in key_points], indent=2))
143		-
144	210	ai_path = dirs["results"] / "action_items.json"
145		- ai_path.write_text(json.dumps([ai.model_dump() for ai in action_items], indent=2))
	211	+ if kp_path.exists() and ai_path.exists():
	212	+ logger.info("Resuming: found key points and action items on disk")
	213	+ key_points = [
	214	+ KeyPoint(**item) for item in json.loads(kp_path.read_text())
	215	+ ]
	216	+ action_items = [
	217	+ ActionItem(**item) for item in json.loads(ai_path.read_text())
	218	+ ]
	219	+ else:
	220	+ key_points = _extract_key_points(pm, transcript_text)
	221	+ action_items = _extract_action_items(pm, transcript_text)
	222	+
	223	+ kp_path.write_text(json.dumps([kp.model_dump() for kp in key_points], indent=2))
	224	+ ai_path.write_text(json.dumps([ai.model_dump() for ai in action_items], indent=2))
	225	+ pipeline_bar.update(1)
146	226
147	227	# --- Step 7: Generate markdown report ---
148		- logger.info("Generating report...")
149		- plan_gen = PlanGenerator(provider_manager=pm, knowledge_graph=kg)
	228	+ pm.usage.start_step("Report generation")
	229	+ pipeline_bar.set_description("Pipeline: generating report")
150	230	md_path = dirs["results"] / "analysis.md"
151		- plan_gen.generate_markdown(
152		- transcript=transcript_data,
153		- key_points=[kp.model_dump() for kp in key_points],
154		- diagrams=[d.model_dump() for d in diagrams],
155		- knowledge_graph=kg.to_dict(),
156		- video_title=title,
157		- output_path=md_path,
158		- )
	231	+ if md_path.exists():
	232	+ logger.info("Resuming: found analysis report on disk, skipping generation")
	233	+ else:
	234	+ logger.info("Generating report...")
	235	+ plan_gen = PlanGenerator(provider_manager=pm, knowledge_graph=kg)
	236	+ plan_gen.generate_markdown(
	237	+ transcript=transcript_data,
	238	+ key_points=[kp.model_dump() for kp in key_points],
	239	+ diagrams=[d.model_dump() for d in diagrams],
	240	+ knowledge_graph=kg.to_dict(),
	241	+ video_title=title,
	242	+ output_path=md_path,
	243	+ )
	244	+ pipeline_bar.update(1)
159	245
160	246	# --- Build manifest ---
161	247	elapsed = time.time() - start_time
162	248	manifest = VideoManifest(
163	249	video=VideoMetadata(
		@@ -166,11 +252,12 @@
166	252	duration_seconds=audio_props.get("duration"),
167	253	),
168	254	stats=ProcessingStats(
169	255	start_time=datetime.now().isoformat(),
170	256	duration_seconds=elapsed,
171		- frames_extracted=len(frames),
	257	+ frames_extracted=len(frame_paths),
	258	+ people_frames_filtered=people_removed,
172	259	diagrams_detected=len(diagrams),
173	260	screen_captures=len(screen_captures),
174	261	transcript_duration_seconds=audio_props.get("duration"),
175	262	models_used=pm.get_models_used(),
176	263	),
		@@ -187,13 +274,19 @@
187	274	screen_captures=screen_captures,
188	275	frame_paths=[f"frames/{Path(p).name}" for p in frame_paths],
189	276	)
190	277
191	278	# --- Step 8: Export all formats ---
192		- logger.info("Exporting multi-format outputs...")
	279	+ pm.usage.start_step("Export formats")
	280	+ pipeline_bar.set_description("Pipeline: exporting formats")
193	281	manifest = export_all_formats(output_dir, manifest)
194	282
	283	+ pm.usage.end_step()
	284	+ pipeline_bar.update(1)
	285	+ pipeline_bar.set_description("Pipeline: complete")
	286	+ pipeline_bar.close()
	287	+
195	288	# Write manifest
196	289	write_video_manifest(manifest, output_dir)
197	290
198	291	logger.info(f"Processing complete in {elapsed:.1f}s: {len(diagrams)} diagrams, "
199	292	f"{len(screen_captures)} captures, {len(key_points)} key points, "
200	293

	--- video_processor/pipeline.py
	+++ video_processor/pipeline.py
	@@ -4,14 +4,16 @@
4	import logging
5	import time
6	from datetime import datetime
7	from pathlib import Path
8	from typing import Optional


9
10	from video_processor.analyzers.diagram_analyzer import DiagramAnalyzer
11	from video_processor.extractors.audio_extractor import AudioExtractor
12	from video_processor.extractors.frame_extractor import extract_frames, save_frames
13	from video_processor.integrators.knowledge_graph import KnowledgeGraph
14	from video_processor.integrators.plan_generator import PlanGenerator
15	from video_processor.models import (
16	ActionItem,
17	KeyPoint,
	@@ -32,10 +34,11 @@
32	provider_manager: Optional[ProviderManager] = None,
33	depth: str = "standard",
34	focus_areas: Optional[list[str]] = None,
35	sampling_rate: float = 0.5,
36	change_threshold: float = 0.15,

37	use_gpu: bool = False,
38	title: Optional[str] = None,
39	) -> VideoManifest:
40	"""
41	Full pipeline: frames -> audio -> transcription -> diagrams -> KG -> report -> export.
	@@ -56,108 +59,191 @@
56	dirs = create_video_output_dirs(output_dir, video_name)
57
58	logger.info(f"Processing: {input_path}")
59	logger.info(f"Depth: {depth}, Focus: {focus_areas or 'all'}")
60












61	# --- Step 1: Extract frames ---
62	logger.info("Extracting video frames...")
63	frames = extract_frames(
64	input_path,
65	sampling_rate=sampling_rate,
66	change_threshold=change_threshold,
67	disable_gpu=not use_gpu,
68	)
69	frame_paths = save_frames(frames, dirs["frames"], "frame")
70	logger.info(f"Extracted {len(frames)} frames")














71
72	# --- Step 2: Extract audio ---
73	logger.info("Extracting audio...")


74	audio_extractor = AudioExtractor()
75	audio_path = audio_extractor.extract_audio(
76	input_path, output_path=dirs["root"] / "audio" / f"{video_name}.wav"
77	)


78	audio_props = audio_extractor.get_audio_properties(audio_path)

79
80	# --- Step 3: Transcribe ---
81	logger.info("Transcribing audio...")
82	transcription = pm.transcribe_audio(audio_path)
83	transcript_text = transcription.get("text", "")
84	segments = transcription.get("segments", [])
85
86	# Save transcript files
87	transcript_data = {
88	"text": transcript_text,
89	"segments": segments,
90	"duration": transcription.get("duration") or audio_props.get("duration"),
91	"language": transcription.get("language"),
92	"provider": transcription.get("provider"),
93	"model": transcription.get("model"),
94	}
95	transcript_json = dirs["transcript"] / "transcript.json"
96	transcript_json.write_text(json.dumps(transcript_data, indent=2))
97
98	transcript_txt = dirs["transcript"] / "transcript.txt"
99	transcript_txt.write_text(transcript_text)
100
101	# SRT
102	transcript_srt = dirs["transcript"] / "transcript.srt"
103	srt_lines = []
104	for i, seg in enumerate(segments):
105	start = seg.get("start", 0)
106	end = seg.get("end", 0)
107	srt_lines.append(str(i + 1))
108	srt_lines.append(
109	f"{_format_srt_time(start)} --> {_format_srt_time(end)}"
110	)
111	srt_lines.append(seg.get("text", "").strip())
112	srt_lines.append("")
113	transcript_srt.write_text("\n".join(srt_lines))





















114
115	# --- Step 4: Diagram extraction ---


116	diagrams = []
117	screen_captures = []
118	if depth != "basic" and (not focus_areas or "diagrams" in focus_areas):









119	logger.info("Analyzing visual elements...")
120	analyzer = DiagramAnalyzer(provider_manager=pm)
121	max_frames = 10 if depth == "standard" else 20
122	subset = frame_paths[:min(max_frames, len(frame_paths))]





123	diagrams, screen_captures = analyzer.process_frames(
124	subset, diagrams_dir=dirs["diagrams"], captures_dir=dirs["captures"]
125	)

126
127	# --- Step 5: Knowledge graph ---
128	logger.info("Building knowledge graph...")
129	kg = KnowledgeGraph(provider_manager=pm)
130	kg.process_transcript(transcript_data)
131	if diagrams:
132	diagram_dicts = [d.model_dump() for d in diagrams]
133	kg.process_diagrams(diagram_dicts)
134	kg_path = kg.save(dirs["results"] / "knowledge_graph.json")










135
136	# --- Step 6: Extract key points & action items ---
137	key_points = _extract_key_points(pm, transcript_text)
138	action_items = _extract_action_items(pm, transcript_text)
139
140	# Save structured data
141	kp_path = dirs["results"] / "key_points.json"
142	kp_path.write_text(json.dumps([kp.model_dump() for kp in key_points], indent=2))
143
144	ai_path = dirs["results"] / "action_items.json"
145	ai_path.write_text(json.dumps([ai.model_dump() for ai in action_items], indent=2))














146
147	# --- Step 7: Generate markdown report ---
148	logger.info("Generating report...")
149	plan_gen = PlanGenerator(provider_manager=pm, knowledge_graph=kg)
150	md_path = dirs["results"] / "analysis.md"
151	plan_gen.generate_markdown(
152	transcript=transcript_data,
153	key_points=[kp.model_dump() for kp in key_points],
154	diagrams=[d.model_dump() for d in diagrams],
155	knowledge_graph=kg.to_dict(),
156	video_title=title,
157	output_path=md_path,
158	)






159
160	# --- Build manifest ---
161	elapsed = time.time() - start_time
162	manifest = VideoManifest(
163	video=VideoMetadata(
	@@ -166,11 +252,12 @@
166	duration_seconds=audio_props.get("duration"),
167	),
168	stats=ProcessingStats(
169	start_time=datetime.now().isoformat(),
170	duration_seconds=elapsed,
171	frames_extracted=len(frames),

172	diagrams_detected=len(diagrams),
173	screen_captures=len(screen_captures),
174	transcript_duration_seconds=audio_props.get("duration"),
175	models_used=pm.get_models_used(),
176	),
	@@ -187,13 +274,19 @@
187	screen_captures=screen_captures,
188	frame_paths=[f"frames/{Path(p).name}" for p in frame_paths],
189	)
190
191	# --- Step 8: Export all formats ---
192	logger.info("Exporting multi-format outputs...")

193	manifest = export_all_formats(output_dir, manifest)
194





195	# Write manifest
196	write_video_manifest(manifest, output_dir)
197
198	logger.info(f"Processing complete in {elapsed:.1f}s: {len(diagrams)} diagrams, "
199	f"{len(screen_captures)} captures, {len(key_points)} key points, "
200

	--- video_processor/pipeline.py
	+++ video_processor/pipeline.py
	@@ -4,14 +4,16 @@
4	import logging
5	import time
6	from datetime import datetime
7	from pathlib import Path
8	from typing import Optional
9
10	from tqdm import tqdm
11
12	from video_processor.analyzers.diagram_analyzer import DiagramAnalyzer
13	from video_processor.extractors.audio_extractor import AudioExtractor
14	from video_processor.extractors.frame_extractor import extract_frames, filter_people_frames, save_frames
15	from video_processor.integrators.knowledge_graph import KnowledgeGraph
16	from video_processor.integrators.plan_generator import PlanGenerator
17	from video_processor.models import (
18	ActionItem,
19	KeyPoint,
	@@ -32,10 +34,11 @@
34	provider_manager: Optional[ProviderManager] = None,
35	depth: str = "standard",
36	focus_areas: Optional[list[str]] = None,
37	sampling_rate: float = 0.5,
38	change_threshold: float = 0.15,
39	periodic_capture_seconds: float = 30.0,
40	use_gpu: bool = False,
41	title: Optional[str] = None,
42	) -> VideoManifest:
43	"""
44	Full pipeline: frames -> audio -> transcription -> diagrams -> KG -> report -> export.
	@@ -56,108 +59,191 @@
59	dirs = create_video_output_dirs(output_dir, video_name)
60
61	logger.info(f"Processing: {input_path}")
62	logger.info(f"Depth: {depth}, Focus: {focus_areas or 'all'}")
63
64	steps = [
65	"Extract frames",
66	"Extract audio",
67	"Transcribe",
68	"Analyze visuals",
69	"Build knowledge graph",
70	"Extract key points",
71	"Generate report",
72	"Export formats",
73	]
74	pipeline_bar = tqdm(steps, desc="Pipeline", unit="step", position=0)
75
76	# --- Step 1: Extract frames ---
77	pm.usage.start_step("Frame extraction")
78	pipeline_bar.set_description("Pipeline: extracting frames")
79	existing_frames = sorted(dirs["frames"].glob("frame_*.jpg"))
80	people_removed = 0
81	if existing_frames:
82	frame_paths = existing_frames
83	logger.info(f"Resuming: found {len(frame_paths)} frames on disk, skipping extraction")
84	else:
85	logger.info("Extracting video frames...")
86	frames = extract_frames(
87	input_path,
88	sampling_rate=sampling_rate,
89	change_threshold=change_threshold,
90	periodic_capture_seconds=periodic_capture_seconds,
91	disable_gpu=not use_gpu,
92	)
93	logger.info(f"Extracted {len(frames)} raw frames")
94
95	# Filter out people/webcam frames before saving
96	frames, people_removed = filter_people_frames(frames)
97	frame_paths = save_frames(frames, dirs["frames"], "frame")
98	logger.info(f"Saved {len(frames)} content frames ({people_removed} people frames filtered)")
99	pipeline_bar.update(1)
100
101	# --- Step 2: Extract audio ---
102	pm.usage.start_step("Audio extraction")
103	pipeline_bar.set_description("Pipeline: extracting audio")
104	audio_path = dirs["root"] / "audio" / f"{video_name}.wav"
105	audio_extractor = AudioExtractor()
106	if audio_path.exists():
107	logger.info(f"Resuming: found audio at {audio_path}, skipping extraction")
108	else:
109	logger.info("Extracting audio...")
110	audio_path = audio_extractor.extract_audio(input_path, output_path=audio_path)
111	audio_props = audio_extractor.get_audio_properties(audio_path)
112	pipeline_bar.update(1)
113
114	# --- Step 3: Transcribe ---
115	pm.usage.start_step("Transcription")
116	pipeline_bar.set_description("Pipeline: transcribing audio")












117	transcript_json = dirs["transcript"] / "transcript.json"
118	if transcript_json.exists():
119	logger.info("Resuming: found transcript on disk, skipping transcription")
120	transcript_data = json.loads(transcript_json.read_text())
121	transcript_text = transcript_data.get("text", "")
122	segments = transcript_data.get("segments", [])
123	else:
124	logger.info("Transcribing audio...")
125	transcription = pm.transcribe_audio(audio_path)
126	transcript_text = transcription.get("text", "")
127	segments = transcription.get("segments", [])
128
129	# Save transcript files
130	transcript_data = {
131	"text": transcript_text,
132	"segments": segments,
133	"duration": transcription.get("duration") or audio_props.get("duration"),
134	"language": transcription.get("language"),
135	"provider": transcription.get("provider"),
136	"model": transcription.get("model"),
137	}
138	transcript_json.write_text(json.dumps(transcript_data, indent=2))
139
140	transcript_txt = dirs["transcript"] / "transcript.txt"
141	transcript_txt.write_text(transcript_text)
142
143	# SRT
144	transcript_srt = dirs["transcript"] / "transcript.srt"
145	srt_lines = []
146	for i, seg in enumerate(segments):
147	start = seg.get("start", 0)
148	end = seg.get("end", 0)
149	srt_lines.append(str(i + 1))
150	srt_lines.append(
151	f"{_format_srt_time(start)} --> {_format_srt_time(end)}"
152	)
153	srt_lines.append(seg.get("text", "").strip())
154	srt_lines.append("")
155	transcript_srt.write_text("\n".join(srt_lines))
156	pipeline_bar.update(1)
157
158	# --- Step 4: Diagram extraction ---
159	pm.usage.start_step("Visual analysis")
160	pipeline_bar.set_description("Pipeline: analyzing visuals")
161	diagrams = []
162	screen_captures = []
163	existing_diagrams = sorted(dirs["diagrams"].glob("diagram_*.json")) if dirs["diagrams"].exists() else []
164	if existing_diagrams:
165	logger.info(f"Resuming: found {len(existing_diagrams)} diagrams on disk, skipping analysis")
166	from video_processor.models import DiagramResult
167	for dj in existing_diagrams:
168	try:
169	diagrams.append(DiagramResult.model_validate_json(dj.read_text()))
170	except Exception as e:
171	logger.warning(f"Failed to load diagram {dj}: {e}")
172	elif depth != "basic" and (not focus_areas or "diagrams" in focus_areas):
173	logger.info("Analyzing visual elements...")
174	analyzer = DiagramAnalyzer(provider_manager=pm)
175	max_frames = 10 if depth == "standard" else 20
176	# Evenly sample across all frames rather than just taking the first N
177	if len(frame_paths) <= max_frames:
178	subset = frame_paths
179	else:
180	step = len(frame_paths) / max_frames
181	subset = [frame_paths[int(i * step)] for i in range(max_frames)]
182	diagrams, screen_captures = analyzer.process_frames(
183	subset, diagrams_dir=dirs["diagrams"], captures_dir=dirs["captures"]
184	)
185	pipeline_bar.update(1)
186
187	# --- Step 5: Knowledge graph ---
188	pm.usage.start_step("Knowledge graph")
189	pipeline_bar.set_description("Pipeline: building knowledge graph")
190	kg_json_path = dirs["results"] / "knowledge_graph.json"
191	if kg_json_path.exists():
192	logger.info("Resuming: found knowledge graph on disk, loading")
193	kg_data = json.loads(kg_json_path.read_text())
194	kg = KnowledgeGraph(provider_manager=pm)
195	kg = KnowledgeGraph.from_dict(kg_data)
196	else:
197	logger.info("Building knowledge graph...")
198	kg = KnowledgeGraph(provider_manager=pm)
199	kg.process_transcript(transcript_data)
200	if diagrams:
201	diagram_dicts = [d.model_dump() for d in diagrams]
202	kg.process_diagrams(diagram_dicts)
203	kg.save(kg_json_path)
204	pipeline_bar.update(1)
205
206	# --- Step 6: Extract key points & action items ---
207	pm.usage.start_step("Key points & actions")
208	pipeline_bar.set_description("Pipeline: extracting key points")


209	kp_path = dirs["results"] / "key_points.json"


210	ai_path = dirs["results"] / "action_items.json"
211	if kp_path.exists() and ai_path.exists():
212	logger.info("Resuming: found key points and action items on disk")
213	key_points = [
214	KeyPoint(**item) for item in json.loads(kp_path.read_text())
215	]
216	action_items = [
217	ActionItem(**item) for item in json.loads(ai_path.read_text())
218	]
219	else:
220	key_points = _extract_key_points(pm, transcript_text)
221	action_items = _extract_action_items(pm, transcript_text)
222
223	kp_path.write_text(json.dumps([kp.model_dump() for kp in key_points], indent=2))
224	ai_path.write_text(json.dumps([ai.model_dump() for ai in action_items], indent=2))
225	pipeline_bar.update(1)
226
227	# --- Step 7: Generate markdown report ---
228	pm.usage.start_step("Report generation")
229	pipeline_bar.set_description("Pipeline: generating report")
230	md_path = dirs["results"] / "analysis.md"
231	if md_path.exists():
232	logger.info("Resuming: found analysis report on disk, skipping generation")
233	else:
234	logger.info("Generating report...")
235	plan_gen = PlanGenerator(provider_manager=pm, knowledge_graph=kg)
236	plan_gen.generate_markdown(
237	transcript=transcript_data,
238	key_points=[kp.model_dump() for kp in key_points],
239	diagrams=[d.model_dump() for d in diagrams],
240	knowledge_graph=kg.to_dict(),
241	video_title=title,
242	output_path=md_path,
243	)
244	pipeline_bar.update(1)
245
246	# --- Build manifest ---
247	elapsed = time.time() - start_time
248	manifest = VideoManifest(
249	video=VideoMetadata(
	@@ -166,11 +252,12 @@
252	duration_seconds=audio_props.get("duration"),
253	),
254	stats=ProcessingStats(
255	start_time=datetime.now().isoformat(),
256	duration_seconds=elapsed,
257	frames_extracted=len(frame_paths),
258	people_frames_filtered=people_removed,
259	diagrams_detected=len(diagrams),
260	screen_captures=len(screen_captures),
261	transcript_duration_seconds=audio_props.get("duration"),
262	models_used=pm.get_models_used(),
263	),
	@@ -187,13 +274,19 @@
274	screen_captures=screen_captures,
275	frame_paths=[f"frames/{Path(p).name}" for p in frame_paths],
276	)
277
278	# --- Step 8: Export all formats ---
279	pm.usage.start_step("Export formats")
280	pipeline_bar.set_description("Pipeline: exporting formats")
281	manifest = export_all_formats(output_dir, manifest)
282
283	pm.usage.end_step()
284	pipeline_bar.update(1)
285	pipeline_bar.set_description("Pipeline: complete")
286	pipeline_bar.close()
287
288	# Write manifest
289	write_video_manifest(manifest, output_dir)
290
291	logger.info(f"Processing complete in {elapsed:.1f}s: {len(diagrams)} diagrams, "
292	f"{len(screen_captures)} captures, {len(key_points)} key points, "
293

M video_processor/providers/anthropic_provider.py

+8

		--- video_processor/providers/anthropic_provider.py
		+++ video_processor/providers/anthropic_provider.py
		@@ -38,10 +38,14 @@
38	38	model=model,
39	39	messages=messages,
40	40	max_tokens=max_tokens,
41	41	temperature=temperature,
42	42	)
	43	+ self._last_usage = {
	44	+ "input_tokens": getattr(response.usage, "input_tokens", 0),
	45	+ "output_tokens": getattr(response.usage, "output_tokens", 0),
	46	+ }
43	47	return response.content[0].text
44	48
45	49	def analyze_image(
46	50	self,
47	51	image_bytes: bytes,
		@@ -69,10 +73,14 @@
69	73	],
70	74	}
71	75	],
72	76	max_tokens=max_tokens,
73	77	)
	78	+ self._last_usage = {
	79	+ "input_tokens": getattr(response.usage, "input_tokens", 0),
	80	+ "output_tokens": getattr(response.usage, "output_tokens", 0),
	81	+ }
74	82	return response.content[0].text
75	83
76	84	def transcribe_audio(
77	85	self,
78	86	audio_path: str \| Path,
79	87

	--- video_processor/providers/anthropic_provider.py
	+++ video_processor/providers/anthropic_provider.py
	@@ -38,10 +38,14 @@
38	model=model,
39	messages=messages,
40	max_tokens=max_tokens,
41	temperature=temperature,
42	)




43	return response.content[0].text
44
45	def analyze_image(
46	self,
47	image_bytes: bytes,
	@@ -69,10 +73,14 @@
69	],
70	}
71	],
72	max_tokens=max_tokens,
73	)




74	return response.content[0].text
75
76	def transcribe_audio(
77	self,
78	audio_path: str \| Path,
79

	--- video_processor/providers/anthropic_provider.py
	+++ video_processor/providers/anthropic_provider.py
	@@ -38,10 +38,14 @@
38	model=model,
39	messages=messages,
40	max_tokens=max_tokens,
41	temperature=temperature,
42	)
43	self._last_usage = {
44	"input_tokens": getattr(response.usage, "input_tokens", 0),
45	"output_tokens": getattr(response.usage, "output_tokens", 0),
46	}
47	return response.content[0].text
48
49	def analyze_image(
50	self,
51	image_bytes: bytes,
	@@ -69,10 +73,14 @@
73	],
74	}
75	],
76	max_tokens=max_tokens,
77	)
78	self._last_usage = {
79	"input_tokens": getattr(response.usage, "input_tokens", 0),
80	"output_tokens": getattr(response.usage, "output_tokens", 0),
81	}
82	return response.content[0].text
83
84	def transcribe_audio(
85	self,
86	audio_path: str \| Path,
87

M video_processor/providers/discovery.py

+9 -4

		--- video_processor/providers/discovery.py
		+++ video_processor/providers/discovery.py
		@@ -56,20 +56,25 @@
56	56	logger.info(f"Discovered {len(models)} Anthropic models")
57	57	all_models.extend(models)
58	58	except Exception as e:
59	59	logger.info(f"Anthropic discovery skipped: {e}")
60	60
61		- # Gemini
62		- if keys.get("gemini"):
	61	+ # Gemini (API key or service account)
	62	+ gemini_key = keys.get("gemini")
	63	+ gemini_creds = os.getenv("GOOGLE_APPLICATION_CREDENTIALS", "")
	64	+ if gemini_key or gemini_creds:
63	65	try:
64	66	from video_processor.providers.gemini_provider import GeminiProvider
65		- provider = GeminiProvider(api_key=keys["gemini"])
	67	+ provider = GeminiProvider(
	68	+ api_key=gemini_key or None,
	69	+ credentials_path=gemini_creds or None,
	70	+ )
66	71	models = provider.list_models()
67	72	logger.info(f"Discovered {len(models)} Gemini models")
68	73	all_models.extend(models)
69	74	except Exception as e:
70		- logger.info(f"Gemini discovery skipped: {e}")
	75	+ logger.warning(f"Gemini discovery failed: {e}")
71	76
72	77	# Sort by provider then id
73	78	all_models.sort(key=lambda m: (m.provider, m.id))
74	79	_cached_models = all_models
75	80	logger.info(f"Total discovered models: {len(all_models)}")
76	81

	--- video_processor/providers/discovery.py
	+++ video_processor/providers/discovery.py
	@@ -56,20 +56,25 @@
56	logger.info(f"Discovered {len(models)} Anthropic models")
57	all_models.extend(models)
58	except Exception as e:
59	logger.info(f"Anthropic discovery skipped: {e}")
60
61	# Gemini
62	if keys.get("gemini"):


63	try:
64	from video_processor.providers.gemini_provider import GeminiProvider
65	provider = GeminiProvider(api_key=keys["gemini"])



66	models = provider.list_models()
67	logger.info(f"Discovered {len(models)} Gemini models")
68	all_models.extend(models)
69	except Exception as e:
70	logger.info(f"Gemini discovery skipped: {e}")
71
72	# Sort by provider then id
73	all_models.sort(key=lambda m: (m.provider, m.id))
74	_cached_models = all_models
75	logger.info(f"Total discovered models: {len(all_models)}")
76

	--- video_processor/providers/discovery.py
	+++ video_processor/providers/discovery.py
	@@ -56,20 +56,25 @@
56	logger.info(f"Discovered {len(models)} Anthropic models")
57	all_models.extend(models)
58	except Exception as e:
59	logger.info(f"Anthropic discovery skipped: {e}")
60
61	# Gemini (API key or service account)
62	gemini_key = keys.get("gemini")
63	gemini_creds = os.getenv("GOOGLE_APPLICATION_CREDENTIALS", "")
64	if gemini_key or gemini_creds:
65	try:
66	from video_processor.providers.gemini_provider import GeminiProvider
67	provider = GeminiProvider(
68	api_key=gemini_key or None,
69	credentials_path=gemini_creds or None,
70	)
71	models = provider.list_models()
72	logger.info(f"Discovered {len(models)} Gemini models")
73	all_models.extend(models)
74	except Exception as e:
75	logger.warning(f"Gemini discovery failed: {e}")
76
77	# Sort by provider then id
78	all_models.sort(key=lambda m: (m.provider, m.id))
79	_cached_models = all_models
80	logger.info(f"Total discovered models: {len(all_models)}")
81

M video_processor/providers/gemini_provider.py

+44 -7

		--- video_processor/providers/gemini_provider.py
		+++ video_processor/providers/gemini_provider.py
		@@ -20,18 +20,43 @@
20	20	class GeminiProvider(BaseProvider):
21	21	"""Google Gemini API provider via google-genai SDK."""
22	22
23	23	provider_name = "gemini"
24	24
25		- def __init__(self, api_key: Optional[str] = None):
	25	+ def __init__(
	26	+ self,
	27	+ api_key: Optional[str] = None,
	28	+ credentials_path: Optional[str] = None,
	29	+ ):
26	30	self.api_key = api_key or os.getenv("GEMINI_API_KEY")
27		- if not self.api_key:
28		- raise ValueError("GEMINI_API_KEY not set")
	31	+ self.credentials_path = credentials_path or os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
	32	+
	33	+ if not self.api_key and not self.credentials_path:
	34	+ raise ValueError(
	35	+ "Neither GEMINI_API_KEY nor GOOGLE_APPLICATION_CREDENTIALS is set"
	36	+ )
	37	+
29	38	try:
30	39	from google import genai
31	40	self._genai = genai
32		- self.client = genai.Client(api_key=self.api_key)
	41	+
	42	+ if self.api_key:
	43	+ self.client = genai.Client(api_key=self.api_key)
	44	+ else:
	45	+ # Service account → use Vertex AI mode
	46	+ import json
	47	+
	48	+ with open(self.credentials_path) as f:
	49	+ sa_info = json.load(f)
	50	+ project = sa_info.get("project_id", "")
	51	+ location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1")
	52	+
	53	+ self.client = genai.Client(
	54	+ vertexai=True,
	55	+ project=project,
	56	+ location=location,
	57	+ )
33	58	except ImportError:
34	59	raise ImportError(
35	60	"google-genai package not installed. "
36	61	"Install with: pip install google-genai"
37	62	)
		@@ -61,10 +86,15 @@
61	86	config=types.GenerateContentConfig(
62	87	max_output_tokens=max_tokens,
63	88	temperature=temperature,
64	89	),
65	90	)
	91	+ um = getattr(response, "usage_metadata", None)
	92	+ self._last_usage = {
	93	+ "input_tokens": getattr(um, "prompt_token_count", 0) if um else 0,
	94	+ "output_tokens": getattr(um, "candidates_token_count", 0) if um else 0,
	95	+ }
66	96	return response.text or ""
67	97
68	98	def analyze_image(
69	99	self,
70	100	image_bytes: bytes,
		@@ -83,10 +113,15 @@
83	113	],
84	114	config=types.GenerateContentConfig(
85	115	max_output_tokens=max_tokens,
86	116	),
87	117	)
	118	+ um = getattr(response, "usage_metadata", None)
	119	+ self._last_usage = {
	120	+ "input_tokens": getattr(um, "prompt_token_count", 0) if um else 0,
	121	+ "output_tokens": getattr(um, "candidates_token_count", 0) if um else 0,
	122	+ }
88	123	return response.text or ""
89	124
90	125	def transcribe_audio(
91	126	self,
92	127	audio_path: str \| Path,
		@@ -152,13 +187,15 @@
152	187	def list_models(self) -> list[ModelInfo]:
153	188	models = []
154	189	try:
155	190	for m in self.client.models.list():
156	191	mid = m.name or ""
157		- # Strip "models/" prefix if present
158		- if mid.startswith("models/"):
159		- mid = mid[7:]
	192	+ # Strip prefix variants from different API modes
	193	+ for prefix in ("models/", "publishers/google/models/"):
	194	+ if mid.startswith(prefix):
	195	+ mid = mid[len(prefix):]
	196	+ break
160	197	display = getattr(m, "display_name", mid) or mid
161	198
162	199	caps = []
163	200	mid_lower = mid.lower()
164	201	if "gemini" in mid_lower:
165	202

	--- video_processor/providers/gemini_provider.py
	+++ video_processor/providers/gemini_provider.py
	@@ -20,18 +20,43 @@
20	class GeminiProvider(BaseProvider):
21	"""Google Gemini API provider via google-genai SDK."""
22
23	provider_name = "gemini"
24
25	def __init__(self, api_key: Optional[str] = None):




26	self.api_key = api_key or os.getenv("GEMINI_API_KEY")
27	if not self.api_key:
28	raise ValueError("GEMINI_API_KEY not set")





29	try:
30	from google import genai
31	self._genai = genai
32	self.client = genai.Client(api_key=self.api_key)
















33	except ImportError:
34	raise ImportError(
35	"google-genai package not installed. "
36	"Install with: pip install google-genai"
37	)
	@@ -61,10 +86,15 @@
61	config=types.GenerateContentConfig(
62	max_output_tokens=max_tokens,
63	temperature=temperature,
64	),
65	)





66	return response.text or ""
67
68	def analyze_image(
69	self,
70	image_bytes: bytes,
	@@ -83,10 +113,15 @@
83	],
84	config=types.GenerateContentConfig(
85	max_output_tokens=max_tokens,
86	),
87	)





88	return response.text or ""
89
90	def transcribe_audio(
91	self,
92	audio_path: str \| Path,
	@@ -152,13 +187,15 @@
152	def list_models(self) -> list[ModelInfo]:
153	models = []
154	try:
155	for m in self.client.models.list():
156	mid = m.name or ""
157	# Strip "models/" prefix if present
158	if mid.startswith("models/"):
159	mid = mid[7:]


160	display = getattr(m, "display_name", mid) or mid
161
162	caps = []
163	mid_lower = mid.lower()
164	if "gemini" in mid_lower:
165

	--- video_processor/providers/gemini_provider.py
	+++ video_processor/providers/gemini_provider.py
	@@ -20,18 +20,43 @@
20	class GeminiProvider(BaseProvider):
21	"""Google Gemini API provider via google-genai SDK."""
22
23	provider_name = "gemini"
24
25	def __init__(
26	self,
27	api_key: Optional[str] = None,
28	credentials_path: Optional[str] = None,
29	):
30	self.api_key = api_key or os.getenv("GEMINI_API_KEY")
31	self.credentials_path = credentials_path or os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
32
33	if not self.api_key and not self.credentials_path:
34	raise ValueError(
35	"Neither GEMINI_API_KEY nor GOOGLE_APPLICATION_CREDENTIALS is set"
36	)
37
38	try:
39	from google import genai
40	self._genai = genai
41
42	if self.api_key:
43	self.client = genai.Client(api_key=self.api_key)
44	else:
45	# Service account → use Vertex AI mode
46	import json
47
48	with open(self.credentials_path) as f:
49	sa_info = json.load(f)
50	project = sa_info.get("project_id", "")
51	location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1")
52
53	self.client = genai.Client(
54	vertexai=True,
55	project=project,
56	location=location,
57	)
58	except ImportError:
59	raise ImportError(
60	"google-genai package not installed. "
61	"Install with: pip install google-genai"
62	)
	@@ -61,10 +86,15 @@
86	config=types.GenerateContentConfig(
87	max_output_tokens=max_tokens,
88	temperature=temperature,
89	),
90	)
91	um = getattr(response, "usage_metadata", None)
92	self._last_usage = {
93	"input_tokens": getattr(um, "prompt_token_count", 0) if um else 0,
94	"output_tokens": getattr(um, "candidates_token_count", 0) if um else 0,
95	}
96	return response.text or ""
97
98	def analyze_image(
99	self,
100	image_bytes: bytes,
	@@ -83,10 +113,15 @@
113	],
114	config=types.GenerateContentConfig(
115	max_output_tokens=max_tokens,
116	),
117	)
118	um = getattr(response, "usage_metadata", None)
119	self._last_usage = {
120	"input_tokens": getattr(um, "prompt_token_count", 0) if um else 0,
121	"output_tokens": getattr(um, "candidates_token_count", 0) if um else 0,
122	}
123	return response.text or ""
124
125	def transcribe_audio(
126	self,
127	audio_path: str \| Path,
	@@ -152,13 +187,15 @@
187	def list_models(self) -> list[ModelInfo]:
188	models = []
189	try:
190	for m in self.client.models.list():
191	mid = m.name or ""
192	# Strip prefix variants from different API modes
193	for prefix in ("models/", "publishers/google/models/"):
194	if mid.startswith(prefix):
195	mid = mid[len(prefix):]
196	break
197	display = getattr(m, "display_name", mid) or mid
198
199	caps = []
200	mid_lower = mid.lower()
201	if "gemini" in mid_lower:
202

M video_processor/providers/manager.py

+54 -4

		--- video_processor/providers/manager.py
		+++ video_processor/providers/manager.py
		@@ -7,10 +7,11 @@
7	7
8	8	from dotenv import load_dotenv
9	9
10	10	from video_processor.providers.base import BaseProvider, ModelInfo
11	11	from video_processor.providers.discovery import discover_available_models
	12	+from video_processor.utils.usage_tracker import UsageTracker
12	13
13	14	load_dotenv()
14	15	logger = logging.getLogger(__name__)
15	16
16	17	# Default model preference rankings (tried in order)
		@@ -60,10 +61,11 @@
60	61	auto : if True and no model specified, pick the best available
61	62	"""
62	63	self.auto = auto
63	64	self._providers: dict[str, BaseProvider] = {}
64	65	self._available_models: Optional[list[ModelInfo]] = None
	66	+ self.usage = UsageTracker()
65	67
66	68	# If a single provider is forced, apply it
67	69	if provider:
68	70	self.vision_model = vision_model or self._default_for_provider(provider, "vision")
69	71	self.chat_model = chat_model or self._default_for_provider(provider, "chat")
		@@ -143,10 +145,22 @@
143	145	raise RuntimeError(
144	146	f"No provider available for capability '{capability}'. "
145	147	"Set an API key for at least one provider."
146	148	)
147	149
	150	+ def _track(self, provider: BaseProvider, prov_name: str, model: str) -> None:
	151	+ """Record usage from the last API call on a provider."""
	152	+ last = getattr(provider, "_last_usage", None)
	153	+ if last:
	154	+ self.usage.record(
	155	+ provider=prov_name,
	156	+ model=model,
	157	+ input_tokens=last.get("input_tokens", 0),
	158	+ output_tokens=last.get("output_tokens", 0),
	159	+ )
	160	+ provider._last_usage = None
	161	+
148	162	# --- Public API ---
149	163
150	164	def chat(
151	165	self,
152	166	messages: list[dict],
		@@ -155,11 +169,13 @@
155	169	) -> str:
156	170	"""Send a chat completion to the best available provider."""
157	171	prov_name, model = self._resolve_model(self.chat_model, "chat", _CHAT_PREFERENCES)
158	172	logger.info(f"Chat: using {prov_name}/{model}")
159	173	provider = self._get_provider(prov_name)
160		- return provider.chat(messages, max_tokens=max_tokens, temperature=temperature, model=model)
	174	+ result = provider.chat(messages, max_tokens=max_tokens, temperature=temperature, model=model)
	175	+ self._track(provider, prov_name, model)
	176	+ return result
161	177
162	178	def analyze_image(
163	179	self,
164	180	image_bytes: bytes,
165	181	prompt: str,
		@@ -167,24 +183,58 @@
167	183	) -> str:
168	184	"""Analyze an image using the best available vision provider."""
169	185	prov_name, model = self._resolve_model(self.vision_model, "vision", _VISION_PREFERENCES)
170	186	logger.info(f"Vision: using {prov_name}/{model}")
171	187	provider = self._get_provider(prov_name)
172		- return provider.analyze_image(image_bytes, prompt, max_tokens=max_tokens, model=model)
	188	+ result = provider.analyze_image(image_bytes, prompt, max_tokens=max_tokens, model=model)
	189	+ self._track(provider, prov_name, model)
	190	+ return result
173	191
174	192	def transcribe_audio(
175	193	self,
176	194	audio_path: str \| Path,
177	195	language: Optional[str] = None,
178	196	) -> dict:
179		- """Transcribe audio using the best available provider."""
	197	+ """Transcribe audio using local Whisper if available, otherwise API."""
	198	+ # Prefer local Whisper — no file size limits, no API costs
	199	+ if not self.transcription_model or self.transcription_model.startswith("whisper-local"):
	200	+ try:
	201	+ from video_processor.providers.whisper_local import WhisperLocal
	202	+
	203	+ if WhisperLocal.is_available():
	204	+ # Parse model size from "whisper-local:large" or default to "large"
	205	+ size = "large"
	206	+ if self.transcription_model and ":" in self.transcription_model:
	207	+ size = self.transcription_model.split(":", 1)[1]
	208	+ if not hasattr(self, "_whisper_local"):
	209	+ self._whisper_local = WhisperLocal(model_size=size)
	210	+ logger.info(f"Transcription: using local whisper-{size}")
	211	+ result = self._whisper_local.transcribe(audio_path, language=language)
	212	+ duration = result.get("duration") or 0
	213	+ self.usage.record(
	214	+ provider="local",
	215	+ model=f"whisper-{size}",
	216	+ audio_minutes=duration / 60 if duration else 0,
	217	+ )
	218	+ return result
	219	+ except ImportError:
	220	+ pass
	221	+
	222	+ # Fall back to API-based transcription
180	223	prov_name, model = self._resolve_model(
181	224	self.transcription_model, "audio", _TRANSCRIPTION_PREFERENCES
182	225	)
183	226	logger.info(f"Transcription: using {prov_name}/{model}")
184	227	provider = self._get_provider(prov_name)
185		- return provider.transcribe_audio(audio_path, language=language, model=model)
	228	+ result = provider.transcribe_audio(audio_path, language=language, model=model)
	229	+ duration = result.get("duration") or 0
	230	+ self.usage.record(
	231	+ provider=prov_name,
	232	+ model=model,
	233	+ audio_minutes=duration / 60 if duration else 0,
	234	+ )
	235	+ return result
186	236
187	237	def get_models_used(self) -> dict[str, str]:
188	238	"""Return a dict mapping capability to 'provider/model' for tracking."""
189	239	result = {}
190	240	for cap, explicit, prefs in [
191	241

	--- video_processor/providers/manager.py
	+++ video_processor/providers/manager.py
	@@ -7,10 +7,11 @@
7
8	from dotenv import load_dotenv
9
10	from video_processor.providers.base import BaseProvider, ModelInfo
11	from video_processor.providers.discovery import discover_available_models

12
13	load_dotenv()
14	logger = logging.getLogger(__name__)
15
16	# Default model preference rankings (tried in order)
	@@ -60,10 +61,11 @@
60	auto : if True and no model specified, pick the best available
61	"""
62	self.auto = auto
63	self._providers: dict[str, BaseProvider] = {}
64	self._available_models: Optional[list[ModelInfo]] = None

65
66	# If a single provider is forced, apply it
67	if provider:
68	self.vision_model = vision_model or self._default_for_provider(provider, "vision")
69	self.chat_model = chat_model or self._default_for_provider(provider, "chat")
	@@ -143,10 +145,22 @@
143	raise RuntimeError(
144	f"No provider available for capability '{capability}'. "
145	"Set an API key for at least one provider."
146	)
147












148	# --- Public API ---
149
150	def chat(
151	self,
152	messages: list[dict],
	@@ -155,11 +169,13 @@
155	) -> str:
156	"""Send a chat completion to the best available provider."""
157	prov_name, model = self._resolve_model(self.chat_model, "chat", _CHAT_PREFERENCES)
158	logger.info(f"Chat: using {prov_name}/{model}")
159	provider = self._get_provider(prov_name)
160	return provider.chat(messages, max_tokens=max_tokens, temperature=temperature, model=model)


161
162	def analyze_image(
163	self,
164	image_bytes: bytes,
165	prompt: str,
	@@ -167,24 +183,58 @@
167	) -> str:
168	"""Analyze an image using the best available vision provider."""
169	prov_name, model = self._resolve_model(self.vision_model, "vision", _VISION_PREFERENCES)
170	logger.info(f"Vision: using {prov_name}/{model}")
171	provider = self._get_provider(prov_name)
172	return provider.analyze_image(image_bytes, prompt, max_tokens=max_tokens, model=model)


173
174	def transcribe_audio(
175	self,
176	audio_path: str \| Path,
177	language: Optional[str] = None,
178	) -> dict:
179	"""Transcribe audio using the best available provider."""

























180	prov_name, model = self._resolve_model(
181	self.transcription_model, "audio", _TRANSCRIPTION_PREFERENCES
182	)
183	logger.info(f"Transcription: using {prov_name}/{model}")
184	provider = self._get_provider(prov_name)
185	return provider.transcribe_audio(audio_path, language=language, model=model)







186
187	def get_models_used(self) -> dict[str, str]:
188	"""Return a dict mapping capability to 'provider/model' for tracking."""
189	result = {}
190	for cap, explicit, prefs in [
191

	--- video_processor/providers/manager.py
	+++ video_processor/providers/manager.py
	@@ -7,10 +7,11 @@
7
8	from dotenv import load_dotenv
9
10	from video_processor.providers.base import BaseProvider, ModelInfo
11	from video_processor.providers.discovery import discover_available_models
12	from video_processor.utils.usage_tracker import UsageTracker
13
14	load_dotenv()
15	logger = logging.getLogger(__name__)
16
17	# Default model preference rankings (tried in order)
	@@ -60,10 +61,11 @@
61	auto : if True and no model specified, pick the best available
62	"""
63	self.auto = auto
64	self._providers: dict[str, BaseProvider] = {}
65	self._available_models: Optional[list[ModelInfo]] = None
66	self.usage = UsageTracker()
67
68	# If a single provider is forced, apply it
69	if provider:
70	self.vision_model = vision_model or self._default_for_provider(provider, "vision")
71	self.chat_model = chat_model or self._default_for_provider(provider, "chat")
	@@ -143,10 +145,22 @@
145	raise RuntimeError(
146	f"No provider available for capability '{capability}'. "
147	"Set an API key for at least one provider."
148	)
149
150	def _track(self, provider: BaseProvider, prov_name: str, model: str) -> None:
151	"""Record usage from the last API call on a provider."""
152	last = getattr(provider, "_last_usage", None)
153	if last:
154	self.usage.record(
155	provider=prov_name,
156	model=model,
157	input_tokens=last.get("input_tokens", 0),
158	output_tokens=last.get("output_tokens", 0),
159	)
160	provider._last_usage = None
161
162	# --- Public API ---
163
164	def chat(
165	self,
166	messages: list[dict],
	@@ -155,11 +169,13 @@
169	) -> str:
170	"""Send a chat completion to the best available provider."""
171	prov_name, model = self._resolve_model(self.chat_model, "chat", _CHAT_PREFERENCES)
172	logger.info(f"Chat: using {prov_name}/{model}")
173	provider = self._get_provider(prov_name)
174	result = provider.chat(messages, max_tokens=max_tokens, temperature=temperature, model=model)
175	self._track(provider, prov_name, model)
176	return result
177
178	def analyze_image(
179	self,
180	image_bytes: bytes,
181	prompt: str,
	@@ -167,24 +183,58 @@
183	) -> str:
184	"""Analyze an image using the best available vision provider."""
185	prov_name, model = self._resolve_model(self.vision_model, "vision", _VISION_PREFERENCES)
186	logger.info(f"Vision: using {prov_name}/{model}")
187	provider = self._get_provider(prov_name)
188	result = provider.analyze_image(image_bytes, prompt, max_tokens=max_tokens, model=model)
189	self._track(provider, prov_name, model)
190	return result
191
192	def transcribe_audio(
193	self,
194	audio_path: str \| Path,
195	language: Optional[str] = None,
196	) -> dict:
197	"""Transcribe audio using local Whisper if available, otherwise API."""
198	# Prefer local Whisper — no file size limits, no API costs
199	if not self.transcription_model or self.transcription_model.startswith("whisper-local"):
200	try:
201	from video_processor.providers.whisper_local import WhisperLocal
202
203	if WhisperLocal.is_available():
204	# Parse model size from "whisper-local:large" or default to "large"
205	size = "large"
206	if self.transcription_model and ":" in self.transcription_model:
207	size = self.transcription_model.split(":", 1)[1]
208	if not hasattr(self, "_whisper_local"):
209	self._whisper_local = WhisperLocal(model_size=size)
210	logger.info(f"Transcription: using local whisper-{size}")
211	result = self._whisper_local.transcribe(audio_path, language=language)
212	duration = result.get("duration") or 0
213	self.usage.record(
214	provider="local",
215	model=f"whisper-{size}",
216	audio_minutes=duration / 60 if duration else 0,
217	)
218	return result
219	except ImportError:
220	pass
221
222	# Fall back to API-based transcription
223	prov_name, model = self._resolve_model(
224	self.transcription_model, "audio", _TRANSCRIPTION_PREFERENCES
225	)
226	logger.info(f"Transcription: using {prov_name}/{model}")
227	provider = self._get_provider(prov_name)
228	result = provider.transcribe_audio(audio_path, language=language, model=model)
229	duration = result.get("duration") or 0
230	self.usage.record(
231	provider=prov_name,
232	model=model,
233	audio_minutes=duration / 60 if duration else 0,
234	)
235	return result
236
237	def get_models_used(self) -> dict[str, str]:
238	"""Return a dict mapping capability to 'provider/model' for tracking."""
239	result = {}
240	for cap, explicit, prefs in [
241

M video_processor/providers/openai_provider.py

+83

		--- video_processor/providers/openai_provider.py
		+++ video_processor/providers/openai_provider.py
		@@ -42,10 +42,14 @@
42	42	model=model,
43	43	messages=messages,
44	44	max_tokens=max_tokens,
45	45	temperature=temperature,
46	46	)
	47	+ self._last_usage = {
	48	+ "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0,
	49	+ "output_tokens": getattr(response.usage, "completion_tokens", 0) if response.usage else 0,
	50	+ }
47	51	return response.choices[0].message.content or ""
48	52
49	53	def analyze_image(
50	54	self,
51	55	image_bytes: bytes,
		@@ -69,19 +73,42 @@
69	73	],
70	74	}
71	75	],
72	76	max_tokens=max_tokens,
73	77	)
	78	+ self._last_usage = {
	79	+ "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0,
	80	+ "output_tokens": getattr(response.usage, "completion_tokens", 0) if response.usage else 0,
	81	+ }
74	82	return response.choices[0].message.content or ""
75	83
	84	+ # Whisper API limit is 25MB
	85	+ _MAX_FILE_SIZE = 25 * 1024 * 1024
	86	+
76	87	def transcribe_audio(
77	88	self,
78	89	audio_path: str \| Path,
79	90	language: Optional[str] = None,
80	91	model: Optional[str] = None,
81	92	) -> dict:
82	93	model = model or "whisper-1"
	94	+ audio_path = Path(audio_path)
	95	+ file_size = audio_path.stat().st_size
	96	+
	97	+ if file_size <= self._MAX_FILE_SIZE:
	98	+ return self._transcribe_single(audio_path, language, model)
	99	+
	100	+ # File too large — split into chunks and transcribe each
	101	+ logger.info(
	102	+ f"Audio file {file_size / 1024 / 1024:.1f}MB exceeds Whisper 25MB limit, chunking..."
	103	+ )
	104	+ return self._transcribe_chunked(audio_path, language, model)
	105	+
	106	+ def _transcribe_single(
	107	+ self, audio_path: Path, language: Optional[str], model: str
	108	+ ) -> dict:
	109	+ """Transcribe a single audio file."""
83	110	with open(audio_path, "rb") as f:
84	111	kwargs = {"model": model, "file": f}
85	112	if language:
86	113	kwargs["language"] = language
87	114	response = self.client.audio.transcriptions.create(
		@@ -100,10 +127,66 @@
100	127	"language": getattr(response, "language", language),
101	128	"duration": getattr(response, "duration", None),
102	129	"provider": "openai",
103	130	"model": model,
104	131	}
	132	+
	133	+ def _transcribe_chunked(
	134	+ self, audio_path: Path, language: Optional[str], model: str
	135	+ ) -> dict:
	136	+ """Split audio into chunks under 25MB and transcribe each."""
	137	+ import tempfile
	138	+ from video_processor.extractors.audio_extractor import AudioExtractor
	139	+
	140	+ extractor = AudioExtractor()
	141	+ audio_data, sr = extractor.load_audio(audio_path)
	142	+ total_duration = len(audio_data) / sr
	143	+
	144	+ # Calculate chunk duration to stay under 25MB
	145	+ # WAV: 16-bit mono = 2 bytes/sample, plus header overhead
	146	+ bytes_per_second = sr * 2
	147	+ max_seconds = self._MAX_FILE_SIZE // bytes_per_second
	148	+ # Use 80% of max to leave headroom
	149	+ chunk_ms = int(max_seconds * 0.8 * 1000)
	150	+
	151	+ segments_data = extractor.segment_audio(audio_data, sr, segment_length_ms=chunk_ms)
	152	+ logger.info(f"Split into {len(segments_data)} chunks of ~{chunk_ms / 1000:.0f}s each")
	153	+
	154	+ all_text = []
	155	+ all_segments = []
	156	+ time_offset = 0.0
	157	+ detected_language = language
	158	+
	159	+ with tempfile.TemporaryDirectory() as tmpdir:
	160	+ for i, chunk in enumerate(segments_data):
	161	+ chunk_path = Path(tmpdir) / f"chunk_{i:03d}.wav"
	162	+ extractor.save_segment(chunk, chunk_path, sr)
	163	+
	164	+ logger.info(f"Transcribing chunk {i + 1}/{len(segments_data)}...")
	165	+ result = self._transcribe_single(chunk_path, language, model)
	166	+
	167	+ all_text.append(result["text"])
	168	+ for seg in result.get("segments", []):
	169	+ all_segments.append({
	170	+ "start": seg["start"] + time_offset,
	171	+ "end": seg["end"] + time_offset,
	172	+ "text": seg["text"],
	173	+ })
	174	+
	175	+ if not detected_language and result.get("language"):
	176	+ detected_language = result["language"]
	177	+
	178	+ time_offset += len(chunk) / sr
	179	+
	180	+ return {
	181	+ "text": " ".join(all_text),
	182	+ "segments": all_segments,
	183	+ "language": detected_language,
	184	+ "duration": total_duration,
	185	+ "provider": "openai",
	186	+ "model": model,
	187	+ }
105	188
106	189	def list_models(self) -> list[ModelInfo]:
107	190	models = []
108	191	try:
109	192	for m in self.client.models.list():
110	193
111	194	ADDED video_processor/providers/whisper_local.py

	--- video_processor/providers/openai_provider.py
	+++ video_processor/providers/openai_provider.py
	@@ -42,10 +42,14 @@
42	model=model,
43	messages=messages,
44	max_tokens=max_tokens,
45	temperature=temperature,
46	)




47	return response.choices[0].message.content or ""
48
49	def analyze_image(
50	self,
51	image_bytes: bytes,
	@@ -69,19 +73,42 @@
69	],
70	}
71	],
72	max_tokens=max_tokens,
73	)




74	return response.choices[0].message.content or ""
75



76	def transcribe_audio(
77	self,
78	audio_path: str \| Path,
79	language: Optional[str] = None,
80	model: Optional[str] = None,
81	) -> dict:
82	model = model or "whisper-1"
















83	with open(audio_path, "rb") as f:
84	kwargs = {"model": model, "file": f}
85	if language:
86	kwargs["language"] = language
87	response = self.client.audio.transcriptions.create(
	@@ -100,10 +127,66 @@
100	"language": getattr(response, "language", language),
101	"duration": getattr(response, "duration", None),
102	"provider": "openai",
103	"model": model,
104	}
























































105
106	def list_models(self) -> list[ModelInfo]:
107	models = []
108	try:
109	for m in self.client.models.list():
110
111	DDED video_processor/providers/whisper_local.py

	--- video_processor/providers/openai_provider.py
	+++ video_processor/providers/openai_provider.py
	@@ -42,10 +42,14 @@
42	model=model,
43	messages=messages,
44	max_tokens=max_tokens,
45	temperature=temperature,
46	)
47	self._last_usage = {
48	"input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0,
49	"output_tokens": getattr(response.usage, "completion_tokens", 0) if response.usage else 0,
50	}
51	return response.choices[0].message.content or ""
52
53	def analyze_image(
54	self,
55	image_bytes: bytes,
	@@ -69,19 +73,42 @@
73	],
74	}
75	],
76	max_tokens=max_tokens,
77	)
78	self._last_usage = {
79	"input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0,
80	"output_tokens": getattr(response.usage, "completion_tokens", 0) if response.usage else 0,
81	}
82	return response.choices[0].message.content or ""
83
84	# Whisper API limit is 25MB
85	_MAX_FILE_SIZE = 25 * 1024 * 1024
86
87	def transcribe_audio(
88	self,
89	audio_path: str \| Path,
90	language: Optional[str] = None,
91	model: Optional[str] = None,
92	) -> dict:
93	model = model or "whisper-1"
94	audio_path = Path(audio_path)
95	file_size = audio_path.stat().st_size
96
97	if file_size <= self._MAX_FILE_SIZE:
98	return self._transcribe_single(audio_path, language, model)
99
100	# File too large — split into chunks and transcribe each
101	logger.info(
102	f"Audio file {file_size / 1024 / 1024:.1f}MB exceeds Whisper 25MB limit, chunking..."
103	)
104	return self._transcribe_chunked(audio_path, language, model)
105
106	def _transcribe_single(
107	self, audio_path: Path, language: Optional[str], model: str
108	) -> dict:
109	"""Transcribe a single audio file."""
110	with open(audio_path, "rb") as f:
111	kwargs = {"model": model, "file": f}
112	if language:
113	kwargs["language"] = language
114	response = self.client.audio.transcriptions.create(
	@@ -100,10 +127,66 @@
127	"language": getattr(response, "language", language),
128	"duration": getattr(response, "duration", None),
129	"provider": "openai",
130	"model": model,
131	}
132
133	def _transcribe_chunked(
134	self, audio_path: Path, language: Optional[str], model: str
135	) -> dict:
136	"""Split audio into chunks under 25MB and transcribe each."""
137	import tempfile
138	from video_processor.extractors.audio_extractor import AudioExtractor
139
140	extractor = AudioExtractor()
141	audio_data, sr = extractor.load_audio(audio_path)
142	total_duration = len(audio_data) / sr
143
144	# Calculate chunk duration to stay under 25MB
145	# WAV: 16-bit mono = 2 bytes/sample, plus header overhead
146	bytes_per_second = sr * 2
147	max_seconds = self._MAX_FILE_SIZE // bytes_per_second
148	# Use 80% of max to leave headroom
149	chunk_ms = int(max_seconds * 0.8 * 1000)
150
151	segments_data = extractor.segment_audio(audio_data, sr, segment_length_ms=chunk_ms)
152	logger.info(f"Split into {len(segments_data)} chunks of ~{chunk_ms / 1000:.0f}s each")
153
154	all_text = []
155	all_segments = []
156	time_offset = 0.0
157	detected_language = language
158
159	with tempfile.TemporaryDirectory() as tmpdir:
160	for i, chunk in enumerate(segments_data):
161	chunk_path = Path(tmpdir) / f"chunk_{i:03d}.wav"
162	extractor.save_segment(chunk, chunk_path, sr)
163
164	logger.info(f"Transcribing chunk {i + 1}/{len(segments_data)}...")
165	result = self._transcribe_single(chunk_path, language, model)
166
167	all_text.append(result["text"])
168	for seg in result.get("segments", []):
169	all_segments.append({
170	"start": seg["start"] + time_offset,
171	"end": seg["end"] + time_offset,
172	"text": seg["text"],
173	})
174
175	if not detected_language and result.get("language"):
176	detected_language = result["language"]
177
178	time_offset += len(chunk) / sr
179
180	return {
181	"text": " ".join(all_text),
182	"segments": all_segments,
183	"language": detected_language,
184	"duration": total_duration,
185	"provider": "openai",
186	"model": model,
187	}
188
189	def list_models(self) -> list[ModelInfo]:
190	models = []
191	try:
192	for m in self.client.models.list():
193
194	DDED video_processor/providers/whisper_local.py

A video_processor/providers/whisper_local.py

+76

		--- a/video_processor/providers/whisper_local.py
		+++ b/video_processor/providers/whisper_local.py
		@@ -0,0 +1,76 @@
	1	+"""Local Whisper transcription provider — runs on-device with GPU acceleration."""
	2	+
	3	+import logging
	4	+from pathlib import Path
	5	+from typing import Optional
	6	+
	7	+logger = logging.getLogger(__name__)
	8	+
	9	+# Model size → approximate VRAM/RAM usage
	10	+_MODEL_SIZES = {
	11	+ "tiny": "~1GB",
	12	+ "base": "~1GB",
	13	+ "small": "~2GB",
	14	+ "medium": "~5GB",
	15	+ "large": "~10GB",
	16	+ "turbo": "~6GB",
	17	+}
	18	+
	19	+
	20	+class WhisperLocal:
	21	+ """
	22	+ Local Whisper transcription using openai-whisper.
	23	+
	24	+ Uses MPS (Apple Silicon) or CUDA when available, falls back to CPU.
	25	+ No file size limits — processes audio directly on device.
	26	+ """
	27	+
	28	+ def __init__(self, model_size: str = "large", device: Optional[str] = None):
	29	+ """
	30	+ Initialize local Whisper.
	31	+
	32	+ Parameters
	33	+ ----------
	34	+ model_size : str
	35	+ Whisper model size: tiny, base, small, medium, large, turbo
	36	+ device : str, optional
	37	+ Force device: 'mps', 'cuda', 'cpu'. Auto-detects if None.
	38	+ """
	39	+ self.model_size = model_size
	40	+ self._model = None
	41	+
	42	+ if device:
	43	+ self.device = device
	44	+ else:
	45	+ self.device = self._detect_device()
	46	+
	47	+ logger.info(
	48	+ f"WhisperLocal: model={model_size} ({_MODEL_SIZES.get(model_size, '?')}), "
	49	+ f"device={self.device}"
	50	+ )
	51	+
	52	+ @staticmethod
	53	+ def _detect_device() -> str:
	54	+ """Auto-detect the best available device."""
	55	+ try:
	56	+ import torch
	57	+
	58	+ if torch.cuda.is_available():
	59	+ return "cuda"
	60	+ if torch.backends.mps.is_available():
	61	+ return "mps"
	62	+ except ImportError:
	63	+ pass
	64	+ return "cpu"
	65	+
	66	+ def _load_model(self):
	67	+ """Lazy-load the Whisper model."""
	68	+ if self._model is not None:
	69	+ return
	70	+
	71	+ try:
	72	+ import whisper
	73	+ except ImportError:
	74	+ ze} ({_MODEL_SIZES raise ImportError("openai-whisper not installed. Run: pip in
	75	+ h.backends., small, medium, large, tLoading Whisper {self.model_size} model on {self.device}...")
	76	+

	--- a/video_processor/providers/whisper_local.py
	+++ b/video_processor/providers/whisper_local.py
	@@ -0,0 +1,76 @@

	--- a/video_processor/providers/whisper_local.py
	+++ b/video_processor/providers/whisper_local.py
	@@ -0,0 +1,76 @@
1	"""Local Whisper transcription provider — runs on-device with GPU acceleration."""
2
3	import logging
4	from pathlib import Path
5	from typing import Optional
6
7	logger = logging.getLogger(__name__)
8
9	# Model size → approximate VRAM/RAM usage
10	_MODEL_SIZES = {
11	"tiny": "~1GB",
12	"base": "~1GB",
13	"small": "~2GB",
14	"medium": "~5GB",
15	"large": "~10GB",
16	"turbo": "~6GB",
17	}
18
19
20	class WhisperLocal:
21	"""
22	Local Whisper transcription using openai-whisper.
23
24	Uses MPS (Apple Silicon) or CUDA when available, falls back to CPU.
25	No file size limits — processes audio directly on device.
26	"""
27
28	def __init__(self, model_size: str = "large", device: Optional[str] = None):
29	"""
30	Initialize local Whisper.
31
32	Parameters
33	----------
34	model_size : str
35	Whisper model size: tiny, base, small, medium, large, turbo
36	device : str, optional
37	Force device: 'mps', 'cuda', 'cpu'. Auto-detects if None.
38	"""
39	self.model_size = model_size
40	self._model = None
41
42	if device:
43	self.device = device
44	else:
45	self.device = self._detect_device()
46
47	logger.info(
48	f"WhisperLocal: model={model_size} ({_MODEL_SIZES.get(model_size, '?')}), "
49	f"device={self.device}"
50	)
51
52	@staticmethod
53	def _detect_device() -> str:
54	"""Auto-detect the best available device."""
55	try:
56	import torch
57
58	if torch.cuda.is_available():
59	return "cuda"
60	if torch.backends.mps.is_available():
61	return "mps"
62	except ImportError:
63	pass
64	return "cpu"
65
66	def _load_model(self):
67	"""Lazy-load the Whisper model."""
68	if self._model is not None:
69	return
70
71	try:
72	import whisper
73	except ImportError:
74	ze} ({_MODEL_SIZES raise ImportError("openai-whisper not installed. Run: pip in
75	h.backends., small, medium, large, tLoading Whisper {self.model_size} model on {self.device}...")
76

M video_processor/sources/base.py

+6 -4

		--- video_processor/sources/base.py
		+++ video_processor/sources/base.py
		@@ -50,17 +50,19 @@
50	50	def download_all(
51	51	self,
52	52	files: List[SourceFile],
53	53	destination_dir: Path,
54	54	) -> List[Path]:
55		- """Download multiple files to a directory."""
	55	+ """Download multiple files to a directory, preserving subfolder structure."""
56	56	destination_dir.mkdir(parents=True, exist_ok=True)
57	57	paths = []
58	58	for f in files:
59		- dest = destination_dir / f.name
	59	+ # Use path (with subfolder) if available, otherwise just name
	60	+ relative = f.path if f.path else f.name
	61	+ dest = destination_dir / relative
60	62	try:
61	63	local_path = self.download(f, dest)
62	64	paths.append(local_path)
63		- logger.info(f"Downloaded: {f.name}")
	65	+ logger.info(f"Downloaded: {relative}")
64	66	except Exception as e:
65		- logger.error(f"Failed to download {f.name}: {e}")
	67	+ logger.error(f"Failed to download {relative}: {e}")
66	68	return paths
67	69

	--- video_processor/sources/base.py
	+++ video_processor/sources/base.py
	@@ -50,17 +50,19 @@
50	def download_all(
51	self,
52	files: List[SourceFile],
53	destination_dir: Path,
54	) -> List[Path]:
55	"""Download multiple files to a directory."""
56	destination_dir.mkdir(parents=True, exist_ok=True)
57	paths = []
58	for f in files:
59	dest = destination_dir / f.name


60	try:
61	local_path = self.download(f, dest)
62	paths.append(local_path)
63	logger.info(f"Downloaded: {f.name}")
64	except Exception as e:
65	logger.error(f"Failed to download {f.name}: {e}")
66	return paths
67

	--- video_processor/sources/base.py
	+++ video_processor/sources/base.py
	@@ -50,17 +50,19 @@
50	def download_all(
51	self,
52	files: List[SourceFile],
53	destination_dir: Path,
54	) -> List[Path]:
55	"""Download multiple files to a directory, preserving subfolder structure."""
56	destination_dir.mkdir(parents=True, exist_ok=True)
57	paths = []
58	for f in files:
59	# Use path (with subfolder) if available, otherwise just name
60	relative = f.path if f.path else f.name
61	dest = destination_dir / relative
62	try:
63	local_path = self.download(f, dest)
64	paths.append(local_path)
65	logger.info(f"Downloaded: {relative}")
66	except Exception as e:
67	logger.error(f"Failed to download {relative}: {e}")
68	return paths
69

M video_processor/sources/google_drive.py

+98 -17

		--- video_processor/sources/google_drive.py
		+++ video_processor/sources/google_drive.py
		@@ -187,31 +187,81 @@
187	187	def list_videos(
188	188	self,
189	189	folder_id: Optional[str] = None,
190	190	folder_path: Optional[str] = None,
191	191	patterns: Optional[List[str]] = None,
	192	+ recursive: bool = True,
192	193	) -> List[SourceFile]:
193		- """List video files in a Google Drive folder."""
	194	+ """
	195	+ List video files in a Google Drive folder.
	196	+
	197	+ Parameters
	198	+ ----------
	199	+ folder_id : str, optional
	200	+ Google Drive folder ID.
	201	+ folder_path : str, optional
	202	+ Not used for Google Drive (folder_id is canonical).
	203	+ patterns : list[str], optional
	204	+ File extension patterns like ['.mp4', '.mkv'].
	205	+ recursive : bool
	206	+ If True, recurse into subfolders (default: True).
	207	+ """
194	208	if not self.service:
195	209	raise RuntimeError("Not authenticated. Call authenticate() first.")
196	210
197		- # Build query
	211	+ files: List[SourceFile] = []
	212	+ self._list_folder(
	213	+ folder_id=folder_id,
	214	+ prefix="",
	215	+ patterns=patterns,
	216	+ recursive=recursive,
	217	+ out=files,
	218	+ )
	219	+
	220	+ logger.info(f"Found {len(files)} videos in Google Drive")
	221	+ return files
	222	+
	223	+ def _list_folder(
	224	+ self,
	225	+ folder_id: Optional[str],
	226	+ prefix: str,
	227	+ patterns: Optional[List[str]],
	228	+ recursive: bool,
	229	+ out: List[SourceFile],
	230	+ ) -> None:
	231	+ """List videos in a single folder, optionally recursing into subfolders."""
	232	+ # List video files
	233	+ self._list_files_in_folder(folder_id, prefix, patterns, out)
	234	+
	235	+ # Recurse into subfolders
	236	+ if recursive:
	237	+ subfolders = self._list_subfolders(folder_id)
	238	+ for sf_id, sf_name in subfolders:
	239	+ sub_prefix = f"{prefix}{sf_name}/" if prefix else f"{sf_name}/"
	240	+ logger.debug(f"Recursing into subfolder: {sub_prefix}")
	241	+ self._list_folder(sf_id, sub_prefix, patterns, recursive, out)
	242	+
	243	+ def _list_files_in_folder(
	244	+ self,
	245	+ folder_id: Optional[str],
	246	+ prefix: str,
	247	+ patterns: Optional[List[str]],
	248	+ out: List[SourceFile],
	249	+ ) -> None:
	250	+ """List video files in a single folder (non-recursive)."""
198	251	query_parts = []
199	252
200	253	if folder_id:
201	254	query_parts.append(f"'{folder_id}' in parents")
202	255
203		- # Filter for video MIME types
204	256	mime_conditions = " or ".join(
205	257	f"mimeType='{mt}'" for mt in VIDEO_MIME_TYPES
206	258	)
207	259	query_parts.append(f"({mime_conditions})")
208	260	query_parts.append("trashed=false")
209	261
210	262	query = " and ".join(query_parts)
211		-
212		- files = []
213	263	page_token = None
214	264
215	265	while True:
216	266	response = (
217	267	self.service.files()
		@@ -224,34 +274,65 @@
224	274	)
225	275	.execute()
226	276	)
227	277
228	278	for f in response.get("files", []):
229		- # Apply pattern filtering if specified
230		- if patterns:
231		- name = f.get("name", "")
232		- if not any(
233		- name.endswith(p.replace("*", "")) for p in patterns
234		- ):
235		- continue
236		-
237		- files.append(
	279	+ name = f.get("name", "")
	280	+ if patterns and not any(
	281	+ name.endswith(p.replace("*", "")) for p in patterns
	282	+ ):
	283	+ continue
	284	+
	285	+ out.append(
238	286	SourceFile(
239		- name=f["name"],
	287	+ name=name,
240	288	id=f["id"],
241	289	size_bytes=int(f.get("size", 0)) if f.get("size") else None,
242	290	mime_type=f.get("mimeType"),
243	291	modified_at=f.get("modifiedTime"),
	292	+ path=f"{prefix}{name}" if prefix else name,
244	293	)
245	294	)
246	295
247	296	page_token = response.get("nextPageToken")
248	297	if not page_token:
249	298	break
250	299
251		- logger.info(f"Found {len(files)} videos in Google Drive")
252		- return files
	300	+ def _list_subfolders(self, parent_id: Optional[str]) -> List[tuple]:
	301	+ """List immediate subfolders of a folder. Returns list of (id, name)."""
	302	+ query_parts = [
	303	+ "mimeType='application/vnd.google-apps.folder'",
	304	+ "trashed=false",
	305	+ ]
	306	+ if parent_id:
	307	+ query_parts.append(f"'{parent_id}' in parents")
	308	+
	309	+ query = " and ".join(query_parts)
	310	+ subfolders = []
	311	+ page_token = None
	312	+
	313	+ while True:
	314	+ response = (
	315	+ self.service.files()
	316	+ .list(
	317	+ q=query,
	318	+ spaces="drive",
	319	+ fields="nextPageToken, files(id, name)",
	320	+ pageToken=page_token,
	321	+ pageSize=100,
	322	+ )
	323	+ .execute()
	324	+ )
	325	+
	326	+ for f in response.get("files", []):
	327	+ subfolders.append((f["id"], f["name"]))
	328	+
	329	+ page_token = response.get("nextPageToken")
	330	+ if not page_token:
	331	+ break
	332	+
	333	+ return sorted(subfolders, key=lambda x: x[1])
253	334
254	335	def download(self, file: SourceFile, destination: Path) -> Path:
255	336	"""Download a file from Google Drive."""
256	337	if not self.service:
257	338	raise RuntimeError("Not authenticated. Call authenticate() first.")
258	339
259	340	ADDED video_processor/utils/__init__.py
260	341	ADDED video_processor/utils/api_cache.py

	--- video_processor/sources/google_drive.py
	+++ video_processor/sources/google_drive.py
	@@ -187,31 +187,81 @@
187	def list_videos(
188	self,
189	folder_id: Optional[str] = None,
190	folder_path: Optional[str] = None,
191	patterns: Optional[List[str]] = None,

192	) -> List[SourceFile]:
193	"""List video files in a Google Drive folder."""













194	if not self.service:
195	raise RuntimeError("Not authenticated. Call authenticate() first.")
196
197	# Build query







































198	query_parts = []
199
200	if folder_id:
201	query_parts.append(f"'{folder_id}' in parents")
202
203	# Filter for video MIME types
204	mime_conditions = " or ".join(
205	f"mimeType='{mt}'" for mt in VIDEO_MIME_TYPES
206	)
207	query_parts.append(f"({mime_conditions})")
208	query_parts.append("trashed=false")
209
210	query = " and ".join(query_parts)
211
212	files = []
213	page_token = None
214
215	while True:
216	response = (
217	self.service.files()
	@@ -224,34 +274,65 @@
224	)
225	.execute()
226	)
227
228	for f in response.get("files", []):
229	# Apply pattern filtering if specified
230	if patterns:
231	name = f.get("name", "")
232	if not any(
233	name.endswith(p.replace("*", "")) for p in patterns
234	):
235	continue
236
237	files.append(
238	SourceFile(
239	name=f["name"],
240	id=f["id"],
241	size_bytes=int(f.get("size", 0)) if f.get("size") else None,
242	mime_type=f.get("mimeType"),
243	modified_at=f.get("modifiedTime"),

244	)
245	)
246
247	page_token = response.get("nextPageToken")
248	if not page_token:
249	break
250
251	logger.info(f"Found {len(files)} videos in Google Drive")
252	return files
































253
254	def download(self, file: SourceFile, destination: Path) -> Path:
255	"""Download a file from Google Drive."""
256	if not self.service:
257	raise RuntimeError("Not authenticated. Call authenticate() first.")
258
259	DDED video_processor/utils/__init__.py
260	DDED video_processor/utils/api_cache.py

	--- video_processor/sources/google_drive.py
	+++ video_processor/sources/google_drive.py
	@@ -187,31 +187,81 @@
187	def list_videos(
188	self,
189	folder_id: Optional[str] = None,
190	folder_path: Optional[str] = None,
191	patterns: Optional[List[str]] = None,
192	recursive: bool = True,
193	) -> List[SourceFile]:
194	"""
195	List video files in a Google Drive folder.
196
197	Parameters
198	----------
199	folder_id : str, optional
200	Google Drive folder ID.
201	folder_path : str, optional
202	Not used for Google Drive (folder_id is canonical).
203	patterns : list[str], optional
204	File extension patterns like ['.mp4', '.mkv'].
205	recursive : bool
206	If True, recurse into subfolders (default: True).
207	"""
208	if not self.service:
209	raise RuntimeError("Not authenticated. Call authenticate() first.")
210
211	files: List[SourceFile] = []
212	self._list_folder(
213	folder_id=folder_id,
214	prefix="",
215	patterns=patterns,
216	recursive=recursive,
217	out=files,
218	)
219
220	logger.info(f"Found {len(files)} videos in Google Drive")
221	return files
222
223	def _list_folder(
224	self,
225	folder_id: Optional[str],
226	prefix: str,
227	patterns: Optional[List[str]],
228	recursive: bool,
229	out: List[SourceFile],
230	) -> None:
231	"""List videos in a single folder, optionally recursing into subfolders."""
232	# List video files
233	self._list_files_in_folder(folder_id, prefix, patterns, out)
234
235	# Recurse into subfolders
236	if recursive:
237	subfolders = self._list_subfolders(folder_id)
238	for sf_id, sf_name in subfolders:
239	sub_prefix = f"{prefix}{sf_name}/" if prefix else f"{sf_name}/"
240	logger.debug(f"Recursing into subfolder: {sub_prefix}")
241	self._list_folder(sf_id, sub_prefix, patterns, recursive, out)
242
243	def _list_files_in_folder(
244	self,
245	folder_id: Optional[str],
246	prefix: str,
247	patterns: Optional[List[str]],
248	out: List[SourceFile],
249	) -> None:
250	"""List video files in a single folder (non-recursive)."""
251	query_parts = []
252
253	if folder_id:
254	query_parts.append(f"'{folder_id}' in parents")
255

256	mime_conditions = " or ".join(
257	f"mimeType='{mt}'" for mt in VIDEO_MIME_TYPES
258	)
259	query_parts.append(f"({mime_conditions})")
260	query_parts.append("trashed=false")
261
262	query = " and ".join(query_parts)


263	page_token = None
264
265	while True:
266	response = (
267	self.service.files()
	@@ -224,34 +274,65 @@
274	)
275	.execute()
276	)
277
278	for f in response.get("files", []):
279	name = f.get("name", "")
280	if patterns and not any(
281	name.endswith(p.replace("*", "")) for p in patterns
282	):
283	continue
284
285	out.append(


286	SourceFile(
287	name=name,
288	id=f["id"],
289	size_bytes=int(f.get("size", 0)) if f.get("size") else None,
290	mime_type=f.get("mimeType"),
291	modified_at=f.get("modifiedTime"),
292	path=f"{prefix}{name}" if prefix else name,
293	)
294	)
295
296	page_token = response.get("nextPageToken")
297	if not page_token:
298	break
299
300	def _list_subfolders(self, parent_id: Optional[str]) -> List[tuple]:
301	"""List immediate subfolders of a folder. Returns list of (id, name)."""
302	query_parts = [
303	"mimeType='application/vnd.google-apps.folder'",
304	"trashed=false",
305	]
306	if parent_id:
307	query_parts.append(f"'{parent_id}' in parents")
308
309	query = " and ".join(query_parts)
310	subfolders = []
311	page_token = None
312
313	while True:
314	response = (
315	self.service.files()
316	.list(
317	q=query,
318	spaces="drive",
319	fields="nextPageToken, files(id, name)",
320	pageToken=page_token,
321	pageSize=100,
322	)
323	.execute()
324	)
325
326	for f in response.get("files", []):
327	subfolders.append((f["id"], f["name"]))
328
329	page_token = response.get("nextPageToken")
330	if not page_token:
331	break
332
333	return sorted(subfolders, key=lambda x: x[1])
334
335	def download(self, file: SourceFile, destination: Path) -> Path:
336	"""Download a file from Google Drive."""
337	if not self.service:
338	raise RuntimeError("Not authenticated. Call authenticate() first.")
339
340	DDED video_processor/utils/__init__.py
341	DDED video_processor/utils/api_cache.py

A video_processor/utils/__init__.py

No diff available

A video_processor/utils/api_cache.py

+163

		--- a/video_processor/utils/api_cache.py
		+++ b/video_processor/utils/api_cache.py
		@@ -0,0 +1,163 @@
	1	+"""Caching system for API responses to reduce API calls and costs."""
	2	+import json
	3	+import logimport hashlib
	4	+from pathlib import Path
	5	+from typing import Any, Dict, Optional, Union
	6	+
	7	+logger = loging.getLogger(__name__)
	8	+
	9	+
	10	+cla sed API response cache."""
	11	+
	12	+
	13	+ c
	14	+ Union[str, Path],
	15	+ ,
	16	+ ttl: int = 86400, # 24 hours in seconds
	17	+ ):
	18	+ """
	19	+ ne otherwise
	20	+ ----------
	21	+ cache_dir : str or Path
	22	+ Directory for cache files
	23	+ namespace : str
	24	+ Cache namespace for organizing cache files
	25	+ ttl : int
	26	+ Time-to-live for cache entries in seconds
	27	+ """
	28	+ self.cache_dir = Path(cache_dir)
	29	+ self.namespace = namesp ace
	30	+ self.ttl = ttl
	31	+
	32	+ # Ensure namespace directory exists
	33	+ self.namespace_dir = self.cache_dir / namespace
	34	+ self.namespace_dir.mkdir(pa
	35	+red
	36	+ timestamp
	37	+
	38	+ logger.de
	39	+ def get_cache_path(self, key: str) -> Path:
	40	+ """
	41	+ Get p ne otherwise
	42	+ """
	43	+ cache_path = self.get_cache_path(key)
	44	+
	45	+ der than this Path
	46	+ Path to cache file
	47	+ """
	48	+ # Hash the key to ensure valid filename
	49	+ hashed_key = hashlib.md5(key.encode()).hexdigest()
	50	+ return self.namespace_dir / f"{hashed_key}.json"
	51	+
	52	+ def get(self, key: str) -> Optional[Any]:
	53	+ """
	54	+ Get value from cache.
	55	+ ne otherwise
	56	+ """
	57	+ cache_path = self.get_cache_path(key)
	58	+
	59	+ der than this object or None
	60	+ Cached value if available and not expired, None otherwise
	61	+ """
	62	+ cache_path = self.get_cache_path(key)
	63	+
	64	+ # Check path(key)
	65	+
	66	+ # Check if cache file exists
	67	+ if not cache_path.exists(
	68	+ Returns
	69	+ ---Read cache file
	70	+ with open(cache_path, "r", encoding="utf-8") as f:
	71	+ cache_data = json.load(f)
	72	+
	73	+ f.get_cache_path(key)es to reduce API calls and costs."""
	74	+
	75	+import hashlib
	76	+import json
	77	+import log"""Caching s
	78	+ = time.time()
	79	+
	80	+ if now - timestamp > self.ttl:
	81	+ logger.debug(f"Cache y: str) -> Optional[Any]:
	82	+ Time-to-live for cache en
	83	+ except E"""Caching system for Am for API responses to .ttl = ttl
	84	+
	85	+ # Ensure namespace directory exists
	86	+ self.namespace_dir = self. ne otherwise
	87	+ """
	88	+ cache_path = self.get_cache_path(key)
	89	+
	90	+ Retu logger.debug(f"Initialized API cache in {self.namespace_dir}")
	91	+
	92	+ der than this Get path to cache file for key.
	93	+
	94	+ Parameters
	95	+ ----------
	96	+ key :
	97	+ Returns
	98	+ -------
	99	+ Path
	100	+ Path to cache fp", 0)
	101	+ "timestamp": time.time(),p", 0)
	102	+ "value": value
	103	+ }
	104	+ ensure valid filename
	105	+ hashed_key = hashlib.md5(key.encode()).hexdigest()
	106	+ return self.namespace_dir / f"{hashed_key}.json"
	107	+
	108	+ def get(self, ke y: str) -> Optional[Any]:
	109	+ """
	110	+ Get value from cache.
	111	+
	112	+
	113	+ except E"""Caching system for Am for API responses to redu -------
	114	+ object or None
	115	+ Cached value if available and not expired, No ne otherwise
	116	+ """
	117	+ cache_path = self.get_cache_path(key)
	118	+
	119	+ der than this Get path to cache file fo try:
	120	+ # Read cache file
	121	+ with open(cache_path, "r", encoding="utf he_path, "r", encoding="utf-8") as f:
	122	+ cache_data = json.load(f)
	123	+
	124	+ # Check if cache entry is expired
	125	+ timestamp = cache_data.get("timestamp", 0)
	126	+ now = time.time()
	127	+
	128	+ if now - timestamp > self.ttl:
	129	+ logger.deb
	130	+ return False
	131	+
	132	+ def clear(self, older_than: Optional[int] = None) -> int:
	133	+ """
	134	+ Clear all cache entries or entries older than specified time.
	135	+ ne otherwise
	136	+ older_than : int, optional
	137	+ Clear entries older than this many seconds
	138	+ der than this many seconds
	139	+
	140	+ Returns
	141	+ -------
	142	+ int
	143	+ Number of entries cleared
	144	+ """
	145	+ count = 0 self.
	146	+ for cache_file in self.namespace_dir_file in self.namespace_dir.glob("*.json"):
	147	+ try:
	148	+ # Check file age if criteria provided
	149	+ if older_than is not None:
	150	+ file_age = now - os.path.getmtime(cache_file)
	151	+ if file_age <= older_than:
	152	+ p", 0)
	153	+ # Remove file
	154	+ os.remove(cache_file)
	155	+ count += 1
	156	+ e.time()
	157	+
	158	+ if now - timestamp > self.ttl:
	159	+ logger logger.warning
	160	+ xists
	161	+ logger.info(f"Cleared {count} cache entries from {self.namespace}")
	162	+ return count
	163	+ ce}"

	--- a/video_processor/utils/api_cache.py
	+++ b/video_processor/utils/api_cache.py
	@@ -0,0 +1,163 @@

	--- a/video_processor/utils/api_cache.py
	+++ b/video_processor/utils/api_cache.py
	@@ -0,0 +1,163 @@
1	"""Caching system for API responses to reduce API calls and costs."""
2	import json
3	import logimport hashlib
4	from pathlib import Path
5	from typing import Any, Dict, Optional, Union
6
7	logger = loging.getLogger(__name__)
8
9
10	cla sed API response cache."""
11
12
13	c
14	Union[str, Path],
15	,
16	ttl: int = 86400, # 24 hours in seconds
17	):
18	"""
19	ne otherwise
20	----------
21	cache_dir : str or Path
22	Directory for cache files
23	namespace : str
24	Cache namespace for organizing cache files
25	ttl : int
26	Time-to-live for cache entries in seconds
27	"""
28	self.cache_dir = Path(cache_dir)
29	self.namespace = namesp ace
30	self.ttl = ttl
31
32	# Ensure namespace directory exists
33	self.namespace_dir = self.cache_dir / namespace
34	self.namespace_dir.mkdir(pa
35	red
36	timestamp
37
38	logger.de
39	def get_cache_path(self, key: str) -> Path:
40	"""
41	Get p ne otherwise
42	"""
43	cache_path = self.get_cache_path(key)
44
45	der than this Path
46	Path to cache file
47	"""
48	# Hash the key to ensure valid filename
49	hashed_key = hashlib.md5(key.encode()).hexdigest()
50	return self.namespace_dir / f"{hashed_key}.json"
51
52	def get(self, key: str) -> Optional[Any]:
53	"""
54	Get value from cache.
55	ne otherwise
56	"""
57	cache_path = self.get_cache_path(key)
58
59	der than this object or None
60	Cached value if available and not expired, None otherwise
61	"""
62	cache_path = self.get_cache_path(key)
63
64	# Check path(key)
65
66	# Check if cache file exists
67	if not cache_path.exists(
68	Returns
69	---Read cache file
70	with open(cache_path, "r", encoding="utf-8") as f:
71	cache_data = json.load(f)
72
73	f.get_cache_path(key)es to reduce API calls and costs."""
74
75	import hashlib
76	import json
77	import log"""Caching s
78	= time.time()
79
80	if now - timestamp > self.ttl:
81	logger.debug(f"Cache y: str) -> Optional[Any]:
82	Time-to-live for cache en
83	except E"""Caching system for Am for API responses to .ttl = ttl
84
85	# Ensure namespace directory exists
86	self.namespace_dir = self. ne otherwise
87	"""
88	cache_path = self.get_cache_path(key)
89
90	Retu logger.debug(f"Initialized API cache in {self.namespace_dir}")
91
92	der than this Get path to cache file for key.
93
94	Parameters
95	----------
96	key :
97	Returns
98	-------
99	Path
100	Path to cache fp", 0)
101	"timestamp": time.time(),p", 0)
102	"value": value
103	}
104	ensure valid filename
105	hashed_key = hashlib.md5(key.encode()).hexdigest()
106	return self.namespace_dir / f"{hashed_key}.json"
107
108	def get(self, ke y: str) -> Optional[Any]:
109	"""
110	Get value from cache.
111
112
113	except E"""Caching system for Am for API responses to redu -------
114	object or None
115	Cached value if available and not expired, No ne otherwise
116	"""
117	cache_path = self.get_cache_path(key)
118
119	der than this Get path to cache file fo try:
120	# Read cache file
121	with open(cache_path, "r", encoding="utf he_path, "r", encoding="utf-8") as f:
122	cache_data = json.load(f)
123
124	# Check if cache entry is expired
125	timestamp = cache_data.get("timestamp", 0)
126	now = time.time()
127
128	if now - timestamp > self.ttl:
129	logger.deb
130	return False
131
132	def clear(self, older_than: Optional[int] = None) -> int:
133	"""
134	Clear all cache entries or entries older than specified time.
135	ne otherwise
136	older_than : int, optional
137	Clear entries older than this many seconds
138	der than this many seconds
139
140	Returns
141	-------
142	int
143	Number of entries cleared
144	"""
145	count = 0 self.
146	for cache_file in self.namespace_dir_file in self.namespace_dir.glob("*.json"):
147	try:
148	# Check file age if criteria provided
149	if older_than is not None:
150	file_age = now - os.path.getmtime(cache_file)
151	if file_age <= older_than:
152	p", 0)
153	# Remove file
154	os.remove(cache_file)
155	count += 1
156	e.time()
157
158	if now - timestamp > self.ttl:
159	logger logger.warning
160	xists
161	logger.info(f"Cleared {count} cache entries from {self.namespace}")
162	return count
163	ce}"

M video_processor/utils/export.py

+3 -1

		--- video_processor/utils/export.py
		+++ video_processor/utils/export.py
		@@ -2,10 +2,12 @@
2	2
3	3	import json
4	4	import logging
5	5	from pathlib import Path
6	6	from typing import Optional
	7	+
	8	+from tqdm import tqdm
7	9
8	10	from video_processor.models import DiagramResult, VideoManifest
9	11	from video_processor.utils.rendering import render_mermaid, reproduce_chart
10	12
11	13	logger = logging.getLogger(__name__)
		@@ -153,11 +155,11 @@
153	155	Updates manifest with output file paths and returns it.
154	156	"""
155	157	output_dir = Path(output_dir)
156	158
157	159	# Render mermaid diagrams to SVG/PNG
158		- for i, diagram in enumerate(manifest.diagrams):
	160	+ for i, diagram in enumerate(tqdm(manifest.diagrams, desc="Rendering diagrams", unit="diag") if manifest.diagrams else []):
159	161	if diagram.mermaid:
160	162	diagrams_dir = output_dir / "diagrams"
161	163	prefix = f"diagram_{i}"
162	164	paths = render_mermaid(diagram.mermaid, diagrams_dir, prefix)
163	165	if "svg" in paths:
164	166
165	167	ADDED video_processor/utils/usage_tracker.py
166	168	ADDED video_processor/utils/visualization.py

	--- video_processor/utils/export.py
	+++ video_processor/utils/export.py
	@@ -2,10 +2,12 @@
2
3	import json
4	import logging
5	from pathlib import Path
6	from typing import Optional


7
8	from video_processor.models import DiagramResult, VideoManifest
9	from video_processor.utils.rendering import render_mermaid, reproduce_chart
10
11	logger = logging.getLogger(__name__)
	@@ -153,11 +155,11 @@
153	Updates manifest with output file paths and returns it.
154	"""
155	output_dir = Path(output_dir)
156
157	# Render mermaid diagrams to SVG/PNG
158	for i, diagram in enumerate(manifest.diagrams):
159	if diagram.mermaid:
160	diagrams_dir = output_dir / "diagrams"
161	prefix = f"diagram_{i}"
162	paths = render_mermaid(diagram.mermaid, diagrams_dir, prefix)
163	if "svg" in paths:
164
165	DDED video_processor/utils/usage_tracker.py
166	DDED video_processor/utils/visualization.py

	--- video_processor/utils/export.py
	+++ video_processor/utils/export.py
	@@ -2,10 +2,12 @@
2
3	import json
4	import logging
5	from pathlib import Path
6	from typing import Optional
7
8	from tqdm import tqdm
9
10	from video_processor.models import DiagramResult, VideoManifest
11	from video_processor.utils.rendering import render_mermaid, reproduce_chart
12
13	logger = logging.getLogger(__name__)
	@@ -153,11 +155,11 @@
155	Updates manifest with output file paths and returns it.
156	"""
157	output_dir = Path(output_dir)
158
159	# Render mermaid diagrams to SVG/PNG
160	for i, diagram in enumerate(tqdm(manifest.diagrams, desc="Rendering diagrams", unit="diag") if manifest.diagrams else []):
161	if diagram.mermaid:
162	diagrams_dir = output_dir / "diagrams"
163	prefix = f"diagram_{i}"
164	paths = render_mermaid(diagram.mermaid, diagrams_dir, prefix)
165	if "svg" in paths:
166
167	DDED video_processor/utils/usage_tracker.py
168	DDED video_processor/utils/visualization.py

A video_processor/utils/usage_tracker.py

+153

		--- a/video_processor/utils/usage_tracker.py
		+++ b/video_processor/utils/usage_tracker.py
		@@ -0,0 +1,153 @@
	1	+"""Usage tracking and cost estimation for API calls."""
	2	+
	3	+import time
	4	+from dataclasses import dataclass, field
	5	+from typing import Optional
	6	+
	7	+
	8	+# Cost per million tokens (USD) — updated Feb 2025
	9	+_MODEL_PRICING = {
	10	+ # Anthropic
	11	+ "claude-sonnet-4-5-20250929": {"input": 3.00, "output": 15.00},
	12	+ "claude-haiku-3-5-20241022": {"input": 0.80, "output": 4.00},
	13	+ # OpenAI
	14	+ "gpt-4o": {"input": 2.50, "output": 10.00},
	15	+ "gpt-4o-mini": {"input": 0.15, "output": 0.60},
	16	+ "gpt-4.1": {"input": 2.00, "output": 8.00},
	17	+ "gpt-4.1-mini": {"input": 0.40, "output": 1.60},
	18	+ "gpt-4.1-nano": {"input": 0.10, "output": 0.40},
	19	+ # Google Gemini
	20	+ "gemini-2.5-flash": {"input": 0.15, "output": 0.60},
	21	+ "gemini-2.5-pro": {"input": 1.25, "output": 10.00},
	22	+ "gemini-2.0-flash": {"input": 0.10, "output": 0.40},
	23	+ # Whisper
	24	+ "whisper-1": {"per_minute": 0.006},
	25	+}
	26	+
	27	+
	28	+@dataclass
	29	+class ModelUsage:
	30	+ """Accumulated usge for a single model."""
	31	+
	32	+ provider: str = ""
	33	+ model: str = ""
	34	+ calls: int = 0
	35	+ input_tokens: int = 0
	36	+ output_tokens: int = 0
	37	+ audio_minutes: float = 0.0
	38	+
	39	+ @property
	40	+ def total_tokens(self) -> int:
	41	+ return self.input_tokens + self.output_tokens
	42	+
	43	+ @property
	44	+ def estimated_cost(self) -> float:
	45	+ pricing = _MODEL_PRICING.get(self.model)
	46	+ if not pricing:
	47	+ # Try partial match
	48	+ for key, p in _MODEL_PRICING.items():
	49	+ if key in self.model or self.model in key:
	50	+ pricing = p
	51	+ break
	52	+ if not pricing:
	53	+ return 0.0
	54	+ if "per_minute" in pricing:
	55	+ return self.audio_minutes * pricing["per_minute"]
	56	+ return (
	57	+ self.input_tokens * pricing.get("input", 0) / 1_000_000
	58	+ + self.output_tokens * pricing.get("output", 0) / 1_000_000
	59	+ )
	60	+
	61	+
	62	+@dataclass
	63	+class StepTiming:
	64	+ """Timing for timation for API calls."""
	65	+
	66	+import time
	67	+from dataclasses import dataclass, field
	68	+from typing import Optional
	69	+
	70	+# Cost per million tokens (USD) — updated Feb 2025
	71	+_MODEL_PRICING = {
	72	+ # Anthropic
	73	+ "claude-sonnet-4-5-20250929": {"input": 3.00, "output": 15.00},
	74	+ "claude-haiku-3-5-20241022": {"input": 0.80, "output": 4.00},
	75	+ """Usage tracking and cost estimation for API calls."""
	76	+
	77	+import time
	78	+from dataclasses import dataclass, field
	79	+from typing import Optional
	80	+
	81	+# Cost per million tokens (USD) — updated Feb 2025
	82	+_MODEL_PRICING = {
	83	+ # Anthropic
	84	+ "claude-sonnet-4-5-20250929": {"input": 3.00, "output": 15.00},
	85	+ "claude-haiku-3-5-20241022": {"input": 0.80, "output": 4.00},
	86	+ # OpenAI
	87	+ "gpt-4o": {"input": 2.50, "output": 10.00},
	88	+ "gpt-4o-mini": {"input": 0.15, "output": 0.60},
	89	+ "gpt-4.1": {"input": 2.00, "output": 8.00},
	90	+ "gpt-4.1-mini": {"input": 0.40, "output": 1.60},
	91	+ "gpt-4.1-nano": {"input": 0.10, "output": 0.40},
	92	+ # Google Gemini
	93	+ "gemini-2.5-flash": {"input": 0.15, "output": 0.60},
	94	+ "gemini-2.5-pro": {"input": 1.25, "output": 10.00},
	95	+ "gemini-2.0-flash": {"input": 0.10, "output": 0.40},
	96	+ # Whisper
	97	+ "whisper-1": {"per_minute": 0.006},
	98	+}
	99	+
	100	+
	101	+@dataclass
	102	+class ModelUsage:
	103	+ """Accumulated usage for a single model."""
	104	+
	105	+ provider: str = ""
	106	+ model: str = ""
	107	+ calls: int = 0
	108	+ input_tokens: int = 0
	109	+ output_tokens: int = 0
	110	+ audio_minutes: float = 0.0
	111	+
	112	+ @property
	113	+ def total_tokens(self) -> int:
	114	+ return self.input_tokens + self.output_tokens
	115	+
	116	+ @property
	117	+ def estimated_cost(self) -> float:
	118	+ pricing = _MODEL_PRICING.get(self.model)
	119	+ es.append(
	120	+ ff.total_tokens:,} "
	121	+ f"({self.total_input_tokens:,} in / {self.t)
	122	+ lines.append("")
	123	+ lines.append(f" {'Model':<35} {'Calls':>6} {'In Tok':>8} {'Out Tok':>8} {'Cost':>8}")
	124	+ lines.append(f" {'-'35} {'-'6} {'-'8} {'-'8} {'-'*8}")
	125	+ for key in sorted(self._models.keys()):
	126	+ u = self._models[key]
	127	+ cost_str = f"${u.estimated_cost:.4f}" if u.estimated_cost > 0 else "free"
	128	+ if u.audio_minutes > 0:
	129	+ lines.append(
	130	+ f" {key:<35} {u.calls:>6} {u.audio_minutes:>7.1f}m {'-':>8} {cost_str:>8}"
	131	+ )
	132	+ else:
	133	+ lines.append(
	134	+ {u.input_tokens:>8,} {u.output_tokens:>8,} {cost_str:>8}"
	135	+ )
	136	+
	137	+ lines.append(f"\n Estimated total cost: ${self.total_cost:.4f}")
	138	+
	139	+ lines.append("=" * 60)
	140	+ return "\n".join(lines)
	141	+
	142	+
	143	+def _fmt_duration(seconds: float) -> str:
	144	+ """Format seconds as human-readable duration."""
	145	+ if seconds < 60:
	146	+ return f"{seconds:.1f}s"
	147	+ m = int(seconds // 60)
	148	+ s = seconds % 60
	149	+ if m < 60:
	150	+ return f"{m}m {s:.0f}s"
	151	+ h = m // 60
	152	+ m = m % 60
	153	+ return f"{h}h {m}m {s:.0f}s"

	--- a/video_processor/utils/usage_tracker.py
	+++ b/video_processor/utils/usage_tracker.py
	@@ -0,0 +1,153 @@

	--- a/video_processor/utils/usage_tracker.py
	+++ b/video_processor/utils/usage_tracker.py
	@@ -0,0 +1,153 @@
1	"""Usage tracking and cost estimation for API calls."""
2
3	import time
4	from dataclasses import dataclass, field
5	from typing import Optional
6
7
8	# Cost per million tokens (USD) — updated Feb 2025
9	_MODEL_PRICING = {
10	# Anthropic
11	"claude-sonnet-4-5-20250929": {"input": 3.00, "output": 15.00},
12	"claude-haiku-3-5-20241022": {"input": 0.80, "output": 4.00},
13	# OpenAI
14	"gpt-4o": {"input": 2.50, "output": 10.00},
15	"gpt-4o-mini": {"input": 0.15, "output": 0.60},
16	"gpt-4.1": {"input": 2.00, "output": 8.00},
17	"gpt-4.1-mini": {"input": 0.40, "output": 1.60},
18	"gpt-4.1-nano": {"input": 0.10, "output": 0.40},
19	# Google Gemini
20	"gemini-2.5-flash": {"input": 0.15, "output": 0.60},
21	"gemini-2.5-pro": {"input": 1.25, "output": 10.00},
22	"gemini-2.0-flash": {"input": 0.10, "output": 0.40},
23	# Whisper
24	"whisper-1": {"per_minute": 0.006},
25	}
26
27
28	@dataclass
29	class ModelUsage:
30	"""Accumulated usge for a single model."""
31
32	provider: str = ""
33	model: str = ""
34	calls: int = 0
35	input_tokens: int = 0
36	output_tokens: int = 0
37	audio_minutes: float = 0.0
38
39	@property
40	def total_tokens(self) -> int:
41	return self.input_tokens + self.output_tokens
42
43	@property
44	def estimated_cost(self) -> float:
45	pricing = _MODEL_PRICING.get(self.model)
46	if not pricing:
47	# Try partial match
48	for key, p in _MODEL_PRICING.items():
49	if key in self.model or self.model in key:
50	pricing = p
51	break
52	if not pricing:
53	return 0.0
54	if "per_minute" in pricing:
55	return self.audio_minutes * pricing["per_minute"]
56	return (
57	self.input_tokens * pricing.get("input", 0) / 1_000_000
58	+ self.output_tokens * pricing.get("output", 0) / 1_000_000
59	)
60
61
62	@dataclass
63	class StepTiming:
64	"""Timing for timation for API calls."""
65
66	import time
67	from dataclasses import dataclass, field
68	from typing import Optional
69
70	# Cost per million tokens (USD) — updated Feb 2025
71	_MODEL_PRICING = {
72	# Anthropic
73	"claude-sonnet-4-5-20250929": {"input": 3.00, "output": 15.00},
74	"claude-haiku-3-5-20241022": {"input": 0.80, "output": 4.00},
75	"""Usage tracking and cost estimation for API calls."""
76
77	import time
78	from dataclasses import dataclass, field
79	from typing import Optional
80
81	# Cost per million tokens (USD) — updated Feb 2025
82	_MODEL_PRICING = {
83	# Anthropic
84	"claude-sonnet-4-5-20250929": {"input": 3.00, "output": 15.00},
85	"claude-haiku-3-5-20241022": {"input": 0.80, "output": 4.00},
86	# OpenAI
87	"gpt-4o": {"input": 2.50, "output": 10.00},
88	"gpt-4o-mini": {"input": 0.15, "output": 0.60},
89	"gpt-4.1": {"input": 2.00, "output": 8.00},
90	"gpt-4.1-mini": {"input": 0.40, "output": 1.60},
91	"gpt-4.1-nano": {"input": 0.10, "output": 0.40},
92	# Google Gemini
93	"gemini-2.5-flash": {"input": 0.15, "output": 0.60},
94	"gemini-2.5-pro": {"input": 1.25, "output": 10.00},
95	"gemini-2.0-flash": {"input": 0.10, "output": 0.40},
96	# Whisper
97	"whisper-1": {"per_minute": 0.006},
98	}
99
100
101	@dataclass
102	class ModelUsage:
103	"""Accumulated usage for a single model."""
104
105	provider: str = ""
106	model: str = ""
107	calls: int = 0
108	input_tokens: int = 0
109	output_tokens: int = 0
110	audio_minutes: float = 0.0
111
112	@property
113	def total_tokens(self) -> int:
114	return self.input_tokens + self.output_tokens
115
116	@property
117	def estimated_cost(self) -> float:
118	pricing = _MODEL_PRICING.get(self.model)
119	es.append(
120	ff.total_tokens:,} "
121	f"({self.total_input_tokens:,} in / {self.t)
122	lines.append("")
123	lines.append(f" {'Model':<35} {'Calls':>6} {'In Tok':>8} {'Out Tok':>8} {'Cost':>8}")
124	lines.append(f" {'-'35} {'-'6} {'-'8} {'-'8} {'-'*8}")
125	for key in sorted(self._models.keys()):
126	u = self._models[key]
127	cost_str = f"${u.estimated_cost:.4f}" if u.estimated_cost > 0 else "free"
128	if u.audio_minutes > 0:
129	lines.append(
130	f" {key:<35} {u.calls:>6} {u.audio_minutes:>7.1f}m {'-':>8} {cost_str:>8}"
131	)
132	else:
133	lines.append(
134	{u.input_tokens:>8,} {u.output_tokens:>8,} {cost_str:>8}"
135	)
136
137	lines.append(f"\n Estimated total cost: ${self.total_cost:.4f}")
138
139	lines.append("=" * 60)
140	return "\n".join(lines)
141
142
143	def _fmt_duration(seconds: float) -> str:
144	"""Format seconds as human-readable duration."""
145	if seconds < 60:
146	return f"{seconds:.1f}s"
147	m = int(seconds // 60)
148	s = seconds % 60
149	if m < 60:
150	return f"{m}m {s:.0f}s"
151	h = m // 60
152	m = m % 60
153	return f"{h}h {m}m {s:.0f}s"

A video_processor/utils/visualization.py

No diff available

PlanOpticon

Keyboard Shortcuts