PlanOpticon
Merge pull request #91 from ConflictHQ/feat/sqlite-migration feat: PlanOpticon v0.4.0 — planning agent, companion REPL, 20+ sources, SQLite
Commit
0981a082d9bd09a808b2fa13ef2ab23b8b463b0ddc0631c05429fb5c904577f7
Parent
f118ba0ec2eacc7…
113 files changed
+71
+1
-1
+105
-33
+83
-6
+41
-12
+561
-11
+47
-4
+80
+174
-1
+24
-4
+406
+27
-3
+413
-11
+627
+114
+309
+114
+252
+162
+224
+10
-25
+56
-42
+422
+275
+433
-2
+359
+286
-33
+1572
+286
+198
+339
+159
+98
+1
+33
+94
+65
+99
+76
+93
+420
+70
+74
+95
+68
+76
+315
+183
+485
+1480
-47
+414
+209
+1
+500
+49
+2
-2
+31
-4
+381
-201
+155
-13
+193
+70
-1
+4
-4
+54
-6
+23
+56
+88
+133
+77
+36
-2
+98
+36
-9
+38
+171
-2
+193
+20
+123
+56
-53
+24
+14
-1
+187
+171
+53
-56
+167
+10
-1
+12
-3
+138
+20
+226
+20
+59
-2
+178
+117
+156
+170
+268
+112
+200
+310
+280
+380
+178
+222
+119
+103
+114
+76
+375
+129
+90
+118
+399
+57
+200
~
CHANGELOG.md
~
CLAUDE.md
~
README.md
~
docs/architecture/overview.md
~
docs/architecture/providers.md
~
docs/cli-reference.md
~
docs/getting-started/configuration.md
~
docs/getting-started/quickstart.md
~
docs/guide/cloud-sources.md
~
docs/index.md
~
knowledge-base/viewer.html
~
pyproject.toml
~
tests/test_agent.py
~
tests/test_agent_skills.py
~
tests/test_api_spec.py
~
tests/test_auth.py
~
tests/test_callbacks.py
~
tests/test_cli.py
~
tests/test_companion.py
~
tests/test_exchange.py
~
tests/test_graph_query.py
~
tests/test_graph_store.py
~
tests/test_knowledge_graph.py
~
tests/test_output_formatter.py
~
tests/test_pipeline.py
~
tests/test_processors.py
~
tests/test_providers.py
~
tests/test_sources.py
~
tests/test_taxonomy.py
~
tests/test_usage_tracker.py
~
tests/test_visualization.py
~
video_processor/agent/agent_loop.py
~
video_processor/agent/kb_context.py
~
video_processor/agent/orchestrator.py
~
video_processor/agent/skills/__init__.py
~
video_processor/agent/skills/artifact_export.py
~
video_processor/agent/skills/base.py
~
video_processor/agent/skills/cli_adapter.py
~
video_processor/agent/skills/doc_generator.py
~
video_processor/agent/skills/github_integration.py
~
video_processor/agent/skills/notes_export.py
~
video_processor/agent/skills/prd.py
~
video_processor/agent/skills/project_plan.py
~
video_processor/agent/skills/requirements_chat.py
~
video_processor/agent/skills/roadmap.py
~
video_processor/agent/skills/task_breakdown.py
~
video_processor/agent/skills/wiki_generator.py
~
video_processor/api/openapi_spec.py
~
video_processor/auth.py
~
video_processor/cli/commands.py
~
video_processor/cli/companion.py
~
video_processor/exchange.py
~
video_processor/exporters/__init__.py
~
video_processor/exporters/markdown.py
~
video_processor/extractors/frame_extractor.py
~
video_processor/integrators/graph_discovery.py
~
video_processor/integrators/graph_query.py
~
video_processor/integrators/graph_store.py
~
video_processor/integrators/knowledge_graph.py
~
video_processor/integrators/taxonomy.py
~
video_processor/models.py
~
video_processor/output_structure.py
~
video_processor/pipeline.py
~
video_processor/processors/__init__.py
~
video_processor/processors/base.py
~
video_processor/processors/ingest.py
~
video_processor/processors/markdown_processor.py
~
video_processor/processors/pdf_processor.py
~
video_processor/providers/__init__.py
~
video_processor/providers/ai21_provider.py
~
video_processor/providers/anthropic_provider.py
~
video_processor/providers/azure_provider.py
~
video_processor/providers/base.py
~
video_processor/providers/bedrock_provider.py
~
video_processor/providers/cerebras_provider.py
~
video_processor/providers/cohere_provider.py
~
video_processor/providers/discovery.py
~
video_processor/providers/fireworks_provider.py
~
video_processor/providers/gemini_provider.py
~
video_processor/providers/huggingface_provider.py
~
video_processor/providers/litellm_provider.py
~
video_processor/providers/manager.py
~
video_processor/providers/mistral_provider.py
~
video_processor/providers/ollama_provider.py
~
video_processor/providers/openai_provider.py
~
video_processor/providers/qianfan_provider.py
~
video_processor/providers/together_provider.py
~
video_processor/providers/vertex_provider.py
~
video_processor/providers/xai_provider.py
~
video_processor/sources/__init__.py
~
video_processor/sources/apple_notes_source.py
~
video_processor/sources/arxiv_source.py
~
video_processor/sources/github_source.py
~
video_processor/sources/google_keep_source.py
~
video_processor/sources/gws_source.py
~
video_processor/sources/hackernews_source.py
~
video_processor/sources/logseq_source.py
~
video_processor/sources/m365_source.py
~
video_processor/sources/meet_recording_source.py
~
video_processor/sources/notion_source.py
~
video_processor/sources/obsidian_source.py
~
video_processor/sources/onenote_source.py
~
video_processor/sources/podcast_source.py
~
video_processor/sources/reddit_source.py
~
video_processor/sources/rss_source.py
~
video_processor/sources/s3_source.py
~
video_processor/sources/teams_recording_source.py
~
video_processor/sources/twitter_source.py
~
video_processor/sources/web_source.py
~
video_processor/sources/youtube_source.py
~
video_processor/sources/zoom_source.py
~
video_processor/utils/callbacks.py
~
video_processor/utils/visualization.py
+71
| --- a/CHANGELOG.md | ||
| +++ b/CHANGELOG.md | ||
| @@ -0,0 +1,71 @@ | ||
| 1 | +# Changelog | |
| 2 | + | |
| 3 | +All notable changes to PlanOpticon are documented in this file. | |
| 4 | + | |
| 5 | +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), | |
| 6 | +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). | |
| 7 | + | |
| 8 | +## [0.4.0] - 2026-03-07 | |
| 9 | + | |
| 10 | +### Added | |
| 11 | + | |
| 12 | +- **Planning agent framework** with 11 skills: project_plan, prd, roadmap, task_breakdown, github_integration, requirements_chat, doc_generator, artifact_export, cli_adapter, notes_export, wiki_generator. Invoke via `planopticon agent`. | |
| 13 | +- **Interactive companion REPL** (`planopticon companion` / `planopticon --chat`) with auto-discovery of knowledge graphs, videos, and documents in the workspace. 15 slash commands for graph exploration, ingestion, export, auth, and runtime provider/model switching. | |
| 14 | +- **20+ source connectors**: YouTube, Web, GitHub, Reddit, HackerNews, RSS, Podcast, Twitter/X, arXiv, S3, Google Workspace (Docs, Sheets, Slides), Microsoft 365 (SharePoint, OneDrive), Obsidian, Notion, Apple Notes, OneNote, Google Keep, Logseq, Zoom (OAuth), Teams, Google Meet. | |
| 15 | +- **Pluggable provider registry** supporting 15+ AI providers: OpenAI, Anthropic, Gemini, Ollama, Azure, Together, Fireworks, Cerebras, xAI, Bedrock, Vertex, Mistral, Cohere, AI21, HuggingFace, Qianfan, and LiteLLM. | |
| 16 | +- **Planning taxonomy classifier** for entity types: goal, requirement, risk, task, milestone, and other planning-specific categories. | |
| 17 | +- **Unified OAuth manager** (`planopticon auth`) with pre-built configs for Google, Dropbox, Zoom, Notion, GitHub, and Microsoft. Auth chain: saved token, OAuth PKCE, API key fallback. | |
| 18 | +- **Markdown document generator** producing 7 document types without an LLM: summary, meeting-notes, glossary, relationship-map, status-report, entity-index, csv. | |
| 19 | +- **Notes export** to Obsidian vaults (YAML frontmatter + wiki-links) and Notion-compatible markdown. | |
| 20 | +- **GitHub wiki generator** with direct push support. | |
| 21 | +- **PlanOpticonExchange** canonical JSON interchange format with merge and dedup. | |
| 22 | +- **Document ingestion pipeline** for PDF, Markdown, and plaintext sources. | |
| 23 | +- **Knowledge graph viewer** -- self-contained HTML file with inlined D3.js for browser-based graph exploration. | |
| 24 | +- **Graph query engine** with direct mode (stats, entities, neighbors, relationships) and agentic mode (natural language queries via LLM). | |
| 25 | +- **Progress callback system** for pipeline status reporting. | |
| 26 | + | |
| 27 | +### Changed | |
| 28 | + | |
| 29 | +- **SQLite replaces FalkorDB** for knowledge graph storage. Zero external dependencies -- no database server or additional packages required. | |
| 30 | +- **Default models** now target cheap/fast options: Claude Haiku, GPT-4o-mini, Gemini Flash. | |
| 31 | +- Output structure updated: `knowledge_graph.db` (SQLite) is now the primary graph file alongside the existing `knowledge_graph.json` export. | |
| 32 | + | |
| 33 | +### Fixed | |
| 34 | + | |
| 35 | +- 821+ tests passing across the full test suite. | |
| 36 | + | |
| 37 | +## [0.3.0] - 2025-12-20 | |
| 38 | + | |
| 39 | +### Added | |
| 40 | + | |
| 41 | +- FalkorDB integration for knowledge graph storage. | |
| 42 | +- Typed relationships and entity properties in graph data model. | |
| 43 | +- Relationship existence checks. | |
| 44 | + | |
| 45 | +## [0.2.0] - 2025-10-15 | |
| 46 | + | |
| 47 | +### Added | |
| 48 | + | |
| 49 | +- Batch video processing with merged knowledge graphs. | |
| 50 | +- Cloud sources: Google Drive and Dropbox shared folder fetching. | |
| 51 | +- Checkpoint/resume for interrupted pipelines. | |
| 52 | +- PDF report generation. | |
| 53 | + | |
| 54 | +## [0.1.0] - 2025-08-01 | |
| 55 | + | |
| 56 | +### Added | |
| 57 | + | |
| 58 | +- Initial release. | |
| 59 | +- Video analysis with multi-provider AI (OpenAI, Anthropic, Gemini, Ollama). | |
| 60 | +- Smart frame extraction with change detection. | |
| 61 | +- People frame filtering via OpenCV face detection. | |
| 62 | +- Diagram extraction and classification. | |
| 63 | +- Knowledge graph extraction (entities and relationships). | |
| 64 | +- Action item detection with assignees and deadlines. | |
| 65 | +- Markdown and HTML report output. | |
| 66 | +- Mermaid diagram generation. | |
| 67 | + | |
| 68 | +[0.4.0]: https://github.com/ConflictHQ/PlanOpticon/compare/v0.3.0...v0.4.0 | |
| 69 | +[0.3.0]: https://github.com/ConflictHQ/PlanOpticon/compare/v0.2.0...v0.3.0 | |
| 70 | +[0.2.0]: https://github.com/ConflictHQ/PlanOpticon/compare/v0.1.0...v0.2.0 | |
| 71 | +[0.1.0]: https://github.com/ConflictHQ/PlanOpticon/releases/tag/v0.1.0 |
| --- a/CHANGELOG.md | |
| +++ b/CHANGELOG.md | |
| @@ -0,0 +1,71 @@ | |
| --- a/CHANGELOG.md | |
| +++ b/CHANGELOG.md | |
| @@ -0,0 +1,71 @@ | |
| 1 | # Changelog |
| 2 | |
| 3 | All notable changes to PlanOpticon are documented in this file. |
| 4 | |
| 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), |
| 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). |
| 7 | |
| 8 | ## [0.4.0] - 2026-03-07 |
| 9 | |
| 10 | ### Added |
| 11 | |
| 12 | - **Planning agent framework** with 11 skills: project_plan, prd, roadmap, task_breakdown, github_integration, requirements_chat, doc_generator, artifact_export, cli_adapter, notes_export, wiki_generator. Invoke via `planopticon agent`. |
| 13 | - **Interactive companion REPL** (`planopticon companion` / `planopticon --chat`) with auto-discovery of knowledge graphs, videos, and documents in the workspace. 15 slash commands for graph exploration, ingestion, export, auth, and runtime provider/model switching. |
| 14 | - **20+ source connectors**: YouTube, Web, GitHub, Reddit, HackerNews, RSS, Podcast, Twitter/X, arXiv, S3, Google Workspace (Docs, Sheets, Slides), Microsoft 365 (SharePoint, OneDrive), Obsidian, Notion, Apple Notes, OneNote, Google Keep, Logseq, Zoom (OAuth), Teams, Google Meet. |
| 15 | - **Pluggable provider registry** supporting 15+ AI providers: OpenAI, Anthropic, Gemini, Ollama, Azure, Together, Fireworks, Cerebras, xAI, Bedrock, Vertex, Mistral, Cohere, AI21, HuggingFace, Qianfan, and LiteLLM. |
| 16 | - **Planning taxonomy classifier** for entity types: goal, requirement, risk, task, milestone, and other planning-specific categories. |
| 17 | - **Unified OAuth manager** (`planopticon auth`) with pre-built configs for Google, Dropbox, Zoom, Notion, GitHub, and Microsoft. Auth chain: saved token, OAuth PKCE, API key fallback. |
| 18 | - **Markdown document generator** producing 7 document types without an LLM: summary, meeting-notes, glossary, relationship-map, status-report, entity-index, csv. |
| 19 | - **Notes export** to Obsidian vaults (YAML frontmatter + wiki-links) and Notion-compatible markdown. |
| 20 | - **GitHub wiki generator** with direct push support. |
| 21 | - **PlanOpticonExchange** canonical JSON interchange format with merge and dedup. |
| 22 | - **Document ingestion pipeline** for PDF, Markdown, and plaintext sources. |
| 23 | - **Knowledge graph viewer** -- self-contained HTML file with inlined D3.js for browser-based graph exploration. |
| 24 | - **Graph query engine** with direct mode (stats, entities, neighbors, relationships) and agentic mode (natural language queries via LLM). |
| 25 | - **Progress callback system** for pipeline status reporting. |
| 26 | |
| 27 | ### Changed |
| 28 | |
| 29 | - **SQLite replaces FalkorDB** for knowledge graph storage. Zero external dependencies -- no database server or additional packages required. |
| 30 | - **Default models** now target cheap/fast options: Claude Haiku, GPT-4o-mini, Gemini Flash. |
| 31 | - Output structure updated: `knowledge_graph.db` (SQLite) is now the primary graph file alongside the existing `knowledge_graph.json` export. |
| 32 | |
| 33 | ### Fixed |
| 34 | |
| 35 | - 821+ tests passing across the full test suite. |
| 36 | |
| 37 | ## [0.3.0] - 2025-12-20 |
| 38 | |
| 39 | ### Added |
| 40 | |
| 41 | - FalkorDB integration for knowledge graph storage. |
| 42 | - Typed relationships and entity properties in graph data model. |
| 43 | - Relationship existence checks. |
| 44 | |
| 45 | ## [0.2.0] - 2025-10-15 |
| 46 | |
| 47 | ### Added |
| 48 | |
| 49 | - Batch video processing with merged knowledge graphs. |
| 50 | - Cloud sources: Google Drive and Dropbox shared folder fetching. |
| 51 | - Checkpoint/resume for interrupted pipelines. |
| 52 | - PDF report generation. |
| 53 | |
| 54 | ## [0.1.0] - 2025-08-01 |
| 55 | |
| 56 | ### Added |
| 57 | |
| 58 | - Initial release. |
| 59 | - Video analysis with multi-provider AI (OpenAI, Anthropic, Gemini, Ollama). |
| 60 | - Smart frame extraction with change detection. |
| 61 | - People frame filtering via OpenCV face detection. |
| 62 | - Diagram extraction and classification. |
| 63 | - Knowledge graph extraction (entities and relationships). |
| 64 | - Action item detection with assignees and deadlines. |
| 65 | - Markdown and HTML report output. |
| 66 | - Mermaid diagram generation. |
| 67 | |
| 68 | [0.4.0]: https://github.com/ConflictHQ/PlanOpticon/compare/v0.3.0...v0.4.0 |
| 69 | [0.3.0]: https://github.com/ConflictHQ/PlanOpticon/compare/v0.2.0...v0.3.0 |
| 70 | [0.2.0]: https://github.com/ConflictHQ/PlanOpticon/compare/v0.1.0...v0.2.0 |
| 71 | [0.1.0]: https://github.com/ConflictHQ/PlanOpticon/releases/tag/v0.1.0 |
+1
-1
| --- CLAUDE.md | ||
| +++ CLAUDE.md | ||
| @@ -7,11 +7,11 @@ | ||
| 7 | 7 | PlanOpticon can build and query knowledge graphs from video content. If you see `knowledge_graph.db` or `knowledge_graph.json` files in the workspace, you can query them to understand what was discussed. |
| 8 | 8 | |
| 9 | 9 | ### Auto-detection |
| 10 | 10 | |
| 11 | 11 | Look for these files (checked automatically): |
| 12 | -- `knowledge_graph.db` — FalkorDB binary graph (preferred) | |
| 12 | +- `knowledge_graph.db` — SQLite graph database (preferred) | |
| 13 | 13 | - `knowledge_graph.json` — JSON export (fallback) |
| 14 | 14 | |
| 15 | 15 | Common locations: project root, `results/`, `output/`, `knowledge-base/`. |
| 16 | 16 | |
| 17 | 17 | ### Quick commands |
| 18 | 18 |
| --- CLAUDE.md | |
| +++ CLAUDE.md | |
| @@ -7,11 +7,11 @@ | |
| 7 | PlanOpticon can build and query knowledge graphs from video content. If you see `knowledge_graph.db` or `knowledge_graph.json` files in the workspace, you can query them to understand what was discussed. |
| 8 | |
| 9 | ### Auto-detection |
| 10 | |
| 11 | Look for these files (checked automatically): |
| 12 | - `knowledge_graph.db` — FalkorDB binary graph (preferred) |
| 13 | - `knowledge_graph.json` — JSON export (fallback) |
| 14 | |
| 15 | Common locations: project root, `results/`, `output/`, `knowledge-base/`. |
| 16 | |
| 17 | ### Quick commands |
| 18 |
| --- CLAUDE.md | |
| +++ CLAUDE.md | |
| @@ -7,11 +7,11 @@ | |
| 7 | PlanOpticon can build and query knowledge graphs from video content. If you see `knowledge_graph.db` or `knowledge_graph.json` files in the workspace, you can query them to understand what was discussed. |
| 8 | |
| 9 | ### Auto-detection |
| 10 | |
| 11 | Look for these files (checked automatically): |
| 12 | - `knowledge_graph.db` — SQLite graph database (preferred) |
| 13 | - `knowledge_graph.json` — JSON export (fallback) |
| 14 | |
| 15 | Common locations: project root, `results/`, `output/`, `knowledge-base/`. |
| 16 | |
| 17 | ### Quick commands |
| 18 |
+105
-33
| --- README.md | ||
| +++ README.md | ||
| @@ -4,31 +4,118 @@ | ||
| 4 | 4 | [](https://pypi.org/project/planopticon/) |
| 5 | 5 | [](https://pypi.org/project/planopticon/) |
| 6 | 6 | [](LICENSE) |
| 7 | 7 | [](https://planopticon.dev) |
| 8 | 8 | |
| 9 | -**AI-powered video analysis and knowledge extraction.** | |
| 9 | +**AI-powered video analysis, knowledge extraction, and planning.** | |
| 10 | 10 | |
| 11 | -PlanOpticon processes video recordings into structured knowledge — transcripts, diagrams, action items, key points, and knowledge graphs. It auto-discovers available models across OpenAI, Anthropic, Gemini, and Ollama, and produces rich multi-format output. | |
| 11 | +PlanOpticon processes video recordings, documents, and 20+ online sources into structured knowledge graphs, then helps you plan with an AI agent and interactive companion. It auto-discovers models across 15+ AI providers, runs fully offline with Ollama, and produces rich multi-format output. | |
| 12 | 12 | |
| 13 | 13 | ## Features |
| 14 | 14 | |
| 15 | -- **Multi-provider AI** — Auto-discovers and routes to the best available model across OpenAI, Anthropic, Google Gemini, and Ollama (fully offline) | |
| 16 | -- **Smart frame extraction** — Change detection for transitions + periodic capture for slow-evolving content (document scrolling, screen shares) | |
| 17 | -- **People frame filtering** — OpenCV face detection automatically removes webcam/video conference frames, keeping only shared content | |
| 18 | -- **Diagram extraction** — Vision model classification detects flowcharts, architecture diagrams, charts, and whiteboards | |
| 19 | -- **Knowledge graphs** — Extracts entities and relationships, builds and merges knowledge graphs across videos | |
| 20 | -- **Action item detection** — Finds commitments, tasks, and follow-ups with assignees and deadlines | |
| 21 | -- **Batch processing** — Process entire folders of videos with merged knowledge graphs and cross-referencing | |
| 22 | -- **Rich output** — Markdown, HTML, PDF reports. Mermaid diagrams, SVG/PNG renderings, JSON manifests | |
| 23 | -- **Cloud sources** — Fetch videos from Google Drive and Dropbox shared folders | |
| 24 | -- **Checkpoint/resume** — Pipeline resumes from where it left off if interrupted | |
| 25 | -- **Screengrab fallback** — When extraction isn't perfect, captures frames with captions — something is always better than nothing | |
| 15 | +- **15+ AI providers** -- OpenAI, Anthropic, Gemini, Ollama, Azure, Together, Fireworks, Cerebras, xAI, Bedrock, Vertex, Mistral, Cohere, AI21, HuggingFace, Qianfan, and LiteLLM. Defaults to cheap models (Haiku, GPT-4o-mini, Gemini Flash). | |
| 16 | +- **20+ source connectors** -- YouTube, web pages, GitHub, Reddit, HackerNews, RSS, podcasts, arXiv, S3, Google Workspace, Microsoft 365, Obsidian, Notion, Apple Notes, Zoom, Teams, Google Meet, and more. | |
| 17 | +- **Planning agent** -- 11 skills including project plans, PRDs, roadmaps, task breakdowns, and GitHub integration. | |
| 18 | +- **Interactive companion** -- Chat REPL with 15 slash commands, auto-discovery of workspace knowledge, and runtime provider/model switching. | |
| 19 | +- **Knowledge graphs** -- SQLite-backed (zero external deps), entity extraction with planning taxonomy (goals, requirements, risks, tasks, milestones), merge and dedup across sources. | |
| 20 | +- **Smart video analysis** -- Change-detection frame extraction, face filtering, diagram classification, action item detection, checkpoint/resume. | |
| 21 | +- **Document ingestion** -- PDF, Markdown, and plaintext pipelines feed the same knowledge graph. | |
| 22 | +- **Export everywhere** -- Markdown docs (7 types, no LLM required), Obsidian vaults, Notion markdown, GitHub wiki with push, PlanOpticonExchange JSON interchange, HTML/PDF reports, Mermaid diagrams. | |
| 23 | +- **OAuth-first auth** -- Unified OAuth manager for Google, Dropbox, Zoom, Notion, GitHub, and Microsoft with saved-token / PKCE / API-key fallback chain. | |
| 24 | +- **Batch processing** -- Process entire folders with merged knowledge graphs and cross-referencing. | |
| 25 | + | |
| 26 | +## Quick Start | |
| 27 | + | |
| 28 | +```bash | |
| 29 | +# Install | |
| 30 | +pip install planopticon | |
| 31 | + | |
| 32 | +# Analyze a video | |
| 33 | +planopticon analyze -i meeting.mp4 -o ./output | |
| 34 | + | |
| 35 | +# Ingest a document | |
| 36 | +planopticon ingest -i spec.pdf -o ./output | |
| 37 | + | |
| 38 | +# Fetch from a source | |
| 39 | +planopticon fetch youtube "https://youtube.com/watch?v=..." -o ./output | |
| 40 | + | |
| 41 | +# Process a folder of videos | |
| 42 | +planopticon batch -i ./recordings -o ./output --title "Weekly Meetings" | |
| 43 | + | |
| 44 | +# Query the knowledge graph | |
| 45 | +planopticon query | |
| 46 | +planopticon query "entities --type technology" | |
| 47 | + | |
| 48 | +# See available AI models | |
| 49 | +planopticon list-models | |
| 50 | +``` | |
| 51 | + | |
| 52 | +## Planning Agent | |
| 53 | + | |
| 54 | +Run AI-powered planning skills against your knowledge base: | |
| 55 | + | |
| 56 | +```bash | |
| 57 | +# Generate a project plan from extracted knowledge | |
| 58 | +planopticon agent "Create a project plan" --kb ./results | |
| 59 | + | |
| 60 | +# Build a PRD | |
| 61 | +planopticon agent "Write a PRD for the authentication system" --kb ./results | |
| 62 | + | |
| 63 | +# Break down tasks | |
| 64 | +planopticon agent "Break this into tasks and estimate effort" --kb ./results | |
| 65 | +``` | |
| 66 | + | |
| 67 | +11 skills: `project_plan`, `prd`, `roadmap`, `task_breakdown`, `github_integration`, `requirements_chat`, `doc_generator`, `artifact_export`, `cli_adapter`, `notes_export`, `wiki_generator`. | |
| 68 | + | |
| 69 | +## Interactive Companion | |
| 70 | + | |
| 71 | +A chat REPL that auto-discovers knowledge graphs, videos, and docs in your workspace: | |
| 72 | + | |
| 73 | +```bash | |
| 74 | +# Launch the companion | |
| 75 | +planopticon companion | |
| 76 | +# or | |
| 77 | +planopticon --chat | |
| 78 | +``` | |
| 79 | + | |
| 80 | +15 slash commands: `/help`, `/status`, `/skills`, `/entities`, `/search`, `/neighbors`, `/export`, `/analyze`, `/ingest`, `/auth`, `/provider`, `/model`, `/run`, `/plan`, `/prd`, `/tasks`. | |
| 81 | + | |
| 82 | +Switch providers and models at runtime, explore your knowledge graph interactively, or chat with any configured LLM. | |
| 83 | + | |
| 84 | +## Source Connectors | |
| 85 | + | |
| 86 | +| Category | Sources | | |
| 87 | +|----------|---------| | |
| 88 | +| Media | YouTube, Web, Podcasts, RSS | | |
| 89 | +| Code & Community | GitHub, Reddit, HackerNews, arXiv | | |
| 90 | +| Cloud Storage | S3, Google Drive, Dropbox | | |
| 91 | +| Google Workspace | Docs, Sheets, Slides (via gws CLI) | | |
| 92 | +| Microsoft 365 | SharePoint, OneDrive (via m365 CLI) | | |
| 93 | +| Notes | Obsidian, Notion, Apple Notes, OneNote, Google Keep, Logseq | | |
| 94 | +| Meetings | Zoom (OAuth), Teams, Google Meet | | |
| 95 | + | |
| 96 | +## Export & Documents | |
| 97 | + | |
| 98 | +Generate documents from your knowledge graph without an LLM: | |
| 99 | + | |
| 100 | +```bash | |
| 101 | +planopticon export summary -o ./docs | |
| 102 | +planopticon export meeting-notes -o ./docs | |
| 103 | +planopticon export glossary -o ./docs | |
| 104 | +``` | |
| 105 | + | |
| 106 | +7 document types: `summary`, `meeting-notes`, `glossary`, `relationship-map`, `status-report`, `entity-index`, `csv`. | |
| 107 | + | |
| 108 | +Additional export targets: | |
| 109 | +- **Obsidian** -- YAML frontmatter + wiki-links vault | |
| 110 | +- **Notion** -- Compatible markdown | |
| 111 | +- **GitHub Wiki** -- Generate and push directly | |
| 112 | +- **PlanOpticonExchange** -- Canonical JSON interchange with merge/dedup | |
| 26 | 113 | |
| 27 | 114 | ## Local Run |
| 28 | 115 | |
| 29 | -PlanOpticon runs entirely offline with Ollama — no API keys, no cloud, no cost. | |
| 116 | +PlanOpticon runs entirely offline with Ollama -- no API keys, no cloud, no cost. | |
| 30 | 117 | |
| 31 | 118 | > **13.2 hours of video content analyzed, knowledge-graphed, and summarized in ~25 hours of processing time, entirely on local hardware, for free.** |
| 32 | 119 | |
| 33 | 120 | 18 meeting recordings processed on a single machine using `llava` (vision), `qwen3:30b` (chat), and `whisper-large` (transcription via Apple Silicon GPU): |
| 34 | 121 | |
| @@ -41,33 +128,17 @@ | ||
| 41 | 128 | | API calls (local) | 1,841 | |
| 42 | 129 | | Tokens processed | 4.87M | |
| 43 | 130 | | Total cost | **$0.00** | |
| 44 | 131 | |
| 45 | 132 | ```bash |
| 46 | -# Fully local analysis — no API keys needed, just Ollama running | |
| 133 | +# Fully local analysis -- no API keys needed, just Ollama running | |
| 47 | 134 | planopticon analyze -i meeting.mp4 -o ./output \ |
| 48 | 135 | --provider ollama \ |
| 49 | 136 | --vision-model llava:latest \ |
| 50 | 137 | --chat-model qwen3:30b |
| 51 | 138 | ``` |
| 52 | 139 | |
| 53 | -## Quick Start | |
| 54 | - | |
| 55 | -```bash | |
| 56 | -# Install | |
| 57 | -pip install planopticon | |
| 58 | - | |
| 59 | -# Analyze a single video | |
| 60 | -planopticon analyze -i meeting.mp4 -o ./output | |
| 61 | - | |
| 62 | -# Process a folder of videos | |
| 63 | -planopticon batch -i ./recordings -o ./output --title "Weekly Meetings" | |
| 64 | - | |
| 65 | -# See available AI models | |
| 66 | -planopticon list-models | |
| 67 | -``` | |
| 68 | - | |
| 69 | 140 | ## Installation |
| 70 | 141 | |
| 71 | 142 | ### From PyPI |
| 72 | 143 | |
| 73 | 144 | ```bash |
| @@ -109,11 +180,12 @@ | ||
| 109 | 180 | ├── captures/ # Screengrab fallbacks |
| 110 | 181 | └── results/ |
| 111 | 182 | ├── analysis.md # Markdown report |
| 112 | 183 | ├── analysis.html # HTML report |
| 113 | 184 | ├── analysis.pdf # PDF report |
| 114 | - ├── knowledge_graph.json # Entities and relationships | |
| 185 | + ├── knowledge_graph.db # SQLite knowledge graph | |
| 186 | + ├── knowledge_graph.json # JSON export | |
| 115 | 187 | ├── key_points.json # Extracted key points |
| 116 | 188 | └── action_items.json # Tasks and follow-ups |
| 117 | 189 | ``` |
| 118 | 190 | |
| 119 | 191 | ## Processing Depth |
| @@ -128,6 +200,6 @@ | ||
| 128 | 200 | |
| 129 | 201 | Full documentation at [planopticon.dev](https://planopticon.dev) |
| 130 | 202 | |
| 131 | 203 | ## License |
| 132 | 204 | |
| 133 | -MIT License — Copyright (c) 2026 CONFLICT LLC | |
| 205 | +MIT License -- Copyright (c) 2026 CONFLICT LLC | |
| 134 | 206 |
| --- README.md | |
| +++ README.md | |
| @@ -4,31 +4,118 @@ | |
| 4 | [](https://pypi.org/project/planopticon/) |
| 5 | [](https://pypi.org/project/planopticon/) |
| 6 | [](LICENSE) |
| 7 | [](https://planopticon.dev) |
| 8 | |
| 9 | **AI-powered video analysis and knowledge extraction.** |
| 10 | |
| 11 | PlanOpticon processes video recordings into structured knowledge — transcripts, diagrams, action items, key points, and knowledge graphs. It auto-discovers available models across OpenAI, Anthropic, Gemini, and Ollama, and produces rich multi-format output. |
| 12 | |
| 13 | ## Features |
| 14 | |
| 15 | - **Multi-provider AI** — Auto-discovers and routes to the best available model across OpenAI, Anthropic, Google Gemini, and Ollama (fully offline) |
| 16 | - **Smart frame extraction** — Change detection for transitions + periodic capture for slow-evolving content (document scrolling, screen shares) |
| 17 | - **People frame filtering** — OpenCV face detection automatically removes webcam/video conference frames, keeping only shared content |
| 18 | - **Diagram extraction** — Vision model classification detects flowcharts, architecture diagrams, charts, and whiteboards |
| 19 | - **Knowledge graphs** — Extracts entities and relationships, builds and merges knowledge graphs across videos |
| 20 | - **Action item detection** — Finds commitments, tasks, and follow-ups with assignees and deadlines |
| 21 | - **Batch processing** — Process entire folders of videos with merged knowledge graphs and cross-referencing |
| 22 | - **Rich output** — Markdown, HTML, PDF reports. Mermaid diagrams, SVG/PNG renderings, JSON manifests |
| 23 | - **Cloud sources** — Fetch videos from Google Drive and Dropbox shared folders |
| 24 | - **Checkpoint/resume** — Pipeline resumes from where it left off if interrupted |
| 25 | - **Screengrab fallback** — When extraction isn't perfect, captures frames with captions — something is always better than nothing |
| 26 | |
| 27 | ## Local Run |
| 28 | |
| 29 | PlanOpticon runs entirely offline with Ollama — no API keys, no cloud, no cost. |
| 30 | |
| 31 | > **13.2 hours of video content analyzed, knowledge-graphed, and summarized in ~25 hours of processing time, entirely on local hardware, for free.** |
| 32 | |
| 33 | 18 meeting recordings processed on a single machine using `llava` (vision), `qwen3:30b` (chat), and `whisper-large` (transcription via Apple Silicon GPU): |
| 34 | |
| @@ -41,33 +128,17 @@ | |
| 41 | | API calls (local) | 1,841 | |
| 42 | | Tokens processed | 4.87M | |
| 43 | | Total cost | **$0.00** | |
| 44 | |
| 45 | ```bash |
| 46 | # Fully local analysis — no API keys needed, just Ollama running |
| 47 | planopticon analyze -i meeting.mp4 -o ./output \ |
| 48 | --provider ollama \ |
| 49 | --vision-model llava:latest \ |
| 50 | --chat-model qwen3:30b |
| 51 | ``` |
| 52 | |
| 53 | ## Quick Start |
| 54 | |
| 55 | ```bash |
| 56 | # Install |
| 57 | pip install planopticon |
| 58 | |
| 59 | # Analyze a single video |
| 60 | planopticon analyze -i meeting.mp4 -o ./output |
| 61 | |
| 62 | # Process a folder of videos |
| 63 | planopticon batch -i ./recordings -o ./output --title "Weekly Meetings" |
| 64 | |
| 65 | # See available AI models |
| 66 | planopticon list-models |
| 67 | ``` |
| 68 | |
| 69 | ## Installation |
| 70 | |
| 71 | ### From PyPI |
| 72 | |
| 73 | ```bash |
| @@ -109,11 +180,12 @@ | |
| 109 | ├── captures/ # Screengrab fallbacks |
| 110 | └── results/ |
| 111 | ├── analysis.md # Markdown report |
| 112 | ├── analysis.html # HTML report |
| 113 | ├── analysis.pdf # PDF report |
| 114 | ├── knowledge_graph.json # Entities and relationships |
| 115 | ├── key_points.json # Extracted key points |
| 116 | └── action_items.json # Tasks and follow-ups |
| 117 | ``` |
| 118 | |
| 119 | ## Processing Depth |
| @@ -128,6 +200,6 @@ | |
| 128 | |
| 129 | Full documentation at [planopticon.dev](https://planopticon.dev) |
| 130 | |
| 131 | ## License |
| 132 | |
| 133 | MIT License — Copyright (c) 2026 CONFLICT LLC |
| 134 |
| --- README.md | |
| +++ README.md | |
| @@ -4,31 +4,118 @@ | |
| 4 | [](https://pypi.org/project/planopticon/) |
| 5 | [](https://pypi.org/project/planopticon/) |
| 6 | [](LICENSE) |
| 7 | [](https://planopticon.dev) |
| 8 | |
| 9 | **AI-powered video analysis, knowledge extraction, and planning.** |
| 10 | |
| 11 | PlanOpticon processes video recordings, documents, and 20+ online sources into structured knowledge graphs, then helps you plan with an AI agent and interactive companion. It auto-discovers models across 15+ AI providers, runs fully offline with Ollama, and produces rich multi-format output. |
| 12 | |
| 13 | ## Features |
| 14 | |
| 15 | - **15+ AI providers** -- OpenAI, Anthropic, Gemini, Ollama, Azure, Together, Fireworks, Cerebras, xAI, Bedrock, Vertex, Mistral, Cohere, AI21, HuggingFace, Qianfan, and LiteLLM. Defaults to cheap models (Haiku, GPT-4o-mini, Gemini Flash). |
| 16 | - **20+ source connectors** -- YouTube, web pages, GitHub, Reddit, HackerNews, RSS, podcasts, arXiv, S3, Google Workspace, Microsoft 365, Obsidian, Notion, Apple Notes, Zoom, Teams, Google Meet, and more. |
| 17 | - **Planning agent** -- 11 skills including project plans, PRDs, roadmaps, task breakdowns, and GitHub integration. |
| 18 | - **Interactive companion** -- Chat REPL with 15 slash commands, auto-discovery of workspace knowledge, and runtime provider/model switching. |
| 19 | - **Knowledge graphs** -- SQLite-backed (zero external deps), entity extraction with planning taxonomy (goals, requirements, risks, tasks, milestones), merge and dedup across sources. |
| 20 | - **Smart video analysis** -- Change-detection frame extraction, face filtering, diagram classification, action item detection, checkpoint/resume. |
| 21 | - **Document ingestion** -- PDF, Markdown, and plaintext pipelines feed the same knowledge graph. |
| 22 | - **Export everywhere** -- Markdown docs (7 types, no LLM required), Obsidian vaults, Notion markdown, GitHub wiki with push, PlanOpticonExchange JSON interchange, HTML/PDF reports, Mermaid diagrams. |
| 23 | - **OAuth-first auth** -- Unified OAuth manager for Google, Dropbox, Zoom, Notion, GitHub, and Microsoft with saved-token / PKCE / API-key fallback chain. |
| 24 | - **Batch processing** -- Process entire folders with merged knowledge graphs and cross-referencing. |
| 25 | |
| 26 | ## Quick Start |
| 27 | |
| 28 | ```bash |
| 29 | # Install |
| 30 | pip install planopticon |
| 31 | |
| 32 | # Analyze a video |
| 33 | planopticon analyze -i meeting.mp4 -o ./output |
| 34 | |
| 35 | # Ingest a document |
| 36 | planopticon ingest -i spec.pdf -o ./output |
| 37 | |
| 38 | # Fetch from a source |
| 39 | planopticon fetch youtube "https://youtube.com/watch?v=..." -o ./output |
| 40 | |
| 41 | # Process a folder of videos |
| 42 | planopticon batch -i ./recordings -o ./output --title "Weekly Meetings" |
| 43 | |
| 44 | # Query the knowledge graph |
| 45 | planopticon query |
| 46 | planopticon query "entities --type technology" |
| 47 | |
| 48 | # See available AI models |
| 49 | planopticon list-models |
| 50 | ``` |
| 51 | |
| 52 | ## Planning Agent |
| 53 | |
| 54 | Run AI-powered planning skills against your knowledge base: |
| 55 | |
| 56 | ```bash |
| 57 | # Generate a project plan from extracted knowledge |
| 58 | planopticon agent "Create a project plan" --kb ./results |
| 59 | |
| 60 | # Build a PRD |
| 61 | planopticon agent "Write a PRD for the authentication system" --kb ./results |
| 62 | |
| 63 | # Break down tasks |
| 64 | planopticon agent "Break this into tasks and estimate effort" --kb ./results |
| 65 | ``` |
| 66 | |
| 67 | 11 skills: `project_plan`, `prd`, `roadmap`, `task_breakdown`, `github_integration`, `requirements_chat`, `doc_generator`, `artifact_export`, `cli_adapter`, `notes_export`, `wiki_generator`. |
| 68 | |
| 69 | ## Interactive Companion |
| 70 | |
| 71 | A chat REPL that auto-discovers knowledge graphs, videos, and docs in your workspace: |
| 72 | |
| 73 | ```bash |
| 74 | # Launch the companion |
| 75 | planopticon companion |
| 76 | # or |
| 77 | planopticon --chat |
| 78 | ``` |
| 79 | |
| 80 | 15 slash commands: `/help`, `/status`, `/skills`, `/entities`, `/search`, `/neighbors`, `/export`, `/analyze`, `/ingest`, `/auth`, `/provider`, `/model`, `/run`, `/plan`, `/prd`, `/tasks`. |
| 81 | |
| 82 | Switch providers and models at runtime, explore your knowledge graph interactively, or chat with any configured LLM. |
| 83 | |
| 84 | ## Source Connectors |
| 85 | |
| 86 | | Category | Sources | |
| 87 | |----------|---------| |
| 88 | | Media | YouTube, Web, Podcasts, RSS | |
| 89 | | Code & Community | GitHub, Reddit, HackerNews, arXiv | |
| 90 | | Cloud Storage | S3, Google Drive, Dropbox | |
| 91 | | Google Workspace | Docs, Sheets, Slides (via gws CLI) | |
| 92 | | Microsoft 365 | SharePoint, OneDrive (via m365 CLI) | |
| 93 | | Notes | Obsidian, Notion, Apple Notes, OneNote, Google Keep, Logseq | |
| 94 | | Meetings | Zoom (OAuth), Teams, Google Meet | |
| 95 | |
| 96 | ## Export & Documents |
| 97 | |
| 98 | Generate documents from your knowledge graph without an LLM: |
| 99 | |
| 100 | ```bash |
| 101 | planopticon export summary -o ./docs |
| 102 | planopticon export meeting-notes -o ./docs |
| 103 | planopticon export glossary -o ./docs |
| 104 | ``` |
| 105 | |
| 106 | 7 document types: `summary`, `meeting-notes`, `glossary`, `relationship-map`, `status-report`, `entity-index`, `csv`. |
| 107 | |
| 108 | Additional export targets: |
| 109 | - **Obsidian** -- YAML frontmatter + wiki-links vault |
| 110 | - **Notion** -- Compatible markdown |
| 111 | - **GitHub Wiki** -- Generate and push directly |
| 112 | - **PlanOpticonExchange** -- Canonical JSON interchange with merge/dedup |
| 113 | |
| 114 | ## Local Run |
| 115 | |
| 116 | PlanOpticon runs entirely offline with Ollama -- no API keys, no cloud, no cost. |
| 117 | |
| 118 | > **13.2 hours of video content analyzed, knowledge-graphed, and summarized in ~25 hours of processing time, entirely on local hardware, for free.** |
| 119 | |
| 120 | 18 meeting recordings processed on a single machine using `llava` (vision), `qwen3:30b` (chat), and `whisper-large` (transcription via Apple Silicon GPU): |
| 121 | |
| @@ -41,33 +128,17 @@ | |
| 128 | | API calls (local) | 1,841 | |
| 129 | | Tokens processed | 4.87M | |
| 130 | | Total cost | **$0.00** | |
| 131 | |
| 132 | ```bash |
| 133 | # Fully local analysis -- no API keys needed, just Ollama running |
| 134 | planopticon analyze -i meeting.mp4 -o ./output \ |
| 135 | --provider ollama \ |
| 136 | --vision-model llava:latest \ |
| 137 | --chat-model qwen3:30b |
| 138 | ``` |
| 139 | |
| 140 | ## Installation |
| 141 | |
| 142 | ### From PyPI |
| 143 | |
| 144 | ```bash |
| @@ -109,11 +180,12 @@ | |
| 180 | ├── captures/ # Screengrab fallbacks |
| 181 | └── results/ |
| 182 | ├── analysis.md # Markdown report |
| 183 | ├── analysis.html # HTML report |
| 184 | ├── analysis.pdf # PDF report |
| 185 | ├── knowledge_graph.db # SQLite knowledge graph |
| 186 | ├── knowledge_graph.json # JSON export |
| 187 | ├── key_points.json # Extracted key points |
| 188 | └── action_items.json # Tasks and follow-ups |
| 189 | ``` |
| 190 | |
| 191 | ## Processing Depth |
| @@ -128,6 +200,6 @@ | |
| 200 | |
| 201 | Full documentation at [planopticon.dev](https://planopticon.dev) |
| 202 | |
| 203 | ## License |
| 204 | |
| 205 | MIT License -- Copyright (c) 2026 CONFLICT LLC |
| 206 |
+83
-6
| --- docs/architecture/overview.md | ||
| +++ docs/architecture/overview.md | ||
| @@ -2,67 +2,144 @@ | ||
| 2 | 2 | |
| 3 | 3 | ## System diagram |
| 4 | 4 | |
| 5 | 5 | ```mermaid |
| 6 | 6 | graph TD |
| 7 | - A[Video Input] --> B[Frame Extractor] | |
| 7 | + subgraph Sources | |
| 8 | + S1[Video Files] | |
| 9 | + S2[Google Workspace] | |
| 10 | + S3[Microsoft 365] | |
| 11 | + S4[Zoom / Teams / Meet] | |
| 12 | + S5[YouTube] | |
| 13 | + S6[Notes — Obsidian / Notion / Apple Notes] | |
| 14 | + S7[GitHub] | |
| 15 | + end | |
| 16 | + | |
| 17 | + subgraph Source Connectors | |
| 18 | + SC[Source Connectors + OAuth] | |
| 19 | + end | |
| 20 | + | |
| 21 | + S1 --> SC | |
| 22 | + S2 --> SC | |
| 23 | + S3 --> SC | |
| 24 | + S4 --> SC | |
| 25 | + S5 --> SC | |
| 26 | + S6 --> SC | |
| 27 | + S7 --> SC | |
| 28 | + | |
| 29 | + SC --> A[Ingest / Analyze Pipeline] | |
| 30 | + | |
| 31 | + A --> B[Frame Extractor] | |
| 8 | 32 | A --> C[Audio Extractor] |
| 9 | 33 | B --> D[Diagram Analyzer] |
| 10 | 34 | C --> E[Transcription] |
| 11 | 35 | D --> F[Knowledge Graph] |
| 12 | 36 | E --> F |
| 13 | 37 | E --> G[Key Point Extractor] |
| 14 | 38 | E --> H[Action Item Detector] |
| 15 | 39 | D --> I[Content Analyzer] |
| 16 | 40 | E --> I |
| 41 | + | |
| 42 | + subgraph Agent & Skills | |
| 43 | + AG[Planning Agent] | |
| 44 | + SK[Skill Registry] | |
| 45 | + CO[Companion REPL] | |
| 46 | + end | |
| 47 | + | |
| 48 | + F --> AG | |
| 49 | + G --> AG | |
| 50 | + H --> AG | |
| 51 | + I --> AG | |
| 52 | + AG --> SK | |
| 53 | + F --> CO | |
| 54 | + | |
| 17 | 55 | F --> J[Plan Generator] |
| 18 | 56 | G --> J |
| 19 | 57 | H --> J |
| 20 | 58 | I --> J |
| 21 | - J --> K[Markdown Report] | |
| 22 | - J --> L[HTML Report] | |
| 23 | - J --> M[PDF Report] | |
| 24 | - D --> N[Mermaid/SVG/PNG Export] | |
| 59 | + | |
| 60 | + subgraph Output & Export | |
| 61 | + J --> K[Markdown Report] | |
| 62 | + J --> L[HTML Report] | |
| 63 | + J --> M[PDF Report] | |
| 64 | + D --> N[Mermaid/SVG/PNG Export] | |
| 65 | + EX[Exporters — Obsidian / Notion / Exchange / Wiki] | |
| 66 | + end | |
| 67 | + | |
| 68 | + AG --> EX | |
| 69 | + F --> EX | |
| 25 | 70 | ``` |
| 26 | 71 | |
| 27 | 72 | ## Module structure |
| 28 | 73 | |
| 29 | 74 | ``` |
| 30 | 75 | video_processor/ |
| 31 | 76 | ├── cli/ # CLI commands (Click) |
| 32 | 77 | │ └── commands.py |
| 78 | +├── sources/ # Source connectors | |
| 79 | +│ ├── gdrive.py # Google Drive | |
| 80 | +│ ├── gws.py # Google Workspace (Docs, Sheets, Slides, Meet) | |
| 81 | +│ ├── m365.py # Microsoft 365 (OneDrive, SharePoint, Teams) | |
| 82 | +│ ├── dropbox.py # Dropbox | |
| 83 | +│ ├── zoom.py # Zoom recordings | |
| 84 | +│ ├── youtube.py # YouTube videos | |
| 85 | +│ ├── notion.py # Notion pages | |
| 86 | +│ ├── github.py # GitHub repos / wikis | |
| 87 | +│ ├── obsidian.py # Obsidian vaults | |
| 88 | +│ └── apple_notes.py # Apple Notes (macOS) | |
| 33 | 89 | ├── extractors/ # Media extraction |
| 34 | 90 | │ ├── frame_extractor.py # Video → frames |
| 35 | 91 | │ └── audio_extractor.py # Video → WAV |
| 36 | 92 | ├── analyzers/ # AI-powered analysis |
| 37 | 93 | │ ├── diagram_analyzer.py # Frame classification + extraction |
| 38 | 94 | │ ├── content_analyzer.py # Cross-referencing |
| 39 | 95 | │ └── action_detector.py # Action item detection |
| 40 | 96 | ├── integrators/ # Knowledge assembly |
| 41 | 97 | │ ├── knowledge_graph.py # Entity/relationship graph |
| 98 | +│ ├── graph_query.py # Query engine | |
| 42 | 99 | │ └── plan_generator.py # Report generation |
| 100 | +├── agent/ # Planning agent | |
| 101 | +│ ├── agent.py # Agent loop | |
| 102 | +│ ├── skills.py # Skill registry | |
| 103 | +│ └── companion.py # Companion REPL | |
| 104 | +├── exporters/ # Export formats | |
| 105 | +│ ├── markdown.py # Markdown export | |
| 106 | +│ ├── obsidian.py # Obsidian vault export | |
| 107 | +│ ├── notion.py # Notion export | |
| 108 | +│ ├── wiki.py # Wiki generation + push | |
| 109 | +│ └── exchange.py # PlanOpticon Exchange Format | |
| 43 | 110 | ├── providers/ # AI provider abstraction |
| 44 | 111 | │ ├── base.py # BaseProvider ABC |
| 45 | 112 | │ ├── openai_provider.py |
| 46 | 113 | │ ├── anthropic_provider.py |
| 47 | 114 | │ ├── gemini_provider.py |
| 48 | 115 | │ ├── ollama_provider.py # Local Ollama (offline) |
| 116 | +│ ├── azure_provider.py # Azure OpenAI | |
| 117 | +│ ├── together_provider.py | |
| 118 | +│ ├── fireworks_provider.py | |
| 119 | +│ ├── cerebras_provider.py | |
| 120 | +│ ├── xai_provider.py # xAI / Grok | |
| 49 | 121 | │ ├── discovery.py # Auto-model-discovery |
| 50 | 122 | │ └── manager.py # ProviderManager routing |
| 51 | 123 | ├── utils/ |
| 52 | 124 | │ ├── json_parsing.py # Robust LLM JSON parsing |
| 53 | 125 | │ ├── rendering.py # Mermaid + chart rendering |
| 54 | 126 | │ ├── export.py # HTML/PDF export |
| 55 | 127 | │ ├── api_cache.py # Disk-based response cache |
| 56 | 128 | │ └── prompt_templates.py # LLM prompt management |
| 129 | +├── auth.py # OAuth flow management | |
| 130 | +├── exchange.py # Exchange format schema | |
| 57 | 131 | ├── models.py # Pydantic data models |
| 58 | 132 | ├── output_structure.py # Directory layout + manifest I/O |
| 59 | 133 | └── pipeline.py # Core processing pipeline |
| 60 | 134 | ``` |
| 61 | 135 | |
| 62 | 136 | ## Key design decisions |
| 63 | 137 | |
| 64 | 138 | - **Pydantic everywhere** — All structured data uses pydantic models for validation and serialization |
| 65 | 139 | - **Manifest-driven** — Every run produces `manifest.json` as the single source of truth |
| 66 | -- **Provider abstraction** — Single `ProviderManager` wraps OpenAI, Anthropic, Gemini, and Ollama behind a common interface | |
| 140 | +- **Provider abstraction** — Single `ProviderManager` wraps OpenAI, Anthropic, Gemini, Ollama, and additional providers behind a common interface | |
| 67 | 141 | - **No hardcoded models** — Model lists come from API discovery |
| 68 | 142 | - **Screengrab fallback** — When extraction fails, save the frame as a captioned screenshot |
| 143 | +- **OAuth-first auth** — All cloud service integrations use OAuth via `planopticon auth`, with credentials stored locally. Service account keys are supported as a fallback for server-side automation | |
| 144 | +- **Skill registry** — The planning agent discovers and invokes skills dynamically. Skills are self-describing and can be composed by the agent to accomplish complex tasks | |
| 145 | +- **Exchange format** — A portable JSON format (`exchange.py`) for importing and exporting knowledge graphs between PlanOpticon instances and external tools | |
| 69 | 146 |
| --- docs/architecture/overview.md | |
| +++ docs/architecture/overview.md | |
| @@ -2,67 +2,144 @@ | |
| 2 | |
| 3 | ## System diagram |
| 4 | |
| 5 | ```mermaid |
| 6 | graph TD |
| 7 | A[Video Input] --> B[Frame Extractor] |
| 8 | A --> C[Audio Extractor] |
| 9 | B --> D[Diagram Analyzer] |
| 10 | C --> E[Transcription] |
| 11 | D --> F[Knowledge Graph] |
| 12 | E --> F |
| 13 | E --> G[Key Point Extractor] |
| 14 | E --> H[Action Item Detector] |
| 15 | D --> I[Content Analyzer] |
| 16 | E --> I |
| 17 | F --> J[Plan Generator] |
| 18 | G --> J |
| 19 | H --> J |
| 20 | I --> J |
| 21 | J --> K[Markdown Report] |
| 22 | J --> L[HTML Report] |
| 23 | J --> M[PDF Report] |
| 24 | D --> N[Mermaid/SVG/PNG Export] |
| 25 | ``` |
| 26 | |
| 27 | ## Module structure |
| 28 | |
| 29 | ``` |
| 30 | video_processor/ |
| 31 | ├── cli/ # CLI commands (Click) |
| 32 | │ └── commands.py |
| 33 | ├── extractors/ # Media extraction |
| 34 | │ ├── frame_extractor.py # Video → frames |
| 35 | │ └── audio_extractor.py # Video → WAV |
| 36 | ├── analyzers/ # AI-powered analysis |
| 37 | │ ├── diagram_analyzer.py # Frame classification + extraction |
| 38 | │ ├── content_analyzer.py # Cross-referencing |
| 39 | │ └── action_detector.py # Action item detection |
| 40 | ├── integrators/ # Knowledge assembly |
| 41 | │ ├── knowledge_graph.py # Entity/relationship graph |
| 42 | │ └── plan_generator.py # Report generation |
| 43 | ├── providers/ # AI provider abstraction |
| 44 | │ ├── base.py # BaseProvider ABC |
| 45 | │ ├── openai_provider.py |
| 46 | │ ├── anthropic_provider.py |
| 47 | │ ├── gemini_provider.py |
| 48 | │ ├── ollama_provider.py # Local Ollama (offline) |
| 49 | │ ├── discovery.py # Auto-model-discovery |
| 50 | │ └── manager.py # ProviderManager routing |
| 51 | ├── utils/ |
| 52 | │ ├── json_parsing.py # Robust LLM JSON parsing |
| 53 | │ ├── rendering.py # Mermaid + chart rendering |
| 54 | │ ├── export.py # HTML/PDF export |
| 55 | │ ├── api_cache.py # Disk-based response cache |
| 56 | │ └── prompt_templates.py # LLM prompt management |
| 57 | ├── models.py # Pydantic data models |
| 58 | ├── output_structure.py # Directory layout + manifest I/O |
| 59 | └── pipeline.py # Core processing pipeline |
| 60 | ``` |
| 61 | |
| 62 | ## Key design decisions |
| 63 | |
| 64 | - **Pydantic everywhere** — All structured data uses pydantic models for validation and serialization |
| 65 | - **Manifest-driven** — Every run produces `manifest.json` as the single source of truth |
| 66 | - **Provider abstraction** — Single `ProviderManager` wraps OpenAI, Anthropic, Gemini, and Ollama behind a common interface |
| 67 | - **No hardcoded models** — Model lists come from API discovery |
| 68 | - **Screengrab fallback** — When extraction fails, save the frame as a captioned screenshot |
| 69 |
| --- docs/architecture/overview.md | |
| +++ docs/architecture/overview.md | |
| @@ -2,67 +2,144 @@ | |
| 2 | |
| 3 | ## System diagram |
| 4 | |
| 5 | ```mermaid |
| 6 | graph TD |
| 7 | subgraph Sources |
| 8 | S1[Video Files] |
| 9 | S2[Google Workspace] |
| 10 | S3[Microsoft 365] |
| 11 | S4[Zoom / Teams / Meet] |
| 12 | S5[YouTube] |
| 13 | S6[Notes — Obsidian / Notion / Apple Notes] |
| 14 | S7[GitHub] |
| 15 | end |
| 16 | |
| 17 | subgraph Source Connectors |
| 18 | SC[Source Connectors + OAuth] |
| 19 | end |
| 20 | |
| 21 | S1 --> SC |
| 22 | S2 --> SC |
| 23 | S3 --> SC |
| 24 | S4 --> SC |
| 25 | S5 --> SC |
| 26 | S6 --> SC |
| 27 | S7 --> SC |
| 28 | |
| 29 | SC --> A[Ingest / Analyze Pipeline] |
| 30 | |
| 31 | A --> B[Frame Extractor] |
| 32 | A --> C[Audio Extractor] |
| 33 | B --> D[Diagram Analyzer] |
| 34 | C --> E[Transcription] |
| 35 | D --> F[Knowledge Graph] |
| 36 | E --> F |
| 37 | E --> G[Key Point Extractor] |
| 38 | E --> H[Action Item Detector] |
| 39 | D --> I[Content Analyzer] |
| 40 | E --> I |
| 41 | |
| 42 | subgraph Agent & Skills |
| 43 | AG[Planning Agent] |
| 44 | SK[Skill Registry] |
| 45 | CO[Companion REPL] |
| 46 | end |
| 47 | |
| 48 | F --> AG |
| 49 | G --> AG |
| 50 | H --> AG |
| 51 | I --> AG |
| 52 | AG --> SK |
| 53 | F --> CO |
| 54 | |
| 55 | F --> J[Plan Generator] |
| 56 | G --> J |
| 57 | H --> J |
| 58 | I --> J |
| 59 | |
| 60 | subgraph Output & Export |
| 61 | J --> K[Markdown Report] |
| 62 | J --> L[HTML Report] |
| 63 | J --> M[PDF Report] |
| 64 | D --> N[Mermaid/SVG/PNG Export] |
| 65 | EX[Exporters — Obsidian / Notion / Exchange / Wiki] |
| 66 | end |
| 67 | |
| 68 | AG --> EX |
| 69 | F --> EX |
| 70 | ``` |
| 71 | |
| 72 | ## Module structure |
| 73 | |
| 74 | ``` |
| 75 | video_processor/ |
| 76 | ├── cli/ # CLI commands (Click) |
| 77 | │ └── commands.py |
| 78 | ├── sources/ # Source connectors |
| 79 | │ ├── gdrive.py # Google Drive |
| 80 | │ ├── gws.py # Google Workspace (Docs, Sheets, Slides, Meet) |
| 81 | │ ├── m365.py # Microsoft 365 (OneDrive, SharePoint, Teams) |
| 82 | │ ├── dropbox.py # Dropbox |
| 83 | │ ├── zoom.py # Zoom recordings |
| 84 | │ ├── youtube.py # YouTube videos |
| 85 | │ ├── notion.py # Notion pages |
| 86 | │ ├── github.py # GitHub repos / wikis |
| 87 | │ ├── obsidian.py # Obsidian vaults |
| 88 | │ └── apple_notes.py # Apple Notes (macOS) |
| 89 | ├── extractors/ # Media extraction |
| 90 | │ ├── frame_extractor.py # Video → frames |
| 91 | │ └── audio_extractor.py # Video → WAV |
| 92 | ├── analyzers/ # AI-powered analysis |
| 93 | │ ├── diagram_analyzer.py # Frame classification + extraction |
| 94 | │ ├── content_analyzer.py # Cross-referencing |
| 95 | │ └── action_detector.py # Action item detection |
| 96 | ├── integrators/ # Knowledge assembly |
| 97 | │ ├── knowledge_graph.py # Entity/relationship graph |
| 98 | │ ├── graph_query.py # Query engine |
| 99 | │ └── plan_generator.py # Report generation |
| 100 | ├── agent/ # Planning agent |
| 101 | │ ├── agent.py # Agent loop |
| 102 | │ ├── skills.py # Skill registry |
| 103 | │ └── companion.py # Companion REPL |
| 104 | ├── exporters/ # Export formats |
| 105 | │ ├── markdown.py # Markdown export |
| 106 | │ ├── obsidian.py # Obsidian vault export |
| 107 | │ ├── notion.py # Notion export |
| 108 | │ ├── wiki.py # Wiki generation + push |
| 109 | │ └── exchange.py # PlanOpticon Exchange Format |
| 110 | ├── providers/ # AI provider abstraction |
| 111 | │ ├── base.py # BaseProvider ABC |
| 112 | │ ├── openai_provider.py |
| 113 | │ ├── anthropic_provider.py |
| 114 | │ ├── gemini_provider.py |
| 115 | │ ├── ollama_provider.py # Local Ollama (offline) |
| 116 | │ ├── azure_provider.py # Azure OpenAI |
| 117 | │ ├── together_provider.py |
| 118 | │ ├── fireworks_provider.py |
| 119 | │ ├── cerebras_provider.py |
| 120 | │ ├── xai_provider.py # xAI / Grok |
| 121 | │ ├── discovery.py # Auto-model-discovery |
| 122 | │ └── manager.py # ProviderManager routing |
| 123 | ├── utils/ |
| 124 | │ ├── json_parsing.py # Robust LLM JSON parsing |
| 125 | │ ├── rendering.py # Mermaid + chart rendering |
| 126 | │ ├── export.py # HTML/PDF export |
| 127 | │ ├── api_cache.py # Disk-based response cache |
| 128 | │ └── prompt_templates.py # LLM prompt management |
| 129 | ├── auth.py # OAuth flow management |
| 130 | ├── exchange.py # Exchange format schema |
| 131 | ├── models.py # Pydantic data models |
| 132 | ├── output_structure.py # Directory layout + manifest I/O |
| 133 | └── pipeline.py # Core processing pipeline |
| 134 | ``` |
| 135 | |
| 136 | ## Key design decisions |
| 137 | |
| 138 | - **Pydantic everywhere** — All structured data uses pydantic models for validation and serialization |
| 139 | - **Manifest-driven** — Every run produces `manifest.json` as the single source of truth |
| 140 | - **Provider abstraction** — Single `ProviderManager` wraps OpenAI, Anthropic, Gemini, Ollama, and additional providers behind a common interface |
| 141 | - **No hardcoded models** — Model lists come from API discovery |
| 142 | - **Screengrab fallback** — When extraction fails, save the frame as a captioned screenshot |
| 143 | - **OAuth-first auth** — All cloud service integrations use OAuth via `planopticon auth`, with credentials stored locally. Service account keys are supported as a fallback for server-side automation |
| 144 | - **Skill registry** — The planning agent discovers and invokes skills dynamically. Skills are self-describing and can be composed by the agent to accomplish complex tasks |
| 145 | - **Exchange format** — A portable JSON format (`exchange.py`) for importing and exporting knowledge graphs between PlanOpticon instances and external tools |
| 146 |
+41
-12
| --- docs/architecture/providers.md | ||
| +++ docs/architecture/providers.md | ||
| @@ -1,19 +1,36 @@ | ||
| 1 | 1 | # Provider System |
| 2 | 2 | |
| 3 | 3 | ## Overview |
| 4 | 4 | |
| 5 | -PlanOpticon supports multiple AI providers through a unified abstraction layer. | |
| 5 | +PlanOpticon supports multiple AI providers through a unified abstraction layer. Default models favor cost-effective options (Haiku, GPT-4o-mini, Gemini Flash) for routine tasks, with more capable models available when needed. | |
| 6 | 6 | |
| 7 | 7 | ## Supported providers |
| 8 | 8 | |
| 9 | -| Provider | Chat | Vision | Transcription | | |
| 10 | -|----------|------|--------|--------------| | |
| 11 | -| OpenAI | GPT-4o, GPT-4 | GPT-4o | Whisper-1 | | |
| 12 | -| Anthropic | Claude Sonnet/Opus | Claude Sonnet/Opus | — | | |
| 13 | -| Google Gemini | Gemini Flash/Pro | Gemini Flash/Pro | Gemini Flash | | |
| 14 | -| Ollama (local) | Any installed model | llava, moondream, etc. | — (use local Whisper) | | |
| 9 | +| Provider | Chat | Vision | Transcription | Env Variable | | |
| 10 | +|----------|------|--------|--------------|--------------| | |
| 11 | +| OpenAI | GPT-4o-mini, GPT-4o | GPT-4o-mini, GPT-4o | Whisper-1 | `OPENAI_API_KEY` | | |
| 12 | +| Anthropic | Claude Haiku, Sonnet, Opus | Claude Haiku, Sonnet, Opus | — | `ANTHROPIC_API_KEY` | | |
| 13 | +| Google Gemini | Gemini Flash, Pro | Gemini Flash, Pro | Gemini Flash | `GEMINI_API_KEY` | | |
| 14 | +| Azure OpenAI | GPT-4o-mini, GPT-4o | GPT-4o-mini, GPT-4o | Whisper-1 | `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_ENDPOINT` | | |
| 15 | +| Together AI | Llama, Mixtral, etc. | Llava | — | `TOGETHER_API_KEY` | | |
| 16 | +| Fireworks AI | Llama, Mixtral, etc. | Llava | — | `FIREWORKS_API_KEY` | | |
| 17 | +| Cerebras | Llama (fast inference) | — | — | `CEREBRAS_API_KEY` | | |
| 18 | +| xAI | Grok | Grok | — | `XAI_API_KEY` | | |
| 19 | +| Ollama (local) | Any installed model | llava, moondream, etc. | — (use local Whisper) | `OLLAMA_HOST` | | |
| 20 | + | |
| 21 | +## Default models | |
| 22 | + | |
| 23 | +PlanOpticon defaults to cheap, fast models for cost efficiency: | |
| 24 | + | |
| 25 | +| Task | Default model | | |
| 26 | +|------|--------------| | |
| 27 | +| Vision (diagrams) | Gemini Flash | | |
| 28 | +| Chat (analysis) | Claude Haiku | | |
| 29 | +| Transcription | Local Whisper (fallback: Whisper-1) | | |
| 30 | + | |
| 31 | +Use `--vision-model` and `--chat-model` to override with more capable models when needed (e.g., `--chat-model claude-sonnet-4-20250514` for complex analysis). | |
| 15 | 32 | |
| 16 | 33 | ## Ollama (offline mode) |
| 17 | 34 | |
| 18 | 35 | [Ollama](https://ollama.com) enables fully offline operation with no API keys required. PlanOpticon connects via Ollama's OpenAI-compatible API. |
| 19 | 36 | |
| @@ -51,31 +68,43 @@ | ||
| 51 | 68 | # Automatically discovers models from all configured providers + Ollama |
| 52 | 69 | ``` |
| 53 | 70 | |
| 54 | 71 | ## Routing preferences |
| 55 | 72 | |
| 56 | -Each task type has a default preference order: | |
| 73 | +Each task type has a default preference order (cheapest first): | |
| 57 | 74 | |
| 58 | 75 | | Task | Preference | |
| 59 | 76 | |------|-----------| |
| 60 | -| Vision | Gemini Flash → GPT-4o → Claude Sonnet → Ollama | | |
| 61 | -| Chat | Claude Sonnet → GPT-4o → Gemini Flash → Ollama | | |
| 77 | +| Vision | Gemini Flash → GPT-4o-mini → Claude Haiku → Ollama | | |
| 78 | +| Chat | Claude Haiku → GPT-4o-mini → Gemini Flash → Ollama | | |
| 62 | 79 | | Transcription | Local Whisper → Whisper-1 → Gemini Flash | |
| 63 | 80 | |
| 64 | -Ollama acts as the last-resort fallback — if no cloud API keys are set but Ollama is running, it is used automatically. | |
| 81 | +Ollama acts as the last-resort fallback -- if no cloud API keys are set but Ollama is running, it is used automatically. | |
| 65 | 82 | |
| 66 | 83 | ## Manual override |
| 67 | 84 | |
| 68 | 85 | ```python |
| 69 | 86 | pm = ProviderManager( |
| 70 | 87 | vision_model="gpt-4o", |
| 71 | - chat_model="claude-sonnet-4-5-20250929", | |
| 88 | + chat_model="claude-sonnet-4-20250514", | |
| 72 | 89 | provider="openai", # Force a specific provider |
| 73 | 90 | ) |
| 91 | + | |
| 92 | +# Use a cheap model for bulk processing | |
| 93 | +pm = ProviderManager( | |
| 94 | + chat_model="claude-haiku-3-5-20241022", | |
| 95 | + vision_model="gemini-2.0-flash", | |
| 96 | +) | |
| 74 | 97 | |
| 75 | 98 | # Or use Ollama for fully offline processing |
| 76 | 99 | pm = ProviderManager(provider="ollama") |
| 100 | + | |
| 101 | +# Use Azure OpenAI | |
| 102 | +pm = ProviderManager(provider="azure") | |
| 103 | + | |
| 104 | +# Use Together AI for open-source models | |
| 105 | +pm = ProviderManager(provider="together", chat_model="meta-llama/Llama-3.3-70B-Instruct-Turbo") | |
| 77 | 106 | ``` |
| 78 | 107 | |
| 79 | 108 | ## BaseProvider interface |
| 80 | 109 | |
| 81 | 110 | All providers implement: |
| 82 | 111 |
| --- docs/architecture/providers.md | |
| +++ docs/architecture/providers.md | |
| @@ -1,19 +1,36 @@ | |
| 1 | # Provider System |
| 2 | |
| 3 | ## Overview |
| 4 | |
| 5 | PlanOpticon supports multiple AI providers through a unified abstraction layer. |
| 6 | |
| 7 | ## Supported providers |
| 8 | |
| 9 | | Provider | Chat | Vision | Transcription | |
| 10 | |----------|------|--------|--------------| |
| 11 | | OpenAI | GPT-4o, GPT-4 | GPT-4o | Whisper-1 | |
| 12 | | Anthropic | Claude Sonnet/Opus | Claude Sonnet/Opus | — | |
| 13 | | Google Gemini | Gemini Flash/Pro | Gemini Flash/Pro | Gemini Flash | |
| 14 | | Ollama (local) | Any installed model | llava, moondream, etc. | — (use local Whisper) | |
| 15 | |
| 16 | ## Ollama (offline mode) |
| 17 | |
| 18 | [Ollama](https://ollama.com) enables fully offline operation with no API keys required. PlanOpticon connects via Ollama's OpenAI-compatible API. |
| 19 | |
| @@ -51,31 +68,43 @@ | |
| 51 | # Automatically discovers models from all configured providers + Ollama |
| 52 | ``` |
| 53 | |
| 54 | ## Routing preferences |
| 55 | |
| 56 | Each task type has a default preference order: |
| 57 | |
| 58 | | Task | Preference | |
| 59 | |------|-----------| |
| 60 | | Vision | Gemini Flash → GPT-4o → Claude Sonnet → Ollama | |
| 61 | | Chat | Claude Sonnet → GPT-4o → Gemini Flash → Ollama | |
| 62 | | Transcription | Local Whisper → Whisper-1 → Gemini Flash | |
| 63 | |
| 64 | Ollama acts as the last-resort fallback — if no cloud API keys are set but Ollama is running, it is used automatically. |
| 65 | |
| 66 | ## Manual override |
| 67 | |
| 68 | ```python |
| 69 | pm = ProviderManager( |
| 70 | vision_model="gpt-4o", |
| 71 | chat_model="claude-sonnet-4-5-20250929", |
| 72 | provider="openai", # Force a specific provider |
| 73 | ) |
| 74 | |
| 75 | # Or use Ollama for fully offline processing |
| 76 | pm = ProviderManager(provider="ollama") |
| 77 | ``` |
| 78 | |
| 79 | ## BaseProvider interface |
| 80 | |
| 81 | All providers implement: |
| 82 |
| --- docs/architecture/providers.md | |
| +++ docs/architecture/providers.md | |
| @@ -1,19 +1,36 @@ | |
| 1 | # Provider System |
| 2 | |
| 3 | ## Overview |
| 4 | |
| 5 | PlanOpticon supports multiple AI providers through a unified abstraction layer. Default models favor cost-effective options (Haiku, GPT-4o-mini, Gemini Flash) for routine tasks, with more capable models available when needed. |
| 6 | |
| 7 | ## Supported providers |
| 8 | |
| 9 | | Provider | Chat | Vision | Transcription | Env Variable | |
| 10 | |----------|------|--------|--------------|--------------| |
| 11 | | OpenAI | GPT-4o-mini, GPT-4o | GPT-4o-mini, GPT-4o | Whisper-1 | `OPENAI_API_KEY` | |
| 12 | | Anthropic | Claude Haiku, Sonnet, Opus | Claude Haiku, Sonnet, Opus | — | `ANTHROPIC_API_KEY` | |
| 13 | | Google Gemini | Gemini Flash, Pro | Gemini Flash, Pro | Gemini Flash | `GEMINI_API_KEY` | |
| 14 | | Azure OpenAI | GPT-4o-mini, GPT-4o | GPT-4o-mini, GPT-4o | Whisper-1 | `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_ENDPOINT` | |
| 15 | | Together AI | Llama, Mixtral, etc. | Llava | — | `TOGETHER_API_KEY` | |
| 16 | | Fireworks AI | Llama, Mixtral, etc. | Llava | — | `FIREWORKS_API_KEY` | |
| 17 | | Cerebras | Llama (fast inference) | — | — | `CEREBRAS_API_KEY` | |
| 18 | | xAI | Grok | Grok | — | `XAI_API_KEY` | |
| 19 | | Ollama (local) | Any installed model | llava, moondream, etc. | — (use local Whisper) | `OLLAMA_HOST` | |
| 20 | |
| 21 | ## Default models |
| 22 | |
| 23 | PlanOpticon defaults to cheap, fast models for cost efficiency: |
| 24 | |
| 25 | | Task | Default model | |
| 26 | |------|--------------| |
| 27 | | Vision (diagrams) | Gemini Flash | |
| 28 | | Chat (analysis) | Claude Haiku | |
| 29 | | Transcription | Local Whisper (fallback: Whisper-1) | |
| 30 | |
| 31 | Use `--vision-model` and `--chat-model` to override with more capable models when needed (e.g., `--chat-model claude-sonnet-4-20250514` for complex analysis). |
| 32 | |
| 33 | ## Ollama (offline mode) |
| 34 | |
| 35 | [Ollama](https://ollama.com) enables fully offline operation with no API keys required. PlanOpticon connects via Ollama's OpenAI-compatible API. |
| 36 | |
| @@ -51,31 +68,43 @@ | |
| 68 | # Automatically discovers models from all configured providers + Ollama |
| 69 | ``` |
| 70 | |
| 71 | ## Routing preferences |
| 72 | |
| 73 | Each task type has a default preference order (cheapest first): |
| 74 | |
| 75 | | Task | Preference | |
| 76 | |------|-----------| |
| 77 | | Vision | Gemini Flash → GPT-4o-mini → Claude Haiku → Ollama | |
| 78 | | Chat | Claude Haiku → GPT-4o-mini → Gemini Flash → Ollama | |
| 79 | | Transcription | Local Whisper → Whisper-1 → Gemini Flash | |
| 80 | |
| 81 | Ollama acts as the last-resort fallback -- if no cloud API keys are set but Ollama is running, it is used automatically. |
| 82 | |
| 83 | ## Manual override |
| 84 | |
| 85 | ```python |
| 86 | pm = ProviderManager( |
| 87 | vision_model="gpt-4o", |
| 88 | chat_model="claude-sonnet-4-20250514", |
| 89 | provider="openai", # Force a specific provider |
| 90 | ) |
| 91 | |
| 92 | # Use a cheap model for bulk processing |
| 93 | pm = ProviderManager( |
| 94 | chat_model="claude-haiku-3-5-20241022", |
| 95 | vision_model="gemini-2.0-flash", |
| 96 | ) |
| 97 | |
| 98 | # Or use Ollama for fully offline processing |
| 99 | pm = ProviderManager(provider="ollama") |
| 100 | |
| 101 | # Use Azure OpenAI |
| 102 | pm = ProviderManager(provider="azure") |
| 103 | |
| 104 | # Use Together AI for open-source models |
| 105 | pm = ProviderManager(provider="together", chat_model="meta-llama/Llama-3.3-70B-Instruct-Turbo") |
| 106 | ``` |
| 107 | |
| 108 | ## BaseProvider interface |
| 109 | |
| 110 | All providers implement: |
| 111 |
+561
-11
| --- docs/cli-reference.md | ||
| +++ docs/cli-reference.md | ||
| @@ -1,6 +1,20 @@ | ||
| 1 | 1 | # CLI Reference |
| 2 | + | |
| 3 | +## Global options | |
| 4 | + | |
| 5 | +These options are available on all commands. | |
| 6 | + | |
| 7 | +| Option | Description | | |
| 8 | +|--------|-------------| | |
| 9 | +| `-v`, `--verbose` | Enable debug-level logging | | |
| 10 | +| `-C`, `--chat` | Enable chat mode (interactive follow-up after command completes) | | |
| 11 | +| `-I`, `--interactive` | Enable interactive REPL mode | | |
| 12 | +| `--version` | Show version and exit | | |
| 13 | +| `--help` | Show help and exit | | |
| 14 | + | |
| 15 | +--- | |
| 2 | 16 | |
| 3 | 17 | ## `planopticon analyze` |
| 4 | 18 | |
| 5 | 19 | Analyze a single video and extract structured knowledge. |
| 6 | 20 | |
| @@ -95,39 +109,575 @@ | ||
| 95 | 109 | | `-p`, `--provider` | `auto\|openai\|anthropic\|gemini\|ollama` | `auto` | API provider | |
| 96 | 110 | | `--vision-model` | TEXT | auto | Override vision model | |
| 97 | 111 | | `--chat-model` | TEXT | auto | Override chat model | |
| 98 | 112 | |
| 99 | 113 | --- |
| 114 | + | |
| 115 | +## `planopticon companion` | |
| 116 | + | |
| 117 | +Interactive knowledge base companion. Opens a REPL for conversational exploration of your knowledge base. | |
| 118 | + | |
| 119 | +```bash | |
| 120 | +planopticon companion [OPTIONS] | |
| 121 | +``` | |
| 122 | + | |
| 123 | +| Option | Type | Default | Description | | |
| 124 | +|--------|------|---------|-------------| | |
| 125 | +| `--kb` | PATH | auto-detect | Path to knowledge base directory | | |
| 126 | +| `-p`, `--provider` | TEXT | `auto` | AI provider | | |
| 127 | +| `--chat-model` | TEXT | auto | Override chat model | | |
| 128 | + | |
| 129 | +**Examples:** | |
| 130 | + | |
| 131 | +```bash | |
| 132 | +# Start companion with auto-detected knowledge base | |
| 133 | +planopticon companion | |
| 134 | + | |
| 135 | +# Point to a specific knowledge base | |
| 136 | +planopticon companion --kb ./my-kb | |
| 137 | + | |
| 138 | +# Use a specific provider | |
| 139 | +planopticon companion --kb ./kb --provider anthropic --chat-model claude-sonnet-4-20250514 | |
| 140 | +``` | |
| 141 | + | |
| 142 | +--- | |
| 143 | + | |
| 144 | +## `planopticon agent` | |
| 145 | + | |
| 146 | +Planning agent with adaptive analysis. Runs an agentic loop that reasons about your knowledge base, plans actions, and executes them. | |
| 147 | + | |
| 148 | +```bash | |
| 149 | +planopticon agent [OPTIONS] | |
| 150 | +``` | |
| 151 | + | |
| 152 | +| Option | Type | Default | Description | | |
| 153 | +|--------|------|---------|-------------| | |
| 154 | +| `--kb` | PATH | auto-detect | Path to knowledge base directory | | |
| 155 | +| `-I`, `--interactive` | FLAG | off | Interactive mode (ask before each action) | | |
| 156 | +| `--export` | PATH | none | Export agent results to a file | | |
| 157 | +| `-p`, `--provider` | TEXT | `auto` | AI provider | | |
| 158 | +| `--chat-model` | TEXT | auto | Override chat model | | |
| 159 | + | |
| 160 | +**Examples:** | |
| 161 | + | |
| 162 | +```bash | |
| 163 | +# Run the agent interactively | |
| 164 | +planopticon agent --kb ./kb --interactive | |
| 165 | + | |
| 166 | +# Run agent and export results | |
| 167 | +planopticon agent --kb ./kb --export ./plan.md | |
| 168 | + | |
| 169 | +# Use a specific model | |
| 170 | +planopticon agent --kb ./kb --provider openai --chat-model gpt-4o | |
| 171 | +``` | |
| 172 | + | |
| 173 | +--- | |
| 174 | + | |
| 175 | +## `planopticon query` | |
| 176 | + | |
| 177 | +Query the knowledge graph directly or with natural language. | |
| 178 | + | |
| 179 | +```bash | |
| 180 | +planopticon query [OPTIONS] [QUERY] | |
| 181 | +``` | |
| 182 | + | |
| 183 | +| Option | Type | Default | Description | | |
| 184 | +|--------|------|---------|-------------| | |
| 185 | +| `--db-path` | PATH | auto-detect | Path to knowledge graph database | | |
| 186 | +| `--mode` | `direct\|agentic` | auto | Query mode (direct for structured, agentic for natural language) | | |
| 187 | +| `--format` | `text\|json\|mermaid` | `text` | Output format | | |
| 188 | +| `-I`, `--interactive` | FLAG | off | Interactive REPL mode | | |
| 189 | + | |
| 190 | +**Examples:** | |
| 191 | + | |
| 192 | +```bash | |
| 193 | +# Show graph stats | |
| 194 | +planopticon query stats | |
| 195 | + | |
| 196 | +# List entities by type | |
| 197 | +planopticon query "entities --type technology" | |
| 198 | +planopticon query "entities --type person" | |
| 199 | + | |
| 200 | +# Find neighbors of an entity | |
| 201 | +planopticon query "neighbors Alice" | |
| 202 | + | |
| 203 | +# List relationships | |
| 204 | +planopticon query "relationships --source Alice" | |
| 205 | + | |
| 206 | +# Natural language query (requires API key) | |
| 207 | +planopticon query "What technologies were discussed?" | |
| 208 | + | |
| 209 | +# Output as Mermaid diagram | |
| 210 | +planopticon query --format mermaid "neighbors ProjectX" | |
| 211 | + | |
| 212 | +# Output as JSON | |
| 213 | +planopticon query --format json stats | |
| 214 | + | |
| 215 | +# Interactive REPL | |
| 216 | +planopticon query -I | |
| 217 | +``` | |
| 218 | + | |
| 219 | +--- | |
| 220 | + | |
| 221 | +## `planopticon ingest` | |
| 222 | + | |
| 223 | +Ingest documents and files into a knowledge graph. | |
| 224 | + | |
| 225 | +```bash | |
| 226 | +planopticon ingest [OPTIONS] INPUT | |
| 227 | +``` | |
| 228 | + | |
| 229 | +| Option | Type | Default | Description | | |
| 230 | +|--------|------|---------|-------------| | |
| 231 | +| `--output` | PATH | `./knowledge-base` | Output directory for the knowledge base | | |
| 232 | +| `--db-path` | PATH | auto | Path to existing knowledge graph database to merge into | | |
| 233 | +| `--recursive` | FLAG | off | Recursively process directories | | |
| 234 | +| `-p`, `--provider` | TEXT | `auto` | AI provider | | |
| 235 | + | |
| 236 | +**Examples:** | |
| 237 | + | |
| 238 | +```bash | |
| 239 | +# Ingest a single file | |
| 240 | +planopticon ingest ./meeting-notes.md --output ./kb | |
| 241 | + | |
| 242 | +# Ingest a directory recursively | |
| 243 | +planopticon ingest ./docs/ --output ./kb --recursive | |
| 244 | + | |
| 245 | +# Merge into an existing knowledge graph | |
| 246 | +planopticon ingest ./new-notes/ --db-path ./kb/knowledge_graph.db --recursive | |
| 247 | +``` | |
| 248 | + | |
| 249 | +--- | |
| 100 | 250 | |
| 101 | 251 | ## `planopticon auth` |
| 102 | 252 | |
| 103 | -Authenticate with a cloud storage service for batch processing. | |
| 253 | +Authenticate with cloud services via OAuth or API keys. | |
| 104 | 254 | |
| 105 | 255 | ```bash |
| 106 | -planopticon auth SERVICE | |
| 256 | +planopticon auth SERVICE [OPTIONS] | |
| 107 | 257 | ``` |
| 108 | 258 | |
| 109 | 259 | | Argument | Values | Description | |
| 110 | 260 | |----------|--------|-------------| |
| 111 | -| `SERVICE` | `google\|dropbox` | Cloud service to authenticate with | | |
| 261 | +| `SERVICE` | `google\|dropbox\|zoom\|notion\|github\|microsoft` | Cloud service to authenticate with | | |
| 262 | + | |
| 263 | +| Option | Type | Default | Description | | |
| 264 | +|--------|------|---------|-------------| | |
| 265 | +| `--logout` | FLAG | off | Remove stored credentials for the service | | |
| 112 | 266 | |
| 113 | 267 | **Examples:** |
| 114 | 268 | |
| 115 | 269 | ```bash |
| 116 | -# Authenticate with Google Drive (interactive OAuth2) | |
| 270 | +# Authenticate with Google (Drive, Meet, YouTube, etc.) | |
| 117 | 271 | planopticon auth google |
| 118 | 272 | |
| 119 | 273 | # Authenticate with Dropbox |
| 120 | 274 | planopticon auth dropbox |
| 275 | + | |
| 276 | +# Authenticate with Zoom (for recording access) | |
| 277 | +planopticon auth zoom | |
| 278 | + | |
| 279 | +# Authenticate with Notion | |
| 280 | +planopticon auth notion | |
| 281 | + | |
| 282 | +# Authenticate with GitHub | |
| 283 | +planopticon auth github | |
| 284 | + | |
| 285 | +# Authenticate with Microsoft 365 (OneDrive, Teams, etc.) | |
| 286 | +planopticon auth microsoft | |
| 287 | + | |
| 288 | +# Log out of a service | |
| 289 | +planopticon auth google --logout | |
| 290 | +``` | |
| 291 | + | |
| 292 | +--- | |
| 293 | + | |
| 294 | +## `planopticon gws` | |
| 295 | + | |
| 296 | +Google Workspace commands. List, fetch, and ingest content from Google Workspace (Drive, Docs, Sheets, Slides, Meet). | |
| 297 | + | |
| 298 | +### `planopticon gws list` | |
| 299 | + | |
| 300 | +List available files and recordings from Google Workspace. | |
| 301 | + | |
| 302 | +```bash | |
| 303 | +planopticon gws list [OPTIONS] | |
| 304 | +``` | |
| 305 | + | |
| 306 | +| Option | Type | Default | Description | | |
| 307 | +|--------|------|---------|-------------| | |
| 308 | +| `--type` | `drive\|docs\|sheets\|slides\|meet` | all | Filter by content type | | |
| 309 | +| `--folder-id` | TEXT | none | Google Drive folder ID | | |
| 310 | +| `--limit` | INT | 50 | Maximum results to return | | |
| 311 | + | |
| 312 | +### `planopticon gws fetch` | |
| 313 | + | |
| 314 | +Download content from Google Workspace. | |
| 315 | + | |
| 316 | +```bash | |
| 317 | +planopticon gws fetch [OPTIONS] RESOURCE_ID | |
| 318 | +``` | |
| 319 | + | |
| 320 | +| Option | Type | Default | Description | | |
| 321 | +|--------|------|---------|-------------| | |
| 322 | +| `--output` | PATH | `./downloads` | Output directory | | |
| 323 | +| `--format` | TEXT | auto | Export format (pdf, docx, etc.) | | |
| 324 | + | |
| 325 | +### `planopticon gws ingest` | |
| 326 | + | |
| 327 | +Ingest Google Workspace content directly into a knowledge graph. | |
| 328 | + | |
| 329 | +```bash | |
| 330 | +planopticon gws ingest [OPTIONS] | |
| 331 | +``` | |
| 332 | + | |
| 333 | +| Option | Type | Default | Description | | |
| 334 | +|--------|------|---------|-------------| | |
| 335 | +| `--folder-id` | TEXT | none | Google Drive folder ID | | |
| 336 | +| `--output` | PATH | `./knowledge-base` | Knowledge base output directory | | |
| 337 | +| `--recursive` | FLAG | off | Recurse into subfolders | | |
| 338 | + | |
| 339 | +**Examples:** | |
| 340 | + | |
| 341 | +```bash | |
| 342 | +# List all Google Workspace files | |
| 343 | +planopticon gws list | |
| 344 | + | |
| 345 | +# List only Google Docs | |
| 346 | +planopticon gws list --type docs | |
| 347 | + | |
| 348 | +# Fetch a specific file | |
| 349 | +planopticon gws fetch abc123def --output ./downloads | |
| 350 | + | |
| 351 | +# Ingest an entire Drive folder into a knowledge base | |
| 352 | +planopticon gws ingest --folder-id abc123 --output ./kb --recursive | |
| 353 | +``` | |
| 354 | + | |
| 355 | +--- | |
| 356 | + | |
| 357 | +## `planopticon m365` | |
| 358 | + | |
| 359 | +Microsoft 365 commands. List, fetch, and ingest content from Microsoft 365 (OneDrive, SharePoint, Teams, Outlook). | |
| 360 | + | |
| 361 | +### `planopticon m365 list` | |
| 362 | + | |
| 363 | +List available files and recordings from Microsoft 365. | |
| 364 | + | |
| 365 | +```bash | |
| 366 | +planopticon m365 list [OPTIONS] | |
| 367 | +``` | |
| 368 | + | |
| 369 | +| Option | Type | Default | Description | | |
| 370 | +|--------|------|---------|-------------| | |
| 371 | +| `--type` | `onedrive\|sharepoint\|teams\|outlook` | all | Filter by content type | | |
| 372 | +| `--site` | TEXT | none | SharePoint site name | | |
| 373 | +| `--limit` | INT | 50 | Maximum results to return | | |
| 374 | + | |
| 375 | +### `planopticon m365 fetch` | |
| 376 | + | |
| 377 | +Download content from Microsoft 365. | |
| 378 | + | |
| 379 | +```bash | |
| 380 | +planopticon m365 fetch [OPTIONS] RESOURCE_ID | |
| 381 | +``` | |
| 382 | + | |
| 383 | +| Option | Type | Default | Description | | |
| 384 | +|--------|------|---------|-------------| | |
| 385 | +| `--output` | PATH | `./downloads` | Output directory | | |
| 386 | + | |
| 387 | +### `planopticon m365 ingest` | |
| 388 | + | |
| 389 | +Ingest Microsoft 365 content directly into a knowledge graph. | |
| 390 | + | |
| 391 | +```bash | |
| 392 | +planopticon m365 ingest [OPTIONS] | |
| 393 | +``` | |
| 394 | + | |
| 395 | +| Option | Type | Default | Description | | |
| 396 | +|--------|------|---------|-------------| | |
| 397 | +| `--site` | TEXT | none | SharePoint site name | | |
| 398 | +| `--path` | TEXT | `/` | Folder path in OneDrive/SharePoint | | |
| 399 | +| `--output` | PATH | `./knowledge-base` | Knowledge base output directory | | |
| 400 | +| `--recursive` | FLAG | off | Recurse into subfolders | | |
| 401 | + | |
| 402 | +**Examples:** | |
| 403 | + | |
| 404 | +```bash | |
| 405 | +# List all Microsoft 365 content | |
| 406 | +planopticon m365 list | |
| 407 | + | |
| 408 | +# List only Teams recordings | |
| 409 | +planopticon m365 list --type teams | |
| 410 | + | |
| 411 | +# Fetch a specific file | |
| 412 | +planopticon m365 fetch item-id-123 --output ./downloads | |
| 413 | + | |
| 414 | +# Ingest SharePoint content | |
| 415 | +planopticon m365 ingest --site "Engineering" --path "/Shared Documents" --output ./kb --recursive | |
| 416 | +``` | |
| 417 | + | |
| 418 | +--- | |
| 419 | + | |
| 420 | +## `planopticon recordings` | |
| 421 | + | |
| 422 | +List meeting recordings from video conferencing platforms. | |
| 423 | + | |
| 424 | +### `planopticon recordings zoom-list` | |
| 425 | + | |
| 426 | +List Zoom cloud recordings. | |
| 427 | + | |
| 428 | +```bash | |
| 429 | +planopticon recordings zoom-list [OPTIONS] | |
| 430 | +``` | |
| 431 | + | |
| 432 | +| Option | Type | Default | Description | | |
| 433 | +|--------|------|---------|-------------| | |
| 434 | +| `--from` | DATE | 30 days ago | Start date (YYYY-MM-DD) | | |
| 435 | +| `--to` | DATE | today | End date (YYYY-MM-DD) | | |
| 436 | +| `--limit` | INT | 50 | Maximum results | | |
| 437 | + | |
| 438 | +### `planopticon recordings teams-list` | |
| 439 | + | |
| 440 | +List Microsoft Teams meeting recordings. | |
| 441 | + | |
| 442 | +```bash | |
| 443 | +planopticon recordings teams-list [OPTIONS] | |
| 444 | +``` | |
| 445 | + | |
| 446 | +| Option | Type | Default | Description | | |
| 447 | +|--------|------|---------|-------------| | |
| 448 | +| `--from` | DATE | 30 days ago | Start date (YYYY-MM-DD) | | |
| 449 | +| `--to` | DATE | today | End date (YYYY-MM-DD) | | |
| 450 | +| `--limit` | INT | 50 | Maximum results | | |
| 451 | + | |
| 452 | +### `planopticon recordings meet-list` | |
| 453 | + | |
| 454 | +List Google Meet recordings. | |
| 455 | + | |
| 456 | +```bash | |
| 457 | +planopticon recordings meet-list [OPTIONS] | |
| 458 | +``` | |
| 459 | + | |
| 460 | +| Option | Type | Default | Description | | |
| 461 | +|--------|------|---------|-------------| | |
| 462 | +| `--from` | DATE | 30 days ago | Start date (YYYY-MM-DD) | | |
| 463 | +| `--to` | DATE | today | End date (YYYY-MM-DD) | | |
| 464 | +| `--limit` | INT | 50 | Maximum results | | |
| 465 | + | |
| 466 | +**Examples:** | |
| 467 | + | |
| 468 | +```bash | |
| 469 | +# List recent Zoom recordings | |
| 470 | +planopticon recordings zoom-list | |
| 471 | + | |
| 472 | +# List Teams recordings from a specific date range | |
| 473 | +planopticon recordings teams-list --from 2026-01-01 --to 2026-02-01 | |
| 474 | + | |
| 475 | +# List Google Meet recordings | |
| 476 | +planopticon recordings meet-list --limit 10 | |
| 477 | +``` | |
| 478 | + | |
| 479 | +--- | |
| 480 | + | |
| 481 | +## `planopticon export` | |
| 482 | + | |
| 483 | +Export knowledge base content to various formats. | |
| 484 | + | |
| 485 | +### `planopticon export markdown` | |
| 486 | + | |
| 487 | +Export knowledge base as Markdown files. | |
| 488 | + | |
| 489 | +```bash | |
| 490 | +planopticon export markdown [OPTIONS] | |
| 491 | +``` | |
| 492 | + | |
| 493 | +| Option | Type | Default | Description | | |
| 494 | +|--------|------|---------|-------------| | |
| 495 | +| `--input` | PATH | auto-detect | Knowledge base path | | |
| 496 | +| `--output` | PATH | `./export` | Output directory | | |
| 497 | + | |
| 498 | +### `planopticon export obsidian` | |
| 499 | + | |
| 500 | +Export knowledge base as an Obsidian vault with wikilinks and graph metadata. | |
| 501 | + | |
| 502 | +```bash | |
| 503 | +planopticon export obsidian [OPTIONS] | |
| 504 | +``` | |
| 505 | + | |
| 506 | +| Option | Type | Default | Description | | |
| 507 | +|--------|------|---------|-------------| | |
| 508 | +| `--input` | PATH | auto-detect | Knowledge base path | | |
| 509 | +| `--output` | PATH | `./obsidian-vault` | Output vault directory | | |
| 510 | + | |
| 511 | +### `planopticon export notion` | |
| 512 | + | |
| 513 | +Export knowledge base to Notion. | |
| 514 | + | |
| 515 | +```bash | |
| 516 | +planopticon export notion [OPTIONS] | |
| 517 | +``` | |
| 518 | + | |
| 519 | +| Option | Type | Default | Description | | |
| 520 | +|--------|------|---------|-------------| | |
| 521 | +| `--input` | PATH | auto-detect | Knowledge base path | | |
| 522 | +| `--parent-page` | TEXT | none | Notion parent page ID | | |
| 523 | + | |
| 524 | +### `planopticon export exchange` | |
| 525 | + | |
| 526 | +Export knowledge base as PlanOpticon Exchange Format (JSON). | |
| 527 | + | |
| 528 | +```bash | |
| 529 | +planopticon export exchange [OPTIONS] | |
| 530 | +``` | |
| 531 | + | |
| 532 | +| Option | Type | Default | Description | | |
| 533 | +|--------|------|---------|-------------| | |
| 534 | +| `--input` | PATH | auto-detect | Knowledge base path | | |
| 535 | +| `--output` | PATH | `./exchange.json` | Output file path | | |
| 536 | + | |
| 537 | +**Examples:** | |
| 538 | + | |
| 539 | +```bash | |
| 540 | +# Export to Markdown | |
| 541 | +planopticon export markdown --input ./kb --output ./docs | |
| 542 | + | |
| 543 | +# Export to Obsidian vault | |
| 544 | +planopticon export obsidian --input ./kb --output ~/Obsidian/PlanOpticon | |
| 545 | + | |
| 546 | +# Export to Notion | |
| 547 | +planopticon export notion --input ./kb --parent-page abc123 | |
| 548 | + | |
| 549 | +# Export as exchange format for interoperability | |
| 550 | +planopticon export exchange --input ./kb --output ./export.json | |
| 551 | +``` | |
| 552 | + | |
| 553 | +--- | |
| 554 | + | |
| 555 | +## `planopticon wiki` | |
| 556 | + | |
| 557 | +Generate and publish wiki documentation from your knowledge base. | |
| 558 | + | |
| 559 | +### `planopticon wiki generate` | |
| 560 | + | |
| 561 | +Generate a static wiki site from the knowledge base. | |
| 562 | + | |
| 563 | +```bash | |
| 564 | +planopticon wiki generate [OPTIONS] | |
| 565 | +``` | |
| 566 | + | |
| 567 | +| Option | Type | Default | Description | | |
| 568 | +|--------|------|---------|-------------| | |
| 569 | +| `--input` | PATH | auto-detect | Knowledge base path | | |
| 570 | +| `--output` | PATH | `./wiki` | Output directory | | |
| 571 | + | |
| 572 | +### `planopticon wiki push` | |
| 573 | + | |
| 574 | +Push a generated wiki to a remote target (e.g., GitHub Wiki, Confluence). | |
| 575 | + | |
| 576 | +```bash | |
| 577 | +planopticon wiki push [OPTIONS] | |
| 121 | 578 | ``` |
| 122 | 579 | |
| 123 | -After authentication, use `planopticon batch --source gdrive` or `--source dropbox` to process cloud videos. | |
| 580 | +| Option | Type | Default | Description | | |
| 581 | +|--------|------|---------|-------------| | |
| 582 | +| `--input` | PATH | `./wiki` | Wiki directory to push | | |
| 583 | +| `--target` | TEXT | *required* | Push target (e.g., `github://org/repo`, `confluence://space`) | | |
| 584 | + | |
| 585 | +**Examples:** | |
| 586 | + | |
| 587 | +```bash | |
| 588 | +# Generate a wiki from the knowledge base | |
| 589 | +planopticon wiki generate --input ./kb --output ./wiki | |
| 590 | + | |
| 591 | +# Push wiki to GitHub | |
| 592 | +planopticon wiki push --input ./wiki --target "github://ConflictHQ/project-wiki" | |
| 593 | +``` | |
| 124 | 594 | |
| 125 | 595 | --- |
| 126 | 596 | |
| 127 | -## Global options | |
| 597 | +## `planopticon kg` | |
| 598 | + | |
| 599 | +Knowledge graph management commands. | |
| 600 | + | |
| 601 | +### `planopticon kg convert` | |
| 602 | + | |
| 603 | +Convert a knowledge graph between formats. | |
| 604 | + | |
| 605 | +```bash | |
| 606 | +planopticon kg convert [OPTIONS] | |
| 607 | +``` | |
| 608 | + | |
| 609 | +| Option | Type | Default | Description | | |
| 610 | +|--------|------|---------|-------------| | |
| 611 | +| `--input` | PATH | *required* | Input knowledge graph file | | |
| 612 | +| `--output` | PATH | *required* | Output file path | | |
| 613 | +| `--format` | `json\|db\|graphml\|csv` | auto (from extension) | Target format | | |
| 614 | + | |
| 615 | +### `planopticon kg sync` | |
| 616 | + | |
| 617 | +Synchronize two knowledge graphs (merge new data). | |
| 618 | + | |
| 619 | +```bash | |
| 620 | +planopticon kg sync [OPTIONS] | |
| 621 | +``` | |
| 622 | + | |
| 623 | +| Option | Type | Default | Description | | |
| 624 | +|--------|------|---------|-------------| | |
| 625 | +| `--source` | PATH | *required* | Source knowledge graph | | |
| 626 | +| `--target` | PATH | *required* | Target knowledge graph to merge into | | |
| 627 | + | |
| 628 | +### `planopticon kg inspect` | |
| 629 | + | |
| 630 | +Inspect a knowledge graph and display statistics. | |
| 631 | + | |
| 632 | +```bash | |
| 633 | +planopticon kg inspect [OPTIONS] [PATH] | |
| 634 | +``` | |
| 635 | + | |
| 636 | +| Option | Type | Default | Description | | |
| 637 | +|--------|------|---------|-------------| | |
| 638 | +| `PATH` | PATH | auto-detect | Knowledge graph file | | |
| 639 | + | |
| 640 | +### `planopticon kg classify` | |
| 641 | + | |
| 642 | +Classify and tag entities in a knowledge graph. | |
| 643 | + | |
| 644 | +```bash | |
| 645 | +planopticon kg classify [OPTIONS] | |
| 646 | +``` | |
| 647 | + | |
| 648 | +| Option | Type | Default | Description | | |
| 649 | +|--------|------|---------|-------------| | |
| 650 | +| `--db-path` | PATH | auto-detect | Knowledge graph database | | |
| 651 | +| `-p`, `--provider` | TEXT | `auto` | AI provider for classification | | |
| 652 | + | |
| 653 | +### `planopticon kg from-exchange` | |
| 654 | + | |
| 655 | +Import a knowledge graph from PlanOpticon Exchange Format. | |
| 656 | + | |
| 657 | +```bash | |
| 658 | +planopticon kg from-exchange [OPTIONS] INPUT | |
| 659 | +``` | |
| 660 | + | |
| 661 | +| Option | Type | Default | Description | | |
| 662 | +|--------|------|---------|-------------| | |
| 663 | +| `INPUT` | PATH | *required* | Exchange format JSON file | | |
| 664 | +| `--output` | PATH | `./knowledge-base` | Output knowledge base directory | | |
| 665 | + | |
| 666 | +**Examples:** | |
| 667 | + | |
| 668 | +```bash | |
| 669 | +# Convert JSON knowledge graph to FalkorDB format | |
| 670 | +planopticon kg convert --input ./kg.json --output ./kg.db | |
| 671 | + | |
| 672 | +# Merge two knowledge graphs | |
| 673 | +planopticon kg sync --source ./new-kg.db --target ./main-kg.db | |
| 674 | + | |
| 675 | +# Inspect a knowledge graph | |
| 676 | +planopticon kg inspect ./knowledge_graph.db | |
| 677 | + | |
| 678 | +# Classify entities with AI | |
| 679 | +planopticon kg classify --db-path ./kg.db --provider anthropic | |
| 128 | 680 | |
| 129 | -| Option | Description | | |
| 130 | -|--------|-------------| | |
| 131 | -| `-v`, `--verbose` | Enable debug-level logging | | |
| 132 | -| `--version` | Show version and exit | | |
| 133 | -| `--help` | Show help and exit | | |
| 681 | +# Import from exchange format | |
| 682 | +planopticon kg from-exchange ./export.json --output ./kb | |
| 683 | +``` | |
| 134 | 684 |
| --- docs/cli-reference.md | |
| +++ docs/cli-reference.md | |
| @@ -1,6 +1,20 @@ | |
| 1 | # CLI Reference |
| 2 | |
| 3 | ## `planopticon analyze` |
| 4 | |
| 5 | Analyze a single video and extract structured knowledge. |
| 6 | |
| @@ -95,39 +109,575 @@ | |
| 95 | | `-p`, `--provider` | `auto\|openai\|anthropic\|gemini\|ollama` | `auto` | API provider | |
| 96 | | `--vision-model` | TEXT | auto | Override vision model | |
| 97 | | `--chat-model` | TEXT | auto | Override chat model | |
| 98 | |
| 99 | --- |
| 100 | |
| 101 | ## `planopticon auth` |
| 102 | |
| 103 | Authenticate with a cloud storage service for batch processing. |
| 104 | |
| 105 | ```bash |
| 106 | planopticon auth SERVICE |
| 107 | ``` |
| 108 | |
| 109 | | Argument | Values | Description | |
| 110 | |----------|--------|-------------| |
| 111 | | `SERVICE` | `google\|dropbox` | Cloud service to authenticate with | |
| 112 | |
| 113 | **Examples:** |
| 114 | |
| 115 | ```bash |
| 116 | # Authenticate with Google Drive (interactive OAuth2) |
| 117 | planopticon auth google |
| 118 | |
| 119 | # Authenticate with Dropbox |
| 120 | planopticon auth dropbox |
| 121 | ``` |
| 122 | |
| 123 | After authentication, use `planopticon batch --source gdrive` or `--source dropbox` to process cloud videos. |
| 124 | |
| 125 | --- |
| 126 | |
| 127 | ## Global options |
| 128 | |
| 129 | | Option | Description | |
| 130 | |--------|-------------| |
| 131 | | `-v`, `--verbose` | Enable debug-level logging | |
| 132 | | `--version` | Show version and exit | |
| 133 | | `--help` | Show help and exit | |
| 134 |
| --- docs/cli-reference.md | |
| +++ docs/cli-reference.md | |
| @@ -1,6 +1,20 @@ | |
| 1 | # CLI Reference |
| 2 | |
| 3 | ## Global options |
| 4 | |
| 5 | These options are available on all commands. |
| 6 | |
| 7 | | Option | Description | |
| 8 | |--------|-------------| |
| 9 | | `-v`, `--verbose` | Enable debug-level logging | |
| 10 | | `-C`, `--chat` | Enable chat mode (interactive follow-up after command completes) | |
| 11 | | `-I`, `--interactive` | Enable interactive REPL mode | |
| 12 | | `--version` | Show version and exit | |
| 13 | | `--help` | Show help and exit | |
| 14 | |
| 15 | --- |
| 16 | |
| 17 | ## `planopticon analyze` |
| 18 | |
| 19 | Analyze a single video and extract structured knowledge. |
| 20 | |
| @@ -95,39 +109,575 @@ | |
| 109 | | `-p`, `--provider` | `auto\|openai\|anthropic\|gemini\|ollama` | `auto` | API provider | |
| 110 | | `--vision-model` | TEXT | auto | Override vision model | |
| 111 | | `--chat-model` | TEXT | auto | Override chat model | |
| 112 | |
| 113 | --- |
| 114 | |
| 115 | ## `planopticon companion` |
| 116 | |
| 117 | Interactive knowledge base companion. Opens a REPL for conversational exploration of your knowledge base. |
| 118 | |
| 119 | ```bash |
| 120 | planopticon companion [OPTIONS] |
| 121 | ``` |
| 122 | |
| 123 | | Option | Type | Default | Description | |
| 124 | |--------|------|---------|-------------| |
| 125 | | `--kb` | PATH | auto-detect | Path to knowledge base directory | |
| 126 | | `-p`, `--provider` | TEXT | `auto` | AI provider | |
| 127 | | `--chat-model` | TEXT | auto | Override chat model | |
| 128 | |
| 129 | **Examples:** |
| 130 | |
| 131 | ```bash |
| 132 | # Start companion with auto-detected knowledge base |
| 133 | planopticon companion |
| 134 | |
| 135 | # Point to a specific knowledge base |
| 136 | planopticon companion --kb ./my-kb |
| 137 | |
| 138 | # Use a specific provider |
| 139 | planopticon companion --kb ./kb --provider anthropic --chat-model claude-sonnet-4-20250514 |
| 140 | ``` |
| 141 | |
| 142 | --- |
| 143 | |
| 144 | ## `planopticon agent` |
| 145 | |
| 146 | Planning agent with adaptive analysis. Runs an agentic loop that reasons about your knowledge base, plans actions, and executes them. |
| 147 | |
| 148 | ```bash |
| 149 | planopticon agent [OPTIONS] |
| 150 | ``` |
| 151 | |
| 152 | | Option | Type | Default | Description | |
| 153 | |--------|------|---------|-------------| |
| 154 | | `--kb` | PATH | auto-detect | Path to knowledge base directory | |
| 155 | | `-I`, `--interactive` | FLAG | off | Interactive mode (ask before each action) | |
| 156 | | `--export` | PATH | none | Export agent results to a file | |
| 157 | | `-p`, `--provider` | TEXT | `auto` | AI provider | |
| 158 | | `--chat-model` | TEXT | auto | Override chat model | |
| 159 | |
| 160 | **Examples:** |
| 161 | |
| 162 | ```bash |
| 163 | # Run the agent interactively |
| 164 | planopticon agent --kb ./kb --interactive |
| 165 | |
| 166 | # Run agent and export results |
| 167 | planopticon agent --kb ./kb --export ./plan.md |
| 168 | |
| 169 | # Use a specific model |
| 170 | planopticon agent --kb ./kb --provider openai --chat-model gpt-4o |
| 171 | ``` |
| 172 | |
| 173 | --- |
| 174 | |
| 175 | ## `planopticon query` |
| 176 | |
| 177 | Query the knowledge graph directly or with natural language. |
| 178 | |
| 179 | ```bash |
| 180 | planopticon query [OPTIONS] [QUERY] |
| 181 | ``` |
| 182 | |
| 183 | | Option | Type | Default | Description | |
| 184 | |--------|------|---------|-------------| |
| 185 | | `--db-path` | PATH | auto-detect | Path to knowledge graph database | |
| 186 | | `--mode` | `direct\|agentic` | auto | Query mode (direct for structured, agentic for natural language) | |
| 187 | | `--format` | `text\|json\|mermaid` | `text` | Output format | |
| 188 | | `-I`, `--interactive` | FLAG | off | Interactive REPL mode | |
| 189 | |
| 190 | **Examples:** |
| 191 | |
| 192 | ```bash |
| 193 | # Show graph stats |
| 194 | planopticon query stats |
| 195 | |
| 196 | # List entities by type |
| 197 | planopticon query "entities --type technology" |
| 198 | planopticon query "entities --type person" |
| 199 | |
| 200 | # Find neighbors of an entity |
| 201 | planopticon query "neighbors Alice" |
| 202 | |
| 203 | # List relationships |
| 204 | planopticon query "relationships --source Alice" |
| 205 | |
| 206 | # Natural language query (requires API key) |
| 207 | planopticon query "What technologies were discussed?" |
| 208 | |
| 209 | # Output as Mermaid diagram |
| 210 | planopticon query --format mermaid "neighbors ProjectX" |
| 211 | |
| 212 | # Output as JSON |
| 213 | planopticon query --format json stats |
| 214 | |
| 215 | # Interactive REPL |
| 216 | planopticon query -I |
| 217 | ``` |
| 218 | |
| 219 | --- |
| 220 | |
| 221 | ## `planopticon ingest` |
| 222 | |
| 223 | Ingest documents and files into a knowledge graph. |
| 224 | |
| 225 | ```bash |
| 226 | planopticon ingest [OPTIONS] INPUT |
| 227 | ``` |
| 228 | |
| 229 | | Option | Type | Default | Description | |
| 230 | |--------|------|---------|-------------| |
| 231 | | `--output` | PATH | `./knowledge-base` | Output directory for the knowledge base | |
| 232 | | `--db-path` | PATH | auto | Path to existing knowledge graph database to merge into | |
| 233 | | `--recursive` | FLAG | off | Recursively process directories | |
| 234 | | `-p`, `--provider` | TEXT | `auto` | AI provider | |
| 235 | |
| 236 | **Examples:** |
| 237 | |
| 238 | ```bash |
| 239 | # Ingest a single file |
| 240 | planopticon ingest ./meeting-notes.md --output ./kb |
| 241 | |
| 242 | # Ingest a directory recursively |
| 243 | planopticon ingest ./docs/ --output ./kb --recursive |
| 244 | |
| 245 | # Merge into an existing knowledge graph |
| 246 | planopticon ingest ./new-notes/ --db-path ./kb/knowledge_graph.db --recursive |
| 247 | ``` |
| 248 | |
| 249 | --- |
| 250 | |
| 251 | ## `planopticon auth` |
| 252 | |
| 253 | Authenticate with cloud services via OAuth or API keys. |
| 254 | |
| 255 | ```bash |
| 256 | planopticon auth SERVICE [OPTIONS] |
| 257 | ``` |
| 258 | |
| 259 | | Argument | Values | Description | |
| 260 | |----------|--------|-------------| |
| 261 | | `SERVICE` | `google\|dropbox\|zoom\|notion\|github\|microsoft` | Cloud service to authenticate with | |
| 262 | |
| 263 | | Option | Type | Default | Description | |
| 264 | |--------|------|---------|-------------| |
| 265 | | `--logout` | FLAG | off | Remove stored credentials for the service | |
| 266 | |
| 267 | **Examples:** |
| 268 | |
| 269 | ```bash |
| 270 | # Authenticate with Google (Drive, Meet, YouTube, etc.) |
| 271 | planopticon auth google |
| 272 | |
| 273 | # Authenticate with Dropbox |
| 274 | planopticon auth dropbox |
| 275 | |
| 276 | # Authenticate with Zoom (for recording access) |
| 277 | planopticon auth zoom |
| 278 | |
| 279 | # Authenticate with Notion |
| 280 | planopticon auth notion |
| 281 | |
| 282 | # Authenticate with GitHub |
| 283 | planopticon auth github |
| 284 | |
| 285 | # Authenticate with Microsoft 365 (OneDrive, Teams, etc.) |
| 286 | planopticon auth microsoft |
| 287 | |
| 288 | # Log out of a service |
| 289 | planopticon auth google --logout |
| 290 | ``` |
| 291 | |
| 292 | --- |
| 293 | |
| 294 | ## `planopticon gws` |
| 295 | |
| 296 | Google Workspace commands. List, fetch, and ingest content from Google Workspace (Drive, Docs, Sheets, Slides, Meet). |
| 297 | |
| 298 | ### `planopticon gws list` |
| 299 | |
| 300 | List available files and recordings from Google Workspace. |
| 301 | |
| 302 | ```bash |
| 303 | planopticon gws list [OPTIONS] |
| 304 | ``` |
| 305 | |
| 306 | | Option | Type | Default | Description | |
| 307 | |--------|------|---------|-------------| |
| 308 | | `--type` | `drive\|docs\|sheets\|slides\|meet` | all | Filter by content type | |
| 309 | | `--folder-id` | TEXT | none | Google Drive folder ID | |
| 310 | | `--limit` | INT | 50 | Maximum results to return | |
| 311 | |
| 312 | ### `planopticon gws fetch` |
| 313 | |
| 314 | Download content from Google Workspace. |
| 315 | |
| 316 | ```bash |
| 317 | planopticon gws fetch [OPTIONS] RESOURCE_ID |
| 318 | ``` |
| 319 | |
| 320 | | Option | Type | Default | Description | |
| 321 | |--------|------|---------|-------------| |
| 322 | | `--output` | PATH | `./downloads` | Output directory | |
| 323 | | `--format` | TEXT | auto | Export format (pdf, docx, etc.) | |
| 324 | |
| 325 | ### `planopticon gws ingest` |
| 326 | |
| 327 | Ingest Google Workspace content directly into a knowledge graph. |
| 328 | |
| 329 | ```bash |
| 330 | planopticon gws ingest [OPTIONS] |
| 331 | ``` |
| 332 | |
| 333 | | Option | Type | Default | Description | |
| 334 | |--------|------|---------|-------------| |
| 335 | | `--folder-id` | TEXT | none | Google Drive folder ID | |
| 336 | | `--output` | PATH | `./knowledge-base` | Knowledge base output directory | |
| 337 | | `--recursive` | FLAG | off | Recurse into subfolders | |
| 338 | |
| 339 | **Examples:** |
| 340 | |
| 341 | ```bash |
| 342 | # List all Google Workspace files |
| 343 | planopticon gws list |
| 344 | |
| 345 | # List only Google Docs |
| 346 | planopticon gws list --type docs |
| 347 | |
| 348 | # Fetch a specific file |
| 349 | planopticon gws fetch abc123def --output ./downloads |
| 350 | |
| 351 | # Ingest an entire Drive folder into a knowledge base |
| 352 | planopticon gws ingest --folder-id abc123 --output ./kb --recursive |
| 353 | ``` |
| 354 | |
| 355 | --- |
| 356 | |
| 357 | ## `planopticon m365` |
| 358 | |
| 359 | Microsoft 365 commands. List, fetch, and ingest content from Microsoft 365 (OneDrive, SharePoint, Teams, Outlook). |
| 360 | |
| 361 | ### `planopticon m365 list` |
| 362 | |
| 363 | List available files and recordings from Microsoft 365. |
| 364 | |
| 365 | ```bash |
| 366 | planopticon m365 list [OPTIONS] |
| 367 | ``` |
| 368 | |
| 369 | | Option | Type | Default | Description | |
| 370 | |--------|------|---------|-------------| |
| 371 | | `--type` | `onedrive\|sharepoint\|teams\|outlook` | all | Filter by content type | |
| 372 | | `--site` | TEXT | none | SharePoint site name | |
| 373 | | `--limit` | INT | 50 | Maximum results to return | |
| 374 | |
| 375 | ### `planopticon m365 fetch` |
| 376 | |
| 377 | Download content from Microsoft 365. |
| 378 | |
| 379 | ```bash |
| 380 | planopticon m365 fetch [OPTIONS] RESOURCE_ID |
| 381 | ``` |
| 382 | |
| 383 | | Option | Type | Default | Description | |
| 384 | |--------|------|---------|-------------| |
| 385 | | `--output` | PATH | `./downloads` | Output directory | |
| 386 | |
| 387 | ### `planopticon m365 ingest` |
| 388 | |
| 389 | Ingest Microsoft 365 content directly into a knowledge graph. |
| 390 | |
| 391 | ```bash |
| 392 | planopticon m365 ingest [OPTIONS] |
| 393 | ``` |
| 394 | |
| 395 | | Option | Type | Default | Description | |
| 396 | |--------|------|---------|-------------| |
| 397 | | `--site` | TEXT | none | SharePoint site name | |
| 398 | | `--path` | TEXT | `/` | Folder path in OneDrive/SharePoint | |
| 399 | | `--output` | PATH | `./knowledge-base` | Knowledge base output directory | |
| 400 | | `--recursive` | FLAG | off | Recurse into subfolders | |
| 401 | |
| 402 | **Examples:** |
| 403 | |
| 404 | ```bash |
| 405 | # List all Microsoft 365 content |
| 406 | planopticon m365 list |
| 407 | |
| 408 | # List only Teams recordings |
| 409 | planopticon m365 list --type teams |
| 410 | |
| 411 | # Fetch a specific file |
| 412 | planopticon m365 fetch item-id-123 --output ./downloads |
| 413 | |
| 414 | # Ingest SharePoint content |
| 415 | planopticon m365 ingest --site "Engineering" --path "/Shared Documents" --output ./kb --recursive |
| 416 | ``` |
| 417 | |
| 418 | --- |
| 419 | |
| 420 | ## `planopticon recordings` |
| 421 | |
| 422 | List meeting recordings from video conferencing platforms. |
| 423 | |
| 424 | ### `planopticon recordings zoom-list` |
| 425 | |
| 426 | List Zoom cloud recordings. |
| 427 | |
| 428 | ```bash |
| 429 | planopticon recordings zoom-list [OPTIONS] |
| 430 | ``` |
| 431 | |
| 432 | | Option | Type | Default | Description | |
| 433 | |--------|------|---------|-------------| |
| 434 | | `--from` | DATE | 30 days ago | Start date (YYYY-MM-DD) | |
| 435 | | `--to` | DATE | today | End date (YYYY-MM-DD) | |
| 436 | | `--limit` | INT | 50 | Maximum results | |
| 437 | |
| 438 | ### `planopticon recordings teams-list` |
| 439 | |
| 440 | List Microsoft Teams meeting recordings. |
| 441 | |
| 442 | ```bash |
| 443 | planopticon recordings teams-list [OPTIONS] |
| 444 | ``` |
| 445 | |
| 446 | | Option | Type | Default | Description | |
| 447 | |--------|------|---------|-------------| |
| 448 | | `--from` | DATE | 30 days ago | Start date (YYYY-MM-DD) | |
| 449 | | `--to` | DATE | today | End date (YYYY-MM-DD) | |
| 450 | | `--limit` | INT | 50 | Maximum results | |
| 451 | |
| 452 | ### `planopticon recordings meet-list` |
| 453 | |
| 454 | List Google Meet recordings. |
| 455 | |
| 456 | ```bash |
| 457 | planopticon recordings meet-list [OPTIONS] |
| 458 | ``` |
| 459 | |
| 460 | | Option | Type | Default | Description | |
| 461 | |--------|------|---------|-------------| |
| 462 | | `--from` | DATE | 30 days ago | Start date (YYYY-MM-DD) | |
| 463 | | `--to` | DATE | today | End date (YYYY-MM-DD) | |
| 464 | | `--limit` | INT | 50 | Maximum results | |
| 465 | |
| 466 | **Examples:** |
| 467 | |
| 468 | ```bash |
| 469 | # List recent Zoom recordings |
| 470 | planopticon recordings zoom-list |
| 471 | |
| 472 | # List Teams recordings from a specific date range |
| 473 | planopticon recordings teams-list --from 2026-01-01 --to 2026-02-01 |
| 474 | |
| 475 | # List Google Meet recordings |
| 476 | planopticon recordings meet-list --limit 10 |
| 477 | ``` |
| 478 | |
| 479 | --- |
| 480 | |
| 481 | ## `planopticon export` |
| 482 | |
| 483 | Export knowledge base content to various formats. |
| 484 | |
| 485 | ### `planopticon export markdown` |
| 486 | |
| 487 | Export knowledge base as Markdown files. |
| 488 | |
| 489 | ```bash |
| 490 | planopticon export markdown [OPTIONS] |
| 491 | ``` |
| 492 | |
| 493 | | Option | Type | Default | Description | |
| 494 | |--------|------|---------|-------------| |
| 495 | | `--input` | PATH | auto-detect | Knowledge base path | |
| 496 | | `--output` | PATH | `./export` | Output directory | |
| 497 | |
| 498 | ### `planopticon export obsidian` |
| 499 | |
| 500 | Export knowledge base as an Obsidian vault with wikilinks and graph metadata. |
| 501 | |
| 502 | ```bash |
| 503 | planopticon export obsidian [OPTIONS] |
| 504 | ``` |
| 505 | |
| 506 | | Option | Type | Default | Description | |
| 507 | |--------|------|---------|-------------| |
| 508 | | `--input` | PATH | auto-detect | Knowledge base path | |
| 509 | | `--output` | PATH | `./obsidian-vault` | Output vault directory | |
| 510 | |
| 511 | ### `planopticon export notion` |
| 512 | |
| 513 | Export knowledge base to Notion. |
| 514 | |
| 515 | ```bash |
| 516 | planopticon export notion [OPTIONS] |
| 517 | ``` |
| 518 | |
| 519 | | Option | Type | Default | Description | |
| 520 | |--------|------|---------|-------------| |
| 521 | | `--input` | PATH | auto-detect | Knowledge base path | |
| 522 | | `--parent-page` | TEXT | none | Notion parent page ID | |
| 523 | |
| 524 | ### `planopticon export exchange` |
| 525 | |
| 526 | Export knowledge base as PlanOpticon Exchange Format (JSON). |
| 527 | |
| 528 | ```bash |
| 529 | planopticon export exchange [OPTIONS] |
| 530 | ``` |
| 531 | |
| 532 | | Option | Type | Default | Description | |
| 533 | |--------|------|---------|-------------| |
| 534 | | `--input` | PATH | auto-detect | Knowledge base path | |
| 535 | | `--output` | PATH | `./exchange.json` | Output file path | |
| 536 | |
| 537 | **Examples:** |
| 538 | |
| 539 | ```bash |
| 540 | # Export to Markdown |
| 541 | planopticon export markdown --input ./kb --output ./docs |
| 542 | |
| 543 | # Export to Obsidian vault |
| 544 | planopticon export obsidian --input ./kb --output ~/Obsidian/PlanOpticon |
| 545 | |
| 546 | # Export to Notion |
| 547 | planopticon export notion --input ./kb --parent-page abc123 |
| 548 | |
| 549 | # Export as exchange format for interoperability |
| 550 | planopticon export exchange --input ./kb --output ./export.json |
| 551 | ``` |
| 552 | |
| 553 | --- |
| 554 | |
| 555 | ## `planopticon wiki` |
| 556 | |
| 557 | Generate and publish wiki documentation from your knowledge base. |
| 558 | |
| 559 | ### `planopticon wiki generate` |
| 560 | |
| 561 | Generate a static wiki site from the knowledge base. |
| 562 | |
| 563 | ```bash |
| 564 | planopticon wiki generate [OPTIONS] |
| 565 | ``` |
| 566 | |
| 567 | | Option | Type | Default | Description | |
| 568 | |--------|------|---------|-------------| |
| 569 | | `--input` | PATH | auto-detect | Knowledge base path | |
| 570 | | `--output` | PATH | `./wiki` | Output directory | |
| 571 | |
| 572 | ### `planopticon wiki push` |
| 573 | |
| 574 | Push a generated wiki to a remote target (e.g., GitHub Wiki, Confluence). |
| 575 | |
| 576 | ```bash |
| 577 | planopticon wiki push [OPTIONS] |
| 578 | ``` |
| 579 | |
| 580 | | Option | Type | Default | Description | |
| 581 | |--------|------|---------|-------------| |
| 582 | | `--input` | PATH | `./wiki` | Wiki directory to push | |
| 583 | | `--target` | TEXT | *required* | Push target (e.g., `github://org/repo`, `confluence://space`) | |
| 584 | |
| 585 | **Examples:** |
| 586 | |
| 587 | ```bash |
| 588 | # Generate a wiki from the knowledge base |
| 589 | planopticon wiki generate --input ./kb --output ./wiki |
| 590 | |
| 591 | # Push wiki to GitHub |
| 592 | planopticon wiki push --input ./wiki --target "github://ConflictHQ/project-wiki" |
| 593 | ``` |
| 594 | |
| 595 | --- |
| 596 | |
| 597 | ## `planopticon kg` |
| 598 | |
| 599 | Knowledge graph management commands. |
| 600 | |
| 601 | ### `planopticon kg convert` |
| 602 | |
| 603 | Convert a knowledge graph between formats. |
| 604 | |
| 605 | ```bash |
| 606 | planopticon kg convert [OPTIONS] |
| 607 | ``` |
| 608 | |
| 609 | | Option | Type | Default | Description | |
| 610 | |--------|------|---------|-------------| |
| 611 | | `--input` | PATH | *required* | Input knowledge graph file | |
| 612 | | `--output` | PATH | *required* | Output file path | |
| 613 | | `--format` | `json\|db\|graphml\|csv` | auto (from extension) | Target format | |
| 614 | |
| 615 | ### `planopticon kg sync` |
| 616 | |
| 617 | Synchronize two knowledge graphs (merge new data). |
| 618 | |
| 619 | ```bash |
| 620 | planopticon kg sync [OPTIONS] |
| 621 | ``` |
| 622 | |
| 623 | | Option | Type | Default | Description | |
| 624 | |--------|------|---------|-------------| |
| 625 | | `--source` | PATH | *required* | Source knowledge graph | |
| 626 | | `--target` | PATH | *required* | Target knowledge graph to merge into | |
| 627 | |
| 628 | ### `planopticon kg inspect` |
| 629 | |
| 630 | Inspect a knowledge graph and display statistics. |
| 631 | |
| 632 | ```bash |
| 633 | planopticon kg inspect [OPTIONS] [PATH] |
| 634 | ``` |
| 635 | |
| 636 | | Option | Type | Default | Description | |
| 637 | |--------|------|---------|-------------| |
| 638 | | `PATH` | PATH | auto-detect | Knowledge graph file | |
| 639 | |
| 640 | ### `planopticon kg classify` |
| 641 | |
| 642 | Classify and tag entities in a knowledge graph. |
| 643 | |
| 644 | ```bash |
| 645 | planopticon kg classify [OPTIONS] |
| 646 | ``` |
| 647 | |
| 648 | | Option | Type | Default | Description | |
| 649 | |--------|------|---------|-------------| |
| 650 | | `--db-path` | PATH | auto-detect | Knowledge graph database | |
| 651 | | `-p`, `--provider` | TEXT | `auto` | AI provider for classification | |
| 652 | |
| 653 | ### `planopticon kg from-exchange` |
| 654 | |
| 655 | Import a knowledge graph from PlanOpticon Exchange Format. |
| 656 | |
| 657 | ```bash |
| 658 | planopticon kg from-exchange [OPTIONS] INPUT |
| 659 | ``` |
| 660 | |
| 661 | | Option | Type | Default | Description | |
| 662 | |--------|------|---------|-------------| |
| 663 | | `INPUT` | PATH | *required* | Exchange format JSON file | |
| 664 | | `--output` | PATH | `./knowledge-base` | Output knowledge base directory | |
| 665 | |
| 666 | **Examples:** |
| 667 | |
| 668 | ```bash |
| 669 | # Convert JSON knowledge graph to FalkorDB format |
| 670 | planopticon kg convert --input ./kg.json --output ./kg.db |
| 671 | |
| 672 | # Merge two knowledge graphs |
| 673 | planopticon kg sync --source ./new-kg.db --target ./main-kg.db |
| 674 | |
| 675 | # Inspect a knowledge graph |
| 676 | planopticon kg inspect ./knowledge_graph.db |
| 677 | |
| 678 | # Classify entities with AI |
| 679 | planopticon kg classify --db-path ./kg.db --provider anthropic |
| 680 | |
| 681 | # Import from exchange format |
| 682 | planopticon kg from-exchange ./export.json --output ./kb |
| 683 | ``` |
| 684 |
| --- docs/getting-started/configuration.md | ||
| +++ docs/getting-started/configuration.md | ||
| @@ -1,27 +1,70 @@ | ||
| 1 | 1 | # Configuration |
| 2 | 2 | |
| 3 | 3 | ## Environment variables |
| 4 | + | |
| 5 | +### AI providers | |
| 4 | 6 | |
| 5 | 7 | | Variable | Description | |
| 6 | 8 | |----------|-------------| |
| 7 | 9 | | `OPENAI_API_KEY` | OpenAI API key | |
| 8 | 10 | | `ANTHROPIC_API_KEY` | Anthropic API key | |
| 9 | 11 | | `GEMINI_API_KEY` | Google Gemini API key | |
| 12 | +| `AZURE_OPENAI_API_KEY` | Azure OpenAI API key | | |
| 13 | +| `AZURE_OPENAI_ENDPOINT` | Azure OpenAI endpoint URL | | |
| 14 | +| `TOGETHER_API_KEY` | Together AI API key | | |
| 15 | +| `FIREWORKS_API_KEY` | Fireworks AI API key | | |
| 16 | +| `CEREBRAS_API_KEY` | Cerebras API key | | |
| 17 | +| `XAI_API_KEY` | xAI (Grok) API key | | |
| 10 | 18 | | `OLLAMA_HOST` | Ollama server URL (default: `http://localhost:11434`) | |
| 11 | -| `GOOGLE_APPLICATION_CREDENTIALS` | Path to Google service account JSON (for Drive) | | |
| 19 | + | |
| 20 | +### Cloud services | |
| 21 | + | |
| 22 | +| Variable | Description | | |
| 23 | +|----------|-------------| | |
| 24 | +| `GOOGLE_APPLICATION_CREDENTIALS` | Path to Google service account JSON (for server-side Drive access) | | |
| 25 | +| `ZOOM_CLIENT_ID` | Zoom OAuth app client ID | | |
| 26 | +| `ZOOM_CLIENT_SECRET` | Zoom OAuth app client secret | | |
| 27 | +| `NOTION_API_KEY` | Notion integration token | | |
| 28 | +| `GITHUB_TOKEN` | GitHub personal access token | | |
| 29 | +| `MICROSOFT_CLIENT_ID` | Azure AD app client ID (for Microsoft 365) | | |
| 30 | +| `MICROSOFT_CLIENT_SECRET` | Azure AD app client secret | | |
| 31 | + | |
| 32 | +### General | |
| 33 | + | |
| 34 | +| Variable | Description | | |
| 35 | +|----------|-------------| | |
| 12 | 36 | | `CACHE_DIR` | Directory for API response caching | |
| 13 | 37 | |
| 38 | +## Authentication | |
| 39 | + | |
| 40 | +Most cloud services use OAuth via the `planopticon auth` command. Run it once per service to store credentials locally: | |
| 41 | + | |
| 42 | +```bash | |
| 43 | +planopticon auth google # Google Drive, Docs, Meet, YouTube | |
| 44 | +planopticon auth dropbox # Dropbox | |
| 45 | +planopticon auth zoom # Zoom recordings | |
| 46 | +planopticon auth notion # Notion pages | |
| 47 | +planopticon auth github # GitHub repos and wikis | |
| 48 | +planopticon auth microsoft # OneDrive, SharePoint, Teams | |
| 49 | +``` | |
| 50 | + | |
| 51 | +Credentials are stored in `~/.config/planopticon/`. Use `planopticon auth SERVICE --logout` to remove them. | |
| 52 | + | |
| 53 | +For Zoom and Microsoft 365, you also need to set the client ID and secret environment variables before running `planopticon auth`. | |
| 54 | + | |
| 14 | 55 | ## Provider routing |
| 15 | 56 | |
| 16 | -PlanOpticon auto-discovers available models and routes each task to the best option: | |
| 57 | +PlanOpticon auto-discovers available models and routes each task to the cheapest capable option: | |
| 17 | 58 | |
| 18 | 59 | | Task | Default preference | |
| 19 | 60 | |------|--------------------| |
| 20 | -| Vision (diagrams) | Gemini Flash > GPT-4o > Claude Sonnet > Ollama | | |
| 21 | -| Chat (analysis) | Claude Sonnet > GPT-4o > Gemini Flash > Ollama | | |
| 61 | +| Vision (diagrams) | Gemini Flash > GPT-4o-mini > Claude Haiku > Ollama | | |
| 62 | +| Chat (analysis) | Claude Haiku > GPT-4o-mini > Gemini Flash > Ollama | | |
| 22 | 63 | | Transcription | Local Whisper > Whisper-1 > Gemini Flash | |
| 64 | + | |
| 65 | +Default models prioritize cost efficiency. For complex or high-stakes analysis, override with more capable models using `--chat-model` or `--vision-model`. | |
| 23 | 66 | |
| 24 | 67 | If no cloud API keys are configured, PlanOpticon automatically falls back to Ollama when a local server is running. This enables fully offline operation when paired with local Whisper for transcription. |
| 25 | 68 | |
| 26 | 69 | Override with `--provider`, `--vision-model`, or `--chat-model` flags. |
| 27 | 70 | |
| 28 | 71 |
| --- docs/getting-started/configuration.md | |
| +++ docs/getting-started/configuration.md | |
| @@ -1,27 +1,70 @@ | |
| 1 | # Configuration |
| 2 | |
| 3 | ## Environment variables |
| 4 | |
| 5 | | Variable | Description | |
| 6 | |----------|-------------| |
| 7 | | `OPENAI_API_KEY` | OpenAI API key | |
| 8 | | `ANTHROPIC_API_KEY` | Anthropic API key | |
| 9 | | `GEMINI_API_KEY` | Google Gemini API key | |
| 10 | | `OLLAMA_HOST` | Ollama server URL (default: `http://localhost:11434`) | |
| 11 | | `GOOGLE_APPLICATION_CREDENTIALS` | Path to Google service account JSON (for Drive) | |
| 12 | | `CACHE_DIR` | Directory for API response caching | |
| 13 | |
| 14 | ## Provider routing |
| 15 | |
| 16 | PlanOpticon auto-discovers available models and routes each task to the best option: |
| 17 | |
| 18 | | Task | Default preference | |
| 19 | |------|--------------------| |
| 20 | | Vision (diagrams) | Gemini Flash > GPT-4o > Claude Sonnet > Ollama | |
| 21 | | Chat (analysis) | Claude Sonnet > GPT-4o > Gemini Flash > Ollama | |
| 22 | | Transcription | Local Whisper > Whisper-1 > Gemini Flash | |
| 23 | |
| 24 | If no cloud API keys are configured, PlanOpticon automatically falls back to Ollama when a local server is running. This enables fully offline operation when paired with local Whisper for transcription. |
| 25 | |
| 26 | Override with `--provider`, `--vision-model`, or `--chat-model` flags. |
| 27 | |
| 28 |
| --- docs/getting-started/configuration.md | |
| +++ docs/getting-started/configuration.md | |
| @@ -1,27 +1,70 @@ | |
| 1 | # Configuration |
| 2 | |
| 3 | ## Environment variables |
| 4 | |
| 5 | ### AI providers |
| 6 | |
| 7 | | Variable | Description | |
| 8 | |----------|-------------| |
| 9 | | `OPENAI_API_KEY` | OpenAI API key | |
| 10 | | `ANTHROPIC_API_KEY` | Anthropic API key | |
| 11 | | `GEMINI_API_KEY` | Google Gemini API key | |
| 12 | | `AZURE_OPENAI_API_KEY` | Azure OpenAI API key | |
| 13 | | `AZURE_OPENAI_ENDPOINT` | Azure OpenAI endpoint URL | |
| 14 | | `TOGETHER_API_KEY` | Together AI API key | |
| 15 | | `FIREWORKS_API_KEY` | Fireworks AI API key | |
| 16 | | `CEREBRAS_API_KEY` | Cerebras API key | |
| 17 | | `XAI_API_KEY` | xAI (Grok) API key | |
| 18 | | `OLLAMA_HOST` | Ollama server URL (default: `http://localhost:11434`) | |
| 19 | |
| 20 | ### Cloud services |
| 21 | |
| 22 | | Variable | Description | |
| 23 | |----------|-------------| |
| 24 | | `GOOGLE_APPLICATION_CREDENTIALS` | Path to Google service account JSON (for server-side Drive access) | |
| 25 | | `ZOOM_CLIENT_ID` | Zoom OAuth app client ID | |
| 26 | | `ZOOM_CLIENT_SECRET` | Zoom OAuth app client secret | |
| 27 | | `NOTION_API_KEY` | Notion integration token | |
| 28 | | `GITHUB_TOKEN` | GitHub personal access token | |
| 29 | | `MICROSOFT_CLIENT_ID` | Azure AD app client ID (for Microsoft 365) | |
| 30 | | `MICROSOFT_CLIENT_SECRET` | Azure AD app client secret | |
| 31 | |
| 32 | ### General |
| 33 | |
| 34 | | Variable | Description | |
| 35 | |----------|-------------| |
| 36 | | `CACHE_DIR` | Directory for API response caching | |
| 37 | |
| 38 | ## Authentication |
| 39 | |
| 40 | Most cloud services use OAuth via the `planopticon auth` command. Run it once per service to store credentials locally: |
| 41 | |
| 42 | ```bash |
| 43 | planopticon auth google # Google Drive, Docs, Meet, YouTube |
| 44 | planopticon auth dropbox # Dropbox |
| 45 | planopticon auth zoom # Zoom recordings |
| 46 | planopticon auth notion # Notion pages |
| 47 | planopticon auth github # GitHub repos and wikis |
| 48 | planopticon auth microsoft # OneDrive, SharePoint, Teams |
| 49 | ``` |
| 50 | |
| 51 | Credentials are stored in `~/.config/planopticon/`. Use `planopticon auth SERVICE --logout` to remove them. |
| 52 | |
| 53 | For Zoom and Microsoft 365, you also need to set the client ID and secret environment variables before running `planopticon auth`. |
| 54 | |
| 55 | ## Provider routing |
| 56 | |
| 57 | PlanOpticon auto-discovers available models and routes each task to the cheapest capable option: |
| 58 | |
| 59 | | Task | Default preference | |
| 60 | |------|--------------------| |
| 61 | | Vision (diagrams) | Gemini Flash > GPT-4o-mini > Claude Haiku > Ollama | |
| 62 | | Chat (analysis) | Claude Haiku > GPT-4o-mini > Gemini Flash > Ollama | |
| 63 | | Transcription | Local Whisper > Whisper-1 > Gemini Flash | |
| 64 | |
| 65 | Default models prioritize cost efficiency. For complex or high-stakes analysis, override with more capable models using `--chat-model` or `--vision-model`. |
| 66 | |
| 67 | If no cloud API keys are configured, PlanOpticon automatically falls back to Ollama when a local server is running. This enables fully offline operation when paired with local Whisper for transcription. |
| 68 | |
| 69 | Override with `--provider`, `--vision-model`, or `--chat-model` flags. |
| 70 | |
| 71 |
| --- docs/getting-started/quickstart.md | ||
| +++ docs/getting-started/quickstart.md | ||
| @@ -63,10 +63,90 @@ | ||
| 63 | 63 | Batch mode produces per-video outputs plus: |
| 64 | 64 | |
| 65 | 65 | - Merged knowledge graph across all videos |
| 66 | 66 | - Batch summary with aggregated action items |
| 67 | 67 | - Cross-referenced entities |
| 68 | + | |
| 69 | +## Ingest documents | |
| 70 | + | |
| 71 | +Build a knowledge graph from documents, notes, or any text content: | |
| 72 | + | |
| 73 | +```bash | |
| 74 | +# Ingest a single file | |
| 75 | +planopticon ingest ./meeting-notes.md --output ./kb | |
| 76 | + | |
| 77 | +# Ingest a directory recursively | |
| 78 | +planopticon ingest ./docs/ --output ./kb --recursive | |
| 79 | + | |
| 80 | +# Ingest from a URL | |
| 81 | +planopticon ingest "https://www.youtube.com/watch?v=example" --output ./kb | |
| 82 | +``` | |
| 83 | + | |
| 84 | +## Companion REPL | |
| 85 | + | |
| 86 | +Chat with your knowledge base interactively: | |
| 87 | + | |
| 88 | +```bash | |
| 89 | +# Start the companion | |
| 90 | +planopticon companion --kb ./kb | |
| 91 | + | |
| 92 | +# Use a specific provider | |
| 93 | +planopticon companion --kb ./kb --provider anthropic | |
| 94 | +``` | |
| 95 | + | |
| 96 | +The companion understands your knowledge graph and can answer questions, find connections, and summarize topics conversationally. | |
| 97 | + | |
| 98 | +## Planning agent | |
| 99 | + | |
| 100 | +Run the planning agent for adaptive, goal-directed analysis: | |
| 101 | + | |
| 102 | +```bash | |
| 103 | +# Interactive mode — the agent asks before each action | |
| 104 | +planopticon agent --kb ./kb --interactive | |
| 105 | + | |
| 106 | +# Non-interactive with export | |
| 107 | +planopticon agent --kb ./kb --export ./plan.md | |
| 108 | +``` | |
| 109 | + | |
| 110 | +## Query the knowledge graph | |
| 111 | + | |
| 112 | +Query your knowledge graph directly without an AI provider: | |
| 113 | + | |
| 114 | +```bash | |
| 115 | +# Show graph stats (entity/relationship counts) | |
| 116 | +planopticon query stats | |
| 117 | + | |
| 118 | +# List entities by type | |
| 119 | +planopticon query "entities --type technology" | |
| 120 | + | |
| 121 | +# Find neighbors of an entity | |
| 122 | +planopticon query "neighbors Alice" | |
| 123 | + | |
| 124 | +# Natural language query (requires API key) | |
| 125 | +planopticon query "What technologies were discussed?" | |
| 126 | + | |
| 127 | +# Interactive REPL | |
| 128 | +planopticon query -I | |
| 129 | +``` | |
| 130 | + | |
| 131 | +## Export | |
| 132 | + | |
| 133 | +Export your knowledge base to various formats: | |
| 134 | + | |
| 135 | +```bash | |
| 136 | +# Export to Markdown files | |
| 137 | +planopticon export markdown --input ./kb --output ./docs | |
| 138 | + | |
| 139 | +# Export to an Obsidian vault | |
| 140 | +planopticon export obsidian --input ./kb --output ~/Obsidian/PlanOpticon | |
| 141 | + | |
| 142 | +# Export to Notion | |
| 143 | +planopticon export notion --input ./kb --parent-page abc123 | |
| 144 | + | |
| 145 | +# Export as exchange format (portable JSON) | |
| 146 | +planopticon export exchange --input ./kb --output ./export.json | |
| 147 | +``` | |
| 68 | 148 | |
| 69 | 149 | ## Discover available models |
| 70 | 150 | |
| 71 | 151 | ```bash |
| 72 | 152 | planopticon list-models |
| 73 | 153 |
| --- docs/getting-started/quickstart.md | |
| +++ docs/getting-started/quickstart.md | |
| @@ -63,10 +63,90 @@ | |
| 63 | Batch mode produces per-video outputs plus: |
| 64 | |
| 65 | - Merged knowledge graph across all videos |
| 66 | - Batch summary with aggregated action items |
| 67 | - Cross-referenced entities |
| 68 | |
| 69 | ## Discover available models |
| 70 | |
| 71 | ```bash |
| 72 | planopticon list-models |
| 73 |
| --- docs/getting-started/quickstart.md | |
| +++ docs/getting-started/quickstart.md | |
| @@ -63,10 +63,90 @@ | |
| 63 | Batch mode produces per-video outputs plus: |
| 64 | |
| 65 | - Merged knowledge graph across all videos |
| 66 | - Batch summary with aggregated action items |
| 67 | - Cross-referenced entities |
| 68 | |
| 69 | ## Ingest documents |
| 70 | |
| 71 | Build a knowledge graph from documents, notes, or any text content: |
| 72 | |
| 73 | ```bash |
| 74 | # Ingest a single file |
| 75 | planopticon ingest ./meeting-notes.md --output ./kb |
| 76 | |
| 77 | # Ingest a directory recursively |
| 78 | planopticon ingest ./docs/ --output ./kb --recursive |
| 79 | |
| 80 | # Ingest from a URL |
| 81 | planopticon ingest "https://www.youtube.com/watch?v=example" --output ./kb |
| 82 | ``` |
| 83 | |
| 84 | ## Companion REPL |
| 85 | |
| 86 | Chat with your knowledge base interactively: |
| 87 | |
| 88 | ```bash |
| 89 | # Start the companion |
| 90 | planopticon companion --kb ./kb |
| 91 | |
| 92 | # Use a specific provider |
| 93 | planopticon companion --kb ./kb --provider anthropic |
| 94 | ``` |
| 95 | |
| 96 | The companion understands your knowledge graph and can answer questions, find connections, and summarize topics conversationally. |
| 97 | |
| 98 | ## Planning agent |
| 99 | |
| 100 | Run the planning agent for adaptive, goal-directed analysis: |
| 101 | |
| 102 | ```bash |
| 103 | # Interactive mode — the agent asks before each action |
| 104 | planopticon agent --kb ./kb --interactive |
| 105 | |
| 106 | # Non-interactive with export |
| 107 | planopticon agent --kb ./kb --export ./plan.md |
| 108 | ``` |
| 109 | |
| 110 | ## Query the knowledge graph |
| 111 | |
| 112 | Query your knowledge graph directly without an AI provider: |
| 113 | |
| 114 | ```bash |
| 115 | # Show graph stats (entity/relationship counts) |
| 116 | planopticon query stats |
| 117 | |
| 118 | # List entities by type |
| 119 | planopticon query "entities --type technology" |
| 120 | |
| 121 | # Find neighbors of an entity |
| 122 | planopticon query "neighbors Alice" |
| 123 | |
| 124 | # Natural language query (requires API key) |
| 125 | planopticon query "What technologies were discussed?" |
| 126 | |
| 127 | # Interactive REPL |
| 128 | planopticon query -I |
| 129 | ``` |
| 130 | |
| 131 | ## Export |
| 132 | |
| 133 | Export your knowledge base to various formats: |
| 134 | |
| 135 | ```bash |
| 136 | # Export to Markdown files |
| 137 | planopticon export markdown --input ./kb --output ./docs |
| 138 | |
| 139 | # Export to an Obsidian vault |
| 140 | planopticon export obsidian --input ./kb --output ~/Obsidian/PlanOpticon |
| 141 | |
| 142 | # Export to Notion |
| 143 | planopticon export notion --input ./kb --parent-page abc123 |
| 144 | |
| 145 | # Export as exchange format (portable JSON) |
| 146 | planopticon export exchange --input ./kb --output ./export.json |
| 147 | ``` |
| 148 | |
| 149 | ## Discover available models |
| 150 | |
| 151 | ```bash |
| 152 | planopticon list-models |
| 153 |
+174
-1
| --- docs/guide/cloud-sources.md | ||
| +++ docs/guide/cloud-sources.md | ||
| @@ -1,8 +1,8 @@ | ||
| 1 | 1 | # Cloud Sources |
| 2 | 2 | |
| 3 | -PlanOpticon can fetch videos directly from cloud storage services. | |
| 3 | +PlanOpticon connects to 20+ source platforms for fetching videos, documents, and notes. | |
| 4 | 4 | |
| 5 | 5 | ## Google Drive |
| 6 | 6 | |
| 7 | 7 | ### Service account auth |
| 8 | 8 | |
| @@ -25,10 +25,75 @@ | ||
| 25 | 25 | ### Install |
| 26 | 26 | |
| 27 | 27 | ```bash |
| 28 | 28 | pip install planopticon[gdrive] |
| 29 | 29 | ``` |
| 30 | + | |
| 31 | +## Google Workspace (gws) | |
| 32 | + | |
| 33 | +Full Google Workspace integration beyond just Drive. Access Docs, Sheets, Slides, and Meet recordings through the `gws` CLI group. | |
| 34 | + | |
| 35 | +### Setup | |
| 36 | + | |
| 37 | +```bash | |
| 38 | +planopticon auth google | |
| 39 | +``` | |
| 40 | + | |
| 41 | +A single Google OAuth session covers Drive, Docs, Sheets, Slides, and Meet. | |
| 42 | + | |
| 43 | +### Usage | |
| 44 | + | |
| 45 | +```bash | |
| 46 | +# List all Google Workspace content | |
| 47 | +planopticon gws list | |
| 48 | + | |
| 49 | +# List only Google Docs | |
| 50 | +planopticon gws list --type docs | |
| 51 | + | |
| 52 | +# List Meet recordings | |
| 53 | +planopticon gws list --type meet | |
| 54 | + | |
| 55 | +# Fetch a specific file | |
| 56 | +planopticon gws fetch abc123def --output ./downloads | |
| 57 | + | |
| 58 | +# Ingest an entire Drive folder into a knowledge base | |
| 59 | +planopticon gws ingest --folder-id abc123 --output ./kb --recursive | |
| 60 | +``` | |
| 61 | + | |
| 62 | +## Microsoft 365 (m365) | |
| 63 | + | |
| 64 | +Access OneDrive, SharePoint, Teams recordings, and Outlook content. | |
| 65 | + | |
| 66 | +### Setup | |
| 67 | + | |
| 68 | +```bash | |
| 69 | +# Set your Azure AD app credentials | |
| 70 | +export MICROSOFT_CLIENT_ID="your-client-id" | |
| 71 | +export MICROSOFT_CLIENT_SECRET="your-client-secret" | |
| 72 | + | |
| 73 | +# Authenticate | |
| 74 | +planopticon auth microsoft | |
| 75 | +``` | |
| 76 | + | |
| 77 | +### Usage | |
| 78 | + | |
| 79 | +```bash | |
| 80 | +# List all Microsoft 365 content | |
| 81 | +planopticon m365 list | |
| 82 | + | |
| 83 | +# List only Teams recordings | |
| 84 | +planopticon m365 list --type teams | |
| 85 | + | |
| 86 | +# List SharePoint files from a specific site | |
| 87 | +planopticon m365 list --type sharepoint --site "Engineering" | |
| 88 | + | |
| 89 | +# Fetch a specific file | |
| 90 | +planopticon m365 fetch item-id-123 --output ./downloads | |
| 91 | + | |
| 92 | +# Ingest SharePoint content into a knowledge base | |
| 93 | +planopticon m365 ingest --site "Engineering" --path "/Shared Documents" --output ./kb --recursive | |
| 94 | +``` | |
| 30 | 95 | |
| 31 | 96 | ## Dropbox |
| 32 | 97 | |
| 33 | 98 | ### OAuth2 auth |
| 34 | 99 | |
| @@ -40,11 +105,119 @@ | ||
| 40 | 105 | ### Install |
| 41 | 106 | |
| 42 | 107 | ```bash |
| 43 | 108 | pip install planopticon[dropbox] |
| 44 | 109 | ``` |
| 110 | + | |
| 111 | +## YouTube | |
| 112 | + | |
| 113 | +PlanOpticon can ingest YouTube videos by URL. Audio is extracted and transcribed, and any visible content (slides, diagrams) is captured from frames. | |
| 114 | + | |
| 115 | +```bash | |
| 116 | +# Ingest a YouTube video | |
| 117 | +planopticon ingest "https://www.youtube.com/watch?v=example" --output ./kb | |
| 118 | + | |
| 119 | +# Ingest a playlist | |
| 120 | +planopticon ingest "https://www.youtube.com/playlist?list=example" --output ./kb | |
| 121 | +``` | |
| 122 | + | |
| 123 | +YouTube ingestion uses `yt-dlp` under the hood. Install it separately if not already available: | |
| 124 | + | |
| 125 | +```bash | |
| 126 | +pip install yt-dlp | |
| 127 | +``` | |
| 128 | + | |
| 129 | +## Meeting recordings | |
| 130 | + | |
| 131 | +Access cloud recordings from Zoom, Microsoft Teams, and Google Meet. | |
| 132 | + | |
| 133 | +### Zoom | |
| 134 | + | |
| 135 | +```bash | |
| 136 | +# Set credentials and authenticate | |
| 137 | +export ZOOM_CLIENT_ID="your-client-id" | |
| 138 | +export ZOOM_CLIENT_SECRET="your-client-secret" | |
| 139 | +planopticon auth zoom | |
| 140 | + | |
| 141 | +# List recent Zoom recordings | |
| 142 | +planopticon recordings zoom-list | |
| 143 | + | |
| 144 | +# List recordings from a date range | |
| 145 | +planopticon recordings zoom-list --from 2026-01-01 --to 2026-02-01 | |
| 146 | +``` | |
| 147 | + | |
| 148 | +### Microsoft Teams | |
| 149 | + | |
| 150 | +```bash | |
| 151 | +# Authenticate with Microsoft (covers Teams) | |
| 152 | +planopticon auth microsoft | |
| 153 | + | |
| 154 | +# List Teams recordings | |
| 155 | +planopticon recordings teams-list | |
| 156 | + | |
| 157 | +# List recordings from a date range | |
| 158 | +planopticon recordings teams-list --from 2026-01-01 --to 2026-02-01 | |
| 159 | +``` | |
| 160 | + | |
| 161 | +### Google Meet | |
| 162 | + | |
| 163 | +```bash | |
| 164 | +# Authenticate with Google (covers Meet) | |
| 165 | +planopticon auth google | |
| 166 | + | |
| 167 | +# List Meet recordings | |
| 168 | +planopticon recordings meet-list --limit 10 | |
| 169 | +``` | |
| 170 | + | |
| 171 | +## Notes sources | |
| 172 | + | |
| 173 | +PlanOpticon can ingest notes and documents from several note-taking platforms. | |
| 174 | + | |
| 175 | +### Obsidian | |
| 176 | + | |
| 177 | +Ingest an Obsidian vault directly. PlanOpticon follows wikilinks and parses frontmatter. | |
| 178 | + | |
| 179 | +```bash | |
| 180 | +planopticon ingest ~/Obsidian/MyVault --output ./kb --recursive | |
| 181 | +``` | |
| 182 | + | |
| 183 | +### Notion | |
| 184 | + | |
| 185 | +```bash | |
| 186 | +# Set your Notion integration token | |
| 187 | +export NOTION_API_KEY="secret_..." | |
| 188 | + | |
| 189 | +# Authenticate | |
| 190 | +planopticon auth notion | |
| 191 | + | |
| 192 | +# Export knowledge base to Notion | |
| 193 | +planopticon export notion --input ./kb --parent-page abc123 | |
| 194 | +``` | |
| 195 | + | |
| 196 | +### Apple Notes | |
| 197 | + | |
| 198 | +PlanOpticon can read Apple Notes on macOS via the system AppleScript bridge. | |
| 199 | + | |
| 200 | +```bash | |
| 201 | +planopticon ingest --source apple-notes --output ./kb | |
| 202 | +``` | |
| 203 | + | |
| 204 | +### GitHub | |
| 205 | + | |
| 206 | +Ingest README files, wikis, and documentation from GitHub repositories. | |
| 207 | + | |
| 208 | +```bash | |
| 209 | +# Set your GitHub token | |
| 210 | +export GITHUB_TOKEN="ghp_..." | |
| 211 | + | |
| 212 | +# Authenticate | |
| 213 | +planopticon auth github | |
| 214 | + | |
| 215 | +# Ingest a repo's docs | |
| 216 | +planopticon ingest "github://ConflictHQ/PlanOpticon" --output ./kb | |
| 217 | +``` | |
| 45 | 218 | |
| 46 | 219 | ## All cloud sources |
| 47 | 220 | |
| 48 | 221 | ```bash |
| 49 | 222 | pip install planopticon[cloud] |
| 50 | 223 | ``` |
| 51 | 224 |
| --- docs/guide/cloud-sources.md | |
| +++ docs/guide/cloud-sources.md | |
| @@ -1,8 +1,8 @@ | |
| 1 | # Cloud Sources |
| 2 | |
| 3 | PlanOpticon can fetch videos directly from cloud storage services. |
| 4 | |
| 5 | ## Google Drive |
| 6 | |
| 7 | ### Service account auth |
| 8 | |
| @@ -25,10 +25,75 @@ | |
| 25 | ### Install |
| 26 | |
| 27 | ```bash |
| 28 | pip install planopticon[gdrive] |
| 29 | ``` |
| 30 | |
| 31 | ## Dropbox |
| 32 | |
| 33 | ### OAuth2 auth |
| 34 | |
| @@ -40,11 +105,119 @@ | |
| 40 | ### Install |
| 41 | |
| 42 | ```bash |
| 43 | pip install planopticon[dropbox] |
| 44 | ``` |
| 45 | |
| 46 | ## All cloud sources |
| 47 | |
| 48 | ```bash |
| 49 | pip install planopticon[cloud] |
| 50 | ``` |
| 51 |
| --- docs/guide/cloud-sources.md | |
| +++ docs/guide/cloud-sources.md | |
| @@ -1,8 +1,8 @@ | |
| 1 | # Cloud Sources |
| 2 | |
| 3 | PlanOpticon connects to 20+ source platforms for fetching videos, documents, and notes. |
| 4 | |
| 5 | ## Google Drive |
| 6 | |
| 7 | ### Service account auth |
| 8 | |
| @@ -25,10 +25,75 @@ | |
| 25 | ### Install |
| 26 | |
| 27 | ```bash |
| 28 | pip install planopticon[gdrive] |
| 29 | ``` |
| 30 | |
| 31 | ## Google Workspace (gws) |
| 32 | |
| 33 | Full Google Workspace integration beyond just Drive. Access Docs, Sheets, Slides, and Meet recordings through the `gws` CLI group. |
| 34 | |
| 35 | ### Setup |
| 36 | |
| 37 | ```bash |
| 38 | planopticon auth google |
| 39 | ``` |
| 40 | |
| 41 | A single Google OAuth session covers Drive, Docs, Sheets, Slides, and Meet. |
| 42 | |
| 43 | ### Usage |
| 44 | |
| 45 | ```bash |
| 46 | # List all Google Workspace content |
| 47 | planopticon gws list |
| 48 | |
| 49 | # List only Google Docs |
| 50 | planopticon gws list --type docs |
| 51 | |
| 52 | # List Meet recordings |
| 53 | planopticon gws list --type meet |
| 54 | |
| 55 | # Fetch a specific file |
| 56 | planopticon gws fetch abc123def --output ./downloads |
| 57 | |
| 58 | # Ingest an entire Drive folder into a knowledge base |
| 59 | planopticon gws ingest --folder-id abc123 --output ./kb --recursive |
| 60 | ``` |
| 61 | |
| 62 | ## Microsoft 365 (m365) |
| 63 | |
| 64 | Access OneDrive, SharePoint, Teams recordings, and Outlook content. |
| 65 | |
| 66 | ### Setup |
| 67 | |
| 68 | ```bash |
| 69 | # Set your Azure AD app credentials |
| 70 | export MICROSOFT_CLIENT_ID="your-client-id" |
| 71 | export MICROSOFT_CLIENT_SECRET="your-client-secret" |
| 72 | |
| 73 | # Authenticate |
| 74 | planopticon auth microsoft |
| 75 | ``` |
| 76 | |
| 77 | ### Usage |
| 78 | |
| 79 | ```bash |
| 80 | # List all Microsoft 365 content |
| 81 | planopticon m365 list |
| 82 | |
| 83 | # List only Teams recordings |
| 84 | planopticon m365 list --type teams |
| 85 | |
| 86 | # List SharePoint files from a specific site |
| 87 | planopticon m365 list --type sharepoint --site "Engineering" |
| 88 | |
| 89 | # Fetch a specific file |
| 90 | planopticon m365 fetch item-id-123 --output ./downloads |
| 91 | |
| 92 | # Ingest SharePoint content into a knowledge base |
| 93 | planopticon m365 ingest --site "Engineering" --path "/Shared Documents" --output ./kb --recursive |
| 94 | ``` |
| 95 | |
| 96 | ## Dropbox |
| 97 | |
| 98 | ### OAuth2 auth |
| 99 | |
| @@ -40,11 +105,119 @@ | |
| 105 | ### Install |
| 106 | |
| 107 | ```bash |
| 108 | pip install planopticon[dropbox] |
| 109 | ``` |
| 110 | |
| 111 | ## YouTube |
| 112 | |
| 113 | PlanOpticon can ingest YouTube videos by URL. Audio is extracted and transcribed, and any visible content (slides, diagrams) is captured from frames. |
| 114 | |
| 115 | ```bash |
| 116 | # Ingest a YouTube video |
| 117 | planopticon ingest "https://www.youtube.com/watch?v=example" --output ./kb |
| 118 | |
| 119 | # Ingest a playlist |
| 120 | planopticon ingest "https://www.youtube.com/playlist?list=example" --output ./kb |
| 121 | ``` |
| 122 | |
| 123 | YouTube ingestion uses `yt-dlp` under the hood. Install it separately if not already available: |
| 124 | |
| 125 | ```bash |
| 126 | pip install yt-dlp |
| 127 | ``` |
| 128 | |
| 129 | ## Meeting recordings |
| 130 | |
| 131 | Access cloud recordings from Zoom, Microsoft Teams, and Google Meet. |
| 132 | |
| 133 | ### Zoom |
| 134 | |
| 135 | ```bash |
| 136 | # Set credentials and authenticate |
| 137 | export ZOOM_CLIENT_ID="your-client-id" |
| 138 | export ZOOM_CLIENT_SECRET="your-client-secret" |
| 139 | planopticon auth zoom |
| 140 | |
| 141 | # List recent Zoom recordings |
| 142 | planopticon recordings zoom-list |
| 143 | |
| 144 | # List recordings from a date range |
| 145 | planopticon recordings zoom-list --from 2026-01-01 --to 2026-02-01 |
| 146 | ``` |
| 147 | |
| 148 | ### Microsoft Teams |
| 149 | |
| 150 | ```bash |
| 151 | # Authenticate with Microsoft (covers Teams) |
| 152 | planopticon auth microsoft |
| 153 | |
| 154 | # List Teams recordings |
| 155 | planopticon recordings teams-list |
| 156 | |
| 157 | # List recordings from a date range |
| 158 | planopticon recordings teams-list --from 2026-01-01 --to 2026-02-01 |
| 159 | ``` |
| 160 | |
| 161 | ### Google Meet |
| 162 | |
| 163 | ```bash |
| 164 | # Authenticate with Google (covers Meet) |
| 165 | planopticon auth google |
| 166 | |
| 167 | # List Meet recordings |
| 168 | planopticon recordings meet-list --limit 10 |
| 169 | ``` |
| 170 | |
| 171 | ## Notes sources |
| 172 | |
| 173 | PlanOpticon can ingest notes and documents from several note-taking platforms. |
| 174 | |
| 175 | ### Obsidian |
| 176 | |
| 177 | Ingest an Obsidian vault directly. PlanOpticon follows wikilinks and parses frontmatter. |
| 178 | |
| 179 | ```bash |
| 180 | planopticon ingest ~/Obsidian/MyVault --output ./kb --recursive |
| 181 | ``` |
| 182 | |
| 183 | ### Notion |
| 184 | |
| 185 | ```bash |
| 186 | # Set your Notion integration token |
| 187 | export NOTION_API_KEY="secret_..." |
| 188 | |
| 189 | # Authenticate |
| 190 | planopticon auth notion |
| 191 | |
| 192 | # Export knowledge base to Notion |
| 193 | planopticon export notion --input ./kb --parent-page abc123 |
| 194 | ``` |
| 195 | |
| 196 | ### Apple Notes |
| 197 | |
| 198 | PlanOpticon can read Apple Notes on macOS via the system AppleScript bridge. |
| 199 | |
| 200 | ```bash |
| 201 | planopticon ingest --source apple-notes --output ./kb |
| 202 | ``` |
| 203 | |
| 204 | ### GitHub |
| 205 | |
| 206 | Ingest README files, wikis, and documentation from GitHub repositories. |
| 207 | |
| 208 | ```bash |
| 209 | # Set your GitHub token |
| 210 | export GITHUB_TOKEN="ghp_..." |
| 211 | |
| 212 | # Authenticate |
| 213 | planopticon auth github |
| 214 | |
| 215 | # Ingest a repo's docs |
| 216 | planopticon ingest "github://ConflictHQ/PlanOpticon" --output ./kb |
| 217 | ``` |
| 218 | |
| 219 | ## All cloud sources |
| 220 | |
| 221 | ```bash |
| 222 | pip install planopticon[cloud] |
| 223 | ``` |
| 224 |
+24
-4
| --- docs/index.md | ||
| +++ docs/index.md | ||
| @@ -1,24 +1,29 @@ | ||
| 1 | 1 | # PlanOpticon |
| 2 | 2 | |
| 3 | -**AI-powered video analysis and knowledge extraction.** | |
| 3 | +**AI-powered video analysis, knowledge extraction, and planning.** | |
| 4 | 4 | |
| 5 | -PlanOpticon processes video recordings into structured knowledge — transcripts, diagrams, action items, key points, and knowledge graphs. It auto-discovers available models across OpenAI, Anthropic, and Gemini, and produces rich multi-format output. | |
| 5 | +PlanOpticon processes video recordings and documents into structured knowledge — transcripts, diagrams, action items, key points, and knowledge graphs. It connects to 20+ source platforms, auto-discovers available models across multiple AI providers, and produces rich multi-format output with an interactive companion REPL and planning agent. | |
| 6 | 6 | |
| 7 | 7 | --- |
| 8 | 8 | |
| 9 | 9 | ## Features |
| 10 | 10 | |
| 11 | -- **Multi-provider AI** — Automatically discovers and routes to the best available model across OpenAI, Anthropic, and Google Gemini | |
| 11 | +- **Multi-provider AI** — Automatically discovers and routes to the best available model across OpenAI, Anthropic, Google Gemini, and more | |
| 12 | +- **Planning agent** — Agentic analysis that adaptively adjusts depth, focus, and strategy based on content | |
| 13 | +- **Companion REPL** — Interactive chat interface for exploring your knowledge base conversationally | |
| 14 | +- **20+ source connectors** — Google Workspace, Microsoft 365, Zoom, Teams, Meet, Notion, GitHub, YouTube, Obsidian, Apple Notes, and more | |
| 15 | +- **Document export** — Export knowledge to Markdown, Obsidian, Notion, and exchange formats | |
| 16 | +- **OAuth authentication** — Built-in `planopticon auth` for Google, Dropbox, Zoom, Notion, GitHub, and Microsoft | |
| 12 | 17 | - **Smart frame extraction** — Change detection for transitions + periodic capture (every 30s) for slow-evolving content like document scrolling |
| 13 | 18 | - **People frame filtering** — OpenCV face detection removes webcam/video conference frames, keeping only shared content (slides, documents, screen shares) |
| 14 | 19 | - **Diagram extraction** — Vision model-based classification detects flowcharts, architecture diagrams, charts, and whiteboards |
| 15 | 20 | - **Knowledge graphs** — Extracts entities and relationships, builds and merges knowledge graphs across videos |
| 16 | 21 | - **Action item detection** — Finds commitments, tasks, and follow-ups with assignees and deadlines |
| 17 | 22 | - **Batch processing** — Process entire folders of videos with merged knowledge graphs and cross-referencing |
| 18 | 23 | - **Rich output** — Markdown, HTML, PDF, Mermaid diagrams, SVG/PNG renderings, JSON manifests |
| 19 | -- **Cloud sources** — Fetch videos from Google Drive and Dropbox shared folders | |
| 24 | +- **Cloud sources** — Fetch videos from Google Drive, Dropbox, and many more cloud platforms | |
| 20 | 25 | - **Checkpoint/resume** — Pipeline resumes from where it left off if interrupted — no wasted work |
| 21 | 26 | - **Screengrab fallback** — When extraction isn't perfect, captures frames with captions — something is always better than nothing |
| 22 | 27 | |
| 23 | 28 | ## Quick Start |
| 24 | 29 | |
| @@ -27,10 +32,25 @@ | ||
| 27 | 32 | pip install planopticon |
| 28 | 33 | |
| 29 | 34 | # Analyze a single video |
| 30 | 35 | planopticon analyze -i meeting.mp4 -o ./output |
| 31 | 36 | |
| 37 | +# Ingest documents and build a knowledge graph | |
| 38 | +planopticon ingest ./notes/ --output ./kb --recursive | |
| 39 | + | |
| 40 | +# Chat with your knowledge base | |
| 41 | +planopticon companion --kb ./kb | |
| 42 | + | |
| 43 | +# Run the planning agent interactively | |
| 44 | +planopticon agent --kb ./kb --interactive | |
| 45 | + | |
| 46 | +# Query the knowledge graph | |
| 47 | +planopticon query stats | |
| 48 | + | |
| 49 | +# Export to Obsidian | |
| 50 | +planopticon export obsidian --input ./kb --output ./vault | |
| 51 | + | |
| 32 | 52 | # Process a folder of videos |
| 33 | 53 | planopticon batch -i ./recordings -o ./output --title "Weekly Meetings" |
| 34 | 54 | |
| 35 | 55 | # See available AI models |
| 36 | 56 | planopticon list-models |
| 37 | 57 | |
| 38 | 58 | ADDED knowledge-base/viewer.html |
| --- docs/index.md | |
| +++ docs/index.md | |
| @@ -1,24 +1,29 @@ | |
| 1 | # PlanOpticon |
| 2 | |
| 3 | **AI-powered video analysis and knowledge extraction.** |
| 4 | |
| 5 | PlanOpticon processes video recordings into structured knowledge — transcripts, diagrams, action items, key points, and knowledge graphs. It auto-discovers available models across OpenAI, Anthropic, and Gemini, and produces rich multi-format output. |
| 6 | |
| 7 | --- |
| 8 | |
| 9 | ## Features |
| 10 | |
| 11 | - **Multi-provider AI** — Automatically discovers and routes to the best available model across OpenAI, Anthropic, and Google Gemini |
| 12 | - **Smart frame extraction** — Change detection for transitions + periodic capture (every 30s) for slow-evolving content like document scrolling |
| 13 | - **People frame filtering** — OpenCV face detection removes webcam/video conference frames, keeping only shared content (slides, documents, screen shares) |
| 14 | - **Diagram extraction** — Vision model-based classification detects flowcharts, architecture diagrams, charts, and whiteboards |
| 15 | - **Knowledge graphs** — Extracts entities and relationships, builds and merges knowledge graphs across videos |
| 16 | - **Action item detection** — Finds commitments, tasks, and follow-ups with assignees and deadlines |
| 17 | - **Batch processing** — Process entire folders of videos with merged knowledge graphs and cross-referencing |
| 18 | - **Rich output** — Markdown, HTML, PDF, Mermaid diagrams, SVG/PNG renderings, JSON manifests |
| 19 | - **Cloud sources** — Fetch videos from Google Drive and Dropbox shared folders |
| 20 | - **Checkpoint/resume** — Pipeline resumes from where it left off if interrupted — no wasted work |
| 21 | - **Screengrab fallback** — When extraction isn't perfect, captures frames with captions — something is always better than nothing |
| 22 | |
| 23 | ## Quick Start |
| 24 | |
| @@ -27,10 +32,25 @@ | |
| 27 | pip install planopticon |
| 28 | |
| 29 | # Analyze a single video |
| 30 | planopticon analyze -i meeting.mp4 -o ./output |
| 31 | |
| 32 | # Process a folder of videos |
| 33 | planopticon batch -i ./recordings -o ./output --title "Weekly Meetings" |
| 34 | |
| 35 | # See available AI models |
| 36 | planopticon list-models |
| 37 | |
| 38 | DDED knowledge-base/viewer.html |
| --- docs/index.md | |
| +++ docs/index.md | |
| @@ -1,24 +1,29 @@ | |
| 1 | # PlanOpticon |
| 2 | |
| 3 | **AI-powered video analysis, knowledge extraction, and planning.** |
| 4 | |
| 5 | PlanOpticon processes video recordings and documents into structured knowledge — transcripts, diagrams, action items, key points, and knowledge graphs. It connects to 20+ source platforms, auto-discovers available models across multiple AI providers, and produces rich multi-format output with an interactive companion REPL and planning agent. |
| 6 | |
| 7 | --- |
| 8 | |
| 9 | ## Features |
| 10 | |
| 11 | - **Multi-provider AI** — Automatically discovers and routes to the best available model across OpenAI, Anthropic, Google Gemini, and more |
| 12 | - **Planning agent** — Agentic analysis that adaptively adjusts depth, focus, and strategy based on content |
| 13 | - **Companion REPL** — Interactive chat interface for exploring your knowledge base conversationally |
| 14 | - **20+ source connectors** — Google Workspace, Microsoft 365, Zoom, Teams, Meet, Notion, GitHub, YouTube, Obsidian, Apple Notes, and more |
| 15 | - **Document export** — Export knowledge to Markdown, Obsidian, Notion, and exchange formats |
| 16 | - **OAuth authentication** — Built-in `planopticon auth` for Google, Dropbox, Zoom, Notion, GitHub, and Microsoft |
| 17 | - **Smart frame extraction** — Change detection for transitions + periodic capture (every 30s) for slow-evolving content like document scrolling |
| 18 | - **People frame filtering** — OpenCV face detection removes webcam/video conference frames, keeping only shared content (slides, documents, screen shares) |
| 19 | - **Diagram extraction** — Vision model-based classification detects flowcharts, architecture diagrams, charts, and whiteboards |
| 20 | - **Knowledge graphs** — Extracts entities and relationships, builds and merges knowledge graphs across videos |
| 21 | - **Action item detection** — Finds commitments, tasks, and follow-ups with assignees and deadlines |
| 22 | - **Batch processing** — Process entire folders of videos with merged knowledge graphs and cross-referencing |
| 23 | - **Rich output** — Markdown, HTML, PDF, Mermaid diagrams, SVG/PNG renderings, JSON manifests |
| 24 | - **Cloud sources** — Fetch videos from Google Drive, Dropbox, and many more cloud platforms |
| 25 | - **Checkpoint/resume** — Pipeline resumes from where it left off if interrupted — no wasted work |
| 26 | - **Screengrab fallback** — When extraction isn't perfect, captures frames with captions — something is always better than nothing |
| 27 | |
| 28 | ## Quick Start |
| 29 | |
| @@ -27,10 +32,25 @@ | |
| 32 | pip install planopticon |
| 33 | |
| 34 | # Analyze a single video |
| 35 | planopticon analyze -i meeting.mp4 -o ./output |
| 36 | |
| 37 | # Ingest documents and build a knowledge graph |
| 38 | planopticon ingest ./notes/ --output ./kb --recursive |
| 39 | |
| 40 | # Chat with your knowledge base |
| 41 | planopticon companion --kb ./kb |
| 42 | |
| 43 | # Run the planning agent interactively |
| 44 | planopticon agent --kb ./kb --interactive |
| 45 | |
| 46 | # Query the knowledge graph |
| 47 | planopticon query stats |
| 48 | |
| 49 | # Export to Obsidian |
| 50 | planopticon export obsidian --input ./kb --output ./vault |
| 51 | |
| 52 | # Process a folder of videos |
| 53 | planopticon batch -i ./recordings -o ./output --title "Weekly Meetings" |
| 54 | |
| 55 | # See available AI models |
| 56 | planopticon list-models |
| 57 | |
| 58 | DDED knowledge-base/viewer.html |
+406
| --- a/knowledge-base/viewer.html | ||
| +++ b/knowledge-base/viewer.html | ||
| @@ -0,0 +1,406 @@ | ||
| 1 | +<!DOCTYPE html> | |
| 2 | +<html lang="en"> | |
| 3 | +<head> | |
| 4 | +<meta charset="UTF-8"> | |
| 5 | +<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| 6 | +<title>PlanOpticon Knowledge Graph Viewer</title> | |
| 7 | +<script src="https://d3js.org/d3.v7.min.js"></script> | |
| 8 | +<style> | |
| 9 | + * { margin: 0; padding: 0; box-sizing: border-box; } | |
| 10 | + body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; background: #1a1a2e; color: #e0e0e0; overflow: hidden; height: 100vh; } | |
| 11 | + #toolbar { position: fixed; top: 0; left: 0; right: 0; z-index: 10; background: #16213e; padding: 8px 16px; display: flex; align-items: center; gap: 12px; border-bottom: 1px solid #0f3460; flex-wrap: wrap; } | |
| 12 | + #toolbar h1 { font-size: 14px; font-weight: 600; color: #e94560; white-space: nowrap; } | |
| 13 | + #search { background: #1a1a2e; border: 1px solid #0f3460; color: #e0e0e0; padding: 5px 10px; border-radius: 4px; font-size: 13px; width: 200px; } | |
| 14 | + #search::placeholder { color: #666; } | |
| 15 | + .filter-btn { background: #1a1a2e; border: 1px solid #0f3460; color: #e0e0e0; padding: 4px 10px; border-radius: 12px; font-size: 12px; cursor: pointer; display: flex; align-items: center; gap: 4px; } | |
| 16 | + .filter-btn.active { border-color: #e94560; } | |
| 17 | + .filter-btn .dot { width: 8px; height: 8px; border-radius: 50%; display: inline-block; } | |
| 18 | + #drop-zone { position: fixed; top: 50%; left: 50%; transform: translate(-50%, -50%); z-index: 20; background: #16213e; border: 2px dashed #0f3460; border-radius: 12px; padding: 60px; text-align: center; } | |
| 19 | + #drop-zone.hidden { display: none; } | |
| 20 | + #drop-zone h2 { color: #e94560; margin-bottom: 12px; } | |
| 21 | + #drop-zone p { color: #888; margin-bottom: 16px; font-size: 14px; } | |
| 22 | + #file-input { display: none; } | |
| 23 | + #drop-zone label { background: #e94560; color: white; padding: 8px 20px; border-radius: 6px; cursor: pointer; font-size: 14px; } | |
| 24 | + #drop-zone label:hover { background: #c73e54; } | |
| 25 | + #graph-container { width: 100%; height: 100vh; padding-top: 44px; } | |
| 26 | + svg { width: 100%; height: 100%; } | |
| 27 | + .node circle { stroke: #333; stroke-width: 1.5px; cursor: pointer; } | |
| 28 | + .node text { font-size: 10px; fill: #e0e0e0; pointer-events: none; text-anchor: middle; } | |
| 29 | + .link { stroke: #334; stroke-opacity: 0.6; } | |
| 30 | + .link-label { font-size: 8px; fill: #666; pointer-events: none; } | |
| 31 | + .node.highlighted circle { stroke: #e94560; stroke-width: 3px; } | |
| 32 | + .node.dimmed { opacity: 0.15; } | |
| 33 | + .link.dimmed { opacity: 0.05; } | |
| 34 | + #detail-panel { position: fixed; top: 44px; right: 0; width: 300px; height: calc(100vh - 44px); background: #16213e; border-left: 1px solid #0f3460; padding: 16px; overflow-y: auto; transform: translateX(100%); transition: transform 0.2s; z-index: 10; } | |
| 35 | + #detail-panel.open { transform: translateX(0); } | |
| 36 | + #detail-panel h3 { color: #e94560; margin-bottom: 4px; font-size: 16px; } | |
| 37 | + #detail-panel .type-badge { display: inline-block; padding: 2px 8px; border-radius: 8px; font-size: 11px; margin-bottom: 12px; } | |
| 38 | + #detail-panel .section { margin-bottom: 14px; } | |
| 39 | + #detail-panel .section h4 { font-size: 12px; color: #888; text-transform: uppercase; margin-bottom: 6px; } | |
| 40 | + #detail-panel .section p, #detail-panel .section li { font-size: 13px; line-height: 1.5; } | |
| 41 | + #detail-panel ul { list-style: none; padding: 0; } | |
| 42 | + #detail-panel ul li { padding: 3px 0; border-bottom: 1px solid #0f3460; } | |
| 43 | + #detail-panel .close-btn { position: absolute; top: 12px; right: 12px; background: none; border: none; color: #888; cursor: pointer; font-size: 18px; } | |
| 44 | + #stats { font-size: 11px; color: #666; white-space: nowrap; } | |
| 45 | + .drag-over #drop-zone { border-color: #e94560; background: #1a1a3e; } | |
| 46 | + #no-d3-msg { display: none; position: fixed; top: 50%; left: 50%; transform: translate(-50%, -50%); background: #16213e; padding: 40px; border-radius: 12px; text-align: center; border: 1px solid #e94560; z-index: 100; } | |
| 47 | + #no-d3-msg h2 { color: #e94560; margin-bottom: 8px; } | |
| 48 | +</style> | |
| 49 | +</head> | |
| 50 | +<body> | |
| 51 | +<div id="no-d3-msg"> | |
| 52 | + <h2>D3.js Required</h2> | |
| 53 | + <p>This viewer requires an internet connection on first load to fetch D3.js.</p> | |
| 54 | + <p style="color:#888; font-size:13px;">Load from: https://d3js.org/d3.v7.min.js</p> | |
| 55 | +</div> | |
| 56 | + | |
| 57 | +<div id="toolbar"> | |
| 58 | + <h1>PlanOpticon</h1> | |
| 59 | + <input type="text" id="search" placeholder="Search entities..."> | |
| 60 | + <button class="filter-btn active" data-type="person"><span class="dot" style="background:#f9d5e5"></span> Person</button> | |
| 61 | + <button class="filter-btn active" data-type="concept"><span class="dot" style="background:#eeeeee"></span> Concept</button> | |
| 62 | + <button class="filter-btn active" data-type="technology"><span class="dot" style="background:#d5e5f9"></span> Technology</button> | |
| 63 | + <button class="filter-btn active" data-type="organization"><span class="dot" style="background:#f9e5d5"></span> Organization</button> | |
| 64 | + <span id="stats"></span> | |
| 65 | +</div> | |
| 66 | + | |
| 67 | +<div id="drop-zone"> | |
| 68 | + <h2>Load Knowledge Graph</h2> | |
| 69 | + <p>Drag and drop a knowledge_graph.json file here, or click to browse.</p> | |
| 70 | + <label for="file-input">Choose File</label> | |
| 71 | + <input type="file" id="file-input" accept=".json"> | |
| 72 | +</div> | |
| 73 | + | |
| 74 | +<div id="graph-container"> | |
| 75 | + <svg id="graph-svg"></svg> | |
| 76 | +</div> | |
| 77 | + | |
| 78 | +<div id="detail-panel"> | |
| 79 | + <button class="close-btn" id="close-detail">×</button> | |
| 80 | + <h3 id="detail-name"></h3> | |
| 81 | + <div class="type-badge" id="detail-type"></div> | |
| 82 | + <div class="section" id="desc-section"> | |
| 83 | + <h4>Descriptions</h4> | |
| 84 | + <ul id="detail-descriptions"></ul> | |
| 85 | + </div> | |
| 86 | + <div class="section" id="conn-section"> | |
| 87 | + <h4>Connections</h4> | |
| 88 | + <ul id="detail-connections"></ul> | |
| 89 | + </div> | |
| 90 | +</div> | |
| 91 | + | |
| 92 | +<script> | |
| 93 | +(function() { | |
| 94 | + // Check D3 loaded | |
| 95 | + if (typeof d3 === 'undefined') { | |
| 96 | + document.getElementById('no-d3-msg').style.display = 'block'; | |
| 97 | + return; | |
| 98 | + } | |
| 99 | + | |
| 100 | + const TYPE_COLORS = { | |
| 101 | + person: '#f9d5e5', | |
| 102 | + concept: '#eeeeee', | |
| 103 | + technology: '#d5e5f9', | |
| 104 | + organization: '#f9e5d5', | |
| 105 | + time: '#e5d5f9', | |
| 106 | + diagram: '#d5f9e5' | |
| 107 | + }; | |
| 108 | + const DEFAULT_COLOR = '#cccccc'; | |
| 109 | + | |
| 110 | + let graphData = null; // { nodes: [], relationships: [] } | |
| 111 | + let simulation = null; | |
| 112 | + let nodesG, linksG, labelsG; | |
| 113 | + let activeTypes = new Set(['person', 'concept', 'technology', 'organization', 'time', 'diagram']); | |
| 114 | + | |
| 115 | + // --- File loading --- | |
| 116 | + const dropZone = document.getElementById('drop-zone'); | |
| 117 | + const fileInput = document.getElementById('file-input'); | |
| 118 | + | |
| 119 | + dropZone.addEventListener('dragover', e => { e.preventDefault(); document.body.classList.add('drag-over'); }); | |
| 120 | + dropZone.addEventListener('dragleave', () => document.body.classList.remove('drag-over')); | |
| 121 | + dropZone.addEventListener('drop', e => { | |
| 122 | + e.preventDefault(); | |
| 123 | + document.body.classList.remove('drag-over'); | |
| 124 | + const file = e.dataTransfer.files[0]; | |
| 125 | + if (file) loadFile(file); | |
| 126 | + }); | |
| 127 | + fileInput.addEventListener('change', e => { if (e.target.files[0]) loadFile(e.target.files[0]); }); | |
| 128 | + | |
| 129 | + function loadFile(file) { | |
| 130 | + const reader = new FileReader(); | |
| 131 | + reader.onload = e => { | |
| 132 | + try { | |
| 133 | + const data = JSON.parse(e.target.result); | |
| 134 | + initGraph(data); | |
| 135 | + } catch (err) { | |
| 136 | + alert('Invalid JSON file: ' + err.message); | |
| 137 | + } | |
| 138 | + }; | |
| 139 | + reader.readAsText(file); | |
| 140 | + } | |
| 141 | + | |
| 142 | + // --- Pre-embedded data support --- | |
| 143 | + if (window.__PLANOPTICON_DATA__) { | |
| 144 | + window.addEventListener('DOMContentLoaded', () => initGraph(window.__PLANOPTICON_DATA__)); | |
| 145 | + } | |
| 146 | + // Also check after script runs (if DOM already ready) | |
| 147 | + if (document.readyState !== 'loading' && window.__PLANOPTICON_DATA__) { | |
| 148 | + initGraph(window.__PLANOPTICON_DATA__); | |
| 149 | + } | |
| 150 | + | |
| 151 | + // --- Graph init --- | |
| 152 | + function initGraph(raw) { | |
| 153 | + dropZone.classList.add('hidden'); | |
| 154 | + graphData = normalize(raw); | |
| 155 | + render(); | |
| 156 | + } | |
| 157 | + | |
| 158 | + function normalize(raw) { | |
| 159 | + // Accept both { nodes: [], relationships: [] } and { entities: [], relationships: [] } | |
| 160 | + const rawNodes = raw.nodes || raw.entities || []; | |
| 161 | + const rawRels = raw.relationships || raw.edges || []; | |
| 162 | + | |
| 163 | + const nodes = rawNodes.map(n => ({ | |
| 164 | + id: n.name || n.id, | |
| 165 | + name: n.name || n.id, | |
| 166 | + type: (n.type || 'concept').toLowerCase(), | |
| 167 | + descriptions: n.descriptions || (n.description ? [n.description] : []), | |
| 168 | + occurrences: n.occurrences || [] | |
| 169 | + })); | |
| 170 | + | |
| 171 | + const nodeSet = new Set(nodes.map(n => n.id)); | |
| 172 | + | |
| 173 | + const links = rawRels | |
| 174 | + .filter(r => nodeSet.has(r.source) && nodeSet.has(r.target)) | |
| 175 | + .map(r => ({ | |
| 176 | + source: r.source, | |
| 177 | + target: r.target, | |
| 178 | + type: r.type || 'related_to' | |
| 179 | + })); | |
| 180 | + | |
| 181 | + // Compute connection counts | |
| 182 | + const connCount = {}; | |
| 183 | + links.forEach(l => { | |
| 184 | + const s = typeof l.source === 'object' ? l.source.id : l.source; | |
| 185 | + const t = typeof l.target === 'object' ? l.target.id : l.target; | |
| 186 | + connCount[s] = (connCount[s] || 0) + 1; | |
| 187 | + connCount[t] = (connCount[t] || 0) + 1; | |
| 188 | + }); | |
| 189 | + nodes.forEach(n => { n.connections = connCount[n.id] || 0; }); | |
| 190 | + | |
| 191 | + return { nodes, links }; | |
| 192 | + } | |
| 193 | + | |
| 194 | + function render() { | |
| 195 | + const svg = d3.select('#graph-svg'); | |
| 196 | + svg.selectAll('*').remove(); | |
| 197 | + | |
| 198 | + const width = window.innerWidth; | |
| 199 | + const height = window.innerHeight - 44; | |
| 200 | + | |
| 201 | + const g = svg.append('g'); | |
| 202 | + | |
| 203 | + // Zoom | |
| 204 | + const zoom = d3.zoom() | |
| 205 | + .scaleExtent([0.1, 8]) | |
| 206 | + .on('zoom', e => g.attr('transform', e.transform)); | |
| 207 | + svg.call(zoom); | |
| 208 | + | |
| 209 | + // Filter nodes/links by active types | |
| 210 | + const visibleNodes = graphData.nodes.filter(n => activeTypes.has(n.type)); | |
| 211 | + const visibleIds = new Set(visibleNodes.map(n => n.id)); | |
| 212 | + const visibleLinks = graphData.links.filter(l => { | |
| 213 | + const s = typeof l.source === 'object' ? l.source.id : l.source; | |
| 214 | + const t = typeof l.target === 'object' ? l.target.id : l.target; | |
| 215 | + return visibleIds.has(s) && visibleIds.has(t); | |
| 216 | + }); | |
| 217 | + | |
| 218 | + // Stats | |
| 219 | + document.getElementById('stats').textContent = visibleNodes.length + ' nodes, ' + visibleLinks.length + ' edges'; | |
| 220 | + | |
| 221 | + // Links | |
| 222 | + linksG = g.append('g').selectAll('line') | |
| 223 | + .data(visibleLinks) | |
| 224 | + .join('line') | |
| 225 | + .attr('class', 'link') | |
| 226 | + .attr('stroke-width', 1); | |
| 227 | + | |
| 228 | + // Link labels | |
| 229 | + labelsG = g.append('g').selectAll('text') | |
| 230 | + .data(visibleLinks) | |
| 231 | + .join('text') | |
| 232 | + .attr('class', 'link-label') | |
| 233 | + .text(d => d.type); | |
| 234 | + | |
| 235 | + // Nodes | |
| 236 | + const maxConn = Math.max(1, d3.max(visibleNodes, d => d.connections)); | |
| 237 | + const radiusScale = d3.scaleSqrt().domain([0, maxConn]).range([5, 24]); | |
| 238 | + | |
| 239 | + nodesG = g.append('g').selectAll('g') | |
| 240 | + .data(visibleNodes) | |
| 241 | + .join('g') | |
| 242 | + .attr('class', 'node') | |
| 243 | + .call(d3.drag() | |
| 244 | + .on('start', dragStart) | |
| 245 | + .on('drag', dragged) | |
| 246 | + .on('end', dragEnd)) | |
| 247 | + .on('click', (e, d) => showDetail(d)); | |
| 248 | + | |
| 249 | + nodesG.append('circle') | |
| 250 | + .attr('r', d => radiusScale(d.connections)) | |
| 251 | + .attr('fill', d => TYPE_COLORS[d.type] || DEFAULT_COLOR); | |
| 252 | + | |
| 253 | + nodesG.append('text') | |
| 254 | + .attr('dy', d => radiusScale(d.connections) + 12) | |
| 255 | + .text(d => d.name.length > 20 ? d.name.slice(0, 18) + '..' : d.name); | |
| 256 | + | |
| 257 | + // Simulation | |
| 258 | + simulation = d3.forceSimulation(visibleNodes) | |
| 259 | + .force('link', d3.forceLink(visibleLinks).id(d => d.id).distance(80)) | |
| 260 | + .force('charge', d3.forceManyBody().strength(-200)) | |
| 261 | + .force('center', d3.forceCenter(width / 2, height / 2)) | |
| 262 | + .force('collision', d3.forceCollide().radius(d => radiusScale(d.connections) + 4)) | |
| 263 | + .on('tick', () => { | |
| 264 | + linksG | |
| 265 | + .attr('x1', d => d.source.x).attr('y1', d => d.source.y) | |
| 266 | + .attr('x2', d => d.target.x).attr('y2', d => d.target.y); | |
| 267 | + labelsG | |
| 268 | + .attr('x', d => (d.source.x + d.target.x) / 2) | |
| 269 | + .attr('y', d => (d.source.y + d.target.y) / 2); | |
| 270 | + nodesG.attr('transform', d => 'translate(' + d.x + ',' + d.y + ')'); | |
| 271 | + }); | |
| 272 | + | |
| 273 | + // Drag handlers | |
| 274 | + function dragStart(e, d) { | |
| 275 | + if (!e.active) simulation.alphaTarget(0.3).restart(); | |
| 276 | + d.fx = d.x; d.fy = d.y; | |
| 277 | + } | |
| 278 | + function dragged(e, d) { d.fx = e.x; d.fy = e.y; } | |
| 279 | + function dragEnd(e, d) { | |
| 280 | + if (!e.active) simulation.alphaTarget(0); | |
| 281 | + d.fx = null; d.fy = null; | |
| 282 | + } | |
| 283 | + | |
| 284 | + // Store zoom for centering | |
| 285 | + svg._zoom = zoom; | |
| 286 | + svg._g = g; | |
| 287 | + } | |
| 288 | + | |
| 289 | + // --- Search --- | |
| 290 | + const searchInput = document.getElementById('search'); | |
| 291 | + searchInput.addEventListener('input', () => { | |
| 292 | + const q = searchInput.value.toLowerCase().trim(); | |
| 293 | + if (!graphData || !nodesG) return; | |
| 294 | + | |
| 295 | + if (!q) { | |
| 296 | + nodesG.classed('highlighted', false).classed('dimmed', false); | |
| 297 | + linksG.classed('dimmed', false); | |
| 298 | + return; | |
| 299 | + } | |
| 300 | + | |
| 301 | + const matches = new Set(); | |
| 302 | + graphData.nodes.forEach(n => { | |
| 303 | + if (n.name.toLowerCase().includes(q)) matches.add(n.id); | |
| 304 | + }); | |
| 305 | + | |
| 306 | + // Also include direct neighbors of matches | |
| 307 | + const neighbors = new Set(matches); | |
| 308 | + graphData.links.forEach(l => { | |
| 309 | + const s = typeof l.source === 'object' ? l.source.id : l.source; | |
| 310 | + const t = typeof l.target === 'object' ? l.target.id : l.target; | |
| 311 | + if (matches.has(s)) neighbors.add(t); | |
| 312 | + if (matches.has(t)) neighbors.add(s); | |
| 313 | + }); | |
| 314 | + | |
| 315 | + nodesG.classed('highlighted', d => matches.has(d.id)); | |
| 316 | + nodesG.classed('dimmed', d => !neighbors.has(d.id)); | |
| 317 | + linksG.classed('dimmed', d => { | |
| 318 | + const s = typeof d.source === 'object' ? d.source.id : d.source; | |
| 319 | + const t = typeof d.target === 'object' ? d.target.id : d.target; | |
| 320 | + return !neighbors.has(s) || !neighbors.has(t); | |
| 321 | + }); | |
| 322 | + | |
| 323 | + // Center on first match | |
| 324 | + if (matches.size > 0) { | |
| 325 | + const first = graphData.nodes.find(n => matches.has(n.id)); | |
| 326 | + if (first && first.x != null) { | |
| 327 | + const svg = d3.select('#graph-svg'); | |
| 328 | + const width = window.innerWidth; | |
| 329 | + const height = window.innerHeight - 44; | |
| 330 | + svg.transition().duration(500).call( | |
| 331 | + svg._zoom.transform, | |
| 332 | + d3.zoomIdentity.translate(width / 2 - first.x, height / 2 - first.y) | |
| 333 | + ); | |
| 334 | + } | |
| 335 | + } | |
| 336 | + }); | |
| 337 | + | |
| 338 | + // --- Filter toggles --- | |
| 339 | + document.querySelectorAll('.filter-btn').forEach(btn => { | |
| 340 | + btn.addEventListener('click', () => { | |
| 341 | + const type = btn.dataset.type; | |
| 342 | + btn.classList.toggle('active'); | |
| 343 | + if (activeTypes.has(type)) activeTypes.delete(type); | |
| 344 | + else activeTypes.add(type); | |
| 345 | + if (graphData) render(); | |
| 346 | + }); | |
| 347 | + }); | |
| 348 | + | |
| 349 | + // --- Detail panel --- | |
| 350 | + function showDetail(node) { | |
| 351 | + const panel = document.getElementById('detail-panel'); | |
| 352 | + document.getElementById('detail-name').textContent = node.name; | |
| 353 | + const badge = document.getElementById('detail-type'); | |
| 354 | + badge.textContent = node.type; | |
| 355 | + badge.style.background = TYPE_COLORS[node.type] || DEFAULT_COLOR; | |
| 356 | + badge.style.color = '#1a1a2e'; | |
| 357 | + | |
| 358 | + const descList = document.getElementById('detail-descriptions'); | |
| 359 | + descList.innerHTML = ''; | |
| 360 | + if (node.descriptions.length === 0) { | |
| 361 | + descList.innerHTML = '<li style="color:#666">No descriptions</li>'; | |
| 362 | + } else { | |
| 363 | + node.descriptions.forEach(d => { | |
| 364 | + const li = document.createElement('li'); | |
| 365 | + li.textContent = d; | |
| 366 | + descList.appendChild(li); | |
| 367 | + }); | |
| 368 | + } | |
| 369 | + | |
| 370 | + const connList = document.getElementById('detail-connections'); | |
| 371 | + connList.innerHTML = ''; | |
| 372 | + const connections = []; | |
| 373 | + graphData.links.forEach(l => { | |
| 374 | + const s = typeof l.source === 'object' ? l.source.id : l.source; | |
| 375 | + const t = typeof l.target === 'object' ? l.target.id : l.target; | |
| 376 | + if (s === node.id) connections.push({ target: t, type: l.type, dir: '->' }); | |
| 377 | + if (t === node.id) connections.push({ target: s, type: l.type, dir: '<-' }); | |
| 378 | + }); | |
| 379 | + if (connections.length === 0) { | |
| 380 | + connList.innerHTML = '<li style="color:#666">No connections</li>'; | |
| 381 | + } else { | |
| 382 | + connections.forEach(c => { | |
| 383 | + const li = document.createElement('li'); | |
| 384 | + li.textContent = c.dir + ' ' + c.target + ' (' + c.type + ')'; | |
| 385 | + li.style.cursor = 'pointer'; | |
| 386 | + li.addEventListener('click', () => { | |
| 387 | + searchInput.value = c.target; | |
| 388 | + searchInput.dispatchEvent(new Event('input')); | |
| 389 | + }); | |
| 390 | + connList.appendChild(li); | |
| 391 | + }); | |
| 392 | + } | |
| 393 | + | |
| 394 | + panel.classList.add('open'); | |
| 395 | + } | |
| 396 | + | |
| 397 | + document.getElementById('close-detail').addEventListener('click', () => { | |
| 398 | + document.getElementById('detail-panel').classList.remove('open'); | |
| 399 | + }); | |
| 400 | + | |
| 401 | + // --- Resize --- | |
| 402 | + window.addEventListener('resize', () => { if (graphData) render(); }); | |
| 403 | +})(); | |
| 404 | +</script> | |
| 405 | +</body> | |
| 406 | +</html> |
| --- a/knowledge-base/viewer.html | |
| +++ b/knowledge-base/viewer.html | |
| @@ -0,0 +1,406 @@ | |
| --- a/knowledge-base/viewer.html | |
| +++ b/knowledge-base/viewer.html | |
| @@ -0,0 +1,406 @@ | |
| 1 | <!DOCTYPE html> |
| 2 | <html lang="en"> |
| 3 | <head> |
| 4 | <meta charset="UTF-8"> |
| 5 | <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| 6 | <title>PlanOpticon Knowledge Graph Viewer</title> |
| 7 | <script src="https://d3js.org/d3.v7.min.js"></script> |
| 8 | <style> |
| 9 | * { margin: 0; padding: 0; box-sizing: border-box; } |
| 10 | body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; background: #1a1a2e; color: #e0e0e0; overflow: hidden; height: 100vh; } |
| 11 | #toolbar { position: fixed; top: 0; left: 0; right: 0; z-index: 10; background: #16213e; padding: 8px 16px; display: flex; align-items: center; gap: 12px; border-bottom: 1px solid #0f3460; flex-wrap: wrap; } |
| 12 | #toolbar h1 { font-size: 14px; font-weight: 600; color: #e94560; white-space: nowrap; } |
| 13 | #search { background: #1a1a2e; border: 1px solid #0f3460; color: #e0e0e0; padding: 5px 10px; border-radius: 4px; font-size: 13px; width: 200px; } |
| 14 | #search::placeholder { color: #666; } |
| 15 | .filter-btn { background: #1a1a2e; border: 1px solid #0f3460; color: #e0e0e0; padding: 4px 10px; border-radius: 12px; font-size: 12px; cursor: pointer; display: flex; align-items: center; gap: 4px; } |
| 16 | .filter-btn.active { border-color: #e94560; } |
| 17 | .filter-btn .dot { width: 8px; height: 8px; border-radius: 50%; display: inline-block; } |
| 18 | #drop-zone { position: fixed; top: 50%; left: 50%; transform: translate(-50%, -50%); z-index: 20; background: #16213e; border: 2px dashed #0f3460; border-radius: 12px; padding: 60px; text-align: center; } |
| 19 | #drop-zone.hidden { display: none; } |
| 20 | #drop-zone h2 { color: #e94560; margin-bottom: 12px; } |
| 21 | #drop-zone p { color: #888; margin-bottom: 16px; font-size: 14px; } |
| 22 | #file-input { display: none; } |
| 23 | #drop-zone label { background: #e94560; color: white; padding: 8px 20px; border-radius: 6px; cursor: pointer; font-size: 14px; } |
| 24 | #drop-zone label:hover { background: #c73e54; } |
| 25 | #graph-container { width: 100%; height: 100vh; padding-top: 44px; } |
| 26 | svg { width: 100%; height: 100%; } |
| 27 | .node circle { stroke: #333; stroke-width: 1.5px; cursor: pointer; } |
| 28 | .node text { font-size: 10px; fill: #e0e0e0; pointer-events: none; text-anchor: middle; } |
| 29 | .link { stroke: #334; stroke-opacity: 0.6; } |
| 30 | .link-label { font-size: 8px; fill: #666; pointer-events: none; } |
| 31 | .node.highlighted circle { stroke: #e94560; stroke-width: 3px; } |
| 32 | .node.dimmed { opacity: 0.15; } |
| 33 | .link.dimmed { opacity: 0.05; } |
| 34 | #detail-panel { position: fixed; top: 44px; right: 0; width: 300px; height: calc(100vh - 44px); background: #16213e; border-left: 1px solid #0f3460; padding: 16px; overflow-y: auto; transform: translateX(100%); transition: transform 0.2s; z-index: 10; } |
| 35 | #detail-panel.open { transform: translateX(0); } |
| 36 | #detail-panel h3 { color: #e94560; margin-bottom: 4px; font-size: 16px; } |
| 37 | #detail-panel .type-badge { display: inline-block; padding: 2px 8px; border-radius: 8px; font-size: 11px; margin-bottom: 12px; } |
| 38 | #detail-panel .section { margin-bottom: 14px; } |
| 39 | #detail-panel .section h4 { font-size: 12px; color: #888; text-transform: uppercase; margin-bottom: 6px; } |
| 40 | #detail-panel .section p, #detail-panel .section li { font-size: 13px; line-height: 1.5; } |
| 41 | #detail-panel ul { list-style: none; padding: 0; } |
| 42 | #detail-panel ul li { padding: 3px 0; border-bottom: 1px solid #0f3460; } |
| 43 | #detail-panel .close-btn { position: absolute; top: 12px; right: 12px; background: none; border: none; color: #888; cursor: pointer; font-size: 18px; } |
| 44 | #stats { font-size: 11px; color: #666; white-space: nowrap; } |
| 45 | .drag-over #drop-zone { border-color: #e94560; background: #1a1a3e; } |
| 46 | #no-d3-msg { display: none; position: fixed; top: 50%; left: 50%; transform: translate(-50%, -50%); background: #16213e; padding: 40px; border-radius: 12px; text-align: center; border: 1px solid #e94560; z-index: 100; } |
| 47 | #no-d3-msg h2 { color: #e94560; margin-bottom: 8px; } |
| 48 | </style> |
| 49 | </head> |
| 50 | <body> |
| 51 | <div id="no-d3-msg"> |
| 52 | <h2>D3.js Required</h2> |
| 53 | <p>This viewer requires an internet connection on first load to fetch D3.js.</p> |
| 54 | <p style="color:#888; font-size:13px;">Load from: https://d3js.org/d3.v7.min.js</p> |
| 55 | </div> |
| 56 | |
| 57 | <div id="toolbar"> |
| 58 | <h1>PlanOpticon</h1> |
| 59 | <input type="text" id="search" placeholder="Search entities..."> |
| 60 | <button class="filter-btn active" data-type="person"><span class="dot" style="background:#f9d5e5"></span> Person</button> |
| 61 | <button class="filter-btn active" data-type="concept"><span class="dot" style="background:#eeeeee"></span> Concept</button> |
| 62 | <button class="filter-btn active" data-type="technology"><span class="dot" style="background:#d5e5f9"></span> Technology</button> |
| 63 | <button class="filter-btn active" data-type="organization"><span class="dot" style="background:#f9e5d5"></span> Organization</button> |
| 64 | <span id="stats"></span> |
| 65 | </div> |
| 66 | |
| 67 | <div id="drop-zone"> |
| 68 | <h2>Load Knowledge Graph</h2> |
| 69 | <p>Drag and drop a knowledge_graph.json file here, or click to browse.</p> |
| 70 | <label for="file-input">Choose File</label> |
| 71 | <input type="file" id="file-input" accept=".json"> |
| 72 | </div> |
| 73 | |
| 74 | <div id="graph-container"> |
| 75 | <svg id="graph-svg"></svg> |
| 76 | </div> |
| 77 | |
| 78 | <div id="detail-panel"> |
| 79 | <button class="close-btn" id="close-detail">×</button> |
| 80 | <h3 id="detail-name"></h3> |
| 81 | <div class="type-badge" id="detail-type"></div> |
| 82 | <div class="section" id="desc-section"> |
| 83 | <h4>Descriptions</h4> |
| 84 | <ul id="detail-descriptions"></ul> |
| 85 | </div> |
| 86 | <div class="section" id="conn-section"> |
| 87 | <h4>Connections</h4> |
| 88 | <ul id="detail-connections"></ul> |
| 89 | </div> |
| 90 | </div> |
| 91 | |
| 92 | <script> |
| 93 | (function() { |
| 94 | // Check D3 loaded |
| 95 | if (typeof d3 === 'undefined') { |
| 96 | document.getElementById('no-d3-msg').style.display = 'block'; |
| 97 | return; |
| 98 | } |
| 99 | |
| 100 | const TYPE_COLORS = { |
| 101 | person: '#f9d5e5', |
| 102 | concept: '#eeeeee', |
| 103 | technology: '#d5e5f9', |
| 104 | organization: '#f9e5d5', |
| 105 | time: '#e5d5f9', |
| 106 | diagram: '#d5f9e5' |
| 107 | }; |
| 108 | const DEFAULT_COLOR = '#cccccc'; |
| 109 | |
| 110 | let graphData = null; // { nodes: [], relationships: [] } |
| 111 | let simulation = null; |
| 112 | let nodesG, linksG, labelsG; |
| 113 | let activeTypes = new Set(['person', 'concept', 'technology', 'organization', 'time', 'diagram']); |
| 114 | |
| 115 | // --- File loading --- |
| 116 | const dropZone = document.getElementById('drop-zone'); |
| 117 | const fileInput = document.getElementById('file-input'); |
| 118 | |
| 119 | dropZone.addEventListener('dragover', e => { e.preventDefault(); document.body.classList.add('drag-over'); }); |
| 120 | dropZone.addEventListener('dragleave', () => document.body.classList.remove('drag-over')); |
| 121 | dropZone.addEventListener('drop', e => { |
| 122 | e.preventDefault(); |
| 123 | document.body.classList.remove('drag-over'); |
| 124 | const file = e.dataTransfer.files[0]; |
| 125 | if (file) loadFile(file); |
| 126 | }); |
| 127 | fileInput.addEventListener('change', e => { if (e.target.files[0]) loadFile(e.target.files[0]); }); |
| 128 | |
| 129 | function loadFile(file) { |
| 130 | const reader = new FileReader(); |
| 131 | reader.onload = e => { |
| 132 | try { |
| 133 | const data = JSON.parse(e.target.result); |
| 134 | initGraph(data); |
| 135 | } catch (err) { |
| 136 | alert('Invalid JSON file: ' + err.message); |
| 137 | } |
| 138 | }; |
| 139 | reader.readAsText(file); |
| 140 | } |
| 141 | |
| 142 | // --- Pre-embedded data support --- |
| 143 | if (window.__PLANOPTICON_DATA__) { |
| 144 | window.addEventListener('DOMContentLoaded', () => initGraph(window.__PLANOPTICON_DATA__)); |
| 145 | } |
| 146 | // Also check after script runs (if DOM already ready) |
| 147 | if (document.readyState !== 'loading' && window.__PLANOPTICON_DATA__) { |
| 148 | initGraph(window.__PLANOPTICON_DATA__); |
| 149 | } |
| 150 | |
| 151 | // --- Graph init --- |
| 152 | function initGraph(raw) { |
| 153 | dropZone.classList.add('hidden'); |
| 154 | graphData = normalize(raw); |
| 155 | render(); |
| 156 | } |
| 157 | |
| 158 | function normalize(raw) { |
| 159 | // Accept both { nodes: [], relationships: [] } and { entities: [], relationships: [] } |
| 160 | const rawNodes = raw.nodes || raw.entities || []; |
| 161 | const rawRels = raw.relationships || raw.edges || []; |
| 162 | |
| 163 | const nodes = rawNodes.map(n => ({ |
| 164 | id: n.name || n.id, |
| 165 | name: n.name || n.id, |
| 166 | type: (n.type || 'concept').toLowerCase(), |
| 167 | descriptions: n.descriptions || (n.description ? [n.description] : []), |
| 168 | occurrences: n.occurrences || [] |
| 169 | })); |
| 170 | |
| 171 | const nodeSet = new Set(nodes.map(n => n.id)); |
| 172 | |
| 173 | const links = rawRels |
| 174 | .filter(r => nodeSet.has(r.source) && nodeSet.has(r.target)) |
| 175 | .map(r => ({ |
| 176 | source: r.source, |
| 177 | target: r.target, |
| 178 | type: r.type || 'related_to' |
| 179 | })); |
| 180 | |
| 181 | // Compute connection counts |
| 182 | const connCount = {}; |
| 183 | links.forEach(l => { |
| 184 | const s = typeof l.source === 'object' ? l.source.id : l.source; |
| 185 | const t = typeof l.target === 'object' ? l.target.id : l.target; |
| 186 | connCount[s] = (connCount[s] || 0) + 1; |
| 187 | connCount[t] = (connCount[t] || 0) + 1; |
| 188 | }); |
| 189 | nodes.forEach(n => { n.connections = connCount[n.id] || 0; }); |
| 190 | |
| 191 | return { nodes, links }; |
| 192 | } |
| 193 | |
| 194 | function render() { |
| 195 | const svg = d3.select('#graph-svg'); |
| 196 | svg.selectAll('*').remove(); |
| 197 | |
| 198 | const width = window.innerWidth; |
| 199 | const height = window.innerHeight - 44; |
| 200 | |
| 201 | const g = svg.append('g'); |
| 202 | |
| 203 | // Zoom |
| 204 | const zoom = d3.zoom() |
| 205 | .scaleExtent([0.1, 8]) |
| 206 | .on('zoom', e => g.attr('transform', e.transform)); |
| 207 | svg.call(zoom); |
| 208 | |
| 209 | // Filter nodes/links by active types |
| 210 | const visibleNodes = graphData.nodes.filter(n => activeTypes.has(n.type)); |
| 211 | const visibleIds = new Set(visibleNodes.map(n => n.id)); |
| 212 | const visibleLinks = graphData.links.filter(l => { |
| 213 | const s = typeof l.source === 'object' ? l.source.id : l.source; |
| 214 | const t = typeof l.target === 'object' ? l.target.id : l.target; |
| 215 | return visibleIds.has(s) && visibleIds.has(t); |
| 216 | }); |
| 217 | |
| 218 | // Stats |
| 219 | document.getElementById('stats').textContent = visibleNodes.length + ' nodes, ' + visibleLinks.length + ' edges'; |
| 220 | |
| 221 | // Links |
| 222 | linksG = g.append('g').selectAll('line') |
| 223 | .data(visibleLinks) |
| 224 | .join('line') |
| 225 | .attr('class', 'link') |
| 226 | .attr('stroke-width', 1); |
| 227 | |
| 228 | // Link labels |
| 229 | labelsG = g.append('g').selectAll('text') |
| 230 | .data(visibleLinks) |
| 231 | .join('text') |
| 232 | .attr('class', 'link-label') |
| 233 | .text(d => d.type); |
| 234 | |
| 235 | // Nodes |
| 236 | const maxConn = Math.max(1, d3.max(visibleNodes, d => d.connections)); |
| 237 | const radiusScale = d3.scaleSqrt().domain([0, maxConn]).range([5, 24]); |
| 238 | |
| 239 | nodesG = g.append('g').selectAll('g') |
| 240 | .data(visibleNodes) |
| 241 | .join('g') |
| 242 | .attr('class', 'node') |
| 243 | .call(d3.drag() |
| 244 | .on('start', dragStart) |
| 245 | .on('drag', dragged) |
| 246 | .on('end', dragEnd)) |
| 247 | .on('click', (e, d) => showDetail(d)); |
| 248 | |
| 249 | nodesG.append('circle') |
| 250 | .attr('r', d => radiusScale(d.connections)) |
| 251 | .attr('fill', d => TYPE_COLORS[d.type] || DEFAULT_COLOR); |
| 252 | |
| 253 | nodesG.append('text') |
| 254 | .attr('dy', d => radiusScale(d.connections) + 12) |
| 255 | .text(d => d.name.length > 20 ? d.name.slice(0, 18) + '..' : d.name); |
| 256 | |
| 257 | // Simulation |
| 258 | simulation = d3.forceSimulation(visibleNodes) |
| 259 | .force('link', d3.forceLink(visibleLinks).id(d => d.id).distance(80)) |
| 260 | .force('charge', d3.forceManyBody().strength(-200)) |
| 261 | .force('center', d3.forceCenter(width / 2, height / 2)) |
| 262 | .force('collision', d3.forceCollide().radius(d => radiusScale(d.connections) + 4)) |
| 263 | .on('tick', () => { |
| 264 | linksG |
| 265 | .attr('x1', d => d.source.x).attr('y1', d => d.source.y) |
| 266 | .attr('x2', d => d.target.x).attr('y2', d => d.target.y); |
| 267 | labelsG |
| 268 | .attr('x', d => (d.source.x + d.target.x) / 2) |
| 269 | .attr('y', d => (d.source.y + d.target.y) / 2); |
| 270 | nodesG.attr('transform', d => 'translate(' + d.x + ',' + d.y + ')'); |
| 271 | }); |
| 272 | |
| 273 | // Drag handlers |
| 274 | function dragStart(e, d) { |
| 275 | if (!e.active) simulation.alphaTarget(0.3).restart(); |
| 276 | d.fx = d.x; d.fy = d.y; |
| 277 | } |
| 278 | function dragged(e, d) { d.fx = e.x; d.fy = e.y; } |
| 279 | function dragEnd(e, d) { |
| 280 | if (!e.active) simulation.alphaTarget(0); |
| 281 | d.fx = null; d.fy = null; |
| 282 | } |
| 283 | |
| 284 | // Store zoom for centering |
| 285 | svg._zoom = zoom; |
| 286 | svg._g = g; |
| 287 | } |
| 288 | |
| 289 | // --- Search --- |
| 290 | const searchInput = document.getElementById('search'); |
| 291 | searchInput.addEventListener('input', () => { |
| 292 | const q = searchInput.value.toLowerCase().trim(); |
| 293 | if (!graphData || !nodesG) return; |
| 294 | |
| 295 | if (!q) { |
| 296 | nodesG.classed('highlighted', false).classed('dimmed', false); |
| 297 | linksG.classed('dimmed', false); |
| 298 | return; |
| 299 | } |
| 300 | |
| 301 | const matches = new Set(); |
| 302 | graphData.nodes.forEach(n => { |
| 303 | if (n.name.toLowerCase().includes(q)) matches.add(n.id); |
| 304 | }); |
| 305 | |
| 306 | // Also include direct neighbors of matches |
| 307 | const neighbors = new Set(matches); |
| 308 | graphData.links.forEach(l => { |
| 309 | const s = typeof l.source === 'object' ? l.source.id : l.source; |
| 310 | const t = typeof l.target === 'object' ? l.target.id : l.target; |
| 311 | if (matches.has(s)) neighbors.add(t); |
| 312 | if (matches.has(t)) neighbors.add(s); |
| 313 | }); |
| 314 | |
| 315 | nodesG.classed('highlighted', d => matches.has(d.id)); |
| 316 | nodesG.classed('dimmed', d => !neighbors.has(d.id)); |
| 317 | linksG.classed('dimmed', d => { |
| 318 | const s = typeof d.source === 'object' ? d.source.id : d.source; |
| 319 | const t = typeof d.target === 'object' ? d.target.id : d.target; |
| 320 | return !neighbors.has(s) || !neighbors.has(t); |
| 321 | }); |
| 322 | |
| 323 | // Center on first match |
| 324 | if (matches.size > 0) { |
| 325 | const first = graphData.nodes.find(n => matches.has(n.id)); |
| 326 | if (first && first.x != null) { |
| 327 | const svg = d3.select('#graph-svg'); |
| 328 | const width = window.innerWidth; |
| 329 | const height = window.innerHeight - 44; |
| 330 | svg.transition().duration(500).call( |
| 331 | svg._zoom.transform, |
| 332 | d3.zoomIdentity.translate(width / 2 - first.x, height / 2 - first.y) |
| 333 | ); |
| 334 | } |
| 335 | } |
| 336 | }); |
| 337 | |
| 338 | // --- Filter toggles --- |
| 339 | document.querySelectorAll('.filter-btn').forEach(btn => { |
| 340 | btn.addEventListener('click', () => { |
| 341 | const type = btn.dataset.type; |
| 342 | btn.classList.toggle('active'); |
| 343 | if (activeTypes.has(type)) activeTypes.delete(type); |
| 344 | else activeTypes.add(type); |
| 345 | if (graphData) render(); |
| 346 | }); |
| 347 | }); |
| 348 | |
| 349 | // --- Detail panel --- |
| 350 | function showDetail(node) { |
| 351 | const panel = document.getElementById('detail-panel'); |
| 352 | document.getElementById('detail-name').textContent = node.name; |
| 353 | const badge = document.getElementById('detail-type'); |
| 354 | badge.textContent = node.type; |
| 355 | badge.style.background = TYPE_COLORS[node.type] || DEFAULT_COLOR; |
| 356 | badge.style.color = '#1a1a2e'; |
| 357 | |
| 358 | const descList = document.getElementById('detail-descriptions'); |
| 359 | descList.innerHTML = ''; |
| 360 | if (node.descriptions.length === 0) { |
| 361 | descList.innerHTML = '<li style="color:#666">No descriptions</li>'; |
| 362 | } else { |
| 363 | node.descriptions.forEach(d => { |
| 364 | const li = document.createElement('li'); |
| 365 | li.textContent = d; |
| 366 | descList.appendChild(li); |
| 367 | }); |
| 368 | } |
| 369 | |
| 370 | const connList = document.getElementById('detail-connections'); |
| 371 | connList.innerHTML = ''; |
| 372 | const connections = []; |
| 373 | graphData.links.forEach(l => { |
| 374 | const s = typeof l.source === 'object' ? l.source.id : l.source; |
| 375 | const t = typeof l.target === 'object' ? l.target.id : l.target; |
| 376 | if (s === node.id) connections.push({ target: t, type: l.type, dir: '->' }); |
| 377 | if (t === node.id) connections.push({ target: s, type: l.type, dir: '<-' }); |
| 378 | }); |
| 379 | if (connections.length === 0) { |
| 380 | connList.innerHTML = '<li style="color:#666">No connections</li>'; |
| 381 | } else { |
| 382 | connections.forEach(c => { |
| 383 | const li = document.createElement('li'); |
| 384 | li.textContent = c.dir + ' ' + c.target + ' (' + c.type + ')'; |
| 385 | li.style.cursor = 'pointer'; |
| 386 | li.addEventListener('click', () => { |
| 387 | searchInput.value = c.target; |
| 388 | searchInput.dispatchEvent(new Event('input')); |
| 389 | }); |
| 390 | connList.appendChild(li); |
| 391 | }); |
| 392 | } |
| 393 | |
| 394 | panel.classList.add('open'); |
| 395 | } |
| 396 | |
| 397 | document.getElementById('close-detail').addEventListener('click', () => { |
| 398 | document.getElementById('detail-panel').classList.remove('open'); |
| 399 | }); |
| 400 | |
| 401 | // --- Resize --- |
| 402 | window.addEventListener('resize', () => { if (graphData) render(); }); |
| 403 | })(); |
| 404 | </script> |
| 405 | </body> |
| 406 | </html> |
+27
-3
| --- pyproject.toml | ||
| +++ pyproject.toml | ||
| @@ -2,11 +2,11 @@ | ||
| 2 | 2 | requires = ["setuptools>=69.0", "wheel"] |
| 3 | 3 | build-backend = "setuptools.build_meta" |
| 4 | 4 | |
| 5 | 5 | [project] |
| 6 | 6 | name = "planopticon" |
| 7 | -version = "0.3.0" | |
| 7 | +version = "0.4.0" | |
| 8 | 8 | description = "AI-powered video analysis and knowledge extraction tool" |
| 9 | 9 | readme = "README.md" |
| 10 | 10 | license = "MIT" |
| 11 | 11 | requires-python = ">=3.10" |
| 12 | 12 | authors = [ |
| @@ -50,18 +50,41 @@ | ||
| 50 | 50 | "requests>=2.31.0", |
| 51 | 51 | "tenacity>=8.2.0", |
| 52 | 52 | ] |
| 53 | 53 | |
| 54 | 54 | [project.optional-dependencies] |
| 55 | -pdf = ["weasyprint>=60.0"] | |
| 55 | +pdf = ["pymupdf>=1.24.0"] | |
| 56 | 56 | gpu = ["torch>=2.0.0", "torchvision>=0.15.0"] |
| 57 | 57 | gdrive = ["google-auth>=2.0.0", "google-auth-oauthlib>=1.0.0", "google-api-python-client>=2.0.0"] |
| 58 | 58 | dropbox = ["dropbox>=12.0.0"] |
| 59 | -graph = ["falkordblite>=0.4.0", "redis>=4.5"] | |
| 59 | +azure = ["openai>=1.0.0"] | |
| 60 | +together = ["openai>=1.0.0"] | |
| 61 | +fireworks = ["openai>=1.0.0"] | |
| 62 | +cerebras = ["openai>=1.0.0"] | |
| 63 | +xai = ["openai>=1.0.0"] | |
| 64 | +bedrock = ["boto3>=1.28"] | |
| 65 | +vertex = ["google-cloud-aiplatform>=1.38"] | |
| 66 | +mistral = ["mistralai>=1.0"] | |
| 67 | +cohere = ["cohere>=5.0"] | |
| 68 | +ai21 = ["ai21>=3.0"] | |
| 69 | +huggingface = ["huggingface_hub>=0.20"] | |
| 70 | +qianfan = ["qianfan>=0.4"] | |
| 71 | +litellm = ["litellm>=1.0"] | |
| 72 | +youtube = ["yt-dlp>=2023.0"] | |
| 73 | +s3 = ["boto3>=1.28"] | |
| 74 | +web = ["beautifulsoup4>=4.12"] | |
| 75 | +rss = ["feedparser>=6.0"] | |
| 76 | +graph = [] | |
| 60 | 77 | cloud = [ |
| 61 | 78 | "planopticon[gdrive]", |
| 62 | 79 | "planopticon[dropbox]", |
| 80 | + "planopticon[s3]", | |
| 81 | +] | |
| 82 | +sources = [ | |
| 83 | + "planopticon[youtube]", | |
| 84 | + "planopticon[web]", | |
| 85 | + "planopticon[rss]", | |
| 63 | 86 | ] |
| 64 | 87 | dev = [ |
| 65 | 88 | "pytest>=7.3.0", |
| 66 | 89 | "pytest-cov>=4.1.0", |
| 67 | 90 | "black>=23.3.0", |
| @@ -103,10 +126,11 @@ | ||
| 103 | 126 | [tool.ruff.lint] |
| 104 | 127 | select = ["E", "F", "W", "I"] |
| 105 | 128 | |
| 106 | 129 | [tool.ruff.lint.per-file-ignores] |
| 107 | 130 | "video_processor/utils/prompt_templates.py" = ["E501"] |
| 131 | +"video_processor/api/openapi_spec.py" = ["E501"] | |
| 108 | 132 | |
| 109 | 133 | [tool.mypy] |
| 110 | 134 | python_version = "3.10" |
| 111 | 135 | warn_return_any = true |
| 112 | 136 | warn_unused_configs = true |
| 113 | 137 |
| --- pyproject.toml | |
| +++ pyproject.toml | |
| @@ -2,11 +2,11 @@ | |
| 2 | requires = ["setuptools>=69.0", "wheel"] |
| 3 | build-backend = "setuptools.build_meta" |
| 4 | |
| 5 | [project] |
| 6 | name = "planopticon" |
| 7 | version = "0.3.0" |
| 8 | description = "AI-powered video analysis and knowledge extraction tool" |
| 9 | readme = "README.md" |
| 10 | license = "MIT" |
| 11 | requires-python = ">=3.10" |
| 12 | authors = [ |
| @@ -50,18 +50,41 @@ | |
| 50 | "requests>=2.31.0", |
| 51 | "tenacity>=8.2.0", |
| 52 | ] |
| 53 | |
| 54 | [project.optional-dependencies] |
| 55 | pdf = ["weasyprint>=60.0"] |
| 56 | gpu = ["torch>=2.0.0", "torchvision>=0.15.0"] |
| 57 | gdrive = ["google-auth>=2.0.0", "google-auth-oauthlib>=1.0.0", "google-api-python-client>=2.0.0"] |
| 58 | dropbox = ["dropbox>=12.0.0"] |
| 59 | graph = ["falkordblite>=0.4.0", "redis>=4.5"] |
| 60 | cloud = [ |
| 61 | "planopticon[gdrive]", |
| 62 | "planopticon[dropbox]", |
| 63 | ] |
| 64 | dev = [ |
| 65 | "pytest>=7.3.0", |
| 66 | "pytest-cov>=4.1.0", |
| 67 | "black>=23.3.0", |
| @@ -103,10 +126,11 @@ | |
| 103 | [tool.ruff.lint] |
| 104 | select = ["E", "F", "W", "I"] |
| 105 | |
| 106 | [tool.ruff.lint.per-file-ignores] |
| 107 | "video_processor/utils/prompt_templates.py" = ["E501"] |
| 108 | |
| 109 | [tool.mypy] |
| 110 | python_version = "3.10" |
| 111 | warn_return_any = true |
| 112 | warn_unused_configs = true |
| 113 |
| --- pyproject.toml | |
| +++ pyproject.toml | |
| @@ -2,11 +2,11 @@ | |
| 2 | requires = ["setuptools>=69.0", "wheel"] |
| 3 | build-backend = "setuptools.build_meta" |
| 4 | |
| 5 | [project] |
| 6 | name = "planopticon" |
| 7 | version = "0.4.0" |
| 8 | description = "AI-powered video analysis and knowledge extraction tool" |
| 9 | readme = "README.md" |
| 10 | license = "MIT" |
| 11 | requires-python = ">=3.10" |
| 12 | authors = [ |
| @@ -50,18 +50,41 @@ | |
| 50 | "requests>=2.31.0", |
| 51 | "tenacity>=8.2.0", |
| 52 | ] |
| 53 | |
| 54 | [project.optional-dependencies] |
| 55 | pdf = ["pymupdf>=1.24.0"] |
| 56 | gpu = ["torch>=2.0.0", "torchvision>=0.15.0"] |
| 57 | gdrive = ["google-auth>=2.0.0", "google-auth-oauthlib>=1.0.0", "google-api-python-client>=2.0.0"] |
| 58 | dropbox = ["dropbox>=12.0.0"] |
| 59 | azure = ["openai>=1.0.0"] |
| 60 | together = ["openai>=1.0.0"] |
| 61 | fireworks = ["openai>=1.0.0"] |
| 62 | cerebras = ["openai>=1.0.0"] |
| 63 | xai = ["openai>=1.0.0"] |
| 64 | bedrock = ["boto3>=1.28"] |
| 65 | vertex = ["google-cloud-aiplatform>=1.38"] |
| 66 | mistral = ["mistralai>=1.0"] |
| 67 | cohere = ["cohere>=5.0"] |
| 68 | ai21 = ["ai21>=3.0"] |
| 69 | huggingface = ["huggingface_hub>=0.20"] |
| 70 | qianfan = ["qianfan>=0.4"] |
| 71 | litellm = ["litellm>=1.0"] |
| 72 | youtube = ["yt-dlp>=2023.0"] |
| 73 | s3 = ["boto3>=1.28"] |
| 74 | web = ["beautifulsoup4>=4.12"] |
| 75 | rss = ["feedparser>=6.0"] |
| 76 | graph = [] |
| 77 | cloud = [ |
| 78 | "planopticon[gdrive]", |
| 79 | "planopticon[dropbox]", |
| 80 | "planopticon[s3]", |
| 81 | ] |
| 82 | sources = [ |
| 83 | "planopticon[youtube]", |
| 84 | "planopticon[web]", |
| 85 | "planopticon[rss]", |
| 86 | ] |
| 87 | dev = [ |
| 88 | "pytest>=7.3.0", |
| 89 | "pytest-cov>=4.1.0", |
| 90 | "black>=23.3.0", |
| @@ -103,10 +126,11 @@ | |
| 126 | [tool.ruff.lint] |
| 127 | select = ["E", "F", "W", "I"] |
| 128 | |
| 129 | [tool.ruff.lint.per-file-ignores] |
| 130 | "video_processor/utils/prompt_templates.py" = ["E501"] |
| 131 | "video_processor/api/openapi_spec.py" = ["E501"] |
| 132 | |
| 133 | [tool.mypy] |
| 134 | python_version = "3.10" |
| 135 | warn_return_any = true |
| 136 | warn_unused_configs = true |
| 137 |
+413
-11
| --- tests/test_agent.py | ||
| +++ tests/test_agent.py | ||
| @@ -1,15 +1,392 @@ | ||
| 1 | -"""Tests for the agentic processing orchestrator.""" | |
| 1 | +"""Tests for the planning agent, skill registry, KB context, and agent loop.""" | |
| 2 | 2 | |
| 3 | 3 | import json |
| 4 | -from unittest.mock import MagicMock | |
| 4 | +from pathlib import Path | |
| 5 | +from unittest.mock import MagicMock, patch | |
| 6 | + | |
| 7 | +import pytest | |
| 8 | + | |
| 9 | +from video_processor.agent.skills.base import ( | |
| 10 | + AgentContext, | |
| 11 | + Artifact, | |
| 12 | + Skill, | |
| 13 | + _skills, | |
| 14 | + get_skill, | |
| 15 | + list_skills, | |
| 16 | + register_skill, | |
| 17 | +) | |
| 18 | + | |
| 19 | +# --------------------------------------------------------------------------- | |
| 20 | +# Fixtures | |
| 21 | +# --------------------------------------------------------------------------- | |
| 22 | + | |
| 23 | + | |
| 24 | +@pytest.fixture(autouse=True) | |
| 25 | +def _clean_skill_registry(): | |
| 26 | + """Save and restore the global skill registry between tests.""" | |
| 27 | + original = dict(_skills) | |
| 28 | + yield | |
| 29 | + _skills.clear() | |
| 30 | + _skills.update(original) | |
| 31 | + | |
| 32 | + | |
| 33 | +class _DummySkill(Skill): | |
| 34 | + name = "dummy_test_skill" | |
| 35 | + description = "A dummy skill for testing" | |
| 36 | + | |
| 37 | + def execute(self, context: AgentContext, **kwargs) -> Artifact: | |
| 38 | + return Artifact( | |
| 39 | + name="dummy artifact", | |
| 40 | + content="dummy content", | |
| 41 | + artifact_type="document", | |
| 42 | + ) | |
| 43 | + | |
| 44 | + | |
| 45 | +class _NoLLMSkill(Skill): | |
| 46 | + """Skill that doesn't require provider_manager.""" | |
| 47 | + | |
| 48 | + name = "nollm_skill" | |
| 49 | + description = "Works without LLM" | |
| 50 | + | |
| 51 | + def execute(self, context: AgentContext, **kwargs) -> Artifact: | |
| 52 | + return Artifact( | |
| 53 | + name="nollm artifact", | |
| 54 | + content="generated", | |
| 55 | + artifact_type="document", | |
| 56 | + ) | |
| 57 | + | |
| 58 | + def can_execute(self, context: AgentContext) -> bool: | |
| 59 | + return context.knowledge_graph is not None | |
| 60 | + | |
| 61 | + | |
| 62 | +# --------------------------------------------------------------------------- | |
| 63 | +# Skill registry | |
| 64 | +# --------------------------------------------------------------------------- | |
| 65 | + | |
| 66 | + | |
| 67 | +class TestSkillRegistry: | |
| 68 | + def test_register_and_get(self): | |
| 69 | + skill = _DummySkill() | |
| 70 | + register_skill(skill) | |
| 71 | + assert get_skill("dummy_test_skill") is skill | |
| 72 | + | |
| 73 | + def test_get_unknown_returns_none(self): | |
| 74 | + assert get_skill("no_such_skill_xyz") is None | |
| 75 | + | |
| 76 | + def test_list_skills(self): | |
| 77 | + s1 = _DummySkill() | |
| 78 | + register_skill(s1) | |
| 79 | + skills = list_skills() | |
| 80 | + assert any(s.name == "dummy_test_skill" for s in skills) | |
| 81 | + | |
| 82 | + def test_list_skills_empty(self): | |
| 83 | + _skills.clear() | |
| 84 | + assert list_skills() == [] | |
| 85 | + | |
| 86 | + | |
| 87 | +# --------------------------------------------------------------------------- | |
| 88 | +# AgentContext dataclass | |
| 89 | +# --------------------------------------------------------------------------- | |
| 90 | + | |
| 91 | + | |
| 92 | +class TestAgentContext: | |
| 93 | + def test_defaults(self): | |
| 94 | + ctx = AgentContext() | |
| 95 | + assert ctx.knowledge_graph is None | |
| 96 | + assert ctx.query_engine is None | |
| 97 | + assert ctx.provider_manager is None | |
| 98 | + assert ctx.planning_entities == [] | |
| 99 | + assert ctx.user_requirements == {} | |
| 100 | + assert ctx.conversation_history == [] | |
| 101 | + assert ctx.artifacts == [] | |
| 102 | + assert ctx.config == {} | |
| 103 | + | |
| 104 | + def test_with_values(self): | |
| 105 | + mock_kg = MagicMock() | |
| 106 | + mock_qe = MagicMock() | |
| 107 | + mock_pm = MagicMock() | |
| 108 | + ctx = AgentContext( | |
| 109 | + knowledge_graph=mock_kg, | |
| 110 | + query_engine=mock_qe, | |
| 111 | + provider_manager=mock_pm, | |
| 112 | + config={"key": "value"}, | |
| 113 | + ) | |
| 114 | + assert ctx.knowledge_graph is mock_kg | |
| 115 | + assert ctx.config == {"key": "value"} | |
| 116 | + | |
| 117 | + def test_conversation_history_is_mutable(self): | |
| 118 | + ctx = AgentContext() | |
| 119 | + ctx.conversation_history.append({"role": "user", "content": "hello"}) | |
| 120 | + assert len(ctx.conversation_history) == 1 | |
| 121 | + | |
| 122 | + | |
| 123 | +# --------------------------------------------------------------------------- | |
| 124 | +# Artifact dataclass | |
| 125 | +# --------------------------------------------------------------------------- | |
| 126 | + | |
| 127 | + | |
| 128 | +class TestArtifact: | |
| 129 | + def test_basic(self): | |
| 130 | + a = Artifact(name="Plan", content="# Plan\n...", artifact_type="project_plan") | |
| 131 | + assert a.name == "Plan" | |
| 132 | + assert a.format == "markdown" # default | |
| 133 | + assert a.metadata == {} | |
| 134 | + | |
| 135 | + def test_with_metadata(self): | |
| 136 | + a = Artifact( | |
| 137 | + name="Tasks", | |
| 138 | + content="[]", | |
| 139 | + artifact_type="task_list", | |
| 140 | + format="json", | |
| 141 | + metadata={"source": "kg"}, | |
| 142 | + ) | |
| 143 | + assert a.format == "json" | |
| 144 | + assert a.metadata["source"] == "kg" | |
| 145 | + | |
| 146 | + | |
| 147 | +# --------------------------------------------------------------------------- | |
| 148 | +# Skill.can_execute | |
| 149 | +# --------------------------------------------------------------------------- | |
| 150 | + | |
| 151 | + | |
| 152 | +class TestSkillCanExecute: | |
| 153 | + def test_default_requires_kg_and_pm(self): | |
| 154 | + skill = _DummySkill() | |
| 155 | + ctx_no_kg = AgentContext(provider_manager=MagicMock()) | |
| 156 | + assert not skill.can_execute(ctx_no_kg) | |
| 157 | + | |
| 158 | + ctx_no_pm = AgentContext(knowledge_graph=MagicMock()) | |
| 159 | + assert not skill.can_execute(ctx_no_pm) | |
| 160 | + | |
| 161 | + ctx_both = AgentContext(knowledge_graph=MagicMock(), provider_manager=MagicMock()) | |
| 162 | + assert skill.can_execute(ctx_both) | |
| 163 | + | |
| 164 | + | |
| 165 | +# --------------------------------------------------------------------------- | |
| 166 | +# KBContext | |
| 167 | +# --------------------------------------------------------------------------- | |
| 168 | + | |
| 169 | + | |
| 170 | +class TestKBContext: | |
| 171 | + def test_add_source_nonexistent_raises(self, tmp_path): | |
| 172 | + from video_processor.agent.kb_context import KBContext | |
| 173 | + | |
| 174 | + ctx = KBContext() | |
| 175 | + with pytest.raises(FileNotFoundError, match="Not found"): | |
| 176 | + ctx.add_source(tmp_path / "nonexistent.json") | |
| 177 | + | |
| 178 | + def test_add_source_file(self, tmp_path): | |
| 179 | + from video_processor.agent.kb_context import KBContext | |
| 180 | + | |
| 181 | + f = tmp_path / "kg.json" | |
| 182 | + f.write_text("{}") | |
| 183 | + ctx = KBContext() | |
| 184 | + ctx.add_source(f) | |
| 185 | + assert len(ctx.sources) == 1 | |
| 186 | + assert ctx.sources[0] == f.resolve() | |
| 187 | + | |
| 188 | + def test_add_source_directory(self, tmp_path): | |
| 189 | + from video_processor.agent.kb_context import KBContext | |
| 190 | + | |
| 191 | + with patch( | |
| 192 | + "video_processor.integrators.graph_discovery.find_knowledge_graphs", | |
| 193 | + return_value=[tmp_path / "a.db"], | |
| 194 | + ): | |
| 195 | + ctx = KBContext() | |
| 196 | + ctx.add_source(tmp_path) | |
| 197 | + assert len(ctx.sources) == 1 | |
| 198 | + | |
| 199 | + def test_knowledge_graph_before_load_raises(self): | |
| 200 | + from video_processor.agent.kb_context import KBContext | |
| 201 | + | |
| 202 | + ctx = KBContext() | |
| 203 | + with pytest.raises(RuntimeError, match="Call load"): | |
| 204 | + _ = ctx.knowledge_graph | |
| 205 | + | |
| 206 | + def test_query_engine_before_load_raises(self): | |
| 207 | + from video_processor.agent.kb_context import KBContext | |
| 208 | + | |
| 209 | + ctx = KBContext() | |
| 210 | + with pytest.raises(RuntimeError, match="Call load"): | |
| 211 | + _ = ctx.query_engine | |
| 212 | + | |
| 213 | + def test_summary_no_data(self): | |
| 214 | + from video_processor.agent.kb_context import KBContext | |
| 215 | + | |
| 216 | + ctx = KBContext() | |
| 217 | + assert ctx.summary() == "No knowledge base loaded." | |
| 218 | + | |
| 219 | + def test_load_json_and_summary(self, tmp_path): | |
| 220 | + from video_processor.agent.kb_context import KBContext | |
| 221 | + | |
| 222 | + kg_data = {"nodes": [], "relationships": []} | |
| 223 | + f = tmp_path / "kg.json" | |
| 224 | + f.write_text(json.dumps(kg_data)) | |
| 225 | + | |
| 226 | + ctx = KBContext() | |
| 227 | + ctx.add_source(f) | |
| 228 | + ctx.load() | |
| 229 | + | |
| 230 | + summary = ctx.summary() | |
| 231 | + assert "Knowledge base" in summary | |
| 232 | + assert "Entities" in summary | |
| 233 | + assert "Relationships" in summary | |
| 234 | + | |
| 235 | + | |
| 236 | +# --------------------------------------------------------------------------- | |
| 237 | +# PlanningAgent | |
| 238 | +# --------------------------------------------------------------------------- | |
| 239 | + | |
| 240 | + | |
| 241 | +class TestPlanningAgent: | |
| 242 | + def test_from_kb_paths(self, tmp_path): | |
| 243 | + from video_processor.agent.agent_loop import PlanningAgent | |
| 244 | + | |
| 245 | + kg_data = {"nodes": [], "relationships": []} | |
| 246 | + f = tmp_path / "kg.json" | |
| 247 | + f.write_text(json.dumps(kg_data)) | |
| 248 | + | |
| 249 | + agent = PlanningAgent.from_kb_paths([f], provider_manager=None) | |
| 250 | + assert agent.context.knowledge_graph is not None | |
| 251 | + assert agent.context.provider_manager is None | |
| 252 | + | |
| 253 | + def test_execute_with_mock_provider(self, tmp_path): | |
| 254 | + from video_processor.agent.agent_loop import PlanningAgent | |
| 255 | + | |
| 256 | + # Register a dummy skill | |
| 257 | + skill = _DummySkill() | |
| 258 | + register_skill(skill) | |
| 259 | + | |
| 260 | + mock_pm = MagicMock() | |
| 261 | + mock_pm.chat.return_value = json.dumps([{"skill": "dummy_test_skill", "params": {}}]) | |
| 262 | + | |
| 263 | + ctx = AgentContext( | |
| 264 | + knowledge_graph=MagicMock(), | |
| 265 | + query_engine=MagicMock(), | |
| 266 | + provider_manager=mock_pm, | |
| 267 | + ) | |
| 268 | + # Mock stats().to_text() | |
| 269 | + ctx.query_engine.stats.return_value.to_text.return_value = "3 entities" | |
| 270 | + | |
| 271 | + agent = PlanningAgent(context=ctx) | |
| 272 | + artifacts = agent.execute("generate a plan") | |
| 273 | + | |
| 274 | + assert len(artifacts) == 1 | |
| 275 | + assert artifacts[0].name == "dummy artifact" | |
| 276 | + mock_pm.chat.assert_called_once() | |
| 277 | + | |
| 278 | + def test_execute_no_provider_keyword_match(self): | |
| 279 | + from video_processor.agent.agent_loop import PlanningAgent | |
| 280 | + | |
| 281 | + skill = _DummySkill() | |
| 282 | + register_skill(skill) | |
| 283 | + | |
| 284 | + ctx = AgentContext( | |
| 285 | + knowledge_graph=MagicMock(), | |
| 286 | + provider_manager=None, | |
| 287 | + ) | |
| 288 | + | |
| 289 | + agent = PlanningAgent(context=ctx) | |
| 290 | + # "dummy" is a keyword in the skill name, but can_execute needs provider_manager | |
| 291 | + # so it should return empty | |
| 292 | + artifacts = agent.execute("dummy request") | |
| 293 | + assert artifacts == [] | |
| 294 | + | |
| 295 | + def test_execute_keyword_match_nollm_skill(self): | |
| 296 | + from video_processor.agent.agent_loop import PlanningAgent | |
| 297 | + | |
| 298 | + skill = _NoLLMSkill() | |
| 299 | + register_skill(skill) | |
| 300 | + | |
| 301 | + ctx = AgentContext( | |
| 302 | + knowledge_graph=MagicMock(), | |
| 303 | + provider_manager=None, | |
| 304 | + ) | |
| 305 | + | |
| 306 | + agent = PlanningAgent(context=ctx) | |
| 307 | + # "nollm" is in the skill name | |
| 308 | + artifacts = agent.execute("nollm stuff") | |
| 309 | + assert len(artifacts) == 1 | |
| 310 | + assert artifacts[0].name == "nollm artifact" | |
| 311 | + | |
| 312 | + def test_execute_skips_unknown_skills(self): | |
| 313 | + from video_processor.agent.agent_loop import PlanningAgent | |
| 314 | + | |
| 315 | + mock_pm = MagicMock() | |
| 316 | + mock_pm.chat.return_value = json.dumps([{"skill": "nonexistent_skill_xyz", "params": {}}]) | |
| 317 | + | |
| 318 | + ctx = AgentContext( | |
| 319 | + knowledge_graph=MagicMock(), | |
| 320 | + query_engine=MagicMock(), | |
| 321 | + provider_manager=mock_pm, | |
| 322 | + ) | |
| 323 | + ctx.query_engine.stats.return_value.to_text.return_value = "" | |
| 324 | + | |
| 325 | + agent = PlanningAgent(context=ctx) | |
| 326 | + artifacts = agent.execute("do something") | |
| 327 | + assert artifacts == [] | |
| 328 | + | |
| 329 | + def test_chat_no_provider(self): | |
| 330 | + from video_processor.agent.agent_loop import PlanningAgent | |
| 331 | + | |
| 332 | + ctx = AgentContext(provider_manager=None) | |
| 333 | + agent = PlanningAgent(context=ctx) | |
| 334 | + | |
| 335 | + reply = agent.chat("hello") | |
| 336 | + assert "requires" in reply.lower() or "provider" in reply.lower() | |
| 337 | + | |
| 338 | + def test_chat_with_provider(self): | |
| 339 | + from video_processor.agent.agent_loop import PlanningAgent | |
| 340 | + | |
| 341 | + mock_pm = MagicMock() | |
| 342 | + mock_pm.chat.return_value = "I can help you plan." | |
| 343 | + | |
| 344 | + ctx = AgentContext( | |
| 345 | + knowledge_graph=MagicMock(), | |
| 346 | + query_engine=MagicMock(), | |
| 347 | + provider_manager=mock_pm, | |
| 348 | + ) | |
| 349 | + ctx.query_engine.stats.return_value.to_text.return_value = "5 entities" | |
| 350 | + | |
| 351 | + agent = PlanningAgent(context=ctx) | |
| 352 | + reply = agent.chat("help me plan") | |
| 353 | + | |
| 354 | + assert reply == "I can help you plan." | |
| 355 | + assert len(ctx.conversation_history) == 2 # user + assistant | |
| 356 | + assert ctx.conversation_history[0]["role"] == "user" | |
| 357 | + assert ctx.conversation_history[1]["role"] == "assistant" | |
| 358 | + | |
| 359 | + def test_chat_accumulates_history(self): | |
| 360 | + from video_processor.agent.agent_loop import PlanningAgent | |
| 361 | + | |
| 362 | + mock_pm = MagicMock() | |
| 363 | + mock_pm.chat.side_effect = ["reply1", "reply2"] | |
| 364 | + | |
| 365 | + ctx = AgentContext(provider_manager=mock_pm) | |
| 366 | + agent = PlanningAgent(context=ctx) | |
| 367 | + | |
| 368 | + agent.chat("msg1") | |
| 369 | + agent.chat("msg2") | |
| 370 | + | |
| 371 | + assert len(ctx.conversation_history) == 4 # 2 user + 2 assistant | |
| 372 | + # The system message is constructed each time but not stored in history | |
| 373 | + # Provider should receive progressively longer message lists | |
| 374 | + second_call_messages = mock_pm.chat.call_args_list[1][0][0] | |
| 375 | + # Should include system + 3 prior messages (user, assistant, user) | |
| 376 | + assert len(second_call_messages) == 4 # system + user + assistant + user | |
| 377 | + | |
| 5 | 378 | |
| 6 | -from video_processor.agent.orchestrator import AgentOrchestrator | |
| 379 | +# --------------------------------------------------------------------------- | |
| 380 | +# Orchestrator tests (from existing test_agent.py — kept for coverage) | |
| 381 | +# --------------------------------------------------------------------------- | |
| 7 | 382 | |
| 8 | 383 | |
| 9 | 384 | class TestPlanCreation: |
| 10 | 385 | def test_basic_plan(self): |
| 386 | + from video_processor.agent.orchestrator import AgentOrchestrator | |
| 387 | + | |
| 11 | 388 | agent = AgentOrchestrator() |
| 12 | 389 | plan = agent._create_plan("test.mp4", "basic") |
| 13 | 390 | steps = [s["step"] for s in plan] |
| 14 | 391 | assert "extract_frames" in steps |
| 15 | 392 | assert "extract_audio" in steps |
| @@ -18,18 +395,22 @@ | ||
| 18 | 395 | assert "extract_action_items" in steps |
| 19 | 396 | assert "generate_reports" in steps |
| 20 | 397 | assert "detect_diagrams" not in steps |
| 21 | 398 | |
| 22 | 399 | def test_standard_plan(self): |
| 400 | + from video_processor.agent.orchestrator import AgentOrchestrator | |
| 401 | + | |
| 23 | 402 | agent = AgentOrchestrator() |
| 24 | 403 | plan = agent._create_plan("test.mp4", "standard") |
| 25 | 404 | steps = [s["step"] for s in plan] |
| 26 | 405 | assert "detect_diagrams" in steps |
| 27 | 406 | assert "build_knowledge_graph" in steps |
| 28 | 407 | assert "deep_analysis" not in steps |
| 29 | 408 | |
| 30 | 409 | def test_comprehensive_plan(self): |
| 410 | + from video_processor.agent.orchestrator import AgentOrchestrator | |
| 411 | + | |
| 31 | 412 | agent = AgentOrchestrator() |
| 32 | 413 | plan = agent._create_plan("test.mp4", "comprehensive") |
| 33 | 414 | steps = [s["step"] for s in plan] |
| 34 | 415 | assert "detect_diagrams" in steps |
| 35 | 416 | assert "deep_analysis" in steps |
| @@ -36,42 +417,52 @@ | ||
| 36 | 417 | assert "cross_reference" in steps |
| 37 | 418 | |
| 38 | 419 | |
| 39 | 420 | class TestAdaptPlan: |
| 40 | 421 | def test_adapts_for_long_transcript(self): |
| 422 | + from video_processor.agent.orchestrator import AgentOrchestrator | |
| 423 | + | |
| 41 | 424 | agent = AgentOrchestrator() |
| 42 | 425 | agent._plan = [{"step": "generate_reports", "priority": "required"}] |
| 43 | - long_text = "word " * 3000 # > 10000 chars | |
| 426 | + long_text = "word " * 3000 | |
| 44 | 427 | agent._adapt_plan("transcribe", {"text": long_text}) |
| 45 | 428 | steps = [s["step"] for s in agent._plan] |
| 46 | 429 | assert "deep_analysis" in steps |
| 47 | 430 | |
| 48 | 431 | def test_no_adapt_for_short_transcript(self): |
| 432 | + from video_processor.agent.orchestrator import AgentOrchestrator | |
| 433 | + | |
| 49 | 434 | agent = AgentOrchestrator() |
| 50 | 435 | agent._plan = [{"step": "generate_reports", "priority": "required"}] |
| 51 | 436 | agent._adapt_plan("transcribe", {"text": "Short text"}) |
| 52 | 437 | steps = [s["step"] for s in agent._plan] |
| 53 | 438 | assert "deep_analysis" not in steps |
| 54 | 439 | |
| 55 | 440 | def test_adapts_for_many_diagrams(self): |
| 441 | + from video_processor.agent.orchestrator import AgentOrchestrator | |
| 442 | + | |
| 56 | 443 | agent = AgentOrchestrator() |
| 57 | 444 | agent._plan = [{"step": "generate_reports", "priority": "required"}] |
| 58 | 445 | diagrams = [MagicMock() for _ in range(5)] |
| 59 | 446 | agent._adapt_plan("detect_diagrams", {"diagrams": diagrams, "captures": []}) |
| 60 | 447 | steps = [s["step"] for s in agent._plan] |
| 61 | 448 | assert "cross_reference" in steps |
| 62 | 449 | |
| 63 | 450 | def test_insight_for_many_captures(self): |
| 451 | + from video_processor.agent.orchestrator import AgentOrchestrator | |
| 452 | + | |
| 64 | 453 | agent = AgentOrchestrator() |
| 65 | 454 | agent._plan = [] |
| 66 | 455 | captures = [MagicMock() for _ in range(5)] |
| 67 | 456 | diagrams = [MagicMock() for _ in range(2)] |
| 68 | 457 | agent._adapt_plan("detect_diagrams", {"diagrams": diagrams, "captures": captures}) |
| 69 | 458 | assert len(agent._insights) == 1 |
| 70 | 459 | assert "uncertain frames" in agent._insights[0] |
| 71 | 460 | |
| 72 | 461 | def test_no_duplicate_steps(self): |
| 462 | + from video_processor.agent.orchestrator import AgentOrchestrator | |
| 463 | + | |
| 73 | 464 | agent = AgentOrchestrator() |
| 74 | 465 | agent._plan = [{"step": "deep_analysis", "priority": "comprehensive"}] |
| 75 | 466 | long_text = "word " * 3000 |
| 76 | 467 | agent._adapt_plan("transcribe", {"text": long_text}) |
| 77 | 468 | deep_steps = [s for s in agent._plan if s["step"] == "deep_analysis"] |
| @@ -78,28 +469,35 @@ | ||
| 78 | 469 | assert len(deep_steps) == 1 |
| 79 | 470 | |
| 80 | 471 | |
| 81 | 472 | class TestFallbacks: |
| 82 | 473 | def test_diagram_fallback(self): |
| 474 | + from video_processor.agent.orchestrator import AgentOrchestrator | |
| 475 | + | |
| 83 | 476 | agent = AgentOrchestrator() |
| 84 | 477 | assert agent._get_fallback("detect_diagrams") == "screengrab_fallback" |
| 85 | 478 | |
| 86 | 479 | def test_no_fallback_for_unknown(self): |
| 480 | + from video_processor.agent.orchestrator import AgentOrchestrator | |
| 481 | + | |
| 87 | 482 | agent = AgentOrchestrator() |
| 88 | 483 | assert agent._get_fallback("transcribe") is None |
| 89 | 484 | |
| 90 | 485 | |
| 91 | 486 | class TestInsights: |
| 92 | 487 | def test_insights_property(self): |
| 488 | + from video_processor.agent.orchestrator import AgentOrchestrator | |
| 489 | + | |
| 93 | 490 | agent = AgentOrchestrator() |
| 94 | 491 | agent._insights = ["Insight 1", "Insight 2"] |
| 95 | 492 | assert agent.insights == ["Insight 1", "Insight 2"] |
| 96 | - # Should return a copy | |
| 97 | 493 | agent.insights.append("should not modify internal") |
| 98 | 494 | assert len(agent._insights) == 2 |
| 99 | 495 | |
| 100 | 496 | def test_deep_analysis_populates_insights(self): |
| 497 | + from video_processor.agent.orchestrator import AgentOrchestrator | |
| 498 | + | |
| 101 | 499 | pm = MagicMock() |
| 102 | 500 | pm.chat.return_value = json.dumps( |
| 103 | 501 | { |
| 104 | 502 | "decisions": ["Decided to use microservices"], |
| 105 | 503 | "risks": ["Timeline is tight"], |
| @@ -113,57 +511,61 @@ | ||
| 113 | 511 | assert "decisions" in result |
| 114 | 512 | assert any("microservices" in i for i in agent._insights) |
| 115 | 513 | assert any("Timeline" in i for i in agent._insights) |
| 116 | 514 | |
| 117 | 515 | def test_deep_analysis_handles_error(self): |
| 516 | + from video_processor.agent.orchestrator import AgentOrchestrator | |
| 517 | + | |
| 118 | 518 | pm = MagicMock() |
| 119 | 519 | pm.chat.side_effect = Exception("API error") |
| 120 | 520 | agent = AgentOrchestrator(provider_manager=pm) |
| 121 | 521 | agent._results["transcribe"] = {"text": "some text"} |
| 122 | 522 | result = agent._deep_analysis("/tmp") |
| 123 | 523 | assert result == {} |
| 124 | 524 | |
| 125 | 525 | def test_deep_analysis_no_transcript(self): |
| 526 | + from video_processor.agent.orchestrator import AgentOrchestrator | |
| 527 | + | |
| 126 | 528 | agent = AgentOrchestrator() |
| 127 | 529 | agent._results["transcribe"] = {"text": ""} |
| 128 | 530 | result = agent._deep_analysis("/tmp") |
| 129 | 531 | assert result == {} |
| 130 | 532 | |
| 131 | 533 | |
| 132 | 534 | class TestBuildManifest: |
| 133 | 535 | def test_builds_from_results(self): |
| 536 | + from video_processor.agent.orchestrator import AgentOrchestrator | |
| 537 | + | |
| 134 | 538 | agent = AgentOrchestrator() |
| 135 | 539 | agent._results = { |
| 136 | 540 | "extract_frames": {"frames": [1, 2, 3], "paths": ["/a.jpg", "/b.jpg"]}, |
| 137 | 541 | "extract_audio": {"audio_path": "/audio.wav", "properties": {"duration": 60.0}}, |
| 138 | 542 | "detect_diagrams": {"diagrams": [], "captures": []}, |
| 139 | 543 | "extract_key_points": {"key_points": []}, |
| 140 | 544 | "extract_action_items": {"action_items": []}, |
| 141 | 545 | } |
| 142 | - from pathlib import Path | |
| 143 | - | |
| 144 | 546 | manifest = agent._build_manifest(Path("test.mp4"), Path("/out"), "Test", 5.0) |
| 145 | 547 | assert manifest.video.title == "Test" |
| 146 | 548 | assert manifest.stats.frames_extracted == 3 |
| 147 | 549 | assert manifest.stats.duration_seconds == 5.0 |
| 148 | 550 | assert manifest.video.duration_seconds == 60.0 |
| 149 | 551 | |
| 150 | 552 | def test_handles_missing_results(self): |
| 553 | + from video_processor.agent.orchestrator import AgentOrchestrator | |
| 554 | + | |
| 151 | 555 | agent = AgentOrchestrator() |
| 152 | 556 | agent._results = {} |
| 153 | - from pathlib import Path | |
| 154 | - | |
| 155 | 557 | manifest = agent._build_manifest(Path("test.mp4"), Path("/out"), None, 1.0) |
| 156 | 558 | assert manifest.video.title == "Analysis of test" |
| 157 | 559 | assert manifest.stats.frames_extracted == 0 |
| 158 | 560 | |
| 159 | 561 | def test_handles_error_results(self): |
| 562 | + from video_processor.agent.orchestrator import AgentOrchestrator | |
| 563 | + | |
| 160 | 564 | agent = AgentOrchestrator() |
| 161 | 565 | agent._results = { |
| 162 | 566 | "extract_frames": {"error": "failed"}, |
| 163 | 567 | "detect_diagrams": {"error": "also failed"}, |
| 164 | 568 | } |
| 165 | - from pathlib import Path | |
| 166 | - | |
| 167 | 569 | manifest = agent._build_manifest(Path("vid.mp4"), Path("/out"), None, 2.0) |
| 168 | 570 | assert manifest.stats.frames_extracted == 0 |
| 169 | 571 | assert len(manifest.diagrams) == 0 |
| 170 | 572 | |
| 171 | 573 | ADDED tests/test_agent_skills.py |
| 172 | 574 | ADDED tests/test_api_spec.py |
| 173 | 575 | ADDED tests/test_auth.py |
| 174 | 576 | ADDED tests/test_callbacks.py |
| 175 | 577 | ADDED tests/test_cli.py |
| 176 | 578 | ADDED tests/test_companion.py |
| 177 | 579 | ADDED tests/test_exchange.py |
| --- tests/test_agent.py | |
| +++ tests/test_agent.py | |
| @@ -1,15 +1,392 @@ | |
| 1 | """Tests for the agentic processing orchestrator.""" |
| 2 | |
| 3 | import json |
| 4 | from unittest.mock import MagicMock |
| 5 | |
| 6 | from video_processor.agent.orchestrator import AgentOrchestrator |
| 7 | |
| 8 | |
| 9 | class TestPlanCreation: |
| 10 | def test_basic_plan(self): |
| 11 | agent = AgentOrchestrator() |
| 12 | plan = agent._create_plan("test.mp4", "basic") |
| 13 | steps = [s["step"] for s in plan] |
| 14 | assert "extract_frames" in steps |
| 15 | assert "extract_audio" in steps |
| @@ -18,18 +395,22 @@ | |
| 18 | assert "extract_action_items" in steps |
| 19 | assert "generate_reports" in steps |
| 20 | assert "detect_diagrams" not in steps |
| 21 | |
| 22 | def test_standard_plan(self): |
| 23 | agent = AgentOrchestrator() |
| 24 | plan = agent._create_plan("test.mp4", "standard") |
| 25 | steps = [s["step"] for s in plan] |
| 26 | assert "detect_diagrams" in steps |
| 27 | assert "build_knowledge_graph" in steps |
| 28 | assert "deep_analysis" not in steps |
| 29 | |
| 30 | def test_comprehensive_plan(self): |
| 31 | agent = AgentOrchestrator() |
| 32 | plan = agent._create_plan("test.mp4", "comprehensive") |
| 33 | steps = [s["step"] for s in plan] |
| 34 | assert "detect_diagrams" in steps |
| 35 | assert "deep_analysis" in steps |
| @@ -36,42 +417,52 @@ | |
| 36 | assert "cross_reference" in steps |
| 37 | |
| 38 | |
| 39 | class TestAdaptPlan: |
| 40 | def test_adapts_for_long_transcript(self): |
| 41 | agent = AgentOrchestrator() |
| 42 | agent._plan = [{"step": "generate_reports", "priority": "required"}] |
| 43 | long_text = "word " * 3000 # > 10000 chars |
| 44 | agent._adapt_plan("transcribe", {"text": long_text}) |
| 45 | steps = [s["step"] for s in agent._plan] |
| 46 | assert "deep_analysis" in steps |
| 47 | |
| 48 | def test_no_adapt_for_short_transcript(self): |
| 49 | agent = AgentOrchestrator() |
| 50 | agent._plan = [{"step": "generate_reports", "priority": "required"}] |
| 51 | agent._adapt_plan("transcribe", {"text": "Short text"}) |
| 52 | steps = [s["step"] for s in agent._plan] |
| 53 | assert "deep_analysis" not in steps |
| 54 | |
| 55 | def test_adapts_for_many_diagrams(self): |
| 56 | agent = AgentOrchestrator() |
| 57 | agent._plan = [{"step": "generate_reports", "priority": "required"}] |
| 58 | diagrams = [MagicMock() for _ in range(5)] |
| 59 | agent._adapt_plan("detect_diagrams", {"diagrams": diagrams, "captures": []}) |
| 60 | steps = [s["step"] for s in agent._plan] |
| 61 | assert "cross_reference" in steps |
| 62 | |
| 63 | def test_insight_for_many_captures(self): |
| 64 | agent = AgentOrchestrator() |
| 65 | agent._plan = [] |
| 66 | captures = [MagicMock() for _ in range(5)] |
| 67 | diagrams = [MagicMock() for _ in range(2)] |
| 68 | agent._adapt_plan("detect_diagrams", {"diagrams": diagrams, "captures": captures}) |
| 69 | assert len(agent._insights) == 1 |
| 70 | assert "uncertain frames" in agent._insights[0] |
| 71 | |
| 72 | def test_no_duplicate_steps(self): |
| 73 | agent = AgentOrchestrator() |
| 74 | agent._plan = [{"step": "deep_analysis", "priority": "comprehensive"}] |
| 75 | long_text = "word " * 3000 |
| 76 | agent._adapt_plan("transcribe", {"text": long_text}) |
| 77 | deep_steps = [s for s in agent._plan if s["step"] == "deep_analysis"] |
| @@ -78,28 +469,35 @@ | |
| 78 | assert len(deep_steps) == 1 |
| 79 | |
| 80 | |
| 81 | class TestFallbacks: |
| 82 | def test_diagram_fallback(self): |
| 83 | agent = AgentOrchestrator() |
| 84 | assert agent._get_fallback("detect_diagrams") == "screengrab_fallback" |
| 85 | |
| 86 | def test_no_fallback_for_unknown(self): |
| 87 | agent = AgentOrchestrator() |
| 88 | assert agent._get_fallback("transcribe") is None |
| 89 | |
| 90 | |
| 91 | class TestInsights: |
| 92 | def test_insights_property(self): |
| 93 | agent = AgentOrchestrator() |
| 94 | agent._insights = ["Insight 1", "Insight 2"] |
| 95 | assert agent.insights == ["Insight 1", "Insight 2"] |
| 96 | # Should return a copy |
| 97 | agent.insights.append("should not modify internal") |
| 98 | assert len(agent._insights) == 2 |
| 99 | |
| 100 | def test_deep_analysis_populates_insights(self): |
| 101 | pm = MagicMock() |
| 102 | pm.chat.return_value = json.dumps( |
| 103 | { |
| 104 | "decisions": ["Decided to use microservices"], |
| 105 | "risks": ["Timeline is tight"], |
| @@ -113,57 +511,61 @@ | |
| 113 | assert "decisions" in result |
| 114 | assert any("microservices" in i for i in agent._insights) |
| 115 | assert any("Timeline" in i for i in agent._insights) |
| 116 | |
| 117 | def test_deep_analysis_handles_error(self): |
| 118 | pm = MagicMock() |
| 119 | pm.chat.side_effect = Exception("API error") |
| 120 | agent = AgentOrchestrator(provider_manager=pm) |
| 121 | agent._results["transcribe"] = {"text": "some text"} |
| 122 | result = agent._deep_analysis("/tmp") |
| 123 | assert result == {} |
| 124 | |
| 125 | def test_deep_analysis_no_transcript(self): |
| 126 | agent = AgentOrchestrator() |
| 127 | agent._results["transcribe"] = {"text": ""} |
| 128 | result = agent._deep_analysis("/tmp") |
| 129 | assert result == {} |
| 130 | |
| 131 | |
| 132 | class TestBuildManifest: |
| 133 | def test_builds_from_results(self): |
| 134 | agent = AgentOrchestrator() |
| 135 | agent._results = { |
| 136 | "extract_frames": {"frames": [1, 2, 3], "paths": ["/a.jpg", "/b.jpg"]}, |
| 137 | "extract_audio": {"audio_path": "/audio.wav", "properties": {"duration": 60.0}}, |
| 138 | "detect_diagrams": {"diagrams": [], "captures": []}, |
| 139 | "extract_key_points": {"key_points": []}, |
| 140 | "extract_action_items": {"action_items": []}, |
| 141 | } |
| 142 | from pathlib import Path |
| 143 | |
| 144 | manifest = agent._build_manifest(Path("test.mp4"), Path("/out"), "Test", 5.0) |
| 145 | assert manifest.video.title == "Test" |
| 146 | assert manifest.stats.frames_extracted == 3 |
| 147 | assert manifest.stats.duration_seconds == 5.0 |
| 148 | assert manifest.video.duration_seconds == 60.0 |
| 149 | |
| 150 | def test_handles_missing_results(self): |
| 151 | agent = AgentOrchestrator() |
| 152 | agent._results = {} |
| 153 | from pathlib import Path |
| 154 | |
| 155 | manifest = agent._build_manifest(Path("test.mp4"), Path("/out"), None, 1.0) |
| 156 | assert manifest.video.title == "Analysis of test" |
| 157 | assert manifest.stats.frames_extracted == 0 |
| 158 | |
| 159 | def test_handles_error_results(self): |
| 160 | agent = AgentOrchestrator() |
| 161 | agent._results = { |
| 162 | "extract_frames": {"error": "failed"}, |
| 163 | "detect_diagrams": {"error": "also failed"}, |
| 164 | } |
| 165 | from pathlib import Path |
| 166 | |
| 167 | manifest = agent._build_manifest(Path("vid.mp4"), Path("/out"), None, 2.0) |
| 168 | assert manifest.stats.frames_extracted == 0 |
| 169 | assert len(manifest.diagrams) == 0 |
| 170 | |
| 171 | DDED tests/test_agent_skills.py |
| 172 | DDED tests/test_api_spec.py |
| 173 | DDED tests/test_auth.py |
| 174 | DDED tests/test_callbacks.py |
| 175 | DDED tests/test_cli.py |
| 176 | DDED tests/test_companion.py |
| 177 | DDED tests/test_exchange.py |
| --- tests/test_agent.py | |
| +++ tests/test_agent.py | |
| @@ -1,15 +1,392 @@ | |
| 1 | """Tests for the planning agent, skill registry, KB context, and agent loop.""" |
| 2 | |
| 3 | import json |
| 4 | from pathlib import Path |
| 5 | from unittest.mock import MagicMock, patch |
| 6 | |
| 7 | import pytest |
| 8 | |
| 9 | from video_processor.agent.skills.base import ( |
| 10 | AgentContext, |
| 11 | Artifact, |
| 12 | Skill, |
| 13 | _skills, |
| 14 | get_skill, |
| 15 | list_skills, |
| 16 | register_skill, |
| 17 | ) |
| 18 | |
| 19 | # --------------------------------------------------------------------------- |
| 20 | # Fixtures |
| 21 | # --------------------------------------------------------------------------- |
| 22 | |
| 23 | |
| 24 | @pytest.fixture(autouse=True) |
| 25 | def _clean_skill_registry(): |
| 26 | """Save and restore the global skill registry between tests.""" |
| 27 | original = dict(_skills) |
| 28 | yield |
| 29 | _skills.clear() |
| 30 | _skills.update(original) |
| 31 | |
| 32 | |
| 33 | class _DummySkill(Skill): |
| 34 | name = "dummy_test_skill" |
| 35 | description = "A dummy skill for testing" |
| 36 | |
| 37 | def execute(self, context: AgentContext, **kwargs) -> Artifact: |
| 38 | return Artifact( |
| 39 | name="dummy artifact", |
| 40 | content="dummy content", |
| 41 | artifact_type="document", |
| 42 | ) |
| 43 | |
| 44 | |
| 45 | class _NoLLMSkill(Skill): |
| 46 | """Skill that doesn't require provider_manager.""" |
| 47 | |
| 48 | name = "nollm_skill" |
| 49 | description = "Works without LLM" |
| 50 | |
| 51 | def execute(self, context: AgentContext, **kwargs) -> Artifact: |
| 52 | return Artifact( |
| 53 | name="nollm artifact", |
| 54 | content="generated", |
| 55 | artifact_type="document", |
| 56 | ) |
| 57 | |
| 58 | def can_execute(self, context: AgentContext) -> bool: |
| 59 | return context.knowledge_graph is not None |
| 60 | |
| 61 | |
| 62 | # --------------------------------------------------------------------------- |
| 63 | # Skill registry |
| 64 | # --------------------------------------------------------------------------- |
| 65 | |
| 66 | |
| 67 | class TestSkillRegistry: |
| 68 | def test_register_and_get(self): |
| 69 | skill = _DummySkill() |
| 70 | register_skill(skill) |
| 71 | assert get_skill("dummy_test_skill") is skill |
| 72 | |
| 73 | def test_get_unknown_returns_none(self): |
| 74 | assert get_skill("no_such_skill_xyz") is None |
| 75 | |
| 76 | def test_list_skills(self): |
| 77 | s1 = _DummySkill() |
| 78 | register_skill(s1) |
| 79 | skills = list_skills() |
| 80 | assert any(s.name == "dummy_test_skill" for s in skills) |
| 81 | |
| 82 | def test_list_skills_empty(self): |
| 83 | _skills.clear() |
| 84 | assert list_skills() == [] |
| 85 | |
| 86 | |
| 87 | # --------------------------------------------------------------------------- |
| 88 | # AgentContext dataclass |
| 89 | # --------------------------------------------------------------------------- |
| 90 | |
| 91 | |
| 92 | class TestAgentContext: |
| 93 | def test_defaults(self): |
| 94 | ctx = AgentContext() |
| 95 | assert ctx.knowledge_graph is None |
| 96 | assert ctx.query_engine is None |
| 97 | assert ctx.provider_manager is None |
| 98 | assert ctx.planning_entities == [] |
| 99 | assert ctx.user_requirements == {} |
| 100 | assert ctx.conversation_history == [] |
| 101 | assert ctx.artifacts == [] |
| 102 | assert ctx.config == {} |
| 103 | |
| 104 | def test_with_values(self): |
| 105 | mock_kg = MagicMock() |
| 106 | mock_qe = MagicMock() |
| 107 | mock_pm = MagicMock() |
| 108 | ctx = AgentContext( |
| 109 | knowledge_graph=mock_kg, |
| 110 | query_engine=mock_qe, |
| 111 | provider_manager=mock_pm, |
| 112 | config={"key": "value"}, |
| 113 | ) |
| 114 | assert ctx.knowledge_graph is mock_kg |
| 115 | assert ctx.config == {"key": "value"} |
| 116 | |
| 117 | def test_conversation_history_is_mutable(self): |
| 118 | ctx = AgentContext() |
| 119 | ctx.conversation_history.append({"role": "user", "content": "hello"}) |
| 120 | assert len(ctx.conversation_history) == 1 |
| 121 | |
| 122 | |
| 123 | # --------------------------------------------------------------------------- |
| 124 | # Artifact dataclass |
| 125 | # --------------------------------------------------------------------------- |
| 126 | |
| 127 | |
| 128 | class TestArtifact: |
| 129 | def test_basic(self): |
| 130 | a = Artifact(name="Plan", content="# Plan\n...", artifact_type="project_plan") |
| 131 | assert a.name == "Plan" |
| 132 | assert a.format == "markdown" # default |
| 133 | assert a.metadata == {} |
| 134 | |
| 135 | def test_with_metadata(self): |
| 136 | a = Artifact( |
| 137 | name="Tasks", |
| 138 | content="[]", |
| 139 | artifact_type="task_list", |
| 140 | format="json", |
| 141 | metadata={"source": "kg"}, |
| 142 | ) |
| 143 | assert a.format == "json" |
| 144 | assert a.metadata["source"] == "kg" |
| 145 | |
| 146 | |
| 147 | # --------------------------------------------------------------------------- |
| 148 | # Skill.can_execute |
| 149 | # --------------------------------------------------------------------------- |
| 150 | |
| 151 | |
| 152 | class TestSkillCanExecute: |
| 153 | def test_default_requires_kg_and_pm(self): |
| 154 | skill = _DummySkill() |
| 155 | ctx_no_kg = AgentContext(provider_manager=MagicMock()) |
| 156 | assert not skill.can_execute(ctx_no_kg) |
| 157 | |
| 158 | ctx_no_pm = AgentContext(knowledge_graph=MagicMock()) |
| 159 | assert not skill.can_execute(ctx_no_pm) |
| 160 | |
| 161 | ctx_both = AgentContext(knowledge_graph=MagicMock(), provider_manager=MagicMock()) |
| 162 | assert skill.can_execute(ctx_both) |
| 163 | |
| 164 | |
| 165 | # --------------------------------------------------------------------------- |
| 166 | # KBContext |
| 167 | # --------------------------------------------------------------------------- |
| 168 | |
| 169 | |
| 170 | class TestKBContext: |
| 171 | def test_add_source_nonexistent_raises(self, tmp_path): |
| 172 | from video_processor.agent.kb_context import KBContext |
| 173 | |
| 174 | ctx = KBContext() |
| 175 | with pytest.raises(FileNotFoundError, match="Not found"): |
| 176 | ctx.add_source(tmp_path / "nonexistent.json") |
| 177 | |
| 178 | def test_add_source_file(self, tmp_path): |
| 179 | from video_processor.agent.kb_context import KBContext |
| 180 | |
| 181 | f = tmp_path / "kg.json" |
| 182 | f.write_text("{}") |
| 183 | ctx = KBContext() |
| 184 | ctx.add_source(f) |
| 185 | assert len(ctx.sources) == 1 |
| 186 | assert ctx.sources[0] == f.resolve() |
| 187 | |
| 188 | def test_add_source_directory(self, tmp_path): |
| 189 | from video_processor.agent.kb_context import KBContext |
| 190 | |
| 191 | with patch( |
| 192 | "video_processor.integrators.graph_discovery.find_knowledge_graphs", |
| 193 | return_value=[tmp_path / "a.db"], |
| 194 | ): |
| 195 | ctx = KBContext() |
| 196 | ctx.add_source(tmp_path) |
| 197 | assert len(ctx.sources) == 1 |
| 198 | |
| 199 | def test_knowledge_graph_before_load_raises(self): |
| 200 | from video_processor.agent.kb_context import KBContext |
| 201 | |
| 202 | ctx = KBContext() |
| 203 | with pytest.raises(RuntimeError, match="Call load"): |
| 204 | _ = ctx.knowledge_graph |
| 205 | |
| 206 | def test_query_engine_before_load_raises(self): |
| 207 | from video_processor.agent.kb_context import KBContext |
| 208 | |
| 209 | ctx = KBContext() |
| 210 | with pytest.raises(RuntimeError, match="Call load"): |
| 211 | _ = ctx.query_engine |
| 212 | |
| 213 | def test_summary_no_data(self): |
| 214 | from video_processor.agent.kb_context import KBContext |
| 215 | |
| 216 | ctx = KBContext() |
| 217 | assert ctx.summary() == "No knowledge base loaded." |
| 218 | |
| 219 | def test_load_json_and_summary(self, tmp_path): |
| 220 | from video_processor.agent.kb_context import KBContext |
| 221 | |
| 222 | kg_data = {"nodes": [], "relationships": []} |
| 223 | f = tmp_path / "kg.json" |
| 224 | f.write_text(json.dumps(kg_data)) |
| 225 | |
| 226 | ctx = KBContext() |
| 227 | ctx.add_source(f) |
| 228 | ctx.load() |
| 229 | |
| 230 | summary = ctx.summary() |
| 231 | assert "Knowledge base" in summary |
| 232 | assert "Entities" in summary |
| 233 | assert "Relationships" in summary |
| 234 | |
| 235 | |
| 236 | # --------------------------------------------------------------------------- |
| 237 | # PlanningAgent |
| 238 | # --------------------------------------------------------------------------- |
| 239 | |
| 240 | |
| 241 | class TestPlanningAgent: |
| 242 | def test_from_kb_paths(self, tmp_path): |
| 243 | from video_processor.agent.agent_loop import PlanningAgent |
| 244 | |
| 245 | kg_data = {"nodes": [], "relationships": []} |
| 246 | f = tmp_path / "kg.json" |
| 247 | f.write_text(json.dumps(kg_data)) |
| 248 | |
| 249 | agent = PlanningAgent.from_kb_paths([f], provider_manager=None) |
| 250 | assert agent.context.knowledge_graph is not None |
| 251 | assert agent.context.provider_manager is None |
| 252 | |
| 253 | def test_execute_with_mock_provider(self, tmp_path): |
| 254 | from video_processor.agent.agent_loop import PlanningAgent |
| 255 | |
| 256 | # Register a dummy skill |
| 257 | skill = _DummySkill() |
| 258 | register_skill(skill) |
| 259 | |
| 260 | mock_pm = MagicMock() |
| 261 | mock_pm.chat.return_value = json.dumps([{"skill": "dummy_test_skill", "params": {}}]) |
| 262 | |
| 263 | ctx = AgentContext( |
| 264 | knowledge_graph=MagicMock(), |
| 265 | query_engine=MagicMock(), |
| 266 | provider_manager=mock_pm, |
| 267 | ) |
| 268 | # Mock stats().to_text() |
| 269 | ctx.query_engine.stats.return_value.to_text.return_value = "3 entities" |
| 270 | |
| 271 | agent = PlanningAgent(context=ctx) |
| 272 | artifacts = agent.execute("generate a plan") |
| 273 | |
| 274 | assert len(artifacts) == 1 |
| 275 | assert artifacts[0].name == "dummy artifact" |
| 276 | mock_pm.chat.assert_called_once() |
| 277 | |
| 278 | def test_execute_no_provider_keyword_match(self): |
| 279 | from video_processor.agent.agent_loop import PlanningAgent |
| 280 | |
| 281 | skill = _DummySkill() |
| 282 | register_skill(skill) |
| 283 | |
| 284 | ctx = AgentContext( |
| 285 | knowledge_graph=MagicMock(), |
| 286 | provider_manager=None, |
| 287 | ) |
| 288 | |
| 289 | agent = PlanningAgent(context=ctx) |
| 290 | # "dummy" is a keyword in the skill name, but can_execute needs provider_manager |
| 291 | # so it should return empty |
| 292 | artifacts = agent.execute("dummy request") |
| 293 | assert artifacts == [] |
| 294 | |
| 295 | def test_execute_keyword_match_nollm_skill(self): |
| 296 | from video_processor.agent.agent_loop import PlanningAgent |
| 297 | |
| 298 | skill = _NoLLMSkill() |
| 299 | register_skill(skill) |
| 300 | |
| 301 | ctx = AgentContext( |
| 302 | knowledge_graph=MagicMock(), |
| 303 | provider_manager=None, |
| 304 | ) |
| 305 | |
| 306 | agent = PlanningAgent(context=ctx) |
| 307 | # "nollm" is in the skill name |
| 308 | artifacts = agent.execute("nollm stuff") |
| 309 | assert len(artifacts) == 1 |
| 310 | assert artifacts[0].name == "nollm artifact" |
| 311 | |
| 312 | def test_execute_skips_unknown_skills(self): |
| 313 | from video_processor.agent.agent_loop import PlanningAgent |
| 314 | |
| 315 | mock_pm = MagicMock() |
| 316 | mock_pm.chat.return_value = json.dumps([{"skill": "nonexistent_skill_xyz", "params": {}}]) |
| 317 | |
| 318 | ctx = AgentContext( |
| 319 | knowledge_graph=MagicMock(), |
| 320 | query_engine=MagicMock(), |
| 321 | provider_manager=mock_pm, |
| 322 | ) |
| 323 | ctx.query_engine.stats.return_value.to_text.return_value = "" |
| 324 | |
| 325 | agent = PlanningAgent(context=ctx) |
| 326 | artifacts = agent.execute("do something") |
| 327 | assert artifacts == [] |
| 328 | |
| 329 | def test_chat_no_provider(self): |
| 330 | from video_processor.agent.agent_loop import PlanningAgent |
| 331 | |
| 332 | ctx = AgentContext(provider_manager=None) |
| 333 | agent = PlanningAgent(context=ctx) |
| 334 | |
| 335 | reply = agent.chat("hello") |
| 336 | assert "requires" in reply.lower() or "provider" in reply.lower() |
| 337 | |
| 338 | def test_chat_with_provider(self): |
| 339 | from video_processor.agent.agent_loop import PlanningAgent |
| 340 | |
| 341 | mock_pm = MagicMock() |
| 342 | mock_pm.chat.return_value = "I can help you plan." |
| 343 | |
| 344 | ctx = AgentContext( |
| 345 | knowledge_graph=MagicMock(), |
| 346 | query_engine=MagicMock(), |
| 347 | provider_manager=mock_pm, |
| 348 | ) |
| 349 | ctx.query_engine.stats.return_value.to_text.return_value = "5 entities" |
| 350 | |
| 351 | agent = PlanningAgent(context=ctx) |
| 352 | reply = agent.chat("help me plan") |
| 353 | |
| 354 | assert reply == "I can help you plan." |
| 355 | assert len(ctx.conversation_history) == 2 # user + assistant |
| 356 | assert ctx.conversation_history[0]["role"] == "user" |
| 357 | assert ctx.conversation_history[1]["role"] == "assistant" |
| 358 | |
| 359 | def test_chat_accumulates_history(self): |
| 360 | from video_processor.agent.agent_loop import PlanningAgent |
| 361 | |
| 362 | mock_pm = MagicMock() |
| 363 | mock_pm.chat.side_effect = ["reply1", "reply2"] |
| 364 | |
| 365 | ctx = AgentContext(provider_manager=mock_pm) |
| 366 | agent = PlanningAgent(context=ctx) |
| 367 | |
| 368 | agent.chat("msg1") |
| 369 | agent.chat("msg2") |
| 370 | |
| 371 | assert len(ctx.conversation_history) == 4 # 2 user + 2 assistant |
| 372 | # The system message is constructed each time but not stored in history |
| 373 | # Provider should receive progressively longer message lists |
| 374 | second_call_messages = mock_pm.chat.call_args_list[1][0][0] |
| 375 | # Should include system + 3 prior messages (user, assistant, user) |
| 376 | assert len(second_call_messages) == 4 # system + user + assistant + user |
| 377 | |
| 378 | |
| 379 | # --------------------------------------------------------------------------- |
| 380 | # Orchestrator tests (from existing test_agent.py — kept for coverage) |
| 381 | # --------------------------------------------------------------------------- |
| 382 | |
| 383 | |
| 384 | class TestPlanCreation: |
| 385 | def test_basic_plan(self): |
| 386 | from video_processor.agent.orchestrator import AgentOrchestrator |
| 387 | |
| 388 | agent = AgentOrchestrator() |
| 389 | plan = agent._create_plan("test.mp4", "basic") |
| 390 | steps = [s["step"] for s in plan] |
| 391 | assert "extract_frames" in steps |
| 392 | assert "extract_audio" in steps |
| @@ -18,18 +395,22 @@ | |
| 395 | assert "extract_action_items" in steps |
| 396 | assert "generate_reports" in steps |
| 397 | assert "detect_diagrams" not in steps |
| 398 | |
| 399 | def test_standard_plan(self): |
| 400 | from video_processor.agent.orchestrator import AgentOrchestrator |
| 401 | |
| 402 | agent = AgentOrchestrator() |
| 403 | plan = agent._create_plan("test.mp4", "standard") |
| 404 | steps = [s["step"] for s in plan] |
| 405 | assert "detect_diagrams" in steps |
| 406 | assert "build_knowledge_graph" in steps |
| 407 | assert "deep_analysis" not in steps |
| 408 | |
| 409 | def test_comprehensive_plan(self): |
| 410 | from video_processor.agent.orchestrator import AgentOrchestrator |
| 411 | |
| 412 | agent = AgentOrchestrator() |
| 413 | plan = agent._create_plan("test.mp4", "comprehensive") |
| 414 | steps = [s["step"] for s in plan] |
| 415 | assert "detect_diagrams" in steps |
| 416 | assert "deep_analysis" in steps |
| @@ -36,42 +417,52 @@ | |
| 417 | assert "cross_reference" in steps |
| 418 | |
| 419 | |
| 420 | class TestAdaptPlan: |
| 421 | def test_adapts_for_long_transcript(self): |
| 422 | from video_processor.agent.orchestrator import AgentOrchestrator |
| 423 | |
| 424 | agent = AgentOrchestrator() |
| 425 | agent._plan = [{"step": "generate_reports", "priority": "required"}] |
| 426 | long_text = "word " * 3000 |
| 427 | agent._adapt_plan("transcribe", {"text": long_text}) |
| 428 | steps = [s["step"] for s in agent._plan] |
| 429 | assert "deep_analysis" in steps |
| 430 | |
| 431 | def test_no_adapt_for_short_transcript(self): |
| 432 | from video_processor.agent.orchestrator import AgentOrchestrator |
| 433 | |
| 434 | agent = AgentOrchestrator() |
| 435 | agent._plan = [{"step": "generate_reports", "priority": "required"}] |
| 436 | agent._adapt_plan("transcribe", {"text": "Short text"}) |
| 437 | steps = [s["step"] for s in agent._plan] |
| 438 | assert "deep_analysis" not in steps |
| 439 | |
| 440 | def test_adapts_for_many_diagrams(self): |
| 441 | from video_processor.agent.orchestrator import AgentOrchestrator |
| 442 | |
| 443 | agent = AgentOrchestrator() |
| 444 | agent._plan = [{"step": "generate_reports", "priority": "required"}] |
| 445 | diagrams = [MagicMock() for _ in range(5)] |
| 446 | agent._adapt_plan("detect_diagrams", {"diagrams": diagrams, "captures": []}) |
| 447 | steps = [s["step"] for s in agent._plan] |
| 448 | assert "cross_reference" in steps |
| 449 | |
| 450 | def test_insight_for_many_captures(self): |
| 451 | from video_processor.agent.orchestrator import AgentOrchestrator |
| 452 | |
| 453 | agent = AgentOrchestrator() |
| 454 | agent._plan = [] |
| 455 | captures = [MagicMock() for _ in range(5)] |
| 456 | diagrams = [MagicMock() for _ in range(2)] |
| 457 | agent._adapt_plan("detect_diagrams", {"diagrams": diagrams, "captures": captures}) |
| 458 | assert len(agent._insights) == 1 |
| 459 | assert "uncertain frames" in agent._insights[0] |
| 460 | |
| 461 | def test_no_duplicate_steps(self): |
| 462 | from video_processor.agent.orchestrator import AgentOrchestrator |
| 463 | |
| 464 | agent = AgentOrchestrator() |
| 465 | agent._plan = [{"step": "deep_analysis", "priority": "comprehensive"}] |
| 466 | long_text = "word " * 3000 |
| 467 | agent._adapt_plan("transcribe", {"text": long_text}) |
| 468 | deep_steps = [s for s in agent._plan if s["step"] == "deep_analysis"] |
| @@ -78,28 +469,35 @@ | |
| 469 | assert len(deep_steps) == 1 |
| 470 | |
| 471 | |
| 472 | class TestFallbacks: |
| 473 | def test_diagram_fallback(self): |
| 474 | from video_processor.agent.orchestrator import AgentOrchestrator |
| 475 | |
| 476 | agent = AgentOrchestrator() |
| 477 | assert agent._get_fallback("detect_diagrams") == "screengrab_fallback" |
| 478 | |
| 479 | def test_no_fallback_for_unknown(self): |
| 480 | from video_processor.agent.orchestrator import AgentOrchestrator |
| 481 | |
| 482 | agent = AgentOrchestrator() |
| 483 | assert agent._get_fallback("transcribe") is None |
| 484 | |
| 485 | |
| 486 | class TestInsights: |
| 487 | def test_insights_property(self): |
| 488 | from video_processor.agent.orchestrator import AgentOrchestrator |
| 489 | |
| 490 | agent = AgentOrchestrator() |
| 491 | agent._insights = ["Insight 1", "Insight 2"] |
| 492 | assert agent.insights == ["Insight 1", "Insight 2"] |
| 493 | agent.insights.append("should not modify internal") |
| 494 | assert len(agent._insights) == 2 |
| 495 | |
| 496 | def test_deep_analysis_populates_insights(self): |
| 497 | from video_processor.agent.orchestrator import AgentOrchestrator |
| 498 | |
| 499 | pm = MagicMock() |
| 500 | pm.chat.return_value = json.dumps( |
| 501 | { |
| 502 | "decisions": ["Decided to use microservices"], |
| 503 | "risks": ["Timeline is tight"], |
| @@ -113,57 +511,61 @@ | |
| 511 | assert "decisions" in result |
| 512 | assert any("microservices" in i for i in agent._insights) |
| 513 | assert any("Timeline" in i for i in agent._insights) |
| 514 | |
| 515 | def test_deep_analysis_handles_error(self): |
| 516 | from video_processor.agent.orchestrator import AgentOrchestrator |
| 517 | |
| 518 | pm = MagicMock() |
| 519 | pm.chat.side_effect = Exception("API error") |
| 520 | agent = AgentOrchestrator(provider_manager=pm) |
| 521 | agent._results["transcribe"] = {"text": "some text"} |
| 522 | result = agent._deep_analysis("/tmp") |
| 523 | assert result == {} |
| 524 | |
| 525 | def test_deep_analysis_no_transcript(self): |
| 526 | from video_processor.agent.orchestrator import AgentOrchestrator |
| 527 | |
| 528 | agent = AgentOrchestrator() |
| 529 | agent._results["transcribe"] = {"text": ""} |
| 530 | result = agent._deep_analysis("/tmp") |
| 531 | assert result == {} |
| 532 | |
| 533 | |
| 534 | class TestBuildManifest: |
| 535 | def test_builds_from_results(self): |
| 536 | from video_processor.agent.orchestrator import AgentOrchestrator |
| 537 | |
| 538 | agent = AgentOrchestrator() |
| 539 | agent._results = { |
| 540 | "extract_frames": {"frames": [1, 2, 3], "paths": ["/a.jpg", "/b.jpg"]}, |
| 541 | "extract_audio": {"audio_path": "/audio.wav", "properties": {"duration": 60.0}}, |
| 542 | "detect_diagrams": {"diagrams": [], "captures": []}, |
| 543 | "extract_key_points": {"key_points": []}, |
| 544 | "extract_action_items": {"action_items": []}, |
| 545 | } |
| 546 | manifest = agent._build_manifest(Path("test.mp4"), Path("/out"), "Test", 5.0) |
| 547 | assert manifest.video.title == "Test" |
| 548 | assert manifest.stats.frames_extracted == 3 |
| 549 | assert manifest.stats.duration_seconds == 5.0 |
| 550 | assert manifest.video.duration_seconds == 60.0 |
| 551 | |
| 552 | def test_handles_missing_results(self): |
| 553 | from video_processor.agent.orchestrator import AgentOrchestrator |
| 554 | |
| 555 | agent = AgentOrchestrator() |
| 556 | agent._results = {} |
| 557 | manifest = agent._build_manifest(Path("test.mp4"), Path("/out"), None, 1.0) |
| 558 | assert manifest.video.title == "Analysis of test" |
| 559 | assert manifest.stats.frames_extracted == 0 |
| 560 | |
| 561 | def test_handles_error_results(self): |
| 562 | from video_processor.agent.orchestrator import AgentOrchestrator |
| 563 | |
| 564 | agent = AgentOrchestrator() |
| 565 | agent._results = { |
| 566 | "extract_frames": {"error": "failed"}, |
| 567 | "detect_diagrams": {"error": "also failed"}, |
| 568 | } |
| 569 | manifest = agent._build_manifest(Path("vid.mp4"), Path("/out"), None, 2.0) |
| 570 | assert manifest.stats.frames_extracted == 0 |
| 571 | assert len(manifest.diagrams) == 0 |
| 572 | |
| 573 | DDED tests/test_agent_skills.py |
| 574 | DDED tests/test_api_spec.py |
| 575 | DDED tests/test_auth.py |
| 576 | DDED tests/test_callbacks.py |
| 577 | DDED tests/test_cli.py |
| 578 | DDED tests/test_companion.py |
| 579 | DDED tests/test_exchange.py |
+627
| --- a/tests/test_agent_skills.py | ||
| +++ b/tests/test_agent_skills.py | ||
| @@ -0,0 +1,627 @@ | ||
| 1 | +"""Tests for agent skill execute() methods with mocked context.""" | |
| 2 | + | |
| 3 | +import json | |
| 4 | +from dataclasses import dataclass | |
| 5 | +from unittest.mock import MagicMock | |
| 6 | + | |
| 7 | +import pytest | |
| 8 | + | |
| 9 | +from video_processor.agent.skills.base import ( | |
| 10 | + AgentContext, | |
| 11 | + Artifact, | |
| 12 | + _skills, | |
| 13 | +) | |
| 14 | + | |
| 15 | +# --------------------------------------------------------------------------- | |
| 16 | +# Fixtures | |
| 17 | +# --------------------------------------------------------------------------- | |
| 18 | + | |
| 19 | + | |
| 20 | +@pytest.fixture(autouse=True) | |
| 21 | +def _clean_skill_registry(): | |
| 22 | + """Save and restore the global skill registry between tests.""" | |
| 23 | + original = dict(_skills) | |
| 24 | + yield | |
| 25 | + _skills.clear() | |
| 26 | + _skills.update(original) | |
| 27 | + | |
| 28 | + | |
| 29 | +@dataclass | |
| 30 | +class FakeEntity: | |
| 31 | + name: str | |
| 32 | + type: str | |
| 33 | + | |
| 34 | + def __str__(self): | |
| 35 | + return self.name | |
| 36 | + | |
| 37 | + | |
| 38 | +class FakeQueryResult: | |
| 39 | + """Mimics QueryResult.to_text().""" | |
| 40 | + | |
| 41 | + def __init__(self, text="Stats: 10 entities, 5 relationships"): | |
| 42 | + self._text = text | |
| 43 | + | |
| 44 | + def to_text(self): | |
| 45 | + return self._text | |
| 46 | + | |
| 47 | + | |
| 48 | +def _make_context( | |
| 49 | + chat_response="# Generated Content\n\nSome markdown content.", | |
| 50 | + planning_entities=None, | |
| 51 | +): | |
| 52 | + """Build an AgentContext with mocked query_engine and provider_manager.""" | |
| 53 | + ctx = AgentContext() | |
| 54 | + | |
| 55 | + qe = MagicMock() | |
| 56 | + qe.stats.return_value = FakeQueryResult("Stats: 10 entities, 5 rels") | |
| 57 | + qe.entities.return_value = FakeQueryResult("Entity1, Entity2") | |
| 58 | + qe.relationships.return_value = FakeQueryResult("Entity1 -> Entity2") | |
| 59 | + ctx.query_engine = qe | |
| 60 | + | |
| 61 | + pm = MagicMock() | |
| 62 | + pm.chat.return_value = chat_response | |
| 63 | + ctx.provider_manager = pm | |
| 64 | + | |
| 65 | + ctx.knowledge_graph = MagicMock() | |
| 66 | + | |
| 67 | + if planning_entities is not None: | |
| 68 | + ctx.planning_entities = planning_entities | |
| 69 | + else: | |
| 70 | + ctx.planning_entities = [ | |
| 71 | + FakeEntity(name="Auth system", type="feature"), | |
| 72 | + FakeEntity(name="Launch v1", type="milestone"), | |
| 73 | + FakeEntity(name="Must be fast", type="constraint"), | |
| 74 | + FakeEntity(name="Build dashboard", type="goal"), | |
| 75 | + FakeEntity(name="API depends on auth", type="dependency"), | |
| 76 | + FakeEntity(name="User login", type="requirement"), | |
| 77 | + ] | |
| 78 | + | |
| 79 | + return ctx | |
| 80 | + | |
| 81 | + | |
| 82 | +# --------------------------------------------------------------------------- | |
| 83 | +# ProjectPlanSkill | |
| 84 | +# --------------------------------------------------------------------------- | |
| 85 | + | |
| 86 | + | |
| 87 | +class TestProjectPlanSkill: | |
| 88 | + def test_execute_returns_artifact(self): | |
| 89 | + from video_processor.agent.skills.project_plan import ProjectPlanSkill | |
| 90 | + | |
| 91 | + skill = ProjectPlanSkill() | |
| 92 | + ctx = _make_context() | |
| 93 | + artifact = skill.execute(ctx) | |
| 94 | + | |
| 95 | + assert isinstance(artifact, Artifact) | |
| 96 | + assert artifact.artifact_type == "project_plan" | |
| 97 | + assert artifact.format == "markdown" | |
| 98 | + assert len(artifact.content) > 0 | |
| 99 | + | |
| 100 | + def test_execute_calls_provider(self): | |
| 101 | + from video_processor.agent.skills.project_plan import ProjectPlanSkill | |
| 102 | + | |
| 103 | + skill = ProjectPlanSkill() | |
| 104 | + ctx = _make_context() | |
| 105 | + skill.execute(ctx) | |
| 106 | + | |
| 107 | + ctx.provider_manager.chat.assert_called_once() | |
| 108 | + call_args = ctx.provider_manager.chat.call_args | |
| 109 | + messages = call_args[1]["messages"] if "messages" in call_args[1] else call_args[0][0] | |
| 110 | + assert len(messages) == 1 | |
| 111 | + assert messages[0]["role"] == "user" | |
| 112 | + | |
| 113 | + def test_execute_queries_graph(self): | |
| 114 | + from video_processor.agent.skills.project_plan import ProjectPlanSkill | |
| 115 | + | |
| 116 | + skill = ProjectPlanSkill() | |
| 117 | + ctx = _make_context() | |
| 118 | + skill.execute(ctx) | |
| 119 | + | |
| 120 | + ctx.query_engine.stats.assert_called_once() | |
| 121 | + ctx.query_engine.entities.assert_called_once() | |
| 122 | + ctx.query_engine.relationships.assert_called_once() | |
| 123 | + | |
| 124 | + | |
| 125 | +# --------------------------------------------------------------------------- | |
| 126 | +# PRDSkill | |
| 127 | +# --------------------------------------------------------------------------- | |
| 128 | + | |
| 129 | + | |
| 130 | +class TestPRDSkill: | |
| 131 | + def test_execute_returns_artifact(self): | |
| 132 | + from video_processor.agent.skills.prd import PRDSkill | |
| 133 | + | |
| 134 | + skill = PRDSkill() | |
| 135 | + ctx = _make_context() | |
| 136 | + artifact = skill.execute(ctx) | |
| 137 | + | |
| 138 | + assert isinstance(artifact, Artifact) | |
| 139 | + assert artifact.artifact_type == "prd" | |
| 140 | + assert artifact.format == "markdown" | |
| 141 | + | |
| 142 | + def test_execute_filters_relevant_entities(self): | |
| 143 | + from video_processor.agent.skills.prd import PRDSkill | |
| 144 | + | |
| 145 | + skill = PRDSkill() | |
| 146 | + ctx = _make_context() | |
| 147 | + skill.execute(ctx) | |
| 148 | + | |
| 149 | + # Should still call provider | |
| 150 | + ctx.provider_manager.chat.assert_called_once() | |
| 151 | + | |
| 152 | + def test_execute_with_no_relevant_entities(self): | |
| 153 | + from video_processor.agent.skills.prd import PRDSkill | |
| 154 | + | |
| 155 | + skill = PRDSkill() | |
| 156 | + ctx = _make_context( | |
| 157 | + planning_entities=[ | |
| 158 | + FakeEntity(name="Some goal", type="goal"), | |
| 159 | + ] | |
| 160 | + ) | |
| 161 | + artifact = skill.execute(ctx) | |
| 162 | + | |
| 163 | + assert isinstance(artifact, Artifact) | |
| 164 | + assert artifact.artifact_type == "prd" | |
| 165 | + | |
| 166 | + | |
| 167 | +# --------------------------------------------------------------------------- | |
| 168 | +# RoadmapSkill | |
| 169 | +# --------------------------------------------------------------------------- | |
| 170 | + | |
| 171 | + | |
| 172 | +class TestRoadmapSkill: | |
| 173 | + def test_execute_returns_artifact(self): | |
| 174 | + from video_processor.agent.skills.roadmap import RoadmapSkill | |
| 175 | + | |
| 176 | + skill = RoadmapSkill() | |
| 177 | + ctx = _make_context() | |
| 178 | + artifact = skill.execute(ctx) | |
| 179 | + | |
| 180 | + assert isinstance(artifact, Artifact) | |
| 181 | + assert artifact.artifact_type == "roadmap" | |
| 182 | + assert artifact.format == "markdown" | |
| 183 | + | |
| 184 | + def test_execute_with_no_relevant_entities(self): | |
| 185 | + from video_processor.agent.skills.roadmap import RoadmapSkill | |
| 186 | + | |
| 187 | + skill = RoadmapSkill() | |
| 188 | + ctx = _make_context( | |
| 189 | + planning_entities=[ | |
| 190 | + FakeEntity(name="Some constraint", type="constraint"), | |
| 191 | + ] | |
| 192 | + ) | |
| 193 | + artifact = skill.execute(ctx) | |
| 194 | + | |
| 195 | + assert isinstance(artifact, Artifact) | |
| 196 | + | |
| 197 | + | |
| 198 | +# --------------------------------------------------------------------------- | |
| 199 | +# TaskBreakdownSkill | |
| 200 | +# --------------------------------------------------------------------------- | |
| 201 | + | |
| 202 | + | |
| 203 | +class TestTaskBreakdownSkill: | |
| 204 | + def test_execute_returns_artifact_json(self): | |
| 205 | + from video_processor.agent.skills.task_breakdown import TaskBreakdownSkill | |
| 206 | + | |
| 207 | + tasks_json = json.dumps( | |
| 208 | + [ | |
| 209 | + { | |
| 210 | + "id": "T1", | |
| 211 | + "title": "Setup", | |
| 212 | + "description": "Init", | |
| 213 | + "depends_on": [], | |
| 214 | + "priority": "high", | |
| 215 | + "estimate": "1d", | |
| 216 | + "assignee_role": "dev", | |
| 217 | + }, | |
| 218 | + ] | |
| 219 | + ) | |
| 220 | + skill = TaskBreakdownSkill() | |
| 221 | + ctx = _make_context(chat_response=tasks_json) | |
| 222 | + artifact = skill.execute(ctx) | |
| 223 | + | |
| 224 | + assert isinstance(artifact, Artifact) | |
| 225 | + assert artifact.artifact_type == "task_list" | |
| 226 | + assert artifact.format == "json" | |
| 227 | + assert "tasks" in artifact.metadata | |
| 228 | + assert len(artifact.metadata["tasks"]) == 1 | |
| 229 | + | |
| 230 | + def test_execute_with_non_json_response(self): | |
| 231 | + from video_processor.agent.skills.task_breakdown import TaskBreakdownSkill | |
| 232 | + | |
| 233 | + skill = TaskBreakdownSkill() | |
| 234 | + ctx = _make_context(chat_response="Not valid JSON at all") | |
| 235 | + artifact = skill.execute(ctx) | |
| 236 | + | |
| 237 | + assert isinstance(artifact, Artifact) | |
| 238 | + assert artifact.artifact_type == "task_list" | |
| 239 | + | |
| 240 | + def test_execute_with_no_relevant_entities(self): | |
| 241 | + from video_processor.agent.skills.task_breakdown import TaskBreakdownSkill | |
| 242 | + | |
| 243 | + tasks_json = json.dumps([]) | |
| 244 | + skill = TaskBreakdownSkill() | |
| 245 | + ctx = _make_context( | |
| 246 | + chat_response=tasks_json, | |
| 247 | + planning_entities=[FakeEntity(name="X", type="constraint")], | |
| 248 | + ) | |
| 249 | + artifact = skill.execute(ctx) | |
| 250 | + assert artifact.metadata["tasks"] == [] | |
| 251 | + | |
| 252 | + | |
| 253 | +# --------------------------------------------------------------------------- | |
| 254 | +# DocGeneratorSkill | |
| 255 | +# --------------------------------------------------------------------------- | |
| 256 | + | |
| 257 | + | |
| 258 | +class TestDocGeneratorSkill: | |
| 259 | + def test_execute_default_type(self): | |
| 260 | + from video_processor.agent.skills.doc_generator import DocGeneratorSkill | |
| 261 | + | |
| 262 | + skill = DocGeneratorSkill() | |
| 263 | + ctx = _make_context() | |
| 264 | + artifact = skill.execute(ctx) | |
| 265 | + | |
| 266 | + assert isinstance(artifact, Artifact) | |
| 267 | + assert artifact.artifact_type == "document" | |
| 268 | + assert artifact.format == "markdown" | |
| 269 | + assert artifact.metadata["doc_type"] == "technical_doc" | |
| 270 | + | |
| 271 | + def test_execute_adr_type(self): | |
| 272 | + from video_processor.agent.skills.doc_generator import DocGeneratorSkill | |
| 273 | + | |
| 274 | + skill = DocGeneratorSkill() | |
| 275 | + ctx = _make_context() | |
| 276 | + artifact = skill.execute(ctx, doc_type="adr") | |
| 277 | + | |
| 278 | + assert artifact.metadata["doc_type"] == "adr" | |
| 279 | + | |
| 280 | + def test_execute_meeting_notes_type(self): | |
| 281 | + from video_processor.agent.skills.doc_generator import DocGeneratorSkill | |
| 282 | + | |
| 283 | + skill = DocGeneratorSkill() | |
| 284 | + ctx = _make_context() | |
| 285 | + artifact = skill.execute(ctx, doc_type="meeting_notes") | |
| 286 | + | |
| 287 | + assert artifact.metadata["doc_type"] == "meeting_notes" | |
| 288 | + | |
| 289 | + def test_execute_unknown_type_falls_back(self): | |
| 290 | + from video_processor.agent.skills.doc_generator import DocGeneratorSkill | |
| 291 | + | |
| 292 | + skill = DocGeneratorSkill() | |
| 293 | + ctx = _make_context() | |
| 294 | + artifact = skill.execute(ctx, doc_type="unknown_type") | |
| 295 | + | |
| 296 | + # Falls back to technical_doc prompt | |
| 297 | + assert artifact.artifact_type == "document" | |
| 298 | + | |
| 299 | + | |
| 300 | +# --------------------------------------------------------------------------- | |
| 301 | +# RequirementsChatSkill | |
| 302 | +# --------------------------------------------------------------------------- | |
| 303 | + | |
| 304 | + | |
| 305 | +class TestRequirementsChatSkill: | |
| 306 | + def test_execute_returns_artifact(self): | |
| 307 | + from video_processor.agent.skills.requirements_chat import RequirementsChatSkill | |
| 308 | + | |
| 309 | + questions = { | |
| 310 | + "questions": [ | |
| 311 | + {"id": "Q1", "category": "goals", "question": "What?", "context": "Why"}, | |
| 312 | + ] | |
| 313 | + } | |
| 314 | + skill = RequirementsChatSkill() | |
| 315 | + ctx = _make_context(chat_response=json.dumps(questions)) | |
| 316 | + artifact = skill.execute(ctx) | |
| 317 | + | |
| 318 | + assert isinstance(artifact, Artifact) | |
| 319 | + assert artifact.artifact_type == "requirements" | |
| 320 | + assert artifact.format == "json" | |
| 321 | + assert artifact.metadata["stage"] == "questionnaire" | |
| 322 | + | |
| 323 | + def test_gather_requirements(self): | |
| 324 | + from video_processor.agent.skills.requirements_chat import RequirementsChatSkill | |
| 325 | + | |
| 326 | + reqs = { | |
| 327 | + "goals": ["Build auth"], | |
| 328 | + "constraints": ["Budget < 10k"], | |
| 329 | + "priorities": ["Security"], | |
| 330 | + "scope": {"in_scope": ["Login"], "out_of_scope": ["SSO"]}, | |
| 331 | + } | |
| 332 | + skill = RequirementsChatSkill() | |
| 333 | + ctx = _make_context(chat_response=json.dumps(reqs)) | |
| 334 | + result = skill.gather_requirements(ctx, {"Q1": "We need auth", "Q2": "Budget is limited"}) | |
| 335 | + | |
| 336 | + assert isinstance(result, dict) | |
| 337 | + | |
| 338 | + def test_gather_requirements_non_json_response(self): | |
| 339 | + from video_processor.agent.skills.requirements_chat import RequirementsChatSkill | |
| 340 | + | |
| 341 | + skill = RequirementsChatSkill() | |
| 342 | + ctx = _make_context(chat_response="Not JSON") | |
| 343 | + result = skill.gather_requirements(ctx, {"Q1": "answer"}) | |
| 344 | + | |
| 345 | + assert isinstance(result, dict) | |
| 346 | + | |
| 347 | + | |
| 348 | +# --------------------------------------------------------------------------- | |
| 349 | +# Skill metadata | |
| 350 | +# --------------------------------------------------------------------------- | |
| 351 | + | |
| 352 | + | |
| 353 | +class TestSkillMetadata: | |
| 354 | + def test_project_plan_name(self): | |
| 355 | + from video_processor.agent.skills.project_plan import ProjectPlanSkill | |
| 356 | + | |
| 357 | + assert ProjectPlanSkill.name == "project_plan" | |
| 358 | + | |
| 359 | + def test_prd_name(self): | |
| 360 | + from video_processor.agent.skills.prd import PRDSkill | |
| 361 | + | |
| 362 | + assert PRDSkill.name == "prd" | |
| 363 | + | |
| 364 | + def test_roadmap_name(self): | |
| 365 | + from video_processor.agent.skills.roadmap import RoadmapSkill | |
| 366 | + | |
| 367 | + assert RoadmapSkill.name == "roadmap" | |
| 368 | + | |
| 369 | + def test_task_breakdown_name(self): | |
| 370 | + from video_processor.agent.skills.task_breakdown import TaskBreakdownSkill | |
| 371 | + | |
| 372 | + assert TaskBreakdownSkill.name == "task_breakdown" | |
| 373 | + | |
| 374 | + def test_doc_generator_name(self): | |
| 375 | + from video_processor.agent.skills.doc_generator import DocGeneratorSkill | |
| 376 | + | |
| 377 | + assert DocGeneratorSkill.name == "doc_generator" | |
| 378 | + | |
| 379 | + def test_requirements_chat_name(self): | |
| 380 | + from video_processor.agent.skills.requirements_chat import RequirementsChatSkill | |
| 381 | + | |
| 382 | + assert RequirementsChatSkill.name == "requirements_chat" | |
| 383 | + | |
| 384 | + def test_can_execute_with_context(self): | |
| 385 | + from video_processor.agent.skills.project_plan import ProjectPlanSkill | |
| 386 | + | |
| 387 | + skill = ProjectPlanSkill() | |
| 388 | + ctx = _make_context() | |
| 389 | + assert skill.can_execute(ctx) is True | |
| 390 | + | |
| 391 | + def test_can_execute_without_kg(self): | |
| 392 | + from video_processor.agent.skills.project_plan import ProjectPlanSkill | |
| 393 | + | |
| 394 | + skill = ProjectPlanSkill() | |
| 395 | + ctx = _make_context() | |
| 396 | + ctx.knowledge_graph = None | |
| 397 | + assert skill.can_execute(ctx) is False | |
| 398 | + | |
| 399 | + def test_can_execute_without_provider(self): | |
| 400 | + from video_processor.agent.skills.project_plan import ProjectPlanSkill | |
| 401 | + | |
| 402 | + skill = ProjectPlanSkill() | |
| 403 | + ctx = _make_context() | |
| 404 | + ctx.provider_manager = None | |
| 405 | + assert skill.can_execute(ctx) is False | |
| 406 | + | |
| 407 | + | |
| 408 | +# --------------------------------------------------------------------------- | |
| 409 | +# WikiGeneratorSkill | |
| 410 | +# --------------------------------------------------------------------------- | |
| 411 | + | |
| 412 | + | |
| 413 | +class TestWikiGeneratorSkill: | |
| 414 | + def _sample_kg_data(self): | |
| 415 | + return { | |
| 416 | + "nodes": [ | |
| 417 | + { | |
| 418 | + "name": "Python", | |
| 419 | + "type": "technology", | |
| 420 | + "descriptions": ["A programming language"], | |
| 421 | + }, | |
| 422 | + { | |
| 423 | + "name": "Alice", | |
| 424 | + "type": "person", | |
| 425 | + "descriptions": ["Lead developer"], | |
| 426 | + }, | |
| 427 | + { | |
| 428 | + "name": "FastAPI", | |
| 429 | + "type": "technology", | |
| 430 | + "descriptions": ["Web framework"], | |
| 431 | + }, | |
| 432 | + ], | |
| 433 | + "relationships": [ | |
| 434 | + {"source": "Alice", "target": "Python", "type": "uses"}, | |
| 435 | + {"source": "FastAPI", "target": "Python", "type": "built_with"}, | |
| 436 | + ], | |
| 437 | + } | |
| 438 | + | |
| 439 | + def test_generate_wiki(self): | |
| 440 | + from video_processor.agent.skills.wiki_generator import generate_wiki | |
| 441 | + | |
| 442 | + pages = generate_wiki(self._sample_kg_data(), title="Test Wiki") | |
| 443 | + | |
| 444 | + assert "Home" in pages | |
| 445 | + assert "_Sidebar" in pages | |
| 446 | + assert "Test Wiki" in pages["Home"] | |
| 447 | + assert "3" in pages["Home"] # 3 entities | |
| 448 | + assert "2" in pages["Home"] # 2 relationships | |
| 449 | + | |
| 450 | + # Entity pages should exist | |
| 451 | + assert "Python" in pages | |
| 452 | + assert "Alice" in pages | |
| 453 | + assert "FastAPI" in pages | |
| 454 | + | |
| 455 | + # Type index pages should exist | |
| 456 | + assert "Technology" in pages | |
| 457 | + assert "Person" in pages | |
| 458 | + | |
| 459 | + # Alice's page should reference Python | |
| 460 | + assert "Python" in pages["Alice"] | |
| 461 | + assert "uses" in pages["Alice"] | |
| 462 | + | |
| 463 | + def test_generate_wiki_with_artifacts(self): | |
| 464 | + from video_processor.agent.skills.wiki_generator import generate_wiki | |
| 465 | + | |
| 466 | + art = Artifact( | |
| 467 | + name="Project Plan", | |
| 468 | + content="# Plan\n\nDo the thing.", | |
| 469 | + artifact_type="project_plan", | |
| 470 | + format="markdown", | |
| 471 | + ) | |
| 472 | + pages = generate_wiki(self._sample_kg_data(), artifacts=[art]) | |
| 473 | + | |
| 474 | + assert "Project-Plan" in pages | |
| 475 | + assert "Do the thing." in pages["Project-Plan"] | |
| 476 | + assert "Planning Artifacts" in pages["Home"] | |
| 477 | + | |
| 478 | + def test_write_wiki(self, tmp_path): | |
| 479 | + from video_processor.agent.skills.wiki_generator import write_wiki | |
| 480 | + | |
| 481 | + pages = { | |
| 482 | + "Home": "# Home\n\nWelcome.", | |
| 483 | + "Page-One": "# Page One\n\nContent.", | |
| 484 | + } | |
| 485 | + paths = write_wiki(pages, tmp_path / "wiki") | |
| 486 | + | |
| 487 | + assert len(paths) == 2 | |
| 488 | + assert (tmp_path / "wiki" / "Home.md").exists() | |
| 489 | + assert (tmp_path / "wiki" / "Page-One.md").exists() | |
| 490 | + assert "Welcome." in (tmp_path / "wiki" / "Home.md").read_text() | |
| 491 | + | |
| 492 | + def test_sanitize_filename(self): | |
| 493 | + from video_processor.agent.skills.wiki_generator import _sanitize_filename | |
| 494 | + | |
| 495 | + assert _sanitize_filename("Hello World") == "Hello-World" | |
| 496 | + assert _sanitize_filename("path/to\\file") == "path-to-file" | |
| 497 | + assert _sanitize_filename("version.2") == "version-2" | |
| 498 | + | |
| 499 | + def test_wiki_link(self): | |
| 500 | + from video_processor.agent.skills.wiki_generator import _wiki_link | |
| 501 | + | |
| 502 | + result = _wiki_link("My Page") | |
| 503 | + assert result == "[My Page](My-Page)" | |
| 504 | + | |
| 505 | + result = _wiki_link("Simple") | |
| 506 | + assert result == "[Simple](Simple)" | |
| 507 | + | |
| 508 | + | |
| 509 | +# --------------------------------------------------------------------------- | |
| 510 | +# NotesExportSkill | |
| 511 | +# --------------------------------------------------------------------------- | |
| 512 | + | |
| 513 | + | |
| 514 | +class TestNotesExportSkill: | |
| 515 | + def _sample_kg_data(self): | |
| 516 | + return { | |
| 517 | + "nodes": [ | |
| 518 | + { | |
| 519 | + "name": "Python", | |
| 520 | + "type": "technology", | |
| 521 | + "descriptions": ["A programming language"], | |
| 522 | + }, | |
| 523 | + { | |
| 524 | + "name": "Alice", | |
| 525 | + "type": "person", | |
| 526 | + "descriptions": ["Lead developer"], | |
| 527 | + }, | |
| 528 | + ], | |
| 529 | + "relationships": [ | |
| 530 | + {"source": "Alice", "target": "Python", "type": "uses"}, | |
| 531 | + ], | |
| 532 | + } | |
| 533 | + | |
| 534 | + def test_export_to_obsidian(self, tmp_path): | |
| 535 | + from video_processor.agent.skills.notes_export import export_to_obsidian | |
| 536 | + | |
| 537 | + output_dir = tmp_path / "obsidian_vault" | |
| 538 | + export_to_obsidian(self._sample_kg_data(), output_dir) | |
| 539 | + | |
| 540 | + assert output_dir.is_dir() | |
| 541 | + | |
| 542 | + # Check entity files exist | |
| 543 | + python_file = output_dir / "Python.md" | |
| 544 | + alice_file = output_dir / "Alice.md" | |
| 545 | + assert python_file.exists() | |
| 546 | + assert alice_file.exists() | |
| 547 | + | |
| 548 | + # Check frontmatter in entity file | |
| 549 | + python_content = python_file.read_text() | |
| 550 | + assert "---" in python_content | |
| 551 | + assert "type: technology" in python_content | |
| 552 | + assert "# Python" in python_content | |
| 553 | + | |
| 554 | + # Check wiki-links in Alice file | |
| 555 | + alice_content = alice_file.read_text() | |
| 556 | + assert "[[Python]]" in alice_content | |
| 557 | + assert "uses" in alice_content | |
| 558 | + | |
| 559 | + # Check index file | |
| 560 | + index_file = output_dir / "_Index.md" | |
| 561 | + assert index_file.exists() | |
| 562 | + index_content = index_file.read_text() | |
| 563 | + assert "[[Python]]" in index_content | |
| 564 | + assert "[[Alice]]" in index_content | |
| 565 | + | |
| 566 | + def test_export_to_obsidian_with_artifacts(self, tmp_path): | |
| 567 | + from video_processor.agent.skills.notes_export import export_to_obsidian | |
| 568 | + | |
| 569 | + art = Artifact( | |
| 570 | + name="Test Plan", | |
| 571 | + content="# Plan\n\nSteps here.", | |
| 572 | + artifact_type="project_plan", | |
| 573 | + format="markdown", | |
| 574 | + ) | |
| 575 | + output_dir = tmp_path / "obsidian_arts" | |
| 576 | + export_to_obsidian(self._sample_kg_data(), output_dir, artifacts=[art]) | |
| 577 | + | |
| 578 | + art_file = output_dir / "Test Plan.md" | |
| 579 | + assert art_file.exists() | |
| 580 | + art_content = art_file.read_text() | |
| 581 | + assert "artifact" in art_content | |
| 582 | + assert "Steps here." in art_content | |
| 583 | + | |
| 584 | + def test_export_to_notion_md(self, tmp_path): | |
| 585 | + from video_processor.agent.skills.notes_export import export_to_notion_md | |
| 586 | + | |
| 587 | + output_dir = tmp_path / "notion_export" | |
| 588 | + export_to_notion_md(self._sample_kg_data(), output_dir) | |
| 589 | + | |
| 590 | + assert output_dir.is_dir() | |
| 591 | + | |
| 592 | + # Check CSV database file | |
| 593 | + csv_file = output_dir / "entities_database.csv" | |
| 594 | + assert csv_file.exists() | |
| 595 | + csv_content = csv_file.read_text() | |
| 596 | + assert "Name" in csv_content | |
| 597 | + assert "Type" in csv_content | |
| 598 | + assert "Python" in csv_content | |
| 599 | + assert "Alice" in csv_content | |
| 600 | + | |
| 601 | + # Check entity markdown files | |
| 602 | + python_file = output_dir / "Python.md" | |
| 603 | + assert python_file.exists() | |
| 604 | + python_content = python_file.read_text() | |
| 605 | + assert "# Python" in python_content | |
| 606 | + assert "technology" in python_content | |
| 607 | + | |
| 608 | + # Check overview file | |
| 609 | + overview_file = output_dir / "Overview.md" | |
| 610 | + assert overview_file.exists() | |
| 611 | + | |
| 612 | + def test_export_to_notion_md_with_artifacts(self, tmp_path): | |
| 613 | + from video_processor.agent.skills.notes_export import export_to_notion_md | |
| 614 | + | |
| 615 | + art = Artifact( | |
| 616 | + name="Roadmap", | |
| 617 | + content="# Roadmap\n\nQ1 goals.", | |
| 618 | + artifact_type="roadmap", | |
| 619 | + format="markdown", | |
| 620 | + ) | |
| 621 | + output_dir = tmp_path / "notion_arts" | |
| 622 | + export_to_notion_md(self._sample_kg_data(), output_dir, artifacts=[art]) | |
| 623 | + | |
| 624 | + art_file = output_dir / "Roadmap.md" | |
| 625 | + assert art_file.exists() | |
| 626 | + art_content = art_file.read_text() | |
| 627 | + assert "Q1 goals." in art_content |
| --- a/tests/test_agent_skills.py | |
| +++ b/tests/test_agent_skills.py | |
| @@ -0,0 +1,627 @@ | |
| --- a/tests/test_agent_skills.py | |
| +++ b/tests/test_agent_skills.py | |
| @@ -0,0 +1,627 @@ | |
| 1 | """Tests for agent skill execute() methods with mocked context.""" |
| 2 | |
| 3 | import json |
| 4 | from dataclasses import dataclass |
| 5 | from unittest.mock import MagicMock |
| 6 | |
| 7 | import pytest |
| 8 | |
| 9 | from video_processor.agent.skills.base import ( |
| 10 | AgentContext, |
| 11 | Artifact, |
| 12 | _skills, |
| 13 | ) |
| 14 | |
| 15 | # --------------------------------------------------------------------------- |
| 16 | # Fixtures |
| 17 | # --------------------------------------------------------------------------- |
| 18 | |
| 19 | |
| 20 | @pytest.fixture(autouse=True) |
| 21 | def _clean_skill_registry(): |
| 22 | """Save and restore the global skill registry between tests.""" |
| 23 | original = dict(_skills) |
| 24 | yield |
| 25 | _skills.clear() |
| 26 | _skills.update(original) |
| 27 | |
| 28 | |
| 29 | @dataclass |
| 30 | class FakeEntity: |
| 31 | name: str |
| 32 | type: str |
| 33 | |
| 34 | def __str__(self): |
| 35 | return self.name |
| 36 | |
| 37 | |
| 38 | class FakeQueryResult: |
| 39 | """Mimics QueryResult.to_text().""" |
| 40 | |
| 41 | def __init__(self, text="Stats: 10 entities, 5 relationships"): |
| 42 | self._text = text |
| 43 | |
| 44 | def to_text(self): |
| 45 | return self._text |
| 46 | |
| 47 | |
| 48 | def _make_context( |
| 49 | chat_response="# Generated Content\n\nSome markdown content.", |
| 50 | planning_entities=None, |
| 51 | ): |
| 52 | """Build an AgentContext with mocked query_engine and provider_manager.""" |
| 53 | ctx = AgentContext() |
| 54 | |
| 55 | qe = MagicMock() |
| 56 | qe.stats.return_value = FakeQueryResult("Stats: 10 entities, 5 rels") |
| 57 | qe.entities.return_value = FakeQueryResult("Entity1, Entity2") |
| 58 | qe.relationships.return_value = FakeQueryResult("Entity1 -> Entity2") |
| 59 | ctx.query_engine = qe |
| 60 | |
| 61 | pm = MagicMock() |
| 62 | pm.chat.return_value = chat_response |
| 63 | ctx.provider_manager = pm |
| 64 | |
| 65 | ctx.knowledge_graph = MagicMock() |
| 66 | |
| 67 | if planning_entities is not None: |
| 68 | ctx.planning_entities = planning_entities |
| 69 | else: |
| 70 | ctx.planning_entities = [ |
| 71 | FakeEntity(name="Auth system", type="feature"), |
| 72 | FakeEntity(name="Launch v1", type="milestone"), |
| 73 | FakeEntity(name="Must be fast", type="constraint"), |
| 74 | FakeEntity(name="Build dashboard", type="goal"), |
| 75 | FakeEntity(name="API depends on auth", type="dependency"), |
| 76 | FakeEntity(name="User login", type="requirement"), |
| 77 | ] |
| 78 | |
| 79 | return ctx |
| 80 | |
| 81 | |
| 82 | # --------------------------------------------------------------------------- |
| 83 | # ProjectPlanSkill |
| 84 | # --------------------------------------------------------------------------- |
| 85 | |
| 86 | |
| 87 | class TestProjectPlanSkill: |
| 88 | def test_execute_returns_artifact(self): |
| 89 | from video_processor.agent.skills.project_plan import ProjectPlanSkill |
| 90 | |
| 91 | skill = ProjectPlanSkill() |
| 92 | ctx = _make_context() |
| 93 | artifact = skill.execute(ctx) |
| 94 | |
| 95 | assert isinstance(artifact, Artifact) |
| 96 | assert artifact.artifact_type == "project_plan" |
| 97 | assert artifact.format == "markdown" |
| 98 | assert len(artifact.content) > 0 |
| 99 | |
| 100 | def test_execute_calls_provider(self): |
| 101 | from video_processor.agent.skills.project_plan import ProjectPlanSkill |
| 102 | |
| 103 | skill = ProjectPlanSkill() |
| 104 | ctx = _make_context() |
| 105 | skill.execute(ctx) |
| 106 | |
| 107 | ctx.provider_manager.chat.assert_called_once() |
| 108 | call_args = ctx.provider_manager.chat.call_args |
| 109 | messages = call_args[1]["messages"] if "messages" in call_args[1] else call_args[0][0] |
| 110 | assert len(messages) == 1 |
| 111 | assert messages[0]["role"] == "user" |
| 112 | |
| 113 | def test_execute_queries_graph(self): |
| 114 | from video_processor.agent.skills.project_plan import ProjectPlanSkill |
| 115 | |
| 116 | skill = ProjectPlanSkill() |
| 117 | ctx = _make_context() |
| 118 | skill.execute(ctx) |
| 119 | |
| 120 | ctx.query_engine.stats.assert_called_once() |
| 121 | ctx.query_engine.entities.assert_called_once() |
| 122 | ctx.query_engine.relationships.assert_called_once() |
| 123 | |
| 124 | |
| 125 | # --------------------------------------------------------------------------- |
| 126 | # PRDSkill |
| 127 | # --------------------------------------------------------------------------- |
| 128 | |
| 129 | |
| 130 | class TestPRDSkill: |
| 131 | def test_execute_returns_artifact(self): |
| 132 | from video_processor.agent.skills.prd import PRDSkill |
| 133 | |
| 134 | skill = PRDSkill() |
| 135 | ctx = _make_context() |
| 136 | artifact = skill.execute(ctx) |
| 137 | |
| 138 | assert isinstance(artifact, Artifact) |
| 139 | assert artifact.artifact_type == "prd" |
| 140 | assert artifact.format == "markdown" |
| 141 | |
| 142 | def test_execute_filters_relevant_entities(self): |
| 143 | from video_processor.agent.skills.prd import PRDSkill |
| 144 | |
| 145 | skill = PRDSkill() |
| 146 | ctx = _make_context() |
| 147 | skill.execute(ctx) |
| 148 | |
| 149 | # Should still call provider |
| 150 | ctx.provider_manager.chat.assert_called_once() |
| 151 | |
| 152 | def test_execute_with_no_relevant_entities(self): |
| 153 | from video_processor.agent.skills.prd import PRDSkill |
| 154 | |
| 155 | skill = PRDSkill() |
| 156 | ctx = _make_context( |
| 157 | planning_entities=[ |
| 158 | FakeEntity(name="Some goal", type="goal"), |
| 159 | ] |
| 160 | ) |
| 161 | artifact = skill.execute(ctx) |
| 162 | |
| 163 | assert isinstance(artifact, Artifact) |
| 164 | assert artifact.artifact_type == "prd" |
| 165 | |
| 166 | |
| 167 | # --------------------------------------------------------------------------- |
| 168 | # RoadmapSkill |
| 169 | # --------------------------------------------------------------------------- |
| 170 | |
| 171 | |
| 172 | class TestRoadmapSkill: |
| 173 | def test_execute_returns_artifact(self): |
| 174 | from video_processor.agent.skills.roadmap import RoadmapSkill |
| 175 | |
| 176 | skill = RoadmapSkill() |
| 177 | ctx = _make_context() |
| 178 | artifact = skill.execute(ctx) |
| 179 | |
| 180 | assert isinstance(artifact, Artifact) |
| 181 | assert artifact.artifact_type == "roadmap" |
| 182 | assert artifact.format == "markdown" |
| 183 | |
| 184 | def test_execute_with_no_relevant_entities(self): |
| 185 | from video_processor.agent.skills.roadmap import RoadmapSkill |
| 186 | |
| 187 | skill = RoadmapSkill() |
| 188 | ctx = _make_context( |
| 189 | planning_entities=[ |
| 190 | FakeEntity(name="Some constraint", type="constraint"), |
| 191 | ] |
| 192 | ) |
| 193 | artifact = skill.execute(ctx) |
| 194 | |
| 195 | assert isinstance(artifact, Artifact) |
| 196 | |
| 197 | |
| 198 | # --------------------------------------------------------------------------- |
| 199 | # TaskBreakdownSkill |
| 200 | # --------------------------------------------------------------------------- |
| 201 | |
| 202 | |
| 203 | class TestTaskBreakdownSkill: |
| 204 | def test_execute_returns_artifact_json(self): |
| 205 | from video_processor.agent.skills.task_breakdown import TaskBreakdownSkill |
| 206 | |
| 207 | tasks_json = json.dumps( |
| 208 | [ |
| 209 | { |
| 210 | "id": "T1", |
| 211 | "title": "Setup", |
| 212 | "description": "Init", |
| 213 | "depends_on": [], |
| 214 | "priority": "high", |
| 215 | "estimate": "1d", |
| 216 | "assignee_role": "dev", |
| 217 | }, |
| 218 | ] |
| 219 | ) |
| 220 | skill = TaskBreakdownSkill() |
| 221 | ctx = _make_context(chat_response=tasks_json) |
| 222 | artifact = skill.execute(ctx) |
| 223 | |
| 224 | assert isinstance(artifact, Artifact) |
| 225 | assert artifact.artifact_type == "task_list" |
| 226 | assert artifact.format == "json" |
| 227 | assert "tasks" in artifact.metadata |
| 228 | assert len(artifact.metadata["tasks"]) == 1 |
| 229 | |
| 230 | def test_execute_with_non_json_response(self): |
| 231 | from video_processor.agent.skills.task_breakdown import TaskBreakdownSkill |
| 232 | |
| 233 | skill = TaskBreakdownSkill() |
| 234 | ctx = _make_context(chat_response="Not valid JSON at all") |
| 235 | artifact = skill.execute(ctx) |
| 236 | |
| 237 | assert isinstance(artifact, Artifact) |
| 238 | assert artifact.artifact_type == "task_list" |
| 239 | |
| 240 | def test_execute_with_no_relevant_entities(self): |
| 241 | from video_processor.agent.skills.task_breakdown import TaskBreakdownSkill |
| 242 | |
| 243 | tasks_json = json.dumps([]) |
| 244 | skill = TaskBreakdownSkill() |
| 245 | ctx = _make_context( |
| 246 | chat_response=tasks_json, |
| 247 | planning_entities=[FakeEntity(name="X", type="constraint")], |
| 248 | ) |
| 249 | artifact = skill.execute(ctx) |
| 250 | assert artifact.metadata["tasks"] == [] |
| 251 | |
| 252 | |
| 253 | # --------------------------------------------------------------------------- |
| 254 | # DocGeneratorSkill |
| 255 | # --------------------------------------------------------------------------- |
| 256 | |
| 257 | |
| 258 | class TestDocGeneratorSkill: |
| 259 | def test_execute_default_type(self): |
| 260 | from video_processor.agent.skills.doc_generator import DocGeneratorSkill |
| 261 | |
| 262 | skill = DocGeneratorSkill() |
| 263 | ctx = _make_context() |
| 264 | artifact = skill.execute(ctx) |
| 265 | |
| 266 | assert isinstance(artifact, Artifact) |
| 267 | assert artifact.artifact_type == "document" |
| 268 | assert artifact.format == "markdown" |
| 269 | assert artifact.metadata["doc_type"] == "technical_doc" |
| 270 | |
| 271 | def test_execute_adr_type(self): |
| 272 | from video_processor.agent.skills.doc_generator import DocGeneratorSkill |
| 273 | |
| 274 | skill = DocGeneratorSkill() |
| 275 | ctx = _make_context() |
| 276 | artifact = skill.execute(ctx, doc_type="adr") |
| 277 | |
| 278 | assert artifact.metadata["doc_type"] == "adr" |
| 279 | |
| 280 | def test_execute_meeting_notes_type(self): |
| 281 | from video_processor.agent.skills.doc_generator import DocGeneratorSkill |
| 282 | |
| 283 | skill = DocGeneratorSkill() |
| 284 | ctx = _make_context() |
| 285 | artifact = skill.execute(ctx, doc_type="meeting_notes") |
| 286 | |
| 287 | assert artifact.metadata["doc_type"] == "meeting_notes" |
| 288 | |
| 289 | def test_execute_unknown_type_falls_back(self): |
| 290 | from video_processor.agent.skills.doc_generator import DocGeneratorSkill |
| 291 | |
| 292 | skill = DocGeneratorSkill() |
| 293 | ctx = _make_context() |
| 294 | artifact = skill.execute(ctx, doc_type="unknown_type") |
| 295 | |
| 296 | # Falls back to technical_doc prompt |
| 297 | assert artifact.artifact_type == "document" |
| 298 | |
| 299 | |
| 300 | # --------------------------------------------------------------------------- |
| 301 | # RequirementsChatSkill |
| 302 | # --------------------------------------------------------------------------- |
| 303 | |
| 304 | |
| 305 | class TestRequirementsChatSkill: |
| 306 | def test_execute_returns_artifact(self): |
| 307 | from video_processor.agent.skills.requirements_chat import RequirementsChatSkill |
| 308 | |
| 309 | questions = { |
| 310 | "questions": [ |
| 311 | {"id": "Q1", "category": "goals", "question": "What?", "context": "Why"}, |
| 312 | ] |
| 313 | } |
| 314 | skill = RequirementsChatSkill() |
| 315 | ctx = _make_context(chat_response=json.dumps(questions)) |
| 316 | artifact = skill.execute(ctx) |
| 317 | |
| 318 | assert isinstance(artifact, Artifact) |
| 319 | assert artifact.artifact_type == "requirements" |
| 320 | assert artifact.format == "json" |
| 321 | assert artifact.metadata["stage"] == "questionnaire" |
| 322 | |
| 323 | def test_gather_requirements(self): |
| 324 | from video_processor.agent.skills.requirements_chat import RequirementsChatSkill |
| 325 | |
| 326 | reqs = { |
| 327 | "goals": ["Build auth"], |
| 328 | "constraints": ["Budget < 10k"], |
| 329 | "priorities": ["Security"], |
| 330 | "scope": {"in_scope": ["Login"], "out_of_scope": ["SSO"]}, |
| 331 | } |
| 332 | skill = RequirementsChatSkill() |
| 333 | ctx = _make_context(chat_response=json.dumps(reqs)) |
| 334 | result = skill.gather_requirements(ctx, {"Q1": "We need auth", "Q2": "Budget is limited"}) |
| 335 | |
| 336 | assert isinstance(result, dict) |
| 337 | |
| 338 | def test_gather_requirements_non_json_response(self): |
| 339 | from video_processor.agent.skills.requirements_chat import RequirementsChatSkill |
| 340 | |
| 341 | skill = RequirementsChatSkill() |
| 342 | ctx = _make_context(chat_response="Not JSON") |
| 343 | result = skill.gather_requirements(ctx, {"Q1": "answer"}) |
| 344 | |
| 345 | assert isinstance(result, dict) |
| 346 | |
| 347 | |
| 348 | # --------------------------------------------------------------------------- |
| 349 | # Skill metadata |
| 350 | # --------------------------------------------------------------------------- |
| 351 | |
| 352 | |
| 353 | class TestSkillMetadata: |
| 354 | def test_project_plan_name(self): |
| 355 | from video_processor.agent.skills.project_plan import ProjectPlanSkill |
| 356 | |
| 357 | assert ProjectPlanSkill.name == "project_plan" |
| 358 | |
| 359 | def test_prd_name(self): |
| 360 | from video_processor.agent.skills.prd import PRDSkill |
| 361 | |
| 362 | assert PRDSkill.name == "prd" |
| 363 | |
| 364 | def test_roadmap_name(self): |
| 365 | from video_processor.agent.skills.roadmap import RoadmapSkill |
| 366 | |
| 367 | assert RoadmapSkill.name == "roadmap" |
| 368 | |
| 369 | def test_task_breakdown_name(self): |
| 370 | from video_processor.agent.skills.task_breakdown import TaskBreakdownSkill |
| 371 | |
| 372 | assert TaskBreakdownSkill.name == "task_breakdown" |
| 373 | |
| 374 | def test_doc_generator_name(self): |
| 375 | from video_processor.agent.skills.doc_generator import DocGeneratorSkill |
| 376 | |
| 377 | assert DocGeneratorSkill.name == "doc_generator" |
| 378 | |
| 379 | def test_requirements_chat_name(self): |
| 380 | from video_processor.agent.skills.requirements_chat import RequirementsChatSkill |
| 381 | |
| 382 | assert RequirementsChatSkill.name == "requirements_chat" |
| 383 | |
| 384 | def test_can_execute_with_context(self): |
| 385 | from video_processor.agent.skills.project_plan import ProjectPlanSkill |
| 386 | |
| 387 | skill = ProjectPlanSkill() |
| 388 | ctx = _make_context() |
| 389 | assert skill.can_execute(ctx) is True |
| 390 | |
| 391 | def test_can_execute_without_kg(self): |
| 392 | from video_processor.agent.skills.project_plan import ProjectPlanSkill |
| 393 | |
| 394 | skill = ProjectPlanSkill() |
| 395 | ctx = _make_context() |
| 396 | ctx.knowledge_graph = None |
| 397 | assert skill.can_execute(ctx) is False |
| 398 | |
| 399 | def test_can_execute_without_provider(self): |
| 400 | from video_processor.agent.skills.project_plan import ProjectPlanSkill |
| 401 | |
| 402 | skill = ProjectPlanSkill() |
| 403 | ctx = _make_context() |
| 404 | ctx.provider_manager = None |
| 405 | assert skill.can_execute(ctx) is False |
| 406 | |
| 407 | |
| 408 | # --------------------------------------------------------------------------- |
| 409 | # WikiGeneratorSkill |
| 410 | # --------------------------------------------------------------------------- |
| 411 | |
| 412 | |
| 413 | class TestWikiGeneratorSkill: |
| 414 | def _sample_kg_data(self): |
| 415 | return { |
| 416 | "nodes": [ |
| 417 | { |
| 418 | "name": "Python", |
| 419 | "type": "technology", |
| 420 | "descriptions": ["A programming language"], |
| 421 | }, |
| 422 | { |
| 423 | "name": "Alice", |
| 424 | "type": "person", |
| 425 | "descriptions": ["Lead developer"], |
| 426 | }, |
| 427 | { |
| 428 | "name": "FastAPI", |
| 429 | "type": "technology", |
| 430 | "descriptions": ["Web framework"], |
| 431 | }, |
| 432 | ], |
| 433 | "relationships": [ |
| 434 | {"source": "Alice", "target": "Python", "type": "uses"}, |
| 435 | {"source": "FastAPI", "target": "Python", "type": "built_with"}, |
| 436 | ], |
| 437 | } |
| 438 | |
| 439 | def test_generate_wiki(self): |
| 440 | from video_processor.agent.skills.wiki_generator import generate_wiki |
| 441 | |
| 442 | pages = generate_wiki(self._sample_kg_data(), title="Test Wiki") |
| 443 | |
| 444 | assert "Home" in pages |
| 445 | assert "_Sidebar" in pages |
| 446 | assert "Test Wiki" in pages["Home"] |
| 447 | assert "3" in pages["Home"] # 3 entities |
| 448 | assert "2" in pages["Home"] # 2 relationships |
| 449 | |
| 450 | # Entity pages should exist |
| 451 | assert "Python" in pages |
| 452 | assert "Alice" in pages |
| 453 | assert "FastAPI" in pages |
| 454 | |
| 455 | # Type index pages should exist |
| 456 | assert "Technology" in pages |
| 457 | assert "Person" in pages |
| 458 | |
| 459 | # Alice's page should reference Python |
| 460 | assert "Python" in pages["Alice"] |
| 461 | assert "uses" in pages["Alice"] |
| 462 | |
| 463 | def test_generate_wiki_with_artifacts(self): |
| 464 | from video_processor.agent.skills.wiki_generator import generate_wiki |
| 465 | |
| 466 | art = Artifact( |
| 467 | name="Project Plan", |
| 468 | content="# Plan\n\nDo the thing.", |
| 469 | artifact_type="project_plan", |
| 470 | format="markdown", |
| 471 | ) |
| 472 | pages = generate_wiki(self._sample_kg_data(), artifacts=[art]) |
| 473 | |
| 474 | assert "Project-Plan" in pages |
| 475 | assert "Do the thing." in pages["Project-Plan"] |
| 476 | assert "Planning Artifacts" in pages["Home"] |
| 477 | |
| 478 | def test_write_wiki(self, tmp_path): |
| 479 | from video_processor.agent.skills.wiki_generator import write_wiki |
| 480 | |
| 481 | pages = { |
| 482 | "Home": "# Home\n\nWelcome.", |
| 483 | "Page-One": "# Page One\n\nContent.", |
| 484 | } |
| 485 | paths = write_wiki(pages, tmp_path / "wiki") |
| 486 | |
| 487 | assert len(paths) == 2 |
| 488 | assert (tmp_path / "wiki" / "Home.md").exists() |
| 489 | assert (tmp_path / "wiki" / "Page-One.md").exists() |
| 490 | assert "Welcome." in (tmp_path / "wiki" / "Home.md").read_text() |
| 491 | |
| 492 | def test_sanitize_filename(self): |
| 493 | from video_processor.agent.skills.wiki_generator import _sanitize_filename |
| 494 | |
| 495 | assert _sanitize_filename("Hello World") == "Hello-World" |
| 496 | assert _sanitize_filename("path/to\\file") == "path-to-file" |
| 497 | assert _sanitize_filename("version.2") == "version-2" |
| 498 | |
| 499 | def test_wiki_link(self): |
| 500 | from video_processor.agent.skills.wiki_generator import _wiki_link |
| 501 | |
| 502 | result = _wiki_link("My Page") |
| 503 | assert result == "[My Page](My-Page)" |
| 504 | |
| 505 | result = _wiki_link("Simple") |
| 506 | assert result == "[Simple](Simple)" |
| 507 | |
| 508 | |
| 509 | # --------------------------------------------------------------------------- |
| 510 | # NotesExportSkill |
| 511 | # --------------------------------------------------------------------------- |
| 512 | |
| 513 | |
| 514 | class TestNotesExportSkill: |
| 515 | def _sample_kg_data(self): |
| 516 | return { |
| 517 | "nodes": [ |
| 518 | { |
| 519 | "name": "Python", |
| 520 | "type": "technology", |
| 521 | "descriptions": ["A programming language"], |
| 522 | }, |
| 523 | { |
| 524 | "name": "Alice", |
| 525 | "type": "person", |
| 526 | "descriptions": ["Lead developer"], |
| 527 | }, |
| 528 | ], |
| 529 | "relationships": [ |
| 530 | {"source": "Alice", "target": "Python", "type": "uses"}, |
| 531 | ], |
| 532 | } |
| 533 | |
| 534 | def test_export_to_obsidian(self, tmp_path): |
| 535 | from video_processor.agent.skills.notes_export import export_to_obsidian |
| 536 | |
| 537 | output_dir = tmp_path / "obsidian_vault" |
| 538 | export_to_obsidian(self._sample_kg_data(), output_dir) |
| 539 | |
| 540 | assert output_dir.is_dir() |
| 541 | |
| 542 | # Check entity files exist |
| 543 | python_file = output_dir / "Python.md" |
| 544 | alice_file = output_dir / "Alice.md" |
| 545 | assert python_file.exists() |
| 546 | assert alice_file.exists() |
| 547 | |
| 548 | # Check frontmatter in entity file |
| 549 | python_content = python_file.read_text() |
| 550 | assert "---" in python_content |
| 551 | assert "type: technology" in python_content |
| 552 | assert "# Python" in python_content |
| 553 | |
| 554 | # Check wiki-links in Alice file |
| 555 | alice_content = alice_file.read_text() |
| 556 | assert "[[Python]]" in alice_content |
| 557 | assert "uses" in alice_content |
| 558 | |
| 559 | # Check index file |
| 560 | index_file = output_dir / "_Index.md" |
| 561 | assert index_file.exists() |
| 562 | index_content = index_file.read_text() |
| 563 | assert "[[Python]]" in index_content |
| 564 | assert "[[Alice]]" in index_content |
| 565 | |
| 566 | def test_export_to_obsidian_with_artifacts(self, tmp_path): |
| 567 | from video_processor.agent.skills.notes_export import export_to_obsidian |
| 568 | |
| 569 | art = Artifact( |
| 570 | name="Test Plan", |
| 571 | content="# Plan\n\nSteps here.", |
| 572 | artifact_type="project_plan", |
| 573 | format="markdown", |
| 574 | ) |
| 575 | output_dir = tmp_path / "obsidian_arts" |
| 576 | export_to_obsidian(self._sample_kg_data(), output_dir, artifacts=[art]) |
| 577 | |
| 578 | art_file = output_dir / "Test Plan.md" |
| 579 | assert art_file.exists() |
| 580 | art_content = art_file.read_text() |
| 581 | assert "artifact" in art_content |
| 582 | assert "Steps here." in art_content |
| 583 | |
| 584 | def test_export_to_notion_md(self, tmp_path): |
| 585 | from video_processor.agent.skills.notes_export import export_to_notion_md |
| 586 | |
| 587 | output_dir = tmp_path / "notion_export" |
| 588 | export_to_notion_md(self._sample_kg_data(), output_dir) |
| 589 | |
| 590 | assert output_dir.is_dir() |
| 591 | |
| 592 | # Check CSV database file |
| 593 | csv_file = output_dir / "entities_database.csv" |
| 594 | assert csv_file.exists() |
| 595 | csv_content = csv_file.read_text() |
| 596 | assert "Name" in csv_content |
| 597 | assert "Type" in csv_content |
| 598 | assert "Python" in csv_content |
| 599 | assert "Alice" in csv_content |
| 600 | |
| 601 | # Check entity markdown files |
| 602 | python_file = output_dir / "Python.md" |
| 603 | assert python_file.exists() |
| 604 | python_content = python_file.read_text() |
| 605 | assert "# Python" in python_content |
| 606 | assert "technology" in python_content |
| 607 | |
| 608 | # Check overview file |
| 609 | overview_file = output_dir / "Overview.md" |
| 610 | assert overview_file.exists() |
| 611 | |
| 612 | def test_export_to_notion_md_with_artifacts(self, tmp_path): |
| 613 | from video_processor.agent.skills.notes_export import export_to_notion_md |
| 614 | |
| 615 | art = Artifact( |
| 616 | name="Roadmap", |
| 617 | content="# Roadmap\n\nQ1 goals.", |
| 618 | artifact_type="roadmap", |
| 619 | format="markdown", |
| 620 | ) |
| 621 | output_dir = tmp_path / "notion_arts" |
| 622 | export_to_notion_md(self._sample_kg_data(), output_dir, artifacts=[art]) |
| 623 | |
| 624 | art_file = output_dir / "Roadmap.md" |
| 625 | assert art_file.exists() |
| 626 | art_content = art_file.read_text() |
| 627 | assert "Q1 goals." in art_content |
+114
| --- a/tests/test_api_spec.py | ||
| +++ b/tests/test_api_spec.py | ||
| @@ -0,0 +1,114 @@ | ||
| 1 | +"""Tests for video_processor.api.openapi_spec.""" | |
| 2 | + | |
| 3 | +from video_processor.api.openapi_spec import get_openapi_spec | |
| 4 | + | |
| 5 | + | |
| 6 | +def test_returns_dict(): | |
| 7 | + spec = get_openapi_spec() | |
| 8 | + assert isinstance(spec, dict) | |
| 9 | + | |
| 10 | + | |
| 11 | +def test_has_top_level_keys(): | |
| 12 | + spec = get_openapi_spec() | |
| 13 | + for key in ("openapi", "info", "paths", "components"): | |
| 14 | + assert key in spec, f"Missing top-level key: {key}" | |
| 15 | + | |
| 16 | + | |
| 17 | +def test_openapi_version(): | |
| 18 | + spec = get_openapi_spec() | |
| 19 | + assert spec["openapi"].startswith("3.0") | |
| 20 | + | |
| 21 | + | |
| 22 | +def test_info_section(): | |
| 23 | + spec = get_openapi_spec() | |
| 24 | + info = spec["info"] | |
| 25 | + assert "title" in info | |
| 26 | + assert "version" in info | |
| 27 | + assert "PlanOpticon" in info["title"] | |
| 28 | + | |
| 29 | + | |
| 30 | +def test_expected_paths(): | |
| 31 | + spec = get_openapi_spec() | |
| 32 | + expected_paths = [ | |
| 33 | + "/analyze", | |
| 34 | + "/jobs/{id}", | |
| 35 | + "/knowledge-graph/{id}/entities", | |
| 36 | + "/knowledge-graph/{id}/relationships", | |
| 37 | + "/knowledge-graph/{id}/query", | |
| 38 | + ] | |
| 39 | + for path in expected_paths: | |
| 40 | + assert path in spec["paths"], f"Missing path: {path}" | |
| 41 | + | |
| 42 | + | |
| 43 | +def test_analyze_endpoint(): | |
| 44 | + spec = get_openapi_spec() | |
| 45 | + analyze = spec["paths"]["/analyze"] | |
| 46 | + assert "post" in analyze | |
| 47 | + post = analyze["post"] | |
| 48 | + assert "summary" in post | |
| 49 | + assert "requestBody" in post | |
| 50 | + assert "responses" in post | |
| 51 | + assert "202" in post["responses"] | |
| 52 | + | |
| 53 | + | |
| 54 | +def test_jobs_endpoint(): | |
| 55 | + spec = get_openapi_spec() | |
| 56 | + jobs = spec["paths"]["/jobs/{id}"] | |
| 57 | + assert "get" in jobs | |
| 58 | + get = jobs["get"] | |
| 59 | + assert "parameters" in get | |
| 60 | + assert get["parameters"][0]["name"] == "id" | |
| 61 | + | |
| 62 | + | |
| 63 | +def test_entities_endpoint(): | |
| 64 | + spec = get_openapi_spec() | |
| 65 | + entities = spec["paths"]["/knowledge-graph/{id}/entities"] | |
| 66 | + assert "get" in entities | |
| 67 | + | |
| 68 | + | |
| 69 | +def test_relationships_endpoint(): | |
| 70 | + spec = get_openapi_spec() | |
| 71 | + rels = spec["paths"]["/knowledge-graph/{id}/relationships"] | |
| 72 | + assert "get" in rels | |
| 73 | + | |
| 74 | + | |
| 75 | +def test_query_endpoint(): | |
| 76 | + spec = get_openapi_spec() | |
| 77 | + query = spec["paths"]["/knowledge-graph/{id}/query"] | |
| 78 | + assert "get" in query | |
| 79 | + params = query["get"]["parameters"] | |
| 80 | + param_names = [p["name"] for p in params] | |
| 81 | + assert "q" in param_names | |
| 82 | + | |
| 83 | + | |
| 84 | +def test_component_schemas(): | |
| 85 | + spec = get_openapi_spec() | |
| 86 | + schemas = spec["components"]["schemas"] | |
| 87 | + for schema_name in ("Job", "Entity", "Relationship"): | |
| 88 | + assert schema_name in schemas, f"Missing schema: {schema_name}" | |
| 89 | + | |
| 90 | + | |
| 91 | +def test_job_schema_properties(): | |
| 92 | + spec = get_openapi_spec() | |
| 93 | + job = spec["components"]["schemas"]["Job"] | |
| 94 | + props = job["properties"] | |
| 95 | + assert "id" in props | |
| 96 | + assert "status" in props | |
| 97 | + assert "progress" in props | |
| 98 | + | |
| 99 | + | |
| 100 | +def test_job_status_enum(): | |
| 101 | + spec = get_openapi_spec() | |
| 102 | + status = spec["components"]["schemas"]["Job"]["properties"]["status"] | |
| 103 | + assert "enum" in status | |
| 104 | + assert "pending" in status["enum"] | |
| 105 | + assert "completed" in status["enum"] | |
| 106 | + | |
| 107 | + | |
| 108 | +def test_analyze_request_body_schema(): | |
| 109 | + spec = get_openapi_spec() | |
| 110 | + schema = spec["paths"]["/analyze"]["post"]["requestBody"]["content"]["application/json"][ | |
| 111 | + "schema" | |
| 112 | + ] | |
| 113 | + assert "video_url" in schema["properties"] | |
| 114 | + assert "video_url" in schema["required"] |
| --- a/tests/test_api_spec.py | |
| +++ b/tests/test_api_spec.py | |
| @@ -0,0 +1,114 @@ | |
| --- a/tests/test_api_spec.py | |
| +++ b/tests/test_api_spec.py | |
| @@ -0,0 +1,114 @@ | |
| 1 | """Tests for video_processor.api.openapi_spec.""" |
| 2 | |
| 3 | from video_processor.api.openapi_spec import get_openapi_spec |
| 4 | |
| 5 | |
| 6 | def test_returns_dict(): |
| 7 | spec = get_openapi_spec() |
| 8 | assert isinstance(spec, dict) |
| 9 | |
| 10 | |
| 11 | def test_has_top_level_keys(): |
| 12 | spec = get_openapi_spec() |
| 13 | for key in ("openapi", "info", "paths", "components"): |
| 14 | assert key in spec, f"Missing top-level key: {key}" |
| 15 | |
| 16 | |
| 17 | def test_openapi_version(): |
| 18 | spec = get_openapi_spec() |
| 19 | assert spec["openapi"].startswith("3.0") |
| 20 | |
| 21 | |
| 22 | def test_info_section(): |
| 23 | spec = get_openapi_spec() |
| 24 | info = spec["info"] |
| 25 | assert "title" in info |
| 26 | assert "version" in info |
| 27 | assert "PlanOpticon" in info["title"] |
| 28 | |
| 29 | |
| 30 | def test_expected_paths(): |
| 31 | spec = get_openapi_spec() |
| 32 | expected_paths = [ |
| 33 | "/analyze", |
| 34 | "/jobs/{id}", |
| 35 | "/knowledge-graph/{id}/entities", |
| 36 | "/knowledge-graph/{id}/relationships", |
| 37 | "/knowledge-graph/{id}/query", |
| 38 | ] |
| 39 | for path in expected_paths: |
| 40 | assert path in spec["paths"], f"Missing path: {path}" |
| 41 | |
| 42 | |
| 43 | def test_analyze_endpoint(): |
| 44 | spec = get_openapi_spec() |
| 45 | analyze = spec["paths"]["/analyze"] |
| 46 | assert "post" in analyze |
| 47 | post = analyze["post"] |
| 48 | assert "summary" in post |
| 49 | assert "requestBody" in post |
| 50 | assert "responses" in post |
| 51 | assert "202" in post["responses"] |
| 52 | |
| 53 | |
| 54 | def test_jobs_endpoint(): |
| 55 | spec = get_openapi_spec() |
| 56 | jobs = spec["paths"]["/jobs/{id}"] |
| 57 | assert "get" in jobs |
| 58 | get = jobs["get"] |
| 59 | assert "parameters" in get |
| 60 | assert get["parameters"][0]["name"] == "id" |
| 61 | |
| 62 | |
| 63 | def test_entities_endpoint(): |
| 64 | spec = get_openapi_spec() |
| 65 | entities = spec["paths"]["/knowledge-graph/{id}/entities"] |
| 66 | assert "get" in entities |
| 67 | |
| 68 | |
| 69 | def test_relationships_endpoint(): |
| 70 | spec = get_openapi_spec() |
| 71 | rels = spec["paths"]["/knowledge-graph/{id}/relationships"] |
| 72 | assert "get" in rels |
| 73 | |
| 74 | |
| 75 | def test_query_endpoint(): |
| 76 | spec = get_openapi_spec() |
| 77 | query = spec["paths"]["/knowledge-graph/{id}/query"] |
| 78 | assert "get" in query |
| 79 | params = query["get"]["parameters"] |
| 80 | param_names = [p["name"] for p in params] |
| 81 | assert "q" in param_names |
| 82 | |
| 83 | |
| 84 | def test_component_schemas(): |
| 85 | spec = get_openapi_spec() |
| 86 | schemas = spec["components"]["schemas"] |
| 87 | for schema_name in ("Job", "Entity", "Relationship"): |
| 88 | assert schema_name in schemas, f"Missing schema: {schema_name}" |
| 89 | |
| 90 | |
| 91 | def test_job_schema_properties(): |
| 92 | spec = get_openapi_spec() |
| 93 | job = spec["components"]["schemas"]["Job"] |
| 94 | props = job["properties"] |
| 95 | assert "id" in props |
| 96 | assert "status" in props |
| 97 | assert "progress" in props |
| 98 | |
| 99 | |
| 100 | def test_job_status_enum(): |
| 101 | spec = get_openapi_spec() |
| 102 | status = spec["components"]["schemas"]["Job"]["properties"]["status"] |
| 103 | assert "enum" in status |
| 104 | assert "pending" in status["enum"] |
| 105 | assert "completed" in status["enum"] |
| 106 | |
| 107 | |
| 108 | def test_analyze_request_body_schema(): |
| 109 | spec = get_openapi_spec() |
| 110 | schema = spec["paths"]["/analyze"]["post"]["requestBody"]["content"]["application/json"][ |
| 111 | "schema" |
| 112 | ] |
| 113 | assert "video_url" in schema["properties"] |
| 114 | assert "video_url" in schema["required"] |
+309
| --- a/tests/test_auth.py | ||
| +++ b/tests/test_auth.py | ||
| @@ -0,0 +1,309 @@ | ||
| 1 | +"""Tests for the unified auth module.""" | |
| 2 | + | |
| 3 | +import json | |
| 4 | +import os | |
| 5 | +import time | |
| 6 | +from pathlib import Path | |
| 7 | +from unittest.mock import MagicMock, patch | |
| 8 | + | |
| 9 | +from video_processor.auth import ( | |
| 10 | + KNOWN_CONFIGS, | |
| 11 | + AuthConfig, | |
| 12 | + AuthResult, | |
| 13 | + OAuthManager, | |
| 14 | + get_auth_config, | |
| 15 | + get_auth_manager, | |
| 16 | +) | |
| 17 | + | |
| 18 | +# ----------------------------------------------------------------------- | |
| 19 | +# AuthConfig | |
| 20 | +# ----------------------------------------------------------------------- | |
| 21 | + | |
| 22 | + | |
| 23 | +class TestAuthConfig: | |
| 24 | + def test_basic_creation(self): | |
| 25 | + config = AuthConfig(service="test") | |
| 26 | + assert config.service == "test" | |
| 27 | + assert config.supports_oauth is False | |
| 28 | + | |
| 29 | + def test_with_oauth_endpoints(self): | |
| 30 | + config = AuthConfig( | |
| 31 | + service="test", | |
| 32 | + oauth_authorize_url="https://example.com/auth", | |
| 33 | + oauth_token_url="https://example.com/token", | |
| 34 | + ) | |
| 35 | + assert config.supports_oauth is True | |
| 36 | + | |
| 37 | + def test_resolved_client_id_from_env(self): | |
| 38 | + config = AuthConfig( | |
| 39 | + service="test", | |
| 40 | + client_id_env="TEST_CLIENT_ID", | |
| 41 | + ) | |
| 42 | + with patch.dict(os.environ, {"TEST_CLIENT_ID": "my-id"}): | |
| 43 | + assert config.resolved_client_id == "my-id" | |
| 44 | + | |
| 45 | + def test_resolved_client_id_explicit(self): | |
| 46 | + config = AuthConfig( | |
| 47 | + service="test", | |
| 48 | + client_id="explicit-id", | |
| 49 | + client_id_env="TEST_CLIENT_ID", | |
| 50 | + ) | |
| 51 | + assert config.resolved_client_id == "explicit-id" | |
| 52 | + | |
| 53 | + def test_resolved_api_key(self): | |
| 54 | + config = AuthConfig(service="test", api_key_env="TEST_API_KEY") | |
| 55 | + with patch.dict(os.environ, {"TEST_API_KEY": "sk-123"}): | |
| 56 | + assert config.resolved_api_key == "sk-123" | |
| 57 | + | |
| 58 | + def test_resolved_api_key_empty(self): | |
| 59 | + config = AuthConfig(service="test", api_key_env="TEST_API_KEY") | |
| 60 | + with patch.dict(os.environ, {}, clear=True): | |
| 61 | + assert config.resolved_api_key is None | |
| 62 | + | |
| 63 | + def test_resolved_token_path_default(self): | |
| 64 | + config = AuthConfig(service="zoom") | |
| 65 | + assert config.resolved_token_path.name == "zoom_token.json" | |
| 66 | + | |
| 67 | + def test_resolved_token_path_custom(self): | |
| 68 | + config = AuthConfig( | |
| 69 | + service="zoom", | |
| 70 | + token_path=Path("/tmp/custom.json"), | |
| 71 | + ) | |
| 72 | + assert config.resolved_token_path == Path("/tmp/custom.json") | |
| 73 | + | |
| 74 | + def test_resolved_account_id(self): | |
| 75 | + config = AuthConfig( | |
| 76 | + service="test", | |
| 77 | + account_id_env="TEST_ACCOUNT_ID", | |
| 78 | + ) | |
| 79 | + with patch.dict(os.environ, {"TEST_ACCOUNT_ID": "acc-123"}): | |
| 80 | + assert config.resolved_account_id == "acc-123" | |
| 81 | + | |
| 82 | + | |
| 83 | +# ----------------------------------------------------------------------- | |
| 84 | +# AuthResult | |
| 85 | +# ----------------------------------------------------------------------- | |
| 86 | + | |
| 87 | + | |
| 88 | +class TestAuthResult: | |
| 89 | + def test_success(self): | |
| 90 | + result = AuthResult( | |
| 91 | + success=True, | |
| 92 | + access_token="tok", | |
| 93 | + method="api_key", | |
| 94 | + ) | |
| 95 | + assert result.success | |
| 96 | + assert result.access_token == "tok" | |
| 97 | + | |
| 98 | + def test_failure(self): | |
| 99 | + result = AuthResult(success=False, error="no key") | |
| 100 | + assert not result.success | |
| 101 | + assert result.error == "no key" | |
| 102 | + | |
| 103 | + | |
| 104 | +# ----------------------------------------------------------------------- | |
| 105 | +# OAuthManager | |
| 106 | +# ----------------------------------------------------------------------- | |
| 107 | + | |
| 108 | + | |
| 109 | +class TestOAuthManager: | |
| 110 | + def test_api_key_fallback(self): | |
| 111 | + config = AuthConfig( | |
| 112 | + service="test", | |
| 113 | + api_key_env="TEST_KEY", | |
| 114 | + ) | |
| 115 | + manager = OAuthManager(config) | |
| 116 | + with patch.dict(os.environ, {"TEST_KEY": "sk-abc"}): | |
| 117 | + result = manager.authenticate() | |
| 118 | + assert result.success | |
| 119 | + assert result.access_token == "sk-abc" | |
| 120 | + assert result.method == "api_key" | |
| 121 | + | |
| 122 | + def test_no_auth_available(self): | |
| 123 | + config = AuthConfig(service="test") | |
| 124 | + manager = OAuthManager(config) | |
| 125 | + with patch.dict(os.environ, {}, clear=True): | |
| 126 | + result = manager.authenticate() | |
| 127 | + assert not result.success | |
| 128 | + assert "No auth method" in result.error | |
| 129 | + | |
| 130 | + def test_saved_token_valid(self, tmp_path): | |
| 131 | + token_file = tmp_path / "token.json" | |
| 132 | + token_data = { | |
| 133 | + "access_token": "saved-tok", | |
| 134 | + "expires_at": time.time() + 3600, | |
| 135 | + } | |
| 136 | + token_file.write_text(json.dumps(token_data)) | |
| 137 | + | |
| 138 | + config = AuthConfig( | |
| 139 | + service="test", | |
| 140 | + token_path=token_file, | |
| 141 | + ) | |
| 142 | + manager = OAuthManager(config) | |
| 143 | + result = manager.authenticate() | |
| 144 | + assert result.success | |
| 145 | + assert result.access_token == "saved-tok" | |
| 146 | + assert result.method == "saved_token" | |
| 147 | + | |
| 148 | + def test_saved_token_expired_no_refresh(self, tmp_path): | |
| 149 | + token_file = tmp_path / "token.json" | |
| 150 | + token_data = { | |
| 151 | + "access_token": "old-tok", | |
| 152 | + "expires_at": time.time() - 100, | |
| 153 | + } | |
| 154 | + token_file.write_text(json.dumps(token_data)) | |
| 155 | + | |
| 156 | + config = AuthConfig( | |
| 157 | + service="test", | |
| 158 | + token_path=token_file, | |
| 159 | + ) | |
| 160 | + manager = OAuthManager(config) | |
| 161 | + result = manager.authenticate() | |
| 162 | + assert not result.success | |
| 163 | + | |
| 164 | + def test_get_token_convenience(self): | |
| 165 | + config = AuthConfig( | |
| 166 | + service="test", | |
| 167 | + api_key_env="TEST_KEY", | |
| 168 | + ) | |
| 169 | + manager = OAuthManager(config) | |
| 170 | + with patch.dict(os.environ, {"TEST_KEY": "sk-xyz"}): | |
| 171 | + token = manager.get_token() | |
| 172 | + assert token == "sk-xyz" | |
| 173 | + | |
| 174 | + def test_get_token_none_on_failure(self): | |
| 175 | + config = AuthConfig(service="test") | |
| 176 | + manager = OAuthManager(config) | |
| 177 | + with patch.dict(os.environ, {}, clear=True): | |
| 178 | + token = manager.get_token() | |
| 179 | + assert token is None | |
| 180 | + | |
| 181 | + def test_clear_token(self, tmp_path): | |
| 182 | + token_file = tmp_path / "token.json" | |
| 183 | + token_file.write_text("{}") | |
| 184 | + config = AuthConfig(service="test", token_path=token_file) | |
| 185 | + manager = OAuthManager(config) | |
| 186 | + manager.clear_token() | |
| 187 | + assert not token_file.exists() | |
| 188 | + | |
| 189 | + def test_clear_token_no_file(self, tmp_path): | |
| 190 | + config = AuthConfig( | |
| 191 | + service="test", | |
| 192 | + token_path=tmp_path / "nonexistent.json", | |
| 193 | + ) | |
| 194 | + manager = OAuthManager(config) | |
| 195 | + manager.clear_token() # should not raise | |
| 196 | + | |
| 197 | + def test_save_token_creates_dir(self, tmp_path): | |
| 198 | + nested = tmp_path / "deep" / "dir" / "token.json" | |
| 199 | + config = AuthConfig(service="test", token_path=nested) | |
| 200 | + manager = OAuthManager(config) | |
| 201 | + manager._save_token({"access_token": "tok"}) | |
| 202 | + assert nested.exists() | |
| 203 | + data = json.loads(nested.read_text()) | |
| 204 | + assert data["access_token"] == "tok" | |
| 205 | + | |
| 206 | + def test_saved_token_expired_with_refresh(self, tmp_path): | |
| 207 | + token_file = tmp_path / "token.json" | |
| 208 | + token_data = { | |
| 209 | + "access_token": "old-tok", | |
| 210 | + "refresh_token": "ref-tok", | |
| 211 | + "expires_at": time.time() - 100, | |
| 212 | + "client_id": "cid", | |
| 213 | + "client_secret": "csec", | |
| 214 | + } | |
| 215 | + token_file.write_text(json.dumps(token_data)) | |
| 216 | + | |
| 217 | + config = AuthConfig( | |
| 218 | + service="test", | |
| 219 | + oauth_token_url="https://example.com/token", | |
| 220 | + token_path=token_file, | |
| 221 | + ) | |
| 222 | + manager = OAuthManager(config) | |
| 223 | + | |
| 224 | + mock_resp = MagicMock() | |
| 225 | + mock_resp.json.return_value = { | |
| 226 | + "access_token": "new-tok", | |
| 227 | + "refresh_token": "new-ref", | |
| 228 | + "expires_in": 7200, | |
| 229 | + } | |
| 230 | + mock_resp.raise_for_status = MagicMock() | |
| 231 | + | |
| 232 | + with patch("requests.post", return_value=mock_resp): | |
| 233 | + result = manager.authenticate() | |
| 234 | + | |
| 235 | + assert result.success | |
| 236 | + assert result.access_token == "new-tok" | |
| 237 | + assert result.method == "saved_token" | |
| 238 | + | |
| 239 | + def test_oauth_prefers_saved_over_api_key(self, tmp_path): | |
| 240 | + """Saved token should be tried before API key fallback.""" | |
| 241 | + token_file = tmp_path / "token.json" | |
| 242 | + token_data = { | |
| 243 | + "access_token": "saved-tok", | |
| 244 | + "expires_at": time.time() + 3600, | |
| 245 | + } | |
| 246 | + token_file.write_text(json.dumps(token_data)) | |
| 247 | + | |
| 248 | + config = AuthConfig( | |
| 249 | + service="test", | |
| 250 | + api_key_env="TEST_KEY", | |
| 251 | + token_path=token_file, | |
| 252 | + ) | |
| 253 | + manager = OAuthManager(config) | |
| 254 | + with patch.dict(os.environ, {"TEST_KEY": "api-key"}): | |
| 255 | + result = manager.authenticate() | |
| 256 | + | |
| 257 | + assert result.access_token == "saved-tok" | |
| 258 | + assert result.method == "saved_token" | |
| 259 | + | |
| 260 | + | |
| 261 | +# ----------------------------------------------------------------------- | |
| 262 | +# Known configs and helpers | |
| 263 | +# ----------------------------------------------------------------------- | |
| 264 | + | |
| 265 | + | |
| 266 | +class TestKnownConfigs: | |
| 267 | + def test_zoom_config(self): | |
| 268 | + config = KNOWN_CONFIGS["zoom"] | |
| 269 | + assert config.service == "zoom" | |
| 270 | + assert config.supports_oauth | |
| 271 | + assert config.client_id_env == "ZOOM_CLIENT_ID" | |
| 272 | + | |
| 273 | + def test_notion_config(self): | |
| 274 | + config = KNOWN_CONFIGS["notion"] | |
| 275 | + assert config.api_key_env == "NOTION_API_KEY" | |
| 276 | + assert config.supports_oauth | |
| 277 | + | |
| 278 | + def test_github_config(self): | |
| 279 | + config = KNOWN_CONFIGS["github"] | |
| 280 | + assert config.api_key_env == "GITHUB_TOKEN" | |
| 281 | + assert "repo" in config.scopes | |
| 282 | + | |
| 283 | + def test_dropbox_config(self): | |
| 284 | + config = KNOWN_CONFIGS["dropbox"] | |
| 285 | + assert config.api_key_env == "DROPBOX_ACCESS_TOKEN" | |
| 286 | + | |
| 287 | + def test_google_config(self): | |
| 288 | + config = KNOWN_CONFIGS["google"] | |
| 289 | + assert config.supports_oauth | |
| 290 | + | |
| 291 | + def test_microsoft_config(self): | |
| 292 | + config = KNOWN_CONFIGS["microsoft"] | |
| 293 | + assert config.supports_oauth | |
| 294 | + | |
| 295 | + def test_all_configs_have_service(self): | |
| 296 | + for name, config in KNOWN_CONFIGS.items(): | |
| 297 | + assert config.service == name | |
| 298 | + | |
| 299 | + def test_get_auth_config(self): | |
| 300 | + assert get_auth_config("zoom") is not None | |
| 301 | + assert get_auth_config("nonexistent") is None | |
| 302 | + | |
| 303 | + def test_get_auth_manager(self): | |
| 304 | + mgr = get_auth_manager("zoom") | |
| 305 | + assert mgr is not None | |
| 306 | + assert isinstance(mgr, OAuthManager) | |
| 307 | + | |
| 308 | + def test_get_auth_manager_unknown(self): | |
| 309 | + assert get_auth_manager("nonexistent") is None |
| --- a/tests/test_auth.py | |
| +++ b/tests/test_auth.py | |
| @@ -0,0 +1,309 @@ | |
| --- a/tests/test_auth.py | |
| +++ b/tests/test_auth.py | |
| @@ -0,0 +1,309 @@ | |
| 1 | """Tests for the unified auth module.""" |
| 2 | |
| 3 | import json |
| 4 | import os |
| 5 | import time |
| 6 | from pathlib import Path |
| 7 | from unittest.mock import MagicMock, patch |
| 8 | |
| 9 | from video_processor.auth import ( |
| 10 | KNOWN_CONFIGS, |
| 11 | AuthConfig, |
| 12 | AuthResult, |
| 13 | OAuthManager, |
| 14 | get_auth_config, |
| 15 | get_auth_manager, |
| 16 | ) |
| 17 | |
| 18 | # ----------------------------------------------------------------------- |
| 19 | # AuthConfig |
| 20 | # ----------------------------------------------------------------------- |
| 21 | |
| 22 | |
| 23 | class TestAuthConfig: |
| 24 | def test_basic_creation(self): |
| 25 | config = AuthConfig(service="test") |
| 26 | assert config.service == "test" |
| 27 | assert config.supports_oauth is False |
| 28 | |
| 29 | def test_with_oauth_endpoints(self): |
| 30 | config = AuthConfig( |
| 31 | service="test", |
| 32 | oauth_authorize_url="https://example.com/auth", |
| 33 | oauth_token_url="https://example.com/token", |
| 34 | ) |
| 35 | assert config.supports_oauth is True |
| 36 | |
| 37 | def test_resolved_client_id_from_env(self): |
| 38 | config = AuthConfig( |
| 39 | service="test", |
| 40 | client_id_env="TEST_CLIENT_ID", |
| 41 | ) |
| 42 | with patch.dict(os.environ, {"TEST_CLIENT_ID": "my-id"}): |
| 43 | assert config.resolved_client_id == "my-id" |
| 44 | |
| 45 | def test_resolved_client_id_explicit(self): |
| 46 | config = AuthConfig( |
| 47 | service="test", |
| 48 | client_id="explicit-id", |
| 49 | client_id_env="TEST_CLIENT_ID", |
| 50 | ) |
| 51 | assert config.resolved_client_id == "explicit-id" |
| 52 | |
| 53 | def test_resolved_api_key(self): |
| 54 | config = AuthConfig(service="test", api_key_env="TEST_API_KEY") |
| 55 | with patch.dict(os.environ, {"TEST_API_KEY": "sk-123"}): |
| 56 | assert config.resolved_api_key == "sk-123" |
| 57 | |
| 58 | def test_resolved_api_key_empty(self): |
| 59 | config = AuthConfig(service="test", api_key_env="TEST_API_KEY") |
| 60 | with patch.dict(os.environ, {}, clear=True): |
| 61 | assert config.resolved_api_key is None |
| 62 | |
| 63 | def test_resolved_token_path_default(self): |
| 64 | config = AuthConfig(service="zoom") |
| 65 | assert config.resolved_token_path.name == "zoom_token.json" |
| 66 | |
| 67 | def test_resolved_token_path_custom(self): |
| 68 | config = AuthConfig( |
| 69 | service="zoom", |
| 70 | token_path=Path("/tmp/custom.json"), |
| 71 | ) |
| 72 | assert config.resolved_token_path == Path("/tmp/custom.json") |
| 73 | |
| 74 | def test_resolved_account_id(self): |
| 75 | config = AuthConfig( |
| 76 | service="test", |
| 77 | account_id_env="TEST_ACCOUNT_ID", |
| 78 | ) |
| 79 | with patch.dict(os.environ, {"TEST_ACCOUNT_ID": "acc-123"}): |
| 80 | assert config.resolved_account_id == "acc-123" |
| 81 | |
| 82 | |
| 83 | # ----------------------------------------------------------------------- |
| 84 | # AuthResult |
| 85 | # ----------------------------------------------------------------------- |
| 86 | |
| 87 | |
| 88 | class TestAuthResult: |
| 89 | def test_success(self): |
| 90 | result = AuthResult( |
| 91 | success=True, |
| 92 | access_token="tok", |
| 93 | method="api_key", |
| 94 | ) |
| 95 | assert result.success |
| 96 | assert result.access_token == "tok" |
| 97 | |
| 98 | def test_failure(self): |
| 99 | result = AuthResult(success=False, error="no key") |
| 100 | assert not result.success |
| 101 | assert result.error == "no key" |
| 102 | |
| 103 | |
| 104 | # ----------------------------------------------------------------------- |
| 105 | # OAuthManager |
| 106 | # ----------------------------------------------------------------------- |
| 107 | |
| 108 | |
| 109 | class TestOAuthManager: |
| 110 | def test_api_key_fallback(self): |
| 111 | config = AuthConfig( |
| 112 | service="test", |
| 113 | api_key_env="TEST_KEY", |
| 114 | ) |
| 115 | manager = OAuthManager(config) |
| 116 | with patch.dict(os.environ, {"TEST_KEY": "sk-abc"}): |
| 117 | result = manager.authenticate() |
| 118 | assert result.success |
| 119 | assert result.access_token == "sk-abc" |
| 120 | assert result.method == "api_key" |
| 121 | |
| 122 | def test_no_auth_available(self): |
| 123 | config = AuthConfig(service="test") |
| 124 | manager = OAuthManager(config) |
| 125 | with patch.dict(os.environ, {}, clear=True): |
| 126 | result = manager.authenticate() |
| 127 | assert not result.success |
| 128 | assert "No auth method" in result.error |
| 129 | |
| 130 | def test_saved_token_valid(self, tmp_path): |
| 131 | token_file = tmp_path / "token.json" |
| 132 | token_data = { |
| 133 | "access_token": "saved-tok", |
| 134 | "expires_at": time.time() + 3600, |
| 135 | } |
| 136 | token_file.write_text(json.dumps(token_data)) |
| 137 | |
| 138 | config = AuthConfig( |
| 139 | service="test", |
| 140 | token_path=token_file, |
| 141 | ) |
| 142 | manager = OAuthManager(config) |
| 143 | result = manager.authenticate() |
| 144 | assert result.success |
| 145 | assert result.access_token == "saved-tok" |
| 146 | assert result.method == "saved_token" |
| 147 | |
| 148 | def test_saved_token_expired_no_refresh(self, tmp_path): |
| 149 | token_file = tmp_path / "token.json" |
| 150 | token_data = { |
| 151 | "access_token": "old-tok", |
| 152 | "expires_at": time.time() - 100, |
| 153 | } |
| 154 | token_file.write_text(json.dumps(token_data)) |
| 155 | |
| 156 | config = AuthConfig( |
| 157 | service="test", |
| 158 | token_path=token_file, |
| 159 | ) |
| 160 | manager = OAuthManager(config) |
| 161 | result = manager.authenticate() |
| 162 | assert not result.success |
| 163 | |
| 164 | def test_get_token_convenience(self): |
| 165 | config = AuthConfig( |
| 166 | service="test", |
| 167 | api_key_env="TEST_KEY", |
| 168 | ) |
| 169 | manager = OAuthManager(config) |
| 170 | with patch.dict(os.environ, {"TEST_KEY": "sk-xyz"}): |
| 171 | token = manager.get_token() |
| 172 | assert token == "sk-xyz" |
| 173 | |
| 174 | def test_get_token_none_on_failure(self): |
| 175 | config = AuthConfig(service="test") |
| 176 | manager = OAuthManager(config) |
| 177 | with patch.dict(os.environ, {}, clear=True): |
| 178 | token = manager.get_token() |
| 179 | assert token is None |
| 180 | |
| 181 | def test_clear_token(self, tmp_path): |
| 182 | token_file = tmp_path / "token.json" |
| 183 | token_file.write_text("{}") |
| 184 | config = AuthConfig(service="test", token_path=token_file) |
| 185 | manager = OAuthManager(config) |
| 186 | manager.clear_token() |
| 187 | assert not token_file.exists() |
| 188 | |
| 189 | def test_clear_token_no_file(self, tmp_path): |
| 190 | config = AuthConfig( |
| 191 | service="test", |
| 192 | token_path=tmp_path / "nonexistent.json", |
| 193 | ) |
| 194 | manager = OAuthManager(config) |
| 195 | manager.clear_token() # should not raise |
| 196 | |
| 197 | def test_save_token_creates_dir(self, tmp_path): |
| 198 | nested = tmp_path / "deep" / "dir" / "token.json" |
| 199 | config = AuthConfig(service="test", token_path=nested) |
| 200 | manager = OAuthManager(config) |
| 201 | manager._save_token({"access_token": "tok"}) |
| 202 | assert nested.exists() |
| 203 | data = json.loads(nested.read_text()) |
| 204 | assert data["access_token"] == "tok" |
| 205 | |
| 206 | def test_saved_token_expired_with_refresh(self, tmp_path): |
| 207 | token_file = tmp_path / "token.json" |
| 208 | token_data = { |
| 209 | "access_token": "old-tok", |
| 210 | "refresh_token": "ref-tok", |
| 211 | "expires_at": time.time() - 100, |
| 212 | "client_id": "cid", |
| 213 | "client_secret": "csec", |
| 214 | } |
| 215 | token_file.write_text(json.dumps(token_data)) |
| 216 | |
| 217 | config = AuthConfig( |
| 218 | service="test", |
| 219 | oauth_token_url="https://example.com/token", |
| 220 | token_path=token_file, |
| 221 | ) |
| 222 | manager = OAuthManager(config) |
| 223 | |
| 224 | mock_resp = MagicMock() |
| 225 | mock_resp.json.return_value = { |
| 226 | "access_token": "new-tok", |
| 227 | "refresh_token": "new-ref", |
| 228 | "expires_in": 7200, |
| 229 | } |
| 230 | mock_resp.raise_for_status = MagicMock() |
| 231 | |
| 232 | with patch("requests.post", return_value=mock_resp): |
| 233 | result = manager.authenticate() |
| 234 | |
| 235 | assert result.success |
| 236 | assert result.access_token == "new-tok" |
| 237 | assert result.method == "saved_token" |
| 238 | |
| 239 | def test_oauth_prefers_saved_over_api_key(self, tmp_path): |
| 240 | """Saved token should be tried before API key fallback.""" |
| 241 | token_file = tmp_path / "token.json" |
| 242 | token_data = { |
| 243 | "access_token": "saved-tok", |
| 244 | "expires_at": time.time() + 3600, |
| 245 | } |
| 246 | token_file.write_text(json.dumps(token_data)) |
| 247 | |
| 248 | config = AuthConfig( |
| 249 | service="test", |
| 250 | api_key_env="TEST_KEY", |
| 251 | token_path=token_file, |
| 252 | ) |
| 253 | manager = OAuthManager(config) |
| 254 | with patch.dict(os.environ, {"TEST_KEY": "api-key"}): |
| 255 | result = manager.authenticate() |
| 256 | |
| 257 | assert result.access_token == "saved-tok" |
| 258 | assert result.method == "saved_token" |
| 259 | |
| 260 | |
| 261 | # ----------------------------------------------------------------------- |
| 262 | # Known configs and helpers |
| 263 | # ----------------------------------------------------------------------- |
| 264 | |
| 265 | |
| 266 | class TestKnownConfigs: |
| 267 | def test_zoom_config(self): |
| 268 | config = KNOWN_CONFIGS["zoom"] |
| 269 | assert config.service == "zoom" |
| 270 | assert config.supports_oauth |
| 271 | assert config.client_id_env == "ZOOM_CLIENT_ID" |
| 272 | |
| 273 | def test_notion_config(self): |
| 274 | config = KNOWN_CONFIGS["notion"] |
| 275 | assert config.api_key_env == "NOTION_API_KEY" |
| 276 | assert config.supports_oauth |
| 277 | |
| 278 | def test_github_config(self): |
| 279 | config = KNOWN_CONFIGS["github"] |
| 280 | assert config.api_key_env == "GITHUB_TOKEN" |
| 281 | assert "repo" in config.scopes |
| 282 | |
| 283 | def test_dropbox_config(self): |
| 284 | config = KNOWN_CONFIGS["dropbox"] |
| 285 | assert config.api_key_env == "DROPBOX_ACCESS_TOKEN" |
| 286 | |
| 287 | def test_google_config(self): |
| 288 | config = KNOWN_CONFIGS["google"] |
| 289 | assert config.supports_oauth |
| 290 | |
| 291 | def test_microsoft_config(self): |
| 292 | config = KNOWN_CONFIGS["microsoft"] |
| 293 | assert config.supports_oauth |
| 294 | |
| 295 | def test_all_configs_have_service(self): |
| 296 | for name, config in KNOWN_CONFIGS.items(): |
| 297 | assert config.service == name |
| 298 | |
| 299 | def test_get_auth_config(self): |
| 300 | assert get_auth_config("zoom") is not None |
| 301 | assert get_auth_config("nonexistent") is None |
| 302 | |
| 303 | def test_get_auth_manager(self): |
| 304 | mgr = get_auth_manager("zoom") |
| 305 | assert mgr is not None |
| 306 | assert isinstance(mgr, OAuthManager) |
| 307 | |
| 308 | def test_get_auth_manager_unknown(self): |
| 309 | assert get_auth_manager("nonexistent") is None |
+114
| --- a/tests/test_callbacks.py | ||
| +++ b/tests/test_callbacks.py | ||
| @@ -0,0 +1,114 @@ | ||
| 1 | +"""Tests for video_processor.utils.callbacks.WebhookCallback.""" | |
| 2 | + | |
| 3 | +import json | |
| 4 | +from unittest.mock import patch | |
| 5 | + | |
| 6 | +import pytest | |
| 7 | + | |
| 8 | +from video_processor.utils.callbacks import WebhookCallback | |
| 9 | + | |
| 10 | + | |
| 11 | +@pytest.fixture() | |
| 12 | +def callback(): | |
| 13 | + return WebhookCallback(url="https://example.com/webhook") | |
| 14 | + | |
| 15 | + | |
| 16 | +# --- Constructor --- | |
| 17 | + | |
| 18 | + | |
| 19 | +def test_default_headers(): | |
| 20 | + cb = WebhookCallback(url="https://example.com/hook") | |
| 21 | + assert cb.headers == {"Content-Type": "application/json"} | |
| 22 | + | |
| 23 | + | |
| 24 | +def test_custom_headers(): | |
| 25 | + headers = {"Authorization": "Bearer tok", "Content-Type": "application/json"} | |
| 26 | + cb = WebhookCallback(url="https://example.com/hook", headers=headers) | |
| 27 | + assert cb.headers["Authorization"] == "Bearer tok" | |
| 28 | + | |
| 29 | + | |
| 30 | +def test_custom_timeout(): | |
| 31 | + cb = WebhookCallback(url="https://example.com/hook", timeout=5.0) | |
| 32 | + assert cb.timeout == 5.0 | |
| 33 | + | |
| 34 | + | |
| 35 | +# --- _post --- | |
| 36 | + | |
| 37 | + | |
| 38 | +@patch("urllib.request.urlopen") | |
| 39 | +@patch("urllib.request.Request") | |
| 40 | +def test_post_sends_json_payload(mock_request_cls, mock_urlopen, callback): | |
| 41 | + callback._post({"event": "test"}) | |
| 42 | + | |
| 43 | + mock_request_cls.assert_called_once() | |
| 44 | + call_args = mock_request_cls.call_args | |
| 45 | + data = json.loads(call_args[1]["data"] if "data" in call_args[1] else call_args[0][1]) | |
| 46 | + assert data["event"] == "test" | |
| 47 | + mock_urlopen.assert_called_once() | |
| 48 | + | |
| 49 | + | |
| 50 | +@patch("urllib.request.urlopen", side_effect=Exception("Connection refused")) | |
| 51 | +@patch("urllib.request.Request") | |
| 52 | +def test_post_logs_failure_does_not_raise(mock_request_cls, mock_urlopen, callback): | |
| 53 | + # Should not raise | |
| 54 | + callback._post({"event": "fail_test"}) | |
| 55 | + | |
| 56 | + | |
| 57 | +# --- on_step_start --- | |
| 58 | + | |
| 59 | + | |
| 60 | +@patch.object(WebhookCallback, "_post") | |
| 61 | +def test_on_step_start_payload(mock_post, callback): | |
| 62 | + callback.on_step_start("transcription", 1, 5) | |
| 63 | + | |
| 64 | + mock_post.assert_called_once_with( | |
| 65 | + { | |
| 66 | + "event": "step_start", | |
| 67 | + "step": "transcription", | |
| 68 | + "index": 1, | |
| 69 | + "total": 5, | |
| 70 | + } | |
| 71 | + ) | |
| 72 | + | |
| 73 | + | |
| 74 | +# --- on_step_complete --- | |
| 75 | + | |
| 76 | + | |
| 77 | +@patch.object(WebhookCallback, "_post") | |
| 78 | +def test_on_step_complete_payload(mock_post, callback): | |
| 79 | + callback.on_step_complete("analysis", 3, 5) | |
| 80 | + | |
| 81 | + mock_post.assert_called_once_with( | |
| 82 | + { | |
| 83 | + "event": "step_complete", | |
| 84 | + "step": "analysis", | |
| 85 | + "index": 3, | |
| 86 | + "total": 5, | |
| 87 | + } | |
| 88 | + ) | |
| 89 | + | |
| 90 | + | |
| 91 | +# --- on_progress --- | |
| 92 | + | |
| 93 | + | |
| 94 | +@patch.object(WebhookCallback, "_post") | |
| 95 | +def test_on_progress_payload(mock_post, callback): | |
| 96 | + callback.on_progress("transcription", 42.5, "Processing chunk 3/7") | |
| 97 | + | |
| 98 | + mock_post.assert_called_once_with( | |
| 99 | + { | |
| 100 | + "event": "progress", | |
| 101 | + "step": "transcription", | |
| 102 | + "percent": 42.5, | |
| 103 | + "message": "Processing chunk 3/7", | |
| 104 | + } | |
| 105 | + ) | |
| 106 | + | |
| 107 | + | |
| 108 | +@patch.object(WebhookCallback, "_post") | |
| 109 | +def test_on_progress_default_message(mock_post, callback): | |
| 110 | + callback.on_progress("extraction", 100.0) | |
| 111 | + | |
| 112 | + payload = mock_post.call_args[0][0] | |
| 113 | + assert payload["message"] == "" | |
| 114 | + assert payload["percent"] == 100.0 |
| --- a/tests/test_callbacks.py | |
| +++ b/tests/test_callbacks.py | |
| @@ -0,0 +1,114 @@ | |
| --- a/tests/test_callbacks.py | |
| +++ b/tests/test_callbacks.py | |
| @@ -0,0 +1,114 @@ | |
| 1 | """Tests for video_processor.utils.callbacks.WebhookCallback.""" |
| 2 | |
| 3 | import json |
| 4 | from unittest.mock import patch |
| 5 | |
| 6 | import pytest |
| 7 | |
| 8 | from video_processor.utils.callbacks import WebhookCallback |
| 9 | |
| 10 | |
| 11 | @pytest.fixture() |
| 12 | def callback(): |
| 13 | return WebhookCallback(url="https://example.com/webhook") |
| 14 | |
| 15 | |
| 16 | # --- Constructor --- |
| 17 | |
| 18 | |
| 19 | def test_default_headers(): |
| 20 | cb = WebhookCallback(url="https://example.com/hook") |
| 21 | assert cb.headers == {"Content-Type": "application/json"} |
| 22 | |
| 23 | |
| 24 | def test_custom_headers(): |
| 25 | headers = {"Authorization": "Bearer tok", "Content-Type": "application/json"} |
| 26 | cb = WebhookCallback(url="https://example.com/hook", headers=headers) |
| 27 | assert cb.headers["Authorization"] == "Bearer tok" |
| 28 | |
| 29 | |
| 30 | def test_custom_timeout(): |
| 31 | cb = WebhookCallback(url="https://example.com/hook", timeout=5.0) |
| 32 | assert cb.timeout == 5.0 |
| 33 | |
| 34 | |
| 35 | # --- _post --- |
| 36 | |
| 37 | |
| 38 | @patch("urllib.request.urlopen") |
| 39 | @patch("urllib.request.Request") |
| 40 | def test_post_sends_json_payload(mock_request_cls, mock_urlopen, callback): |
| 41 | callback._post({"event": "test"}) |
| 42 | |
| 43 | mock_request_cls.assert_called_once() |
| 44 | call_args = mock_request_cls.call_args |
| 45 | data = json.loads(call_args[1]["data"] if "data" in call_args[1] else call_args[0][1]) |
| 46 | assert data["event"] == "test" |
| 47 | mock_urlopen.assert_called_once() |
| 48 | |
| 49 | |
| 50 | @patch("urllib.request.urlopen", side_effect=Exception("Connection refused")) |
| 51 | @patch("urllib.request.Request") |
| 52 | def test_post_logs_failure_does_not_raise(mock_request_cls, mock_urlopen, callback): |
| 53 | # Should not raise |
| 54 | callback._post({"event": "fail_test"}) |
| 55 | |
| 56 | |
| 57 | # --- on_step_start --- |
| 58 | |
| 59 | |
| 60 | @patch.object(WebhookCallback, "_post") |
| 61 | def test_on_step_start_payload(mock_post, callback): |
| 62 | callback.on_step_start("transcription", 1, 5) |
| 63 | |
| 64 | mock_post.assert_called_once_with( |
| 65 | { |
| 66 | "event": "step_start", |
| 67 | "step": "transcription", |
| 68 | "index": 1, |
| 69 | "total": 5, |
| 70 | } |
| 71 | ) |
| 72 | |
| 73 | |
| 74 | # --- on_step_complete --- |
| 75 | |
| 76 | |
| 77 | @patch.object(WebhookCallback, "_post") |
| 78 | def test_on_step_complete_payload(mock_post, callback): |
| 79 | callback.on_step_complete("analysis", 3, 5) |
| 80 | |
| 81 | mock_post.assert_called_once_with( |
| 82 | { |
| 83 | "event": "step_complete", |
| 84 | "step": "analysis", |
| 85 | "index": 3, |
| 86 | "total": 5, |
| 87 | } |
| 88 | ) |
| 89 | |
| 90 | |
| 91 | # --- on_progress --- |
| 92 | |
| 93 | |
| 94 | @patch.object(WebhookCallback, "_post") |
| 95 | def test_on_progress_payload(mock_post, callback): |
| 96 | callback.on_progress("transcription", 42.5, "Processing chunk 3/7") |
| 97 | |
| 98 | mock_post.assert_called_once_with( |
| 99 | { |
| 100 | "event": "progress", |
| 101 | "step": "transcription", |
| 102 | "percent": 42.5, |
| 103 | "message": "Processing chunk 3/7", |
| 104 | } |
| 105 | ) |
| 106 | |
| 107 | |
| 108 | @patch.object(WebhookCallback, "_post") |
| 109 | def test_on_progress_default_message(mock_post, callback): |
| 110 | callback.on_progress("extraction", 100.0) |
| 111 | |
| 112 | payload = mock_post.call_args[0][0] |
| 113 | assert payload["message"] == "" |
| 114 | assert payload["percent"] == 100.0 |
+252
| --- a/tests/test_cli.py | ||
| +++ b/tests/test_cli.py | ||
| @@ -0,0 +1,252 @@ | ||
| 1 | +"""Tests for the CLI commands (help text, version, option presence).""" | |
| 2 | + | |
| 3 | +from click.testing import CliRunner | |
| 4 | + | |
| 5 | +from video_processor.cli.commands import cli | |
| 6 | + | |
| 7 | + | |
| 8 | +class TestCLIRoot: | |
| 9 | + def test_version(self): | |
| 10 | + runner = CliRunner() | |
| 11 | + result = runner.invoke(cli, ["--version"]) | |
| 12 | + assert result.exit_code == 0 | |
| 13 | + assert "PlanOpticon" in result.output | |
| 14 | + assert "0.4.0" in result.output # matches @click.version_option | |
| 15 | + | |
| 16 | + def test_help(self): | |
| 17 | + runner = CliRunner() | |
| 18 | + result = runner.invoke(cli, ["--help"]) | |
| 19 | + assert result.exit_code == 0 | |
| 20 | + assert "PlanOpticon" in result.output | |
| 21 | + assert "analyze" in result.output | |
| 22 | + assert "query" in result.output | |
| 23 | + assert "agent" in result.output | |
| 24 | + assert "kg" in result.output | |
| 25 | + assert "gws" in result.output | |
| 26 | + assert "m365" in result.output | |
| 27 | + assert "recordings" in result.output | |
| 28 | + assert "ingest" in result.output | |
| 29 | + assert "batch" in result.output | |
| 30 | + assert "--chat" in result.output | |
| 31 | + assert "--interactive" in result.output | |
| 32 | + assert "companion" in result.output | |
| 33 | + | |
| 34 | + | |
| 35 | +class TestAnalyzeHelp: | |
| 36 | + def test_help(self): | |
| 37 | + runner = CliRunner() | |
| 38 | + result = runner.invoke(cli, ["analyze", "--help"]) | |
| 39 | + assert result.exit_code == 0 | |
| 40 | + assert "--input" in result.output or "-i" in result.output | |
| 41 | + assert "--output" in result.output or "-o" in result.output | |
| 42 | + assert "--depth" in result.output | |
| 43 | + assert "--provider" in result.output | |
| 44 | + assert "--output-format" in result.output | |
| 45 | + assert "--templates-dir" in result.output | |
| 46 | + assert "--speakers" in result.output | |
| 47 | + assert "--vision-model" in result.output | |
| 48 | + assert "--chat-model" in result.output | |
| 49 | + assert "--sampling-rate" in result.output | |
| 50 | + assert "--change-threshold" in result.output | |
| 51 | + assert "--periodic-capture" in result.output | |
| 52 | + | |
| 53 | + | |
| 54 | +class TestQueryHelp: | |
| 55 | + def test_help(self): | |
| 56 | + runner = CliRunner() | |
| 57 | + result = runner.invoke(cli, ["query", "--help"]) | |
| 58 | + assert result.exit_code == 0 | |
| 59 | + assert "--db-path" in result.output | |
| 60 | + assert "--mode" in result.output | |
| 61 | + assert "--format" in result.output | |
| 62 | + assert "--interactive" in result.output or "-I" in result.output | |
| 63 | + assert "--provider" in result.output | |
| 64 | + assert "--chat-model" in result.output | |
| 65 | + assert "QUESTION" in result.output | |
| 66 | + | |
| 67 | + | |
| 68 | +class TestAgentHelp: | |
| 69 | + def test_help(self): | |
| 70 | + runner = CliRunner() | |
| 71 | + result = runner.invoke(cli, ["agent", "--help"]) | |
| 72 | + assert result.exit_code == 0 | |
| 73 | + assert "--kb" in result.output | |
| 74 | + assert "--interactive" in result.output or "-I" in result.output | |
| 75 | + assert "--export" in result.output | |
| 76 | + assert "--provider" in result.output | |
| 77 | + assert "--chat-model" in result.output | |
| 78 | + assert "REQUEST" in result.output | |
| 79 | + | |
| 80 | + | |
| 81 | +class TestKGHelp: | |
| 82 | + def test_help(self): | |
| 83 | + runner = CliRunner() | |
| 84 | + result = runner.invoke(cli, ["kg", "--help"]) | |
| 85 | + assert result.exit_code == 0 | |
| 86 | + assert "convert" in result.output | |
| 87 | + assert "Knowledge graph" in result.output | |
| 88 | + | |
| 89 | + def test_convert_help(self): | |
| 90 | + runner = CliRunner() | |
| 91 | + result = runner.invoke(cli, ["kg", "convert", "--help"]) | |
| 92 | + assert result.exit_code == 0 | |
| 93 | + assert "SOURCE_PATH" in result.output | |
| 94 | + assert "DEST_PATH" in result.output | |
| 95 | + | |
| 96 | + | |
| 97 | +class TestIngestHelp: | |
| 98 | + def test_help(self): | |
| 99 | + runner = CliRunner() | |
| 100 | + result = runner.invoke(cli, ["ingest", "--help"]) | |
| 101 | + assert result.exit_code == 0 | |
| 102 | + assert "INPUT_PATH" in result.output | |
| 103 | + assert "--output" in result.output or "-o" in result.output | |
| 104 | + assert "--db-path" in result.output | |
| 105 | + assert "--recursive" in result.output | |
| 106 | + assert "--provider" in result.output | |
| 107 | + assert "--chat-model" in result.output | |
| 108 | + | |
| 109 | + | |
| 110 | +class TestBatchHelp: | |
| 111 | + def test_help(self): | |
| 112 | + runner = CliRunner() | |
| 113 | + result = runner.invoke(cli, ["batch", "--help"]) | |
| 114 | + assert result.exit_code == 0 | |
| 115 | + assert "--input-dir" in result.output or "-i" in result.output | |
| 116 | + assert "--output" in result.output or "-o" in result.output | |
| 117 | + assert "--depth" in result.output | |
| 118 | + assert "--pattern" in result.output | |
| 119 | + assert "--source" in result.output | |
| 120 | + assert "--folder-id" in result.output | |
| 121 | + | |
| 122 | + | |
| 123 | +class TestAgentAnalyzeHelp: | |
| 124 | + def test_help(self): | |
| 125 | + runner = CliRunner() | |
| 126 | + result = runner.invoke(cli, ["agent-analyze", "--help"]) | |
| 127 | + assert result.exit_code == 0 | |
| 128 | + assert "--input" in result.output or "-i" in result.output | |
| 129 | + assert "--output" in result.output or "-o" in result.output | |
| 130 | + assert "--depth" in result.output | |
| 131 | + assert "--provider" in result.output | |
| 132 | + | |
| 133 | + | |
| 134 | +class TestListModelsHelp: | |
| 135 | + def test_help(self): | |
| 136 | + runner = CliRunner() | |
| 137 | + result = runner.invoke(cli, ["list-models", "--help"]) | |
| 138 | + assert result.exit_code == 0 | |
| 139 | + assert "Discover" in result.output or "models" in result.output | |
| 140 | + | |
| 141 | + | |
| 142 | +class TestClearCacheHelp: | |
| 143 | + def test_help(self): | |
| 144 | + runner = CliRunner() | |
| 145 | + result = runner.invoke(cli, ["clear-cache", "--help"]) | |
| 146 | + assert result.exit_code == 0 | |
| 147 | + assert "--cache-dir" in result.output | |
| 148 | + assert "--older-than" in result.output | |
| 149 | + assert "--all" in result.output | |
| 150 | + | |
| 151 | + | |
| 152 | +class TestGWSHelp: | |
| 153 | + def test_help(self): | |
| 154 | + runner = CliRunner() | |
| 155 | + result = runner.invoke(cli, ["gws", "--help"]) | |
| 156 | + assert result.exit_code == 0 | |
| 157 | + assert "list" in result.output | |
| 158 | + assert "fetch" in result.output | |
| 159 | + assert "ingest" in result.output | |
| 160 | + | |
| 161 | + def test_list_help(self): | |
| 162 | + runner = CliRunner() | |
| 163 | + result = runner.invoke(cli, ["gws", "list", "--help"]) | |
| 164 | + assert result.exit_code == 0 | |
| 165 | + assert "--folder-id" in result.output | |
| 166 | + assert "--query" in result.output | |
| 167 | + | |
| 168 | + def test_ingest_help(self): | |
| 169 | + runner = CliRunner() | |
| 170 | + result = runner.invoke(cli, ["gws", "ingest", "--help"]) | |
| 171 | + assert result.exit_code == 0 | |
| 172 | + assert "--folder-id" in result.output | |
| 173 | + assert "--doc-id" in result.output | |
| 174 | + assert "--db-path" in result.output | |
| 175 | + | |
| 176 | + | |
| 177 | +class TestM365Help: | |
| 178 | + def test_help(self): | |
| 179 | + runner = CliRunner() | |
| 180 | + result = runner.invoke(cli, ["m365", "--help"]) | |
| 181 | + assert result.exit_code == 0 | |
| 182 | + assert "list" in result.output | |
| 183 | + assert "fetch" in result.output | |
| 184 | + assert "ingest" in result.output | |
| 185 | + | |
| 186 | + def test_list_help(self): | |
| 187 | + runner = CliRunner() | |
| 188 | + result = runner.invoke(cli, ["m365", "list", "--help"]) | |
| 189 | + assert result.exit_code == 0 | |
| 190 | + assert "--web-url" in result.output | |
| 191 | + assert "--folder-url" in result.output | |
| 192 | + assert "--recursive" in result.output | |
| 193 | + | |
| 194 | + def test_ingest_help(self): | |
| 195 | + runner = CliRunner() | |
| 196 | + result = runner.invoke(cli, ["m365", "ingest", "--help"]) | |
| 197 | + assert result.exit_code == 0 | |
| 198 | + assert "--web-url" in result.output | |
| 199 | + assert "--file-id" in result.output | |
| 200 | + assert "--db-path" in result.output | |
| 201 | + | |
| 202 | + | |
| 203 | +class TestRecordingsHelp: | |
| 204 | + def test_group_help(self): | |
| 205 | + runner = CliRunner() | |
| 206 | + result = runner.invoke(cli, ["recordings", "--help"]) | |
| 207 | + assert result.exit_code == 0 | |
| 208 | + assert "zoom-list" in result.output | |
| 209 | + assert "teams-list" in result.output | |
| 210 | + assert "meet-list" in result.output | |
| 211 | + | |
| 212 | + def test_zoom_list_help(self): | |
| 213 | + runner = CliRunner() | |
| 214 | + result = runner.invoke(cli, ["recordings", "zoom-list", "--help"]) | |
| 215 | + assert result.exit_code == 0 | |
| 216 | + assert "ZOOM_CLIENT_ID" in result.output | |
| 217 | + | |
| 218 | + def test_teams_list_help(self): | |
| 219 | + runner = CliRunner() | |
| 220 | + result = runner.invoke(cli, ["recordings", "teams-list", "--help"]) | |
| 221 | + assert result.exit_code == 0 | |
| 222 | + assert "--user-id" in result.output | |
| 223 | + | |
| 224 | + def test_meet_list_help(self): | |
| 225 | + runner = CliRunner() | |
| 226 | + result = runner.invoke(cli, ["recordings", "meet-list", "--help"]) | |
| 227 | + assert result.exit_code == 0 | |
| 228 | + assert "--folder-id" in result.output | |
| 229 | + | |
| 230 | + | |
| 231 | +class TestAuthHelp: | |
| 232 | + def test_help(self): | |
| 233 | + runner = CliRunner() | |
| 234 | + result = runner.invoke(cli, ["auth", "--help"]) | |
| 235 | + assert result.exit_code == 0 | |
| 236 | + assert "google" in result.output | |
| 237 | + assert "dropbox" in result.output | |
| 238 | + assert "zoom" in result.output | |
| 239 | + assert "notion" in result.output | |
| 240 | + assert "github" in result.output | |
| 241 | + assert "microsoft" in result.output | |
| 242 | + assert "--logout" in result.output | |
| 243 | + | |
| 244 | + | |
| 245 | +class TestCompanionHelp: | |
| 246 | + def test_help(self): | |
| 247 | + runner = CliRunner() | |
| 248 | + result = runner.invoke(cli, ["companion", "--help"]) | |
| 249 | + assert result.exit_code == 0 | |
| 250 | + assert "--kb" in result.output | |
| 251 | + assert "--provider" in result.output | |
| 252 | + assert "--chat-model" in result.output |
| --- a/tests/test_cli.py | |
| +++ b/tests/test_cli.py | |
| @@ -0,0 +1,252 @@ | |
| --- a/tests/test_cli.py | |
| +++ b/tests/test_cli.py | |
| @@ -0,0 +1,252 @@ | |
| 1 | """Tests for the CLI commands (help text, version, option presence).""" |
| 2 | |
| 3 | from click.testing import CliRunner |
| 4 | |
| 5 | from video_processor.cli.commands import cli |
| 6 | |
| 7 | |
| 8 | class TestCLIRoot: |
| 9 | def test_version(self): |
| 10 | runner = CliRunner() |
| 11 | result = runner.invoke(cli, ["--version"]) |
| 12 | assert result.exit_code == 0 |
| 13 | assert "PlanOpticon" in result.output |
| 14 | assert "0.4.0" in result.output # matches @click.version_option |
| 15 | |
| 16 | def test_help(self): |
| 17 | runner = CliRunner() |
| 18 | result = runner.invoke(cli, ["--help"]) |
| 19 | assert result.exit_code == 0 |
| 20 | assert "PlanOpticon" in result.output |
| 21 | assert "analyze" in result.output |
| 22 | assert "query" in result.output |
| 23 | assert "agent" in result.output |
| 24 | assert "kg" in result.output |
| 25 | assert "gws" in result.output |
| 26 | assert "m365" in result.output |
| 27 | assert "recordings" in result.output |
| 28 | assert "ingest" in result.output |
| 29 | assert "batch" in result.output |
| 30 | assert "--chat" in result.output |
| 31 | assert "--interactive" in result.output |
| 32 | assert "companion" in result.output |
| 33 | |
| 34 | |
| 35 | class TestAnalyzeHelp: |
| 36 | def test_help(self): |
| 37 | runner = CliRunner() |
| 38 | result = runner.invoke(cli, ["analyze", "--help"]) |
| 39 | assert result.exit_code == 0 |
| 40 | assert "--input" in result.output or "-i" in result.output |
| 41 | assert "--output" in result.output or "-o" in result.output |
| 42 | assert "--depth" in result.output |
| 43 | assert "--provider" in result.output |
| 44 | assert "--output-format" in result.output |
| 45 | assert "--templates-dir" in result.output |
| 46 | assert "--speakers" in result.output |
| 47 | assert "--vision-model" in result.output |
| 48 | assert "--chat-model" in result.output |
| 49 | assert "--sampling-rate" in result.output |
| 50 | assert "--change-threshold" in result.output |
| 51 | assert "--periodic-capture" in result.output |
| 52 | |
| 53 | |
| 54 | class TestQueryHelp: |
| 55 | def test_help(self): |
| 56 | runner = CliRunner() |
| 57 | result = runner.invoke(cli, ["query", "--help"]) |
| 58 | assert result.exit_code == 0 |
| 59 | assert "--db-path" in result.output |
| 60 | assert "--mode" in result.output |
| 61 | assert "--format" in result.output |
| 62 | assert "--interactive" in result.output or "-I" in result.output |
| 63 | assert "--provider" in result.output |
| 64 | assert "--chat-model" in result.output |
| 65 | assert "QUESTION" in result.output |
| 66 | |
| 67 | |
| 68 | class TestAgentHelp: |
| 69 | def test_help(self): |
| 70 | runner = CliRunner() |
| 71 | result = runner.invoke(cli, ["agent", "--help"]) |
| 72 | assert result.exit_code == 0 |
| 73 | assert "--kb" in result.output |
| 74 | assert "--interactive" in result.output or "-I" in result.output |
| 75 | assert "--export" in result.output |
| 76 | assert "--provider" in result.output |
| 77 | assert "--chat-model" in result.output |
| 78 | assert "REQUEST" in result.output |
| 79 | |
| 80 | |
| 81 | class TestKGHelp: |
| 82 | def test_help(self): |
| 83 | runner = CliRunner() |
| 84 | result = runner.invoke(cli, ["kg", "--help"]) |
| 85 | assert result.exit_code == 0 |
| 86 | assert "convert" in result.output |
| 87 | assert "Knowledge graph" in result.output |
| 88 | |
| 89 | def test_convert_help(self): |
| 90 | runner = CliRunner() |
| 91 | result = runner.invoke(cli, ["kg", "convert", "--help"]) |
| 92 | assert result.exit_code == 0 |
| 93 | assert "SOURCE_PATH" in result.output |
| 94 | assert "DEST_PATH" in result.output |
| 95 | |
| 96 | |
| 97 | class TestIngestHelp: |
| 98 | def test_help(self): |
| 99 | runner = CliRunner() |
| 100 | result = runner.invoke(cli, ["ingest", "--help"]) |
| 101 | assert result.exit_code == 0 |
| 102 | assert "INPUT_PATH" in result.output |
| 103 | assert "--output" in result.output or "-o" in result.output |
| 104 | assert "--db-path" in result.output |
| 105 | assert "--recursive" in result.output |
| 106 | assert "--provider" in result.output |
| 107 | assert "--chat-model" in result.output |
| 108 | |
| 109 | |
| 110 | class TestBatchHelp: |
| 111 | def test_help(self): |
| 112 | runner = CliRunner() |
| 113 | result = runner.invoke(cli, ["batch", "--help"]) |
| 114 | assert result.exit_code == 0 |
| 115 | assert "--input-dir" in result.output or "-i" in result.output |
| 116 | assert "--output" in result.output or "-o" in result.output |
| 117 | assert "--depth" in result.output |
| 118 | assert "--pattern" in result.output |
| 119 | assert "--source" in result.output |
| 120 | assert "--folder-id" in result.output |
| 121 | |
| 122 | |
| 123 | class TestAgentAnalyzeHelp: |
| 124 | def test_help(self): |
| 125 | runner = CliRunner() |
| 126 | result = runner.invoke(cli, ["agent-analyze", "--help"]) |
| 127 | assert result.exit_code == 0 |
| 128 | assert "--input" in result.output or "-i" in result.output |
| 129 | assert "--output" in result.output or "-o" in result.output |
| 130 | assert "--depth" in result.output |
| 131 | assert "--provider" in result.output |
| 132 | |
| 133 | |
| 134 | class TestListModelsHelp: |
| 135 | def test_help(self): |
| 136 | runner = CliRunner() |
| 137 | result = runner.invoke(cli, ["list-models", "--help"]) |
| 138 | assert result.exit_code == 0 |
| 139 | assert "Discover" in result.output or "models" in result.output |
| 140 | |
| 141 | |
| 142 | class TestClearCacheHelp: |
| 143 | def test_help(self): |
| 144 | runner = CliRunner() |
| 145 | result = runner.invoke(cli, ["clear-cache", "--help"]) |
| 146 | assert result.exit_code == 0 |
| 147 | assert "--cache-dir" in result.output |
| 148 | assert "--older-than" in result.output |
| 149 | assert "--all" in result.output |
| 150 | |
| 151 | |
| 152 | class TestGWSHelp: |
| 153 | def test_help(self): |
| 154 | runner = CliRunner() |
| 155 | result = runner.invoke(cli, ["gws", "--help"]) |
| 156 | assert result.exit_code == 0 |
| 157 | assert "list" in result.output |
| 158 | assert "fetch" in result.output |
| 159 | assert "ingest" in result.output |
| 160 | |
| 161 | def test_list_help(self): |
| 162 | runner = CliRunner() |
| 163 | result = runner.invoke(cli, ["gws", "list", "--help"]) |
| 164 | assert result.exit_code == 0 |
| 165 | assert "--folder-id" in result.output |
| 166 | assert "--query" in result.output |
| 167 | |
| 168 | def test_ingest_help(self): |
| 169 | runner = CliRunner() |
| 170 | result = runner.invoke(cli, ["gws", "ingest", "--help"]) |
| 171 | assert result.exit_code == 0 |
| 172 | assert "--folder-id" in result.output |
| 173 | assert "--doc-id" in result.output |
| 174 | assert "--db-path" in result.output |
| 175 | |
| 176 | |
| 177 | class TestM365Help: |
| 178 | def test_help(self): |
| 179 | runner = CliRunner() |
| 180 | result = runner.invoke(cli, ["m365", "--help"]) |
| 181 | assert result.exit_code == 0 |
| 182 | assert "list" in result.output |
| 183 | assert "fetch" in result.output |
| 184 | assert "ingest" in result.output |
| 185 | |
| 186 | def test_list_help(self): |
| 187 | runner = CliRunner() |
| 188 | result = runner.invoke(cli, ["m365", "list", "--help"]) |
| 189 | assert result.exit_code == 0 |
| 190 | assert "--web-url" in result.output |
| 191 | assert "--folder-url" in result.output |
| 192 | assert "--recursive" in result.output |
| 193 | |
| 194 | def test_ingest_help(self): |
| 195 | runner = CliRunner() |
| 196 | result = runner.invoke(cli, ["m365", "ingest", "--help"]) |
| 197 | assert result.exit_code == 0 |
| 198 | assert "--web-url" in result.output |
| 199 | assert "--file-id" in result.output |
| 200 | assert "--db-path" in result.output |
| 201 | |
| 202 | |
| 203 | class TestRecordingsHelp: |
| 204 | def test_group_help(self): |
| 205 | runner = CliRunner() |
| 206 | result = runner.invoke(cli, ["recordings", "--help"]) |
| 207 | assert result.exit_code == 0 |
| 208 | assert "zoom-list" in result.output |
| 209 | assert "teams-list" in result.output |
| 210 | assert "meet-list" in result.output |
| 211 | |
| 212 | def test_zoom_list_help(self): |
| 213 | runner = CliRunner() |
| 214 | result = runner.invoke(cli, ["recordings", "zoom-list", "--help"]) |
| 215 | assert result.exit_code == 0 |
| 216 | assert "ZOOM_CLIENT_ID" in result.output |
| 217 | |
| 218 | def test_teams_list_help(self): |
| 219 | runner = CliRunner() |
| 220 | result = runner.invoke(cli, ["recordings", "teams-list", "--help"]) |
| 221 | assert result.exit_code == 0 |
| 222 | assert "--user-id" in result.output |
| 223 | |
| 224 | def test_meet_list_help(self): |
| 225 | runner = CliRunner() |
| 226 | result = runner.invoke(cli, ["recordings", "meet-list", "--help"]) |
| 227 | assert result.exit_code == 0 |
| 228 | assert "--folder-id" in result.output |
| 229 | |
| 230 | |
| 231 | class TestAuthHelp: |
| 232 | def test_help(self): |
| 233 | runner = CliRunner() |
| 234 | result = runner.invoke(cli, ["auth", "--help"]) |
| 235 | assert result.exit_code == 0 |
| 236 | assert "google" in result.output |
| 237 | assert "dropbox" in result.output |
| 238 | assert "zoom" in result.output |
| 239 | assert "notion" in result.output |
| 240 | assert "github" in result.output |
| 241 | assert "microsoft" in result.output |
| 242 | assert "--logout" in result.output |
| 243 | |
| 244 | |
| 245 | class TestCompanionHelp: |
| 246 | def test_help(self): |
| 247 | runner = CliRunner() |
| 248 | result = runner.invoke(cli, ["companion", "--help"]) |
| 249 | assert result.exit_code == 0 |
| 250 | assert "--kb" in result.output |
| 251 | assert "--provider" in result.output |
| 252 | assert "--chat-model" in result.output |
+162
| --- a/tests/test_companion.py | ||
| +++ b/tests/test_companion.py | ||
| @@ -0,0 +1,162 @@ | ||
| 1 | +"""Tests for the CompanionREPL (without launching the loop).""" | |
| 2 | + | |
| 3 | +from unittest.mock import patch | |
| 4 | + | |
| 5 | +from video_processor.cli.companion import CompanionREPL | |
| 6 | + | |
| 7 | + | |
| 8 | +class TestImport: | |
| 9 | + def test_import(self): | |
| 10 | + from video_processor.cli import companion # noqa: F401 | |
| 11 | + | |
| 12 | + assert hasattr(companion, "CompanionREPL") | |
| 13 | + | |
| 14 | + | |
| 15 | +class TestConstructor: | |
| 16 | + def test_defaults(self): | |
| 17 | + repl = CompanionREPL() | |
| 18 | + assert repl.kg is None | |
| 19 | + assert repl.query_engine is None | |
| 20 | + assert repl.agent is None | |
| 21 | + assert repl.provider_manager is None | |
| 22 | + | |
| 23 | + def test_explicit_args(self): | |
| 24 | + repl = CompanionREPL( | |
| 25 | + kb_paths=["/tmp/fake.db"], | |
| 26 | + provider="openai", | |
| 27 | + chat_model="gpt-4", | |
| 28 | + ) | |
| 29 | + assert repl._kb_paths == ["/tmp/fake.db"] | |
| 30 | + assert repl._provider_name == "openai" | |
| 31 | + assert repl._chat_model == "gpt-4" | |
| 32 | + | |
| 33 | + | |
| 34 | +class TestAutoDiscovery: | |
| 35 | + @patch( | |
| 36 | + "video_processor.integrators.graph_discovery.find_nearest_graph", | |
| 37 | + return_value=None, | |
| 38 | + ) | |
| 39 | + def test_no_graph_found(self, mock_find): | |
| 40 | + repl = CompanionREPL() | |
| 41 | + repl._discover() | |
| 42 | + assert repl.query_engine is None | |
| 43 | + assert repl.kg is None | |
| 44 | + mock_find.assert_called_once() | |
| 45 | + | |
| 46 | + | |
| 47 | +class TestHandleHelp: | |
| 48 | + def test_handle_help(self): | |
| 49 | + repl = CompanionREPL() | |
| 50 | + output = repl.handle_input("/help") | |
| 51 | + assert "Available commands" in output | |
| 52 | + assert "/status" in output | |
| 53 | + assert "/skills" in output | |
| 54 | + assert "/entities" in output | |
| 55 | + assert "/quit" in output | |
| 56 | + | |
| 57 | + | |
| 58 | +class TestHandleStatus: | |
| 59 | + def test_handle_status_no_kg(self): | |
| 60 | + repl = CompanionREPL() | |
| 61 | + output = repl.handle_input("/status") | |
| 62 | + assert "Workspace status" in output | |
| 63 | + assert "not loaded" in output | |
| 64 | + | |
| 65 | + | |
| 66 | +class TestHandleSkills: | |
| 67 | + def test_handle_skills(self): | |
| 68 | + repl = CompanionREPL() | |
| 69 | + output = repl.handle_input("/skills") | |
| 70 | + # Either lists skills or says none registered | |
| 71 | + assert "skills" in output.lower() or "No skills" in output | |
| 72 | + | |
| 73 | + | |
| 74 | +class TestHandleQuit: | |
| 75 | + def test_quit(self): | |
| 76 | + repl = CompanionREPL() | |
| 77 | + assert repl.handle_input("/quit") == "__QUIT__" | |
| 78 | + | |
| 79 | + def test_exit(self): | |
| 80 | + repl = CompanionREPL() | |
| 81 | + assert repl.handle_input("/exit") == "__QUIT__" | |
| 82 | + | |
| 83 | + def test_bare_quit(self): | |
| 84 | + repl = CompanionREPL() | |
| 85 | + assert repl.handle_input("quit") == "__QUIT__" | |
| 86 | + | |
| 87 | + def test_bare_exit(self): | |
| 88 | + repl = CompanionREPL() | |
| 89 | + assert repl.handle_input("exit") == "__QUIT__" | |
| 90 | + | |
| 91 | + def test_bare_bye(self): | |
| 92 | + repl = CompanionREPL() | |
| 93 | + assert repl.handle_input("bye") == "__QUIT__" | |
| 94 | + | |
| 95 | + def test_bare_q(self): | |
| 96 | + repl = CompanionREPL() | |
| 97 | + assert repl.handle_input("q") == "__QUIT__" | |
| 98 | + | |
| 99 | + | |
| 100 | +class TestHandleUnknownSlash: | |
| 101 | + def test_unknown_command(self): | |
| 102 | + repl = CompanionREPL() | |
| 103 | + output = repl.handle_input("/foobar") | |
| 104 | + assert "Unknown command" in output | |
| 105 | + assert "/help" in output | |
| 106 | + | |
| 107 | + | |
| 108 | +class TestHandleChatNoProvider: | |
| 109 | + def test_chat_no_provider(self): | |
| 110 | + repl = CompanionREPL() | |
| 111 | + output = repl.handle_input("What is this project about?") | |
| 112 | + assert "LLM provider" in output or "API" in output | |
| 113 | + # Should not crash | |
| 114 | + | |
| 115 | + | |
| 116 | +class TestHandleEntitiesNoKG: | |
| 117 | + def test_entities_no_kg(self): | |
| 118 | + repl = CompanionREPL() | |
| 119 | + output = repl.handle_input("/entities") | |
| 120 | + assert "No knowledge graph loaded" in output | |
| 121 | + | |
| 122 | + def test_search_no_kg(self): | |
| 123 | + repl = CompanionREPL() | |
| 124 | + output = repl.handle_input("/search python") | |
| 125 | + assert "No knowledge graph loaded" in output | |
| 126 | + | |
| 127 | + def test_neighbors_no_kg(self): | |
| 128 | + repl = CompanionREPL() | |
| 129 | + output = repl.handle_input("/neighbors Alice") | |
| 130 | + assert "No knowledge graph loaded" in output | |
| 131 | + | |
| 132 | + | |
| 133 | +class TestProviderCommand: | |
| 134 | + def test_provider_list(self): | |
| 135 | + repl = CompanionREPL() | |
| 136 | + output = repl.handle_input("/provider") | |
| 137 | + assert "Available providers" in output | |
| 138 | + assert "openai" in output | |
| 139 | + assert "anthropic" in output | |
| 140 | + | |
| 141 | + def test_provider_switch(self): | |
| 142 | + repl = CompanionREPL() | |
| 143 | + output = repl.handle_input("/provider openai") | |
| 144 | + # Will fail to init without key, but shouldn't crash | |
| 145 | + assert "openai" in output.lower() | |
| 146 | + | |
| 147 | + def test_model_show(self): | |
| 148 | + repl = CompanionREPL() | |
| 149 | + output = repl.handle_input("/model") | |
| 150 | + assert "Current model" in output | |
| 151 | + | |
| 152 | + def test_model_switch(self): | |
| 153 | + repl = CompanionREPL() | |
| 154 | + output = repl.handle_input("/model gpt-4o") | |
| 155 | + # Will fail without provider, but shouldn't crash | |
| 156 | + assert "gpt-4o" in output | |
| 157 | + | |
| 158 | + def test_help_includes_provider(self): | |
| 159 | + repl = CompanionREPL() | |
| 160 | + output = repl.handle_input("/help") | |
| 161 | + assert "/provider" in output | |
| 162 | + assert "/model" in output |
| --- a/tests/test_companion.py | |
| +++ b/tests/test_companion.py | |
| @@ -0,0 +1,162 @@ | |
| --- a/tests/test_companion.py | |
| +++ b/tests/test_companion.py | |
| @@ -0,0 +1,162 @@ | |
| 1 | """Tests for the CompanionREPL (without launching the loop).""" |
| 2 | |
| 3 | from unittest.mock import patch |
| 4 | |
| 5 | from video_processor.cli.companion import CompanionREPL |
| 6 | |
| 7 | |
| 8 | class TestImport: |
| 9 | def test_import(self): |
| 10 | from video_processor.cli import companion # noqa: F401 |
| 11 | |
| 12 | assert hasattr(companion, "CompanionREPL") |
| 13 | |
| 14 | |
| 15 | class TestConstructor: |
| 16 | def test_defaults(self): |
| 17 | repl = CompanionREPL() |
| 18 | assert repl.kg is None |
| 19 | assert repl.query_engine is None |
| 20 | assert repl.agent is None |
| 21 | assert repl.provider_manager is None |
| 22 | |
| 23 | def test_explicit_args(self): |
| 24 | repl = CompanionREPL( |
| 25 | kb_paths=["/tmp/fake.db"], |
| 26 | provider="openai", |
| 27 | chat_model="gpt-4", |
| 28 | ) |
| 29 | assert repl._kb_paths == ["/tmp/fake.db"] |
| 30 | assert repl._provider_name == "openai" |
| 31 | assert repl._chat_model == "gpt-4" |
| 32 | |
| 33 | |
| 34 | class TestAutoDiscovery: |
| 35 | @patch( |
| 36 | "video_processor.integrators.graph_discovery.find_nearest_graph", |
| 37 | return_value=None, |
| 38 | ) |
| 39 | def test_no_graph_found(self, mock_find): |
| 40 | repl = CompanionREPL() |
| 41 | repl._discover() |
| 42 | assert repl.query_engine is None |
| 43 | assert repl.kg is None |
| 44 | mock_find.assert_called_once() |
| 45 | |
| 46 | |
| 47 | class TestHandleHelp: |
| 48 | def test_handle_help(self): |
| 49 | repl = CompanionREPL() |
| 50 | output = repl.handle_input("/help") |
| 51 | assert "Available commands" in output |
| 52 | assert "/status" in output |
| 53 | assert "/skills" in output |
| 54 | assert "/entities" in output |
| 55 | assert "/quit" in output |
| 56 | |
| 57 | |
| 58 | class TestHandleStatus: |
| 59 | def test_handle_status_no_kg(self): |
| 60 | repl = CompanionREPL() |
| 61 | output = repl.handle_input("/status") |
| 62 | assert "Workspace status" in output |
| 63 | assert "not loaded" in output |
| 64 | |
| 65 | |
| 66 | class TestHandleSkills: |
| 67 | def test_handle_skills(self): |
| 68 | repl = CompanionREPL() |
| 69 | output = repl.handle_input("/skills") |
| 70 | # Either lists skills or says none registered |
| 71 | assert "skills" in output.lower() or "No skills" in output |
| 72 | |
| 73 | |
| 74 | class TestHandleQuit: |
| 75 | def test_quit(self): |
| 76 | repl = CompanionREPL() |
| 77 | assert repl.handle_input("/quit") == "__QUIT__" |
| 78 | |
| 79 | def test_exit(self): |
| 80 | repl = CompanionREPL() |
| 81 | assert repl.handle_input("/exit") == "__QUIT__" |
| 82 | |
| 83 | def test_bare_quit(self): |
| 84 | repl = CompanionREPL() |
| 85 | assert repl.handle_input("quit") == "__QUIT__" |
| 86 | |
| 87 | def test_bare_exit(self): |
| 88 | repl = CompanionREPL() |
| 89 | assert repl.handle_input("exit") == "__QUIT__" |
| 90 | |
| 91 | def test_bare_bye(self): |
| 92 | repl = CompanionREPL() |
| 93 | assert repl.handle_input("bye") == "__QUIT__" |
| 94 | |
| 95 | def test_bare_q(self): |
| 96 | repl = CompanionREPL() |
| 97 | assert repl.handle_input("q") == "__QUIT__" |
| 98 | |
| 99 | |
| 100 | class TestHandleUnknownSlash: |
| 101 | def test_unknown_command(self): |
| 102 | repl = CompanionREPL() |
| 103 | output = repl.handle_input("/foobar") |
| 104 | assert "Unknown command" in output |
| 105 | assert "/help" in output |
| 106 | |
| 107 | |
| 108 | class TestHandleChatNoProvider: |
| 109 | def test_chat_no_provider(self): |
| 110 | repl = CompanionREPL() |
| 111 | output = repl.handle_input("What is this project about?") |
| 112 | assert "LLM provider" in output or "API" in output |
| 113 | # Should not crash |
| 114 | |
| 115 | |
| 116 | class TestHandleEntitiesNoKG: |
| 117 | def test_entities_no_kg(self): |
| 118 | repl = CompanionREPL() |
| 119 | output = repl.handle_input("/entities") |
| 120 | assert "No knowledge graph loaded" in output |
| 121 | |
| 122 | def test_search_no_kg(self): |
| 123 | repl = CompanionREPL() |
| 124 | output = repl.handle_input("/search python") |
| 125 | assert "No knowledge graph loaded" in output |
| 126 | |
| 127 | def test_neighbors_no_kg(self): |
| 128 | repl = CompanionREPL() |
| 129 | output = repl.handle_input("/neighbors Alice") |
| 130 | assert "No knowledge graph loaded" in output |
| 131 | |
| 132 | |
| 133 | class TestProviderCommand: |
| 134 | def test_provider_list(self): |
| 135 | repl = CompanionREPL() |
| 136 | output = repl.handle_input("/provider") |
| 137 | assert "Available providers" in output |
| 138 | assert "openai" in output |
| 139 | assert "anthropic" in output |
| 140 | |
| 141 | def test_provider_switch(self): |
| 142 | repl = CompanionREPL() |
| 143 | output = repl.handle_input("/provider openai") |
| 144 | # Will fail to init without key, but shouldn't crash |
| 145 | assert "openai" in output.lower() |
| 146 | |
| 147 | def test_model_show(self): |
| 148 | repl = CompanionREPL() |
| 149 | output = repl.handle_input("/model") |
| 150 | assert "Current model" in output |
| 151 | |
| 152 | def test_model_switch(self): |
| 153 | repl = CompanionREPL() |
| 154 | output = repl.handle_input("/model gpt-4o") |
| 155 | # Will fail without provider, but shouldn't crash |
| 156 | assert "gpt-4o" in output |
| 157 | |
| 158 | def test_help_includes_provider(self): |
| 159 | repl = CompanionREPL() |
| 160 | output = repl.handle_input("/help") |
| 161 | assert "/provider" in output |
| 162 | assert "/model" in output |
+224
| --- a/tests/test_exchange.py | ||
| +++ b/tests/test_exchange.py | ||
| @@ -0,0 +1,224 @@ | ||
| 1 | +"""Tests for the PlanOpticonExchange interchange format.""" | |
| 2 | + | |
| 3 | +import json | |
| 4 | + | |
| 5 | +from video_processor.exchange import ( | |
| 6 | + ArtifactMeta, | |
| 7 | + PlanOpticonExchange, | |
| 8 | + ProjectMeta, | |
| 9 | +) | |
| 10 | +from video_processor.models import Entity, Relationship, SourceRecord | |
| 11 | + | |
| 12 | +# ------------------------------------------------------------------ | |
| 13 | +# Fixtures | |
| 14 | +# ------------------------------------------------------------------ | |
| 15 | + | |
| 16 | + | |
| 17 | +def _sample_entity(name: str = "Python", etype: str = "technology"): | |
| 18 | + return Entity( | |
| 19 | + name=name, | |
| 20 | + type=etype, | |
| 21 | + descriptions=["A programming language"], | |
| 22 | + ) | |
| 23 | + | |
| 24 | + | |
| 25 | +def _sample_relationship(): | |
| 26 | + return Relationship( | |
| 27 | + source="Alice", | |
| 28 | + target="Python", | |
| 29 | + type="uses", | |
| 30 | + ) | |
| 31 | + | |
| 32 | + | |
| 33 | +def _sample_source(): | |
| 34 | + return SourceRecord( | |
| 35 | + source_id="src-1", | |
| 36 | + source_type="video", | |
| 37 | + title="Intro recording", | |
| 38 | + ) | |
| 39 | + | |
| 40 | + | |
| 41 | +def _sample_artifact(): | |
| 42 | + return ArtifactMeta( | |
| 43 | + name="roadmap", | |
| 44 | + content="# Roadmap\n- Phase 1", | |
| 45 | + artifact_type="roadmap", | |
| 46 | + format="markdown", | |
| 47 | + ) | |
| 48 | + | |
| 49 | + | |
| 50 | +def _sample_project(): | |
| 51 | + return ProjectMeta(name="TestProject", description="A test") | |
| 52 | + | |
| 53 | + | |
| 54 | +# ------------------------------------------------------------------ | |
| 55 | +# Tests | |
| 56 | +# ------------------------------------------------------------------ | |
| 57 | + | |
| 58 | + | |
| 59 | +def test_create_empty_exchange(): | |
| 60 | + ex = PlanOpticonExchange(project=_sample_project()) | |
| 61 | + assert ex.version == "1.0" | |
| 62 | + assert ex.entities == [] | |
| 63 | + assert ex.relationships == [] | |
| 64 | + assert ex.artifacts == [] | |
| 65 | + assert ex.sources == [] | |
| 66 | + assert ex.project.name == "TestProject" | |
| 67 | + | |
| 68 | + | |
| 69 | +def test_create_with_data(): | |
| 70 | + ex = PlanOpticonExchange( | |
| 71 | + project=_sample_project(), | |
| 72 | + entities=[_sample_entity()], | |
| 73 | + relationships=[_sample_relationship()], | |
| 74 | + artifacts=[_sample_artifact()], | |
| 75 | + sources=[_sample_source()], | |
| 76 | + ) | |
| 77 | + assert len(ex.entities) == 1 | |
| 78 | + assert ex.entities[0].name == "Python" | |
| 79 | + assert len(ex.relationships) == 1 | |
| 80 | + assert len(ex.artifacts) == 1 | |
| 81 | + assert len(ex.sources) == 1 | |
| 82 | + | |
| 83 | + | |
| 84 | +def test_json_roundtrip(tmp_path): | |
| 85 | + original = PlanOpticonExchange( | |
| 86 | + project=_sample_project(), | |
| 87 | + entities=[_sample_entity()], | |
| 88 | + relationships=[_sample_relationship()], | |
| 89 | + artifacts=[_sample_artifact()], | |
| 90 | + sources=[_sample_source()], | |
| 91 | + ) | |
| 92 | + out = tmp_path / "exchange.json" | |
| 93 | + original.to_file(out) | |
| 94 | + | |
| 95 | + assert out.exists() | |
| 96 | + loaded = PlanOpticonExchange.from_file(out) | |
| 97 | + assert loaded.project.name == original.project.name | |
| 98 | + assert len(loaded.entities) == 1 | |
| 99 | + assert loaded.entities[0].name == "Python" | |
| 100 | + assert len(loaded.relationships) == 1 | |
| 101 | + assert len(loaded.artifacts) == 1 | |
| 102 | + assert len(loaded.sources) == 1 | |
| 103 | + | |
| 104 | + # Verify valid JSON on disk | |
| 105 | + raw = json.loads(out.read_text()) | |
| 106 | + assert raw["version"] == "1.0" | |
| 107 | + | |
| 108 | + | |
| 109 | +def test_json_schema_export(): | |
| 110 | + schema = PlanOpticonExchange.json_schema() | |
| 111 | + assert isinstance(schema, dict) | |
| 112 | + assert "properties" in schema | |
| 113 | + assert "version" in schema["properties"] | |
| 114 | + assert "project" in schema["properties"] | |
| 115 | + assert "entities" in schema["properties"] | |
| 116 | + | |
| 117 | + | |
| 118 | +def test_from_knowledge_graph(): | |
| 119 | + kg_dict = { | |
| 120 | + "nodes": [ | |
| 121 | + { | |
| 122 | + "id": "python", | |
| 123 | + "name": "Python", | |
| 124 | + "type": "technology", | |
| 125 | + "descriptions": ["A language"], | |
| 126 | + "occurrences": [], | |
| 127 | + }, | |
| 128 | + { | |
| 129 | + "id": "alice", | |
| 130 | + "name": "Alice", | |
| 131 | + "type": "person", | |
| 132 | + "descriptions": ["Engineer"], | |
| 133 | + "occurrences": [], | |
| 134 | + }, | |
| 135 | + ], | |
| 136 | + "relationships": [ | |
| 137 | + { | |
| 138 | + "source": "Alice", | |
| 139 | + "target": "Python", | |
| 140 | + "type": "uses", | |
| 141 | + }, | |
| 142 | + ], | |
| 143 | + "sources": [ | |
| 144 | + { | |
| 145 | + "source_id": "s1", | |
| 146 | + "source_type": "video", | |
| 147 | + "title": "Recording", | |
| 148 | + }, | |
| 149 | + ], | |
| 150 | + } | |
| 151 | + | |
| 152 | + ex = PlanOpticonExchange.from_knowledge_graph( | |
| 153 | + kg_dict, | |
| 154 | + project_name="Demo", | |
| 155 | + tags=["test"], | |
| 156 | + ) | |
| 157 | + assert ex.project.name == "Demo" | |
| 158 | + assert len(ex.entities) == 2 | |
| 159 | + assert len(ex.relationships) == 1 | |
| 160 | + assert len(ex.sources) == 1 | |
| 161 | + assert "test" in ex.project.tags | |
| 162 | + | |
| 163 | + | |
| 164 | +def test_merge_deduplicates_entities(): | |
| 165 | + ex1 = PlanOpticonExchange( | |
| 166 | + project=_sample_project(), | |
| 167 | + entities=[_sample_entity("Python"), _sample_entity("Rust")], | |
| 168 | + relationships=[_sample_relationship()], | |
| 169 | + sources=[_sample_source()], | |
| 170 | + ) | |
| 171 | + ex2 = PlanOpticonExchange( | |
| 172 | + project=ProjectMeta(name="Other"), | |
| 173 | + entities=[ | |
| 174 | + _sample_entity("Python"), # duplicate | |
| 175 | + _sample_entity("Go"), # new | |
| 176 | + ], | |
| 177 | + relationships=[ | |
| 178 | + Relationship(source="Bob", target="Go", type="uses"), | |
| 179 | + ], | |
| 180 | + sources=[ | |
| 181 | + SourceRecord( | |
| 182 | + source_id="src-2", | |
| 183 | + source_type="document", | |
| 184 | + title="Notes", | |
| 185 | + ), | |
| 186 | + ], | |
| 187 | + ) | |
| 188 | + | |
| 189 | + ex1.merge(ex2) | |
| 190 | + | |
| 191 | + entity_names = [e.name for e in ex1.entities] | |
| 192 | + assert entity_names.count("Python") == 1 | |
| 193 | + assert "Go" in entity_names | |
| 194 | + assert "Rust" in entity_names | |
| 195 | + assert len(ex1.entities) == 3 | |
| 196 | + assert len(ex1.relationships) == 2 | |
| 197 | + assert len(ex1.sources) == 2 | |
| 198 | + | |
| 199 | + | |
| 200 | +def test_version_field(): | |
| 201 | + ex = PlanOpticonExchange( | |
| 202 | + version="2.0", | |
| 203 | + project=_sample_project(), | |
| 204 | + ) | |
| 205 | + assert ex.version == "2.0" | |
| 206 | + | |
| 207 | + | |
| 208 | +def test_artifact_meta_model(): | |
| 209 | + art = ArtifactMeta( | |
| 210 | + name="plan", | |
| 211 | + content="# Plan\nDo stuff", | |
| 212 | + artifact_type="project_plan", | |
| 213 | + format="markdown", | |
| 214 | + metadata={"author": "agent"}, | |
| 215 | + ) | |
| 216 | + assert art.name == "plan" | |
| 217 | + assert art.artifact_type == "project_plan" | |
| 218 | + assert art.format == "markdown" | |
| 219 | + assert art.metadata == {"author": "agent"} | |
| 220 | + | |
| 221 | + # Roundtrip via dict | |
| 222 | + d = art.model_dump() | |
| 223 | + restored = ArtifactMeta.model_validate(d) | |
| 224 | + assert restored == art |
| --- a/tests/test_exchange.py | |
| +++ b/tests/test_exchange.py | |
| @@ -0,0 +1,224 @@ | |
| --- a/tests/test_exchange.py | |
| +++ b/tests/test_exchange.py | |
| @@ -0,0 +1,224 @@ | |
| 1 | """Tests for the PlanOpticonExchange interchange format.""" |
| 2 | |
| 3 | import json |
| 4 | |
| 5 | from video_processor.exchange import ( |
| 6 | ArtifactMeta, |
| 7 | PlanOpticonExchange, |
| 8 | ProjectMeta, |
| 9 | ) |
| 10 | from video_processor.models import Entity, Relationship, SourceRecord |
| 11 | |
| 12 | # ------------------------------------------------------------------ |
| 13 | # Fixtures |
| 14 | # ------------------------------------------------------------------ |
| 15 | |
| 16 | |
| 17 | def _sample_entity(name: str = "Python", etype: str = "technology"): |
| 18 | return Entity( |
| 19 | name=name, |
| 20 | type=etype, |
| 21 | descriptions=["A programming language"], |
| 22 | ) |
| 23 | |
| 24 | |
| 25 | def _sample_relationship(): |
| 26 | return Relationship( |
| 27 | source="Alice", |
| 28 | target="Python", |
| 29 | type="uses", |
| 30 | ) |
| 31 | |
| 32 | |
| 33 | def _sample_source(): |
| 34 | return SourceRecord( |
| 35 | source_id="src-1", |
| 36 | source_type="video", |
| 37 | title="Intro recording", |
| 38 | ) |
| 39 | |
| 40 | |
| 41 | def _sample_artifact(): |
| 42 | return ArtifactMeta( |
| 43 | name="roadmap", |
| 44 | content="# Roadmap\n- Phase 1", |
| 45 | artifact_type="roadmap", |
| 46 | format="markdown", |
| 47 | ) |
| 48 | |
| 49 | |
| 50 | def _sample_project(): |
| 51 | return ProjectMeta(name="TestProject", description="A test") |
| 52 | |
| 53 | |
| 54 | # ------------------------------------------------------------------ |
| 55 | # Tests |
| 56 | # ------------------------------------------------------------------ |
| 57 | |
| 58 | |
| 59 | def test_create_empty_exchange(): |
| 60 | ex = PlanOpticonExchange(project=_sample_project()) |
| 61 | assert ex.version == "1.0" |
| 62 | assert ex.entities == [] |
| 63 | assert ex.relationships == [] |
| 64 | assert ex.artifacts == [] |
| 65 | assert ex.sources == [] |
| 66 | assert ex.project.name == "TestProject" |
| 67 | |
| 68 | |
| 69 | def test_create_with_data(): |
| 70 | ex = PlanOpticonExchange( |
| 71 | project=_sample_project(), |
| 72 | entities=[_sample_entity()], |
| 73 | relationships=[_sample_relationship()], |
| 74 | artifacts=[_sample_artifact()], |
| 75 | sources=[_sample_source()], |
| 76 | ) |
| 77 | assert len(ex.entities) == 1 |
| 78 | assert ex.entities[0].name == "Python" |
| 79 | assert len(ex.relationships) == 1 |
| 80 | assert len(ex.artifacts) == 1 |
| 81 | assert len(ex.sources) == 1 |
| 82 | |
| 83 | |
| 84 | def test_json_roundtrip(tmp_path): |
| 85 | original = PlanOpticonExchange( |
| 86 | project=_sample_project(), |
| 87 | entities=[_sample_entity()], |
| 88 | relationships=[_sample_relationship()], |
| 89 | artifacts=[_sample_artifact()], |
| 90 | sources=[_sample_source()], |
| 91 | ) |
| 92 | out = tmp_path / "exchange.json" |
| 93 | original.to_file(out) |
| 94 | |
| 95 | assert out.exists() |
| 96 | loaded = PlanOpticonExchange.from_file(out) |
| 97 | assert loaded.project.name == original.project.name |
| 98 | assert len(loaded.entities) == 1 |
| 99 | assert loaded.entities[0].name == "Python" |
| 100 | assert len(loaded.relationships) == 1 |
| 101 | assert len(loaded.artifacts) == 1 |
| 102 | assert len(loaded.sources) == 1 |
| 103 | |
| 104 | # Verify valid JSON on disk |
| 105 | raw = json.loads(out.read_text()) |
| 106 | assert raw["version"] == "1.0" |
| 107 | |
| 108 | |
| 109 | def test_json_schema_export(): |
| 110 | schema = PlanOpticonExchange.json_schema() |
| 111 | assert isinstance(schema, dict) |
| 112 | assert "properties" in schema |
| 113 | assert "version" in schema["properties"] |
| 114 | assert "project" in schema["properties"] |
| 115 | assert "entities" in schema["properties"] |
| 116 | |
| 117 | |
| 118 | def test_from_knowledge_graph(): |
| 119 | kg_dict = { |
| 120 | "nodes": [ |
| 121 | { |
| 122 | "id": "python", |
| 123 | "name": "Python", |
| 124 | "type": "technology", |
| 125 | "descriptions": ["A language"], |
| 126 | "occurrences": [], |
| 127 | }, |
| 128 | { |
| 129 | "id": "alice", |
| 130 | "name": "Alice", |
| 131 | "type": "person", |
| 132 | "descriptions": ["Engineer"], |
| 133 | "occurrences": [], |
| 134 | }, |
| 135 | ], |
| 136 | "relationships": [ |
| 137 | { |
| 138 | "source": "Alice", |
| 139 | "target": "Python", |
| 140 | "type": "uses", |
| 141 | }, |
| 142 | ], |
| 143 | "sources": [ |
| 144 | { |
| 145 | "source_id": "s1", |
| 146 | "source_type": "video", |
| 147 | "title": "Recording", |
| 148 | }, |
| 149 | ], |
| 150 | } |
| 151 | |
| 152 | ex = PlanOpticonExchange.from_knowledge_graph( |
| 153 | kg_dict, |
| 154 | project_name="Demo", |
| 155 | tags=["test"], |
| 156 | ) |
| 157 | assert ex.project.name == "Demo" |
| 158 | assert len(ex.entities) == 2 |
| 159 | assert len(ex.relationships) == 1 |
| 160 | assert len(ex.sources) == 1 |
| 161 | assert "test" in ex.project.tags |
| 162 | |
| 163 | |
| 164 | def test_merge_deduplicates_entities(): |
| 165 | ex1 = PlanOpticonExchange( |
| 166 | project=_sample_project(), |
| 167 | entities=[_sample_entity("Python"), _sample_entity("Rust")], |
| 168 | relationships=[_sample_relationship()], |
| 169 | sources=[_sample_source()], |
| 170 | ) |
| 171 | ex2 = PlanOpticonExchange( |
| 172 | project=ProjectMeta(name="Other"), |
| 173 | entities=[ |
| 174 | _sample_entity("Python"), # duplicate |
| 175 | _sample_entity("Go"), # new |
| 176 | ], |
| 177 | relationships=[ |
| 178 | Relationship(source="Bob", target="Go", type="uses"), |
| 179 | ], |
| 180 | sources=[ |
| 181 | SourceRecord( |
| 182 | source_id="src-2", |
| 183 | source_type="document", |
| 184 | title="Notes", |
| 185 | ), |
| 186 | ], |
| 187 | ) |
| 188 | |
| 189 | ex1.merge(ex2) |
| 190 | |
| 191 | entity_names = [e.name for e in ex1.entities] |
| 192 | assert entity_names.count("Python") == 1 |
| 193 | assert "Go" in entity_names |
| 194 | assert "Rust" in entity_names |
| 195 | assert len(ex1.entities) == 3 |
| 196 | assert len(ex1.relationships) == 2 |
| 197 | assert len(ex1.sources) == 2 |
| 198 | |
| 199 | |
| 200 | def test_version_field(): |
| 201 | ex = PlanOpticonExchange( |
| 202 | version="2.0", |
| 203 | project=_sample_project(), |
| 204 | ) |
| 205 | assert ex.version == "2.0" |
| 206 | |
| 207 | |
| 208 | def test_artifact_meta_model(): |
| 209 | art = ArtifactMeta( |
| 210 | name="plan", |
| 211 | content="# Plan\nDo stuff", |
| 212 | artifact_type="project_plan", |
| 213 | format="markdown", |
| 214 | metadata={"author": "agent"}, |
| 215 | ) |
| 216 | assert art.name == "plan" |
| 217 | assert art.artifact_type == "project_plan" |
| 218 | assert art.format == "markdown" |
| 219 | assert art.metadata == {"author": "agent"} |
| 220 | |
| 221 | # Roundtrip via dict |
| 222 | d = art.model_dump() |
| 223 | restored = ArtifactMeta.model_validate(d) |
| 224 | assert restored == art |
+10
-25
| --- tests/test_graph_query.py | ||
| +++ tests/test_graph_query.py | ||
| @@ -4,11 +4,11 @@ | ||
| 4 | 4 | from unittest.mock import MagicMock |
| 5 | 5 | |
| 6 | 6 | import pytest |
| 7 | 7 | |
| 8 | 8 | from video_processor.integrators.graph_query import GraphQueryEngine, QueryResult |
| 9 | -from video_processor.integrators.graph_store import InMemoryStore | |
| 9 | +from video_processor.integrators.graph_store import InMemoryStore, SQLiteStore | |
| 10 | 10 | |
| 11 | 11 | |
| 12 | 12 | def _make_populated_store(): |
| 13 | 13 | """Create a store with test data.""" |
| 14 | 14 | store = InMemoryStore() |
| @@ -167,15 +167,15 @@ | ||
| 167 | 167 | engine = GraphQueryEngine(store) |
| 168 | 168 | result = engine.neighbors("Ghost") |
| 169 | 169 | assert result.data == [] |
| 170 | 170 | assert "not found" in result.explanation |
| 171 | 171 | |
| 172 | - def test_cypher_raises_on_inmemory(self): | |
| 172 | + def test_sql_raises_on_inmemory(self): | |
| 173 | 173 | store = InMemoryStore() |
| 174 | 174 | engine = GraphQueryEngine(store) |
| 175 | 175 | with pytest.raises(NotImplementedError): |
| 176 | - engine.cypher("MATCH (n) RETURN n") | |
| 176 | + engine.sql("SELECT * FROM entities") | |
| 177 | 177 | |
| 178 | 178 | def test_entities_limit(self): |
| 179 | 179 | store = _make_populated_store() |
| 180 | 180 | engine = GraphQueryEngine(store) |
| 181 | 181 | result = engine.entities(limit=2) |
| @@ -199,39 +199,24 @@ | ||
| 199 | 199 | result = engine.stats() |
| 200 | 200 | assert result.data["entity_count"] == 2 |
| 201 | 201 | assert result.data["relationship_count"] == 1 |
| 202 | 202 | |
| 203 | 203 | |
| 204 | -# Conditional FalkorDB tests | |
| 205 | -_falkordb_available = False | |
| 206 | -try: | |
| 207 | - import redislite # noqa: F401 | |
| 208 | - | |
| 209 | - _falkordb_available = True | |
| 210 | -except ImportError: | |
| 211 | - pass | |
| 212 | - | |
| 213 | - | |
| 214 | -@pytest.mark.skipif(not _falkordb_available, reason="falkordblite not installed") | |
| 215 | -class TestFalkorDBQuery: | |
| 216 | - def test_cypher_passthrough(self, tmp_path): | |
| 217 | - from video_processor.integrators.graph_store import FalkorDBStore | |
| 218 | - | |
| 219 | - store = FalkorDBStore(tmp_path / "test.db") | |
| 204 | +class TestSQLiteQuery: | |
| 205 | + def test_sql_passthrough(self, tmp_path): | |
| 206 | + store = SQLiteStore(tmp_path / "test.db") | |
| 220 | 207 | store.merge_entity("Python", "technology", ["A language"]) |
| 221 | 208 | engine = GraphQueryEngine(store) |
| 222 | - result = engine.cypher("MATCH (e:Entity) RETURN e.name") | |
| 209 | + result = engine.sql("SELECT name FROM entities") | |
| 223 | 210 | assert len(result.data) >= 1 |
| 224 | - assert result.query_type == "cypher" | |
| 211 | + assert result.query_type == "sql" | |
| 225 | 212 | store.close() |
| 226 | 213 | |
| 227 | 214 | def test_raw_query_on_store(self, tmp_path): |
| 228 | - from video_processor.integrators.graph_store import FalkorDBStore | |
| 229 | - | |
| 230 | - store = FalkorDBStore(tmp_path / "test.db") | |
| 215 | + store = SQLiteStore(tmp_path / "test.db") | |
| 231 | 216 | store.merge_entity("Alice", "person", ["Engineer"]) |
| 232 | - rows = store.raw_query("MATCH (e:Entity) RETURN e.name") | |
| 217 | + rows = store.raw_query("SELECT name FROM entities") | |
| 233 | 218 | assert len(rows) >= 1 |
| 234 | 219 | store.close() |
| 235 | 220 | |
| 236 | 221 | |
| 237 | 222 | class TestAgenticMode: |
| 238 | 223 |
| --- tests/test_graph_query.py | |
| +++ tests/test_graph_query.py | |
| @@ -4,11 +4,11 @@ | |
| 4 | from unittest.mock import MagicMock |
| 5 | |
| 6 | import pytest |
| 7 | |
| 8 | from video_processor.integrators.graph_query import GraphQueryEngine, QueryResult |
| 9 | from video_processor.integrators.graph_store import InMemoryStore |
| 10 | |
| 11 | |
| 12 | def _make_populated_store(): |
| 13 | """Create a store with test data.""" |
| 14 | store = InMemoryStore() |
| @@ -167,15 +167,15 @@ | |
| 167 | engine = GraphQueryEngine(store) |
| 168 | result = engine.neighbors("Ghost") |
| 169 | assert result.data == [] |
| 170 | assert "not found" in result.explanation |
| 171 | |
| 172 | def test_cypher_raises_on_inmemory(self): |
| 173 | store = InMemoryStore() |
| 174 | engine = GraphQueryEngine(store) |
| 175 | with pytest.raises(NotImplementedError): |
| 176 | engine.cypher("MATCH (n) RETURN n") |
| 177 | |
| 178 | def test_entities_limit(self): |
| 179 | store = _make_populated_store() |
| 180 | engine = GraphQueryEngine(store) |
| 181 | result = engine.entities(limit=2) |
| @@ -199,39 +199,24 @@ | |
| 199 | result = engine.stats() |
| 200 | assert result.data["entity_count"] == 2 |
| 201 | assert result.data["relationship_count"] == 1 |
| 202 | |
| 203 | |
| 204 | # Conditional FalkorDB tests |
| 205 | _falkordb_available = False |
| 206 | try: |
| 207 | import redislite # noqa: F401 |
| 208 | |
| 209 | _falkordb_available = True |
| 210 | except ImportError: |
| 211 | pass |
| 212 | |
| 213 | |
| 214 | @pytest.mark.skipif(not _falkordb_available, reason="falkordblite not installed") |
| 215 | class TestFalkorDBQuery: |
| 216 | def test_cypher_passthrough(self, tmp_path): |
| 217 | from video_processor.integrators.graph_store import FalkorDBStore |
| 218 | |
| 219 | store = FalkorDBStore(tmp_path / "test.db") |
| 220 | store.merge_entity("Python", "technology", ["A language"]) |
| 221 | engine = GraphQueryEngine(store) |
| 222 | result = engine.cypher("MATCH (e:Entity) RETURN e.name") |
| 223 | assert len(result.data) >= 1 |
| 224 | assert result.query_type == "cypher" |
| 225 | store.close() |
| 226 | |
| 227 | def test_raw_query_on_store(self, tmp_path): |
| 228 | from video_processor.integrators.graph_store import FalkorDBStore |
| 229 | |
| 230 | store = FalkorDBStore(tmp_path / "test.db") |
| 231 | store.merge_entity("Alice", "person", ["Engineer"]) |
| 232 | rows = store.raw_query("MATCH (e:Entity) RETURN e.name") |
| 233 | assert len(rows) >= 1 |
| 234 | store.close() |
| 235 | |
| 236 | |
| 237 | class TestAgenticMode: |
| 238 |
| --- tests/test_graph_query.py | |
| +++ tests/test_graph_query.py | |
| @@ -4,11 +4,11 @@ | |
| 4 | from unittest.mock import MagicMock |
| 5 | |
| 6 | import pytest |
| 7 | |
| 8 | from video_processor.integrators.graph_query import GraphQueryEngine, QueryResult |
| 9 | from video_processor.integrators.graph_store import InMemoryStore, SQLiteStore |
| 10 | |
| 11 | |
| 12 | def _make_populated_store(): |
| 13 | """Create a store with test data.""" |
| 14 | store = InMemoryStore() |
| @@ -167,15 +167,15 @@ | |
| 167 | engine = GraphQueryEngine(store) |
| 168 | result = engine.neighbors("Ghost") |
| 169 | assert result.data == [] |
| 170 | assert "not found" in result.explanation |
| 171 | |
| 172 | def test_sql_raises_on_inmemory(self): |
| 173 | store = InMemoryStore() |
| 174 | engine = GraphQueryEngine(store) |
| 175 | with pytest.raises(NotImplementedError): |
| 176 | engine.sql("SELECT * FROM entities") |
| 177 | |
| 178 | def test_entities_limit(self): |
| 179 | store = _make_populated_store() |
| 180 | engine = GraphQueryEngine(store) |
| 181 | result = engine.entities(limit=2) |
| @@ -199,39 +199,24 @@ | |
| 199 | result = engine.stats() |
| 200 | assert result.data["entity_count"] == 2 |
| 201 | assert result.data["relationship_count"] == 1 |
| 202 | |
| 203 | |
| 204 | class TestSQLiteQuery: |
| 205 | def test_sql_passthrough(self, tmp_path): |
| 206 | store = SQLiteStore(tmp_path / "test.db") |
| 207 | store.merge_entity("Python", "technology", ["A language"]) |
| 208 | engine = GraphQueryEngine(store) |
| 209 | result = engine.sql("SELECT name FROM entities") |
| 210 | assert len(result.data) >= 1 |
| 211 | assert result.query_type == "sql" |
| 212 | store.close() |
| 213 | |
| 214 | def test_raw_query_on_store(self, tmp_path): |
| 215 | store = SQLiteStore(tmp_path / "test.db") |
| 216 | store.merge_entity("Alice", "person", ["Engineer"]) |
| 217 | rows = store.raw_query("SELECT name FROM entities") |
| 218 | assert len(rows) >= 1 |
| 219 | store.close() |
| 220 | |
| 221 | |
| 222 | class TestAgenticMode: |
| 223 |
+56
-42
| --- tests/test_graph_store.py | ||
| +++ tests/test_graph_store.py | ||
| @@ -1,10 +1,8 @@ | ||
| 1 | 1 | """Tests for graph storage backends.""" |
| 2 | 2 | |
| 3 | -import pytest | |
| 4 | - | |
| 5 | -from video_processor.integrators.graph_store import InMemoryStore, create_store | |
| 3 | +from video_processor.integrators.graph_store import InMemoryStore, SQLiteStore, create_store | |
| 6 | 4 | |
| 7 | 5 | |
| 8 | 6 | class TestInMemoryStore: |
| 9 | 7 | def test_merge_entity_creates_new(self): |
| 10 | 8 | store = InMemoryStore() |
| @@ -144,58 +142,40 @@ | ||
| 144 | 142 | |
| 145 | 143 | def test_returns_in_memory_with_none_path(self): |
| 146 | 144 | store = create_store(db_path=None) |
| 147 | 145 | assert isinstance(store, InMemoryStore) |
| 148 | 146 | |
| 149 | - def test_fallback_to_in_memory_when_falkordb_unavailable(self, tmp_path): | |
| 150 | - """When falkordblite is not installed, should fall back gracefully.""" | |
| 147 | + def test_returns_sqlite_with_path(self, tmp_path): | |
| 151 | 148 | store = create_store(db_path=tmp_path / "test.db") |
| 152 | - # Will be FalkorDBStore if installed, InMemoryStore if not | |
| 153 | - # Either way, it should work | |
| 149 | + assert isinstance(store, SQLiteStore) | |
| 154 | 150 | store.merge_entity("Test", "concept", ["test entity"]) |
| 155 | 151 | assert store.get_entity_count() == 1 |
| 152 | + store.close() | |
| 156 | 153 | |
| 157 | 154 | |
| 158 | -# Conditional FalkorDB tests | |
| 159 | -_falkordb_available = False | |
| 160 | -try: | |
| 161 | - import redislite # noqa: F401 | |
| 162 | - | |
| 163 | - _falkordb_available = True | |
| 164 | -except ImportError: | |
| 165 | - pass | |
| 166 | - | |
| 167 | - | |
| 168 | -@pytest.mark.skipif(not _falkordb_available, reason="falkordblite not installed") | |
| 169 | -class TestFalkorDBStore: | |
| 155 | +class TestSQLiteStore: | |
| 170 | 156 | def test_create_and_query_entity(self, tmp_path): |
| 171 | - from video_processor.integrators.graph_store import FalkorDBStore | |
| 172 | - | |
| 173 | - store = FalkorDBStore(tmp_path / "test.db") | |
| 157 | + store = SQLiteStore(tmp_path / "test.db") | |
| 174 | 158 | store.merge_entity("Python", "technology", ["A language"]) |
| 175 | 159 | assert store.get_entity_count() == 1 |
| 176 | 160 | entity = store.get_entity("python") |
| 177 | 161 | assert entity is not None |
| 178 | 162 | assert entity["name"] == "Python" |
| 179 | 163 | store.close() |
| 180 | 164 | |
| 181 | 165 | def test_case_insensitive_merge(self, tmp_path): |
| 182 | - from video_processor.integrators.graph_store import FalkorDBStore | |
| 183 | - | |
| 184 | - store = FalkorDBStore(tmp_path / "test.db") | |
| 166 | + store = SQLiteStore(tmp_path / "test.db") | |
| 185 | 167 | store.merge_entity("Python", "technology", ["Language"]) |
| 186 | 168 | store.merge_entity("python", "technology", ["Snake-based"]) |
| 187 | 169 | assert store.get_entity_count() == 1 |
| 188 | 170 | entity = store.get_entity("python") |
| 189 | 171 | assert "Language" in entity["descriptions"] |
| 190 | 172 | assert "Snake-based" in entity["descriptions"] |
| 191 | 173 | store.close() |
| 192 | 174 | |
| 193 | 175 | def test_relationships(self, tmp_path): |
| 194 | - from video_processor.integrators.graph_store import FalkorDBStore | |
| 195 | - | |
| 196 | - store = FalkorDBStore(tmp_path / "test.db") | |
| 176 | + store = SQLiteStore(tmp_path / "test.db") | |
| 197 | 177 | store.merge_entity("Alice", "person", []) |
| 198 | 178 | store.merge_entity("Bob", "person", []) |
| 199 | 179 | store.add_relationship("Alice", "Bob", "knows") |
| 200 | 180 | assert store.get_relationship_count() == 1 |
| 201 | 181 | rels = store.get_all_relationships() |
| @@ -202,40 +182,39 @@ | ||
| 202 | 182 | assert rels[0]["source"] == "Alice" |
| 203 | 183 | assert rels[0]["target"] == "Bob" |
| 204 | 184 | store.close() |
| 205 | 185 | |
| 206 | 186 | def test_occurrences(self, tmp_path): |
| 207 | - from video_processor.integrators.graph_store import FalkorDBStore | |
| 208 | - | |
| 209 | - store = FalkorDBStore(tmp_path / "test.db") | |
| 187 | + store = SQLiteStore(tmp_path / "test.db") | |
| 210 | 188 | store.merge_entity("Alice", "person", ["Engineer"]) |
| 211 | 189 | store.add_occurrence("Alice", "transcript_0", timestamp=10.5, text="Alice said...") |
| 212 | 190 | entity = store.get_entity("alice") |
| 213 | 191 | assert len(entity["occurrences"]) == 1 |
| 214 | 192 | assert entity["occurrences"][0]["source"] == "transcript_0" |
| 215 | 193 | store.close() |
| 216 | 194 | |
| 217 | - def test_persistence(self, tmp_path): | |
| 218 | - from video_processor.integrators.graph_store import FalkorDBStore | |
| 195 | + def test_occurrence_nonexistent_entity(self, tmp_path): | |
| 196 | + store = SQLiteStore(tmp_path / "test.db") | |
| 197 | + store.add_occurrence("Ghost", "transcript_0") | |
| 198 | + assert store.get_entity_count() == 0 | |
| 199 | + store.close() | |
| 219 | 200 | |
| 201 | + def test_persistence(self, tmp_path): | |
| 220 | 202 | db_path = tmp_path / "persist.db" |
| 221 | 203 | |
| 222 | - store1 = FalkorDBStore(db_path) | |
| 204 | + store1 = SQLiteStore(db_path) | |
| 223 | 205 | store1.merge_entity("Python", "technology", ["A language"]) |
| 224 | - store1.add_relationship_count = 0 # just to trigger write | |
| 225 | 206 | store1.close() |
| 226 | 207 | |
| 227 | - store2 = FalkorDBStore(db_path) | |
| 208 | + store2 = SQLiteStore(db_path) | |
| 228 | 209 | assert store2.get_entity_count() == 1 |
| 229 | 210 | entity = store2.get_entity("python") |
| 230 | 211 | assert entity["name"] == "Python" |
| 231 | 212 | store2.close() |
| 232 | 213 | |
| 233 | 214 | def test_to_dict_format(self, tmp_path): |
| 234 | - from video_processor.integrators.graph_store import FalkorDBStore | |
| 235 | - | |
| 236 | - store = FalkorDBStore(tmp_path / "test.db") | |
| 215 | + store = SQLiteStore(tmp_path / "test.db") | |
| 237 | 216 | store.merge_entity("Python", "technology", ["A language"]) |
| 238 | 217 | store.merge_entity("Django", "technology", ["A framework"]) |
| 239 | 218 | store.add_relationship("Django", "Python", "uses") |
| 240 | 219 | |
| 241 | 220 | data = store.to_dict() |
| @@ -248,13 +227,48 @@ | ||
| 248 | 227 | assert "name" in node |
| 249 | 228 | |
| 250 | 229 | store.close() |
| 251 | 230 | |
| 252 | 231 | def test_has_entity(self, tmp_path): |
| 253 | - from video_processor.integrators.graph_store import FalkorDBStore | |
| 254 | - | |
| 255 | - store = FalkorDBStore(tmp_path / "test.db") | |
| 232 | + store = SQLiteStore(tmp_path / "test.db") | |
| 256 | 233 | assert not store.has_entity("Python") |
| 257 | 234 | store.merge_entity("Python", "technology", []) |
| 258 | 235 | assert store.has_entity("Python") |
| 259 | 236 | assert store.has_entity("python") |
| 260 | 237 | store.close() |
| 238 | + | |
| 239 | + def test_raw_query(self, tmp_path): | |
| 240 | + store = SQLiteStore(tmp_path / "test.db") | |
| 241 | + store.merge_entity("Alice", "person", ["Engineer"]) | |
| 242 | + rows = store.raw_query("SELECT name FROM entities") | |
| 243 | + assert len(rows) >= 1 | |
| 244 | + assert rows[0][0] == "Alice" | |
| 245 | + store.close() | |
| 246 | + | |
| 247 | + def test_typed_relationship(self, tmp_path): | |
| 248 | + store = SQLiteStore(tmp_path / "test.db") | |
| 249 | + store.merge_entity("Django", "technology", []) | |
| 250 | + store.merge_entity("Python", "technology", []) | |
| 251 | + store.add_typed_relationship("Django", "Python", "DEPENDS_ON", {"version": "3.10"}) | |
| 252 | + rels = store.get_all_relationships() | |
| 253 | + assert len(rels) == 1 | |
| 254 | + assert rels[0]["type"] == "DEPENDS_ON" | |
| 255 | + store.close() | |
| 256 | + | |
| 257 | + def test_set_entity_properties(self, tmp_path): | |
| 258 | + store = SQLiteStore(tmp_path / "test.db") | |
| 259 | + store.merge_entity("Python", "technology", []) | |
| 260 | + assert store.set_entity_properties("Python", {"version": "3.12", "stable": True}) | |
| 261 | + assert not store.set_entity_properties("Ghost", {"key": "val"}) | |
| 262 | + store.close() | |
| 263 | + | |
| 264 | + def test_has_relationship(self, tmp_path): | |
| 265 | + store = SQLiteStore(tmp_path / "test.db") | |
| 266 | + store.merge_entity("Alice", "person", []) | |
| 267 | + store.merge_entity("Bob", "person", []) | |
| 268 | + store.add_relationship("Alice", "Bob", "knows") | |
| 269 | + assert store.has_relationship("Alice", "Bob") | |
| 270 | + assert store.has_relationship("alice", "bob") | |
| 271 | + assert store.has_relationship("Alice", "Bob", "knows") | |
| 272 | + assert not store.has_relationship("Alice", "Bob", "hates") | |
| 273 | + assert not store.has_relationship("Bob", "Alice") | |
| 274 | + store.close() | |
| 261 | 275 | |
| 262 | 276 | ADDED tests/test_knowledge_graph.py |
| 263 | 277 | ADDED tests/test_output_formatter.py |
| --- tests/test_graph_store.py | |
| +++ tests/test_graph_store.py | |
| @@ -1,10 +1,8 @@ | |
| 1 | """Tests for graph storage backends.""" |
| 2 | |
| 3 | import pytest |
| 4 | |
| 5 | from video_processor.integrators.graph_store import InMemoryStore, create_store |
| 6 | |
| 7 | |
| 8 | class TestInMemoryStore: |
| 9 | def test_merge_entity_creates_new(self): |
| 10 | store = InMemoryStore() |
| @@ -144,58 +142,40 @@ | |
| 144 | |
| 145 | def test_returns_in_memory_with_none_path(self): |
| 146 | store = create_store(db_path=None) |
| 147 | assert isinstance(store, InMemoryStore) |
| 148 | |
| 149 | def test_fallback_to_in_memory_when_falkordb_unavailable(self, tmp_path): |
| 150 | """When falkordblite is not installed, should fall back gracefully.""" |
| 151 | store = create_store(db_path=tmp_path / "test.db") |
| 152 | # Will be FalkorDBStore if installed, InMemoryStore if not |
| 153 | # Either way, it should work |
| 154 | store.merge_entity("Test", "concept", ["test entity"]) |
| 155 | assert store.get_entity_count() == 1 |
| 156 | |
| 157 | |
| 158 | # Conditional FalkorDB tests |
| 159 | _falkordb_available = False |
| 160 | try: |
| 161 | import redislite # noqa: F401 |
| 162 | |
| 163 | _falkordb_available = True |
| 164 | except ImportError: |
| 165 | pass |
| 166 | |
| 167 | |
| 168 | @pytest.mark.skipif(not _falkordb_available, reason="falkordblite not installed") |
| 169 | class TestFalkorDBStore: |
| 170 | def test_create_and_query_entity(self, tmp_path): |
| 171 | from video_processor.integrators.graph_store import FalkorDBStore |
| 172 | |
| 173 | store = FalkorDBStore(tmp_path / "test.db") |
| 174 | store.merge_entity("Python", "technology", ["A language"]) |
| 175 | assert store.get_entity_count() == 1 |
| 176 | entity = store.get_entity("python") |
| 177 | assert entity is not None |
| 178 | assert entity["name"] == "Python" |
| 179 | store.close() |
| 180 | |
| 181 | def test_case_insensitive_merge(self, tmp_path): |
| 182 | from video_processor.integrators.graph_store import FalkorDBStore |
| 183 | |
| 184 | store = FalkorDBStore(tmp_path / "test.db") |
| 185 | store.merge_entity("Python", "technology", ["Language"]) |
| 186 | store.merge_entity("python", "technology", ["Snake-based"]) |
| 187 | assert store.get_entity_count() == 1 |
| 188 | entity = store.get_entity("python") |
| 189 | assert "Language" in entity["descriptions"] |
| 190 | assert "Snake-based" in entity["descriptions"] |
| 191 | store.close() |
| 192 | |
| 193 | def test_relationships(self, tmp_path): |
| 194 | from video_processor.integrators.graph_store import FalkorDBStore |
| 195 | |
| 196 | store = FalkorDBStore(tmp_path / "test.db") |
| 197 | store.merge_entity("Alice", "person", []) |
| 198 | store.merge_entity("Bob", "person", []) |
| 199 | store.add_relationship("Alice", "Bob", "knows") |
| 200 | assert store.get_relationship_count() == 1 |
| 201 | rels = store.get_all_relationships() |
| @@ -202,40 +182,39 @@ | |
| 202 | assert rels[0]["source"] == "Alice" |
| 203 | assert rels[0]["target"] == "Bob" |
| 204 | store.close() |
| 205 | |
| 206 | def test_occurrences(self, tmp_path): |
| 207 | from video_processor.integrators.graph_store import FalkorDBStore |
| 208 | |
| 209 | store = FalkorDBStore(tmp_path / "test.db") |
| 210 | store.merge_entity("Alice", "person", ["Engineer"]) |
| 211 | store.add_occurrence("Alice", "transcript_0", timestamp=10.5, text="Alice said...") |
| 212 | entity = store.get_entity("alice") |
| 213 | assert len(entity["occurrences"]) == 1 |
| 214 | assert entity["occurrences"][0]["source"] == "transcript_0" |
| 215 | store.close() |
| 216 | |
| 217 | def test_persistence(self, tmp_path): |
| 218 | from video_processor.integrators.graph_store import FalkorDBStore |
| 219 | |
| 220 | db_path = tmp_path / "persist.db" |
| 221 | |
| 222 | store1 = FalkorDBStore(db_path) |
| 223 | store1.merge_entity("Python", "technology", ["A language"]) |
| 224 | store1.add_relationship_count = 0 # just to trigger write |
| 225 | store1.close() |
| 226 | |
| 227 | store2 = FalkorDBStore(db_path) |
| 228 | assert store2.get_entity_count() == 1 |
| 229 | entity = store2.get_entity("python") |
| 230 | assert entity["name"] == "Python" |
| 231 | store2.close() |
| 232 | |
| 233 | def test_to_dict_format(self, tmp_path): |
| 234 | from video_processor.integrators.graph_store import FalkorDBStore |
| 235 | |
| 236 | store = FalkorDBStore(tmp_path / "test.db") |
| 237 | store.merge_entity("Python", "technology", ["A language"]) |
| 238 | store.merge_entity("Django", "technology", ["A framework"]) |
| 239 | store.add_relationship("Django", "Python", "uses") |
| 240 | |
| 241 | data = store.to_dict() |
| @@ -248,13 +227,48 @@ | |
| 248 | assert "name" in node |
| 249 | |
| 250 | store.close() |
| 251 | |
| 252 | def test_has_entity(self, tmp_path): |
| 253 | from video_processor.integrators.graph_store import FalkorDBStore |
| 254 | |
| 255 | store = FalkorDBStore(tmp_path / "test.db") |
| 256 | assert not store.has_entity("Python") |
| 257 | store.merge_entity("Python", "technology", []) |
| 258 | assert store.has_entity("Python") |
| 259 | assert store.has_entity("python") |
| 260 | store.close() |
| 261 | |
| 262 | DDED tests/test_knowledge_graph.py |
| 263 | DDED tests/test_output_formatter.py |
| --- tests/test_graph_store.py | |
| +++ tests/test_graph_store.py | |
| @@ -1,10 +1,8 @@ | |
| 1 | """Tests for graph storage backends.""" |
| 2 | |
| 3 | from video_processor.integrators.graph_store import InMemoryStore, SQLiteStore, create_store |
| 4 | |
| 5 | |
| 6 | class TestInMemoryStore: |
| 7 | def test_merge_entity_creates_new(self): |
| 8 | store = InMemoryStore() |
| @@ -144,58 +142,40 @@ | |
| 142 | |
| 143 | def test_returns_in_memory_with_none_path(self): |
| 144 | store = create_store(db_path=None) |
| 145 | assert isinstance(store, InMemoryStore) |
| 146 | |
| 147 | def test_returns_sqlite_with_path(self, tmp_path): |
| 148 | store = create_store(db_path=tmp_path / "test.db") |
| 149 | assert isinstance(store, SQLiteStore) |
| 150 | store.merge_entity("Test", "concept", ["test entity"]) |
| 151 | assert store.get_entity_count() == 1 |
| 152 | store.close() |
| 153 | |
| 154 | |
| 155 | class TestSQLiteStore: |
| 156 | def test_create_and_query_entity(self, tmp_path): |
| 157 | store = SQLiteStore(tmp_path / "test.db") |
| 158 | store.merge_entity("Python", "technology", ["A language"]) |
| 159 | assert store.get_entity_count() == 1 |
| 160 | entity = store.get_entity("python") |
| 161 | assert entity is not None |
| 162 | assert entity["name"] == "Python" |
| 163 | store.close() |
| 164 | |
| 165 | def test_case_insensitive_merge(self, tmp_path): |
| 166 | store = SQLiteStore(tmp_path / "test.db") |
| 167 | store.merge_entity("Python", "technology", ["Language"]) |
| 168 | store.merge_entity("python", "technology", ["Snake-based"]) |
| 169 | assert store.get_entity_count() == 1 |
| 170 | entity = store.get_entity("python") |
| 171 | assert "Language" in entity["descriptions"] |
| 172 | assert "Snake-based" in entity["descriptions"] |
| 173 | store.close() |
| 174 | |
| 175 | def test_relationships(self, tmp_path): |
| 176 | store = SQLiteStore(tmp_path / "test.db") |
| 177 | store.merge_entity("Alice", "person", []) |
| 178 | store.merge_entity("Bob", "person", []) |
| 179 | store.add_relationship("Alice", "Bob", "knows") |
| 180 | assert store.get_relationship_count() == 1 |
| 181 | rels = store.get_all_relationships() |
| @@ -202,40 +182,39 @@ | |
| 182 | assert rels[0]["source"] == "Alice" |
| 183 | assert rels[0]["target"] == "Bob" |
| 184 | store.close() |
| 185 | |
| 186 | def test_occurrences(self, tmp_path): |
| 187 | store = SQLiteStore(tmp_path / "test.db") |
| 188 | store.merge_entity("Alice", "person", ["Engineer"]) |
| 189 | store.add_occurrence("Alice", "transcript_0", timestamp=10.5, text="Alice said...") |
| 190 | entity = store.get_entity("alice") |
| 191 | assert len(entity["occurrences"]) == 1 |
| 192 | assert entity["occurrences"][0]["source"] == "transcript_0" |
| 193 | store.close() |
| 194 | |
| 195 | def test_occurrence_nonexistent_entity(self, tmp_path): |
| 196 | store = SQLiteStore(tmp_path / "test.db") |
| 197 | store.add_occurrence("Ghost", "transcript_0") |
| 198 | assert store.get_entity_count() == 0 |
| 199 | store.close() |
| 200 | |
| 201 | def test_persistence(self, tmp_path): |
| 202 | db_path = tmp_path / "persist.db" |
| 203 | |
| 204 | store1 = SQLiteStore(db_path) |
| 205 | store1.merge_entity("Python", "technology", ["A language"]) |
| 206 | store1.close() |
| 207 | |
| 208 | store2 = SQLiteStore(db_path) |
| 209 | assert store2.get_entity_count() == 1 |
| 210 | entity = store2.get_entity("python") |
| 211 | assert entity["name"] == "Python" |
| 212 | store2.close() |
| 213 | |
| 214 | def test_to_dict_format(self, tmp_path): |
| 215 | store = SQLiteStore(tmp_path / "test.db") |
| 216 | store.merge_entity("Python", "technology", ["A language"]) |
| 217 | store.merge_entity("Django", "technology", ["A framework"]) |
| 218 | store.add_relationship("Django", "Python", "uses") |
| 219 | |
| 220 | data = store.to_dict() |
| @@ -248,13 +227,48 @@ | |
| 227 | assert "name" in node |
| 228 | |
| 229 | store.close() |
| 230 | |
| 231 | def test_has_entity(self, tmp_path): |
| 232 | store = SQLiteStore(tmp_path / "test.db") |
| 233 | assert not store.has_entity("Python") |
| 234 | store.merge_entity("Python", "technology", []) |
| 235 | assert store.has_entity("Python") |
| 236 | assert store.has_entity("python") |
| 237 | store.close() |
| 238 | |
| 239 | def test_raw_query(self, tmp_path): |
| 240 | store = SQLiteStore(tmp_path / "test.db") |
| 241 | store.merge_entity("Alice", "person", ["Engineer"]) |
| 242 | rows = store.raw_query("SELECT name FROM entities") |
| 243 | assert len(rows) >= 1 |
| 244 | assert rows[0][0] == "Alice" |
| 245 | store.close() |
| 246 | |
| 247 | def test_typed_relationship(self, tmp_path): |
| 248 | store = SQLiteStore(tmp_path / "test.db") |
| 249 | store.merge_entity("Django", "technology", []) |
| 250 | store.merge_entity("Python", "technology", []) |
| 251 | store.add_typed_relationship("Django", "Python", "DEPENDS_ON", {"version": "3.10"}) |
| 252 | rels = store.get_all_relationships() |
| 253 | assert len(rels) == 1 |
| 254 | assert rels[0]["type"] == "DEPENDS_ON" |
| 255 | store.close() |
| 256 | |
| 257 | def test_set_entity_properties(self, tmp_path): |
| 258 | store = SQLiteStore(tmp_path / "test.db") |
| 259 | store.merge_entity("Python", "technology", []) |
| 260 | assert store.set_entity_properties("Python", {"version": "3.12", "stable": True}) |
| 261 | assert not store.set_entity_properties("Ghost", {"key": "val"}) |
| 262 | store.close() |
| 263 | |
| 264 | def test_has_relationship(self, tmp_path): |
| 265 | store = SQLiteStore(tmp_path / "test.db") |
| 266 | store.merge_entity("Alice", "person", []) |
| 267 | store.merge_entity("Bob", "person", []) |
| 268 | store.add_relationship("Alice", "Bob", "knows") |
| 269 | assert store.has_relationship("Alice", "Bob") |
| 270 | assert store.has_relationship("alice", "bob") |
| 271 | assert store.has_relationship("Alice", "Bob", "knows") |
| 272 | assert not store.has_relationship("Alice", "Bob", "hates") |
| 273 | assert not store.has_relationship("Bob", "Alice") |
| 274 | store.close() |
| 275 | |
| 276 | DDED tests/test_knowledge_graph.py |
| 277 | DDED tests/test_output_formatter.py |
| --- a/tests/test_knowledge_graph.py | ||
| +++ b/tests/test_knowledge_graph.py | ||
| @@ -0,0 +1,422 @@ | ||
| 1 | +"""Tests for the KnowledgeGraph class.""" | |
| 2 | + | |
| 3 | +import json | |
| 4 | +from unittest.mock import MagicMock, patch | |
| 5 | + | |
| 6 | +import pytest | |
| 7 | + | |
| 8 | +from video_processor.integrators.knowledge_graph import KnowledgeGraph | |
| 9 | + | |
| 10 | + | |
| 11 | +@pytest.fixture | |
| 12 | +def mock_pm(): | |
| 13 | + """A mock ProviderManager that returns predictable JSON from chat().""" | |
| 14 | + pm = MagicMock() | |
| 15 | + pm.chat.return_value = json.dumps( | |
| 16 | + { | |
| 17 | + "entities": [ | |
| 18 | + {"name": "Python", "type": "technology", "description": "A programming language"}, | |
| 19 | + {"name": "Alice", "type": "person", "description": "Lead developer"}, | |
| 20 | + ], | |
| 21 | + "relationships": [ | |
| 22 | + {"source": "Alice", "target": "Python", "type": "uses"}, | |
| 23 | + ], | |
| 24 | + } | |
| 25 | + ) | |
| 26 | + return pm | |
| 27 | + | |
| 28 | + | |
| 29 | +@pytest.fixture | |
| 30 | +def kg_no_provider(): | |
| 31 | + """KnowledgeGraph with no provider (in-memory store).""" | |
| 32 | + return KnowledgeGraph() | |
| 33 | + | |
| 34 | + | |
| 35 | +@pytest.fixture | |
| 36 | +def kg_with_provider(mock_pm): | |
| 37 | + """KnowledgeGraph with a mock provider (in-memory store).""" | |
| 38 | + return KnowledgeGraph(provider_manager=mock_pm) | |
| 39 | + | |
| 40 | + | |
| 41 | +class TestCreation: | |
| 42 | + def test_create_without_db_path(self): | |
| 43 | + kg = KnowledgeGraph() | |
| 44 | + assert kg.pm is None | |
| 45 | + assert kg._store.get_entity_count() == 0 | |
| 46 | + assert kg._store.get_relationship_count() == 0 | |
| 47 | + | |
| 48 | + def test_create_with_db_path(self, tmp_path): | |
| 49 | + db_path = tmp_path / "test.db" | |
| 50 | + kg = KnowledgeGraph(db_path=db_path) | |
| 51 | + assert kg._store.get_entity_count() == 0 | |
| 52 | + assert db_path.exists() | |
| 53 | + | |
| 54 | + def test_create_with_provider(self, mock_pm): | |
| 55 | + kg = KnowledgeGraph(provider_manager=mock_pm) | |
| 56 | + assert kg.pm is mock_pm | |
| 57 | + | |
| 58 | + | |
| 59 | +class TestProcessTranscript: | |
| 60 | + def test_process_transcript_extracts_entities(self, kg_with_provider, mock_pm): | |
| 61 | + transcript = { | |
| 62 | + "segments": [ | |
| 63 | + {"text": "Alice is using Python for the project", "start": 0.0, "speaker": "Alice"}, | |
| 64 | + {"text": "It works great for data processing", "start": 5.0}, | |
| 65 | + ] | |
| 66 | + } | |
| 67 | + kg_with_provider.process_transcript(transcript) | |
| 68 | + | |
| 69 | + # The mock returns Python and Alice as entities | |
| 70 | + nodes = kg_with_provider.nodes | |
| 71 | + assert "Python" in nodes | |
| 72 | + assert "Alice" in nodes | |
| 73 | + assert nodes["Python"]["type"] == "technology" | |
| 74 | + | |
| 75 | + def test_process_transcript_registers_speakers(self, kg_with_provider): | |
| 76 | + transcript = { | |
| 77 | + "segments": [ | |
| 78 | + {"text": "Hello everyone", "start": 0.0, "speaker": "Bob"}, | |
| 79 | + ] | |
| 80 | + } | |
| 81 | + kg_with_provider.process_transcript(transcript) | |
| 82 | + assert kg_with_provider._store.has_entity("Bob") | |
| 83 | + | |
| 84 | + def test_process_transcript_missing_segments(self, kg_with_provider): | |
| 85 | + """Should log warning and return without error.""" | |
| 86 | + kg_with_provider.process_transcript({}) | |
| 87 | + assert kg_with_provider._store.get_entity_count() == 0 | |
| 88 | + | |
| 89 | + def test_process_transcript_empty_text_skipped(self, kg_with_provider, mock_pm): | |
| 90 | + transcript = { | |
| 91 | + "segments": [ | |
| 92 | + {"text": " ", "start": 0.0}, | |
| 93 | + ] | |
| 94 | + } | |
| 95 | + kg_with_provider.process_transcript(transcript) | |
| 96 | + # chat should not be called for empty batches (speaker registration may still happen) | |
| 97 | + mock_pm.chat.assert_not_called() | |
| 98 | + | |
| 99 | + def test_process_transcript_batching(self, kg_with_provider, mock_pm): | |
| 100 | + """With batch_size=2, 5 segments should produce 3 batches.""" | |
| 101 | + segments = [{"text": f"Segment {i}", "start": float(i)} for i in range(5)] | |
| 102 | + transcript = {"segments": segments} | |
| 103 | + kg_with_provider.process_transcript(transcript, batch_size=2) | |
| 104 | + assert mock_pm.chat.call_count == 3 | |
| 105 | + | |
| 106 | + | |
| 107 | +class TestProcessDiagrams: | |
| 108 | + def test_process_diagrams_with_text(self, kg_with_provider, mock_pm): | |
| 109 | + diagrams = [ | |
| 110 | + {"text_content": "Architecture shows Python microservices", "frame_index": 0}, | |
| 111 | + ] | |
| 112 | + kg_with_provider.process_diagrams(diagrams) | |
| 113 | + | |
| 114 | + # Should have called chat once for the text content | |
| 115 | + assert mock_pm.chat.call_count == 1 | |
| 116 | + # diagram_0 entity should exist | |
| 117 | + assert kg_with_provider._store.has_entity("diagram_0") | |
| 118 | + | |
| 119 | + def test_process_diagrams_without_text(self, kg_with_provider, mock_pm): | |
| 120 | + diagrams = [ | |
| 121 | + {"text_content": "", "frame_index": 5}, | |
| 122 | + ] | |
| 123 | + kg_with_provider.process_diagrams(diagrams) | |
| 124 | + # No chat call for empty text | |
| 125 | + mock_pm.chat.assert_not_called() | |
| 126 | + # But diagram entity still created | |
| 127 | + assert kg_with_provider._store.has_entity("diagram_0") | |
| 128 | + | |
| 129 | + def test_process_multiple_diagrams(self, kg_with_provider, mock_pm): | |
| 130 | + diagrams = [ | |
| 131 | + {"text_content": "Diagram A content", "frame_index": 0}, | |
| 132 | + {"text_content": "Diagram B content", "frame_index": 10}, | |
| 133 | + ] | |
| 134 | + kg_with_provider.process_diagrams(diagrams) | |
| 135 | + assert kg_with_provider._store.has_entity("diagram_0") | |
| 136 | + assert kg_with_provider._store.has_entity("diagram_1") | |
| 137 | + | |
| 138 | + | |
| 139 | +class Testcess_screenshots(screenshots) | |
| 140 | + # LLM extraction from text_content | |
| 141 | + mock_pm.chat.assert_called() | |
| 142 | + # Explicitly listed entities should be added | |
| 143 | + assert kg_with_provider._store.has_entity("Flask") | |
| 144 | + assert kg_with_provider._store.has_entity("Python") | |
| 145 | + | |
| 146 | + def test_process_screenshots_without_text(self, kg_with_provider, mock_pm): | |
| 147 | + screenshots = [ | |
| 148 | + { | |
| 149 | + "text_content": "", | |
| 150 | + "content_type": "other", | |
| 151 | + "entities": ["Docker"], | |
| 152 | + "frame_index": 5, | |
| 153 | + }, | |
| 154 | + ] | |
| 155 | + kg_with_provider.process_screenshots(screenshots) | |
| 156 | + # No chat call for empty text | |
| 157 | + mock_pm.chat.assert_not_called() | |
| 158 | + # But explicit entities still added | |
| 159 | + assert kg_with_provider._store.has_entity("Docker") | |
| 160 | + | |
| 161 | + def test_process_screenshots_empty_entities(self, kg_with_provider): | |
| 162 | + screenshots = [ | |
| 163 | + { | |
| 164 | + "text_content": "", | |
| 165 | + "content_type": "slide", | |
| 166 | + "entities": [], | |
| 167 | + "frame_index": 0, | |
| 168 | + }, | |
| 169 | + ] | |
| 170 | + kg_with_provider.process_screenshots(screenshots) | |
| 171 | + # No crash, no entities added | |
| 172 | + | |
| 173 | + def test_process_screenshots_filters_short_names(self, kg_with_provider): | |
| 174 | + screenshots = [ | |
| 175 | + { | |
| 176 | + "text_content": "", | |
| 177 | + "entities": ["A", "Go", "Python"], | |
| 178 | + "frame_index": 0, | |
| 179 | + }, | |
| 180 | + ] | |
| 181 | + kg_with_provider.process_screenshots(screenshots) | |
| 182 | + # "A" is too short (< 2 chars), filtered out | |
| 183 | + assert not kg_with_provider._store.has_entity("A") | |
| 184 | + assert kg_with_provider._store.has_entity("Go") | |
| 185 | + assert kg_with_provider._store.has_entity("Python") | |
| 186 | + | |
| 187 | + | |
| 188 | +class TestToDictFromDict: | |
| 189 | + def test_round_trip_empty(self): | |
| 190 | + kg = KnowledgeGraph() | |
| 191 | + data = kg.to_dict() | |
| 192 | + kg2 = KnowledgeGraph.from_dict(data) | |
| 193 | + assert kg2._store.get_entity_count() == 0 | |
| 194 | + assert kg2._store.get_relationship_count() == 0 | |
| 195 | + | |
| 196 | + def test_round_trip_with_entities(self, kg_with_provider, mock_pm): | |
| 197 | + # Add some content to populate the graph | |
| 198 | + kg_with_provider.add_content("Alice uses Python", "test_source") | |
| 199 | + original = kg_with_provider.to_dict() | |
| 200 | + | |
| 201 | + restored = KnowledgeGraph.from_dict(original) | |
| 202 | + restored_dict = restored.to_dict() | |
| 203 | + | |
| 204 | + assert len(restored_dict["nodes"]) == len(original["nodes"]) | |
| 205 | + assert len(restored_dict["relationships"]) == len(original["relationships"]) | |
| 206 | + | |
| 207 | + original_names = {n["name"] for n in original["nodes"]} | |
| 208 | + restored_names = {n["name"] for n in restored_dict["nodes"]} | |
| 209 | + assert original_names == restored_names | |
| 210 | + | |
| 211 | + def test_round_trip_with_sources(self): | |
| 212 | + kg = KnowledgeGraph() | |
| 213 | + kg.register_source( | |
| 214 | + { | |
| 215 | + "source_id": "src1", | |
| 216 | + "source_type": "video", | |
| 217 | + "title": "Test Video", | |
| 218 | + "ingested_at": "2025-01-01T00:00:00", | |
| 219 | + } | |
| 220 | + ) | |
| 221 | + data = kg.to_dict() | |
| 222 | + assert "sources" in data | |
| 223 | + assert data["sources"][0]["source_id"] == "src1" | |
| 224 | + | |
| 225 | + kg2 = KnowledgeGraph.from_dict(data) | |
| 226 | + sources = kg2._store.get_sources() | |
| 227 | + assert len(sources) == 1 | |
| 228 | + assert sources[0]["source_id"] == "src1" | |
| 229 | + | |
| 230 | + def test_from_dict_with_db_path(self, tmp_path): | |
| 231 | + data = { | |
| 232 | + "nodes": [ | |
| 233 | + {"name": "TestEntity", "type": "concept", "descriptions": ["A test"]}, | |
| 234 | + ], | |
| 235 | + "relationships": [], | |
| 236 | + } | |
| 237 | + db_path = tmp_path / "restored.db" | |
| 238 | + kg = KnowledgeGraph.from_dict(data, db_path=db_path) | |
| 239 | + assert kg._store.has_entity("TestEntity") | |
| 240 | + assert db_path.exists() | |
| 241 | + | |
| 242 | + | |
| 243 | +class TestSave: | |
| 244 | + def test_save_json(self, tmp_path, kg_with_provider, mock_pm): | |
| 245 | + kg_with_provider.add_content("Alice uses Python", "source1") | |
| 246 | + path = tmp_path / "graph.json" | |
| 247 | + result = kg_with_provider.save(path) | |
| 248 | + | |
| 249 | + assert result == path | |
| 250 | + assert path.exists() | |
| 251 | + data = json.loads(path.read_text()) | |
| 252 | + assert "nodes" in data | |
| 253 | + assert "relationships" in data | |
| 254 | + | |
| 255 | + def test_save_db(self, tmp_path, kg_with_provider, mock_pm): | |
| 256 | + kg_with_provider.add_content("Alice uses Python", "source1") | |
| 257 | + path = tmp_path / "graph.db" | |
| 258 | + result = kg_with_provider.save(path) | |
| 259 | + | |
| 260 | + assert result == path | |
| 261 | + assert path.exists() | |
| 262 | + | |
| 263 | + def test_save_no_suffix_defaults_to_db(self, tmp_path, kg_with_provider, mock_pm): | |
| 264 | + kg_with_provider.add_content("Alice uses Python", "source1") | |
| 265 | + path = tmp_path / "graph" | |
| 266 | + result = kg_with_provider.save(path) | |
| 267 | + assert result.suffix == ".db" | |
| 268 | + assert result.exists() | |
| 269 | + | |
| 270 | + def test_save_creates_parent_dirs(self, tmp_path, kg_with_provider, mock_pm): | |
| 271 | + kg_with_provider.add_content("Alice uses Python", "source1") | |
| 272 | + path = tmp_path / "nested" / "dir" / "graph.json" | |
| 273 | + result = kg_with_provider.save(path) | |
| 274 | + assert result.exists() | |
| 275 | + | |
| 276 | + def test_save_unknown_suffix_falls_back_to_json(self, tmp_path): | |
| 277 | + kg = KnowledgeGraph() | |
| 278 | + kg._store.merge_entity("TestNode", "concept", ["test"]) | |
| 279 | + path = tmp_path / "graph.xyz" | |
| 280 | + result = kg.save(path) | |
| 281 | + assert result.exists() | |
| 282 | + # Should be valid JSON | |
| 283 | + data = json.loads(path.read_text()) | |
| 284 | + assert "nodes" in data | |
| 285 | + | |
| 286 | + | |
| 287 | +class TestMerge: | |
| 288 | + def test_merge_disjoint(self): | |
| 289 | + kg1 = KnowledgeGraph() | |
| 290 | + kg1._store.merge_entity("Alice", "person", ["Developer"]) | |
| 291 | + | |
| 292 | + kg2 = KnowledgeGraph() | |
| 293 | + kg2._store.merge_entity("Bob", "person", ["Manager"]) | |
| 294 | + | |
| 295 | + kg1.merge(kg2) | |
| 296 | + assert kg1._store.has_entity("Alice") | |
| 297 | + assert kg1._store.has_entity("Bob") | |
| 298 | + assert kg1._store.get_entity_count() == 2 | |
| 299 | + | |
| 300 | + def test_merge_overlapping_entities_descriptions_merged(self): | |
| 301 | + kg1 = KnowledgeGraph() | |
| 302 | + kg1._store.merge_entity("Python", "concept", ["A language"]) | |
| 303 | + | |
| 304 | + kg2 = KnowledgeGraph() | |
| 305 | + kg2._store.merge_entity("Python", "technology", ["Programming language"]) | |
| 306 | + | |
| 307 | + kg1.merge(kg2) | |
| 308 | + entity = kg1._store.get_entity("Python") | |
| 309 | + # Descriptions from both should be present | |
| 310 | + descs = entity["descriptions"] | |
| 311 | + if isinstance(descs, set): | |
| 312 | + descs = list(descs) | |
| 313 | + assert "A language" in descs | |
| 314 | + assert "Programming language" in descs | |
| 315 | + | |
| 316 | + def test_merge_overlapping_entities_with_sqlite(self, tmp_path): | |
| 317 | + """SQLiteStore does update type on merge_entity, so type resolution works there.""" | |
| 318 | + kg1 = KnowledgeGraph(db_path=tmp_path / "kg1.db") | |
| 319 | + kg1._store.merge_entity("Python", "concept", ["A language"]) | |
| 320 | + | |
| 321 | + kg2 = KnowledgeGraph(db_path=tmp_path / "kg2.db") | |
| 322 | + kg2._store.merge_entity("Python", "technology", ["Programming language"]) | |
| 323 | + | |
| 324 | + kg1.merge(kg2) | |
| 325 | + entity = kg1._store.get_entity("Python") | |
| 326 | + # SQLiteStore overwrites type — merge resolves to more specific | |
| 327 | + # (The merge method computes the resolved type and passes it to merge_entity, | |
| 328 | + # but InMemoryStore ignores type for existing entities while SQLiteStore does not) | |
| 329 | + assert entity is not None | |
| 330 | + assert kg1._store.get_entity_count() == 1 | |
| 331 | + | |
| 332 | + def test_merge_fuzzy_match(self): | |
| 333 | + kg1 = KnowledgeGraph() | |
| 334 | + kg1._store.merge_entity("JavaScript", "technology", ["A language"]) | |
| 335 | + | |
| 336 | + kg2 = KnowledgeGraph() | |
| 337 | + kg2._store.merge_entity("Javascript", "technology", ["Web language"]) | |
| 338 | + | |
| 339 | + kg1.merge(kg2) | |
| 340 | + # Should fuzzy-match and merge, not create two entities | |
| 341 | + assert kg1._store.get_entity_count() == 1 | |
| 342 | + entity = kg1._store.get_entity("JavaScript") | |
| 343 | + assert entity is not None | |
| 344 | + | |
| 345 | + def test_merge_relationships(self): | |
| 346 | + kg1 = KnowledgeGraph() | |
| 347 | + kg1._store.merge_entity("Alice", "person", []) | |
| 348 | + | |
| 349 | + kg2 = KnowledgeGraph() | |
| 350 | + kg2._store.merge_entity("Bob", "person", []) | |
| 351 | + kg2._store.add_relationship("Alice", "Bob", "collaborates_with") | |
| 352 | + | |
| 353 | + kg1.merge(kg2) | |
| 354 | + rels = kg1._store.get_all_relationships() | |
| 355 | + assert len(rels) == 1 | |
| 356 | + assert rels[0]["type"] == "collaborates_with" | |
| 357 | + | |
| 358 | + def test_merge_sources(self): | |
| 359 | + kg1 = KnowledgeGraph() | |
| 360 | + kg2 = KnowledgeGraph() | |
| 361 | + kg2.register_source( | |
| 362 | + { | |
| 363 | + "source_id": "vid2", | |
| 364 | + "source_type": "video", | |
| 365 | + "title": "Video 2", | |
| 366 | + "ingested_at": "2025-01-01T00:00:00", | |
| 367 | + } | |
| 368 | + ) | |
| 369 | + kg1.merge(kg2) | |
| 370 | + sources = kg1._store.get_sources() | |
| 371 | + assert len(sources) == 1 | |
| 372 | + assert sources[0]["source_id"] == "vid2" | |
| 373 | + | |
| 374 | + def test_merge_type_specificity_with_sqlite(self, tmp_path): | |
| 375 | + """Type specificity resolution works with SQLiteStore which updates type.""" | |
| 376 | + kg1 = KnowledgeGraph(db_path=tmp_path / "kg1.db") | |
| 377 | + kg1._store.merge_entity("React", "concept", []) | |
| 378 | + | |
| 379 | + kg2 = KnowledgeGraph(db_path=tmp_path / "kg2.db") | |
| 380 | + kg2._store.merge_entity("React", "technology", []) | |
| 381 | + | |
| 382 | + kg1.merge(kg2) | |
| 383 | + entity = kg1._store.get_entity("React") | |
| 384 | + assert entity is not None | |
| 385 | + assert kg1._store.get_entity_count() == 1 | |
| 386 | + | |
| 387 | + | |
| 388 | +class TestRegisterSource: | |
| 389 | + def test_register_and_retrieve(self): | |
| 390 | + kg = KnowledgeGraph() | |
| 391 | + source = { | |
| 392 | + "source_id": "src123", | |
| 393 | + "source_type": "video", | |
| 394 | + "title": "Meeting Recording", | |
| 395 | + "path": "/tmp/meeting.mp4", | |
| 396 | + "ingested_at": "2025-06-01T10:00:00", | |
| 397 | + } | |
| 398 | + kg.register_source(source) | |
| 399 | + sources = kg._store.get_sources() | |
| 400 | + assert len(sources) == 1 | |
| 401 | + assert sources[0]["source_id"] == "src123" | |
| 402 | + assert sources[0]["title"] == "Meeting Recording" | |
| 403 | + | |
| 404 | + def test_register_multiple_sources(self): | |
| 405 | + kg = KnowledgeGraph() | |
| 406 | + for i in range(3): | |
| 407 | + kg.register_source( | |
| 408 | + { | |
| 409 | + "source_id": f"src{i}", | |
| 410 | + "source_type": "video", | |
| 411 | + "title": f"Video {i}", | |
| 412 | + "ingested_at": "2025-01-01", | |
| 413 | + } | |
| 414 | + ) | |
| 415 | + assert len(kg._store.get_sources()) == 3 | |
| 416 | + | |
| 417 | + | |
| 418 | +class TestClassifyForPlanning: | |
| 419 | + @patch("video_processor.integrators.knowledge_graph.TaxonomyClassifier", create=True) | |
| 420 | + def test_classify_calls_taxonomy(self, mock_cls): | |
| 421 | + """classify_for_planning should delegate to TaxonomyClassifier.""" | |
| 422 | + mock_ |
| --- a/tests/test_knowledge_graph.py | |
| +++ b/tests/test_knowledge_graph.py | |
| @@ -0,0 +1,422 @@ | |
| --- a/tests/test_knowledge_graph.py | |
| +++ b/tests/test_knowledge_graph.py | |
| @@ -0,0 +1,422 @@ | |
| 1 | """Tests for the KnowledgeGraph class.""" |
| 2 | |
| 3 | import json |
| 4 | from unittest.mock import MagicMock, patch |
| 5 | |
| 6 | import pytest |
| 7 | |
| 8 | from video_processor.integrators.knowledge_graph import KnowledgeGraph |
| 9 | |
| 10 | |
| 11 | @pytest.fixture |
| 12 | def mock_pm(): |
| 13 | """A mock ProviderManager that returns predictable JSON from chat().""" |
| 14 | pm = MagicMock() |
| 15 | pm.chat.return_value = json.dumps( |
| 16 | { |
| 17 | "entities": [ |
| 18 | {"name": "Python", "type": "technology", "description": "A programming language"}, |
| 19 | {"name": "Alice", "type": "person", "description": "Lead developer"}, |
| 20 | ], |
| 21 | "relationships": [ |
| 22 | {"source": "Alice", "target": "Python", "type": "uses"}, |
| 23 | ], |
| 24 | } |
| 25 | ) |
| 26 | return pm |
| 27 | |
| 28 | |
| 29 | @pytest.fixture |
| 30 | def kg_no_provider(): |
| 31 | """KnowledgeGraph with no provider (in-memory store).""" |
| 32 | return KnowledgeGraph() |
| 33 | |
| 34 | |
| 35 | @pytest.fixture |
| 36 | def kg_with_provider(mock_pm): |
| 37 | """KnowledgeGraph with a mock provider (in-memory store).""" |
| 38 | return KnowledgeGraph(provider_manager=mock_pm) |
| 39 | |
| 40 | |
| 41 | class TestCreation: |
| 42 | def test_create_without_db_path(self): |
| 43 | kg = KnowledgeGraph() |
| 44 | assert kg.pm is None |
| 45 | assert kg._store.get_entity_count() == 0 |
| 46 | assert kg._store.get_relationship_count() == 0 |
| 47 | |
| 48 | def test_create_with_db_path(self, tmp_path): |
| 49 | db_path = tmp_path / "test.db" |
| 50 | kg = KnowledgeGraph(db_path=db_path) |
| 51 | assert kg._store.get_entity_count() == 0 |
| 52 | assert db_path.exists() |
| 53 | |
| 54 | def test_create_with_provider(self, mock_pm): |
| 55 | kg = KnowledgeGraph(provider_manager=mock_pm) |
| 56 | assert kg.pm is mock_pm |
| 57 | |
| 58 | |
| 59 | class TestProcessTranscript: |
| 60 | def test_process_transcript_extracts_entities(self, kg_with_provider, mock_pm): |
| 61 | transcript = { |
| 62 | "segments": [ |
| 63 | {"text": "Alice is using Python for the project", "start": 0.0, "speaker": "Alice"}, |
| 64 | {"text": "It works great for data processing", "start": 5.0}, |
| 65 | ] |
| 66 | } |
| 67 | kg_with_provider.process_transcript(transcript) |
| 68 | |
| 69 | # The mock returns Python and Alice as entities |
| 70 | nodes = kg_with_provider.nodes |
| 71 | assert "Python" in nodes |
| 72 | assert "Alice" in nodes |
| 73 | assert nodes["Python"]["type"] == "technology" |
| 74 | |
| 75 | def test_process_transcript_registers_speakers(self, kg_with_provider): |
| 76 | transcript = { |
| 77 | "segments": [ |
| 78 | {"text": "Hello everyone", "start": 0.0, "speaker": "Bob"}, |
| 79 | ] |
| 80 | } |
| 81 | kg_with_provider.process_transcript(transcript) |
| 82 | assert kg_with_provider._store.has_entity("Bob") |
| 83 | |
| 84 | def test_process_transcript_missing_segments(self, kg_with_provider): |
| 85 | """Should log warning and return without error.""" |
| 86 | kg_with_provider.process_transcript({}) |
| 87 | assert kg_with_provider._store.get_entity_count() == 0 |
| 88 | |
| 89 | def test_process_transcript_empty_text_skipped(self, kg_with_provider, mock_pm): |
| 90 | transcript = { |
| 91 | "segments": [ |
| 92 | {"text": " ", "start": 0.0}, |
| 93 | ] |
| 94 | } |
| 95 | kg_with_provider.process_transcript(transcript) |
| 96 | # chat should not be called for empty batches (speaker registration may still happen) |
| 97 | mock_pm.chat.assert_not_called() |
| 98 | |
| 99 | def test_process_transcript_batching(self, kg_with_provider, mock_pm): |
| 100 | """With batch_size=2, 5 segments should produce 3 batches.""" |
| 101 | segments = [{"text": f"Segment {i}", "start": float(i)} for i in range(5)] |
| 102 | transcript = {"segments": segments} |
| 103 | kg_with_provider.process_transcript(transcript, batch_size=2) |
| 104 | assert mock_pm.chat.call_count == 3 |
| 105 | |
| 106 | |
| 107 | class TestProcessDiagrams: |
| 108 | def test_process_diagrams_with_text(self, kg_with_provider, mock_pm): |
| 109 | diagrams = [ |
| 110 | {"text_content": "Architecture shows Python microservices", "frame_index": 0}, |
| 111 | ] |
| 112 | kg_with_provider.process_diagrams(diagrams) |
| 113 | |
| 114 | # Should have called chat once for the text content |
| 115 | assert mock_pm.chat.call_count == 1 |
| 116 | # diagram_0 entity should exist |
| 117 | assert kg_with_provider._store.has_entity("diagram_0") |
| 118 | |
| 119 | def test_process_diagrams_without_text(self, kg_with_provider, mock_pm): |
| 120 | diagrams = [ |
| 121 | {"text_content": "", "frame_index": 5}, |
| 122 | ] |
| 123 | kg_with_provider.process_diagrams(diagrams) |
| 124 | # No chat call for empty text |
| 125 | mock_pm.chat.assert_not_called() |
| 126 | # But diagram entity still created |
| 127 | assert kg_with_provider._store.has_entity("diagram_0") |
| 128 | |
| 129 | def test_process_multiple_diagrams(self, kg_with_provider, mock_pm): |
| 130 | diagrams = [ |
| 131 | {"text_content": "Diagram A content", "frame_index": 0}, |
| 132 | {"text_content": "Diagram B content", "frame_index": 10}, |
| 133 | ] |
| 134 | kg_with_provider.process_diagrams(diagrams) |
| 135 | assert kg_with_provider._store.has_entity("diagram_0") |
| 136 | assert kg_with_provider._store.has_entity("diagram_1") |
| 137 | |
| 138 | |
| 139 | class Testcess_screenshots(screenshots) |
| 140 | # LLM extraction from text_content |
| 141 | mock_pm.chat.assert_called() |
| 142 | # Explicitly listed entities should be added |
| 143 | assert kg_with_provider._store.has_entity("Flask") |
| 144 | assert kg_with_provider._store.has_entity("Python") |
| 145 | |
| 146 | def test_process_screenshots_without_text(self, kg_with_provider, mock_pm): |
| 147 | screenshots = [ |
| 148 | { |
| 149 | "text_content": "", |
| 150 | "content_type": "other", |
| 151 | "entities": ["Docker"], |
| 152 | "frame_index": 5, |
| 153 | }, |
| 154 | ] |
| 155 | kg_with_provider.process_screenshots(screenshots) |
| 156 | # No chat call for empty text |
| 157 | mock_pm.chat.assert_not_called() |
| 158 | # But explicit entities still added |
| 159 | assert kg_with_provider._store.has_entity("Docker") |
| 160 | |
| 161 | def test_process_screenshots_empty_entities(self, kg_with_provider): |
| 162 | screenshots = [ |
| 163 | { |
| 164 | "text_content": "", |
| 165 | "content_type": "slide", |
| 166 | "entities": [], |
| 167 | "frame_index": 0, |
| 168 | }, |
| 169 | ] |
| 170 | kg_with_provider.process_screenshots(screenshots) |
| 171 | # No crash, no entities added |
| 172 | |
| 173 | def test_process_screenshots_filters_short_names(self, kg_with_provider): |
| 174 | screenshots = [ |
| 175 | { |
| 176 | "text_content": "", |
| 177 | "entities": ["A", "Go", "Python"], |
| 178 | "frame_index": 0, |
| 179 | }, |
| 180 | ] |
| 181 | kg_with_provider.process_screenshots(screenshots) |
| 182 | # "A" is too short (< 2 chars), filtered out |
| 183 | assert not kg_with_provider._store.has_entity("A") |
| 184 | assert kg_with_provider._store.has_entity("Go") |
| 185 | assert kg_with_provider._store.has_entity("Python") |
| 186 | |
| 187 | |
| 188 | class TestToDictFromDict: |
| 189 | def test_round_trip_empty(self): |
| 190 | kg = KnowledgeGraph() |
| 191 | data = kg.to_dict() |
| 192 | kg2 = KnowledgeGraph.from_dict(data) |
| 193 | assert kg2._store.get_entity_count() == 0 |
| 194 | assert kg2._store.get_relationship_count() == 0 |
| 195 | |
| 196 | def test_round_trip_with_entities(self, kg_with_provider, mock_pm): |
| 197 | # Add some content to populate the graph |
| 198 | kg_with_provider.add_content("Alice uses Python", "test_source") |
| 199 | original = kg_with_provider.to_dict() |
| 200 | |
| 201 | restored = KnowledgeGraph.from_dict(original) |
| 202 | restored_dict = restored.to_dict() |
| 203 | |
| 204 | assert len(restored_dict["nodes"]) == len(original["nodes"]) |
| 205 | assert len(restored_dict["relationships"]) == len(original["relationships"]) |
| 206 | |
| 207 | original_names = {n["name"] for n in original["nodes"]} |
| 208 | restored_names = {n["name"] for n in restored_dict["nodes"]} |
| 209 | assert original_names == restored_names |
| 210 | |
| 211 | def test_round_trip_with_sources(self): |
| 212 | kg = KnowledgeGraph() |
| 213 | kg.register_source( |
| 214 | { |
| 215 | "source_id": "src1", |
| 216 | "source_type": "video", |
| 217 | "title": "Test Video", |
| 218 | "ingested_at": "2025-01-01T00:00:00", |
| 219 | } |
| 220 | ) |
| 221 | data = kg.to_dict() |
| 222 | assert "sources" in data |
| 223 | assert data["sources"][0]["source_id"] == "src1" |
| 224 | |
| 225 | kg2 = KnowledgeGraph.from_dict(data) |
| 226 | sources = kg2._store.get_sources() |
| 227 | assert len(sources) == 1 |
| 228 | assert sources[0]["source_id"] == "src1" |
| 229 | |
| 230 | def test_from_dict_with_db_path(self, tmp_path): |
| 231 | data = { |
| 232 | "nodes": [ |
| 233 | {"name": "TestEntity", "type": "concept", "descriptions": ["A test"]}, |
| 234 | ], |
| 235 | "relationships": [], |
| 236 | } |
| 237 | db_path = tmp_path / "restored.db" |
| 238 | kg = KnowledgeGraph.from_dict(data, db_path=db_path) |
| 239 | assert kg._store.has_entity("TestEntity") |
| 240 | assert db_path.exists() |
| 241 | |
| 242 | |
| 243 | class TestSave: |
| 244 | def test_save_json(self, tmp_path, kg_with_provider, mock_pm): |
| 245 | kg_with_provider.add_content("Alice uses Python", "source1") |
| 246 | path = tmp_path / "graph.json" |
| 247 | result = kg_with_provider.save(path) |
| 248 | |
| 249 | assert result == path |
| 250 | assert path.exists() |
| 251 | data = json.loads(path.read_text()) |
| 252 | assert "nodes" in data |
| 253 | assert "relationships" in data |
| 254 | |
| 255 | def test_save_db(self, tmp_path, kg_with_provider, mock_pm): |
| 256 | kg_with_provider.add_content("Alice uses Python", "source1") |
| 257 | path = tmp_path / "graph.db" |
| 258 | result = kg_with_provider.save(path) |
| 259 | |
| 260 | assert result == path |
| 261 | assert path.exists() |
| 262 | |
| 263 | def test_save_no_suffix_defaults_to_db(self, tmp_path, kg_with_provider, mock_pm): |
| 264 | kg_with_provider.add_content("Alice uses Python", "source1") |
| 265 | path = tmp_path / "graph" |
| 266 | result = kg_with_provider.save(path) |
| 267 | assert result.suffix == ".db" |
| 268 | assert result.exists() |
| 269 | |
| 270 | def test_save_creates_parent_dirs(self, tmp_path, kg_with_provider, mock_pm): |
| 271 | kg_with_provider.add_content("Alice uses Python", "source1") |
| 272 | path = tmp_path / "nested" / "dir" / "graph.json" |
| 273 | result = kg_with_provider.save(path) |
| 274 | assert result.exists() |
| 275 | |
| 276 | def test_save_unknown_suffix_falls_back_to_json(self, tmp_path): |
| 277 | kg = KnowledgeGraph() |
| 278 | kg._store.merge_entity("TestNode", "concept", ["test"]) |
| 279 | path = tmp_path / "graph.xyz" |
| 280 | result = kg.save(path) |
| 281 | assert result.exists() |
| 282 | # Should be valid JSON |
| 283 | data = json.loads(path.read_text()) |
| 284 | assert "nodes" in data |
| 285 | |
| 286 | |
| 287 | class TestMerge: |
| 288 | def test_merge_disjoint(self): |
| 289 | kg1 = KnowledgeGraph() |
| 290 | kg1._store.merge_entity("Alice", "person", ["Developer"]) |
| 291 | |
| 292 | kg2 = KnowledgeGraph() |
| 293 | kg2._store.merge_entity("Bob", "person", ["Manager"]) |
| 294 | |
| 295 | kg1.merge(kg2) |
| 296 | assert kg1._store.has_entity("Alice") |
| 297 | assert kg1._store.has_entity("Bob") |
| 298 | assert kg1._store.get_entity_count() == 2 |
| 299 | |
| 300 | def test_merge_overlapping_entities_descriptions_merged(self): |
| 301 | kg1 = KnowledgeGraph() |
| 302 | kg1._store.merge_entity("Python", "concept", ["A language"]) |
| 303 | |
| 304 | kg2 = KnowledgeGraph() |
| 305 | kg2._store.merge_entity("Python", "technology", ["Programming language"]) |
| 306 | |
| 307 | kg1.merge(kg2) |
| 308 | entity = kg1._store.get_entity("Python") |
| 309 | # Descriptions from both should be present |
| 310 | descs = entity["descriptions"] |
| 311 | if isinstance(descs, set): |
| 312 | descs = list(descs) |
| 313 | assert "A language" in descs |
| 314 | assert "Programming language" in descs |
| 315 | |
| 316 | def test_merge_overlapping_entities_with_sqlite(self, tmp_path): |
| 317 | """SQLiteStore does update type on merge_entity, so type resolution works there.""" |
| 318 | kg1 = KnowledgeGraph(db_path=tmp_path / "kg1.db") |
| 319 | kg1._store.merge_entity("Python", "concept", ["A language"]) |
| 320 | |
| 321 | kg2 = KnowledgeGraph(db_path=tmp_path / "kg2.db") |
| 322 | kg2._store.merge_entity("Python", "technology", ["Programming language"]) |
| 323 | |
| 324 | kg1.merge(kg2) |
| 325 | entity = kg1._store.get_entity("Python") |
| 326 | # SQLiteStore overwrites type — merge resolves to more specific |
| 327 | # (The merge method computes the resolved type and passes it to merge_entity, |
| 328 | # but InMemoryStore ignores type for existing entities while SQLiteStore does not) |
| 329 | assert entity is not None |
| 330 | assert kg1._store.get_entity_count() == 1 |
| 331 | |
| 332 | def test_merge_fuzzy_match(self): |
| 333 | kg1 = KnowledgeGraph() |
| 334 | kg1._store.merge_entity("JavaScript", "technology", ["A language"]) |
| 335 | |
| 336 | kg2 = KnowledgeGraph() |
| 337 | kg2._store.merge_entity("Javascript", "technology", ["Web language"]) |
| 338 | |
| 339 | kg1.merge(kg2) |
| 340 | # Should fuzzy-match and merge, not create two entities |
| 341 | assert kg1._store.get_entity_count() == 1 |
| 342 | entity = kg1._store.get_entity("JavaScript") |
| 343 | assert entity is not None |
| 344 | |
| 345 | def test_merge_relationships(self): |
| 346 | kg1 = KnowledgeGraph() |
| 347 | kg1._store.merge_entity("Alice", "person", []) |
| 348 | |
| 349 | kg2 = KnowledgeGraph() |
| 350 | kg2._store.merge_entity("Bob", "person", []) |
| 351 | kg2._store.add_relationship("Alice", "Bob", "collaborates_with") |
| 352 | |
| 353 | kg1.merge(kg2) |
| 354 | rels = kg1._store.get_all_relationships() |
| 355 | assert len(rels) == 1 |
| 356 | assert rels[0]["type"] == "collaborates_with" |
| 357 | |
| 358 | def test_merge_sources(self): |
| 359 | kg1 = KnowledgeGraph() |
| 360 | kg2 = KnowledgeGraph() |
| 361 | kg2.register_source( |
| 362 | { |
| 363 | "source_id": "vid2", |
| 364 | "source_type": "video", |
| 365 | "title": "Video 2", |
| 366 | "ingested_at": "2025-01-01T00:00:00", |
| 367 | } |
| 368 | ) |
| 369 | kg1.merge(kg2) |
| 370 | sources = kg1._store.get_sources() |
| 371 | assert len(sources) == 1 |
| 372 | assert sources[0]["source_id"] == "vid2" |
| 373 | |
| 374 | def test_merge_type_specificity_with_sqlite(self, tmp_path): |
| 375 | """Type specificity resolution works with SQLiteStore which updates type.""" |
| 376 | kg1 = KnowledgeGraph(db_path=tmp_path / "kg1.db") |
| 377 | kg1._store.merge_entity("React", "concept", []) |
| 378 | |
| 379 | kg2 = KnowledgeGraph(db_path=tmp_path / "kg2.db") |
| 380 | kg2._store.merge_entity("React", "technology", []) |
| 381 | |
| 382 | kg1.merge(kg2) |
| 383 | entity = kg1._store.get_entity("React") |
| 384 | assert entity is not None |
| 385 | assert kg1._store.get_entity_count() == 1 |
| 386 | |
| 387 | |
| 388 | class TestRegisterSource: |
| 389 | def test_register_and_retrieve(self): |
| 390 | kg = KnowledgeGraph() |
| 391 | source = { |
| 392 | "source_id": "src123", |
| 393 | "source_type": "video", |
| 394 | "title": "Meeting Recording", |
| 395 | "path": "/tmp/meeting.mp4", |
| 396 | "ingested_at": "2025-06-01T10:00:00", |
| 397 | } |
| 398 | kg.register_source(source) |
| 399 | sources = kg._store.get_sources() |
| 400 | assert len(sources) == 1 |
| 401 | assert sources[0]["source_id"] == "src123" |
| 402 | assert sources[0]["title"] == "Meeting Recording" |
| 403 | |
| 404 | def test_register_multiple_sources(self): |
| 405 | kg = KnowledgeGraph() |
| 406 | for i in range(3): |
| 407 | kg.register_source( |
| 408 | { |
| 409 | "source_id": f"src{i}", |
| 410 | "source_type": "video", |
| 411 | "title": f"Video {i}", |
| 412 | "ingested_at": "2025-01-01", |
| 413 | } |
| 414 | ) |
| 415 | assert len(kg._store.get_sources()) == 3 |
| 416 | |
| 417 | |
| 418 | class TestClassifyForPlanning: |
| 419 | @patch("video_processor.integrators.knowledge_graph.TaxonomyClassifier", create=True) |
| 420 | def test_classify_calls_taxonomy(self, mock_cls): |
| 421 | """classify_for_planning should delegate to TaxonomyClassifier.""" |
| 422 | mock_ |
| --- a/tests/test_output_formatter.py | ||
| +++ b/tests/test_output_formatter.py | ||
| @@ -0,0 +1,275 @@ | ||
| 1 | +"""Tests for video_processor.cli.output_formatter.OutputFormatter.""" | |
| 2 | + | |
| 3 | +from pathlib import Path | |
| 4 | + | |
| 5 | +import pytest | |
| 6 | + | |
| 7 | +from video_processor.cli.output_formatter import OutputFormatter | |
| 8 | + | |
| 9 | + | |
| 10 | +@pytest.fixture() | |
| 11 | +def tmp_dir(tmp_path): | |
| 12 | + """Return a fresh temp directory that is cleaned up automatically.""" | |
| 13 | + return tmp_path | |
| 14 | + | |
| 15 | + | |
| 16 | +@pytest.fixture() | |
| 17 | +def formatter(tmp_dir): | |
| 18 | + """Return an OutputFormatter pointed at a temp output directory.""" | |
| 19 | + return OutputFormatter(tmp_dir / "output") | |
| 20 | + | |
| 21 | + | |
| 22 | +# --- Constructor --- | |
| 23 | + | |
| 24 | + | |
| 25 | +def test_constructor_creates_output_dir(tmp_dir): | |
| 26 | + out = tmp_dir / "new_output" | |
| 27 | + assert not out.exists() | |
| 28 | + OutputFormatter(out) | |
| 29 | + assert out.is_dir() | |
| 30 | + | |
| 31 | + | |
| 32 | +def test_constructor_accepts_string(tmp_dir): | |
| 33 | + fmt = OutputFormatter(str(tmp_dir / "str_output")) | |
| 34 | + assert fmt.output_dir.is_dir() | |
| 35 | + | |
| 36 | + | |
| 37 | +# --- organize_outputs --- | |
| 38 | + | |
| 39 | + | |
| 40 | +def _create_file(path: Path, content: str = "test") -> Path: | |
| 41 | + path.parent.mkdir(parents=True, exist_ok=True) | |
| 42 | + path.write_text(content) | |
| 43 | + return path | |
| 44 | + | |
| 45 | + | |
| 46 | +def test_organize_outputs_basic(formatter, tmp_dir): | |
| 47 | + md = _create_file(tmp_dir / "analysis.md", "# Title") | |
| 48 | + kg = _create_file(tmp_dir / "kg.json", "{}") | |
| 49 | + | |
| 50 | + result = formatter.organize_outputs( | |
| 51 | + markdown_path=md, | |
| 52 | + knowledge_graph_path=kg, | |
| 53 | + diagrams=[], | |
| 54 | + ) | |
| 55 | + | |
| 56 | + assert "markdown" in result | |
| 57 | + assert "knowledge_graph" in result | |
| 58 | + assert Path(result["markdown"]).exists() | |
| 59 | + assert Path(result["knowledge_graph"]).exists() | |
| 60 | + assert result["diagram_images"] == [] | |
| 61 | + assert result["frames"] == [] | |
| 62 | + assert result["transcript"] is None | |
| 63 | + | |
| 64 | + | |
| 65 | +def test_organize_outputs_with_transcript(formatter, tmp_dir): | |
| 66 | + md = _create_file(tmp_dir / "analysis.md") | |
| 67 | + kg = _create_file(tmp_dir / "kg.json") | |
| 68 | + transcript = _create_file(tmp_dir / "transcript.txt", "Hello world") | |
| 69 | + | |
| 70 | + result = formatter.organize_outputs( | |
| 71 | + markdown_path=md, | |
| 72 | + knowledge_graph_path=kg, | |
| 73 | + diagrams=[], | |
| 74 | + transcript_path=transcript, | |
| 75 | + ) | |
| 76 | + | |
| 77 | + assert result["transcript"] is not None | |
| 78 | + assert Path(result["transcript"]).exists() | |
| 79 | + | |
| 80 | + | |
| 81 | +def test_organize_outputs_with_diagrams(formatter, tmp_dir): | |
| 82 | + md = _create_file(tmp_dir / "analysis.md") | |
| 83 | + kg = _create_file(tmp_dir / "kg.json") | |
| 84 | + img = _create_file(tmp_dir / "diagram1.png", "fake-png") | |
| 85 | + | |
| 86 | + result = formatter.organize_outputs( | |
| 87 | + markdown_path=md, | |
| 88 | + knowledge_graph_path=kg, | |
| 89 | + diagrams=[{"image_path": str(img)}], | |
| 90 | + ) | |
| 91 | + | |
| 92 | + assert len(result["diagram_images"]) == 1 | |
| 93 | + assert Path(result["diagram_images"][0]).exists() | |
| 94 | + | |
| 95 | + | |
| 96 | +def test_organize_outputs_skips_missing_diagram(formatter, tmp_dir): | |
| 97 | + md = _create_file(tmp_dir / "analysis.md") | |
| 98 | + kg = _create_file(tmp_dir / "kg.json") | |
| 99 | + | |
| 100 | + result = formatter.organize_outputs( | |
| 101 | + markdown_path=md, | |
| 102 | + knowledge_graph_path=kg, | |
| 103 | + diagrams=[{"image_path": "/nonexistent/diagram.png"}], | |
| 104 | + ) | |
| 105 | + | |
| 106 | + assert result["diagram_images"] == [] | |
| 107 | + | |
| 108 | + | |
| 109 | +def test_organize_outputs_diagram_without_image_path(formatter, tmp_dir): | |
| 110 | + md = _create_file(tmp_dir / "analysis.md") | |
| 111 | + kg = _create_file(tmp_dir / "kg.json") | |
| 112 | + | |
| 113 | + result = formatter.organize_outputs( | |
| 114 | + markdown_path=md, | |
| 115 | + knowledge_graph_path=kg, | |
| 116 | + diagrams=[{"description": "A diagram"}], | |
| 117 | + ) | |
| 118 | + | |
| 119 | + assert result["diagram_images"] == [] | |
| 120 | + | |
| 121 | + | |
| 122 | +def test_organize_outputs_with_frames(formatter, tmp_dir): | |
| 123 | + md = _create_file(tmp_dir / "analysis.md") | |
| 124 | + kg = _create_file(tmp_dir / "kg.json") | |
| 125 | + frames_dir = tmp_dir / "frames" | |
| 126 | + frames_dir.mkdir() | |
| 127 | + for i in range(5): | |
| 128 | + _create_file(frames_dir / f"frame_{i:03d}.jpg", f"frame{i}") | |
| 129 | + | |
| 130 | + result = formatter.organize_outputs( | |
| 131 | + markdown_path=md, | |
| 132 | + knowledge_graph_path=kg, | |
| 133 | + diagrams=[], | |
| 134 | + frames_dir=frames_dir, | |
| 135 | + ) | |
| 136 | + | |
| 137 | + assert len(result["frames"]) == 5 | |
| 138 | + | |
| 139 | + | |
| 140 | +def test_organize_outputs_limits_frames_to_10(formatter, tmp_dir): | |
| 141 | + md = _create_file(tmp_dir / "analysis.md") | |
| 142 | + kg = _create_file(tmp_dir / "kg.json") | |
| 143 | + frames_dir = tmp_dir / "frames" | |
| 144 | + frames_dir.mkdir() | |
| 145 | + for i in range(25): | |
| 146 | + _create_file(frames_dir / f"frame_{i:03d}.jpg", f"frame{i}") | |
| 147 | + | |
| 148 | + result = formatter.organize_outputs( | |
| 149 | + markdown_path=md, | |
| 150 | + knowledge_graph_path=kg, | |
| 151 | + diagrams=[], | |
| 152 | + frames_dir=frames_dir, | |
| 153 | + ) | |
| 154 | + | |
| 155 | + assert len(result["frames"]) <= 10 | |
| 156 | + | |
| 157 | + | |
| 158 | +def test_organize_outputs_missing_frames_dir(formatter, tmp_dir): | |
| 159 | + md = _create_file(tmp_dir / "analysis.md") | |
| 160 | + kg = _create_file(tmp_dir / "kg.json") | |
| 161 | + | |
| 162 | + result = formatter.organize_outputs( | |
| 163 | + markdown_path=md, | |
| 164 | + knowledge_graph_path=kg, | |
| 165 | + diagrams=[], | |
| 166 | + frames_dir=tmp_dir / "nonexistent_frames", | |
| 167 | + ) | |
| 168 | + | |
| 169 | + assert result["frames"] == [] | |
| 170 | + | |
| 171 | + | |
| 172 | +# --- create_html_index --- | |
| 173 | + | |
| 174 | + | |
| 175 | +def test_create_html_index_returns_path(formatter, tmp_dir): | |
| 176 | + outputs = { | |
| 177 | + "markdown": str(formatter.output_dir / "markdown" / "analysis.md"), | |
| 178 | + "knowledge_graph": str(formatter.output_dir / "data" / "kg.json"), | |
| 179 | + "diagram_images": [], | |
| 180 | + "frames": [], | |
| 181 | + "transcript": None, | |
| 182 | + } | |
| 183 | + # Create the referenced files so relative_to works | |
| 184 | + for key in ("markdown", "knowledge_graph"): | |
| 185 | + _create_file(Path(outputs[key])) | |
| 186 | + | |
| 187 | + index = formatter.create_html_index(outputs) | |
| 188 | + assert index.exists() | |
| 189 | + assert index.name == "index.html" | |
| 190 | + | |
| 191 | + | |
| 192 | +def test_create_html_index_contains_analysis_link(formatter, tmp_dir): | |
| 193 | + md_path = formatter.output_dir / "markdown" / "analysis.md" | |
| 194 | + _create_file(md_path) | |
| 195 | + outputs = { | |
| 196 | + "markdown": str(md_path), | |
| 197 | + "knowledge_graph": None, | |
| 198 | + "diagram_images": [], | |
| 199 | + "frames": [], | |
| 200 | + "transcript": None, | |
| 201 | + } | |
| 202 | + | |
| 203 | + index = formatter.create_html_index(outputs) | |
| 204 | + content = index.read_text() | |
| 205 | + assert "Analysis Report" in content | |
| 206 | + assert "analysis.md" in content | |
| 207 | + | |
| 208 | + | |
| 209 | +def test_create_html_index_with_diagrams(formatter, tmp_dir): | |
| 210 | + img_path = formatter.output_dir / "diagrams" / "d1.png" | |
| 211 | + _create_file(img_path) | |
| 212 | + outputs = { | |
| 213 | + "markdown": None, | |
| 214 | + "knowledge_graph": None, | |
| 215 | + "diagram_images": [str(img_path)], | |
| 216 | + "frames": [], | |
| 217 | + "transcript": None, | |
| 218 | + } | |
| 219 | + | |
| 220 | + index = formatter.create_html_index(outputs) | |
| 221 | + content = index.read_text() | |
| 222 | + assert "Diagrams" in content | |
| 223 | + assert "d1.png" in content | |
| 224 | + | |
| 225 | + | |
| 226 | +def test_create_html_index_with_frames(formatter, tmp_dir): | |
| 227 | + frame_path = formatter.output_dir / "frames" / "frame_001.jpg" | |
| 228 | + _create_file(frame_path) | |
| 229 | + outputs = { | |
| 230 | + "markdown": None, | |
| 231 | + "knowledge_graph": None, | |
| 232 | + "diagram_images": [], | |
| 233 | + "frames": [str(frame_path)], | |
| 234 | + "transcript": None, | |
| 235 | + } | |
| 236 | + | |
| 237 | + index = formatter.create_html_index(outputs) | |
| 238 | + content = index.read_text() | |
| 239 | + assert "Key Frames" in content | |
| 240 | + assert "frame_001.jpg" in content | |
| 241 | + | |
| 242 | + | |
| 243 | +def test_create_html_index_with_data_files(formatter, tmp_dir): | |
| 244 | + kg_path = formatter.output_dir / "data" / "kg.json" | |
| 245 | + transcript_path = formatter.output_dir / "data" / "transcript.txt" | |
| 246 | + _create_file(kg_path) | |
| 247 | + _create_file(transcript_path) | |
| 248 | + outputs = { | |
| 249 | + "markdown": None, | |
| 250 | + "knowledge_graph": str(kg_path), | |
| 251 | + "diagram_images": [], | |
| 252 | + "frames": [], | |
| 253 | + "transcript": str(transcript_path), | |
| 254 | + } | |
| 255 | + | |
| 256 | + index = formatter.create_html_index(outputs) | |
| 257 | + content = index.read_text() | |
| 258 | + assert "Data Files" in content | |
| 259 | + assert "kg.json" in content | |
| 260 | + assert "transcript.txt" in content | |
| 261 | + | |
| 262 | + | |
| 263 | +def test_create_html_index_empty_outputs(formatter): | |
| 264 | + outputs = { | |
| 265 | + "markdown": None, | |
| 266 | + "knowledge_graph": None, | |
| 267 | + "diagram_images": [], | |
| 268 | + "frames": [], | |
| 269 | + "transcript": None, | |
| 270 | + } | |
| 271 | + | |
| 272 | + index = formatter.create_html_index(outputs) | |
| 273 | + content = index.read_text() | |
| 274 | + assert "PlanOpticon Analysis Results" in content | |
| 275 | + assert "<!DOCTYPE html>" in content |
| --- a/tests/test_output_formatter.py | |
| +++ b/tests/test_output_formatter.py | |
| @@ -0,0 +1,275 @@ | |
| --- a/tests/test_output_formatter.py | |
| +++ b/tests/test_output_formatter.py | |
| @@ -0,0 +1,275 @@ | |
| 1 | """Tests for video_processor.cli.output_formatter.OutputFormatter.""" |
| 2 | |
| 3 | from pathlib import Path |
| 4 | |
| 5 | import pytest |
| 6 | |
| 7 | from video_processor.cli.output_formatter import OutputFormatter |
| 8 | |
| 9 | |
| 10 | @pytest.fixture() |
| 11 | def tmp_dir(tmp_path): |
| 12 | """Return a fresh temp directory that is cleaned up automatically.""" |
| 13 | return tmp_path |
| 14 | |
| 15 | |
| 16 | @pytest.fixture() |
| 17 | def formatter(tmp_dir): |
| 18 | """Return an OutputFormatter pointed at a temp output directory.""" |
| 19 | return OutputFormatter(tmp_dir / "output") |
| 20 | |
| 21 | |
| 22 | # --- Constructor --- |
| 23 | |
| 24 | |
| 25 | def test_constructor_creates_output_dir(tmp_dir): |
| 26 | out = tmp_dir / "new_output" |
| 27 | assert not out.exists() |
| 28 | OutputFormatter(out) |
| 29 | assert out.is_dir() |
| 30 | |
| 31 | |
| 32 | def test_constructor_accepts_string(tmp_dir): |
| 33 | fmt = OutputFormatter(str(tmp_dir / "str_output")) |
| 34 | assert fmt.output_dir.is_dir() |
| 35 | |
| 36 | |
| 37 | # --- organize_outputs --- |
| 38 | |
| 39 | |
| 40 | def _create_file(path: Path, content: str = "test") -> Path: |
| 41 | path.parent.mkdir(parents=True, exist_ok=True) |
| 42 | path.write_text(content) |
| 43 | return path |
| 44 | |
| 45 | |
| 46 | def test_organize_outputs_basic(formatter, tmp_dir): |
| 47 | md = _create_file(tmp_dir / "analysis.md", "# Title") |
| 48 | kg = _create_file(tmp_dir / "kg.json", "{}") |
| 49 | |
| 50 | result = formatter.organize_outputs( |
| 51 | markdown_path=md, |
| 52 | knowledge_graph_path=kg, |
| 53 | diagrams=[], |
| 54 | ) |
| 55 | |
| 56 | assert "markdown" in result |
| 57 | assert "knowledge_graph" in result |
| 58 | assert Path(result["markdown"]).exists() |
| 59 | assert Path(result["knowledge_graph"]).exists() |
| 60 | assert result["diagram_images"] == [] |
| 61 | assert result["frames"] == [] |
| 62 | assert result["transcript"] is None |
| 63 | |
| 64 | |
| 65 | def test_organize_outputs_with_transcript(formatter, tmp_dir): |
| 66 | md = _create_file(tmp_dir / "analysis.md") |
| 67 | kg = _create_file(tmp_dir / "kg.json") |
| 68 | transcript = _create_file(tmp_dir / "transcript.txt", "Hello world") |
| 69 | |
| 70 | result = formatter.organize_outputs( |
| 71 | markdown_path=md, |
| 72 | knowledge_graph_path=kg, |
| 73 | diagrams=[], |
| 74 | transcript_path=transcript, |
| 75 | ) |
| 76 | |
| 77 | assert result["transcript"] is not None |
| 78 | assert Path(result["transcript"]).exists() |
| 79 | |
| 80 | |
| 81 | def test_organize_outputs_with_diagrams(formatter, tmp_dir): |
| 82 | md = _create_file(tmp_dir / "analysis.md") |
| 83 | kg = _create_file(tmp_dir / "kg.json") |
| 84 | img = _create_file(tmp_dir / "diagram1.png", "fake-png") |
| 85 | |
| 86 | result = formatter.organize_outputs( |
| 87 | markdown_path=md, |
| 88 | knowledge_graph_path=kg, |
| 89 | diagrams=[{"image_path": str(img)}], |
| 90 | ) |
| 91 | |
| 92 | assert len(result["diagram_images"]) == 1 |
| 93 | assert Path(result["diagram_images"][0]).exists() |
| 94 | |
| 95 | |
| 96 | def test_organize_outputs_skips_missing_diagram(formatter, tmp_dir): |
| 97 | md = _create_file(tmp_dir / "analysis.md") |
| 98 | kg = _create_file(tmp_dir / "kg.json") |
| 99 | |
| 100 | result = formatter.organize_outputs( |
| 101 | markdown_path=md, |
| 102 | knowledge_graph_path=kg, |
| 103 | diagrams=[{"image_path": "/nonexistent/diagram.png"}], |
| 104 | ) |
| 105 | |
| 106 | assert result["diagram_images"] == [] |
| 107 | |
| 108 | |
| 109 | def test_organize_outputs_diagram_without_image_path(formatter, tmp_dir): |
| 110 | md = _create_file(tmp_dir / "analysis.md") |
| 111 | kg = _create_file(tmp_dir / "kg.json") |
| 112 | |
| 113 | result = formatter.organize_outputs( |
| 114 | markdown_path=md, |
| 115 | knowledge_graph_path=kg, |
| 116 | diagrams=[{"description": "A diagram"}], |
| 117 | ) |
| 118 | |
| 119 | assert result["diagram_images"] == [] |
| 120 | |
| 121 | |
| 122 | def test_organize_outputs_with_frames(formatter, tmp_dir): |
| 123 | md = _create_file(tmp_dir / "analysis.md") |
| 124 | kg = _create_file(tmp_dir / "kg.json") |
| 125 | frames_dir = tmp_dir / "frames" |
| 126 | frames_dir.mkdir() |
| 127 | for i in range(5): |
| 128 | _create_file(frames_dir / f"frame_{i:03d}.jpg", f"frame{i}") |
| 129 | |
| 130 | result = formatter.organize_outputs( |
| 131 | markdown_path=md, |
| 132 | knowledge_graph_path=kg, |
| 133 | diagrams=[], |
| 134 | frames_dir=frames_dir, |
| 135 | ) |
| 136 | |
| 137 | assert len(result["frames"]) == 5 |
| 138 | |
| 139 | |
| 140 | def test_organize_outputs_limits_frames_to_10(formatter, tmp_dir): |
| 141 | md = _create_file(tmp_dir / "analysis.md") |
| 142 | kg = _create_file(tmp_dir / "kg.json") |
| 143 | frames_dir = tmp_dir / "frames" |
| 144 | frames_dir.mkdir() |
| 145 | for i in range(25): |
| 146 | _create_file(frames_dir / f"frame_{i:03d}.jpg", f"frame{i}") |
| 147 | |
| 148 | result = formatter.organize_outputs( |
| 149 | markdown_path=md, |
| 150 | knowledge_graph_path=kg, |
| 151 | diagrams=[], |
| 152 | frames_dir=frames_dir, |
| 153 | ) |
| 154 | |
| 155 | assert len(result["frames"]) <= 10 |
| 156 | |
| 157 | |
| 158 | def test_organize_outputs_missing_frames_dir(formatter, tmp_dir): |
| 159 | md = _create_file(tmp_dir / "analysis.md") |
| 160 | kg = _create_file(tmp_dir / "kg.json") |
| 161 | |
| 162 | result = formatter.organize_outputs( |
| 163 | markdown_path=md, |
| 164 | knowledge_graph_path=kg, |
| 165 | diagrams=[], |
| 166 | frames_dir=tmp_dir / "nonexistent_frames", |
| 167 | ) |
| 168 | |
| 169 | assert result["frames"] == [] |
| 170 | |
| 171 | |
| 172 | # --- create_html_index --- |
| 173 | |
| 174 | |
| 175 | def test_create_html_index_returns_path(formatter, tmp_dir): |
| 176 | outputs = { |
| 177 | "markdown": str(formatter.output_dir / "markdown" / "analysis.md"), |
| 178 | "knowledge_graph": str(formatter.output_dir / "data" / "kg.json"), |
| 179 | "diagram_images": [], |
| 180 | "frames": [], |
| 181 | "transcript": None, |
| 182 | } |
| 183 | # Create the referenced files so relative_to works |
| 184 | for key in ("markdown", "knowledge_graph"): |
| 185 | _create_file(Path(outputs[key])) |
| 186 | |
| 187 | index = formatter.create_html_index(outputs) |
| 188 | assert index.exists() |
| 189 | assert index.name == "index.html" |
| 190 | |
| 191 | |
| 192 | def test_create_html_index_contains_analysis_link(formatter, tmp_dir): |
| 193 | md_path = formatter.output_dir / "markdown" / "analysis.md" |
| 194 | _create_file(md_path) |
| 195 | outputs = { |
| 196 | "markdown": str(md_path), |
| 197 | "knowledge_graph": None, |
| 198 | "diagram_images": [], |
| 199 | "frames": [], |
| 200 | "transcript": None, |
| 201 | } |
| 202 | |
| 203 | index = formatter.create_html_index(outputs) |
| 204 | content = index.read_text() |
| 205 | assert "Analysis Report" in content |
| 206 | assert "analysis.md" in content |
| 207 | |
| 208 | |
| 209 | def test_create_html_index_with_diagrams(formatter, tmp_dir): |
| 210 | img_path = formatter.output_dir / "diagrams" / "d1.png" |
| 211 | _create_file(img_path) |
| 212 | outputs = { |
| 213 | "markdown": None, |
| 214 | "knowledge_graph": None, |
| 215 | "diagram_images": [str(img_path)], |
| 216 | "frames": [], |
| 217 | "transcript": None, |
| 218 | } |
| 219 | |
| 220 | index = formatter.create_html_index(outputs) |
| 221 | content = index.read_text() |
| 222 | assert "Diagrams" in content |
| 223 | assert "d1.png" in content |
| 224 | |
| 225 | |
| 226 | def test_create_html_index_with_frames(formatter, tmp_dir): |
| 227 | frame_path = formatter.output_dir / "frames" / "frame_001.jpg" |
| 228 | _create_file(frame_path) |
| 229 | outputs = { |
| 230 | "markdown": None, |
| 231 | "knowledge_graph": None, |
| 232 | "diagram_images": [], |
| 233 | "frames": [str(frame_path)], |
| 234 | "transcript": None, |
| 235 | } |
| 236 | |
| 237 | index = formatter.create_html_index(outputs) |
| 238 | content = index.read_text() |
| 239 | assert "Key Frames" in content |
| 240 | assert "frame_001.jpg" in content |
| 241 | |
| 242 | |
| 243 | def test_create_html_index_with_data_files(formatter, tmp_dir): |
| 244 | kg_path = formatter.output_dir / "data" / "kg.json" |
| 245 | transcript_path = formatter.output_dir / "data" / "transcript.txt" |
| 246 | _create_file(kg_path) |
| 247 | _create_file(transcript_path) |
| 248 | outputs = { |
| 249 | "markdown": None, |
| 250 | "knowledge_graph": str(kg_path), |
| 251 | "diagram_images": [], |
| 252 | "frames": [], |
| 253 | "transcript": str(transcript_path), |
| 254 | } |
| 255 | |
| 256 | index = formatter.create_html_index(outputs) |
| 257 | content = index.read_text() |
| 258 | assert "Data Files" in content |
| 259 | assert "kg.json" in content |
| 260 | assert "transcript.txt" in content |
| 261 | |
| 262 | |
| 263 | def test_create_html_index_empty_outputs(formatter): |
| 264 | outputs = { |
| 265 | "markdown": None, |
| 266 | "knowledge_graph": None, |
| 267 | "diagram_images": [], |
| 268 | "frames": [], |
| 269 | "transcript": None, |
| 270 | } |
| 271 | |
| 272 | index = formatter.create_html_index(outputs) |
| 273 | content = index.read_text() |
| 274 | assert "PlanOpticon Analysis Results" in content |
| 275 | assert "<!DOCTYPE html>" in content |
+433
-2
| --- tests/test_pipeline.py | ||
| +++ tests/test_pipeline.py | ||
| @@ -1,11 +1,19 @@ | ||
| 1 | 1 | """Tests for the core video processing pipeline.""" |
| 2 | 2 | |
| 3 | 3 | import json |
| 4 | -from unittest.mock import MagicMock | |
| 4 | +from pathlib import Path | |
| 5 | +from unittest.mock import MagicMock, patch | |
| 5 | 6 | |
| 6 | -from video_processor.pipeline import _extract_action_items, _extract_key_points, _format_srt_time | |
| 7 | +import pytest | |
| 8 | + | |
| 9 | +from video_processor.pipeline import ( | |
| 10 | + _extract_action_items, | |
| 11 | + _extract_key_points, | |
| 12 | + _format_srt_time, | |
| 13 | + process_single_video, | |
| 14 | +) | |
| 7 | 15 | |
| 8 | 16 | |
| 9 | 17 | class TestFormatSrtTime: |
| 10 | 18 | def test_zero(self): |
| 11 | 19 | assert _format_srt_time(0) == "00:00:00,000" |
| @@ -99,5 +107,428 @@ | ||
| 99 | 107 | def test_handles_error(self): |
| 100 | 108 | pm = MagicMock() |
| 101 | 109 | pm.chat.side_effect = Exception("API down") |
| 102 | 110 | result = _extract_action_items(pm, "text") |
| 103 | 111 | assert result == [] |
| 112 | + | |
| 113 | + | |
| 114 | +# --------------------------------------------------------------------------- | |
| 115 | +# process_single_video tests (heavily mocked) | |
| 116 | +# --------------------------------------------------------------------------- | |
| 117 | + | |
| 118 | + | |
| 119 | +def _make_mock_pm(): | |
| 120 | + """Build a mock ProviderManager with usage tracker and predictable responses.""" | |
| 121 | + pm = MagicMock() | |
| 122 | + | |
| 123 | + # Usage tracker stub | |
| 124 | + pm.usage = MagicMock() | |
| 125 | + pm.usage.start_step = MagicMock() | |
| 126 | + pm.usage.end_step = MagicMock() | |
| 127 | + | |
| 128 | + # transcribe_audio returns a simple transcript | |
| 129 | + pm.transcribe_audio.return_value = { | |
| 130 | + "text": "Alice discussed the Python deployment strategy with Bob.", | |
| 131 | + "segments": [ | |
| 132 | + {"start": 0.0, "end": 5.0, "text": "Alice discussed the Python deployment strategy."}, | |
| 133 | + {"start": 5.0, "end": 10.0, "text": "Bob agreed on the timeline."}, | |
| 134 | + ], | |
| 135 | + "duration": 10.0, | |
| 136 | + "language": "en", | |
| 137 | + "provider": "mock", | |
| 138 | + "model": "mock-whisper", | |
| 139 | + } | |
| 140 | + | |
| 141 | + # chat returns predictable JSON depending on the call | |
| 142 | + def _chat_side_effect(messages, **kwargs): | |
| 143 | + content = messages[0]["content"] if messages else "" | |
| 144 | + if "key points" in content.lower(): | |
| 145 | + return json.dumps( | |
| 146 | + [{"point": "Deployment strategy discussed", "topic": "DevOps", "details": "Python"}] | |
| 147 | + ) | |
| 148 | + if "action items" in content.lower(): | |
| 149 | + return json.dumps( | |
| 150 | + [{"action": "Deploy to production", "assignee": "Bob", "priority": "high"}] | |
| 151 | + ) | |
| 152 | + # Default: entity extraction for knowledge graph | |
| 153 | + return json.dumps( | |
| 154 | + { | |
| 155 | + "entities": [ | |
| 156 | + {"name": "Python", "type": "technology", "description": "Programming language"}, | |
| 157 | + {"name": "Alice", "type": "person", "description": "Engineer"}, | |
| 158 | + ], | |
| 159 | + "relationships": [ | |
| 160 | + {"source": "Alice", "target": "Python", "type": "uses"}, | |
| 161 | + ], | |
| 162 | + } | |
| 163 | + ) | |
| 164 | + | |
| 165 | + pm.chat.side_effect = _chat_side_effect | |
| 166 | + pm.get_models_used.return_value = {"chat": "mock-gpt", "transcription": "mock-whisper"} | |
| 167 | + return pm | |
| 168 | + | |
| 169 | + | |
| 170 | +def _make_tqdm_passthrough(mock_tqdm): | |
| 171 | + """Configure mock tqdm to pass through iterables while supporting .set_description() etc.""" | |
| 172 | + | |
| 173 | + def _tqdm_side_effect(iterable, **kw): | |
| 174 | + wrapper = MagicMock() | |
| 175 | + wrapper.__iter__ = lambda self: iter(iterable) | |
| 176 | + return wrapper | |
| 177 | + | |
| 178 | + mock_tqdm.side_effect = _tqdm_side_effect | |
| 179 | + | |
| 180 | + | |
| 181 | +def _create_fake_video(path: Path) -> Path: | |
| 182 | + """Create a tiny file that stands in for a video (all extractors are mocked).""" | |
| 183 | + path.parent.mkdir(parents=True, exist_ok=True) | |
| 184 | + path.write_bytes(b"\x00" * 64) | |
| 185 | + return path | |
| 186 | + | |
| 187 | + | |
| 188 | +class TestProcessSingleVideo: | |
| 189 | + """Integration-level tests for process_single_video with heavy mocking.""" | |
| 190 | + | |
| 191 | + @pytest.fixture | |
| 192 | + def setup(self, tmp_path): | |
| 193 | + """Create fake video, output dir, and mock PM.""" | |
| 194 | + video_path = _create_fake_video(tmp_path / "input" / "meeting.mp4") | |
| 195 | + output_dir = tmp_path / "output" | |
| 196 | + pm = _make_mock_pm() | |
| 197 | + return video_path, output_dir, pm | |
| 198 | + | |
| 199 | + @patch("video_processor.pipeline.export_all_formats") | |
| 200 | + @patch("video_processor.pipeline.PlanGenerator") | |
| 201 | + @patch("video_processor.pipeline.DiagramAnalyzer") | |
| 202 | + @patch("video_processor.pipeline.AudioExtractor") | |
| 203 | + @patch("video_processor.pipeline.filter_people_frames") | |
| 204 | + @patch("video_processor.pipeline.save_frames") | |
| 205 | + @patch("video_processor.pipeline.extract_frames") | |
| 206 | + @patch("video_processor.pipeline.tqdm") | |
| 207 | + def test_returns_manifest( | |
| 208 | + self, | |
| 209 | + mock_tqdm, | |
| 210 | + mock_extract_frames, | |
| 211 | + mock_save_frames, | |
| 212 | + mock_filter_people, | |
| 213 | + mock_audio_extractor_cls, | |
| 214 | + mock_diagram_analyzer_cls, | |
| 215 | + mock_plan_gen_cls, | |
| 216 | + mock_export, | |
| 217 | + setup, | |
| 218 | + ): | |
| 219 | + video_path, output_dir, pm = setup | |
| 220 | + | |
| 221 | + # tqdm pass-through | |
| 222 | + _make_tqdm_passthrough(mock_tqdm) | |
| 223 | + | |
| 224 | + # Frame extraction mocks | |
| 225 | + mock_extract_frames.return_value = [b"fake_frame_1", b"fake_frame_2"] | |
| 226 | + mock_filter_people.return_value = ([b"fake_frame_1", b"fake_frame_2"], 0) | |
| 227 | + | |
| 228 | + frames_dir = output_dir / "frames" | |
| 229 | + frames_dir.mkdir(parents=True, exist_ok=True) | |
| 230 | + frame_paths = [] | |
| 231 | + for i in range(2): | |
| 232 | + fp = frames_dir / f"frame_{i:04d}.jpg" | |
| 233 | + fp.write_bytes(b"\xff") | |
| 234 | + frame_paths.append(fp) | |
| 235 | + mock_save_frames.return_value = frame_paths | |
| 236 | + | |
| 237 | + # Audio extractor mock | |
| 238 | + audio_ext = MagicMock() | |
| 239 | + audio_ext.extract_audio.return_value = output_dir / "audio" / "meeting.wav" | |
| 240 | + audio_ext.get_audio_properties.return_value = {"duration": 10.0} | |
| 241 | + mock_audio_extractor_cls.return_value = audio_ext | |
| 242 | + | |
| 243 | + # Diagram analyzer mock | |
| 244 | + diag_analyzer = MagicMock() | |
| 245 | + diag_analyzer.process_frames.return_value = ([], []) | |
| 246 | + mock_diagram_analyzer_cls.return_value = diag_analyzer | |
| 247 | + | |
| 248 | + # Plan generator mock | |
| 249 | + plan_gen = MagicMock() | |
| 250 | + mock_plan_gen_cls.return_value = plan_gen | |
| 251 | + | |
| 252 | + # export_all_formats returns the manifest it receives | |
| 253 | + mock_export.side_effect = lambda out_dir, manifest: manifest | |
| 254 | + | |
| 255 | + manifest = process_single_video( | |
| 256 | + input_path=video_path, | |
| 257 | + output_dir=output_dir, | |
| 258 | + provider_manager=pm, | |
| 259 | + depth="standard", | |
| 260 | + ) | |
| 261 | + | |
| 262 | + from video_processor.models import VideoManifest | |
| 263 | + | |
| 264 | + assert isinstance(manifest, VideoManifest) | |
| 265 | + assert manifest.video.title == "Analysis of meeting" | |
| 266 | + assert manifest.stats.frames_extracted == 2 | |
| 267 | + assert manifest.transcript_json == "transcript/transcript.json" | |
| 268 | + assert manifest.knowledge_graph_json == "results/knowledge_graph.json" | |
| 269 | + | |
| 270 | + @patch("video_processor.pipeline.export_all_formats") | |
| 271 | + @patch("video_processor.pipeline.PlanGenerator") | |
| 272 | + @patch("video_processor.pipeline.DiagramAnalyzer") | |
| 273 | + @patch("video_processor.pipeline.AudioExtractor") | |
| 274 | + @patch("video_processor.pipeline.filter_people_frames") | |
| 275 | + @patch("video_processor.pipeline.save_frames") | |
| 276 | + @patch("video_processor.pipeline.extract_frames") | |
| 277 | + @patch("video_processor.pipeline.tqdm") | |
| 278 | + def test_creates_output_directories( | |
| 279 | + self, | |
| 280 | + mock_tqdm, | |
| 281 | + mock_extract_frames, | |
| 282 | + mock_save_frames, | |
| 283 | + mock_filter_people, | |
| 284 | + mock_audio_extractor_cls, | |
| 285 | + mock_diagram_analyzer_cls, | |
| 286 | + mock_plan_gen_cls, | |
| 287 | + mock_export, | |
| 288 | + setup, | |
| 289 | + ): | |
| 290 | + video_path, output_dir, pm = setup | |
| 291 | + | |
| 292 | + _make_tqdm_passthrough(mock_tqdm) | |
| 293 | + mock_extract_frames.return_value = [] | |
| 294 | + mock_filter_people.return_value = ([], 0) | |
| 295 | + mock_save_frames.return_value = [] | |
| 296 | + | |
| 297 | + audio_ext = MagicMock() | |
| 298 | + audio_ext.extract_audio.return_value = output_dir / "audio" / "meeting.wav" | |
| 299 | + audio_ext.get_audio_properties.return_value = {"duration": 5.0} | |
| 300 | + mock_audio_extractor_cls.return_value = audio_ext | |
| 301 | + | |
| 302 | + diag_analyzer = MagicMock() | |
| 303 | + diag_analyzer.process_frames.return_value = ([], []) | |
| 304 | + mock_diagram_analyzer_cls.return_value = diag_analyzer | |
| 305 | + | |
| 306 | + plan_gen = MagicMock() | |
| 307 | + mock_plan_gen_cls.return_value = plan_gen | |
| 308 | + | |
| 309 | + mock_export.side_effect = lambda out_dir, manifest: manifest | |
| 310 | + | |
| 311 | + process_single_video( | |
| 312 | + input_path=video_path, | |
| 313 | + output_dir=output_dir, | |
| 314 | + provider_manager=pm, | |
| 315 | + ) | |
| 316 | + | |
| 317 | + # Verify standard output directories were created | |
| 318 | + assert (output_dir / "transcript").is_dir() | |
| 319 | + assert (output_dir / "frames").is_dir() | |
| 320 | + assert (output_dir / "results").is_dir() | |
| 321 | + | |
| 322 | + @patch("video_processor.pipeline.export_all_formats") | |
| 323 | + @patch("video_processor.pipeline.PlanGenerator") | |
| 324 | + @patch("video_processor.pipeline.DiagramAnalyzer") | |
| 325 | + @patch("video_processor.pipeline.AudioExtractor") | |
| 326 | + @patch("video_processor.pipeline.filter_people_frames") | |
| 327 | + @patch("video_processor.pipeline.save_frames") | |
| 328 | + @patch("video_processor.pipeline.extract_frames") | |
| 329 | + @patch("video_processor.pipeline.tqdm") | |
| 330 | + def test_resume_existing_frames( | |
| 331 | + self, | |
| 332 | + mock_tqdm, | |
| 333 | + mock_extract_frames, | |
| 334 | + mock_save_frames, | |
| 335 | + mock_filter_people, | |
| 336 | + mock_audio_extractor_cls, | |
| 337 | + mock_diagram_analyzer_cls, | |
| 338 | + mock_plan_gen_cls, | |
| 339 | + mock_export, | |
| 340 | + setup, | |
| 341 | + ): | |
| 342 | + """When frames already exist on disk, extraction should be skipped.""" | |
| 343 | + video_path, output_dir, pm = setup | |
| 344 | + | |
| 345 | + _make_tqdm_passthrough(mock_tqdm) | |
| 346 | + | |
| 347 | + # Pre-create frames directory with existing frames | |
| 348 | + frames_dir = output_dir / "frames" | |
| 349 | + frames_dir.mkdir(parents=True, exist_ok=True) | |
| 350 | + for i in range(3): | |
| 351 | + (frames_dir / f"frame_{i:04d}.jpg").write_bytes(b"\xff") | |
| 352 | + | |
| 353 | + audio_ext = MagicMock() | |
| 354 | + audio_ext.extract_audio.return_value = output_dir / "audio" / "meeting.wav" | |
| 355 | + audio_ext.get_audio_properties.return_value = {"duration": 10.0} | |
| 356 | + mock_audio_extractor_cls.return_value = audio_ext | |
| 357 | + | |
| 358 | + diag_analyzer = MagicMock() | |
| 359 | + diag_analyzer.process_frames.return_value = ([], []) | |
| 360 | + mock_diagram_analyzer_cls.return_value = diag_analyzer | |
| 361 | + | |
| 362 | + plan_gen = MagicMock() | |
| 363 | + mock_plan_gen_cls.return_value = plan_gen | |
| 364 | + mock_export.side_effect = lambda out_dir, manifest: manifest | |
| 365 | + | |
| 366 | + manifest = process_single_video( | |
| 367 | + input_path=video_path, | |
| 368 | + output_dir=output_dir, | |
| 369 | + provider_manager=pm, | |
| 370 | + ) | |
| 371 | + | |
| 372 | + # extract_frames should NOT have been called (resume path) | |
| 373 | + mock_extract_frames.assert_not_called() | |
| 374 | + assert manifest.stats.frames_extracted == 3 | |
| 375 | + | |
| 376 | + @patch("video_processor.pipeline.export_all_formats") | |
| 377 | + @patch("video_processor.pipeline.PlanGenerator") | |
| 378 | + @patch("video_processor.pipeline.DiagramAnalyzer") | |
| 379 | + @patch("video_processor.pipeline.AudioExtractor") | |
| 380 | + @patch("video_processor.pipeline.filter_people_frames") | |
| 381 | + @patch("video_processor.pipeline.save_frames") | |
| 382 | + @patch("video_processor.pipeline.extract_frames") | |
| 383 | + @patch("video_processor.pipeline.tqdm") | |
| 384 | + def test_resume_existing_transcript( | |
| 385 | + self, | |
| 386 | + mock_tqdm, | |
| 387 | + mock_extract_frames, | |
| 388 | + mock_save_frames, | |
| 389 | + mock_filter_people, | |
| 390 | + mock_audio_extractor_cls, | |
| 391 | + mock_diagram_analyzer_cls, | |
| 392 | + mock_plan_gen_cls, | |
| 393 | + mock_export, | |
| 394 | + setup, | |
| 395 | + ): | |
| 396 | + """When transcript exists on disk, transcription should be skipped.""" | |
| 397 | + video_path, output_dir, pm = setup | |
| 398 | + | |
| 399 | + _make_tqdm_passthrough(mock_tqdm) | |
| 400 | + mock_extract_frames.return_value = [] | |
| 401 | + mock_filter_people.return_value = ([], 0) | |
| 402 | + mock_save_frames.return_value = [] | |
| 403 | + | |
| 404 | + audio_ext = MagicMock() | |
| 405 | + audio_ext.extract_audio.return_value = output_dir / "audio" / "meeting.wav" | |
| 406 | + audio_ext.get_audio_properties.return_value = {"duration": 10.0} | |
| 407 | + mock_audio_extractor_cls.return_value = audio_ext | |
| 408 | + | |
| 409 | + # Pre-create transcript file | |
| 410 | + transcript_dir = output_dir / "transcript" | |
| 411 | + transcript_dir.mkdir(parents=True, exist_ok=True) | |
| 412 | + transcript_data = { | |
| 413 | + "text": "Pre-existing transcript text.", | |
| 414 | + "segments": [{"start": 0.0, "end": 5.0, "text": "Pre-existing transcript text."}], | |
| 415 | + "duration": 5.0, | |
| 416 | + } | |
| 417 | + (transcript_dir / "transcript.json").write_text(json.dumps(transcript_data)) | |
| 418 | + | |
| 419 | + diag_analyzer = MagicMock() | |
| 420 | + diag_analyzer.process_frames.return_value = ([], []) | |
| 421 | + mock_diagram_analyzer_cls.return_value = diag_analyzer | |
| 422 | + | |
| 423 | + plan_gen = MagicMock() | |
| 424 | + mock_plan_gen_cls.return_value = plan_gen | |
| 425 | + mock_export.side_effect = lambda out_dir, manifest: manifest | |
| 426 | + | |
| 427 | + process_single_video( | |
| 428 | + input_path=video_path, | |
| 429 | + output_dir=output_dir, | |
| 430 | + provider_manager=pm, | |
| 431 | + ) | |
| 432 | + | |
| 433 | + # transcribe_audio should NOT have been called (resume path) | |
| 434 | + pm.transcribe_audio.assert_not_called() | |
| 435 | + | |
| 436 | + @patch("video_processor.pipeline.export_all_formats") | |
| 437 | + @patch("video_processor.pipeline.PlanGenerator") | |
| 438 | + @patch("video_processor.pipeline.DiagramAnalyzer") | |
| 439 | + @patch("video_processor.pipeline.AudioExtractor") | |
| 440 | + @patch("video_processor.pipeline.filter_people_frames") | |
| 441 | + @patch("video_processor.pipeline.save_frames") | |
| 442 | + @patch("video_processor.pipeline.extract_frames") | |
| 443 | + @patch("video_processor.pipeline.tqdm") | |
| 444 | + def test_custom_title( | |
| 445 | + self, | |
| 446 | + mock_tqdm, | |
| 447 | + mock_extract_frames, | |
| 448 | + mock_save_frames, | |
| 449 | + mock_filter_people, | |
| 450 | + mock_audio_extractor_cls, | |
| 451 | + mock_diagram_analyzer_cls, | |
| 452 | + mock_plan_gen_cls, | |
| 453 | + mock_export, | |
| 454 | + setup, | |
| 455 | + ): | |
| 456 | + video_path, output_dir, pm = setup | |
| 457 | + | |
| 458 | + _make_tqdm_passthrough(mock_tqdm) | |
| 459 | + mock_extract_frames.return_value = [] | |
| 460 | + mock_filter_people.return_value = ([], 0) | |
| 461 | + mock_save_frames.return_value = [] | |
| 462 | + | |
| 463 | + audio_ext = MagicMock() | |
| 464 | + audio_ext.extract_audio.return_value = output_dir / "audio" / "meeting.wav" | |
| 465 | + audio_ext.get_audio_properties.return_value = {"duration": 5.0} | |
| 466 | + mock_audio_extractor_cls.return_value = audio_ext | |
| 467 | + | |
| 468 | + diag_analyzer = MagicMock() | |
| 469 | + diag_analyzer.process_frames.return_value = ([], []) | |
| 470 | + mock_diagram_analyzer_cls.return_value = diag_analyzer | |
| 471 | + | |
| 472 | + plan_gen = MagicMock() | |
| 473 | + mock_plan_gen_cls.return_value = plan_gen | |
| 474 | + mock_export.side_effect = lambda out_dir, manifest: manifest | |
| 475 | + | |
| 476 | + manifest = process_single_video( | |
| 477 | + input_path=video_path, | |
| 478 | + output_dir=output_dir, | |
| 479 | + provider_manager=pm, | |
| 480 | + title="My Custom Title", | |
| 481 | + ) | |
| 482 | + | |
| 483 | + assert manifest.video.title == "My Custom Title" | |
| 484 | + | |
| 485 | + @patch("video_processor.pipeline.export_all_formats") | |
| 486 | + @patch("video_processor.pipeline.PlanGenerator") | |
| 487 | + @patch("video_processor.pipeline.DiagramAnalyzer") | |
| 488 | + @patch("video_processor.pipeline.AudioExtractor") | |
| 489 | + @patch("video_processor.pipeline.filter_people_frames") | |
| 490 | + @patch("video_processor.pipeline.save_frames") | |
| 491 | + @patch("video_processor.pipeline.extract_frames") | |
| 492 | + @patch("video_processor.pipeline.tqdm") | |
| 493 | + def test_key_points_and_action_items_extracted( | |
| 494 | + self, | |
| 495 | + mock_tqdm, | |
| 496 | + mock_extract_frames, | |
| 497 | + mock_save_frames, | |
| 498 | + mock_filter_people, | |
| 499 | + mock_audio_extractor_cls, | |
| 500 | + mock_diagram_analyzer_cls, | |
| 501 | + mock_plan_gen_cls, | |
| 502 | + mock_export, | |
| 503 | + setup, | |
| 504 | + ): | |
| 505 | + video_path, output_dir, pm = setup | |
| 506 | + | |
| 507 | + _make_tqdm_passthrough(mock_tqdm) | |
| 508 | + mock_extract_frames.return_value = [] | |
| 509 | + mock_filter_people.return_value = ([], 0) | |
| 510 | + mock_save_frames.return_value = [] | |
| 511 | + | |
| 512 | + audio_ext = MagicMock() | |
| 513 | + audio_ext.extract_audio.return_value = output_dir / "audio" / "meeting.wav" | |
| 514 | + audio_ext.get_audio_properties.return_value = {"duration": 10.0} | |
| 515 | + mock_audio_extractor_cls.return_value = audio_ext | |
| 516 | + | |
| 517 | + diag_analyzer = MagicMock() | |
| 518 | + diag_analyzer.process_frames.return_value = ([], []) | |
| 519 | + mock_diagram_analyzer_cls.return_value = diag_analyzer | |
| 520 | + | |
| 521 | + plan_gen = MagicMock() | |
| 522 | + mock_plan_gen_cls.return_value = plan_gen | |
| 523 | + mock_export.side_effect = lambda out_dir, manifest: manifest | |
| 524 | + | |
| 525 | + manifest = process_single_video( | |
| 526 | + input_path=video_path, | |
| 527 | + output_dir=output_dir, | |
| 528 | + provider_manager=pm, | |
| 529 | + ) | |
| 530 | + | |
| 531 | + assert len(manifest.key_points) == 1 | |
| 532 | + assert manifest.key_points[0].point == "Deployment strategy discussed" | |
| 533 | + assert len(manifest.action_items) == 1 | |
| 534 | + assert manifest.action_items[0].action == "Deploy to production" | |
| 104 | 535 | |
| 105 | 536 | ADDED tests/test_processors.py |
| --- tests/test_pipeline.py | |
| +++ tests/test_pipeline.py | |
| @@ -1,11 +1,19 @@ | |
| 1 | """Tests for the core video processing pipeline.""" |
| 2 | |
| 3 | import json |
| 4 | from unittest.mock import MagicMock |
| 5 | |
| 6 | from video_processor.pipeline import _extract_action_items, _extract_key_points, _format_srt_time |
| 7 | |
| 8 | |
| 9 | class TestFormatSrtTime: |
| 10 | def test_zero(self): |
| 11 | assert _format_srt_time(0) == "00:00:00,000" |
| @@ -99,5 +107,428 @@ | |
| 99 | def test_handles_error(self): |
| 100 | pm = MagicMock() |
| 101 | pm.chat.side_effect = Exception("API down") |
| 102 | result = _extract_action_items(pm, "text") |
| 103 | assert result == [] |
| 104 | |
| 105 | DDED tests/test_processors.py |
| --- tests/test_pipeline.py | |
| +++ tests/test_pipeline.py | |
| @@ -1,11 +1,19 @@ | |
| 1 | """Tests for the core video processing pipeline.""" |
| 2 | |
| 3 | import json |
| 4 | from pathlib import Path |
| 5 | from unittest.mock import MagicMock, patch |
| 6 | |
| 7 | import pytest |
| 8 | |
| 9 | from video_processor.pipeline import ( |
| 10 | _extract_action_items, |
| 11 | _extract_key_points, |
| 12 | _format_srt_time, |
| 13 | process_single_video, |
| 14 | ) |
| 15 | |
| 16 | |
| 17 | class TestFormatSrtTime: |
| 18 | def test_zero(self): |
| 19 | assert _format_srt_time(0) == "00:00:00,000" |
| @@ -99,5 +107,428 @@ | |
| 107 | def test_handles_error(self): |
| 108 | pm = MagicMock() |
| 109 | pm.chat.side_effect = Exception("API down") |
| 110 | result = _extract_action_items(pm, "text") |
| 111 | assert result == [] |
| 112 | |
| 113 | |
| 114 | # --------------------------------------------------------------------------- |
| 115 | # process_single_video tests (heavily mocked) |
| 116 | # --------------------------------------------------------------------------- |
| 117 | |
| 118 | |
| 119 | def _make_mock_pm(): |
| 120 | """Build a mock ProviderManager with usage tracker and predictable responses.""" |
| 121 | pm = MagicMock() |
| 122 | |
| 123 | # Usage tracker stub |
| 124 | pm.usage = MagicMock() |
| 125 | pm.usage.start_step = MagicMock() |
| 126 | pm.usage.end_step = MagicMock() |
| 127 | |
| 128 | # transcribe_audio returns a simple transcript |
| 129 | pm.transcribe_audio.return_value = { |
| 130 | "text": "Alice discussed the Python deployment strategy with Bob.", |
| 131 | "segments": [ |
| 132 | {"start": 0.0, "end": 5.0, "text": "Alice discussed the Python deployment strategy."}, |
| 133 | {"start": 5.0, "end": 10.0, "text": "Bob agreed on the timeline."}, |
| 134 | ], |
| 135 | "duration": 10.0, |
| 136 | "language": "en", |
| 137 | "provider": "mock", |
| 138 | "model": "mock-whisper", |
| 139 | } |
| 140 | |
| 141 | # chat returns predictable JSON depending on the call |
| 142 | def _chat_side_effect(messages, **kwargs): |
| 143 | content = messages[0]["content"] if messages else "" |
| 144 | if "key points" in content.lower(): |
| 145 | return json.dumps( |
| 146 | [{"point": "Deployment strategy discussed", "topic": "DevOps", "details": "Python"}] |
| 147 | ) |
| 148 | if "action items" in content.lower(): |
| 149 | return json.dumps( |
| 150 | [{"action": "Deploy to production", "assignee": "Bob", "priority": "high"}] |
| 151 | ) |
| 152 | # Default: entity extraction for knowledge graph |
| 153 | return json.dumps( |
| 154 | { |
| 155 | "entities": [ |
| 156 | {"name": "Python", "type": "technology", "description": "Programming language"}, |
| 157 | {"name": "Alice", "type": "person", "description": "Engineer"}, |
| 158 | ], |
| 159 | "relationships": [ |
| 160 | {"source": "Alice", "target": "Python", "type": "uses"}, |
| 161 | ], |
| 162 | } |
| 163 | ) |
| 164 | |
| 165 | pm.chat.side_effect = _chat_side_effect |
| 166 | pm.get_models_used.return_value = {"chat": "mock-gpt", "transcription": "mock-whisper"} |
| 167 | return pm |
| 168 | |
| 169 | |
| 170 | def _make_tqdm_passthrough(mock_tqdm): |
| 171 | """Configure mock tqdm to pass through iterables while supporting .set_description() etc.""" |
| 172 | |
| 173 | def _tqdm_side_effect(iterable, **kw): |
| 174 | wrapper = MagicMock() |
| 175 | wrapper.__iter__ = lambda self: iter(iterable) |
| 176 | return wrapper |
| 177 | |
| 178 | mock_tqdm.side_effect = _tqdm_side_effect |
| 179 | |
| 180 | |
| 181 | def _create_fake_video(path: Path) -> Path: |
| 182 | """Create a tiny file that stands in for a video (all extractors are mocked).""" |
| 183 | path.parent.mkdir(parents=True, exist_ok=True) |
| 184 | path.write_bytes(b"\x00" * 64) |
| 185 | return path |
| 186 | |
| 187 | |
| 188 | class TestProcessSingleVideo: |
| 189 | """Integration-level tests for process_single_video with heavy mocking.""" |
| 190 | |
| 191 | @pytest.fixture |
| 192 | def setup(self, tmp_path): |
| 193 | """Create fake video, output dir, and mock PM.""" |
| 194 | video_path = _create_fake_video(tmp_path / "input" / "meeting.mp4") |
| 195 | output_dir = tmp_path / "output" |
| 196 | pm = _make_mock_pm() |
| 197 | return video_path, output_dir, pm |
| 198 | |
| 199 | @patch("video_processor.pipeline.export_all_formats") |
| 200 | @patch("video_processor.pipeline.PlanGenerator") |
| 201 | @patch("video_processor.pipeline.DiagramAnalyzer") |
| 202 | @patch("video_processor.pipeline.AudioExtractor") |
| 203 | @patch("video_processor.pipeline.filter_people_frames") |
| 204 | @patch("video_processor.pipeline.save_frames") |
| 205 | @patch("video_processor.pipeline.extract_frames") |
| 206 | @patch("video_processor.pipeline.tqdm") |
| 207 | def test_returns_manifest( |
| 208 | self, |
| 209 | mock_tqdm, |
| 210 | mock_extract_frames, |
| 211 | mock_save_frames, |
| 212 | mock_filter_people, |
| 213 | mock_audio_extractor_cls, |
| 214 | mock_diagram_analyzer_cls, |
| 215 | mock_plan_gen_cls, |
| 216 | mock_export, |
| 217 | setup, |
| 218 | ): |
| 219 | video_path, output_dir, pm = setup |
| 220 | |
| 221 | # tqdm pass-through |
| 222 | _make_tqdm_passthrough(mock_tqdm) |
| 223 | |
| 224 | # Frame extraction mocks |
| 225 | mock_extract_frames.return_value = [b"fake_frame_1", b"fake_frame_2"] |
| 226 | mock_filter_people.return_value = ([b"fake_frame_1", b"fake_frame_2"], 0) |
| 227 | |
| 228 | frames_dir = output_dir / "frames" |
| 229 | frames_dir.mkdir(parents=True, exist_ok=True) |
| 230 | frame_paths = [] |
| 231 | for i in range(2): |
| 232 | fp = frames_dir / f"frame_{i:04d}.jpg" |
| 233 | fp.write_bytes(b"\xff") |
| 234 | frame_paths.append(fp) |
| 235 | mock_save_frames.return_value = frame_paths |
| 236 | |
| 237 | # Audio extractor mock |
| 238 | audio_ext = MagicMock() |
| 239 | audio_ext.extract_audio.return_value = output_dir / "audio" / "meeting.wav" |
| 240 | audio_ext.get_audio_properties.return_value = {"duration": 10.0} |
| 241 | mock_audio_extractor_cls.return_value = audio_ext |
| 242 | |
| 243 | # Diagram analyzer mock |
| 244 | diag_analyzer = MagicMock() |
| 245 | diag_analyzer.process_frames.return_value = ([], []) |
| 246 | mock_diagram_analyzer_cls.return_value = diag_analyzer |
| 247 | |
| 248 | # Plan generator mock |
| 249 | plan_gen = MagicMock() |
| 250 | mock_plan_gen_cls.return_value = plan_gen |
| 251 | |
| 252 | # export_all_formats returns the manifest it receives |
| 253 | mock_export.side_effect = lambda out_dir, manifest: manifest |
| 254 | |
| 255 | manifest = process_single_video( |
| 256 | input_path=video_path, |
| 257 | output_dir=output_dir, |
| 258 | provider_manager=pm, |
| 259 | depth="standard", |
| 260 | ) |
| 261 | |
| 262 | from video_processor.models import VideoManifest |
| 263 | |
| 264 | assert isinstance(manifest, VideoManifest) |
| 265 | assert manifest.video.title == "Analysis of meeting" |
| 266 | assert manifest.stats.frames_extracted == 2 |
| 267 | assert manifest.transcript_json == "transcript/transcript.json" |
| 268 | assert manifest.knowledge_graph_json == "results/knowledge_graph.json" |
| 269 | |
| 270 | @patch("video_processor.pipeline.export_all_formats") |
| 271 | @patch("video_processor.pipeline.PlanGenerator") |
| 272 | @patch("video_processor.pipeline.DiagramAnalyzer") |
| 273 | @patch("video_processor.pipeline.AudioExtractor") |
| 274 | @patch("video_processor.pipeline.filter_people_frames") |
| 275 | @patch("video_processor.pipeline.save_frames") |
| 276 | @patch("video_processor.pipeline.extract_frames") |
| 277 | @patch("video_processor.pipeline.tqdm") |
| 278 | def test_creates_output_directories( |
| 279 | self, |
| 280 | mock_tqdm, |
| 281 | mock_extract_frames, |
| 282 | mock_save_frames, |
| 283 | mock_filter_people, |
| 284 | mock_audio_extractor_cls, |
| 285 | mock_diagram_analyzer_cls, |
| 286 | mock_plan_gen_cls, |
| 287 | mock_export, |
| 288 | setup, |
| 289 | ): |
| 290 | video_path, output_dir, pm = setup |
| 291 | |
| 292 | _make_tqdm_passthrough(mock_tqdm) |
| 293 | mock_extract_frames.return_value = [] |
| 294 | mock_filter_people.return_value = ([], 0) |
| 295 | mock_save_frames.return_value = [] |
| 296 | |
| 297 | audio_ext = MagicMock() |
| 298 | audio_ext.extract_audio.return_value = output_dir / "audio" / "meeting.wav" |
| 299 | audio_ext.get_audio_properties.return_value = {"duration": 5.0} |
| 300 | mock_audio_extractor_cls.return_value = audio_ext |
| 301 | |
| 302 | diag_analyzer = MagicMock() |
| 303 | diag_analyzer.process_frames.return_value = ([], []) |
| 304 | mock_diagram_analyzer_cls.return_value = diag_analyzer |
| 305 | |
| 306 | plan_gen = MagicMock() |
| 307 | mock_plan_gen_cls.return_value = plan_gen |
| 308 | |
| 309 | mock_export.side_effect = lambda out_dir, manifest: manifest |
| 310 | |
| 311 | process_single_video( |
| 312 | input_path=video_path, |
| 313 | output_dir=output_dir, |
| 314 | provider_manager=pm, |
| 315 | ) |
| 316 | |
| 317 | # Verify standard output directories were created |
| 318 | assert (output_dir / "transcript").is_dir() |
| 319 | assert (output_dir / "frames").is_dir() |
| 320 | assert (output_dir / "results").is_dir() |
| 321 | |
| 322 | @patch("video_processor.pipeline.export_all_formats") |
| 323 | @patch("video_processor.pipeline.PlanGenerator") |
| 324 | @patch("video_processor.pipeline.DiagramAnalyzer") |
| 325 | @patch("video_processor.pipeline.AudioExtractor") |
| 326 | @patch("video_processor.pipeline.filter_people_frames") |
| 327 | @patch("video_processor.pipeline.save_frames") |
| 328 | @patch("video_processor.pipeline.extract_frames") |
| 329 | @patch("video_processor.pipeline.tqdm") |
| 330 | def test_resume_existing_frames( |
| 331 | self, |
| 332 | mock_tqdm, |
| 333 | mock_extract_frames, |
| 334 | mock_save_frames, |
| 335 | mock_filter_people, |
| 336 | mock_audio_extractor_cls, |
| 337 | mock_diagram_analyzer_cls, |
| 338 | mock_plan_gen_cls, |
| 339 | mock_export, |
| 340 | setup, |
| 341 | ): |
| 342 | """When frames already exist on disk, extraction should be skipped.""" |
| 343 | video_path, output_dir, pm = setup |
| 344 | |
| 345 | _make_tqdm_passthrough(mock_tqdm) |
| 346 | |
| 347 | # Pre-create frames directory with existing frames |
| 348 | frames_dir = output_dir / "frames" |
| 349 | frames_dir.mkdir(parents=True, exist_ok=True) |
| 350 | for i in range(3): |
| 351 | (frames_dir / f"frame_{i:04d}.jpg").write_bytes(b"\xff") |
| 352 | |
| 353 | audio_ext = MagicMock() |
| 354 | audio_ext.extract_audio.return_value = output_dir / "audio" / "meeting.wav" |
| 355 | audio_ext.get_audio_properties.return_value = {"duration": 10.0} |
| 356 | mock_audio_extractor_cls.return_value = audio_ext |
| 357 | |
| 358 | diag_analyzer = MagicMock() |
| 359 | diag_analyzer.process_frames.return_value = ([], []) |
| 360 | mock_diagram_analyzer_cls.return_value = diag_analyzer |
| 361 | |
| 362 | plan_gen = MagicMock() |
| 363 | mock_plan_gen_cls.return_value = plan_gen |
| 364 | mock_export.side_effect = lambda out_dir, manifest: manifest |
| 365 | |
| 366 | manifest = process_single_video( |
| 367 | input_path=video_path, |
| 368 | output_dir=output_dir, |
| 369 | provider_manager=pm, |
| 370 | ) |
| 371 | |
| 372 | # extract_frames should NOT have been called (resume path) |
| 373 | mock_extract_frames.assert_not_called() |
| 374 | assert manifest.stats.frames_extracted == 3 |
| 375 | |
| 376 | @patch("video_processor.pipeline.export_all_formats") |
| 377 | @patch("video_processor.pipeline.PlanGenerator") |
| 378 | @patch("video_processor.pipeline.DiagramAnalyzer") |
| 379 | @patch("video_processor.pipeline.AudioExtractor") |
| 380 | @patch("video_processor.pipeline.filter_people_frames") |
| 381 | @patch("video_processor.pipeline.save_frames") |
| 382 | @patch("video_processor.pipeline.extract_frames") |
| 383 | @patch("video_processor.pipeline.tqdm") |
| 384 | def test_resume_existing_transcript( |
| 385 | self, |
| 386 | mock_tqdm, |
| 387 | mock_extract_frames, |
| 388 | mock_save_frames, |
| 389 | mock_filter_people, |
| 390 | mock_audio_extractor_cls, |
| 391 | mock_diagram_analyzer_cls, |
| 392 | mock_plan_gen_cls, |
| 393 | mock_export, |
| 394 | setup, |
| 395 | ): |
| 396 | """When transcript exists on disk, transcription should be skipped.""" |
| 397 | video_path, output_dir, pm = setup |
| 398 | |
| 399 | _make_tqdm_passthrough(mock_tqdm) |
| 400 | mock_extract_frames.return_value = [] |
| 401 | mock_filter_people.return_value = ([], 0) |
| 402 | mock_save_frames.return_value = [] |
| 403 | |
| 404 | audio_ext = MagicMock() |
| 405 | audio_ext.extract_audio.return_value = output_dir / "audio" / "meeting.wav" |
| 406 | audio_ext.get_audio_properties.return_value = {"duration": 10.0} |
| 407 | mock_audio_extractor_cls.return_value = audio_ext |
| 408 | |
| 409 | # Pre-create transcript file |
| 410 | transcript_dir = output_dir / "transcript" |
| 411 | transcript_dir.mkdir(parents=True, exist_ok=True) |
| 412 | transcript_data = { |
| 413 | "text": "Pre-existing transcript text.", |
| 414 | "segments": [{"start": 0.0, "end": 5.0, "text": "Pre-existing transcript text."}], |
| 415 | "duration": 5.0, |
| 416 | } |
| 417 | (transcript_dir / "transcript.json").write_text(json.dumps(transcript_data)) |
| 418 | |
| 419 | diag_analyzer = MagicMock() |
| 420 | diag_analyzer.process_frames.return_value = ([], []) |
| 421 | mock_diagram_analyzer_cls.return_value = diag_analyzer |
| 422 | |
| 423 | plan_gen = MagicMock() |
| 424 | mock_plan_gen_cls.return_value = plan_gen |
| 425 | mock_export.side_effect = lambda out_dir, manifest: manifest |
| 426 | |
| 427 | process_single_video( |
| 428 | input_path=video_path, |
| 429 | output_dir=output_dir, |
| 430 | provider_manager=pm, |
| 431 | ) |
| 432 | |
| 433 | # transcribe_audio should NOT have been called (resume path) |
| 434 | pm.transcribe_audio.assert_not_called() |
| 435 | |
| 436 | @patch("video_processor.pipeline.export_all_formats") |
| 437 | @patch("video_processor.pipeline.PlanGenerator") |
| 438 | @patch("video_processor.pipeline.DiagramAnalyzer") |
| 439 | @patch("video_processor.pipeline.AudioExtractor") |
| 440 | @patch("video_processor.pipeline.filter_people_frames") |
| 441 | @patch("video_processor.pipeline.save_frames") |
| 442 | @patch("video_processor.pipeline.extract_frames") |
| 443 | @patch("video_processor.pipeline.tqdm") |
| 444 | def test_custom_title( |
| 445 | self, |
| 446 | mock_tqdm, |
| 447 | mock_extract_frames, |
| 448 | mock_save_frames, |
| 449 | mock_filter_people, |
| 450 | mock_audio_extractor_cls, |
| 451 | mock_diagram_analyzer_cls, |
| 452 | mock_plan_gen_cls, |
| 453 | mock_export, |
| 454 | setup, |
| 455 | ): |
| 456 | video_path, output_dir, pm = setup |
| 457 | |
| 458 | _make_tqdm_passthrough(mock_tqdm) |
| 459 | mock_extract_frames.return_value = [] |
| 460 | mock_filter_people.return_value = ([], 0) |
| 461 | mock_save_frames.return_value = [] |
| 462 | |
| 463 | audio_ext = MagicMock() |
| 464 | audio_ext.extract_audio.return_value = output_dir / "audio" / "meeting.wav" |
| 465 | audio_ext.get_audio_properties.return_value = {"duration": 5.0} |
| 466 | mock_audio_extractor_cls.return_value = audio_ext |
| 467 | |
| 468 | diag_analyzer = MagicMock() |
| 469 | diag_analyzer.process_frames.return_value = ([], []) |
| 470 | mock_diagram_analyzer_cls.return_value = diag_analyzer |
| 471 | |
| 472 | plan_gen = MagicMock() |
| 473 | mock_plan_gen_cls.return_value = plan_gen |
| 474 | mock_export.side_effect = lambda out_dir, manifest: manifest |
| 475 | |
| 476 | manifest = process_single_video( |
| 477 | input_path=video_path, |
| 478 | output_dir=output_dir, |
| 479 | provider_manager=pm, |
| 480 | title="My Custom Title", |
| 481 | ) |
| 482 | |
| 483 | assert manifest.video.title == "My Custom Title" |
| 484 | |
| 485 | @patch("video_processor.pipeline.export_all_formats") |
| 486 | @patch("video_processor.pipeline.PlanGenerator") |
| 487 | @patch("video_processor.pipeline.DiagramAnalyzer") |
| 488 | @patch("video_processor.pipeline.AudioExtractor") |
| 489 | @patch("video_processor.pipeline.filter_people_frames") |
| 490 | @patch("video_processor.pipeline.save_frames") |
| 491 | @patch("video_processor.pipeline.extract_frames") |
| 492 | @patch("video_processor.pipeline.tqdm") |
| 493 | def test_key_points_and_action_items_extracted( |
| 494 | self, |
| 495 | mock_tqdm, |
| 496 | mock_extract_frames, |
| 497 | mock_save_frames, |
| 498 | mock_filter_people, |
| 499 | mock_audio_extractor_cls, |
| 500 | mock_diagram_analyzer_cls, |
| 501 | mock_plan_gen_cls, |
| 502 | mock_export, |
| 503 | setup, |
| 504 | ): |
| 505 | video_path, output_dir, pm = setup |
| 506 | |
| 507 | _make_tqdm_passthrough(mock_tqdm) |
| 508 | mock_extract_frames.return_value = [] |
| 509 | mock_filter_people.return_value = ([], 0) |
| 510 | mock_save_frames.return_value = [] |
| 511 | |
| 512 | audio_ext = MagicMock() |
| 513 | audio_ext.extract_audio.return_value = output_dir / "audio" / "meeting.wav" |
| 514 | audio_ext.get_audio_properties.return_value = {"duration": 10.0} |
| 515 | mock_audio_extractor_cls.return_value = audio_ext |
| 516 | |
| 517 | diag_analyzer = MagicMock() |
| 518 | diag_analyzer.process_frames.return_value = ([], []) |
| 519 | mock_diagram_analyzer_cls.return_value = diag_analyzer |
| 520 | |
| 521 | plan_gen = MagicMock() |
| 522 | mock_plan_gen_cls.return_value = plan_gen |
| 523 | mock_export.side_effect = lambda out_dir, manifest: manifest |
| 524 | |
| 525 | manifest = process_single_video( |
| 526 | input_path=video_path, |
| 527 | output_dir=output_dir, |
| 528 | provider_manager=pm, |
| 529 | ) |
| 530 | |
| 531 | assert len(manifest.key_points) == 1 |
| 532 | assert manifest.key_points[0].point == "Deployment strategy discussed" |
| 533 | assert len(manifest.action_items) == 1 |
| 534 | assert manifest.action_items[0].action == "Deploy to production" |
| 535 | |
| 536 | DDED tests/test_processors.py |
+359
| --- a/tests/test_processors.py | ||
| +++ b/tests/test_processors.py | ||
| @@ -0,0 +1,359 @@ | ||
| 1 | +"""Tests for document processors and ingestion pipeline.""" | |
| 2 | + | |
| 3 | +import textwrap | |
| 4 | +from pathlib import Path | |
| 5 | +from unittest.mock import MagicMock, patch | |
| 6 | + | |
| 7 | +import pytest | |
| 8 | + | |
| 9 | +from video_processor.processors.base import ( | |
| 10 | + DocumentChunk, | |
| 11 | + DocumentProcessor, | |
| 12 | + get_processor, | |
| 13 | + list_supported_extensions, | |
| 14 | + register_processor, | |
| 15 | +) | |
| 16 | +from video_processor.processors.markdown_processor import ( | |
| 17 | + MarkdownProcessor, | |
| 18 | + PlaintextProcessor, | |
| 19 | + _chunk_by_paragraphs, | |
| 20 | +) | |
| 21 | +from video_processor.processors.pdf_processor import PdfProcessor | |
| 22 | + | |
| 23 | +# --- Base / Registry --- | |
| 24 | + | |
| 25 | + | |
| 26 | +class TestRegistry: | |
| 27 | + def test_list_supported_extensions_includes_builtins(self): | |
| 28 | + exts = list_supported_extensions() | |
| 29 | + assert ".md" in exts | |
| 30 | + assert ".txt" in exts | |
| 31 | + assert ".pdf" in exts | |
| 32 | + | |
| 33 | + def test_get_processor_markdown(self, tmp_path): | |
| 34 | + f = tmp_path / "doc.md" | |
| 35 | + f.write_text("hello") | |
| 36 | + proc = get_processor(f) | |
| 37 | + assert isinstance(proc, MarkdownProcessor) | |
| 38 | + | |
| 39 | + def test_get_processor_txt(self, tmp_path): | |
| 40 | + f = tmp_path / "doc.txt" | |
| 41 | + f.write_text("hello") | |
| 42 | + proc = get_processor(f) | |
| 43 | + assert isinstance(proc, PlaintextProcessor) | |
| 44 | + | |
| 45 | + def test_get_processor_pdf(self, tmp_path): | |
| 46 | + f = tmp_path / "doc.pdf" | |
| 47 | + f.write_text("") | |
| 48 | + proc = get_processor(f) | |
| 49 | + assert isinstance(proc, PdfProcessor) | |
| 50 | + | |
| 51 | + def test_get_processor_unknown(self, tmp_path): | |
| 52 | + f = tmp_path / "doc.xyz" | |
| 53 | + f.write_text("") | |
| 54 | + assert get_processor(f) is None | |
| 55 | + | |
| 56 | + def test_register_custom_processor(self, tmp_path): | |
| 57 | + class CustomProcessor(DocumentProcessor): | |
| 58 | + supported_extensions = [".custom"] | |
| 59 | + | |
| 60 | + def can_process(self, path): | |
| 61 | + return path.suffix == ".custom" | |
| 62 | + | |
| 63 | + def process(self, path): | |
| 64 | + return [DocumentChunk(text="custom", source_file=str(path), chunk_index=0)] | |
| 65 | + | |
| 66 | + register_processor([".custom"], CustomProcessor) | |
| 67 | + f = tmp_path / "test.custom" | |
| 68 | + f.write_text("data") | |
| 69 | + proc = get_processor(f) | |
| 70 | + assert isinstance(proc, CustomProcessor) | |
| 71 | + chunks = proc.process(f) | |
| 72 | + assert len(chunks) == 1 | |
| 73 | + assert chunks[0].text == "custom" | |
| 74 | + | |
| 75 | + | |
| 76 | +# --- Markdown --- | |
| 77 | + | |
| 78 | + | |
| 79 | +class TestMarkdownProcessor: | |
| 80 | + def test_splits_by_headings(self, tmp_path): | |
| 81 | + md = tmp_path / "test.md" | |
| 82 | + md.write_text( | |
| 83 | + textwrap.dedent("""\ | |
| 84 | + # Introduction | |
| 85 | + Some intro text. | |
| 86 | + | |
| 87 | + ## Details | |
| 88 | + More details here. | |
| 89 | + | |
| 90 | + ## Conclusion | |
| 91 | + Final thoughts. | |
| 92 | + """) | |
| 93 | + ) | |
| 94 | + proc = MarkdownProcessor() | |
| 95 | + assert proc.can_process(md) | |
| 96 | + chunks = proc.process(md) | |
| 97 | + | |
| 98 | + assert len(chunks) == 3 | |
| 99 | + assert chunks[0].section == "Introduction" | |
| 100 | + assert "intro text" in chunks[0].text | |
| 101 | + assert chunks[1].section == "Details" | |
| 102 | + assert chunks[2].section == "Conclusion" | |
| 103 | + | |
| 104 | + def test_preamble_before_first_heading(self, tmp_path): | |
| 105 | + md = tmp_path / "test.md" | |
| 106 | + md.write_text( | |
| 107 | + textwrap.dedent("""\ | |
| 108 | + Some preamble text. | |
| 109 | + | |
| 110 | + # First Heading | |
| 111 | + Content here. | |
| 112 | + """) | |
| 113 | + ) | |
| 114 | + proc = MarkdownProcessor() | |
| 115 | + chunks = proc.process(md) | |
| 116 | + assert len(chunks) == 2 | |
| 117 | + assert chunks[0].section == "(preamble)" | |
| 118 | + assert "preamble" in chunks[0].text | |
| 119 | + | |
| 120 | + def test_no_headings_falls_back_to_paragraphs(self, tmp_path): | |
| 121 | + md = tmp_path / "test.md" | |
| 122 | + md.write_text("Paragraph one.\n\nParagraph two.\n\nParagraph three.") | |
| 123 | + proc = MarkdownProcessor() | |
| 124 | + chunks = proc.process(md) | |
| 125 | + assert len(chunks) >= 1 | |
| 126 | + # All text should be captured | |
| 127 | + full_text = " ".join(c.text for c in chunks) | |
| 128 | + assert "Paragraph one" in full_text | |
| 129 | + assert "Paragraph three" in full_text | |
| 130 | + | |
| 131 | + def test_chunk_index_increments(self, tmp_path): | |
| 132 | + md = tmp_path / "test.md" | |
| 133 | + md.write_text("# A\ntext\n# B\ntext\n# C\ntext") | |
| 134 | + proc = MarkdownProcessor() | |
| 135 | + chunks = proc.process(md) | |
| 136 | + indices = [c.chunk_index for c in chunks] | |
| 137 | + assert indices == list(range(len(chunks))) | |
| 138 | + | |
| 139 | + def test_source_file_set(self, tmp_path): | |
| 140 | + md = tmp_path / "test.md" | |
| 141 | + md.write_text("# Heading\nContent") | |
| 142 | + proc = MarkdownProcessor() | |
| 143 | + chunks = proc.process(md) | |
| 144 | + assert chunks[0].source_file == str(md) | |
| 145 | + | |
| 146 | + | |
| 147 | +# --- Plaintext --- | |
| 148 | + | |
| 149 | + | |
| 150 | +class TestPlaintextProcessor: | |
| 151 | + def test_basic_paragraphs(self, tmp_path): | |
| 152 | + txt = tmp_path / "test.txt" | |
| 153 | + txt.write_text("First paragraph.\n\nSecond paragraph.\n\nThird paragraph.") | |
| 154 | + proc = PlaintextProcessor() | |
| 155 | + assert proc.can_process(txt) | |
| 156 | + chunks = proc.process(txt) | |
| 157 | + assert len(chunks) >= 1 | |
| 158 | + full_text = " ".join(c.text for c in chunks) | |
| 159 | + assert "First paragraph" in full_text | |
| 160 | + assert "Third paragraph" in full_text | |
| 161 | + | |
| 162 | + def test_handles_log_files(self, tmp_path): | |
| 163 | + log = tmp_path / "app.log" | |
| 164 | + log.write_text("line 1\nline 2\nline 3") | |
| 165 | + proc = PlaintextProcessor() | |
| 166 | + assert proc.can_process(log) | |
| 167 | + chunks = proc.process(log) | |
| 168 | + assert len(chunks) >= 1 | |
| 169 | + | |
| 170 | + def test_handles_csv(self, tmp_path): | |
| 171 | + csv = tmp_path / "data.csv" | |
| 172 | + csv.write_text("a,b,c\n1,2,3\n4,5,6") | |
| 173 | + proc = PlaintextProcessor() | |
| 174 | + assert proc.can_process(csv) | |
| 175 | + chunks = proc.process(csv) | |
| 176 | + assert len(chunks) >= 1 | |
| 177 | + | |
| 178 | + def test_empty_file(self, tmp_path): | |
| 179 | + txt = tmp_path / "empty.txt" | |
| 180 | + txt.write_text("") | |
| 181 | + proc = PlaintextProcessor() | |
| 182 | + chunks = proc.process(txt) | |
| 183 | + assert chunks == [] | |
| 184 | + | |
| 185 | + | |
| 186 | +class TestChunkByParagraphs: | |
| 187 | + def test_respects_max_chunk_size(self): | |
| 188 | + # Create text with many paragraphs that exceed max size | |
| 189 | + paragraphs = ["A" * 500 for _ in range(10)] | |
| 190 | + text = "\n\n".join(paragraphs) | |
| 191 | + chunks = _chunk_by_paragraphs(text, "test.txt", max_chunk_size=1200, overlap=100) | |
| 192 | + assert len(chunks) > 1 | |
| 193 | + for chunk in chunks: | |
| 194 | + # Each chunk should be reasonably sized (allowing for overlap) | |
| 195 | + assert len(chunk.text) < 2000 | |
| 196 | + | |
| 197 | + def test_overlap(self): | |
| 198 | + text = "Para A " * 300 + "\n\n" + "Para B " * 300 + "\n\n" + "Para C " * 300 | |
| 199 | + chunks = _chunk_by_paragraphs(text, "test.txt", max_chunk_size=2500, overlap=200) | |
| 200 | + if len(chunks) > 1: | |
| 201 | + # The second chunk should contain some overlap from the first | |
| 202 | + assert len(chunks[1].text) > 200 | |
| 203 | + | |
| 204 | + | |
| 205 | +# --- PDF --- | |
| 206 | + | |
| 207 | + | |
| 208 | +class TestPdfProcessor: | |
| 209 | + def test_can_process(self, tmp_path): | |
| 210 | + f = tmp_path / "doc.pdf" | |
| 211 | + f.write_text("") | |
| 212 | + proc = PdfProcessor() | |
| 213 | + assert proc.can_process(f) | |
| 214 | + assert not proc.can_process(tmp_path / "doc.txt") | |
| 215 | + | |
| 216 | + def test_process_pymupdf(self, tmp_path): | |
| 217 | + f = tmp_path / "doc.pdf" | |
| 218 | + f.write_text("") | |
| 219 | + | |
| 220 | + mock_page = MagicMock() | |
| 221 | + mock_page.get_text.return_value = "Page 1 content" | |
| 222 | + mock_doc = MagicMock() | |
| 223 | + mock_doc.__iter__ = MagicMock(return_value=iter([mock_page])) | |
| 224 | + mock_doc.__enter__ = MagicMock(return_value=mock_doc) | |
| 225 | + mock_doc.__exit__ = MagicMock(return_value=False) | |
| 226 | + | |
| 227 | + mock_pymupdf = MagicMock() | |
| 228 | + mock_pymupdf.open.return_value = mock_doc | |
| 229 | + | |
| 230 | + with patch.dict("sys.modules", {"pymupdf": mock_pymupdf}): | |
| 231 | + proc = PdfProcessor() | |
| 232 | + chunks = proc._process_pymupdf(f) | |
| 233 | + assert len(chunks) == 1 | |
| 234 | + assert chunks[0].text == "Page 1 content" | |
| 235 | + assert chunks[0].page == 1 | |
| 236 | + assert chunks[0].metadata["extraction_method"] == "pymupdf" | |
| 237 | + | |
| 238 | + def test_process_pdfplumber(self, tmp_path): | |
| 239 | + f = tmp_path / "doc.pdf" | |
| 240 | + f.write_text("") | |
| 241 | + | |
| 242 | + mock_page = MagicMock() | |
| 243 | + mock_page.extract_text.return_value = "Page 1 via pdfplumber" | |
| 244 | + mock_pdf = MagicMock() | |
| 245 | + mock_pdf.pages = [mock_page] | |
| 246 | + mock_pdf.__enter__ = MagicMock(return_value=mock_pdf) | |
| 247 | + mock_pdf.__exit__ = MagicMock(return_value=False) | |
| 248 | + | |
| 249 | + mock_pdfplumber = MagicMock() | |
| 250 | + mock_pdfplumber.open.return_value = mock_pdf | |
| 251 | + | |
| 252 | + with patch.dict("sys.modules", {"pdfplumber": mock_pdfplumber}): | |
| 253 | + proc = PdfProcessor() | |
| 254 | + chunks = proc._process_pdfplumber(f) | |
| 255 | + assert len(chunks) == 1 | |
| 256 | + assert chunks[0].text == "Page 1 via pdfplumber" | |
| 257 | + assert chunks[0].metadata["extraction_method"] == "pdfplumber" | |
| 258 | + | |
| 259 | + def test_raises_if_no_library(self, tmp_path): | |
| 260 | + f = tmp_path / "doc.pdf" | |
| 261 | + f.write_text("") | |
| 262 | + proc = PdfProcessor() | |
| 263 | + | |
| 264 | + with patch.object(proc, "_process_pymupdf", side_effect=ImportError): | |
| 265 | + with patch.object(proc, "_process_pdfplumber", side_effect=ImportError): | |
| 266 | + with pytest.raises(ImportError, match="pymupdf or pdfplumber"): | |
| 267 | + proc.process(f) | |
| 268 | + | |
| 269 | + | |
| 270 | +# --- Ingest --- | |
| 271 | + | |
| 272 | + | |
| 273 | +class TestIngest: | |
| 274 | + def test_ingest_file(self, tmp_path): | |
| 275 | + md = tmp_path / "doc.md" | |
| 276 | + md.write_text("# Title\nSome content here.") | |
| 277 | + | |
| 278 | + mock_kg = MagicMock() | |
| 279 | + mock_kg.register_source = MagicMock() | |
| 280 | + mock_kg.add_content = MagicMock() | |
| 281 | + | |
| 282 | + from video_processor.processors.ingest import ingest_file | |
| 283 | + | |
| 284 | + count = ingest_file(md, mock_kg) | |
| 285 | + assert count == 1 | |
| 286 | + mock_kg.register_source.assert_called_once() | |
| 287 | + source_arg = mock_kg.register_source.call_args[0][0] | |
| 288 | + assert source_arg["source_type"] == "document" | |
| 289 | + assert source_arg["title"] == "doc" | |
| 290 | + mock_kg.add_content.assert_called_once() | |
| 291 | + | |
| 292 | + def test_ingest_file_unsupported(self, tmp_path): | |
| 293 | + f = tmp_path / "data.xyz" | |
| 294 | + f.write_text("stuff") | |
| 295 | + mock_kg = MagicMock() | |
| 296 | + | |
| 297 | + from video_processor.processors.ingest import ingest_file | |
| 298 | + | |
| 299 | + with pytest.raises(ValueError, match="No processor"): | |
| 300 | + ingest_file(f, mock_kg) | |
| 301 | + | |
| 302 | + def test_ingest_directory(self, tmp_path): | |
| 303 | + (tmp_path / "a.md").write_text("# A\nContent A") | |
| 304 | + (tmp_path / "b.txt").write_text("Content B") | |
| 305 | + (tmp_path / "c.xyz").write_text("Ignored") | |
| 306 | + | |
| 307 | + mock_kg = MagicMock() | |
| 308 | + | |
| 309 | + from video_processor.processors.ingest import ingest_directory | |
| 310 | + | |
| 311 | + results = ingest_directory(tmp_path, mock_kg, recursive=False) | |
| 312 | + # Should process a.md and b.txt but not c.xyz | |
| 313 | + assert len(results) == 2 | |
| 314 | + processed_names = {Path(p).name for p in results} | |
| 315 | + assert "a.md" in processed_names | |
| 316 | + assert "b.txt" in processed_names | |
| 317 | + | |
| 318 | + def test_ingest_directory_recursive(self, tmp_path): | |
| 319 | + sub = tmp_path / "sub" | |
| 320 | + sub.mkdir() | |
| 321 | + (tmp_path / "top.md").write_text("# Top\nTop level") | |
| 322 | + (sub / "nested.md").write_text("# Nested\nNested content") | |
| 323 | + | |
| 324 | + mock_kg = MagicMock() | |
| 325 | + | |
| 326 | + from video_processor.processors.ingest import ingest_directory | |
| 327 | + | |
| 328 | + results = ingest_directory(tmp_path, mock_kg, recursive=True) | |
| 329 | + assert len(results) == 2 | |
| 330 | + processed_names = {Path(p).name for p in results} | |
| 331 | + assert "top.md" in processed_names | |
| 332 | + assert "nested.md" in processed_names | |
| 333 | + | |
| 334 | + def test_ingest_file_custom_source_id(self, tmp_path): | |
| 335 | + md = tmp_path / "doc.md" | |
| 336 | + md.write_text("# Title\nContent") | |
| 337 | + | |
| 338 | + mock_kg = MagicMock() | |
| 339 | + | |
| 340 | + from video_processor.processors.ingest import ingest_file | |
| 341 | + | |
| 342 | + ingest_file(md, mock_kg, source_id="custom-123") | |
| 343 | + source_arg = mock_kg.register_source.call_args[0][0] | |
| 344 | + assert source_arg["source_id"] == "custom-123" | |
| 345 | + | |
| 346 | + def test_ingest_content_source_format_with_section(self, tmp_path): | |
| 347 | + md = tmp_path / "doc.md" | |
| 348 | + md.write_text("# Introduction\nSome text\n\n## Details\nMore text") | |
| 349 | + | |
| 350 | + mock_kg = MagicMock() | |
| 351 | + | |
| 352 | + from video_processor.processors.ingest import ingest_file | |
| 353 | + | |
| 354 | + ingest_file(md, mock_kg) | |
| 355 | + # Check content_source includes section info | |
| 356 | + calls = mock_kg.add_content.call_args_list | |
| 357 | + assert len(calls) == 2 | |
| 358 | + assert "document:doc.md:section:Introduction" in calls[0][0][1] | |
| 359 | + assert "document:doc.md:section:Details" in calls[1][0][1] |
| --- a/tests/test_processors.py | |
| +++ b/tests/test_processors.py | |
| @@ -0,0 +1,359 @@ | |
| --- a/tests/test_processors.py | |
| +++ b/tests/test_processors.py | |
| @@ -0,0 +1,359 @@ | |
| 1 | """Tests for document processors and ingestion pipeline.""" |
| 2 | |
| 3 | import textwrap |
| 4 | from pathlib import Path |
| 5 | from unittest.mock import MagicMock, patch |
| 6 | |
| 7 | import pytest |
| 8 | |
| 9 | from video_processor.processors.base import ( |
| 10 | DocumentChunk, |
| 11 | DocumentProcessor, |
| 12 | get_processor, |
| 13 | list_supported_extensions, |
| 14 | register_processor, |
| 15 | ) |
| 16 | from video_processor.processors.markdown_processor import ( |
| 17 | MarkdownProcessor, |
| 18 | PlaintextProcessor, |
| 19 | _chunk_by_paragraphs, |
| 20 | ) |
| 21 | from video_processor.processors.pdf_processor import PdfProcessor |
| 22 | |
| 23 | # --- Base / Registry --- |
| 24 | |
| 25 | |
| 26 | class TestRegistry: |
| 27 | def test_list_supported_extensions_includes_builtins(self): |
| 28 | exts = list_supported_extensions() |
| 29 | assert ".md" in exts |
| 30 | assert ".txt" in exts |
| 31 | assert ".pdf" in exts |
| 32 | |
| 33 | def test_get_processor_markdown(self, tmp_path): |
| 34 | f = tmp_path / "doc.md" |
| 35 | f.write_text("hello") |
| 36 | proc = get_processor(f) |
| 37 | assert isinstance(proc, MarkdownProcessor) |
| 38 | |
| 39 | def test_get_processor_txt(self, tmp_path): |
| 40 | f = tmp_path / "doc.txt" |
| 41 | f.write_text("hello") |
| 42 | proc = get_processor(f) |
| 43 | assert isinstance(proc, PlaintextProcessor) |
| 44 | |
| 45 | def test_get_processor_pdf(self, tmp_path): |
| 46 | f = tmp_path / "doc.pdf" |
| 47 | f.write_text("") |
| 48 | proc = get_processor(f) |
| 49 | assert isinstance(proc, PdfProcessor) |
| 50 | |
| 51 | def test_get_processor_unknown(self, tmp_path): |
| 52 | f = tmp_path / "doc.xyz" |
| 53 | f.write_text("") |
| 54 | assert get_processor(f) is None |
| 55 | |
| 56 | def test_register_custom_processor(self, tmp_path): |
| 57 | class CustomProcessor(DocumentProcessor): |
| 58 | supported_extensions = [".custom"] |
| 59 | |
| 60 | def can_process(self, path): |
| 61 | return path.suffix == ".custom" |
| 62 | |
| 63 | def process(self, path): |
| 64 | return [DocumentChunk(text="custom", source_file=str(path), chunk_index=0)] |
| 65 | |
| 66 | register_processor([".custom"], CustomProcessor) |
| 67 | f = tmp_path / "test.custom" |
| 68 | f.write_text("data") |
| 69 | proc = get_processor(f) |
| 70 | assert isinstance(proc, CustomProcessor) |
| 71 | chunks = proc.process(f) |
| 72 | assert len(chunks) == 1 |
| 73 | assert chunks[0].text == "custom" |
| 74 | |
| 75 | |
| 76 | # --- Markdown --- |
| 77 | |
| 78 | |
| 79 | class TestMarkdownProcessor: |
| 80 | def test_splits_by_headings(self, tmp_path): |
| 81 | md = tmp_path / "test.md" |
| 82 | md.write_text( |
| 83 | textwrap.dedent("""\ |
| 84 | # Introduction |
| 85 | Some intro text. |
| 86 | |
| 87 | ## Details |
| 88 | More details here. |
| 89 | |
| 90 | ## Conclusion |
| 91 | Final thoughts. |
| 92 | """) |
| 93 | ) |
| 94 | proc = MarkdownProcessor() |
| 95 | assert proc.can_process(md) |
| 96 | chunks = proc.process(md) |
| 97 | |
| 98 | assert len(chunks) == 3 |
| 99 | assert chunks[0].section == "Introduction" |
| 100 | assert "intro text" in chunks[0].text |
| 101 | assert chunks[1].section == "Details" |
| 102 | assert chunks[2].section == "Conclusion" |
| 103 | |
| 104 | def test_preamble_before_first_heading(self, tmp_path): |
| 105 | md = tmp_path / "test.md" |
| 106 | md.write_text( |
| 107 | textwrap.dedent("""\ |
| 108 | Some preamble text. |
| 109 | |
| 110 | # First Heading |
| 111 | Content here. |
| 112 | """) |
| 113 | ) |
| 114 | proc = MarkdownProcessor() |
| 115 | chunks = proc.process(md) |
| 116 | assert len(chunks) == 2 |
| 117 | assert chunks[0].section == "(preamble)" |
| 118 | assert "preamble" in chunks[0].text |
| 119 | |
| 120 | def test_no_headings_falls_back_to_paragraphs(self, tmp_path): |
| 121 | md = tmp_path / "test.md" |
| 122 | md.write_text("Paragraph one.\n\nParagraph two.\n\nParagraph three.") |
| 123 | proc = MarkdownProcessor() |
| 124 | chunks = proc.process(md) |
| 125 | assert len(chunks) >= 1 |
| 126 | # All text should be captured |
| 127 | full_text = " ".join(c.text for c in chunks) |
| 128 | assert "Paragraph one" in full_text |
| 129 | assert "Paragraph three" in full_text |
| 130 | |
| 131 | def test_chunk_index_increments(self, tmp_path): |
| 132 | md = tmp_path / "test.md" |
| 133 | md.write_text("# A\ntext\n# B\ntext\n# C\ntext") |
| 134 | proc = MarkdownProcessor() |
| 135 | chunks = proc.process(md) |
| 136 | indices = [c.chunk_index for c in chunks] |
| 137 | assert indices == list(range(len(chunks))) |
| 138 | |
| 139 | def test_source_file_set(self, tmp_path): |
| 140 | md = tmp_path / "test.md" |
| 141 | md.write_text("# Heading\nContent") |
| 142 | proc = MarkdownProcessor() |
| 143 | chunks = proc.process(md) |
| 144 | assert chunks[0].source_file == str(md) |
| 145 | |
| 146 | |
| 147 | # --- Plaintext --- |
| 148 | |
| 149 | |
| 150 | class TestPlaintextProcessor: |
| 151 | def test_basic_paragraphs(self, tmp_path): |
| 152 | txt = tmp_path / "test.txt" |
| 153 | txt.write_text("First paragraph.\n\nSecond paragraph.\n\nThird paragraph.") |
| 154 | proc = PlaintextProcessor() |
| 155 | assert proc.can_process(txt) |
| 156 | chunks = proc.process(txt) |
| 157 | assert len(chunks) >= 1 |
| 158 | full_text = " ".join(c.text for c in chunks) |
| 159 | assert "First paragraph" in full_text |
| 160 | assert "Third paragraph" in full_text |
| 161 | |
| 162 | def test_handles_log_files(self, tmp_path): |
| 163 | log = tmp_path / "app.log" |
| 164 | log.write_text("line 1\nline 2\nline 3") |
| 165 | proc = PlaintextProcessor() |
| 166 | assert proc.can_process(log) |
| 167 | chunks = proc.process(log) |
| 168 | assert len(chunks) >= 1 |
| 169 | |
| 170 | def test_handles_csv(self, tmp_path): |
| 171 | csv = tmp_path / "data.csv" |
| 172 | csv.write_text("a,b,c\n1,2,3\n4,5,6") |
| 173 | proc = PlaintextProcessor() |
| 174 | assert proc.can_process(csv) |
| 175 | chunks = proc.process(csv) |
| 176 | assert len(chunks) >= 1 |
| 177 | |
| 178 | def test_empty_file(self, tmp_path): |
| 179 | txt = tmp_path / "empty.txt" |
| 180 | txt.write_text("") |
| 181 | proc = PlaintextProcessor() |
| 182 | chunks = proc.process(txt) |
| 183 | assert chunks == [] |
| 184 | |
| 185 | |
| 186 | class TestChunkByParagraphs: |
| 187 | def test_respects_max_chunk_size(self): |
| 188 | # Create text with many paragraphs that exceed max size |
| 189 | paragraphs = ["A" * 500 for _ in range(10)] |
| 190 | text = "\n\n".join(paragraphs) |
| 191 | chunks = _chunk_by_paragraphs(text, "test.txt", max_chunk_size=1200, overlap=100) |
| 192 | assert len(chunks) > 1 |
| 193 | for chunk in chunks: |
| 194 | # Each chunk should be reasonably sized (allowing for overlap) |
| 195 | assert len(chunk.text) < 2000 |
| 196 | |
| 197 | def test_overlap(self): |
| 198 | text = "Para A " * 300 + "\n\n" + "Para B " * 300 + "\n\n" + "Para C " * 300 |
| 199 | chunks = _chunk_by_paragraphs(text, "test.txt", max_chunk_size=2500, overlap=200) |
| 200 | if len(chunks) > 1: |
| 201 | # The second chunk should contain some overlap from the first |
| 202 | assert len(chunks[1].text) > 200 |
| 203 | |
| 204 | |
| 205 | # --- PDF --- |
| 206 | |
| 207 | |
| 208 | class TestPdfProcessor: |
| 209 | def test_can_process(self, tmp_path): |
| 210 | f = tmp_path / "doc.pdf" |
| 211 | f.write_text("") |
| 212 | proc = PdfProcessor() |
| 213 | assert proc.can_process(f) |
| 214 | assert not proc.can_process(tmp_path / "doc.txt") |
| 215 | |
| 216 | def test_process_pymupdf(self, tmp_path): |
| 217 | f = tmp_path / "doc.pdf" |
| 218 | f.write_text("") |
| 219 | |
| 220 | mock_page = MagicMock() |
| 221 | mock_page.get_text.return_value = "Page 1 content" |
| 222 | mock_doc = MagicMock() |
| 223 | mock_doc.__iter__ = MagicMock(return_value=iter([mock_page])) |
| 224 | mock_doc.__enter__ = MagicMock(return_value=mock_doc) |
| 225 | mock_doc.__exit__ = MagicMock(return_value=False) |
| 226 | |
| 227 | mock_pymupdf = MagicMock() |
| 228 | mock_pymupdf.open.return_value = mock_doc |
| 229 | |
| 230 | with patch.dict("sys.modules", {"pymupdf": mock_pymupdf}): |
| 231 | proc = PdfProcessor() |
| 232 | chunks = proc._process_pymupdf(f) |
| 233 | assert len(chunks) == 1 |
| 234 | assert chunks[0].text == "Page 1 content" |
| 235 | assert chunks[0].page == 1 |
| 236 | assert chunks[0].metadata["extraction_method"] == "pymupdf" |
| 237 | |
| 238 | def test_process_pdfplumber(self, tmp_path): |
| 239 | f = tmp_path / "doc.pdf" |
| 240 | f.write_text("") |
| 241 | |
| 242 | mock_page = MagicMock() |
| 243 | mock_page.extract_text.return_value = "Page 1 via pdfplumber" |
| 244 | mock_pdf = MagicMock() |
| 245 | mock_pdf.pages = [mock_page] |
| 246 | mock_pdf.__enter__ = MagicMock(return_value=mock_pdf) |
| 247 | mock_pdf.__exit__ = MagicMock(return_value=False) |
| 248 | |
| 249 | mock_pdfplumber = MagicMock() |
| 250 | mock_pdfplumber.open.return_value = mock_pdf |
| 251 | |
| 252 | with patch.dict("sys.modules", {"pdfplumber": mock_pdfplumber}): |
| 253 | proc = PdfProcessor() |
| 254 | chunks = proc._process_pdfplumber(f) |
| 255 | assert len(chunks) == 1 |
| 256 | assert chunks[0].text == "Page 1 via pdfplumber" |
| 257 | assert chunks[0].metadata["extraction_method"] == "pdfplumber" |
| 258 | |
| 259 | def test_raises_if_no_library(self, tmp_path): |
| 260 | f = tmp_path / "doc.pdf" |
| 261 | f.write_text("") |
| 262 | proc = PdfProcessor() |
| 263 | |
| 264 | with patch.object(proc, "_process_pymupdf", side_effect=ImportError): |
| 265 | with patch.object(proc, "_process_pdfplumber", side_effect=ImportError): |
| 266 | with pytest.raises(ImportError, match="pymupdf or pdfplumber"): |
| 267 | proc.process(f) |
| 268 | |
| 269 | |
| 270 | # --- Ingest --- |
| 271 | |
| 272 | |
| 273 | class TestIngest: |
| 274 | def test_ingest_file(self, tmp_path): |
| 275 | md = tmp_path / "doc.md" |
| 276 | md.write_text("# Title\nSome content here.") |
| 277 | |
| 278 | mock_kg = MagicMock() |
| 279 | mock_kg.register_source = MagicMock() |
| 280 | mock_kg.add_content = MagicMock() |
| 281 | |
| 282 | from video_processor.processors.ingest import ingest_file |
| 283 | |
| 284 | count = ingest_file(md, mock_kg) |
| 285 | assert count == 1 |
| 286 | mock_kg.register_source.assert_called_once() |
| 287 | source_arg = mock_kg.register_source.call_args[0][0] |
| 288 | assert source_arg["source_type"] == "document" |
| 289 | assert source_arg["title"] == "doc" |
| 290 | mock_kg.add_content.assert_called_once() |
| 291 | |
| 292 | def test_ingest_file_unsupported(self, tmp_path): |
| 293 | f = tmp_path / "data.xyz" |
| 294 | f.write_text("stuff") |
| 295 | mock_kg = MagicMock() |
| 296 | |
| 297 | from video_processor.processors.ingest import ingest_file |
| 298 | |
| 299 | with pytest.raises(ValueError, match="No processor"): |
| 300 | ingest_file(f, mock_kg) |
| 301 | |
| 302 | def test_ingest_directory(self, tmp_path): |
| 303 | (tmp_path / "a.md").write_text("# A\nContent A") |
| 304 | (tmp_path / "b.txt").write_text("Content B") |
| 305 | (tmp_path / "c.xyz").write_text("Ignored") |
| 306 | |
| 307 | mock_kg = MagicMock() |
| 308 | |
| 309 | from video_processor.processors.ingest import ingest_directory |
| 310 | |
| 311 | results = ingest_directory(tmp_path, mock_kg, recursive=False) |
| 312 | # Should process a.md and b.txt but not c.xyz |
| 313 | assert len(results) == 2 |
| 314 | processed_names = {Path(p).name for p in results} |
| 315 | assert "a.md" in processed_names |
| 316 | assert "b.txt" in processed_names |
| 317 | |
| 318 | def test_ingest_directory_recursive(self, tmp_path): |
| 319 | sub = tmp_path / "sub" |
| 320 | sub.mkdir() |
| 321 | (tmp_path / "top.md").write_text("# Top\nTop level") |
| 322 | (sub / "nested.md").write_text("# Nested\nNested content") |
| 323 | |
| 324 | mock_kg = MagicMock() |
| 325 | |
| 326 | from video_processor.processors.ingest import ingest_directory |
| 327 | |
| 328 | results = ingest_directory(tmp_path, mock_kg, recursive=True) |
| 329 | assert len(results) == 2 |
| 330 | processed_names = {Path(p).name for p in results} |
| 331 | assert "top.md" in processed_names |
| 332 | assert "nested.md" in processed_names |
| 333 | |
| 334 | def test_ingest_file_custom_source_id(self, tmp_path): |
| 335 | md = tmp_path / "doc.md" |
| 336 | md.write_text("# Title\nContent") |
| 337 | |
| 338 | mock_kg = MagicMock() |
| 339 | |
| 340 | from video_processor.processors.ingest import ingest_file |
| 341 | |
| 342 | ingest_file(md, mock_kg, source_id="custom-123") |
| 343 | source_arg = mock_kg.register_source.call_args[0][0] |
| 344 | assert source_arg["source_id"] == "custom-123" |
| 345 | |
| 346 | def test_ingest_content_source_format_with_section(self, tmp_path): |
| 347 | md = tmp_path / "doc.md" |
| 348 | md.write_text("# Introduction\nSome text\n\n## Details\nMore text") |
| 349 | |
| 350 | mock_kg = MagicMock() |
| 351 | |
| 352 | from video_processor.processors.ingest import ingest_file |
| 353 | |
| 354 | ingest_file(md, mock_kg) |
| 355 | # Check content_source includes section info |
| 356 | calls = mock_kg.add_content.call_args_list |
| 357 | assert len(calls) == 2 |
| 358 | assert "document:doc.md:section:Introduction" in calls[0][0][1] |
| 359 | assert "document:doc.md:section:Details" in calls[1][0][1] |
+286
-33
| --- tests/test_providers.py | ||
| +++ tests/test_providers.py | ||
| @@ -1,13 +1,23 @@ | ||
| 1 | 1 | """Tests for the provider abstraction layer.""" |
| 2 | 2 | |
| 3 | +import importlib | |
| 3 | 4 | from unittest.mock import MagicMock, patch |
| 4 | 5 | |
| 5 | 6 | import pytest |
| 6 | 7 | |
| 7 | -from video_processor.providers.base import BaseProvider, ModelInfo | |
| 8 | +from video_processor.providers.base import ( | |
| 9 | + BaseProvider, | |
| 10 | + ModelInfo, | |
| 11 | + OpenAICompatibleProvider, | |
| 12 | + ProviderRegistry, | |
| 13 | +) | |
| 8 | 14 | from video_processor.providers.manager import ProviderManager |
| 15 | + | |
| 16 | +# --------------------------------------------------------------------------- | |
| 17 | +# ModelInfo | |
| 18 | +# --------------------------------------------------------------------------- | |
| 9 | 19 | |
| 10 | 20 | |
| 11 | 21 | class TestModelInfo: |
| 12 | 22 | def test_basic(self): |
| 13 | 23 | m = ModelInfo(id="gpt-4o", provider="openai", capabilities=["chat", "vision"]) |
| @@ -22,14 +32,97 @@ | ||
| 22 | 32 | capabilities=["chat", "vision"], |
| 23 | 33 | ) |
| 24 | 34 | restored = ModelInfo.model_validate_json(m.model_dump_json()) |
| 25 | 35 | assert restored == m |
| 26 | 36 | |
| 37 | + def test_defaults(self): | |
| 38 | + m = ModelInfo(id="x", provider="y") | |
| 39 | + assert m.display_name == "" | |
| 40 | + assert m.capabilities == [] | |
| 41 | + | |
| 42 | + | |
| 43 | +# --------------------------------------------------------------------------- | |
| 44 | +# ProviderRegistry | |
| 45 | +# --------------------------------------------------------------------------- | |
| 46 | + | |
| 47 | + | |
| 48 | +class TestProviderRegistry: | |
| 49 | + """Test ProviderRegistry class methods. | |
| 50 | + | |
| 51 | + We save and restore the internal _providers dict around each test so that | |
| 52 | + registrations from one test don't leak into another. | |
| 53 | + """ | |
| 54 | + | |
| 55 | + @pytest.fixture(autouse=True) | |
| 56 | + def _save_restore_registry(self): | |
| 57 | + original = dict(ProviderRegistry._providers) | |
| 58 | + yield | |
| 59 | + ProviderRegistry._providers = original | |
| 60 | + | |
| 61 | + def test_register_and_get(self): | |
| 62 | + dummy_cls = type("Dummy", (), {}) | |
| 63 | + ProviderRegistry.register("test_prov", dummy_cls, env_var="TEST_KEY") | |
| 64 | + assert ProviderRegistry.get("test_prov") is dummy_cls | |
| 65 | + | |
| 66 | + def test_get_unknown_raises(self): | |
| 67 | + with pytest.raises(ValueError, match="Unknown provider"): | |
| 68 | + ProviderRegistry.get("nonexistent_provider_xyz") | |
| 69 | + | |
| 70 | + def test_get_by_model_prefix(self): | |
| 71 | + dummy_cls = type("Dummy", (), {}) | |
| 72 | + ProviderRegistry.register("myprov", dummy_cls, model_prefixes=["mymodel-"]) | |
| 73 | + assert ProviderRegistry.get_by_model("mymodel-7b") == "myprov" | |
| 74 | + assert ProviderRegistry.get_by_model("othermodel-7b") is None | |
| 75 | + | |
| 76 | + def test_get_by_model_returns_none_for_no_match(self): | |
| 77 | + assert ProviderRegistry.get_by_model("totally_unknown_model_xyz") is None | |
| 78 | + | |
| 79 | + def test_available_with_env_var(self): | |
| 80 | + dummy_cls = type("Dummy", (), {}) | |
| 81 | + ProviderRegistry.register("envprov", dummy_cls, env_var="ENVPROV_KEY") | |
| 82 | + # Not in env -> should not appear | |
| 83 | + with patch.dict("os.environ", {}, clear=True): | |
| 84 | + avail = ProviderRegistry.available() | |
| 85 | + assert "envprov" not in avail | |
| 86 | + | |
| 87 | + # In env -> should appear | |
| 88 | + with patch.dict("os.environ", {"ENVPROV_KEY": "secret"}): | |
| 89 | + avail = ProviderRegistry.available() | |
| 90 | + assert "envprov" in avail | |
| 91 | + | |
| 92 | + def test_available_no_env_var_required(self): | |
| 93 | + dummy_cls = type("Dummy", (), {}) | |
| 94 | + ProviderRegistry.register("noenvprov", dummy_cls, env_var="") | |
| 95 | + avail = ProviderRegistry.available() | |
| 96 | + assert "noenvprov" in avail | |
| 97 | + | |
| 98 | + def test_all_registered(self): | |
| 99 | + dummy_cls = type("Dummy", (), {}) | |
| 100 | + ProviderRegistry.register("regprov", dummy_cls, env_var="X", default_models={"chat": "m1"}) | |
| 101 | + all_reg = ProviderRegistry.all_registered() | |
| 102 | + assert "regprov" in all_reg | |
| 103 | + assert all_reg["regprov"]["class"] is dummy_cls | |
| 104 | + | |
| 105 | + def test_get_default_models(self): | |
| 106 | + dummy_cls = type("Dummy", (), {}) | |
| 107 | + ProviderRegistry.register( | |
| 108 | + "defprov", dummy_cls, default_models={"chat": "c1", "vision": "v1"} | |
| 109 | + ) | |
| 110 | + defaults = ProviderRegistry.get_default_models("defprov") | |
| 111 | + assert defaults == {"chat": "c1", "vision": "v1"} | |
| 112 | + | |
| 113 | + def test_get_default_models_unknown(self): | |
| 114 | + assert ProviderRegistry.get_default_models("unknown_prov_xyz") == {} | |
| 115 | + | |
| 116 | + | |
| 117 | +# --------------------------------------------------------------------------- | |
| 118 | +# ProviderManager | |
| 119 | +# --------------------------------------------------------------------------- | |
| 120 | + | |
| 27 | 121 | |
| 28 | 122 | class TestProviderManager: |
| 29 | 123 | def _make_mock_provider(self, name="openai"): |
| 30 | - """Create a mock provider.""" | |
| 31 | 124 | provider = MagicMock(spec=BaseProvider) |
| 32 | 125 | provider.provider_name = name |
| 33 | 126 | provider.chat.return_value = "test response" |
| 34 | 127 | provider.analyze_image.return_value = "image analysis" |
| 35 | 128 | provider.transcribe_audio.return_value = { |
| @@ -53,18 +146,58 @@ | ||
| 53 | 146 | def test_init_forced_provider(self): |
| 54 | 147 | mgr = ProviderManager(provider="gemini") |
| 55 | 148 | assert mgr.vision_model == "gemini-2.5-flash" |
| 56 | 149 | assert mgr.chat_model == "gemini-2.5-flash" |
| 57 | 150 | assert mgr.transcription_model == "gemini-2.5-flash" |
| 151 | + | |
| 152 | + def test_init_forced_provider_ollama(self): | |
| 153 | + mgr = ProviderManager(provider="ollama") | |
| 154 | + assert mgr.vision_model == "" | |
| 155 | + assert mgr.chat_model == "" | |
| 156 | + assert mgr.transcription_model == "" | |
| 157 | + | |
| 158 | + def test_init_no_overrides(self): | |
| 159 | + mgr = ProviderManager() | |
| 160 | + assert mgr.vision_model is None | |
| 161 | + assert mgr.chat_model is None | |
| 162 | + assert mgr.transcription_model is None | |
| 163 | + assert mgr.auto is True | |
| 164 | + | |
| 165 | + def test_default_for_provider_gemini(self): | |
| 166 | + result = ProviderManager._default_for_provider("gemini", "vision") | |
| 167 | + assert result == "gemini-2.5-flash" | |
| 168 | + | |
| 169 | + def test_default_for_provider_openai(self): | |
| 170 | + result = ProviderManager._default_for_provider("openai", "chat") | |
| 171 | + assert isinstance(result, str) | |
| 172 | + assert len(result) > 0 | |
| 173 | + | |
| 174 | + def test_default_for_provider_unknown(self): | |
| 175 | + result = ProviderManager._default_for_provider("nonexistent_xyz", "chat") | |
| 176 | + assert result == "" | |
| 58 | 177 | |
| 59 | 178 | def test_provider_for_model(self): |
| 60 | 179 | mgr = ProviderManager() |
| 61 | 180 | assert mgr._provider_for_model("gpt-4o") == "openai" |
| 62 | 181 | assert mgr._provider_for_model("claude-sonnet-4-5-20250929") == "anthropic" |
| 63 | 182 | assert mgr._provider_for_model("gemini-2.5-flash") == "gemini" |
| 64 | 183 | assert mgr._provider_for_model("whisper-1") == "openai" |
| 65 | 184 | |
| 185 | + def test_provider_for_model_ollama_via_discovery(self): | |
| 186 | + mgr = ProviderManager() | |
| 187 | + mgr._available_models = [ | |
| 188 | + ModelInfo(id="llama3.2:latest", provider="ollama", capabilities=["chat"]), | |
| 189 | + ] | |
| 190 | + assert mgr._provider_for_model("llama3.2:latest") == "ollama" | |
| 191 | + | |
| 192 | + def test_provider_for_model_ollama_fuzzy_tag(self): | |
| 193 | + mgr = ProviderManager() | |
| 194 | + mgr._available_models = [ | |
| 195 | + ModelInfo(id="llama3.2:latest", provider="ollama", capabilities=["chat"]), | |
| 196 | + ] | |
| 197 | + assert mgr._provider_for_model("llama3.2") == "ollama" | |
| 198 | + | |
| 66 | 199 | @patch.dict("os.environ", {"OPENAI_API_KEY": "test-key"}) |
| 67 | 200 | def test_chat_routes_to_provider(self): |
| 68 | 201 | mgr = ProviderManager(chat_model="gpt-4o") |
| 69 | 202 | mock_prov = self._make_mock_provider("openai") |
| 70 | 203 | mgr._providers["openai"] = mock_prov |
| @@ -97,36 +230,126 @@ | ||
| 97 | 230 | mgr = ProviderManager( |
| 98 | 231 | vision_model="gpt-4o", |
| 99 | 232 | chat_model="claude-sonnet-4-5-20250929", |
| 100 | 233 | transcription_model="whisper-1", |
| 101 | 234 | ) |
| 102 | - # Pre-fill providers so _resolve_model doesn't try to instantiate real ones | |
| 103 | 235 | for name in ["openai", "anthropic"]: |
| 104 | 236 | mgr._providers[name] = self._make_mock_provider(name) |
| 105 | 237 | |
| 106 | 238 | used = mgr.get_models_used() |
| 107 | 239 | assert "vision" in used |
| 108 | - assert "openai/gpt-4o" == used["vision"] | |
| 109 | - assert "anthropic/claude-sonnet-4-5-20250929" == used["chat"] | |
| 240 | + assert used["vision"] == "openai/gpt-4o" | |
| 241 | + assert used["chat"] == "anthropic/claude-sonnet-4-5-20250929" | |
| 242 | + | |
| 243 | + def test_track_records_usage(self): | |
| 244 | + mgr = ProviderManager(chat_model="gpt-4o") | |
| 245 | + mock_prov = self._make_mock_provider("openai") | |
| 246 | + mock_prov._last_usage = {"input_tokens": 10, "output_tokens": 20} | |
| 247 | + mgr._providers["openai"] = mock_prov | |
| 248 | + | |
| 249 | + mgr.chat([{"role": "user", "content": "hi"}]) | |
| 250 | + assert mgr.usage.total_input_tokens == 10 | |
| 251 | + assert mgr.usage.total_output_tokens == 20 | |
| 252 | + | |
| 253 | + | |
| 254 | +# --------------------------------------------------------------------------- | |
| 255 | +# OpenAICompatibleProvider | |
| 256 | +# --------------------------------------------------------------------------- | |
| 257 | + | |
| 258 | + | |
| 259 | +class TestOpenAICompatibleProvider: | |
| 260 | + @patch("openai.OpenAI") | |
| 261 | + def test_chat(self, mock_openai_cls): | |
| 262 | + mock_client = MagicMock() | |
| 263 | + mock_openai_cls.return_value = mock_client | |
| 264 | + | |
| 265 | + mock_choice = MagicMock() | |
| 266 | + mock_choice.message.content = "hello back" | |
| 267 | + mock_response = MagicMock() | |
| 268 | + mock_response.choices = [mock_choice] | |
| 269 | + mock_response.usage.prompt_tokens = 5 | |
| 270 | + mock_response.usage.completion_tokens = 10 | |
| 271 | + mock_client.chat.completions.create.return_value = mock_response | |
| 272 | + | |
| 273 | + provider = OpenAICompatibleProvider(api_key="test", base_url="http://test") | |
| 274 | + result = provider.chat([{"role": "user", "content": "hi"}], model="test-model") | |
| 275 | + assert result == "hello back" | |
| 276 | + assert provider._last_usage == {"input_tokens": 5, "output_tokens": 10} | |
| 277 | + | |
| 278 | + @patch("openai.OpenAI") | |
| 279 | + def test_analyze_image(self, mock_openai_cls): | |
| 280 | + mock_client = MagicMock() | |
| 281 | + mock_openai_cls.return_value = mock_client | |
| 282 | + | |
| 283 | + mock_choice = MagicMock() | |
| 284 | + mock_choice.message.content = "a cat" | |
| 285 | + mock_response = MagicMock() | |
| 286 | + mock_response.choices = [mock_choice] | |
| 287 | + mock_response.usage.prompt_tokens = 100 | |
| 288 | + mock_response.usage.completion_tokens = 5 | |
| 289 | + mock_client.chat.completions.create.return_value = mock_response | |
| 290 | + | |
| 291 | + provider = OpenAICompatibleProvider(api_key="test", base_url="http://test") | |
| 292 | + result = provider.analyze_image(b"\x89PNG", "what is this?") | |
| 293 | + assert result == "a cat" | |
| 294 | + assert provider._last_usage["input_tokens"] == 100 | |
| 295 | + | |
| 296 | + @patch("openai.OpenAI") | |
| 297 | + def test_transcribe_raises(self, mock_openai_cls): | |
| 298 | + provider = OpenAICompatibleProvider(api_key="test", base_url="http://test") | |
| 299 | + with pytest.raises(NotImplementedError): | |
| 300 | + provider.transcribe_audio("/tmp/audio.wav") | |
| 301 | + | |
| 302 | + @patch("openai.OpenAI") | |
| 303 | + def test_list_models(self, mock_openai_cls): | |
| 304 | + mock_client = MagicMock() | |
| 305 | + mock_openai_cls.return_value = mock_client | |
| 306 | + | |
| 307 | + mock_model = MagicMock() | |
| 308 | + mock_model.id = "test-model-1" | |
| 309 | + mock_client.models.list.return_value = [mock_model] | |
| 310 | + | |
| 311 | + provider = OpenAICompatibleProvider(api_key="test", base_url="http://test") | |
| 312 | + provider.provider_name = "testprov" | |
| 313 | + models = provider.list_models() | |
| 314 | + assert len(models) == 1 | |
| 315 | + assert models[0].id == "test-model-1" | |
| 316 | + assert models[0].provider == "testprov" | |
| 317 | + | |
| 318 | + @patch("openai.OpenAI") | |
| 319 | + def test_list_models_handles_error(self, mock_openai_cls): | |
| 320 | + mock_client = MagicMock() | |
| 321 | + mock_openai_cls.return_value = mock_client | |
| 322 | + mock_client.models.list.side_effect = Exception("connection error") | |
| 323 | + | |
| 324 | + provider = OpenAICompatibleProvider(api_key="test", base_url="http://test") | |
| 325 | + models = provider.list_models() | |
| 326 | + assert models == [] | |
| 327 | + | |
| 328 | + | |
| 329 | +# --------------------------------------------------------------------------- | |
| 330 | +# Discovery | |
| 331 | +# --------------------------------------------------------------------------- | |
| 110 | 332 | |
| 111 | 333 | |
| 112 | 334 | class TestDiscovery: |
| 113 | 335 | @patch("video_processor.providers.discovery._cached_models", None) |
| 114 | 336 | @patch( |
| 115 | - "video_processor.providers.ollama_provider.OllamaProvider.is_available", return_value=False | |
| 337 | + "video_processor.providers.ollama_provider.OllamaProvider.is_available", | |
| 338 | + return_value=False, | |
| 116 | 339 | ) |
| 117 | 340 | @patch.dict("os.environ", {}, clear=True) |
| 118 | 341 | def test_discover_skips_missing_keys(self, mock_ollama): |
| 119 | 342 | from video_processor.providers.discovery import discover_available_models |
| 120 | 343 | |
| 121 | - # No API keys and no Ollama -> empty list, no errors | |
| 122 | 344 | models = discover_available_models(api_keys={"openai": "", "anthropic": "", "gemini": ""}) |
| 123 | 345 | assert models == [] |
| 124 | 346 | |
| 125 | 347 | @patch.dict("os.environ", {}, clear=True) |
| 126 | 348 | @patch( |
| 127 | - "video_processor.providers.ollama_provider.OllamaProvider.is_available", return_value=False | |
| 349 | + "video_processor.providers.ollama_provider.OllamaProvider.is_available", | |
| 350 | + return_value=False, | |
| 128 | 351 | ) |
| 129 | 352 | @patch("video_processor.providers.discovery._cached_models", None) |
| 130 | 353 | def test_discover_caches_results(self, mock_ollama): |
| 131 | 354 | from video_processor.providers import discovery |
| 132 | 355 | |
| @@ -136,13 +359,41 @@ | ||
| 136 | 359 | assert models == [] |
| 137 | 360 | # Second call should use cache |
| 138 | 361 | models2 = discovery.discover_available_models(api_keys={"openai": "key"}) |
| 139 | 362 | assert models2 == [] # Still cached empty result |
| 140 | 363 | |
| 141 | - # Force refresh | |
| 364 | + discovery.clear_discovery_cache() | |
| 365 | + | |
| 366 | + @patch("video_processor.providers.discovery._cached_models", None) | |
| 367 | + @patch( | |
| 368 | + "video_processor.providers.ollama_provider.OllamaProvider.is_available", | |
| 369 | + return_value=False, | |
| 370 | + ) | |
| 371 | + @patch.dict("os.environ", {}, clear=True) | |
| 372 | + def test_force_refresh_clears_cache(self, mock_ollama): | |
| 373 | + from video_processor.providers import discovery | |
| 374 | + | |
| 375 | + # Warm the cache | |
| 376 | + discovery.discover_available_models(api_keys={"openai": "", "anthropic": "", "gemini": ""}) | |
| 377 | + # Force refresh should re-run | |
| 378 | + models = discovery.discover_available_models( | |
| 379 | + api_keys={"openai": "", "anthropic": "", "gemini": ""}, | |
| 380 | + force_refresh=True, | |
| 381 | + ) | |
| 382 | + assert models == [] | |
| 383 | + | |
| 384 | + def test_clear_discovery_cache(self): | |
| 385 | + from video_processor.providers import discovery | |
| 386 | + | |
| 387 | + discovery._cached_models = [ModelInfo(id="x", provider="y")] | |
| 142 | 388 | discovery.clear_discovery_cache() |
| 143 | - # Would try to connect with real key, so skip that test | |
| 389 | + assert discovery._cached_models is None | |
| 390 | + | |
| 391 | + | |
| 392 | +# --------------------------------------------------------------------------- | |
| 393 | +# OllamaProvider | |
| 394 | +# --------------------------------------------------------------------------- | |
| 144 | 395 | |
| 145 | 396 | |
| 146 | 397 | class TestOllamaProvider: |
| 147 | 398 | @patch("video_processor.providers.ollama_provider.requests") |
| 148 | 399 | def test_is_available_when_running(self, mock_requests): |
| @@ -189,35 +440,37 @@ | ||
| 189 | 440 | provider = OllamaProvider() |
| 190 | 441 | models = provider.list_models() |
| 191 | 442 | assert len(models) == 2 |
| 192 | 443 | assert models[0].provider == "ollama" |
| 193 | 444 | |
| 194 | - # llava should have vision capability | |
| 195 | 445 | llava = [m for m in models if "llava" in m.id][0] |
| 196 | 446 | assert "vision" in llava.capabilities |
| 197 | 447 | |
| 198 | - # llama should have only chat | |
| 199 | 448 | llama = [m for m in models if "llama" in m.id][0] |
| 200 | 449 | assert "chat" in llama.capabilities |
| 201 | 450 | assert "vision" not in llama.capabilities |
| 202 | 451 | |
| 203 | - def test_provider_for_model_ollama_via_discovery(self): | |
| 204 | - mgr = ProviderManager() | |
| 205 | - mgr._available_models = [ | |
| 206 | - ModelInfo(id="llama3.2:latest", provider="ollama", capabilities=["chat"]), | |
| 207 | - ] | |
| 208 | - assert mgr._provider_for_model("llama3.2:latest") == "ollama" | |
| 209 | - | |
| 210 | - def test_provider_for_model_ollama_fuzzy_tag(self): | |
| 211 | - mgr = ProviderManager() | |
| 212 | - mgr._available_models = [ | |
| 213 | - ModelInfo(id="llama3.2:latest", provider="ollama", capabilities=["chat"]), | |
| 214 | - ] | |
| 215 | - # Should match "llama3.2" to "llama3.2:latest" via prefix | |
| 216 | - assert mgr._provider_for_model("llama3.2") == "ollama" | |
| 217 | - | |
| 218 | - def test_init_forced_provider_ollama(self): | |
| 219 | - mgr = ProviderManager(provider="ollama") | |
| 220 | - # Ollama defaults are empty (resolved dynamically) | |
| 221 | - assert mgr.vision_model == "" | |
| 222 | - assert mgr.chat_model == "" | |
| 223 | - assert mgr.transcription_model == "" | |
| 452 | + | |
| 453 | +# --------------------------------------------------------------------------- | |
| 454 | +# Provider module imports | |
| 455 | +# --------------------------------------------------------------------------- | |
| 456 | + | |
| 457 | + | |
| 458 | +class TestProviderImports: | |
| 459 | + """Verify that all provider modules import without errors.""" | |
| 460 | + | |
| 461 | + PROVIDER_MODULES = [ | |
| 462 | + "video_processor.providers.openai_provider", | |
| 463 | + "video_processor.providers.anthropic_provider", | |
| 464 | + "video_processor.providers.gemini_provider", | |
| 465 | + "video_processor.providers.ollama_provider", | |
| 466 | + "video_processor.providers.azure_provider", | |
| 467 | + "video_processor.providers.together_provider", | |
| 468 | + "video_processor.providers.fireworks_provider", | |
| 469 | + "video_processor.providers.cerebras_provider", | |
| 470 | + "video_processor.providers.xai_provider", | |
| 471 | + ] | |
| 472 | + | |
| 473 | + @pytest.mark.parametrize("module_name", PROVIDER_MODULES) | |
| 474 | + def test_import(self, module_name): | |
| 475 | + mod = importlib.import_module(module_name) | |
| 476 | + assert mod is not None | |
| 224 | 477 | |
| 225 | 478 | ADDED tests/test_sources.py |
| 226 | 479 | ADDED tests/test_taxonomy.py |
| 227 | 480 | ADDED tests/test_usage_tracker.py |
| 228 | 481 | ADDED tests/test_visualization.py |
| 229 | 482 | ADDED video_processor/agent/agent_loop.py |
| 230 | 483 | ADDED video_processor/agent/kb_context.py |
| --- tests/test_providers.py | |
| +++ tests/test_providers.py | |
| @@ -1,13 +1,23 @@ | |
| 1 | """Tests for the provider abstraction layer.""" |
| 2 | |
| 3 | from unittest.mock import MagicMock, patch |
| 4 | |
| 5 | import pytest |
| 6 | |
| 7 | from video_processor.providers.base import BaseProvider, ModelInfo |
| 8 | from video_processor.providers.manager import ProviderManager |
| 9 | |
| 10 | |
| 11 | class TestModelInfo: |
| 12 | def test_basic(self): |
| 13 | m = ModelInfo(id="gpt-4o", provider="openai", capabilities=["chat", "vision"]) |
| @@ -22,14 +32,97 @@ | |
| 22 | capabilities=["chat", "vision"], |
| 23 | ) |
| 24 | restored = ModelInfo.model_validate_json(m.model_dump_json()) |
| 25 | assert restored == m |
| 26 | |
| 27 | |
| 28 | class TestProviderManager: |
| 29 | def _make_mock_provider(self, name="openai"): |
| 30 | """Create a mock provider.""" |
| 31 | provider = MagicMock(spec=BaseProvider) |
| 32 | provider.provider_name = name |
| 33 | provider.chat.return_value = "test response" |
| 34 | provider.analyze_image.return_value = "image analysis" |
| 35 | provider.transcribe_audio.return_value = { |
| @@ -53,18 +146,58 @@ | |
| 53 | def test_init_forced_provider(self): |
| 54 | mgr = ProviderManager(provider="gemini") |
| 55 | assert mgr.vision_model == "gemini-2.5-flash" |
| 56 | assert mgr.chat_model == "gemini-2.5-flash" |
| 57 | assert mgr.transcription_model == "gemini-2.5-flash" |
| 58 | |
| 59 | def test_provider_for_model(self): |
| 60 | mgr = ProviderManager() |
| 61 | assert mgr._provider_for_model("gpt-4o") == "openai" |
| 62 | assert mgr._provider_for_model("claude-sonnet-4-5-20250929") == "anthropic" |
| 63 | assert mgr._provider_for_model("gemini-2.5-flash") == "gemini" |
| 64 | assert mgr._provider_for_model("whisper-1") == "openai" |
| 65 | |
| 66 | @patch.dict("os.environ", {"OPENAI_API_KEY": "test-key"}) |
| 67 | def test_chat_routes_to_provider(self): |
| 68 | mgr = ProviderManager(chat_model="gpt-4o") |
| 69 | mock_prov = self._make_mock_provider("openai") |
| 70 | mgr._providers["openai"] = mock_prov |
| @@ -97,36 +230,126 @@ | |
| 97 | mgr = ProviderManager( |
| 98 | vision_model="gpt-4o", |
| 99 | chat_model="claude-sonnet-4-5-20250929", |
| 100 | transcription_model="whisper-1", |
| 101 | ) |
| 102 | # Pre-fill providers so _resolve_model doesn't try to instantiate real ones |
| 103 | for name in ["openai", "anthropic"]: |
| 104 | mgr._providers[name] = self._make_mock_provider(name) |
| 105 | |
| 106 | used = mgr.get_models_used() |
| 107 | assert "vision" in used |
| 108 | assert "openai/gpt-4o" == used["vision"] |
| 109 | assert "anthropic/claude-sonnet-4-5-20250929" == used["chat"] |
| 110 | |
| 111 | |
| 112 | class TestDiscovery: |
| 113 | @patch("video_processor.providers.discovery._cached_models", None) |
| 114 | @patch( |
| 115 | "video_processor.providers.ollama_provider.OllamaProvider.is_available", return_value=False |
| 116 | ) |
| 117 | @patch.dict("os.environ", {}, clear=True) |
| 118 | def test_discover_skips_missing_keys(self, mock_ollama): |
| 119 | from video_processor.providers.discovery import discover_available_models |
| 120 | |
| 121 | # No API keys and no Ollama -> empty list, no errors |
| 122 | models = discover_available_models(api_keys={"openai": "", "anthropic": "", "gemini": ""}) |
| 123 | assert models == [] |
| 124 | |
| 125 | @patch.dict("os.environ", {}, clear=True) |
| 126 | @patch( |
| 127 | "video_processor.providers.ollama_provider.OllamaProvider.is_available", return_value=False |
| 128 | ) |
| 129 | @patch("video_processor.providers.discovery._cached_models", None) |
| 130 | def test_discover_caches_results(self, mock_ollama): |
| 131 | from video_processor.providers import discovery |
| 132 | |
| @@ -136,13 +359,41 @@ | |
| 136 | assert models == [] |
| 137 | # Second call should use cache |
| 138 | models2 = discovery.discover_available_models(api_keys={"openai": "key"}) |
| 139 | assert models2 == [] # Still cached empty result |
| 140 | |
| 141 | # Force refresh |
| 142 | discovery.clear_discovery_cache() |
| 143 | # Would try to connect with real key, so skip that test |
| 144 | |
| 145 | |
| 146 | class TestOllamaProvider: |
| 147 | @patch("video_processor.providers.ollama_provider.requests") |
| 148 | def test_is_available_when_running(self, mock_requests): |
| @@ -189,35 +440,37 @@ | |
| 189 | provider = OllamaProvider() |
| 190 | models = provider.list_models() |
| 191 | assert len(models) == 2 |
| 192 | assert models[0].provider == "ollama" |
| 193 | |
| 194 | # llava should have vision capability |
| 195 | llava = [m for m in models if "llava" in m.id][0] |
| 196 | assert "vision" in llava.capabilities |
| 197 | |
| 198 | # llama should have only chat |
| 199 | llama = [m for m in models if "llama" in m.id][0] |
| 200 | assert "chat" in llama.capabilities |
| 201 | assert "vision" not in llama.capabilities |
| 202 | |
| 203 | def test_provider_for_model_ollama_via_discovery(self): |
| 204 | mgr = ProviderManager() |
| 205 | mgr._available_models = [ |
| 206 | ModelInfo(id="llama3.2:latest", provider="ollama", capabilities=["chat"]), |
| 207 | ] |
| 208 | assert mgr._provider_for_model("llama3.2:latest") == "ollama" |
| 209 | |
| 210 | def test_provider_for_model_ollama_fuzzy_tag(self): |
| 211 | mgr = ProviderManager() |
| 212 | mgr._available_models = [ |
| 213 | ModelInfo(id="llama3.2:latest", provider="ollama", capabilities=["chat"]), |
| 214 | ] |
| 215 | # Should match "llama3.2" to "llama3.2:latest" via prefix |
| 216 | assert mgr._provider_for_model("llama3.2") == "ollama" |
| 217 | |
| 218 | def test_init_forced_provider_ollama(self): |
| 219 | mgr = ProviderManager(provider="ollama") |
| 220 | # Ollama defaults are empty (resolved dynamically) |
| 221 | assert mgr.vision_model == "" |
| 222 | assert mgr.chat_model == "" |
| 223 | assert mgr.transcription_model == "" |
| 224 | |
| 225 | DDED tests/test_sources.py |
| 226 | DDED tests/test_taxonomy.py |
| 227 | DDED tests/test_usage_tracker.py |
| 228 | DDED tests/test_visualization.py |
| 229 | DDED video_processor/agent/agent_loop.py |
| 230 | DDED video_processor/agent/kb_context.py |
| --- tests/test_providers.py | |
| +++ tests/test_providers.py | |
| @@ -1,13 +1,23 @@ | |
| 1 | """Tests for the provider abstraction layer.""" |
| 2 | |
| 3 | import importlib |
| 4 | from unittest.mock import MagicMock, patch |
| 5 | |
| 6 | import pytest |
| 7 | |
| 8 | from video_processor.providers.base import ( |
| 9 | BaseProvider, |
| 10 | ModelInfo, |
| 11 | OpenAICompatibleProvider, |
| 12 | ProviderRegistry, |
| 13 | ) |
| 14 | from video_processor.providers.manager import ProviderManager |
| 15 | |
| 16 | # --------------------------------------------------------------------------- |
| 17 | # ModelInfo |
| 18 | # --------------------------------------------------------------------------- |
| 19 | |
| 20 | |
| 21 | class TestModelInfo: |
| 22 | def test_basic(self): |
| 23 | m = ModelInfo(id="gpt-4o", provider="openai", capabilities=["chat", "vision"]) |
| @@ -22,14 +32,97 @@ | |
| 32 | capabilities=["chat", "vision"], |
| 33 | ) |
| 34 | restored = ModelInfo.model_validate_json(m.model_dump_json()) |
| 35 | assert restored == m |
| 36 | |
| 37 | def test_defaults(self): |
| 38 | m = ModelInfo(id="x", provider="y") |
| 39 | assert m.display_name == "" |
| 40 | assert m.capabilities == [] |
| 41 | |
| 42 | |
| 43 | # --------------------------------------------------------------------------- |
| 44 | # ProviderRegistry |
| 45 | # --------------------------------------------------------------------------- |
| 46 | |
| 47 | |
| 48 | class TestProviderRegistry: |
| 49 | """Test ProviderRegistry class methods. |
| 50 | |
| 51 | We save and restore the internal _providers dict around each test so that |
| 52 | registrations from one test don't leak into another. |
| 53 | """ |
| 54 | |
| 55 | @pytest.fixture(autouse=True) |
| 56 | def _save_restore_registry(self): |
| 57 | original = dict(ProviderRegistry._providers) |
| 58 | yield |
| 59 | ProviderRegistry._providers = original |
| 60 | |
| 61 | def test_register_and_get(self): |
| 62 | dummy_cls = type("Dummy", (), {}) |
| 63 | ProviderRegistry.register("test_prov", dummy_cls, env_var="TEST_KEY") |
| 64 | assert ProviderRegistry.get("test_prov") is dummy_cls |
| 65 | |
| 66 | def test_get_unknown_raises(self): |
| 67 | with pytest.raises(ValueError, match="Unknown provider"): |
| 68 | ProviderRegistry.get("nonexistent_provider_xyz") |
| 69 | |
| 70 | def test_get_by_model_prefix(self): |
| 71 | dummy_cls = type("Dummy", (), {}) |
| 72 | ProviderRegistry.register("myprov", dummy_cls, model_prefixes=["mymodel-"]) |
| 73 | assert ProviderRegistry.get_by_model("mymodel-7b") == "myprov" |
| 74 | assert ProviderRegistry.get_by_model("othermodel-7b") is None |
| 75 | |
| 76 | def test_get_by_model_returns_none_for_no_match(self): |
| 77 | assert ProviderRegistry.get_by_model("totally_unknown_model_xyz") is None |
| 78 | |
| 79 | def test_available_with_env_var(self): |
| 80 | dummy_cls = type("Dummy", (), {}) |
| 81 | ProviderRegistry.register("envprov", dummy_cls, env_var="ENVPROV_KEY") |
| 82 | # Not in env -> should not appear |
| 83 | with patch.dict("os.environ", {}, clear=True): |
| 84 | avail = ProviderRegistry.available() |
| 85 | assert "envprov" not in avail |
| 86 | |
| 87 | # In env -> should appear |
| 88 | with patch.dict("os.environ", {"ENVPROV_KEY": "secret"}): |
| 89 | avail = ProviderRegistry.available() |
| 90 | assert "envprov" in avail |
| 91 | |
| 92 | def test_available_no_env_var_required(self): |
| 93 | dummy_cls = type("Dummy", (), {}) |
| 94 | ProviderRegistry.register("noenvprov", dummy_cls, env_var="") |
| 95 | avail = ProviderRegistry.available() |
| 96 | assert "noenvprov" in avail |
| 97 | |
| 98 | def test_all_registered(self): |
| 99 | dummy_cls = type("Dummy", (), {}) |
| 100 | ProviderRegistry.register("regprov", dummy_cls, env_var="X", default_models={"chat": "m1"}) |
| 101 | all_reg = ProviderRegistry.all_registered() |
| 102 | assert "regprov" in all_reg |
| 103 | assert all_reg["regprov"]["class"] is dummy_cls |
| 104 | |
| 105 | def test_get_default_models(self): |
| 106 | dummy_cls = type("Dummy", (), {}) |
| 107 | ProviderRegistry.register( |
| 108 | "defprov", dummy_cls, default_models={"chat": "c1", "vision": "v1"} |
| 109 | ) |
| 110 | defaults = ProviderRegistry.get_default_models("defprov") |
| 111 | assert defaults == {"chat": "c1", "vision": "v1"} |
| 112 | |
| 113 | def test_get_default_models_unknown(self): |
| 114 | assert ProviderRegistry.get_default_models("unknown_prov_xyz") == {} |
| 115 | |
| 116 | |
| 117 | # --------------------------------------------------------------------------- |
| 118 | # ProviderManager |
| 119 | # --------------------------------------------------------------------------- |
| 120 | |
| 121 | |
| 122 | class TestProviderManager: |
| 123 | def _make_mock_provider(self, name="openai"): |
| 124 | provider = MagicMock(spec=BaseProvider) |
| 125 | provider.provider_name = name |
| 126 | provider.chat.return_value = "test response" |
| 127 | provider.analyze_image.return_value = "image analysis" |
| 128 | provider.transcribe_audio.return_value = { |
| @@ -53,18 +146,58 @@ | |
| 146 | def test_init_forced_provider(self): |
| 147 | mgr = ProviderManager(provider="gemini") |
| 148 | assert mgr.vision_model == "gemini-2.5-flash" |
| 149 | assert mgr.chat_model == "gemini-2.5-flash" |
| 150 | assert mgr.transcription_model == "gemini-2.5-flash" |
| 151 | |
| 152 | def test_init_forced_provider_ollama(self): |
| 153 | mgr = ProviderManager(provider="ollama") |
| 154 | assert mgr.vision_model == "" |
| 155 | assert mgr.chat_model == "" |
| 156 | assert mgr.transcription_model == "" |
| 157 | |
| 158 | def test_init_no_overrides(self): |
| 159 | mgr = ProviderManager() |
| 160 | assert mgr.vision_model is None |
| 161 | assert mgr.chat_model is None |
| 162 | assert mgr.transcription_model is None |
| 163 | assert mgr.auto is True |
| 164 | |
| 165 | def test_default_for_provider_gemini(self): |
| 166 | result = ProviderManager._default_for_provider("gemini", "vision") |
| 167 | assert result == "gemini-2.5-flash" |
| 168 | |
| 169 | def test_default_for_provider_openai(self): |
| 170 | result = ProviderManager._default_for_provider("openai", "chat") |
| 171 | assert isinstance(result, str) |
| 172 | assert len(result) > 0 |
| 173 | |
| 174 | def test_default_for_provider_unknown(self): |
| 175 | result = ProviderManager._default_for_provider("nonexistent_xyz", "chat") |
| 176 | assert result == "" |
| 177 | |
| 178 | def test_provider_for_model(self): |
| 179 | mgr = ProviderManager() |
| 180 | assert mgr._provider_for_model("gpt-4o") == "openai" |
| 181 | assert mgr._provider_for_model("claude-sonnet-4-5-20250929") == "anthropic" |
| 182 | assert mgr._provider_for_model("gemini-2.5-flash") == "gemini" |
| 183 | assert mgr._provider_for_model("whisper-1") == "openai" |
| 184 | |
| 185 | def test_provider_for_model_ollama_via_discovery(self): |
| 186 | mgr = ProviderManager() |
| 187 | mgr._available_models = [ |
| 188 | ModelInfo(id="llama3.2:latest", provider="ollama", capabilities=["chat"]), |
| 189 | ] |
| 190 | assert mgr._provider_for_model("llama3.2:latest") == "ollama" |
| 191 | |
| 192 | def test_provider_for_model_ollama_fuzzy_tag(self): |
| 193 | mgr = ProviderManager() |
| 194 | mgr._available_models = [ |
| 195 | ModelInfo(id="llama3.2:latest", provider="ollama", capabilities=["chat"]), |
| 196 | ] |
| 197 | assert mgr._provider_for_model("llama3.2") == "ollama" |
| 198 | |
| 199 | @patch.dict("os.environ", {"OPENAI_API_KEY": "test-key"}) |
| 200 | def test_chat_routes_to_provider(self): |
| 201 | mgr = ProviderManager(chat_model="gpt-4o") |
| 202 | mock_prov = self._make_mock_provider("openai") |
| 203 | mgr._providers["openai"] = mock_prov |
| @@ -97,36 +230,126 @@ | |
| 230 | mgr = ProviderManager( |
| 231 | vision_model="gpt-4o", |
| 232 | chat_model="claude-sonnet-4-5-20250929", |
| 233 | transcription_model="whisper-1", |
| 234 | ) |
| 235 | for name in ["openai", "anthropic"]: |
| 236 | mgr._providers[name] = self._make_mock_provider(name) |
| 237 | |
| 238 | used = mgr.get_models_used() |
| 239 | assert "vision" in used |
| 240 | assert used["vision"] == "openai/gpt-4o" |
| 241 | assert used["chat"] == "anthropic/claude-sonnet-4-5-20250929" |
| 242 | |
| 243 | def test_track_records_usage(self): |
| 244 | mgr = ProviderManager(chat_model="gpt-4o") |
| 245 | mock_prov = self._make_mock_provider("openai") |
| 246 | mock_prov._last_usage = {"input_tokens": 10, "output_tokens": 20} |
| 247 | mgr._providers["openai"] = mock_prov |
| 248 | |
| 249 | mgr.chat([{"role": "user", "content": "hi"}]) |
| 250 | assert mgr.usage.total_input_tokens == 10 |
| 251 | assert mgr.usage.total_output_tokens == 20 |
| 252 | |
| 253 | |
| 254 | # --------------------------------------------------------------------------- |
| 255 | # OpenAICompatibleProvider |
| 256 | # --------------------------------------------------------------------------- |
| 257 | |
| 258 | |
| 259 | class TestOpenAICompatibleProvider: |
| 260 | @patch("openai.OpenAI") |
| 261 | def test_chat(self, mock_openai_cls): |
| 262 | mock_client = MagicMock() |
| 263 | mock_openai_cls.return_value = mock_client |
| 264 | |
| 265 | mock_choice = MagicMock() |
| 266 | mock_choice.message.content = "hello back" |
| 267 | mock_response = MagicMock() |
| 268 | mock_response.choices = [mock_choice] |
| 269 | mock_response.usage.prompt_tokens = 5 |
| 270 | mock_response.usage.completion_tokens = 10 |
| 271 | mock_client.chat.completions.create.return_value = mock_response |
| 272 | |
| 273 | provider = OpenAICompatibleProvider(api_key="test", base_url="http://test") |
| 274 | result = provider.chat([{"role": "user", "content": "hi"}], model="test-model") |
| 275 | assert result == "hello back" |
| 276 | assert provider._last_usage == {"input_tokens": 5, "output_tokens": 10} |
| 277 | |
| 278 | @patch("openai.OpenAI") |
| 279 | def test_analyze_image(self, mock_openai_cls): |
| 280 | mock_client = MagicMock() |
| 281 | mock_openai_cls.return_value = mock_client |
| 282 | |
| 283 | mock_choice = MagicMock() |
| 284 | mock_choice.message.content = "a cat" |
| 285 | mock_response = MagicMock() |
| 286 | mock_response.choices = [mock_choice] |
| 287 | mock_response.usage.prompt_tokens = 100 |
| 288 | mock_response.usage.completion_tokens = 5 |
| 289 | mock_client.chat.completions.create.return_value = mock_response |
| 290 | |
| 291 | provider = OpenAICompatibleProvider(api_key="test", base_url="http://test") |
| 292 | result = provider.analyze_image(b"\x89PNG", "what is this?") |
| 293 | assert result == "a cat" |
| 294 | assert provider._last_usage["input_tokens"] == 100 |
| 295 | |
| 296 | @patch("openai.OpenAI") |
| 297 | def test_transcribe_raises(self, mock_openai_cls): |
| 298 | provider = OpenAICompatibleProvider(api_key="test", base_url="http://test") |
| 299 | with pytest.raises(NotImplementedError): |
| 300 | provider.transcribe_audio("/tmp/audio.wav") |
| 301 | |
| 302 | @patch("openai.OpenAI") |
| 303 | def test_list_models(self, mock_openai_cls): |
| 304 | mock_client = MagicMock() |
| 305 | mock_openai_cls.return_value = mock_client |
| 306 | |
| 307 | mock_model = MagicMock() |
| 308 | mock_model.id = "test-model-1" |
| 309 | mock_client.models.list.return_value = [mock_model] |
| 310 | |
| 311 | provider = OpenAICompatibleProvider(api_key="test", base_url="http://test") |
| 312 | provider.provider_name = "testprov" |
| 313 | models = provider.list_models() |
| 314 | assert len(models) == 1 |
| 315 | assert models[0].id == "test-model-1" |
| 316 | assert models[0].provider == "testprov" |
| 317 | |
| 318 | @patch("openai.OpenAI") |
| 319 | def test_list_models_handles_error(self, mock_openai_cls): |
| 320 | mock_client = MagicMock() |
| 321 | mock_openai_cls.return_value = mock_client |
| 322 | mock_client.models.list.side_effect = Exception("connection error") |
| 323 | |
| 324 | provider = OpenAICompatibleProvider(api_key="test", base_url="http://test") |
| 325 | models = provider.list_models() |
| 326 | assert models == [] |
| 327 | |
| 328 | |
| 329 | # --------------------------------------------------------------------------- |
| 330 | # Discovery |
| 331 | # --------------------------------------------------------------------------- |
| 332 | |
| 333 | |
| 334 | class TestDiscovery: |
| 335 | @patch("video_processor.providers.discovery._cached_models", None) |
| 336 | @patch( |
| 337 | "video_processor.providers.ollama_provider.OllamaProvider.is_available", |
| 338 | return_value=False, |
| 339 | ) |
| 340 | @patch.dict("os.environ", {}, clear=True) |
| 341 | def test_discover_skips_missing_keys(self, mock_ollama): |
| 342 | from video_processor.providers.discovery import discover_available_models |
| 343 | |
| 344 | models = discover_available_models(api_keys={"openai": "", "anthropic": "", "gemini": ""}) |
| 345 | assert models == [] |
| 346 | |
| 347 | @patch.dict("os.environ", {}, clear=True) |
| 348 | @patch( |
| 349 | "video_processor.providers.ollama_provider.OllamaProvider.is_available", |
| 350 | return_value=False, |
| 351 | ) |
| 352 | @patch("video_processor.providers.discovery._cached_models", None) |
| 353 | def test_discover_caches_results(self, mock_ollama): |
| 354 | from video_processor.providers import discovery |
| 355 | |
| @@ -136,13 +359,41 @@ | |
| 359 | assert models == [] |
| 360 | # Second call should use cache |
| 361 | models2 = discovery.discover_available_models(api_keys={"openai": "key"}) |
| 362 | assert models2 == [] # Still cached empty result |
| 363 | |
| 364 | discovery.clear_discovery_cache() |
| 365 | |
| 366 | @patch("video_processor.providers.discovery._cached_models", None) |
| 367 | @patch( |
| 368 | "video_processor.providers.ollama_provider.OllamaProvider.is_available", |
| 369 | return_value=False, |
| 370 | ) |
| 371 | @patch.dict("os.environ", {}, clear=True) |
| 372 | def test_force_refresh_clears_cache(self, mock_ollama): |
| 373 | from video_processor.providers import discovery |
| 374 | |
| 375 | # Warm the cache |
| 376 | discovery.discover_available_models(api_keys={"openai": "", "anthropic": "", "gemini": ""}) |
| 377 | # Force refresh should re-run |
| 378 | models = discovery.discover_available_models( |
| 379 | api_keys={"openai": "", "anthropic": "", "gemini": ""}, |
| 380 | force_refresh=True, |
| 381 | ) |
| 382 | assert models == [] |
| 383 | |
| 384 | def test_clear_discovery_cache(self): |
| 385 | from video_processor.providers import discovery |
| 386 | |
| 387 | discovery._cached_models = [ModelInfo(id="x", provider="y")] |
| 388 | discovery.clear_discovery_cache() |
| 389 | assert discovery._cached_models is None |
| 390 | |
| 391 | |
| 392 | # --------------------------------------------------------------------------- |
| 393 | # OllamaProvider |
| 394 | # --------------------------------------------------------------------------- |
| 395 | |
| 396 | |
| 397 | class TestOllamaProvider: |
| 398 | @patch("video_processor.providers.ollama_provider.requests") |
| 399 | def test_is_available_when_running(self, mock_requests): |
| @@ -189,35 +440,37 @@ | |
| 440 | provider = OllamaProvider() |
| 441 | models = provider.list_models() |
| 442 | assert len(models) == 2 |
| 443 | assert models[0].provider == "ollama" |
| 444 | |
| 445 | llava = [m for m in models if "llava" in m.id][0] |
| 446 | assert "vision" in llava.capabilities |
| 447 | |
| 448 | llama = [m for m in models if "llama" in m.id][0] |
| 449 | assert "chat" in llama.capabilities |
| 450 | assert "vision" not in llama.capabilities |
| 451 | |
| 452 | |
| 453 | # --------------------------------------------------------------------------- |
| 454 | # Provider module imports |
| 455 | # --------------------------------------------------------------------------- |
| 456 | |
| 457 | |
| 458 | class TestProviderImports: |
| 459 | """Verify that all provider modules import without errors.""" |
| 460 | |
| 461 | PROVIDER_MODULES = [ |
| 462 | "video_processor.providers.openai_provider", |
| 463 | "video_processor.providers.anthropic_provider", |
| 464 | "video_processor.providers.gemini_provider", |
| 465 | "video_processor.providers.ollama_provider", |
| 466 | "video_processor.providers.azure_provider", |
| 467 | "video_processor.providers.together_provider", |
| 468 | "video_processor.providers.fireworks_provider", |
| 469 | "video_processor.providers.cerebras_provider", |
| 470 | "video_processor.providers.xai_provider", |
| 471 | ] |
| 472 | |
| 473 | @pytest.mark.parametrize("module_name", PROVIDER_MODULES) |
| 474 | def test_import(self, module_name): |
| 475 | mod = importlib.import_module(module_name) |
| 476 | assert mod is not None |
| 477 | |
| 478 | DDED tests/test_sources.py |
| 479 | DDED tests/test_taxonomy.py |
| 480 | DDED tests/test_usage_tracker.py |
| 481 | DDED tests/test_visualization.py |
| 482 | DDED video_processor/agent/agent_loop.py |
| 483 | DDED video_processor/agent/kb_context.py |
+1572
| --- a/tests/test_sources.py | ||
| +++ b/tests/test_sources.py | ||
| @@ -0,0 +1,1572 @@ | ||
| 1 | +"""Tests for all source connectors: import, instantiation, authenticate, list_videos.""" | |
| 2 | + | |
| 3 | +import os | |
| 4 | +from unittest.mock import MagicMock, patch | |
| 5 | + | |
| 6 | +import pytest | |
| 7 | + | |
| 8 | +from video_proes.base import BaseSource, SourceFile | |
| 9 | + | |
| 10 | +# --------------------------------------------------------------------------- | |
| 11 | +# SourceFile model | |
| 12 | +# --------------------------------------------------------------------------- | |
| 13 | + | |
| 14 | + | |
| 15 | +def test_source_file_creation(): | |
| 16 | + sf = SourceFile(name="test.mp4", id="abc123") | |
| 17 | + assert sf.name == "test.mp4" | |
| 18 | + assert sf.id == "abc123" | |
| 19 | + assert sf.size_bytes is None | |
| 20 | + assert sf.mime_type is None | |
| 21 | + | |
| 22 | + | |
| 23 | +def test_source_file_with_all_fields(): | |
| 24 | + sf = SourceFile( | |
| 25 | + name="video.mp4", | |
| 26 | + id="v1", | |
| 27 | + size_bytes=1024, | |
| 28 | + mime_type="video/mp4", | |
| 29 | + modified_at="2025-01-01", | |
| 30 | + path="folder/video.mp4", | |
| 31 | + ) | |
| 32 | + assert sf.size_bytes == 1024 | |
| 33 | + assert sf.path == "folder/video.mp4" | |
| 34 | + | |
| 35 | + | |
| 36 | +# --------------------------------------------------------------------------- | |
| 37 | +# YouTubeSource | |
| 38 | +# --------------------------------------------------------------------------- | |
| 39 | + | |
| 40 | + | |
| 41 | +class TestYouTubeSource: | |
| 42 | + def test_import(self): | |
| 43 | + from video_processor.sources.youtube_source import YouTubeSource | |
| 44 | + | |
| 45 | + assert YouTubeSource is not None | |
| 46 | + | |
| 47 | + def test_constructor(self): | |
| 48 | + from video_processor.sources.youtube_source import YouTubeSource | |
| 49 | + | |
| 50 | + src = YouTubeSource(url="https://www.youtube.com/watch?v=dQw4w9WgXcQ") | |
| 51 | + assert src.video_id == "dQw4w9WgXcQ" | |
| 52 | + assert src.audio_only is False | |
| 53 | + | |
| 54 | + def test_constructor_audio_only(self): | |
| 55 | + from video_processor.sources.youtube_source import YouTubeSource | |
| 56 | + | |
| 57 | + src = YouTubeSource(url="https://youtu.be/dQw4w9WgXcQ", audio_only=True) | |
| 58 | + assert src.audio_only is True | |
| 59 | + | |
| 60 | + def test_constructor_shorts_url(self): | |
| 61 | + from video_processor.sources.youtube_source import YouTubeSource | |
| 62 | + | |
| 63 | + src = YouTubeSource(url="https://youtube.com/shorts/dQw4w9WgXcQ") | |
| 64 | + assert src.video_id == "dQw4w9WgXcQ" | |
| 65 | + | |
| 66 | + def test_constructor_invalid_url(self): | |
| 67 | + from video_processor.sources.youtube_source import YouTubeSource | |
| 68 | + | |
| 69 | + with pytest.raises(ValueError, match="Could not extract"): | |
| 70 | + YouTubeSource(url="https://example.com/not-youtube") | |
| 71 | + | |
| 72 | + @patch.dict(os.environ, {}, clear=False) | |
| 73 | + def test_authenticate_no_ytdlp(self): | |
| 74 | + from video_processor.sources.youtube_source import YouTubeSource | |
| 75 | + | |
| 76 | + src = YouTubeSource(url="https://youtube.com/watch?v=dQw4w9WgXcQ") | |
| 77 | + with patch.dict("sys.modules", {"yt_dlp": None}): | |
| 78 | + # yt_dlp import will fail | |
| 79 | + result = src.authenticate() | |
| 80 | + # Result depends on whether yt_dlp is installed; just check it returns bool | |
| 81 | + assert isinstance(result, bool) | |
| 82 | + | |
| 83 | + def test_list_videos(self): | |
| 84 | + from video_processor.sources.youtube_source import YouTubeSource | |
| 85 | + | |
| 86 | + mock_ydl = MagicMock() | |
| 87 | + mock_ydl.__enter__ = MagicMock(return_value=mock_ydl) | |
| 88 | + mock_ydl.__exit__ = MagicMock(return_value=False) | |
| 89 | + mock_ydl.extract_info.return_value = { | |
| 90 | + "title": "Test Video", | |
| 91 | + "filesize": 1000, | |
| 92 | + } | |
| 93 | + mock_ydl_cls = MagicMock(return_value=mock_ydl) | |
| 94 | + mock_module = MagicMock() | |
| 95 | + mock_module.YoutubeDL = mock_ydl_cls | |
| 96 | + | |
| 97 | + with patch.dict("sys.modules", {"yt_dlp": mock_module}): | |
| 98 | + src = YouTubeSource(url="https://youtube.com/watch?v=dQw4w9WgXcQ") | |
| 99 | + files = src.list_videos() | |
| 100 | + assert isinstance(files, list) | |
| 101 | + assert len(files) == 1 | |
| 102 | + assert files[0].name == "Test Video" | |
| 103 | + | |
| 104 | + | |
| 105 | +# --------------------------------------------------------------------------- | |
| 106 | +# WebSource | |
| 107 | +# --------------------------------------------------------------------------- | |
| 108 | + | |
| 109 | + | |
| 110 | +class TestWebSource: | |
| 111 | + def test_import(self): | |
| 112 | + from video_processor.sources.web_source import WebSource | |
| 113 | + | |
| 114 | + assert WebSource is not None | |
| 115 | + | |
| 116 | + def test_constructor(self): | |
| 117 | + from video_processor.sources.web_source import WebSource | |
| 118 | + | |
| 119 | + src = WebSource(url="https://example.com/page") | |
| 120 | + assert src.url == "https://example.com/page" | |
| 121 | + | |
| 122 | + def test_authenticate(self): | |
| 123 | + from video_processor.sources.web_source import WebSource | |
| 124 | + | |
| 125 | + src = WebSource(url="https://example.com") | |
| 126 | + assert src.authenticate() is True | |
| 127 | + | |
| 128 | + def test_list_videos(self): | |
| 129 | + from video_processor.sources.web_source import WebSource | |
| 130 | + | |
| 131 | + src = WebSource(url="https://example.com/article") | |
| 132 | + files = src.list_videos() | |
| 133 | + assert isinstance(files, list) | |
| 134 | + assert len(files) == 1 | |
| 135 | + assert files[0].mime_type == "text/html" | |
| 136 | + | |
| 137 | + | |
| 138 | +# --------------------------------------------------------------------------- | |
| 139 | +# GitHubSource | |
| 140 | +# --------------------------------------------------------------------------- | |
| 141 | + | |
| 142 | + | |
| 143 | +class TestGitHubSource: | |
| 144 | + def test_import(self): | |
| 145 | + from video_processor.sources.github_source import GitHubSource | |
| 146 | + | |
| 147 | + assert GitHubSource is not None | |
| 148 | + | |
| 149 | + def test_constructor(self): | |
| 150 | + from video_processor.sources.github_source import GitHubSource | |
| 151 | + | |
| 152 | + src = GitHubSource(repo="owner/repo") | |
| 153 | + assert src.repo == "owner/repo" | |
| 154 | + assert src.include_issues is True | |
| 155 | + assert src.include_prs is True | |
| 156 | + | |
| 157 | + @patch.dict(os.environ, {"GITHUB_TOKEN": "ghp_test123"}) | |
| 158 | + def test_authenticate_with_env_token(self): | |
| 159 | + from video_processor.sources.github_source import GitHubSource | |
| 160 | + | |
| 161 | + src = GitHubSource(repo="owner/repo") | |
| 162 | + result = src.authenticate() | |
| 163 | + assert result is True | |
| 164 | + assert src._token == "ghp_test123" | |
| 165 | + | |
| 166 | + @patch("requests.get") | |
| 167 | + @patch.dict(os.environ, {"GITHUB_TOKEN": "ghp_test123"}) | |
| 168 | + def test_list_videos(self, mock_get): | |
| 169 | + from video_processor.sources.github_source import GitHubSource | |
| 170 | + | |
| 171 | + # Mock responses for readme, issues, and PRs | |
| 172 | + readme_resp = MagicMock() | |
| 173 | + readme_resp.ok = True | |
| 174 | + | |
| 175 | + issues_resp = MagicMock() | |
| 176 | + issues_resp.ok = True | |
| 177 | + issues_resp.json.return_value = [ | |
| 178 | + {"number": 1, "title": "Bug report", "id": 1}, | |
| 179 | + {"number": 2, "title": "Feature request", "id": 2, "pull_request": {}}, | |
| 180 | + ] | |
| 181 | + | |
| 182 | + prs_resp = MagicMock() | |
| 183 | + prs_resp.ok = True | |
| 184 | + prs_resp.json.return_value = [ | |
| 185 | + {"number": 3, "title": "Fix bug"}, | |
| 186 | + ] | |
| 187 | + | |
| 188 | + mock_get.side_effect = [readme_resp, issues_resp, prs_resp] | |
| 189 | + | |
| 190 | + src = GitHubSource(repo="owner/repo") | |
| 191 | + src.authenticate() | |
| 192 | + files = src.list_videos() | |
| 193 | + assert isinstance(files, list) | |
| 194 | + # README + 1 issue (one filtered as PR) + 1 PR = 3 | |
| 195 | + assert len(files) == 3 | |
| 196 | + | |
| 197 | + | |
| 198 | +# --------------------------------------------------------------------------- | |
| 199 | +# RedditSource | |
| 200 | +# --------------------------------------------------------------------------- | |
| 201 | + | |
| 202 | + | |
| 203 | +class TestRedditSource: | |
| 204 | + def test_import(self): | |
| 205 | + from video_processor.sources.reddit_source import RedditSource | |
| 206 | + | |
| 207 | + assert RedditSource is not None | |
| 208 | + | |
| 209 | + def test_constructor(self): | |
| 210 | + from video_processor.sources.reddit_source import RedditSource | |
| 211 | + | |
| 212 | + src = RedditSource(url="https://reddit.com/r/python/comments/abc123/test/") | |
| 213 | + assert src.url == "https://reddit.com/r/python/comments/abc123/test" | |
| 214 | + | |
| 215 | + def test_authenticate(self): | |
| 216 | + from video_processor.sources.reddit_source import RedditSource | |
| 217 | + | |
| 218 | + src = RedditSource(url="https://reddit.com/r/test") | |
| 219 | + assert src.authenticate() is True | |
| 220 | + | |
| 221 | + def test_list_videos(self): | |
| 222 | + from video_processor.sources.reddit_source import RedditSource | |
| 223 | + | |
| 224 | + src = RedditSource(url="https://reddit.com/r/python/comments/abc/post") | |
| 225 | + files = src.list_videos() | |
| 226 | + assert isinstance(files, list) | |
| 227 | + assert len(files) == 1 | |
| 228 | + assert files[0].mime_type == "text/plain" | |
| 229 | + | |
| 230 | + | |
| 231 | +# --------------------------------------------------------------------------- | |
| 232 | +# HackerNewsSource | |
| 233 | +# --------------------------------------------------------------------------- | |
| 234 | + | |
| 235 | + | |
| 236 | +class TestHackerNewsSource: | |
| 237 | + def test_import(self): | |
| 238 | + from video_processor.sources.hackernews_source import HackerNewsSource | |
| 239 | + | |
| 240 | + assert HackerNewsSource is not None | |
| 241 | + | |
| 242 | + def test_constructor(self): | |
| 243 | + from video_processor.sources.hackernews_source import HackerNewsSource | |
| 244 | + | |
| 245 | + src = HackerNewsSource(item_id=12345678) | |
| 246 | + assert src.item_id == 12345678 | |
| 247 | + assert src.max_comments == 200 | |
| 248 | + | |
| 249 | + def test_authenticate(self): | |
| 250 | + from video_processor.sources.hackernews_source import HackerNewsSource | |
| 251 | + | |
| 252 | + src = HackerNewsSource(item_id=12345678) | |
| 253 | + assert src.authenticate() is True | |
| 254 | + | |
| 255 | + def test_list_videos(self): | |
| 256 | + from video_processor.sources.hackernews_source import HackerNewsSource | |
| 257 | + | |
| 258 | + src = HackerNewsSource(item_id=99999) | |
| 259 | + files = src.list_videos() | |
| 260 | + assert isinstance(files, list) | |
| 261 | + assert len(files) == 1 | |
| 262 | + assert files[0].id == "99999" | |
| 263 | + | |
| 264 | + | |
| 265 | +# --------------------------------------------------------------------------- | |
| 266 | +# RSSSource | |
| 267 | +# --------------------------------------------------------------------------- | |
| 268 | + | |
| 269 | + | |
| 270 | +class TestRSSSource: | |
| 271 | + def test_import(self): | |
| 272 | + from video_processor.sources.rss_source import RSSSource | |
| 273 | + | |
| 274 | + assert RSSSource is not None | |
| 275 | + | |
| 276 | + def test_constructor(self): | |
| 277 | + from video_processor.sources.rss_source import RSSSource | |
| 278 | + | |
| 279 | + src = RSSSource(url="https://example.com/feed.xml", max_entries=20) | |
| 280 | + assert src.url == "https://example.com/feed.xml" | |
| 281 | + assert src.max_entries == 20 | |
| 282 | + | |
| 283 | + def test_authenticate(self): | |
| 284 | + from video_processor.sources.rss_source import RSSSource | |
| 285 | + | |
| 286 | + src = RSSSource(url="https://example.com/feed.xml") | |
| 287 | + assert src.authenticate() is True | |
| 288 | + | |
| 289 | + @patch("requests.get") | |
| 290 | + def test_list_videos(self, mock_get): | |
| 291 | + from video_processor.sources.rss_source import RSSSource | |
| 292 | + | |
| 293 | + rss_xml = """<?xml version="1.0"?> | |
| 294 | + <rss version="2.0"> | |
| 295 | + <channel> | |
| 296 | + <item> | |
| 297 | + <title>Entry 1</title> | |
| 298 | + <link>https://example.com/1</link> | |
| 299 | + <description>First entry</description> | |
| 300 | + <pubDate>Mon, 01 Jan 2025 00:00:00 GMT</pubDate> | |
| 301 | + </item> | |
| 302 | + </channel> | |
| 303 | + </rss>""" | |
| 304 | + mock_resp = MagicMock() | |
| 305 | + mock_resp.text = rss_xml | |
| 306 | + mock_resp.raise_for_status = MagicMock() | |
| 307 | + mock_get.return_value = mock_resp | |
| 308 | + | |
| 309 | + src = RSSSource(url="https://example.com/feed.xml") | |
| 310 | + files = src.list_videos() | |
| 311 | + assert isinstance(files, list) | |
| 312 | + assert len(files) >= 1 | |
| 313 | + | |
| 314 | + | |
| 315 | +# --------------------------------------------------------------------------- | |
| 316 | +# PodcastSource | |
| 317 | +# --------------------------------------------------------------------------- | |
| 318 | + | |
| 319 | + | |
| 320 | +class TestPodcastSource: | |
| 321 | + def test_import(self): | |
| 322 | + from video_processor.sources.podcast_source import PodcastSource | |
| 323 | + | |
| 324 | + assert PodcastSource is not None | |
| 325 | + | |
| 326 | + def test_constructor(self): | |
| 327 | + from video_processor.sources.podcast_source import PodcastSource | |
| 328 | + | |
| 329 | + src = PodcastSource(feed_url="https://example.com/podcast.xml", max_episodes=5) | |
| 330 | + assert src.feed_url == "https://example.com/podcast.xml" | |
| 331 | + assert src.max_episodes == 5 | |
| 332 | + | |
| 333 | + def test_authenticate(self): | |
| 334 | + from video_processor.sources.podcast_source import PodcastSource | |
| 335 | + | |
| 336 | + src = PodcastSource(feed_url="https://example.com/podcast.xml") | |
| 337 | + assert src.authenticate() is True | |
| 338 | + | |
| 339 | + @patch("requests.get") | |
| 340 | + def test_list_videos(self, mock_get): | |
| 341 | + from video_processor.sources.podcast_source import PodcastSource | |
| 342 | + | |
| 343 | + podcast_xml = """<?xml version="1.0"?> | |
| 344 | + <rss version="2.0"> | |
| 345 | + <channel> | |
| 346 | + <item> | |
| 347 | + <title>Episode 1</title> | |
| 348 | + <enclosure url="https://example.com/ep1.mp3" type="audio/mpeg" /> | |
| 349 | + <pubDate>Mon, 01 Jan 2025 00:00:00 GMT</pubDate> | |
| 350 | + </item> | |
| 351 | + </channel> | |
| 352 | + </rss>""" | |
| 353 | + mock_resp = MagicMock() | |
| 354 | + mock_resp.text = podcast_xml | |
| 355 | + mock_resp.raise_for_status = MagicMock() | |
| 356 | + mock_get.return_value = mock_resp | |
| 357 | + | |
| 358 | + src = PodcastSource(feed_url="https://example.com/podcast.xml") | |
| 359 | + files = src.list_videos() | |
| 360 | + assert isinstance(files, list) | |
| 361 | + assert len(files) == 1 | |
| 362 | + assert files[0].mime_type == "audio/mpeg" | |
| 363 | + | |
| 364 | + | |
| 365 | +# --------------------------------------------------------------------------- | |
| 366 | +# TwitterSource | |
| 367 | +# --------------------------------------------------------------------------- | |
| 368 | + | |
| 369 | + | |
| 370 | +class TestTwitterSource: | |
| 371 | + def test_import(self): | |
| 372 | + from video_processor.sources.twitter_source import TwitterSource | |
| 373 | + | |
| 374 | + assert TwitterSource is not None | |
| 375 | + | |
| 376 | + def test_constructor(self): | |
| 377 | + from video_processor.sources.twitter_source import TwitterSource | |
| 378 | + | |
| 379 | + src = TwitterSource(url="https://twitter.com/user/status/123456") | |
| 380 | + assert src.url == "https://twitter.com/user/status/123456" | |
| 381 | + | |
| 382 | + @patch.dict(os.environ, {"TWITTER_BEARER_TOKEN": "test_token"}) | |
| 383 | + def test_authenticate_with_bearer_token(self): | |
| 384 | + from video_processor.sources.twitter_source import TwitterSource | |
| 385 | + | |
| 386 | + src = TwitterSource(url="https://twitter.com/user/status/123456") | |
| 387 | + assert src.authenticate() is True | |
| 388 | + | |
| 389 | + @patch.dict(os.environ, {}, clear=True) | |
| 390 | + def test_authenticate_no_token_no_gallery_dl(self): | |
| 391 | + from video_processor.sources.twitter_source import TwitterSource | |
| 392 | + | |
| 393 | + src = TwitterSource(url="https://twitter.com/user/status/123456") | |
| 394 | + with patch.dict("sys.modules", {"gallery_dl": None}): | |
| 395 | + result = src.authenticate() | |
| 396 | + assert isinstance(result, bool) | |
| 397 | + | |
| 398 | + def test_list_videos(self): | |
| 399 | + from video_processor.sources.twitter_source import TwitterSource | |
| 400 | + | |
| 401 | + src = TwitterSource(url="https://twitter.com/user/status/123456") | |
| 402 | + files = src.list_videos() | |
| 403 | + assert isinstance(files, list) | |
| 404 | + assert len(files) == 1 | |
| 405 | + | |
| 406 | + | |
| 407 | +# --------------------------------------------------------------------------- | |
| 408 | +# ArxivSource | |
| 409 | +# --------------------------------------------------------------------------- | |
| 410 | + | |
| 411 | + | |
| 412 | +class TestArxivSource: | |
| 413 | + def test_import(self): | |
| 414 | + from video_processor.sources.arxiv_source import ArxivSource | |
| 415 | + | |
| 416 | + assert ArxivSource is not None | |
| 417 | + | |
| 418 | + def test_constructor(self): | |
| 419 | + from video_processor.sources.arxiv_source import ArxivSource | |
| 420 | + | |
| 421 | + src = ArxivSource(url_or_id="2301.07041") | |
| 422 | + assert src.arxiv_id == "2301.07041" | |
| 423 | + | |
| 424 | + def test_constructor_from_url(self): | |
| 425 | + from video_processor.sources.arxiv_source import ArxivSource | |
| 426 | + | |
| 427 | + src = ArxivSource(url_or_id="https://arxiv.org/abs/2301.07041v2") | |
| 428 | + assert src.arxiv_id == "2301.07041v2" | |
| 429 | + | |
| 430 | + def test_constructor_invalid(self): | |
| 431 | + from video_processor.sources.arxiv_source import ArxivSource | |
| 432 | + | |
| 433 | + with pytest.raises(ValueError, match="Could not extract"): | |
| 434 | + ArxivSource(url_or_id="not-an-arxiv-id") | |
| 435 | + | |
| 436 | + def test_authenticate(self): | |
| 437 | + from video_processor.sources.arxiv_source import ArxivSource | |
| 438 | + | |
| 439 | + src = ArxivSource(url_or_id="2301.07041") | |
| 440 | + assert src.authenticate() is True | |
| 441 | + | |
| 442 | + @patch("requests.get") | |
| 443 | + def test_list_videos(self, mock_get): | |
| 444 | + from video_processor.sources.arxiv_source import ArxivSource | |
| 445 | + | |
| 446 | + atom_xml = """<?xml version="1.0"?> | |
| 447 | + <feed xmlns="http://www.w3.org/2005/Atom" | |
| 448 | + xmlns:arxiv="http://arxiv.org/schemas/atom"> | |
| 449 | + <entry> | |
| 450 | + <title>Test Paper</title> | |
| 451 | + <summary>Abstract text here.</summary> | |
| 452 | + <author><name>Author One</name></author> | |
| 453 | + <published>2023-01-15T00:00:00Z</published> | |
| 454 | + </entry> | |
| 455 | + </feed>""" | |
| 456 | + mock_resp = MagicMock() | |
| 457 | + mock_resp.text = atom_xml | |
| 458 | + mock_resp.raise_for_status = MagicMock() | |
| 459 | + mock_get.return_value = mock_resp | |
| 460 | + | |
| 461 | + src = ArxivSource(url_or_id="2301.07041") | |
| 462 | + files = src.list_videos() | |
| 463 | + assert isinstance(files, list) | |
| 464 | + assert len(files) == 2 # metadata + pdf | |
| 465 | + | |
| 466 | + | |
| 467 | +# --------------------------------------------------------------------------- | |
| 468 | +# S3Source | |
| 469 | +# --------------------------------------------------------------------------- | |
| 470 | + | |
| 471 | + | |
| 472 | +class TestS3Source: | |
| 473 | + def test_import(self): | |
| 474 | + from video_processor.sources.s3_source import S3Source | |
| 475 | + | |
| 476 | + assert S3Source is not None | |
| 477 | + | |
| 478 | + def test_constructor(self): | |
| 479 | + from video_processor.sources.s3_source import S3Source | |
| 480 | + | |
| 481 | + src = S3Source(bucket="my-bucket", prefix="videos/", region="us-east-1") | |
| 482 | + assert src.bucket == "my-bucket" | |
| 483 | + assert src.prefix == "videos/" | |
| 484 | + assert src.region == "us-east-1" | |
| 485 | + | |
| 486 | + def test_authenticate_success(self): | |
| 487 | + from video_processor.sources.s3_source import S3Source | |
| 488 | + | |
| 489 | + mock_client = MagicMock() | |
| 490 | + mock_client.head_bucket.return_value = {} | |
| 491 | + mock_boto3 = MagicMock() | |
| 492 | + mock_boto3.client.return_value = mock_client | |
| 493 | + | |
| 494 | + with patch.dict("sys.modules", {"boto3": mock_boto3}): | |
| 495 | + src = S3Source(bucket="my-bucket") | |
| 496 | + assert src.authenticate() is True | |
| 497 | + | |
| 498 | + def test_authenticate_failure(self): | |
| 499 | + from video_processor.sources.s3_source import S3Source | |
| 500 | + | |
| 501 | + mock_client = MagicMock() | |
| 502 | + mock_client.head_bucket.side_effect = Exception("Access Denied") | |
| 503 | + mock_boto3 = MagicMock() | |
| 504 | + mock_boto3.client.return_value = mock_client | |
| 505 | + | |
| 506 | + with patch.dict("sys.modules", {"boto3": mock_boto3}): | |
| 507 | + src = S3Source(bucket="bad-bucket") | |
| 508 | + assert src.authenticate() is False | |
| 509 | + | |
| 510 | + def test_list_videos(self): | |
| 511 | + from video_processor.sources.s3_source import S3Source | |
| 512 | + | |
| 513 | + mock_client = MagicMock() | |
| 514 | + mock_client.head_bucket.return_value = {} | |
| 515 | + paginator = MagicMock() | |
| 516 | + mock_client.get_paginator.return_value = paginator | |
| 517 | + paginator.paginate.return_value = [ | |
| 518 | + { | |
| 519 | + "Contents": [ | |
| 520 | + {"Key": "videos/clip.mp4", "Size": 5000}, | |
| 521 | + {"Key": "videos/notes.txt", "Size": 100}, | |
| 522 | + {"Key": "videos/movie.mkv", "Size": 90000}, | |
| 523 | + ] | |
| 524 | + } | |
| 525 | + ] | |
| 526 | + mock_boto3 = MagicMock() | |
| 527 | + mock_boto3.client.return_value = mock_client | |
| 528 | + | |
| 529 | + with patch.dict("sys.modules", {"boto3": mock_boto3}): | |
| 530 | + src = S3Source(bucket="my-bucket") | |
| 531 | + src.authenticate() | |
| 532 | + files = src.list_videos() | |
| 533 | + assert isinstance(files, list) | |
| 534 | + # Only .mp4 and .mkv are video extensions | |
| 535 | + assert len(files) == 2 | |
| 536 | + names = [f.name for f in files] | |
| 537 | + assert "clip.mp4" in names | |
| 538 | + assert "movie.mkv" in names | |
| 539 | + | |
| 540 | + | |
| 541 | +# --------------------------------------------------------------------------- | |
| 542 | +# GWSSource | |
| 543 | +# --------------------------------------------------------------------------- | |
| 544 | + | |
| 545 | + | |
| 546 | +class TestGWSSource: | |
| 547 | + def test_import(self): | |
| 548 | + from video_processor.sources.gws_source import GWSSource | |
| 549 | + | |
| 550 | + assert GWSSource is not None | |
| 551 | + | |
| 552 | + def test_constructor_defaults(self): | |
| 553 | + from video_processor.sources.gws_source import GWSSource | |
| 554 | + | |
| 555 | + src = GWSSource() | |
| 556 | + assert src.folder_id is None | |
| 557 | + assert src.query is None | |
| 558 | + assert src.doc_ids == [] | |
| 559 | + | |
| 560 | + def test_constructor_with_folder(self): | |
| 561 | + from video_processor.sources.gws_source import GWSSource | |
| 562 | + | |
| 563 | + src = GWSSource(folder_id="1abc", query="name contains 'spec'") | |
| 564 | + assert src.folder_id == "1abc" | |
| 565 | + assert src.query == "name contains 'spec'" | |
| 566 | + | |
| 567 | + def test_constructor_with_doc_ids(self): | |
| 568 | + from video_processor.sources.gws_source import GWSSource | |
| 569 | + | |
| 570 | + src = GWSSource(doc_ids=["doc1", "doc2"]) | |
| 571 | + assert src.doc_ids == ["doc1", "doc2"] | |
| 572 | + | |
| 573 | + @patch("shutil.which", return_value=None) | |
| 574 | + def test_authenticate_no_gws(self, _mock_which): | |
| 575 | + from video_processor.sources.gws_source import GWSSource | |
| 576 | + | |
| 577 | + src = GWSSource() | |
| 578 | + assert src.authenticate() is False | |
| 579 | + | |
| 580 | + @patch("video_processor.sources.gws_source._run_gws") | |
| 581 | + @patch("shutil.which", return_value="/usr/local/bin/gws") | |
| 582 | + def test_authenticate_success(self, _mock_which, mock_run): | |
| 583 | + from video_processor.sources.gws_source import GWSSource | |
| 584 | + | |
| 585 | + mock_run.return_value = {"connectedAs": "[email protected]"} | |
| 586 | + src = GWSSource() | |
| 587 | + assert src.authenticate() is True | |
| 588 | + | |
| 589 | + @patch("video_processor.sources.gws_source._run_gws") | |
| 590 | + @patch("shutil.which", return_value="/usr/local/bin/gws") | |
| 591 | + def test_list_videos(self, _mock_which, mock_run): | |
| 592 | + from video_processor.sources.gws_source import GWSSource | |
| 593 | + | |
| 594 | + mock_run.return_value = { | |
| 595 | + "files": [ | |
| 596 | + { | |
| 597 | + "id": "doc123", | |
| 598 | + "name": "Project Spec", | |
| 599 | + "mimeType": "application/vnd.google-apps.document", | |
| 600 | + "modifiedTime": "2026-01-01T00:00:00Z", | |
| 601 | + }, | |
| 602 | + { | |
| 603 | + "id": "sheet456", | |
| 604 | + "name": "Budget", | |
| 605 | + "mimeType": "application/vnd.google-apps.spreadsheet", | |
| 606 | + }, | |
| 607 | + ] | |
| 608 | + } | |
| 609 | + src = GWSSource(folder_id="folder1") | |
| 610 | + files = src.list_videos() | |
| 611 | + assert len(files) == 2 | |
| 612 | + assert files[0].name == "Project Spec" | |
| 613 | + assert files[1].id == "sheet456" | |
| 614 | + | |
| 615 | + @patch("video_processor.sources.gws_source._run_gws") | |
| 616 | + @patch("shutil.which", return_value="/usr/local/bin/gws") | |
| 617 | + def test_list_videos_with_doc_ids(self, _mock_which, mock_run): | |
| 618 | + from video_processor.sources.gws_source import GWSSource | |
| 619 | + | |
| 620 | + mock_run.return_value = { | |
| 621 | + "id": "doc123", | |
| 622 | + "name": "My Doc", | |
| 623 | + "mimeType": "application/vnd.google-apps.document", | |
| 624 | + } | |
| 625 | + src = GWSSource(doc_ids=["doc123"]) | |
| 626 | + files = src.list_videos() | |
| 627 | + assert len(files) == 1 | |
| 628 | + assert files[0].name == "My Doc" | |
| 629 | + | |
| 630 | + def test_result_to_source_file(self): | |
| 631 | + from video_processor.sources.gws_source import _result_to_source_file | |
| 632 | + | |
| 633 | + sf = _result_to_source_file( | |
| 634 | + { | |
| 635 | + "id": "abc", | |
| 636 | + "name": "Test Doc", | |
| 637 | + "mimeType": "text/plain", | |
| 638 | + "size": "1024", | |
| 639 | + "modifiedTime": "2026-03-01", | |
| 640 | + } | |
| 641 | + ) | |
| 642 | + assert sf.name == "Test Doc" | |
| 643 | + assert sf.id == "abc" | |
| 644 | + assert sf.size_bytes == 1024 | |
| 645 | + assert sf.mime_type == "text/plain" | |
| 646 | + | |
| 647 | + @patch("video_processor.sources.gws_source._run_gws") | |
| 648 | + def test_get_doc_text(self, mock_run): | |
| 649 | + from video_processor.sources.gws_source import GWSSource | |
| 650 | + | |
| 651 | + mock_run.return_value = { | |
| 652 | + "body": { | |
| 653 | + "content": [ | |
| 654 | + { | |
| 655 | + "paragraph": { | |
| 656 | + "elements": [ | |
| 657 | + {"textRun": {"content": "Hello world\n"}}, | |
| 658 | + ] | |
| 659 | + } | |
| 660 | + }, | |
| 661 | + { | |
| 662 | + "paragraph": { | |
| 663 | + "elements": [ | |
| 664 | + {"textRun": {"content": "Second paragraph\n"}}, | |
| 665 | + ] | |
| 666 | + } | |
| 667 | + }, | |
| 668 | + ] | |
| 669 | + } | |
| 670 | + } | |
| 671 | + src = GWSSource() | |
| 672 | + text = src._get_doc_text("doc123") | |
| 673 | + assert "Hello world" in text | |
| 674 | + assert "Second paragraph" in text | |
| 675 | + | |
| 676 | + @patch("video_processor.sources.gws_source._run_gws") | |
| 677 | + def test_collate(self, mock_run): | |
| 678 | + from video_processor.sources.gws_source import GWSSource | |
| 679 | + | |
| 680 | + # First call: list files, second+: export each | |
| 681 | + mock_run.side_effect = [ | |
| 682 | + { | |
| 683 | + "files": [ | |
| 684 | + { | |
| 685 | + "id": "d1", | |
| 686 | + "name": "Doc A", | |
| 687 | + "mimeType": "application/vnd.google-apps.document", | |
| 688 | + }, | |
| 689 | + ] | |
| 690 | + }, | |
| 691 | + {"raw": "Content of Doc A"}, | |
| 692 | + ] | |
| 693 | + src = GWSSource(folder_id="f1") | |
| 694 | + result = src.collate() | |
| 695 | + assert "Doc A" in result | |
| 696 | + assert "Content of Doc A" in result | |
| 697 | + | |
| 698 | + | |
| 699 | +# --------------------------------------------------------------------------- | |
| 700 | +# M365Source | |
| 701 | +# --------------------------------------------------------------------------- | |
| 702 | + | |
| 703 | + | |
| 704 | +class TestM365Source: | |
| 705 | + def test_import(self): | |
| 706 | + from video_processor.sources.m365_source import M365Source | |
| 707 | + | |
| 708 | + assert M365Source is not None | |
| 709 | + | |
| 710 | + def test_constructor(self): | |
| 711 | + from video_processor.sources.m365_source import M365Source | |
| 712 | + | |
| 713 | + src = M365Source( | |
| 714 | + web_url="https://contoso.sharepoint.com/sites/proj", | |
| 715 | + folder_url="/sites/proj/Shared Documents", | |
| 716 | + ) | |
| 717 | + assert src.web_url == "https://contoso.sharepoint.com/sites/proj" | |
| 718 | + assert src.folder_url == "/sites/proj/Shared Documents" | |
| 719 | + assert src.file_ids == [] | |
| 720 | + assert src.recursive is False | |
| 721 | + | |
| 722 | + def test_constructor_with_file_ids(self): | |
| 723 | + from video_processor.sources.m365_source import M365Source | |
| 724 | + | |
| 725 | + src = M365Source( | |
| 726 | + web_url="https://contoso.sharepoint.com", | |
| 727 | + file_ids=["id1", "id2"], | |
| 728 | + ) | |
| 729 | + assert src.file_ids == ["id1", "id2"] | |
| 730 | + | |
| 731 | + @patch("shutil.which", return_value=None) | |
| 732 | + def test_authenticate_no_m365(self, _mock_which): | |
| 733 | + from video_processor.sources.m365_source import M365Source | |
| 734 | + | |
| 735 | + src = M365Source(web_url="https://contoso.sharepoint.com") | |
| 736 | + assert src.authenticate() is False | |
| 737 | + | |
| 738 | + @patch("video_processor.sources.m365_source._run_m365") | |
| 739 | + @patch("shutil.which", return_value="/usr/local/bin/m365") | |
| 740 | + def test_authenticate_logged_in(self, _mock_which, mock_run): | |
| 741 | + from video_processor.sources.m365_source import M365Source | |
| 742 | + | |
| 743 | + mock_run.return_value = {"connectedAs": "[email protected]"} | |
| 744 | + src = M365Source(web_url="https://contoso.sharepoint.com") | |
| 745 | + assert src.authenticate() is True | |
| 746 | + | |
| 747 | + @patch("video_processor.sources.m365_source._run_m365") | |
| 748 | + @patch("shutil.which", return_value="/usr/local/bin/m365") | |
| 749 | + def test_authenticate_not_logged_in(self, _mock_which, mock_run): | |
| 750 | + from video_processor.sources.m365_source import M365Source | |
| 751 | + | |
| 752 | + mock_run.return_value = {} | |
| 753 | + src = M365Source(web_url="https://contoso.sharepoint.com") | |
| 754 | + assert src.authenticate() is False | |
| 755 | + | |
| 756 | + @patch("video_processor.sources.m365_source._run_m365") | |
| 757 | + @patch("shutil.which", return_value="/usr/local/bin/m365") | |
| 758 | + def test_list_videos(self, _mock_which, mock_run): | |
| 759 | + from video_processor.sources.m365_source import M365Source | |
| 760 | + | |
| 761 | + mock_run.side_effect = [ | |
| 762 | + {"connectedAs": "[email protected]"}, # authenticate | |
| 763 | + [ | |
| 764 | + { | |
| 765 | + "Name": "spec.docx", | |
| 766 | + "UniqueId": "uid-1", | |
| 767 | + "Length": "20480", | |
| 768 | + "ServerRelativeUrl": "/sites/proj/docs/spec.docx", | |
| 769 | + }, | |
| 770 | + { | |
| 771 | + "Name": "budget.xlsx", | |
| 772 | + "UniqueId": "uid-2", | |
| 773 | + "Length": "10240", | |
| 774 | + "ServerRelativeUrl": "/sites/proj/docs/budget.xlsx", | |
| 775 | + }, | |
| 776 | + { | |
| 777 | + "Name": "image.png", | |
| 778 | + "UniqueId": "uid-3", | |
| 779 | + "Length": "5000", | |
| 780 | + "ServerRelativeUrl": "/sites/proj/docs/image.png", | |
| 781 | + }, | |
| 782 | + ], | |
| 783 | + ] | |
| 784 | + src = M365Source( | |
| 785 | + web_url="https://contoso.sharepoint.com/sites/proj", | |
| 786 | + folder_url="/sites/proj/docs", | |
| 787 | + ) | |
| 788 | + src.authenticate() | |
| 789 | + files = src.list_videos() | |
| 790 | + # Only .docx and .xlsx match _DOC_EXTENSIONS, not .png | |
| 791 | + assert len(files) == 2 | |
| 792 | + names = [f.name for f in files] | |
| 793 | + assert "spec.docx" in names | |
| 794 | + assert "budget.xlsx" in names | |
| 795 | + | |
| 796 | + @patch("video_processor.sources.m365_source._run_m365") | |
| 797 | + def test_list_videos_with_file_ids(self, mock_run): | |
| 798 | + from video_processor.sources.m365_source import M365Source | |
| 799 | + | |
| 800 | + mock_run.return_value = { | |
| 801 | + "Name": "report.pdf", | |
| 802 | + "UniqueId": "uid-1", | |
| 803 | + "Length": "50000", | |
| 804 | + "ServerRelativeUrl": "/sites/proj/docs/report.pdf", | |
| 805 | + } | |
| 806 | + src = M365Source( | |
| 807 | + web_url="https://contoso.sharepoint.com", | |
| 808 | + file_ids=["uid-1"], | |
| 809 | + ) | |
| 810 | + files = src.list_videos() | |
| 811 | + assert len(files) == 1 | |
| 812 | + assert files[0].name == "report.pdf" | |
| 813 | + | |
| 814 | + def test_result_to_source_file(self): | |
| 815 | + from video_processor.sources.m365_source import _result_to_source_file | |
| 816 | + | |
| 817 | + sf = _result_to_source_file( | |
| 818 | + { | |
| 819 | + "Name": "notes.txt", | |
| 820 | + "UniqueId": "abc-123", | |
| 821 | + "Length": "512", | |
| 822 | + "ServerRelativeUrl": "/sites/proj/notes.txt", | |
| 823 | + "TimeLastModified": "2026-03-01T12:00:00Z", | |
| 824 | + } | |
| 825 | + ) | |
| 826 | + assert sf.name == "notes.txt" | |
| 827 | + assert sf.id == "abc-123" | |
| 828 | + assert sf.size_bytes == 512 | |
| 829 | + assert sf.path == "/sites/proj/notes.txt" | |
| 830 | + assert sf.modified_at == "2026-03-01T12:00:00Z" | |
| 831 | + | |
| 832 | + def test_extract_text_txt(self, tmp_path): | |
| 833 | + from video_processor.sources.m365_source import _extract_text | |
| 834 | + | |
| 835 | + f = tmp_path / "test.txt" | |
| 836 | + f.write_text("Hello from a text file") | |
| 837 | + result = _extract_text(f) | |
| 838 | + assert result == "Hello from a text file" | |
| 839 | + | |
| 840 | + def test_extract_text_md(self, tmp_path): | |
| 841 | + from video_processor.sources.m365_source import _extract_text | |
| 842 | + | |
| 843 | + f = tmp_path / "readme.md" | |
| 844 | + f.write_text("# Title\n\nSome content") | |
| 845 | + result = _extract_text(f) | |
| 846 | + assert "Title" in result | |
| 847 | + assert "Some content" in result | |
| 848 | + | |
| 849 | + def test_extract_text_unsupported(self, tmp_path): | |
| 850 | + from video_processor.sources.m365_source import _extract_text | |
| 851 | + | |
| 852 | + f = tmp_path / "data.bin" | |
| 853 | + f.write_bytes(b"\x00\x01\x02") | |
| 854 | + result = _extract_text(f) | |
| 855 | + assert "Unsupported" in result | |
| 856 | + | |
| 857 | + def test_list_no_folder_url(self): | |
| 858 | + from video_processor.sources.m365_source import M365Source | |
| 859 | + | |
| 860 | + src = M365Source(web_url="https://contoso.sharepoint.com") | |
| 861 | + files = src.list_videos() | |
| 862 | + assert files == [] | |
| 863 | + | |
| 864 | + | |
| 865 | +# --------------------------------------------------------------------------- | |
| 866 | +# ObsidianSource | |
| 867 | +# --------------------------------------------------------------------------- | |
| 868 | + | |
| 869 | + | |
| 870 | +class TestObsidianSource: | |
| 871 | + def test_import(self): | |
| 872 | + from video_processor.sources.obsidian_source import ObsidianSource | |
| 873 | + | |
| 874 | + assert ObsidianSource is not None | |
| 875 | + | |
| 876 | + def test_constructor(self, tmp_path): | |
| 877 | + from video_processor.sources.obsidian_source import ObsidianSource | |
| 878 | + | |
| 879 | + src = ObsidianSource(vault_path=str(tmp_path)) | |
| 880 | + assert src.vault_path == tmp_path | |
| 881 | + | |
| 882 | + def test_authenticate_with_vault(self, tmp_path): | |
| 883 | + from video_processor.sources.obsidian_source import ObsidianSource | |
| 884 | + | |
| 885 | + (tmp_path / "note.md").write_text("# Hello") | |
| 886 | + src = ObsidianSource(vault_path=str(tmp_path)) | |
| 887 | + assert src.authenticate() is True | |
| 888 | + | |
| 889 | + def test_authenticate_empty_dir(self, tmp_path): | |
| 890 | + from video_processor.sources.obsidian_source import ObsidianSource | |
| 891 | + | |
| 892 | + src = ObsidianSource(vault_path=str(tmp_path)) | |
| 893 | + assert src.authenticate() is False | |
| 894 | + | |
| 895 | + def test_authenticate_nonexistent(self, tmp_path): | |
| 896 | + from video_processor.sources.obsidian_source import ObsidianSource | |
| 897 | + | |
| 898 | + src = ObsidianSource(vault_path=str(tmp_path / "nonexistent")) | |
| 899 | + assert src.authenticate() is False | |
| 900 | + | |
| 901 | + def test_parse_note(self, tmp_path): | |
| 902 | + from video_processor.sources.obsidian_source import parse_note | |
| 903 | + | |
| 904 | + note_content = ( | |
| 905 | + "---\n" | |
| 906 | + "title: Test Note\n" | |
| 907 | + "tags: [python, testing]\n" | |
| 908 | + "---\n" | |
| 909 | + "# Heading One\n\n" | |
| 910 | + "Some text with a [[Wiki Link]] and [[Another Page|alias]].\n\n" | |
| 911 | + "Also has #tag1 and #tag2 inline tags.\n\n" | |
| 912 | + "## Sub Heading\n\n" | |
| 913 | + "More content here.\n" | |
| 914 | + ) | |
| 915 | + note_file = tmp_path / "test_note.md" | |
| 916 | + note_file.write_text(note_content) | |
| 917 | + | |
| 918 | + result = parse_note(note_file) | |
| 919 | + | |
| 920 | + assert result["frontmatter"]["title"] == "Test Note" | |
| 921 | + assert isinstance(result["frontmatter"]["tags"], list) | |
| 922 | + assert "python" in result["frontmatter"]["tags"] | |
| 923 | + assert "Wiki Link" in result["links"] | |
| 924 | + assert "Another Page" in result["links"] | |
| 925 | + assert "tag1" in result["tags"] | |
| 926 | + assert "tag2" in result["tags"] | |
| 927 | + assert len(result["headings"]) == 2 | |
| 928 | + assert result["headings"][0]["level"] == 1 | |
| 929 | + assert result["headings"][0]["text"] == "Heading One" | |
| 930 | + assert "Some text" in result["body"] | |
| 931 | + | |
| 932 | + def test_ingest_vault(self, tmp_path): | |
| 933 | + from video_processor.sources.obsidian_source import ingest_vault | |
| 934 | + | |
| 935 | + (tmp_path / "note_a.md").write_text("# A\n\nLinks to [[B]].\n") | |
| 936 | + (tmp_path / "note_b.md").write_text("# B\n\nLinks to [[A]] and [[C]].\n") | |
| 937 | + | |
| 938 | + result = ingest_vault(tmp_path) | |
| 939 | + | |
| 940 | + assert len(result["notes"]) == 2 | |
| 941 | + names = [n["name"] for n in result["notes"]] | |
| 942 | + assert "note_a" in names | |
| 943 | + assert "note_b" in names | |
| 944 | + # note_a links to B, note_b links to A and C => 3 links | |
| 945 | + assert len(result["links"]) == 3 | |
| 946 | + | |
| 947 | + def test_list_videos(self, tmp_path): | |
| 948 | + from video_processor.sources.obsidian_source import ObsidianSource | |
| 949 | + | |
| 950 | + (tmp_path / "note1.md").write_text("# Note 1") | |
| 951 | + sub = tmp_path / "subdir" | |
| 952 | + sub.mkdir() | |
| 953 | + (sub / "note2.md").write_text("# Note 2") | |
| 954 | + | |
| 955 | + src = ObsidianSource(vault_path=str(tmp_path)) | |
| 956 | + files = src.list_videos() | |
| 957 | + assert len(files) == 2 | |
| 958 | + assert all(f.mime_type == "text/markdown" for f in files) | |
| 959 | + | |
| 960 | + | |
| 961 | +# --------------------------------------------------------------------------- | |
| 962 | +# LogseqSource | |
| 963 | +# --------------------------------------------------------------------------- | |
| 964 | + | |
| 965 | + | |
| 966 | +class TestLogseqSource: | |
| 967 | + def test_import(self): | |
| 968 | + from video_processor.sources.logseq_source import LogseqSource | |
| 969 | + | |
| 970 | + assert LogseqSource is not None | |
| 971 | + | |
| 972 | + def test_constructor(self, tmp_path): | |
| 973 | + from video_processor.sources.logseq_source import LogseqSource | |
| 974 | + | |
| 975 | + src = LogseqSource(graph_path=str(tmp_path)) | |
| 976 | + assert src.graph_path == tmp_path | |
| 977 | + | |
| 978 | + def test_authenticate_with_pages(self, tmp_path): | |
| 979 | + from video_processor.sources.logseq_source import LogseqSource | |
| 980 | + | |
| 981 | + (tmp_path / "pages").mkdir() | |
| 982 | + src = LogseqSource(graph_path=str(tmp_path)) | |
| 983 | + assert src.authenticate() is True | |
| 984 | + | |
| 985 | + def test_authenticate_no_pages_or_journals(self, tmp_path): | |
| 986 | + from video_processor.sources.logseq_source import LogseqSource | |
| 987 | + | |
| 988 | + src = LogseqSource(graph_path=str(tmp_path)) | |
| 989 | + assert src.authenticate() is False | |
| 990 | + | |
| 991 | + def test_authenticate_nonexistent(self, tmp_path): | |
| 992 | + from video_processor.sources.logseq_source import LogseqSource | |
| 993 | + | |
| 994 | + src = LogseqSource(graph_path=str(tmp_path / "nonexistent")) | |
| 995 | + assert src.authenticate() is False | |
| 996 | + | |
| 997 | + def test_parse_page(self, tmp_path): | |
| 998 | + from video_processor.sources.logseq_source import parse_page | |
| 999 | + | |
| 1000 | + page_content = ( | |
| 1001 | + "title:: My Page\n" | |
| 1002 | + "tags:: #project #important\n" | |
| 1003 | + "- Some block content\n" | |
| 1004 | + " - Nested with [[Another Page]] link\n" | |
| 1005 | + " - And a #todo tag\n" | |
| 1006 | + " - Block ref ((abc12345-6789-0abc-def0-123456789abc))\n" | |
| 1007 | + ) | |
| 1008 | + page_file = tmp_path / "my_page.md" | |
| 1009 | + page_file.write_text(page_content) | |
| 1010 | + | |
| 1011 | + result = parse_page(page_file) | |
| 1012 | + | |
| 1013 | + assert result["properties"]["title"] == "My Page" | |
| 1014 | + assert "Another Page" in result["links"] | |
| 1015 | + assert "todo" in result["tags"] | |
| 1016 | + assert "abc12345-6789-0abc-def0-123456789abc" in result["block_refs"] | |
| 1017 | + assert "Some block content" in result["body"] | |
| 1018 | + | |
| 1019 | + def test_ingest_graph(self, tmp_path): | |
| 1020 | + from video_processor.sources.logseq_source import ingest_graph | |
| 1021 | + | |
| 1022 | + pages_dir = tmp_path / "pages" | |
| 1023 | + pages_dir.mkdir() | |
| 1024 | + (pages_dir / "page_a.md").write_text("- Content linking [[Page B]]\n") | |
| 1025 | + (pages_dir / "page_b.md").write_text("- Content linking [[Page A]]\n") | |
| 1026 | + | |
| 1027 | + journals_dir = tmp_path / "journals" | |
| 1028 | + journals_dir.mkdir() | |
| 1029 | + (journals_dir / "2026_03_07.md").write_text("- Journal entry\n") | |
| 1030 | + | |
| 1031 | + result = ingest_graph(tmp_path) | |
| 1032 | + | |
| 1033 | + assert len(result["notes"]) == 3 | |
| 1034 | + assert len(result["links"]) == 2 | |
| 1035 | + | |
| 1036 | + def test_list_videos(self, tmp_path): | |
| 1037 | + from video_processor.sources.logseq_source import LogseqSource | |
| 1038 | + | |
| 1039 | + pages_dir = tmp_path / "pages" | |
| 1040 | + pages_dir.mkdir() | |
| 1041 | + (pages_dir / "page1.md").write_text("- content") | |
| 1042 | + | |
| 1043 | + src = LogseqSource(graph_path=str(tmp_path)) | |
| 1044 | + files = src.list_videos() | |
| 1045 | + assert len(files) == 1 | |
| 1046 | + assert files[0].mime_type == "text/markdown" | |
| 1047 | + | |
| 1048 | + | |
| 1049 | +# --------------------------------------------------------------------------- | |
| 1050 | +# NotionSource | |
| 1051 | +# --------------------------------------------------------------------------- | |
| 1052 | + | |
| 1053 | + | |
| 1054 | +class TestNotionSource: | |
| 1055 | + def test_import(self): | |
| 1056 | + from video_processor.sources.notion_source import NotionSource | |
| 1057 | + | |
| 1058 | + assert NotionSource is not None | |
| 1059 | + | |
| 1060 | + def test_constructor(self): | |
| 1061 | + from video_processor.sources.notion_source import NotionSource | |
| 1062 | + | |
| 1063 | + src = NotionSource(token="ntn_test123", database_id="db-1") | |
| 1064 | + assert src.token == "ntn_test123" | |
| 1065 | + assert src.database_id == "db-1" | |
| 1066 | + assert src.page_ids == [] | |
| 1067 | + | |
| 1068 | + @patch.dict(os.environ, {}, clear=True) | |
| 1069 | + def test_authenticate_no_token(self): | |
| 1070 | + from video_processor.sources.notion_source import NotionSource | |
| 1071 | + | |
| 1072 | + src = NotionSource(token="") | |
| 1073 | + assert src.authenticate() is False | |
| 1074 | + | |
| 1075 | + @patch("requests.get") | |
| 1076 | + def test_authenticate_with_mock(self, mock_get): | |
| 1077 | + from video_processor.sources.notion_source import NotionSource | |
| 1078 | + | |
| 1079 | + mock_resp = MagicMock() | |
| 1080 | + mock_resp.raise_for_status = MagicMock() | |
| 1081 | + mock_resp.json.return_value = {"name": "Test Bot"} | |
| 1082 | + mock_get.return_value = mock_resp | |
| 1083 | + | |
| 1084 | + src = NotionSource(token="ntn_test123") | |
| 1085 | + assert src.authenticate() is True | |
| 1086 | + | |
| 1087 | + @patch("requests.post") | |
| 1088 | + def test_list_videos_database(self, mock_post): | |
| 1089 | + from video_processor.sources.notion_source import NotionSource | |
| 1090 | + | |
| 1091 | + mock_resp = MagicMock() | |
| 1092 | + mock_resp.raise_for_status = MagicMock() | |
| 1093 | + mock_resp.json.return_value = { | |
| 1094 | + "results": [ | |
| 1095 | + { | |
| 1096 | + "id": "page-1", | |
| 1097 | + "last_edited_time": "2026-03-01T00:00:00Z", | |
| 1098 | + "properties": { | |
| 1099 | + "Name": { | |
| 1100 | + "type": "title", | |
| 1101 | + "title": [{"plain_text": "Meeting Notes"}], | |
| 1102 | + } | |
| 1103 | + }, | |
| 1104 | + }, | |
| 1105 | + ], | |
| 1106 | + "has_more": False, | |
| 1107 | + } | |
| 1108 | + mock_post.return_value = mock_resp | |
| 1109 | + | |
| 1110 | + src = NotionSource(token="ntn_test", database_id="db-1") | |
| 1111 | + files = src.list_videos() | |
| 1112 | + assert len(files) == 1 | |
| 1113 | + assert files[0].name == "Meeting Notes" | |
| 1114 | + assert files[0].id == "page-1" | |
| 1115 | + | |
| 1116 | + def test_blocks_to_text(self): | |
| 1117 | + from video_processor.sources.notion_source import NotionSource | |
| 1118 | + | |
| 1119 | + src = NotionSource(token="test") | |
| 1120 | + blocks = [ | |
| 1121 | + { | |
| 1122 | + "type": "heading_1", | |
| 1123 | + "heading_1": { | |
| 1124 | + "rich_text": [{"plain_text": "Title"}], | |
| 1125 | + }, | |
| 1126 | + }, | |
| 1127 | + { | |
| 1128 | + "type": "paragraph", | |
| 1129 | + "paragraph": { | |
| 1130 | + "rich_text": [{"plain_text": "Some paragraph text."}], | |
| 1131 | + }, | |
| 1132 | + }, | |
| 1133 | + { | |
| 1134 | + "type": "bulleted_list_item", | |
| 1135 | + "bulleted_list_item": { | |
| 1136 | + "rich_text": [{"plain_text": "A bullet point"}], | |
| 1137 | + }, | |
| 1138 | + }, | |
| 1139 | + { | |
| 1140 | + "type": "divider", | |
| 1141 | + "divider": {}, | |
| 1142 | + }, | |
| 1143 | + ] | |
| 1144 | + result = src._blocks_to_text(blocks) | |
| 1145 | + assert "# Title" in result | |
| 1146 | + assert "Some paragraph text." in result | |
| 1147 | + assert "- A bullet point" in result | |
| 1148 | + assert "---" in result | |
| 1149 | + | |
| 1150 | + | |
| 1151 | +# --------------------------------------------------------------------------- | |
| 1152 | +# AppleNotesSource | |
| 1153 | +# --------------------------------------------------------------------------- | |
| 1154 | + | |
| 1155 | + | |
| 1156 | +class TestAppleNotesSource: | |
| 1157 | + def test_import(self): | |
| 1158 | + from video_processor.sources.apple_notes_source import AppleNotesSource | |
| 1159 | + | |
| 1160 | + assert AppleNotesSource is not None | |
| 1161 | + | |
| 1162 | + def test_constructor(self): | |
| 1163 | + from video_processor.sources.apple_notes_source import AppleNotesSource | |
| 1164 | + | |
| 1165 | + src = AppleNotesSource(folder="Work") | |
| 1166 | + assert src.folder == "Work" | |
| 1167 | + | |
| 1168 | + def test_constructor_default(self): | |
| 1169 | + from video_processor.sources.apple_notes_source import AppleNotesSource | |
| 1170 | + | |
| 1171 | + src = AppleNotesSource() | |
| 1172 | + assert src.folder is None | |
| 1173 | + | |
| 1174 | + def test_authenticate_platform(self): | |
| 1175 | + import sys | |
| 1176 | + | |
| 1177 | + from video_processor.sources.apple_notes_source import AppleNotesSource | |
| 1178 | + | |
| 1179 | + src = AppleNotesSource() | |
| 1180 | + result = src.authenticate() | |
| 1181 | + if sys.platform == "darwin": | |
| 1182 | + assert result is True | |
| 1183 | + else: | |
| 1184 | + assert result is False | |
| 1185 | + | |
| 1186 | + def test_html_to_text(self): | |
| 1187 | + from video_processor.sources.apple_notes_source import AppleNotesSource | |
| 1188 | + | |
| 1189 | + html = ( | |
| 1190 | + "<div>Hello <b>World</b></div>" | |
| 1191 | + "<p>Paragraph one.</p>" | |
| 1192 | + "<p>Paragraph two with & entity.</p>" | |
| 1193 | + "<br/>" | |
| 1194 | + "<ul><li>Item 1</li><li>Item 2</li></ul>" | |
| 1195 | + ) | |
| 1196 | + result = AppleNotesSource._html_to_text(html) | |
| 1197 | + assert "Hello World" in result | |
| 1198 | + assert "Paragraph one." in result | |
| 1199 | + assert "Paragraph two with & entity." in result | |
| 1200 | + assert "Item 1" in result | |
| 1201 | + | |
| 1202 | + def test_html_to_text_empty(self): | |
| 1203 | + from video_processor.sources.apple_notes_source import AppleNotesSource | |
| 1204 | + | |
| 1205 | + assert AppleNotesSource._html_to_text("") == "" | |
| 1206 | + | |
| 1207 | + def test_html_to_text_entities(self): | |
| 1208 | + from video_processor.sources.apple_notes_source import AppleNotesSource | |
| 1209 | + | |
| 1210 | + html = "<code> "test" 'single' space" | |
| 1211 | + result = AppleNotesSource._html_to_text(html) | |
| 1212 | + assert "<code>" in result | |
| 1213 | + assert '"test"' in result | |
| 1214 | + assert "'single'" in result | |
| 1215 | + | |
| 1216 | + | |
| 1217 | +# --------------------------------------------------------------------------- | |
| 1218 | +# GoogleKeepSource | |
| 1219 | +# --------------------------------------------------------------------------- | |
| 1220 | + | |
| 1221 | + | |
| 1222 | +class TestGoogleKeepSource: | |
| 1223 | + def test_import(self): | |
| 1224 | + from video_processor.sources.google_keep_source import GoogleKeepSource | |
| 1225 | + | |
| 1226 | + assert GoogleKeepSource is not None | |
| 1227 | + | |
| 1228 | + def test_constructor(self): | |
| 1229 | + from video_processor.sources.google_keep_source import GoogleKeepSource | |
| 1230 | + | |
| 1231 | + src = GoogleKeepSource(label="meetings") | |
| 1232 | + assert src.label == "meetings" | |
| 1233 | + | |
| 1234 | + def test_constructor_default(self): | |
| 1235 | + from video_processor.sources.google_keep_source import GoogleKeepSource | |
| 1236 | + | |
| 1237 | + src = GoogleKeepSource() | |
| 1238 | + assert src.label is None | |
| 1239 | + | |
| 1240 | + @patch("shutil.which", return_value=None) | |
| 1241 | + def test_authenticate_no_gws(self, _mock_which): | |
| 1242 | + from video_processor.sources.google_keep_source import GoogleKeepSource | |
| 1243 | + | |
| 1244 | + src = GoogleKeepSource() | |
| 1245 | + assert src.authenticate() is False | |
| 1246 | + | |
| 1247 | + def test_note_to_text(self): | |
| 1248 | + from video_processor.sources.google_keep_source import _note_to_text | |
| 1249 | + | |
| 1250 | + note = { | |
| 1251 | + "title": "Shopping List", | |
| 1252 | + "body": "Remember to buy groceries", | |
| 1253 | + "listContent": [ | |
| 1254 | + {"text": "Milk", "checked": True}, | |
| 1255 | + {"text": "Bread", "checked": False}, | |
| 1256 | + {"text": "", "checked": False}, | |
| 1257 | + ], | |
| 1258 | + } | |
| 1259 | + result = _note_to_text(note) | |
| 1260 | + assert "Shopping List" in result | |
| 1261 | + assert "Remember to buy groceries" in result | |
| 1262 | + assert "- [x] Milk" in result | |
| 1263 | + assert "- [ ] Bread" in result | |
| 1264 | + | |
| 1265 | + def test_note_to_text_empty(self): | |
| 1266 | + from video_processor.sources.google_keep_source import _note_to_text | |
| 1267 | + | |
| 1268 | + assert _note_to_text({}) == "" | |
| 1269 | + | |
| 1270 | + def test_note_to_text_text_content(self): | |
| 1271 | + from video_processor.sources.google_keep_source import _note_to_text | |
| 1272 | + | |
| 1273 | + note = {"title": "Simple", "textContent": "Just a plain note"} | |
| 1274 | + result = _note_to_text(note) | |
| 1275 | + assert "Simple" in result | |
| 1276 | + assert "Just a plain note" in result | |
| 1277 | + | |
| 1278 | + | |
| 1279 | +# --------------------------------------------------------------------------- | |
| 1280 | +# OneNoteSource | |
| 1281 | +# --------------------------------------------------------------------------- | |
| 1282 | + | |
| 1283 | + | |
| 1284 | +class TestOneNoteSource: | |
| 1285 | + def test_import(self): | |
| 1286 | + from video_processor.sources.onenote_source import OneNoteSource | |
| 1287 | + | |
| 1288 | + assert OneNoteSource is not None | |
| 1289 | + | |
| 1290 | + def test_constructor(self): | |
| 1291 | + from video_processor.sources.onenote_source import OneNoteSource | |
| 1292 | + | |
| 1293 | + src = OneNoteSource(notebook_name="Work Notes", section_name="Meetings") | |
| 1294 | + assert src.notebook_name == "Work Notes" | |
| 1295 | + assert src.section_name == "Meetings" | |
| 1296 | + | |
| 1297 | + def test_constructor_default(self): | |
| 1298 | + from video_processor.sources.onenote_source import OneNoteSource | |
| 1299 | + | |
| 1300 | + src = OneNoteSource() | |
| 1301 | + assert src.notebook_name is None | |
| 1302 | + assert src.section_name is None | |
| 1303 | + | |
| 1304 | + @patch("shutil.which", return_value=None) | |
| 1305 | + def test_authenticate_no_m365(self, _mock_which): | |
| 1306 | + from video_processor.sources.onenote_source import OneNoteSource | |
| 1307 | + | |
| 1308 | + src = OneNoteSource() | |
| 1309 | + assert src.authenticate() is False | |
| 1310 | + | |
| 1311 | + def test_html_to_text(self): | |
| 1312 | + from video_processor.sources.onenote_source import _html_to_text | |
| 1313 | + | |
| 1314 | + html = ( | |
| 1315 | + "<html><body>" | |
| 1316 | + "<h1>Meeting Notes</h1>" | |
| 1317 | + "<p>Discussed the & project.</p>" | |
| 1318 | + "<script>var x = 1;</script>" | |
| 1319 | + "<style>.foo { color: red; }</style>" | |
| 1320 | + "<ul><li>Action item 1</li><li>Action item 2</li></ul>" | |
| 1321 | + "<p>Entity A and A decoded.</p>" | |
| 1322 | + "</body></html>" | |
| 1323 | + ) | |
| 1324 | + result = _html_to_text(html) | |
| 1325 | + assert "Meeting Notes" in result | |
| 1326 | + assert "Discussed the & project." in result | |
| 1327 | + assert "var x" not in result | |
| 1328 | + assert ".foo" not in result | |
| 1329 | + assert "Action item 1" in result | |
| 1330 | + assert "Entity A and A decoded." in result | |
| 1331 | + | |
| 1332 | + def test_html_to_text_empty(self): | |
| 1333 | + from video_processor.sources.onenote_source import _html_to_text | |
| 1334 | + | |
| 1335 | + assert _html_to_text("") == "" | |
| 1336 | + | |
| 1337 | + def test_html_to_text_entities(self): | |
| 1338 | + from video_processor.sources.onenote_source import _html_to_text | |
| 1339 | + | |
| 1340 | + html = "<tag> "quoted" 'apos' space" | |
| 1341 | + result = _html_to_text(html) | |
| 1342 | + assert "<tag>" in result | |
| 1343 | + assert '"quoted"' in result | |
| 1344 | + assert "'apos'" in result | |
| 1345 | + | |
| 1346 | + | |
| 1347 | +# --------------------------------------------------------------------------- | |
| 1348 | +# ZoomSource | |
| 1349 | +# --------------------------------------------------------------------------- | |
| 1350 | + | |
| 1351 | + | |
| 1352 | +class TestZoomSource: | |
| 1353 | + def test_import(self): | |
| 1354 | + from video_processor.sources.zoom_source import ZoomSource | |
| 1355 | + | |
| 1356 | + assert ZoomSource is not None | |
| 1357 | + | |
| 1358 | + def test_constructor_defaults(self): | |
| 1359 | + from video_processor.sources.zoom_source import ZoomSource | |
| 1360 | + | |
| 1361 | + src = ZoomSource() | |
| 1362 | + assert src.client_id is None or isinstance(src.client_id, str) | |
| 1363 | + assert src._access_token is None | |
| 1364 | + | |
| 1365 | + def test_constructor_explicit(self): | |
| 1366 | + from video_processor.sources.zoom_source import ZoomSource | |
| 1367 | + | |
| 1368 | + src = ZoomSource( | |
| 1369 | + client_id="cid", | |
| 1370 | + client_secret="csec", | |
| 1371 | + account_id="aid", | |
| 1372 | + ) | |
| 1373 | + assert src.client_id == "cid" | |
| 1374 | + assert src.client_secret == "csec" | |
| 1375 | + assert src.account_id == "aid" | |
| 1376 | + | |
| 1377 | + def test_authenticate_no_credentials(self): | |
| 1378 | + from video_processor.sources.zoom_source import ZoomSource | |
| 1379 | + | |
| 1380 | + src = ZoomSource(client_id=None, client_secret=None, account_id=None) | |
| 1381 | + # No saved token, no account_id, no client_id → should fail | |
| 1382 | + assert src.authenticate() is False | |
| 1383 | + | |
| 1384 | + def test_list_videos_not_authenticated(self): | |
| 1385 | + from video_processor.sources.zoom_source import ZoomSource | |
| 1386 | + | |
| 1387 | + src = ZoomSource() | |
| 1388 | + with pytest.raises(RuntimeError, match="Not authenticated"): | |
| 1389 | + src.list_videos() | |
| 1390 | + | |
| 1391 | + def test_download_not_authenticated(self): | |
| 1392 | + from video_processor.sources.zoom_source import ZoomSource | |
| 1393 | + | |
| 1394 | + src = ZoomSource() | |
| 1395 | + sf = SourceFile(name="test.mp4", id="123") | |
| 1396 | + with pytest.raises(RuntimeError, match="Not authenticated"): | |
| 1397 | + src.download(sf, "/tmp/test.mp4") | |
| 1398 | + | |
| 1399 | + def test_fetch_transcript_not_authenticated(self): | |
| 1400 | + from video_processor.sources.zoom_source import ZoomSource | |
| 1401 | + | |
| 1402 | + src = ZoomSource() | |
| 1403 | + with pytest.raises(RuntimeError, match="Not authenticated"): | |
| 1404 | + src.fetch_transcript("meeting123") | |
| 1405 | + | |
| 1406 | + def test_mime_types_mapping(self): | |
| 1407 | + from video_processor.sources.zoom_source import _MIME_TYPES | |
| 1408 | + | |
| 1409 | + assert _MIME_TYPES["MP4"] == "video/mp4" | |
| 1410 | + assert _MIME_TYPES["TRANSCRIPT"] == "text/vtt" | |
| 1411 | + assert _MIME_TYPES["M4A"] == "audio/mp4" | |
| 1412 | + | |
| 1413 | + | |
| 1414 | +# --------------------------------------------------------------------------- | |
| 1415 | +# TeamsRecordingSource | |
| 1416 | +# --------------------------------------------------------------------------- | |
| 1417 | + | |
| 1418 | + | |
| 1419 | +class TestTeamsRecordingSource: | |
| 1420 | + def test_import(self): | |
| 1421 | + from video_processor.sources.teams_recording_source import ( | |
| 1422 | + TeamsRecordingSource, | |
| 1423 | + ) | |
| 1424 | + | |
| 1425 | + assert TeamsRecordingSource is not None | |
| 1426 | + | |
| 1427 | + def test_constructor_default(self): | |
| 1428 | + from video_processor.sources.teams_recording_source import ( | |
| 1429 | + TeamsRecordingSource, | |
| 1430 | + ) | |
| 1431 | + | |
| 1432 | + src = TeamsRecordingSource() | |
| 1433 | + assert src.user_id == "me" | |
| 1434 | + | |
| 1435 | + def test_constructor_custom_user(self): | |
| 1436 | + from video_processor.sources.teams_recording_source import ( | |
| 1437 | + TeamsRecordingSource, | |
| 1438 | + ) | |
| 1439 | + | |
| 1440 | + src = TeamsRecordingSource(user_id="[email protected]") | |
| 1441 | + assert src.user_id == "[email protected]" | |
| 1442 | + | |
| 1443 | + @patch("shutil.which", return_value=None) | |
| 1444 | + def test_authenticate_no_m365(self, _mock_which): | |
| 1445 | + from video_processor.sources.teams_recording_source import ( | |
| 1446 | + TeamsRecordingSource, | |
| 1447 | + ) | |
| 1448 | + | |
| 1449 | + src = TeamsRecordingSource() | |
| 1450 | + assert src.authenticate() is False | |
| 1451 | + | |
| 1452 | + def test_vtt_to_text(self): | |
| 1453 | + from video_processor.sources.teams_recording_source import ( | |
| 1454 | + _vtt_to_text, | |
| 1455 | + ) | |
| 1456 | + | |
| 1457 | + vtt = ( | |
| 1458 | + "WEBVTT\n\n" | |
| 1459 | + "1\n" | |
| 1460 | + "00:00:01.000 --> 00:00:05.000\n" | |
| 1461 | + "<v Speaker1>Hello everyone\n\n" | |
| 1462 | + "2\n" | |
| 1463 | + "00:00:05.000 --> 00:00:10.000\n" | |
| 1464 | + "<v Speaker2>Welcome to the meeting\n" | |
| 1465 | + ) | |
| 1466 | + result = _vtt_to_text(vtt) | |
| 1467 | + assert "Hello everyone" in result | |
| 1468 | + assert "Welcome to the meeting" in result | |
| 1469 | + assert "WEBVTT" not in result | |
| 1470 | + assert "-->" not in result | |
| 1471 | + | |
| 1472 | + def test_vtt_to_text_empty(self): | |
| 1473 | + from video_processor.sources.teams_recording_source import ( | |
| 1474 | + _vtt_to_text, | |
| 1475 | + ) | |
| 1476 | + | |
| 1477 | + assert _vtt_to_text("") == "" | |
| 1478 | + | |
| 1479 | + def test_vtt_to_text_deduplicates(self): | |
| 1480 | + from video_processor.sources.teams_recording_source import ( | |
| 1481 | + _vtt_to_text, | |
| 1482 | + ) | |
| 1483 | + | |
| 1484 | + vtt = ( | |
| 1485 | + "WEBVTT\n\n" | |
| 1486 | + "00:00:01.000 --> 00:00:03.000\n" | |
| 1487 | + "Same line\n\n" | |
| 1488 | + "00:00:03.000 --> 00:00:05.000\n" | |
| 1489 | + "Same line\n" | |
| 1490 | + ) | |
| 1491 | + result = _vtt_to_text(vtt) | |
| 1492 | + assert result.count("Same line") == 1 | |
| 1493 | + | |
| 1494 | + def test_extract_meetings_list_dict(self): | |
| 1495 | + from video_processor.sources.teams_recording_source import ( | |
| 1496 | + TeamsRecordingSource, | |
| 1497 | + ) | |
| 1498 | + | |
| 1499 | + src = TeamsRecordingSource() | |
| 1500 | + result = src._extract_meetings_list({"value": [{"id": "m1"}]}) | |
| 1501 | + assert len(result) == 1 | |
| 1502 | + | |
| 1503 | + def test_extract_meetings_list_list(self): | |
| 1504 | + from video_processor.sources.teams_recording_source import ( | |
| 1505 | + TeamsRecordingSource, | |
| 1506 | + ) | |
| 1507 | + | |
| 1508 | + src = TeamsRecordingSource() | |
| 1509 | + result = src._extract_meetings_list([{"id": "m1"}]) | |
| 1510 | + assert len(result) == 1 | |
| 1511 | + | |
| 1512 | + | |
| 1513 | +# --------------------------------------------------------------------------- | |
| 1514 | +# MeetRecordingSource | |
| 1515 | +# --------------------------------------------------------------------------- | |
| 1516 | + | |
| 1517 | + | |
| 1518 | +class TestMeetRecordingSource: | |
| 1519 | + def test_import(self): | |
| 1520 | + from video_processor.sources.meet_recording_source import ( | |
| 1521 | + MeetRecordingSource, | |
| 1522 | + ) | |
| 1523 | + | |
| 1524 | + assert MeetRecordingSource is not None | |
| 1525 | + | |
| 1526 | + def test_constructor_default(self): | |
| 1527 | + from video_processor.sources.meet_recording_source import ( | |
| 1528 | + MeetRecordingSource, | |
| 1529 | + ) | |
| 1530 | + | |
| 1531 | + src = MeetRecordingSource() | |
| 1532 | + assert src.drive_folder_id is None | |
| 1533 | + | |
| 1534 | + def test_constructor_with_folder(self): | |
| 1535 | + from video_processor.sources.meet_recording_source import ( | |
| 1536 | + MeetRecordingSource, | |
| 1537 | + ) | |
| 1538 | + | |
| 1539 | + src = MeetRecordingSource(drive_folder_id="folder123") | |
| 1540 | + assert src.drive_folder_id == "folder123" | |
| 1541 | + | |
| 1542 | + @patch("shutil.which", return_value=None) | |
| 1543 | + def test_authenticate_no_gws(self, _mock_which): | |
| 1544 | + from video_processor.sources.meet_recording_source import ( | |
| 1545 | + MeetRecordingSource, | |
| 1546 | + ) | |
| 1547 | + | |
| 1548 | + src = MeetRecordingSource() | |
| 1549 | + assert src.authenticate() is False | |
| 1550 | + | |
| 1551 | + def test_find_matching_transcript_date_extraction(self): | |
| 1552 | + import re | |
| 1553 | + | |
| 1554 | + name = "Meet Recording 2026-03-07T14:30:00" | |
| 1555 | + match = re.search(r"\d{4}-\d{2}-\d{2}", name) | |
| 1556 | + assert match is not None | |
| 1557 | + assert match.group(0) == "2026-03-07" | |
| 1558 | + | |
| 1559 | + def test_lazy_import(self): | |
| 1560 | + from video_processor.sources import MeetRecordingSource | |
| 1561 | + | |
| 1562 | + assert MeetRecordingSource is not None | |
| 1563 | + | |
| 1564 | + def test_teams_lazy_import(self): | |
| 1565 | + from video_processor.sources import TeamsRecordingSource | |
| 1566 | + | |
| 1567 | + assert TeamsRecordingSource is not None | |
| 1568 | + | |
| 1569 | + def test_zoom_lazy_import(self): | |
| 1570 | + from video_processor.sources import ZoomSource | |
| 1571 | + | |
| 1572 | + ass |
| --- a/tests/test_sources.py | |
| +++ b/tests/test_sources.py | |
| @@ -0,0 +1,1572 @@ | |
| --- a/tests/test_sources.py | |
| +++ b/tests/test_sources.py | |
| @@ -0,0 +1,1572 @@ | |
| 1 | """Tests for all source connectors: import, instantiation, authenticate, list_videos.""" |
| 2 | |
| 3 | import os |
| 4 | from unittest.mock import MagicMock, patch |
| 5 | |
| 6 | import pytest |
| 7 | |
| 8 | from video_proes.base import BaseSource, SourceFile |
| 9 | |
| 10 | # --------------------------------------------------------------------------- |
| 11 | # SourceFile model |
| 12 | # --------------------------------------------------------------------------- |
| 13 | |
| 14 | |
| 15 | def test_source_file_creation(): |
| 16 | sf = SourceFile(name="test.mp4", id="abc123") |
| 17 | assert sf.name == "test.mp4" |
| 18 | assert sf.id == "abc123" |
| 19 | assert sf.size_bytes is None |
| 20 | assert sf.mime_type is None |
| 21 | |
| 22 | |
| 23 | def test_source_file_with_all_fields(): |
| 24 | sf = SourceFile( |
| 25 | name="video.mp4", |
| 26 | id="v1", |
| 27 | size_bytes=1024, |
| 28 | mime_type="video/mp4", |
| 29 | modified_at="2025-01-01", |
| 30 | path="folder/video.mp4", |
| 31 | ) |
| 32 | assert sf.size_bytes == 1024 |
| 33 | assert sf.path == "folder/video.mp4" |
| 34 | |
| 35 | |
| 36 | # --------------------------------------------------------------------------- |
| 37 | # YouTubeSource |
| 38 | # --------------------------------------------------------------------------- |
| 39 | |
| 40 | |
| 41 | class TestYouTubeSource: |
| 42 | def test_import(self): |
| 43 | from video_processor.sources.youtube_source import YouTubeSource |
| 44 | |
| 45 | assert YouTubeSource is not None |
| 46 | |
| 47 | def test_constructor(self): |
| 48 | from video_processor.sources.youtube_source import YouTubeSource |
| 49 | |
| 50 | src = YouTubeSource(url="https://www.youtube.com/watch?v=dQw4w9WgXcQ") |
| 51 | assert src.video_id == "dQw4w9WgXcQ" |
| 52 | assert src.audio_only is False |
| 53 | |
| 54 | def test_constructor_audio_only(self): |
| 55 | from video_processor.sources.youtube_source import YouTubeSource |
| 56 | |
| 57 | src = YouTubeSource(url="https://youtu.be/dQw4w9WgXcQ", audio_only=True) |
| 58 | assert src.audio_only is True |
| 59 | |
| 60 | def test_constructor_shorts_url(self): |
| 61 | from video_processor.sources.youtube_source import YouTubeSource |
| 62 | |
| 63 | src = YouTubeSource(url="https://youtube.com/shorts/dQw4w9WgXcQ") |
| 64 | assert src.video_id == "dQw4w9WgXcQ" |
| 65 | |
| 66 | def test_constructor_invalid_url(self): |
| 67 | from video_processor.sources.youtube_source import YouTubeSource |
| 68 | |
| 69 | with pytest.raises(ValueError, match="Could not extract"): |
| 70 | YouTubeSource(url="https://example.com/not-youtube") |
| 71 | |
| 72 | @patch.dict(os.environ, {}, clear=False) |
| 73 | def test_authenticate_no_ytdlp(self): |
| 74 | from video_processor.sources.youtube_source import YouTubeSource |
| 75 | |
| 76 | src = YouTubeSource(url="https://youtube.com/watch?v=dQw4w9WgXcQ") |
| 77 | with patch.dict("sys.modules", {"yt_dlp": None}): |
| 78 | # yt_dlp import will fail |
| 79 | result = src.authenticate() |
| 80 | # Result depends on whether yt_dlp is installed; just check it returns bool |
| 81 | assert isinstance(result, bool) |
| 82 | |
| 83 | def test_list_videos(self): |
| 84 | from video_processor.sources.youtube_source import YouTubeSource |
| 85 | |
| 86 | mock_ydl = MagicMock() |
| 87 | mock_ydl.__enter__ = MagicMock(return_value=mock_ydl) |
| 88 | mock_ydl.__exit__ = MagicMock(return_value=False) |
| 89 | mock_ydl.extract_info.return_value = { |
| 90 | "title": "Test Video", |
| 91 | "filesize": 1000, |
| 92 | } |
| 93 | mock_ydl_cls = MagicMock(return_value=mock_ydl) |
| 94 | mock_module = MagicMock() |
| 95 | mock_module.YoutubeDL = mock_ydl_cls |
| 96 | |
| 97 | with patch.dict("sys.modules", {"yt_dlp": mock_module}): |
| 98 | src = YouTubeSource(url="https://youtube.com/watch?v=dQw4w9WgXcQ") |
| 99 | files = src.list_videos() |
| 100 | assert isinstance(files, list) |
| 101 | assert len(files) == 1 |
| 102 | assert files[0].name == "Test Video" |
| 103 | |
| 104 | |
| 105 | # --------------------------------------------------------------------------- |
| 106 | # WebSource |
| 107 | # --------------------------------------------------------------------------- |
| 108 | |
| 109 | |
| 110 | class TestWebSource: |
| 111 | def test_import(self): |
| 112 | from video_processor.sources.web_source import WebSource |
| 113 | |
| 114 | assert WebSource is not None |
| 115 | |
| 116 | def test_constructor(self): |
| 117 | from video_processor.sources.web_source import WebSource |
| 118 | |
| 119 | src = WebSource(url="https://example.com/page") |
| 120 | assert src.url == "https://example.com/page" |
| 121 | |
| 122 | def test_authenticate(self): |
| 123 | from video_processor.sources.web_source import WebSource |
| 124 | |
| 125 | src = WebSource(url="https://example.com") |
| 126 | assert src.authenticate() is True |
| 127 | |
| 128 | def test_list_videos(self): |
| 129 | from video_processor.sources.web_source import WebSource |
| 130 | |
| 131 | src = WebSource(url="https://example.com/article") |
| 132 | files = src.list_videos() |
| 133 | assert isinstance(files, list) |
| 134 | assert len(files) == 1 |
| 135 | assert files[0].mime_type == "text/html" |
| 136 | |
| 137 | |
| 138 | # --------------------------------------------------------------------------- |
| 139 | # GitHubSource |
| 140 | # --------------------------------------------------------------------------- |
| 141 | |
| 142 | |
| 143 | class TestGitHubSource: |
| 144 | def test_import(self): |
| 145 | from video_processor.sources.github_source import GitHubSource |
| 146 | |
| 147 | assert GitHubSource is not None |
| 148 | |
| 149 | def test_constructor(self): |
| 150 | from video_processor.sources.github_source import GitHubSource |
| 151 | |
| 152 | src = GitHubSource(repo="owner/repo") |
| 153 | assert src.repo == "owner/repo" |
| 154 | assert src.include_issues is True |
| 155 | assert src.include_prs is True |
| 156 | |
| 157 | @patch.dict(os.environ, {"GITHUB_TOKEN": "ghp_test123"}) |
| 158 | def test_authenticate_with_env_token(self): |
| 159 | from video_processor.sources.github_source import GitHubSource |
| 160 | |
| 161 | src = GitHubSource(repo="owner/repo") |
| 162 | result = src.authenticate() |
| 163 | assert result is True |
| 164 | assert src._token == "ghp_test123" |
| 165 | |
| 166 | @patch("requests.get") |
| 167 | @patch.dict(os.environ, {"GITHUB_TOKEN": "ghp_test123"}) |
| 168 | def test_list_videos(self, mock_get): |
| 169 | from video_processor.sources.github_source import GitHubSource |
| 170 | |
| 171 | # Mock responses for readme, issues, and PRs |
| 172 | readme_resp = MagicMock() |
| 173 | readme_resp.ok = True |
| 174 | |
| 175 | issues_resp = MagicMock() |
| 176 | issues_resp.ok = True |
| 177 | issues_resp.json.return_value = [ |
| 178 | {"number": 1, "title": "Bug report", "id": 1}, |
| 179 | {"number": 2, "title": "Feature request", "id": 2, "pull_request": {}}, |
| 180 | ] |
| 181 | |
| 182 | prs_resp = MagicMock() |
| 183 | prs_resp.ok = True |
| 184 | prs_resp.json.return_value = [ |
| 185 | {"number": 3, "title": "Fix bug"}, |
| 186 | ] |
| 187 | |
| 188 | mock_get.side_effect = [readme_resp, issues_resp, prs_resp] |
| 189 | |
| 190 | src = GitHubSource(repo="owner/repo") |
| 191 | src.authenticate() |
| 192 | files = src.list_videos() |
| 193 | assert isinstance(files, list) |
| 194 | # README + 1 issue (one filtered as PR) + 1 PR = 3 |
| 195 | assert len(files) == 3 |
| 196 | |
| 197 | |
| 198 | # --------------------------------------------------------------------------- |
| 199 | # RedditSource |
| 200 | # --------------------------------------------------------------------------- |
| 201 | |
| 202 | |
| 203 | class TestRedditSource: |
| 204 | def test_import(self): |
| 205 | from video_processor.sources.reddit_source import RedditSource |
| 206 | |
| 207 | assert RedditSource is not None |
| 208 | |
| 209 | def test_constructor(self): |
| 210 | from video_processor.sources.reddit_source import RedditSource |
| 211 | |
| 212 | src = RedditSource(url="https://reddit.com/r/python/comments/abc123/test/") |
| 213 | assert src.url == "https://reddit.com/r/python/comments/abc123/test" |
| 214 | |
| 215 | def test_authenticate(self): |
| 216 | from video_processor.sources.reddit_source import RedditSource |
| 217 | |
| 218 | src = RedditSource(url="https://reddit.com/r/test") |
| 219 | assert src.authenticate() is True |
| 220 | |
| 221 | def test_list_videos(self): |
| 222 | from video_processor.sources.reddit_source import RedditSource |
| 223 | |
| 224 | src = RedditSource(url="https://reddit.com/r/python/comments/abc/post") |
| 225 | files = src.list_videos() |
| 226 | assert isinstance(files, list) |
| 227 | assert len(files) == 1 |
| 228 | assert files[0].mime_type == "text/plain" |
| 229 | |
| 230 | |
| 231 | # --------------------------------------------------------------------------- |
| 232 | # HackerNewsSource |
| 233 | # --------------------------------------------------------------------------- |
| 234 | |
| 235 | |
| 236 | class TestHackerNewsSource: |
| 237 | def test_import(self): |
| 238 | from video_processor.sources.hackernews_source import HackerNewsSource |
| 239 | |
| 240 | assert HackerNewsSource is not None |
| 241 | |
| 242 | def test_constructor(self): |
| 243 | from video_processor.sources.hackernews_source import HackerNewsSource |
| 244 | |
| 245 | src = HackerNewsSource(item_id=12345678) |
| 246 | assert src.item_id == 12345678 |
| 247 | assert src.max_comments == 200 |
| 248 | |
| 249 | def test_authenticate(self): |
| 250 | from video_processor.sources.hackernews_source import HackerNewsSource |
| 251 | |
| 252 | src = HackerNewsSource(item_id=12345678) |
| 253 | assert src.authenticate() is True |
| 254 | |
| 255 | def test_list_videos(self): |
| 256 | from video_processor.sources.hackernews_source import HackerNewsSource |
| 257 | |
| 258 | src = HackerNewsSource(item_id=99999) |
| 259 | files = src.list_videos() |
| 260 | assert isinstance(files, list) |
| 261 | assert len(files) == 1 |
| 262 | assert files[0].id == "99999" |
| 263 | |
| 264 | |
| 265 | # --------------------------------------------------------------------------- |
| 266 | # RSSSource |
| 267 | # --------------------------------------------------------------------------- |
| 268 | |
| 269 | |
| 270 | class TestRSSSource: |
| 271 | def test_import(self): |
| 272 | from video_processor.sources.rss_source import RSSSource |
| 273 | |
| 274 | assert RSSSource is not None |
| 275 | |
| 276 | def test_constructor(self): |
| 277 | from video_processor.sources.rss_source import RSSSource |
| 278 | |
| 279 | src = RSSSource(url="https://example.com/feed.xml", max_entries=20) |
| 280 | assert src.url == "https://example.com/feed.xml" |
| 281 | assert src.max_entries == 20 |
| 282 | |
| 283 | def test_authenticate(self): |
| 284 | from video_processor.sources.rss_source import RSSSource |
| 285 | |
| 286 | src = RSSSource(url="https://example.com/feed.xml") |
| 287 | assert src.authenticate() is True |
| 288 | |
| 289 | @patch("requests.get") |
| 290 | def test_list_videos(self, mock_get): |
| 291 | from video_processor.sources.rss_source import RSSSource |
| 292 | |
| 293 | rss_xml = """<?xml version="1.0"?> |
| 294 | <rss version="2.0"> |
| 295 | <channel> |
| 296 | <item> |
| 297 | <title>Entry 1</title> |
| 298 | <link>https://example.com/1</link> |
| 299 | <description>First entry</description> |
| 300 | <pubDate>Mon, 01 Jan 2025 00:00:00 GMT</pubDate> |
| 301 | </item> |
| 302 | </channel> |
| 303 | </rss>""" |
| 304 | mock_resp = MagicMock() |
| 305 | mock_resp.text = rss_xml |
| 306 | mock_resp.raise_for_status = MagicMock() |
| 307 | mock_get.return_value = mock_resp |
| 308 | |
| 309 | src = RSSSource(url="https://example.com/feed.xml") |
| 310 | files = src.list_videos() |
| 311 | assert isinstance(files, list) |
| 312 | assert len(files) >= 1 |
| 313 | |
| 314 | |
| 315 | # --------------------------------------------------------------------------- |
| 316 | # PodcastSource |
| 317 | # --------------------------------------------------------------------------- |
| 318 | |
| 319 | |
| 320 | class TestPodcastSource: |
| 321 | def test_import(self): |
| 322 | from video_processor.sources.podcast_source import PodcastSource |
| 323 | |
| 324 | assert PodcastSource is not None |
| 325 | |
| 326 | def test_constructor(self): |
| 327 | from video_processor.sources.podcast_source import PodcastSource |
| 328 | |
| 329 | src = PodcastSource(feed_url="https://example.com/podcast.xml", max_episodes=5) |
| 330 | assert src.feed_url == "https://example.com/podcast.xml" |
| 331 | assert src.max_episodes == 5 |
| 332 | |
| 333 | def test_authenticate(self): |
| 334 | from video_processor.sources.podcast_source import PodcastSource |
| 335 | |
| 336 | src = PodcastSource(feed_url="https://example.com/podcast.xml") |
| 337 | assert src.authenticate() is True |
| 338 | |
| 339 | @patch("requests.get") |
| 340 | def test_list_videos(self, mock_get): |
| 341 | from video_processor.sources.podcast_source import PodcastSource |
| 342 | |
| 343 | podcast_xml = """<?xml version="1.0"?> |
| 344 | <rss version="2.0"> |
| 345 | <channel> |
| 346 | <item> |
| 347 | <title>Episode 1</title> |
| 348 | <enclosure url="https://example.com/ep1.mp3" type="audio/mpeg" /> |
| 349 | <pubDate>Mon, 01 Jan 2025 00:00:00 GMT</pubDate> |
| 350 | </item> |
| 351 | </channel> |
| 352 | </rss>""" |
| 353 | mock_resp = MagicMock() |
| 354 | mock_resp.text = podcast_xml |
| 355 | mock_resp.raise_for_status = MagicMock() |
| 356 | mock_get.return_value = mock_resp |
| 357 | |
| 358 | src = PodcastSource(feed_url="https://example.com/podcast.xml") |
| 359 | files = src.list_videos() |
| 360 | assert isinstance(files, list) |
| 361 | assert len(files) == 1 |
| 362 | assert files[0].mime_type == "audio/mpeg" |
| 363 | |
| 364 | |
| 365 | # --------------------------------------------------------------------------- |
| 366 | # TwitterSource |
| 367 | # --------------------------------------------------------------------------- |
| 368 | |
| 369 | |
| 370 | class TestTwitterSource: |
| 371 | def test_import(self): |
| 372 | from video_processor.sources.twitter_source import TwitterSource |
| 373 | |
| 374 | assert TwitterSource is not None |
| 375 | |
| 376 | def test_constructor(self): |
| 377 | from video_processor.sources.twitter_source import TwitterSource |
| 378 | |
| 379 | src = TwitterSource(url="https://twitter.com/user/status/123456") |
| 380 | assert src.url == "https://twitter.com/user/status/123456" |
| 381 | |
| 382 | @patch.dict(os.environ, {"TWITTER_BEARER_TOKEN": "test_token"}) |
| 383 | def test_authenticate_with_bearer_token(self): |
| 384 | from video_processor.sources.twitter_source import TwitterSource |
| 385 | |
| 386 | src = TwitterSource(url="https://twitter.com/user/status/123456") |
| 387 | assert src.authenticate() is True |
| 388 | |
| 389 | @patch.dict(os.environ, {}, clear=True) |
| 390 | def test_authenticate_no_token_no_gallery_dl(self): |
| 391 | from video_processor.sources.twitter_source import TwitterSource |
| 392 | |
| 393 | src = TwitterSource(url="https://twitter.com/user/status/123456") |
| 394 | with patch.dict("sys.modules", {"gallery_dl": None}): |
| 395 | result = src.authenticate() |
| 396 | assert isinstance(result, bool) |
| 397 | |
| 398 | def test_list_videos(self): |
| 399 | from video_processor.sources.twitter_source import TwitterSource |
| 400 | |
| 401 | src = TwitterSource(url="https://twitter.com/user/status/123456") |
| 402 | files = src.list_videos() |
| 403 | assert isinstance(files, list) |
| 404 | assert len(files) == 1 |
| 405 | |
| 406 | |
| 407 | # --------------------------------------------------------------------------- |
| 408 | # ArxivSource |
| 409 | # --------------------------------------------------------------------------- |
| 410 | |
| 411 | |
| 412 | class TestArxivSource: |
| 413 | def test_import(self): |
| 414 | from video_processor.sources.arxiv_source import ArxivSource |
| 415 | |
| 416 | assert ArxivSource is not None |
| 417 | |
| 418 | def test_constructor(self): |
| 419 | from video_processor.sources.arxiv_source import ArxivSource |
| 420 | |
| 421 | src = ArxivSource(url_or_id="2301.07041") |
| 422 | assert src.arxiv_id == "2301.07041" |
| 423 | |
| 424 | def test_constructor_from_url(self): |
| 425 | from video_processor.sources.arxiv_source import ArxivSource |
| 426 | |
| 427 | src = ArxivSource(url_or_id="https://arxiv.org/abs/2301.07041v2") |
| 428 | assert src.arxiv_id == "2301.07041v2" |
| 429 | |
| 430 | def test_constructor_invalid(self): |
| 431 | from video_processor.sources.arxiv_source import ArxivSource |
| 432 | |
| 433 | with pytest.raises(ValueError, match="Could not extract"): |
| 434 | ArxivSource(url_or_id="not-an-arxiv-id") |
| 435 | |
| 436 | def test_authenticate(self): |
| 437 | from video_processor.sources.arxiv_source import ArxivSource |
| 438 | |
| 439 | src = ArxivSource(url_or_id="2301.07041") |
| 440 | assert src.authenticate() is True |
| 441 | |
| 442 | @patch("requests.get") |
| 443 | def test_list_videos(self, mock_get): |
| 444 | from video_processor.sources.arxiv_source import ArxivSource |
| 445 | |
| 446 | atom_xml = """<?xml version="1.0"?> |
| 447 | <feed xmlns="http://www.w3.org/2005/Atom" |
| 448 | xmlns:arxiv="http://arxiv.org/schemas/atom"> |
| 449 | <entry> |
| 450 | <title>Test Paper</title> |
| 451 | <summary>Abstract text here.</summary> |
| 452 | <author><name>Author One</name></author> |
| 453 | <published>2023-01-15T00:00:00Z</published> |
| 454 | </entry> |
| 455 | </feed>""" |
| 456 | mock_resp = MagicMock() |
| 457 | mock_resp.text = atom_xml |
| 458 | mock_resp.raise_for_status = MagicMock() |
| 459 | mock_get.return_value = mock_resp |
| 460 | |
| 461 | src = ArxivSource(url_or_id="2301.07041") |
| 462 | files = src.list_videos() |
| 463 | assert isinstance(files, list) |
| 464 | assert len(files) == 2 # metadata + pdf |
| 465 | |
| 466 | |
| 467 | # --------------------------------------------------------------------------- |
| 468 | # S3Source |
| 469 | # --------------------------------------------------------------------------- |
| 470 | |
| 471 | |
| 472 | class TestS3Source: |
| 473 | def test_import(self): |
| 474 | from video_processor.sources.s3_source import S3Source |
| 475 | |
| 476 | assert S3Source is not None |
| 477 | |
| 478 | def test_constructor(self): |
| 479 | from video_processor.sources.s3_source import S3Source |
| 480 | |
| 481 | src = S3Source(bucket="my-bucket", prefix="videos/", region="us-east-1") |
| 482 | assert src.bucket == "my-bucket" |
| 483 | assert src.prefix == "videos/" |
| 484 | assert src.region == "us-east-1" |
| 485 | |
| 486 | def test_authenticate_success(self): |
| 487 | from video_processor.sources.s3_source import S3Source |
| 488 | |
| 489 | mock_client = MagicMock() |
| 490 | mock_client.head_bucket.return_value = {} |
| 491 | mock_boto3 = MagicMock() |
| 492 | mock_boto3.client.return_value = mock_client |
| 493 | |
| 494 | with patch.dict("sys.modules", {"boto3": mock_boto3}): |
| 495 | src = S3Source(bucket="my-bucket") |
| 496 | assert src.authenticate() is True |
| 497 | |
| 498 | def test_authenticate_failure(self): |
| 499 | from video_processor.sources.s3_source import S3Source |
| 500 | |
| 501 | mock_client = MagicMock() |
| 502 | mock_client.head_bucket.side_effect = Exception("Access Denied") |
| 503 | mock_boto3 = MagicMock() |
| 504 | mock_boto3.client.return_value = mock_client |
| 505 | |
| 506 | with patch.dict("sys.modules", {"boto3": mock_boto3}): |
| 507 | src = S3Source(bucket="bad-bucket") |
| 508 | assert src.authenticate() is False |
| 509 | |
| 510 | def test_list_videos(self): |
| 511 | from video_processor.sources.s3_source import S3Source |
| 512 | |
| 513 | mock_client = MagicMock() |
| 514 | mock_client.head_bucket.return_value = {} |
| 515 | paginator = MagicMock() |
| 516 | mock_client.get_paginator.return_value = paginator |
| 517 | paginator.paginate.return_value = [ |
| 518 | { |
| 519 | "Contents": [ |
| 520 | {"Key": "videos/clip.mp4", "Size": 5000}, |
| 521 | {"Key": "videos/notes.txt", "Size": 100}, |
| 522 | {"Key": "videos/movie.mkv", "Size": 90000}, |
| 523 | ] |
| 524 | } |
| 525 | ] |
| 526 | mock_boto3 = MagicMock() |
| 527 | mock_boto3.client.return_value = mock_client |
| 528 | |
| 529 | with patch.dict("sys.modules", {"boto3": mock_boto3}): |
| 530 | src = S3Source(bucket="my-bucket") |
| 531 | src.authenticate() |
| 532 | files = src.list_videos() |
| 533 | assert isinstance(files, list) |
| 534 | # Only .mp4 and .mkv are video extensions |
| 535 | assert len(files) == 2 |
| 536 | names = [f.name for f in files] |
| 537 | assert "clip.mp4" in names |
| 538 | assert "movie.mkv" in names |
| 539 | |
| 540 | |
| 541 | # --------------------------------------------------------------------------- |
| 542 | # GWSSource |
| 543 | # --------------------------------------------------------------------------- |
| 544 | |
| 545 | |
| 546 | class TestGWSSource: |
| 547 | def test_import(self): |
| 548 | from video_processor.sources.gws_source import GWSSource |
| 549 | |
| 550 | assert GWSSource is not None |
| 551 | |
| 552 | def test_constructor_defaults(self): |
| 553 | from video_processor.sources.gws_source import GWSSource |
| 554 | |
| 555 | src = GWSSource() |
| 556 | assert src.folder_id is None |
| 557 | assert src.query is None |
| 558 | assert src.doc_ids == [] |
| 559 | |
| 560 | def test_constructor_with_folder(self): |
| 561 | from video_processor.sources.gws_source import GWSSource |
| 562 | |
| 563 | src = GWSSource(folder_id="1abc", query="name contains 'spec'") |
| 564 | assert src.folder_id == "1abc" |
| 565 | assert src.query == "name contains 'spec'" |
| 566 | |
| 567 | def test_constructor_with_doc_ids(self): |
| 568 | from video_processor.sources.gws_source import GWSSource |
| 569 | |
| 570 | src = GWSSource(doc_ids=["doc1", "doc2"]) |
| 571 | assert src.doc_ids == ["doc1", "doc2"] |
| 572 | |
| 573 | @patch("shutil.which", return_value=None) |
| 574 | def test_authenticate_no_gws(self, _mock_which): |
| 575 | from video_processor.sources.gws_source import GWSSource |
| 576 | |
| 577 | src = GWSSource() |
| 578 | assert src.authenticate() is False |
| 579 | |
| 580 | @patch("video_processor.sources.gws_source._run_gws") |
| 581 | @patch("shutil.which", return_value="/usr/local/bin/gws") |
| 582 | def test_authenticate_success(self, _mock_which, mock_run): |
| 583 | from video_processor.sources.gws_source import GWSSource |
| 584 | |
| 585 | mock_run.return_value = {"connectedAs": "[email protected]"} |
| 586 | src = GWSSource() |
| 587 | assert src.authenticate() is True |
| 588 | |
| 589 | @patch("video_processor.sources.gws_source._run_gws") |
| 590 | @patch("shutil.which", return_value="/usr/local/bin/gws") |
| 591 | def test_list_videos(self, _mock_which, mock_run): |
| 592 | from video_processor.sources.gws_source import GWSSource |
| 593 | |
| 594 | mock_run.return_value = { |
| 595 | "files": [ |
| 596 | { |
| 597 | "id": "doc123", |
| 598 | "name": "Project Spec", |
| 599 | "mimeType": "application/vnd.google-apps.document", |
| 600 | "modifiedTime": "2026-01-01T00:00:00Z", |
| 601 | }, |
| 602 | { |
| 603 | "id": "sheet456", |
| 604 | "name": "Budget", |
| 605 | "mimeType": "application/vnd.google-apps.spreadsheet", |
| 606 | }, |
| 607 | ] |
| 608 | } |
| 609 | src = GWSSource(folder_id="folder1") |
| 610 | files = src.list_videos() |
| 611 | assert len(files) == 2 |
| 612 | assert files[0].name == "Project Spec" |
| 613 | assert files[1].id == "sheet456" |
| 614 | |
| 615 | @patch("video_processor.sources.gws_source._run_gws") |
| 616 | @patch("shutil.which", return_value="/usr/local/bin/gws") |
| 617 | def test_list_videos_with_doc_ids(self, _mock_which, mock_run): |
| 618 | from video_processor.sources.gws_source import GWSSource |
| 619 | |
| 620 | mock_run.return_value = { |
| 621 | "id": "doc123", |
| 622 | "name": "My Doc", |
| 623 | "mimeType": "application/vnd.google-apps.document", |
| 624 | } |
| 625 | src = GWSSource(doc_ids=["doc123"]) |
| 626 | files = src.list_videos() |
| 627 | assert len(files) == 1 |
| 628 | assert files[0].name == "My Doc" |
| 629 | |
| 630 | def test_result_to_source_file(self): |
| 631 | from video_processor.sources.gws_source import _result_to_source_file |
| 632 | |
| 633 | sf = _result_to_source_file( |
| 634 | { |
| 635 | "id": "abc", |
| 636 | "name": "Test Doc", |
| 637 | "mimeType": "text/plain", |
| 638 | "size": "1024", |
| 639 | "modifiedTime": "2026-03-01", |
| 640 | } |
| 641 | ) |
| 642 | assert sf.name == "Test Doc" |
| 643 | assert sf.id == "abc" |
| 644 | assert sf.size_bytes == 1024 |
| 645 | assert sf.mime_type == "text/plain" |
| 646 | |
| 647 | @patch("video_processor.sources.gws_source._run_gws") |
| 648 | def test_get_doc_text(self, mock_run): |
| 649 | from video_processor.sources.gws_source import GWSSource |
| 650 | |
| 651 | mock_run.return_value = { |
| 652 | "body": { |
| 653 | "content": [ |
| 654 | { |
| 655 | "paragraph": { |
| 656 | "elements": [ |
| 657 | {"textRun": {"content": "Hello world\n"}}, |
| 658 | ] |
| 659 | } |
| 660 | }, |
| 661 | { |
| 662 | "paragraph": { |
| 663 | "elements": [ |
| 664 | {"textRun": {"content": "Second paragraph\n"}}, |
| 665 | ] |
| 666 | } |
| 667 | }, |
| 668 | ] |
| 669 | } |
| 670 | } |
| 671 | src = GWSSource() |
| 672 | text = src._get_doc_text("doc123") |
| 673 | assert "Hello world" in text |
| 674 | assert "Second paragraph" in text |
| 675 | |
| 676 | @patch("video_processor.sources.gws_source._run_gws") |
| 677 | def test_collate(self, mock_run): |
| 678 | from video_processor.sources.gws_source import GWSSource |
| 679 | |
| 680 | # First call: list files, second+: export each |
| 681 | mock_run.side_effect = [ |
| 682 | { |
| 683 | "files": [ |
| 684 | { |
| 685 | "id": "d1", |
| 686 | "name": "Doc A", |
| 687 | "mimeType": "application/vnd.google-apps.document", |
| 688 | }, |
| 689 | ] |
| 690 | }, |
| 691 | {"raw": "Content of Doc A"}, |
| 692 | ] |
| 693 | src = GWSSource(folder_id="f1") |
| 694 | result = src.collate() |
| 695 | assert "Doc A" in result |
| 696 | assert "Content of Doc A" in result |
| 697 | |
| 698 | |
| 699 | # --------------------------------------------------------------------------- |
| 700 | # M365Source |
| 701 | # --------------------------------------------------------------------------- |
| 702 | |
| 703 | |
| 704 | class TestM365Source: |
| 705 | def test_import(self): |
| 706 | from video_processor.sources.m365_source import M365Source |
| 707 | |
| 708 | assert M365Source is not None |
| 709 | |
| 710 | def test_constructor(self): |
| 711 | from video_processor.sources.m365_source import M365Source |
| 712 | |
| 713 | src = M365Source( |
| 714 | web_url="https://contoso.sharepoint.com/sites/proj", |
| 715 | folder_url="/sites/proj/Shared Documents", |
| 716 | ) |
| 717 | assert src.web_url == "https://contoso.sharepoint.com/sites/proj" |
| 718 | assert src.folder_url == "/sites/proj/Shared Documents" |
| 719 | assert src.file_ids == [] |
| 720 | assert src.recursive is False |
| 721 | |
| 722 | def test_constructor_with_file_ids(self): |
| 723 | from video_processor.sources.m365_source import M365Source |
| 724 | |
| 725 | src = M365Source( |
| 726 | web_url="https://contoso.sharepoint.com", |
| 727 | file_ids=["id1", "id2"], |
| 728 | ) |
| 729 | assert src.file_ids == ["id1", "id2"] |
| 730 | |
| 731 | @patch("shutil.which", return_value=None) |
| 732 | def test_authenticate_no_m365(self, _mock_which): |
| 733 | from video_processor.sources.m365_source import M365Source |
| 734 | |
| 735 | src = M365Source(web_url="https://contoso.sharepoint.com") |
| 736 | assert src.authenticate() is False |
| 737 | |
| 738 | @patch("video_processor.sources.m365_source._run_m365") |
| 739 | @patch("shutil.which", return_value="/usr/local/bin/m365") |
| 740 | def test_authenticate_logged_in(self, _mock_which, mock_run): |
| 741 | from video_processor.sources.m365_source import M365Source |
| 742 | |
| 743 | mock_run.return_value = {"connectedAs": "[email protected]"} |
| 744 | src = M365Source(web_url="https://contoso.sharepoint.com") |
| 745 | assert src.authenticate() is True |
| 746 | |
| 747 | @patch("video_processor.sources.m365_source._run_m365") |
| 748 | @patch("shutil.which", return_value="/usr/local/bin/m365") |
| 749 | def test_authenticate_not_logged_in(self, _mock_which, mock_run): |
| 750 | from video_processor.sources.m365_source import M365Source |
| 751 | |
| 752 | mock_run.return_value = {} |
| 753 | src = M365Source(web_url="https://contoso.sharepoint.com") |
| 754 | assert src.authenticate() is False |
| 755 | |
| 756 | @patch("video_processor.sources.m365_source._run_m365") |
| 757 | @patch("shutil.which", return_value="/usr/local/bin/m365") |
| 758 | def test_list_videos(self, _mock_which, mock_run): |
| 759 | from video_processor.sources.m365_source import M365Source |
| 760 | |
| 761 | mock_run.side_effect = [ |
| 762 | {"connectedAs": "[email protected]"}, # authenticate |
| 763 | [ |
| 764 | { |
| 765 | "Name": "spec.docx", |
| 766 | "UniqueId": "uid-1", |
| 767 | "Length": "20480", |
| 768 | "ServerRelativeUrl": "/sites/proj/docs/spec.docx", |
| 769 | }, |
| 770 | { |
| 771 | "Name": "budget.xlsx", |
| 772 | "UniqueId": "uid-2", |
| 773 | "Length": "10240", |
| 774 | "ServerRelativeUrl": "/sites/proj/docs/budget.xlsx", |
| 775 | }, |
| 776 | { |
| 777 | "Name": "image.png", |
| 778 | "UniqueId": "uid-3", |
| 779 | "Length": "5000", |
| 780 | "ServerRelativeUrl": "/sites/proj/docs/image.png", |
| 781 | }, |
| 782 | ], |
| 783 | ] |
| 784 | src = M365Source( |
| 785 | web_url="https://contoso.sharepoint.com/sites/proj", |
| 786 | folder_url="/sites/proj/docs", |
| 787 | ) |
| 788 | src.authenticate() |
| 789 | files = src.list_videos() |
| 790 | # Only .docx and .xlsx match _DOC_EXTENSIONS, not .png |
| 791 | assert len(files) == 2 |
| 792 | names = [f.name for f in files] |
| 793 | assert "spec.docx" in names |
| 794 | assert "budget.xlsx" in names |
| 795 | |
| 796 | @patch("video_processor.sources.m365_source._run_m365") |
| 797 | def test_list_videos_with_file_ids(self, mock_run): |
| 798 | from video_processor.sources.m365_source import M365Source |
| 799 | |
| 800 | mock_run.return_value = { |
| 801 | "Name": "report.pdf", |
| 802 | "UniqueId": "uid-1", |
| 803 | "Length": "50000", |
| 804 | "ServerRelativeUrl": "/sites/proj/docs/report.pdf", |
| 805 | } |
| 806 | src = M365Source( |
| 807 | web_url="https://contoso.sharepoint.com", |
| 808 | file_ids=["uid-1"], |
| 809 | ) |
| 810 | files = src.list_videos() |
| 811 | assert len(files) == 1 |
| 812 | assert files[0].name == "report.pdf" |
| 813 | |
| 814 | def test_result_to_source_file(self): |
| 815 | from video_processor.sources.m365_source import _result_to_source_file |
| 816 | |
| 817 | sf = _result_to_source_file( |
| 818 | { |
| 819 | "Name": "notes.txt", |
| 820 | "UniqueId": "abc-123", |
| 821 | "Length": "512", |
| 822 | "ServerRelativeUrl": "/sites/proj/notes.txt", |
| 823 | "TimeLastModified": "2026-03-01T12:00:00Z", |
| 824 | } |
| 825 | ) |
| 826 | assert sf.name == "notes.txt" |
| 827 | assert sf.id == "abc-123" |
| 828 | assert sf.size_bytes == 512 |
| 829 | assert sf.path == "/sites/proj/notes.txt" |
| 830 | assert sf.modified_at == "2026-03-01T12:00:00Z" |
| 831 | |
| 832 | def test_extract_text_txt(self, tmp_path): |
| 833 | from video_processor.sources.m365_source import _extract_text |
| 834 | |
| 835 | f = tmp_path / "test.txt" |
| 836 | f.write_text("Hello from a text file") |
| 837 | result = _extract_text(f) |
| 838 | assert result == "Hello from a text file" |
| 839 | |
| 840 | def test_extract_text_md(self, tmp_path): |
| 841 | from video_processor.sources.m365_source import _extract_text |
| 842 | |
| 843 | f = tmp_path / "readme.md" |
| 844 | f.write_text("# Title\n\nSome content") |
| 845 | result = _extract_text(f) |
| 846 | assert "Title" in result |
| 847 | assert "Some content" in result |
| 848 | |
| 849 | def test_extract_text_unsupported(self, tmp_path): |
| 850 | from video_processor.sources.m365_source import _extract_text |
| 851 | |
| 852 | f = tmp_path / "data.bin" |
| 853 | f.write_bytes(b"\x00\x01\x02") |
| 854 | result = _extract_text(f) |
| 855 | assert "Unsupported" in result |
| 856 | |
| 857 | def test_list_no_folder_url(self): |
| 858 | from video_processor.sources.m365_source import M365Source |
| 859 | |
| 860 | src = M365Source(web_url="https://contoso.sharepoint.com") |
| 861 | files = src.list_videos() |
| 862 | assert files == [] |
| 863 | |
| 864 | |
| 865 | # --------------------------------------------------------------------------- |
| 866 | # ObsidianSource |
| 867 | # --------------------------------------------------------------------------- |
| 868 | |
| 869 | |
| 870 | class TestObsidianSource: |
| 871 | def test_import(self): |
| 872 | from video_processor.sources.obsidian_source import ObsidianSource |
| 873 | |
| 874 | assert ObsidianSource is not None |
| 875 | |
| 876 | def test_constructor(self, tmp_path): |
| 877 | from video_processor.sources.obsidian_source import ObsidianSource |
| 878 | |
| 879 | src = ObsidianSource(vault_path=str(tmp_path)) |
| 880 | assert src.vault_path == tmp_path |
| 881 | |
| 882 | def test_authenticate_with_vault(self, tmp_path): |
| 883 | from video_processor.sources.obsidian_source import ObsidianSource |
| 884 | |
| 885 | (tmp_path / "note.md").write_text("# Hello") |
| 886 | src = ObsidianSource(vault_path=str(tmp_path)) |
| 887 | assert src.authenticate() is True |
| 888 | |
| 889 | def test_authenticate_empty_dir(self, tmp_path): |
| 890 | from video_processor.sources.obsidian_source import ObsidianSource |
| 891 | |
| 892 | src = ObsidianSource(vault_path=str(tmp_path)) |
| 893 | assert src.authenticate() is False |
| 894 | |
| 895 | def test_authenticate_nonexistent(self, tmp_path): |
| 896 | from video_processor.sources.obsidian_source import ObsidianSource |
| 897 | |
| 898 | src = ObsidianSource(vault_path=str(tmp_path / "nonexistent")) |
| 899 | assert src.authenticate() is False |
| 900 | |
| 901 | def test_parse_note(self, tmp_path): |
| 902 | from video_processor.sources.obsidian_source import parse_note |
| 903 | |
| 904 | note_content = ( |
| 905 | "---\n" |
| 906 | "title: Test Note\n" |
| 907 | "tags: [python, testing]\n" |
| 908 | "---\n" |
| 909 | "# Heading One\n\n" |
| 910 | "Some text with a [[Wiki Link]] and [[Another Page|alias]].\n\n" |
| 911 | "Also has #tag1 and #tag2 inline tags.\n\n" |
| 912 | "## Sub Heading\n\n" |
| 913 | "More content here.\n" |
| 914 | ) |
| 915 | note_file = tmp_path / "test_note.md" |
| 916 | note_file.write_text(note_content) |
| 917 | |
| 918 | result = parse_note(note_file) |
| 919 | |
| 920 | assert result["frontmatter"]["title"] == "Test Note" |
| 921 | assert isinstance(result["frontmatter"]["tags"], list) |
| 922 | assert "python" in result["frontmatter"]["tags"] |
| 923 | assert "Wiki Link" in result["links"] |
| 924 | assert "Another Page" in result["links"] |
| 925 | assert "tag1" in result["tags"] |
| 926 | assert "tag2" in result["tags"] |
| 927 | assert len(result["headings"]) == 2 |
| 928 | assert result["headings"][0]["level"] == 1 |
| 929 | assert result["headings"][0]["text"] == "Heading One" |
| 930 | assert "Some text" in result["body"] |
| 931 | |
| 932 | def test_ingest_vault(self, tmp_path): |
| 933 | from video_processor.sources.obsidian_source import ingest_vault |
| 934 | |
| 935 | (tmp_path / "note_a.md").write_text("# A\n\nLinks to [[B]].\n") |
| 936 | (tmp_path / "note_b.md").write_text("# B\n\nLinks to [[A]] and [[C]].\n") |
| 937 | |
| 938 | result = ingest_vault(tmp_path) |
| 939 | |
| 940 | assert len(result["notes"]) == 2 |
| 941 | names = [n["name"] for n in result["notes"]] |
| 942 | assert "note_a" in names |
| 943 | assert "note_b" in names |
| 944 | # note_a links to B, note_b links to A and C => 3 links |
| 945 | assert len(result["links"]) == 3 |
| 946 | |
| 947 | def test_list_videos(self, tmp_path): |
| 948 | from video_processor.sources.obsidian_source import ObsidianSource |
| 949 | |
| 950 | (tmp_path / "note1.md").write_text("# Note 1") |
| 951 | sub = tmp_path / "subdir" |
| 952 | sub.mkdir() |
| 953 | (sub / "note2.md").write_text("# Note 2") |
| 954 | |
| 955 | src = ObsidianSource(vault_path=str(tmp_path)) |
| 956 | files = src.list_videos() |
| 957 | assert len(files) == 2 |
| 958 | assert all(f.mime_type == "text/markdown" for f in files) |
| 959 | |
| 960 | |
| 961 | # --------------------------------------------------------------------------- |
| 962 | # LogseqSource |
| 963 | # --------------------------------------------------------------------------- |
| 964 | |
| 965 | |
| 966 | class TestLogseqSource: |
| 967 | def test_import(self): |
| 968 | from video_processor.sources.logseq_source import LogseqSource |
| 969 | |
| 970 | assert LogseqSource is not None |
| 971 | |
| 972 | def test_constructor(self, tmp_path): |
| 973 | from video_processor.sources.logseq_source import LogseqSource |
| 974 | |
| 975 | src = LogseqSource(graph_path=str(tmp_path)) |
| 976 | assert src.graph_path == tmp_path |
| 977 | |
| 978 | def test_authenticate_with_pages(self, tmp_path): |
| 979 | from video_processor.sources.logseq_source import LogseqSource |
| 980 | |
| 981 | (tmp_path / "pages").mkdir() |
| 982 | src = LogseqSource(graph_path=str(tmp_path)) |
| 983 | assert src.authenticate() is True |
| 984 | |
| 985 | def test_authenticate_no_pages_or_journals(self, tmp_path): |
| 986 | from video_processor.sources.logseq_source import LogseqSource |
| 987 | |
| 988 | src = LogseqSource(graph_path=str(tmp_path)) |
| 989 | assert src.authenticate() is False |
| 990 | |
| 991 | def test_authenticate_nonexistent(self, tmp_path): |
| 992 | from video_processor.sources.logseq_source import LogseqSource |
| 993 | |
| 994 | src = LogseqSource(graph_path=str(tmp_path / "nonexistent")) |
| 995 | assert src.authenticate() is False |
| 996 | |
| 997 | def test_parse_page(self, tmp_path): |
| 998 | from video_processor.sources.logseq_source import parse_page |
| 999 | |
| 1000 | page_content = ( |
| 1001 | "title:: My Page\n" |
| 1002 | "tags:: #project #important\n" |
| 1003 | "- Some block content\n" |
| 1004 | " - Nested with [[Another Page]] link\n" |
| 1005 | " - And a #todo tag\n" |
| 1006 | " - Block ref ((abc12345-6789-0abc-def0-123456789abc))\n" |
| 1007 | ) |
| 1008 | page_file = tmp_path / "my_page.md" |
| 1009 | page_file.write_text(page_content) |
| 1010 | |
| 1011 | result = parse_page(page_file) |
| 1012 | |
| 1013 | assert result["properties"]["title"] == "My Page" |
| 1014 | assert "Another Page" in result["links"] |
| 1015 | assert "todo" in result["tags"] |
| 1016 | assert "abc12345-6789-0abc-def0-123456789abc" in result["block_refs"] |
| 1017 | assert "Some block content" in result["body"] |
| 1018 | |
| 1019 | def test_ingest_graph(self, tmp_path): |
| 1020 | from video_processor.sources.logseq_source import ingest_graph |
| 1021 | |
| 1022 | pages_dir = tmp_path / "pages" |
| 1023 | pages_dir.mkdir() |
| 1024 | (pages_dir / "page_a.md").write_text("- Content linking [[Page B]]\n") |
| 1025 | (pages_dir / "page_b.md").write_text("- Content linking [[Page A]]\n") |
| 1026 | |
| 1027 | journals_dir = tmp_path / "journals" |
| 1028 | journals_dir.mkdir() |
| 1029 | (journals_dir / "2026_03_07.md").write_text("- Journal entry\n") |
| 1030 | |
| 1031 | result = ingest_graph(tmp_path) |
| 1032 | |
| 1033 | assert len(result["notes"]) == 3 |
| 1034 | assert len(result["links"]) == 2 |
| 1035 | |
| 1036 | def test_list_videos(self, tmp_path): |
| 1037 | from video_processor.sources.logseq_source import LogseqSource |
| 1038 | |
| 1039 | pages_dir = tmp_path / "pages" |
| 1040 | pages_dir.mkdir() |
| 1041 | (pages_dir / "page1.md").write_text("- content") |
| 1042 | |
| 1043 | src = LogseqSource(graph_path=str(tmp_path)) |
| 1044 | files = src.list_videos() |
| 1045 | assert len(files) == 1 |
| 1046 | assert files[0].mime_type == "text/markdown" |
| 1047 | |
| 1048 | |
| 1049 | # --------------------------------------------------------------------------- |
| 1050 | # NotionSource |
| 1051 | # --------------------------------------------------------------------------- |
| 1052 | |
| 1053 | |
| 1054 | class TestNotionSource: |
| 1055 | def test_import(self): |
| 1056 | from video_processor.sources.notion_source import NotionSource |
| 1057 | |
| 1058 | assert NotionSource is not None |
| 1059 | |
| 1060 | def test_constructor(self): |
| 1061 | from video_processor.sources.notion_source import NotionSource |
| 1062 | |
| 1063 | src = NotionSource(token="ntn_test123", database_id="db-1") |
| 1064 | assert src.token == "ntn_test123" |
| 1065 | assert src.database_id == "db-1" |
| 1066 | assert src.page_ids == [] |
| 1067 | |
| 1068 | @patch.dict(os.environ, {}, clear=True) |
| 1069 | def test_authenticate_no_token(self): |
| 1070 | from video_processor.sources.notion_source import NotionSource |
| 1071 | |
| 1072 | src = NotionSource(token="") |
| 1073 | assert src.authenticate() is False |
| 1074 | |
| 1075 | @patch("requests.get") |
| 1076 | def test_authenticate_with_mock(self, mock_get): |
| 1077 | from video_processor.sources.notion_source import NotionSource |
| 1078 | |
| 1079 | mock_resp = MagicMock() |
| 1080 | mock_resp.raise_for_status = MagicMock() |
| 1081 | mock_resp.json.return_value = {"name": "Test Bot"} |
| 1082 | mock_get.return_value = mock_resp |
| 1083 | |
| 1084 | src = NotionSource(token="ntn_test123") |
| 1085 | assert src.authenticate() is True |
| 1086 | |
| 1087 | @patch("requests.post") |
| 1088 | def test_list_videos_database(self, mock_post): |
| 1089 | from video_processor.sources.notion_source import NotionSource |
| 1090 | |
| 1091 | mock_resp = MagicMock() |
| 1092 | mock_resp.raise_for_status = MagicMock() |
| 1093 | mock_resp.json.return_value = { |
| 1094 | "results": [ |
| 1095 | { |
| 1096 | "id": "page-1", |
| 1097 | "last_edited_time": "2026-03-01T00:00:00Z", |
| 1098 | "properties": { |
| 1099 | "Name": { |
| 1100 | "type": "title", |
| 1101 | "title": [{"plain_text": "Meeting Notes"}], |
| 1102 | } |
| 1103 | }, |
| 1104 | }, |
| 1105 | ], |
| 1106 | "has_more": False, |
| 1107 | } |
| 1108 | mock_post.return_value = mock_resp |
| 1109 | |
| 1110 | src = NotionSource(token="ntn_test", database_id="db-1") |
| 1111 | files = src.list_videos() |
| 1112 | assert len(files) == 1 |
| 1113 | assert files[0].name == "Meeting Notes" |
| 1114 | assert files[0].id == "page-1" |
| 1115 | |
| 1116 | def test_blocks_to_text(self): |
| 1117 | from video_processor.sources.notion_source import NotionSource |
| 1118 | |
| 1119 | src = NotionSource(token="test") |
| 1120 | blocks = [ |
| 1121 | { |
| 1122 | "type": "heading_1", |
| 1123 | "heading_1": { |
| 1124 | "rich_text": [{"plain_text": "Title"}], |
| 1125 | }, |
| 1126 | }, |
| 1127 | { |
| 1128 | "type": "paragraph", |
| 1129 | "paragraph": { |
| 1130 | "rich_text": [{"plain_text": "Some paragraph text."}], |
| 1131 | }, |
| 1132 | }, |
| 1133 | { |
| 1134 | "type": "bulleted_list_item", |
| 1135 | "bulleted_list_item": { |
| 1136 | "rich_text": [{"plain_text": "A bullet point"}], |
| 1137 | }, |
| 1138 | }, |
| 1139 | { |
| 1140 | "type": "divider", |
| 1141 | "divider": {}, |
| 1142 | }, |
| 1143 | ] |
| 1144 | result = src._blocks_to_text(blocks) |
| 1145 | assert "# Title" in result |
| 1146 | assert "Some paragraph text." in result |
| 1147 | assert "- A bullet point" in result |
| 1148 | assert "---" in result |
| 1149 | |
| 1150 | |
| 1151 | # --------------------------------------------------------------------------- |
| 1152 | # AppleNotesSource |
| 1153 | # --------------------------------------------------------------------------- |
| 1154 | |
| 1155 | |
| 1156 | class TestAppleNotesSource: |
| 1157 | def test_import(self): |
| 1158 | from video_processor.sources.apple_notes_source import AppleNotesSource |
| 1159 | |
| 1160 | assert AppleNotesSource is not None |
| 1161 | |
| 1162 | def test_constructor(self): |
| 1163 | from video_processor.sources.apple_notes_source import AppleNotesSource |
| 1164 | |
| 1165 | src = AppleNotesSource(folder="Work") |
| 1166 | assert src.folder == "Work" |
| 1167 | |
| 1168 | def test_constructor_default(self): |
| 1169 | from video_processor.sources.apple_notes_source import AppleNotesSource |
| 1170 | |
| 1171 | src = AppleNotesSource() |
| 1172 | assert src.folder is None |
| 1173 | |
| 1174 | def test_authenticate_platform(self): |
| 1175 | import sys |
| 1176 | |
| 1177 | from video_processor.sources.apple_notes_source import AppleNotesSource |
| 1178 | |
| 1179 | src = AppleNotesSource() |
| 1180 | result = src.authenticate() |
| 1181 | if sys.platform == "darwin": |
| 1182 | assert result is True |
| 1183 | else: |
| 1184 | assert result is False |
| 1185 | |
| 1186 | def test_html_to_text(self): |
| 1187 | from video_processor.sources.apple_notes_source import AppleNotesSource |
| 1188 | |
| 1189 | html = ( |
| 1190 | "<div>Hello <b>World</b></div>" |
| 1191 | "<p>Paragraph one.</p>" |
| 1192 | "<p>Paragraph two with & entity.</p>" |
| 1193 | "<br/>" |
| 1194 | "<ul><li>Item 1</li><li>Item 2</li></ul>" |
| 1195 | ) |
| 1196 | result = AppleNotesSource._html_to_text(html) |
| 1197 | assert "Hello World" in result |
| 1198 | assert "Paragraph one." in result |
| 1199 | assert "Paragraph two with & entity." in result |
| 1200 | assert "Item 1" in result |
| 1201 | |
| 1202 | def test_html_to_text_empty(self): |
| 1203 | from video_processor.sources.apple_notes_source import AppleNotesSource |
| 1204 | |
| 1205 | assert AppleNotesSource._html_to_text("") == "" |
| 1206 | |
| 1207 | def test_html_to_text_entities(self): |
| 1208 | from video_processor.sources.apple_notes_source import AppleNotesSource |
| 1209 | |
| 1210 | html = "<code> "test" 'single' space" |
| 1211 | result = AppleNotesSource._html_to_text(html) |
| 1212 | assert "<code>" in result |
| 1213 | assert '"test"' in result |
| 1214 | assert "'single'" in result |
| 1215 | |
| 1216 | |
| 1217 | # --------------------------------------------------------------------------- |
| 1218 | # GoogleKeepSource |
| 1219 | # --------------------------------------------------------------------------- |
| 1220 | |
| 1221 | |
| 1222 | class TestGoogleKeepSource: |
| 1223 | def test_import(self): |
| 1224 | from video_processor.sources.google_keep_source import GoogleKeepSource |
| 1225 | |
| 1226 | assert GoogleKeepSource is not None |
| 1227 | |
| 1228 | def test_constructor(self): |
| 1229 | from video_processor.sources.google_keep_source import GoogleKeepSource |
| 1230 | |
| 1231 | src = GoogleKeepSource(label="meetings") |
| 1232 | assert src.label == "meetings" |
| 1233 | |
| 1234 | def test_constructor_default(self): |
| 1235 | from video_processor.sources.google_keep_source import GoogleKeepSource |
| 1236 | |
| 1237 | src = GoogleKeepSource() |
| 1238 | assert src.label is None |
| 1239 | |
| 1240 | @patch("shutil.which", return_value=None) |
| 1241 | def test_authenticate_no_gws(self, _mock_which): |
| 1242 | from video_processor.sources.google_keep_source import GoogleKeepSource |
| 1243 | |
| 1244 | src = GoogleKeepSource() |
| 1245 | assert src.authenticate() is False |
| 1246 | |
| 1247 | def test_note_to_text(self): |
| 1248 | from video_processor.sources.google_keep_source import _note_to_text |
| 1249 | |
| 1250 | note = { |
| 1251 | "title": "Shopping List", |
| 1252 | "body": "Remember to buy groceries", |
| 1253 | "listContent": [ |
| 1254 | {"text": "Milk", "checked": True}, |
| 1255 | {"text": "Bread", "checked": False}, |
| 1256 | {"text": "", "checked": False}, |
| 1257 | ], |
| 1258 | } |
| 1259 | result = _note_to_text(note) |
| 1260 | assert "Shopping List" in result |
| 1261 | assert "Remember to buy groceries" in result |
| 1262 | assert "- [x] Milk" in result |
| 1263 | assert "- [ ] Bread" in result |
| 1264 | |
| 1265 | def test_note_to_text_empty(self): |
| 1266 | from video_processor.sources.google_keep_source import _note_to_text |
| 1267 | |
| 1268 | assert _note_to_text({}) == "" |
| 1269 | |
| 1270 | def test_note_to_text_text_content(self): |
| 1271 | from video_processor.sources.google_keep_source import _note_to_text |
| 1272 | |
| 1273 | note = {"title": "Simple", "textContent": "Just a plain note"} |
| 1274 | result = _note_to_text(note) |
| 1275 | assert "Simple" in result |
| 1276 | assert "Just a plain note" in result |
| 1277 | |
| 1278 | |
| 1279 | # --------------------------------------------------------------------------- |
| 1280 | # OneNoteSource |
| 1281 | # --------------------------------------------------------------------------- |
| 1282 | |
| 1283 | |
| 1284 | class TestOneNoteSource: |
| 1285 | def test_import(self): |
| 1286 | from video_processor.sources.onenote_source import OneNoteSource |
| 1287 | |
| 1288 | assert OneNoteSource is not None |
| 1289 | |
| 1290 | def test_constructor(self): |
| 1291 | from video_processor.sources.onenote_source import OneNoteSource |
| 1292 | |
| 1293 | src = OneNoteSource(notebook_name="Work Notes", section_name="Meetings") |
| 1294 | assert src.notebook_name == "Work Notes" |
| 1295 | assert src.section_name == "Meetings" |
| 1296 | |
| 1297 | def test_constructor_default(self): |
| 1298 | from video_processor.sources.onenote_source import OneNoteSource |
| 1299 | |
| 1300 | src = OneNoteSource() |
| 1301 | assert src.notebook_name is None |
| 1302 | assert src.section_name is None |
| 1303 | |
| 1304 | @patch("shutil.which", return_value=None) |
| 1305 | def test_authenticate_no_m365(self, _mock_which): |
| 1306 | from video_processor.sources.onenote_source import OneNoteSource |
| 1307 | |
| 1308 | src = OneNoteSource() |
| 1309 | assert src.authenticate() is False |
| 1310 | |
| 1311 | def test_html_to_text(self): |
| 1312 | from video_processor.sources.onenote_source import _html_to_text |
| 1313 | |
| 1314 | html = ( |
| 1315 | "<html><body>" |
| 1316 | "<h1>Meeting Notes</h1>" |
| 1317 | "<p>Discussed the & project.</p>" |
| 1318 | "<script>var x = 1;</script>" |
| 1319 | "<style>.foo { color: red; }</style>" |
| 1320 | "<ul><li>Action item 1</li><li>Action item 2</li></ul>" |
| 1321 | "<p>Entity A and A decoded.</p>" |
| 1322 | "</body></html>" |
| 1323 | ) |
| 1324 | result = _html_to_text(html) |
| 1325 | assert "Meeting Notes" in result |
| 1326 | assert "Discussed the & project." in result |
| 1327 | assert "var x" not in result |
| 1328 | assert ".foo" not in result |
| 1329 | assert "Action item 1" in result |
| 1330 | assert "Entity A and A decoded." in result |
| 1331 | |
| 1332 | def test_html_to_text_empty(self): |
| 1333 | from video_processor.sources.onenote_source import _html_to_text |
| 1334 | |
| 1335 | assert _html_to_text("") == "" |
| 1336 | |
| 1337 | def test_html_to_text_entities(self): |
| 1338 | from video_processor.sources.onenote_source import _html_to_text |
| 1339 | |
| 1340 | html = "<tag> "quoted" 'apos' space" |
| 1341 | result = _html_to_text(html) |
| 1342 | assert "<tag>" in result |
| 1343 | assert '"quoted"' in result |
| 1344 | assert "'apos'" in result |
| 1345 | |
| 1346 | |
| 1347 | # --------------------------------------------------------------------------- |
| 1348 | # ZoomSource |
| 1349 | # --------------------------------------------------------------------------- |
| 1350 | |
| 1351 | |
| 1352 | class TestZoomSource: |
| 1353 | def test_import(self): |
| 1354 | from video_processor.sources.zoom_source import ZoomSource |
| 1355 | |
| 1356 | assert ZoomSource is not None |
| 1357 | |
| 1358 | def test_constructor_defaults(self): |
| 1359 | from video_processor.sources.zoom_source import ZoomSource |
| 1360 | |
| 1361 | src = ZoomSource() |
| 1362 | assert src.client_id is None or isinstance(src.client_id, str) |
| 1363 | assert src._access_token is None |
| 1364 | |
| 1365 | def test_constructor_explicit(self): |
| 1366 | from video_processor.sources.zoom_source import ZoomSource |
| 1367 | |
| 1368 | src = ZoomSource( |
| 1369 | client_id="cid", |
| 1370 | client_secret="csec", |
| 1371 | account_id="aid", |
| 1372 | ) |
| 1373 | assert src.client_id == "cid" |
| 1374 | assert src.client_secret == "csec" |
| 1375 | assert src.account_id == "aid" |
| 1376 | |
| 1377 | def test_authenticate_no_credentials(self): |
| 1378 | from video_processor.sources.zoom_source import ZoomSource |
| 1379 | |
| 1380 | src = ZoomSource(client_id=None, client_secret=None, account_id=None) |
| 1381 | # No saved token, no account_id, no client_id → should fail |
| 1382 | assert src.authenticate() is False |
| 1383 | |
| 1384 | def test_list_videos_not_authenticated(self): |
| 1385 | from video_processor.sources.zoom_source import ZoomSource |
| 1386 | |
| 1387 | src = ZoomSource() |
| 1388 | with pytest.raises(RuntimeError, match="Not authenticated"): |
| 1389 | src.list_videos() |
| 1390 | |
| 1391 | def test_download_not_authenticated(self): |
| 1392 | from video_processor.sources.zoom_source import ZoomSource |
| 1393 | |
| 1394 | src = ZoomSource() |
| 1395 | sf = SourceFile(name="test.mp4", id="123") |
| 1396 | with pytest.raises(RuntimeError, match="Not authenticated"): |
| 1397 | src.download(sf, "/tmp/test.mp4") |
| 1398 | |
| 1399 | def test_fetch_transcript_not_authenticated(self): |
| 1400 | from video_processor.sources.zoom_source import ZoomSource |
| 1401 | |
| 1402 | src = ZoomSource() |
| 1403 | with pytest.raises(RuntimeError, match="Not authenticated"): |
| 1404 | src.fetch_transcript("meeting123") |
| 1405 | |
| 1406 | def test_mime_types_mapping(self): |
| 1407 | from video_processor.sources.zoom_source import _MIME_TYPES |
| 1408 | |
| 1409 | assert _MIME_TYPES["MP4"] == "video/mp4" |
| 1410 | assert _MIME_TYPES["TRANSCRIPT"] == "text/vtt" |
| 1411 | assert _MIME_TYPES["M4A"] == "audio/mp4" |
| 1412 | |
| 1413 | |
| 1414 | # --------------------------------------------------------------------------- |
| 1415 | # TeamsRecordingSource |
| 1416 | # --------------------------------------------------------------------------- |
| 1417 | |
| 1418 | |
| 1419 | class TestTeamsRecordingSource: |
| 1420 | def test_import(self): |
| 1421 | from video_processor.sources.teams_recording_source import ( |
| 1422 | TeamsRecordingSource, |
| 1423 | ) |
| 1424 | |
| 1425 | assert TeamsRecordingSource is not None |
| 1426 | |
| 1427 | def test_constructor_default(self): |
| 1428 | from video_processor.sources.teams_recording_source import ( |
| 1429 | TeamsRecordingSource, |
| 1430 | ) |
| 1431 | |
| 1432 | src = TeamsRecordingSource() |
| 1433 | assert src.user_id == "me" |
| 1434 | |
| 1435 | def test_constructor_custom_user(self): |
| 1436 | from video_processor.sources.teams_recording_source import ( |
| 1437 | TeamsRecordingSource, |
| 1438 | ) |
| 1439 | |
| 1440 | src = TeamsRecordingSource(user_id="[email protected]") |
| 1441 | assert src.user_id == "[email protected]" |
| 1442 | |
| 1443 | @patch("shutil.which", return_value=None) |
| 1444 | def test_authenticate_no_m365(self, _mock_which): |
| 1445 | from video_processor.sources.teams_recording_source import ( |
| 1446 | TeamsRecordingSource, |
| 1447 | ) |
| 1448 | |
| 1449 | src = TeamsRecordingSource() |
| 1450 | assert src.authenticate() is False |
| 1451 | |
| 1452 | def test_vtt_to_text(self): |
| 1453 | from video_processor.sources.teams_recording_source import ( |
| 1454 | _vtt_to_text, |
| 1455 | ) |
| 1456 | |
| 1457 | vtt = ( |
| 1458 | "WEBVTT\n\n" |
| 1459 | "1\n" |
| 1460 | "00:00:01.000 --> 00:00:05.000\n" |
| 1461 | "<v Speaker1>Hello everyone\n\n" |
| 1462 | "2\n" |
| 1463 | "00:00:05.000 --> 00:00:10.000\n" |
| 1464 | "<v Speaker2>Welcome to the meeting\n" |
| 1465 | ) |
| 1466 | result = _vtt_to_text(vtt) |
| 1467 | assert "Hello everyone" in result |
| 1468 | assert "Welcome to the meeting" in result |
| 1469 | assert "WEBVTT" not in result |
| 1470 | assert "-->" not in result |
| 1471 | |
| 1472 | def test_vtt_to_text_empty(self): |
| 1473 | from video_processor.sources.teams_recording_source import ( |
| 1474 | _vtt_to_text, |
| 1475 | ) |
| 1476 | |
| 1477 | assert _vtt_to_text("") == "" |
| 1478 | |
| 1479 | def test_vtt_to_text_deduplicates(self): |
| 1480 | from video_processor.sources.teams_recording_source import ( |
| 1481 | _vtt_to_text, |
| 1482 | ) |
| 1483 | |
| 1484 | vtt = ( |
| 1485 | "WEBVTT\n\n" |
| 1486 | "00:00:01.000 --> 00:00:03.000\n" |
| 1487 | "Same line\n\n" |
| 1488 | "00:00:03.000 --> 00:00:05.000\n" |
| 1489 | "Same line\n" |
| 1490 | ) |
| 1491 | result = _vtt_to_text(vtt) |
| 1492 | assert result.count("Same line") == 1 |
| 1493 | |
| 1494 | def test_extract_meetings_list_dict(self): |
| 1495 | from video_processor.sources.teams_recording_source import ( |
| 1496 | TeamsRecordingSource, |
| 1497 | ) |
| 1498 | |
| 1499 | src = TeamsRecordingSource() |
| 1500 | result = src._extract_meetings_list({"value": [{"id": "m1"}]}) |
| 1501 | assert len(result) == 1 |
| 1502 | |
| 1503 | def test_extract_meetings_list_list(self): |
| 1504 | from video_processor.sources.teams_recording_source import ( |
| 1505 | TeamsRecordingSource, |
| 1506 | ) |
| 1507 | |
| 1508 | src = TeamsRecordingSource() |
| 1509 | result = src._extract_meetings_list([{"id": "m1"}]) |
| 1510 | assert len(result) == 1 |
| 1511 | |
| 1512 | |
| 1513 | # --------------------------------------------------------------------------- |
| 1514 | # MeetRecordingSource |
| 1515 | # --------------------------------------------------------------------------- |
| 1516 | |
| 1517 | |
| 1518 | class TestMeetRecordingSource: |
| 1519 | def test_import(self): |
| 1520 | from video_processor.sources.meet_recording_source import ( |
| 1521 | MeetRecordingSource, |
| 1522 | ) |
| 1523 | |
| 1524 | assert MeetRecordingSource is not None |
| 1525 | |
| 1526 | def test_constructor_default(self): |
| 1527 | from video_processor.sources.meet_recording_source import ( |
| 1528 | MeetRecordingSource, |
| 1529 | ) |
| 1530 | |
| 1531 | src = MeetRecordingSource() |
| 1532 | assert src.drive_folder_id is None |
| 1533 | |
| 1534 | def test_constructor_with_folder(self): |
| 1535 | from video_processor.sources.meet_recording_source import ( |
| 1536 | MeetRecordingSource, |
| 1537 | ) |
| 1538 | |
| 1539 | src = MeetRecordingSource(drive_folder_id="folder123") |
| 1540 | assert src.drive_folder_id == "folder123" |
| 1541 | |
| 1542 | @patch("shutil.which", return_value=None) |
| 1543 | def test_authenticate_no_gws(self, _mock_which): |
| 1544 | from video_processor.sources.meet_recording_source import ( |
| 1545 | MeetRecordingSource, |
| 1546 | ) |
| 1547 | |
| 1548 | src = MeetRecordingSource() |
| 1549 | assert src.authenticate() is False |
| 1550 | |
| 1551 | def test_find_matching_transcript_date_extraction(self): |
| 1552 | import re |
| 1553 | |
| 1554 | name = "Meet Recording 2026-03-07T14:30:00" |
| 1555 | match = re.search(r"\d{4}-\d{2}-\d{2}", name) |
| 1556 | assert match is not None |
| 1557 | assert match.group(0) == "2026-03-07" |
| 1558 | |
| 1559 | def test_lazy_import(self): |
| 1560 | from video_processor.sources import MeetRecordingSource |
| 1561 | |
| 1562 | assert MeetRecordingSource is not None |
| 1563 | |
| 1564 | def test_teams_lazy_import(self): |
| 1565 | from video_processor.sources import TeamsRecordingSource |
| 1566 | |
| 1567 | assert TeamsRecordingSource is not None |
| 1568 | |
| 1569 | def test_zoom_lazy_import(self): |
| 1570 | from video_processor.sources import ZoomSource |
| 1571 | |
| 1572 | ass |
+286
| --- a/tests/test_taxonomy.py | ||
| +++ b/tests/test_taxonomy.py | ||
| @@ -0,0 +1,286 @@ | ||
| 1 | +"""Tests for the planning taxonomy classifier.""" | |
| 2 | + | |
| 3 | +from unittest.mock import MagicMock | |
| 4 | + | |
| 5 | +from video_processor.integrators.taxonomy import TaxonomyClassifier | |
| 6 | +from video_processor.models import ( | |
| 7 | + PlanningEntity, | |
| 8 | + PlanningEntityType, | |
| 9 | + PlanningRelationshipType, | |
| 10 | +) | |
| 11 | + | |
| 12 | +# ── Fixtures ────────────────────────────────────────────────────────── | |
| 13 | + | |
| 14 | + | |
| 15 | +def _entity(name, descriptions=None, entity_type="concept"): | |
| 16 | + return { | |
| 17 | + "name": name, | |
| 18 | + "type": entity_type, | |
| 19 | + "descriptions": descriptions or [], | |
| 20 | + } | |
| 21 | + | |
| 22 | + | |
| 23 | +# ── PlanningEntityType enum ────────────────────────────────────────── | |
| 24 | + | |
| 25 | + | |
| 26 | +class TestPlanningEntityType: | |
| 27 | + def test_all_values(self): | |
| 28 | + expected = { | |
| 29 | + "goal", | |
| 30 | + "requirement", | |
| 31 | + "constraint", | |
| 32 | + "decision", | |
| 33 | + "risk", | |
| 34 | + "assumption", | |
| 35 | + "dependency", | |
| 36 | + "milestone", | |
| 37 | + "task", | |
| 38 | + "feature", | |
| 39 | + } | |
| 40 | + assert {t.value for t in PlanningEntityType} == expected | |
| 41 | + | |
| 42 | + def test_str_enum(self): | |
| 43 | + assert PlanningEntityType.GOAL == "goal" | |
| 44 | + assert PlanningEntityType.RISK.value == "risk" | |
| 45 | + | |
| 46 | + | |
| 47 | +class TestPlanningRelationshipType: | |
| 48 | + def test_all_values(self): | |
| 49 | + expected = { | |
| 50 | + "requires", | |
| 51 | + "blocked_by", | |
| 52 | + "has_risk", | |
| 53 | + "depends_on", | |
| 54 | + "addresses", | |
| 55 | + "has_tradeoff", | |
| 56 | + "delivers", | |
| 57 | + "implements", | |
| 58 | + "parent_of", | |
| 59 | + } | |
| 60 | + assert {t.value for t in PlanningRelationshipType} == expected | |
| 61 | + | |
| 62 | + | |
| 63 | +# ── PlanningEntity model ───────────────────────────────────────────── | |
| 64 | + | |
| 65 | + | |
| 66 | +class TestPlanningEntity: | |
| 67 | + def test_minimal(self): | |
| 68 | + pe = PlanningEntity(name="Ship v2", planning_type=PlanningEntityType.GOAL) | |
| 69 | + assert pe.description == "" | |
| 70 | + assert pe.priority is None | |
| 71 | + assert pe.status is None | |
| 72 | + assert pe.source_entities == [] | |
| 73 | + assert pe.metadata == {} | |
| 74 | + | |
| 75 | + def test_full(self): | |
| 76 | + pe = PlanningEntity( | |
| 77 | + name="Ship v2", | |
| 78 | + planning_type=PlanningEntityType.GOAL, | |
| 79 | + description="Release version 2", | |
| 80 | + priority="high", | |
| 81 | + status="identified", | |
| 82 | + source_entities=["v2 release"], | |
| 83 | + metadata={"quarter": "Q3"}, | |
| 84 | + ) | |
| 85 | + assert pe.priority == "high" | |
| 86 | + assert pe.metadata["quarter"] == "Q3" | |
| 87 | + | |
| 88 | + def test_round_trip(self): | |
| 89 | + pe = PlanningEntity( | |
| 90 | + name="Auth module", | |
| 91 | + planning_type=PlanningEntityType.FEATURE, | |
| 92 | + priority="medium", | |
| 93 | + source_entities=["Auth"], | |
| 94 | + ) | |
| 95 | + restored = PlanningEntity.model_validate_json(pe.model_dump_json()) | |
| 96 | + assert restored == pe | |
| 97 | + | |
| 98 | + | |
| 99 | +# ── Heuristic classification ───────────────────────────────────────── | |
| 100 | + | |
| 101 | + | |
| 102 | +class TestHeuristicClassify: | |
| 103 | + def setup_method(self): | |
| 104 | + self.classifier = TaxonomyClassifier() | |
| 105 | + | |
| 106 | + def test_goal_keyword(self): | |
| 107 | + entities = [_entity("Ship v2", ["Our main goal is to ship v2"])] | |
| 108 | + result = self.classifier.classify_entities(entities, []) | |
| 109 | + assert len(result) == 1 | |
| 110 | + assert result[0].planning_type == PlanningEntityType.GOAL | |
| 111 | + | |
| 112 | + def test_requirement_keyword(self): | |
| 113 | + entities = [_entity("Auth", ["System must support SSO"])] | |
| 114 | + result = self.classifier.classify_entities(entities, []) | |
| 115 | + assert result[0].planning_type == PlanningEntityType.REQUIREMENT | |
| 116 | + | |
| 117 | + def test_constraint_keyword(self): | |
| 118 | + entities = [_entity("Budget", ["Budget limitation of $50k"])] | |
| 119 | + result = self.classifier.classify_entities(entities, []) | |
| 120 | + assert result[0].planning_type == PlanningEntityType.CONSTRAINT | |
| 121 | + | |
| 122 | + def test_decision_keyword(self): | |
| 123 | + entities = [_entity("DB choice", ["Team decided to use Postgres"])] | |
| 124 | + result = self.classifier.classify_entities(entities, []) | |
| 125 | + assert result[0].planning_type == PlanningEntityType.DECISION | |
| 126 | + | |
| 127 | + def test_risk_keyword(self): | |
| 128 | + entities = [_entity("Vendor lock-in", ["There is a risk of vendor lock-in"])] | |
| 129 | + result = self.classifier.classify_entities(entities, []) | |
| 130 | + assert result[0].planning_type == PlanningEntityType.RISK | |
| 131 | + | |
| 132 | + def test_assumption_keyword(self): | |
| 133 | + entities = [_entity("Team size", ["We assume the team stays at 5"])] | |
| 134 | + result = self.classifier.classify_entities(entities, []) | |
| 135 | + assert result[0].planning_type == PlanningEntityType.ASSUMPTION | |
| 136 | + | |
| 137 | + def test_dependency_keyword(self): | |
| 138 | + entities = [_entity("API v3", ["This depends on API v3 being ready"])] | |
| 139 | + result = self.classifier.classify_entities(entities, []) | |
| 140 | + assert result[0].planning_type == PlanningEntityType.DEPENDENCY | |
| 141 | + | |
| 142 | + def test_milestone_keyword(self): | |
| 143 | + entities = [_entity("Beta", ["Beta release milestone in March"])] | |
| 144 | + result = self.classifier.classify_entities(entities, []) | |
| 145 | + assert result[0].planning_type == PlanningEntityType.MILESTONE | |
| 146 | + | |
| 147 | + def test_task_keyword(self): | |
| 148 | + entities = [_entity("Setup CI", ["Action item: set up CI pipeline"])] | |
| 149 | + result = self.classifier.classify_entities(entities, []) | |
| 150 | + assert result[0].planning_type == PlanningEntityType.TASK | |
| 151 | + | |
| 152 | + def test_feature_keyword(self): | |
| 153 | + entities = [_entity("Search", ["Search feature with autocomplete"])] | |
| 154 | + result = self.classifier.classify_entities(entities, []) | |
| 155 | + assert result[0].planning_type == PlanningEntityType.FEATURE | |
| 156 | + | |
| 157 | + def test_no_match(self): | |
| 158 | + entities = [_entity("Python", ["A programming language"])] | |
| 159 | + result = self.classifier.classify_entities(entities, []) | |
| 160 | + assert len(result) == 0 | |
| 161 | + | |
| 162 | + def test_multiple_entities(self): | |
| 163 | + entities = [ | |
| 164 | + _entity("Goal A", ["The goal is performance"]), | |
| 165 | + _entity("Person B", ["Engineer on the team"], "person"), | |
| 166 | + _entity("Risk C", ["Risk of data loss"]), | |
| 167 | + ] | |
| 168 | + result = self.classifier.classify_entities(entities, []) | |
| 169 | + assert len(result) == 2 | |
| 170 | + types = {pe.planning_type for pe in result} | |
| 171 | + assert PlanningEntityType.GOAL in types | |
| 172 | + assert PlanningEntityType.RISK in types | |
| 173 | + | |
| 174 | + def test_description_joined(self): | |
| 175 | + entities = [_entity("Perf", ["System must handle", "1000 req/s"])] | |
| 176 | + result = self.classifier.classify_entities(entities, []) | |
| 177 | + assert result[0].planning_type == PlanningEntityType.REQUIREMENT | |
| 178 | + assert result[0].description == "System must handle; 1000 req/s" | |
| 179 | + | |
| 180 | + def test_source_entities_populated(self): | |
| 181 | + entities = [_entity("Ship v2", ["Our main goal"])] | |
| 182 | + result = self.classifier.classify_entities(entities, []) | |
| 183 | + assert result[0].source_entities == ["Ship v2"] | |
| 184 | + | |
| 185 | + | |
| 186 | +# ── LLM classification ─────────────────────────────────────────────── | |
| 187 | + | |
| 188 | + | |
| 189 | +class TestLLMClassify: | |
| 190 | + def test_llm_results_merged(self): | |
| 191 | + mock_pm = MagicMock() | |
| 192 | + mock_pm.chat.return_value = ( | |
| 193 | + '[{"name": "Python", "planning_type": "feature", "priority": "medium"}]' | |
| 194 | + ) | |
| 195 | + classifier = TaxonomyClassifier(provider_manager=mock_pm) | |
| 196 | + entities = [_entity("Python", ["A programming language"])] | |
| 197 | + result = classifier.classify_entities(entities, []) | |
| 198 | + assert len(result) == 1 | |
| 199 | + assert result[0].planning_type == PlanningEntityType.FEATURE | |
| 200 | + assert result[0].priority == "medium" | |
| 201 | + | |
| 202 | + def test_llm_overrides_heuristic(self): | |
| 203 | + mock_pm = MagicMock() | |
| 204 | + # Heuristic would say REQUIREMENT ("must"), LLM says GOAL | |
| 205 | + mock_pm.chat.return_value = ( | |
| 206 | + '[{"name": "Perf", "planning_type": "goal", "priority": "high"}]' | |
| 207 | + ) | |
| 208 | + classifier = TaxonomyClassifier(provider_manager=mock_pm) | |
| 209 | + entities = [_entity("Perf", ["System must be fast"])] | |
| 210 | + result = classifier.classify_entities(entities, []) | |
| 211 | + assert len(result) == 1 | |
| 212 | + assert result[0].planning_type == PlanningEntityType.GOAL | |
| 213 | + | |
| 214 | + def test_llm_invalid_type_skipped(self): | |
| 215 | + mock_pm = MagicMock() | |
| 216 | + mock_pm.chat.return_value = ( | |
| 217 | + '[{"name": "X", "planning_type": "not_a_type", "priority": "low"}]' | |
| 218 | + ) | |
| 219 | + classifier = TaxonomyClassifier(provider_manager=mock_pm) | |
| 220 | + entities = [_entity("X", ["Something"])] | |
| 221 | + result = classifier.classify_entities(entities, []) | |
| 222 | + assert len(result) == 0 | |
| 223 | + | |
| 224 | + def test_llm_failure_falls_back(self): | |
| 225 | + mock_pm = MagicMock() | |
| 226 | + mock_pm.chat.side_effect = RuntimeError("API down") | |
| 227 | + classifier = TaxonomyClassifier(provider_manager=mock_pm) | |
| 228 | + entities = [_entity("Ship v2", ["Our goal"])] | |
| 229 | + result = classifier.classify_entities(entities, []) | |
| 230 | + # Should still get heuristic result | |
| 231 | + assert len(result) == 1 | |
| 232 | + assert result[0].planning_type == PlanningEntityType.GOAL | |
| 233 | + | |
| 234 | + def test_llm_empty_response(self): | |
| 235 | + mock_pm = MagicMock() | |
| 236 | + mock_pm.chat.return_value = "" | |
| 237 | + classifier = TaxonomyClassifier(provider_manager=mock_pm) | |
| 238 | + entities = [_entity("Ship v2", ["Our goal"])] | |
| 239 | + result = classifier.classify_entities(entities, []) | |
| 240 | + assert len(result) == 1 # heuristic still works | |
| 241 | + | |
| 242 | + | |
| 243 | +# ── Workstream organization ────────────────────────────────────────── | |
| 244 | + | |
| 245 | + | |
| 246 | +class TestOrganizeByWorkstream: | |
| 247 | + def test_groups_by_type(self): | |
| 248 | + classifier = TaxonomyClassifier() | |
| 249 | + entities = [ | |
| 250 | + PlanningEntity(name="A", planning_type=PlanningEntityType.GOAL), | |
| 251 | + PlanningEntity(name="B", planning_type=PlanningEntityType.GOAL), | |
| 252 | + PlanningEntity(name="C", planning_type=PlanningEntityType.RISK), | |
| 253 | + ] | |
| 254 | + ws = classifier.organize_by_workstream(entities) | |
| 255 | + assert len(ws["goals"]) == 2 | |
| 256 | + assert len(ws["risks"]) == 1 | |
| 257 | + | |
| 258 | + def test_empty_input(self): | |
| 259 | + classifier = TaxonomyClassifier() | |
| 260 | + ws = classifier.organize_by_workstream([]) | |
| 261 | + assert ws == {} | |
| 262 | + | |
| 263 | + | |
| 264 | +# ── Merge classifications ──────────────────────────────────────────── | |
| 265 | + | |
| 266 | + | |
| 267 | +class TestMergeClassifications: | |
| 268 | + def test_llm_wins_conflict(self): | |
| 269 | + h = [PlanningEntity(name="X", planning_type=PlanningEntityType.GOAL)] | |
| 270 | + llm_list = [PlanningEntity(name="X", planning_type=PlanningEntityType.RISK)] | |
| 271 | + merged = TaxonomyClassifier._merge_classifications(h, llm_list) | |
| 272 | + assert len(merged) == 1 | |
| 273 | + assert merged[0].planning_type == PlanningEntityType.RISK | |
| 274 | + | |
| 275 | + def test_case_insensitive_merge(self): | |
| 276 | + h = [PlanningEntity(name="Auth", planning_type=PlanningEntityType.FEATURE)] | |
| 277 | + llm_list = [PlanningEntity(name="auth", planning_type=PlanningEntityType.REQUIREMENT)] | |
| 278 | + merged = TaxonomyClassifier._merge_classifications(h, llm_list) | |
| 279 | + assert len(merged) == 1 | |
| 280 | + assert merged[0].planning_type == PlanningEntityType.REQUIREMENT | |
| 281 | + | |
| 282 | + def test_union_of_distinct(self): | |
| 283 | + h = [PlanningEntity(name="A", planning_type=PlanningEntityType.GOAL)] | |
| 284 | + llm_list = [PlanningEntity(name="B", planning_type=PlanningEntityType.RISK)] | |
| 285 | + merged = TaxonomyClassifier._merge_classifications(h, llm_list) | |
| 286 | + assert len(merged) == 2 |
| --- a/tests/test_taxonomy.py | |
| +++ b/tests/test_taxonomy.py | |
| @@ -0,0 +1,286 @@ | |
| --- a/tests/test_taxonomy.py | |
| +++ b/tests/test_taxonomy.py | |
| @@ -0,0 +1,286 @@ | |
| 1 | """Tests for the planning taxonomy classifier.""" |
| 2 | |
| 3 | from unittest.mock import MagicMock |
| 4 | |
| 5 | from video_processor.integrators.taxonomy import TaxonomyClassifier |
| 6 | from video_processor.models import ( |
| 7 | PlanningEntity, |
| 8 | PlanningEntityType, |
| 9 | PlanningRelationshipType, |
| 10 | ) |
| 11 | |
| 12 | # ── Fixtures ────────────────────────────────────────────────────────── |
| 13 | |
| 14 | |
| 15 | def _entity(name, descriptions=None, entity_type="concept"): |
| 16 | return { |
| 17 | "name": name, |
| 18 | "type": entity_type, |
| 19 | "descriptions": descriptions or [], |
| 20 | } |
| 21 | |
| 22 | |
| 23 | # ── PlanningEntityType enum ────────────────────────────────────────── |
| 24 | |
| 25 | |
| 26 | class TestPlanningEntityType: |
| 27 | def test_all_values(self): |
| 28 | expected = { |
| 29 | "goal", |
| 30 | "requirement", |
| 31 | "constraint", |
| 32 | "decision", |
| 33 | "risk", |
| 34 | "assumption", |
| 35 | "dependency", |
| 36 | "milestone", |
| 37 | "task", |
| 38 | "feature", |
| 39 | } |
| 40 | assert {t.value for t in PlanningEntityType} == expected |
| 41 | |
| 42 | def test_str_enum(self): |
| 43 | assert PlanningEntityType.GOAL == "goal" |
| 44 | assert PlanningEntityType.RISK.value == "risk" |
| 45 | |
| 46 | |
| 47 | class TestPlanningRelationshipType: |
| 48 | def test_all_values(self): |
| 49 | expected = { |
| 50 | "requires", |
| 51 | "blocked_by", |
| 52 | "has_risk", |
| 53 | "depends_on", |
| 54 | "addresses", |
| 55 | "has_tradeoff", |
| 56 | "delivers", |
| 57 | "implements", |
| 58 | "parent_of", |
| 59 | } |
| 60 | assert {t.value for t in PlanningRelationshipType} == expected |
| 61 | |
| 62 | |
| 63 | # ── PlanningEntity model ───────────────────────────────────────────── |
| 64 | |
| 65 | |
| 66 | class TestPlanningEntity: |
| 67 | def test_minimal(self): |
| 68 | pe = PlanningEntity(name="Ship v2", planning_type=PlanningEntityType.GOAL) |
| 69 | assert pe.description == "" |
| 70 | assert pe.priority is None |
| 71 | assert pe.status is None |
| 72 | assert pe.source_entities == [] |
| 73 | assert pe.metadata == {} |
| 74 | |
| 75 | def test_full(self): |
| 76 | pe = PlanningEntity( |
| 77 | name="Ship v2", |
| 78 | planning_type=PlanningEntityType.GOAL, |
| 79 | description="Release version 2", |
| 80 | priority="high", |
| 81 | status="identified", |
| 82 | source_entities=["v2 release"], |
| 83 | metadata={"quarter": "Q3"}, |
| 84 | ) |
| 85 | assert pe.priority == "high" |
| 86 | assert pe.metadata["quarter"] == "Q3" |
| 87 | |
| 88 | def test_round_trip(self): |
| 89 | pe = PlanningEntity( |
| 90 | name="Auth module", |
| 91 | planning_type=PlanningEntityType.FEATURE, |
| 92 | priority="medium", |
| 93 | source_entities=["Auth"], |
| 94 | ) |
| 95 | restored = PlanningEntity.model_validate_json(pe.model_dump_json()) |
| 96 | assert restored == pe |
| 97 | |
| 98 | |
| 99 | # ── Heuristic classification ───────────────────────────────────────── |
| 100 | |
| 101 | |
| 102 | class TestHeuristicClassify: |
| 103 | def setup_method(self): |
| 104 | self.classifier = TaxonomyClassifier() |
| 105 | |
| 106 | def test_goal_keyword(self): |
| 107 | entities = [_entity("Ship v2", ["Our main goal is to ship v2"])] |
| 108 | result = self.classifier.classify_entities(entities, []) |
| 109 | assert len(result) == 1 |
| 110 | assert result[0].planning_type == PlanningEntityType.GOAL |
| 111 | |
| 112 | def test_requirement_keyword(self): |
| 113 | entities = [_entity("Auth", ["System must support SSO"])] |
| 114 | result = self.classifier.classify_entities(entities, []) |
| 115 | assert result[0].planning_type == PlanningEntityType.REQUIREMENT |
| 116 | |
| 117 | def test_constraint_keyword(self): |
| 118 | entities = [_entity("Budget", ["Budget limitation of $50k"])] |
| 119 | result = self.classifier.classify_entities(entities, []) |
| 120 | assert result[0].planning_type == PlanningEntityType.CONSTRAINT |
| 121 | |
| 122 | def test_decision_keyword(self): |
| 123 | entities = [_entity("DB choice", ["Team decided to use Postgres"])] |
| 124 | result = self.classifier.classify_entities(entities, []) |
| 125 | assert result[0].planning_type == PlanningEntityType.DECISION |
| 126 | |
| 127 | def test_risk_keyword(self): |
| 128 | entities = [_entity("Vendor lock-in", ["There is a risk of vendor lock-in"])] |
| 129 | result = self.classifier.classify_entities(entities, []) |
| 130 | assert result[0].planning_type == PlanningEntityType.RISK |
| 131 | |
| 132 | def test_assumption_keyword(self): |
| 133 | entities = [_entity("Team size", ["We assume the team stays at 5"])] |
| 134 | result = self.classifier.classify_entities(entities, []) |
| 135 | assert result[0].planning_type == PlanningEntityType.ASSUMPTION |
| 136 | |
| 137 | def test_dependency_keyword(self): |
| 138 | entities = [_entity("API v3", ["This depends on API v3 being ready"])] |
| 139 | result = self.classifier.classify_entities(entities, []) |
| 140 | assert result[0].planning_type == PlanningEntityType.DEPENDENCY |
| 141 | |
| 142 | def test_milestone_keyword(self): |
| 143 | entities = [_entity("Beta", ["Beta release milestone in March"])] |
| 144 | result = self.classifier.classify_entities(entities, []) |
| 145 | assert result[0].planning_type == PlanningEntityType.MILESTONE |
| 146 | |
| 147 | def test_task_keyword(self): |
| 148 | entities = [_entity("Setup CI", ["Action item: set up CI pipeline"])] |
| 149 | result = self.classifier.classify_entities(entities, []) |
| 150 | assert result[0].planning_type == PlanningEntityType.TASK |
| 151 | |
| 152 | def test_feature_keyword(self): |
| 153 | entities = [_entity("Search", ["Search feature with autocomplete"])] |
| 154 | result = self.classifier.classify_entities(entities, []) |
| 155 | assert result[0].planning_type == PlanningEntityType.FEATURE |
| 156 | |
| 157 | def test_no_match(self): |
| 158 | entities = [_entity("Python", ["A programming language"])] |
| 159 | result = self.classifier.classify_entities(entities, []) |
| 160 | assert len(result) == 0 |
| 161 | |
| 162 | def test_multiple_entities(self): |
| 163 | entities = [ |
| 164 | _entity("Goal A", ["The goal is performance"]), |
| 165 | _entity("Person B", ["Engineer on the team"], "person"), |
| 166 | _entity("Risk C", ["Risk of data loss"]), |
| 167 | ] |
| 168 | result = self.classifier.classify_entities(entities, []) |
| 169 | assert len(result) == 2 |
| 170 | types = {pe.planning_type for pe in result} |
| 171 | assert PlanningEntityType.GOAL in types |
| 172 | assert PlanningEntityType.RISK in types |
| 173 | |
| 174 | def test_description_joined(self): |
| 175 | entities = [_entity("Perf", ["System must handle", "1000 req/s"])] |
| 176 | result = self.classifier.classify_entities(entities, []) |
| 177 | assert result[0].planning_type == PlanningEntityType.REQUIREMENT |
| 178 | assert result[0].description == "System must handle; 1000 req/s" |
| 179 | |
| 180 | def test_source_entities_populated(self): |
| 181 | entities = [_entity("Ship v2", ["Our main goal"])] |
| 182 | result = self.classifier.classify_entities(entities, []) |
| 183 | assert result[0].source_entities == ["Ship v2"] |
| 184 | |
| 185 | |
| 186 | # ── LLM classification ─────────────────────────────────────────────── |
| 187 | |
| 188 | |
| 189 | class TestLLMClassify: |
| 190 | def test_llm_results_merged(self): |
| 191 | mock_pm = MagicMock() |
| 192 | mock_pm.chat.return_value = ( |
| 193 | '[{"name": "Python", "planning_type": "feature", "priority": "medium"}]' |
| 194 | ) |
| 195 | classifier = TaxonomyClassifier(provider_manager=mock_pm) |
| 196 | entities = [_entity("Python", ["A programming language"])] |
| 197 | result = classifier.classify_entities(entities, []) |
| 198 | assert len(result) == 1 |
| 199 | assert result[0].planning_type == PlanningEntityType.FEATURE |
| 200 | assert result[0].priority == "medium" |
| 201 | |
| 202 | def test_llm_overrides_heuristic(self): |
| 203 | mock_pm = MagicMock() |
| 204 | # Heuristic would say REQUIREMENT ("must"), LLM says GOAL |
| 205 | mock_pm.chat.return_value = ( |
| 206 | '[{"name": "Perf", "planning_type": "goal", "priority": "high"}]' |
| 207 | ) |
| 208 | classifier = TaxonomyClassifier(provider_manager=mock_pm) |
| 209 | entities = [_entity("Perf", ["System must be fast"])] |
| 210 | result = classifier.classify_entities(entities, []) |
| 211 | assert len(result) == 1 |
| 212 | assert result[0].planning_type == PlanningEntityType.GOAL |
| 213 | |
| 214 | def test_llm_invalid_type_skipped(self): |
| 215 | mock_pm = MagicMock() |
| 216 | mock_pm.chat.return_value = ( |
| 217 | '[{"name": "X", "planning_type": "not_a_type", "priority": "low"}]' |
| 218 | ) |
| 219 | classifier = TaxonomyClassifier(provider_manager=mock_pm) |
| 220 | entities = [_entity("X", ["Something"])] |
| 221 | result = classifier.classify_entities(entities, []) |
| 222 | assert len(result) == 0 |
| 223 | |
| 224 | def test_llm_failure_falls_back(self): |
| 225 | mock_pm = MagicMock() |
| 226 | mock_pm.chat.side_effect = RuntimeError("API down") |
| 227 | classifier = TaxonomyClassifier(provider_manager=mock_pm) |
| 228 | entities = [_entity("Ship v2", ["Our goal"])] |
| 229 | result = classifier.classify_entities(entities, []) |
| 230 | # Should still get heuristic result |
| 231 | assert len(result) == 1 |
| 232 | assert result[0].planning_type == PlanningEntityType.GOAL |
| 233 | |
| 234 | def test_llm_empty_response(self): |
| 235 | mock_pm = MagicMock() |
| 236 | mock_pm.chat.return_value = "" |
| 237 | classifier = TaxonomyClassifier(provider_manager=mock_pm) |
| 238 | entities = [_entity("Ship v2", ["Our goal"])] |
| 239 | result = classifier.classify_entities(entities, []) |
| 240 | assert len(result) == 1 # heuristic still works |
| 241 | |
| 242 | |
| 243 | # ── Workstream organization ────────────────────────────────────────── |
| 244 | |
| 245 | |
| 246 | class TestOrganizeByWorkstream: |
| 247 | def test_groups_by_type(self): |
| 248 | classifier = TaxonomyClassifier() |
| 249 | entities = [ |
| 250 | PlanningEntity(name="A", planning_type=PlanningEntityType.GOAL), |
| 251 | PlanningEntity(name="B", planning_type=PlanningEntityType.GOAL), |
| 252 | PlanningEntity(name="C", planning_type=PlanningEntityType.RISK), |
| 253 | ] |
| 254 | ws = classifier.organize_by_workstream(entities) |
| 255 | assert len(ws["goals"]) == 2 |
| 256 | assert len(ws["risks"]) == 1 |
| 257 | |
| 258 | def test_empty_input(self): |
| 259 | classifier = TaxonomyClassifier() |
| 260 | ws = classifier.organize_by_workstream([]) |
| 261 | assert ws == {} |
| 262 | |
| 263 | |
| 264 | # ── Merge classifications ──────────────────────────────────────────── |
| 265 | |
| 266 | |
| 267 | class TestMergeClassifications: |
| 268 | def test_llm_wins_conflict(self): |
| 269 | h = [PlanningEntity(name="X", planning_type=PlanningEntityType.GOAL)] |
| 270 | llm_list = [PlanningEntity(name="X", planning_type=PlanningEntityType.RISK)] |
| 271 | merged = TaxonomyClassifier._merge_classifications(h, llm_list) |
| 272 | assert len(merged) == 1 |
| 273 | assert merged[0].planning_type == PlanningEntityType.RISK |
| 274 | |
| 275 | def test_case_insensitive_merge(self): |
| 276 | h = [PlanningEntity(name="Auth", planning_type=PlanningEntityType.FEATURE)] |
| 277 | llm_list = [PlanningEntity(name="auth", planning_type=PlanningEntityType.REQUIREMENT)] |
| 278 | merged = TaxonomyClassifier._merge_classifications(h, llm_list) |
| 279 | assert len(merged) == 1 |
| 280 | assert merged[0].planning_type == PlanningEntityType.REQUIREMENT |
| 281 | |
| 282 | def test_union_of_distinct(self): |
| 283 | h = [PlanningEntity(name="A", planning_type=PlanningEntityType.GOAL)] |
| 284 | llm_list = [PlanningEntity(name="B", planning_type=PlanningEntityType.RISK)] |
| 285 | merged = TaxonomyClassifier._merge_classifications(h, llm_list) |
| 286 | assert len(merged) == 2 |
+198
| --- a/tests/test_usage_tracker.py | ||
| +++ b/tests/test_usage_tracker.py | ||
| @@ -0,0 +1,198 @@ | ||
| 1 | +"""Tests for the UsageTracker class.""" | |
| 2 | + | |
| 3 | +import time | |
| 4 | + | |
| 5 | +from video_processor.utils.usage_tracker import ModelUsage, StepTiming, UsageTracker, _fmt_duration | |
| 6 | + | |
| 7 | + | |
| 8 | +class TestModelUsage: | |
| 9 | + def test_total_tokens(self): | |
| 10 | + mu = ModelUsage(provider="openai", model="gpt-4o", input_tokens=100, output_tokens=50) | |
| 11 | + assert mu.total_tokens == 150 | |
| 12 | + | |
| 13 | + def test_estimated_cost_known_model(self): | |
| 14 | + mu = ModelUsage( | |
| 15 | + provider="openai", | |
| 16 | + model="gpt-4o", | |
| 17 | + input_tokens=1_000_000, | |
| 18 | + output_tokens=500_000, | |
| 19 | + ) | |
| 20 | + # gpt-4o: input $2.50/M, output $10.00/M | |
| 21 | + expected = 1_000_000 * 2.50 / 1_000_000 + 500_000 * 10.00 / 1_000_000 | |
| 22 | + assert abs(mu.estimated_cost - expected) < 0.001 | |
| 23 | + | |
| 24 | + def test_estimated_cost_unknown_model(self): | |
| 25 | + mu = ModelUsage( | |
| 26 | + provider="local", | |
| 27 | + model="my-custom-model", | |
| 28 | + input_tokens=1000, | |
| 29 | + output_tokens=500, | |
| 30 | + ) | |
| 31 | + assert mu.estimated_cost == 0.0 | |
| 32 | + | |
| 33 | + def test_estimated_cost_whisper(self): | |
| 34 | + mu = ModelUsage( | |
| 35 | + provider="openai", | |
| 36 | + model="whisper-1", | |
| 37 | + audio_minutes=10.0, | |
| 38 | + ) | |
| 39 | + # whisper-1: $0.006/min | |
| 40 | + assert abs(mu.estimated_cost - 0.06) < 0.001 | |
| 41 | + | |
| 42 | + def test_estimated_cost_partial_match(self): | |
| 43 | + mu = ModelUsage( | |
| 44 | + provider="openai", | |
| 45 | + model="gpt-4o-2024-08-06", | |
| 46 | + input_tokens=1_000_000, | |
| 47 | + output_tokens=0, | |
| 48 | + ) | |
| 49 | + # Should partial-match to gpt-4o | |
| 50 | + assert mu.estimated_cost > 0 | |
| 51 | + | |
| 52 | + def test_calls_default_zero(self): | |
| 53 | + mu = ModelUsage() | |
| 54 | + assert mu.calls == 0 | |
| 55 | + assert mu.total_tokens == 0 | |
| 56 | + assert mu.estimated_cost == 0.0 | |
| 57 | + | |
| 58 | + | |
| 59 | +class TestStepTiming: | |
| 60 | + def test_duration_with_times(self): | |
| 61 | + st = StepTiming(name="test", start_time=100.0, end_time=105.5) | |
| 62 | + assert abs(st.duration - 5.5) < 0.001 | |
| 63 | + | |
| 64 | + def test_duration_no_end_time(self): | |
| 65 | + st = StepTiming(name="test", start_time=100.0) | |
| 66 | + assert st.duration == 0.0 | |
| 67 | + | |
| 68 | + def test_duration_no_start_time(self): | |
| 69 | + st = StepTiming(name="test") | |
| 70 | + assert st.duration == 0.0 | |
| 71 | + | |
| 72 | + | |
| 73 | +class TestUsageTracker: | |
| 74 | + def test_record_single_call(self): | |
| 75 | + tracker = UsageTracker() | |
| 76 | + tracker.record("openai", "gpt-4o", input_tokens=500, output_tokens=200) | |
| 77 | + assert tracker.total_api_calls == 1 | |
| 78 | + assert tracker.total_input_tokens == 500 | |
| 79 | + assert tracker.total_output_tokens == 200 | |
| 80 | + assert tracker.total_tokens == 700 | |
| 81 | + | |
| 82 | + def test_record_multiple_calls_same_model(self): | |
| 83 | + tracker = UsageTracker() | |
| 84 | + tracker.record("openai", "gpt-4o", input_tokens=100, output_tokens=50) | |
| 85 | + tracker.record("openai", "gpt-4o", input_tokens=200, output_tokens=100) | |
| 86 | + assert tracker.total_api_calls == 2 | |
| 87 | + assert tracker.total_input_tokens == 300 | |
| 88 | + assert tracker.total_output_tokens == 150 | |
| 89 | + | |
| 90 | + def test_record_multiple_models(self): | |
| 91 | + tracker = UsageTracker() | |
| 92 | + tracker.record("openai", "gpt-4o", input_tokens=100, output_tokens=50) | |
| 93 | + tracker.record( | |
| 94 | + "anthropic", "claude-sonnet-4-5-20250929", input_tokens=200, output_tokens=100 | |
| 95 | + ) | |
| 96 | + assert tracker.total_api_calls == 2 | |
| 97 | + assert tracker.total_input_tokens == 300 | |
| 98 | + assert len(tracker._models) == 2 | |
| 99 | + | |
| 100 | + def test_total_cost(self): | |
| 101 | + tracker = UsageTracker() | |
| 102 | + tracker.record("openai", "gpt-4o", input_tokens=1_000_000, output_tokens=500_000) | |
| 103 | + cost = tracker.total_cost | |
| 104 | + assert cost > 0 | |
| 105 | + | |
| 106 | + def test_start_and_end_step(self): | |
| 107 | + tracker = UsageTracker() | |
| 108 | + tracker.start_step("Frame extraction") | |
| 109 | + time.sleep(0.01) | |
| 110 | + tracker.end_step() | |
| 111 | + | |
| 112 | + assert len(tracker._steps) == 1 | |
| 113 | + assert tracker._steps[0].name == "Frame extraction" | |
| 114 | + assert tracker._steps[0].duration > 0 | |
| 115 | + | |
| 116 | + def test_start_step_auto_closes_previous(self): | |
| 117 | + tracker = UsageTracker() | |
| 118 | + tracker.start_step("Step 1") | |
| 119 | + time.sleep(0.01) | |
| 120 | + tracker.start_step("Step 2") | |
| 121 | + # Step 1 should have been auto-closed | |
| 122 | + assert len(tracker._steps) == 1 | |
| 123 | + assert tracker._steps[0].name == "Step 1" | |
| 124 | + assert tracker._steps[0].duration > 0 | |
| 125 | + # Step 2 is current | |
| 126 | + assert tracker._current_step.name == "Step 2" | |
| 127 | + | |
| 128 | + def test_end_step_when_none(self): | |
| 129 | + tracker = UsageTracker() | |
| 130 | + tracker.end_step() # Should not raise | |
| 131 | + assert len(tracker._steps) == 0 | |
| 132 | + | |
| 133 | + def test_total_duration(self): | |
| 134 | + tracker = UsageTracker() | |
| 135 | + time.sleep(0.01) | |
| 136 | + assert tracker.total_duration > 0 | |
| 137 | + | |
| 138 | + def test_format_summary_empty(self): | |
| 139 | + tracker = UsageTracker() | |
| 140 | + summary = tracker.format_summary() | |
| 141 | + assert "PROCESSING SUMMARY" in summary | |
| 142 | + assert "Total time" in summary | |
| 143 | + | |
| 144 | + def test_format_summary_with_usage(self): | |
| 145 | + tracker = UsageTracker() | |
| 146 | + tracker.record("openai", "gpt-4o", input_tokens=1000, output_tokens=500) | |
| 147 | + tracker.start_step("Analysis") | |
| 148 | + tracker.end_step() | |
| 149 | + | |
| 150 | + summary = tracker.format_summary() | |
| 151 | + assert "API Calls" in summary | |
| 152 | + assert "Tokens" in summary | |
| 153 | + assert "gpt-4o" in summary | |
| 154 | + assert "Analysis" in summary | |
| 155 | + | |
| 156 | + def test_format_summary_with_audio(self): | |
| 157 | + tracker = UsageTracker() | |
| 158 | + tracker.record("openai", "whisper-1", audio_minutes=5.0) | |
| 159 | + summary = tracker.format_summary() | |
| 160 | + assert "whisper" in summary | |
| 161 | + assert "5.0m" in summary | |
| 162 | + | |
| 163 | + def test_format_summary_cost_display(self): | |
| 164 | + tracker = UsageTracker() | |
| 165 | + tracker.record("openai", "gpt-4o", input_tokens=1_000_000, output_tokens=500_000) | |
| 166 | + summary = tracker.format_summary() | |
| 167 | + assert "Estimated total cost: $" in summary | |
| 168 | + | |
| 169 | + def test_format_summary_step_percentages(self): | |
| 170 | + tracker = UsageTracker() | |
| 171 | + # Manually create steps with known timings | |
| 172 | + tracker._steps = [ | |
| 173 | + StepTiming(name="Step A", start_time=0.0, end_time=1.0), | |
| 174 | + StepTiming(name="Step B", start_time=1.0, end_time=3.0), | |
| 175 | + ] | |
| 176 | + summary = tracker.format_summary() | |
| 177 | + assert "Step A" in summary | |
| 178 | + assert "Step B" in summary | |
| 179 | + assert "%" in summary | |
| 180 | + | |
| 181 | + | |
| 182 | +class TestFmtDuration: | |
| 183 | + def test_seconds(self): | |
| 184 | + assert _fmt_duration(5.3) == "5.3s" | |
| 185 | + | |
| 186 | + def test_minutes(self): | |
| 187 | + result = _fmt_duration(90.0) | |
| 188 | + assert result == "1m 30s" | |
| 189 | + | |
| 190 | + def test_hours(self): | |
| 191 | + result = _fmt_duration(3661.0) | |
| 192 | + assert result == "1h 1m 1s" | |
| 193 | + | |
| 194 | + def test_zero(self): | |
| 195 | + assert _fmt_duration(0.0) == "0.0s" | |
| 196 | + | |
| 197 | + def test_just_under_minute(self): | |
| 198 | + assert _fmt_duration(59.9) == "59.9s" |
| --- a/tests/test_usage_tracker.py | |
| +++ b/tests/test_usage_tracker.py | |
| @@ -0,0 +1,198 @@ | |
| --- a/tests/test_usage_tracker.py | |
| +++ b/tests/test_usage_tracker.py | |
| @@ -0,0 +1,198 @@ | |
| 1 | """Tests for the UsageTracker class.""" |
| 2 | |
| 3 | import time |
| 4 | |
| 5 | from video_processor.utils.usage_tracker import ModelUsage, StepTiming, UsageTracker, _fmt_duration |
| 6 | |
| 7 | |
| 8 | class TestModelUsage: |
| 9 | def test_total_tokens(self): |
| 10 | mu = ModelUsage(provider="openai", model="gpt-4o", input_tokens=100, output_tokens=50) |
| 11 | assert mu.total_tokens == 150 |
| 12 | |
| 13 | def test_estimated_cost_known_model(self): |
| 14 | mu = ModelUsage( |
| 15 | provider="openai", |
| 16 | model="gpt-4o", |
| 17 | input_tokens=1_000_000, |
| 18 | output_tokens=500_000, |
| 19 | ) |
| 20 | # gpt-4o: input $2.50/M, output $10.00/M |
| 21 | expected = 1_000_000 * 2.50 / 1_000_000 + 500_000 * 10.00 / 1_000_000 |
| 22 | assert abs(mu.estimated_cost - expected) < 0.001 |
| 23 | |
| 24 | def test_estimated_cost_unknown_model(self): |
| 25 | mu = ModelUsage( |
| 26 | provider="local", |
| 27 | model="my-custom-model", |
| 28 | input_tokens=1000, |
| 29 | output_tokens=500, |
| 30 | ) |
| 31 | assert mu.estimated_cost == 0.0 |
| 32 | |
| 33 | def test_estimated_cost_whisper(self): |
| 34 | mu = ModelUsage( |
| 35 | provider="openai", |
| 36 | model="whisper-1", |
| 37 | audio_minutes=10.0, |
| 38 | ) |
| 39 | # whisper-1: $0.006/min |
| 40 | assert abs(mu.estimated_cost - 0.06) < 0.001 |
| 41 | |
| 42 | def test_estimated_cost_partial_match(self): |
| 43 | mu = ModelUsage( |
| 44 | provider="openai", |
| 45 | model="gpt-4o-2024-08-06", |
| 46 | input_tokens=1_000_000, |
| 47 | output_tokens=0, |
| 48 | ) |
| 49 | # Should partial-match to gpt-4o |
| 50 | assert mu.estimated_cost > 0 |
| 51 | |
| 52 | def test_calls_default_zero(self): |
| 53 | mu = ModelUsage() |
| 54 | assert mu.calls == 0 |
| 55 | assert mu.total_tokens == 0 |
| 56 | assert mu.estimated_cost == 0.0 |
| 57 | |
| 58 | |
| 59 | class TestStepTiming: |
| 60 | def test_duration_with_times(self): |
| 61 | st = StepTiming(name="test", start_time=100.0, end_time=105.5) |
| 62 | assert abs(st.duration - 5.5) < 0.001 |
| 63 | |
| 64 | def test_duration_no_end_time(self): |
| 65 | st = StepTiming(name="test", start_time=100.0) |
| 66 | assert st.duration == 0.0 |
| 67 | |
| 68 | def test_duration_no_start_time(self): |
| 69 | st = StepTiming(name="test") |
| 70 | assert st.duration == 0.0 |
| 71 | |
| 72 | |
| 73 | class TestUsageTracker: |
| 74 | def test_record_single_call(self): |
| 75 | tracker = UsageTracker() |
| 76 | tracker.record("openai", "gpt-4o", input_tokens=500, output_tokens=200) |
| 77 | assert tracker.total_api_calls == 1 |
| 78 | assert tracker.total_input_tokens == 500 |
| 79 | assert tracker.total_output_tokens == 200 |
| 80 | assert tracker.total_tokens == 700 |
| 81 | |
| 82 | def test_record_multiple_calls_same_model(self): |
| 83 | tracker = UsageTracker() |
| 84 | tracker.record("openai", "gpt-4o", input_tokens=100, output_tokens=50) |
| 85 | tracker.record("openai", "gpt-4o", input_tokens=200, output_tokens=100) |
| 86 | assert tracker.total_api_calls == 2 |
| 87 | assert tracker.total_input_tokens == 300 |
| 88 | assert tracker.total_output_tokens == 150 |
| 89 | |
| 90 | def test_record_multiple_models(self): |
| 91 | tracker = UsageTracker() |
| 92 | tracker.record("openai", "gpt-4o", input_tokens=100, output_tokens=50) |
| 93 | tracker.record( |
| 94 | "anthropic", "claude-sonnet-4-5-20250929", input_tokens=200, output_tokens=100 |
| 95 | ) |
| 96 | assert tracker.total_api_calls == 2 |
| 97 | assert tracker.total_input_tokens == 300 |
| 98 | assert len(tracker._models) == 2 |
| 99 | |
| 100 | def test_total_cost(self): |
| 101 | tracker = UsageTracker() |
| 102 | tracker.record("openai", "gpt-4o", input_tokens=1_000_000, output_tokens=500_000) |
| 103 | cost = tracker.total_cost |
| 104 | assert cost > 0 |
| 105 | |
| 106 | def test_start_and_end_step(self): |
| 107 | tracker = UsageTracker() |
| 108 | tracker.start_step("Frame extraction") |
| 109 | time.sleep(0.01) |
| 110 | tracker.end_step() |
| 111 | |
| 112 | assert len(tracker._steps) == 1 |
| 113 | assert tracker._steps[0].name == "Frame extraction" |
| 114 | assert tracker._steps[0].duration > 0 |
| 115 | |
| 116 | def test_start_step_auto_closes_previous(self): |
| 117 | tracker = UsageTracker() |
| 118 | tracker.start_step("Step 1") |
| 119 | time.sleep(0.01) |
| 120 | tracker.start_step("Step 2") |
| 121 | # Step 1 should have been auto-closed |
| 122 | assert len(tracker._steps) == 1 |
| 123 | assert tracker._steps[0].name == "Step 1" |
| 124 | assert tracker._steps[0].duration > 0 |
| 125 | # Step 2 is current |
| 126 | assert tracker._current_step.name == "Step 2" |
| 127 | |
| 128 | def test_end_step_when_none(self): |
| 129 | tracker = UsageTracker() |
| 130 | tracker.end_step() # Should not raise |
| 131 | assert len(tracker._steps) == 0 |
| 132 | |
| 133 | def test_total_duration(self): |
| 134 | tracker = UsageTracker() |
| 135 | time.sleep(0.01) |
| 136 | assert tracker.total_duration > 0 |
| 137 | |
| 138 | def test_format_summary_empty(self): |
| 139 | tracker = UsageTracker() |
| 140 | summary = tracker.format_summary() |
| 141 | assert "PROCESSING SUMMARY" in summary |
| 142 | assert "Total time" in summary |
| 143 | |
| 144 | def test_format_summary_with_usage(self): |
| 145 | tracker = UsageTracker() |
| 146 | tracker.record("openai", "gpt-4o", input_tokens=1000, output_tokens=500) |
| 147 | tracker.start_step("Analysis") |
| 148 | tracker.end_step() |
| 149 | |
| 150 | summary = tracker.format_summary() |
| 151 | assert "API Calls" in summary |
| 152 | assert "Tokens" in summary |
| 153 | assert "gpt-4o" in summary |
| 154 | assert "Analysis" in summary |
| 155 | |
| 156 | def test_format_summary_with_audio(self): |
| 157 | tracker = UsageTracker() |
| 158 | tracker.record("openai", "whisper-1", audio_minutes=5.0) |
| 159 | summary = tracker.format_summary() |
| 160 | assert "whisper" in summary |
| 161 | assert "5.0m" in summary |
| 162 | |
| 163 | def test_format_summary_cost_display(self): |
| 164 | tracker = UsageTracker() |
| 165 | tracker.record("openai", "gpt-4o", input_tokens=1_000_000, output_tokens=500_000) |
| 166 | summary = tracker.format_summary() |
| 167 | assert "Estimated total cost: $" in summary |
| 168 | |
| 169 | def test_format_summary_step_percentages(self): |
| 170 | tracker = UsageTracker() |
| 171 | # Manually create steps with known timings |
| 172 | tracker._steps = [ |
| 173 | StepTiming(name="Step A", start_time=0.0, end_time=1.0), |
| 174 | StepTiming(name="Step B", start_time=1.0, end_time=3.0), |
| 175 | ] |
| 176 | summary = tracker.format_summary() |
| 177 | assert "Step A" in summary |
| 178 | assert "Step B" in summary |
| 179 | assert "%" in summary |
| 180 | |
| 181 | |
| 182 | class TestFmtDuration: |
| 183 | def test_seconds(self): |
| 184 | assert _fmt_duration(5.3) == "5.3s" |
| 185 | |
| 186 | def test_minutes(self): |
| 187 | result = _fmt_duration(90.0) |
| 188 | assert result == "1m 30s" |
| 189 | |
| 190 | def test_hours(self): |
| 191 | result = _fmt_duration(3661.0) |
| 192 | assert result == "1h 1m 1s" |
| 193 | |
| 194 | def test_zero(self): |
| 195 | assert _fmt_duration(0.0) == "0.0s" |
| 196 | |
| 197 | def test_just_under_minute(self): |
| 198 | assert _fmt_duration(59.9) == "59.9s" |
+339
| --- a/tests/test_visualization.py | ||
| +++ b/tests/test_visualization.py | ||
| @@ -0,0 +1,339 @@ | ||
| 1 | +"""Tests for video_processor.utils.visualization module.""" | |
| 2 | + | |
| 3 | +import pytest | |
| 4 | + | |
| 5 | +nx = pytest.importorskip("networkx", reason="networkx not installed") | |
| 6 | + | |
| 7 | +from video_processor.utils.visualization import ( # noqa: E402 | |
| 8 | + compute_graph_stats, | |
| 9 | + filter_graph, | |
| 10 | + generate_mermaid, | |
| 11 | + graph_to_d3_json, | |
| 12 | + graph_to_dot, | |
| 13 | + graph_to_networkx, | |
| 14 | +) | |
| 15 | + | |
| 16 | + | |
| 17 | +@pytest.fixture | |
| 18 | +def sample_kg_data(): | |
| 19 | + """Mock knowledge graph data matching to_dict() format.""" | |
| 20 | + return { | |
| 21 | + "nodes": [ | |
| 22 | + { | |
| 23 | + "id": "Alice", | |
| 24 | + "name": "Alice", | |
| 25 | + "type": "person", | |
| 26 | + "descriptions": ["Project lead"], | |
| 27 | + "occurrences": [{"source": "transcript_batch_0", "timestamp": 0.0}], | |
| 28 | + }, | |
| 29 | + { | |
| 30 | + "id": "Bob", | |
| 31 | + "name": "Bob", | |
| 32 | + "type": "person", | |
| 33 | + "descriptions": ["Developer"], | |
| 34 | + "occurrences": [], | |
| 35 | + }, | |
| 36 | + { | |
| 37 | + "id": "Python", | |
| 38 | + "name": "Python", | |
| 39 | + "type": "technology", | |
| 40 | + "descriptions": ["Programming language"], | |
| 41 | + "occurrences": [], | |
| 42 | + }, | |
| 43 | + { | |
| 44 | + "id": "Acme Corp", | |
| 45 | + "name": "Acme Corp", | |
| 46 | + "type": "organization", | |
| 47 | + "descriptions": ["The company"], | |
| 48 | + "occurrences": [], | |
| 49 | + }, | |
| 50 | + { | |
| 51 | + "id": "Microservices", | |
| 52 | + "name": "Microservices", | |
| 53 | + "type": "concept", | |
| 54 | + "descriptions": ["Architecture pattern"], | |
| 55 | + "occurrences": [], | |
| 56 | + }, | |
| 57 | + ], | |
| 58 | + "relationships": [ | |
| 59 | + { | |
| 60 | + "source": "Alice", | |
| 61 | + "target": "Python", | |
| 62 | + "type": "uses", | |
| 63 | + "content_source": "transcript_batch_0", | |
| 64 | + "timestamp": 1.5, | |
| 65 | + }, | |
| 66 | + { | |
| 67 | + "source": "Bob", | |
| 68 | + "target": "Python", | |
| 69 | + "type": "uses", | |
| 70 | + "content_source": "transcript_batch_0", | |
| 71 | + "timestamp": 2.0, | |
| 72 | + }, | |
| 73 | + { | |
| 74 | + "source": "Alice", | |
| 75 | + "target": "Bob", | |
| 76 | + "type": "works_with", | |
| 77 | + "content_source": "transcript_batch_0", | |
| 78 | + "timestamp": 3.0, | |
| 79 | + }, | |
| 80 | + { | |
| 81 | + "source": "Alice", | |
| 82 | + "target": "Acme Corp", | |
| 83 | + "type": "employed_by", | |
| 84 | + "content_source": "transcript_batch_1", | |
| 85 | + "timestamp": 10.0, | |
| 86 | + }, | |
| 87 | + { | |
| 88 | + "source": "Acme Corp", | |
| 89 | + "target": "Microservices", | |
| 90 | + "type": "adopts", | |
| 91 | + "content_source": "transcript_batch_1", | |
| 92 | + "timestamp": 12.0, | |
| 93 | + }, | |
| 94 | + ], | |
| 95 | + } | |
| 96 | + | |
| 97 | + | |
| 98 | +@pytest.fixture | |
| 99 | +def sample_graph(sample_kg_data): | |
| 100 | + """Pre-built NetworkX graph from sample data.""" | |
| 101 | + return graph_to_networkx(sample_kg_data) | |
| 102 | + | |
| 103 | + | |
| 104 | +class TestGraphToNetworkx: | |
| 105 | + def test_node_count(self, sample_graph): | |
| 106 | + assert sample_graph.number_of_nodes() == 5 | |
| 107 | + | |
| 108 | + def test_edge_count(self, sample_graph): | |
| 109 | + assert sample_graph.number_of_edges() == 5 | |
| 110 | + | |
| 111 | + def test_node_attributes(self, sample_graph): | |
| 112 | + alice = sample_graph.nodes["Alice"] | |
| 113 | + assert alice["type"] == "person" | |
| 114 | + assert alice["descriptions"] == ["Project lead"] | |
| 115 | + | |
| 116 | + def test_edge_attributes(self, sample_graph): | |
| 117 | + edge = sample_graph.edges["Alice", "Python"] | |
| 118 | + assert edge["type"] == "uses" | |
| 119 | + assert edge["content_source"] == "transcript_batch_0" | |
| 120 | + assert edge["timestamp"] == 1.5 | |
| 121 | + | |
| 122 | + def test_empty_data(self): | |
| 123 | + G = graph_to_networkx({}) | |
| 124 | + assert G.number_of_nodes() == 0 | |
| 125 | + assert G.number_of_edges() == 0 | |
| 126 | + | |
| 127 | + def test_nodes_only(self): | |
| 128 | + data = {"nodes": [{"name": "X", "type": "concept"}]} | |
| 129 | + G = graph_to_networkx(data) | |
| 130 | + assert G.number_of_nodes() == 1 | |
| 131 | + assert G.number_of_edges() == 0 | |
| 132 | + | |
| 133 | + def test_skips_empty_names(self): | |
| 134 | + data = {"nodes": [{"name": "", "type": "concept"}, {"name": "A"}]} | |
| 135 | + G = graph_to_networkx(data) | |
| 136 | + assert G.number_of_nodes() == 1 | |
| 137 | + | |
| 138 | + def test_skips_empty_relationship_endpoints(self): | |
| 139 | + data = { | |
| 140 | + "nodes": [{"name": "A"}], | |
| 141 | + "relationships": [{"source": "", "target": "A", "type": "x"}], | |
| 142 | + } | |
| 143 | + G = graph_to_networkx(data) | |
| 144 | + assert G.number_of_edges() == 0 | |
| 145 | + | |
| 146 | + | |
| 147 | +class TestComputeGraphStats: | |
| 148 | + def test_basic_counts(self, sample_graph): | |
| 149 | + stats = compute_graph_stats(sample_graph) | |
| 150 | + assert stats["node_count"] == 5 | |
| 151 | + assert stats["edge_count"] == 5 | |
| 152 | + | |
| 153 | + def test_density_range(self, sample_graph): | |
| 154 | + stats = compute_graph_stats(sample_graph) | |
| 155 | + assert 0.0 <= stats["density"] <= 1.0 | |
| 156 | + | |
| 157 | + def test_connected_components(self, sample_graph): | |
| 158 | + stats = compute_graph_stats(sample_graph) | |
| 159 | + assert stats["connected_components"] == 1 | |
| 160 | + | |
| 161 | + def test_type_breakdown(self, sample_graph): | |
| 162 | + stats = compute_graph_stats(sample_graph) | |
| 163 | + assert stats["type_breakdown"]["person"] == 2 | |
| 164 | + assert stats["type_breakdown"]["technology"] == 1 | |
| 165 | + assert stats["type_breakdown"]["organization"] == 1 | |
| 166 | + assert stats["type_breakdown"]["concept"] == 1 | |
| 167 | + | |
| 168 | + def test_top_entities(self, sample_graph): | |
| 169 | + stats = compute_graph_stats(sample_graph) | |
| 170 | + top = stats["top_entities"] | |
| 171 | + assert len(top) <= 10 | |
| 172 | + # Alice has degree 4 (3 out + 0 in? No: 3 out-edges, 0 in-edges = degree 3 undirected... | |
| 173 | + # Actually in DiGraph, degree = in + out. Alice: out=3 (Python, Bob, Acme), in=0 => 3 | |
| 174 | + # Python: in=2, out=0 => 2 | |
| 175 | + assert top[0]["name"] == "Alice" | |
| 176 | + | |
| 177 | + def test_empty_graph(self): | |
| 178 | + import networkx as nx | |
| 179 | + | |
| 180 | + G = nx.DiGraph() | |
| 181 | + stats = compute_graph_stats(G) | |
| 182 | + assert stats["node_count"] == 0 | |
| 183 | + assert stats["connected_components"] == 0 | |
| 184 | + assert stats["top_entities"] == [] | |
| 185 | + | |
| 186 | + | |
| 187 | +class TestFilterGraph: | |
| 188 | + def test_filter_by_type(self, sample_graph): | |
| 189 | + filtered = filter_graph(sample_graph, entity_types=["person"]) | |
| 190 | + assert filtered.number_of_nodes() == 2 | |
| 191 | + for _, data in filtered.nodes(data=True): | |
| 192 | + assert data["type"] == "person" | |
| 193 | + | |
| 194 | + def test_filter_by_min_degree(self, sample_graph): | |
| 195 | + # Alice has degree 3 (3 out-edges), Python has degree 2 (2 in-edges) | |
| 196 | + filtered = filter_graph(sample_graph, min_degree=3) | |
| 197 | + assert "Alice" in filtered.nodes | |
| 198 | + assert filtered.number_of_nodes() >= 1 | |
| 199 | + | |
| 200 | + def test_filter_combined(self, sample_graph): | |
| 201 | + filtered = filter_graph(sample_graph, entity_types=["person"], min_degree=1) | |
| 202 | + assert all(filtered.nodes[n]["type"] == "person" for n in filtered.nodes) | |
| 203 | + | |
| 204 | + def test_filter_no_criteria(self, sample_graph): | |
| 205 | + filtered = filter_graph(sample_graph) | |
| 206 | + assert filtered.number_of_nodes() == sample_graph.number_of_nodes() | |
| 207 | + | |
| 208 | + def test_filter_nonexistent_type(self, sample_graph): | |
| 209 | + filtered = filter_graph(sample_graph, entity_types=["alien"]) | |
| 210 | + assert filtered.number_of_nodes() == 0 | |
| 211 | + | |
| 212 | + def test_filter_preserves_edges(self, sample_graph): | |
| 213 | + filtered = filter_graph(sample_graph, entity_types=["person"]) | |
| 214 | + # Alice -> Bob edge should be preserved | |
| 215 | + assert filtered.has_edge("Alice", "Bob") | |
| 216 | + | |
| 217 | + def test_filter_returns_copy(self, sample_graph): | |
| 218 | + filtered = filter_graph(sample_graph, entity_types=["person"]) | |
| 219 | + # Modifying filtered should not affect original | |
| 220 | + filtered.add_node("NewNode") | |
| 221 | + assert "NewNode" not in sample_graph | |
| 222 | + | |
| 223 | + | |
| 224 | +class TestGenerateMermaid: | |
| 225 | + def test_output_starts_with_graph(self, sample_graph): | |
| 226 | + mermaid = generate_mermaid(sample_graph) | |
| 227 | + assert mermaid.startswith("graph LR") | |
| 228 | + | |
| 229 | + def test_custom_layout(self, sample_graph): | |
| 230 | + mermaid = generate_mermaid(sample_graph, layout="TD") | |
| 231 | + assert mermaid.startswith("graph TD") | |
| 232 | + | |
| 233 | + def test_contains_nodes(self, sample_graph): | |
| 234 | + mermaid = generate_mermaid(sample_graph) | |
| 235 | + assert "Alice" in mermaid | |
| 236 | + assert "Python" in mermaid | |
| 237 | + | |
| 238 | + def test_contains_edges(self, sample_graph): | |
| 239 | + mermaid = generate_mermaid(sample_graph) | |
| 240 | + assert "uses" in mermaid | |
| 241 | + | |
| 242 | + def test_contains_class_defs(self, sample_graph): | |
| 243 | + mermaid = generate_mermaid(sample_graph) | |
| 244 | + assert "classDef person" in mermaid | |
| 245 | + assert "classDef concept" in mermaid | |
| 246 | + | |
| 247 | + def test_max_nodes_limit(self, sample_graph): | |
| 248 | + mermaid = generate_mermaid(sample_graph, max_nodes=2) | |
| 249 | + # Should only have top-2 nodes by degree | |
| 250 | + lines = [ln for ln in mermaid.split("\n") if '["' in ln] | |
| 251 | + assert len(lines) <= 2 | |
| 252 | + | |
| 253 | + def test_empty_graph(self): | |
| 254 | + import networkx as nx | |
| 255 | + | |
| 256 | + G = nx.DiGraph() | |
| 257 | + mermaid = generate_mermaid(G) | |
| 258 | + assert "graph LR" in mermaid | |
| 259 | + | |
| 260 | + def test_sanitizes_special_chars(self): | |
| 261 | + import networkx as nx | |
| 262 | + | |
| 263 | + G = nx.DiGraph() | |
| 264 | + G.add_node("foo bar/baz", type="concept") | |
| 265 | + mermaid = generate_mermaid(G) | |
| 266 | + # Node ID should be sanitized but label preserved | |
| 267 | + assert "foo_bar_baz" in mermaid | |
| 268 | + assert "foo bar/baz" in mermaid | |
| 269 | + | |
| 270 | + | |
| 271 | +class TestGraphToD3Json: | |
| 272 | + def test_structure(self, sample_graph): | |
| 273 | + d3 = graph_to_d3_json(sample_graph) | |
| 274 | + assert "nodes" in d3 | |
| 275 | + assert "links" in d3 | |
| 276 | + | |
| 277 | + def test_node_format(self, sample_graph): | |
| 278 | + d3 = graph_to_d3_json(sample_graph) | |
| 279 | + node_ids = {n["id"] for n in d3["nodes"]} | |
| 280 | + assert "Alice" in node_ids | |
| 281 | + alice = next(n for n in d3["nodes"] if n["id"] == "Alice") | |
| 282 | + assert alice["group"] == "person" | |
| 283 | + | |
| 284 | + def test_link_format(self, sample_graph): | |
| 285 | + d3 = graph_to_d3_json(sample_graph) | |
| 286 | + assert len(d3["links"]) == 5 | |
| 287 | + link = d3["links"][0] | |
| 288 | + assert "source" in link | |
| 289 | + assert "target" in link | |
| 290 | + assert "type" in link | |
| 291 | + | |
| 292 | + def test_empty_graph(self): | |
| 293 | + import networkx as nx | |
| 294 | + | |
| 295 | + G = nx.DiGraph() | |
| 296 | + d3 = graph_to_d3_json(G) | |
| 297 | + assert d3 == {"nodes": [], "links": []} | |
| 298 | + | |
| 299 | + | |
| 300 | +class TestGraphToDot: | |
| 301 | + def test_starts_with_digraph(self, sample_graph): | |
| 302 | + dot = graph_to_dot(sample_graph) | |
| 303 | + assert dot.startswith("digraph KnowledgeGraph {") | |
| 304 | + | |
| 305 | + def test_ends_with_closing_brace(self, sample_graph): | |
| 306 | + dot = graph_to_dot(sample_graph) | |
| 307 | + assert dot.strip().endswith("}") | |
| 308 | + | |
| 309 | + def test_contains_nodes(self, sample_graph): | |
| 310 | + dot = graph_to_dot(sample_graph) | |
| 311 | + assert '"Alice"' in dot | |
| 312 | + assert '"Python"' in dot | |
| 313 | + | |
| 314 | + def test_contains_edges(self, sample_graph): | |
| 315 | + dot = graph_to_dot(sample_graph) | |
| 316 | + assert '"Alice" -> "Python"' in dot | |
| 317 | + | |
| 318 | + def test_edge_labels(self, sample_graph): | |
| 319 | + dot = graph_to_dot(sample_graph) | |
| 320 | + assert 'label="uses"' in dot | |
| 321 | + | |
| 322 | + def test_node_colors(self, sample_graph): | |
| 323 | + dot = graph_to_dot(sample_graph) | |
| 324 | + assert 'fillcolor="#f9d5e5"' in dot # person color for Alice | |
| 325 | + | |
| 326 | + def test_empty_graph(self): | |
| 327 | + import networkx as nx | |
| 328 | + | |
| 329 | + G = nx.DiGraph() | |
| 330 | + dot = graph_to_dot(G) | |
| 331 | + assert "digraph" in dot | |
| 332 | + | |
| 333 | + def test_special_chars_escaped(self): | |
| 334 | + import networkx as nx | |
| 335 | + | |
| 336 | + G = nx.DiGraph() | |
| 337 | + G.add_node('He said "hello"', type="person") | |
| 338 | + dot = graph_to_dot(G) | |
| 339 | + assert 'He said \\"hello\\"' in dot |
| --- a/tests/test_visualization.py | |
| +++ b/tests/test_visualization.py | |
| @@ -0,0 +1,339 @@ | |
| --- a/tests/test_visualization.py | |
| +++ b/tests/test_visualization.py | |
| @@ -0,0 +1,339 @@ | |
| 1 | """Tests for video_processor.utils.visualization module.""" |
| 2 | |
| 3 | import pytest |
| 4 | |
| 5 | nx = pytest.importorskip("networkx", reason="networkx not installed") |
| 6 | |
| 7 | from video_processor.utils.visualization import ( # noqa: E402 |
| 8 | compute_graph_stats, |
| 9 | filter_graph, |
| 10 | generate_mermaid, |
| 11 | graph_to_d3_json, |
| 12 | graph_to_dot, |
| 13 | graph_to_networkx, |
| 14 | ) |
| 15 | |
| 16 | |
| 17 | @pytest.fixture |
| 18 | def sample_kg_data(): |
| 19 | """Mock knowledge graph data matching to_dict() format.""" |
| 20 | return { |
| 21 | "nodes": [ |
| 22 | { |
| 23 | "id": "Alice", |
| 24 | "name": "Alice", |
| 25 | "type": "person", |
| 26 | "descriptions": ["Project lead"], |
| 27 | "occurrences": [{"source": "transcript_batch_0", "timestamp": 0.0}], |
| 28 | }, |
| 29 | { |
| 30 | "id": "Bob", |
| 31 | "name": "Bob", |
| 32 | "type": "person", |
| 33 | "descriptions": ["Developer"], |
| 34 | "occurrences": [], |
| 35 | }, |
| 36 | { |
| 37 | "id": "Python", |
| 38 | "name": "Python", |
| 39 | "type": "technology", |
| 40 | "descriptions": ["Programming language"], |
| 41 | "occurrences": [], |
| 42 | }, |
| 43 | { |
| 44 | "id": "Acme Corp", |
| 45 | "name": "Acme Corp", |
| 46 | "type": "organization", |
| 47 | "descriptions": ["The company"], |
| 48 | "occurrences": [], |
| 49 | }, |
| 50 | { |
| 51 | "id": "Microservices", |
| 52 | "name": "Microservices", |
| 53 | "type": "concept", |
| 54 | "descriptions": ["Architecture pattern"], |
| 55 | "occurrences": [], |
| 56 | }, |
| 57 | ], |
| 58 | "relationships": [ |
| 59 | { |
| 60 | "source": "Alice", |
| 61 | "target": "Python", |
| 62 | "type": "uses", |
| 63 | "content_source": "transcript_batch_0", |
| 64 | "timestamp": 1.5, |
| 65 | }, |
| 66 | { |
| 67 | "source": "Bob", |
| 68 | "target": "Python", |
| 69 | "type": "uses", |
| 70 | "content_source": "transcript_batch_0", |
| 71 | "timestamp": 2.0, |
| 72 | }, |
| 73 | { |
| 74 | "source": "Alice", |
| 75 | "target": "Bob", |
| 76 | "type": "works_with", |
| 77 | "content_source": "transcript_batch_0", |
| 78 | "timestamp": 3.0, |
| 79 | }, |
| 80 | { |
| 81 | "source": "Alice", |
| 82 | "target": "Acme Corp", |
| 83 | "type": "employed_by", |
| 84 | "content_source": "transcript_batch_1", |
| 85 | "timestamp": 10.0, |
| 86 | }, |
| 87 | { |
| 88 | "source": "Acme Corp", |
| 89 | "target": "Microservices", |
| 90 | "type": "adopts", |
| 91 | "content_source": "transcript_batch_1", |
| 92 | "timestamp": 12.0, |
| 93 | }, |
| 94 | ], |
| 95 | } |
| 96 | |
| 97 | |
| 98 | @pytest.fixture |
| 99 | def sample_graph(sample_kg_data): |
| 100 | """Pre-built NetworkX graph from sample data.""" |
| 101 | return graph_to_networkx(sample_kg_data) |
| 102 | |
| 103 | |
| 104 | class TestGraphToNetworkx: |
| 105 | def test_node_count(self, sample_graph): |
| 106 | assert sample_graph.number_of_nodes() == 5 |
| 107 | |
| 108 | def test_edge_count(self, sample_graph): |
| 109 | assert sample_graph.number_of_edges() == 5 |
| 110 | |
| 111 | def test_node_attributes(self, sample_graph): |
| 112 | alice = sample_graph.nodes["Alice"] |
| 113 | assert alice["type"] == "person" |
| 114 | assert alice["descriptions"] == ["Project lead"] |
| 115 | |
| 116 | def test_edge_attributes(self, sample_graph): |
| 117 | edge = sample_graph.edges["Alice", "Python"] |
| 118 | assert edge["type"] == "uses" |
| 119 | assert edge["content_source"] == "transcript_batch_0" |
| 120 | assert edge["timestamp"] == 1.5 |
| 121 | |
| 122 | def test_empty_data(self): |
| 123 | G = graph_to_networkx({}) |
| 124 | assert G.number_of_nodes() == 0 |
| 125 | assert G.number_of_edges() == 0 |
| 126 | |
| 127 | def test_nodes_only(self): |
| 128 | data = {"nodes": [{"name": "X", "type": "concept"}]} |
| 129 | G = graph_to_networkx(data) |
| 130 | assert G.number_of_nodes() == 1 |
| 131 | assert G.number_of_edges() == 0 |
| 132 | |
| 133 | def test_skips_empty_names(self): |
| 134 | data = {"nodes": [{"name": "", "type": "concept"}, {"name": "A"}]} |
| 135 | G = graph_to_networkx(data) |
| 136 | assert G.number_of_nodes() == 1 |
| 137 | |
| 138 | def test_skips_empty_relationship_endpoints(self): |
| 139 | data = { |
| 140 | "nodes": [{"name": "A"}], |
| 141 | "relationships": [{"source": "", "target": "A", "type": "x"}], |
| 142 | } |
| 143 | G = graph_to_networkx(data) |
| 144 | assert G.number_of_edges() == 0 |
| 145 | |
| 146 | |
| 147 | class TestComputeGraphStats: |
| 148 | def test_basic_counts(self, sample_graph): |
| 149 | stats = compute_graph_stats(sample_graph) |
| 150 | assert stats["node_count"] == 5 |
| 151 | assert stats["edge_count"] == 5 |
| 152 | |
| 153 | def test_density_range(self, sample_graph): |
| 154 | stats = compute_graph_stats(sample_graph) |
| 155 | assert 0.0 <= stats["density"] <= 1.0 |
| 156 | |
| 157 | def test_connected_components(self, sample_graph): |
| 158 | stats = compute_graph_stats(sample_graph) |
| 159 | assert stats["connected_components"] == 1 |
| 160 | |
| 161 | def test_type_breakdown(self, sample_graph): |
| 162 | stats = compute_graph_stats(sample_graph) |
| 163 | assert stats["type_breakdown"]["person"] == 2 |
| 164 | assert stats["type_breakdown"]["technology"] == 1 |
| 165 | assert stats["type_breakdown"]["organization"] == 1 |
| 166 | assert stats["type_breakdown"]["concept"] == 1 |
| 167 | |
| 168 | def test_top_entities(self, sample_graph): |
| 169 | stats = compute_graph_stats(sample_graph) |
| 170 | top = stats["top_entities"] |
| 171 | assert len(top) <= 10 |
| 172 | # Alice has degree 4 (3 out + 0 in? No: 3 out-edges, 0 in-edges = degree 3 undirected... |
| 173 | # Actually in DiGraph, degree = in + out. Alice: out=3 (Python, Bob, Acme), in=0 => 3 |
| 174 | # Python: in=2, out=0 => 2 |
| 175 | assert top[0]["name"] == "Alice" |
| 176 | |
| 177 | def test_empty_graph(self): |
| 178 | import networkx as nx |
| 179 | |
| 180 | G = nx.DiGraph() |
| 181 | stats = compute_graph_stats(G) |
| 182 | assert stats["node_count"] == 0 |
| 183 | assert stats["connected_components"] == 0 |
| 184 | assert stats["top_entities"] == [] |
| 185 | |
| 186 | |
| 187 | class TestFilterGraph: |
| 188 | def test_filter_by_type(self, sample_graph): |
| 189 | filtered = filter_graph(sample_graph, entity_types=["person"]) |
| 190 | assert filtered.number_of_nodes() == 2 |
| 191 | for _, data in filtered.nodes(data=True): |
| 192 | assert data["type"] == "person" |
| 193 | |
| 194 | def test_filter_by_min_degree(self, sample_graph): |
| 195 | # Alice has degree 3 (3 out-edges), Python has degree 2 (2 in-edges) |
| 196 | filtered = filter_graph(sample_graph, min_degree=3) |
| 197 | assert "Alice" in filtered.nodes |
| 198 | assert filtered.number_of_nodes() >= 1 |
| 199 | |
| 200 | def test_filter_combined(self, sample_graph): |
| 201 | filtered = filter_graph(sample_graph, entity_types=["person"], min_degree=1) |
| 202 | assert all(filtered.nodes[n]["type"] == "person" for n in filtered.nodes) |
| 203 | |
| 204 | def test_filter_no_criteria(self, sample_graph): |
| 205 | filtered = filter_graph(sample_graph) |
| 206 | assert filtered.number_of_nodes() == sample_graph.number_of_nodes() |
| 207 | |
| 208 | def test_filter_nonexistent_type(self, sample_graph): |
| 209 | filtered = filter_graph(sample_graph, entity_types=["alien"]) |
| 210 | assert filtered.number_of_nodes() == 0 |
| 211 | |
| 212 | def test_filter_preserves_edges(self, sample_graph): |
| 213 | filtered = filter_graph(sample_graph, entity_types=["person"]) |
| 214 | # Alice -> Bob edge should be preserved |
| 215 | assert filtered.has_edge("Alice", "Bob") |
| 216 | |
| 217 | def test_filter_returns_copy(self, sample_graph): |
| 218 | filtered = filter_graph(sample_graph, entity_types=["person"]) |
| 219 | # Modifying filtered should not affect original |
| 220 | filtered.add_node("NewNode") |
| 221 | assert "NewNode" not in sample_graph |
| 222 | |
| 223 | |
| 224 | class TestGenerateMermaid: |
| 225 | def test_output_starts_with_graph(self, sample_graph): |
| 226 | mermaid = generate_mermaid(sample_graph) |
| 227 | assert mermaid.startswith("graph LR") |
| 228 | |
| 229 | def test_custom_layout(self, sample_graph): |
| 230 | mermaid = generate_mermaid(sample_graph, layout="TD") |
| 231 | assert mermaid.startswith("graph TD") |
| 232 | |
| 233 | def test_contains_nodes(self, sample_graph): |
| 234 | mermaid = generate_mermaid(sample_graph) |
| 235 | assert "Alice" in mermaid |
| 236 | assert "Python" in mermaid |
| 237 | |
| 238 | def test_contains_edges(self, sample_graph): |
| 239 | mermaid = generate_mermaid(sample_graph) |
| 240 | assert "uses" in mermaid |
| 241 | |
| 242 | def test_contains_class_defs(self, sample_graph): |
| 243 | mermaid = generate_mermaid(sample_graph) |
| 244 | assert "classDef person" in mermaid |
| 245 | assert "classDef concept" in mermaid |
| 246 | |
| 247 | def test_max_nodes_limit(self, sample_graph): |
| 248 | mermaid = generate_mermaid(sample_graph, max_nodes=2) |
| 249 | # Should only have top-2 nodes by degree |
| 250 | lines = [ln for ln in mermaid.split("\n") if '["' in ln] |
| 251 | assert len(lines) <= 2 |
| 252 | |
| 253 | def test_empty_graph(self): |
| 254 | import networkx as nx |
| 255 | |
| 256 | G = nx.DiGraph() |
| 257 | mermaid = generate_mermaid(G) |
| 258 | assert "graph LR" in mermaid |
| 259 | |
| 260 | def test_sanitizes_special_chars(self): |
| 261 | import networkx as nx |
| 262 | |
| 263 | G = nx.DiGraph() |
| 264 | G.add_node("foo bar/baz", type="concept") |
| 265 | mermaid = generate_mermaid(G) |
| 266 | # Node ID should be sanitized but label preserved |
| 267 | assert "foo_bar_baz" in mermaid |
| 268 | assert "foo bar/baz" in mermaid |
| 269 | |
| 270 | |
| 271 | class TestGraphToD3Json: |
| 272 | def test_structure(self, sample_graph): |
| 273 | d3 = graph_to_d3_json(sample_graph) |
| 274 | assert "nodes" in d3 |
| 275 | assert "links" in d3 |
| 276 | |
| 277 | def test_node_format(self, sample_graph): |
| 278 | d3 = graph_to_d3_json(sample_graph) |
| 279 | node_ids = {n["id"] for n in d3["nodes"]} |
| 280 | assert "Alice" in node_ids |
| 281 | alice = next(n for n in d3["nodes"] if n["id"] == "Alice") |
| 282 | assert alice["group"] == "person" |
| 283 | |
| 284 | def test_link_format(self, sample_graph): |
| 285 | d3 = graph_to_d3_json(sample_graph) |
| 286 | assert len(d3["links"]) == 5 |
| 287 | link = d3["links"][0] |
| 288 | assert "source" in link |
| 289 | assert "target" in link |
| 290 | assert "type" in link |
| 291 | |
| 292 | def test_empty_graph(self): |
| 293 | import networkx as nx |
| 294 | |
| 295 | G = nx.DiGraph() |
| 296 | d3 = graph_to_d3_json(G) |
| 297 | assert d3 == {"nodes": [], "links": []} |
| 298 | |
| 299 | |
| 300 | class TestGraphToDot: |
| 301 | def test_starts_with_digraph(self, sample_graph): |
| 302 | dot = graph_to_dot(sample_graph) |
| 303 | assert dot.startswith("digraph KnowledgeGraph {") |
| 304 | |
| 305 | def test_ends_with_closing_brace(self, sample_graph): |
| 306 | dot = graph_to_dot(sample_graph) |
| 307 | assert dot.strip().endswith("}") |
| 308 | |
| 309 | def test_contains_nodes(self, sample_graph): |
| 310 | dot = graph_to_dot(sample_graph) |
| 311 | assert '"Alice"' in dot |
| 312 | assert '"Python"' in dot |
| 313 | |
| 314 | def test_contains_edges(self, sample_graph): |
| 315 | dot = graph_to_dot(sample_graph) |
| 316 | assert '"Alice" -> "Python"' in dot |
| 317 | |
| 318 | def test_edge_labels(self, sample_graph): |
| 319 | dot = graph_to_dot(sample_graph) |
| 320 | assert 'label="uses"' in dot |
| 321 | |
| 322 | def test_node_colors(self, sample_graph): |
| 323 | dot = graph_to_dot(sample_graph) |
| 324 | assert 'fillcolor="#f9d5e5"' in dot # person color for Alice |
| 325 | |
| 326 | def test_empty_graph(self): |
| 327 | import networkx as nx |
| 328 | |
| 329 | G = nx.DiGraph() |
| 330 | dot = graph_to_dot(G) |
| 331 | assert "digraph" in dot |
| 332 | |
| 333 | def test_special_chars_escaped(self): |
| 334 | import networkx as nx |
| 335 | |
| 336 | G = nx.DiGraph() |
| 337 | G.add_node('He said "hello"', type="person") |
| 338 | dot = graph_to_dot(G) |
| 339 | assert 'He said \\"hello\\"' in dot |
| --- a/video_processor/agent/agent_loop.py | ||
| +++ b/video_processor/agent/agent_loop.py | ||
| @@ -0,0 +1,159 @@ | ||
| 1 | +"""Planning agent loop for synthesizing knowledge into artifacts.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +from pathlib import Path | |
| 5 | +from typing import List | |
| 6 | + | |
| 7 | +from video_processor.agent.kb_context import KBContext | |
| 8 | +from video_processor.agent.skills.base import ( | |
| 9 | + AgentContext, | |
| 10 | + Artifact, | |
| 11 | + get_skill, | |
| 12 | + list_skills, | |
| 13 | +) | |
| 14 | + | |
| 15 | +logger = logging.getLogger(__name__) | |
| 16 | + | |
| 17 | + | |
| 18 | +class PlanningAgent: | |
| 19 | + """AI agent that synthesizes knowledge into planning artifacts.""" | |
| 20 | + | |
| 21 | + def __init__(self, context: AgentContext): | |
| 22 | + self.context = context | |
| 23 | + | |
| 24 | + @classmethod | |
| 25 | + def from_kb_paths(cls, kb_paths: List[Path], provider_manager=None) -> "PlanningAgent": | |
| 26 | + """Create an agent from knowledge base paths.""" | |
| 27 | + kb = KBContext() | |
| 28 | + for path in kb_paths: | |
| 29 | + kb.add_source(path) | |
| 30 | + kb.load(provider_manager=provider_manager) | |
| 31 | + | |
| 32 | + context = AgentContext( | |
| 33 | + knowledge_graph=kb.knowledge_graph, | |
| 34 | + query_engine=kb.query_engine, | |
| 35 | + provider_manager=provider_manager, | |
| 36 | + ) | |
| 37 | + return cls(context) | |
| 38 | + | |
| 39 | + def execute(self, request: str) -> List[Artifact]: | |
| 40 | + """Execute a user request by selecting and running appropriate skills.""" | |
| 41 | + # Step 1: Build context summary for LLM | |
| 42 | + kb_summary = "" | |
| 43 | + if self.context.query_engine: | |
| 44 | + stats = self.context.query_engine.stats() | |
| 45 | + kb_summary = stats.to_text() | |
| 46 | + | |
| 47 | + available_skills = list_skills() | |
| 48 | + skill_descriptions = "\n".join(f"- {s.name}: {s.description}" for s in available_skills) | |
| 49 | + | |
| 50 | + # Step 2: Ask LLM to select skills | |
| 51 | + plan_prompt = ( | |
| 52 | + "You are a planning agent. Given a user request and available skills, " | |
| 53 | + "select which skills to execute and in what order.\n\n" | |
| 54 | + f"Knowledge base:\n{kb_summary}\n\n" | |
| 55 | + f"Available skills:\n{skill_descriptions}\n\n" | |
| 56 | + f"User request: {request}\n\n" | |
| 57 | + "Return a JSON array of skill names to execute in order:\n" | |
| 58 | + '[{"skill": "skill_name", "params": {}}]\n' | |
| 59 | + "Return ONLY the JSON array." | |
| 60 | + ) | |
| 61 | + | |
| 62 | + if not self.context.provider_manager: | |
| 63 | + # No LLM -- try to match skills by keyword | |
| 64 | + return self._keyword_match_execute(request) | |
| 65 | + | |
| 66 | + raw = self.context.provider_manager.chat( | |
| 67 | + [{"role": "user", "content": plan_prompt}], | |
| 68 | + max_tokens=512, | |
| 69 | + temperature=0.1, | |
| 70 | + ) | |
| 71 | + | |
| 72 | + from video_processor.utils.json_parsing import parse_json_from_response | |
| 73 | + | |
| 74 | + plan = parse_json_from_response(raw) | |
| 75 | + | |
| 76 | + artifacts = [] | |
| 77 | + if isinstance(plan, list): | |
| 78 | + for step in plan: | |
| 79 | + if isinstance(step, dict) and "skill" in step: | |
| 80 | + skill = get_skill(step["skill"]) | |
| 81 | + if skill and skill.can_execute(self.context): | |
| 82 | + params = step.get("params", {}) | |
| 83 | + artifact = skill.execute(self.context, **params) | |
| 84 | + artifacts.append(artifact) | |
| 85 | + self.context.artifacts.append(artifact) | |
| 86 | + | |
| 87 | + return artifacts | |
| 88 | + | |
| 89 | + def _keyword_match_execute(self, request: str) -> List[Artifact]: | |
| 90 | + """Fallback: match skills by keywords in the request.""" | |
| 91 | + request_lower = request.lower() | |
| 92 | + artifacts = [] | |
| 93 | + for skill in list_skills(): | |
| 94 | + # Simple keyword matching | |
| 95 | + skill_words = skill.name.replace("_", " ").split() | |
| 96 | + if any(word in request_lower for word in skill_words): | |
| 97 | + if skill.can_execute(self.context): | |
| 98 | + artifact = skill.execute(self.context) | |
| 99 | + artifacts.append(artifact) | |
| 100 | + self.context.artifacts.append(artifact) | |
| 101 | + return artifacts | |
| 102 | + | |
| 103 | + def chat(self, message: str) -> str: | |
| 104 | + """Interactive chat -- accumulate context and answer questions.""" | |
| 105 | + self.context.conversation_history.append({"role": "user", "content": message}) | |
| 106 | + | |
| 107 | + if not self.context.provider_manager: | |
| 108 | + return "Agent requires a configured LLM provider for chat mode." | |
| 109 | + | |
| 110 | + # Build system context | |
| 111 | + kb_summary = "" | |
| 112 | + if self.context.query_engine: | |
| 113 | + stats = self.context.query_engine.stats() | |
| 114 | + kb_summary = f"\n\nKnowledge base:\n{stats.to_text()}" | |
| 115 | + | |
| 116 | + artifacts_summary = "" | |
| 117 | + if self.context.artifacts: | |
| 118 | + artifacts_summary = "\n\nGenerated artifacts:\n" + "\n".join( | |
| 119 | + f"- {a.name} ({a.artifact_type})" for a in self.context.artifacts | |
| 120 | + ) | |
| 121 | + | |
| 122 | + system_msg = ( | |
| 123 | + "You are PlanOpticon, an AI planning companion built into the PlanOpticon CLI. " | |
| 124 | + "PlanOpticon is a video analysis and knowledge extraction tool that processes " | |
| 125 | + "recordings into structured knowledge graphs.\n\n" | |
| 126 | + "You are running inside the interactive companion REPL. The user can use these " | |
| 127 | + "built-in commands (suggest them when relevant):\n" | |
| 128 | + " /status - Show workspace status (loaded KG, videos, docs)\n" | |
| 129 | + " /entities [--type T] - List knowledge graph entities\n" | |
| 130 | + " /search TERM - Search entities by name\n" | |
| 131 | + " /neighbors ENTITY - Show entity relationships\n" | |
| 132 | + " /export FORMAT - Export KG (markdown, obsidian, notion, csv)\n" | |
| 133 | + " /analyze PATH - Analyze a video or document\n" | |
| 134 | + " /ingest PATH - Ingest a file into the knowledge graph\n" | |
| 135 | + " /auth SERVICE - Authenticate with a service " | |
| 136 | + "(zoom, google, microsoft, notion, dropbox, github)\n" | |
| 137 | + " /provider [NAME] - List or switch LLM provider\n" | |
| 138 | + " /model [NAME] - Show or switch chat model\n" | |
| 139 | + " /plan - Generate a project plan\n" | |
| 140 | + " /prd - Generate a PRD\n" | |
| 141 | + " /tasks - Generate a task breakdown\n\n" | |
| 142 | + "PlanOpticon CLI commands the user can run outside the REPL:\n" | |
| 143 | + " planopticon auth zoom|google|microsoft - Authenticate with cloud services\n" | |
| 144 | + " planopticon recordings zoom-list|teams-list|meet-list - List cloud recordings\n" | |
| 145 | + " planopticon analyze -i VIDEO - Analyze a video file\n" | |
| 146 | + " planopticon query - Query the knowledge graph\n" | |
| 147 | + " planopticon export FORMAT PATH - Export knowledge graph\n\n" | |
| 148 | + f"{kb_summary}{artifacts_summary}\n\n" | |
| 149 | + "Help the user with their planning tasks. When they ask about capabilities, " | |
| 150 | + "refer them to the appropriate built-in commands. Ask clarifying questions " | |
| 151 | + "to gather requirements. When ready, suggest using specific skills or commands " | |
| 152 | + "to generate artifacts." | |
| 153 | + ) | |
| 154 | + | |
| 155 | + messages = [{"role": "system", "content": system_msg}] + self.context.conversation_history | |
| 156 | + | |
| 157 | + response = self.context.provider_manager.chat(messages, max_tokens=2048, temperature=0.5) | |
| 158 | + self.context.conversation_history.append({"role": "assistant", "content": response}) | |
| 159 | + return response |
| --- a/video_processor/agent/agent_loop.py | |
| +++ b/video_processor/agent/agent_loop.py | |
| @@ -0,0 +1,159 @@ | |
| --- a/video_processor/agent/agent_loop.py | |
| +++ b/video_processor/agent/agent_loop.py | |
| @@ -0,0 +1,159 @@ | |
| 1 | """Planning agent loop for synthesizing knowledge into artifacts.""" |
| 2 | |
| 3 | import logging |
| 4 | from pathlib import Path |
| 5 | from typing import List |
| 6 | |
| 7 | from video_processor.agent.kb_context import KBContext |
| 8 | from video_processor.agent.skills.base import ( |
| 9 | AgentContext, |
| 10 | Artifact, |
| 11 | get_skill, |
| 12 | list_skills, |
| 13 | ) |
| 14 | |
| 15 | logger = logging.getLogger(__name__) |
| 16 | |
| 17 | |
| 18 | class PlanningAgent: |
| 19 | """AI agent that synthesizes knowledge into planning artifacts.""" |
| 20 | |
| 21 | def __init__(self, context: AgentContext): |
| 22 | self.context = context |
| 23 | |
| 24 | @classmethod |
| 25 | def from_kb_paths(cls, kb_paths: List[Path], provider_manager=None) -> "PlanningAgent": |
| 26 | """Create an agent from knowledge base paths.""" |
| 27 | kb = KBContext() |
| 28 | for path in kb_paths: |
| 29 | kb.add_source(path) |
| 30 | kb.load(provider_manager=provider_manager) |
| 31 | |
| 32 | context = AgentContext( |
| 33 | knowledge_graph=kb.knowledge_graph, |
| 34 | query_engine=kb.query_engine, |
| 35 | provider_manager=provider_manager, |
| 36 | ) |
| 37 | return cls(context) |
| 38 | |
| 39 | def execute(self, request: str) -> List[Artifact]: |
| 40 | """Execute a user request by selecting and running appropriate skills.""" |
| 41 | # Step 1: Build context summary for LLM |
| 42 | kb_summary = "" |
| 43 | if self.context.query_engine: |
| 44 | stats = self.context.query_engine.stats() |
| 45 | kb_summary = stats.to_text() |
| 46 | |
| 47 | available_skills = list_skills() |
| 48 | skill_descriptions = "\n".join(f"- {s.name}: {s.description}" for s in available_skills) |
| 49 | |
| 50 | # Step 2: Ask LLM to select skills |
| 51 | plan_prompt = ( |
| 52 | "You are a planning agent. Given a user request and available skills, " |
| 53 | "select which skills to execute and in what order.\n\n" |
| 54 | f"Knowledge base:\n{kb_summary}\n\n" |
| 55 | f"Available skills:\n{skill_descriptions}\n\n" |
| 56 | f"User request: {request}\n\n" |
| 57 | "Return a JSON array of skill names to execute in order:\n" |
| 58 | '[{"skill": "skill_name", "params": {}}]\n' |
| 59 | "Return ONLY the JSON array." |
| 60 | ) |
| 61 | |
| 62 | if not self.context.provider_manager: |
| 63 | # No LLM -- try to match skills by keyword |
| 64 | return self._keyword_match_execute(request) |
| 65 | |
| 66 | raw = self.context.provider_manager.chat( |
| 67 | [{"role": "user", "content": plan_prompt}], |
| 68 | max_tokens=512, |
| 69 | temperature=0.1, |
| 70 | ) |
| 71 | |
| 72 | from video_processor.utils.json_parsing import parse_json_from_response |
| 73 | |
| 74 | plan = parse_json_from_response(raw) |
| 75 | |
| 76 | artifacts = [] |
| 77 | if isinstance(plan, list): |
| 78 | for step in plan: |
| 79 | if isinstance(step, dict) and "skill" in step: |
| 80 | skill = get_skill(step["skill"]) |
| 81 | if skill and skill.can_execute(self.context): |
| 82 | params = step.get("params", {}) |
| 83 | artifact = skill.execute(self.context, **params) |
| 84 | artifacts.append(artifact) |
| 85 | self.context.artifacts.append(artifact) |
| 86 | |
| 87 | return artifacts |
| 88 | |
| 89 | def _keyword_match_execute(self, request: str) -> List[Artifact]: |
| 90 | """Fallback: match skills by keywords in the request.""" |
| 91 | request_lower = request.lower() |
| 92 | artifacts = [] |
| 93 | for skill in list_skills(): |
| 94 | # Simple keyword matching |
| 95 | skill_words = skill.name.replace("_", " ").split() |
| 96 | if any(word in request_lower for word in skill_words): |
| 97 | if skill.can_execute(self.context): |
| 98 | artifact = skill.execute(self.context) |
| 99 | artifacts.append(artifact) |
| 100 | self.context.artifacts.append(artifact) |
| 101 | return artifacts |
| 102 | |
| 103 | def chat(self, message: str) -> str: |
| 104 | """Interactive chat -- accumulate context and answer questions.""" |
| 105 | self.context.conversation_history.append({"role": "user", "content": message}) |
| 106 | |
| 107 | if not self.context.provider_manager: |
| 108 | return "Agent requires a configured LLM provider for chat mode." |
| 109 | |
| 110 | # Build system context |
| 111 | kb_summary = "" |
| 112 | if self.context.query_engine: |
| 113 | stats = self.context.query_engine.stats() |
| 114 | kb_summary = f"\n\nKnowledge base:\n{stats.to_text()}" |
| 115 | |
| 116 | artifacts_summary = "" |
| 117 | if self.context.artifacts: |
| 118 | artifacts_summary = "\n\nGenerated artifacts:\n" + "\n".join( |
| 119 | f"- {a.name} ({a.artifact_type})" for a in self.context.artifacts |
| 120 | ) |
| 121 | |
| 122 | system_msg = ( |
| 123 | "You are PlanOpticon, an AI planning companion built into the PlanOpticon CLI. " |
| 124 | "PlanOpticon is a video analysis and knowledge extraction tool that processes " |
| 125 | "recordings into structured knowledge graphs.\n\n" |
| 126 | "You are running inside the interactive companion REPL. The user can use these " |
| 127 | "built-in commands (suggest them when relevant):\n" |
| 128 | " /status - Show workspace status (loaded KG, videos, docs)\n" |
| 129 | " /entities [--type T] - List knowledge graph entities\n" |
| 130 | " /search TERM - Search entities by name\n" |
| 131 | " /neighbors ENTITY - Show entity relationships\n" |
| 132 | " /export FORMAT - Export KG (markdown, obsidian, notion, csv)\n" |
| 133 | " /analyze PATH - Analyze a video or document\n" |
| 134 | " /ingest PATH - Ingest a file into the knowledge graph\n" |
| 135 | " /auth SERVICE - Authenticate with a service " |
| 136 | "(zoom, google, microsoft, notion, dropbox, github)\n" |
| 137 | " /provider [NAME] - List or switch LLM provider\n" |
| 138 | " /model [NAME] - Show or switch chat model\n" |
| 139 | " /plan - Generate a project plan\n" |
| 140 | " /prd - Generate a PRD\n" |
| 141 | " /tasks - Generate a task breakdown\n\n" |
| 142 | "PlanOpticon CLI commands the user can run outside the REPL:\n" |
| 143 | " planopticon auth zoom|google|microsoft - Authenticate with cloud services\n" |
| 144 | " planopticon recordings zoom-list|teams-list|meet-list - List cloud recordings\n" |
| 145 | " planopticon analyze -i VIDEO - Analyze a video file\n" |
| 146 | " planopticon query - Query the knowledge graph\n" |
| 147 | " planopticon export FORMAT PATH - Export knowledge graph\n\n" |
| 148 | f"{kb_summary}{artifacts_summary}\n\n" |
| 149 | "Help the user with their planning tasks. When they ask about capabilities, " |
| 150 | "refer them to the appropriate built-in commands. Ask clarifying questions " |
| 151 | "to gather requirements. When ready, suggest using specific skills or commands " |
| 152 | "to generate artifacts." |
| 153 | ) |
| 154 | |
| 155 | messages = [{"role": "system", "content": system_msg}] + self.context.conversation_history |
| 156 | |
| 157 | response = self.context.provider_manager.chat(messages, max_tokens=2048, temperature=0.5) |
| 158 | self.context.conversation_history.append({"role": "assistant", "content": response}) |
| 159 | return response |
| --- a/video_processor/agent/kb_context.py | ||
| +++ b/video_processor/agent/kb_context.py | ||
| @@ -0,0 +1,98 @@ | ||
| 1 | +"""Knowledge base context manager for loading and merging knowledge graphs.""" | |
| 2 | + | |
| 3 | +import json | |
| 4 | +import logging | |
| 5 | +from pathlib import Path | |
| 6 | +from typing import List, Optional | |
| 7 | + | |
| 8 | +logger = logging.getLogger(__name__) | |
| 9 | + | |
| 10 | + | |
| 11 | +class KBContext: | |
| 12 | + """Load and merge multiple knowledge graphs into a unified context.""" | |
| 13 | + | |
| 14 | + def __init__(self): | |
| 15 | + self._sources: List[Path] = [] | |
| 16 | + self._kg = None # KnowledgeGraph instance | |
| 17 | + self._engine = None # GraphQueryEngine instance | |
| 18 | + | |
| 19 | + def add_source(self, path) -> None: | |
| 20 | + """Add a knowledge graph source (.db or .json file, or directory to search).""" | |
| 21 | + path = Path(path).resolve() | |
| 22 | + if path.is_dir(): | |
| 23 | + from video_processor.integrators.graph_discovery import find_knowledge_graphs | |
| 24 | + | |
| 25 | + graphs = find_knowledge_graphs(path) | |
| 26 | + self._sources.extend(graphs) | |
| 27 | + elif path.is_file(): | |
| 28 | + self._sources.append(path) | |
| 29 | + else: | |
| 30 | + raise FileNotFoundError(f"Not found: {path}") | |
| 31 | + | |
| 32 | + def load(self, provider_manager=None) -> "KBContext": | |
| 33 | + """Load and merge all added sources into a single knowledge graph.""" | |
| 34 | + from video_processor.integrators.graph_query import GraphQueryEngine | |
| 35 | + from video_processor.integrators.knowledge_graph import KnowledgeGraph | |
| 36 | + | |
| 37 | + self._kg = KnowledgeGraph(provider_manager=provider_manager) | |
| 38 | + | |
| 39 | + for source_path in self._sources: | |
| 40 | + if source_path.suffix == ".db": | |
| 41 | + other = KnowledgeGraph(db_path=source_path) | |
| 42 | + self._kg.merge(other) | |
| 43 | + elif source_path.suffix == ".json": | |
| 44 | + data = json.loads(source_path.read_text()) | |
| 45 | + other = KnowledgeGraph.from_dict(data) | |
| 46 | + self._kg.merge(other) | |
| 47 | + | |
| 48 | + self._engine = GraphQueryEngine(self._kg._store, provider_manager=provider_manager) | |
| 49 | + return self | |
| 50 | + | |
| 51 | + @property | |
| 52 | + def knowledge_graph(self): | |
| 53 | + """Return the merged KnowledgeGraph, or None if not loaded.""" | |
| 54 | + if not self._kg: | |
| 55 | + raise RuntimeError("Call load() first") | |
| 56 | + return self._kg | |
| 57 | + | |
| 58 | + @property | |
| 59 | + def query_engine(self): | |
| 60 | + """Return the GraphQueryEngine, or None if not loaded.""" | |
| 61 | + if not self._engine: | |
| 62 | + raise RuntimeError("Call load() first") | |
| 63 | + return self._engine | |
| 64 | + | |
| 65 | + @property | |
| 66 | + def sources(self) -> List[Path]: | |
| 67 | + """Return the list of source paths.""" | |
| 68 | + return list(self._sources) | |
| 69 | + | |
| 70 | + def summary(self) -> str: | |
| 71 | + """Generate a brief summary of the loaded knowledge base.""" | |
| 72 | + if not self._kg: | |
| 73 | + return "No knowledge base loaded." | |
| 74 | + | |
| 75 | + stats = self._engine.stats().data | |
| 76 | + lines = [ | |
| 77 | + f"Knowledge base: {len(self._sources)} source(s)", | |
| 78 | + f" Entities: {stats['entity_count']}", | |
| 79 | + f" Relationships: {stats['relationship_count']}", | |
| 80 | + ] | |
| 81 | + if stats.get("entity_types"): | |
| 82 | + lines.append(" Entity types:") | |
| 83 | + for t, count in sorted(stats["entity_types"].items(), key=lambda x: -x[1]): | |
| 84 | + lines.append(f" {t}: {count}") | |
| 85 | + return "\n".join(lines) | |
| 86 | + | |
| 87 | + @classmethod | |
| 88 | + def auto_discover(cls, start_dir: Optional[Path] = None, provider_manager=None) -> "KBContext": | |
| 89 | + """Create a KBContext by auto-discovering knowledge graphs near start_dir.""" | |
| 90 | + from video_processor.integrators.graph_discovery import find_knowledge_graphs | |
| 91 | + | |
| 92 | + ctx = cls() | |
| 93 | + graphs = find_knowledge_graphs(start_dir) | |
| 94 | + for g in graphs: | |
| 95 | + ctx._sources.append(g) | |
| 96 | + if ctx._sources: | |
| 97 | + ctx.load(provider_manager=provider_manager) | |
| 98 | + return ctx |
| --- a/video_processor/agent/kb_context.py | |
| +++ b/video_processor/agent/kb_context.py | |
| @@ -0,0 +1,98 @@ | |
| --- a/video_processor/agent/kb_context.py | |
| +++ b/video_processor/agent/kb_context.py | |
| @@ -0,0 +1,98 @@ | |
| 1 | """Knowledge base context manager for loading and merging knowledge graphs.""" |
| 2 | |
| 3 | import json |
| 4 | import logging |
| 5 | from pathlib import Path |
| 6 | from typing import List, Optional |
| 7 | |
| 8 | logger = logging.getLogger(__name__) |
| 9 | |
| 10 | |
| 11 | class KBContext: |
| 12 | """Load and merge multiple knowledge graphs into a unified context.""" |
| 13 | |
| 14 | def __init__(self): |
| 15 | self._sources: List[Path] = [] |
| 16 | self._kg = None # KnowledgeGraph instance |
| 17 | self._engine = None # GraphQueryEngine instance |
| 18 | |
| 19 | def add_source(self, path) -> None: |
| 20 | """Add a knowledge graph source (.db or .json file, or directory to search).""" |
| 21 | path = Path(path).resolve() |
| 22 | if path.is_dir(): |
| 23 | from video_processor.integrators.graph_discovery import find_knowledge_graphs |
| 24 | |
| 25 | graphs = find_knowledge_graphs(path) |
| 26 | self._sources.extend(graphs) |
| 27 | elif path.is_file(): |
| 28 | self._sources.append(path) |
| 29 | else: |
| 30 | raise FileNotFoundError(f"Not found: {path}") |
| 31 | |
| 32 | def load(self, provider_manager=None) -> "KBContext": |
| 33 | """Load and merge all added sources into a single knowledge graph.""" |
| 34 | from video_processor.integrators.graph_query import GraphQueryEngine |
| 35 | from video_processor.integrators.knowledge_graph import KnowledgeGraph |
| 36 | |
| 37 | self._kg = KnowledgeGraph(provider_manager=provider_manager) |
| 38 | |
| 39 | for source_path in self._sources: |
| 40 | if source_path.suffix == ".db": |
| 41 | other = KnowledgeGraph(db_path=source_path) |
| 42 | self._kg.merge(other) |
| 43 | elif source_path.suffix == ".json": |
| 44 | data = json.loads(source_path.read_text()) |
| 45 | other = KnowledgeGraph.from_dict(data) |
| 46 | self._kg.merge(other) |
| 47 | |
| 48 | self._engine = GraphQueryEngine(self._kg._store, provider_manager=provider_manager) |
| 49 | return self |
| 50 | |
| 51 | @property |
| 52 | def knowledge_graph(self): |
| 53 | """Return the merged KnowledgeGraph, or None if not loaded.""" |
| 54 | if not self._kg: |
| 55 | raise RuntimeError("Call load() first") |
| 56 | return self._kg |
| 57 | |
| 58 | @property |
| 59 | def query_engine(self): |
| 60 | """Return the GraphQueryEngine, or None if not loaded.""" |
| 61 | if not self._engine: |
| 62 | raise RuntimeError("Call load() first") |
| 63 | return self._engine |
| 64 | |
| 65 | @property |
| 66 | def sources(self) -> List[Path]: |
| 67 | """Return the list of source paths.""" |
| 68 | return list(self._sources) |
| 69 | |
| 70 | def summary(self) -> str: |
| 71 | """Generate a brief summary of the loaded knowledge base.""" |
| 72 | if not self._kg: |
| 73 | return "No knowledge base loaded." |
| 74 | |
| 75 | stats = self._engine.stats().data |
| 76 | lines = [ |
| 77 | f"Knowledge base: {len(self._sources)} source(s)", |
| 78 | f" Entities: {stats['entity_count']}", |
| 79 | f" Relationships: {stats['relationship_count']}", |
| 80 | ] |
| 81 | if stats.get("entity_types"): |
| 82 | lines.append(" Entity types:") |
| 83 | for t, count in sorted(stats["entity_types"].items(), key=lambda x: -x[1]): |
| 84 | lines.append(f" {t}: {count}") |
| 85 | return "\n".join(lines) |
| 86 | |
| 87 | @classmethod |
| 88 | def auto_discover(cls, start_dir: Optional[Path] = None, provider_manager=None) -> "KBContext": |
| 89 | """Create a KBContext by auto-discovering knowledge graphs near start_dir.""" |
| 90 | from video_processor.integrators.graph_discovery import find_knowledge_graphs |
| 91 | |
| 92 | ctx = cls() |
| 93 | graphs = find_knowledge_graphs(start_dir) |
| 94 | for g in graphs: |
| 95 | ctx._sources.append(g) |
| 96 | if ctx._sources: |
| 97 | ctx.load(provider_manager=provider_manager) |
| 98 | return ctx |
| --- video_processor/agent/orchestrator.py | ||
| +++ video_processor/agent/orchestrator.py | ||
| @@ -200,10 +200,11 @@ | ||
| 200 | 200 | diagram_result = self._results.get("detect_diagrams", {}) |
| 201 | 201 | diagrams = diagram_result.get("diagrams", []) |
| 202 | 202 | if diagrams: |
| 203 | 203 | kg.process_diagrams([d.model_dump() for d in diagrams]) |
| 204 | 204 | |
| 205 | + # Export JSON copy alongside the SQLite db | |
| 205 | 206 | kg.save(dirs["results"] / "knowledge_graph.json") |
| 206 | 207 | return {"knowledge_graph": kg} |
| 207 | 208 | |
| 208 | 209 | elif step_name == "extract_key_points": |
| 209 | 210 | transcript = self._results.get("transcribe", {}) |
| 210 | 211 | |
| 211 | 212 | ADDED video_processor/agent/skills/__init__.py |
| 212 | 213 | ADDED video_processor/agent/skills/artifact_export.py |
| 213 | 214 | ADDED video_processor/agent/skills/base.py |
| 214 | 215 | ADDED video_processor/agent/skills/cli_adapter.py |
| 215 | 216 | ADDED video_processor/agent/skills/doc_generator.py |
| 216 | 217 | ADDED video_processor/agent/skills/github_integration.py |
| 217 | 218 | ADDED video_processor/agent/skills/notes_export.py |
| 218 | 219 | ADDED video_processor/agent/skills/prd.py |
| 219 | 220 | ADDED video_processor/agent/skills/project_plan.py |
| 220 | 221 | ADDED video_processor/agent/skills/requirements_chat.py |
| 221 | 222 | ADDED video_processor/agent/skills/roadmap.py |
| 222 | 223 | ADDED video_processor/agent/skills/task_breakdown.py |
| 223 | 224 | ADDED video_processor/agent/skills/wiki_generator.py |
| 224 | 225 | ADDED video_processor/api/openapi_spec.py |
| 225 | 226 | ADDED video_processor/auth.py |
| --- video_processor/agent/orchestrator.py | |
| +++ video_processor/agent/orchestrator.py | |
| @@ -200,10 +200,11 @@ | |
| 200 | diagram_result = self._results.get("detect_diagrams", {}) |
| 201 | diagrams = diagram_result.get("diagrams", []) |
| 202 | if diagrams: |
| 203 | kg.process_diagrams([d.model_dump() for d in diagrams]) |
| 204 | |
| 205 | kg.save(dirs["results"] / "knowledge_graph.json") |
| 206 | return {"knowledge_graph": kg} |
| 207 | |
| 208 | elif step_name == "extract_key_points": |
| 209 | transcript = self._results.get("transcribe", {}) |
| 210 | |
| 211 | DDED video_processor/agent/skills/__init__.py |
| 212 | DDED video_processor/agent/skills/artifact_export.py |
| 213 | DDED video_processor/agent/skills/base.py |
| 214 | DDED video_processor/agent/skills/cli_adapter.py |
| 215 | DDED video_processor/agent/skills/doc_generator.py |
| 216 | DDED video_processor/agent/skills/github_integration.py |
| 217 | DDED video_processor/agent/skills/notes_export.py |
| 218 | DDED video_processor/agent/skills/prd.py |
| 219 | DDED video_processor/agent/skills/project_plan.py |
| 220 | DDED video_processor/agent/skills/requirements_chat.py |
| 221 | DDED video_processor/agent/skills/roadmap.py |
| 222 | DDED video_processor/agent/skills/task_breakdown.py |
| 223 | DDED video_processor/agent/skills/wiki_generator.py |
| 224 | DDED video_processor/api/openapi_spec.py |
| 225 | DDED video_processor/auth.py |
| --- video_processor/agent/orchestrator.py | |
| +++ video_processor/agent/orchestrator.py | |
| @@ -200,10 +200,11 @@ | |
| 200 | diagram_result = self._results.get("detect_diagrams", {}) |
| 201 | diagrams = diagram_result.get("diagrams", []) |
| 202 | if diagrams: |
| 203 | kg.process_diagrams([d.model_dump() for d in diagrams]) |
| 204 | |
| 205 | # Export JSON copy alongside the SQLite db |
| 206 | kg.save(dirs["results"] / "knowledge_graph.json") |
| 207 | return {"knowledge_graph": kg} |
| 208 | |
| 209 | elif step_name == "extract_key_points": |
| 210 | transcript = self._results.get("transcribe", {}) |
| 211 | |
| 212 | DDED video_processor/agent/skills/__init__.py |
| 213 | DDED video_processor/agent/skills/artifact_export.py |
| 214 | DDED video_processor/agent/skills/base.py |
| 215 | DDED video_processor/agent/skills/cli_adapter.py |
| 216 | DDED video_processor/agent/skills/doc_generator.py |
| 217 | DDED video_processor/agent/skills/github_integration.py |
| 218 | DDED video_processor/agent/skills/notes_export.py |
| 219 | DDED video_processor/agent/skills/prd.py |
| 220 | DDED video_processor/agent/skills/project_plan.py |
| 221 | DDED video_processor/agent/skills/requirements_chat.py |
| 222 | DDED video_processor/agent/skills/roadmap.py |
| 223 | DDED video_processor/agent/skills/task_breakdown.py |
| 224 | DDED video_processor/agent/skills/wiki_generator.py |
| 225 | DDED video_processor/api/openapi_spec.py |
| 226 | DDED video_processor/auth.py |
| --- a/video_processor/agent/skills/__init__.py | ||
| +++ b/video_processor/agent/skills/__init__.py | ||
| @@ -0,0 +1,33 @@ | ||
| 1 | +"""Agent skill system for PlanOpticon.""" | |
| 2 | + | |
| 3 | +# Import skill modules so they self-register via register_skill(). | |
| 4 | +from video_processor.agent.skills import ( # noqa: F401 | |
| 5 | + artifact_export, | |
| 6 | + cli_adapter, | |
| 7 | + doc_generator, | |
| 8 | + github_integration, | |
| 9 | + notes_export, | |
| 10 | + prd, | |
| 11 | + project_plan, | |
| 12 | + requirements_chat, | |
| 13 | + roadmap, | |
| 14 | + task_breakdown, | |
| 15 | + wiki_generator, | |
| 16 | +) | |
| 17 | +from video_processor.agent.skills.base import ( | |
| 18 | + AgentContext, | |
| 19 | + Artifact, | |
| 20 | + Skill, | |
| 21 | + get_skill, | |
| 22 | + list_skills, | |
| 23 | + register_skill, | |
| 24 | +) | |
| 25 | + | |
| 26 | +__all__ = [ | |
| 27 | + "AgentContext", | |
| 28 | + "Artifact", | |
| 29 | + "Skill", | |
| 30 | + "get_skill", | |
| 31 | + "list_skills", | |
| 32 | + "register_skill", | |
| 33 | +] |
| --- a/video_processor/agent/skills/__init__.py | |
| +++ b/video_processor/agent/skills/__init__.py | |
| @@ -0,0 +1,33 @@ | |
| --- a/video_processor/agent/skills/__init__.py | |
| +++ b/video_processor/agent/skills/__init__.py | |
| @@ -0,0 +1,33 @@ | |
| 1 | """Agent skill system for PlanOpticon.""" |
| 2 | |
| 3 | # Import skill modules so they self-register via register_skill(). |
| 4 | from video_processor.agent.skills import ( # noqa: F401 |
| 5 | artifact_export, |
| 6 | cli_adapter, |
| 7 | doc_generator, |
| 8 | github_integration, |
| 9 | notes_export, |
| 10 | prd, |
| 11 | project_plan, |
| 12 | requirements_chat, |
| 13 | roadmap, |
| 14 | task_breakdown, |
| 15 | wiki_generator, |
| 16 | ) |
| 17 | from video_processor.agent.skills.base import ( |
| 18 | AgentContext, |
| 19 | Artifact, |
| 20 | Skill, |
| 21 | get_skill, |
| 22 | list_skills, |
| 23 | register_skill, |
| 24 | ) |
| 25 | |
| 26 | __all__ = [ |
| 27 | "AgentContext", |
| 28 | "Artifact", |
| 29 | "Skill", |
| 30 | "get_skill", |
| 31 | "list_skills", |
| 32 | "register_skill", |
| 33 | ] |
| --- a/video_processor/agent/skills/artifact_export.py | ||
| +++ b/video_processor/agent/skills/artifact_export.py | ||
| @@ -0,0 +1,94 @@ | ||
| 1 | +"""Skill: Export artifacts in agent-ready formats to a directory structure.""" | |
| 2 | + | |
| 3 | +import json | |
| 4 | +from pathlib import Path | |
| 5 | + | |
| 6 | +from video_processor.agent.skills.base import AgentContext, Artifact, Skill, register_skill | |
| 7 | + | |
| 8 | +# Maps artifact_type to output filename | |
| 9 | +_TYPE_TO_FILE = { | |
| 10 | + "project_plan": "project_plan.md", | |
| 11 | + "prd": "prd.md", | |
| 12 | + "roadmap": "roadmap.md", | |
| 13 | + "task_list": "tasks.json", | |
| 14 | + "issues": "issues.json", | |
| 15 | + "requirements": "requirements.json", | |
| 16 | +} | |
| 17 | + | |
| 18 | + | |
| 19 | +def _write_artifact(artifact: Artifact, output_dir: Path) -> dict: | |
| 20 | + """Write a single artifact to the appropriate file. Returns manifest entry.""" | |
| 21 | + filename = _TYPE_TO_FILE.get(artifact.artifact_type) | |
| 22 | + if filename: | |
| 23 | + dest = output_dir / filename | |
| 24 | + elif artifact.artifact_type == "document": | |
| 25 | + docs_dir = output_dir / "docs" | |
| 26 | + docs_dir.mkdir(parents=True, exist_ok=True) | |
| 27 | + safe_name = artifact.name.replace(" ", "_").replace("/", "_").lower() | |
| 28 | + ext = ".json" if artifact.format == "json" else ".md" | |
| 29 | + dest = docs_dir / f"{safe_name}{ext}" | |
| 30 | + else: | |
| 31 | + safe_name = artifact.name.replace(" ", "_").replace("/", "_").lower() | |
| 32 | + ext = ".json" if artifact.format == "json" else ".md" | |
| 33 | + dest = output_dir / f"{safe_name}{ext}" | |
| 34 | + | |
| 35 | + dest.write_text(artifact.content, encoding="utf-8") | |
| 36 | + return { | |
| 37 | + "file": str(dest), | |
| 38 | + "name": artifact.name, | |
| 39 | + "artifact_type": artifact.artifact_type, | |
| 40 | + "format": artifact.format, | |
| 41 | + } | |
| 42 | + | |
| 43 | + | |
| 44 | +class ArtifactExportSkill(Skill): | |
| 45 | + name = "artifact_export" | |
| 46 | + description = "Export artifacts in agent-ready formats" | |
| 47 | + | |
| 48 | + def execute(self, context: AgentContext, **kwargs) -> Artifact: | |
| 49 | + output_dir = Path(kwargs.get("output_dir", "plan")) | |
| 50 | + output_dir.mkdir(parents=True, exist_ok=True) | |
| 51 | + | |
| 52 | + manifest_entries = [] | |
| 53 | + for artifact in context.artifacts: | |
| 54 | + entry = _write_artifact(artifact, output_dir) | |
| 55 | + manifest_entries.append(entry) | |
| 56 | + | |
| 57 | + manifest = { | |
| 58 | + "artifact_count": len(manifest_entries), | |
| 59 | + "output_dir": str(output_dir), | |
| 60 | + "files": manifest_entries, | |
| 61 | + } | |
| 62 | + manifest_path = output_dir / "manifest.json" | |
| 63 | + manifest_json = json.dumps(manifest, indent=2) | |
| 64 | + manifest_path.write_text(manifest_json, encoding="utf-8") | |
| 65 | + | |
| 66 | + return Artifact( | |
| 67 | + name="Export Manifest", | |
| 68 | + content=manifest_json, | |
| 69 | + artifact_type="export_manifest", | |
| 70 | + format="json", | |
| 71 | + ) | |
| 72 | + | |
| 73 | + | |
| 74 | +register_skill(ArtifactExportSkill()) | |
| 75 | + | |
| 76 | + | |
| 77 | +def export_artifacts(artifacts: list, output_dir: Path) -> dict: | |
| 78 | + """Standalone helper: export a list of Artifact objects to a directory.""" | |
| 79 | + output_dir = Path(output_dir) | |
| 80 | + output_dir.mkdir(parents=True, exist_ok=True) | |
| 81 | + | |
| 82 | + manifest_entries = [] | |
| 83 | + for artifact in artifacts: | |
| 84 | + entry = _write_artifact(artifact, output_dir) | |
| 85 | + manifest_entries.append(entry) | |
| 86 | + | |
| 87 | + manifest = { | |
| 88 | + "artifact_count": len(manifest_entries), | |
| 89 | + "output_dir": str(output_dir), | |
| 90 | + "files": manifest_entries, | |
| 91 | + } | |
| 92 | + manifest_path = output_dir / "manifest.json" | |
| 93 | + manifest_path.write_text(json.dumps(manifest, indent=2), encoding="utf-8") | |
| 94 | + return manifest |
| --- a/video_processor/agent/skills/artifact_export.py | |
| +++ b/video_processor/agent/skills/artifact_export.py | |
| @@ -0,0 +1,94 @@ | |
| --- a/video_processor/agent/skills/artifact_export.py | |
| +++ b/video_processor/agent/skills/artifact_export.py | |
| @@ -0,0 +1,94 @@ | |
| 1 | """Skill: Export artifacts in agent-ready formats to a directory structure.""" |
| 2 | |
| 3 | import json |
| 4 | from pathlib import Path |
| 5 | |
| 6 | from video_processor.agent.skills.base import AgentContext, Artifact, Skill, register_skill |
| 7 | |
| 8 | # Maps artifact_type to output filename |
| 9 | _TYPE_TO_FILE = { |
| 10 | "project_plan": "project_plan.md", |
| 11 | "prd": "prd.md", |
| 12 | "roadmap": "roadmap.md", |
| 13 | "task_list": "tasks.json", |
| 14 | "issues": "issues.json", |
| 15 | "requirements": "requirements.json", |
| 16 | } |
| 17 | |
| 18 | |
| 19 | def _write_artifact(artifact: Artifact, output_dir: Path) -> dict: |
| 20 | """Write a single artifact to the appropriate file. Returns manifest entry.""" |
| 21 | filename = _TYPE_TO_FILE.get(artifact.artifact_type) |
| 22 | if filename: |
| 23 | dest = output_dir / filename |
| 24 | elif artifact.artifact_type == "document": |
| 25 | docs_dir = output_dir / "docs" |
| 26 | docs_dir.mkdir(parents=True, exist_ok=True) |
| 27 | safe_name = artifact.name.replace(" ", "_").replace("/", "_").lower() |
| 28 | ext = ".json" if artifact.format == "json" else ".md" |
| 29 | dest = docs_dir / f"{safe_name}{ext}" |
| 30 | else: |
| 31 | safe_name = artifact.name.replace(" ", "_").replace("/", "_").lower() |
| 32 | ext = ".json" if artifact.format == "json" else ".md" |
| 33 | dest = output_dir / f"{safe_name}{ext}" |
| 34 | |
| 35 | dest.write_text(artifact.content, encoding="utf-8") |
| 36 | return { |
| 37 | "file": str(dest), |
| 38 | "name": artifact.name, |
| 39 | "artifact_type": artifact.artifact_type, |
| 40 | "format": artifact.format, |
| 41 | } |
| 42 | |
| 43 | |
| 44 | class ArtifactExportSkill(Skill): |
| 45 | name = "artifact_export" |
| 46 | description = "Export artifacts in agent-ready formats" |
| 47 | |
| 48 | def execute(self, context: AgentContext, **kwargs) -> Artifact: |
| 49 | output_dir = Path(kwargs.get("output_dir", "plan")) |
| 50 | output_dir.mkdir(parents=True, exist_ok=True) |
| 51 | |
| 52 | manifest_entries = [] |
| 53 | for artifact in context.artifacts: |
| 54 | entry = _write_artifact(artifact, output_dir) |
| 55 | manifest_entries.append(entry) |
| 56 | |
| 57 | manifest = { |
| 58 | "artifact_count": len(manifest_entries), |
| 59 | "output_dir": str(output_dir), |
| 60 | "files": manifest_entries, |
| 61 | } |
| 62 | manifest_path = output_dir / "manifest.json" |
| 63 | manifest_json = json.dumps(manifest, indent=2) |
| 64 | manifest_path.write_text(manifest_json, encoding="utf-8") |
| 65 | |
| 66 | return Artifact( |
| 67 | name="Export Manifest", |
| 68 | content=manifest_json, |
| 69 | artifact_type="export_manifest", |
| 70 | format="json", |
| 71 | ) |
| 72 | |
| 73 | |
| 74 | register_skill(ArtifactExportSkill()) |
| 75 | |
| 76 | |
| 77 | def export_artifacts(artifacts: list, output_dir: Path) -> dict: |
| 78 | """Standalone helper: export a list of Artifact objects to a directory.""" |
| 79 | output_dir = Path(output_dir) |
| 80 | output_dir.mkdir(parents=True, exist_ok=True) |
| 81 | |
| 82 | manifest_entries = [] |
| 83 | for artifact in artifacts: |
| 84 | entry = _write_artifact(artifact, output_dir) |
| 85 | manifest_entries.append(entry) |
| 86 | |
| 87 | manifest = { |
| 88 | "artifact_count": len(manifest_entries), |
| 89 | "output_dir": str(output_dir), |
| 90 | "files": manifest_entries, |
| 91 | } |
| 92 | manifest_path = output_dir / "manifest.json" |
| 93 | manifest_path.write_text(json.dumps(manifest, indent=2), encoding="utf-8") |
| 94 | return manifest |
| --- a/video_processor/agent/skills/base.py | ||
| +++ b/video_processor/agent/skills/base.py | ||
| @@ -0,0 +1,65 @@ | ||
| 1 | +"""Skill interface for the PlanOpticon planning agent.""" | |
| 2 | + | |
| 3 | +from abc import ABC, abstractmethod | |
| 4 | +from dataclasses import dataclass, field | |
| 5 | +from typing import Any, Dict, List, Optional | |
| 6 | + | |
| 7 | + | |
| 8 | +@dataclass | |
| 9 | +class Artifact: | |
| 10 | + """Output from a skill execution.""" | |
| 11 | + | |
| 12 | + name: str | |
| 13 | + content: str # The generated content (markdown, json, etc.) | |
| 14 | + artifact_type: str # "project_plan", "prd", "roadmap", "task_list", "document", "issues" | |
| 15 | + format: str = "markdown" # "markdown", "json", "mermaid" | |
| 16 | + metadata: Dict[str, Any] = field(default_factory=dict) | |
| 17 | + | |
| 18 | + | |
| 19 | +@dataclass | |
| 20 | +class AgentContext: | |
| 21 | + """Shared context for agent skills.""" | |
| 22 | + | |
| 23 | + knowledge_graph: Any = None # KnowledgeGraph instance | |
| 24 | + query_engine: Any = None # GraphQueryEngine instance | |
| 25 | + provider_manager: Any = None # ProviderManager instance | |
| 26 | + planning_entities: List[Any] = field(default_factory=list) | |
| 27 | + user_requirements: Dict[str, Any] = field(default_factory=dict) | |
| 28 | + conversation_history: List[Dict[str, str]] = field(default_factory=list) | |
| 29 | + artifacts: List[Artifact] = field(default_factory=list) | |
| 30 | + config: Dict[str, Any] = field(default_factory=dict) | |
| 31 | + | |
| 32 | + | |
| 33 | +class Skill(ABC): | |
| 34 | + """Base class for agent skills.""" | |
| 35 | + | |
| 36 | + name: str = "" | |
| 37 | + description: str = "" | |
| 38 | + | |
| 39 | + @abstractmethod | |
| 40 | + def execute(self, context: AgentContext, **kwargs) -> Artifact: | |
| 41 | + """Execute this skill and return an artifact.""" | |
| 42 | + ... | |
| 43 | + | |
| 44 | + def can_execute(self, context: AgentContext) -> bool: | |
| 45 | + """Check if this skill can execute given the current context.""" | |
| 46 | + return context.knowledge_graph is not None and context.provider_manager is not None | |
| 47 | + | |
| 48 | + | |
| 49 | +# Skill registry | |
| 50 | +_skills: Dict[str, "Skill"] = {} | |
| 51 | + | |
| 52 | + | |
| 53 | +def register_skill(skill: "Skill") -> None: | |
| 54 | + """Register a skill instance in the global registry.""" | |
| 55 | + _skills[skill.name] = skill | |
| 56 | + | |
| 57 | + | |
| 58 | +def get_skill(name: str) -> Optional["Skill"]: | |
| 59 | + """Look up a skill by name.""" | |
| 60 | + return _skills.get(name) | |
| 61 | + | |
| 62 | + | |
| 63 | +def list_skills() -> List["Skill"]: | |
| 64 | + """Return all registered skills.""" | |
| 65 | + return list(_skills.values()) |
| --- a/video_processor/agent/skills/base.py | |
| +++ b/video_processor/agent/skills/base.py | |
| @@ -0,0 +1,65 @@ | |
| --- a/video_processor/agent/skills/base.py | |
| +++ b/video_processor/agent/skills/base.py | |
| @@ -0,0 +1,65 @@ | |
| 1 | """Skill interface for the PlanOpticon planning agent.""" |
| 2 | |
| 3 | from abc import ABC, abstractmethod |
| 4 | from dataclasses import dataclass, field |
| 5 | from typing import Any, Dict, List, Optional |
| 6 | |
| 7 | |
| 8 | @dataclass |
| 9 | class Artifact: |
| 10 | """Output from a skill execution.""" |
| 11 | |
| 12 | name: str |
| 13 | content: str # The generated content (markdown, json, etc.) |
| 14 | artifact_type: str # "project_plan", "prd", "roadmap", "task_list", "document", "issues" |
| 15 | format: str = "markdown" # "markdown", "json", "mermaid" |
| 16 | metadata: Dict[str, Any] = field(default_factory=dict) |
| 17 | |
| 18 | |
| 19 | @dataclass |
| 20 | class AgentContext: |
| 21 | """Shared context for agent skills.""" |
| 22 | |
| 23 | knowledge_graph: Any = None # KnowledgeGraph instance |
| 24 | query_engine: Any = None # GraphQueryEngine instance |
| 25 | provider_manager: Any = None # ProviderManager instance |
| 26 | planning_entities: List[Any] = field(default_factory=list) |
| 27 | user_requirements: Dict[str, Any] = field(default_factory=dict) |
| 28 | conversation_history: List[Dict[str, str]] = field(default_factory=list) |
| 29 | artifacts: List[Artifact] = field(default_factory=list) |
| 30 | config: Dict[str, Any] = field(default_factory=dict) |
| 31 | |
| 32 | |
| 33 | class Skill(ABC): |
| 34 | """Base class for agent skills.""" |
| 35 | |
| 36 | name: str = "" |
| 37 | description: str = "" |
| 38 | |
| 39 | @abstractmethod |
| 40 | def execute(self, context: AgentContext, **kwargs) -> Artifact: |
| 41 | """Execute this skill and return an artifact.""" |
| 42 | ... |
| 43 | |
| 44 | def can_execute(self, context: AgentContext) -> bool: |
| 45 | """Check if this skill can execute given the current context.""" |
| 46 | return context.knowledge_graph is not None and context.provider_manager is not None |
| 47 | |
| 48 | |
| 49 | # Skill registry |
| 50 | _skills: Dict[str, "Skill"] = {} |
| 51 | |
| 52 | |
| 53 | def register_skill(skill: "Skill") -> None: |
| 54 | """Register a skill instance in the global registry.""" |
| 55 | _skills[skill.name] = skill |
| 56 | |
| 57 | |
| 58 | def get_skill(name: str) -> Optional["Skill"]: |
| 59 | """Look up a skill by name.""" |
| 60 | return _skills.get(name) |
| 61 | |
| 62 | |
| 63 | def list_skills() -> List["Skill"]: |
| 64 | """Return all registered skills.""" |
| 65 | return list(_skills.values()) |
| --- a/video_processor/agent/skills/cli_adapter.py | ||
| +++ b/video_processor/agent/skills/cli_adapter.py | ||
| @@ -0,0 +1,99 @@ | ||
| 1 | +"""Skill: Push artifacts to external tools via their CLIs.""" | |
| 2 | + | |
| 3 | +import json | |
| 4 | +import shutil | |
| 5 | +import subprocess | |
| 6 | +from typing import List | |
| 7 | + | |
| 8 | +from video_processor.agent.skills.base import AgentContext, Artifact, Skill, register_skill | |
| 9 | + | |
| 10 | + | |
| 11 | +def _format_github(artifact: Artifact) -> List[str]: | |
| 12 | + """Convert artifact to gh CLI commands.""" | |
| 13 | + items = json.loads(artifact.content) if artifact.format == "json" else [] | |
| 14 | + cmds = [] | |
| 15 | + for item in items: | |
| 16 | + cmd = f"gh issue create --title {json.dumps(item.get('title', ''))}" | |
| 17 | + if item.get("body"): | |
| 18 | + cmd += f" --body {json.dumps(item['body'])}" | |
| 19 | + for label in item.get("labels", []): | |
| 20 | + cmd += f" --label {json.dumps(label)}" | |
| 21 | + cmds.append(cmd) | |
| 22 | + return cmds | |
| 23 | + | |
| 24 | + | |
| 25 | +def _format_jira(artifact: Artifact) -> List[str]: | |
| 26 | + """Convert artifact to jira-cli commands.""" | |
| 27 | + items = json.loads(artifact.content) if artifact.format == "json" else [] | |
| 28 | + return [ | |
| 29 | + f"jira issue create --summary {json.dumps(item.get('title', ''))}" | |
| 30 | + f" --description {json.dumps(item.get('body', item.get('description', '')))}" | |
| 31 | + for item in items | |
| 32 | + ] | |
| 33 | + | |
| 34 | + | |
| 35 | +def _format_linear(artifact: Artifact) -> List[str]: | |
| 36 | + """Convert artifact to linear CLI commands.""" | |
| 37 | + items = json.loads(artifact.content) if artifact.format == "json" else [] | |
| 38 | + return [ | |
| 39 | + f"linear issue create --title {json.dumps(item.get('title', ''))}" | |
| 40 | + f" --description {json.dumps(item.get('body', item.get('description', '')))}" | |
| 41 | + for item in items | |
| 42 | + ] | |
| 43 | + | |
| 44 | + | |
| 45 | +_adapters = {"github": _format_github, "jira": _format_jira, "linear": _format_linear} | |
| 46 | + | |
| 47 | + | |
| 48 | +def run_commands(commands: List[str], dry_run: bool = True) -> List[dict]: | |
| 49 | + """Execute CLI commands. In dry_run mode, just return what would run.""" | |
| 50 | + results = [] | |
| 51 | + for cmd in commands: | |
| 52 | + if dry_run: | |
| 53 | + results.append({"command": cmd, "status": "dry_run"}) | |
| 54 | + else: | |
| 55 | + proc = subprocess.run(cmd, shell=True, capture_output=True, text=True) | |
| 56 | + results.append( | |
| 57 | + { | |
| 58 | + "command": cmd, | |
| 59 | + "returncode": proc.returncode, | |
| 60 | + "stdout": proc.stdout.strip(), | |
| 61 | + "stderr": proc.stderr.strip(), | |
| 62 | + } | |
| 63 | + ) | |
| 64 | + return results | |
| 65 | + | |
| 66 | + | |
| 67 | +class CLIAdapterSkill(Skill): | |
| 68 | + name = "cli_adapter" | |
| 69 | + description = "Push artifacts to external tools via their CLIs" | |
| 70 | + | |
| 71 | + def execute(self, context: AgentContext, **kwargs) -> Artifact: | |
| 72 | + tool = kwargs.get("tool", "github") | |
| 73 | + artifact = kwargs.get("artifact") | |
| 74 | + if artifact is None and context.artifacts: | |
| 75 | + artifact = context.artifacts[-1] | |
| 76 | + if artifact is None: | |
| 77 | + return Artifact( | |
| 78 | + name="CLI Commands", content="[]", artifact_type="cli_commands", format="json" | |
| 79 | + ) | |
| 80 | + | |
| 81 | + formatter = _adapters.get(tool) | |
| 82 | + if formatter is None: | |
| 83 | + return Artifact( | |
| 84 | + name="CLI Commands", | |
| 85 | + content=json.dumps({"error": f"Unknown tool: {tool}"}), | |
| 86 | + artifact_type="cli_commands", | |
| 87 | + format="json", | |
| 88 | + ) | |
| 89 | + | |
| 90 | + cli_name = {"github": "gh", "jira": "jira", "linear": "linear"}[tool] | |
| 91 | + available = shutil.which(cli_name) is not None | |
| 92 | + commands = formatter(artifact) | |
| 93 | + content = json.dumps({"tool": tool, "available": available, "commands": commands}, indent=2) | |
| 94 | + return Artifact( | |
| 95 | + name="CLI Commands", content=content, artifact_type="cli_commands", format="json" | |
| 96 | + ) | |
| 97 | + | |
| 98 | + | |
| 99 | +register_skill(CLIAdapterSkill()) |
| --- a/video_processor/agent/skills/cli_adapter.py | |
| +++ b/video_processor/agent/skills/cli_adapter.py | |
| @@ -0,0 +1,99 @@ | |
| --- a/video_processor/agent/skills/cli_adapter.py | |
| +++ b/video_processor/agent/skills/cli_adapter.py | |
| @@ -0,0 +1,99 @@ | |
| 1 | """Skill: Push artifacts to external tools via their CLIs.""" |
| 2 | |
| 3 | import json |
| 4 | import shutil |
| 5 | import subprocess |
| 6 | from typing import List |
| 7 | |
| 8 | from video_processor.agent.skills.base import AgentContext, Artifact, Skill, register_skill |
| 9 | |
| 10 | |
| 11 | def _format_github(artifact: Artifact) -> List[str]: |
| 12 | """Convert artifact to gh CLI commands.""" |
| 13 | items = json.loads(artifact.content) if artifact.format == "json" else [] |
| 14 | cmds = [] |
| 15 | for item in items: |
| 16 | cmd = f"gh issue create --title {json.dumps(item.get('title', ''))}" |
| 17 | if item.get("body"): |
| 18 | cmd += f" --body {json.dumps(item['body'])}" |
| 19 | for label in item.get("labels", []): |
| 20 | cmd += f" --label {json.dumps(label)}" |
| 21 | cmds.append(cmd) |
| 22 | return cmds |
| 23 | |
| 24 | |
| 25 | def _format_jira(artifact: Artifact) -> List[str]: |
| 26 | """Convert artifact to jira-cli commands.""" |
| 27 | items = json.loads(artifact.content) if artifact.format == "json" else [] |
| 28 | return [ |
| 29 | f"jira issue create --summary {json.dumps(item.get('title', ''))}" |
| 30 | f" --description {json.dumps(item.get('body', item.get('description', '')))}" |
| 31 | for item in items |
| 32 | ] |
| 33 | |
| 34 | |
| 35 | def _format_linear(artifact: Artifact) -> List[str]: |
| 36 | """Convert artifact to linear CLI commands.""" |
| 37 | items = json.loads(artifact.content) if artifact.format == "json" else [] |
| 38 | return [ |
| 39 | f"linear issue create --title {json.dumps(item.get('title', ''))}" |
| 40 | f" --description {json.dumps(item.get('body', item.get('description', '')))}" |
| 41 | for item in items |
| 42 | ] |
| 43 | |
| 44 | |
| 45 | _adapters = {"github": _format_github, "jira": _format_jira, "linear": _format_linear} |
| 46 | |
| 47 | |
| 48 | def run_commands(commands: List[str], dry_run: bool = True) -> List[dict]: |
| 49 | """Execute CLI commands. In dry_run mode, just return what would run.""" |
| 50 | results = [] |
| 51 | for cmd in commands: |
| 52 | if dry_run: |
| 53 | results.append({"command": cmd, "status": "dry_run"}) |
| 54 | else: |
| 55 | proc = subprocess.run(cmd, shell=True, capture_output=True, text=True) |
| 56 | results.append( |
| 57 | { |
| 58 | "command": cmd, |
| 59 | "returncode": proc.returncode, |
| 60 | "stdout": proc.stdout.strip(), |
| 61 | "stderr": proc.stderr.strip(), |
| 62 | } |
| 63 | ) |
| 64 | return results |
| 65 | |
| 66 | |
| 67 | class CLIAdapterSkill(Skill): |
| 68 | name = "cli_adapter" |
| 69 | description = "Push artifacts to external tools via their CLIs" |
| 70 | |
| 71 | def execute(self, context: AgentContext, **kwargs) -> Artifact: |
| 72 | tool = kwargs.get("tool", "github") |
| 73 | artifact = kwargs.get("artifact") |
| 74 | if artifact is None and context.artifacts: |
| 75 | artifact = context.artifacts[-1] |
| 76 | if artifact is None: |
| 77 | return Artifact( |
| 78 | name="CLI Commands", content="[]", artifact_type="cli_commands", format="json" |
| 79 | ) |
| 80 | |
| 81 | formatter = _adapters.get(tool) |
| 82 | if formatter is None: |
| 83 | return Artifact( |
| 84 | name="CLI Commands", |
| 85 | content=json.dumps({"error": f"Unknown tool: {tool}"}), |
| 86 | artifact_type="cli_commands", |
| 87 | format="json", |
| 88 | ) |
| 89 | |
| 90 | cli_name = {"github": "gh", "jira": "jira", "linear": "linear"}[tool] |
| 91 | available = shutil.which(cli_name) is not None |
| 92 | commands = formatter(artifact) |
| 93 | content = json.dumps({"tool": tool, "available": available, "commands": commands}, indent=2) |
| 94 | return Artifact( |
| 95 | name="CLI Commands", content=content, artifact_type="cli_commands", format="json" |
| 96 | ) |
| 97 | |
| 98 | |
| 99 | register_skill(CLIAdapterSkill()) |
| --- a/video_processor/agent/skills/doc_generator.py | ||
| +++ b/video_processor/agent/skills/doc_generator.py | ||
| @@ -0,0 +1,76 @@ | ||
| 1 | +"""Skill: Generate technical documentation, ADRs, or meeting notes.""" | |
| 2 | + | |
| 3 | +from video_processor.agent.skills.base import ( | |
| 4 | + AgentContext, | |
| 5 | + Artifact, | |
| 6 | + Skill, | |
| 7 | + register_skill, | |
| 8 | +) | |
| 9 | + | |
| 10 | +_DOC_PROMPTS = { | |
| 11 | + "technical_doc": ( | |
| 12 | + "Generate technical documentation with:\n" | |
| 13 | + "1. Overview\n2. Architecture\n3. Components & Interfaces\n" | |
| 14 | + "4. Data Flow\n5. Deployment & Configuration\n" | |
| 15 | + "6. API Reference (if applicable)" | |
| 16 | + ), | |
| 17 | + "adr": ( | |
| 18 | + "Generate an Architecture Decision Record (ADR) with:\n" | |
| 19 | + "1. Title\n2. Status (Proposed)\n3. Context\n" | |
| 20 | + "4. Decision\n5. Consequences\n6. Alternatives Considered" | |
| 21 | + ), | |
| 22 | + "meeting_notes": ( | |
| 23 | + "Generate structured meeting notes with:\n" | |
| 24 | + "1. Meeting Summary\n2. Key Discussion Points\n" | |
| 25 | + "3. Decisions Made\n4. Action Items (with owners)\n" | |
| 26 | + "5. Open Questions\n6. Next Steps" | |
| 27 | + ), | |
| 28 | +} | |
| 29 | + | |
| 30 | + | |
| 31 | +class DocGeneratorSkill(Skill): | |
| 32 | + name = "doc_generator" | |
| 33 | + description = "Generate technical documentation, ADRs, or meeting notes" | |
| 34 | + | |
| 35 | + def execute(self, context: AgentContext, **kwargs) -> Artifact: | |
| 36 | + doc_type = kwargs.get("doc_type", "technical_doc") | |
| 37 | + stats = context.query_engine.stats() | |
| 38 | + entities = context.query_engine.entities() | |
| 39 | + relationships = context.query_engine.relationships() | |
| 40 | + | |
| 41 | + doc_instructions = _DOC_PROMPTS.get(doc_type, _DOC_PROMPTS["technical_doc"]) | |
| 42 | + doc_label = doc_type.replace("_", " ") | |
| 43 | + | |
| 44 | + parts = [ | |
| 45 | + f"You are a technical writer. Generate a {doc_label} " | |
| 46 | + "from the following knowledge graph context.", | |
| 47 | + "", | |
| 48 | + "## Knowledge Graph Overview", | |
| 49 | + stats.to_text(), | |
| 50 | + "", | |
| 51 | + "## Entities", | |
| 52 | + entities.to_text(), | |
| 53 | + "", | |
| 54 | + "## Relationships", | |
| 55 | + relationships.to_text(), | |
| 56 | + "", | |
| 57 | + "## Planning Entities", | |
| 58 | + ] | |
| 59 | + for e in context.planning_entities: | |
| 60 | + parts.append(f"- {e}") | |
| 61 | + | |
| 62 | + parts.append(f"\n{doc_instructions}\n\nReturn ONLY the markdown.") | |
| 63 | + | |
| 64 | + prompt = "\n".join(parts) | |
| 65 | + response = context.provider_manager.chat(messages=[{"role": "user", "content": prompt}]) | |
| 66 | + | |
| 67 | + return Artifact( | |
| 68 | + name=doc_label.title(), | |
| 69 | + content=response, | |
| 70 | + artifact_type="document", | |
| 71 | + format="markdown", | |
| 72 | + metadata={"doc_type": doc_type}, | |
| 73 | + ) | |
| 74 | + | |
| 75 | + | |
| 76 | +register_skill(DocGeneratorSkill()) |
| --- a/video_processor/agent/skills/doc_generator.py | |
| +++ b/video_processor/agent/skills/doc_generator.py | |
| @@ -0,0 +1,76 @@ | |
| --- a/video_processor/agent/skills/doc_generator.py | |
| +++ b/video_processor/agent/skills/doc_generator.py | |
| @@ -0,0 +1,76 @@ | |
| 1 | """Skill: Generate technical documentation, ADRs, or meeting notes.""" |
| 2 | |
| 3 | from video_processor.agent.skills.base import ( |
| 4 | AgentContext, |
| 5 | Artifact, |
| 6 | Skill, |
| 7 | register_skill, |
| 8 | ) |
| 9 | |
| 10 | _DOC_PROMPTS = { |
| 11 | "technical_doc": ( |
| 12 | "Generate technical documentation with:\n" |
| 13 | "1. Overview\n2. Architecture\n3. Components & Interfaces\n" |
| 14 | "4. Data Flow\n5. Deployment & Configuration\n" |
| 15 | "6. API Reference (if applicable)" |
| 16 | ), |
| 17 | "adr": ( |
| 18 | "Generate an Architecture Decision Record (ADR) with:\n" |
| 19 | "1. Title\n2. Status (Proposed)\n3. Context\n" |
| 20 | "4. Decision\n5. Consequences\n6. Alternatives Considered" |
| 21 | ), |
| 22 | "meeting_notes": ( |
| 23 | "Generate structured meeting notes with:\n" |
| 24 | "1. Meeting Summary\n2. Key Discussion Points\n" |
| 25 | "3. Decisions Made\n4. Action Items (with owners)\n" |
| 26 | "5. Open Questions\n6. Next Steps" |
| 27 | ), |
| 28 | } |
| 29 | |
| 30 | |
| 31 | class DocGeneratorSkill(Skill): |
| 32 | name = "doc_generator" |
| 33 | description = "Generate technical documentation, ADRs, or meeting notes" |
| 34 | |
| 35 | def execute(self, context: AgentContext, **kwargs) -> Artifact: |
| 36 | doc_type = kwargs.get("doc_type", "technical_doc") |
| 37 | stats = context.query_engine.stats() |
| 38 | entities = context.query_engine.entities() |
| 39 | relationships = context.query_engine.relationships() |
| 40 | |
| 41 | doc_instructions = _DOC_PROMPTS.get(doc_type, _DOC_PROMPTS["technical_doc"]) |
| 42 | doc_label = doc_type.replace("_", " ") |
| 43 | |
| 44 | parts = [ |
| 45 | f"You are a technical writer. Generate a {doc_label} " |
| 46 | "from the following knowledge graph context.", |
| 47 | "", |
| 48 | "## Knowledge Graph Overview", |
| 49 | stats.to_text(), |
| 50 | "", |
| 51 | "## Entities", |
| 52 | entities.to_text(), |
| 53 | "", |
| 54 | "## Relationships", |
| 55 | relationships.to_text(), |
| 56 | "", |
| 57 | "## Planning Entities", |
| 58 | ] |
| 59 | for e in context.planning_entities: |
| 60 | parts.append(f"- {e}") |
| 61 | |
| 62 | parts.append(f"\n{doc_instructions}\n\nReturn ONLY the markdown.") |
| 63 | |
| 64 | prompt = "\n".join(parts) |
| 65 | response = context.provider_manager.chat(messages=[{"role": "user", "content": prompt}]) |
| 66 | |
| 67 | return Artifact( |
| 68 | name=doc_label.title(), |
| 69 | content=response, |
| 70 | artifact_type="document", |
| 71 | format="markdown", |
| 72 | metadata={"doc_type": doc_type}, |
| 73 | ) |
| 74 | |
| 75 | |
| 76 | register_skill(DocGeneratorSkill()) |
| --- a/video_processor/agent/skills/github_integration.py | ||
| +++ b/video_processor/agent/skills/github_integration.py | ||
| @@ -0,0 +1,93 @@ | ||
| 1 | +"""Skill: Generate GitHub issues from task breakdown artifacts.""" | |
| 2 | + | |
| 3 | +import json | |
| 4 | +import shutil | |
| 5 | +import subprocess | |
| 6 | +from typing import List, Optional | |
| 7 | + | |
| 8 | +from video_processor.agent.skills.base import AgentContext, Artifact, Skill, register_skill | |
| 9 | + | |
| 10 | + | |
| 11 | +def _task_to_issue(task: dict) -> dict: | |
| 12 | + """Convert a task dict to a GitHub issue object.""" | |
| 13 | + deps = task.get("dependencies", []) | |
| 14 | + body_parts = [ | |
| 15 | + f"## Description\n{task.get('description', task.get('title', ''))}", | |
| 16 | + f"**Priority:** {task.get('priority', 'medium')}", | |
| 17 | + f"**Estimate:** {task.get('estimate', 'unknown')}", | |
| 18 | + ] | |
| 19 | + if deps: | |
| 20 | + body_parts.append(f"**Dependencies:** {', '.join(str(d) for d in deps)}") | |
| 21 | + labels = [task.get("priority", "medium")] | |
| 22 | + if task.get("labels"): | |
| 23 | + labels.extend(task["labels"]) | |
| 24 | + return { | |
| 25 | + "title": task.get("title", "Untitled task"), | |
| 26 | + "body": "\n\n".join(body_parts), | |
| 27 | + "labels": labels, | |
| 28 | + } | |
| 29 | + | |
| 30 | + | |
| 31 | +def push_to_github(issues_json: str, repo: str) -> Optional[List[dict]]: | |
| 32 | + """Shell out to `gh issue create` for each issue. Returns None if gh unavailable.""" | |
| 33 | + if not shutil.which("gh"): | |
| 34 | + return None | |
| 35 | + issues = json.loads(issues_json) | |
| 36 | + results = [] | |
| 37 | + for issue in issues: | |
| 38 | + cmd = [ | |
| 39 | + "gh", | |
| 40 | + "issue", | |
| 41 | + "create", | |
| 42 | + "--repo", | |
| 43 | + repo, | |
| 44 | + "--title", | |
| 45 | + issue["title"], | |
| 46 | + "--body", | |
| 47 | + issue["body"], | |
| 48 | + ] | |
| 49 | + for label in issue.get("labels", []): | |
| 50 | + cmd.extend(["--label", label]) | |
| 51 | + proc = subprocess.run(cmd, capture_output=True, text=True) | |
| 52 | + results.append( | |
| 53 | + { | |
| 54 | + "title": issue["title"], | |
| 55 | + "returncode": proc.returncode, | |
| 56 | + "stdout": proc.stdout.strip(), | |
| 57 | + "stderr": proc.stderr.strip(), | |
| 58 | + } | |
| 59 | + ) | |
| 60 | + return results | |
| 61 | + | |
| 62 | + | |
| 63 | +class GitHubIssuesSkill(Skill): | |
| 64 | + name = "github_issues" | |
| 65 | + description = "Generate GitHub issues from task breakdown" | |
| 66 | + | |
| 67 | + def execute(self, context: AgentContext, **kwargs) -> Artifact: | |
| 68 | + task_artifact = next((a for a in context.artifacts if a.artifact_type == "task_list"), None) | |
| 69 | + if task_artifact: | |
| 70 | + tasks = json.loads(task_artifact.content) | |
| 71 | + else: | |
| 72 | + # Generate minimal task list inline from planning entities | |
| 73 | + tasks = [ | |
| 74 | + { | |
| 75 | + "title": str(e), | |
| 76 | + "description": str(e), | |
| 77 | + "priority": "medium", | |
| 78 | + "estimate": "unknown", | |
| 79 | + } | |
| 80 | + for e in context.planning_entities | |
| 81 | + ] | |
| 82 | + | |
| 83 | + issues = [_task_to_issue(t) for t in tasks] | |
| 84 | + content = json.dumps(issues, indent=2) | |
| 85 | + return Artifact( | |
| 86 | + name="GitHub Issues", | |
| 87 | + content=content, | |
| 88 | + artifact_type="issues", | |
| 89 | + format="json", | |
| 90 | + ) | |
| 91 | + | |
| 92 | + | |
| 93 | +register_skill(GitHubIssuesSkill()) |
| --- a/video_processor/agent/skills/github_integration.py | |
| +++ b/video_processor/agent/skills/github_integration.py | |
| @@ -0,0 +1,93 @@ | |
| --- a/video_processor/agent/skills/github_integration.py | |
| +++ b/video_processor/agent/skills/github_integration.py | |
| @@ -0,0 +1,93 @@ | |
| 1 | """Skill: Generate GitHub issues from task breakdown artifacts.""" |
| 2 | |
| 3 | import json |
| 4 | import shutil |
| 5 | import subprocess |
| 6 | from typing import List, Optional |
| 7 | |
| 8 | from video_processor.agent.skills.base import AgentContext, Artifact, Skill, register_skill |
| 9 | |
| 10 | |
| 11 | def _task_to_issue(task: dict) -> dict: |
| 12 | """Convert a task dict to a GitHub issue object.""" |
| 13 | deps = task.get("dependencies", []) |
| 14 | body_parts = [ |
| 15 | f"## Description\n{task.get('description', task.get('title', ''))}", |
| 16 | f"**Priority:** {task.get('priority', 'medium')}", |
| 17 | f"**Estimate:** {task.get('estimate', 'unknown')}", |
| 18 | ] |
| 19 | if deps: |
| 20 | body_parts.append(f"**Dependencies:** {', '.join(str(d) for d in deps)}") |
| 21 | labels = [task.get("priority", "medium")] |
| 22 | if task.get("labels"): |
| 23 | labels.extend(task["labels"]) |
| 24 | return { |
| 25 | "title": task.get("title", "Untitled task"), |
| 26 | "body": "\n\n".join(body_parts), |
| 27 | "labels": labels, |
| 28 | } |
| 29 | |
| 30 | |
| 31 | def push_to_github(issues_json: str, repo: str) -> Optional[List[dict]]: |
| 32 | """Shell out to `gh issue create` for each issue. Returns None if gh unavailable.""" |
| 33 | if not shutil.which("gh"): |
| 34 | return None |
| 35 | issues = json.loads(issues_json) |
| 36 | results = [] |
| 37 | for issue in issues: |
| 38 | cmd = [ |
| 39 | "gh", |
| 40 | "issue", |
| 41 | "create", |
| 42 | "--repo", |
| 43 | repo, |
| 44 | "--title", |
| 45 | issue["title"], |
| 46 | "--body", |
| 47 | issue["body"], |
| 48 | ] |
| 49 | for label in issue.get("labels", []): |
| 50 | cmd.extend(["--label", label]) |
| 51 | proc = subprocess.run(cmd, capture_output=True, text=True) |
| 52 | results.append( |
| 53 | { |
| 54 | "title": issue["title"], |
| 55 | "returncode": proc.returncode, |
| 56 | "stdout": proc.stdout.strip(), |
| 57 | "stderr": proc.stderr.strip(), |
| 58 | } |
| 59 | ) |
| 60 | return results |
| 61 | |
| 62 | |
| 63 | class GitHubIssuesSkill(Skill): |
| 64 | name = "github_issues" |
| 65 | description = "Generate GitHub issues from task breakdown" |
| 66 | |
| 67 | def execute(self, context: AgentContext, **kwargs) -> Artifact: |
| 68 | task_artifact = next((a for a in context.artifacts if a.artifact_type == "task_list"), None) |
| 69 | if task_artifact: |
| 70 | tasks = json.loads(task_artifact.content) |
| 71 | else: |
| 72 | # Generate minimal task list inline from planning entities |
| 73 | tasks = [ |
| 74 | { |
| 75 | "title": str(e), |
| 76 | "description": str(e), |
| 77 | "priority": "medium", |
| 78 | "estimate": "unknown", |
| 79 | } |
| 80 | for e in context.planning_entities |
| 81 | ] |
| 82 | |
| 83 | issues = [_task_to_issue(t) for t in tasks] |
| 84 | content = json.dumps(issues, indent=2) |
| 85 | return Artifact( |
| 86 | name="GitHub Issues", |
| 87 | content=content, |
| 88 | artifact_type="issues", |
| 89 | format="json", |
| 90 | ) |
| 91 | |
| 92 | |
| 93 | register_skill(GitHubIssuesSkill()) |
| --- a/video_processor/agent/skills/notes_export.py | ||
| +++ b/video_processor/agent/skills/notes_export.py | ||
| @@ -0,0 +1,420 @@ | ||
| 1 | +"""Skill: Export knowledge graph as structured notes (Obsidian, Notion).""" | |
| 2 | + | |
| 3 | +import csv | |
| 4 | +import io | |
| 5 | +import logging | |
| 6 | +from datetime import date | |
| 7 | +from pathlib import Path | |
| 8 | +from typing import Dict, List, Optional | |
| 9 | + | |
| 10 | +from video_processor.agent.skills.base import ( | |
| 11 | + AgentContext, | |
| 12 | + Artifact, | |
| 13 | + Skill, | |
| 14 | + register_skill, | |
| 15 | +) | |
| 16 | + | |
| 17 | +logger = logging.getLogger(__name__) | |
| 18 | + | |
| 19 | + | |
| 20 | +def _sanitize_filename(name: str) -> str: | |
| 21 | + """Convert a name to a filesystem-safe filename.""" | |
| 22 | + return ( | |
| 23 | + name.replace("/", "-") | |
| 24 | + .replace("\\", "-") | |
| 25 | + .replace(":", "-") | |
| 26 | + .replace('"', "") | |
| 27 | + .replace("?", "") | |
| 28 | + .replace("*", "") | |
| 29 | + .replace("<", "") | |
| 30 | + .replace(">", "") | |
| 31 | + .replace("|", "") | |
| 32 | + ) | |
| 33 | + | |
| 34 | + | |
| 35 | +def _build_indexes(kg_data: dict): | |
| 36 | + """Build lookup structures from knowledge graph data. | |
| 37 | + | |
| 38 | + Returns (nodes, by_type, node_lookup, outgoing, incoming). | |
| 39 | + """ | |
| 40 | + nodes = kg_data.get("nodes", []) | |
| 41 | + relationships = kg_data.get("relationships", []) | |
| 42 | + | |
| 43 | + by_type: Dict[str, list] = {} | |
| 44 | + node_lookup: Dict[str, dict] = {} | |
| 45 | + for node in nodes: | |
| 46 | + name = node.get("name", node.get("id", "")) | |
| 47 | + ntype = node.get("type", "concept") | |
| 48 | + by_type.setdefault(ntype, []).append(node) | |
| 49 | + node_lookup[name] = node | |
| 50 | + | |
| 51 | + outgoing: Dict[str, list] = {} | |
| 52 | + incoming: Dict[str, list] = {} | |
| 53 | + for rel in relationships: | |
| 54 | + src = rel.get("source", "") | |
| 55 | + tgt = rel.get("target", "") | |
| 56 | + rtype = rel.get("type", "related_to") | |
| 57 | + outgoing.setdefault(src, []).append((tgt, rtype)) | |
| 58 | + incoming.setdefault(tgt, []).append((src, rtype)) | |
| 59 | + | |
| 60 | + return nodes, by_type, node_lookup, outgoing, incoming | |
| 61 | + | |
| 62 | + | |
| 63 | +# --------------------------------------------------------------------------- | |
| 64 | +# Obsidian export | |
| 65 | +# --------------------------------------------------------------------------- | |
| 66 | + | |
| 67 | + | |
| 68 | +def export_to_obsidian( | |
| 69 | + kg_data: dict, | |
| 70 | + output_dir: Path, | |
| 71 | + artifacts: Optional[List[Artifact]] = None, | |
| 72 | +) -> List[Path]: | |
| 73 | + """Export knowledge graph as an Obsidian vault. | |
| 74 | + | |
| 75 | + Creates one ``.md`` file per entity with YAML frontmatter and | |
| 76 | + ``[[wiki-links]]``, an ``_Index.md`` Map of Content, tag pages per | |
| 77 | + entity type, and optional artifact notes. | |
| 78 | + """ | |
| 79 | + output_dir.mkdir(parents=True, exist_ok=True) | |
| 80 | + artifacts = artifacts or [] | |
| 81 | + created: List[Path] = [] | |
| 82 | + today = date.today().isoformat() | |
| 83 | + | |
| 84 | + nodes, by_type, node_lookup, outgoing, incoming = _build_indexes(kg_data) | |
| 85 | + | |
| 86 | + # --- Individual entity notes --- | |
| 87 | + for node in nodes: | |
| 88 | + name = node.get("name", node.get("id", "")) | |
| 89 | + if not name: | |
| 90 | + continue | |
| 91 | + ntype = node.get("type", "concept") | |
| 92 | + descs = node.get("descriptions", []) | |
| 93 | + aliases = node.get("aliases", []) | |
| 94 | + | |
| 95 | + # YAML frontmatter | |
| 96 | + tags_yaml = f" - {ntype}" | |
| 97 | + aliases_yaml = "" | |
| 98 | + if aliases: | |
| 99 | + alias_lines = "\n".join(f" - {a}" for a in aliases) | |
| 100 | + aliases_yaml = f"aliases:\n{alias_lines}\n" | |
| 101 | + | |
| 102 | + frontmatter = f"---\ntype: {ntype}\ntags:\n{tags_yaml}\n{aliases_yaml}date: {today}\n---\n" | |
| 103 | + | |
| 104 | + parts = [frontmatter, f"# {name}", ""] | |
| 105 | + | |
| 106 | + # Descriptions | |
| 107 | + if descs: | |
| 108 | + for d in descs: | |
| 109 | + parts.append(f"{d}") | |
| 110 | + parts.append("") | |
| 111 | + | |
| 112 | + # Outgoing relationships | |
| 113 | + outs = outgoing.get(name, []) | |
| 114 | + if outs: | |
| 115 | + parts.append("## Relationships") | |
| 116 | + parts.append("") | |
| 117 | + for tgt, rtype in outs: | |
| 118 | + parts.append(f"- **{rtype}**: [[{tgt}]]") | |
| 119 | + parts.append("") | |
| 120 | + | |
| 121 | + # Incoming relationships | |
| 122 | + ins = incoming.get(name, []) | |
| 123 | + if ins: | |
| 124 | + parts.append("## Referenced by") | |
| 125 | + parts.append("") | |
| 126 | + for src, rtype in ins: | |
| 127 | + parts.append(f"- **{rtype}** from [[{src}]]") | |
| 128 | + parts.append("") | |
| 129 | + | |
| 130 | + filename = _sanitize_filename(name) + ".md" | |
| 131 | + path = output_dir / filename | |
| 132 | + path.write_text("\n".join(parts), encoding="utf-8") | |
| 133 | + created.append(path) | |
| 134 | + | |
| 135 | + # --- Index note (Map of Content) --- | |
| 136 | + index_parts = [ | |
| 137 | + "---", | |
| 138 | + "type: index", | |
| 139 | + "tags:", | |
| 140 | + " - MOC", | |
| 141 | + f"date: {today}", | |
| 142 | + "---", | |
| 143 | + "", | |
| 144 | + "# Index", | |
| 145 | + "", | |
| 146 | + f"**{len(nodes)}** entities | **{len(kg_data.get('relationships', []))}** relationships", | |
| 147 | + "", | |
| 148 | + ] | |
| 149 | + | |
| 150 | + for etype in sorted(by_type.keys()): | |
| 151 | + elist = by_type[etype] | |
| 152 | + index_parts.append(f"## {etype.title()}") | |
| 153 | + index_parts.append("") | |
| 154 | + for node in sorted(elist, key=lambda n: n.get("name", "")): | |
| 155 | + name = node.get("name", "") | |
| 156 | + index_parts.append(f"- [[{name}]]") | |
| 157 | + index_parts.append("") | |
| 158 | + | |
| 159 | + if artifacts: | |
| 160 | + index_parts.append("## Artifacts") | |
| 161 | + index_parts.append("") | |
| 162 | + for art in artifacts: | |
| 163 | + index_parts.append(f"- [[{art.name}]]") | |
| 164 | + index_parts.append("") | |
| 165 | + | |
| 166 | + index_path = output_dir / "_Index.md" | |
| 167 | + index_path.write_text("\n".join(index_parts), encoding="utf-8") | |
| 168 | + created.append(index_path) | |
| 169 | + | |
| 170 | + # --- Tag pages (one per entity type) --- | |
| 171 | + for etype, elist in sorted(by_type.items()): | |
| 172 | + tag_parts = [ | |
| 173 | + "---", | |
| 174 | + "type: tag", | |
| 175 | + "tags:", | |
| 176 | + f" - {etype}", | |
| 177 | + f"date: {today}", | |
| 178 | + "---", | |
| 179 | + "", | |
| 180 | + f"# {etype.title()}", | |
| 181 | + "", | |
| 182 | + f"All entities of type **{etype}** ({len(elist)}).", | |
| 183 | + "", | |
| 184 | + ] | |
| 185 | + for node in sorted(elist, key=lambda n: n.get("name", "")): | |
| 186 | + name = node.get("name", "") | |
| 187 | + descs = node.get("descriptions", []) | |
| 188 | + summary = descs[0] if descs else "" | |
| 189 | + tag_parts.append(f"- [[{name}]]" + (f" - {summary}" if summary else "")) | |
| 190 | + tag_parts.append("") | |
| 191 | + | |
| 192 | + tag_filename = f"Tag - {etype.title()}.md" | |
| 193 | + tag_path = output_dir / _sanitize_filename(tag_filename) | |
| 194 | + tag_path.write_text("\n".join(tag_parts), encoding="utf-8") | |
| 195 | + created.append(tag_path) | |
| 196 | + | |
| 197 | + # --- Artifact notes --- | |
| 198 | + for art in artifacts: | |
| 199 | + art_parts = [ | |
| 200 | + "---", | |
| 201 | + "type: artifact", | |
| 202 | + f"artifact_type: {art.artifact_type}", | |
| 203 | + "tags:", | |
| 204 | + " - artifact", | |
| 205 | + f" - {art.artifact_type}", | |
| 206 | + f"date: {today}", | |
| 207 | + "---", | |
| 208 | + "", | |
| 209 | + f"# {art.name}", | |
| 210 | + "", | |
| 211 | + art.content, | |
| 212 | + "", | |
| 213 | + ] | |
| 214 | + art_filename = _sanitize_filename(art.name) + ".md" | |
| 215 | + art_path = output_dir / art_filename | |
| 216 | + art_path.write_text("\n".join(art_parts), encoding="utf-8") | |
| 217 | + created.append(art_path) | |
| 218 | + | |
| 219 | + logger.info("Exported %d Obsidian notes to %s", len(created), output_dir) | |
| 220 | + return created | |
| 221 | + | |
| 222 | + | |
| 223 | +# --------------------------------------------------------------------------- | |
| 224 | +# Notion-compatible markdown export | |
| 225 | +# --------------------------------------------------------------------------- | |
| 226 | + | |
| 227 | + | |
| 228 | +def export_to_notion_md( | |
| 229 | + kg_data: dict, | |
| 230 | + output_dir: Path, | |
| 231 | + artifacts: Optional[List[Artifact]] = None, | |
| 232 | +) -> List[Path]: | |
| 233 | + """Export knowledge graph as Notion-compatible markdown. | |
| 234 | + | |
| 235 | + Creates ``.md`` files with Notion-style callout blocks and a | |
| 236 | + database-style CSV for bulk import. | |
| 237 | + """ | |
| 238 | + output_dir.mkdir(parents=True, exist_ok=True) | |
| 239 | + artifacts = artifacts or [] | |
| 240 | + created: List[Path] = [] | |
| 241 | + | |
| 242 | + nodes, by_type, node_lookup, outgoing, incoming = _build_indexes(kg_data) | |
| 243 | + | |
| 244 | + # --- Database CSV --- | |
| 245 | + csv_buffer = io.StringIO() | |
| 246 | + writer = csv.writer(csv_buffer) | |
| 247 | + writer.writerow(["Name", "Type", "Description", "Related To"]) | |
| 248 | + | |
| 249 | + for node in nodes: | |
| 250 | + name = node.get("name", node.get("id", "")) | |
| 251 | + ntype = node.get("type", "concept") | |
| 252 | + descs = node.get("descriptions", []) | |
| 253 | + desc_text = "; ".join(descs[:2]) if descs else "" | |
| 254 | + outs = outgoing.get(name, []) | |
| 255 | + related = ", ".join(tgt for tgt, _ in outs) if outs else "" | |
| 256 | + writer.writerow([name, ntype, desc_text, related]) | |
| 257 | + | |
| 258 | + csv_path = output_dir / "entities_database.csv" | |
| 259 | + csv_path.write_text(csv_buffer.getvalue(), encoding="utf-8") | |
| 260 | + created.append(csv_path) | |
| 261 | + | |
| 262 | + # --- Individual entity pages --- | |
| 263 | + for node in nodes: | |
| 264 | + name = node.get("name", node.get("id", "")) | |
| 265 | + if not name: | |
| 266 | + continue | |
| 267 | + ntype = node.get("type", "concept") | |
| 268 | + descs = node.get("descriptions", []) | |
| 269 | + | |
| 270 | + type_emoji = { | |
| 271 | + "person": "person", | |
| 272 | + "technology": "computer", | |
| 273 | + "organization": "building", | |
| 274 | + "concept": "bulb", | |
| 275 | + "event": "calendar", | |
| 276 | + "location": "round_pushpin", | |
| 277 | + } | |
| 278 | + emoji = type_emoji.get(ntype, "bulb") | |
| 279 | + | |
| 280 | + parts = [ | |
| 281 | + f"# {name}", | |
| 282 | + "", | |
| 283 | + f"> :{emoji}: **Type:** {ntype}", | |
| 284 | + "", | |
| 285 | + ] | |
| 286 | + | |
| 287 | + if descs: | |
| 288 | + parts.append("## Description") | |
| 289 | + parts.append("") | |
| 290 | + for d in descs: | |
| 291 | + parts.append(f"{d}") | |
| 292 | + parts.append("") | |
| 293 | + | |
| 294 | + # Properties callout | |
| 295 | + properties = node.get("properties", {}) | |
| 296 | + if properties: | |
| 297 | + parts.append("> :memo: **Properties**") | |
| 298 | + for k, v in properties.items(): | |
| 299 | + parts.append(f"> - **{k}:** {v}") | |
| 300 | + parts.append("") | |
| 301 | + | |
| 302 | + # Outgoing relationships | |
| 303 | + outs = outgoing.get(name, []) | |
| 304 | + if outs: | |
| 305 | + parts.append("## Relationships") | |
| 306 | + parts.append("") | |
| 307 | + parts.append("| Target | Relationship |") | |
| 308 | + parts.append("|--------|-------------|") | |
| 309 | + for tgt, rtype in outs: | |
| 310 | + parts.append(f"| {tgt} | {rtype} |") | |
| 311 | + parts.append("") | |
| 312 | + | |
| 313 | + # Incoming relationships | |
| 314 | + ins = incoming.get(name, []) | |
| 315 | + if ins: | |
| 316 | + parts.append("## Referenced by") | |
| 317 | + parts.append("") | |
| 318 | + parts.append("| Source | Relationship |") | |
| 319 | + parts.append("|--------|-------------|") | |
| 320 | + for src, rtype in ins: | |
| 321 | + parts.append(f"| {src} | {rtype} |") | |
| 322 | + parts.append("") | |
| 323 | + | |
| 324 | + filename = _sanitize_filename(name) + ".md" | |
| 325 | + path = output_dir / filename | |
| 326 | + path.write_text("\n".join(parts), encoding="utf-8") | |
| 327 | + created.append(path) | |
| 328 | + | |
| 329 | + # --- Overview page --- | |
| 330 | + overview_parts = [ | |
| 331 | + "# Knowledge Graph Overview", | |
| 332 | + "", | |
| 333 | + f"> :bar_chart: **Stats:** {len(nodes)} entities, " | |
| 334 | + f"{len(kg_data.get('relationships', []))} relationships", | |
| 335 | + "", | |
| 336 | + "## Entity Types", | |
| 337 | + "", | |
| 338 | + ] | |
| 339 | + for etype in sorted(by_type.keys()): | |
| 340 | + elist = by_type[etype] | |
| 341 | + overview_parts.append(f"### {etype.title()} ({len(elist)})") | |
| 342 | + overview_parts.append("") | |
| 343 | + for node in sorted(elist, key=lambda n: n.get("name", "")): | |
| 344 | + name = node.get("name", "") | |
| 345 | + overview_parts.append(f"- {name}") | |
| 346 | + overview_parts.append("") | |
| 347 | + | |
| 348 | + if artifacts: | |
| 349 | + overview_parts.append("## Artifacts") | |
| 350 | + overview_parts.append("") | |
| 351 | + for art in artifacts: | |
| 352 | + overview_parts.append(f"- **{art.name}** ({art.artifact_type})") | |
| 353 | + overview_parts.append("") | |
| 354 | + | |
| 355 | + overview_path = output_dir / "Overview.md" | |
| 356 | + overview_path.write_text("\n".join(overview_parts), encoding="utf-8") | |
| 357 | + created.append(overview_path) | |
| 358 | + | |
| 359 | + # --- Artifact pages --- | |
| 360 | + for art in artifacts: | |
| 361 | + art_parts = [ | |
| 362 | + f"# {art.name}", | |
| 363 | + "", | |
| 364 | + f"> :page_facing_up: **Type:** {art.artifact_type} | **Format:** {art.format}", | |
| 365 | + "", | |
| 366 | + art.content, | |
| 367 | + "", | |
| 368 | + ] | |
| 369 | + art_filename = _sanitize_filename(art.name) + ".md" | |
| 370 | + art_path = output_dir / art_filename | |
| 371 | + art_path.write_text("\n".join(art_parts), encoding="utf-8") | |
| 372 | + created.append(art_path) | |
| 373 | + | |
| 374 | + logger.info("Exported %d Notion markdown files to %s", len(created), output_dir) | |
| 375 | + return created | |
| 376 | + | |
| 377 | + | |
| 378 | +# --------------------------------------------------------------------------- | |
| 379 | +# Skill class | |
| 380 | +# --------------------------------------------------------------------------- | |
| 381 | + | |
| 382 | + | |
| 383 | +class NotesExportSkill(Skill): | |
| 384 | + """Export knowledge graph as structured notes (Obsidian, Notion). | |
| 385 | + | |
| 386 | + For GitHub wiki export, see the ``wiki_generator`` skill. | |
| 387 | + """ | |
| 388 | + | |
| 389 | + name = "notes_export" | |
| 390 | + description = "Export knowledge graph as structured notes (Obsidian, Notion)" | |
| 391 | + | |
| 392 | + def execute(self, context: AgentContext, **kwargs) -> Artifact: | |
| 393 | + fmt = kwargs.get("format", "obsidian") | |
| 394 | + output_dir = Path(kwargs.get("output_dir", f"notes_export_{fmt}")) | |
| 395 | + kg_data = context.knowledge_graph.to_dict() | |
| 396 | + artifacts = context.artifacts or [] | |
| 397 | + | |
| 398 | + if fmt == "notion": | |
| 399 | + created = export_to_notion_md(kg_data, output_dir, artifacts=artifacts) | |
| 400 | + else: | |
| 401 | + created = export_to_obsidian(kg_data, output_dir, artifacts=artifacts) | |
| 402 | + | |
| 403 | + file_list = "\n".join(f"- {p.name}" for p in created) | |
| 404 | + summary = f"Exported {len(created)} {fmt} notes to `{output_dir}`:\n\n{file_list}" | |
| 405 | + | |
| 406 | + return Artifact( | |
| 407 | + name=f"Notes Export ({fmt.title()})", | |
| 408 | + content=summary, | |
| 409 | + artifact_type="notes_export", | |
| 410 | + format="markdown", | |
| 411 | + metadata={ | |
| 412 | + "output_dir": str(output_dir), | |
| 413 | + "format": fmt, | |
| 414 | + "file_count": len(created), | |
| 415 | + "files": [str(p) for p in created], | |
| 416 | + }, | |
| 417 | + ) | |
| 418 | + | |
| 419 | + | |
| 420 | +register_skill(NotesExportSkill()) |
| --- a/video_processor/agent/skills/notes_export.py | |
| +++ b/video_processor/agent/skills/notes_export.py | |
| @@ -0,0 +1,420 @@ | |
| --- a/video_processor/agent/skills/notes_export.py | |
| +++ b/video_processor/agent/skills/notes_export.py | |
| @@ -0,0 +1,420 @@ | |
| 1 | """Skill: Export knowledge graph as structured notes (Obsidian, Notion).""" |
| 2 | |
| 3 | import csv |
| 4 | import io |
| 5 | import logging |
| 6 | from datetime import date |
| 7 | from pathlib import Path |
| 8 | from typing import Dict, List, Optional |
| 9 | |
| 10 | from video_processor.agent.skills.base import ( |
| 11 | AgentContext, |
| 12 | Artifact, |
| 13 | Skill, |
| 14 | register_skill, |
| 15 | ) |
| 16 | |
| 17 | logger = logging.getLogger(__name__) |
| 18 | |
| 19 | |
| 20 | def _sanitize_filename(name: str) -> str: |
| 21 | """Convert a name to a filesystem-safe filename.""" |
| 22 | return ( |
| 23 | name.replace("/", "-") |
| 24 | .replace("\\", "-") |
| 25 | .replace(":", "-") |
| 26 | .replace('"', "") |
| 27 | .replace("?", "") |
| 28 | .replace("*", "") |
| 29 | .replace("<", "") |
| 30 | .replace(">", "") |
| 31 | .replace("|", "") |
| 32 | ) |
| 33 | |
| 34 | |
| 35 | def _build_indexes(kg_data: dict): |
| 36 | """Build lookup structures from knowledge graph data. |
| 37 | |
| 38 | Returns (nodes, by_type, node_lookup, outgoing, incoming). |
| 39 | """ |
| 40 | nodes = kg_data.get("nodes", []) |
| 41 | relationships = kg_data.get("relationships", []) |
| 42 | |
| 43 | by_type: Dict[str, list] = {} |
| 44 | node_lookup: Dict[str, dict] = {} |
| 45 | for node in nodes: |
| 46 | name = node.get("name", node.get("id", "")) |
| 47 | ntype = node.get("type", "concept") |
| 48 | by_type.setdefault(ntype, []).append(node) |
| 49 | node_lookup[name] = node |
| 50 | |
| 51 | outgoing: Dict[str, list] = {} |
| 52 | incoming: Dict[str, list] = {} |
| 53 | for rel in relationships: |
| 54 | src = rel.get("source", "") |
| 55 | tgt = rel.get("target", "") |
| 56 | rtype = rel.get("type", "related_to") |
| 57 | outgoing.setdefault(src, []).append((tgt, rtype)) |
| 58 | incoming.setdefault(tgt, []).append((src, rtype)) |
| 59 | |
| 60 | return nodes, by_type, node_lookup, outgoing, incoming |
| 61 | |
| 62 | |
| 63 | # --------------------------------------------------------------------------- |
| 64 | # Obsidian export |
| 65 | # --------------------------------------------------------------------------- |
| 66 | |
| 67 | |
| 68 | def export_to_obsidian( |
| 69 | kg_data: dict, |
| 70 | output_dir: Path, |
| 71 | artifacts: Optional[List[Artifact]] = None, |
| 72 | ) -> List[Path]: |
| 73 | """Export knowledge graph as an Obsidian vault. |
| 74 | |
| 75 | Creates one ``.md`` file per entity with YAML frontmatter and |
| 76 | ``[[wiki-links]]``, an ``_Index.md`` Map of Content, tag pages per |
| 77 | entity type, and optional artifact notes. |
| 78 | """ |
| 79 | output_dir.mkdir(parents=True, exist_ok=True) |
| 80 | artifacts = artifacts or [] |
| 81 | created: List[Path] = [] |
| 82 | today = date.today().isoformat() |
| 83 | |
| 84 | nodes, by_type, node_lookup, outgoing, incoming = _build_indexes(kg_data) |
| 85 | |
| 86 | # --- Individual entity notes --- |
| 87 | for node in nodes: |
| 88 | name = node.get("name", node.get("id", "")) |
| 89 | if not name: |
| 90 | continue |
| 91 | ntype = node.get("type", "concept") |
| 92 | descs = node.get("descriptions", []) |
| 93 | aliases = node.get("aliases", []) |
| 94 | |
| 95 | # YAML frontmatter |
| 96 | tags_yaml = f" - {ntype}" |
| 97 | aliases_yaml = "" |
| 98 | if aliases: |
| 99 | alias_lines = "\n".join(f" - {a}" for a in aliases) |
| 100 | aliases_yaml = f"aliases:\n{alias_lines}\n" |
| 101 | |
| 102 | frontmatter = f"---\ntype: {ntype}\ntags:\n{tags_yaml}\n{aliases_yaml}date: {today}\n---\n" |
| 103 | |
| 104 | parts = [frontmatter, f"# {name}", ""] |
| 105 | |
| 106 | # Descriptions |
| 107 | if descs: |
| 108 | for d in descs: |
| 109 | parts.append(f"{d}") |
| 110 | parts.append("") |
| 111 | |
| 112 | # Outgoing relationships |
| 113 | outs = outgoing.get(name, []) |
| 114 | if outs: |
| 115 | parts.append("## Relationships") |
| 116 | parts.append("") |
| 117 | for tgt, rtype in outs: |
| 118 | parts.append(f"- **{rtype}**: [[{tgt}]]") |
| 119 | parts.append("") |
| 120 | |
| 121 | # Incoming relationships |
| 122 | ins = incoming.get(name, []) |
| 123 | if ins: |
| 124 | parts.append("## Referenced by") |
| 125 | parts.append("") |
| 126 | for src, rtype in ins: |
| 127 | parts.append(f"- **{rtype}** from [[{src}]]") |
| 128 | parts.append("") |
| 129 | |
| 130 | filename = _sanitize_filename(name) + ".md" |
| 131 | path = output_dir / filename |
| 132 | path.write_text("\n".join(parts), encoding="utf-8") |
| 133 | created.append(path) |
| 134 | |
| 135 | # --- Index note (Map of Content) --- |
| 136 | index_parts = [ |
| 137 | "---", |
| 138 | "type: index", |
| 139 | "tags:", |
| 140 | " - MOC", |
| 141 | f"date: {today}", |
| 142 | "---", |
| 143 | "", |
| 144 | "# Index", |
| 145 | "", |
| 146 | f"**{len(nodes)}** entities | **{len(kg_data.get('relationships', []))}** relationships", |
| 147 | "", |
| 148 | ] |
| 149 | |
| 150 | for etype in sorted(by_type.keys()): |
| 151 | elist = by_type[etype] |
| 152 | index_parts.append(f"## {etype.title()}") |
| 153 | index_parts.append("") |
| 154 | for node in sorted(elist, key=lambda n: n.get("name", "")): |
| 155 | name = node.get("name", "") |
| 156 | index_parts.append(f"- [[{name}]]") |
| 157 | index_parts.append("") |
| 158 | |
| 159 | if artifacts: |
| 160 | index_parts.append("## Artifacts") |
| 161 | index_parts.append("") |
| 162 | for art in artifacts: |
| 163 | index_parts.append(f"- [[{art.name}]]") |
| 164 | index_parts.append("") |
| 165 | |
| 166 | index_path = output_dir / "_Index.md" |
| 167 | index_path.write_text("\n".join(index_parts), encoding="utf-8") |
| 168 | created.append(index_path) |
| 169 | |
| 170 | # --- Tag pages (one per entity type) --- |
| 171 | for etype, elist in sorted(by_type.items()): |
| 172 | tag_parts = [ |
| 173 | "---", |
| 174 | "type: tag", |
| 175 | "tags:", |
| 176 | f" - {etype}", |
| 177 | f"date: {today}", |
| 178 | "---", |
| 179 | "", |
| 180 | f"# {etype.title()}", |
| 181 | "", |
| 182 | f"All entities of type **{etype}** ({len(elist)}).", |
| 183 | "", |
| 184 | ] |
| 185 | for node in sorted(elist, key=lambda n: n.get("name", "")): |
| 186 | name = node.get("name", "") |
| 187 | descs = node.get("descriptions", []) |
| 188 | summary = descs[0] if descs else "" |
| 189 | tag_parts.append(f"- [[{name}]]" + (f" - {summary}" if summary else "")) |
| 190 | tag_parts.append("") |
| 191 | |
| 192 | tag_filename = f"Tag - {etype.title()}.md" |
| 193 | tag_path = output_dir / _sanitize_filename(tag_filename) |
| 194 | tag_path.write_text("\n".join(tag_parts), encoding="utf-8") |
| 195 | created.append(tag_path) |
| 196 | |
| 197 | # --- Artifact notes --- |
| 198 | for art in artifacts: |
| 199 | art_parts = [ |
| 200 | "---", |
| 201 | "type: artifact", |
| 202 | f"artifact_type: {art.artifact_type}", |
| 203 | "tags:", |
| 204 | " - artifact", |
| 205 | f" - {art.artifact_type}", |
| 206 | f"date: {today}", |
| 207 | "---", |
| 208 | "", |
| 209 | f"# {art.name}", |
| 210 | "", |
| 211 | art.content, |
| 212 | "", |
| 213 | ] |
| 214 | art_filename = _sanitize_filename(art.name) + ".md" |
| 215 | art_path = output_dir / art_filename |
| 216 | art_path.write_text("\n".join(art_parts), encoding="utf-8") |
| 217 | created.append(art_path) |
| 218 | |
| 219 | logger.info("Exported %d Obsidian notes to %s", len(created), output_dir) |
| 220 | return created |
| 221 | |
| 222 | |
| 223 | # --------------------------------------------------------------------------- |
| 224 | # Notion-compatible markdown export |
| 225 | # --------------------------------------------------------------------------- |
| 226 | |
| 227 | |
| 228 | def export_to_notion_md( |
| 229 | kg_data: dict, |
| 230 | output_dir: Path, |
| 231 | artifacts: Optional[List[Artifact]] = None, |
| 232 | ) -> List[Path]: |
| 233 | """Export knowledge graph as Notion-compatible markdown. |
| 234 | |
| 235 | Creates ``.md`` files with Notion-style callout blocks and a |
| 236 | database-style CSV for bulk import. |
| 237 | """ |
| 238 | output_dir.mkdir(parents=True, exist_ok=True) |
| 239 | artifacts = artifacts or [] |
| 240 | created: List[Path] = [] |
| 241 | |
| 242 | nodes, by_type, node_lookup, outgoing, incoming = _build_indexes(kg_data) |
| 243 | |
| 244 | # --- Database CSV --- |
| 245 | csv_buffer = io.StringIO() |
| 246 | writer = csv.writer(csv_buffer) |
| 247 | writer.writerow(["Name", "Type", "Description", "Related To"]) |
| 248 | |
| 249 | for node in nodes: |
| 250 | name = node.get("name", node.get("id", "")) |
| 251 | ntype = node.get("type", "concept") |
| 252 | descs = node.get("descriptions", []) |
| 253 | desc_text = "; ".join(descs[:2]) if descs else "" |
| 254 | outs = outgoing.get(name, []) |
| 255 | related = ", ".join(tgt for tgt, _ in outs) if outs else "" |
| 256 | writer.writerow([name, ntype, desc_text, related]) |
| 257 | |
| 258 | csv_path = output_dir / "entities_database.csv" |
| 259 | csv_path.write_text(csv_buffer.getvalue(), encoding="utf-8") |
| 260 | created.append(csv_path) |
| 261 | |
| 262 | # --- Individual entity pages --- |
| 263 | for node in nodes: |
| 264 | name = node.get("name", node.get("id", "")) |
| 265 | if not name: |
| 266 | continue |
| 267 | ntype = node.get("type", "concept") |
| 268 | descs = node.get("descriptions", []) |
| 269 | |
| 270 | type_emoji = { |
| 271 | "person": "person", |
| 272 | "technology": "computer", |
| 273 | "organization": "building", |
| 274 | "concept": "bulb", |
| 275 | "event": "calendar", |
| 276 | "location": "round_pushpin", |
| 277 | } |
| 278 | emoji = type_emoji.get(ntype, "bulb") |
| 279 | |
| 280 | parts = [ |
| 281 | f"# {name}", |
| 282 | "", |
| 283 | f"> :{emoji}: **Type:** {ntype}", |
| 284 | "", |
| 285 | ] |
| 286 | |
| 287 | if descs: |
| 288 | parts.append("## Description") |
| 289 | parts.append("") |
| 290 | for d in descs: |
| 291 | parts.append(f"{d}") |
| 292 | parts.append("") |
| 293 | |
| 294 | # Properties callout |
| 295 | properties = node.get("properties", {}) |
| 296 | if properties: |
| 297 | parts.append("> :memo: **Properties**") |
| 298 | for k, v in properties.items(): |
| 299 | parts.append(f"> - **{k}:** {v}") |
| 300 | parts.append("") |
| 301 | |
| 302 | # Outgoing relationships |
| 303 | outs = outgoing.get(name, []) |
| 304 | if outs: |
| 305 | parts.append("## Relationships") |
| 306 | parts.append("") |
| 307 | parts.append("| Target | Relationship |") |
| 308 | parts.append("|--------|-------------|") |
| 309 | for tgt, rtype in outs: |
| 310 | parts.append(f"| {tgt} | {rtype} |") |
| 311 | parts.append("") |
| 312 | |
| 313 | # Incoming relationships |
| 314 | ins = incoming.get(name, []) |
| 315 | if ins: |
| 316 | parts.append("## Referenced by") |
| 317 | parts.append("") |
| 318 | parts.append("| Source | Relationship |") |
| 319 | parts.append("|--------|-------------|") |
| 320 | for src, rtype in ins: |
| 321 | parts.append(f"| {src} | {rtype} |") |
| 322 | parts.append("") |
| 323 | |
| 324 | filename = _sanitize_filename(name) + ".md" |
| 325 | path = output_dir / filename |
| 326 | path.write_text("\n".join(parts), encoding="utf-8") |
| 327 | created.append(path) |
| 328 | |
| 329 | # --- Overview page --- |
| 330 | overview_parts = [ |
| 331 | "# Knowledge Graph Overview", |
| 332 | "", |
| 333 | f"> :bar_chart: **Stats:** {len(nodes)} entities, " |
| 334 | f"{len(kg_data.get('relationships', []))} relationships", |
| 335 | "", |
| 336 | "## Entity Types", |
| 337 | "", |
| 338 | ] |
| 339 | for etype in sorted(by_type.keys()): |
| 340 | elist = by_type[etype] |
| 341 | overview_parts.append(f"### {etype.title()} ({len(elist)})") |
| 342 | overview_parts.append("") |
| 343 | for node in sorted(elist, key=lambda n: n.get("name", "")): |
| 344 | name = node.get("name", "") |
| 345 | overview_parts.append(f"- {name}") |
| 346 | overview_parts.append("") |
| 347 | |
| 348 | if artifacts: |
| 349 | overview_parts.append("## Artifacts") |
| 350 | overview_parts.append("") |
| 351 | for art in artifacts: |
| 352 | overview_parts.append(f"- **{art.name}** ({art.artifact_type})") |
| 353 | overview_parts.append("") |
| 354 | |
| 355 | overview_path = output_dir / "Overview.md" |
| 356 | overview_path.write_text("\n".join(overview_parts), encoding="utf-8") |
| 357 | created.append(overview_path) |
| 358 | |
| 359 | # --- Artifact pages --- |
| 360 | for art in artifacts: |
| 361 | art_parts = [ |
| 362 | f"# {art.name}", |
| 363 | "", |
| 364 | f"> :page_facing_up: **Type:** {art.artifact_type} | **Format:** {art.format}", |
| 365 | "", |
| 366 | art.content, |
| 367 | "", |
| 368 | ] |
| 369 | art_filename = _sanitize_filename(art.name) + ".md" |
| 370 | art_path = output_dir / art_filename |
| 371 | art_path.write_text("\n".join(art_parts), encoding="utf-8") |
| 372 | created.append(art_path) |
| 373 | |
| 374 | logger.info("Exported %d Notion markdown files to %s", len(created), output_dir) |
| 375 | return created |
| 376 | |
| 377 | |
| 378 | # --------------------------------------------------------------------------- |
| 379 | # Skill class |
| 380 | # --------------------------------------------------------------------------- |
| 381 | |
| 382 | |
| 383 | class NotesExportSkill(Skill): |
| 384 | """Export knowledge graph as structured notes (Obsidian, Notion). |
| 385 | |
| 386 | For GitHub wiki export, see the ``wiki_generator`` skill. |
| 387 | """ |
| 388 | |
| 389 | name = "notes_export" |
| 390 | description = "Export knowledge graph as structured notes (Obsidian, Notion)" |
| 391 | |
| 392 | def execute(self, context: AgentContext, **kwargs) -> Artifact: |
| 393 | fmt = kwargs.get("format", "obsidian") |
| 394 | output_dir = Path(kwargs.get("output_dir", f"notes_export_{fmt}")) |
| 395 | kg_data = context.knowledge_graph.to_dict() |
| 396 | artifacts = context.artifacts or [] |
| 397 | |
| 398 | if fmt == "notion": |
| 399 | created = export_to_notion_md(kg_data, output_dir, artifacts=artifacts) |
| 400 | else: |
| 401 | created = export_to_obsidian(kg_data, output_dir, artifacts=artifacts) |
| 402 | |
| 403 | file_list = "\n".join(f"- {p.name}" for p in created) |
| 404 | summary = f"Exported {len(created)} {fmt} notes to `{output_dir}`:\n\n{file_list}" |
| 405 | |
| 406 | return Artifact( |
| 407 | name=f"Notes Export ({fmt.title()})", |
| 408 | content=summary, |
| 409 | artifact_type="notes_export", |
| 410 | format="markdown", |
| 411 | metadata={ |
| 412 | "output_dir": str(output_dir), |
| 413 | "format": fmt, |
| 414 | "file_count": len(created), |
| 415 | "files": [str(p) for p in created], |
| 416 | }, |
| 417 | ) |
| 418 | |
| 419 | |
| 420 | register_skill(NotesExportSkill()) |
| --- a/video_processor/agent/skills/prd.py | ||
| +++ b/video_processor/agent/skills/prd.py | ||
| @@ -0,0 +1,70 @@ | ||
| 1 | +"""Skill: Generate a product requirements document (PRD) / feature spec.""" | |
| 2 | + | |
| 3 | +from video_processor.agent.skills.base import ( | |
| 4 | + AgentContext, | |
| 5 | + Artifact, | |
| 6 | + Skill, | |
| 7 | + register_skill, | |
| 8 | +) | |
| 9 | + | |
| 10 | + | |
| 11 | +class PRDSkill(Skill): | |
| 12 | + name = "prd" | |
| 13 | + description = "Generate a product requirements document (PRD) / feature spec" | |
| 14 | + | |
| 15 | + def execute(self, context: AgentContext, **kwargs) -> Artifact: | |
| 16 | + stats = context.query_engine.stats() | |
| 17 | + entities = context.query_engine.entities() | |
| 18 | + relationships = context.query_engine.relationships() | |
| 19 | + | |
| 20 | + relevant_types = {"requirement", "feature", "constraint"} | |
| 21 | + filtered = [ | |
| 22 | + e for e in context.planning_entities if getattr(e, "type", "").lower() in relevant_types | |
| 23 | + ] | |
| 24 | + | |
| 25 | + parts = [ | |
| 26 | + "You are a product manager. Using the following knowledge " | |
| 27 | + "graph context, generate a product requirements document.", | |
| 28 | + "", | |
| 29 | + "## Knowledge Graph Overview", | |
| 30 | + stats.to_text(), | |
| 31 | + "", | |
| 32 | + "## Entities", | |
| 33 | + entities.to_text(), | |
| 34 | + "", | |
| 35 | + "## Relationships", | |
| 36 | + relationships.to_text(), | |
| 37 | + "", | |
| 38 | + "## Relevant Planning Entities", | |
| 39 | + ] | |
| 40 | + for e in filtered: | |
| 41 | + parts.append(f"- [{getattr(e, 'type', 'unknown')}] {e}") | |
| 42 | + | |
| 43 | + if not filtered: | |
| 44 | + parts.append( | |
| 45 | + "(No pre-filtered entities; derive requirements from the full context above.)" | |
| 46 | + ) | |
| 47 | + | |
| 48 | + parts.append( | |
| 49 | + "\nGenerate a PRD with:\n" | |
| 50 | + "1. Problem Statement\n" | |
| 51 | + "2. User Stories\n" | |
| 52 | + "3. Functional Requirements\n" | |
| 53 | + "4. Non-Functional Requirements\n" | |
| 54 | + "5. Acceptance Criteria\n" | |
| 55 | + "6. Out of Scope\n\n" | |
| 56 | + "Return ONLY the markdown." | |
| 57 | + ) | |
| 58 | + | |
| 59 | + prompt = "\n".join(parts) | |
| 60 | + response = context.provider_manager.chat(messages=[{"role": "user", "content": prompt}]) | |
| 61 | + | |
| 62 | + return Artifact( | |
| 63 | + name="Product Requirements Document", | |
| 64 | + content=response, | |
| 65 | + artifact_type="prd", | |
| 66 | + format="markdown", | |
| 67 | + ) | |
| 68 | + | |
| 69 | + | |
| 70 | +register_skill(PRDSkill()) |
| --- a/video_processor/agent/skills/prd.py | |
| +++ b/video_processor/agent/skills/prd.py | |
| @@ -0,0 +1,70 @@ | |
| --- a/video_processor/agent/skills/prd.py | |
| +++ b/video_processor/agent/skills/prd.py | |
| @@ -0,0 +1,70 @@ | |
| 1 | """Skill: Generate a product requirements document (PRD) / feature spec.""" |
| 2 | |
| 3 | from video_processor.agent.skills.base import ( |
| 4 | AgentContext, |
| 5 | Artifact, |
| 6 | Skill, |
| 7 | register_skill, |
| 8 | ) |
| 9 | |
| 10 | |
| 11 | class PRDSkill(Skill): |
| 12 | name = "prd" |
| 13 | description = "Generate a product requirements document (PRD) / feature spec" |
| 14 | |
| 15 | def execute(self, context: AgentContext, **kwargs) -> Artifact: |
| 16 | stats = context.query_engine.stats() |
| 17 | entities = context.query_engine.entities() |
| 18 | relationships = context.query_engine.relationships() |
| 19 | |
| 20 | relevant_types = {"requirement", "feature", "constraint"} |
| 21 | filtered = [ |
| 22 | e for e in context.planning_entities if getattr(e, "type", "").lower() in relevant_types |
| 23 | ] |
| 24 | |
| 25 | parts = [ |
| 26 | "You are a product manager. Using the following knowledge " |
| 27 | "graph context, generate a product requirements document.", |
| 28 | "", |
| 29 | "## Knowledge Graph Overview", |
| 30 | stats.to_text(), |
| 31 | "", |
| 32 | "## Entities", |
| 33 | entities.to_text(), |
| 34 | "", |
| 35 | "## Relationships", |
| 36 | relationships.to_text(), |
| 37 | "", |
| 38 | "## Relevant Planning Entities", |
| 39 | ] |
| 40 | for e in filtered: |
| 41 | parts.append(f"- [{getattr(e, 'type', 'unknown')}] {e}") |
| 42 | |
| 43 | if not filtered: |
| 44 | parts.append( |
| 45 | "(No pre-filtered entities; derive requirements from the full context above.)" |
| 46 | ) |
| 47 | |
| 48 | parts.append( |
| 49 | "\nGenerate a PRD with:\n" |
| 50 | "1. Problem Statement\n" |
| 51 | "2. User Stories\n" |
| 52 | "3. Functional Requirements\n" |
| 53 | "4. Non-Functional Requirements\n" |
| 54 | "5. Acceptance Criteria\n" |
| 55 | "6. Out of Scope\n\n" |
| 56 | "Return ONLY the markdown." |
| 57 | ) |
| 58 | |
| 59 | prompt = "\n".join(parts) |
| 60 | response = context.provider_manager.chat(messages=[{"role": "user", "content": prompt}]) |
| 61 | |
| 62 | return Artifact( |
| 63 | name="Product Requirements Document", |
| 64 | content=response, |
| 65 | artifact_type="prd", |
| 66 | format="markdown", |
| 67 | ) |
| 68 | |
| 69 | |
| 70 | register_skill(PRDSkill()) |
| --- a/video_processor/agent/skills/project_plan.py | ||
| +++ b/video_processor/agent/skills/project_plan.py | ||
| @@ -0,0 +1,74 @@ | ||
| 1 | +"""Skill: Generate a structured project plan from knowledge graph.""" | |
| 2 | + | |
| 3 | +from video_processor.agent.skills.base import ( | |
| 4 | + AgentContext, | |
| 5 | + Artifact, | |
| 6 | + Skill, | |
| 7 | + register_skill, | |
| 8 | +) | |
| 9 | + | |
| 10 | + | |
| 11 | +def _group_entities_by_type(entities): | |
| 12 | + """Group planning entities by their type.""" | |
| 13 | + grouped = {} | |
| 14 | + for e in entities: | |
| 15 | + etype = getattr(e, "type", "unknown") | |
| 16 | + grouped.setdefault(etype, []).append(e) | |
| 17 | + return grouped | |
| 18 | + | |
| 19 | + | |
| 20 | +class ProjectPlanSkill(Skill): | |
| 21 | + name = "project_plan" | |
| 22 | + description = "Generate a structured project plan from knowledge graph" | |
| 23 | + | |
| 24 | + def execute(self, context: AgentContext, **kwargs) -> Artifact: | |
| 25 | + stats = context.query_engine.stats() | |
| 26 | + entities = context.query_engine.entities() | |
| 27 | + relationships = context.query_engine.relationships() | |
| 28 | + grouped = _group_entities_by_type(context.planning_entities) | |
| 29 | + | |
| 30 | + parts = [ | |
| 31 | + "You are a project planning expert. Using the following " | |
| 32 | + "knowledge graph context, generate a comprehensive " | |
| 33 | + "project plan in markdown.", | |
| 34 | + "", | |
| 35 | + "## Knowledge Graph Overview", | |
| 36 | + stats.to_text(), | |
| 37 | + "", | |
| 38 | + "## Entities", | |
| 39 | + entities.to_text(), | |
| 40 | + "", | |
| 41 | + "## Relationships", | |
| 42 | + relationships.to_text(), | |
| 43 | + "", | |
| 44 | + "## Planning Entities (by type)", | |
| 45 | + ] | |
| 46 | + for etype, elist in grouped.items(): | |
| 47 | + parts.append(f"\n### {etype}") | |
| 48 | + for e in elist: | |
| 49 | + parts.append(f"- {e}") | |
| 50 | + | |
| 51 | + parts.append( | |
| 52 | + "\nGenerate a markdown project plan with:\n" | |
| 53 | + "1. Executive Summary\n" | |
| 54 | + "2. Goals & Objectives\n" | |
| 55 | + "3. Scope\n" | |
| 56 | + "4. Phases & Milestones\n" | |
| 57 | + "5. Resource Requirements\n" | |
| 58 | + "6. Risks & Mitigations\n" | |
| 59 | + "7. Success Criteria\n\n" | |
| 60 | + "Return ONLY the markdown." | |
| 61 | + ) | |
| 62 | + | |
| 63 | + prompt = "\n".join(parts) | |
| 64 | + response = context.provider_manager.chat(messages=[{"role": "user", "content": prompt}]) | |
| 65 | + | |
| 66 | + return Artifact( | |
| 67 | + name="Project Plan", | |
| 68 | + content=response, | |
| 69 | + artifact_type="project_plan", | |
| 70 | + format="markdown", | |
| 71 | + ) | |
| 72 | + | |
| 73 | + | |
| 74 | +register_skill(ProjectPlanSkill()) |
| --- a/video_processor/agent/skills/project_plan.py | |
| +++ b/video_processor/agent/skills/project_plan.py | |
| @@ -0,0 +1,74 @@ | |
| --- a/video_processor/agent/skills/project_plan.py | |
| +++ b/video_processor/agent/skills/project_plan.py | |
| @@ -0,0 +1,74 @@ | |
| 1 | """Skill: Generate a structured project plan from knowledge graph.""" |
| 2 | |
| 3 | from video_processor.agent.skills.base import ( |
| 4 | AgentContext, |
| 5 | Artifact, |
| 6 | Skill, |
| 7 | register_skill, |
| 8 | ) |
| 9 | |
| 10 | |
| 11 | def _group_entities_by_type(entities): |
| 12 | """Group planning entities by their type.""" |
| 13 | grouped = {} |
| 14 | for e in entities: |
| 15 | etype = getattr(e, "type", "unknown") |
| 16 | grouped.setdefault(etype, []).append(e) |
| 17 | return grouped |
| 18 | |
| 19 | |
| 20 | class ProjectPlanSkill(Skill): |
| 21 | name = "project_plan" |
| 22 | description = "Generate a structured project plan from knowledge graph" |
| 23 | |
| 24 | def execute(self, context: AgentContext, **kwargs) -> Artifact: |
| 25 | stats = context.query_engine.stats() |
| 26 | entities = context.query_engine.entities() |
| 27 | relationships = context.query_engine.relationships() |
| 28 | grouped = _group_entities_by_type(context.planning_entities) |
| 29 | |
| 30 | parts = [ |
| 31 | "You are a project planning expert. Using the following " |
| 32 | "knowledge graph context, generate a comprehensive " |
| 33 | "project plan in markdown.", |
| 34 | "", |
| 35 | "## Knowledge Graph Overview", |
| 36 | stats.to_text(), |
| 37 | "", |
| 38 | "## Entities", |
| 39 | entities.to_text(), |
| 40 | "", |
| 41 | "## Relationships", |
| 42 | relationships.to_text(), |
| 43 | "", |
| 44 | "## Planning Entities (by type)", |
| 45 | ] |
| 46 | for etype, elist in grouped.items(): |
| 47 | parts.append(f"\n### {etype}") |
| 48 | for e in elist: |
| 49 | parts.append(f"- {e}") |
| 50 | |
| 51 | parts.append( |
| 52 | "\nGenerate a markdown project plan with:\n" |
| 53 | "1. Executive Summary\n" |
| 54 | "2. Goals & Objectives\n" |
| 55 | "3. Scope\n" |
| 56 | "4. Phases & Milestones\n" |
| 57 | "5. Resource Requirements\n" |
| 58 | "6. Risks & Mitigations\n" |
| 59 | "7. Success Criteria\n\n" |
| 60 | "Return ONLY the markdown." |
| 61 | ) |
| 62 | |
| 63 | prompt = "\n".join(parts) |
| 64 | response = context.provider_manager.chat(messages=[{"role": "user", "content": prompt}]) |
| 65 | |
| 66 | return Artifact( |
| 67 | name="Project Plan", |
| 68 | content=response, |
| 69 | artifact_type="project_plan", |
| 70 | format="markdown", |
| 71 | ) |
| 72 | |
| 73 | |
| 74 | register_skill(ProjectPlanSkill()) |
| --- a/video_processor/agent/skills/requirements_chat.py | ||
| +++ b/video_processor/agent/skills/requirements_chat.py | ||
| @@ -0,0 +1,95 @@ | ||
| 1 | +"""Skill: Interactive requirements gathering via guided questions.""" | |
| 2 | + | |
| 3 | +import json | |
| 4 | + | |
| 5 | +from video_processor.agent.skills.base import ( | |
| 6 | + AgentContext, | |
| 7 | + Artifact, | |
| 8 | + Skill, | |
| 9 | + register_skill, | |
| 10 | +) | |
| 11 | +from video_processor.utils.json_parsing import parse_json_from_response | |
| 12 | + | |
| 13 | + | |
| 14 | +class RequirementsChatSkill(Skill): | |
| 15 | + name = "requirements_chat" | |
| 16 | + description = "Interactive requirements gathering via guided questions" | |
| 17 | + | |
| 18 | + def execute(self, context: AgentContext, **kwargs) -> Artifact: | |
| 19 | + """Generate a structured requirements questionnaire.""" | |
| 20 | + stats = context.query_engine.stats() | |
| 21 | + entities = context.query_engine.entities() | |
| 22 | + | |
| 23 | + parts = [ | |
| 24 | + "You are a requirements analyst. Based on the following " | |
| 25 | + "knowledge graph context, generate a requirements " | |
| 26 | + "gathering questionnaire.", | |
| 27 | + "", | |
| 28 | + "## Knowledge Graph Overview", | |
| 29 | + stats.to_text(), | |
| 30 | + "", | |
| 31 | + "## Entities", | |
| 32 | + entities.to_text(), | |
| 33 | + "", | |
| 34 | + "## Planning Entities", | |
| 35 | + ] | |
| 36 | + for e in context.planning_entities: | |
| 37 | + parts.append(f"- {e}") | |
| 38 | + | |
| 39 | + parts.append( | |
| 40 | + '\nGenerate a JSON object with a "questions" array. ' | |
| 41 | + "Each question should have:\n" | |
| 42 | + '- "id": string (e.g. "Q1")\n' | |
| 43 | + '- "category": "goals"|"constraints"|"priorities"|"scope"\n' | |
| 44 | + '- "question": string\n' | |
| 45 | + '- "context": string (why this matters)\n\n' | |
| 46 | + "Include 8-12 targeted questions.\n\n" | |
| 47 | + "Return ONLY the JSON." | |
| 48 | + ) | |
| 49 | + | |
| 50 | + prompt = "\n".join(parts) | |
| 51 | + response = context.provider_manager.chat(messages=[{"role": "user", "content": prompt}]) | |
| 52 | + parsed = parse_json_from_response(response) | |
| 53 | + content = json.dumps(parsed, indent=2) if not isinstance(parsed, str) else parsed | |
| 54 | + | |
| 55 | + return Artifact( | |
| 56 | + name="Requirements Questionnaire", | |
| 57 | + content=content, | |
| 58 | + artifact_type="requirements", | |
| 59 | + format="json", | |
| 60 | + metadata={"stage": "questionnaire"}, | |
| 61 | + ) | |
| 62 | + | |
| 63 | + def gather_requirements(self, context: AgentContext, answers: dict) -> dict: | |
| 64 | + """Take Q&A pairs and synthesize structured requirements.""" | |
| 65 | + stats = context.query_engine.stats() | |
| 66 | + | |
| 67 | + qa_text = "" | |
| 68 | + for qid, answer in answers.items(): | |
| 69 | + qa_text += f"- {qid}: {answer}\n" | |
| 70 | + | |
| 71 | + parts = [ | |
| 72 | + "You are a requirements analyst. Based on the knowledge " | |
| 73 | + "graph context and the answered questions, synthesize " | |
| 74 | + "structured requirements.", | |
| 75 | + "", | |
| 76 | + "## Knowledge Graph Overview", | |
| 77 | + stats.to_text(), | |
| 78 | + "", | |
| 79 | + "## Answers", | |
| 80 | + qa_text, | |
| 81 | + "Return a JSON object with:\n" | |
| 82 | + '- "goals": list of goal strings\n' | |
| 83 | + '- "constraints": list of constraint strings\n' | |
| 84 | + '- "priorities": list (ordered high to low)\n' | |
| 85 | + '- "scope": {"in_scope": [...], "out_of_scope": [...]}\n\n' | |
| 86 | + "Return ONLY the JSON.", | |
| 87 | + ] | |
| 88 | + | |
| 89 | + prompt = "\n".join(parts) | |
| 90 | + response = context.provider_manager.chat(messages=[{"role": "user", "content": prompt}]) | |
| 91 | + result = parse_json_from_response(response) | |
| 92 | + return result if isinstance(result, dict) else {"raw": result} | |
| 93 | + | |
| 94 | + | |
| 95 | +register_skill(RequirementsChatSkill()) |
| --- a/video_processor/agent/skills/requirements_chat.py | |
| +++ b/video_processor/agent/skills/requirements_chat.py | |
| @@ -0,0 +1,95 @@ | |
| --- a/video_processor/agent/skills/requirements_chat.py | |
| +++ b/video_processor/agent/skills/requirements_chat.py | |
| @@ -0,0 +1,95 @@ | |
| 1 | """Skill: Interactive requirements gathering via guided questions.""" |
| 2 | |
| 3 | import json |
| 4 | |
| 5 | from video_processor.agent.skills.base import ( |
| 6 | AgentContext, |
| 7 | Artifact, |
| 8 | Skill, |
| 9 | register_skill, |
| 10 | ) |
| 11 | from video_processor.utils.json_parsing import parse_json_from_response |
| 12 | |
| 13 | |
| 14 | class RequirementsChatSkill(Skill): |
| 15 | name = "requirements_chat" |
| 16 | description = "Interactive requirements gathering via guided questions" |
| 17 | |
| 18 | def execute(self, context: AgentContext, **kwargs) -> Artifact: |
| 19 | """Generate a structured requirements questionnaire.""" |
| 20 | stats = context.query_engine.stats() |
| 21 | entities = context.query_engine.entities() |
| 22 | |
| 23 | parts = [ |
| 24 | "You are a requirements analyst. Based on the following " |
| 25 | "knowledge graph context, generate a requirements " |
| 26 | "gathering questionnaire.", |
| 27 | "", |
| 28 | "## Knowledge Graph Overview", |
| 29 | stats.to_text(), |
| 30 | "", |
| 31 | "## Entities", |
| 32 | entities.to_text(), |
| 33 | "", |
| 34 | "## Planning Entities", |
| 35 | ] |
| 36 | for e in context.planning_entities: |
| 37 | parts.append(f"- {e}") |
| 38 | |
| 39 | parts.append( |
| 40 | '\nGenerate a JSON object with a "questions" array. ' |
| 41 | "Each question should have:\n" |
| 42 | '- "id": string (e.g. "Q1")\n' |
| 43 | '- "category": "goals"|"constraints"|"priorities"|"scope"\n' |
| 44 | '- "question": string\n' |
| 45 | '- "context": string (why this matters)\n\n' |
| 46 | "Include 8-12 targeted questions.\n\n" |
| 47 | "Return ONLY the JSON." |
| 48 | ) |
| 49 | |
| 50 | prompt = "\n".join(parts) |
| 51 | response = context.provider_manager.chat(messages=[{"role": "user", "content": prompt}]) |
| 52 | parsed = parse_json_from_response(response) |
| 53 | content = json.dumps(parsed, indent=2) if not isinstance(parsed, str) else parsed |
| 54 | |
| 55 | return Artifact( |
| 56 | name="Requirements Questionnaire", |
| 57 | content=content, |
| 58 | artifact_type="requirements", |
| 59 | format="json", |
| 60 | metadata={"stage": "questionnaire"}, |
| 61 | ) |
| 62 | |
| 63 | def gather_requirements(self, context: AgentContext, answers: dict) -> dict: |
| 64 | """Take Q&A pairs and synthesize structured requirements.""" |
| 65 | stats = context.query_engine.stats() |
| 66 | |
| 67 | qa_text = "" |
| 68 | for qid, answer in answers.items(): |
| 69 | qa_text += f"- {qid}: {answer}\n" |
| 70 | |
| 71 | parts = [ |
| 72 | "You are a requirements analyst. Based on the knowledge " |
| 73 | "graph context and the answered questions, synthesize " |
| 74 | "structured requirements.", |
| 75 | "", |
| 76 | "## Knowledge Graph Overview", |
| 77 | stats.to_text(), |
| 78 | "", |
| 79 | "## Answers", |
| 80 | qa_text, |
| 81 | "Return a JSON object with:\n" |
| 82 | '- "goals": list of goal strings\n' |
| 83 | '- "constraints": list of constraint strings\n' |
| 84 | '- "priorities": list (ordered high to low)\n' |
| 85 | '- "scope": {"in_scope": [...], "out_of_scope": [...]}\n\n' |
| 86 | "Return ONLY the JSON.", |
| 87 | ] |
| 88 | |
| 89 | prompt = "\n".join(parts) |
| 90 | response = context.provider_manager.chat(messages=[{"role": "user", "content": prompt}]) |
| 91 | result = parse_json_from_response(response) |
| 92 | return result if isinstance(result, dict) else {"raw": result} |
| 93 | |
| 94 | |
| 95 | register_skill(RequirementsChatSkill()) |
| --- a/video_processor/agent/skills/roadmap.py | ||
| +++ b/video_processor/agent/skills/roadmap.py | ||
| @@ -0,0 +1,68 @@ | ||
| 1 | +"""Skill: Generate a product/project roadmap.""" | |
| 2 | + | |
| 3 | +from video_processor.agent.skills.base import ( | |
| 4 | + AgentContext, | |
| 5 | + Artifact, | |
| 6 | + Skill, | |
| 7 | + register_skill, | |
| 8 | +) | |
| 9 | + | |
| 10 | + | |
| 11 | +class RoadmapSkill(Skill): | |
| 12 | + name = "roadmap" | |
| 13 | + description = "Generate a product/project roadmap" | |
| 14 | + | |
| 15 | + def execute(self, context: AgentContext, **kwargs) -> Artifact: | |
| 16 | + stats = context.query_engine.stats() | |
| 17 | + entities = context.query_engine.entities() | |
| 18 | + relationships = context.query_engine.relationships() | |
| 19 | + | |
| 20 | + roadmap_types = {"milestone", "feature", "dependency"} | |
| 21 | + relevant = [ | |
| 22 | + e for e in context.planning_entities if getattr(e, "type", "").lower() in roadmap_types | |
| 23 | + ] | |
| 24 | + | |
| 25 | + parts = [ | |
| 26 | + "You are a product strategist. Using the following " | |
| 27 | + "knowledge graph context, generate a product roadmap.", | |
| 28 | + "", | |
| 29 | + "## Knowledge Graph Overview", | |
| 30 | + stats.to_text(), | |
| 31 | + "", | |
| 32 | + "## Entities", | |
| 33 | + entities.to_text(), | |
| 34 | + "", | |
| 35 | + "## Relationships", | |
| 36 | + relationships.to_text(), | |
| 37 | + "", | |
| 38 | + "## Milestones, Features & Dependencies", | |
| 39 | + ] | |
| 40 | + for e in relevant: | |
| 41 | + parts.append(f"- [{getattr(e, 'type', 'unknown')}] {e}") | |
| 42 | + | |
| 43 | + if not relevant: | |
| 44 | + parts.append( | |
| 45 | + "(No pre-filtered entities; derive roadmap items from the full context above.)" | |
| 46 | + ) | |
| 47 | + | |
| 48 | + parts.append( | |
| 49 | + "\nGenerate a markdown roadmap with:\n" | |
| 50 | + "1. Vision & Strategy\n" | |
| 51 | + "2. Phases (with timeline estimates)\n" | |
| 52 | + "3. Key Dependencies\n" | |
| 53 | + "4. A Mermaid Gantt chart summarizing the timeline\n\n" | |
| 54 | + "Return ONLY the markdown." | |
| 55 | + ) | |
| 56 | + | |
| 57 | + prompt = "\n".join(parts) | |
| 58 | + response = context.provider_manager.chat(messages=[{"role": "user", "content": prompt}]) | |
| 59 | + | |
| 60 | + return Artifact( | |
| 61 | + name="Roadmap", | |
| 62 | + content=response, | |
| 63 | + artifact_type="roadmap", | |
| 64 | + format="markdown", | |
| 65 | + ) | |
| 66 | + | |
| 67 | + | |
| 68 | +register_skill(RoadmapSkill()) |
| --- a/video_processor/agent/skills/roadmap.py | |
| +++ b/video_processor/agent/skills/roadmap.py | |
| @@ -0,0 +1,68 @@ | |
| --- a/video_processor/agent/skills/roadmap.py | |
| +++ b/video_processor/agent/skills/roadmap.py | |
| @@ -0,0 +1,68 @@ | |
| 1 | """Skill: Generate a product/project roadmap.""" |
| 2 | |
| 3 | from video_processor.agent.skills.base import ( |
| 4 | AgentContext, |
| 5 | Artifact, |
| 6 | Skill, |
| 7 | register_skill, |
| 8 | ) |
| 9 | |
| 10 | |
| 11 | class RoadmapSkill(Skill): |
| 12 | name = "roadmap" |
| 13 | description = "Generate a product/project roadmap" |
| 14 | |
| 15 | def execute(self, context: AgentContext, **kwargs) -> Artifact: |
| 16 | stats = context.query_engine.stats() |
| 17 | entities = context.query_engine.entities() |
| 18 | relationships = context.query_engine.relationships() |
| 19 | |
| 20 | roadmap_types = {"milestone", "feature", "dependency"} |
| 21 | relevant = [ |
| 22 | e for e in context.planning_entities if getattr(e, "type", "").lower() in roadmap_types |
| 23 | ] |
| 24 | |
| 25 | parts = [ |
| 26 | "You are a product strategist. Using the following " |
| 27 | "knowledge graph context, generate a product roadmap.", |
| 28 | "", |
| 29 | "## Knowledge Graph Overview", |
| 30 | stats.to_text(), |
| 31 | "", |
| 32 | "## Entities", |
| 33 | entities.to_text(), |
| 34 | "", |
| 35 | "## Relationships", |
| 36 | relationships.to_text(), |
| 37 | "", |
| 38 | "## Milestones, Features & Dependencies", |
| 39 | ] |
| 40 | for e in relevant: |
| 41 | parts.append(f"- [{getattr(e, 'type', 'unknown')}] {e}") |
| 42 | |
| 43 | if not relevant: |
| 44 | parts.append( |
| 45 | "(No pre-filtered entities; derive roadmap items from the full context above.)" |
| 46 | ) |
| 47 | |
| 48 | parts.append( |
| 49 | "\nGenerate a markdown roadmap with:\n" |
| 50 | "1. Vision & Strategy\n" |
| 51 | "2. Phases (with timeline estimates)\n" |
| 52 | "3. Key Dependencies\n" |
| 53 | "4. A Mermaid Gantt chart summarizing the timeline\n\n" |
| 54 | "Return ONLY the markdown." |
| 55 | ) |
| 56 | |
| 57 | prompt = "\n".join(parts) |
| 58 | response = context.provider_manager.chat(messages=[{"role": "user", "content": prompt}]) |
| 59 | |
| 60 | return Artifact( |
| 61 | name="Roadmap", |
| 62 | content=response, |
| 63 | artifact_type="roadmap", |
| 64 | format="markdown", |
| 65 | ) |
| 66 | |
| 67 | |
| 68 | register_skill(RoadmapSkill()) |
| --- a/video_processor/agent/skills/task_breakdown.py | ||
| +++ b/video_processor/agent/skills/task_breakdown.py | ||
| @@ -0,0 +1,76 @@ | ||
| 1 | +"""Skill: Break down goals into tasks with dependencies.""" | |
| 2 | + | |
| 3 | +from video_processor.agent.skills.base import ( | |
| 4 | + AgentContext, | |
| 5 | + Artifact, | |
| 6 | + Skill, | |
| 7 | + register_skill, | |
| 8 | +) | |
| 9 | +from video_processor.utils.json_parsing import parse_json_from_response | |
| 10 | + | |
| 11 | + | |
| 12 | +class TaskBreakdownSkill(Skill): | |
| 13 | + name = "task_breakdown" | |
| 14 | + description = "Break down goals into tasks with dependencies" | |
| 15 | + | |
| 16 | + def execute(self, context: AgentContext, **kwargs) -> Artifact: | |
| 17 | + stats = context.query_engine.stats() | |
| 18 | + entities = context.query_engine.entities() | |
| 19 | + relationships = context.query_engine.relationships() | |
| 20 | + | |
| 21 | + task_types = {"goal", "feature", "milestone"} | |
| 22 | + relevant = [ | |
| 23 | + e for e in context.planning_entities if getattr(e, "type", "").lower() in task_types | |
| 24 | + ] | |
| 25 | + | |
| 26 | + parts = [ | |
| 27 | + "You are a project manager. Using the following knowledge " | |
| 28 | + "graph context, decompose goals and features into tasks.", | |
| 29 | + "", | |
| 30 | + "## Knowledge Graph Overview", | |
| 31 | + stats.to_text(), | |
| 32 | + "", | |
| 33 | + "## Entities", | |
| 34 | + entities.to_text(), | |
| 35 | + "", | |
| 36 | + "## Relationships", | |
| 37 | + relationships.to_text(), | |
| 38 | + "", | |
| 39 | + "## Goals, Features & Milestones", | |
| 40 | + ] | |
| 41 | + for e in relevant: | |
| 42 | + parts.append(f"- [{getattr(e, 'type', 'unknown')}] {e}") | |
| 43 | + | |
| 44 | + if not relevant: | |
| 45 | + parts.append("(No pre-filtered entities; derive tasks from the full context above.)") | |
| 46 | + | |
| 47 | + parts.append( | |
| 48 | + "\nReturn a JSON array of task objects with:\n" | |
| 49 | + '- "id": string (e.g. "T1", "T2")\n' | |
| 50 | + '- "title": string\n' | |
| 51 | + '- "description": string\n' | |
| 52 | + '- "depends_on": list of task id strings\n' | |
| 53 | + '- "priority": "high" | "medium" | "low"\n' | |
| 54 | + '- "estimate": string (e.g. "2d", "1w")\n' | |
| 55 | + '- "assignee_role": string\n\n' | |
| 56 | + "Return ONLY the JSON." | |
| 57 | + ) | |
| 58 | + | |
| 59 | + prompt = "\n".join(parts) | |
| 60 | + response = context.provider_manager.chat(messages=[{"role": "user", "content": prompt}]) | |
| 61 | + parsed = parse_json_from_response(response) | |
| 62 | + | |
| 63 | + import json | |
| 64 | + | |
| 65 | + content = json.dumps(parsed, indent=2) if isinstance(parsed, list) else response | |
| 66 | + | |
| 67 | + return Artifact( | |
| 68 | + name="Task Breakdown", | |
| 69 | + content=content, | |
| 70 | + artifact_type="task_list", | |
| 71 | + format="json", | |
| 72 | + metadata={"tasks": parsed if isinstance(parsed, list) else []}, | |
| 73 | + ) | |
| 74 | + | |
| 75 | + | |
| 76 | +register_skill(TaskBreakdownSkill()) |
| --- a/video_processor/agent/skills/task_breakdown.py | |
| +++ b/video_processor/agent/skills/task_breakdown.py | |
| @@ -0,0 +1,76 @@ | |
| --- a/video_processor/agent/skills/task_breakdown.py | |
| +++ b/video_processor/agent/skills/task_breakdown.py | |
| @@ -0,0 +1,76 @@ | |
| 1 | """Skill: Break down goals into tasks with dependencies.""" |
| 2 | |
| 3 | from video_processor.agent.skills.base import ( |
| 4 | AgentContext, |
| 5 | Artifact, |
| 6 | Skill, |
| 7 | register_skill, |
| 8 | ) |
| 9 | from video_processor.utils.json_parsing import parse_json_from_response |
| 10 | |
| 11 | |
| 12 | class TaskBreakdownSkill(Skill): |
| 13 | name = "task_breakdown" |
| 14 | description = "Break down goals into tasks with dependencies" |
| 15 | |
| 16 | def execute(self, context: AgentContext, **kwargs) -> Artifact: |
| 17 | stats = context.query_engine.stats() |
| 18 | entities = context.query_engine.entities() |
| 19 | relationships = context.query_engine.relationships() |
| 20 | |
| 21 | task_types = {"goal", "feature", "milestone"} |
| 22 | relevant = [ |
| 23 | e for e in context.planning_entities if getattr(e, "type", "").lower() in task_types |
| 24 | ] |
| 25 | |
| 26 | parts = [ |
| 27 | "You are a project manager. Using the following knowledge " |
| 28 | "graph context, decompose goals and features into tasks.", |
| 29 | "", |
| 30 | "## Knowledge Graph Overview", |
| 31 | stats.to_text(), |
| 32 | "", |
| 33 | "## Entities", |
| 34 | entities.to_text(), |
| 35 | "", |
| 36 | "## Relationships", |
| 37 | relationships.to_text(), |
| 38 | "", |
| 39 | "## Goals, Features & Milestones", |
| 40 | ] |
| 41 | for e in relevant: |
| 42 | parts.append(f"- [{getattr(e, 'type', 'unknown')}] {e}") |
| 43 | |
| 44 | if not relevant: |
| 45 | parts.append("(No pre-filtered entities; derive tasks from the full context above.)") |
| 46 | |
| 47 | parts.append( |
| 48 | "\nReturn a JSON array of task objects with:\n" |
| 49 | '- "id": string (e.g. "T1", "T2")\n' |
| 50 | '- "title": string\n' |
| 51 | '- "description": string\n' |
| 52 | '- "depends_on": list of task id strings\n' |
| 53 | '- "priority": "high" | "medium" | "low"\n' |
| 54 | '- "estimate": string (e.g. "2d", "1w")\n' |
| 55 | '- "assignee_role": string\n\n' |
| 56 | "Return ONLY the JSON." |
| 57 | ) |
| 58 | |
| 59 | prompt = "\n".join(parts) |
| 60 | response = context.provider_manager.chat(messages=[{"role": "user", "content": prompt}]) |
| 61 | parsed = parse_json_from_response(response) |
| 62 | |
| 63 | import json |
| 64 | |
| 65 | content = json.dumps(parsed, indent=2) if isinstance(parsed, list) else response |
| 66 | |
| 67 | return Artifact( |
| 68 | name="Task Breakdown", |
| 69 | content=content, |
| 70 | artifact_type="task_list", |
| 71 | format="json", |
| 72 | metadata={"tasks": parsed if isinstance(parsed, list) else []}, |
| 73 | ) |
| 74 | |
| 75 | |
| 76 | register_skill(TaskBreakdownSkill()) |
| --- a/video_processor/agent/skills/wiki_generator.py | ||
| +++ b/video_processor/agent/skills/wiki_generator.py | ||
| @@ -0,0 +1,315 @@ | ||
| 1 | +"""Skill: Generate a GitHub wiki from knowledge graph and artifacts.""" | |
| 2 | + | |
| 3 | +import json | |
| 4 | +import logging | |
| 5 | +import subprocess | |
| 6 | +from pathlib import Path | |
| 7 | +from typing import Dict, List, Optional | |
| 8 | + | |
| 9 | +from video_processor.agent.skills.base import ( | |
| 10 | + AgentContext, | |
| 11 | + Artifact, | |
| 12 | + Skill, | |
| 13 | + register_skill, | |
| 14 | +) | |
| 15 | + | |
| 16 | +logger = logging.getLogger(__name__) | |
| 17 | + | |
| 18 | + | |
| 19 | +def _sanitize_filename(name: str) -> str: | |
| 20 | + """Convert entity name to a wiki-safe filename.""" | |
| 21 | + return name.replace("/", "-").replace("\\", "-").replace(" ", "-").replace(".", "-") | |
| 22 | + | |
| 23 | + | |
| 24 | +def _wiki_link(name: str) -> str: | |
| 25 | + """Create a GitHub wiki-style markdown link.""" | |
| 26 | + safe = _sanitize_filename(name) | |
| 27 | + return f"[{name}]({safe})" | |
| 28 | + | |
| 29 | + | |
| 30 | +def generate_wiki( | |
| 31 | + kg_data: dict, | |
| 32 | + artifacts: Optional[List[Artifact]] = None, | |
| 33 | + title: str = "Knowledge Base", | |
| 34 | +) -> Dict[str, str]: | |
| 35 | + """Generate a dict of {filename: markdown_content} for a GitHub wiki. | |
| 36 | + | |
| 37 | + Returns pages for: Home, _Sidebar, entity type indexes, individual | |
| 38 | + entity pages, and any planning artifacts. | |
| 39 | + """ | |
| 40 | + pages: Dict[str, str] = {} | |
| 41 | + artifacts = artifacts or [] | |
| 42 | + | |
| 43 | + nodes = kg_data.get("nodes", []) | |
| 44 | + relationships = kg_data.get("relationships", []) | |
| 45 | + | |
| 46 | + # Group entities by type | |
| 47 | + by_type: Dict[str, list] = {} | |
| 48 | + node_lookup: Dict[str, dict] = {} | |
| 49 | + for node in nodes: | |
| 50 | + name = node.get("name", node.get("id", "")) | |
| 51 | + ntype = node.get("type", "concept") | |
| 52 | + by_type.setdefault(ntype, []).append(node) | |
| 53 | + node_lookup[name.lower()] = node | |
| 54 | + | |
| 55 | + # Build relationship index (outgoing and incoming per entity) | |
| 56 | + outgoing: Dict[str, list] = {} | |
| 57 | + incoming: Dict[str, list] = {} | |
| 58 | + for rel in relationships: | |
| 59 | + src = rel.get("source", "") | |
| 60 | + tgt = rel.get("target", "") | |
| 61 | + rtype = rel.get("type", "related_to") | |
| 62 | + outgoing.setdefault(src, []).append((tgt, rtype)) | |
| 63 | + incoming.setdefault(tgt, []).append((src, rtype)) | |
| 64 | + | |
| 65 | + # --- Home page --- | |
| 66 | + home_parts = [ | |
| 67 | + f"# {title}", | |
| 68 | + "", | |
| 69 | + f"**{len(nodes)}** entities | **{len(relationships)}** relationships", | |
| 70 | + "", | |
| 71 | + "## Entity Types", | |
| 72 | + "", | |
| 73 | + ] | |
| 74 | + for etype, elist in sorted(by_type.items()): | |
| 75 | + home_parts.append(f"- {_wiki_link(etype.title())} ({len(elist)})") | |
| 76 | + | |
| 77 | + if artifacts: | |
| 78 | + home_parts.append("") | |
| 79 | + home_parts.append("## Planning Artifacts") | |
| 80 | + home_parts.append("") | |
| 81 | + for art in artifacts: | |
| 82 | + safe = _sanitize_filename(art.name) | |
| 83 | + home_parts.append(f"- [{art.name}]({safe})") | |
| 84 | + | |
| 85 | + pages["Home"] = "\n".join(home_parts) | |
| 86 | + | |
| 87 | + # --- Sidebar --- | |
| 88 | + sidebar_parts = [f"**{title}**", "", "**Navigation**", "", "- [Home](Home)", ""] | |
| 89 | + sidebar_parts.append("**Entity Types**") | |
| 90 | + sidebar_parts.append("") | |
| 91 | + for etype in sorted(by_type.keys()): | |
| 92 | + sidebar_parts.append(f"- {_wiki_link(etype.title())}") | |
| 93 | + | |
| 94 | + if artifacts: | |
| 95 | + sidebar_parts.append("") | |
| 96 | + sidebar_parts.append("**Artifacts**") | |
| 97 | + sidebar_parts.append("") | |
| 98 | + for art in artifacts: | |
| 99 | + safe = _sanitize_filename(art.name) | |
| 100 | + sidebar_parts.append(f"- [{art.name}]({safe})") | |
| 101 | + | |
| 102 | + pages["_Sidebar"] = "\n".join(sidebar_parts) | |
| 103 | + | |
| 104 | + # --- Type index pages --- | |
| 105 | + for etype, elist in sorted(by_type.items()): | |
| 106 | + page_name = _sanitize_filename(etype.title()) | |
| 107 | + parts = [ | |
| 108 | + f"# {etype.title()}", | |
| 109 | + "", | |
| 110 | + f"{len(elist)} entities of type **{etype}**.", | |
| 111 | + "", | |
| 112 | + "| Entity | Descriptions |", | |
| 113 | + "|--------|-------------|", | |
| 114 | + ] | |
| 115 | + for node in sorted(elist, key=lambda n: n.get("name", "")): | |
| 116 | + name = node.get("name", "") | |
| 117 | + descs = node.get("descriptions", []) | |
| 118 | + desc_text = "; ".join(descs[:2]) if descs else "—" | |
| 119 | + parts.append(f"| {_wiki_link(name)} | {desc_text} |") | |
| 120 | + | |
| 121 | + pages[page_name] = "\n".join(parts) | |
| 122 | + | |
| 123 | + # --- Individual entity pages --- | |
| 124 | + for node in nodes: | |
| 125 | + name = node.get("name", "") | |
| 126 | + if not name: | |
| 127 | + continue | |
| 128 | + ntype = node.get("type", "concept") | |
| 129 | + descs = node.get("descriptions", []) | |
| 130 | + page_name = _sanitize_filename(name) | |
| 131 | + | |
| 132 | + parts = [ | |
| 133 | + f"# {name}", | |
| 134 | + "", | |
| 135 | + f"**Type:** {ntype}", | |
| 136 | + "", | |
| 137 | + ] | |
| 138 | + | |
| 139 | + if descs: | |
| 140 | + parts.append("## Descriptions") | |
| 141 | + parts.append("") | |
| 142 | + for d in descs: | |
| 143 | + parts.append(f"- {d}") | |
| 144 | + parts.append("") | |
| 145 | + | |
| 146 | + # Outgoing relationships | |
| 147 | + outs = outgoing.get(name, []) | |
| 148 | + if outs: | |
| 149 | + parts.append("## Relationships") | |
| 150 | + parts.append("") | |
| 151 | + parts.append("| Target | Type |") | |
| 152 | + parts.append("|--------|------|") | |
| 153 | + for tgt, rtype in outs: | |
| 154 | + parts.append(f"| {_wiki_link(tgt)} | {rtype} |") | |
| 155 | + parts.append("") | |
| 156 | + | |
| 157 | + # Incoming relationships | |
| 158 | + ins = incoming.get(name, []) | |
| 159 | + if ins: | |
| 160 | + parts.append("## Referenced By") | |
| 161 | + parts.append("") | |
| 162 | + parts.append("| Source | Type |") | |
| 163 | + parts.append("|--------|------|") | |
| 164 | + for src, rtype in ins: | |
| 165 | + parts.append(f"| {_wiki_link(src)} | {rtype} |") | |
| 166 | + parts.append("") | |
| 167 | + | |
| 168 | + # Occurrences / sources | |
| 169 | + occs = node.get("occurrences", []) | |
| 170 | + if occs: | |
| 171 | + parts.append("## Sources") | |
| 172 | + parts.append("") | |
| 173 | + for occ in occs: | |
| 174 | + src = occ.get("source", "unknown") | |
| 175 | + ts = occ.get("timestamp", "") | |
| 176 | + text = occ.get("text", "") | |
| 177 | + line = f"- **{src}**" | |
| 178 | + if ts: | |
| 179 | + line += f" @ {ts}" | |
| 180 | + if text: | |
| 181 | + line += f": _{text}_" | |
| 182 | + parts.append(line) | |
| 183 | + parts.append("") | |
| 184 | + | |
| 185 | + pages[page_name] = "\n".join(parts) | |
| 186 | + | |
| 187 | + # --- Artifact pages --- | |
| 188 | + for art in artifacts: | |
| 189 | + page_name = _sanitize_filename(art.name) | |
| 190 | + if art.format == "json": | |
| 191 | + try: | |
| 192 | + data = json.loads(art.content) | |
| 193 | + content = f"```json\n{json.dumps(data, indent=2)}\n```" | |
| 194 | + except json.JSONDecodeError: | |
| 195 | + content = art.content | |
| 196 | + else: | |
| 197 | + content = art.content | |
| 198 | + | |
| 199 | + pages[page_name] = f"# {art.name}\n\n{content}" | |
| 200 | + | |
| 201 | + return pages | |
| 202 | + | |
| 203 | + | |
| 204 | +def write_wiki(pages: Dict[str, str], output_dir: Path) -> List[Path]: | |
| 205 | + """Write wiki pages to a directory as .md files.""" | |
| 206 | + output_dir.mkdir(parents=True, exist_ok=True) | |
| 207 | + paths = [] | |
| 208 | + for name, content in pages.items(): | |
| 209 | + path = output_dir / f"{name}.md" | |
| 210 | + path.write_text(content, encoding="utf-8") | |
| 211 | + paths.append(path) | |
| 212 | + return paths | |
| 213 | + | |
| 214 | + | |
| 215 | +def push_wiki(wiki_dir: Path, repo: str, message: str = "Update wiki") -> bool: | |
| 216 | + """Push wiki pages to a GitHub wiki repo. | |
| 217 | + | |
| 218 | + Clones the wiki repo, copies pages, commits and pushes. | |
| 219 | + The repo should be in 'owner/repo' format. | |
| 220 | + """ | |
| 221 | + wiki_url = f"https://github.com/{repo}.wiki.git" | |
| 222 | + | |
| 223 | + # Clone existing wiki (or init if empty) | |
| 224 | + clone_dir = wiki_dir / ".wiki_clone" | |
| 225 | + if clone_dir.exists(): | |
| 226 | + subprocess.run(["rm", "-rf", str(clone_dir)], check=True) | |
| 227 | + | |
| 228 | + result = subprocess.run( | |
| 229 | + ["git", "clone", wiki_url, str(clone_dir)], | |
| 230 | + capture_output=True, | |
| 231 | + text=True, | |
| 232 | + ) | |
| 233 | + | |
| 234 | + if result.returncode != 0: | |
| 235 | + # Wiki might not exist yet — init a new repo | |
| 236 | + clone_dir.mkdir(parents=True, exist_ok=True) | |
| 237 | + subprocess.run(["git", "init"], cwd=clone_dir, capture_output=True) | |
| 238 | + subprocess.run( | |
| 239 | + ["git", "remote", "add", "origin", wiki_url], | |
| 240 | + cwd=clone_dir, | |
| 241 | + capture_output=True, | |
| 242 | + ) | |
| 243 | + | |
| 244 | + # Copy wiki pages into clone | |
| 245 | + for md_file in wiki_dir.glob("*.md"): | |
| 246 | + if md_file.parent == wiki_dir: | |
| 247 | + dest = clone_dir / md_file.name | |
| 248 | + dest.write_text(md_file.read_text(encoding="utf-8"), encoding="utf-8") | |
| 249 | + | |
| 250 | + # Commit and push | |
| 251 | + subprocess.run(["git", "add", "-A"], cwd=clone_dir, capture_output=True) | |
| 252 | + commit_result = subprocess.run( | |
| 253 | + ["git", "commit", "-m", message], | |
| 254 | + cwd=clone_dir, | |
| 255 | + capture_output=True, | |
| 256 | + text=True, | |
| 257 | + ) | |
| 258 | + if commit_result.returncode != 0: | |
| 259 | + logger.info("No wiki changes to commit") | |
| 260 | + return True | |
| 261 | + | |
| 262 | + push_result = subprocess.run( | |
| 263 | + ["git", "push", "origin", "master"], | |
| 264 | + cwd=clone_dir, | |
| 265 | + capture_output=True, | |
| 266 | + text=True, | |
| 267 | + ) | |
| 268 | + if push_result.returncode != 0: | |
| 269 | + # Try main branch | |
| 270 | + push_result = subprocess.run( | |
| 271 | + ["git", "push", "origin", "main"], | |
| 272 | + cwd=clone_dir, | |
| 273 | + capture_output=True, | |
| 274 | + text=True, | |
| 275 | + ) | |
| 276 | + | |
| 277 | + if push_result.returncode == 0: | |
| 278 | + logger.info(f"Wiki pushed to {wiki_url}") | |
| 279 | + return True | |
| 280 | + else: | |
| 281 | + logger.error(f"Wiki push failed: {push_result.stderr}") | |
| 282 | + return False | |
| 283 | + | |
| 284 | + | |
| 285 | +class WikiGeneratorSkill(Skill): | |
| 286 | + name = "wiki_generator" | |
| 287 | + description = "Generate a GitHub wiki from knowledge graph and artifacts" | |
| 288 | + | |
| 289 | + def execute(self, context: AgentContext, **kwargs) -> Artifact: | |
| 290 | + kg_data = context.knowledge_graph.to_dict() | |
| 291 | + pages = generate_wiki( | |
| 292 | + kg_data, | |
| 293 | + artifacts=context.artifacts, | |
| 294 | + title=kwargs.get("title", "Knowledge Base"), | |
| 295 | + ) | |
| 296 | + | |
| 297 | + # Return a summary artifact; actual pages are written via write_wiki() | |
| 298 | + page_list = sorted(pages.keys()) | |
| 299 | + summary_parts = [ | |
| 300 | + f"Generated {len(pages)} wiki pages:", | |
| 301 | + "", | |
| 302 | + ] | |
| 303 | + for name in page_list: | |
| 304 | + summary_parts.append(f"- {name}.md") | |
| 305 | + | |
| 306 | + return Artifact( | |
| 307 | + name="Wiki", | |
| 308 | + content="\n".join(summary_parts), | |
| 309 | + artifact_type="wiki", | |
| 310 | + format="markdown", | |
| 311 | + metadata={"pages": pages}, | |
| 312 | + ) | |
| 313 | + | |
| 314 | + | |
| 315 | +register_skill(WikiGeneratorSkill()) |
| --- a/video_processor/agent/skills/wiki_generator.py | |
| +++ b/video_processor/agent/skills/wiki_generator.py | |
| @@ -0,0 +1,315 @@ | |
| --- a/video_processor/agent/skills/wiki_generator.py | |
| +++ b/video_processor/agent/skills/wiki_generator.py | |
| @@ -0,0 +1,315 @@ | |
| 1 | """Skill: Generate a GitHub wiki from knowledge graph and artifacts.""" |
| 2 | |
| 3 | import json |
| 4 | import logging |
| 5 | import subprocess |
| 6 | from pathlib import Path |
| 7 | from typing import Dict, List, Optional |
| 8 | |
| 9 | from video_processor.agent.skills.base import ( |
| 10 | AgentContext, |
| 11 | Artifact, |
| 12 | Skill, |
| 13 | register_skill, |
| 14 | ) |
| 15 | |
| 16 | logger = logging.getLogger(__name__) |
| 17 | |
| 18 | |
| 19 | def _sanitize_filename(name: str) -> str: |
| 20 | """Convert entity name to a wiki-safe filename.""" |
| 21 | return name.replace("/", "-").replace("\\", "-").replace(" ", "-").replace(".", "-") |
| 22 | |
| 23 | |
| 24 | def _wiki_link(name: str) -> str: |
| 25 | """Create a GitHub wiki-style markdown link.""" |
| 26 | safe = _sanitize_filename(name) |
| 27 | return f"[{name}]({safe})" |
| 28 | |
| 29 | |
| 30 | def generate_wiki( |
| 31 | kg_data: dict, |
| 32 | artifacts: Optional[List[Artifact]] = None, |
| 33 | title: str = "Knowledge Base", |
| 34 | ) -> Dict[str, str]: |
| 35 | """Generate a dict of {filename: markdown_content} for a GitHub wiki. |
| 36 | |
| 37 | Returns pages for: Home, _Sidebar, entity type indexes, individual |
| 38 | entity pages, and any planning artifacts. |
| 39 | """ |
| 40 | pages: Dict[str, str] = {} |
| 41 | artifacts = artifacts or [] |
| 42 | |
| 43 | nodes = kg_data.get("nodes", []) |
| 44 | relationships = kg_data.get("relationships", []) |
| 45 | |
| 46 | # Group entities by type |
| 47 | by_type: Dict[str, list] = {} |
| 48 | node_lookup: Dict[str, dict] = {} |
| 49 | for node in nodes: |
| 50 | name = node.get("name", node.get("id", "")) |
| 51 | ntype = node.get("type", "concept") |
| 52 | by_type.setdefault(ntype, []).append(node) |
| 53 | node_lookup[name.lower()] = node |
| 54 | |
| 55 | # Build relationship index (outgoing and incoming per entity) |
| 56 | outgoing: Dict[str, list] = {} |
| 57 | incoming: Dict[str, list] = {} |
| 58 | for rel in relationships: |
| 59 | src = rel.get("source", "") |
| 60 | tgt = rel.get("target", "") |
| 61 | rtype = rel.get("type", "related_to") |
| 62 | outgoing.setdefault(src, []).append((tgt, rtype)) |
| 63 | incoming.setdefault(tgt, []).append((src, rtype)) |
| 64 | |
| 65 | # --- Home page --- |
| 66 | home_parts = [ |
| 67 | f"# {title}", |
| 68 | "", |
| 69 | f"**{len(nodes)}** entities | **{len(relationships)}** relationships", |
| 70 | "", |
| 71 | "## Entity Types", |
| 72 | "", |
| 73 | ] |
| 74 | for etype, elist in sorted(by_type.items()): |
| 75 | home_parts.append(f"- {_wiki_link(etype.title())} ({len(elist)})") |
| 76 | |
| 77 | if artifacts: |
| 78 | home_parts.append("") |
| 79 | home_parts.append("## Planning Artifacts") |
| 80 | home_parts.append("") |
| 81 | for art in artifacts: |
| 82 | safe = _sanitize_filename(art.name) |
| 83 | home_parts.append(f"- [{art.name}]({safe})") |
| 84 | |
| 85 | pages["Home"] = "\n".join(home_parts) |
| 86 | |
| 87 | # --- Sidebar --- |
| 88 | sidebar_parts = [f"**{title}**", "", "**Navigation**", "", "- [Home](Home)", ""] |
| 89 | sidebar_parts.append("**Entity Types**") |
| 90 | sidebar_parts.append("") |
| 91 | for etype in sorted(by_type.keys()): |
| 92 | sidebar_parts.append(f"- {_wiki_link(etype.title())}") |
| 93 | |
| 94 | if artifacts: |
| 95 | sidebar_parts.append("") |
| 96 | sidebar_parts.append("**Artifacts**") |
| 97 | sidebar_parts.append("") |
| 98 | for art in artifacts: |
| 99 | safe = _sanitize_filename(art.name) |
| 100 | sidebar_parts.append(f"- [{art.name}]({safe})") |
| 101 | |
| 102 | pages["_Sidebar"] = "\n".join(sidebar_parts) |
| 103 | |
| 104 | # --- Type index pages --- |
| 105 | for etype, elist in sorted(by_type.items()): |
| 106 | page_name = _sanitize_filename(etype.title()) |
| 107 | parts = [ |
| 108 | f"# {etype.title()}", |
| 109 | "", |
| 110 | f"{len(elist)} entities of type **{etype}**.", |
| 111 | "", |
| 112 | "| Entity | Descriptions |", |
| 113 | "|--------|-------------|", |
| 114 | ] |
| 115 | for node in sorted(elist, key=lambda n: n.get("name", "")): |
| 116 | name = node.get("name", "") |
| 117 | descs = node.get("descriptions", []) |
| 118 | desc_text = "; ".join(descs[:2]) if descs else "—" |
| 119 | parts.append(f"| {_wiki_link(name)} | {desc_text} |") |
| 120 | |
| 121 | pages[page_name] = "\n".join(parts) |
| 122 | |
| 123 | # --- Individual entity pages --- |
| 124 | for node in nodes: |
| 125 | name = node.get("name", "") |
| 126 | if not name: |
| 127 | continue |
| 128 | ntype = node.get("type", "concept") |
| 129 | descs = node.get("descriptions", []) |
| 130 | page_name = _sanitize_filename(name) |
| 131 | |
| 132 | parts = [ |
| 133 | f"# {name}", |
| 134 | "", |
| 135 | f"**Type:** {ntype}", |
| 136 | "", |
| 137 | ] |
| 138 | |
| 139 | if descs: |
| 140 | parts.append("## Descriptions") |
| 141 | parts.append("") |
| 142 | for d in descs: |
| 143 | parts.append(f"- {d}") |
| 144 | parts.append("") |
| 145 | |
| 146 | # Outgoing relationships |
| 147 | outs = outgoing.get(name, []) |
| 148 | if outs: |
| 149 | parts.append("## Relationships") |
| 150 | parts.append("") |
| 151 | parts.append("| Target | Type |") |
| 152 | parts.append("|--------|------|") |
| 153 | for tgt, rtype in outs: |
| 154 | parts.append(f"| {_wiki_link(tgt)} | {rtype} |") |
| 155 | parts.append("") |
| 156 | |
| 157 | # Incoming relationships |
| 158 | ins = incoming.get(name, []) |
| 159 | if ins: |
| 160 | parts.append("## Referenced By") |
| 161 | parts.append("") |
| 162 | parts.append("| Source | Type |") |
| 163 | parts.append("|--------|------|") |
| 164 | for src, rtype in ins: |
| 165 | parts.append(f"| {_wiki_link(src)} | {rtype} |") |
| 166 | parts.append("") |
| 167 | |
| 168 | # Occurrences / sources |
| 169 | occs = node.get("occurrences", []) |
| 170 | if occs: |
| 171 | parts.append("## Sources") |
| 172 | parts.append("") |
| 173 | for occ in occs: |
| 174 | src = occ.get("source", "unknown") |
| 175 | ts = occ.get("timestamp", "") |
| 176 | text = occ.get("text", "") |
| 177 | line = f"- **{src}**" |
| 178 | if ts: |
| 179 | line += f" @ {ts}" |
| 180 | if text: |
| 181 | line += f": _{text}_" |
| 182 | parts.append(line) |
| 183 | parts.append("") |
| 184 | |
| 185 | pages[page_name] = "\n".join(parts) |
| 186 | |
| 187 | # --- Artifact pages --- |
| 188 | for art in artifacts: |
| 189 | page_name = _sanitize_filename(art.name) |
| 190 | if art.format == "json": |
| 191 | try: |
| 192 | data = json.loads(art.content) |
| 193 | content = f"```json\n{json.dumps(data, indent=2)}\n```" |
| 194 | except json.JSONDecodeError: |
| 195 | content = art.content |
| 196 | else: |
| 197 | content = art.content |
| 198 | |
| 199 | pages[page_name] = f"# {art.name}\n\n{content}" |
| 200 | |
| 201 | return pages |
| 202 | |
| 203 | |
| 204 | def write_wiki(pages: Dict[str, str], output_dir: Path) -> List[Path]: |
| 205 | """Write wiki pages to a directory as .md files.""" |
| 206 | output_dir.mkdir(parents=True, exist_ok=True) |
| 207 | paths = [] |
| 208 | for name, content in pages.items(): |
| 209 | path = output_dir / f"{name}.md" |
| 210 | path.write_text(content, encoding="utf-8") |
| 211 | paths.append(path) |
| 212 | return paths |
| 213 | |
| 214 | |
| 215 | def push_wiki(wiki_dir: Path, repo: str, message: str = "Update wiki") -> bool: |
| 216 | """Push wiki pages to a GitHub wiki repo. |
| 217 | |
| 218 | Clones the wiki repo, copies pages, commits and pushes. |
| 219 | The repo should be in 'owner/repo' format. |
| 220 | """ |
| 221 | wiki_url = f"https://github.com/{repo}.wiki.git" |
| 222 | |
| 223 | # Clone existing wiki (or init if empty) |
| 224 | clone_dir = wiki_dir / ".wiki_clone" |
| 225 | if clone_dir.exists(): |
| 226 | subprocess.run(["rm", "-rf", str(clone_dir)], check=True) |
| 227 | |
| 228 | result = subprocess.run( |
| 229 | ["git", "clone", wiki_url, str(clone_dir)], |
| 230 | capture_output=True, |
| 231 | text=True, |
| 232 | ) |
| 233 | |
| 234 | if result.returncode != 0: |
| 235 | # Wiki might not exist yet — init a new repo |
| 236 | clone_dir.mkdir(parents=True, exist_ok=True) |
| 237 | subprocess.run(["git", "init"], cwd=clone_dir, capture_output=True) |
| 238 | subprocess.run( |
| 239 | ["git", "remote", "add", "origin", wiki_url], |
| 240 | cwd=clone_dir, |
| 241 | capture_output=True, |
| 242 | ) |
| 243 | |
| 244 | # Copy wiki pages into clone |
| 245 | for md_file in wiki_dir.glob("*.md"): |
| 246 | if md_file.parent == wiki_dir: |
| 247 | dest = clone_dir / md_file.name |
| 248 | dest.write_text(md_file.read_text(encoding="utf-8"), encoding="utf-8") |
| 249 | |
| 250 | # Commit and push |
| 251 | subprocess.run(["git", "add", "-A"], cwd=clone_dir, capture_output=True) |
| 252 | commit_result = subprocess.run( |
| 253 | ["git", "commit", "-m", message], |
| 254 | cwd=clone_dir, |
| 255 | capture_output=True, |
| 256 | text=True, |
| 257 | ) |
| 258 | if commit_result.returncode != 0: |
| 259 | logger.info("No wiki changes to commit") |
| 260 | return True |
| 261 | |
| 262 | push_result = subprocess.run( |
| 263 | ["git", "push", "origin", "master"], |
| 264 | cwd=clone_dir, |
| 265 | capture_output=True, |
| 266 | text=True, |
| 267 | ) |
| 268 | if push_result.returncode != 0: |
| 269 | # Try main branch |
| 270 | push_result = subprocess.run( |
| 271 | ["git", "push", "origin", "main"], |
| 272 | cwd=clone_dir, |
| 273 | capture_output=True, |
| 274 | text=True, |
| 275 | ) |
| 276 | |
| 277 | if push_result.returncode == 0: |
| 278 | logger.info(f"Wiki pushed to {wiki_url}") |
| 279 | return True |
| 280 | else: |
| 281 | logger.error(f"Wiki push failed: {push_result.stderr}") |
| 282 | return False |
| 283 | |
| 284 | |
| 285 | class WikiGeneratorSkill(Skill): |
| 286 | name = "wiki_generator" |
| 287 | description = "Generate a GitHub wiki from knowledge graph and artifacts" |
| 288 | |
| 289 | def execute(self, context: AgentContext, **kwargs) -> Artifact: |
| 290 | kg_data = context.knowledge_graph.to_dict() |
| 291 | pages = generate_wiki( |
| 292 | kg_data, |
| 293 | artifacts=context.artifacts, |
| 294 | title=kwargs.get("title", "Knowledge Base"), |
| 295 | ) |
| 296 | |
| 297 | # Return a summary artifact; actual pages are written via write_wiki() |
| 298 | page_list = sorted(pages.keys()) |
| 299 | summary_parts = [ |
| 300 | f"Generated {len(pages)} wiki pages:", |
| 301 | "", |
| 302 | ] |
| 303 | for name in page_list: |
| 304 | summary_parts.append(f"- {name}.md") |
| 305 | |
| 306 | return Artifact( |
| 307 | name="Wiki", |
| 308 | content="\n".join(summary_parts), |
| 309 | artifact_type="wiki", |
| 310 | format="markdown", |
| 311 | metadata={"pages": pages}, |
| 312 | ) |
| 313 | |
| 314 | |
| 315 | register_skill(WikiGeneratorSkill()) |
| --- a/video_processor/api/openapi_spec.py | ||
| +++ b/video_processor/api/openapi_spec.py | ||
| @@ -0,0 +1,183 @@ | ||
| 1 | +"""OpenAPI 3.0 specification stub for the PlanOpticon REST API.""" | |
| 2 | + | |
| 3 | + | |
| 4 | +def get_openapi_spec() -> dict: | |
| 5 | + """Return an OpenAPI 3.0 spec dict for the planned PlanOpticon API.""" | |
| 6 | + return { | |
| 7 | + "openapi": "3.0.3", | |
| 8 | + "info": { | |
| 9 | + "title": "PlanOpticon API", | |
| 10 | + "version": "0.1.0", | |
| 11 | + "description": "Video analysis and knowledge extraction REST API.", | |
| 12 | + }, | |
| 13 | + "paths": { | |
| 14 | + "/analyze": { | |
| 15 | + "post": { | |
| 16 | + "summary": "Submit a video for analysis", | |
| 17 | + "operationId": "createAnalysis", | |
| 18 | + "requestBody": { | |
| 19 | + "required": True, | |
| 20 | + "content": { | |
| 21 | + "application/json": { | |
| 22 | + "schema": { | |
| 23 | + "type": "object", | |
| 24 | + "required": ["video_url"], | |
| 25 | + "properties": { | |
| 26 | + "video_url": {"type": "string", "format": "uri"}, | |
| 27 | + "depth": { | |
| 28 | + "type": "string", | |
| 29 | + "enum": ["basic", "standard", "comprehensive"], | |
| 30 | + }, | |
| 31 | + "focus_areas": { | |
| 32 | + "type": "array", | |
| 33 | + "items": {"type": "string"}, | |
| 34 | + }, | |
| 35 | + "webhook_url": {"type": "string", "format": "uri"}, | |
| 36 | + "speaker_hints": { | |
| 37 | + "type": "array", | |
| 38 | + "items": {"type": "string"}, | |
| 39 | + }, | |
| 40 | + }, | |
| 41 | + } | |
| 42 | + } | |
| 43 | + }, | |
| 44 | + }, | |
| 45 | + "responses": { | |
| 46 | + "202": { | |
| 47 | + "description": "Analysis job accepted", | |
| 48 | + "content": { | |
| 49 | + "application/json": {"schema": {"$ref": "#/components/schemas/Job"}} | |
| 50 | + }, | |
| 51 | + } | |
| 52 | + }, | |
| 53 | + } | |
| 54 | + }, | |
| 55 | + "/jobs/{id}": { | |
| 56 | + "get": { | |
| 57 | + "summary": "Get analysis job status", | |
| 58 | + "operationId": "getJob", | |
| 59 | + "parameters": [ | |
| 60 | + {"name": "id", "in": "path", "required": True, "schema": {"type": "string"}} | |
| 61 | + ], | |
| 62 | + "responses": { | |
| 63 | + "200": { | |
| 64 | + "description": "Job status", | |
| 65 | + "content": { | |
| 66 | + "application/json": {"schema": {"$ref": "#/components/schemas/Job"}} | |
| 67 | + }, | |
| 68 | + } | |
| 69 | + }, | |
| 70 | + } | |
| 71 | + }, | |
| 72 | + "/knowledge-graph/{id}/entities": { | |
| 73 | + "get": { | |
| 74 | + "summary": "List entities in a knowledge graph", | |
| 75 | + "operationId": "listEntities", | |
| 76 | + "parameters": [ | |
| 77 | + { | |
| 78 | + "name": "id", | |
| 79 | + "in": "path", | |
| 80 | + "required": True, | |
| 81 | + "schema": {"type": "string"}, | |
| 82 | + }, | |
| 83 | + {"name": "type", "in": "query", "schema": {"type": "string"}}, | |
| 84 | + ], | |
| 85 | + "responses": { | |
| 86 | + "200": { | |
| 87 | + "description": "Entity list", | |
| 88 | + "content": { | |
| 89 | + "application/json": { | |
| 90 | + "schema": { | |
| 91 | + "type": "array", | |
| 92 | + "items": {"$ref": "#/components/schemas/Entity"}, | |
| 93 | + } | |
| 94 | + } | |
| 95 | + }, | |
| 96 | + } | |
| 97 | + }, | |
| 98 | + } | |
| 99 | + }, | |
| 100 | + "/knowledge-graph/{id}/relationships": { | |
| 101 | + "get": { | |
| 102 | + "summary": "List relationships in a knowledge graph", | |
| 103 | + "operationId": "listRelationships", | |
| 104 | + "parameters": [ | |
| 105 | + {"name": "id", "in": "path", "required": True, "schema": {"type": "string"}} | |
| 106 | + ], | |
| 107 | + "responses": { | |
| 108 | + "200": { | |
| 109 | + "description": "Relationship list", | |
| 110 | + "content": { | |
| 111 | + "application/json": { | |
| 112 | + "schema": { | |
| 113 | + "type": "array", | |
| 114 | + "items": {"$ref": "#/components/schemas/Relationship"}, | |
| 115 | + } | |
| 116 | + } | |
| 117 | + }, | |
| 118 | + } | |
| 119 | + }, | |
| 120 | + } | |
| 121 | + }, | |
| 122 | + "/knowledge-graph/{id}/query": { | |
| 123 | + "get": { | |
| 124 | + "summary": "Query the knowledge graph with natural language", | |
| 125 | + "operationId": "queryKnowledgeGraph", | |
| 126 | + "parameters": [ | |
| 127 | + { | |
| 128 | + "name": "id", | |
| 129 | + "in": "path", | |
| 130 | + "required": True, | |
| 131 | + "schema": {"type": "string"}, | |
| 132 | + }, | |
| 133 | + { | |
| 134 | + "name": "q", | |
| 135 | + "in": "query", | |
| 136 | + "required": True, | |
| 137 | + "schema": {"type": "string"}, | |
| 138 | + }, | |
| 139 | + ], | |
| 140 | + "responses": { | |
| 141 | + "200": { | |
| 142 | + "description": "Query results", | |
| 143 | + "content": {"application/json": {"schema": {"type": "object"}}}, | |
| 144 | + } | |
| 145 | + }, | |
| 146 | + } | |
| 147 | + }, | |
| 148 | + }, | |
| 149 | + "components": { | |
| 150 | + "schemas": { | |
| 151 | + "Job": { | |
| 152 | + "type": "object", | |
| 153 | + "properties": { | |
| 154 | + "id": {"type": "string"}, | |
| 155 | + "status": { | |
| 156 | + "type": "string", | |
| 157 | + "enum": ["pending", "processing", "completed", "failed"], | |
| 158 | + }, | |
| 159 | + "progress": {"type": "number", "format": "float"}, | |
| 160 | + "created_at": {"type": "string", "format": "date-time"}, | |
| 161 | + "completed_at": {"type": "string", "format": "date-time"}, | |
| 162 | + "result_url": {"type": "string", "format": "uri"}, | |
| 163 | + }, | |
| 164 | + }, | |
| 165 | + "Entity": { | |
| 166 | + "type": "object", | |
| 167 | + "properties": { | |
| 168 | + "name": {"type": "string"}, | |
| 169 | + "type": {"type": "string"}, | |
| 170 | + "descriptions": {"type": "array", "items": {"type": "string"}}, | |
| 171 | + }, | |
| 172 | + }, | |
| 173 | + "Relationship": { | |
| 174 | + "type": "object", | |
| 175 | + "properties": { | |
| 176 | + "source": {"type": "string"}, | |
| 177 | + "target": {"type": "string"}, | |
| 178 | + "type": {"type": "string"}, | |
| 179 | + }, | |
| 180 | + }, | |
| 181 | + } | |
| 182 | + }, | |
| 183 | + } |
| --- a/video_processor/api/openapi_spec.py | |
| +++ b/video_processor/api/openapi_spec.py | |
| @@ -0,0 +1,183 @@ | |
| --- a/video_processor/api/openapi_spec.py | |
| +++ b/video_processor/api/openapi_spec.py | |
| @@ -0,0 +1,183 @@ | |
| 1 | """OpenAPI 3.0 specification stub for the PlanOpticon REST API.""" |
| 2 | |
| 3 | |
| 4 | def get_openapi_spec() -> dict: |
| 5 | """Return an OpenAPI 3.0 spec dict for the planned PlanOpticon API.""" |
| 6 | return { |
| 7 | "openapi": "3.0.3", |
| 8 | "info": { |
| 9 | "title": "PlanOpticon API", |
| 10 | "version": "0.1.0", |
| 11 | "description": "Video analysis and knowledge extraction REST API.", |
| 12 | }, |
| 13 | "paths": { |
| 14 | "/analyze": { |
| 15 | "post": { |
| 16 | "summary": "Submit a video for analysis", |
| 17 | "operationId": "createAnalysis", |
| 18 | "requestBody": { |
| 19 | "required": True, |
| 20 | "content": { |
| 21 | "application/json": { |
| 22 | "schema": { |
| 23 | "type": "object", |
| 24 | "required": ["video_url"], |
| 25 | "properties": { |
| 26 | "video_url": {"type": "string", "format": "uri"}, |
| 27 | "depth": { |
| 28 | "type": "string", |
| 29 | "enum": ["basic", "standard", "comprehensive"], |
| 30 | }, |
| 31 | "focus_areas": { |
| 32 | "type": "array", |
| 33 | "items": {"type": "string"}, |
| 34 | }, |
| 35 | "webhook_url": {"type": "string", "format": "uri"}, |
| 36 | "speaker_hints": { |
| 37 | "type": "array", |
| 38 | "items": {"type": "string"}, |
| 39 | }, |
| 40 | }, |
| 41 | } |
| 42 | } |
| 43 | }, |
| 44 | }, |
| 45 | "responses": { |
| 46 | "202": { |
| 47 | "description": "Analysis job accepted", |
| 48 | "content": { |
| 49 | "application/json": {"schema": {"$ref": "#/components/schemas/Job"}} |
| 50 | }, |
| 51 | } |
| 52 | }, |
| 53 | } |
| 54 | }, |
| 55 | "/jobs/{id}": { |
| 56 | "get": { |
| 57 | "summary": "Get analysis job status", |
| 58 | "operationId": "getJob", |
| 59 | "parameters": [ |
| 60 | {"name": "id", "in": "path", "required": True, "schema": {"type": "string"}} |
| 61 | ], |
| 62 | "responses": { |
| 63 | "200": { |
| 64 | "description": "Job status", |
| 65 | "content": { |
| 66 | "application/json": {"schema": {"$ref": "#/components/schemas/Job"}} |
| 67 | }, |
| 68 | } |
| 69 | }, |
| 70 | } |
| 71 | }, |
| 72 | "/knowledge-graph/{id}/entities": { |
| 73 | "get": { |
| 74 | "summary": "List entities in a knowledge graph", |
| 75 | "operationId": "listEntities", |
| 76 | "parameters": [ |
| 77 | { |
| 78 | "name": "id", |
| 79 | "in": "path", |
| 80 | "required": True, |
| 81 | "schema": {"type": "string"}, |
| 82 | }, |
| 83 | {"name": "type", "in": "query", "schema": {"type": "string"}}, |
| 84 | ], |
| 85 | "responses": { |
| 86 | "200": { |
| 87 | "description": "Entity list", |
| 88 | "content": { |
| 89 | "application/json": { |
| 90 | "schema": { |
| 91 | "type": "array", |
| 92 | "items": {"$ref": "#/components/schemas/Entity"}, |
| 93 | } |
| 94 | } |
| 95 | }, |
| 96 | } |
| 97 | }, |
| 98 | } |
| 99 | }, |
| 100 | "/knowledge-graph/{id}/relationships": { |
| 101 | "get": { |
| 102 | "summary": "List relationships in a knowledge graph", |
| 103 | "operationId": "listRelationships", |
| 104 | "parameters": [ |
| 105 | {"name": "id", "in": "path", "required": True, "schema": {"type": "string"}} |
| 106 | ], |
| 107 | "responses": { |
| 108 | "200": { |
| 109 | "description": "Relationship list", |
| 110 | "content": { |
| 111 | "application/json": { |
| 112 | "schema": { |
| 113 | "type": "array", |
| 114 | "items": {"$ref": "#/components/schemas/Relationship"}, |
| 115 | } |
| 116 | } |
| 117 | }, |
| 118 | } |
| 119 | }, |
| 120 | } |
| 121 | }, |
| 122 | "/knowledge-graph/{id}/query": { |
| 123 | "get": { |
| 124 | "summary": "Query the knowledge graph with natural language", |
| 125 | "operationId": "queryKnowledgeGraph", |
| 126 | "parameters": [ |
| 127 | { |
| 128 | "name": "id", |
| 129 | "in": "path", |
| 130 | "required": True, |
| 131 | "schema": {"type": "string"}, |
| 132 | }, |
| 133 | { |
| 134 | "name": "q", |
| 135 | "in": "query", |
| 136 | "required": True, |
| 137 | "schema": {"type": "string"}, |
| 138 | }, |
| 139 | ], |
| 140 | "responses": { |
| 141 | "200": { |
| 142 | "description": "Query results", |
| 143 | "content": {"application/json": {"schema": {"type": "object"}}}, |
| 144 | } |
| 145 | }, |
| 146 | } |
| 147 | }, |
| 148 | }, |
| 149 | "components": { |
| 150 | "schemas": { |
| 151 | "Job": { |
| 152 | "type": "object", |
| 153 | "properties": { |
| 154 | "id": {"type": "string"}, |
| 155 | "status": { |
| 156 | "type": "string", |
| 157 | "enum": ["pending", "processing", "completed", "failed"], |
| 158 | }, |
| 159 | "progress": {"type": "number", "format": "float"}, |
| 160 | "created_at": {"type": "string", "format": "date-time"}, |
| 161 | "completed_at": {"type": "string", "format": "date-time"}, |
| 162 | "result_url": {"type": "string", "format": "uri"}, |
| 163 | }, |
| 164 | }, |
| 165 | "Entity": { |
| 166 | "type": "object", |
| 167 | "properties": { |
| 168 | "name": {"type": "string"}, |
| 169 | "type": {"type": "string"}, |
| 170 | "descriptions": {"type": "array", "items": {"type": "string"}}, |
| 171 | }, |
| 172 | }, |
| 173 | "Relationship": { |
| 174 | "type": "object", |
| 175 | "properties": { |
| 176 | "source": {"type": "string"}, |
| 177 | "target": {"type": "string"}, |
| 178 | "type": {"type": "string"}, |
| 179 | }, |
| 180 | }, |
| 181 | } |
| 182 | }, |
| 183 | } |
+485
| --- a/video_processor/auth.py | ||
| +++ b/video_processor/auth.py | ||
| @@ -0,0 +1,485 @@ | ||
| 1 | +"""Unified OAuth and authentication strategy for PlanOpticon connectors. | |
| 2 | + | |
| 3 | +Provides a consistent auth pattern across all source connectors: | |
| 4 | +1. Saved token (auto-refresh if expired) | |
| 5 | +2. OAuth 2.0 (Authorization Code with PKCE, or Client Credentials) | |
| 6 | +3. API key fallback (environment variable) | |
| 7 | + | |
| 8 | +Usage in a connector: | |
| 9 | + | |
| 10 | + from video_processor.auth import OAuthManager, AuthConfig | |
| 11 | + | |
| 12 | + config = AuthConfig( | |
| 13 | + service="notion", | |
| 14 | + oauth_authorize_url="https://api.notion.com/v1/oauth/authorize", | |
| 15 | + oauth_token_url="https://api.notion.com/v1/oauth/token", | |
| 16 | + client_id_env="NOTION_CLIENT_ID", | |
| 17 | + client_secret_env="NOTION_CLIENT_SECRET", | |
| 18 | + api_key_env="NOTION_API_KEY", | |
| 19 | + scopes=["read_content"], | |
| 20 | + ) | |
| 21 | + manager = OAuthManager(config) | |
| 22 | + token = manager.authenticate() # Returns access token or None | |
| 23 | +""" | |
| 24 | + | |
| 25 | +import base64 | |
| 26 | +import hashlib | |
| 27 | +import json | |
| 28 | +import logging | |
| 29 | +import os | |
| 30 | +import secrets | |
| 31 | +import time | |
| 32 | +import webbrowser | |
| 33 | +from dataclasses import dataclass, field | |
| 34 | +from pathlib import Path | |
| 35 | +from typing import Dict, List, Optional | |
| 36 | + | |
| 37 | +logger = logging.getLogger(__name__) | |
| 38 | + | |
| 39 | +TOKEN_DIR = Path.home() / ".planopticon" | |
| 40 | + | |
| 41 | + | |
| 42 | +@dataclass | |
| 43 | +class AuthConfig: | |
| 44 | + """Configuration for a service's authentication.""" | |
| 45 | + | |
| 46 | + service: str | |
| 47 | + | |
| 48 | + # OAuth endpoints (set both for OAuth support) | |
| 49 | + oauth_authorize_url: Optional[str] = None | |
| 50 | + oauth_token_url: Optional[str] = None | |
| 51 | + | |
| 52 | + # Client credentials (checked from env if not provided) | |
| 53 | + client_id: Optional[str] = None | |
| 54 | + client_secret: Optional[str] = None | |
| 55 | + client_id_env: Optional[str] = None | |
| 56 | + client_secret_env: Optional[str] = None | |
| 57 | + | |
| 58 | + # API key fallback | |
| 59 | + api_key_env: Optional[str] = None | |
| 60 | + | |
| 61 | + # OAuth scopes | |
| 62 | + scopes: List[str] = field(default_factory=list) | |
| 63 | + | |
| 64 | + # Redirect URI for auth code flow | |
| 65 | + redirect_uri: str = "urn:ietf:wg:oauth:2.0:oob" | |
| 66 | + | |
| 67 | + # Server-to-Server (client credentials grant) | |
| 68 | + account_id: Optional[str] = None | |
| 69 | + account_id_env: Optional[str] = None | |
| 70 | + | |
| 71 | + # Token storage | |
| 72 | + token_path: Optional[Path] = None | |
| 73 | + | |
| 74 | + @property | |
| 75 | + def resolved_client_id(self) -> Optional[str]: | |
| 76 | + return ( | |
| 77 | + self.client_id | |
| 78 | + or (os.environ.get(self.client_id_env, "") if self.client_id_env else None) | |
| 79 | + or None | |
| 80 | + ) | |
| 81 | + | |
| 82 | + @property | |
| 83 | + def resolved_client_secret(self) -> Optional[str]: | |
| 84 | + return ( | |
| 85 | + self.client_secret | |
| 86 | + or (os.environ.get(self.client_secret_env, "") if self.client_secret_env else None) | |
| 87 | + or None | |
| 88 | + ) | |
| 89 | + | |
| 90 | + @property | |
| 91 | + def resolved_api_key(self) -> Optional[str]: | |
| 92 | + if self.api_key_env: | |
| 93 | + val = os.environ.get(self.api_key_env, "") | |
| 94 | + return val if val else None | |
| 95 | + return None | |
| 96 | + | |
| 97 | + @property | |
| 98 | + def resolved_account_id(self) -> Optional[str]: | |
| 99 | + return ( | |
| 100 | + self.account_id | |
| 101 | + or (os.environ.get(self.account_id_env, "") if self.account_id_env else None) | |
| 102 | + or None | |
| 103 | + ) | |
| 104 | + | |
| 105 | + @property | |
| 106 | + def resolved_token_path(self) -> Path: | |
| 107 | + return self.token_path or TOKEN_DIR / f"{self.service}_token.json" | |
| 108 | + | |
| 109 | + @property | |
| 110 | + def supports_oauth(self) -> bool: | |
| 111 | + return bool(self.oauth_authorize_url and self.oauth_token_url) | |
| 112 | + | |
| 113 | + | |
| 114 | +@dataclass | |
| 115 | +class AuthResult: | |
| 116 | + """Result of an authentication attempt.""" | |
| 117 | + | |
| 118 | + success: bool | |
| 119 | + access_token: Optional[str] = None | |
| 120 | + method: Optional[str] = None # "saved_token", "oauth_pkce", "client_credentials", "api_key" | |
| 121 | + expires_at: Optional[float] = None | |
| 122 | + refresh_token: Optional[str] = None | |
| 123 | + error: Optional[str] = None | |
| 124 | + | |
| 125 | + | |
| 126 | +class OAuthManager: | |
| 127 | + """Manages OAuth and API key authentication for a service. | |
| 128 | + | |
| 129 | + Tries auth methods in order: | |
| 130 | + 1. Load saved token (refresh if expired) | |
| 131 | + 2. Client Credentials grant (if account_id is set) | |
| 132 | + 3. OAuth2 Authorization Code with PKCE (interactive) | |
| 133 | + 4. API key fallback | |
| 134 | + """ | |
| 135 | + | |
| 136 | + def __init__(self, config: AuthConfig): | |
| 137 | + self.config = config | |
| 138 | + self._token_data: Optional[Dict] = None | |
| 139 | + | |
| 140 | + def authenticate(self) -> AuthResult: | |
| 141 | + """Run the auth chain and return the result.""" | |
| 142 | + # 1. Saved token | |
| 143 | + result = self._try_saved_token() | |
| 144 | + if result.success: | |
| 145 | + return result | |
| 146 | + | |
| 147 | + # 2. Client Credentials (Server-to-Server) | |
| 148 | + if self.config.resolved_account_id and self.config.supports_oauth: | |
| 149 | + result = self._try_client_credentials() | |
| 150 | + if result.success: | |
| 151 | + return result | |
| 152 | + | |
| 153 | + # 3. OAuth PKCE (interactive) | |
| 154 | + if self.config.supports_oauth and self.config.resolved_client_id: | |
| 155 | + result = self._try_oauth_pkce() | |
| 156 | + if result.success: | |
| 157 | + return result | |
| 158 | + | |
| 159 | + # 4. API key fallback | |
| 160 | + api_key = self.config.resolved_api_key | |
| 161 | + if api_key: | |
| 162 | + return AuthResult( | |
| 163 | + success=True, | |
| 164 | + access_token=api_key, | |
| 165 | + method="api_key", | |
| 166 | + ) | |
| 167 | + | |
| 168 | + # Build a helpful error message | |
| 169 | + hints = [] | |
| 170 | + if self.config.supports_oauth and self.config.client_id_env: | |
| 171 | + hints.append(f"Set {self.config.client_id_env} for OAuth") | |
| 172 | + if self.config.client_secret_env: | |
| 173 | + hints.append(f"and {self.config.client_secret_env}") | |
| 174 | + if self.config.api_key_env: | |
| 175 | + hints.append(f"or set {self.config.api_key_env} for API key access") | |
| 176 | + hint_str = (" (" + " ".join(hints) + ")") if hints else "" | |
| 177 | + | |
| 178 | + return AuthResult( | |
| 179 | + success=False, | |
| 180 | + error=f"No auth method available for {self.config.service}.{hint_str}", | |
| 181 | + ) | |
| 182 | + | |
| 183 | + def get_token(self) -> Optional[str]: | |
| 184 | + """Convenience: authenticate and return just the token.""" | |
| 185 | + result = self.authenticate() | |
| 186 | + return result.access_token if result.success else None | |
| 187 | + | |
| 188 | + def _try_saved_token(self) -> AuthResult: | |
| 189 | + """Load and validate a saved token.""" | |
| 190 | + token_path = self.config.resolved_token_path | |
| 191 | + if not token_path.exists(): | |
| 192 | + return AuthResult(success=False) | |
| 193 | + | |
| 194 | + try: | |
| 195 | + data = json.loads(token_path.read_text()) | |
| 196 | + expires_at = data.get("expires_at", 0) | |
| 197 | + | |
| 198 | + if time.time() < expires_at: | |
| 199 | + self._token_data = data | |
| 200 | + return AuthResult( | |
| 201 | + success=True, | |
| 202 | + access_token=data["access_token"], | |
| 203 | + method="saved_token", | |
| 204 | + expires_at=expires_at, | |
| 205 | + ) | |
| 206 | + | |
| 207 | + # Expired — try refresh | |
| 208 | + if data.get("refresh_token"): | |
| 209 | + return self._refresh_token(data) | |
| 210 | + | |
| 211 | + return AuthResult(success=False) | |
| 212 | + except Exception as exc: | |
| 213 | + logger.debug("Failed to load saved token for %s: %s", self.config.service, exc) | |
| 214 | + return AuthResult(success=False) | |
| 215 | + | |
| 216 | + def _refresh_token(self, data: Dict) -> AuthResult: | |
| 217 | + """Refresh an expired OAuth token.""" | |
| 218 | + try: | |
| 219 | + import requests | |
| 220 | + except ImportError: | |
| 221 | + return AuthResult(success=False, error="requests not installed") | |
| 222 | + | |
| 223 | + client_id = data.get("client_id") or self.config.resolved_client_id | |
| 224 | + client_secret = data.get("client_secret") or self.config.resolved_client_secret | |
| 225 | + | |
| 226 | + if not client_id or not data.get("refresh_token"): | |
| 227 | + return AuthResult(success=False) | |
| 228 | + | |
| 229 | + try: | |
| 230 | + resp = requests.post( | |
| 231 | + self.config.oauth_token_url, | |
| 232 | + data={ | |
| 233 | + "grant_type": "refresh_token", | |
| 234 | + "refresh_token": data["refresh_token"], | |
| 235 | + }, | |
| 236 | + auth=(client_id, client_secret or ""), | |
| 237 | + timeout=30, | |
| 238 | + ) | |
| 239 | + resp.raise_for_status() | |
| 240 | + token_data = resp.json() | |
| 241 | + | |
| 242 | + new_data = { | |
| 243 | + "access_token": token_data["access_token"], | |
| 244 | + "refresh_token": token_data.get("refresh_token", data["refresh_token"]), | |
| 245 | + "expires_at": time.time() + token_data.get("expires_in", 3600) - 60, | |
| 246 | + "client_id": client_id, | |
| 247 | + "client_secret": client_secret or "", | |
| 248 | + } | |
| 249 | + self._save_token(new_data) | |
| 250 | + self._token_data = new_data | |
| 251 | + | |
| 252 | + logger.info("Refreshed OAuth token for %s", self.config.service) | |
| 253 | + return AuthResult( | |
| 254 | + success=True, | |
| 255 | + access_token=new_data["access_token"], | |
| 256 | + method="saved_token", | |
| 257 | + expires_at=new_data["expires_at"], | |
| 258 | + refresh_token=new_data["refresh_token"], | |
| 259 | + ) | |
| 260 | + except Exception as exc: | |
| 261 | + logger.debug("Token refresh failed for %s: %s", self.config.service, exc) | |
| 262 | + return AuthResult(success=False) | |
| 263 | + | |
| 264 | + def _try_client_credentials(self) -> AuthResult: | |
| 265 | + """Server-to-Server OAuth using client credentials grant.""" | |
| 266 | + try: | |
| 267 | + import requests | |
| 268 | + except ImportError: | |
| 269 | + return AuthResult(success=False, error="requests not installed") | |
| 270 | + | |
| 271 | + client_id = self.config.resolved_client_id | |
| 272 | + client_secret = self.config.resolved_client_secret | |
| 273 | + account_id = self.config.resolved_account_id | |
| 274 | + | |
| 275 | + if not client_id or not client_secret: | |
| 276 | + return AuthResult(success=False) | |
| 277 | + | |
| 278 | + try: | |
| 279 | + resp = requests.post( | |
| 280 | + self.config.oauth_token_url, | |
| 281 | + params={ | |
| 282 | + "grant_type": "account_credentials", | |
| 283 | + "account_id": account_id, | |
| 284 | + }, | |
| 285 | + auth=(client_id, client_secret), | |
| 286 | + timeout=30, | |
| 287 | + ) | |
| 288 | + resp.raise_for_status() | |
| 289 | + token_data = resp.json() | |
| 290 | + | |
| 291 | + data = { | |
| 292 | + "access_token": token_data["access_token"], | |
| 293 | + "expires_at": time.time() + token_data.get("expires_in", 3600) - 60, | |
| 294 | + } | |
| 295 | + self._save_token(data) | |
| 296 | + self._token_data = data | |
| 297 | + | |
| 298 | + logger.info("Authenticated %s via client credentials", self.config.service) | |
| 299 | + return AuthResult( | |
| 300 | + success=True, | |
| 301 | + access_token=data["access_token"], | |
| 302 | + method="client_credentials", | |
| 303 | + expires_at=data["expires_at"], | |
| 304 | + ) | |
| 305 | + except Exception as exc: | |
| 306 | + logger.debug("Client credentials failed for %s: %s", self.config.service, exc) | |
| 307 | + return AuthResult(success=False) | |
| 308 | + | |
| 309 | + def _try_oauth_pkce(self) -> AuthResult: | |
| 310 | + """Interactive OAuth2 Authorization Code flow with PKCE.""" | |
| 311 | + try: | |
| 312 | + import requests | |
| 313 | + except ImportError: | |
| 314 | + return AuthResult(success=False, error="requests not installed") | |
| 315 | + | |
| 316 | + client_id = self.config.resolved_client_id | |
| 317 | + if not client_id: | |
| 318 | + return AuthResult(success=False) | |
| 319 | + | |
| 320 | + # Generate PKCE verifier and challenge | |
| 321 | + code_verifier = secrets.token_urlsafe(64) | |
| 322 | + code_challenge = ( | |
| 323 | + base64.urlsafe_b64encode(hashlib.sha256(code_verifier.encode("ascii")).digest()) | |
| 324 | + .rstrip(b"=") | |
| 325 | + .decode("ascii") | |
| 326 | + ) | |
| 327 | + | |
| 328 | + # Build authorize URL | |
| 329 | + params = ( | |
| 330 | + f"?response_type=code" | |
| 331 | + f"&client_id={client_id}" | |
| 332 | + f"&redirect_uri={self.config.redirect_uri}" | |
| 333 | + f"&code_challenge={code_challenge}" | |
| 334 | + f"&code_challenge_method=S256" | |
| 335 | + ) | |
| 336 | + if self.config.scopes: | |
| 337 | + params += f"&scope={'+'.join(self.config.scopes)}" | |
| 338 | + | |
| 339 | + authorize_url = f"{self.config.oauth_authorize_url}{params}" | |
| 340 | + | |
| 341 | + print(f"\nOpen this URL to authorize PlanOpticon ({self.config.service}):") | |
| 342 | + print(f"{authorize_url}\n") | |
| 343 | + | |
| 344 | + try: | |
| 345 | + webbrowser.open(authorize_url) | |
| 346 | + except Exception: | |
| 347 | + pass | |
| 348 | + | |
| 349 | + try: | |
| 350 | + auth_code = input("Enter the authorization code: ").strip() | |
| 351 | + except (KeyboardInterrupt, EOFError): | |
| 352 | + return AuthResult(success=False, error="Auth cancelled by user") | |
| 353 | + | |
| 354 | + if not auth_code: | |
| 355 | + return AuthResult(success=False, error="No auth code provided") | |
| 356 | + | |
| 357 | + # Exchange code for tokens | |
| 358 | + client_secret = self.config.resolved_client_secret | |
| 359 | + try: | |
| 360 | + resp = requests.post( | |
| 361 | + self.config.oauth_token_url, | |
| 362 | + data={ | |
| 363 | + "grant_type": "authorization_code", | |
| 364 | + "code": auth_code, | |
| 365 | + "redirect_uri": self.config.redirect_uri, | |
| 366 | + "code_verifier": code_verifier, | |
| 367 | + }, | |
| 368 | + auth=(client_id, client_secret or ""), | |
| 369 | + timeout=30, | |
| 370 | + ) | |
| 371 | + resp.raise_for_status() | |
| 372 | + token_data = resp.json() | |
| 373 | + | |
| 374 | + data = { | |
| 375 | + "access_token": token_data["access_token"], | |
| 376 | + "refresh_token": token_data.get("refresh_token"), | |
| 377 | + "expires_at": time.time() + token_data.get("expires_in", 3600) - 60, | |
| 378 | + "client_id": client_id, | |
| 379 | + "client_secret": client_secret or "", | |
| 380 | + } | |
| 381 | + self._save_token(data) | |
| 382 | + self._token_data = data | |
| 383 | + | |
| 384 | + logger.info("Authenticated %s via OAuth PKCE", self.config.service) | |
| 385 | + return AuthResult( | |
| 386 | + success=True, | |
| 387 | + access_token=data["access_token"], | |
| 388 | + method="oauth_pkce", | |
| 389 | + expires_at=data["expires_at"], | |
| 390 | + refresh_token=data.get("refresh_token"), | |
| 391 | + ) | |
| 392 | + except Exception as exc: | |
| 393 | + logger.debug("OAuth PKCE failed for %s: %s", self.config.service, exc) | |
| 394 | + return AuthResult(success=False, error=str(exc)) | |
| 395 | + | |
| 396 | + def _save_token(self, data: Dict) -> None: | |
| 397 | + """Persist token data to disk.""" | |
| 398 | + token_path = self.config.resolved_token_path | |
| 399 | + token_path.parent.mkdir(parents=True, exist_ok=True) | |
| 400 | + token_path.write_text(json.dumps(data)) | |
| 401 | + logger.info("Saved %s token to %s", self.config.service, token_path) | |
| 402 | + | |
| 403 | + def clear_token(self) -> None: | |
| 404 | + """Remove saved token (logout).""" | |
| 405 | + token_path = self.config.resolved_token_path | |
| 406 | + if token_path.exists(): | |
| 407 | + token_path.unlink() | |
| 408 | + logger.info("Cleared %s token", self.config.service) | |
| 409 | + | |
| 410 | + | |
| 411 | +# ----------------------------------------------------------------------- | |
| 412 | +# Pre-built configs for known services | |
| 413 | +# ----------------------------------------------------------------------- | |
| 414 | + | |
| 415 | +KNOWN_CONFIGS: Dict[str, AuthConfig] = { | |
| 416 | + "zoom": AuthConfig( | |
| 417 | + service="zoom", | |
| 418 | + oauth_authorize_url="https://zoom.us/oauth/authorize", | |
| 419 | + oauth_token_url="https://zoom.us/oauth/token", | |
| 420 | + client_id_env="ZOOM_CLIENT_ID", | |
| 421 | + client_secret_env="ZOOM_CLIENT_SECRET", | |
| 422 | + account_id_env="ZOOM_ACCOUNT_ID", | |
| 423 | + ), | |
| 424 | + "notion": AuthConfig( | |
| 425 | + service="notion", | |
| 426 | + oauth_authorize_url="https://api.notion.com/v1/oauth/authorize", | |
| 427 | + oauth_token_url="https://api.notion.com/v1/oauth/token", | |
| 428 | + client_id_env="NOTION_CLIENT_ID", | |
| 429 | + client_secret_env="NOTION_CLIENT_SECRET", | |
| 430 | + api_key_env="NOTION_API_KEY", | |
| 431 | + ), | |
| 432 | + "dropbox": AuthConfig( | |
| 433 | + service="dropbox", | |
| 434 | + oauth_authorize_url="https://www.dropbox.com/oauth2/authorize", | |
| 435 | + oauth_token_url="https://api.dropboxapi.com/oauth2/token", | |
| 436 | + client_id_env="DROPBOX_APP_KEY", | |
| 437 | + client_secret_env="DROPBOX_APP_SECRET", | |
| 438 | + api_key_env="DROPBOX_ACCESS_TOKEN", | |
| 439 | + ), | |
| 440 | + "github": AuthConfig( | |
| 441 | + service="github", | |
| 442 | + oauth_authorize_url="https://github.com/login/oauth/authorize", | |
| 443 | + oauth_token_url="https://github.com/login/oauth/access_token", | |
| 444 | + client_id_env="GITHUB_CLIENT_ID", | |
| 445 | + client_secret_env="GITHUB_CLIENT_SECRET", | |
| 446 | + api_key_env="GITHUB_TOKEN", | |
| 447 | + scopes=["repo", "read:org"], | |
| 448 | + ), | |
| 449 | + "google": AuthConfig( | |
| 450 | + service="google", | |
| 451 | + oauth_authorize_url="https://accounts.google.com/o/oauth2/v2/auth", | |
| 452 | + oauth_token_url="https://oauth2.googleapis.com/token", | |
| 453 | + client_id_env="GOOGLE_CLIENT_ID", | |
| 454 | + client_secret_env="GOOGLE_CLIENT_SECRET", | |
| 455 | + api_key_env="GOOGLE_API_KEY", | |
| 456 | + scopes=[ | |
| 457 | + "https://www.googleapis.com/auth/drive.readonly", | |
| 458 | + "https://www.googleapis.com/auth/documents.readonly", | |
| 459 | + ], | |
| 460 | + ), | |
| 461 | + "microsoft": AuthConfig( | |
| 462 | + service="microsoft", | |
| 463 | + oauth_authorize_url=("https://login.microsoftonline.com/common/oauth2/v2.0/authorize"), | |
| 464 | + oauth_token_url=("https://login.microsoftonline.com/common/oauth2/v2.0/token"), | |
| 465 | + client_id_env="MICROSOFT_CLIENT_ID", | |
| 466 | + client_secret_env="MICROSOFT_CLIENT_SECRET", | |
| 467 | + scopes=[ | |
| 468 | + "https://graph.microsoft.com/OnlineMeetings.Read", | |
| 469 | + "https://graph.microsoft.com/Files.Read", | |
| 470 | + ], | |
| 471 | + ), | |
| 472 | +} | |
| 473 | + | |
| 474 | + | |
| 475 | +def get_auth_config(service: str) -> Optional[AuthConfig]: | |
| 476 | + """Get a pre-built AuthConfig for a known service.""" | |
| 477 | + return KNOWN_CONFIGS.get(service) | |
| 478 | + | |
| 479 | + | |
| 480 | +def get_auth_manager(service: str) -> Optional[OAuthManager]: | |
| 481 | + """Get an OAuthManager for a known service.""" | |
| 482 | + config = get_auth_config(service) | |
| 483 | + if config: | |
| 484 | + return OAuthManager(config) | |
| 485 | + return None |
| --- a/video_processor/auth.py | |
| +++ b/video_processor/auth.py | |
| @@ -0,0 +1,485 @@ | |
| --- a/video_processor/auth.py | |
| +++ b/video_processor/auth.py | |
| @@ -0,0 +1,485 @@ | |
| 1 | """Unified OAuth and authentication strategy for PlanOpticon connectors. |
| 2 | |
| 3 | Provides a consistent auth pattern across all source connectors: |
| 4 | 1. Saved token (auto-refresh if expired) |
| 5 | 2. OAuth 2.0 (Authorization Code with PKCE, or Client Credentials) |
| 6 | 3. API key fallback (environment variable) |
| 7 | |
| 8 | Usage in a connector: |
| 9 | |
| 10 | from video_processor.auth import OAuthManager, AuthConfig |
| 11 | |
| 12 | config = AuthConfig( |
| 13 | service="notion", |
| 14 | oauth_authorize_url="https://api.notion.com/v1/oauth/authorize", |
| 15 | oauth_token_url="https://api.notion.com/v1/oauth/token", |
| 16 | client_id_env="NOTION_CLIENT_ID", |
| 17 | client_secret_env="NOTION_CLIENT_SECRET", |
| 18 | api_key_env="NOTION_API_KEY", |
| 19 | scopes=["read_content"], |
| 20 | ) |
| 21 | manager = OAuthManager(config) |
| 22 | token = manager.authenticate() # Returns access token or None |
| 23 | """ |
| 24 | |
| 25 | import base64 |
| 26 | import hashlib |
| 27 | import json |
| 28 | import logging |
| 29 | import os |
| 30 | import secrets |
| 31 | import time |
| 32 | import webbrowser |
| 33 | from dataclasses import dataclass, field |
| 34 | from pathlib import Path |
| 35 | from typing import Dict, List, Optional |
| 36 | |
| 37 | logger = logging.getLogger(__name__) |
| 38 | |
| 39 | TOKEN_DIR = Path.home() / ".planopticon" |
| 40 | |
| 41 | |
| 42 | @dataclass |
| 43 | class AuthConfig: |
| 44 | """Configuration for a service's authentication.""" |
| 45 | |
| 46 | service: str |
| 47 | |
| 48 | # OAuth endpoints (set both for OAuth support) |
| 49 | oauth_authorize_url: Optional[str] = None |
| 50 | oauth_token_url: Optional[str] = None |
| 51 | |
| 52 | # Client credentials (checked from env if not provided) |
| 53 | client_id: Optional[str] = None |
| 54 | client_secret: Optional[str] = None |
| 55 | client_id_env: Optional[str] = None |
| 56 | client_secret_env: Optional[str] = None |
| 57 | |
| 58 | # API key fallback |
| 59 | api_key_env: Optional[str] = None |
| 60 | |
| 61 | # OAuth scopes |
| 62 | scopes: List[str] = field(default_factory=list) |
| 63 | |
| 64 | # Redirect URI for auth code flow |
| 65 | redirect_uri: str = "urn:ietf:wg:oauth:2.0:oob" |
| 66 | |
| 67 | # Server-to-Server (client credentials grant) |
| 68 | account_id: Optional[str] = None |
| 69 | account_id_env: Optional[str] = None |
| 70 | |
| 71 | # Token storage |
| 72 | token_path: Optional[Path] = None |
| 73 | |
| 74 | @property |
| 75 | def resolved_client_id(self) -> Optional[str]: |
| 76 | return ( |
| 77 | self.client_id |
| 78 | or (os.environ.get(self.client_id_env, "") if self.client_id_env else None) |
| 79 | or None |
| 80 | ) |
| 81 | |
| 82 | @property |
| 83 | def resolved_client_secret(self) -> Optional[str]: |
| 84 | return ( |
| 85 | self.client_secret |
| 86 | or (os.environ.get(self.client_secret_env, "") if self.client_secret_env else None) |
| 87 | or None |
| 88 | ) |
| 89 | |
| 90 | @property |
| 91 | def resolved_api_key(self) -> Optional[str]: |
| 92 | if self.api_key_env: |
| 93 | val = os.environ.get(self.api_key_env, "") |
| 94 | return val if val else None |
| 95 | return None |
| 96 | |
| 97 | @property |
| 98 | def resolved_account_id(self) -> Optional[str]: |
| 99 | return ( |
| 100 | self.account_id |
| 101 | or (os.environ.get(self.account_id_env, "") if self.account_id_env else None) |
| 102 | or None |
| 103 | ) |
| 104 | |
| 105 | @property |
| 106 | def resolved_token_path(self) -> Path: |
| 107 | return self.token_path or TOKEN_DIR / f"{self.service}_token.json" |
| 108 | |
| 109 | @property |
| 110 | def supports_oauth(self) -> bool: |
| 111 | return bool(self.oauth_authorize_url and self.oauth_token_url) |
| 112 | |
| 113 | |
| 114 | @dataclass |
| 115 | class AuthResult: |
| 116 | """Result of an authentication attempt.""" |
| 117 | |
| 118 | success: bool |
| 119 | access_token: Optional[str] = None |
| 120 | method: Optional[str] = None # "saved_token", "oauth_pkce", "client_credentials", "api_key" |
| 121 | expires_at: Optional[float] = None |
| 122 | refresh_token: Optional[str] = None |
| 123 | error: Optional[str] = None |
| 124 | |
| 125 | |
| 126 | class OAuthManager: |
| 127 | """Manages OAuth and API key authentication for a service. |
| 128 | |
| 129 | Tries auth methods in order: |
| 130 | 1. Load saved token (refresh if expired) |
| 131 | 2. Client Credentials grant (if account_id is set) |
| 132 | 3. OAuth2 Authorization Code with PKCE (interactive) |
| 133 | 4. API key fallback |
| 134 | """ |
| 135 | |
| 136 | def __init__(self, config: AuthConfig): |
| 137 | self.config = config |
| 138 | self._token_data: Optional[Dict] = None |
| 139 | |
| 140 | def authenticate(self) -> AuthResult: |
| 141 | """Run the auth chain and return the result.""" |
| 142 | # 1. Saved token |
| 143 | result = self._try_saved_token() |
| 144 | if result.success: |
| 145 | return result |
| 146 | |
| 147 | # 2. Client Credentials (Server-to-Server) |
| 148 | if self.config.resolved_account_id and self.config.supports_oauth: |
| 149 | result = self._try_client_credentials() |
| 150 | if result.success: |
| 151 | return result |
| 152 | |
| 153 | # 3. OAuth PKCE (interactive) |
| 154 | if self.config.supports_oauth and self.config.resolved_client_id: |
| 155 | result = self._try_oauth_pkce() |
| 156 | if result.success: |
| 157 | return result |
| 158 | |
| 159 | # 4. API key fallback |
| 160 | api_key = self.config.resolved_api_key |
| 161 | if api_key: |
| 162 | return AuthResult( |
| 163 | success=True, |
| 164 | access_token=api_key, |
| 165 | method="api_key", |
| 166 | ) |
| 167 | |
| 168 | # Build a helpful error message |
| 169 | hints = [] |
| 170 | if self.config.supports_oauth and self.config.client_id_env: |
| 171 | hints.append(f"Set {self.config.client_id_env} for OAuth") |
| 172 | if self.config.client_secret_env: |
| 173 | hints.append(f"and {self.config.client_secret_env}") |
| 174 | if self.config.api_key_env: |
| 175 | hints.append(f"or set {self.config.api_key_env} for API key access") |
| 176 | hint_str = (" (" + " ".join(hints) + ")") if hints else "" |
| 177 | |
| 178 | return AuthResult( |
| 179 | success=False, |
| 180 | error=f"No auth method available for {self.config.service}.{hint_str}", |
| 181 | ) |
| 182 | |
| 183 | def get_token(self) -> Optional[str]: |
| 184 | """Convenience: authenticate and return just the token.""" |
| 185 | result = self.authenticate() |
| 186 | return result.access_token if result.success else None |
| 187 | |
| 188 | def _try_saved_token(self) -> AuthResult: |
| 189 | """Load and validate a saved token.""" |
| 190 | token_path = self.config.resolved_token_path |
| 191 | if not token_path.exists(): |
| 192 | return AuthResult(success=False) |
| 193 | |
| 194 | try: |
| 195 | data = json.loads(token_path.read_text()) |
| 196 | expires_at = data.get("expires_at", 0) |
| 197 | |
| 198 | if time.time() < expires_at: |
| 199 | self._token_data = data |
| 200 | return AuthResult( |
| 201 | success=True, |
| 202 | access_token=data["access_token"], |
| 203 | method="saved_token", |
| 204 | expires_at=expires_at, |
| 205 | ) |
| 206 | |
| 207 | # Expired — try refresh |
| 208 | if data.get("refresh_token"): |
| 209 | return self._refresh_token(data) |
| 210 | |
| 211 | return AuthResult(success=False) |
| 212 | except Exception as exc: |
| 213 | logger.debug("Failed to load saved token for %s: %s", self.config.service, exc) |
| 214 | return AuthResult(success=False) |
| 215 | |
| 216 | def _refresh_token(self, data: Dict) -> AuthResult: |
| 217 | """Refresh an expired OAuth token.""" |
| 218 | try: |
| 219 | import requests |
| 220 | except ImportError: |
| 221 | return AuthResult(success=False, error="requests not installed") |
| 222 | |
| 223 | client_id = data.get("client_id") or self.config.resolved_client_id |
| 224 | client_secret = data.get("client_secret") or self.config.resolved_client_secret |
| 225 | |
| 226 | if not client_id or not data.get("refresh_token"): |
| 227 | return AuthResult(success=False) |
| 228 | |
| 229 | try: |
| 230 | resp = requests.post( |
| 231 | self.config.oauth_token_url, |
| 232 | data={ |
| 233 | "grant_type": "refresh_token", |
| 234 | "refresh_token": data["refresh_token"], |
| 235 | }, |
| 236 | auth=(client_id, client_secret or ""), |
| 237 | timeout=30, |
| 238 | ) |
| 239 | resp.raise_for_status() |
| 240 | token_data = resp.json() |
| 241 | |
| 242 | new_data = { |
| 243 | "access_token": token_data["access_token"], |
| 244 | "refresh_token": token_data.get("refresh_token", data["refresh_token"]), |
| 245 | "expires_at": time.time() + token_data.get("expires_in", 3600) - 60, |
| 246 | "client_id": client_id, |
| 247 | "client_secret": client_secret or "", |
| 248 | } |
| 249 | self._save_token(new_data) |
| 250 | self._token_data = new_data |
| 251 | |
| 252 | logger.info("Refreshed OAuth token for %s", self.config.service) |
| 253 | return AuthResult( |
| 254 | success=True, |
| 255 | access_token=new_data["access_token"], |
| 256 | method="saved_token", |
| 257 | expires_at=new_data["expires_at"], |
| 258 | refresh_token=new_data["refresh_token"], |
| 259 | ) |
| 260 | except Exception as exc: |
| 261 | logger.debug("Token refresh failed for %s: %s", self.config.service, exc) |
| 262 | return AuthResult(success=False) |
| 263 | |
| 264 | def _try_client_credentials(self) -> AuthResult: |
| 265 | """Server-to-Server OAuth using client credentials grant.""" |
| 266 | try: |
| 267 | import requests |
| 268 | except ImportError: |
| 269 | return AuthResult(success=False, error="requests not installed") |
| 270 | |
| 271 | client_id = self.config.resolved_client_id |
| 272 | client_secret = self.config.resolved_client_secret |
| 273 | account_id = self.config.resolved_account_id |
| 274 | |
| 275 | if not client_id or not client_secret: |
| 276 | return AuthResult(success=False) |
| 277 | |
| 278 | try: |
| 279 | resp = requests.post( |
| 280 | self.config.oauth_token_url, |
| 281 | params={ |
| 282 | "grant_type": "account_credentials", |
| 283 | "account_id": account_id, |
| 284 | }, |
| 285 | auth=(client_id, client_secret), |
| 286 | timeout=30, |
| 287 | ) |
| 288 | resp.raise_for_status() |
| 289 | token_data = resp.json() |
| 290 | |
| 291 | data = { |
| 292 | "access_token": token_data["access_token"], |
| 293 | "expires_at": time.time() + token_data.get("expires_in", 3600) - 60, |
| 294 | } |
| 295 | self._save_token(data) |
| 296 | self._token_data = data |
| 297 | |
| 298 | logger.info("Authenticated %s via client credentials", self.config.service) |
| 299 | return AuthResult( |
| 300 | success=True, |
| 301 | access_token=data["access_token"], |
| 302 | method="client_credentials", |
| 303 | expires_at=data["expires_at"], |
| 304 | ) |
| 305 | except Exception as exc: |
| 306 | logger.debug("Client credentials failed for %s: %s", self.config.service, exc) |
| 307 | return AuthResult(success=False) |
| 308 | |
| 309 | def _try_oauth_pkce(self) -> AuthResult: |
| 310 | """Interactive OAuth2 Authorization Code flow with PKCE.""" |
| 311 | try: |
| 312 | import requests |
| 313 | except ImportError: |
| 314 | return AuthResult(success=False, error="requests not installed") |
| 315 | |
| 316 | client_id = self.config.resolved_client_id |
| 317 | if not client_id: |
| 318 | return AuthResult(success=False) |
| 319 | |
| 320 | # Generate PKCE verifier and challenge |
| 321 | code_verifier = secrets.token_urlsafe(64) |
| 322 | code_challenge = ( |
| 323 | base64.urlsafe_b64encode(hashlib.sha256(code_verifier.encode("ascii")).digest()) |
| 324 | .rstrip(b"=") |
| 325 | .decode("ascii") |
| 326 | ) |
| 327 | |
| 328 | # Build authorize URL |
| 329 | params = ( |
| 330 | f"?response_type=code" |
| 331 | f"&client_id={client_id}" |
| 332 | f"&redirect_uri={self.config.redirect_uri}" |
| 333 | f"&code_challenge={code_challenge}" |
| 334 | f"&code_challenge_method=S256" |
| 335 | ) |
| 336 | if self.config.scopes: |
| 337 | params += f"&scope={'+'.join(self.config.scopes)}" |
| 338 | |
| 339 | authorize_url = f"{self.config.oauth_authorize_url}{params}" |
| 340 | |
| 341 | print(f"\nOpen this URL to authorize PlanOpticon ({self.config.service}):") |
| 342 | print(f"{authorize_url}\n") |
| 343 | |
| 344 | try: |
| 345 | webbrowser.open(authorize_url) |
| 346 | except Exception: |
| 347 | pass |
| 348 | |
| 349 | try: |
| 350 | auth_code = input("Enter the authorization code: ").strip() |
| 351 | except (KeyboardInterrupt, EOFError): |
| 352 | return AuthResult(success=False, error="Auth cancelled by user") |
| 353 | |
| 354 | if not auth_code: |
| 355 | return AuthResult(success=False, error="No auth code provided") |
| 356 | |
| 357 | # Exchange code for tokens |
| 358 | client_secret = self.config.resolved_client_secret |
| 359 | try: |
| 360 | resp = requests.post( |
| 361 | self.config.oauth_token_url, |
| 362 | data={ |
| 363 | "grant_type": "authorization_code", |
| 364 | "code": auth_code, |
| 365 | "redirect_uri": self.config.redirect_uri, |
| 366 | "code_verifier": code_verifier, |
| 367 | }, |
| 368 | auth=(client_id, client_secret or ""), |
| 369 | timeout=30, |
| 370 | ) |
| 371 | resp.raise_for_status() |
| 372 | token_data = resp.json() |
| 373 | |
| 374 | data = { |
| 375 | "access_token": token_data["access_token"], |
| 376 | "refresh_token": token_data.get("refresh_token"), |
| 377 | "expires_at": time.time() + token_data.get("expires_in", 3600) - 60, |
| 378 | "client_id": client_id, |
| 379 | "client_secret": client_secret or "", |
| 380 | } |
| 381 | self._save_token(data) |
| 382 | self._token_data = data |
| 383 | |
| 384 | logger.info("Authenticated %s via OAuth PKCE", self.config.service) |
| 385 | return AuthResult( |
| 386 | success=True, |
| 387 | access_token=data["access_token"], |
| 388 | method="oauth_pkce", |
| 389 | expires_at=data["expires_at"], |
| 390 | refresh_token=data.get("refresh_token"), |
| 391 | ) |
| 392 | except Exception as exc: |
| 393 | logger.debug("OAuth PKCE failed for %s: %s", self.config.service, exc) |
| 394 | return AuthResult(success=False, error=str(exc)) |
| 395 | |
| 396 | def _save_token(self, data: Dict) -> None: |
| 397 | """Persist token data to disk.""" |
| 398 | token_path = self.config.resolved_token_path |
| 399 | token_path.parent.mkdir(parents=True, exist_ok=True) |
| 400 | token_path.write_text(json.dumps(data)) |
| 401 | logger.info("Saved %s token to %s", self.config.service, token_path) |
| 402 | |
| 403 | def clear_token(self) -> None: |
| 404 | """Remove saved token (logout).""" |
| 405 | token_path = self.config.resolved_token_path |
| 406 | if token_path.exists(): |
| 407 | token_path.unlink() |
| 408 | logger.info("Cleared %s token", self.config.service) |
| 409 | |
| 410 | |
| 411 | # ----------------------------------------------------------------------- |
| 412 | # Pre-built configs for known services |
| 413 | # ----------------------------------------------------------------------- |
| 414 | |
| 415 | KNOWN_CONFIGS: Dict[str, AuthConfig] = { |
| 416 | "zoom": AuthConfig( |
| 417 | service="zoom", |
| 418 | oauth_authorize_url="https://zoom.us/oauth/authorize", |
| 419 | oauth_token_url="https://zoom.us/oauth/token", |
| 420 | client_id_env="ZOOM_CLIENT_ID", |
| 421 | client_secret_env="ZOOM_CLIENT_SECRET", |
| 422 | account_id_env="ZOOM_ACCOUNT_ID", |
| 423 | ), |
| 424 | "notion": AuthConfig( |
| 425 | service="notion", |
| 426 | oauth_authorize_url="https://api.notion.com/v1/oauth/authorize", |
| 427 | oauth_token_url="https://api.notion.com/v1/oauth/token", |
| 428 | client_id_env="NOTION_CLIENT_ID", |
| 429 | client_secret_env="NOTION_CLIENT_SECRET", |
| 430 | api_key_env="NOTION_API_KEY", |
| 431 | ), |
| 432 | "dropbox": AuthConfig( |
| 433 | service="dropbox", |
| 434 | oauth_authorize_url="https://www.dropbox.com/oauth2/authorize", |
| 435 | oauth_token_url="https://api.dropboxapi.com/oauth2/token", |
| 436 | client_id_env="DROPBOX_APP_KEY", |
| 437 | client_secret_env="DROPBOX_APP_SECRET", |
| 438 | api_key_env="DROPBOX_ACCESS_TOKEN", |
| 439 | ), |
| 440 | "github": AuthConfig( |
| 441 | service="github", |
| 442 | oauth_authorize_url="https://github.com/login/oauth/authorize", |
| 443 | oauth_token_url="https://github.com/login/oauth/access_token", |
| 444 | client_id_env="GITHUB_CLIENT_ID", |
| 445 | client_secret_env="GITHUB_CLIENT_SECRET", |
| 446 | api_key_env="GITHUB_TOKEN", |
| 447 | scopes=["repo", "read:org"], |
| 448 | ), |
| 449 | "google": AuthConfig( |
| 450 | service="google", |
| 451 | oauth_authorize_url="https://accounts.google.com/o/oauth2/v2/auth", |
| 452 | oauth_token_url="https://oauth2.googleapis.com/token", |
| 453 | client_id_env="GOOGLE_CLIENT_ID", |
| 454 | client_secret_env="GOOGLE_CLIENT_SECRET", |
| 455 | api_key_env="GOOGLE_API_KEY", |
| 456 | scopes=[ |
| 457 | "https://www.googleapis.com/auth/drive.readonly", |
| 458 | "https://www.googleapis.com/auth/documents.readonly", |
| 459 | ], |
| 460 | ), |
| 461 | "microsoft": AuthConfig( |
| 462 | service="microsoft", |
| 463 | oauth_authorize_url=("https://login.microsoftonline.com/common/oauth2/v2.0/authorize"), |
| 464 | oauth_token_url=("https://login.microsoftonline.com/common/oauth2/v2.0/token"), |
| 465 | client_id_env="MICROSOFT_CLIENT_ID", |
| 466 | client_secret_env="MICROSOFT_CLIENT_SECRET", |
| 467 | scopes=[ |
| 468 | "https://graph.microsoft.com/OnlineMeetings.Read", |
| 469 | "https://graph.microsoft.com/Files.Read", |
| 470 | ], |
| 471 | ), |
| 472 | } |
| 473 | |
| 474 | |
| 475 | def get_auth_config(service: str) -> Optional[AuthConfig]: |
| 476 | """Get a pre-built AuthConfig for a known service.""" |
| 477 | return KNOWN_CONFIGS.get(service) |
| 478 | |
| 479 | |
| 480 | def get_auth_manager(service: str) -> Optional[OAuthManager]: |
| 481 | """Get an OAuthManager for a known service.""" |
| 482 | config = get_auth_config(service) |
| 483 | if config: |
| 484 | return OAuthManager(config) |
| 485 | return None |
+1480
-47
| --- video_processor/cli/commands.py | ||
| +++ video_processor/cli/commands.py | ||
| @@ -34,19 +34,38 @@ | ||
| 34 | 34 | root_logger.addHandler(console_handler) |
| 35 | 35 | |
| 36 | 36 | |
| 37 | 37 | @click.group(invoke_without_command=True) |
| 38 | 38 | @click.option("--verbose", "-v", is_flag=True, help="Enable verbose output") |
| 39 | -@click.version_option("0.2.0", prog_name="PlanOpticon") | |
| 39 | +@click.option( | |
| 40 | + "--chat", | |
| 41 | + "-C", | |
| 42 | + is_flag=True, | |
| 43 | + help="Launch interactive companion REPL", | |
| 44 | +) | |
| 45 | +@click.option( | |
| 46 | + "--interactive", | |
| 47 | + "-I", | |
| 48 | + "interactive_flag", | |
| 49 | + is_flag=True, | |
| 50 | + help="Launch interactive companion REPL", | |
| 51 | +) | |
| 52 | +@click.version_option("0.4.0", prog_name="PlanOpticon") | |
| 40 | 53 | @click.pass_context |
| 41 | -def cli(ctx, verbose): | |
| 54 | +def cli(ctx, verbose, chat, interactive_flag): | |
| 42 | 55 | """PlanOpticon - Comprehensive Video Analysis & Knowledge Extraction Tool.""" |
| 43 | 56 | ctx.ensure_object(dict) |
| 44 | 57 | ctx.obj["verbose"] = verbose |
| 45 | 58 | setup_logging(verbose) |
| 46 | 59 | |
| 47 | - if ctx.invoked_subcommand is None: | |
| 60 | + if (chat or interactive_flag) and ctx.invoked_subcommand is None: | |
| 61 | + from video_processor.cli.companion import CompanionREPL | |
| 62 | + | |
| 63 | + repl = CompanionREPL() | |
| 64 | + repl.run() | |
| 65 | + ctx.exit(0) | |
| 66 | + elif ctx.invoked_subcommand is None: | |
| 48 | 67 | _interactive_menu(ctx) |
| 49 | 68 | |
| 50 | 69 | |
| 51 | 70 | @cli.command() |
| 52 | 71 | @click.option( |
| @@ -73,16 +92,47 @@ | ||
| 73 | 92 | ) |
| 74 | 93 | @click.option("--title", type=str, help="Title for the analysis report") |
| 75 | 94 | @click.option( |
| 76 | 95 | "--provider", |
| 77 | 96 | "-p", |
| 78 | - type=click.Choice(["auto", "openai", "anthropic", "gemini", "ollama"]), | |
| 97 | + type=click.Choice( | |
| 98 | + [ | |
| 99 | + "auto", | |
| 100 | + "openai", | |
| 101 | + "anthropic", | |
| 102 | + "gemini", | |
| 103 | + "ollama", | |
| 104 | + "azure", | |
| 105 | + "together", | |
| 106 | + "fireworks", | |
| 107 | + "cerebras", | |
| 108 | + "xai", | |
| 109 | + ] | |
| 110 | + ), | |
| 79 | 111 | default="auto", |
| 80 | 112 | help="API provider", |
| 81 | 113 | ) |
| 82 | 114 | @click.option("--vision-model", type=str, default=None, help="Override model for vision tasks") |
| 83 | 115 | @click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks") |
| 116 | +@click.option( | |
| 117 | + "--output-format", | |
| 118 | + type=click.Choice(["default", "json"]), | |
| 119 | + default="default", | |
| 120 | + help="Output format: default (files + summary) or json (structured JSON to stdout)", | |
| 121 | +) | |
| 122 | +@click.option( | |
| 123 | + "--templates-dir", | |
| 124 | + type=click.Path(exists=True), | |
| 125 | + default=None, | |
| 126 | + help="Directory with custom prompt template .txt files", | |
| 127 | +) | |
| 128 | +@click.option( | |
| 129 | + "--speakers", | |
| 130 | + type=str, | |
| 131 | + default=None, | |
| 132 | + help='Comma-separated speaker names for diarization hints (e.g., "Alice,Bob,Carol")', | |
| 133 | +) | |
| 84 | 134 | @click.pass_context |
| 85 | 135 | def analyze( |
| 86 | 136 | ctx, |
| 87 | 137 | input, |
| 88 | 138 | output, |
| @@ -94,26 +144,35 @@ | ||
| 94 | 144 | periodic_capture, |
| 95 | 145 | title, |
| 96 | 146 | provider, |
| 97 | 147 | vision_model, |
| 98 | 148 | chat_model, |
| 149 | + output_format, | |
| 150 | + templates_dir, | |
| 151 | + speakers, | |
| 99 | 152 | ): |
| 100 | 153 | """Analyze a single video and extract structured knowledge.""" |
| 101 | 154 | from video_processor.pipeline import process_single_video |
| 102 | 155 | from video_processor.providers.manager import ProviderManager |
| 103 | 156 | |
| 104 | 157 | focus_areas = [a.strip().lower() for a in focus.split(",")] if focus else [] |
| 158 | + speaker_hints = [s.strip() for s in speakers.split(",")] if speakers else None | |
| 105 | 159 | prov = None if provider == "auto" else provider |
| 106 | 160 | |
| 107 | 161 | pm = ProviderManager( |
| 108 | 162 | vision_model=vision_model, |
| 109 | 163 | chat_model=chat_model, |
| 110 | 164 | provider=prov, |
| 111 | 165 | ) |
| 112 | 166 | |
| 167 | + if templates_dir: | |
| 168 | + from video_processor.utils.prompt_templates import PromptTemplate | |
| 169 | + | |
| 170 | + pm.prompt_templates = PromptTemplate(templates_dir=templates_dir) | |
| 171 | + | |
| 113 | 172 | try: |
| 114 | - process_single_video( | |
| 173 | + manifest = process_single_video( | |
| 115 | 174 | input_path=input, |
| 116 | 175 | output_dir=output, |
| 117 | 176 | provider_manager=pm, |
| 118 | 177 | depth=depth, |
| 119 | 178 | focus_areas=focus_areas, |
| @@ -120,16 +179,23 @@ | ||
| 120 | 179 | sampling_rate=sampling_rate, |
| 121 | 180 | change_threshold=change_threshold, |
| 122 | 181 | periodic_capture_seconds=periodic_capture, |
| 123 | 182 | use_gpu=use_gpu, |
| 124 | 183 | title=title, |
| 184 | + speaker_hints=speaker_hints, | |
| 125 | 185 | ) |
| 126 | - click.echo(pm.usage.format_summary()) | |
| 127 | - click.echo(f"\n Results: {output}/manifest.json") | |
| 186 | + if output_format == "json": | |
| 187 | + click.echo(json.dumps(manifest.model_dump(), indent=2, default=str)) | |
| 188 | + else: | |
| 189 | + click.echo(pm.usage.format_summary()) | |
| 190 | + click.echo(f"\n Results: {output}/manifest.json") | |
| 128 | 191 | except Exception as e: |
| 129 | 192 | logging.error(f"Error: {e}") |
| 130 | - click.echo(pm.usage.format_summary()) | |
| 193 | + if output_format == "json": | |
| 194 | + click.echo(json.dumps({"error": str(e)})) | |
| 195 | + else: | |
| 196 | + click.echo(pm.usage.format_summary()) | |
| 131 | 197 | if ctx.obj["verbose"]: |
| 132 | 198 | import traceback |
| 133 | 199 | |
| 134 | 200 | traceback.print_exc() |
| 135 | 201 | sys.exit(1) |
| @@ -154,11 +220,24 @@ | ||
| 154 | 220 | ) |
| 155 | 221 | @click.option("--title", type=str, default="Batch Processing Results", help="Batch title") |
| 156 | 222 | @click.option( |
| 157 | 223 | "--provider", |
| 158 | 224 | "-p", |
| 159 | - type=click.Choice(["auto", "openai", "anthropic", "gemini", "ollama"]), | |
| 225 | + type=click.Choice( | |
| 226 | + [ | |
| 227 | + "auto", | |
| 228 | + "openai", | |
| 229 | + "anthropic", | |
| 230 | + "gemini", | |
| 231 | + "ollama", | |
| 232 | + "azure", | |
| 233 | + "together", | |
| 234 | + "fireworks", | |
| 235 | + "cerebras", | |
| 236 | + "xai", | |
| 237 | + ] | |
| 238 | + ), | |
| 160 | 239 | default="auto", |
| 161 | 240 | help="API provider", |
| 162 | 241 | ) |
| 163 | 242 | @click.option("--vision-model", type=str, default=None, help="Override model for vision tasks") |
| 164 | 243 | @click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks") |
| @@ -282,14 +361,18 @@ | ||
| 282 | 361 | entry.action_items_count = len(manifest.action_items) |
| 283 | 362 | entry.key_points_count = len(manifest.key_points) |
| 284 | 363 | entry.duration_seconds = manifest.video.duration_seconds |
| 285 | 364 | manifests.append(manifest) |
| 286 | 365 | |
| 287 | - # Merge knowledge graph | |
| 288 | - kg_path = video_output / "results" / "knowledge_graph.json" | |
| 289 | - if kg_path.exists(): | |
| 290 | - kg_data = json.loads(kg_path.read_text()) | |
| 366 | + # Merge knowledge graph (prefer .db, fall back to .json) | |
| 367 | + kg_db = video_output / "results" / "knowledge_graph.db" | |
| 368 | + kg_json = video_output / "results" / "knowledge_graph.json" | |
| 369 | + if kg_db.exists(): | |
| 370 | + video_kg = KnowledgeGraph(db_path=kg_db) | |
| 371 | + merged_kg.merge(video_kg) | |
| 372 | + elif kg_json.exists(): | |
| 373 | + kg_data = json.loads(kg_json.read_text()) | |
| 291 | 374 | video_kg = KnowledgeGraph.from_dict(kg_data) |
| 292 | 375 | merged_kg.merge(video_kg) |
| 293 | 376 | |
| 294 | 377 | except Exception as e: |
| 295 | 378 | logging.error(f"Failed to process {video_path.name}: {e}") |
| @@ -300,13 +383,12 @@ | ||
| 300 | 383 | |
| 301 | 384 | traceback.print_exc() |
| 302 | 385 | |
| 303 | 386 | entries.append(entry) |
| 304 | 387 | |
| 305 | - # Save merged knowledge graph | |
| 306 | - merged_kg_path = Path(output) / "knowledge_graph.json" | |
| 307 | - merged_kg.save(merged_kg_path) | |
| 388 | + # Save merged knowledge graph (SQLite is primary, JSON is export) | |
| 389 | + merged_kg.save(Path(output) / "knowledge_graph.json") | |
| 308 | 390 | |
| 309 | 391 | # Generate batch summary |
| 310 | 392 | plan_gen = PlanGenerator(provider_manager=pm, knowledge_graph=merged_kg) |
| 311 | 393 | summary_path = Path(output) / "batch_summary.md" |
| 312 | 394 | plan_gen.generate_batch_summary( |
| @@ -336,10 +418,120 @@ | ||
| 336 | 418 | f"\n Batch complete: {batch_manifest.completed_videos}" |
| 337 | 419 | f"/{batch_manifest.total_videos} succeeded" |
| 338 | 420 | ) |
| 339 | 421 | click.echo(f" Results: {output}/batch_manifest.json") |
| 340 | 422 | |
| 423 | + | |
| 424 | +@cli.command() | |
| 425 | +@click.argument("input_path", type=click.Path(exists=True)) | |
| 426 | +@click.option( | |
| 427 | + "--output", "-o", type=click.Path(), default=None, help="Output directory for knowledge graph" | |
| 428 | +) | |
| 429 | +@click.option( | |
| 430 | + "--db-path", type=click.Path(), default=None, help="Existing knowledge_graph.db to add to" | |
| 431 | +) | |
| 432 | +@click.option("--recursive/--no-recursive", "-r", default=True, help="Recurse into subdirectories") | |
| 433 | +@click.option( | |
| 434 | + "--provider", | |
| 435 | + "-p", | |
| 436 | + type=click.Choice( | |
| 437 | + [ | |
| 438 | + "auto", | |
| 439 | + "openai", | |
| 440 | + "anthropic", | |
| 441 | + "gemini", | |
| 442 | + "ollama", | |
| 443 | + "azure", | |
| 444 | + "together", | |
| 445 | + "fireworks", | |
| 446 | + "cerebras", | |
| 447 | + "xai", | |
| 448 | + ] | |
| 449 | + ), | |
| 450 | + default="auto", | |
| 451 | + help="API provider", | |
| 452 | +) | |
| 453 | +@click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks") | |
| 454 | +@click.pass_context | |
| 455 | +def ingest(ctx, input_path, output, db_path, recursive, provider, chat_model): | |
| 456 | + """Ingest documents into a knowledge graph. | |
| 457 | + | |
| 458 | + Supports: .md, .txt, .pdf (with pymupdf or pdfplumber installed) | |
| 459 | + | |
| 460 | + Examples: | |
| 461 | + | |
| 462 | + planopticon ingest spec.md | |
| 463 | + | |
| 464 | + planopticon ingest ./docs/ -o ./output | |
| 465 | + | |
| 466 | + planopticon ingest report.pdf --db-path existing.db | |
| 467 | + """ | |
| 468 | + from video_processor.integrators.knowledge_graph import KnowledgeGraph | |
| 469 | + from video_processor.processors import list_supported_extensions | |
| 470 | + from video_processor.processors.ingest import ingest_directory, ingest_file | |
| 471 | + from video_processor.providers.manager import ProviderManager | |
| 472 | + | |
| 473 | + input_path = Path(input_path) | |
| 474 | + prov = None if provider == "auto" else provider | |
| 475 | + pm = ProviderManager(chat_model=chat_model, provider=prov) | |
| 476 | + | |
| 477 | + # Determine DB path | |
| 478 | + if db_path: | |
| 479 | + kg_path = Path(db_path) | |
| 480 | + elif output: | |
| 481 | + out_dir = Path(output) | |
| 482 | + out_dir.mkdir(parents=True, exist_ok=True) | |
| 483 | + kg_path = out_dir / "knowledge_graph.db" | |
| 484 | + else: | |
| 485 | + kg_path = Path.cwd() / "knowledge_graph.db" | |
| 486 | + | |
| 487 | + kg_path.parent.mkdir(parents=True, exist_ok=True) | |
| 488 | + | |
| 489 | + click.echo(f"Knowledge graph: {kg_path}") | |
| 490 | + kg = KnowledgeGraph(provider_manager=pm, db_path=kg_path) | |
| 491 | + | |
| 492 | + total_files = 0 | |
| 493 | + total_chunks = 0 | |
| 494 | + | |
| 495 | + try: | |
| 496 | + if input_path.is_file(): | |
| 497 | + count = ingest_file(input_path, kg) | |
| 498 | + total_files = 1 | |
| 499 | + total_chunks = count | |
| 500 | + click.echo(f" {input_path.name}: {count} chunks") | |
| 501 | + elif input_path.is_dir(): | |
| 502 | + results = ingest_directory(input_path, kg, recursive=recursive) | |
| 503 | + total_files = len(results) | |
| 504 | + total_chunks = sum(results.values()) | |
| 505 | + for fpath, count in results.items(): | |
| 506 | + click.echo(f" {Path(fpath).name}: {count} chunks") | |
| 507 | + else: | |
| 508 | + click.echo(f"Error: {input_path} is not a file or directory", err=True) | |
| 509 | + sys.exit(1) | |
| 510 | + except ValueError as e: | |
| 511 | + click.echo(f"Error: {e}", err=True) | |
| 512 | + click.echo(f"Supported extensions: {', '.join(list_supported_extensions())}") | |
| 513 | + sys.exit(1) | |
| 514 | + except ImportError as e: | |
| 515 | + click.echo(f"Error: {e}", err=True) | |
| 516 | + sys.exit(1) | |
| 517 | + | |
| 518 | + # Save both .db and .json | |
| 519 | + kg.save(kg_path) | |
| 520 | + json_path = kg_path.with_suffix(".json") | |
| 521 | + kg.save(json_path) | |
| 522 | + | |
| 523 | + entity_count = kg._store.get_entity_count() | |
| 524 | + rel_count = kg._store.get_relationship_count() | |
| 525 | + | |
| 526 | + click.echo("\nIngestion complete:") | |
| 527 | + click.echo(f" Files processed: {total_files}") | |
| 528 | + click.echo(f" Total chunks: {total_chunks}") | |
| 529 | + click.echo(f" Entities extracted: {entity_count}") | |
| 530 | + click.echo(f" Relationships: {rel_count}") | |
| 531 | + click.echo(f" Knowledge graph: {kg_path}") | |
| 532 | + | |
| 341 | 533 | |
| 342 | 534 | @cli.command("list-models") |
| 343 | 535 | @click.pass_context |
| 344 | 536 | def list_models(ctx): |
| 345 | 537 | """Discover and display available models from all configured providers.""" |
| @@ -421,11 +613,24 @@ | ||
| 421 | 613 | ) |
| 422 | 614 | @click.option("--title", type=str, help="Title for the analysis report") |
| 423 | 615 | @click.option( |
| 424 | 616 | "--provider", |
| 425 | 617 | "-p", |
| 426 | - type=click.Choice(["auto", "openai", "anthropic", "gemini", "ollama"]), | |
| 618 | + type=click.Choice( | |
| 619 | + [ | |
| 620 | + "auto", | |
| 621 | + "openai", | |
| 622 | + "anthropic", | |
| 623 | + "gemini", | |
| 624 | + "ollama", | |
| 625 | + "azure", | |
| 626 | + "together", | |
| 627 | + "fireworks", | |
| 628 | + "cerebras", | |
| 629 | + "xai", | |
| 630 | + ] | |
| 631 | + ), | |
| 427 | 632 | default="auto", |
| 428 | 633 | help="API provider", |
| 429 | 634 | ) |
| 430 | 635 | @click.option("--vision-model", type=str, default=None, help="Override model for vision tasks") |
| 431 | 636 | @click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks") |
| @@ -462,10 +667,145 @@ | ||
| 462 | 667 | import traceback |
| 463 | 668 | |
| 464 | 669 | traceback.print_exc() |
| 465 | 670 | sys.exit(1) |
| 466 | 671 | |
| 672 | + | |
| 673 | +@cli.command() | |
| 674 | +@click.argument("request", required=False, default=None) | |
| 675 | +@click.option("--kb", multiple=True, type=click.Path(exists=True), help="Knowledge base paths") | |
| 676 | +@click.option("--interactive", "-I", is_flag=True, help="Interactive chat mode") | |
| 677 | +@click.option("--export", type=click.Path(), default=None, help="Export artifacts to directory") | |
| 678 | +@click.option( | |
| 679 | + "--provider", | |
| 680 | + "-p", | |
| 681 | + type=click.Choice( | |
| 682 | + [ | |
| 683 | + "auto", | |
| 684 | + "openai", | |
| 685 | + "anthropic", | |
| 686 | + "gemini", | |
| 687 | + "ollama", | |
| 688 | + "azure", | |
| 689 | + "together", | |
| 690 | + "fireworks", | |
| 691 | + "cerebras", | |
| 692 | + "xai", | |
| 693 | + ] | |
| 694 | + ), | |
| 695 | + default="auto", | |
| 696 | + help="API provider", | |
| 697 | +) | |
| 698 | +@click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks") | |
| 699 | +@click.pass_context | |
| 700 | +def agent(ctx, request, kb, interactive, export, provider, chat_model): | |
| 701 | + """AI planning agent. Synthesizes knowledge into project plans and artifacts. | |
| 702 | + | |
| 703 | + Examples: | |
| 704 | + | |
| 705 | + planopticon agent "Create a project plan" --kb ./results | |
| 706 | + | |
| 707 | + planopticon agent -I --kb ./videos --kb ./docs | |
| 708 | + | |
| 709 | + planopticon agent "Generate a PRD" --export ./output | |
| 710 | + """ | |
| 711 | + # Ensure all skills are registered | |
| 712 | + import video_processor.agent.skills # noqa: F401 | |
| 713 | + from video_processor.agent.agent_loop import PlanningAgent | |
| 714 | + from video_processor.agent.kb_context import KBContext | |
| 715 | + from video_processor.agent.skills.base import AgentContext | |
| 716 | + | |
| 717 | + # Build provider manager | |
| 718 | + pm = None | |
| 719 | + try: | |
| 720 | + from video_processor.providers.manager import ProviderManager | |
| 721 | + | |
| 722 | + prov = None if provider == "auto" else provider | |
| 723 | + pm = ProviderManager(chat_model=chat_model, provider=prov) | |
| 724 | + except Exception: | |
| 725 | + if not interactive: | |
| 726 | + click.echo("Warning: could not initialize LLM provider.", err=True) | |
| 727 | + | |
| 728 | + # Load knowledge base | |
| 729 | + kb_ctx = KBContext() | |
| 730 | + if kb: | |
| 731 | + for path in kb: | |
| 732 | + kb_ctx.add_source(Path(path)) | |
| 733 | + kb_ctx.load(provider_manager=pm) | |
| 734 | + click.echo(kb_ctx.summary()) | |
| 735 | + else: | |
| 736 | + # Auto-discover | |
| 737 | + kb_ctx = KBContext.auto_discover(provider_manager=pm) | |
| 738 | + if kb_ctx.sources: | |
| 739 | + click.echo(kb_ctx.summary()) | |
| 740 | + else: | |
| 741 | + click.echo("No knowledge base found. Use --kb to specify paths.") | |
| 742 | + | |
| 743 | + agent_inst = PlanningAgent( | |
| 744 | + context=AgentContext( | |
| 745 | + knowledge_graph=kb_ctx.knowledge_graph if kb_ctx.sources else None, | |
| 746 | + query_engine=kb_ctx.query_engine if kb_ctx.sources else None, | |
| 747 | + provider_manager=pm, | |
| 748 | + ) | |
| 749 | + ) | |
| 750 | + | |
| 751 | + if interactive: | |
| 752 | + click.echo("\nPlanOpticon Agent (interactive mode)") | |
| 753 | + click.echo("Type your request, or 'quit' to exit.\n") | |
| 754 | + while True: | |
| 755 | + try: | |
| 756 | + line = click.prompt("agent", prompt_suffix="> ") | |
| 757 | + except (KeyboardInterrupt, EOFError): | |
| 758 | + click.echo("\nBye.") | |
| 759 | + break | |
| 760 | + if line.strip().lower() in ("quit", "exit", "q"): | |
| 761 | + click.echo("Bye.") | |
| 762 | + break | |
| 763 | + | |
| 764 | + # Check for slash commands | |
| 765 | + if line.strip().startswith("/"): | |
| 766 | + cmd = line.strip()[1:].split()[0] | |
| 767 | + if cmd == "plan": | |
| 768 | + artifacts = agent_inst.execute("Generate a project plan") | |
| 769 | + elif cmd == "skills": | |
| 770 | + from video_processor.agent.skills.base import list_skills | |
| 771 | + | |
| 772 | + for s in list_skills(): | |
| 773 | + click.echo(f" {s.name}: {s.description}") | |
| 774 | + continue | |
| 775 | + elif cmd == "summary": | |
| 776 | + if kb_ctx.sources: | |
| 777 | + click.echo(kb_ctx.summary()) | |
| 778 | + continue | |
| 779 | + else: | |
| 780 | + artifacts = agent_inst.execute(line.strip()[1:]) | |
| 781 | + | |
| 782 | + for a in artifacts: | |
| 783 | + click.echo(f"\n--- {a.name} ({a.artifact_type}) ---\n") | |
| 784 | + click.echo(a.content) | |
| 785 | + else: | |
| 786 | + response = agent_inst.chat(line) | |
| 787 | + click.echo(f"\n{response}\n") | |
| 788 | + elif request: | |
| 789 | + artifacts = agent_inst.execute(request) | |
| 790 | + if not artifacts: | |
| 791 | + click.echo("No artifacts generated. Try a more specific request.") | |
| 792 | + for artifact in artifacts: | |
| 793 | + click.echo(f"\n--- {artifact.name} ({artifact.artifact_type}) ---\n") | |
| 794 | + click.echo(artifact.content) | |
| 795 | + | |
| 796 | + if export: | |
| 797 | + from video_processor.agent.skills.artifact_export import export_artifacts | |
| 798 | + | |
| 799 | + export_dir = Path(export) | |
| 800 | + export_artifacts(artifacts, export_dir) | |
| 801 | + click.echo(f"Exported {len(artifacts)} artifacts to {export_dir}/") | |
| 802 | + click.echo(f"Manifest: {export_dir / 'manifest.json'}") | |
| 803 | + else: | |
| 804 | + click.echo("Provide a request or use -I for interactive mode.") | |
| 805 | + click.echo("Example: planopticon agent 'Create a project plan' --kb ./results") | |
| 806 | + | |
| 467 | 807 | |
| 468 | 808 | @cli.command() |
| 469 | 809 | @click.argument("question", required=False, default=None) |
| 470 | 810 | @click.option( |
| 471 | 811 | "--db-path", |
| @@ -488,28 +828,43 @@ | ||
| 488 | 828 | ) |
| 489 | 829 | @click.option("--interactive", "-I", is_flag=True, help="Enter interactive REPL mode") |
| 490 | 830 | @click.option( |
| 491 | 831 | "--provider", |
| 492 | 832 | "-p", |
| 493 | - type=click.Choice(["auto", "openai", "anthropic", "gemini", "ollama"]), | |
| 833 | + type=click.Choice( | |
| 834 | + [ | |
| 835 | + "auto", | |
| 836 | + "openai", | |
| 837 | + "anthropic", | |
| 838 | + "gemini", | |
| 839 | + "ollama", | |
| 840 | + "azure", | |
| 841 | + "together", | |
| 842 | + "fireworks", | |
| 843 | + "cerebras", | |
| 844 | + "xai", | |
| 845 | + ] | |
| 846 | + ), | |
| 494 | 847 | default="auto", |
| 495 | 848 | help="API provider for agentic mode", |
| 496 | 849 | ) |
| 497 | 850 | @click.option("--chat-model", type=str, default=None, help="Override model for agentic mode") |
| 498 | 851 | @click.pass_context |
| 499 | 852 | def query(ctx, question, db_path, mode, output_format, interactive, provider, chat_model): |
| 500 | 853 | """Query a knowledge graph. Runs stats if no question given. |
| 501 | 854 | |
| 502 | 855 | Direct commands recognized in QUESTION: stats, entities, relationships, |
| 503 | - neighbors, cypher. Natural language questions use agentic mode. | |
| 856 | + neighbors, sources, provenance, sql. Natural language questions use agentic mode. | |
| 504 | 857 | |
| 505 | 858 | Examples: |
| 506 | 859 | |
| 507 | 860 | planopticon query |
| 508 | 861 | planopticon query stats |
| 509 | 862 | planopticon query "entities --type technology" |
| 510 | 863 | planopticon query "neighbors Alice" |
| 864 | + planopticon query sources | |
| 865 | + planopticon query "provenance Alice" | |
| 511 | 866 | planopticon query "What was discussed?" |
| 512 | 867 | planopticon query -I |
| 513 | 868 | """ |
| 514 | 869 | from video_processor.integrators.graph_discovery import find_nearest_graph |
| 515 | 870 | from video_processor.integrators.graph_query import GraphQueryEngine |
| @@ -588,13 +943,20 @@ | ||
| 588 | 943 | |
| 589 | 944 | if cmd == "neighbors": |
| 590 | 945 | entity_name = " ".join(parts[1:]) if len(parts) > 1 else "" |
| 591 | 946 | return engine.neighbors(entity_name) |
| 592 | 947 | |
| 593 | - if cmd == "cypher": | |
| 594 | - cypher_query = " ".join(parts[1:]) | |
| 595 | - return engine.cypher(cypher_query) | |
| 948 | + if cmd == "sources": | |
| 949 | + return engine.sources() | |
| 950 | + | |
| 951 | + if cmd == "provenance": | |
| 952 | + entity_name = " ".join(parts[1:]) if len(parts) > 1 else "" | |
| 953 | + return engine.provenance(entity_name) | |
| 954 | + | |
| 955 | + if cmd == "sql": | |
| 956 | + sql_query = " ".join(parts[1:]) | |
| 957 | + return engine.sql(sql_query) | |
| 596 | 958 | |
| 597 | 959 | # Natural language → agentic (or fallback to entity search in direct mode) |
| 598 | 960 | if mode == "direct": |
| 599 | 961 | return engine.entities(name=question) |
| 600 | 962 | return engine.ask(question) |
| @@ -646,33 +1008,1078 @@ | ||
| 646 | 1008 | _print_result(result, output_format) |
| 647 | 1009 | click.echo() |
| 648 | 1010 | |
| 649 | 1011 | |
| 650 | 1012 | @cli.command() |
| 651 | -@click.argument("service", type=click.Choice(["google", "dropbox"])) | |
| 652 | -@click.pass_context | |
| 653 | -def auth(ctx, service): | |
| 654 | - """Authenticate with a cloud service (google or dropbox).""" | |
| 655 | - if service == "google": | |
| 656 | - from video_processor.sources.google_drive import GoogleDriveSource | |
| 657 | - | |
| 658 | - source = GoogleDriveSource(use_service_account=False) | |
| 659 | - if source.authenticate(): | |
| 660 | - click.echo("Google Drive authentication successful.") | |
| 661 | - else: | |
| 662 | - click.echo("Google Drive authentication failed.", err=True) | |
| 663 | - sys.exit(1) | |
| 664 | - | |
| 665 | - elif service == "dropbox": | |
| 666 | - from video_processor.sources.dropbox_source import DropboxSource | |
| 667 | - | |
| 668 | - source = DropboxSource() | |
| 669 | - if source.authenticate(): | |
| 670 | - click.echo("Dropbox authentication successful.") | |
| 671 | - else: | |
| 672 | - click.echo("Dropbox authentication failed.", err=True) | |
| 673 | - sys.exit(1) | |
| 1013 | +@click.argument( | |
| 1014 | + "service", | |
| 1015 | + type=click.Choice( | |
| 1016 | + [ | |
| 1017 | + "google", | |
| 1018 | + "dropbox", | |
| 1019 | + "zoom", | |
| 1020 | + "notion", | |
| 1021 | + "github", | |
| 1022 | + "microsoft", | |
| 1023 | + ] | |
| 1024 | + ), | |
| 1025 | +) | |
| 1026 | +@click.option("--logout", is_flag=True, help="Clear saved token") | |
| 1027 | +@click.pass_context | |
| 1028 | +def auth(ctx, service, logout): | |
| 1029 | + """Authenticate with a cloud service via OAuth or API key. | |
| 1030 | + | |
| 1031 | + Uses OAuth when available, falls back to API keys. | |
| 1032 | + Tokens are saved to ~/.planopticon/ for reuse. | |
| 1033 | + | |
| 1034 | + Examples: | |
| 1035 | + | |
| 1036 | + planopticon auth google | |
| 1037 | + | |
| 1038 | + planopticon auth zoom | |
| 1039 | + | |
| 1040 | + planopticon auth github --logout | |
| 1041 | + """ | |
| 1042 | + from video_processor.auth import get_auth_manager | |
| 1043 | + | |
| 1044 | + manager = get_auth_manager(service) | |
| 1045 | + if not manager: | |
| 1046 | + click.echo(f"Unknown service: {service}", err=True) | |
| 1047 | + sys.exit(1) | |
| 1048 | + | |
| 1049 | + if logout: | |
| 1050 | + manager.clear_token() | |
| 1051 | + click.echo(f"Cleared saved {service} token.") | |
| 1052 | + return | |
| 1053 | + | |
| 1054 | + result = manager.authenticate() | |
| 1055 | + if result.success: | |
| 1056 | + click.echo(f"{service.title()} authentication successful ({result.method}).") | |
| 1057 | + else: | |
| 1058 | + click.echo( | |
| 1059 | + f"{service.title()} authentication failed: {result.error}", | |
| 1060 | + err=True, | |
| 1061 | + ) | |
| 1062 | + sys.exit(1) | |
| 1063 | + | |
| 1064 | + | |
| 1065 | +@cli.group() | |
| 1066 | +def gws(): | |
| 1067 | + """Google Workspace: fetch docs, sheets, and slides via the gws CLI.""" | |
| 1068 | + pass | |
| 1069 | + | |
| 1070 | + | |
| 1071 | +@gws.command("list") | |
| 1072 | +@click.option("--folder-id", type=str, default=None, help="Drive folder ID to list") | |
| 1073 | +@click.option("--query", "-q", type=str, default=None, help="Drive search query") | |
| 1074 | +@click.option("--json", "as_json", is_flag=True, help="Output as JSON") | |
| 1075 | +def gws_list(folder_id, query, as_json): | |
| 1076 | + """List documents in Google Drive. | |
| 1077 | + | |
| 1078 | + Examples: | |
| 1079 | + | |
| 1080 | + planopticon gws list | |
| 1081 | + | |
| 1082 | + planopticon gws list --folder-id 1abc... | |
| 1083 | + | |
| 1084 | + planopticon gws list -q "name contains 'PRD'" --json | |
| 1085 | + """ | |
| 1086 | + from video_processor.sources.gws_source import GWSSource | |
| 1087 | + | |
| 1088 | + source = GWSSource(folder_id=folder_id, query=query) | |
| 1089 | + if not source.authenticate(): | |
| 1090 | + click.echo("Error: gws CLI not available or not authenticated.", err=True) | |
| 1091 | + click.echo("Install: npm install -g @googleworkspace/cli", err=True) | |
| 1092 | + click.echo("Auth: gws auth login", err=True) | |
| 1093 | + sys.exit(1) | |
| 1094 | + | |
| 1095 | + files = source.list_videos(folder_id=folder_id) | |
| 1096 | + if as_json: | |
| 1097 | + click.echo(json.dumps([f.model_dump() for f in files], indent=2, default=str)) | |
| 1098 | + else: | |
| 1099 | + if not files: | |
| 1100 | + click.echo("No documents found.") | |
| 1101 | + return | |
| 1102 | + for f in files: | |
| 1103 | + size = f"{f.size_bytes / 1024:.0f}KB" if f.size_bytes else "—" | |
| 1104 | + click.echo(f" {f.id[:12]}… {size:>8s} {f.mime_type or ''} {f.name}") | |
| 1105 | + | |
| 1106 | + | |
| 1107 | +@gws.command("fetch") | |
| 1108 | +@click.argument("doc_ids", nargs=-1) | |
| 1109 | +@click.option("--folder-id", type=str, default=None, help="Fetch all docs in a folder") | |
| 1110 | +@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory") | |
| 1111 | +def gws_fetch(doc_ids, folder_id, output): | |
| 1112 | + """Fetch Google Docs/Sheets/Slides as text files. | |
| 1113 | + | |
| 1114 | + Examples: | |
| 1115 | + | |
| 1116 | + planopticon gws fetch DOC_ID1 DOC_ID2 -o ./docs | |
| 1117 | + | |
| 1118 | + planopticon gws fetch --folder-id 1abc... -o ./docs | |
| 1119 | + """ | |
| 1120 | + from video_processor.sources.gws_source import GWSSource | |
| 1121 | + | |
| 1122 | + source = GWSSource(folder_id=folder_id, doc_ids=list(doc_ids)) | |
| 1123 | + if not source.authenticate(): | |
| 1124 | + click.echo("Error: gws CLI not available or not authenticated.", err=True) | |
| 1125 | + sys.exit(1) | |
| 1126 | + | |
| 1127 | + out_dir = Path(output) if output else Path.cwd() / "gws_docs" | |
| 1128 | + out_dir.mkdir(parents=True, exist_ok=True) | |
| 1129 | + | |
| 1130 | + files = source.list_videos(folder_id=folder_id) | |
| 1131 | + if not files: | |
| 1132 | + click.echo("No documents found.") | |
| 1133 | + return | |
| 1134 | + | |
| 1135 | + for f in files: | |
| 1136 | + safe_name = f.name.replace("/", "_").replace("\\", "_") | |
| 1137 | + dest = out_dir / f"{safe_name}.txt" | |
| 1138 | + try: | |
| 1139 | + source.download(f, dest) | |
| 1140 | + click.echo(f" ✓ {f.name} → {dest}") | |
| 1141 | + except Exception as e: | |
| 1142 | + click.echo(f" ✗ {f.name}: {e}", err=True) | |
| 1143 | + | |
| 1144 | + click.echo(f"\nFetched {len(files)} document(s) to {out_dir}") | |
| 1145 | + | |
| 1146 | + | |
| 1147 | +@gws.command("ingest") | |
| 1148 | +@click.option("--folder-id", type=str, default=None, help="Drive folder ID") | |
| 1149 | +@click.option("--doc-id", type=str, multiple=True, help="Specific doc IDs (repeatable)") | |
| 1150 | +@click.option("--query", "-q", type=str, default=None, help="Drive search query") | |
| 1151 | +@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory") | |
| 1152 | +@click.option("--db-path", type=click.Path(), default=None, help="Existing DB to merge into") | |
| 1153 | +@click.option( | |
| 1154 | + "-p", | |
| 1155 | + "--provider", | |
| 1156 | + type=click.Choice( | |
| 1157 | + [ | |
| 1158 | + "auto", | |
| 1159 | + "openai", | |
| 1160 | + "anthropic", | |
| 1161 | + "gemini", | |
| 1162 | + "ollama", | |
| 1163 | + "azure", | |
| 1164 | + "together", | |
| 1165 | + "fireworks", | |
| 1166 | + "cerebras", | |
| 1167 | + "xai", | |
| 1168 | + ] | |
| 1169 | + ), | |
| 1170 | + default="auto", | |
| 1171 | + help="API provider", | |
| 1172 | +) | |
| 1173 | +@click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks") | |
| 1174 | +@click.pass_context | |
| 1175 | +def gws_ingest(ctx, folder_id, doc_id, query, output, db_path, provider, chat_model): | |
| 1176 | + """Fetch Google Workspace docs and ingest into a knowledge graph. | |
| 1177 | + | |
| 1178 | + Combines gws fetch + planopticon ingest in one step. | |
| 1179 | + | |
| 1180 | + Examples: | |
| 1181 | + | |
| 1182 | + planopticon gws ingest --folder-id 1abc... | |
| 1183 | + | |
| 1184 | + planopticon gws ingest --doc-id DOC1 --doc-id DOC2 -o ./results | |
| 1185 | + | |
| 1186 | + planopticon gws ingest -q "name contains 'spec'" --db-path existing.db | |
| 1187 | + """ | |
| 1188 | + import tempfile | |
| 1189 | + | |
| 1190 | + from video_processor.integrators.knowledge_graph import KnowledgeGraph | |
| 1191 | + from video_processor.processors.ingest import ingest_file | |
| 1192 | + from video_processor.providers.manager import ProviderManager | |
| 1193 | + from video_processor.sources.gws_source import GWSSource | |
| 1194 | + | |
| 1195 | + source = GWSSource(folder_id=folder_id, doc_ids=list(doc_id), query=query) | |
| 1196 | + if not source.authenticate(): | |
| 1197 | + click.echo("Error: gws CLI not available or not authenticated.", err=True) | |
| 1198 | + click.echo("Install: npm install -g @googleworkspace/cli", err=True) | |
| 1199 | + click.echo("Auth: gws auth login", err=True) | |
| 1200 | + sys.exit(1) | |
| 1201 | + | |
| 1202 | + # Fetch docs to temp dir | |
| 1203 | + files = source.list_videos(folder_id=folder_id) | |
| 1204 | + if not files: | |
| 1205 | + click.echo("No documents found.") | |
| 1206 | + return | |
| 1207 | + | |
| 1208 | + click.echo(f"Found {len(files)} document(s), fetching...") | |
| 1209 | + | |
| 1210 | + with tempfile.TemporaryDirectory() as tmp_dir: | |
| 1211 | + tmp_path = Path(tmp_dir) | |
| 1212 | + local_files = [] | |
| 1213 | + for f in files: | |
| 1214 | + safe_name = f.name.replace("/", "_").replace("\\", "_") | |
| 1215 | + dest = tmp_path / f"{safe_name}.txt" | |
| 1216 | + try: | |
| 1217 | + source.download(f, dest) | |
| 1218 | + local_files.append(dest) | |
| 1219 | + click.echo(f" ✓ {f.name}") | |
| 1220 | + except Exception as e: | |
| 1221 | + click.echo(f" ✗ {f.name}: {e}", err=True) | |
| 1222 | + | |
| 1223 | + if not local_files: | |
| 1224 | + click.echo("No documents fetched successfully.", err=True) | |
| 1225 | + sys.exit(1) | |
| 1226 | + | |
| 1227 | + # Set up KG | |
| 1228 | + prov = None if provider == "auto" else provider | |
| 1229 | + pm = ProviderManager(chat_model=chat_model, provider=prov) | |
| 1230 | + | |
| 1231 | + if db_path: | |
| 1232 | + kg_path = Path(db_path) | |
| 1233 | + elif output: | |
| 1234 | + out_dir = Path(output) | |
| 1235 | + out_dir.mkdir(parents=True, exist_ok=True) | |
| 1236 | + kg_path = out_dir / "knowledge_graph.db" | |
| 1237 | + else: | |
| 1238 | + kg_path = Path.cwd() / "knowledge_graph.db" | |
| 1239 | + | |
| 1240 | + kg_path.parent.mkdir(parents=True, exist_ok=True) | |
| 1241 | + kg = KnowledgeGraph(provider_manager=pm, db_path=kg_path) | |
| 1242 | + | |
| 1243 | + total_chunks = 0 | |
| 1244 | + for lf in local_files: | |
| 1245 | + try: | |
| 1246 | + count = ingest_file(lf, kg) | |
| 1247 | + total_chunks += count | |
| 1248 | + click.echo(f" Ingested {lf.stem}: {count} chunks") | |
| 1249 | + except Exception as e: | |
| 1250 | + click.echo(f" Failed to ingest {lf.stem}: {e}", err=True) | |
| 1251 | + | |
| 1252 | + kg.save(kg_path) | |
| 1253 | + kg.save(kg_path.with_suffix(".json")) | |
| 1254 | + | |
| 1255 | + entity_count = kg._store.get_entity_count() | |
| 1256 | + rel_count = kg._store.get_relationship_count() | |
| 1257 | + | |
| 1258 | + click.echo("\nIngestion complete:") | |
| 1259 | + click.echo(f" Documents: {len(local_files)}") | |
| 1260 | + click.echo(f" Chunks: {total_chunks}") | |
| 1261 | + click.echo(f" Entities: {entity_count}") | |
| 1262 | + click.echo(f" Relationships: {rel_count}") | |
| 1263 | + click.echo(f" Knowledge graph: {kg_path}") | |
| 1264 | + | |
| 1265 | + | |
| 1266 | +@cli.group() | |
| 1267 | +def m365(): | |
| 1268 | + """Microsoft 365: fetch docs from SharePoint and OneDrive via the m365 CLI.""" | |
| 1269 | + pass | |
| 1270 | + | |
| 1271 | + | |
| 1272 | +@m365.command("list") | |
| 1273 | +@click.option("--web-url", type=str, required=True, help="SharePoint site URL") | |
| 1274 | +@click.option("--folder-url", type=str, required=True, help="Server-relative folder URL") | |
| 1275 | +@click.option("--recursive", is_flag=True, help="Include subfolders") | |
| 1276 | +@click.option("--json", "as_json", is_flag=True, help="Output as JSON") | |
| 1277 | +def m365_list(web_url, folder_url, recursive, as_json): | |
| 1278 | + """List documents in SharePoint or OneDrive. | |
| 1279 | + | |
| 1280 | + Examples: | |
| 1281 | + | |
| 1282 | + planopticon m365 list --web-url https://contoso.sharepoint.com/sites/proj \\ | |
| 1283 | + --folder-url /sites/proj/Shared\\ Documents | |
| 1284 | + | |
| 1285 | + planopticon m365 list --web-url URL --folder-url FOLDER --recursive --json | |
| 1286 | + """ | |
| 1287 | + from video_processor.sources.m365_source import M365Source | |
| 1288 | + | |
| 1289 | + source = M365Source(web_url=web_url, folder_url=folder_url, recursive=recursive) | |
| 1290 | + if not source.authenticate(): | |
| 1291 | + click.echo("Error: m365 CLI not available or not logged in.", err=True) | |
| 1292 | + click.echo("Install: npm install -g @pnp/cli-microsoft365", err=True) | |
| 1293 | + click.echo("Auth: m365 login", err=True) | |
| 1294 | + sys.exit(1) | |
| 1295 | + | |
| 1296 | + files = source.list_videos() | |
| 1297 | + if as_json: | |
| 1298 | + click.echo(json.dumps([f.model_dump() for f in files], indent=2, default=str)) | |
| 1299 | + else: | |
| 1300 | + if not files: | |
| 1301 | + click.echo("No documents found.") | |
| 1302 | + return | |
| 1303 | + for f in files: | |
| 1304 | + size = f"{f.size_bytes / 1024:.0f}KB" if f.size_bytes else "—" | |
| 1305 | + click.echo(f" {f.id[:12]}… {size:>8s} {f.name}") | |
| 1306 | + | |
| 1307 | + | |
| 1308 | +@m365.command("fetch") | |
| 1309 | +@click.option("--web-url", type=str, required=True, help="SharePoint site URL") | |
| 1310 | +@click.option("--folder-url", type=str, default=None, help="Server-relative folder URL") | |
| 1311 | +@click.option("--file-id", type=str, multiple=True, help="Specific file IDs (repeatable)") | |
| 1312 | +@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory") | |
| 1313 | +def m365_fetch(web_url, folder_url, file_id, output): | |
| 1314 | + """Fetch SharePoint/OneDrive documents as local files. | |
| 1315 | + | |
| 1316 | + Examples: | |
| 1317 | + | |
| 1318 | + planopticon m365 fetch --web-url URL --folder-url FOLDER -o ./docs | |
| 1319 | + | |
| 1320 | + planopticon m365 fetch --web-url URL --file-id ID1 --file-id ID2 -o ./docs | |
| 1321 | + """ | |
| 1322 | + from video_processor.sources.m365_source import M365Source | |
| 1323 | + | |
| 1324 | + source = M365Source(web_url=web_url, folder_url=folder_url, file_ids=list(file_id)) | |
| 1325 | + if not source.authenticate(): | |
| 1326 | + click.echo("Error: m365 CLI not available or not logged in.", err=True) | |
| 1327 | + sys.exit(1) | |
| 1328 | + | |
| 1329 | + out_dir = Path(output) if output else Path.cwd() / "m365_docs" | |
| 1330 | + out_dir.mkdir(parents=True, exist_ok=True) | |
| 1331 | + | |
| 1332 | + files = source.list_videos() | |
| 1333 | + if not files: | |
| 1334 | + click.echo("No documents found.") | |
| 1335 | + return | |
| 1336 | + | |
| 1337 | + for f in files: | |
| 1338 | + dest = out_dir / f.name | |
| 1339 | + try: | |
| 1340 | + source.download(f, dest) | |
| 1341 | + click.echo(f" fetched {f.name}") | |
| 1342 | + except Exception as e: | |
| 1343 | + click.echo(f" failed {f.name}: {e}", err=True) | |
| 1344 | + | |
| 1345 | + click.echo(f"\nFetched {len(files)} document(s) to {out_dir}") | |
| 1346 | + | |
| 1347 | + | |
| 1348 | +@m365.command("ingest") | |
| 1349 | +@click.option("--web-url", type=str, required=True, help="SharePoint site URL") | |
| 1350 | +@click.option("--folder-url", type=str, default=None, help="Server-relative folder URL") | |
| 1351 | +@click.option("--file-id", type=str, multiple=True, help="Specific file IDs (repeatable)") | |
| 1352 | +@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory") | |
| 1353 | +@click.option("--db-path", type=click.Path(), default=None, help="Existing DB to merge into") | |
| 1354 | +@click.option( | |
| 1355 | + "-p", | |
| 1356 | + "--provider", | |
| 1357 | + type=click.Choice( | |
| 1358 | + [ | |
| 1359 | + "auto", | |
| 1360 | + "openai", | |
| 1361 | + "anthropic", | |
| 1362 | + "gemini", | |
| 1363 | + "ollama", | |
| 1364 | + "azure", | |
| 1365 | + "together", | |
| 1366 | + "fireworks", | |
| 1367 | + "cerebras", | |
| 1368 | + "xai", | |
| 1369 | + ] | |
| 1370 | + ), | |
| 1371 | + default="auto", | |
| 1372 | + help="API provider", | |
| 1373 | +) | |
| 1374 | +@click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks") | |
| 1375 | +@click.pass_context | |
| 1376 | +def m365_ingest(ctx, web_url, folder_url, file_id, output, db_path, provider, chat_model): | |
| 1377 | + """Fetch SharePoint/OneDrive docs and ingest into a knowledge graph. | |
| 1378 | + | |
| 1379 | + Examples: | |
| 1380 | + | |
| 1381 | + planopticon m365 ingest --web-url URL --folder-url FOLDER | |
| 1382 | + | |
| 1383 | + planopticon m365 ingest --web-url URL --file-id ID1 --file-id ID2 -o ./results | |
| 1384 | + """ | |
| 1385 | + import tempfile | |
| 1386 | + | |
| 1387 | + from video_processor.integrators.knowledge_graph import KnowledgeGraph | |
| 1388 | + from video_processor.processors.ingest import ingest_file | |
| 1389 | + from video_processor.providers.manager import ProviderManager | |
| 1390 | + from video_processor.sources.m365_source import M365Source | |
| 1391 | + | |
| 1392 | + source = M365Source(web_url=web_url, folder_url=folder_url, file_ids=list(file_id)) | |
| 1393 | + if not source.authenticate(): | |
| 1394 | + click.echo("Error: m365 CLI not available or not logged in.", err=True) | |
| 1395 | + click.echo("Install: npm install -g @pnp/cli-microsoft365", err=True) | |
| 1396 | + click.echo("Auth: m365 login", err=True) | |
| 1397 | + sys.exit(1) | |
| 1398 | + | |
| 1399 | + files = source.list_videos() | |
| 1400 | + if not files: | |
| 1401 | + click.echo("No documents found.") | |
| 1402 | + return | |
| 1403 | + | |
| 1404 | + click.echo(f"Found {len(files)} document(s), fetching...") | |
| 1405 | + | |
| 1406 | + with tempfile.TemporaryDirectory() as tmp_dir: | |
| 1407 | + tmp_path = Path(tmp_dir) | |
| 1408 | + local_files = [] | |
| 1409 | + for f in files: | |
| 1410 | + dest = tmp_path / f.name | |
| 1411 | + try: | |
| 1412 | + source.download(f, dest) | |
| 1413 | + # Extract text for non-text formats | |
| 1414 | + text_dest = tmp_path / f"{Path(f.name).stem}.txt" | |
| 1415 | + text = source.download_as_text(f) | |
| 1416 | + text_dest.write_text(text, encoding="utf-8") | |
| 1417 | + local_files.append(text_dest) | |
| 1418 | + click.echo(f" fetched {f.name}") | |
| 1419 | + except Exception as e: | |
| 1420 | + click.echo(f" failed {f.name}: {e}", err=True) | |
| 1421 | + | |
| 1422 | + if not local_files: | |
| 1423 | + click.echo("No documents fetched successfully.", err=True) | |
| 1424 | + sys.exit(1) | |
| 1425 | + | |
| 1426 | + prov = None if provider == "auto" else provider | |
| 1427 | + pm = ProviderManager(chat_model=chat_model, provider=prov) | |
| 1428 | + | |
| 1429 | + if db_path: | |
| 1430 | + kg_path = Path(db_path) | |
| 1431 | + elif output: | |
| 1432 | + out_dir = Path(output) | |
| 1433 | + out_dir.mkdir(parents=True, exist_ok=True) | |
| 1434 | + kg_path = out_dir / "knowledge_graph.db" | |
| 1435 | + else: | |
| 1436 | + kg_path = Path.cwd() / "knowledge_graph.db" | |
| 1437 | + | |
| 1438 | + kg_path.parent.mkdir(parents=True, exist_ok=True) | |
| 1439 | + kg = KnowledgeGraph(provider_manager=pm, db_path=kg_path) | |
| 1440 | + | |
| 1441 | + total_chunks = 0 | |
| 1442 | + for lf in local_files: | |
| 1443 | + try: | |
| 1444 | + count = ingest_file(lf, kg) | |
| 1445 | + total_chunks += count | |
| 1446 | + click.echo(f" Ingested {lf.stem}: {count} chunks") | |
| 1447 | + except Exception as e: | |
| 1448 | + click.echo(f" Failed to ingest {lf.stem}: {e}", err=True) | |
| 1449 | + | |
| 1450 | + kg.save(kg_path) | |
| 1451 | + kg.save(kg_path.with_suffix(".json")) | |
| 1452 | + | |
| 1453 | + entity_count = kg._store.get_entity_count() | |
| 1454 | + rel_count = kg._store.get_relationship_count() | |
| 1455 | + | |
| 1456 | + click.echo("\nIngestion complete:") | |
| 1457 | + click.echo(f" Documents: {len(local_files)}") | |
| 1458 | + click.echo(f" Chunks: {total_chunks}") | |
| 1459 | + click.echo(f" Entities: {entity_count}") | |
| 1460 | + click.echo(f" Relationships: {rel_count}") | |
| 1461 | + click.echo(f" Knowledge graph: {kg_path}") | |
| 1462 | + | |
| 1463 | + | |
| 1464 | +@cli.group() | |
| 1465 | +def export(): | |
| 1466 | + """Export knowledge graphs as markdown docs, notes, or CSV.""" | |
| 1467 | + pass | |
| 1468 | + | |
| 1469 | + | |
| 1470 | +@export.command("markdown") | |
| 1471 | +@click.argument("db_path", type=click.Path(exists=True)) | |
| 1472 | +@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory") | |
| 1473 | +@click.option( | |
| 1474 | + "--type", | |
| 1475 | + "doc_types", | |
| 1476 | + type=click.Choice( | |
| 1477 | + [ | |
| 1478 | + "summary", | |
| 1479 | + "meeting-notes", | |
| 1480 | + "glossary", | |
| 1481 | + "relationship-map", | |
| 1482 | + "status-report", | |
| 1483 | + "entity-index", | |
| 1484 | + "csv", | |
| 1485 | + "all", | |
| 1486 | + ] | |
| 1487 | + ), | |
| 1488 | + multiple=True, | |
| 1489 | + default=("all",), | |
| 1490 | + help="Document types to generate (repeatable)", | |
| 1491 | +) | |
| 1492 | +def export_markdown(db_path, output, doc_types): | |
| 1493 | + """Generate markdown documents from a knowledge graph. | |
| 1494 | + | |
| 1495 | + No API key needed — pure template-based generation. | |
| 1496 | + | |
| 1497 | + Examples: | |
| 1498 | + | |
| 1499 | + planopticon export markdown knowledge_graph.db | |
| 1500 | + | |
| 1501 | + planopticon export markdown kg.db -o ./docs --type summary --type glossary | |
| 1502 | + | |
| 1503 | + planopticon export markdown kg.db --type meeting-notes --type csv | |
| 1504 | + """ | |
| 1505 | + from video_processor.exporters.markdown import generate_all | |
| 1506 | + from video_processor.integrators.knowledge_graph import KnowledgeGraph | |
| 1507 | + | |
| 1508 | + db_path = Path(db_path) | |
| 1509 | + out_dir = Path(output) if output else Path.cwd() / "export" | |
| 1510 | + | |
| 1511 | + kg = KnowledgeGraph(db_path=db_path) | |
| 1512 | + kg_data = kg.to_dict() | |
| 1513 | + | |
| 1514 | + types = None if "all" in doc_types else list(doc_types) | |
| 1515 | + created = generate_all(kg_data, out_dir, doc_types=types) | |
| 1516 | + | |
| 1517 | + click.echo(f"Generated {len(created)} files in {out_dir}/") | |
| 1518 | + # Show top-level files (not entity briefs) | |
| 1519 | + for p in sorted(created): | |
| 1520 | + if p.parent == out_dir: | |
| 1521 | + click.echo(f" {p.name}") | |
| 1522 | + entity_count = len([p for p in created if p.parent != out_dir]) | |
| 1523 | + if entity_count: | |
| 1524 | + click.echo(f" entities/ ({entity_count} entity briefs)") | |
| 1525 | + | |
| 1526 | + | |
| 1527 | +@export.command("obsidian") | |
| 1528 | +@click.argument("db_path", type=click.Path(exists=True)) | |
| 1529 | +@click.option("-o", "--output", type=click.Path(), default=None, help="Output vault directory") | |
| 1530 | +def export_obsidian(db_path, output): | |
| 1531 | + """Export knowledge graph as an Obsidian vault with frontmatter and wiki-links. | |
| 1532 | + | |
| 1533 | + Examples: | |
| 1534 | + | |
| 1535 | + planopticon export obsidian knowledge_graph.db -o ./my-vault | |
| 1536 | + """ | |
| 1537 | + from video_processor.agent.skills.notes_export import export_to_obsidian | |
| 1538 | + from video_processor.integrators.knowledge_graph import KnowledgeGraph | |
| 1539 | + | |
| 1540 | + db_path = Path(db_path) | |
| 1541 | + out_dir = Path(output) if output else Path.cwd() / "obsidian-vault" | |
| 1542 | + | |
| 1543 | + kg = KnowledgeGraph(db_path=db_path) | |
| 1544 | + kg_data = kg.to_dict() | |
| 1545 | + created = export_to_obsidian(kg_data, out_dir) | |
| 1546 | + | |
| 1547 | + click.echo(f"Exported Obsidian vault: {len(created)} notes in {out_dir}/") | |
| 1548 | + | |
| 1549 | + | |
| 1550 | +@export.command("notion") | |
| 1551 | +@click.argument("db_path", type=click.Path(exists=True)) | |
| 1552 | +@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory") | |
| 1553 | +def export_notion(db_path, output): | |
| 1554 | + """Export knowledge graph as Notion-compatible markdown + CSV database. | |
| 1555 | + | |
| 1556 | + Examples: | |
| 1557 | + | |
| 1558 | + planopticon export notion knowledge_graph.db -o ./notion-export | |
| 1559 | + """ | |
| 1560 | + from video_processor.agent.skills.notes_export import export_to_notion_md | |
| 1561 | + from video_processor.integrators.knowledge_graph import KnowledgeGraph | |
| 1562 | + | |
| 1563 | + db_path = Path(db_path) | |
| 1564 | + out_dir = Path(output) if output else Path.cwd() / "notion-export" | |
| 1565 | + | |
| 1566 | + kg = KnowledgeGraph(db_path=db_path) | |
| 1567 | + kg_data = kg.to_dict() | |
| 1568 | + created = export_to_notion_md(kg_data, out_dir) | |
| 1569 | + | |
| 1570 | + click.echo(f"Exported Notion markdown: {len(created)} files in {out_dir}/") | |
| 1571 | + | |
| 1572 | + | |
| 1573 | +@export.command("exchange") | |
| 1574 | +@click.argument("db_path", type=click.Path(exists=True)) | |
| 1575 | +@click.option( | |
| 1576 | + "-o", | |
| 1577 | + "--output", | |
| 1578 | + type=click.Path(), | |
| 1579 | + default=None, | |
| 1580 | + help="Output JSON file path", | |
| 1581 | +) | |
| 1582 | +@click.option( | |
| 1583 | + "--name", | |
| 1584 | + "project_name", | |
| 1585 | + type=str, | |
| 1586 | + default="Untitled", | |
| 1587 | + help="Project name for the exchange payload", | |
| 1588 | +) | |
| 1589 | +@click.option( | |
| 1590 | + "--description", | |
| 1591 | + "project_desc", | |
| 1592 | + type=str, | |
| 1593 | + default="", | |
| 1594 | + help="Project description", | |
| 1595 | +) | |
| 1596 | +def export_exchange(db_path, output, project_name, project_desc): | |
| 1597 | + """Export a knowledge graph as a PlanOpticonExchange JSON file. | |
| 1598 | + | |
| 1599 | + Examples: | |
| 1600 | + | |
| 1601 | + planopticon export exchange knowledge_graph.db | |
| 1602 | + | |
| 1603 | + planopticon export exchange kg.db -o exchange.json --name "My Project" | |
| 1604 | + """ | |
| 1605 | + from video_processor.exchange import PlanOpticonExchange | |
| 1606 | + from video_processor.integrators.knowledge_graph import KnowledgeGraph | |
| 1607 | + | |
| 1608 | + db_path = Path(db_path) | |
| 1609 | + kg = KnowledgeGraph(db_path=db_path) | |
| 1610 | + kg_data = kg.to_dict() | |
| 1611 | + | |
| 1612 | + ex = PlanOpticonExchange.from_knowledge_graph( | |
| 1613 | + kg_data, | |
| 1614 | + project_name=project_name, | |
| 1615 | + project_description=project_desc, | |
| 1616 | + ) | |
| 1617 | + | |
| 1618 | + out_path = Path(output) if output else Path.cwd() / "exchange.json" | |
| 1619 | + ex.to_file(out_path) | |
| 1620 | + | |
| 1621 | + click.echo( | |
| 1622 | + f"Exported PlanOpticonExchange to {out_path} " | |
| 1623 | + f"({len(ex.entities)} entities, " | |
| 1624 | + f"{len(ex.relationships)} relationships)" | |
| 1625 | + ) | |
| 1626 | + | |
| 1627 | + | |
| 1628 | +@cli.group() | |
| 1629 | +def wiki(): | |
| 1630 | + """Generate and push GitHub wikis from knowledge graphs.""" | |
| 1631 | + pass | |
| 1632 | + | |
| 1633 | + | |
| 1634 | +@wiki.command("generate") | |
| 1635 | +@click.argument("db_path", type=click.Path(exists=True)) | |
| 1636 | +@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory for wiki") | |
| 1637 | +@click.option("--title", type=str, default="Knowledge Base", help="Wiki title") | |
| 1638 | +def wiki_generate(db_path, output, title): | |
| 1639 | + """Generate a GitHub wiki from a knowledge graph. | |
| 1640 | + | |
| 1641 | + Examples: | |
| 1642 | + | |
| 1643 | + planopticon wiki generate knowledge_graph.db -o ./wiki | |
| 1644 | + | |
| 1645 | + planopticon wiki generate results/kg.db --title "Project Wiki" | |
| 1646 | + """ | |
| 1647 | + from video_processor.agent.skills.wiki_generator import generate_wiki, write_wiki | |
| 1648 | + from video_processor.integrators.knowledge_graph import KnowledgeGraph | |
| 1649 | + | |
| 1650 | + db_path = Path(db_path) | |
| 1651 | + out_dir = Path(output) if output else Path.cwd() / "wiki" | |
| 1652 | + | |
| 1653 | + kg = KnowledgeGraph(db_path=db_path) | |
| 1654 | + kg_data = kg.to_dict() | |
| 1655 | + pages = generate_wiki(kg_data, title=title) | |
| 1656 | + written = write_wiki(pages, out_dir) | |
| 1657 | + | |
| 1658 | + click.echo(f"Generated {len(written)} wiki pages in {out_dir}") | |
| 1659 | + for p in sorted(written): | |
| 1660 | + click.echo(f" {p.name}") | |
| 1661 | + | |
| 1662 | + | |
| 1663 | +@wiki.command("push") | |
| 1664 | +@click.argument("wiki_dir", type=click.Path(exists=True)) | |
| 1665 | +@click.argument("repo", type=str) | |
| 1666 | +@click.option("--message", "-m", type=str, default="Update wiki", help="Commit message") | |
| 1667 | +def wiki_push(wiki_dir, repo, message): | |
| 1668 | + """Push generated wiki pages to a GitHub wiki repo. | |
| 1669 | + | |
| 1670 | + REPO should be in 'owner/repo' format. | |
| 1671 | + | |
| 1672 | + Examples: | |
| 1673 | + | |
| 1674 | + planopticon wiki push ./wiki ConflictHQ/PlanOpticon | |
| 1675 | + | |
| 1676 | + planopticon wiki push ./wiki owner/repo -m "Add entity pages" | |
| 1677 | + """ | |
| 1678 | + from video_processor.agent.skills.wiki_generator import push_wiki | |
| 1679 | + | |
| 1680 | + wiki_dir = Path(wiki_dir) | |
| 1681 | + success = push_wiki(wiki_dir, repo, message=message) | |
| 1682 | + if success: | |
| 1683 | + click.echo(f"Wiki pushed to https://github.com/{repo}/wiki") | |
| 1684 | + else: | |
| 1685 | + click.echo("Wiki push failed. Check auth and repo permissions.", err=True) | |
| 1686 | + sys.exit(1) | |
| 1687 | + | |
| 1688 | + | |
| 1689 | +@cli.group() | |
| 1690 | +def recordings(): | |
| 1691 | + """Fetch meeting recordings from Zoom, Teams, and Google Meet.""" | |
| 1692 | + pass | |
| 1693 | + | |
| 1694 | + | |
| 1695 | +@recordings.command("zoom-list") | |
| 1696 | +@click.option("--json", "as_json", is_flag=True, help="Output as JSON") | |
| 1697 | +def recordings_zoom_list(as_json): | |
| 1698 | + """List Zoom cloud recordings. | |
| 1699 | + | |
| 1700 | + Requires ZOOM_CLIENT_ID (and optionally ZOOM_CLIENT_SECRET, | |
| 1701 | + ZOOM_ACCOUNT_ID) environment variables. | |
| 1702 | + | |
| 1703 | + Examples: | |
| 1704 | + | |
| 1705 | + planopticon recordings zoom-list | |
| 1706 | + | |
| 1707 | + planopticon recordings zoom-list --json | |
| 1708 | + """ | |
| 1709 | + from video_processor.sources.zoom_source import ZoomSource | |
| 1710 | + | |
| 1711 | + source = ZoomSource() | |
| 1712 | + if not source.authenticate(): | |
| 1713 | + click.echo("Zoom authentication failed.", err=True) | |
| 1714 | + sys.exit(1) | |
| 1715 | + | |
| 1716 | + files = source.list_videos() | |
| 1717 | + if as_json: | |
| 1718 | + click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str)) | |
| 1719 | + else: | |
| 1720 | + click.echo(f"Found {len(files)} recording(s):") | |
| 1721 | + for f in files: | |
| 1722 | + size = f"{f.size_bytes // 1_000_000} MB" if f.size_bytes else "unknown" | |
| 1723 | + click.echo(f" {f.name} ({size}) {f.modified_at or ''}") | |
| 1724 | + | |
| 1725 | + | |
| 1726 | +@recordings.command("teams-list") | |
| 1727 | +@click.option("--user-id", default="me", help="Microsoft user ID") | |
| 1728 | +@click.option("--json", "as_json", is_flag=True, help="Output as JSON") | |
| 1729 | +def recordings_teams_list(user_id, as_json): | |
| 1730 | + """List Teams meeting recordings via the m365 CLI. | |
| 1731 | + | |
| 1732 | + Requires: npm install -g @pnp/cli-microsoft365 && m365 login | |
| 1733 | + | |
| 1734 | + Examples: | |
| 1735 | + | |
| 1736 | + planopticon recordings teams-list | |
| 1737 | + | |
| 1738 | + planopticon recordings teams-list --json | |
| 1739 | + """ | |
| 1740 | + from video_processor.sources.teams_recording_source import ( | |
| 1741 | + TeamsRecordingSource, | |
| 1742 | + ) | |
| 1743 | + | |
| 1744 | + source = TeamsRecordingSource(user_id=user_id) | |
| 1745 | + if not source.authenticate(): | |
| 1746 | + click.echo("Teams authentication failed.", err=True) | |
| 1747 | + sys.exit(1) | |
| 1748 | + | |
| 1749 | + files = source.list_videos() | |
| 1750 | + if as_json: | |
| 1751 | + click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str)) | |
| 1752 | + else: | |
| 1753 | + click.echo(f"Found {len(files)} recording(s):") | |
| 1754 | + for f in files: | |
| 1755 | + click.echo(f" {f.name} {f.modified_at or ''}") | |
| 1756 | + | |
| 1757 | + | |
| 1758 | +@recordings.command("meet-list") | |
| 1759 | +@click.option("--folder-id", default=None, help="Drive folder ID") | |
| 1760 | +@click.option("--json", "as_json", is_flag=True, help="Output as JSON") | |
| 1761 | +def recordings_meet_list(folder_id, as_json): | |
| 1762 | + """List Google Meet recordings in Drive via the gws CLI. | |
| 1763 | + | |
| 1764 | + Requires: npm install -g @googleworkspace/cli && gws auth login | |
| 1765 | + | |
| 1766 | + Examples: | |
| 1767 | + | |
| 1768 | + planopticon recordings meet-list | |
| 1769 | + | |
| 1770 | + planopticon recordings meet-list --folder-id abc123 | |
| 1771 | + """ | |
| 1772 | + from video_processor.sources.meet_recording_source import ( | |
| 1773 | + MeetRecordingSource, | |
| 1774 | + ) | |
| 1775 | + | |
| 1776 | + source = MeetRecordingSource(drive_folder_id=folder_id) | |
| 1777 | + if not source.authenticate(): | |
| 1778 | + click.echo("Google Meet authentication failed.", err=True) | |
| 1779 | + sys.exit(1) | |
| 1780 | + | |
| 1781 | + files = source.list_videos() | |
| 1782 | + if as_json: | |
| 1783 | + click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str)) | |
| 1784 | + else: | |
| 1785 | + click.echo(f"Found {len(files)} recording(s):") | |
| 1786 | + for f in files: | |
| 1787 | + size = f"{f.size_bytes // 1_000_000} MB" if f.size_bytes else "unknown" | |
| 1788 | + click.echo(f" {f.name} ({size}) {f.modified_at or ''}") | |
| 1789 | + | |
| 1790 | + | |
| 1791 | +@cli.group() | |
| 1792 | +def kg(): | |
| 1793 | + """Knowledge graph utilities: convert, sync, and inspect.""" | |
| 1794 | + pass | |
| 1795 | + | |
| 1796 | + | |
| 1797 | +@kg.command() | |
| 1798 | +@click.argument("source_path", type=click.Path(exists=True)) | |
| 1799 | +@click.argument("dest_path", type=click.Path()) | |
| 1800 | +def convert(source_path, dest_path): | |
| 1801 | + """Convert a knowledge graph between formats. | |
| 1802 | + | |
| 1803 | + Supports .db (SQLite) and .json. The output format is inferred from DEST_PATH extension. | |
| 1804 | + | |
| 1805 | + Examples: | |
| 1806 | + | |
| 1807 | + planopticon kg convert results/knowledge_graph.db output.json | |
| 1808 | + planopticon kg convert knowledge_graph.json knowledge_graph.db | |
| 1809 | + """ | |
| 1810 | + from video_processor.integrators.graph_store import InMemoryStore, SQLiteStore | |
| 1811 | + | |
| 1812 | + source_path = Path(source_path) | |
| 1813 | + dest_path = Path(dest_path) | |
| 1814 | + | |
| 1815 | + if source_path.suffix == dest_path.suffix: | |
| 1816 | + click.echo(f"Source and destination are the same format ({source_path.suffix}).", err=True) | |
| 1817 | + sys.exit(1) | |
| 1818 | + | |
| 1819 | + # Load source | |
| 1820 | + if source_path.suffix == ".db": | |
| 1821 | + src_store = SQLiteStore(source_path) | |
| 1822 | + elif source_path.suffix == ".json": | |
| 1823 | + data = json.loads(source_path.read_text()) | |
| 1824 | + src_store = InMemoryStore() | |
| 1825 | + for node in data.get("nodes", []): | |
| 1826 | + descs = node.get("descriptions", []) | |
| 1827 | + if isinstance(descs, set): | |
| 1828 | + descs = list(descs) | |
| 1829 | + src_store.merge_entity(node.get("name", ""), node.get("type", "concept"), descs) | |
| 1830 | + for occ in node.get("occurrences", []): | |
| 1831 | + src_store.add_occurrence( | |
| 1832 | + node.get("name", ""), | |
| 1833 | + occ.get("source", ""), | |
| 1834 | + occ.get("timestamp"), | |
| 1835 | + occ.get("text"), | |
| 1836 | + ) | |
| 1837 | + for rel in data.get("relationships", []): | |
| 1838 | + src_store.add_relationship( | |
| 1839 | + rel.get("source", ""), | |
| 1840 | + rel.get("target", ""), | |
| 1841 | + rel.get("type", "related_to"), | |
| 1842 | + content_source=rel.get("content_source"), | |
| 1843 | + timestamp=rel.get("timestamp"), | |
| 1844 | + ) | |
| 1845 | + else: | |
| 1846 | + click.echo(f"Unsupported source format: {source_path.suffix}", err=True) | |
| 1847 | + sys.exit(1) | |
| 1848 | + | |
| 1849 | + # Write destination | |
| 1850 | + from video_processor.integrators.knowledge_graph import KnowledgeGraph | |
| 1851 | + | |
| 1852 | + kg_obj = KnowledgeGraph(store=src_store) | |
| 1853 | + kg_obj.save(dest_path) | |
| 1854 | + | |
| 1855 | + e_count = src_store.get_entity_count() | |
| 1856 | + r_count = src_store.get_relationship_count() | |
| 1857 | + click.echo( | |
| 1858 | + f"Converted {source_path} → {dest_path} ({e_count} entities, {r_count} relationships)" | |
| 1859 | + ) | |
| 1860 | + | |
| 1861 | + if hasattr(src_store, "close"): | |
| 1862 | + src_store.close() | |
| 1863 | + | |
| 1864 | + | |
| 1865 | +@kg.command() | |
| 1866 | +@click.argument("db_path", type=click.Path(exists=True)) | |
| 1867 | +@click.argument("json_path", type=click.Path(), required=False, default=None) | |
| 1868 | +@click.option( | |
| 1869 | + "--direction", | |
| 1870 | + type=click.Choice(["db-to-json", "json-to-db", "auto"]), | |
| 1871 | + default="auto", | |
| 1872 | + help="Sync direction. 'auto' picks the newer file as source.", | |
| 1873 | +) | |
| 1874 | +def sync(db_path, json_path, direction): | |
| 1875 | + """Sync a .db and .json knowledge graph, updating the stale one. | |
| 1876 | + | |
| 1877 | + If JSON_PATH is omitted, uses the same name with .json extension. | |
| 1878 | + | |
| 1879 | + Examples: | |
| 1880 | + | |
| 1881 | + planopticon kg sync results/knowledge_graph.db | |
| 1882 | + planopticon kg sync knowledge_graph.db knowledge_graph.json --direction db-to-json | |
| 1883 | + """ | |
| 1884 | + db_path = Path(db_path) | |
| 1885 | + if json_path is None: | |
| 1886 | + json_path = db_path.with_suffix(".json") | |
| 1887 | + else: | |
| 1888 | + json_path = Path(json_path) | |
| 1889 | + | |
| 1890 | + if direction == "auto": | |
| 1891 | + if not json_path.exists(): | |
| 1892 | + direction = "db-to-json" | |
| 1893 | + elif not db_path.exists(): | |
| 1894 | + direction = "json-to-db" | |
| 1895 | + else: | |
| 1896 | + db_mtime = db_path.stat().st_mtime | |
| 1897 | + json_mtime = json_path.stat().st_mtime | |
| 1898 | + direction = "db-to-json" if db_mtime >= json_mtime else "json-to-db" | |
| 1899 | + | |
| 1900 | + from video_processor.integrators.knowledge_graph import KnowledgeGraph | |
| 1901 | + | |
| 1902 | + if direction == "db-to-json": | |
| 1903 | + kg_obj = KnowledgeGraph(db_path=db_path) | |
| 1904 | + kg_obj.save(json_path) | |
| 1905 | + click.echo(f"Synced {db_path} → {json_path}") | |
| 1906 | + else: | |
| 1907 | + data = json.loads(json_path.read_text()) | |
| 1908 | + kg_obj = KnowledgeGraph.from_dict(data, db_path=db_path) | |
| 1909 | + # Force write to db by saving | |
| 1910 | + kg_obj.save(db_path) | |
| 1911 | + click.echo(f"Synced {json_path} → {db_path}") | |
| 1912 | + | |
| 1913 | + click.echo( | |
| 1914 | + f" {kg_obj._store.get_entity_count()} entities, " | |
| 1915 | + f"{kg_obj._store.get_relationship_count()} relationships" | |
| 1916 | + ) | |
| 1917 | + | |
| 1918 | + | |
| 1919 | +@kg.command() | |
| 1920 | +@click.argument("path", type=click.Path(exists=True)) | |
| 1921 | +def inspect(path): | |
| 1922 | + """Show summary stats for a knowledge graph file (.db or .json).""" | |
| 1923 | + from video_processor.integrators.graph_discovery import describe_graph | |
| 1924 | + | |
| 1925 | + path = Path(path) | |
| 1926 | + info = describe_graph(path) | |
| 1927 | + click.echo(f"File: {path}") | |
| 1928 | + click.echo(f"Store: {info['store_type']}") | |
| 1929 | + click.echo(f"Entities: {info['entity_count']}") | |
| 1930 | + click.echo(f"Relationships: {info['relationship_count']}") | |
| 1931 | + if info["entity_types"]: | |
| 1932 | + click.echo("Entity types:") | |
| 1933 | + for t, count in sorted(info["entity_types"].items(), key=lambda x: -x[1]): | |
| 1934 | + click.echo(f" {t}: {count}") | |
| 1935 | + | |
| 1936 | + | |
| 1937 | +@kg.command() | |
| 1938 | +@click.argument("db_path", type=click.Path(exists=True)) | |
| 1939 | +@click.option("--provider", "-p", type=str, default="auto") | |
| 1940 | +@click.option("--chat-model", type=str, default=None) | |
| 1941 | +@click.option( | |
| 1942 | + "--format", | |
| 1943 | + "output_format", | |
| 1944 | + type=click.Choice(["text", "json"]), | |
| 1945 | + default="text", | |
| 1946 | +) | |
| 1947 | +@click.pass_context | |
| 1948 | +def classify(ctx, db_path, provider, chat_model, output_format): | |
| 1949 | + """Classify knowledge graph entities into planning taxonomy types. | |
| 1950 | + | |
| 1951 | + Examples:\n | |
| 1952 | + planopticon kg classify results/knowledge_graph.db\n | |
| 1953 | + planopticon kg classify results/knowledge_graph.db --format json | |
| 1954 | + """ | |
| 1955 | + from video_processor.integrators.graph_store import create_store | |
| 1956 | + from video_processor.integrators.taxonomy import TaxonomyClassifier | |
| 1957 | + | |
| 1958 | + db_path = Path(db_path) | |
| 1959 | + store = create_store(db_path) | |
| 1960 | + entities = store.get_all_entities() | |
| 1961 | + relationships = store.get_all_relationships() | |
| 1962 | + | |
| 1963 | + pm = None | |
| 1964 | + if provider != "none": | |
| 1965 | + try: | |
| 1966 | + from video_processor.providers.manager import ProviderManager | |
| 1967 | + | |
| 1968 | + pm = ProviderManager(provider=provider if provider != "auto" else None) | |
| 1969 | + if chat_model: | |
| 1970 | + pm.chat_model = chat_model | |
| 1971 | + except Exception: | |
| 1972 | + pm = None # fall back to heuristic-only | |
| 1973 | + | |
| 1974 | + classifier = TaxonomyClassifier(provider_manager=pm) | |
| 1975 | + planning_entities = classifier.classify_entities(entities, relationships) | |
| 1976 | + | |
| 1977 | + if output_format == "json": | |
| 1978 | + click.echo( | |
| 1979 | + json.dumps( | |
| 1980 | + [pe.model_dump() for pe in planning_entities], | |
| 1981 | + indent=2, | |
| 1982 | + ) | |
| 1983 | + ) | |
| 1984 | + else: | |
| 1985 | + if not planning_entities: | |
| 1986 | + click.echo("No entities matched planning taxonomy types.") | |
| 1987 | + return | |
| 1988 | + workstreams = classifier.organize_by_workstream(planning_entities) | |
| 1989 | + for group_name, items in sorted(workstreams.items()): | |
| 1990 | + click.echo(f"\n{group_name.upper()} ({len(items)})") | |
| 1991 | + for pe in items: | |
| 1992 | + priority_str = f" [{pe.priority}]" if pe.priority else "" | |
| 1993 | + click.echo(f" - {pe.name}{priority_str}") | |
| 1994 | + if pe.description: | |
| 1995 | + click.echo(f" {pe.description}") | |
| 1996 | + | |
| 1997 | + store.close() | |
| 1998 | + | |
| 1999 | + | |
| 2000 | +@kg.command("from-exchange") | |
| 2001 | +@click.argument("exchange_path", type=click.Path(exists=True)) | |
| 2002 | +@click.option( | |
| 2003 | + "-o", | |
| 2004 | + "--output", | |
| 2005 | + "db_path", | |
| 2006 | + type=click.Path(), | |
| 2007 | + default=None, | |
| 2008 | + help="Output .db file path", | |
| 2009 | +) | |
| 2010 | +def kg_from_exchange(exchange_path, db_path): | |
| 2011 | + """Import a PlanOpticonExchange JSON file into a knowledge graph .db. | |
| 2012 | + | |
| 2013 | + Examples: | |
| 2014 | + | |
| 2015 | + planopticon kg from-exchange exchange.json | |
| 2016 | + | |
| 2017 | + planopticon kg from-exchange exchange.json -o project.db | |
| 2018 | + """ | |
| 2019 | + from video_processor.exchange import PlanOpticonExchange | |
| 2020 | + from video_processor.integrators.knowledge_graph import KnowledgeGraph | |
| 2021 | + | |
| 2022 | + ex = PlanOpticonExchange.from_file(exchange_path) | |
| 2023 | + | |
| 2024 | + kg_dict = { | |
| 2025 | + "nodes": [e.model_dump() for e in ex.entities], | |
| 2026 | + "relationships": [r.model_dump() for r in ex.relationships], | |
| 2027 | + "sources": [s.model_dump() for s in ex.sources], | |
| 2028 | + } | |
| 2029 | + | |
| 2030 | + out = Path(db_path) if db_path else Path.cwd() / "knowledge_graph.db" | |
| 2031 | + kg_obj = KnowledgeGraph.from_dict(kg_dict, db_path=out) | |
| 2032 | + kg_obj.save(out) | |
| 2033 | + | |
| 2034 | + click.echo( | |
| 2035 | + f"Imported exchange into {out} " | |
| 2036 | + f"({len(ex.entities)} entities, " | |
| 2037 | + f"{len(ex.relationships)} relationships)" | |
| 2038 | + ) | |
| 2039 | + | |
| 2040 | + | |
| 2041 | +@cli.command() | |
| 2042 | +@click.option( | |
| 2043 | + "--kb", | |
| 2044 | + multiple=True, | |
| 2045 | + type=click.Path(exists=True), | |
| 2046 | + help="Knowledge base paths", | |
| 2047 | +) | |
| 2048 | +@click.option( | |
| 2049 | + "--provider", | |
| 2050 | + "-p", | |
| 2051 | + type=str, | |
| 2052 | + default="auto", | |
| 2053 | + help="LLM provider (auto, openai, anthropic, ...)", | |
| 2054 | +) | |
| 2055 | +@click.option( | |
| 2056 | + "--chat-model", | |
| 2057 | + type=str, | |
| 2058 | + default=None, | |
| 2059 | + help="Chat model override", | |
| 2060 | +) | |
| 2061 | +@click.pass_context | |
| 2062 | +def companion(ctx, kb, provider, chat_model): | |
| 2063 | + """Interactive planning companion with workspace awareness. | |
| 2064 | + | |
| 2065 | + Examples: | |
| 2066 | + | |
| 2067 | + planopticon companion | |
| 2068 | + | |
| 2069 | + planopticon companion --kb ./results | |
| 2070 | + | |
| 2071 | + planopticon companion -p anthropic | |
| 2072 | + """ | |
| 2073 | + from video_processor.cli.companion import CompanionREPL | |
| 2074 | + | |
| 2075 | + repl = CompanionREPL( | |
| 2076 | + kb_paths=list(kb), | |
| 2077 | + provider=provider, | |
| 2078 | + chat_model=chat_model, | |
| 2079 | + ) | |
| 2080 | + repl.run() | |
| 674 | 2081 | |
| 675 | 2082 | |
| 676 | 2083 | def _interactive_menu(ctx): |
| 677 | 2084 | """Show an interactive menu when planopticon is run with no arguments.""" |
| 678 | 2085 | click.echo() |
| @@ -698,11 +2105,24 @@ | ||
| 698 | 2105 | type=click.Choice(["basic", "standard", "comprehensive"]), |
| 699 | 2106 | default="standard", |
| 700 | 2107 | ) |
| 701 | 2108 | provider = click.prompt( |
| 702 | 2109 | " Provider", |
| 703 | - type=click.Choice(["auto", "openai", "anthropic", "gemini", "ollama"]), | |
| 2110 | + type=click.Choice( | |
| 2111 | + [ | |
| 2112 | + "auto", | |
| 2113 | + "openai", | |
| 2114 | + "anthropic", | |
| 2115 | + "gemini", | |
| 2116 | + "ollama", | |
| 2117 | + "azure", | |
| 2118 | + "together", | |
| 2119 | + "fireworks", | |
| 2120 | + "cerebras", | |
| 2121 | + "xai", | |
| 2122 | + ] | |
| 2123 | + ), | |
| 704 | 2124 | default="auto", |
| 705 | 2125 | ) |
| 706 | 2126 | ctx.invoke( |
| 707 | 2127 | analyze, |
| 708 | 2128 | input=input_path, |
| @@ -727,11 +2147,24 @@ | ||
| 727 | 2147 | type=click.Choice(["basic", "standard", "comprehensive"]), |
| 728 | 2148 | default="standard", |
| 729 | 2149 | ) |
| 730 | 2150 | provider = click.prompt( |
| 731 | 2151 | " Provider", |
| 732 | - type=click.Choice(["auto", "openai", "anthropic", "gemini", "ollama"]), | |
| 2152 | + type=click.Choice( | |
| 2153 | + [ | |
| 2154 | + "auto", | |
| 2155 | + "openai", | |
| 2156 | + "anthropic", | |
| 2157 | + "gemini", | |
| 2158 | + "ollama", | |
| 2159 | + "azure", | |
| 2160 | + "together", | |
| 2161 | + "fireworks", | |
| 2162 | + "cerebras", | |
| 2163 | + "xai", | |
| 2164 | + ] | |
| 2165 | + ), | |
| 733 | 2166 | default="auto", |
| 734 | 2167 | ) |
| 735 | 2168 | ctx.invoke( |
| 736 | 2169 | batch, |
| 737 | 2170 | input_dir=input_dir, |
| 738 | 2171 | |
| 739 | 2172 | ADDED video_processor/cli/companion.py |
| 740 | 2173 | ADDED video_processor/exchange.py |
| 741 | 2174 | ADDED video_processor/exporters/__init__.py |
| 742 | 2175 | ADDED video_processor/exporters/markdown.py |
| --- video_processor/cli/commands.py | |
| +++ video_processor/cli/commands.py | |
| @@ -34,19 +34,38 @@ | |
| 34 | root_logger.addHandler(console_handler) |
| 35 | |
| 36 | |
| 37 | @click.group(invoke_without_command=True) |
| 38 | @click.option("--verbose", "-v", is_flag=True, help="Enable verbose output") |
| 39 | @click.version_option("0.2.0", prog_name="PlanOpticon") |
| 40 | @click.pass_context |
| 41 | def cli(ctx, verbose): |
| 42 | """PlanOpticon - Comprehensive Video Analysis & Knowledge Extraction Tool.""" |
| 43 | ctx.ensure_object(dict) |
| 44 | ctx.obj["verbose"] = verbose |
| 45 | setup_logging(verbose) |
| 46 | |
| 47 | if ctx.invoked_subcommand is None: |
| 48 | _interactive_menu(ctx) |
| 49 | |
| 50 | |
| 51 | @cli.command() |
| 52 | @click.option( |
| @@ -73,16 +92,47 @@ | |
| 73 | ) |
| 74 | @click.option("--title", type=str, help="Title for the analysis report") |
| 75 | @click.option( |
| 76 | "--provider", |
| 77 | "-p", |
| 78 | type=click.Choice(["auto", "openai", "anthropic", "gemini", "ollama"]), |
| 79 | default="auto", |
| 80 | help="API provider", |
| 81 | ) |
| 82 | @click.option("--vision-model", type=str, default=None, help="Override model for vision tasks") |
| 83 | @click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks") |
| 84 | @click.pass_context |
| 85 | def analyze( |
| 86 | ctx, |
| 87 | input, |
| 88 | output, |
| @@ -94,26 +144,35 @@ | |
| 94 | periodic_capture, |
| 95 | title, |
| 96 | provider, |
| 97 | vision_model, |
| 98 | chat_model, |
| 99 | ): |
| 100 | """Analyze a single video and extract structured knowledge.""" |
| 101 | from video_processor.pipeline import process_single_video |
| 102 | from video_processor.providers.manager import ProviderManager |
| 103 | |
| 104 | focus_areas = [a.strip().lower() for a in focus.split(",")] if focus else [] |
| 105 | prov = None if provider == "auto" else provider |
| 106 | |
| 107 | pm = ProviderManager( |
| 108 | vision_model=vision_model, |
| 109 | chat_model=chat_model, |
| 110 | provider=prov, |
| 111 | ) |
| 112 | |
| 113 | try: |
| 114 | process_single_video( |
| 115 | input_path=input, |
| 116 | output_dir=output, |
| 117 | provider_manager=pm, |
| 118 | depth=depth, |
| 119 | focus_areas=focus_areas, |
| @@ -120,16 +179,23 @@ | |
| 120 | sampling_rate=sampling_rate, |
| 121 | change_threshold=change_threshold, |
| 122 | periodic_capture_seconds=periodic_capture, |
| 123 | use_gpu=use_gpu, |
| 124 | title=title, |
| 125 | ) |
| 126 | click.echo(pm.usage.format_summary()) |
| 127 | click.echo(f"\n Results: {output}/manifest.json") |
| 128 | except Exception as e: |
| 129 | logging.error(f"Error: {e}") |
| 130 | click.echo(pm.usage.format_summary()) |
| 131 | if ctx.obj["verbose"]: |
| 132 | import traceback |
| 133 | |
| 134 | traceback.print_exc() |
| 135 | sys.exit(1) |
| @@ -154,11 +220,24 @@ | |
| 154 | ) |
| 155 | @click.option("--title", type=str, default="Batch Processing Results", help="Batch title") |
| 156 | @click.option( |
| 157 | "--provider", |
| 158 | "-p", |
| 159 | type=click.Choice(["auto", "openai", "anthropic", "gemini", "ollama"]), |
| 160 | default="auto", |
| 161 | help="API provider", |
| 162 | ) |
| 163 | @click.option("--vision-model", type=str, default=None, help="Override model for vision tasks") |
| 164 | @click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks") |
| @@ -282,14 +361,18 @@ | |
| 282 | entry.action_items_count = len(manifest.action_items) |
| 283 | entry.key_points_count = len(manifest.key_points) |
| 284 | entry.duration_seconds = manifest.video.duration_seconds |
| 285 | manifests.append(manifest) |
| 286 | |
| 287 | # Merge knowledge graph |
| 288 | kg_path = video_output / "results" / "knowledge_graph.json" |
| 289 | if kg_path.exists(): |
| 290 | kg_data = json.loads(kg_path.read_text()) |
| 291 | video_kg = KnowledgeGraph.from_dict(kg_data) |
| 292 | merged_kg.merge(video_kg) |
| 293 | |
| 294 | except Exception as e: |
| 295 | logging.error(f"Failed to process {video_path.name}: {e}") |
| @@ -300,13 +383,12 @@ | |
| 300 | |
| 301 | traceback.print_exc() |
| 302 | |
| 303 | entries.append(entry) |
| 304 | |
| 305 | # Save merged knowledge graph |
| 306 | merged_kg_path = Path(output) / "knowledge_graph.json" |
| 307 | merged_kg.save(merged_kg_path) |
| 308 | |
| 309 | # Generate batch summary |
| 310 | plan_gen = PlanGenerator(provider_manager=pm, knowledge_graph=merged_kg) |
| 311 | summary_path = Path(output) / "batch_summary.md" |
| 312 | plan_gen.generate_batch_summary( |
| @@ -336,10 +418,120 @@ | |
| 336 | f"\n Batch complete: {batch_manifest.completed_videos}" |
| 337 | f"/{batch_manifest.total_videos} succeeded" |
| 338 | ) |
| 339 | click.echo(f" Results: {output}/batch_manifest.json") |
| 340 | |
| 341 | |
| 342 | @cli.command("list-models") |
| 343 | @click.pass_context |
| 344 | def list_models(ctx): |
| 345 | """Discover and display available models from all configured providers.""" |
| @@ -421,11 +613,24 @@ | |
| 421 | ) |
| 422 | @click.option("--title", type=str, help="Title for the analysis report") |
| 423 | @click.option( |
| 424 | "--provider", |
| 425 | "-p", |
| 426 | type=click.Choice(["auto", "openai", "anthropic", "gemini", "ollama"]), |
| 427 | default="auto", |
| 428 | help="API provider", |
| 429 | ) |
| 430 | @click.option("--vision-model", type=str, default=None, help="Override model for vision tasks") |
| 431 | @click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks") |
| @@ -462,10 +667,145 @@ | |
| 462 | import traceback |
| 463 | |
| 464 | traceback.print_exc() |
| 465 | sys.exit(1) |
| 466 | |
| 467 | |
| 468 | @cli.command() |
| 469 | @click.argument("question", required=False, default=None) |
| 470 | @click.option( |
| 471 | "--db-path", |
| @@ -488,28 +828,43 @@ | |
| 488 | ) |
| 489 | @click.option("--interactive", "-I", is_flag=True, help="Enter interactive REPL mode") |
| 490 | @click.option( |
| 491 | "--provider", |
| 492 | "-p", |
| 493 | type=click.Choice(["auto", "openai", "anthropic", "gemini", "ollama"]), |
| 494 | default="auto", |
| 495 | help="API provider for agentic mode", |
| 496 | ) |
| 497 | @click.option("--chat-model", type=str, default=None, help="Override model for agentic mode") |
| 498 | @click.pass_context |
| 499 | def query(ctx, question, db_path, mode, output_format, interactive, provider, chat_model): |
| 500 | """Query a knowledge graph. Runs stats if no question given. |
| 501 | |
| 502 | Direct commands recognized in QUESTION: stats, entities, relationships, |
| 503 | neighbors, cypher. Natural language questions use agentic mode. |
| 504 | |
| 505 | Examples: |
| 506 | |
| 507 | planopticon query |
| 508 | planopticon query stats |
| 509 | planopticon query "entities --type technology" |
| 510 | planopticon query "neighbors Alice" |
| 511 | planopticon query "What was discussed?" |
| 512 | planopticon query -I |
| 513 | """ |
| 514 | from video_processor.integrators.graph_discovery import find_nearest_graph |
| 515 | from video_processor.integrators.graph_query import GraphQueryEngine |
| @@ -588,13 +943,20 @@ | |
| 588 | |
| 589 | if cmd == "neighbors": |
| 590 | entity_name = " ".join(parts[1:]) if len(parts) > 1 else "" |
| 591 | return engine.neighbors(entity_name) |
| 592 | |
| 593 | if cmd == "cypher": |
| 594 | cypher_query = " ".join(parts[1:]) |
| 595 | return engine.cypher(cypher_query) |
| 596 | |
| 597 | # Natural language → agentic (or fallback to entity search in direct mode) |
| 598 | if mode == "direct": |
| 599 | return engine.entities(name=question) |
| 600 | return engine.ask(question) |
| @@ -646,33 +1008,1078 @@ | |
| 646 | _print_result(result, output_format) |
| 647 | click.echo() |
| 648 | |
| 649 | |
| 650 | @cli.command() |
| 651 | @click.argument("service", type=click.Choice(["google", "dropbox"])) |
| 652 | @click.pass_context |
| 653 | def auth(ctx, service): |
| 654 | """Authenticate with a cloud service (google or dropbox).""" |
| 655 | if service == "google": |
| 656 | from video_processor.sources.google_drive import GoogleDriveSource |
| 657 | |
| 658 | source = GoogleDriveSource(use_service_account=False) |
| 659 | if source.authenticate(): |
| 660 | click.echo("Google Drive authentication successful.") |
| 661 | else: |
| 662 | click.echo("Google Drive authentication failed.", err=True) |
| 663 | sys.exit(1) |
| 664 | |
| 665 | elif service == "dropbox": |
| 666 | from video_processor.sources.dropbox_source import DropboxSource |
| 667 | |
| 668 | source = DropboxSource() |
| 669 | if source.authenticate(): |
| 670 | click.echo("Dropbox authentication successful.") |
| 671 | else: |
| 672 | click.echo("Dropbox authentication failed.", err=True) |
| 673 | sys.exit(1) |
| 674 | |
| 675 | |
| 676 | def _interactive_menu(ctx): |
| 677 | """Show an interactive menu when planopticon is run with no arguments.""" |
| 678 | click.echo() |
| @@ -698,11 +2105,24 @@ | |
| 698 | type=click.Choice(["basic", "standard", "comprehensive"]), |
| 699 | default="standard", |
| 700 | ) |
| 701 | provider = click.prompt( |
| 702 | " Provider", |
| 703 | type=click.Choice(["auto", "openai", "anthropic", "gemini", "ollama"]), |
| 704 | default="auto", |
| 705 | ) |
| 706 | ctx.invoke( |
| 707 | analyze, |
| 708 | input=input_path, |
| @@ -727,11 +2147,24 @@ | |
| 727 | type=click.Choice(["basic", "standard", "comprehensive"]), |
| 728 | default="standard", |
| 729 | ) |
| 730 | provider = click.prompt( |
| 731 | " Provider", |
| 732 | type=click.Choice(["auto", "openai", "anthropic", "gemini", "ollama"]), |
| 733 | default="auto", |
| 734 | ) |
| 735 | ctx.invoke( |
| 736 | batch, |
| 737 | input_dir=input_dir, |
| 738 | |
| 739 | DDED video_processor/cli/companion.py |
| 740 | DDED video_processor/exchange.py |
| 741 | DDED video_processor/exporters/__init__.py |
| 742 | DDED video_processor/exporters/markdown.py |
| --- video_processor/cli/commands.py | |
| +++ video_processor/cli/commands.py | |
| @@ -34,19 +34,38 @@ | |
| 34 | root_logger.addHandler(console_handler) |
| 35 | |
| 36 | |
| 37 | @click.group(invoke_without_command=True) |
| 38 | @click.option("--verbose", "-v", is_flag=True, help="Enable verbose output") |
| 39 | @click.option( |
| 40 | "--chat", |
| 41 | "-C", |
| 42 | is_flag=True, |
| 43 | help="Launch interactive companion REPL", |
| 44 | ) |
| 45 | @click.option( |
| 46 | "--interactive", |
| 47 | "-I", |
| 48 | "interactive_flag", |
| 49 | is_flag=True, |
| 50 | help="Launch interactive companion REPL", |
| 51 | ) |
| 52 | @click.version_option("0.4.0", prog_name="PlanOpticon") |
| 53 | @click.pass_context |
| 54 | def cli(ctx, verbose, chat, interactive_flag): |
| 55 | """PlanOpticon - Comprehensive Video Analysis & Knowledge Extraction Tool.""" |
| 56 | ctx.ensure_object(dict) |
| 57 | ctx.obj["verbose"] = verbose |
| 58 | setup_logging(verbose) |
| 59 | |
| 60 | if (chat or interactive_flag) and ctx.invoked_subcommand is None: |
| 61 | from video_processor.cli.companion import CompanionREPL |
| 62 | |
| 63 | repl = CompanionREPL() |
| 64 | repl.run() |
| 65 | ctx.exit(0) |
| 66 | elif ctx.invoked_subcommand is None: |
| 67 | _interactive_menu(ctx) |
| 68 | |
| 69 | |
| 70 | @cli.command() |
| 71 | @click.option( |
| @@ -73,16 +92,47 @@ | |
| 92 | ) |
| 93 | @click.option("--title", type=str, help="Title for the analysis report") |
| 94 | @click.option( |
| 95 | "--provider", |
| 96 | "-p", |
| 97 | type=click.Choice( |
| 98 | [ |
| 99 | "auto", |
| 100 | "openai", |
| 101 | "anthropic", |
| 102 | "gemini", |
| 103 | "ollama", |
| 104 | "azure", |
| 105 | "together", |
| 106 | "fireworks", |
| 107 | "cerebras", |
| 108 | "xai", |
| 109 | ] |
| 110 | ), |
| 111 | default="auto", |
| 112 | help="API provider", |
| 113 | ) |
| 114 | @click.option("--vision-model", type=str, default=None, help="Override model for vision tasks") |
| 115 | @click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks") |
| 116 | @click.option( |
| 117 | "--output-format", |
| 118 | type=click.Choice(["default", "json"]), |
| 119 | default="default", |
| 120 | help="Output format: default (files + summary) or json (structured JSON to stdout)", |
| 121 | ) |
| 122 | @click.option( |
| 123 | "--templates-dir", |
| 124 | type=click.Path(exists=True), |
| 125 | default=None, |
| 126 | help="Directory with custom prompt template .txt files", |
| 127 | ) |
| 128 | @click.option( |
| 129 | "--speakers", |
| 130 | type=str, |
| 131 | default=None, |
| 132 | help='Comma-separated speaker names for diarization hints (e.g., "Alice,Bob,Carol")', |
| 133 | ) |
| 134 | @click.pass_context |
| 135 | def analyze( |
| 136 | ctx, |
| 137 | input, |
| 138 | output, |
| @@ -94,26 +144,35 @@ | |
| 144 | periodic_capture, |
| 145 | title, |
| 146 | provider, |
| 147 | vision_model, |
| 148 | chat_model, |
| 149 | output_format, |
| 150 | templates_dir, |
| 151 | speakers, |
| 152 | ): |
| 153 | """Analyze a single video and extract structured knowledge.""" |
| 154 | from video_processor.pipeline import process_single_video |
| 155 | from video_processor.providers.manager import ProviderManager |
| 156 | |
| 157 | focus_areas = [a.strip().lower() for a in focus.split(",")] if focus else [] |
| 158 | speaker_hints = [s.strip() for s in speakers.split(",")] if speakers else None |
| 159 | prov = None if provider == "auto" else provider |
| 160 | |
| 161 | pm = ProviderManager( |
| 162 | vision_model=vision_model, |
| 163 | chat_model=chat_model, |
| 164 | provider=prov, |
| 165 | ) |
| 166 | |
| 167 | if templates_dir: |
| 168 | from video_processor.utils.prompt_templates import PromptTemplate |
| 169 | |
| 170 | pm.prompt_templates = PromptTemplate(templates_dir=templates_dir) |
| 171 | |
| 172 | try: |
| 173 | manifest = process_single_video( |
| 174 | input_path=input, |
| 175 | output_dir=output, |
| 176 | provider_manager=pm, |
| 177 | depth=depth, |
| 178 | focus_areas=focus_areas, |
| @@ -120,16 +179,23 @@ | |
| 179 | sampling_rate=sampling_rate, |
| 180 | change_threshold=change_threshold, |
| 181 | periodic_capture_seconds=periodic_capture, |
| 182 | use_gpu=use_gpu, |
| 183 | title=title, |
| 184 | speaker_hints=speaker_hints, |
| 185 | ) |
| 186 | if output_format == "json": |
| 187 | click.echo(json.dumps(manifest.model_dump(), indent=2, default=str)) |
| 188 | else: |
| 189 | click.echo(pm.usage.format_summary()) |
| 190 | click.echo(f"\n Results: {output}/manifest.json") |
| 191 | except Exception as e: |
| 192 | logging.error(f"Error: {e}") |
| 193 | if output_format == "json": |
| 194 | click.echo(json.dumps({"error": str(e)})) |
| 195 | else: |
| 196 | click.echo(pm.usage.format_summary()) |
| 197 | if ctx.obj["verbose"]: |
| 198 | import traceback |
| 199 | |
| 200 | traceback.print_exc() |
| 201 | sys.exit(1) |
| @@ -154,11 +220,24 @@ | |
| 220 | ) |
| 221 | @click.option("--title", type=str, default="Batch Processing Results", help="Batch title") |
| 222 | @click.option( |
| 223 | "--provider", |
| 224 | "-p", |
| 225 | type=click.Choice( |
| 226 | [ |
| 227 | "auto", |
| 228 | "openai", |
| 229 | "anthropic", |
| 230 | "gemini", |
| 231 | "ollama", |
| 232 | "azure", |
| 233 | "together", |
| 234 | "fireworks", |
| 235 | "cerebras", |
| 236 | "xai", |
| 237 | ] |
| 238 | ), |
| 239 | default="auto", |
| 240 | help="API provider", |
| 241 | ) |
| 242 | @click.option("--vision-model", type=str, default=None, help="Override model for vision tasks") |
| 243 | @click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks") |
| @@ -282,14 +361,18 @@ | |
| 361 | entry.action_items_count = len(manifest.action_items) |
| 362 | entry.key_points_count = len(manifest.key_points) |
| 363 | entry.duration_seconds = manifest.video.duration_seconds |
| 364 | manifests.append(manifest) |
| 365 | |
| 366 | # Merge knowledge graph (prefer .db, fall back to .json) |
| 367 | kg_db = video_output / "results" / "knowledge_graph.db" |
| 368 | kg_json = video_output / "results" / "knowledge_graph.json" |
| 369 | if kg_db.exists(): |
| 370 | video_kg = KnowledgeGraph(db_path=kg_db) |
| 371 | merged_kg.merge(video_kg) |
| 372 | elif kg_json.exists(): |
| 373 | kg_data = json.loads(kg_json.read_text()) |
| 374 | video_kg = KnowledgeGraph.from_dict(kg_data) |
| 375 | merged_kg.merge(video_kg) |
| 376 | |
| 377 | except Exception as e: |
| 378 | logging.error(f"Failed to process {video_path.name}: {e}") |
| @@ -300,13 +383,12 @@ | |
| 383 | |
| 384 | traceback.print_exc() |
| 385 | |
| 386 | entries.append(entry) |
| 387 | |
| 388 | # Save merged knowledge graph (SQLite is primary, JSON is export) |
| 389 | merged_kg.save(Path(output) / "knowledge_graph.json") |
| 390 | |
| 391 | # Generate batch summary |
| 392 | plan_gen = PlanGenerator(provider_manager=pm, knowledge_graph=merged_kg) |
| 393 | summary_path = Path(output) / "batch_summary.md" |
| 394 | plan_gen.generate_batch_summary( |
| @@ -336,10 +418,120 @@ | |
| 418 | f"\n Batch complete: {batch_manifest.completed_videos}" |
| 419 | f"/{batch_manifest.total_videos} succeeded" |
| 420 | ) |
| 421 | click.echo(f" Results: {output}/batch_manifest.json") |
| 422 | |
| 423 | |
| 424 | @cli.command() |
| 425 | @click.argument("input_path", type=click.Path(exists=True)) |
| 426 | @click.option( |
| 427 | "--output", "-o", type=click.Path(), default=None, help="Output directory for knowledge graph" |
| 428 | ) |
| 429 | @click.option( |
| 430 | "--db-path", type=click.Path(), default=None, help="Existing knowledge_graph.db to add to" |
| 431 | ) |
| 432 | @click.option("--recursive/--no-recursive", "-r", default=True, help="Recurse into subdirectories") |
| 433 | @click.option( |
| 434 | "--provider", |
| 435 | "-p", |
| 436 | type=click.Choice( |
| 437 | [ |
| 438 | "auto", |
| 439 | "openai", |
| 440 | "anthropic", |
| 441 | "gemini", |
| 442 | "ollama", |
| 443 | "azure", |
| 444 | "together", |
| 445 | "fireworks", |
| 446 | "cerebras", |
| 447 | "xai", |
| 448 | ] |
| 449 | ), |
| 450 | default="auto", |
| 451 | help="API provider", |
| 452 | ) |
| 453 | @click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks") |
| 454 | @click.pass_context |
| 455 | def ingest(ctx, input_path, output, db_path, recursive, provider, chat_model): |
| 456 | """Ingest documents into a knowledge graph. |
| 457 | |
| 458 | Supports: .md, .txt, .pdf (with pymupdf or pdfplumber installed) |
| 459 | |
| 460 | Examples: |
| 461 | |
| 462 | planopticon ingest spec.md |
| 463 | |
| 464 | planopticon ingest ./docs/ -o ./output |
| 465 | |
| 466 | planopticon ingest report.pdf --db-path existing.db |
| 467 | """ |
| 468 | from video_processor.integrators.knowledge_graph import KnowledgeGraph |
| 469 | from video_processor.processors import list_supported_extensions |
| 470 | from video_processor.processors.ingest import ingest_directory, ingest_file |
| 471 | from video_processor.providers.manager import ProviderManager |
| 472 | |
| 473 | input_path = Path(input_path) |
| 474 | prov = None if provider == "auto" else provider |
| 475 | pm = ProviderManager(chat_model=chat_model, provider=prov) |
| 476 | |
| 477 | # Determine DB path |
| 478 | if db_path: |
| 479 | kg_path = Path(db_path) |
| 480 | elif output: |
| 481 | out_dir = Path(output) |
| 482 | out_dir.mkdir(parents=True, exist_ok=True) |
| 483 | kg_path = out_dir / "knowledge_graph.db" |
| 484 | else: |
| 485 | kg_path = Path.cwd() / "knowledge_graph.db" |
| 486 | |
| 487 | kg_path.parent.mkdir(parents=True, exist_ok=True) |
| 488 | |
| 489 | click.echo(f"Knowledge graph: {kg_path}") |
| 490 | kg = KnowledgeGraph(provider_manager=pm, db_path=kg_path) |
| 491 | |
| 492 | total_files = 0 |
| 493 | total_chunks = 0 |
| 494 | |
| 495 | try: |
| 496 | if input_path.is_file(): |
| 497 | count = ingest_file(input_path, kg) |
| 498 | total_files = 1 |
| 499 | total_chunks = count |
| 500 | click.echo(f" {input_path.name}: {count} chunks") |
| 501 | elif input_path.is_dir(): |
| 502 | results = ingest_directory(input_path, kg, recursive=recursive) |
| 503 | total_files = len(results) |
| 504 | total_chunks = sum(results.values()) |
| 505 | for fpath, count in results.items(): |
| 506 | click.echo(f" {Path(fpath).name}: {count} chunks") |
| 507 | else: |
| 508 | click.echo(f"Error: {input_path} is not a file or directory", err=True) |
| 509 | sys.exit(1) |
| 510 | except ValueError as e: |
| 511 | click.echo(f"Error: {e}", err=True) |
| 512 | click.echo(f"Supported extensions: {', '.join(list_supported_extensions())}") |
| 513 | sys.exit(1) |
| 514 | except ImportError as e: |
| 515 | click.echo(f"Error: {e}", err=True) |
| 516 | sys.exit(1) |
| 517 | |
| 518 | # Save both .db and .json |
| 519 | kg.save(kg_path) |
| 520 | json_path = kg_path.with_suffix(".json") |
| 521 | kg.save(json_path) |
| 522 | |
| 523 | entity_count = kg._store.get_entity_count() |
| 524 | rel_count = kg._store.get_relationship_count() |
| 525 | |
| 526 | click.echo("\nIngestion complete:") |
| 527 | click.echo(f" Files processed: {total_files}") |
| 528 | click.echo(f" Total chunks: {total_chunks}") |
| 529 | click.echo(f" Entities extracted: {entity_count}") |
| 530 | click.echo(f" Relationships: {rel_count}") |
| 531 | click.echo(f" Knowledge graph: {kg_path}") |
| 532 | |
| 533 | |
| 534 | @cli.command("list-models") |
| 535 | @click.pass_context |
| 536 | def list_models(ctx): |
| 537 | """Discover and display available models from all configured providers.""" |
| @@ -421,11 +613,24 @@ | |
| 613 | ) |
| 614 | @click.option("--title", type=str, help="Title for the analysis report") |
| 615 | @click.option( |
| 616 | "--provider", |
| 617 | "-p", |
| 618 | type=click.Choice( |
| 619 | [ |
| 620 | "auto", |
| 621 | "openai", |
| 622 | "anthropic", |
| 623 | "gemini", |
| 624 | "ollama", |
| 625 | "azure", |
| 626 | "together", |
| 627 | "fireworks", |
| 628 | "cerebras", |
| 629 | "xai", |
| 630 | ] |
| 631 | ), |
| 632 | default="auto", |
| 633 | help="API provider", |
| 634 | ) |
| 635 | @click.option("--vision-model", type=str, default=None, help="Override model for vision tasks") |
| 636 | @click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks") |
| @@ -462,10 +667,145 @@ | |
| 667 | import traceback |
| 668 | |
| 669 | traceback.print_exc() |
| 670 | sys.exit(1) |
| 671 | |
| 672 | |
| 673 | @cli.command() |
| 674 | @click.argument("request", required=False, default=None) |
| 675 | @click.option("--kb", multiple=True, type=click.Path(exists=True), help="Knowledge base paths") |
| 676 | @click.option("--interactive", "-I", is_flag=True, help="Interactive chat mode") |
| 677 | @click.option("--export", type=click.Path(), default=None, help="Export artifacts to directory") |
| 678 | @click.option( |
| 679 | "--provider", |
| 680 | "-p", |
| 681 | type=click.Choice( |
| 682 | [ |
| 683 | "auto", |
| 684 | "openai", |
| 685 | "anthropic", |
| 686 | "gemini", |
| 687 | "ollama", |
| 688 | "azure", |
| 689 | "together", |
| 690 | "fireworks", |
| 691 | "cerebras", |
| 692 | "xai", |
| 693 | ] |
| 694 | ), |
| 695 | default="auto", |
| 696 | help="API provider", |
| 697 | ) |
| 698 | @click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks") |
| 699 | @click.pass_context |
| 700 | def agent(ctx, request, kb, interactive, export, provider, chat_model): |
| 701 | """AI planning agent. Synthesizes knowledge into project plans and artifacts. |
| 702 | |
| 703 | Examples: |
| 704 | |
| 705 | planopticon agent "Create a project plan" --kb ./results |
| 706 | |
| 707 | planopticon agent -I --kb ./videos --kb ./docs |
| 708 | |
| 709 | planopticon agent "Generate a PRD" --export ./output |
| 710 | """ |
| 711 | # Ensure all skills are registered |
| 712 | import video_processor.agent.skills # noqa: F401 |
| 713 | from video_processor.agent.agent_loop import PlanningAgent |
| 714 | from video_processor.agent.kb_context import KBContext |
| 715 | from video_processor.agent.skills.base import AgentContext |
| 716 | |
| 717 | # Build provider manager |
| 718 | pm = None |
| 719 | try: |
| 720 | from video_processor.providers.manager import ProviderManager |
| 721 | |
| 722 | prov = None if provider == "auto" else provider |
| 723 | pm = ProviderManager(chat_model=chat_model, provider=prov) |
| 724 | except Exception: |
| 725 | if not interactive: |
| 726 | click.echo("Warning: could not initialize LLM provider.", err=True) |
| 727 | |
| 728 | # Load knowledge base |
| 729 | kb_ctx = KBContext() |
| 730 | if kb: |
| 731 | for path in kb: |
| 732 | kb_ctx.add_source(Path(path)) |
| 733 | kb_ctx.load(provider_manager=pm) |
| 734 | click.echo(kb_ctx.summary()) |
| 735 | else: |
| 736 | # Auto-discover |
| 737 | kb_ctx = KBContext.auto_discover(provider_manager=pm) |
| 738 | if kb_ctx.sources: |
| 739 | click.echo(kb_ctx.summary()) |
| 740 | else: |
| 741 | click.echo("No knowledge base found. Use --kb to specify paths.") |
| 742 | |
| 743 | agent_inst = PlanningAgent( |
| 744 | context=AgentContext( |
| 745 | knowledge_graph=kb_ctx.knowledge_graph if kb_ctx.sources else None, |
| 746 | query_engine=kb_ctx.query_engine if kb_ctx.sources else None, |
| 747 | provider_manager=pm, |
| 748 | ) |
| 749 | ) |
| 750 | |
| 751 | if interactive: |
| 752 | click.echo("\nPlanOpticon Agent (interactive mode)") |
| 753 | click.echo("Type your request, or 'quit' to exit.\n") |
| 754 | while True: |
| 755 | try: |
| 756 | line = click.prompt("agent", prompt_suffix="> ") |
| 757 | except (KeyboardInterrupt, EOFError): |
| 758 | click.echo("\nBye.") |
| 759 | break |
| 760 | if line.strip().lower() in ("quit", "exit", "q"): |
| 761 | click.echo("Bye.") |
| 762 | break |
| 763 | |
| 764 | # Check for slash commands |
| 765 | if line.strip().startswith("/"): |
| 766 | cmd = line.strip()[1:].split()[0] |
| 767 | if cmd == "plan": |
| 768 | artifacts = agent_inst.execute("Generate a project plan") |
| 769 | elif cmd == "skills": |
| 770 | from video_processor.agent.skills.base import list_skills |
| 771 | |
| 772 | for s in list_skills(): |
| 773 | click.echo(f" {s.name}: {s.description}") |
| 774 | continue |
| 775 | elif cmd == "summary": |
| 776 | if kb_ctx.sources: |
| 777 | click.echo(kb_ctx.summary()) |
| 778 | continue |
| 779 | else: |
| 780 | artifacts = agent_inst.execute(line.strip()[1:]) |
| 781 | |
| 782 | for a in artifacts: |
| 783 | click.echo(f"\n--- {a.name} ({a.artifact_type}) ---\n") |
| 784 | click.echo(a.content) |
| 785 | else: |
| 786 | response = agent_inst.chat(line) |
| 787 | click.echo(f"\n{response}\n") |
| 788 | elif request: |
| 789 | artifacts = agent_inst.execute(request) |
| 790 | if not artifacts: |
| 791 | click.echo("No artifacts generated. Try a more specific request.") |
| 792 | for artifact in artifacts: |
| 793 | click.echo(f"\n--- {artifact.name} ({artifact.artifact_type}) ---\n") |
| 794 | click.echo(artifact.content) |
| 795 | |
| 796 | if export: |
| 797 | from video_processor.agent.skills.artifact_export import export_artifacts |
| 798 | |
| 799 | export_dir = Path(export) |
| 800 | export_artifacts(artifacts, export_dir) |
| 801 | click.echo(f"Exported {len(artifacts)} artifacts to {export_dir}/") |
| 802 | click.echo(f"Manifest: {export_dir / 'manifest.json'}") |
| 803 | else: |
| 804 | click.echo("Provide a request or use -I for interactive mode.") |
| 805 | click.echo("Example: planopticon agent 'Create a project plan' --kb ./results") |
| 806 | |
| 807 | |
| 808 | @cli.command() |
| 809 | @click.argument("question", required=False, default=None) |
| 810 | @click.option( |
| 811 | "--db-path", |
| @@ -488,28 +828,43 @@ | |
| 828 | ) |
| 829 | @click.option("--interactive", "-I", is_flag=True, help="Enter interactive REPL mode") |
| 830 | @click.option( |
| 831 | "--provider", |
| 832 | "-p", |
| 833 | type=click.Choice( |
| 834 | [ |
| 835 | "auto", |
| 836 | "openai", |
| 837 | "anthropic", |
| 838 | "gemini", |
| 839 | "ollama", |
| 840 | "azure", |
| 841 | "together", |
| 842 | "fireworks", |
| 843 | "cerebras", |
| 844 | "xai", |
| 845 | ] |
| 846 | ), |
| 847 | default="auto", |
| 848 | help="API provider for agentic mode", |
| 849 | ) |
| 850 | @click.option("--chat-model", type=str, default=None, help="Override model for agentic mode") |
| 851 | @click.pass_context |
| 852 | def query(ctx, question, db_path, mode, output_format, interactive, provider, chat_model): |
| 853 | """Query a knowledge graph. Runs stats if no question given. |
| 854 | |
| 855 | Direct commands recognized in QUESTION: stats, entities, relationships, |
| 856 | neighbors, sources, provenance, sql. Natural language questions use agentic mode. |
| 857 | |
| 858 | Examples: |
| 859 | |
| 860 | planopticon query |
| 861 | planopticon query stats |
| 862 | planopticon query "entities --type technology" |
| 863 | planopticon query "neighbors Alice" |
| 864 | planopticon query sources |
| 865 | planopticon query "provenance Alice" |
| 866 | planopticon query "What was discussed?" |
| 867 | planopticon query -I |
| 868 | """ |
| 869 | from video_processor.integrators.graph_discovery import find_nearest_graph |
| 870 | from video_processor.integrators.graph_query import GraphQueryEngine |
| @@ -588,13 +943,20 @@ | |
| 943 | |
| 944 | if cmd == "neighbors": |
| 945 | entity_name = " ".join(parts[1:]) if len(parts) > 1 else "" |
| 946 | return engine.neighbors(entity_name) |
| 947 | |
| 948 | if cmd == "sources": |
| 949 | return engine.sources() |
| 950 | |
| 951 | if cmd == "provenance": |
| 952 | entity_name = " ".join(parts[1:]) if len(parts) > 1 else "" |
| 953 | return engine.provenance(entity_name) |
| 954 | |
| 955 | if cmd == "sql": |
| 956 | sql_query = " ".join(parts[1:]) |
| 957 | return engine.sql(sql_query) |
| 958 | |
| 959 | # Natural language → agentic (or fallback to entity search in direct mode) |
| 960 | if mode == "direct": |
| 961 | return engine.entities(name=question) |
| 962 | return engine.ask(question) |
| @@ -646,33 +1008,1078 @@ | |
| 1008 | _print_result(result, output_format) |
| 1009 | click.echo() |
| 1010 | |
| 1011 | |
| 1012 | @cli.command() |
| 1013 | @click.argument( |
| 1014 | "service", |
| 1015 | type=click.Choice( |
| 1016 | [ |
| 1017 | "google", |
| 1018 | "dropbox", |
| 1019 | "zoom", |
| 1020 | "notion", |
| 1021 | "github", |
| 1022 | "microsoft", |
| 1023 | ] |
| 1024 | ), |
| 1025 | ) |
| 1026 | @click.option("--logout", is_flag=True, help="Clear saved token") |
| 1027 | @click.pass_context |
| 1028 | def auth(ctx, service, logout): |
| 1029 | """Authenticate with a cloud service via OAuth or API key. |
| 1030 | |
| 1031 | Uses OAuth when available, falls back to API keys. |
| 1032 | Tokens are saved to ~/.planopticon/ for reuse. |
| 1033 | |
| 1034 | Examples: |
| 1035 | |
| 1036 | planopticon auth google |
| 1037 | |
| 1038 | planopticon auth zoom |
| 1039 | |
| 1040 | planopticon auth github --logout |
| 1041 | """ |
| 1042 | from video_processor.auth import get_auth_manager |
| 1043 | |
| 1044 | manager = get_auth_manager(service) |
| 1045 | if not manager: |
| 1046 | click.echo(f"Unknown service: {service}", err=True) |
| 1047 | sys.exit(1) |
| 1048 | |
| 1049 | if logout: |
| 1050 | manager.clear_token() |
| 1051 | click.echo(f"Cleared saved {service} token.") |
| 1052 | return |
| 1053 | |
| 1054 | result = manager.authenticate() |
| 1055 | if result.success: |
| 1056 | click.echo(f"{service.title()} authentication successful ({result.method}).") |
| 1057 | else: |
| 1058 | click.echo( |
| 1059 | f"{service.title()} authentication failed: {result.error}", |
| 1060 | err=True, |
| 1061 | ) |
| 1062 | sys.exit(1) |
| 1063 | |
| 1064 | |
| 1065 | @cli.group() |
| 1066 | def gws(): |
| 1067 | """Google Workspace: fetch docs, sheets, and slides via the gws CLI.""" |
| 1068 | pass |
| 1069 | |
| 1070 | |
| 1071 | @gws.command("list") |
| 1072 | @click.option("--folder-id", type=str, default=None, help="Drive folder ID to list") |
| 1073 | @click.option("--query", "-q", type=str, default=None, help="Drive search query") |
| 1074 | @click.option("--json", "as_json", is_flag=True, help="Output as JSON") |
| 1075 | def gws_list(folder_id, query, as_json): |
| 1076 | """List documents in Google Drive. |
| 1077 | |
| 1078 | Examples: |
| 1079 | |
| 1080 | planopticon gws list |
| 1081 | |
| 1082 | planopticon gws list --folder-id 1abc... |
| 1083 | |
| 1084 | planopticon gws list -q "name contains 'PRD'" --json |
| 1085 | """ |
| 1086 | from video_processor.sources.gws_source import GWSSource |
| 1087 | |
| 1088 | source = GWSSource(folder_id=folder_id, query=query) |
| 1089 | if not source.authenticate(): |
| 1090 | click.echo("Error: gws CLI not available or not authenticated.", err=True) |
| 1091 | click.echo("Install: npm install -g @googleworkspace/cli", err=True) |
| 1092 | click.echo("Auth: gws auth login", err=True) |
| 1093 | sys.exit(1) |
| 1094 | |
| 1095 | files = source.list_videos(folder_id=folder_id) |
| 1096 | if as_json: |
| 1097 | click.echo(json.dumps([f.model_dump() for f in files], indent=2, default=str)) |
| 1098 | else: |
| 1099 | if not files: |
| 1100 | click.echo("No documents found.") |
| 1101 | return |
| 1102 | for f in files: |
| 1103 | size = f"{f.size_bytes / 1024:.0f}KB" if f.size_bytes else "—" |
| 1104 | click.echo(f" {f.id[:12]}… {size:>8s} {f.mime_type or ''} {f.name}") |
| 1105 | |
| 1106 | |
| 1107 | @gws.command("fetch") |
| 1108 | @click.argument("doc_ids", nargs=-1) |
| 1109 | @click.option("--folder-id", type=str, default=None, help="Fetch all docs in a folder") |
| 1110 | @click.option("-o", "--output", type=click.Path(), default=None, help="Output directory") |
| 1111 | def gws_fetch(doc_ids, folder_id, output): |
| 1112 | """Fetch Google Docs/Sheets/Slides as text files. |
| 1113 | |
| 1114 | Examples: |
| 1115 | |
| 1116 | planopticon gws fetch DOC_ID1 DOC_ID2 -o ./docs |
| 1117 | |
| 1118 | planopticon gws fetch --folder-id 1abc... -o ./docs |
| 1119 | """ |
| 1120 | from video_processor.sources.gws_source import GWSSource |
| 1121 | |
| 1122 | source = GWSSource(folder_id=folder_id, doc_ids=list(doc_ids)) |
| 1123 | if not source.authenticate(): |
| 1124 | click.echo("Error: gws CLI not available or not authenticated.", err=True) |
| 1125 | sys.exit(1) |
| 1126 | |
| 1127 | out_dir = Path(output) if output else Path.cwd() / "gws_docs" |
| 1128 | out_dir.mkdir(parents=True, exist_ok=True) |
| 1129 | |
| 1130 | files = source.list_videos(folder_id=folder_id) |
| 1131 | if not files: |
| 1132 | click.echo("No documents found.") |
| 1133 | return |
| 1134 | |
| 1135 | for f in files: |
| 1136 | safe_name = f.name.replace("/", "_").replace("\\", "_") |
| 1137 | dest = out_dir / f"{safe_name}.txt" |
| 1138 | try: |
| 1139 | source.download(f, dest) |
| 1140 | click.echo(f" ✓ {f.name} → {dest}") |
| 1141 | except Exception as e: |
| 1142 | click.echo(f" ✗ {f.name}: {e}", err=True) |
| 1143 | |
| 1144 | click.echo(f"\nFetched {len(files)} document(s) to {out_dir}") |
| 1145 | |
| 1146 | |
| 1147 | @gws.command("ingest") |
| 1148 | @click.option("--folder-id", type=str, default=None, help="Drive folder ID") |
| 1149 | @click.option("--doc-id", type=str, multiple=True, help="Specific doc IDs (repeatable)") |
| 1150 | @click.option("--query", "-q", type=str, default=None, help="Drive search query") |
| 1151 | @click.option("-o", "--output", type=click.Path(), default=None, help="Output directory") |
| 1152 | @click.option("--db-path", type=click.Path(), default=None, help="Existing DB to merge into") |
| 1153 | @click.option( |
| 1154 | "-p", |
| 1155 | "--provider", |
| 1156 | type=click.Choice( |
| 1157 | [ |
| 1158 | "auto", |
| 1159 | "openai", |
| 1160 | "anthropic", |
| 1161 | "gemini", |
| 1162 | "ollama", |
| 1163 | "azure", |
| 1164 | "together", |
| 1165 | "fireworks", |
| 1166 | "cerebras", |
| 1167 | "xai", |
| 1168 | ] |
| 1169 | ), |
| 1170 | default="auto", |
| 1171 | help="API provider", |
| 1172 | ) |
| 1173 | @click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks") |
| 1174 | @click.pass_context |
| 1175 | def gws_ingest(ctx, folder_id, doc_id, query, output, db_path, provider, chat_model): |
| 1176 | """Fetch Google Workspace docs and ingest into a knowledge graph. |
| 1177 | |
| 1178 | Combines gws fetch + planopticon ingest in one step. |
| 1179 | |
| 1180 | Examples: |
| 1181 | |
| 1182 | planopticon gws ingest --folder-id 1abc... |
| 1183 | |
| 1184 | planopticon gws ingest --doc-id DOC1 --doc-id DOC2 -o ./results |
| 1185 | |
| 1186 | planopticon gws ingest -q "name contains 'spec'" --db-path existing.db |
| 1187 | """ |
| 1188 | import tempfile |
| 1189 | |
| 1190 | from video_processor.integrators.knowledge_graph import KnowledgeGraph |
| 1191 | from video_processor.processors.ingest import ingest_file |
| 1192 | from video_processor.providers.manager import ProviderManager |
| 1193 | from video_processor.sources.gws_source import GWSSource |
| 1194 | |
| 1195 | source = GWSSource(folder_id=folder_id, doc_ids=list(doc_id), query=query) |
| 1196 | if not source.authenticate(): |
| 1197 | click.echo("Error: gws CLI not available or not authenticated.", err=True) |
| 1198 | click.echo("Install: npm install -g @googleworkspace/cli", err=True) |
| 1199 | click.echo("Auth: gws auth login", err=True) |
| 1200 | sys.exit(1) |
| 1201 | |
| 1202 | # Fetch docs to temp dir |
| 1203 | files = source.list_videos(folder_id=folder_id) |
| 1204 | if not files: |
| 1205 | click.echo("No documents found.") |
| 1206 | return |
| 1207 | |
| 1208 | click.echo(f"Found {len(files)} document(s), fetching...") |
| 1209 | |
| 1210 | with tempfile.TemporaryDirectory() as tmp_dir: |
| 1211 | tmp_path = Path(tmp_dir) |
| 1212 | local_files = [] |
| 1213 | for f in files: |
| 1214 | safe_name = f.name.replace("/", "_").replace("\\", "_") |
| 1215 | dest = tmp_path / f"{safe_name}.txt" |
| 1216 | try: |
| 1217 | source.download(f, dest) |
| 1218 | local_files.append(dest) |
| 1219 | click.echo(f" ✓ {f.name}") |
| 1220 | except Exception as e: |
| 1221 | click.echo(f" ✗ {f.name}: {e}", err=True) |
| 1222 | |
| 1223 | if not local_files: |
| 1224 | click.echo("No documents fetched successfully.", err=True) |
| 1225 | sys.exit(1) |
| 1226 | |
| 1227 | # Set up KG |
| 1228 | prov = None if provider == "auto" else provider |
| 1229 | pm = ProviderManager(chat_model=chat_model, provider=prov) |
| 1230 | |
| 1231 | if db_path: |
| 1232 | kg_path = Path(db_path) |
| 1233 | elif output: |
| 1234 | out_dir = Path(output) |
| 1235 | out_dir.mkdir(parents=True, exist_ok=True) |
| 1236 | kg_path = out_dir / "knowledge_graph.db" |
| 1237 | else: |
| 1238 | kg_path = Path.cwd() / "knowledge_graph.db" |
| 1239 | |
| 1240 | kg_path.parent.mkdir(parents=True, exist_ok=True) |
| 1241 | kg = KnowledgeGraph(provider_manager=pm, db_path=kg_path) |
| 1242 | |
| 1243 | total_chunks = 0 |
| 1244 | for lf in local_files: |
| 1245 | try: |
| 1246 | count = ingest_file(lf, kg) |
| 1247 | total_chunks += count |
| 1248 | click.echo(f" Ingested {lf.stem}: {count} chunks") |
| 1249 | except Exception as e: |
| 1250 | click.echo(f" Failed to ingest {lf.stem}: {e}", err=True) |
| 1251 | |
| 1252 | kg.save(kg_path) |
| 1253 | kg.save(kg_path.with_suffix(".json")) |
| 1254 | |
| 1255 | entity_count = kg._store.get_entity_count() |
| 1256 | rel_count = kg._store.get_relationship_count() |
| 1257 | |
| 1258 | click.echo("\nIngestion complete:") |
| 1259 | click.echo(f" Documents: {len(local_files)}") |
| 1260 | click.echo(f" Chunks: {total_chunks}") |
| 1261 | click.echo(f" Entities: {entity_count}") |
| 1262 | click.echo(f" Relationships: {rel_count}") |
| 1263 | click.echo(f" Knowledge graph: {kg_path}") |
| 1264 | |
| 1265 | |
| 1266 | @cli.group() |
| 1267 | def m365(): |
| 1268 | """Microsoft 365: fetch docs from SharePoint and OneDrive via the m365 CLI.""" |
| 1269 | pass |
| 1270 | |
| 1271 | |
| 1272 | @m365.command("list") |
| 1273 | @click.option("--web-url", type=str, required=True, help="SharePoint site URL") |
| 1274 | @click.option("--folder-url", type=str, required=True, help="Server-relative folder URL") |
| 1275 | @click.option("--recursive", is_flag=True, help="Include subfolders") |
| 1276 | @click.option("--json", "as_json", is_flag=True, help="Output as JSON") |
| 1277 | def m365_list(web_url, folder_url, recursive, as_json): |
| 1278 | """List documents in SharePoint or OneDrive. |
| 1279 | |
| 1280 | Examples: |
| 1281 | |
| 1282 | planopticon m365 list --web-url https://contoso.sharepoint.com/sites/proj \\ |
| 1283 | --folder-url /sites/proj/Shared\\ Documents |
| 1284 | |
| 1285 | planopticon m365 list --web-url URL --folder-url FOLDER --recursive --json |
| 1286 | """ |
| 1287 | from video_processor.sources.m365_source import M365Source |
| 1288 | |
| 1289 | source = M365Source(web_url=web_url, folder_url=folder_url, recursive=recursive) |
| 1290 | if not source.authenticate(): |
| 1291 | click.echo("Error: m365 CLI not available or not logged in.", err=True) |
| 1292 | click.echo("Install: npm install -g @pnp/cli-microsoft365", err=True) |
| 1293 | click.echo("Auth: m365 login", err=True) |
| 1294 | sys.exit(1) |
| 1295 | |
| 1296 | files = source.list_videos() |
| 1297 | if as_json: |
| 1298 | click.echo(json.dumps([f.model_dump() for f in files], indent=2, default=str)) |
| 1299 | else: |
| 1300 | if not files: |
| 1301 | click.echo("No documents found.") |
| 1302 | return |
| 1303 | for f in files: |
| 1304 | size = f"{f.size_bytes / 1024:.0f}KB" if f.size_bytes else "—" |
| 1305 | click.echo(f" {f.id[:12]}… {size:>8s} {f.name}") |
| 1306 | |
| 1307 | |
| 1308 | @m365.command("fetch") |
| 1309 | @click.option("--web-url", type=str, required=True, help="SharePoint site URL") |
| 1310 | @click.option("--folder-url", type=str, default=None, help="Server-relative folder URL") |
| 1311 | @click.option("--file-id", type=str, multiple=True, help="Specific file IDs (repeatable)") |
| 1312 | @click.option("-o", "--output", type=click.Path(), default=None, help="Output directory") |
| 1313 | def m365_fetch(web_url, folder_url, file_id, output): |
| 1314 | """Fetch SharePoint/OneDrive documents as local files. |
| 1315 | |
| 1316 | Examples: |
| 1317 | |
| 1318 | planopticon m365 fetch --web-url URL --folder-url FOLDER -o ./docs |
| 1319 | |
| 1320 | planopticon m365 fetch --web-url URL --file-id ID1 --file-id ID2 -o ./docs |
| 1321 | """ |
| 1322 | from video_processor.sources.m365_source import M365Source |
| 1323 | |
| 1324 | source = M365Source(web_url=web_url, folder_url=folder_url, file_ids=list(file_id)) |
| 1325 | if not source.authenticate(): |
| 1326 | click.echo("Error: m365 CLI not available or not logged in.", err=True) |
| 1327 | sys.exit(1) |
| 1328 | |
| 1329 | out_dir = Path(output) if output else Path.cwd() / "m365_docs" |
| 1330 | out_dir.mkdir(parents=True, exist_ok=True) |
| 1331 | |
| 1332 | files = source.list_videos() |
| 1333 | if not files: |
| 1334 | click.echo("No documents found.") |
| 1335 | return |
| 1336 | |
| 1337 | for f in files: |
| 1338 | dest = out_dir / f.name |
| 1339 | try: |
| 1340 | source.download(f, dest) |
| 1341 | click.echo(f" fetched {f.name}") |
| 1342 | except Exception as e: |
| 1343 | click.echo(f" failed {f.name}: {e}", err=True) |
| 1344 | |
| 1345 | click.echo(f"\nFetched {len(files)} document(s) to {out_dir}") |
| 1346 | |
| 1347 | |
| 1348 | @m365.command("ingest") |
| 1349 | @click.option("--web-url", type=str, required=True, help="SharePoint site URL") |
| 1350 | @click.option("--folder-url", type=str, default=None, help="Server-relative folder URL") |
| 1351 | @click.option("--file-id", type=str, multiple=True, help="Specific file IDs (repeatable)") |
| 1352 | @click.option("-o", "--output", type=click.Path(), default=None, help="Output directory") |
| 1353 | @click.option("--db-path", type=click.Path(), default=None, help="Existing DB to merge into") |
| 1354 | @click.option( |
| 1355 | "-p", |
| 1356 | "--provider", |
| 1357 | type=click.Choice( |
| 1358 | [ |
| 1359 | "auto", |
| 1360 | "openai", |
| 1361 | "anthropic", |
| 1362 | "gemini", |
| 1363 | "ollama", |
| 1364 | "azure", |
| 1365 | "together", |
| 1366 | "fireworks", |
| 1367 | "cerebras", |
| 1368 | "xai", |
| 1369 | ] |
| 1370 | ), |
| 1371 | default="auto", |
| 1372 | help="API provider", |
| 1373 | ) |
| 1374 | @click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks") |
| 1375 | @click.pass_context |
| 1376 | def m365_ingest(ctx, web_url, folder_url, file_id, output, db_path, provider, chat_model): |
| 1377 | """Fetch SharePoint/OneDrive docs and ingest into a knowledge graph. |
| 1378 | |
| 1379 | Examples: |
| 1380 | |
| 1381 | planopticon m365 ingest --web-url URL --folder-url FOLDER |
| 1382 | |
| 1383 | planopticon m365 ingest --web-url URL --file-id ID1 --file-id ID2 -o ./results |
| 1384 | """ |
| 1385 | import tempfile |
| 1386 | |
| 1387 | from video_processor.integrators.knowledge_graph import KnowledgeGraph |
| 1388 | from video_processor.processors.ingest import ingest_file |
| 1389 | from video_processor.providers.manager import ProviderManager |
| 1390 | from video_processor.sources.m365_source import M365Source |
| 1391 | |
| 1392 | source = M365Source(web_url=web_url, folder_url=folder_url, file_ids=list(file_id)) |
| 1393 | if not source.authenticate(): |
| 1394 | click.echo("Error: m365 CLI not available or not logged in.", err=True) |
| 1395 | click.echo("Install: npm install -g @pnp/cli-microsoft365", err=True) |
| 1396 | click.echo("Auth: m365 login", err=True) |
| 1397 | sys.exit(1) |
| 1398 | |
| 1399 | files = source.list_videos() |
| 1400 | if not files: |
| 1401 | click.echo("No documents found.") |
| 1402 | return |
| 1403 | |
| 1404 | click.echo(f"Found {len(files)} document(s), fetching...") |
| 1405 | |
| 1406 | with tempfile.TemporaryDirectory() as tmp_dir: |
| 1407 | tmp_path = Path(tmp_dir) |
| 1408 | local_files = [] |
| 1409 | for f in files: |
| 1410 | dest = tmp_path / f.name |
| 1411 | try: |
| 1412 | source.download(f, dest) |
| 1413 | # Extract text for non-text formats |
| 1414 | text_dest = tmp_path / f"{Path(f.name).stem}.txt" |
| 1415 | text = source.download_as_text(f) |
| 1416 | text_dest.write_text(text, encoding="utf-8") |
| 1417 | local_files.append(text_dest) |
| 1418 | click.echo(f" fetched {f.name}") |
| 1419 | except Exception as e: |
| 1420 | click.echo(f" failed {f.name}: {e}", err=True) |
| 1421 | |
| 1422 | if not local_files: |
| 1423 | click.echo("No documents fetched successfully.", err=True) |
| 1424 | sys.exit(1) |
| 1425 | |
| 1426 | prov = None if provider == "auto" else provider |
| 1427 | pm = ProviderManager(chat_model=chat_model, provider=prov) |
| 1428 | |
| 1429 | if db_path: |
| 1430 | kg_path = Path(db_path) |
| 1431 | elif output: |
| 1432 | out_dir = Path(output) |
| 1433 | out_dir.mkdir(parents=True, exist_ok=True) |
| 1434 | kg_path = out_dir / "knowledge_graph.db" |
| 1435 | else: |
| 1436 | kg_path = Path.cwd() / "knowledge_graph.db" |
| 1437 | |
| 1438 | kg_path.parent.mkdir(parents=True, exist_ok=True) |
| 1439 | kg = KnowledgeGraph(provider_manager=pm, db_path=kg_path) |
| 1440 | |
| 1441 | total_chunks = 0 |
| 1442 | for lf in local_files: |
| 1443 | try: |
| 1444 | count = ingest_file(lf, kg) |
| 1445 | total_chunks += count |
| 1446 | click.echo(f" Ingested {lf.stem}: {count} chunks") |
| 1447 | except Exception as e: |
| 1448 | click.echo(f" Failed to ingest {lf.stem}: {e}", err=True) |
| 1449 | |
| 1450 | kg.save(kg_path) |
| 1451 | kg.save(kg_path.with_suffix(".json")) |
| 1452 | |
| 1453 | entity_count = kg._store.get_entity_count() |
| 1454 | rel_count = kg._store.get_relationship_count() |
| 1455 | |
| 1456 | click.echo("\nIngestion complete:") |
| 1457 | click.echo(f" Documents: {len(local_files)}") |
| 1458 | click.echo(f" Chunks: {total_chunks}") |
| 1459 | click.echo(f" Entities: {entity_count}") |
| 1460 | click.echo(f" Relationships: {rel_count}") |
| 1461 | click.echo(f" Knowledge graph: {kg_path}") |
| 1462 | |
| 1463 | |
| 1464 | @cli.group() |
| 1465 | def export(): |
| 1466 | """Export knowledge graphs as markdown docs, notes, or CSV.""" |
| 1467 | pass |
| 1468 | |
| 1469 | |
| 1470 | @export.command("markdown") |
| 1471 | @click.argument("db_path", type=click.Path(exists=True)) |
| 1472 | @click.option("-o", "--output", type=click.Path(), default=None, help="Output directory") |
| 1473 | @click.option( |
| 1474 | "--type", |
| 1475 | "doc_types", |
| 1476 | type=click.Choice( |
| 1477 | [ |
| 1478 | "summary", |
| 1479 | "meeting-notes", |
| 1480 | "glossary", |
| 1481 | "relationship-map", |
| 1482 | "status-report", |
| 1483 | "entity-index", |
| 1484 | "csv", |
| 1485 | "all", |
| 1486 | ] |
| 1487 | ), |
| 1488 | multiple=True, |
| 1489 | default=("all",), |
| 1490 | help="Document types to generate (repeatable)", |
| 1491 | ) |
| 1492 | def export_markdown(db_path, output, doc_types): |
| 1493 | """Generate markdown documents from a knowledge graph. |
| 1494 | |
| 1495 | No API key needed — pure template-based generation. |
| 1496 | |
| 1497 | Examples: |
| 1498 | |
| 1499 | planopticon export markdown knowledge_graph.db |
| 1500 | |
| 1501 | planopticon export markdown kg.db -o ./docs --type summary --type glossary |
| 1502 | |
| 1503 | planopticon export markdown kg.db --type meeting-notes --type csv |
| 1504 | """ |
| 1505 | from video_processor.exporters.markdown import generate_all |
| 1506 | from video_processor.integrators.knowledge_graph import KnowledgeGraph |
| 1507 | |
| 1508 | db_path = Path(db_path) |
| 1509 | out_dir = Path(output) if output else Path.cwd() / "export" |
| 1510 | |
| 1511 | kg = KnowledgeGraph(db_path=db_path) |
| 1512 | kg_data = kg.to_dict() |
| 1513 | |
| 1514 | types = None if "all" in doc_types else list(doc_types) |
| 1515 | created = generate_all(kg_data, out_dir, doc_types=types) |
| 1516 | |
| 1517 | click.echo(f"Generated {len(created)} files in {out_dir}/") |
| 1518 | # Show top-level files (not entity briefs) |
| 1519 | for p in sorted(created): |
| 1520 | if p.parent == out_dir: |
| 1521 | click.echo(f" {p.name}") |
| 1522 | entity_count = len([p for p in created if p.parent != out_dir]) |
| 1523 | if entity_count: |
| 1524 | click.echo(f" entities/ ({entity_count} entity briefs)") |
| 1525 | |
| 1526 | |
| 1527 | @export.command("obsidian") |
| 1528 | @click.argument("db_path", type=click.Path(exists=True)) |
| 1529 | @click.option("-o", "--output", type=click.Path(), default=None, help="Output vault directory") |
| 1530 | def export_obsidian(db_path, output): |
| 1531 | """Export knowledge graph as an Obsidian vault with frontmatter and wiki-links. |
| 1532 | |
| 1533 | Examples: |
| 1534 | |
| 1535 | planopticon export obsidian knowledge_graph.db -o ./my-vault |
| 1536 | """ |
| 1537 | from video_processor.agent.skills.notes_export import export_to_obsidian |
| 1538 | from video_processor.integrators.knowledge_graph import KnowledgeGraph |
| 1539 | |
| 1540 | db_path = Path(db_path) |
| 1541 | out_dir = Path(output) if output else Path.cwd() / "obsidian-vault" |
| 1542 | |
| 1543 | kg = KnowledgeGraph(db_path=db_path) |
| 1544 | kg_data = kg.to_dict() |
| 1545 | created = export_to_obsidian(kg_data, out_dir) |
| 1546 | |
| 1547 | click.echo(f"Exported Obsidian vault: {len(created)} notes in {out_dir}/") |
| 1548 | |
| 1549 | |
| 1550 | @export.command("notion") |
| 1551 | @click.argument("db_path", type=click.Path(exists=True)) |
| 1552 | @click.option("-o", "--output", type=click.Path(), default=None, help="Output directory") |
| 1553 | def export_notion(db_path, output): |
| 1554 | """Export knowledge graph as Notion-compatible markdown + CSV database. |
| 1555 | |
| 1556 | Examples: |
| 1557 | |
| 1558 | planopticon export notion knowledge_graph.db -o ./notion-export |
| 1559 | """ |
| 1560 | from video_processor.agent.skills.notes_export import export_to_notion_md |
| 1561 | from video_processor.integrators.knowledge_graph import KnowledgeGraph |
| 1562 | |
| 1563 | db_path = Path(db_path) |
| 1564 | out_dir = Path(output) if output else Path.cwd() / "notion-export" |
| 1565 | |
| 1566 | kg = KnowledgeGraph(db_path=db_path) |
| 1567 | kg_data = kg.to_dict() |
| 1568 | created = export_to_notion_md(kg_data, out_dir) |
| 1569 | |
| 1570 | click.echo(f"Exported Notion markdown: {len(created)} files in {out_dir}/") |
| 1571 | |
| 1572 | |
| 1573 | @export.command("exchange") |
| 1574 | @click.argument("db_path", type=click.Path(exists=True)) |
| 1575 | @click.option( |
| 1576 | "-o", |
| 1577 | "--output", |
| 1578 | type=click.Path(), |
| 1579 | default=None, |
| 1580 | help="Output JSON file path", |
| 1581 | ) |
| 1582 | @click.option( |
| 1583 | "--name", |
| 1584 | "project_name", |
| 1585 | type=str, |
| 1586 | default="Untitled", |
| 1587 | help="Project name for the exchange payload", |
| 1588 | ) |
| 1589 | @click.option( |
| 1590 | "--description", |
| 1591 | "project_desc", |
| 1592 | type=str, |
| 1593 | default="", |
| 1594 | help="Project description", |
| 1595 | ) |
| 1596 | def export_exchange(db_path, output, project_name, project_desc): |
| 1597 | """Export a knowledge graph as a PlanOpticonExchange JSON file. |
| 1598 | |
| 1599 | Examples: |
| 1600 | |
| 1601 | planopticon export exchange knowledge_graph.db |
| 1602 | |
| 1603 | planopticon export exchange kg.db -o exchange.json --name "My Project" |
| 1604 | """ |
| 1605 | from video_processor.exchange import PlanOpticonExchange |
| 1606 | from video_processor.integrators.knowledge_graph import KnowledgeGraph |
| 1607 | |
| 1608 | db_path = Path(db_path) |
| 1609 | kg = KnowledgeGraph(db_path=db_path) |
| 1610 | kg_data = kg.to_dict() |
| 1611 | |
| 1612 | ex = PlanOpticonExchange.from_knowledge_graph( |
| 1613 | kg_data, |
| 1614 | project_name=project_name, |
| 1615 | project_description=project_desc, |
| 1616 | ) |
| 1617 | |
| 1618 | out_path = Path(output) if output else Path.cwd() / "exchange.json" |
| 1619 | ex.to_file(out_path) |
| 1620 | |
| 1621 | click.echo( |
| 1622 | f"Exported PlanOpticonExchange to {out_path} " |
| 1623 | f"({len(ex.entities)} entities, " |
| 1624 | f"{len(ex.relationships)} relationships)" |
| 1625 | ) |
| 1626 | |
| 1627 | |
| 1628 | @cli.group() |
| 1629 | def wiki(): |
| 1630 | """Generate and push GitHub wikis from knowledge graphs.""" |
| 1631 | pass |
| 1632 | |
| 1633 | |
| 1634 | @wiki.command("generate") |
| 1635 | @click.argument("db_path", type=click.Path(exists=True)) |
| 1636 | @click.option("-o", "--output", type=click.Path(), default=None, help="Output directory for wiki") |
| 1637 | @click.option("--title", type=str, default="Knowledge Base", help="Wiki title") |
| 1638 | def wiki_generate(db_path, output, title): |
| 1639 | """Generate a GitHub wiki from a knowledge graph. |
| 1640 | |
| 1641 | Examples: |
| 1642 | |
| 1643 | planopticon wiki generate knowledge_graph.db -o ./wiki |
| 1644 | |
| 1645 | planopticon wiki generate results/kg.db --title "Project Wiki" |
| 1646 | """ |
| 1647 | from video_processor.agent.skills.wiki_generator import generate_wiki, write_wiki |
| 1648 | from video_processor.integrators.knowledge_graph import KnowledgeGraph |
| 1649 | |
| 1650 | db_path = Path(db_path) |
| 1651 | out_dir = Path(output) if output else Path.cwd() / "wiki" |
| 1652 | |
| 1653 | kg = KnowledgeGraph(db_path=db_path) |
| 1654 | kg_data = kg.to_dict() |
| 1655 | pages = generate_wiki(kg_data, title=title) |
| 1656 | written = write_wiki(pages, out_dir) |
| 1657 | |
| 1658 | click.echo(f"Generated {len(written)} wiki pages in {out_dir}") |
| 1659 | for p in sorted(written): |
| 1660 | click.echo(f" {p.name}") |
| 1661 | |
| 1662 | |
| 1663 | @wiki.command("push") |
| 1664 | @click.argument("wiki_dir", type=click.Path(exists=True)) |
| 1665 | @click.argument("repo", type=str) |
| 1666 | @click.option("--message", "-m", type=str, default="Update wiki", help="Commit message") |
| 1667 | def wiki_push(wiki_dir, repo, message): |
| 1668 | """Push generated wiki pages to a GitHub wiki repo. |
| 1669 | |
| 1670 | REPO should be in 'owner/repo' format. |
| 1671 | |
| 1672 | Examples: |
| 1673 | |
| 1674 | planopticon wiki push ./wiki ConflictHQ/PlanOpticon |
| 1675 | |
| 1676 | planopticon wiki push ./wiki owner/repo -m "Add entity pages" |
| 1677 | """ |
| 1678 | from video_processor.agent.skills.wiki_generator import push_wiki |
| 1679 | |
| 1680 | wiki_dir = Path(wiki_dir) |
| 1681 | success = push_wiki(wiki_dir, repo, message=message) |
| 1682 | if success: |
| 1683 | click.echo(f"Wiki pushed to https://github.com/{repo}/wiki") |
| 1684 | else: |
| 1685 | click.echo("Wiki push failed. Check auth and repo permissions.", err=True) |
| 1686 | sys.exit(1) |
| 1687 | |
| 1688 | |
| 1689 | @cli.group() |
| 1690 | def recordings(): |
| 1691 | """Fetch meeting recordings from Zoom, Teams, and Google Meet.""" |
| 1692 | pass |
| 1693 | |
| 1694 | |
| 1695 | @recordings.command("zoom-list") |
| 1696 | @click.option("--json", "as_json", is_flag=True, help="Output as JSON") |
| 1697 | def recordings_zoom_list(as_json): |
| 1698 | """List Zoom cloud recordings. |
| 1699 | |
| 1700 | Requires ZOOM_CLIENT_ID (and optionally ZOOM_CLIENT_SECRET, |
| 1701 | ZOOM_ACCOUNT_ID) environment variables. |
| 1702 | |
| 1703 | Examples: |
| 1704 | |
| 1705 | planopticon recordings zoom-list |
| 1706 | |
| 1707 | planopticon recordings zoom-list --json |
| 1708 | """ |
| 1709 | from video_processor.sources.zoom_source import ZoomSource |
| 1710 | |
| 1711 | source = ZoomSource() |
| 1712 | if not source.authenticate(): |
| 1713 | click.echo("Zoom authentication failed.", err=True) |
| 1714 | sys.exit(1) |
| 1715 | |
| 1716 | files = source.list_videos() |
| 1717 | if as_json: |
| 1718 | click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str)) |
| 1719 | else: |
| 1720 | click.echo(f"Found {len(files)} recording(s):") |
| 1721 | for f in files: |
| 1722 | size = f"{f.size_bytes // 1_000_000} MB" if f.size_bytes else "unknown" |
| 1723 | click.echo(f" {f.name} ({size}) {f.modified_at or ''}") |
| 1724 | |
| 1725 | |
| 1726 | @recordings.command("teams-list") |
| 1727 | @click.option("--user-id", default="me", help="Microsoft user ID") |
| 1728 | @click.option("--json", "as_json", is_flag=True, help="Output as JSON") |
| 1729 | def recordings_teams_list(user_id, as_json): |
| 1730 | """List Teams meeting recordings via the m365 CLI. |
| 1731 | |
| 1732 | Requires: npm install -g @pnp/cli-microsoft365 && m365 login |
| 1733 | |
| 1734 | Examples: |
| 1735 | |
| 1736 | planopticon recordings teams-list |
| 1737 | |
| 1738 | planopticon recordings teams-list --json |
| 1739 | """ |
| 1740 | from video_processor.sources.teams_recording_source import ( |
| 1741 | TeamsRecordingSource, |
| 1742 | ) |
| 1743 | |
| 1744 | source = TeamsRecordingSource(user_id=user_id) |
| 1745 | if not source.authenticate(): |
| 1746 | click.echo("Teams authentication failed.", err=True) |
| 1747 | sys.exit(1) |
| 1748 | |
| 1749 | files = source.list_videos() |
| 1750 | if as_json: |
| 1751 | click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str)) |
| 1752 | else: |
| 1753 | click.echo(f"Found {len(files)} recording(s):") |
| 1754 | for f in files: |
| 1755 | click.echo(f" {f.name} {f.modified_at or ''}") |
| 1756 | |
| 1757 | |
| 1758 | @recordings.command("meet-list") |
| 1759 | @click.option("--folder-id", default=None, help="Drive folder ID") |
| 1760 | @click.option("--json", "as_json", is_flag=True, help="Output as JSON") |
| 1761 | def recordings_meet_list(folder_id, as_json): |
| 1762 | """List Google Meet recordings in Drive via the gws CLI. |
| 1763 | |
| 1764 | Requires: npm install -g @googleworkspace/cli && gws auth login |
| 1765 | |
| 1766 | Examples: |
| 1767 | |
| 1768 | planopticon recordings meet-list |
| 1769 | |
| 1770 | planopticon recordings meet-list --folder-id abc123 |
| 1771 | """ |
| 1772 | from video_processor.sources.meet_recording_source import ( |
| 1773 | MeetRecordingSource, |
| 1774 | ) |
| 1775 | |
| 1776 | source = MeetRecordingSource(drive_folder_id=folder_id) |
| 1777 | if not source.authenticate(): |
| 1778 | click.echo("Google Meet authentication failed.", err=True) |
| 1779 | sys.exit(1) |
| 1780 | |
| 1781 | files = source.list_videos() |
| 1782 | if as_json: |
| 1783 | click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str)) |
| 1784 | else: |
| 1785 | click.echo(f"Found {len(files)} recording(s):") |
| 1786 | for f in files: |
| 1787 | size = f"{f.size_bytes // 1_000_000} MB" if f.size_bytes else "unknown" |
| 1788 | click.echo(f" {f.name} ({size}) {f.modified_at or ''}") |
| 1789 | |
| 1790 | |
| 1791 | @cli.group() |
| 1792 | def kg(): |
| 1793 | """Knowledge graph utilities: convert, sync, and inspect.""" |
| 1794 | pass |
| 1795 | |
| 1796 | |
| 1797 | @kg.command() |
| 1798 | @click.argument("source_path", type=click.Path(exists=True)) |
| 1799 | @click.argument("dest_path", type=click.Path()) |
| 1800 | def convert(source_path, dest_path): |
| 1801 | """Convert a knowledge graph between formats. |
| 1802 | |
| 1803 | Supports .db (SQLite) and .json. The output format is inferred from DEST_PATH extension. |
| 1804 | |
| 1805 | Examples: |
| 1806 | |
| 1807 | planopticon kg convert results/knowledge_graph.db output.json |
| 1808 | planopticon kg convert knowledge_graph.json knowledge_graph.db |
| 1809 | """ |
| 1810 | from video_processor.integrators.graph_store import InMemoryStore, SQLiteStore |
| 1811 | |
| 1812 | source_path = Path(source_path) |
| 1813 | dest_path = Path(dest_path) |
| 1814 | |
| 1815 | if source_path.suffix == dest_path.suffix: |
| 1816 | click.echo(f"Source and destination are the same format ({source_path.suffix}).", err=True) |
| 1817 | sys.exit(1) |
| 1818 | |
| 1819 | # Load source |
| 1820 | if source_path.suffix == ".db": |
| 1821 | src_store = SQLiteStore(source_path) |
| 1822 | elif source_path.suffix == ".json": |
| 1823 | data = json.loads(source_path.read_text()) |
| 1824 | src_store = InMemoryStore() |
| 1825 | for node in data.get("nodes", []): |
| 1826 | descs = node.get("descriptions", []) |
| 1827 | if isinstance(descs, set): |
| 1828 | descs = list(descs) |
| 1829 | src_store.merge_entity(node.get("name", ""), node.get("type", "concept"), descs) |
| 1830 | for occ in node.get("occurrences", []): |
| 1831 | src_store.add_occurrence( |
| 1832 | node.get("name", ""), |
| 1833 | occ.get("source", ""), |
| 1834 | occ.get("timestamp"), |
| 1835 | occ.get("text"), |
| 1836 | ) |
| 1837 | for rel in data.get("relationships", []): |
| 1838 | src_store.add_relationship( |
| 1839 | rel.get("source", ""), |
| 1840 | rel.get("target", ""), |
| 1841 | rel.get("type", "related_to"), |
| 1842 | content_source=rel.get("content_source"), |
| 1843 | timestamp=rel.get("timestamp"), |
| 1844 | ) |
| 1845 | else: |
| 1846 | click.echo(f"Unsupported source format: {source_path.suffix}", err=True) |
| 1847 | sys.exit(1) |
| 1848 | |
| 1849 | # Write destination |
| 1850 | from video_processor.integrators.knowledge_graph import KnowledgeGraph |
| 1851 | |
| 1852 | kg_obj = KnowledgeGraph(store=src_store) |
| 1853 | kg_obj.save(dest_path) |
| 1854 | |
| 1855 | e_count = src_store.get_entity_count() |
| 1856 | r_count = src_store.get_relationship_count() |
| 1857 | click.echo( |
| 1858 | f"Converted {source_path} → {dest_path} ({e_count} entities, {r_count} relationships)" |
| 1859 | ) |
| 1860 | |
| 1861 | if hasattr(src_store, "close"): |
| 1862 | src_store.close() |
| 1863 | |
| 1864 | |
| 1865 | @kg.command() |
| 1866 | @click.argument("db_path", type=click.Path(exists=True)) |
| 1867 | @click.argument("json_path", type=click.Path(), required=False, default=None) |
| 1868 | @click.option( |
| 1869 | "--direction", |
| 1870 | type=click.Choice(["db-to-json", "json-to-db", "auto"]), |
| 1871 | default="auto", |
| 1872 | help="Sync direction. 'auto' picks the newer file as source.", |
| 1873 | ) |
| 1874 | def sync(db_path, json_path, direction): |
| 1875 | """Sync a .db and .json knowledge graph, updating the stale one. |
| 1876 | |
| 1877 | If JSON_PATH is omitted, uses the same name with .json extension. |
| 1878 | |
| 1879 | Examples: |
| 1880 | |
| 1881 | planopticon kg sync results/knowledge_graph.db |
| 1882 | planopticon kg sync knowledge_graph.db knowledge_graph.json --direction db-to-json |
| 1883 | """ |
| 1884 | db_path = Path(db_path) |
| 1885 | if json_path is None: |
| 1886 | json_path = db_path.with_suffix(".json") |
| 1887 | else: |
| 1888 | json_path = Path(json_path) |
| 1889 | |
| 1890 | if direction == "auto": |
| 1891 | if not json_path.exists(): |
| 1892 | direction = "db-to-json" |
| 1893 | elif not db_path.exists(): |
| 1894 | direction = "json-to-db" |
| 1895 | else: |
| 1896 | db_mtime = db_path.stat().st_mtime |
| 1897 | json_mtime = json_path.stat().st_mtime |
| 1898 | direction = "db-to-json" if db_mtime >= json_mtime else "json-to-db" |
| 1899 | |
| 1900 | from video_processor.integrators.knowledge_graph import KnowledgeGraph |
| 1901 | |
| 1902 | if direction == "db-to-json": |
| 1903 | kg_obj = KnowledgeGraph(db_path=db_path) |
| 1904 | kg_obj.save(json_path) |
| 1905 | click.echo(f"Synced {db_path} → {json_path}") |
| 1906 | else: |
| 1907 | data = json.loads(json_path.read_text()) |
| 1908 | kg_obj = KnowledgeGraph.from_dict(data, db_path=db_path) |
| 1909 | # Force write to db by saving |
| 1910 | kg_obj.save(db_path) |
| 1911 | click.echo(f"Synced {json_path} → {db_path}") |
| 1912 | |
| 1913 | click.echo( |
| 1914 | f" {kg_obj._store.get_entity_count()} entities, " |
| 1915 | f"{kg_obj._store.get_relationship_count()} relationships" |
| 1916 | ) |
| 1917 | |
| 1918 | |
| 1919 | @kg.command() |
| 1920 | @click.argument("path", type=click.Path(exists=True)) |
| 1921 | def inspect(path): |
| 1922 | """Show summary stats for a knowledge graph file (.db or .json).""" |
| 1923 | from video_processor.integrators.graph_discovery import describe_graph |
| 1924 | |
| 1925 | path = Path(path) |
| 1926 | info = describe_graph(path) |
| 1927 | click.echo(f"File: {path}") |
| 1928 | click.echo(f"Store: {info['store_type']}") |
| 1929 | click.echo(f"Entities: {info['entity_count']}") |
| 1930 | click.echo(f"Relationships: {info['relationship_count']}") |
| 1931 | if info["entity_types"]: |
| 1932 | click.echo("Entity types:") |
| 1933 | for t, count in sorted(info["entity_types"].items(), key=lambda x: -x[1]): |
| 1934 | click.echo(f" {t}: {count}") |
| 1935 | |
| 1936 | |
| 1937 | @kg.command() |
| 1938 | @click.argument("db_path", type=click.Path(exists=True)) |
| 1939 | @click.option("--provider", "-p", type=str, default="auto") |
| 1940 | @click.option("--chat-model", type=str, default=None) |
| 1941 | @click.option( |
| 1942 | "--format", |
| 1943 | "output_format", |
| 1944 | type=click.Choice(["text", "json"]), |
| 1945 | default="text", |
| 1946 | ) |
| 1947 | @click.pass_context |
| 1948 | def classify(ctx, db_path, provider, chat_model, output_format): |
| 1949 | """Classify knowledge graph entities into planning taxonomy types. |
| 1950 | |
| 1951 | Examples:\n |
| 1952 | planopticon kg classify results/knowledge_graph.db\n |
| 1953 | planopticon kg classify results/knowledge_graph.db --format json |
| 1954 | """ |
| 1955 | from video_processor.integrators.graph_store import create_store |
| 1956 | from video_processor.integrators.taxonomy import TaxonomyClassifier |
| 1957 | |
| 1958 | db_path = Path(db_path) |
| 1959 | store = create_store(db_path) |
| 1960 | entities = store.get_all_entities() |
| 1961 | relationships = store.get_all_relationships() |
| 1962 | |
| 1963 | pm = None |
| 1964 | if provider != "none": |
| 1965 | try: |
| 1966 | from video_processor.providers.manager import ProviderManager |
| 1967 | |
| 1968 | pm = ProviderManager(provider=provider if provider != "auto" else None) |
| 1969 | if chat_model: |
| 1970 | pm.chat_model = chat_model |
| 1971 | except Exception: |
| 1972 | pm = None # fall back to heuristic-only |
| 1973 | |
| 1974 | classifier = TaxonomyClassifier(provider_manager=pm) |
| 1975 | planning_entities = classifier.classify_entities(entities, relationships) |
| 1976 | |
| 1977 | if output_format == "json": |
| 1978 | click.echo( |
| 1979 | json.dumps( |
| 1980 | [pe.model_dump() for pe in planning_entities], |
| 1981 | indent=2, |
| 1982 | ) |
| 1983 | ) |
| 1984 | else: |
| 1985 | if not planning_entities: |
| 1986 | click.echo("No entities matched planning taxonomy types.") |
| 1987 | return |
| 1988 | workstreams = classifier.organize_by_workstream(planning_entities) |
| 1989 | for group_name, items in sorted(workstreams.items()): |
| 1990 | click.echo(f"\n{group_name.upper()} ({len(items)})") |
| 1991 | for pe in items: |
| 1992 | priority_str = f" [{pe.priority}]" if pe.priority else "" |
| 1993 | click.echo(f" - {pe.name}{priority_str}") |
| 1994 | if pe.description: |
| 1995 | click.echo(f" {pe.description}") |
| 1996 | |
| 1997 | store.close() |
| 1998 | |
| 1999 | |
| 2000 | @kg.command("from-exchange") |
| 2001 | @click.argument("exchange_path", type=click.Path(exists=True)) |
| 2002 | @click.option( |
| 2003 | "-o", |
| 2004 | "--output", |
| 2005 | "db_path", |
| 2006 | type=click.Path(), |
| 2007 | default=None, |
| 2008 | help="Output .db file path", |
| 2009 | ) |
| 2010 | def kg_from_exchange(exchange_path, db_path): |
| 2011 | """Import a PlanOpticonExchange JSON file into a knowledge graph .db. |
| 2012 | |
| 2013 | Examples: |
| 2014 | |
| 2015 | planopticon kg from-exchange exchange.json |
| 2016 | |
| 2017 | planopticon kg from-exchange exchange.json -o project.db |
| 2018 | """ |
| 2019 | from video_processor.exchange import PlanOpticonExchange |
| 2020 | from video_processor.integrators.knowledge_graph import KnowledgeGraph |
| 2021 | |
| 2022 | ex = PlanOpticonExchange.from_file(exchange_path) |
| 2023 | |
| 2024 | kg_dict = { |
| 2025 | "nodes": [e.model_dump() for e in ex.entities], |
| 2026 | "relationships": [r.model_dump() for r in ex.relationships], |
| 2027 | "sources": [s.model_dump() for s in ex.sources], |
| 2028 | } |
| 2029 | |
| 2030 | out = Path(db_path) if db_path else Path.cwd() / "knowledge_graph.db" |
| 2031 | kg_obj = KnowledgeGraph.from_dict(kg_dict, db_path=out) |
| 2032 | kg_obj.save(out) |
| 2033 | |
| 2034 | click.echo( |
| 2035 | f"Imported exchange into {out} " |
| 2036 | f"({len(ex.entities)} entities, " |
| 2037 | f"{len(ex.relationships)} relationships)" |
| 2038 | ) |
| 2039 | |
| 2040 | |
| 2041 | @cli.command() |
| 2042 | @click.option( |
| 2043 | "--kb", |
| 2044 | multiple=True, |
| 2045 | type=click.Path(exists=True), |
| 2046 | help="Knowledge base paths", |
| 2047 | ) |
| 2048 | @click.option( |
| 2049 | "--provider", |
| 2050 | "-p", |
| 2051 | type=str, |
| 2052 | default="auto", |
| 2053 | help="LLM provider (auto, openai, anthropic, ...)", |
| 2054 | ) |
| 2055 | @click.option( |
| 2056 | "--chat-model", |
| 2057 | type=str, |
| 2058 | default=None, |
| 2059 | help="Chat model override", |
| 2060 | ) |
| 2061 | @click.pass_context |
| 2062 | def companion(ctx, kb, provider, chat_model): |
| 2063 | """Interactive planning companion with workspace awareness. |
| 2064 | |
| 2065 | Examples: |
| 2066 | |
| 2067 | planopticon companion |
| 2068 | |
| 2069 | planopticon companion --kb ./results |
| 2070 | |
| 2071 | planopticon companion -p anthropic |
| 2072 | """ |
| 2073 | from video_processor.cli.companion import CompanionREPL |
| 2074 | |
| 2075 | repl = CompanionREPL( |
| 2076 | kb_paths=list(kb), |
| 2077 | provider=provider, |
| 2078 | chat_model=chat_model, |
| 2079 | ) |
| 2080 | repl.run() |
| 2081 | |
| 2082 | |
| 2083 | def _interactive_menu(ctx): |
| 2084 | """Show an interactive menu when planopticon is run with no arguments.""" |
| 2085 | click.echo() |
| @@ -698,11 +2105,24 @@ | |
| 2105 | type=click.Choice(["basic", "standard", "comprehensive"]), |
| 2106 | default="standard", |
| 2107 | ) |
| 2108 | provider = click.prompt( |
| 2109 | " Provider", |
| 2110 | type=click.Choice( |
| 2111 | [ |
| 2112 | "auto", |
| 2113 | "openai", |
| 2114 | "anthropic", |
| 2115 | "gemini", |
| 2116 | "ollama", |
| 2117 | "azure", |
| 2118 | "together", |
| 2119 | "fireworks", |
| 2120 | "cerebras", |
| 2121 | "xai", |
| 2122 | ] |
| 2123 | ), |
| 2124 | default="auto", |
| 2125 | ) |
| 2126 | ctx.invoke( |
| 2127 | analyze, |
| 2128 | input=input_path, |
| @@ -727,11 +2147,24 @@ | |
| 2147 | type=click.Choice(["basic", "standard", "comprehensive"]), |
| 2148 | default="standard", |
| 2149 | ) |
| 2150 | provider = click.prompt( |
| 2151 | " Provider", |
| 2152 | type=click.Choice( |
| 2153 | [ |
| 2154 | "auto", |
| 2155 | "openai", |
| 2156 | "anthropic", |
| 2157 | "gemini", |
| 2158 | "ollama", |
| 2159 | "azure", |
| 2160 | "together", |
| 2161 | "fireworks", |
| 2162 | "cerebras", |
| 2163 | "xai", |
| 2164 | ] |
| 2165 | ), |
| 2166 | default="auto", |
| 2167 | ) |
| 2168 | ctx.invoke( |
| 2169 | batch, |
| 2170 | input_dir=input_dir, |
| 2171 | |
| 2172 | DDED video_processor/cli/companion.py |
| 2173 | DDED video_processor/exchange.py |
| 2174 | DDED video_processor/exporters/__init__.py |
| 2175 | DDED video_processor/exporters/markdown.py |
| --- a/video_processor/cli/companion.py | ||
| +++ b/video_processor/cli/companion.py | ||
| @@ -0,0 +1,414 @@ | ||
| 1 | +"""Interactive planning companion REPL for PlanOpticon.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +from pathlib import Path | |
| 5 | +from typing import List, Optional | |
| 6 | + | |
| 7 | +logger = logging.getLogger(__name__) | |
| 8 | + | |
| 9 | +VIDEO_EXTS = {".mp4", ".mkv", ".webm"} | |
| 10 | +DOC_EXTS = {".md", ".pdf", ".docx"} | |
| 11 | + | |
| 12 | + | |
| 13 | +class CompanionREPL: | |
| 14 | + """Smart REPL with workspace awareness and KG querying.""" | |
| 15 | + | |
| 16 | + def __init__( | |
| 17 | + self, | |
| 18 | + kb_paths: Optional[List[str]] = None, | |
| 19 | + provider: str = "auto", | |
| 20 | + chat_model: Optional[str] = None, | |
| 21 | + ): | |
| 22 | + self.kg = None | |
| 23 | + self.query_engine = None | |
| 24 | + self.agent = None | |
| 25 | + self.provider_manager = None | |
| 26 | + self._kb_paths = kb_paths or [] | |
| 27 | + self._provider_name = provider | |
| 28 | + self._chat_model = chat_model | |
| 29 | + self._videos: List[Path] = [] | |
| 30 | + self._docs: List[Path] = [] | |
| 31 | + self._kg_path: Optional[Path] = None | |
| 32 | + | |
| 33 | + def _discover(self) -> None: | |
| 34 | + """Auto-discover workspace context.""" | |
| 35 | + # Discover knowledge graphs | |
| 36 | + from video_processor.integrators.graph_discovery import ( | |
| 37 | + find_nearest_graph, | |
| 38 | + ) | |
| 39 | + | |
| 40 | + if self._kb_paths: | |
| 41 | + # Use explicit paths | |
| 42 | + self._kg_path = Path(self._kb_paths[0]) | |
| 43 | + else: | |
| 44 | + self._kg_path = find_nearest_graph() | |
| 45 | + | |
| 46 | + if self._kg_path and self._kg_path.exists(): | |
| 47 | + self._load_kg(self._kg_path) | |
| 48 | + | |
| 49 | + # Scan for media and doc files in cwd | |
| 50 | + cwd = Path.cwd() | |
| 51 | + try: | |
| 52 | + for f in sorted(cwd.iterdir()): | |
| 53 | + if f.suffix.lower() in VIDEO_EXTS: | |
| 54 | + self._videos.append(f) | |
| 55 | + elif f.suffix.lower() in DOC_EXTS: | |
| 56 | + self._docs.append(f) | |
| 57 | + except PermissionError: | |
| 58 | + pass | |
| 59 | + | |
| 60 | + def _load_kg(self, path: Path) -> None: | |
| 61 | + """Load a knowledge graph from a file path.""" | |
| 62 | + from video_processor.integrators.graph_query import ( | |
| 63 | + GraphQueryEngine, | |
| 64 | + ) | |
| 65 | + | |
| 66 | + try: | |
| 67 | + if path.suffix == ".json": | |
| 68 | + self.query_engine = GraphQueryEngine.from_json_path(path) | |
| 69 | + else: | |
| 70 | + self.query_engine = GraphQueryEngine.from_db_path(path) | |
| 71 | + self.kg = self.query_engine.store | |
| 72 | + except Exception as exc: | |
| 73 | + logger.debug("Failed to load KG at %s: %s", path, exc) | |
| 74 | + | |
| 75 | + def _init_provider(self) -> None: | |
| 76 | + """Try to initialise an LLM provider.""" | |
| 77 | + try: | |
| 78 | + from video_processor.providers.manager import ( | |
| 79 | + ProviderManager, | |
| 80 | + ) | |
| 81 | + | |
| 82 | + prov = None if self._provider_name == "auto" else self._provider_name | |
| 83 | + self.provider_manager = ProviderManager( | |
| 84 | + chat_model=self._chat_model, | |
| 85 | + provider=prov, | |
| 86 | + ) | |
| 87 | + except Exception: | |
| 88 | + self.provider_manager = None | |
| 89 | + | |
| 90 | + def _init_agent(self) -> None: | |
| 91 | + """Create a PlanningAgent if possible.""" | |
| 92 | + try: | |
| 93 | + from video_processor.agent.agent_loop import ( | |
| 94 | + PlanningAgent, | |
| 95 | + ) | |
| 96 | + from video_processor.agent.skills.base import ( | |
| 97 | + AgentContext, | |
| 98 | + ) | |
| 99 | + | |
| 100 | + ctx = AgentContext( | |
| 101 | + knowledge_graph=self.kg, | |
| 102 | + query_engine=self.query_engine, | |
| 103 | + provider_manager=self.provider_manager, | |
| 104 | + ) | |
| 105 | + self.agent = PlanningAgent(context=ctx) | |
| 106 | + except Exception: | |
| 107 | + self.agent = None | |
| 108 | + | |
| 109 | + def _welcome_banner(self) -> str: | |
| 110 | + """Build the welcome banner text.""" | |
| 111 | + lines = [ | |
| 112 | + "", | |
| 113 | + " PlanOpticon Companion", | |
| 114 | + " Interactive planning REPL", | |
| 115 | + "", | |
| 116 | + ] | |
| 117 | + | |
| 118 | + if self._kg_path and self.query_engine: | |
| 119 | + stats = self.query_engine.stats().data | |
| 120 | + lines.append( | |
| 121 | + f" Knowledge graph: {self._kg_path.name}" | |
| 122 | + f" ({stats['entity_count']} entities," | |
| 123 | + f" {stats['relationship_count']} relationships)" | |
| 124 | + ) | |
| 125 | + else: | |
| 126 | + lines.append(" No knowledge graph loaded.") | |
| 127 | + | |
| 128 | + if self._videos: | |
| 129 | + names = ", ".join(v.name for v in self._videos[:3]) | |
| 130 | + suffix = f" (+{len(self._videos) - 3} more)" if len(self._videos) > 3 else "" | |
| 131 | + lines.append(f" Videos: {names}{suffix}") | |
| 132 | + | |
| 133 | + if self._docs: | |
| 134 | + names = ", ".join(d.name for d in self._docs[:3]) | |
| 135 | + suffix = f" (+{len(self._docs) - 3} more)" if len(self._docs) > 3 else "" | |
| 136 | + lines.append(f" Docs: {names}{suffix}") | |
| 137 | + | |
| 138 | + if self.provider_manager: | |
| 139 | + prov = getattr(self.provider_manager, "provider", self._provider_name) | |
| 140 | + model = self._chat_model or "default" | |
| 141 | + lines.append(f" LLM provider: {prov} (model: {model})") | |
| 142 | + else: | |
| 143 | + lines.append(" LLM provider: none") | |
| 144 | + lines.append("") | |
| 145 | + lines.append(" Type /help for commands, or ask a question.") | |
| 146 | + lines.append("") | |
| 147 | + return "\n".join(lines) | |
| 148 | + | |
| 149 | + # ── Command handlers ── | |
| 150 | + | |
| 151 | + def _cmd_help(self) -> str: | |
| 152 | + lines = [ | |
| 153 | + "Available commands:", | |
| 154 | + " /help Show this help", | |
| 155 | + " /status Workspace status", | |
| 156 | + " /skills List available skills", | |
| 157 | + " /entities [--type T] List KG entities", | |
| 158 | + " /search TERM Search entities by name", | |
| 159 | + " /neighbors ENTITY Show entity relationships", | |
| 160 | + " /export FORMAT Export KG (markdown, obsidian, notion, csv)", | |
| 161 | + " /analyze PATH Analyze a video/doc", | |
| 162 | + " /ingest PATH Ingest a file into the KG", | |
| 163 | + " /auth SERVICE Authenticate with a cloud service", | |
| 164 | + " /provider [NAME] List or switch LLM provider", | |
| 165 | + " /model [NAME] Show or switch chat model", | |
| 166 | + " /run SKILL Run a skill by name", | |
| 167 | + " /plan Run project_plan skill", | |
| 168 | + " /prd Run PRD skill", | |
| 169 | + " /tasks Run task_breakdown skill", | |
| 170 | + " /quit, /exit Exit companion", | |
| 171 | + "", | |
| 172 | + "Any other input is sent to the chat agent (requires LLM).", | |
| 173 | + ] | |
| 174 | + return "\n".join(lines) | |
| 175 | + | |
| 176 | + def _cmd_status(self) -> str: | |
| 177 | + lines = ["Workspace status:"] | |
| 178 | + if self._kg_path and self.query_engine: | |
| 179 | + stats = self.query_engine.stats().data | |
| 180 | + lines.append( | |
| 181 | + f" KG: {self._kg_path}" | |
| 182 | + f" ({stats['entity_count']} entities," | |
| 183 | + f" {stats['relationship_count']} relationships)" | |
| 184 | + ) | |
| 185 | + if stats.get("entity_types"): | |
| 186 | + for t, c in sorted( | |
| 187 | + stats["entity_types"].items(), | |
| 188 | + key=lambda x: -x[1], | |
| 189 | + ): | |
| 190 | + lines.append(f" {t}: {c}") | |
| 191 | + else: | |
| 192 | + lines.append(" KG: not loaded") | |
| 193 | + | |
| 194 | + lines.append(f" Videos: {len(self._videos)} found") | |
| 195 | + lines.append(f" Docs: {len(self._docs)} found") | |
| 196 | + lines.append(f" Provider: {'active' if self.provider_manager else 'none'}") | |
| 197 | + return "\n".join(lines) | |
| 198 | + | |
| 199 | + def _cmd_skills(self) -> str: | |
| 200 | + from video_processor.agent.skills.base import ( | |
| 201 | + list_skills, | |
| 202 | + ) | |
| 203 | + | |
| 204 | + skills = list_skills() | |
| 205 | + if not skills: | |
| 206 | + return "No skills registered." | |
| 207 | + lines = ["Available skills:"] | |
| 208 | + for s in skills: | |
| 209 | + lines.append(f" {s.name}: {s.description}") | |
| 210 | + return "\n".join(lines) | |
| 211 | + | |
| 212 | + def _cmd_entities(self, args: str) -> str: | |
| 213 | + if not self.query_engine: | |
| 214 | + return "No knowledge graph loaded." | |
| 215 | + entity_type = None | |
| 216 | + parts = args.split() | |
| 217 | + for i, part in enumerate(parts): | |
| 218 | + if part == "--type" and i + 1 < len(parts): | |
| 219 | + entity_type = parts[i + 1] | |
| 220 | + result = self.query_engine.entities( | |
| 221 | + entity_type=entity_type, | |
| 222 | + ) | |
| 223 | + return result.to_text() | |
| 224 | + | |
| 225 | + def _cmd_search(self, term: str) -> str: | |
| 226 | + if not self.query_engine: | |
| 227 | + return "No knowledge graph loaded." | |
| 228 | + term = term.strip() | |
| 229 | + if not term: | |
| 230 | + return "Usage: /search TERM" | |
| 231 | + result = self.query_engine.entities(name=term) | |
| 232 | + return result.to_text() | |
| 233 | + | |
| 234 | + def _cmd_neighbors(self, entity: str) -> str: | |
| 235 | + if not self.query_engine: | |
| 236 | + return "No knowledge graph loaded." | |
| 237 | + entity = entity.strip() | |
| 238 | + if not entity: | |
| 239 | + return "Usage: /neighbors ENTITY" | |
| 240 | + result = self.query_engine.neighbors(entity) | |
| 241 | + return result.to_text() | |
| 242 | + | |
| 243 | + def _cmd_export(self, fmt: str) -> str: | |
| 244 | + fmt = fmt.strip().lower() | |
| 245 | + if not fmt: | |
| 246 | + return "Usage: /export FORMAT (markdown, obsidian, notion, csv)" | |
| 247 | + if not self._kg_path: | |
| 248 | + return "No knowledge graph loaded." | |
| 249 | + return ( | |
| 250 | + f"Export '{fmt}' requested. Use the CLI command:\n" | |
| 251 | + f" planopticon export {fmt} {self._kg_path}" | |
| 252 | + ) | |
| 253 | + | |
| 254 | + def _cmd_analyze(self, path_str: str) -> str: | |
| 255 | + path_str = path_str.strip() | |
| 256 | + if not path_str: | |
| 257 | + return "Usage: /analyze PATH" | |
| 258 | + p = Path(path_str) | |
| 259 | + if not p.exists(): | |
| 260 | + return f"File not found: {path_str}" | |
| 261 | + return f"Analyze requested for {p.name}. Use the CLI:\n planopticon analyze -i {p}" | |
| 262 | + | |
| 263 | + def _cmd_ingest(self, path_str: str) -> str: | |
| 264 | + path_str = path_str.strip() | |
| 265 | + if not path_str: | |
| 266 | + return "Usage: /ingest PATH" | |
| 267 | + p = Path(path_str) | |
| 268 | + if not p.exists(): | |
| 269 | + return f"File not found: {path_str}" | |
| 270 | + return f"Ingest requested for {p.name}. Use the CLI:\n planopticon ingest {p}" | |
| 271 | + | |
| 272 | + def _cmd_run_skill(self, skill_name: str) -> str: | |
| 273 | + skill_name = skill_name.strip() | |
| 274 | + if not skill_name: | |
| 275 | + return "Usage: /run SKILL_NAME" | |
| 276 | + from video_processor.agent.skills.base import ( | |
| 277 | + get_skill, | |
| 278 | + ) | |
| 279 | + | |
| 280 | + skill = get_skill(skill_name) | |
| 281 | + if not skill: | |
| 282 | + return f"Unknown skill: {skill_name}" | |
| 283 | + if not self.agent: | |
| 284 | + return "Agent not initialised (no LLM provider?)." | |
| 285 | + if not skill.can_execute(self.agent.context): | |
| 286 | + return f"Skill '{skill_name}' cannot execute in current context." | |
| 287 | + try: | |
| 288 | + artifact = skill.execute(self.agent.context) | |
| 289 | + return f"--- {artifact.name} ({artifact.artifact_type}) ---\n{artifact.content}" | |
| 290 | + except Exception as exc: | |
| 291 | + return f"Skill execution failed: {exc}" | |
| 292 | + | |
| 293 | + def _cmd_auth(self, args: str) -> str: | |
| 294 | + """Authenticate with a cloud service.""" | |
| 295 | + service = args.strip().lower() | |
| 296 | + if not service: | |
| 297 | + from video_processor.auth import KNOWN_CONFIGS | |
| 298 | + | |
| 299 | + services = ", ".join(sorted(KNOWN_CONFIGS.keys())) | |
| 300 | + return f"Usage: /auth SERVICE\nAvailable: {services}" | |
| 301 | + | |
| 302 | + from video_processor.auth import get_auth_manager | |
| 303 | + | |
| 304 | + manager = get_auth_manager(service) | |
| 305 | + if not manager: | |
| 306 | + return f"Unknown service: {service}" | |
| 307 | + | |
| 308 | + result = manager.authenticate() | |
| 309 | + if result.success: | |
| 310 | + return f"{service.title()} authenticated ({result.method})" | |
| 311 | + return f"{service.title()} auth failed: {result.error}" | |
| 312 | + | |
| 313 | + def _cmd_provider(self, args: str) -> str: | |
| 314 | + """List available providers or switch to a specific one.""" | |
| 315 | + args = args.strip().lower() | |
| 316 | + if not args or args == "list": | |
| 317 | + lines = ["Available providers:"] | |
| 318 | + known = [ | |
| 319 | + "openai", | |
| 320 | + "anthropic", | |
| 321 | + "gemini", | |
| 322 | + "ollama", | |
| 323 | + "azure", | |
| 324 | + "together", | |
| 325 | + "fireworks", | |
| 326 | + "cerebras", | |
| 327 | + "xai", | |
| 328 | + ] | |
| 329 | + import os | |
| 330 | + | |
| 331 | + key_map = { | |
| 332 | + "openai": "OPENAI_API_KEY", | |
| 333 | + "anthropic": "ANTHROPIC_API_KEY", | |
| 334 | + "gemini": "GEMINI_API_KEY", | |
| 335 | + "azure": "AZURE_OPENAI_API_KEY", | |
| 336 | + "together": "TOGETHER_API_KEY", | |
| 337 | + "fireworks": "FIREWORKS_API_KEY", | |
| 338 | + "cerebras": "CEREBRAS_API_KEY", | |
| 339 | + "xai": "XAI_API_KEY", | |
| 340 | + } | |
| 341 | + current = getattr(self.provider_manager, "provider", self._provider_name) | |
| 342 | + for name in known: | |
| 343 | + env = key_map.get(name) | |
| 344 | + has_key = bool(os.environ.get(env, "")) if env else None | |
| 345 | + if name == "ollama": | |
| 346 | + status = "local" | |
| 347 | + elif has_key: | |
| 348 | + status = "ready" | |
| 349 | + else: | |
| 350 | + status = "no key" | |
| 351 | + active = " (active)" if name == current else "" | |
| 352 | + lines.append(f" {name}: {status}{active}") | |
| 353 | + lines.append(f"\nCurrent: {current or 'none'}") | |
| 354 | + return "\n".join(lines) | |
| 355 | + | |
| 356 | + # Switch provider | |
| 357 | + self._provider_name = args | |
| 358 | + self._chat_model = None | |
| 359 | + self._init_provider() | |
| 360 | + self._init_agent() | |
| 361 | + if self.provider_manager: | |
| 362 | + return f"Switched to provider: {args}" | |
| 363 | + return f"Failed to initialise provider: {args}" | |
| 364 | + | |
| 365 | + def _cmd_model(self, args: str) -> str: | |
| 366 | + """Switch the chat model.""" | |
| 367 | + args = args.strip() | |
| 368 | + if not args: | |
| 369 | + current = self._chat_model or "default" | |
| 370 | + return f"Current model: {current}\nUsage: /model MODEL_NAME" | |
| 371 | + self._chat_model = args | |
| 372 | + self._init_provider() | |
| 373 | + self._init_agent() | |
| 374 | + if self.provider_manager: | |
| 375 | + return f"Switched to model: {args}" | |
| 376 | + return f"Failed to initialise with model: {args}" | |
| 377 | + | |
| 378 | + def _cmd_chat(self, message: str) -> str: | |
| 379 | + if not self.provider_manager or not self.agent: | |
| 380 | + return ( | |
| 381 | + "Chat requires an LLM provider. Set one of:\n" | |
| 382 | + " OPENAI_API_KEY\n" | |
| 383 | + " ANTHROPIC_API_KEY\n" | |
| 384 | + " GEMINI_API_KEY\n" | |
| 385 | + "Or pass --provider / --chat-model." | |
| 386 | + ) | |
| 387 | + try: | |
| 388 | + return self.agent.chat(message) | |
| 389 | + except Exception as exc: | |
| 390 | + return f"Chat error: {exc}" | |
| 391 | + | |
| 392 | + # ── Main dispatch ── | |
| 393 | + | |
| 394 | + def handle_input(self, line: str) -> str: | |
| 395 | + """Process a single def run find_nearest_graph, | |
| 396 | + Main REPL loop.""" | |
| 397 | + self._discover() | |
| 398 | + self._init_provider() | |
| 399 | + | |
| 400 | + print(self._welcome_banner()) | |
| 401 | + | |
| 402 | + while True: | |
| 403 | + try: | |
| 404 | + line = input("planopticon> ") | |
| 405 | + except (KeyboardInterrupt, EOFError): | |
| 406 | + print("\nBye.") | |
| 407 | + break | |
| 408 | + | |
| 409 | + output = self.handle_input(line) | |
| 410 | + if output == "__QUIT__": | |
| 411 | + print("Bye.") | |
| 412 | + break | |
| 413 | + if output: | |
| 414 | + print(output) |
| --- a/video_processor/cli/companion.py | |
| +++ b/video_processor/cli/companion.py | |
| @@ -0,0 +1,414 @@ | |
| --- a/video_processor/cli/companion.py | |
| +++ b/video_processor/cli/companion.py | |
| @@ -0,0 +1,414 @@ | |
| 1 | """Interactive planning companion REPL for PlanOpticon.""" |
| 2 | |
| 3 | import logging |
| 4 | from pathlib import Path |
| 5 | from typing import List, Optional |
| 6 | |
| 7 | logger = logging.getLogger(__name__) |
| 8 | |
| 9 | VIDEO_EXTS = {".mp4", ".mkv", ".webm"} |
| 10 | DOC_EXTS = {".md", ".pdf", ".docx"} |
| 11 | |
| 12 | |
| 13 | class CompanionREPL: |
| 14 | """Smart REPL with workspace awareness and KG querying.""" |
| 15 | |
| 16 | def __init__( |
| 17 | self, |
| 18 | kb_paths: Optional[List[str]] = None, |
| 19 | provider: str = "auto", |
| 20 | chat_model: Optional[str] = None, |
| 21 | ): |
| 22 | self.kg = None |
| 23 | self.query_engine = None |
| 24 | self.agent = None |
| 25 | self.provider_manager = None |
| 26 | self._kb_paths = kb_paths or [] |
| 27 | self._provider_name = provider |
| 28 | self._chat_model = chat_model |
| 29 | self._videos: List[Path] = [] |
| 30 | self._docs: List[Path] = [] |
| 31 | self._kg_path: Optional[Path] = None |
| 32 | |
| 33 | def _discover(self) -> None: |
| 34 | """Auto-discover workspace context.""" |
| 35 | # Discover knowledge graphs |
| 36 | from video_processor.integrators.graph_discovery import ( |
| 37 | find_nearest_graph, |
| 38 | ) |
| 39 | |
| 40 | if self._kb_paths: |
| 41 | # Use explicit paths |
| 42 | self._kg_path = Path(self._kb_paths[0]) |
| 43 | else: |
| 44 | self._kg_path = find_nearest_graph() |
| 45 | |
| 46 | if self._kg_path and self._kg_path.exists(): |
| 47 | self._load_kg(self._kg_path) |
| 48 | |
| 49 | # Scan for media and doc files in cwd |
| 50 | cwd = Path.cwd() |
| 51 | try: |
| 52 | for f in sorted(cwd.iterdir()): |
| 53 | if f.suffix.lower() in VIDEO_EXTS: |
| 54 | self._videos.append(f) |
| 55 | elif f.suffix.lower() in DOC_EXTS: |
| 56 | self._docs.append(f) |
| 57 | except PermissionError: |
| 58 | pass |
| 59 | |
| 60 | def _load_kg(self, path: Path) -> None: |
| 61 | """Load a knowledge graph from a file path.""" |
| 62 | from video_processor.integrators.graph_query import ( |
| 63 | GraphQueryEngine, |
| 64 | ) |
| 65 | |
| 66 | try: |
| 67 | if path.suffix == ".json": |
| 68 | self.query_engine = GraphQueryEngine.from_json_path(path) |
| 69 | else: |
| 70 | self.query_engine = GraphQueryEngine.from_db_path(path) |
| 71 | self.kg = self.query_engine.store |
| 72 | except Exception as exc: |
| 73 | logger.debug("Failed to load KG at %s: %s", path, exc) |
| 74 | |
| 75 | def _init_provider(self) -> None: |
| 76 | """Try to initialise an LLM provider.""" |
| 77 | try: |
| 78 | from video_processor.providers.manager import ( |
| 79 | ProviderManager, |
| 80 | ) |
| 81 | |
| 82 | prov = None if self._provider_name == "auto" else self._provider_name |
| 83 | self.provider_manager = ProviderManager( |
| 84 | chat_model=self._chat_model, |
| 85 | provider=prov, |
| 86 | ) |
| 87 | except Exception: |
| 88 | self.provider_manager = None |
| 89 | |
| 90 | def _init_agent(self) -> None: |
| 91 | """Create a PlanningAgent if possible.""" |
| 92 | try: |
| 93 | from video_processor.agent.agent_loop import ( |
| 94 | PlanningAgent, |
| 95 | ) |
| 96 | from video_processor.agent.skills.base import ( |
| 97 | AgentContext, |
| 98 | ) |
| 99 | |
| 100 | ctx = AgentContext( |
| 101 | knowledge_graph=self.kg, |
| 102 | query_engine=self.query_engine, |
| 103 | provider_manager=self.provider_manager, |
| 104 | ) |
| 105 | self.agent = PlanningAgent(context=ctx) |
| 106 | except Exception: |
| 107 | self.agent = None |
| 108 | |
| 109 | def _welcome_banner(self) -> str: |
| 110 | """Build the welcome banner text.""" |
| 111 | lines = [ |
| 112 | "", |
| 113 | " PlanOpticon Companion", |
| 114 | " Interactive planning REPL", |
| 115 | "", |
| 116 | ] |
| 117 | |
| 118 | if self._kg_path and self.query_engine: |
| 119 | stats = self.query_engine.stats().data |
| 120 | lines.append( |
| 121 | f" Knowledge graph: {self._kg_path.name}" |
| 122 | f" ({stats['entity_count']} entities," |
| 123 | f" {stats['relationship_count']} relationships)" |
| 124 | ) |
| 125 | else: |
| 126 | lines.append(" No knowledge graph loaded.") |
| 127 | |
| 128 | if self._videos: |
| 129 | names = ", ".join(v.name for v in self._videos[:3]) |
| 130 | suffix = f" (+{len(self._videos) - 3} more)" if len(self._videos) > 3 else "" |
| 131 | lines.append(f" Videos: {names}{suffix}") |
| 132 | |
| 133 | if self._docs: |
| 134 | names = ", ".join(d.name for d in self._docs[:3]) |
| 135 | suffix = f" (+{len(self._docs) - 3} more)" if len(self._docs) > 3 else "" |
| 136 | lines.append(f" Docs: {names}{suffix}") |
| 137 | |
| 138 | if self.provider_manager: |
| 139 | prov = getattr(self.provider_manager, "provider", self._provider_name) |
| 140 | model = self._chat_model or "default" |
| 141 | lines.append(f" LLM provider: {prov} (model: {model})") |
| 142 | else: |
| 143 | lines.append(" LLM provider: none") |
| 144 | lines.append("") |
| 145 | lines.append(" Type /help for commands, or ask a question.") |
| 146 | lines.append("") |
| 147 | return "\n".join(lines) |
| 148 | |
| 149 | # ── Command handlers ── |
| 150 | |
| 151 | def _cmd_help(self) -> str: |
| 152 | lines = [ |
| 153 | "Available commands:", |
| 154 | " /help Show this help", |
| 155 | " /status Workspace status", |
| 156 | " /skills List available skills", |
| 157 | " /entities [--type T] List KG entities", |
| 158 | " /search TERM Search entities by name", |
| 159 | " /neighbors ENTITY Show entity relationships", |
| 160 | " /export FORMAT Export KG (markdown, obsidian, notion, csv)", |
| 161 | " /analyze PATH Analyze a video/doc", |
| 162 | " /ingest PATH Ingest a file into the KG", |
| 163 | " /auth SERVICE Authenticate with a cloud service", |
| 164 | " /provider [NAME] List or switch LLM provider", |
| 165 | " /model [NAME] Show or switch chat model", |
| 166 | " /run SKILL Run a skill by name", |
| 167 | " /plan Run project_plan skill", |
| 168 | " /prd Run PRD skill", |
| 169 | " /tasks Run task_breakdown skill", |
| 170 | " /quit, /exit Exit companion", |
| 171 | "", |
| 172 | "Any other input is sent to the chat agent (requires LLM).", |
| 173 | ] |
| 174 | return "\n".join(lines) |
| 175 | |
| 176 | def _cmd_status(self) -> str: |
| 177 | lines = ["Workspace status:"] |
| 178 | if self._kg_path and self.query_engine: |
| 179 | stats = self.query_engine.stats().data |
| 180 | lines.append( |
| 181 | f" KG: {self._kg_path}" |
| 182 | f" ({stats['entity_count']} entities," |
| 183 | f" {stats['relationship_count']} relationships)" |
| 184 | ) |
| 185 | if stats.get("entity_types"): |
| 186 | for t, c in sorted( |
| 187 | stats["entity_types"].items(), |
| 188 | key=lambda x: -x[1], |
| 189 | ): |
| 190 | lines.append(f" {t}: {c}") |
| 191 | else: |
| 192 | lines.append(" KG: not loaded") |
| 193 | |
| 194 | lines.append(f" Videos: {len(self._videos)} found") |
| 195 | lines.append(f" Docs: {len(self._docs)} found") |
| 196 | lines.append(f" Provider: {'active' if self.provider_manager else 'none'}") |
| 197 | return "\n".join(lines) |
| 198 | |
| 199 | def _cmd_skills(self) -> str: |
| 200 | from video_processor.agent.skills.base import ( |
| 201 | list_skills, |
| 202 | ) |
| 203 | |
| 204 | skills = list_skills() |
| 205 | if not skills: |
| 206 | return "No skills registered." |
| 207 | lines = ["Available skills:"] |
| 208 | for s in skills: |
| 209 | lines.append(f" {s.name}: {s.description}") |
| 210 | return "\n".join(lines) |
| 211 | |
| 212 | def _cmd_entities(self, args: str) -> str: |
| 213 | if not self.query_engine: |
| 214 | return "No knowledge graph loaded." |
| 215 | entity_type = None |
| 216 | parts = args.split() |
| 217 | for i, part in enumerate(parts): |
| 218 | if part == "--type" and i + 1 < len(parts): |
| 219 | entity_type = parts[i + 1] |
| 220 | result = self.query_engine.entities( |
| 221 | entity_type=entity_type, |
| 222 | ) |
| 223 | return result.to_text() |
| 224 | |
| 225 | def _cmd_search(self, term: str) -> str: |
| 226 | if not self.query_engine: |
| 227 | return "No knowledge graph loaded." |
| 228 | term = term.strip() |
| 229 | if not term: |
| 230 | return "Usage: /search TERM" |
| 231 | result = self.query_engine.entities(name=term) |
| 232 | return result.to_text() |
| 233 | |
| 234 | def _cmd_neighbors(self, entity: str) -> str: |
| 235 | if not self.query_engine: |
| 236 | return "No knowledge graph loaded." |
| 237 | entity = entity.strip() |
| 238 | if not entity: |
| 239 | return "Usage: /neighbors ENTITY" |
| 240 | result = self.query_engine.neighbors(entity) |
| 241 | return result.to_text() |
| 242 | |
| 243 | def _cmd_export(self, fmt: str) -> str: |
| 244 | fmt = fmt.strip().lower() |
| 245 | if not fmt: |
| 246 | return "Usage: /export FORMAT (markdown, obsidian, notion, csv)" |
| 247 | if not self._kg_path: |
| 248 | return "No knowledge graph loaded." |
| 249 | return ( |
| 250 | f"Export '{fmt}' requested. Use the CLI command:\n" |
| 251 | f" planopticon export {fmt} {self._kg_path}" |
| 252 | ) |
| 253 | |
| 254 | def _cmd_analyze(self, path_str: str) -> str: |
| 255 | path_str = path_str.strip() |
| 256 | if not path_str: |
| 257 | return "Usage: /analyze PATH" |
| 258 | p = Path(path_str) |
| 259 | if not p.exists(): |
| 260 | return f"File not found: {path_str}" |
| 261 | return f"Analyze requested for {p.name}. Use the CLI:\n planopticon analyze -i {p}" |
| 262 | |
| 263 | def _cmd_ingest(self, path_str: str) -> str: |
| 264 | path_str = path_str.strip() |
| 265 | if not path_str: |
| 266 | return "Usage: /ingest PATH" |
| 267 | p = Path(path_str) |
| 268 | if not p.exists(): |
| 269 | return f"File not found: {path_str}" |
| 270 | return f"Ingest requested for {p.name}. Use the CLI:\n planopticon ingest {p}" |
| 271 | |
| 272 | def _cmd_run_skill(self, skill_name: str) -> str: |
| 273 | skill_name = skill_name.strip() |
| 274 | if not skill_name: |
| 275 | return "Usage: /run SKILL_NAME" |
| 276 | from video_processor.agent.skills.base import ( |
| 277 | get_skill, |
| 278 | ) |
| 279 | |
| 280 | skill = get_skill(skill_name) |
| 281 | if not skill: |
| 282 | return f"Unknown skill: {skill_name}" |
| 283 | if not self.agent: |
| 284 | return "Agent not initialised (no LLM provider?)." |
| 285 | if not skill.can_execute(self.agent.context): |
| 286 | return f"Skill '{skill_name}' cannot execute in current context." |
| 287 | try: |
| 288 | artifact = skill.execute(self.agent.context) |
| 289 | return f"--- {artifact.name} ({artifact.artifact_type}) ---\n{artifact.content}" |
| 290 | except Exception as exc: |
| 291 | return f"Skill execution failed: {exc}" |
| 292 | |
| 293 | def _cmd_auth(self, args: str) -> str: |
| 294 | """Authenticate with a cloud service.""" |
| 295 | service = args.strip().lower() |
| 296 | if not service: |
| 297 | from video_processor.auth import KNOWN_CONFIGS |
| 298 | |
| 299 | services = ", ".join(sorted(KNOWN_CONFIGS.keys())) |
| 300 | return f"Usage: /auth SERVICE\nAvailable: {services}" |
| 301 | |
| 302 | from video_processor.auth import get_auth_manager |
| 303 | |
| 304 | manager = get_auth_manager(service) |
| 305 | if not manager: |
| 306 | return f"Unknown service: {service}" |
| 307 | |
| 308 | result = manager.authenticate() |
| 309 | if result.success: |
| 310 | return f"{service.title()} authenticated ({result.method})" |
| 311 | return f"{service.title()} auth failed: {result.error}" |
| 312 | |
| 313 | def _cmd_provider(self, args: str) -> str: |
| 314 | """List available providers or switch to a specific one.""" |
| 315 | args = args.strip().lower() |
| 316 | if not args or args == "list": |
| 317 | lines = ["Available providers:"] |
| 318 | known = [ |
| 319 | "openai", |
| 320 | "anthropic", |
| 321 | "gemini", |
| 322 | "ollama", |
| 323 | "azure", |
| 324 | "together", |
| 325 | "fireworks", |
| 326 | "cerebras", |
| 327 | "xai", |
| 328 | ] |
| 329 | import os |
| 330 | |
| 331 | key_map = { |
| 332 | "openai": "OPENAI_API_KEY", |
| 333 | "anthropic": "ANTHROPIC_API_KEY", |
| 334 | "gemini": "GEMINI_API_KEY", |
| 335 | "azure": "AZURE_OPENAI_API_KEY", |
| 336 | "together": "TOGETHER_API_KEY", |
| 337 | "fireworks": "FIREWORKS_API_KEY", |
| 338 | "cerebras": "CEREBRAS_API_KEY", |
| 339 | "xai": "XAI_API_KEY", |
| 340 | } |
| 341 | current = getattr(self.provider_manager, "provider", self._provider_name) |
| 342 | for name in known: |
| 343 | env = key_map.get(name) |
| 344 | has_key = bool(os.environ.get(env, "")) if env else None |
| 345 | if name == "ollama": |
| 346 | status = "local" |
| 347 | elif has_key: |
| 348 | status = "ready" |
| 349 | else: |
| 350 | status = "no key" |
| 351 | active = " (active)" if name == current else "" |
| 352 | lines.append(f" {name}: {status}{active}") |
| 353 | lines.append(f"\nCurrent: {current or 'none'}") |
| 354 | return "\n".join(lines) |
| 355 | |
| 356 | # Switch provider |
| 357 | self._provider_name = args |
| 358 | self._chat_model = None |
| 359 | self._init_provider() |
| 360 | self._init_agent() |
| 361 | if self.provider_manager: |
| 362 | return f"Switched to provider: {args}" |
| 363 | return f"Failed to initialise provider: {args}" |
| 364 | |
| 365 | def _cmd_model(self, args: str) -> str: |
| 366 | """Switch the chat model.""" |
| 367 | args = args.strip() |
| 368 | if not args: |
| 369 | current = self._chat_model or "default" |
| 370 | return f"Current model: {current}\nUsage: /model MODEL_NAME" |
| 371 | self._chat_model = args |
| 372 | self._init_provider() |
| 373 | self._init_agent() |
| 374 | if self.provider_manager: |
| 375 | return f"Switched to model: {args}" |
| 376 | return f"Failed to initialise with model: {args}" |
| 377 | |
| 378 | def _cmd_chat(self, message: str) -> str: |
| 379 | if not self.provider_manager or not self.agent: |
| 380 | return ( |
| 381 | "Chat requires an LLM provider. Set one of:\n" |
| 382 | " OPENAI_API_KEY\n" |
| 383 | " ANTHROPIC_API_KEY\n" |
| 384 | " GEMINI_API_KEY\n" |
| 385 | "Or pass --provider / --chat-model." |
| 386 | ) |
| 387 | try: |
| 388 | return self.agent.chat(message) |
| 389 | except Exception as exc: |
| 390 | return f"Chat error: {exc}" |
| 391 | |
| 392 | # ── Main dispatch ── |
| 393 | |
| 394 | def handle_input(self, line: str) -> str: |
| 395 | """Process a single def run find_nearest_graph, |
| 396 | Main REPL loop.""" |
| 397 | self._discover() |
| 398 | self._init_provider() |
| 399 | |
| 400 | print(self._welcome_banner()) |
| 401 | |
| 402 | while True: |
| 403 | try: |
| 404 | line = input("planopticon> ") |
| 405 | except (KeyboardInterrupt, EOFError): |
| 406 | print("\nBye.") |
| 407 | break |
| 408 | |
| 409 | output = self.handle_input(line) |
| 410 | if output == "__QUIT__": |
| 411 | print("Bye.") |
| 412 | break |
| 413 | if output: |
| 414 | print(output) |
+209
| --- a/video_processor/exchange.py | ||
| +++ b/video_processor/exchange.py | ||
| @@ -0,0 +1,209 @@ | ||
| 1 | +"""PlanOpticonExchange -- canonical interchange format. | |
| 2 | + | |
| 3 | +Every command produces it, every export adapter consumes it. | |
| 4 | +""" | |
| 5 | + | |
| 6 | +from __future__ import annotations | |
| 7 | + | |
| 8 | +import json | |
| 9 | +from datetime import datetime | |
| 10 | +from pathlib import Path | |
| 11 | +from typing import Any, Dict, List, Optional | |
| 12 | + | |
| 13 | +from pydantic import BaseModel, Field | |
| 14 | + | |
| 15 | +from video_processor.models import Entity, Relationship, SourceRecord | |
| 16 | + | |
| 17 | + | |
| 18 | +class ArtifactMeta(BaseModel): | |
| 19 | + """Pydantic mirror of the Artifact dataclass for serialisation.""" | |
| 20 | + | |
| 21 | + name: str = Field(description="Artifact name") | |
| 22 | + content: str = Field(description="Generated content (markdown, json, etc.)") | |
| 23 | + artifact_type: str = Field( | |
| 24 | + description="Artifact kind: project_plan, prd, roadmap, task_list, document, issues" | |
| 25 | + ) | |
| 26 | + format: str = Field( | |
| 27 | + default="markdown", | |
| 28 | + description="Content format: markdown, json, mermaid", | |
| 29 | + ) | |
| 30 | + metadata: Dict[str, Any] = Field( | |
| 31 | + default_factory=dict, | |
| 32 | + description="Arbitrary key-value metadata", | |
| 33 | + ) | |
| 34 | + | |
| 35 | + | |
| 36 | +class ProjectMeta(BaseModel): | |
| 37 | + """Lightweight project descriptor embedded in an exchange payload.""" | |
| 38 | + | |
| 39 | + name: str = Field(description="Project name") | |
| 40 | + description: str = Field( | |
| 41 | + default="", | |
| 42 | + description="Short project description", | |
| 43 | + ) | |
| 44 | + created_at: str = Field( | |
| 45 | + default_factory=lambda: datetime.now().isoformat(), | |
| 46 | + description="ISO-8601 creation timestamp", | |
| 47 | + ) | |
| 48 | + updated_at: str = Field( | |
| 49 | + default_factory=lambda: datetime.now().isoformat(), | |
| 50 | + description="ISO-8601 last-updated timestamp", | |
| 51 | + ) | |
| 52 | + tags: List[str] = Field( | |
| 53 | + default_factory=list, | |
| 54 | + description="Freeform tags for categorisation", | |
| 55 | + ) | |
| 56 | + | |
| 57 | + | |
| 58 | +class PlanOpticonExchange(BaseModel): | |
| 59 | + """Wire format for PlanOpticon data interchange. | |
| 60 | + | |
| 61 | + Produced by every command, consumed by every export adapter. | |
| 62 | + """ | |
| 63 | + | |
| 64 | + version: str = Field( | |
| 65 | + default="1.0", | |
| 66 | + description="Schema version of this exchange payload", | |
| 67 | + ) | |
| 68 | + project: ProjectMeta = Field( | |
| 69 | + description="Project-level metadata", | |
| 70 | + ) | |
| 71 | + entities: List[Entity] = Field( | |
| 72 | + default_factory=list, | |
| 73 | + description="Knowledge-graph entities", | |
| 74 | + ) | |
| 75 | + relationships: List[Relationship] = Field( | |
| 76 | + default_factory=list, | |
| 77 | + description="Knowledge-graph relationships", | |
| 78 | + ) | |
| 79 | + artifacts: List[ArtifactMeta] = Field( | |
| 80 | + default_factory=list, | |
| 81 | + description="Generated artifacts (plans, PRDs, etc.)", | |
| 82 | + ) | |
| 83 | + sources: List[SourceRecord] = Field( | |
| 84 | + default_factory=list, | |
| 85 | + description="Content-source provenance records", | |
| 86 | + ) | |
| 87 | + | |
| 88 | + # ------------------------------------------------------------------ | |
| 89 | + # Convenience helpers | |
| 90 | + # ------------------------------------------------------------------ | |
| 91 | + | |
| 92 | + @classmethod | |
| 93 | + def json_schema(cls) -> Dict[str, Any]: | |
| 94 | + """Return the JSON Schema for validation / documentation.""" | |
| 95 | + return cls.model_json_schema() | |
| 96 | + | |
| 97 | + @classmethod | |
| 98 | + def from_knowledge_graph( | |
| 99 | + cls, | |
| 100 | + kg_data: Dict[str, Any], | |
| 101 | + *, | |
| 102 | + project_name: str = "Untitled", | |
| 103 | + project_description: str = "", | |
| 104 | + tags: Optional[List[str]] = None, | |
| 105 | + ) -> "PlanOpticonExchange": | |
| 106 | + """Build an exchange payload from a ``KnowledgeGraph.to_dict()`` dict. | |
| 107 | + | |
| 108 | + The dict is expected to have ``nodes`` and ``relationships`` keys, | |
| 109 | + with an optional ``sources`` key. | |
| 110 | + """ | |
| 111 | + entities = [Entity(**_normalise_entity(n)) for n in kg_data.get("nodes", [])] | |
| 112 | + relationships = [ | |
| 113 | + Relationship(**_normalise_relationship(r)) for r in kg_data.get("relationships", []) | |
| 114 | + ] | |
| 115 | + sources = [SourceRecord(**s) for s in kg_data.get("sources", [])] | |
| 116 | + | |
| 117 | + now = datetime.now().isoformat() | |
| 118 | + project = ProjectMeta( | |
| 119 | + name=project_name, | |
| 120 | + description=project_description, | |
| 121 | + created_at=now, | |
| 122 | + updated_at=now, | |
| 123 | + tags=tags or [], | |
| 124 | + ) | |
| 125 | + | |
| 126 | + return cls( | |
| 127 | + project=project, | |
| 128 | + entities=entities, | |
| 129 | + relationships=relationships, | |
| 130 | + sources=sources, | |
| 131 | + ) | |
| 132 | + | |
| 133 | + # ------------------------------------------------------------------ | |
| 134 | + # File I/O | |
| 135 | + # ------------------------------------------------------------------ | |
| 136 | + | |
| 137 | + def to_file(self, path: str | Path) -> Path: | |
| 138 | + """Serialise this exchange to a JSON file.""" | |
| 139 | + path = Path(path) | |
| 140 | + path.parent.mkdir(parents=True, exist_ok=True) | |
| 141 | + path.write_text(self.model_dump_json(indent=2)) | |
| 142 | + return path | |
| 143 | + | |
| 144 | + @classmethod | |
| 145 | + def from_file(cls, path: str | Path) -> "PlanOpticonExchange": | |
| 146 | + """Deserialise an exchange from a JSON file.""" | |
| 147 | + path = Path(path) | |
| 148 | + raw = json.loads(path.read_text()) | |
| 149 | + return cls.model_validate(raw) | |
| 150 | + | |
| 151 | + # ------------------------------------------------------------------ | |
| 152 | + # Merge | |
| 153 | + # ------------------------------------------------------------------ | |
| 154 | + | |
| 155 | + def merge(self, other: "PlanOpticonExchange") -> None: | |
| 156 | + """Merge *other* into this exchange, deduplicating entities by name.""" | |
| 157 | + existing_names = {e.name for e in self.entities} | |
| 158 | + for entity in other.entities: | |
| 159 | + if entity.name not in existing_names: | |
| 160 | + self.entities.append(entity) | |
| 161 | + existing_names.add(entity.name) | |
| 162 | + | |
| 163 | + existing_rels = {(r.source, r.target, r.type) for r in self.relationships} | |
| 164 | + for rel in other.relationships: | |
| 165 | + key = (rel.source, rel.target, rel.type) | |
| 166 | + if key not in existing_rels: | |
| 167 | + self.relationships.append(rel) | |
| 168 | + existing_rels.add(key) | |
| 169 | + | |
| 170 | + existing_artifact_names = {a.name for a in self.artifacts} | |
| 171 | + for artifact in other.artifacts: | |
| 172 | + if artifact.name not in existing_artifact_names: | |
| 173 | + self.artifacts.append(artifact) | |
| 174 | + existing_artifact_names.add(artifact.name) | |
| 175 | + | |
| 176 | + existing_source_ids = {s.source_id for s in self.sources} | |
| 177 | + for source in other.sources: | |
| 178 | + if source.source_id not in existing_source_ids: | |
| 179 | + self.sources.append(source) | |
| 180 | + existing_source_ids.add(source.source_id) | |
| 181 | + | |
| 182 | + self.project.updated_at = datetime.now().isoformat() | |
| 183 | + | |
| 184 | + | |
| 185 | +# ------------------------------------------------------------------ | |
| 186 | +# Internal helpers | |
| 187 | +# ------------------------------------------------------------------ | |
| 188 | + | |
| 189 | + | |
| 190 | +def _normalise_entity(raw: Dict[str, Any]) -> Dict[str, Any]: | |
| 191 | + """Coerce a KG node dict into Entity-compatible kwargs.""" | |
| 192 | + return { | |
| 193 | + "name": raw.get("name", raw.get("id", "")), | |
| 194 | + "type": raw.get("type", "concept"), | |
| 195 | + "descriptions": list(raw.get("descriptions", [])), | |
| 196 | + "source": raw.get("source"), | |
| 197 | + "occurrences": raw.get("occurrences", []), | |
| 198 | + } | |
| 199 | + | |
| 200 | + | |
| 201 | +def _normalise_relationship(raw: Dict[str, Any]) -> Dict[str, Any]: | |
| 202 | + """Coerce a KG relationship dict into Relationship-compatible kwargs.""" | |
| 203 | + return { | |
| 204 | + "source": raw.get("source", ""), | |
| 205 | + "target": raw.get("target", ""), | |
| 206 | + "type": raw.get("type", "related_to"), | |
| 207 | + "content_source": raw.get("content_source"), | |
| 208 | + "timestamp": raw.get("timestamp"), | |
| 209 | + } |
| --- a/video_processor/exchange.py | |
| +++ b/video_processor/exchange.py | |
| @@ -0,0 +1,209 @@ | |
| --- a/video_processor/exchange.py | |
| +++ b/video_processor/exchange.py | |
| @@ -0,0 +1,209 @@ | |
| 1 | """PlanOpticonExchange -- canonical interchange format. |
| 2 | |
| 3 | Every command produces it, every export adapter consumes it. |
| 4 | """ |
| 5 | |
| 6 | from __future__ import annotations |
| 7 | |
| 8 | import json |
| 9 | from datetime import datetime |
| 10 | from pathlib import Path |
| 11 | from typing import Any, Dict, List, Optional |
| 12 | |
| 13 | from pydantic import BaseModel, Field |
| 14 | |
| 15 | from video_processor.models import Entity, Relationship, SourceRecord |
| 16 | |
| 17 | |
| 18 | class ArtifactMeta(BaseModel): |
| 19 | """Pydantic mirror of the Artifact dataclass for serialisation.""" |
| 20 | |
| 21 | name: str = Field(description="Artifact name") |
| 22 | content: str = Field(description="Generated content (markdown, json, etc.)") |
| 23 | artifact_type: str = Field( |
| 24 | description="Artifact kind: project_plan, prd, roadmap, task_list, document, issues" |
| 25 | ) |
| 26 | format: str = Field( |
| 27 | default="markdown", |
| 28 | description="Content format: markdown, json, mermaid", |
| 29 | ) |
| 30 | metadata: Dict[str, Any] = Field( |
| 31 | default_factory=dict, |
| 32 | description="Arbitrary key-value metadata", |
| 33 | ) |
| 34 | |
| 35 | |
| 36 | class ProjectMeta(BaseModel): |
| 37 | """Lightweight project descriptor embedded in an exchange payload.""" |
| 38 | |
| 39 | name: str = Field(description="Project name") |
| 40 | description: str = Field( |
| 41 | default="", |
| 42 | description="Short project description", |
| 43 | ) |
| 44 | created_at: str = Field( |
| 45 | default_factory=lambda: datetime.now().isoformat(), |
| 46 | description="ISO-8601 creation timestamp", |
| 47 | ) |
| 48 | updated_at: str = Field( |
| 49 | default_factory=lambda: datetime.now().isoformat(), |
| 50 | description="ISO-8601 last-updated timestamp", |
| 51 | ) |
| 52 | tags: List[str] = Field( |
| 53 | default_factory=list, |
| 54 | description="Freeform tags for categorisation", |
| 55 | ) |
| 56 | |
| 57 | |
| 58 | class PlanOpticonExchange(BaseModel): |
| 59 | """Wire format for PlanOpticon data interchange. |
| 60 | |
| 61 | Produced by every command, consumed by every export adapter. |
| 62 | """ |
| 63 | |
| 64 | version: str = Field( |
| 65 | default="1.0", |
| 66 | description="Schema version of this exchange payload", |
| 67 | ) |
| 68 | project: ProjectMeta = Field( |
| 69 | description="Project-level metadata", |
| 70 | ) |
| 71 | entities: List[Entity] = Field( |
| 72 | default_factory=list, |
| 73 | description="Knowledge-graph entities", |
| 74 | ) |
| 75 | relationships: List[Relationship] = Field( |
| 76 | default_factory=list, |
| 77 | description="Knowledge-graph relationships", |
| 78 | ) |
| 79 | artifacts: List[ArtifactMeta] = Field( |
| 80 | default_factory=list, |
| 81 | description="Generated artifacts (plans, PRDs, etc.)", |
| 82 | ) |
| 83 | sources: List[SourceRecord] = Field( |
| 84 | default_factory=list, |
| 85 | description="Content-source provenance records", |
| 86 | ) |
| 87 | |
| 88 | # ------------------------------------------------------------------ |
| 89 | # Convenience helpers |
| 90 | # ------------------------------------------------------------------ |
| 91 | |
| 92 | @classmethod |
| 93 | def json_schema(cls) -> Dict[str, Any]: |
| 94 | """Return the JSON Schema for validation / documentation.""" |
| 95 | return cls.model_json_schema() |
| 96 | |
| 97 | @classmethod |
| 98 | def from_knowledge_graph( |
| 99 | cls, |
| 100 | kg_data: Dict[str, Any], |
| 101 | *, |
| 102 | project_name: str = "Untitled", |
| 103 | project_description: str = "", |
| 104 | tags: Optional[List[str]] = None, |
| 105 | ) -> "PlanOpticonExchange": |
| 106 | """Build an exchange payload from a ``KnowledgeGraph.to_dict()`` dict. |
| 107 | |
| 108 | The dict is expected to have ``nodes`` and ``relationships`` keys, |
| 109 | with an optional ``sources`` key. |
| 110 | """ |
| 111 | entities = [Entity(**_normalise_entity(n)) for n in kg_data.get("nodes", [])] |
| 112 | relationships = [ |
| 113 | Relationship(**_normalise_relationship(r)) for r in kg_data.get("relationships", []) |
| 114 | ] |
| 115 | sources = [SourceRecord(**s) for s in kg_data.get("sources", [])] |
| 116 | |
| 117 | now = datetime.now().isoformat() |
| 118 | project = ProjectMeta( |
| 119 | name=project_name, |
| 120 | description=project_description, |
| 121 | created_at=now, |
| 122 | updated_at=now, |
| 123 | tags=tags or [], |
| 124 | ) |
| 125 | |
| 126 | return cls( |
| 127 | project=project, |
| 128 | entities=entities, |
| 129 | relationships=relationships, |
| 130 | sources=sources, |
| 131 | ) |
| 132 | |
| 133 | # ------------------------------------------------------------------ |
| 134 | # File I/O |
| 135 | # ------------------------------------------------------------------ |
| 136 | |
| 137 | def to_file(self, path: str | Path) -> Path: |
| 138 | """Serialise this exchange to a JSON file.""" |
| 139 | path = Path(path) |
| 140 | path.parent.mkdir(parents=True, exist_ok=True) |
| 141 | path.write_text(self.model_dump_json(indent=2)) |
| 142 | return path |
| 143 | |
| 144 | @classmethod |
| 145 | def from_file(cls, path: str | Path) -> "PlanOpticonExchange": |
| 146 | """Deserialise an exchange from a JSON file.""" |
| 147 | path = Path(path) |
| 148 | raw = json.loads(path.read_text()) |
| 149 | return cls.model_validate(raw) |
| 150 | |
| 151 | # ------------------------------------------------------------------ |
| 152 | # Merge |
| 153 | # ------------------------------------------------------------------ |
| 154 | |
| 155 | def merge(self, other: "PlanOpticonExchange") -> None: |
| 156 | """Merge *other* into this exchange, deduplicating entities by name.""" |
| 157 | existing_names = {e.name for e in self.entities} |
| 158 | for entity in other.entities: |
| 159 | if entity.name not in existing_names: |
| 160 | self.entities.append(entity) |
| 161 | existing_names.add(entity.name) |
| 162 | |
| 163 | existing_rels = {(r.source, r.target, r.type) for r in self.relationships} |
| 164 | for rel in other.relationships: |
| 165 | key = (rel.source, rel.target, rel.type) |
| 166 | if key not in existing_rels: |
| 167 | self.relationships.append(rel) |
| 168 | existing_rels.add(key) |
| 169 | |
| 170 | existing_artifact_names = {a.name for a in self.artifacts} |
| 171 | for artifact in other.artifacts: |
| 172 | if artifact.name not in existing_artifact_names: |
| 173 | self.artifacts.append(artifact) |
| 174 | existing_artifact_names.add(artifact.name) |
| 175 | |
| 176 | existing_source_ids = {s.source_id for s in self.sources} |
| 177 | for source in other.sources: |
| 178 | if source.source_id not in existing_source_ids: |
| 179 | self.sources.append(source) |
| 180 | existing_source_ids.add(source.source_id) |
| 181 | |
| 182 | self.project.updated_at = datetime.now().isoformat() |
| 183 | |
| 184 | |
| 185 | # ------------------------------------------------------------------ |
| 186 | # Internal helpers |
| 187 | # ------------------------------------------------------------------ |
| 188 | |
| 189 | |
| 190 | def _normalise_entity(raw: Dict[str, Any]) -> Dict[str, Any]: |
| 191 | """Coerce a KG node dict into Entity-compatible kwargs.""" |
| 192 | return { |
| 193 | "name": raw.get("name", raw.get("id", "")), |
| 194 | "type": raw.get("type", "concept"), |
| 195 | "descriptions": list(raw.get("descriptions", [])), |
| 196 | "source": raw.get("source"), |
| 197 | "occurrences": raw.get("occurrences", []), |
| 198 | } |
| 199 | |
| 200 | |
| 201 | def _normalise_relationship(raw: Dict[str, Any]) -> Dict[str, Any]: |
| 202 | """Coerce a KG relationship dict into Relationship-compatible kwargs.""" |
| 203 | return { |
| 204 | "source": raw.get("source", ""), |
| 205 | "target": raw.get("target", ""), |
| 206 | "type": raw.get("type", "related_to"), |
| 207 | "content_source": raw.get("content_source"), |
| 208 | "timestamp": raw.get("timestamp"), |
| 209 | } |
| --- a/video_processor/exporters/__init__.py | ||
| +++ b/video_processor/exporters/__init__.py | ||
| @@ -0,0 +1 @@ | ||
| 1 | +"""Document exporters for"""Document exporters for"""Document exporters for g |
| --- a/video_processor/exporters/__init__.py | |
| +++ b/video_processor/exporters/__init__.py | |
| @@ -0,0 +1 @@ | |
| --- a/video_processor/exporters/__init__.py | |
| +++ b/video_processor/exporters/__init__.py | |
| @@ -0,0 +1 @@ | |
| 1 | """Document exporters for"""Document exporters for"""Document exporters for g |
| --- a/video_processor/exporters/markdown.py | ||
| +++ b/video_processor/exporters/markdown.py | ||
| @@ -0,0 +1,500 @@ | ||
| 1 | +"""Generate structured markdown documents from knowledge graphs. | |
| 2 | + | |
| 3 | +No LLM required — pure template-based generation from KG data. | |
| 4 | +Produces federated, curated notes suitable for Obsidian, Notion, | |
| 5 | +GitHub, or any markdown-based workflow. | |
| 6 | +""" | |
| 7 | + | |
| 8 | +import csv | |
| 9 | +import io | |
| 10 | +import logging | |
| 11 | +from datetime import datetime | |
| 12 | +from pathlib import Path | |
| 13 | +from typing import Dict, List, Optional | |
| 14 | + | |
| 15 | +logger = logging.getLogger(__name__) | |
| 16 | + | |
| 17 | + | |
| 18 | +def _heading(text: str, level: int = 1) -> str: | |
| 19 | + return f"{'#' * level} {text}" | |
| 20 | + | |
| 21 | + | |
| 22 | +def _table(headers: List[str], rows: List[List[str]]) -> str: | |
| 23 | + lines = ["| " + " | ".join(headers) + " |"] | |
| 24 | + lines.append("| " + " | ".join("---" for _ in headers) + " |") | |
| 25 | + for row in rows: | |
| 26 | + lines.append("| " + " | ".join(str(c) for c in row) + " |") | |
| 27 | + return "\n".join(lines) | |
| 28 | + | |
| 29 | + | |
| 30 | +def _badge(label: str, value: str) -> str: | |
| 31 | + return f"**{label}:** {value}" | |
| 32 | + | |
| 33 | + | |
| 34 | +# --------------------------------------------------------------------------- | |
| 35 | +# Individual document generators | |
| 36 | +# --------------------------------------------------------------------------- | |
| 37 | + | |
| 38 | + | |
| 39 | +def generate_entity_brief(entity: dict, relationships: list) -> str: | |
| 40 | + """Generate a one-pager markdown brief for a single entity.""" | |
| 41 | + name = entity.get("name", "Untitled") | |
| 42 | + etype = entity.get("type", "concept") | |
| 43 | + descs = entity.get("descriptions", []) | |
| 44 | + occs = entity.get("occurrences", []) | |
| 45 | + | |
| 46 | + outgoing = [(r["target"], r["type"]) for r in relationships if r.get("source") == name] | |
| 47 | + incoming = [(r["source"], r["type"]) for r in relationships if r.get("target") == name] | |
| 48 | + | |
| 49 | + parts = [ | |
| 50 | + _heading(name), | |
| 51 | + "", | |
| 52 | + _badge("Type", etype), | |
| 53 | + "", | |
| 54 | + ] | |
| 55 | + | |
| 56 | + if descs: | |
| 57 | + parts.append(_heading("Summary", 2)) | |
| 58 | + parts.append("") | |
| 59 | + for d in descs: | |
| 60 | + parts.append(f"- {d}") | |
| 61 | + parts.append("") | |
| 62 | + | |
| 63 | + if outgoing: | |
| 64 | + parts.append(_heading("Relates To", 2)) | |
| 65 | + parts.append("") | |
| 66 | + parts.append(_table(["Entity", "Relationship"], [[t, r] for t, r in outgoing])) | |
| 67 | + parts.append("") | |
| 68 | + | |
| 69 | + if incoming: | |
| 70 | + parts.append(_heading("Referenced By", 2)) | |
| 71 | + parts.append("") | |
| 72 | + parts.append(_table(["Entity", "Relationship"], [[s, r] for s, r in incoming])) | |
| 73 | + parts.append("") | |
| 74 | + | |
| 75 | + if occs: | |
| 76 | + parts.append(_heading("Sources", 2)) | |
| 77 | + parts.append("") | |
| 78 | + for occ in occs: | |
| 79 | + src = occ.get("source", "unknown") | |
| 80 | + ts = occ.get("timestamp", "") | |
| 81 | + text = occ.get("text", "") | |
| 82 | + line = f"- **{src}**" | |
| 83 | + if ts: | |
| 84 | + line += f" ({ts})" | |
| 85 | + if text: | |
| 86 | + line += f" — {text}" | |
| 87 | + parts.append(line) | |
| 88 | + parts.append("") | |
| 89 | + | |
| 90 | + return "\n".join(parts) | |
| 91 | + | |
| 92 | + | |
| 93 | +def generate_executive_summary(kg_data: dict) -> str: | |
| 94 | + """Generate a high-level executive summary from the KG.""" | |
| 95 | + nodes = kg_data.get("nodes", []) | |
| 96 | + rels = kg_data.get("relationships", []) | |
| 97 | + | |
| 98 | + by_type: Dict[str, list] = {} | |
| 99 | + for n in nodes: | |
| 100 | + t = n.get("type", "concept") | |
| 101 | + by_type.setdefault(t, []).append(n) | |
| 102 | + | |
| 103 | + parts = [ | |
| 104 | + _heading("Executive Summary"), | |
| 105 | + "", | |
| 106 | + f"Knowledge base contains **{len(nodes)} entities** " | |
| 107 | + f"and **{len(rels)} relationships** across " | |
| 108 | + f"**{len(by_type)} categories**.", | |
| 109 | + "", | |
| 110 | + _heading("Entity Breakdown", 2), | |
| 111 | + "", | |
| 112 | + _table( | |
| 113 | + ["Type", "Count", "Examples"], | |
| 114 | + [ | |
| 115 | + [ | |
| 116 | + etype, | |
| 117 | + str(len(elist)), | |
| 118 | + ", ".join(e.get("name", "") for e in elist[:3]), | |
| 119 | + ] | |
| 120 | + for etype, elist in sorted(by_type.items(), key=lambda x: -len(x[1])) | |
| 121 | + ], | |
| 122 | + ), | |
| 123 | + "", | |
| 124 | + ] | |
| 125 | + | |
| 126 | + # Top connected entities | |
| 127 | + degree: Dict[str, int] = {} | |
| 128 | + for r in rels: | |
| 129 | + degree[r.get("source", "")] = degree.get(r.get("source", ""), 0) + 1 | |
| 130 | + degree[r.get("target", "")] = degree.get(r.get("target", ""), 0) + 1 | |
| 131 | + | |
| 132 | + top = sorted(degree.items(), key=lambda x: -x[1])[:10] | |
| 133 | + if top: | |
| 134 | + parts.append(_heading("Key Entities (by connections)", 2)) | |
| 135 | + parts.append("") | |
| 136 | + parts.append( | |
| 137 | + _table( | |
| 138 | + ["Entity", "Connections"], | |
| 139 | + [[name, str(deg)] for name, deg in top], | |
| 140 | + ) | |
| 141 | + ) | |
| 142 | + parts.append("") | |
| 143 | + | |
| 144 | + # Relationship type breakdown | |
| 145 | + rel_types: Dict[str, int] = {} | |
| 146 | + for r in rels: | |
| 147 | + rt = r.get("type", "related_to") | |
| 148 | + rel_types[rt] = rel_types.get(rt, 0) + 1 | |
| 149 | + | |
| 150 | + if rel_types: | |
| 151 | + parts.append(_heading("Relationship Types", 2)) | |
| 152 | + parts.append("") | |
| 153 | + parts.append( | |
| 154 | + _table( | |
| 155 | + ["Type", "Count"], | |
| 156 | + [[rt, str(c)] for rt, c in sorted(rel_types.items(), key=lambda x: -x[1])], | |
| 157 | + ) | |
| 158 | + ) | |
| 159 | + parts.append("") | |
| 160 | + | |
| 161 | + return "\n".join(parts) | |
| 162 | + | |
| 163 | + | |
| 164 | +def generate_meeting_notes(kg_data: dict, title: Optional[str] = None) -> str: | |
| 165 | + """Generate meeting notes format from KG data.""" | |
| 166 | + nodes = kg_data.get("nodes", []) | |
| 167 | + rels = kg_data.get("relationships", []) | |
| 168 | + title = title or "Meeting Notes" | |
| 169 | + | |
| 170 | + # Categorize by planning-relevant types | |
| 171 | + decisions = [n for n in nodes if n.get("type") in ("decision", "constraint")] | |
| 172 | + actions = [n for n in nodes if n.get("type") in ("goal", "feature", "milestone")] | |
| 173 | + people = [n for n in nodes if n.get("type") == "person"] | |
| 174 | + topics = [n for n in nodes if n.get("type") in ("concept", "technology", "topic")] | |
| 175 | + | |
| 176 | + parts = [ | |
| 177 | + _heading(title), | |
| 178 | + "", | |
| 179 | + f"*Generated {datetime.now().strftime('%Y-%m-%d %H:%M')}*", | |
| 180 | + "", | |
| 181 | + ] | |
| 182 | + | |
| 183 | + if topics: | |
| 184 | + parts.append(_heading("Discussion Topics", 2)) | |
| 185 | + parts.append("") | |
| 186 | + for t in topics: | |
| 187 | + descs = t.get("descriptions", []) | |
| 188 | + desc = descs[0] if descs else "" | |
| 189 | + parts.append(f"- **{t['name']}**: {desc}") | |
| 190 | + parts.append("") | |
| 191 | + | |
| 192 | + if people: | |
| 193 | + parts.append(_heading("Participants", 2)) | |
| 194 | + parts.append("") | |
| 195 | + for p in people: | |
| 196 | + parts.append(f"- {p['name']}") | |
| 197 | + parts.append("") | |
| 198 | + | |
| 199 | + if decisions: | |
| 200 | + parts.append(_heading("Decisions & Constraints", 2)) | |
| 201 | + parts.append("") | |
| 202 | + for d in decisions: | |
| 203 | + descs = d.get("descriptions", []) | |
| 204 | + desc = descs[0] if descs else "" | |
| 205 | + parts.append(f"- **{d['name']}**: {desc}") | |
| 206 | + parts.append("") | |
| 207 | + | |
| 208 | + if actions: | |
| 209 | + parts.append(_heading("Action Items", 2)) | |
| 210 | + parts.append("") | |
| 211 | + for a in actions: | |
| 212 | + descs = a.get("descriptions", []) | |
| 213 | + desc = descs[0] if descs else "" | |
| 214 | + # Find who it's related to | |
| 215 | + owners = [ | |
| 216 | + r["target"] | |
| 217 | + for r in rels | |
| 218 | + if r.get("source") == a["name"] and r.get("type") in ("assigned_to", "owned_by") | |
| 219 | + ] | |
| 220 | + owner_str = f" (@{', '.join(owners)})" if owners else "" | |
| 221 | + parts.append(f"- [ ] **{a['name']}**{owner_str}: {desc}") | |
| 222 | + parts.append("") | |
| 223 | + | |
| 224 | + # Open questions (entities without many relationships) | |
| 225 | + degree_map: Dict[str, int] = {} | |
| 226 | + for r in rels: | |
| 227 | + degree_map[r.get("source", "")] = degree_map.get(r.get("source", ""), 0) + 1 | |
| 228 | + degree_map[r.get("target", "")] = degree_map.get(r.get("target", ""), 0) + 1 | |
| 229 | + | |
| 230 | + orphans = [n for n in nodes if degree_map.get(n.get("name", ""), 0) <= 1 and n not in people] | |
| 231 | + if orphans: | |
| 232 | + parts.append(_heading("Open Questions / Loose Ends", 2)) | |
| 233 | + parts.append("") | |
| 234 | + for o in orphans[:10]: | |
| 235 | + parts.append(f"- {o['name']}") | |
| 236 | + parts.append("") | |
| 237 | + | |
| 238 | + return "\n".join(parts) | |
| 239 | + | |
| 240 | + | |
| 241 | +def generate_glossary(kg_data: dict) -> str: | |
| 242 | + """Generate a glossary/dictionary of all entities.""" | |
| 243 | + nodes = sorted(kg_data.get("nodes", []), key=lambda n: n.get("name", "").lower()) | |
| 244 | + | |
| 245 | + parts = [ | |
| 246 | + _heading("Glossary"), | |
| 247 | + "", | |
| 248 | + ] | |
| 249 | + | |
| 250 | + for node in nodes: | |
| 251 | + name = node.get("name", "") | |
| 252 | + etype = node.get("type", "concept") | |
| 253 | + descs = node.get("descriptions", []) | |
| 254 | + desc = descs[0] if descs else "No description available." | |
| 255 | + parts.append(f"**{name}** *({etype})*") | |
| 256 | + parts.append(f": {desc}") | |
| 257 | + parts.append("") | |
| 258 | + | |
| 259 | + return "\n".join(parts) | |
| 260 | + | |
| 261 | + | |
| 262 | +def generate_relationship_map(kg_data: dict) -> str: | |
| 263 | + """Generate a relationship map as a markdown document with Mermaid diagram.""" | |
| 264 | + rels = kg_data.get("relationships", []) | |
| 265 | + nodes = kg_data.get("nodes", []) | |
| 266 | + | |
| 267 | + parts = [ | |
| 268 | + _heading("Relationship Map"), | |
| 269 | + "", | |
| 270 | + f"*{len(nodes)} entities, {len(rels)} relationships*", | |
| 271 | + "", | |
| 272 | + ] | |
| 273 | + | |
| 274 | + # Group by relationship type | |
| 275 | + by_type: Dict[str, list] = {} | |
| 276 | + for r in rels: | |
| 277 | + rt = r.get("type", "related_to") | |
| 278 | + by_type.setdefault(rt, []).append(r) | |
| 279 | + | |
| 280 | + for rt, rlist in sorted(by_type.items()): | |
| 281 | + parts.append(_heading(rt.replace("_", " ").title(), 2)) | |
| 282 | + parts.append("") | |
| 283 | + parts.append( | |
| 284 | + _table( | |
| 285 | + ["Source", "Target"], | |
| 286 | + [[r.get("source", ""), r.get("target", "")] for r in rlist], | |
| 287 | + ) | |
| 288 | + ) | |
| 289 | + parts.append("") | |
| 290 | + | |
| 291 | + # Mermaid diagram (top 20 nodes by degree) | |
| 292 | + degree: Dict[str, int] = {} | |
| 293 | + for r in rels: | |
| 294 | + degree[r.get("source", "")] = degree.get(r.get("source", ""), 0) + 1 | |
| 295 | + degree[r.get("target", "")] = degree.get(r.get("target", ""), 0) + 1 | |
| 296 | + | |
| 297 | + top_nodes = {name for name, _ in sorted(degree.items(), key=lambda x: -x[1])[:20]} | |
| 298 | + | |
| 299 | + if top_nodes: | |
| 300 | + parts.append(_heading("Visual Map", 2)) | |
| 301 | + parts.append("") | |
| 302 | + parts.append("```mermaid") | |
| 303 | + parts.append("graph LR") | |
| 304 | + | |
| 305 | + def safe(s): | |
| 306 | + return "".join(c if c.isalnum() or c == "_" else "_" for c in s) | |
| 307 | + | |
| 308 | + seen = set() | |
| 309 | + for r in rels: | |
| 310 | + src, tgt = r.get("source", ""), r.get("target", "") | |
| 311 | + if src in top_nodes and tgt in top_nodes: | |
| 312 | + key = (src, tgt) | |
| 313 | + if key not in seen: | |
| 314 | + parts.append( | |
| 315 | + f' {safe(src)}["{src}"] -->|{r.get("type", "")}| {safe(tgt)}["{tgt}"]' | |
| 316 | + ) | |
| 317 | + seen.add(key) | |
| 318 | + parts.append("```") | |
| 319 | + parts.append("") | |
| 320 | + | |
| 321 | + return "\n".join(parts) | |
| 322 | + | |
| 323 | + | |
| 324 | +def generate_status_report(kg_data: dict, title: Optional[str] = None) -> str: | |
| 325 | + """Generate a project status report from KG data.""" | |
| 326 | + nodes = kg_data.get("nodes", []) | |
| 327 | + rels = kg_data.get("relationships", []) | |
| 328 | + title = title or "Status Report" | |
| 329 | + | |
| 330 | + milestones = [n for n in nodes if n.get("type") == "milestone"] | |
| 331 | + features = [n for n in nodes if n.get("type") == "feature"] | |
| 332 | + risks = [n for n in nodes if n.get("type") in ("risk", "constraint")] | |
| 333 | + requirements = [n for n in nodes if n.get("type") == "requirement"] | |
| 334 | + | |
| 335 | + parts = [ | |
| 336 | + _heading(title), | |
| 337 | + "", | |
| 338 | + f"*Generated {datetime.now().strftime('%Y-%m-%d %H:%M')}*", | |
| 339 | + "", | |
| 340 | + ] | |
| 341 | + | |
| 342 | + parts.append(_heading("Overview", 2)) | |
| 343 | + parts.append("") | |
| 344 | + parts.append(f"- **Entities:** {len(nodes)}") | |
| 345 | + parts.append(f"- **Relationships:** {len(rels)}") | |
| 346 | + parts.append(f"- **Features:** {len(features)}") | |
| 347 | + parts.append(f"- **Milestones:** {len(milestones)}") | |
| 348 | + parts.append(f"- **Requirements:** {len(requirements)}") | |
| 349 | + parts.append(f"- **Risks/Constraints:** {len(risks)}") | |
| 350 | + parts.append("") | |
| 351 | + | |
| 352 | + if milestones: | |
| 353 | + parts.append(_heading("Milestones", 2)) | |
| 354 | + parts.append("") | |
| 355 | + for m in milestones: | |
| 356 | + descs = m.get("descriptions", []) | |
| 357 | + parts.append(f"- **{m['name']}**: {descs[0] if descs else 'TBD'}") | |
| 358 | + parts.append("") | |
| 359 | + | |
| 360 | + if features: | |
| 361 | + parts.append(_heading("Features", 2)) | |
| 362 | + parts.append("") | |
| 363 | + parts.append( | |
| 364 | + _table( | |
| 365 | + ["Feature", "Description"], | |
| 366 | + [[f["name"], (f.get("descriptions") or [""])[0][:60]] for f in features], | |
| 367 | + ) | |
| 368 | + ) | |
| 369 | + parts.append("") | |
| 370 | + | |
| 371 | + if risks: | |
| 372 | + parts.append(_heading("Risks & Constraints", 2)) | |
| 373 | + parts.append("") | |
| 374 | + for r in risks: | |
| 375 | + descs = r.get("descriptions", []) | |
| 376 | + parts.append(f"- **{r['name']}**: {descs[0] if descs else ''}") | |
| 377 | + parts.append("") | |
| 378 | + | |
| 379 | + return "\n".join(parts) | |
| 380 | + | |
| 381 | + | |
| 382 | +def generate_entity_index(kg_data: dict) -> str: | |
| 383 | + """Generate a master index of all entities grouped by type.""" | |
| 384 | + nodes = kg_data.get("nodes", []) | |
| 385 | + | |
| 386 | + by_type: Dict[str, list] = {} | |
| 387 | + for n in nodes: | |
| 388 | + t = n.get("type", "concept") | |
| 389 | + by_type.setdefault(t, []).append(n) | |
| 390 | + | |
| 391 | + parts = [ | |
| 392 | + _heading("Entity Index"), | |
| 393 | + "", | |
| 394 | + f"*{len(nodes)} entities across {len(by_type)} types*", | |
| 395 | + "", | |
| 396 | + ] | |
| 397 | + | |
| 398 | + for etype, elist in sorted(by_type.items()): | |
| 399 | + parts.append(_heading(f"{etype.title()} ({len(elist)})", 2)) | |
| 400 | + parts.append("") | |
| 401 | + for e in sorted(elist, key=lambda x: x.get("name", "")): | |
| 402 | + descs = e.get("descriptions", []) | |
| 403 | + desc = f" — {descs[0]}" if descs else "" | |
| 404 | + parts.append(f"- **{e['name']}**{desc}") | |
| 405 | + parts.append("") | |
| 406 | + | |
| 407 | + return "\n".join(parts) | |
| 408 | + | |
| 409 | + | |
| 410 | +def generate_csv_export(kg_data: dict) -> str: | |
| 411 | + """Generate CSV of entities for spreadsheet import.""" | |
| 412 | + nodes = kg_data.get("nodes", []) | |
| 413 | + rels = kg_data.get("relationships", []) | |
| 414 | + | |
| 415 | + # Build adjacency info | |
| 416 | + related: Dict[str, list] = {} | |
| 417 | + for r in rels: | |
| 418 | + src = r.get("source", "") | |
| 419 | + tgt = r.get("target", "") | |
| 420 | + related.setdefault(src, []).append(tgt) | |
| 421 | + | |
| 422 | + output = io.StringIO() | |
| 423 | + writer = csv.writer(output) | |
| 424 | + writer.writerow(["Name", "Type", "Description", "Related To", "Source"]) | |
| 425 | + | |
| 426 | + for n in sorted(nodes, key=lambda x: x.get("name", "")): | |
| 427 | + name = n.get("name", "") | |
| 428 | + etype = n.get("type", "") | |
| 429 | + descs = n.get("descriptions", []) | |
| 430 | + desc = descs[0] if descs else "" | |
| 431 | + rels_str = "; ".join(related.get(name, [])) | |
| 432 | + sources = n.get("occurrences", []) | |
| 433 | + src_str = sources[0].get("source", "") if sources else "" | |
| 434 | + writer.writerow([name, etype, desc, rels_str, src_str]) | |
| 435 | + | |
| 436 | + return output.getvalue() | |
| 437 | + | |
| 438 | + | |
| 439 | +# --------------------------------------------------------------------------- | |
| 440 | +# Document types registry | |
| 441 | +# --------------------------------------------------------------------------- | |
| 442 | + | |
| 443 | +DOCUMENT_TYPES = { | |
| 444 | + "summary": ("Executive Summary", generate_executive_summary), | |
| 445 | + "meeting-notes": ("Meeting Notes", generate_meeting_notes), | |
| 446 | + "glossary": ("Glossary", generate_glossary), | |
| 447 | + "relationship-map": ("Relationship Map", generate_relationship_map), | |
| 448 | + "status-report": ("Status Report", generate_status_report), | |
| 449 | + "entity-index": ("Entity Index", generate_entity_index), | |
| 450 | + "csv": ("CSV Export", generate_csv_export), | |
| 451 | +} | |
| 452 | + | |
| 453 | + | |
| 454 | +def generate_all( | |
| 455 | + kg_data: dict, | |
| 456 | + output_dir: Path, | |
| 457 | + doc_types: Optional[List[str]] = None, | |
| 458 | + title: Optional[str] = None, | |
| 459 | +) -> List[Path]: | |
| 460 | + """Generate multiple document types and write to output directory. | |
| 461 | + | |
| 462 | + If doc_types is None, generates all available types. | |
| 463 | + Returns list of created file paths. | |
| 464 | + """ | |
| 465 | + output_dir.mkdir(parents=True, exist_ok=True) | |
| 466 | + types_to_generate = doc_types or list(DOCUMENT_TYPES.keys()) | |
| 467 | + created = [] | |
| 468 | + | |
| 469 | + for dtype in types_to_generate: | |
| 470 | + if dtype not in DOCUMENT_TYPES: | |
| 471 | + logger.warning(f"Unknown document type: {dtype}") | |
| 472 | + continue | |
| 473 | + | |
| 474 | + label, generator = DOCUMENT_TYPES[dtype] | |
| 475 | + try: | |
| 476 | + content = generator(kg_data) | |
| 477 | + ext = ".csv" if dtype == "csv" else ".md" | |
| 478 | + filename = f"{dtype}{ext}" | |
| 479 | + path = output_dir / filename | |
| 480 | + path.write_text(content, encoding="utf-8") | |
| 481 | + created.append(path) | |
| 482 | + logger.info(f"Generated {label} → {path}") | |
| 483 | + except Exception as e: | |
| 484 | + logger.error(f"Failed to generate {label}: {e}") | |
| 485 | + | |
| 486 | + # Also generate individual entity briefs | |
| 487 | + briefs_dir = output_dir / "entities" | |
| 488 | + briefs_dir.mkdir(exist_ok=True) | |
| 489 | + rels = kg_data.get("relationships", []) | |
| 490 | + for node in kg_data.get("nodes", []): | |
| 491 | + name = node.get("name", "") | |
| 492 | + if not name: | |
| 493 | + continue | |
| 494 | + safe = name.replace("/", "-").replace("\\", "-").replace(" ", "-") | |
| 495 | + brief = generate_entity_brief(node, rels) | |
| 496 | + path = briefs_dir / f"{safe}.md" | |
| 497 | + path.write_text(brief, encoding="utf-8") | |
| 498 | + created.append(path) | |
| 499 | + | |
| 500 | + return created |
| --- a/video_processor/exporters/markdown.py | |
| +++ b/video_processor/exporters/markdown.py | |
| @@ -0,0 +1,500 @@ | |
| --- a/video_processor/exporters/markdown.py | |
| +++ b/video_processor/exporters/markdown.py | |
| @@ -0,0 +1,500 @@ | |
| 1 | """Generate structured markdown documents from knowledge graphs. |
| 2 | |
| 3 | No LLM required — pure template-based generation from KG data. |
| 4 | Produces federated, curated notes suitable for Obsidian, Notion, |
| 5 | GitHub, or any markdown-based workflow. |
| 6 | """ |
| 7 | |
| 8 | import csv |
| 9 | import io |
| 10 | import logging |
| 11 | from datetime import datetime |
| 12 | from pathlib import Path |
| 13 | from typing import Dict, List, Optional |
| 14 | |
| 15 | logger = logging.getLogger(__name__) |
| 16 | |
| 17 | |
| 18 | def _heading(text: str, level: int = 1) -> str: |
| 19 | return f"{'#' * level} {text}" |
| 20 | |
| 21 | |
| 22 | def _table(headers: List[str], rows: List[List[str]]) -> str: |
| 23 | lines = ["| " + " | ".join(headers) + " |"] |
| 24 | lines.append("| " + " | ".join("---" for _ in headers) + " |") |
| 25 | for row in rows: |
| 26 | lines.append("| " + " | ".join(str(c) for c in row) + " |") |
| 27 | return "\n".join(lines) |
| 28 | |
| 29 | |
| 30 | def _badge(label: str, value: str) -> str: |
| 31 | return f"**{label}:** {value}" |
| 32 | |
| 33 | |
| 34 | # --------------------------------------------------------------------------- |
| 35 | # Individual document generators |
| 36 | # --------------------------------------------------------------------------- |
| 37 | |
| 38 | |
| 39 | def generate_entity_brief(entity: dict, relationships: list) -> str: |
| 40 | """Generate a one-pager markdown brief for a single entity.""" |
| 41 | name = entity.get("name", "Untitled") |
| 42 | etype = entity.get("type", "concept") |
| 43 | descs = entity.get("descriptions", []) |
| 44 | occs = entity.get("occurrences", []) |
| 45 | |
| 46 | outgoing = [(r["target"], r["type"]) for r in relationships if r.get("source") == name] |
| 47 | incoming = [(r["source"], r["type"]) for r in relationships if r.get("target") == name] |
| 48 | |
| 49 | parts = [ |
| 50 | _heading(name), |
| 51 | "", |
| 52 | _badge("Type", etype), |
| 53 | "", |
| 54 | ] |
| 55 | |
| 56 | if descs: |
| 57 | parts.append(_heading("Summary", 2)) |
| 58 | parts.append("") |
| 59 | for d in descs: |
| 60 | parts.append(f"- {d}") |
| 61 | parts.append("") |
| 62 | |
| 63 | if outgoing: |
| 64 | parts.append(_heading("Relates To", 2)) |
| 65 | parts.append("") |
| 66 | parts.append(_table(["Entity", "Relationship"], [[t, r] for t, r in outgoing])) |
| 67 | parts.append("") |
| 68 | |
| 69 | if incoming: |
| 70 | parts.append(_heading("Referenced By", 2)) |
| 71 | parts.append("") |
| 72 | parts.append(_table(["Entity", "Relationship"], [[s, r] for s, r in incoming])) |
| 73 | parts.append("") |
| 74 | |
| 75 | if occs: |
| 76 | parts.append(_heading("Sources", 2)) |
| 77 | parts.append("") |
| 78 | for occ in occs: |
| 79 | src = occ.get("source", "unknown") |
| 80 | ts = occ.get("timestamp", "") |
| 81 | text = occ.get("text", "") |
| 82 | line = f"- **{src}**" |
| 83 | if ts: |
| 84 | line += f" ({ts})" |
| 85 | if text: |
| 86 | line += f" — {text}" |
| 87 | parts.append(line) |
| 88 | parts.append("") |
| 89 | |
| 90 | return "\n".join(parts) |
| 91 | |
| 92 | |
| 93 | def generate_executive_summary(kg_data: dict) -> str: |
| 94 | """Generate a high-level executive summary from the KG.""" |
| 95 | nodes = kg_data.get("nodes", []) |
| 96 | rels = kg_data.get("relationships", []) |
| 97 | |
| 98 | by_type: Dict[str, list] = {} |
| 99 | for n in nodes: |
| 100 | t = n.get("type", "concept") |
| 101 | by_type.setdefault(t, []).append(n) |
| 102 | |
| 103 | parts = [ |
| 104 | _heading("Executive Summary"), |
| 105 | "", |
| 106 | f"Knowledge base contains **{len(nodes)} entities** " |
| 107 | f"and **{len(rels)} relationships** across " |
| 108 | f"**{len(by_type)} categories**.", |
| 109 | "", |
| 110 | _heading("Entity Breakdown", 2), |
| 111 | "", |
| 112 | _table( |
| 113 | ["Type", "Count", "Examples"], |
| 114 | [ |
| 115 | [ |
| 116 | etype, |
| 117 | str(len(elist)), |
| 118 | ", ".join(e.get("name", "") for e in elist[:3]), |
| 119 | ] |
| 120 | for etype, elist in sorted(by_type.items(), key=lambda x: -len(x[1])) |
| 121 | ], |
| 122 | ), |
| 123 | "", |
| 124 | ] |
| 125 | |
| 126 | # Top connected entities |
| 127 | degree: Dict[str, int] = {} |
| 128 | for r in rels: |
| 129 | degree[r.get("source", "")] = degree.get(r.get("source", ""), 0) + 1 |
| 130 | degree[r.get("target", "")] = degree.get(r.get("target", ""), 0) + 1 |
| 131 | |
| 132 | top = sorted(degree.items(), key=lambda x: -x[1])[:10] |
| 133 | if top: |
| 134 | parts.append(_heading("Key Entities (by connections)", 2)) |
| 135 | parts.append("") |
| 136 | parts.append( |
| 137 | _table( |
| 138 | ["Entity", "Connections"], |
| 139 | [[name, str(deg)] for name, deg in top], |
| 140 | ) |
| 141 | ) |
| 142 | parts.append("") |
| 143 | |
| 144 | # Relationship type breakdown |
| 145 | rel_types: Dict[str, int] = {} |
| 146 | for r in rels: |
| 147 | rt = r.get("type", "related_to") |
| 148 | rel_types[rt] = rel_types.get(rt, 0) + 1 |
| 149 | |
| 150 | if rel_types: |
| 151 | parts.append(_heading("Relationship Types", 2)) |
| 152 | parts.append("") |
| 153 | parts.append( |
| 154 | _table( |
| 155 | ["Type", "Count"], |
| 156 | [[rt, str(c)] for rt, c in sorted(rel_types.items(), key=lambda x: -x[1])], |
| 157 | ) |
| 158 | ) |
| 159 | parts.append("") |
| 160 | |
| 161 | return "\n".join(parts) |
| 162 | |
| 163 | |
| 164 | def generate_meeting_notes(kg_data: dict, title: Optional[str] = None) -> str: |
| 165 | """Generate meeting notes format from KG data.""" |
| 166 | nodes = kg_data.get("nodes", []) |
| 167 | rels = kg_data.get("relationships", []) |
| 168 | title = title or "Meeting Notes" |
| 169 | |
| 170 | # Categorize by planning-relevant types |
| 171 | decisions = [n for n in nodes if n.get("type") in ("decision", "constraint")] |
| 172 | actions = [n for n in nodes if n.get("type") in ("goal", "feature", "milestone")] |
| 173 | people = [n for n in nodes if n.get("type") == "person"] |
| 174 | topics = [n for n in nodes if n.get("type") in ("concept", "technology", "topic")] |
| 175 | |
| 176 | parts = [ |
| 177 | _heading(title), |
| 178 | "", |
| 179 | f"*Generated {datetime.now().strftime('%Y-%m-%d %H:%M')}*", |
| 180 | "", |
| 181 | ] |
| 182 | |
| 183 | if topics: |
| 184 | parts.append(_heading("Discussion Topics", 2)) |
| 185 | parts.append("") |
| 186 | for t in topics: |
| 187 | descs = t.get("descriptions", []) |
| 188 | desc = descs[0] if descs else "" |
| 189 | parts.append(f"- **{t['name']}**: {desc}") |
| 190 | parts.append("") |
| 191 | |
| 192 | if people: |
| 193 | parts.append(_heading("Participants", 2)) |
| 194 | parts.append("") |
| 195 | for p in people: |
| 196 | parts.append(f"- {p['name']}") |
| 197 | parts.append("") |
| 198 | |
| 199 | if decisions: |
| 200 | parts.append(_heading("Decisions & Constraints", 2)) |
| 201 | parts.append("") |
| 202 | for d in decisions: |
| 203 | descs = d.get("descriptions", []) |
| 204 | desc = descs[0] if descs else "" |
| 205 | parts.append(f"- **{d['name']}**: {desc}") |
| 206 | parts.append("") |
| 207 | |
| 208 | if actions: |
| 209 | parts.append(_heading("Action Items", 2)) |
| 210 | parts.append("") |
| 211 | for a in actions: |
| 212 | descs = a.get("descriptions", []) |
| 213 | desc = descs[0] if descs else "" |
| 214 | # Find who it's related to |
| 215 | owners = [ |
| 216 | r["target"] |
| 217 | for r in rels |
| 218 | if r.get("source") == a["name"] and r.get("type") in ("assigned_to", "owned_by") |
| 219 | ] |
| 220 | owner_str = f" (@{', '.join(owners)})" if owners else "" |
| 221 | parts.append(f"- [ ] **{a['name']}**{owner_str}: {desc}") |
| 222 | parts.append("") |
| 223 | |
| 224 | # Open questions (entities without many relationships) |
| 225 | degree_map: Dict[str, int] = {} |
| 226 | for r in rels: |
| 227 | degree_map[r.get("source", "")] = degree_map.get(r.get("source", ""), 0) + 1 |
| 228 | degree_map[r.get("target", "")] = degree_map.get(r.get("target", ""), 0) + 1 |
| 229 | |
| 230 | orphans = [n for n in nodes if degree_map.get(n.get("name", ""), 0) <= 1 and n not in people] |
| 231 | if orphans: |
| 232 | parts.append(_heading("Open Questions / Loose Ends", 2)) |
| 233 | parts.append("") |
| 234 | for o in orphans[:10]: |
| 235 | parts.append(f"- {o['name']}") |
| 236 | parts.append("") |
| 237 | |
| 238 | return "\n".join(parts) |
| 239 | |
| 240 | |
| 241 | def generate_glossary(kg_data: dict) -> str: |
| 242 | """Generate a glossary/dictionary of all entities.""" |
| 243 | nodes = sorted(kg_data.get("nodes", []), key=lambda n: n.get("name", "").lower()) |
| 244 | |
| 245 | parts = [ |
| 246 | _heading("Glossary"), |
| 247 | "", |
| 248 | ] |
| 249 | |
| 250 | for node in nodes: |
| 251 | name = node.get("name", "") |
| 252 | etype = node.get("type", "concept") |
| 253 | descs = node.get("descriptions", []) |
| 254 | desc = descs[0] if descs else "No description available." |
| 255 | parts.append(f"**{name}** *({etype})*") |
| 256 | parts.append(f": {desc}") |
| 257 | parts.append("") |
| 258 | |
| 259 | return "\n".join(parts) |
| 260 | |
| 261 | |
| 262 | def generate_relationship_map(kg_data: dict) -> str: |
| 263 | """Generate a relationship map as a markdown document with Mermaid diagram.""" |
| 264 | rels = kg_data.get("relationships", []) |
| 265 | nodes = kg_data.get("nodes", []) |
| 266 | |
| 267 | parts = [ |
| 268 | _heading("Relationship Map"), |
| 269 | "", |
| 270 | f"*{len(nodes)} entities, {len(rels)} relationships*", |
| 271 | "", |
| 272 | ] |
| 273 | |
| 274 | # Group by relationship type |
| 275 | by_type: Dict[str, list] = {} |
| 276 | for r in rels: |
| 277 | rt = r.get("type", "related_to") |
| 278 | by_type.setdefault(rt, []).append(r) |
| 279 | |
| 280 | for rt, rlist in sorted(by_type.items()): |
| 281 | parts.append(_heading(rt.replace("_", " ").title(), 2)) |
| 282 | parts.append("") |
| 283 | parts.append( |
| 284 | _table( |
| 285 | ["Source", "Target"], |
| 286 | [[r.get("source", ""), r.get("target", "")] for r in rlist], |
| 287 | ) |
| 288 | ) |
| 289 | parts.append("") |
| 290 | |
| 291 | # Mermaid diagram (top 20 nodes by degree) |
| 292 | degree: Dict[str, int] = {} |
| 293 | for r in rels: |
| 294 | degree[r.get("source", "")] = degree.get(r.get("source", ""), 0) + 1 |
| 295 | degree[r.get("target", "")] = degree.get(r.get("target", ""), 0) + 1 |
| 296 | |
| 297 | top_nodes = {name for name, _ in sorted(degree.items(), key=lambda x: -x[1])[:20]} |
| 298 | |
| 299 | if top_nodes: |
| 300 | parts.append(_heading("Visual Map", 2)) |
| 301 | parts.append("") |
| 302 | parts.append("```mermaid") |
| 303 | parts.append("graph LR") |
| 304 | |
| 305 | def safe(s): |
| 306 | return "".join(c if c.isalnum() or c == "_" else "_" for c in s) |
| 307 | |
| 308 | seen = set() |
| 309 | for r in rels: |
| 310 | src, tgt = r.get("source", ""), r.get("target", "") |
| 311 | if src in top_nodes and tgt in top_nodes: |
| 312 | key = (src, tgt) |
| 313 | if key not in seen: |
| 314 | parts.append( |
| 315 | f' {safe(src)}["{src}"] -->|{r.get("type", "")}| {safe(tgt)}["{tgt}"]' |
| 316 | ) |
| 317 | seen.add(key) |
| 318 | parts.append("```") |
| 319 | parts.append("") |
| 320 | |
| 321 | return "\n".join(parts) |
| 322 | |
| 323 | |
| 324 | def generate_status_report(kg_data: dict, title: Optional[str] = None) -> str: |
| 325 | """Generate a project status report from KG data.""" |
| 326 | nodes = kg_data.get("nodes", []) |
| 327 | rels = kg_data.get("relationships", []) |
| 328 | title = title or "Status Report" |
| 329 | |
| 330 | milestones = [n for n in nodes if n.get("type") == "milestone"] |
| 331 | features = [n for n in nodes if n.get("type") == "feature"] |
| 332 | risks = [n for n in nodes if n.get("type") in ("risk", "constraint")] |
| 333 | requirements = [n for n in nodes if n.get("type") == "requirement"] |
| 334 | |
| 335 | parts = [ |
| 336 | _heading(title), |
| 337 | "", |
| 338 | f"*Generated {datetime.now().strftime('%Y-%m-%d %H:%M')}*", |
| 339 | "", |
| 340 | ] |
| 341 | |
| 342 | parts.append(_heading("Overview", 2)) |
| 343 | parts.append("") |
| 344 | parts.append(f"- **Entities:** {len(nodes)}") |
| 345 | parts.append(f"- **Relationships:** {len(rels)}") |
| 346 | parts.append(f"- **Features:** {len(features)}") |
| 347 | parts.append(f"- **Milestones:** {len(milestones)}") |
| 348 | parts.append(f"- **Requirements:** {len(requirements)}") |
| 349 | parts.append(f"- **Risks/Constraints:** {len(risks)}") |
| 350 | parts.append("") |
| 351 | |
| 352 | if milestones: |
| 353 | parts.append(_heading("Milestones", 2)) |
| 354 | parts.append("") |
| 355 | for m in milestones: |
| 356 | descs = m.get("descriptions", []) |
| 357 | parts.append(f"- **{m['name']}**: {descs[0] if descs else 'TBD'}") |
| 358 | parts.append("") |
| 359 | |
| 360 | if features: |
| 361 | parts.append(_heading("Features", 2)) |
| 362 | parts.append("") |
| 363 | parts.append( |
| 364 | _table( |
| 365 | ["Feature", "Description"], |
| 366 | [[f["name"], (f.get("descriptions") or [""])[0][:60]] for f in features], |
| 367 | ) |
| 368 | ) |
| 369 | parts.append("") |
| 370 | |
| 371 | if risks: |
| 372 | parts.append(_heading("Risks & Constraints", 2)) |
| 373 | parts.append("") |
| 374 | for r in risks: |
| 375 | descs = r.get("descriptions", []) |
| 376 | parts.append(f"- **{r['name']}**: {descs[0] if descs else ''}") |
| 377 | parts.append("") |
| 378 | |
| 379 | return "\n".join(parts) |
| 380 | |
| 381 | |
| 382 | def generate_entity_index(kg_data: dict) -> str: |
| 383 | """Generate a master index of all entities grouped by type.""" |
| 384 | nodes = kg_data.get("nodes", []) |
| 385 | |
| 386 | by_type: Dict[str, list] = {} |
| 387 | for n in nodes: |
| 388 | t = n.get("type", "concept") |
| 389 | by_type.setdefault(t, []).append(n) |
| 390 | |
| 391 | parts = [ |
| 392 | _heading("Entity Index"), |
| 393 | "", |
| 394 | f"*{len(nodes)} entities across {len(by_type)} types*", |
| 395 | "", |
| 396 | ] |
| 397 | |
| 398 | for etype, elist in sorted(by_type.items()): |
| 399 | parts.append(_heading(f"{etype.title()} ({len(elist)})", 2)) |
| 400 | parts.append("") |
| 401 | for e in sorted(elist, key=lambda x: x.get("name", "")): |
| 402 | descs = e.get("descriptions", []) |
| 403 | desc = f" — {descs[0]}" if descs else "" |
| 404 | parts.append(f"- **{e['name']}**{desc}") |
| 405 | parts.append("") |
| 406 | |
| 407 | return "\n".join(parts) |
| 408 | |
| 409 | |
| 410 | def generate_csv_export(kg_data: dict) -> str: |
| 411 | """Generate CSV of entities for spreadsheet import.""" |
| 412 | nodes = kg_data.get("nodes", []) |
| 413 | rels = kg_data.get("relationships", []) |
| 414 | |
| 415 | # Build adjacency info |
| 416 | related: Dict[str, list] = {} |
| 417 | for r in rels: |
| 418 | src = r.get("source", "") |
| 419 | tgt = r.get("target", "") |
| 420 | related.setdefault(src, []).append(tgt) |
| 421 | |
| 422 | output = io.StringIO() |
| 423 | writer = csv.writer(output) |
| 424 | writer.writerow(["Name", "Type", "Description", "Related To", "Source"]) |
| 425 | |
| 426 | for n in sorted(nodes, key=lambda x: x.get("name", "")): |
| 427 | name = n.get("name", "") |
| 428 | etype = n.get("type", "") |
| 429 | descs = n.get("descriptions", []) |
| 430 | desc = descs[0] if descs else "" |
| 431 | rels_str = "; ".join(related.get(name, [])) |
| 432 | sources = n.get("occurrences", []) |
| 433 | src_str = sources[0].get("source", "") if sources else "" |
| 434 | writer.writerow([name, etype, desc, rels_str, src_str]) |
| 435 | |
| 436 | return output.getvalue() |
| 437 | |
| 438 | |
| 439 | # --------------------------------------------------------------------------- |
| 440 | # Document types registry |
| 441 | # --------------------------------------------------------------------------- |
| 442 | |
| 443 | DOCUMENT_TYPES = { |
| 444 | "summary": ("Executive Summary", generate_executive_summary), |
| 445 | "meeting-notes": ("Meeting Notes", generate_meeting_notes), |
| 446 | "glossary": ("Glossary", generate_glossary), |
| 447 | "relationship-map": ("Relationship Map", generate_relationship_map), |
| 448 | "status-report": ("Status Report", generate_status_report), |
| 449 | "entity-index": ("Entity Index", generate_entity_index), |
| 450 | "csv": ("CSV Export", generate_csv_export), |
| 451 | } |
| 452 | |
| 453 | |
| 454 | def generate_all( |
| 455 | kg_data: dict, |
| 456 | output_dir: Path, |
| 457 | doc_types: Optional[List[str]] = None, |
| 458 | title: Optional[str] = None, |
| 459 | ) -> List[Path]: |
| 460 | """Generate multiple document types and write to output directory. |
| 461 | |
| 462 | If doc_types is None, generates all available types. |
| 463 | Returns list of created file paths. |
| 464 | """ |
| 465 | output_dir.mkdir(parents=True, exist_ok=True) |
| 466 | types_to_generate = doc_types or list(DOCUMENT_TYPES.keys()) |
| 467 | created = [] |
| 468 | |
| 469 | for dtype in types_to_generate: |
| 470 | if dtype not in DOCUMENT_TYPES: |
| 471 | logger.warning(f"Unknown document type: {dtype}") |
| 472 | continue |
| 473 | |
| 474 | label, generator = DOCUMENT_TYPES[dtype] |
| 475 | try: |
| 476 | content = generator(kg_data) |
| 477 | ext = ".csv" if dtype == "csv" else ".md" |
| 478 | filename = f"{dtype}{ext}" |
| 479 | path = output_dir / filename |
| 480 | path.write_text(content, encoding="utf-8") |
| 481 | created.append(path) |
| 482 | logger.info(f"Generated {label} → {path}") |
| 483 | except Exception as e: |
| 484 | logger.error(f"Failed to generate {label}: {e}") |
| 485 | |
| 486 | # Also generate individual entity briefs |
| 487 | briefs_dir = output_dir / "entities" |
| 488 | briefs_dir.mkdir(exist_ok=True) |
| 489 | rels = kg_data.get("relationships", []) |
| 490 | for node in kg_data.get("nodes", []): |
| 491 | name = node.get("name", "") |
| 492 | if not name: |
| 493 | continue |
| 494 | safe = name.replace("/", "-").replace("\\", "-").replace(" ", "-") |
| 495 | brief = generate_entity_brief(node, rels) |
| 496 | path = briefs_dir / f"{safe}.md" |
| 497 | path.write_text(brief, encoding="utf-8") |
| 498 | created.append(path) |
| 499 | |
| 500 | return created |
| --- video_processor/extractors/frame_extractor.py | ||
| +++ video_processor/extractors/frame_extractor.py | ||
| @@ -1,9 +1,11 @@ | ||
| 1 | 1 | """Frame extraction module for video processing.""" |
| 2 | 2 | |
| 3 | 3 | import functools |
| 4 | 4 | import logging |
| 5 | +import sys | |
| 6 | +import tempfile | |
| 5 | 7 | from pathlib import Path |
| 6 | 8 | from typing import List, Optional, Tuple, Union |
| 7 | 9 | |
| 8 | 10 | import cv2 |
| 9 | 11 | import numpy as np |
| @@ -183,10 +185,11 @@ | ||
| 183 | 185 | sampling_rate: float = 1.0, |
| 184 | 186 | change_threshold: float = 0.15, |
| 185 | 187 | periodic_capture_seconds: float = 30.0, |
| 186 | 188 | max_frames: Optional[int] = None, |
| 187 | 189 | resize_to: Optional[Tuple[int, int]] = None, |
| 190 | + max_memory_mb: int = 1024, | |
| 188 | 191 | ) -> List[np.ndarray]: |
| 189 | 192 | """ |
| 190 | 193 | Extract frames from video based on visual change detection + periodic capture. |
| 191 | 194 | |
| 192 | 195 | Two capture strategies work together: |
| @@ -209,10 +212,15 @@ | ||
| 209 | 212 | Capture a frame every N seconds regardless of change (0 to disable) |
| 210 | 213 | max_frames : int, optional |
| 211 | 214 | Maximum number of frames to extract |
| 212 | 215 | resize_to : tuple of (width, height), optional |
| 213 | 216 | Resize frames to this dimension |
| 217 | + max_memory_mb : int | |
| 218 | + Approximate memory limit in MB for held frames. When approaching this | |
| 219 | + limit, frames are flushed to disk early and only paths are retained | |
| 220 | + internally. The returned list still contains numpy arrays (reloaded | |
| 221 | + from the temp files at the end). Default 1024 MB. | |
| 214 | 222 | |
| 215 | 223 | Returns |
| 216 | 224 | ------- |
| 217 | 225 | list |
| 218 | 226 | List of extracted frames as numpy arrays |
| @@ -247,10 +255,16 @@ | ||
| 247 | 255 | extracted_frames = [] |
| 248 | 256 | prev_frame = None |
| 249 | 257 | frame_idx = 0 |
| 250 | 258 | last_capture_frame = -periodic_interval # allow first periodic capture immediately |
| 251 | 259 | |
| 260 | + # Memory safety valve | |
| 261 | + max_memory_bytes = max_memory_mb * 1024 * 1024 | |
| 262 | + approx_memory_used = 0 | |
| 263 | + _flush_dir = None # lazily created temp dir for flushed frames | |
| 264 | + _flushed_paths: List[Path] = [] # paths of frames flushed to disk | |
| 265 | + | |
| 252 | 266 | pbar = tqdm( |
| 253 | 267 | total=frame_count, |
| 254 | 268 | desc="Extracting frames", |
| 255 | 269 | unit="frame", |
| 256 | 270 | bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]", |
| @@ -288,13 +302,31 @@ | ||
| 288 | 302 | should_capture = True |
| 289 | 303 | reason = "periodic" |
| 290 | 304 | |
| 291 | 305 | if should_capture: |
| 292 | 306 | extracted_frames.append(frame) |
| 307 | + approx_memory_used += sys.getsizeof(frame) + ( | |
| 308 | + frame.nbytes if hasattr(frame, "nbytes") else 0 | |
| 309 | + ) | |
| 293 | 310 | prev_frame = frame |
| 294 | 311 | last_capture_frame = frame_idx |
| 295 | 312 | logger.debug(f"Frame {frame_idx} extracted ({reason})") |
| 313 | + | |
| 314 | + # Memory safety valve: flush frames to disk when approaching limit | |
| 315 | + if approx_memory_used >= max_memory_bytes * 0.9: | |
| 316 | + if _flush_dir is None: | |
| 317 | + _flush_dir = tempfile.mkdtemp(prefix="planopticon_frames_") | |
| 318 | + logger.info( | |
| 319 | + f"Memory limit ~{max_memory_mb}MB approaching, " | |
| 320 | + f"flushing frames to {_flush_dir}" | |
| 321 | + ) | |
| 322 | + for fi, f in enumerate(extracted_frames): | |
| 323 | + flush_path = Path(_flush_dir) / f"flush_{len(_flushed_paths) + fi:06d}.jpg" | |
| 324 | + cv2.imwrite(str(flush_path), f) | |
| 325 | + _flushed_paths.append(flush_path) | |
| 326 | + extracted_frames.clear() | |
| 327 | + approx_memory_used = 0 | |
| 296 | 328 | |
| 297 | 329 | pbar.set_postfix(extracted=len(extracted_frames)) |
| 298 | 330 | |
| 299 | 331 | # Check if we've reached the maximum |
| 300 | 332 | if max_frames is not None and len(extracted_frames) >= max_frames: |
| @@ -306,10 +338,27 @@ | ||
| 306 | 338 | frame_idx += 1 |
| 307 | 339 | pbar.update(frame_interval) |
| 308 | 340 | |
| 309 | 341 | pbar.close() |
| 310 | 342 | cap.release() |
| 343 | + | |
| 344 | + # If frames were flushed to disk, reload them | |
| 345 | + if _flushed_paths: | |
| 346 | + reloaded = [] | |
| 347 | + for fp in _flushed_paths: | |
| 348 | + img = cv2.imread(str(fp)) | |
| 349 | + if img is not None: | |
| 350 | + reloaded.append(img) | |
| 351 | + reloaded.extend(extracted_frames) | |
| 352 | + extracted_frames = reloaded | |
| 353 | + logger.info(f"Reloaded {len(_flushed_paths)} flushed frames from disk") | |
| 354 | + # Clean up temp files | |
| 355 | + import shutil | |
| 356 | + | |
| 357 | + if _flush_dir: | |
| 358 | + shutil.rmtree(_flush_dir, ignore_errors=True) | |
| 359 | + | |
| 311 | 360 | logger.info(f"Extracted {len(extracted_frames)} frames from {frame_count} total frames") |
| 312 | 361 | return extracted_frames |
| 313 | 362 | |
| 314 | 363 | |
| 315 | 364 | def func_gpu(*args, **kwargs): |
| 316 | 365 |
| --- video_processor/extractors/frame_extractor.py | |
| +++ video_processor/extractors/frame_extractor.py | |
| @@ -1,9 +1,11 @@ | |
| 1 | """Frame extraction module for video processing.""" |
| 2 | |
| 3 | import functools |
| 4 | import logging |
| 5 | from pathlib import Path |
| 6 | from typing import List, Optional, Tuple, Union |
| 7 | |
| 8 | import cv2 |
| 9 | import numpy as np |
| @@ -183,10 +185,11 @@ | |
| 183 | sampling_rate: float = 1.0, |
| 184 | change_threshold: float = 0.15, |
| 185 | periodic_capture_seconds: float = 30.0, |
| 186 | max_frames: Optional[int] = None, |
| 187 | resize_to: Optional[Tuple[int, int]] = None, |
| 188 | ) -> List[np.ndarray]: |
| 189 | """ |
| 190 | Extract frames from video based on visual change detection + periodic capture. |
| 191 | |
| 192 | Two capture strategies work together: |
| @@ -209,10 +212,15 @@ | |
| 209 | Capture a frame every N seconds regardless of change (0 to disable) |
| 210 | max_frames : int, optional |
| 211 | Maximum number of frames to extract |
| 212 | resize_to : tuple of (width, height), optional |
| 213 | Resize frames to this dimension |
| 214 | |
| 215 | Returns |
| 216 | ------- |
| 217 | list |
| 218 | List of extracted frames as numpy arrays |
| @@ -247,10 +255,16 @@ | |
| 247 | extracted_frames = [] |
| 248 | prev_frame = None |
| 249 | frame_idx = 0 |
| 250 | last_capture_frame = -periodic_interval # allow first periodic capture immediately |
| 251 | |
| 252 | pbar = tqdm( |
| 253 | total=frame_count, |
| 254 | desc="Extracting frames", |
| 255 | unit="frame", |
| 256 | bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]", |
| @@ -288,13 +302,31 @@ | |
| 288 | should_capture = True |
| 289 | reason = "periodic" |
| 290 | |
| 291 | if should_capture: |
| 292 | extracted_frames.append(frame) |
| 293 | prev_frame = frame |
| 294 | last_capture_frame = frame_idx |
| 295 | logger.debug(f"Frame {frame_idx} extracted ({reason})") |
| 296 | |
| 297 | pbar.set_postfix(extracted=len(extracted_frames)) |
| 298 | |
| 299 | # Check if we've reached the maximum |
| 300 | if max_frames is not None and len(extracted_frames) >= max_frames: |
| @@ -306,10 +338,27 @@ | |
| 306 | frame_idx += 1 |
| 307 | pbar.update(frame_interval) |
| 308 | |
| 309 | pbar.close() |
| 310 | cap.release() |
| 311 | logger.info(f"Extracted {len(extracted_frames)} frames from {frame_count} total frames") |
| 312 | return extracted_frames |
| 313 | |
| 314 | |
| 315 | def func_gpu(*args, **kwargs): |
| 316 |
| --- video_processor/extractors/frame_extractor.py | |
| +++ video_processor/extractors/frame_extractor.py | |
| @@ -1,9 +1,11 @@ | |
| 1 | """Frame extraction module for video processing.""" |
| 2 | |
| 3 | import functools |
| 4 | import logging |
| 5 | import sys |
| 6 | import tempfile |
| 7 | from pathlib import Path |
| 8 | from typing import List, Optional, Tuple, Union |
| 9 | |
| 10 | import cv2 |
| 11 | import numpy as np |
| @@ -183,10 +185,11 @@ | |
| 185 | sampling_rate: float = 1.0, |
| 186 | change_threshold: float = 0.15, |
| 187 | periodic_capture_seconds: float = 30.0, |
| 188 | max_frames: Optional[int] = None, |
| 189 | resize_to: Optional[Tuple[int, int]] = None, |
| 190 | max_memory_mb: int = 1024, |
| 191 | ) -> List[np.ndarray]: |
| 192 | """ |
| 193 | Extract frames from video based on visual change detection + periodic capture. |
| 194 | |
| 195 | Two capture strategies work together: |
| @@ -209,10 +212,15 @@ | |
| 212 | Capture a frame every N seconds regardless of change (0 to disable) |
| 213 | max_frames : int, optional |
| 214 | Maximum number of frames to extract |
| 215 | resize_to : tuple of (width, height), optional |
| 216 | Resize frames to this dimension |
| 217 | max_memory_mb : int |
| 218 | Approximate memory limit in MB for held frames. When approaching this |
| 219 | limit, frames are flushed to disk early and only paths are retained |
| 220 | internally. The returned list still contains numpy arrays (reloaded |
| 221 | from the temp files at the end). Default 1024 MB. |
| 222 | |
| 223 | Returns |
| 224 | ------- |
| 225 | list |
| 226 | List of extracted frames as numpy arrays |
| @@ -247,10 +255,16 @@ | |
| 255 | extracted_frames = [] |
| 256 | prev_frame = None |
| 257 | frame_idx = 0 |
| 258 | last_capture_frame = -periodic_interval # allow first periodic capture immediately |
| 259 | |
| 260 | # Memory safety valve |
| 261 | max_memory_bytes = max_memory_mb * 1024 * 1024 |
| 262 | approx_memory_used = 0 |
| 263 | _flush_dir = None # lazily created temp dir for flushed frames |
| 264 | _flushed_paths: List[Path] = [] # paths of frames flushed to disk |
| 265 | |
| 266 | pbar = tqdm( |
| 267 | total=frame_count, |
| 268 | desc="Extracting frames", |
| 269 | unit="frame", |
| 270 | bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]", |
| @@ -288,13 +302,31 @@ | |
| 302 | should_capture = True |
| 303 | reason = "periodic" |
| 304 | |
| 305 | if should_capture: |
| 306 | extracted_frames.append(frame) |
| 307 | approx_memory_used += sys.getsizeof(frame) + ( |
| 308 | frame.nbytes if hasattr(frame, "nbytes") else 0 |
| 309 | ) |
| 310 | prev_frame = frame |
| 311 | last_capture_frame = frame_idx |
| 312 | logger.debug(f"Frame {frame_idx} extracted ({reason})") |
| 313 | |
| 314 | # Memory safety valve: flush frames to disk when approaching limit |
| 315 | if approx_memory_used >= max_memory_bytes * 0.9: |
| 316 | if _flush_dir is None: |
| 317 | _flush_dir = tempfile.mkdtemp(prefix="planopticon_frames_") |
| 318 | logger.info( |
| 319 | f"Memory limit ~{max_memory_mb}MB approaching, " |
| 320 | f"flushing frames to {_flush_dir}" |
| 321 | ) |
| 322 | for fi, f in enumerate(extracted_frames): |
| 323 | flush_path = Path(_flush_dir) / f"flush_{len(_flushed_paths) + fi:06d}.jpg" |
| 324 | cv2.imwrite(str(flush_path), f) |
| 325 | _flushed_paths.append(flush_path) |
| 326 | extracted_frames.clear() |
| 327 | approx_memory_used = 0 |
| 328 | |
| 329 | pbar.set_postfix(extracted=len(extracted_frames)) |
| 330 | |
| 331 | # Check if we've reached the maximum |
| 332 | if max_frames is not None and len(extracted_frames) >= max_frames: |
| @@ -306,10 +338,27 @@ | |
| 338 | frame_idx += 1 |
| 339 | pbar.update(frame_interval) |
| 340 | |
| 341 | pbar.close() |
| 342 | cap.release() |
| 343 | |
| 344 | # If frames were flushed to disk, reload them |
| 345 | if _flushed_paths: |
| 346 | reloaded = [] |
| 347 | for fp in _flushed_paths: |
| 348 | img = cv2.imread(str(fp)) |
| 349 | if img is not None: |
| 350 | reloaded.append(img) |
| 351 | reloaded.extend(extracted_frames) |
| 352 | extracted_frames = reloaded |
| 353 | logger.info(f"Reloaded {len(_flushed_paths)} flushed frames from disk") |
| 354 | # Clean up temp files |
| 355 | import shutil |
| 356 | |
| 357 | if _flush_dir: |
| 358 | shutil.rmtree(_flush_dir, ignore_errors=True) |
| 359 | |
| 360 | logger.info(f"Extracted {len(extracted_frames)} frames from {frame_count} total frames") |
| 361 | return extracted_frames |
| 362 | |
| 363 | |
| 364 | def func_gpu(*args, **kwargs): |
| 365 |
| --- video_processor/integrators/graph_discovery.py | ||
| +++ video_processor/integrators/graph_discovery.py | ||
| @@ -95,12 +95,12 @@ | ||
| 95 | 95 | """Return summary stats for a knowledge graph file. |
| 96 | 96 | |
| 97 | 97 | Returns dict with: entity_count, relationship_count, entity_types, store_type. |
| 98 | 98 | """ |
| 99 | 99 | from video_processor.integrators.graph_store import ( |
| 100 | - FalkorDBStore, | |
| 101 | 100 | InMemoryStore, |
| 101 | + SQLiteStore, | |
| 102 | 102 | create_store, |
| 103 | 103 | ) |
| 104 | 104 | |
| 105 | 105 | db_path = Path(db_path) |
| 106 | 106 | |
| @@ -122,11 +122,11 @@ | ||
| 122 | 122 | rel.get("type", "related_to"), |
| 123 | 123 | ) |
| 124 | 124 | store_type = "json" |
| 125 | 125 | else: |
| 126 | 126 | store = create_store(db_path) |
| 127 | - store_type = "falkordb" if isinstance(store, FalkorDBStore) else "inmemory" | |
| 127 | + store_type = "sqlite" if isinstance(store, SQLiteStore) else "inmemory" | |
| 128 | 128 | |
| 129 | 129 | entities = store.get_all_entities() |
| 130 | 130 | entity_types = {} |
| 131 | 131 | for e in entities: |
| 132 | 132 | t = e.get("type", "concept") |
| 133 | 133 |
| --- video_processor/integrators/graph_discovery.py | |
| +++ video_processor/integrators/graph_discovery.py | |
| @@ -95,12 +95,12 @@ | |
| 95 | """Return summary stats for a knowledge graph file. |
| 96 | |
| 97 | Returns dict with: entity_count, relationship_count, entity_types, store_type. |
| 98 | """ |
| 99 | from video_processor.integrators.graph_store import ( |
| 100 | FalkorDBStore, |
| 101 | InMemoryStore, |
| 102 | create_store, |
| 103 | ) |
| 104 | |
| 105 | db_path = Path(db_path) |
| 106 | |
| @@ -122,11 +122,11 @@ | |
| 122 | rel.get("type", "related_to"), |
| 123 | ) |
| 124 | store_type = "json" |
| 125 | else: |
| 126 | store = create_store(db_path) |
| 127 | store_type = "falkordb" if isinstance(store, FalkorDBStore) else "inmemory" |
| 128 | |
| 129 | entities = store.get_all_entities() |
| 130 | entity_types = {} |
| 131 | for e in entities: |
| 132 | t = e.get("type", "concept") |
| 133 |
| --- video_processor/integrators/graph_discovery.py | |
| +++ video_processor/integrators/graph_discovery.py | |
| @@ -95,12 +95,12 @@ | |
| 95 | """Return summary stats for a knowledge graph file. |
| 96 | |
| 97 | Returns dict with: entity_count, relationship_count, entity_types, store_type. |
| 98 | """ |
| 99 | from video_processor.integrators.graph_store import ( |
| 100 | InMemoryStore, |
| 101 | SQLiteStore, |
| 102 | create_store, |
| 103 | ) |
| 104 | |
| 105 | db_path = Path(db_path) |
| 106 | |
| @@ -122,11 +122,11 @@ | |
| 122 | rel.get("type", "related_to"), |
| 123 | ) |
| 124 | store_type = "json" |
| 125 | else: |
| 126 | store = create_store(db_path) |
| 127 | store_type = "sqlite" if isinstance(store, SQLiteStore) else "inmemory" |
| 128 | |
| 129 | entities = store.get_all_entities() |
| 130 | entity_types = {} |
| 131 | for e in entities: |
| 132 | t = e.get("type", "concept") |
| 133 |
| --- video_processor/integrators/graph_query.py | ||
| +++ video_processor/integrators/graph_query.py | ||
| @@ -284,19 +284,46 @@ | ||
| 284 | 284 | query_type="filter", |
| 285 | 285 | raw_query="stats()", |
| 286 | 286 | explanation="Knowledge graph statistics", |
| 287 | 287 | ) |
| 288 | 288 | |
| 289 | - def cypher(self, query: str) -> QueryResult: | |
| 290 | - """Execute a raw Cypher query (FalkorDB only).""" | |
| 289 | + def sources(self) -> QueryResult: | |
| 290 | + """Return all registered content sources.""" | |
| 291 | + all_sources = self.store.get_sources() | |
| 292 | + return QueryResult( | |
| 293 | + data=all_sources, | |
| 294 | + query_type="filter", | |
| 295 | + raw_query="sources()", | |
| 296 | + explanation=f"Found {len(all_sources)} registered sources", | |
| 297 | + ) | |
| 298 | + | |
| 299 | + def provenance(self, entity_name: str) -> QueryResult: | |
| 300 | + """Return source locations for a given entity.""" | |
| 301 | + locations = self.store.get_entity_provenance(entity_name) | |
| 302 | + if not locations: | |
| 303 | + return QueryResult( | |
| 304 | + data=[], | |
| 305 | + query_type="filter", | |
| 306 | + raw_query=f"provenance({entity_name!r})", | |
| 307 | + explanation=f"No provenance records found for '{entity_name}'", | |
| 308 | + ) | |
| 309 | + return QueryResult( | |
| 310 | + data=locations, | |
| 311 | + query_type="filter", | |
| 312 | + raw_query=f"provenance({entity_name!r})", | |
| 313 | + explanation=f"Found {len(locations)} provenance records for '{entity_name}'", | |
| 314 | + ) | |
| 315 | + | |
| 316 | + def sql(self, query: str) -> QueryResult: | |
| 317 | + """Execute a raw SQL query (SQLite only).""" | |
| 291 | 318 | result = self.store.raw_query(query) |
| 292 | 319 | return QueryResult( |
| 293 | 320 | data=result, |
| 294 | - query_type="cypher", | |
| 321 | + query_type="sql", | |
| 295 | 322 | raw_query=query, |
| 296 | 323 | explanation=( |
| 297 | - f"Cypher query returned {len(result) if isinstance(result, list) else 1} rows" | |
| 324 | + f"SQL query returned {len(result) if isinstance(result, list) else 1} rows" | |
| 298 | 325 | ), |
| 299 | 326 | ) |
| 300 | 327 | |
| 301 | 328 | # ── Agentic mode (requires LLM) ── |
| 302 | 329 | |
| 303 | 330 |
| --- video_processor/integrators/graph_query.py | |
| +++ video_processor/integrators/graph_query.py | |
| @@ -284,19 +284,46 @@ | |
| 284 | query_type="filter", |
| 285 | raw_query="stats()", |
| 286 | explanation="Knowledge graph statistics", |
| 287 | ) |
| 288 | |
| 289 | def cypher(self, query: str) -> QueryResult: |
| 290 | """Execute a raw Cypher query (FalkorDB only).""" |
| 291 | result = self.store.raw_query(query) |
| 292 | return QueryResult( |
| 293 | data=result, |
| 294 | query_type="cypher", |
| 295 | raw_query=query, |
| 296 | explanation=( |
| 297 | f"Cypher query returned {len(result) if isinstance(result, list) else 1} rows" |
| 298 | ), |
| 299 | ) |
| 300 | |
| 301 | # ── Agentic mode (requires LLM) ── |
| 302 | |
| 303 |
| --- video_processor/integrators/graph_query.py | |
| +++ video_processor/integrators/graph_query.py | |
| @@ -284,19 +284,46 @@ | |
| 284 | query_type="filter", |
| 285 | raw_query="stats()", |
| 286 | explanation="Knowledge graph statistics", |
| 287 | ) |
| 288 | |
| 289 | def sources(self) -> QueryResult: |
| 290 | """Return all registered content sources.""" |
| 291 | all_sources = self.store.get_sources() |
| 292 | return QueryResult( |
| 293 | data=all_sources, |
| 294 | query_type="filter", |
| 295 | raw_query="sources()", |
| 296 | explanation=f"Found {len(all_sources)} registered sources", |
| 297 | ) |
| 298 | |
| 299 | def provenance(self, entity_name: str) -> QueryResult: |
| 300 | """Return source locations for a given entity.""" |
| 301 | locations = self.store.get_entity_provenance(entity_name) |
| 302 | if not locations: |
| 303 | return QueryResult( |
| 304 | data=[], |
| 305 | query_type="filter", |
| 306 | raw_query=f"provenance({entity_name!r})", |
| 307 | explanation=f"No provenance records found for '{entity_name}'", |
| 308 | ) |
| 309 | return QueryResult( |
| 310 | data=locations, |
| 311 | query_type="filter", |
| 312 | raw_query=f"provenance({entity_name!r})", |
| 313 | explanation=f"Found {len(locations)} provenance records for '{entity_name}'", |
| 314 | ) |
| 315 | |
| 316 | def sql(self, query: str) -> QueryResult: |
| 317 | """Execute a raw SQL query (SQLite only).""" |
| 318 | result = self.store.raw_query(query) |
| 319 | return QueryResult( |
| 320 | data=result, |
| 321 | query_type="sql", |
| 322 | raw_query=query, |
| 323 | explanation=( |
| 324 | f"SQL query returned {len(result) if isinstance(result, list) else 1} rows" |
| 325 | ), |
| 326 | ) |
| 327 | |
| 328 | # ── Agentic mode (requires LLM) ── |
| 329 | |
| 330 |
+381
-201
| --- video_processor/integrators/graph_store.py | ||
| +++ video_processor/integrators/graph_store.py | ||
| @@ -1,8 +1,10 @@ | ||
| 1 | 1 | """Graph storage backends for PlanOpticon knowledge graphs.""" |
| 2 | 2 | |
| 3 | +import json | |
| 3 | 4 | import logging |
| 5 | +import sqlite3 | |
| 4 | 6 | from abc import ABC, abstractmethod |
| 5 | 7 | from pathlib import Path |
| 6 | 8 | from typing import Any, Dict, List, Optional, Union |
| 7 | 9 | |
| 8 | 10 | logger = logging.getLogger(__name__) |
| @@ -108,13 +110,39 @@ | ||
| 108 | 110 | """Check if a relationship exists between two entities. |
| 109 | 111 | |
| 110 | 112 | If edge_label is None, checks for any relationship type. |
| 111 | 113 | """ |
| 112 | 114 | ... |
| 115 | + | |
| 116 | + def register_source(self, source: Dict[str, Any]) -> None: | |
| 117 | + """Register a content source. Default no-op for backends that don't support it.""" | |
| 118 | + pass | |
| 119 | + | |
| 120 | + def get_sources(self) -> List[Dict[str, Any]]: | |
| 121 | + """Return all registered sources.""" | |
| 122 | + return [] | |
| 123 | + | |
| 124 | + def get_source(self, source_id: str) -> Optional[Dict[str, Any]]: | |
| 125 | + """Get a source by ID.""" | |
| 126 | + return None | |
| 127 | + | |
| 128 | + def add_source_location( | |
| 129 | + self, | |
| 130 | + source_id: str, | |
| 131 | + entity_name_lower: Optional[str] = None, | |
| 132 | + relationship_id: Optional[int] = None, | |
| 133 | + **kwargs, | |
| 134 | + ) -> None: | |
| 135 | + """Link a source to an entity or relationship with location details.""" | |
| 136 | + pass | |
| 137 | + | |
| 138 | + def get_entity_provenance(self, name: str) -> List[Dict[str, Any]]: | |
| 139 | + """Get all source locations for an entity.""" | |
| 140 | + return [] | |
| 113 | 141 | |
| 114 | 142 | def raw_query(self, query_string: str) -> Any: |
| 115 | - """Execute a raw query against the backend (e.g. Cypher for FalkorDB). | |
| 143 | + """Execute a raw query against the backend (e.g. SQL for SQLite). | |
| 116 | 144 | |
| 117 | 145 | Not supported by all backends — raises NotImplementedError by default. |
| 118 | 146 | """ |
| 119 | 147 | raise NotImplementedError(f"{type(self).__name__} does not support raw queries") |
| 120 | 148 | |
| @@ -133,19 +161,25 @@ | ||
| 133 | 161 | "type": e.get("type", "concept"), |
| 134 | 162 | "descriptions": descs, |
| 135 | 163 | "occurrences": e.get("occurrences", []), |
| 136 | 164 | } |
| 137 | 165 | ) |
| 138 | - return {"nodes": nodes, "relationships": self.get_all_relationships()} | |
| 166 | + result = {"nodes": nodes, "relationships": self.get_all_relationships()} | |
| 167 | + sources = self.get_sources() | |
| 168 | + if sources: | |
| 169 | + result["sources"] = sources | |
| 170 | + return result | |
| 139 | 171 | |
| 140 | 172 | |
| 141 | 173 | class InMemoryStore(GraphStore): |
| 142 | 174 | """In-memory graph store using Python dicts. Default fallback.""" |
| 143 | 175 | |
| 144 | 176 | def __init__(self) -> None: |
| 145 | 177 | self._nodes: Dict[str, Dict[str, Any]] = {} # keyed by name.lower() |
| 146 | 178 | self._relationships: List[Dict[str, Any]] = [] |
| 179 | + self._sources: Dict[str, Dict[str, Any]] = {} # keyed by source_id | |
| 180 | + self._source_locations: List[Dict[str, Any]] = [] | |
| 147 | 181 | |
| 148 | 182 | def merge_entity( |
| 149 | 183 | self, |
| 150 | 184 | name: str, |
| 151 | 185 | entity_type: str, |
| @@ -154,10 +188,12 @@ | ||
| 154 | 188 | ) -> None: |
| 155 | 189 | key = name.lower() |
| 156 | 190 | if key in self._nodes: |
| 157 | 191 | if descriptions: |
| 158 | 192 | self._nodes[key]["descriptions"].update(descriptions) |
| 193 | + if entity_type and entity_type != self._nodes[key]["type"]: | |
| 194 | + self._nodes[key]["type"] = entity_type | |
| 159 | 195 | else: |
| 160 | 196 | self._nodes[key] = { |
| 161 | 197 | "id": name, |
| 162 | 198 | "name": name, |
| 163 | 199 | "type": entity_type, |
| @@ -239,10 +275,47 @@ | ||
| 239 | 275 | key = name.lower() |
| 240 | 276 | if key not in self._nodes: |
| 241 | 277 | return False |
| 242 | 278 | self._nodes[key].update(properties) |
| 243 | 279 | return True |
| 280 | + | |
| 281 | + def register_source(self, source: Dict[str, Any]) -> None: | |
| 282 | + source_id = source.get("source_id", "") | |
| 283 | + self._sources[source_id] = dict(source) | |
| 284 | + | |
| 285 | + def get_sources(self) -> List[Dict[str, Any]]: | |
| 286 | + return list(self._sources.values()) | |
| 287 | + | |
| 288 | + def get_source(self, source_id: str) -> Optional[Dict[str, Any]]: | |
| 289 | + return self._sources.get(source_id) | |
| 290 | + | |
| 291 | + def add_source_location( | |
| 292 | + self, | |
| 293 | + source_id: str, | |
| 294 | + entity_name_lower: Optional[str] = None, | |
| 295 | + relationship_id: Optional[int] = None, | |
| 296 | + **kwargs, | |
| 297 | + ) -> None: | |
| 298 | + entry: Dict[str, Any] = { | |
| 299 | + "source_id": source_id, | |
| 300 | + "entity_name_lower": entity_name_lower, | |
| 301 | + "relationship_id": relationship_id, | |
| 302 | + } | |
| 303 | + entry.update(kwargs) | |
| 304 | + self._source_locations.append(entry) | |
| 305 | + | |
| 306 | + def get_entity_provenance(self, name: str) -> List[Dict[str, Any]]: | |
| 307 | + name_lower = name.lower() | |
| 308 | + results = [] | |
| 309 | + for loc in self._source_locations: | |
| 310 | + if loc.get("entity_name_lower") == name_lower: | |
| 311 | + entry = dict(loc) | |
| 312 | + src = self._sources.get(loc.get("source_id", "")) | |
| 313 | + if src: | |
| 314 | + entry["source"] = src | |
| 315 | + results.append(entry) | |
| 316 | + return results | |
| 244 | 317 | |
| 245 | 318 | def has_relationship( |
| 246 | 319 | self, |
| 247 | 320 | source: str, |
| 248 | 321 | target: str, |
| @@ -255,323 +328,430 @@ | ||
| 255 | 328 | if edge_label is None or rel.get("type") == edge_label: |
| 256 | 329 | return True |
| 257 | 330 | return False |
| 258 | 331 | |
| 259 | 332 | |
| 260 | -class FalkorDBStore(GraphStore): | |
| 261 | - """FalkorDB Lite-backed graph store. Requires falkordblite package.""" | |
| 333 | +class SQLiteStore(GraphStore): | |
| 334 | + """SQLite-backed graph store. Uses Python's built-in sqlite3 module.""" | |
| 335 | + | |
| 336 | + _SCHEMA = """ | |
| 337 | + CREATE TABLE IF NOT EXISTS entities ( | |
| 338 | + name TEXT NOT NULL, | |
| 339 | + name_lower TEXT NOT NULL UNIQUE, | |
| 340 | + type TEXT NOT NULL DEFAULT 'concept', | |
| 341 | + descriptions TEXT NOT NULL DEFAULT '[]', | |
| 342 | + source TEXT, | |
| 343 | + properties TEXT NOT NULL DEFAULT '{}' | |
| 344 | + ); | |
| 345 | + CREATE TABLE IF NOT EXISTS occurrences ( | |
| 346 | + entity_name_lower TEXT NOT NULL, | |
| 347 | + source TEXT NOT NULL, | |
| 348 | + timestamp REAL, | |
| 349 | + text TEXT, | |
| 350 | + FOREIGN KEY (entity_name_lower) REFERENCES entities(name_lower) | |
| 351 | + ); | |
| 352 | + CREATE TABLE IF NOT EXISTS relationships ( | |
| 353 | + source TEXT NOT NULL, | |
| 354 | + target TEXT NOT NULL, | |
| 355 | + type TEXT NOT NULL DEFAULT 'related_to', | |
| 356 | + content_source TEXT, | |
| 357 | + timestamp REAL, | |
| 358 | + properties TEXT NOT NULL DEFAULT '{}' | |
| 359 | + ); | |
| 360 | + CREATE INDEX IF NOT EXISTS idx_entities_name_lower ON entities(name_lower); | |
| 361 | + CREATE INDEX IF NOT EXISTS idx_entities_type ON entities(type); | |
| 362 | + CREATE INDEX IF NOT EXISTS idx_occurrences_entity ON occurrences(entity_name_lower); | |
| 363 | + CREATE INDEX IF NOT EXISTS idx_relationships_source ON relationships(source); | |
| 364 | + CREATE INDEX IF NOT EXISTS idx_relationships_target ON relationships(target); | |
| 365 | + | |
| 366 | + CREATE TABLE IF NOT EXISTS sources ( | |
| 367 | + source_id TEXT PRIMARY KEY, | |
| 368 | + source_type TEXT NOT NULL, | |
| 369 | + title TEXT NOT NULL, | |
| 370 | + path TEXT, | |
| 371 | + url TEXT, | |
| 372 | + mime_type TEXT, | |
| 373 | + ingested_at TEXT NOT NULL, | |
| 374 | + metadata TEXT NOT NULL DEFAULT '{}' | |
| 375 | + ); | |
| 376 | + CREATE TABLE IF NOT EXISTS source_locations ( | |
| 377 | + id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| 378 | + source_id TEXT NOT NULL REFERENCES sources(source_id), | |
| 379 | + entity_name_lower TEXT, | |
| 380 | + relationship_id INTEGER, | |
| 381 | + timestamp REAL, | |
| 382 | + page INTEGER, | |
| 383 | + section TEXT, | |
| 384 | + line_start INTEGER, | |
| 385 | + line_end INTEGER, | |
| 386 | + text_snippet TEXT | |
| 387 | + ); | |
| 388 | + CREATE INDEX IF NOT EXISTS idx_source_locations_source ON source_locations(source_id); | |
| 389 | + CREATE INDEX IF NOT EXISTS idx_source_locations_entity | |
| 390 | + ON source_locations(entity_name_lower); | |
| 391 | + """ | |
| 262 | 392 | |
| 263 | 393 | def __init__(self, db_path: Union[str, Path]) -> None: |
| 264 | - # Patch redis 7.x compat: UnixDomainSocketConnection missing 'port' | |
| 265 | - import redis.connection | |
| 266 | - | |
| 267 | - if not hasattr(redis.connection.UnixDomainSocketConnection, "port"): | |
| 268 | - redis.connection.UnixDomainSocketConnection.port = 0 | |
| 269 | - | |
| 270 | - from redislite import FalkorDB | |
| 271 | - | |
| 272 | 394 | self._db_path = str(db_path) |
| 273 | - self._db = FalkorDB(self._db_path) | |
| 274 | - self._graph = self._db.select_graph("knowledge") | |
| 275 | - self._ensure_indexes() | |
| 276 | - | |
| 277 | - def _ensure_indexes(self) -> None: | |
| 278 | - for query in [ | |
| 279 | - "CREATE INDEX FOR (e:Entity) ON (e.name_lower)", | |
| 280 | - "CREATE INDEX FOR (e:Entity) ON (e.type)", | |
| 281 | - "CREATE INDEX FOR (e:Entity) ON (e.dag_id)", | |
| 282 | - ]: | |
| 283 | - try: | |
| 284 | - self._graph.query(query) | |
| 285 | - except Exception: | |
| 286 | - pass # index already exists | |
| 395 | + self._conn = sqlite3.connect(self._db_path) | |
| 396 | + self._conn.execute("PRAGMA journal_mode=WAL") | |
| 397 | + self._conn.execute("PRAGMA foreign_keys=ON") | |
| 398 | + self._conn.executescript(self._SCHEMA) | |
| 399 | + self._conn.commit() | |
| 287 | 400 | |
| 288 | 401 | def merge_entity( |
| 289 | 402 | self, |
| 290 | 403 | name: str, |
| 291 | 404 | entity_type: str, |
| 292 | 405 | descriptions: List[str], |
| 293 | 406 | source: Optional[str] = None, |
| 294 | 407 | ) -> None: |
| 295 | 408 | name_lower = name.lower() |
| 296 | - | |
| 297 | - # Check if entity exists | |
| 298 | - result = self._graph.query( | |
| 299 | - "MATCH (e:Entity {name_lower: $name_lower}) RETURN e.descriptions", | |
| 300 | - params={"name_lower": name_lower}, | |
| 301 | - ) | |
| 302 | - | |
| 303 | - if result.result_set: | |
| 304 | - # Entity exists — merge descriptions | |
| 305 | - existing_descs = result.result_set[0][0] or [] | |
| 306 | - merged = list(set(existing_descs + descriptions)) | |
| 307 | - self._graph.query( | |
| 308 | - "MATCH (e:Entity {name_lower: $name_lower}) SET e.descriptions = $descs", | |
| 309 | - params={"name_lower": name_lower, "descs": merged}, | |
| 409 | + row = self._conn.execute( | |
| 410 | + "SELECT descriptions FROM entities WHERE name_lower = ?", | |
| 411 | + (name_lower,), | |
| 412 | + ).fetchone() | |
| 413 | + | |
| 414 | + if row: | |
| 415 | + existing = json.loads(row[0]) | |
| 416 | + merged = list(set(existing + descriptions)) | |
| 417 | + self._conn.execute( | |
| 418 | + "UPDATE entities SET descriptions = ?, type = ? WHERE name_lower = ?", | |
| 419 | + (json.dumps(merged), entity_type, name_lower), | |
| 310 | 420 | ) |
| 311 | 421 | else: |
| 312 | - # Create new entity | |
| 313 | - self._graph.query( | |
| 314 | - "CREATE (e:Entity {" | |
| 315 | - "name: $name, name_lower: $name_lower, type: $type, " | |
| 316 | - "descriptions: $descs, source: $source" | |
| 317 | - "})", | |
| 318 | - params={ | |
| 319 | - "name": name, | |
| 320 | - "name_lower": name_lower, | |
| 321 | - "type": entity_type, | |
| 322 | - "descs": descriptions, | |
| 323 | - "source": source, | |
| 324 | - }, | |
| 325 | - ) | |
| 422 | + self._conn.execute( | |
| 423 | + "INSERT INTO entities (name, name_lower, type, descriptions, source) " | |
| 424 | + "VALUES (?, ?, ?, ?, ?)", | |
| 425 | + (name, name_lower, entity_type, json.dumps(descriptions), source), | |
| 426 | + ) | |
| 427 | + self._conn.commit() | |
| 326 | 428 | |
| 327 | 429 | def add_occurrence( |
| 328 | 430 | self, |
| 329 | 431 | entity_name: str, |
| 330 | 432 | source: str, |
| 331 | 433 | timestamp: Optional[float] = None, |
| 332 | 434 | text: Optional[str] = None, |
| 333 | 435 | ) -> None: |
| 334 | 436 | name_lower = entity_name.lower() |
| 335 | - self._graph.query( | |
| 336 | - "MATCH (e:Entity {name_lower: $name_lower}) " | |
| 337 | - "CREATE (o:Occurrence {source: $source, timestamp: $timestamp, text: $text}) " | |
| 338 | - "CREATE (e)-[:OCCURRED_IN]->(o)", | |
| 339 | - params={ | |
| 340 | - "name_lower": name_lower, | |
| 341 | - "source": source, | |
| 342 | - "timestamp": timestamp, | |
| 343 | - "text": text, | |
| 344 | - }, | |
| 345 | - ) | |
| 437 | + exists = self._conn.execute( | |
| 438 | + "SELECT 1 FROM entities WHERE name_lower = ?", (name_lower,) | |
| 439 | + ).fetchone() | |
| 440 | + if not exists: | |
| 441 | + return | |
| 442 | + self._conn.execute( | |
| 443 | + "INSERT INTO occurrences (entity_name_lower, source, timestamp, text) " | |
| 444 | + "VALUES (?, ?, ?, ?)", | |
| 445 | + (name_lower, source, timestamp, text), | |
| 446 | + ) | |
| 447 | + self._conn.commit() | |
| 346 | 448 | |
| 347 | 449 | def add_relationship( |
| 348 | 450 | self, |
| 349 | 451 | source: str, |
| 350 | 452 | target: str, |
| 351 | 453 | rel_type: str, |
| 352 | 454 | content_source: Optional[str] = None, |
| 353 | 455 | timestamp: Optional[float] = None, |
| 354 | 456 | ) -> None: |
| 355 | - self._graph.query( | |
| 356 | - "MATCH (a:Entity {name_lower: $src_lower}) " | |
| 357 | - "MATCH (b:Entity {name_lower: $tgt_lower}) " | |
| 358 | - "CREATE (a)-[:RELATED_TO {" | |
| 359 | - "rel_type: $rel_type, content_source: $content_source, timestamp: $timestamp" | |
| 360 | - "}]->(b)", | |
| 361 | - params={ | |
| 362 | - "src_lower": source.lower(), | |
| 363 | - "tgt_lower": target.lower(), | |
| 364 | - "rel_type": rel_type, | |
| 365 | - "content_source": content_source, | |
| 366 | - "timestamp": timestamp, | |
| 367 | - }, | |
| 368 | - ) | |
| 457 | + self._conn.execute( | |
| 458 | + "INSERT INTO relationships (source, target, type, content_source, timestamp) " | |
| 459 | + "VALUES (?, ?, ?, ?, ?)", | |
| 460 | + (source, target, rel_type, content_source, timestamp), | |
| 461 | + ) | |
| 462 | + self._conn.commit() | |
| 369 | 463 | |
| 370 | 464 | def get_entity(self, name: str) -> Optional[Dict[str, Any]]: |
| 371 | - result = self._graph.query( | |
| 372 | - "MATCH (e:Entity {name_lower: $name_lower}) " | |
| 373 | - "RETURN e.name, e.type, e.descriptions, e.source", | |
| 374 | - params={"name_lower": name.lower()}, | |
| 375 | - ) | |
| 376 | - if not result.result_set: | |
| 465 | + row = self._conn.execute( | |
| 466 | + "SELECT name, type, descriptions, source FROM entities WHERE name_lower = ?", | |
| 467 | + (name.lower(),), | |
| 468 | + ).fetchone() | |
| 469 | + if not row: | |
| 377 | 470 | return None |
| 378 | 471 | |
| 379 | - row = result.result_set[0] | |
| 380 | 472 | entity_name = row[0] |
| 381 | - | |
| 382 | - # Fetch occurrences | |
| 383 | - occ_result = self._graph.query( | |
| 384 | - "MATCH (e:Entity {name_lower: $name_lower})-[:OCCURRED_IN]->(o:Occurrence) " | |
| 385 | - "RETURN o.source, o.timestamp, o.text", | |
| 386 | - params={"name_lower": name.lower()}, | |
| 387 | - ) | |
| 388 | - occurrences = [ | |
| 389 | - {"source": o[0], "timestamp": o[1], "text": o[2]} for o in occ_result.result_set | |
| 390 | - ] | |
| 473 | + occ_rows = self._conn.execute( | |
| 474 | + "SELECT source, timestamp, text FROM occurrences WHERE entity_name_lower = ?", | |
| 475 | + (name.lower(),), | |
| 476 | + ).fetchall() | |
| 477 | + occurrences = [{"source": o[0], "timestamp": o[1], "text": o[2]} for o in occ_rows] | |
| 391 | 478 | |
| 392 | 479 | return { |
| 393 | 480 | "id": entity_name, |
| 394 | 481 | "name": entity_name, |
| 395 | 482 | "type": row[1] or "concept", |
| 396 | - "descriptions": row[2] or [], | |
| 483 | + "descriptions": json.loads(row[2]) if row[2] else [], | |
| 397 | 484 | "occurrences": occurrences, |
| 398 | 485 | "source": row[3], |
| 399 | 486 | } |
| 400 | 487 | |
| 401 | 488 | def get_all_entities(self) -> List[Dict[str, Any]]: |
| 402 | - result = self._graph.query( | |
| 403 | - "MATCH (e:Entity) RETURN e.name, e.name_lower, e.type, e.descriptions, e.source" | |
| 404 | - ) | |
| 489 | + rows = self._conn.execute( | |
| 490 | + "SELECT name, name_lower, type, descriptions, source FROM entities" | |
| 491 | + ).fetchall() | |
| 405 | 492 | entities = [] |
| 406 | - for row in result.result_set: | |
| 493 | + for row in rows: | |
| 407 | 494 | name_lower = row[1] |
| 408 | - # Fetch occurrences for this entity | |
| 409 | - occ_result = self._graph.query( | |
| 410 | - "MATCH (e:Entity {name_lower: $name_lower})-[:OCCURRED_IN]->(o:Occurrence) " | |
| 411 | - "RETURN o.source, o.timestamp, o.text", | |
| 412 | - params={"name_lower": name_lower}, | |
| 413 | - ) | |
| 414 | - occurrences = [ | |
| 415 | - {"source": o[0], "timestamp": o[1], "text": o[2]} for o in occ_result.result_set | |
| 416 | - ] | |
| 495 | + occ_rows = self._conn.execute( | |
| 496 | + "SELECT source, timestamp, text FROM occurrences WHERE entity_name_lower = ?", | |
| 497 | + (name_lower,), | |
| 498 | + ).fetchall() | |
| 499 | + occurrences = [{"source": o[0], "timestamp": o[1], "text": o[2]} for o in occ_rows] | |
| 417 | 500 | entities.append( |
| 418 | 501 | { |
| 419 | 502 | "id": row[0], |
| 420 | 503 | "name": row[0], |
| 421 | 504 | "type": row[2] or "concept", |
| 422 | - "descriptions": row[3] or [], | |
| 505 | + "descriptions": json.loads(row[3]) if row[3] else [], | |
| 423 | 506 | "occurrences": occurrences, |
| 424 | 507 | "source": row[4], |
| 425 | 508 | } |
| 426 | 509 | ) |
| 427 | 510 | return entities |
| 428 | 511 | |
| 429 | 512 | def get_all_relationships(self) -> List[Dict[str, Any]]: |
| 430 | - result = self._graph.query( | |
| 431 | - "MATCH (a:Entity)-[r:RELATED_TO]->(b:Entity) " | |
| 432 | - "RETURN a.name, b.name, r.rel_type, r.content_source, r.timestamp" | |
| 433 | - ) | |
| 513 | + rows = self._conn.execute( | |
| 514 | + "SELECT source, target, type, content_source, timestamp FROM relationships" | |
| 515 | + ).fetchall() | |
| 434 | 516 | return [ |
| 435 | 517 | { |
| 436 | 518 | "source": row[0], |
| 437 | 519 | "target": row[1], |
| 438 | 520 | "type": row[2] or "related_to", |
| 439 | 521 | "content_source": row[3], |
| 440 | 522 | "timestamp": row[4], |
| 441 | 523 | } |
| 442 | - for row in result.result_set | |
| 524 | + for row in rows | |
| 443 | 525 | ] |
| 444 | 526 | |
| 445 | 527 | def get_entity_count(self) -> int: |
| 446 | - result = self._graph.query("MATCH (e:Entity) RETURN count(e)") | |
| 447 | - return result.result_set[0][0] if result.result_set else 0 | |
| 528 | + row = self._conn.execute("SELECT COUNT(*) FROM entities").fetchone() | |
| 529 | + return row[0] if row else 0 | |
| 448 | 530 | |
| 449 | 531 | def get_relationship_count(self) -> int: |
| 450 | - result = self._graph.query("MATCH ()-[r]->() RETURN count(r)") | |
| 451 | - count = result.result_set[0][0] if result.result_set else 0 | |
| 452 | - # Subtract occurrence edges which are internal bookkeeping | |
| 453 | - occ_result = self._graph.query("MATCH ()-[r:OCCURRED_IN]->() RETURN count(r)") | |
| 454 | - occ_count = occ_result.result_set[0][0] if occ_result.result_set else 0 | |
| 455 | - return count - occ_count | |
| 532 | + row = self._conn.execute("SELECT COUNT(*) FROM relationships").fetchone() | |
| 533 | + return row[0] if row else 0 | |
| 456 | 534 | |
| 457 | 535 | def has_entity(self, name: str) -> bool: |
| 458 | - result = self._graph.query( | |
| 459 | - "MATCH (e:Entity {name_lower: $name_lower}) RETURN count(e)", | |
| 460 | - params={"name_lower": name.lower()}, | |
| 461 | - ) | |
| 462 | - return result.result_set[0][0] > 0 if result.result_set else False | |
| 536 | + row = self._conn.execute( | |
| 537 | + "SELECT 1 FROM entities WHERE name_lower = ?", (name.lower(),) | |
| 538 | + ).fetchone() | |
| 539 | + return row is not None | |
| 463 | 540 | |
| 464 | 541 | def raw_query(self, query_string: str) -> Any: |
| 465 | - """Execute a raw Cypher query and return the result set.""" | |
| 466 | - result = self._graph.query(query_string) | |
| 467 | - return result.result_set | |
| 542 | + """Execute a raw SQL query and return all rows.""" | |
| 543 | + cursor = self._conn.execute(query_string) | |
| 544 | + return cursor.fetchall() | |
| 468 | 545 | |
| 469 | 546 | def add_typed_relationship( |
| 470 | 547 | self, |
| 471 | 548 | source: str, |
| 472 | 549 | target: str, |
| 473 | 550 | edge_label: str, |
| 474 | 551 | properties: Optional[Dict[str, Any]] = None, |
| 475 | 552 | ) -> None: |
| 476 | - props = properties or {} | |
| 477 | - # Build property string for Cypher SET clause | |
| 478 | - prop_assignments = [] | |
| 479 | - params: Dict[str, Any] = { | |
| 480 | - "src_lower": source.lower(), | |
| 481 | - "tgt_lower": target.lower(), | |
| 482 | - } | |
| 483 | - for i, (k, v) in enumerate(props.items()): | |
| 484 | - param_name = f"prop_{i}" | |
| 485 | - prop_assignments.append(f"r.{k} = ${param_name}") | |
| 486 | - params[param_name] = v | |
| 487 | - | |
| 488 | - set_clause = "" | |
| 489 | - if prop_assignments: | |
| 490 | - set_clause = " SET " + ", ".join(prop_assignments) | |
| 491 | - | |
| 492 | - # FalkorDB requires static relationship types in CREATE, so we use | |
| 493 | - # a parameterized approach with specific known labels | |
| 494 | - query = ( | |
| 495 | - f"MATCH (a:Entity {{name_lower: $src_lower}}) " | |
| 496 | - f"MATCH (b:Entity {{name_lower: $tgt_lower}}) " | |
| 497 | - f"CREATE (a)-[r:{edge_label}]->(b)" | |
| 498 | - f"{set_clause}" | |
| 499 | - ) | |
| 500 | - self._graph.query(query, params=params) | |
| 553 | + self._conn.execute( | |
| 554 | + "INSERT INTO relationships (source, target, type, properties) VALUES (?, ?, ?, ?)", | |
| 555 | + (source, target, edge_label, json.dumps(properties or {})), | |
| 556 | + ) | |
| 557 | + self._conn.commit() | |
| 501 | 558 | |
| 502 | 559 | def set_entity_properties( |
| 503 | 560 | self, |
| 504 | 561 | name: str, |
| 505 | 562 | properties: Dict[str, Any], |
| 506 | 563 | ) -> bool: |
| 507 | 564 | name_lower = name.lower() |
| 508 | - # Check entity exists | |
| 509 | 565 | if not self.has_entity(name): |
| 510 | 566 | return False |
| 511 | - | |
| 512 | - params: Dict[str, Any] = {"name_lower": name_lower} | |
| 513 | - set_parts = [] | |
| 514 | - for i, (k, v) in enumerate(properties.items()): | |
| 515 | - param_name = f"prop_{i}" | |
| 516 | - set_parts.append(f"e.{k} = ${param_name}") | |
| 517 | - params[param_name] = v | |
| 518 | - | |
| 519 | - if not set_parts: | |
| 567 | + if not properties: | |
| 520 | 568 | return True |
| 521 | - | |
| 522 | - query = f"MATCH (e:Entity {{name_lower: $name_lower}}) SET {', '.join(set_parts)}" | |
| 523 | - self._graph.query(query, params=params) | |
| 569 | + row = self._conn.execute( | |
| 570 | + "SELECT properties FROM entities WHERE name_lower = ?", (name_lower,) | |
| 571 | + ).fetchone() | |
| 572 | + existing = json.loads(row[0]) if row and row[0] else {} | |
| 573 | + existing.update(properties) | |
| 574 | + self._conn.execute( | |
| 575 | + "UPDATE entities SET properties = ? WHERE name_lower = ?", | |
| 576 | + (json.dumps(existing), name_lower), | |
| 577 | + ) | |
| 578 | + self._conn.commit() | |
| 524 | 579 | return True |
| 525 | 580 | |
| 526 | 581 | def has_relationship( |
| 527 | 582 | self, |
| 528 | 583 | source: str, |
| 529 | 584 | target: str, |
| 530 | 585 | edge_label: Optional[str] = None, |
| 531 | 586 | ) -> bool: |
| 532 | - params = { | |
| 533 | - "src_lower": source.lower(), | |
| 534 | - "tgt_lower": target.lower(), | |
| 535 | - } | |
| 536 | 587 | if edge_label: |
| 537 | - query = ( | |
| 538 | - f"MATCH (a:Entity {{name_lower: $src_lower}})" | |
| 539 | - f"-[:{edge_label}]->" | |
| 540 | - f"(b:Entity {{name_lower: $tgt_lower}}) " | |
| 541 | - f"RETURN count(*)" | |
| 588 | + row = self._conn.execute( | |
| 589 | + "SELECT 1 FROM relationships " | |
| 590 | + "WHERE LOWER(source) = ? AND LOWER(target) = ? AND type = ?", | |
| 591 | + (source.lower(), target.lower(), edge_label), | |
| 592 | + ).fetchone() | |
| 593 | + else: | |
| 594 | + row = self._conn.execute( | |
| 595 | + "SELECT 1 FROM relationships WHERE LOWER(source) = ? AND LOWER(target) = ?", | |
| 596 | + (source.lower(), target.lower()), | |
| 597 | + ).fetchone() | |
| 598 | + return row is not None | |
| 599 | + | |
| 600 | + def register_source(self, source: Dict[str, Any]) -> None: | |
| 601 | + source_id = source.get("source_id", "") | |
| 602 | + existing = self._conn.execute( | |
| 603 | + "SELECT 1 FROM sources WHERE source_id = ?", (source_id,) | |
| 604 | + ).fetchone() | |
| 605 | + if existing: | |
| 606 | + self._conn.execute( | |
| 607 | + "UPDATE sources SET source_type = ?, title = ?, path = ?, url = ?, " | |
| 608 | + "mime_type = ?, ingested_at = ?, metadata = ? WHERE source_id = ?", | |
| 609 | + ( | |
| 610 | + source.get("source_type", ""), | |
| 611 | + source.get("title", ""), | |
| 612 | + source.get("path"), | |
| 613 | + source.get("url"), | |
| 614 | + source.get("mime_type"), | |
| 615 | + source.get("ingested_at", ""), | |
| 616 | + json.dumps(source.get("metadata", {})), | |
| 617 | + source_id, | |
| 618 | + ), | |
| 542 | 619 | ) |
| 543 | 620 | else: |
| 544 | - query = ( | |
| 545 | - "MATCH (a:Entity {name_lower: $src_lower})" | |
| 546 | - "-[]->" | |
| 547 | - "(b:Entity {name_lower: $tgt_lower}) " | |
| 548 | - "RETURN count(*)" | |
| 549 | - ) | |
| 550 | - result = self._graph.query(query, params=params) | |
| 551 | - return result.result_set[0][0] > 0 if result.result_set else False | |
| 621 | + self._conn.execute( | |
| 622 | + "INSERT INTO sources (source_id, source_type, title, path, url, " | |
| 623 | + "mime_type, ingested_at, metadata) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", | |
| 624 | + ( | |
| 625 | + source_id, | |
| 626 | + source.get("source_type", ""), | |
| 627 | + source.get("title", ""), | |
| 628 | + source.get("path"), | |
| 629 | + source.get("url"), | |
| 630 | + source.get("mime_type"), | |
| 631 | + source.get("ingested_at", ""), | |
| 632 | + json.dumps(source.get("metadata", {})), | |
| 633 | + ), | |
| 634 | + ) | |
| 635 | + self._conn.commit() | |
| 636 | + | |
| 637 | + def get_sources(self) -> List[Dict[str, Any]]: | |
| 638 | + rows = self._conn.execute( | |
| 639 | + "SELECT source_id, source_type, title, path, url, mime_type, " | |
| 640 | + "ingested_at, metadata FROM sources" | |
| 641 | + ).fetchall() | |
| 642 | + return [ | |
| 643 | + { | |
| 644 | + "source_id": r[0], | |
| 645 | + "source_type": r[1], | |
| 646 | + "title": r[2], | |
| 647 | + "path": r[3], | |
| 648 | + "url": r[4], | |
| 649 | + "mime_type": r[5], | |
| 650 | + "ingested_at": r[6], | |
| 651 | + "metadata": json.loads(r[7]) if r[7] else {}, | |
| 652 | + } | |
| 653 | + for r in rows | |
| 654 | + ] | |
| 655 | + | |
| 656 | + def get_source(self, source_id: str) -> Optional[Dict[str, Any]]: | |
| 657 | + row = self._conn.execute( | |
| 658 | + "SELECT source_id, source_type, title, path, url, mime_type, " | |
| 659 | + "ingested_at, metadata FROM sources WHERE source_id = ?", | |
| 660 | + (source_id,), | |
| 661 | + ).fetchone() | |
| 662 | + if not row: | |
| 663 | + return None | |
| 664 | + return { | |
| 665 | + "source_id": row[0], | |
| 666 | + "source_type": row[1], | |
| 667 | + "title": row[2], | |
| 668 | + "path": row[3], | |
| 669 | + "url": row[4], | |
| 670 | + "mime_type": row[5], | |
| 671 | + "ingested_at": row[6], | |
| 672 | + "metadata": json.loads(row[7]) if row[7] else {}, | |
| 673 | + } | |
| 674 | + | |
| 675 | + def add_source_location( | |
| 676 | + self, | |
| 677 | + source_id: str, | |
| 678 | + entity_name_lower: Optional[str] = None, | |
| 679 | + relationship_id: Optional[int] = None, | |
| 680 | + **kwargs, | |
| 681 | + ) -> None: | |
| 682 | + self._conn.execute( | |
| 683 | + "INSERT INTO source_locations (source_id, entity_name_lower, relationship_id, " | |
| 684 | + "timestamp, page, section, line_start, line_end, text_snippet) " | |
| 685 | + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", | |
| 686 | + ( | |
| 687 | + source_id, | |
| 688 | + entity_name_lower, | |
| 689 | + relationship_id, | |
| 690 | + kwargs.get("timestamp"), | |
| 691 | + kwargs.get("page"), | |
| 692 | + kwargs.get("section"), | |
| 693 | + kwargs.get("line_start"), | |
| 694 | + kwargs.get("line_end"), | |
| 695 | + kwargs.get("text_snippet"), | |
| 696 | + ), | |
| 697 | + ) | |
| 698 | + self._conn.commit() | |
| 699 | + | |
| 700 | + def get_entity_provenance(self, name: str) -> List[Dict[str, Any]]: | |
| 701 | + name_lower = name.lower() | |
| 702 | + rows = self._conn.execute( | |
| 703 | + "SELECT sl.source_id, sl.entity_name_lower, sl.relationship_id, " | |
| 704 | + "sl.timestamp, sl.page, sl.section, sl.line_start, sl.line_end, " | |
| 705 | + "sl.text_snippet, s.source_type, s.title, s.path, s.url, s.mime_type, " | |
| 706 | + "s.ingested_at, s.metadata " | |
| 707 | + "FROM source_locations sl " | |
| 708 | + "JOIN sources s ON sl.source_id = s.source_id " | |
| 709 | + "WHERE sl.entity_name_lower = ?", | |
| 710 | + (name_lower,), | |
| 711 | + ).fetchall() | |
| 712 | + results = [] | |
| 713 | + for r in rows: | |
| 714 | + results.append( | |
| 715 | + { | |
| 716 | + "source_id": r[0], | |
| 717 | + "entity_name_lower": r[1], | |
| 718 | + "relationship_id": r[2], | |
| 719 | + "timestamp": r[3], | |
| 720 | + "page": r[4], | |
| 721 | + "section": r[5], | |
| 722 | + "line_start": r[6], | |
| 723 | + "line_end": r[7], | |
| 724 | + "text_snippet": r[8], | |
| 725 | + "source": { | |
| 726 | + "source_id": r[0], | |
| 727 | + "source_type": r[9], | |
| 728 | + "title": r[10], | |
| 729 | + "path": r[11], | |
| 730 | + "url": r[12], | |
| 731 | + "mime_type": r[13], | |
| 732 | + "ingested_at": r[14], | |
| 733 | + "metadata": json.loads(r[15]) if r[15] else {}, | |
| 734 | + }, | |
| 735 | + } | |
| 736 | + ) | |
| 737 | + return results | |
| 552 | 738 | |
| 553 | 739 | def close(self) -> None: |
| 554 | - """Release references. FalkorDB Lite handles persistence automatically.""" | |
| 555 | - self._graph = None | |
| 556 | - self._db = None | |
| 740 | + """Close the SQLite connection.""" | |
| 741 | + if self._conn: | |
| 742 | + self._conn.close() | |
| 743 | + self._conn = None | |
| 557 | 744 | |
| 558 | 745 | |
| 559 | 746 | def create_store(db_path: Optional[Union[str, Path]] = None) -> GraphStore: |
| 560 | 747 | """Create the best available graph store. |
| 561 | 748 | |
| 562 | - If db_path is provided and falkordblite is installed, uses FalkorDBStore. | |
| 563 | - Otherwise falls back to InMemoryStore. | |
| 749 | + If db_path is provided, uses SQLiteStore for persistent storage. | |
| 750 | + Otherwise returns an InMemoryStore. | |
| 564 | 751 | """ |
| 565 | 752 | if db_path is not None: |
| 566 | 753 | try: |
| 567 | - return FalkorDBStore(db_path) | |
| 568 | - except ImportError: | |
| 569 | - logger.info( | |
| 570 | - "falkordblite not installed, falling back to in-memory store. " | |
| 571 | - "Install with: pip install planopticon[graph]" | |
| 572 | - ) | |
| 754 | + return SQLiteStore(db_path) | |
| 573 | 755 | except Exception as e: |
| 574 | - logger.warning( | |
| 575 | - f"Failed to initialize FalkorDB at {db_path}: {e}. Using in-memory store." | |
| 576 | - ) | |
| 756 | + logger.warning(f"Failed to initialize SQLite at {db_path}: {e}. Using in-memory store.") | |
| 577 | 757 | return InMemoryStore() |
| 578 | 758 |
| --- video_processor/integrators/graph_store.py | |
| +++ video_processor/integrators/graph_store.py | |
| @@ -1,8 +1,10 @@ | |
| 1 | """Graph storage backends for PlanOpticon knowledge graphs.""" |
| 2 | |
| 3 | import logging |
| 4 | from abc import ABC, abstractmethod |
| 5 | from pathlib import Path |
| 6 | from typing import Any, Dict, List, Optional, Union |
| 7 | |
| 8 | logger = logging.getLogger(__name__) |
| @@ -108,13 +110,39 @@ | |
| 108 | """Check if a relationship exists between two entities. |
| 109 | |
| 110 | If edge_label is None, checks for any relationship type. |
| 111 | """ |
| 112 | ... |
| 113 | |
| 114 | def raw_query(self, query_string: str) -> Any: |
| 115 | """Execute a raw query against the backend (e.g. Cypher for FalkorDB). |
| 116 | |
| 117 | Not supported by all backends — raises NotImplementedError by default. |
| 118 | """ |
| 119 | raise NotImplementedError(f"{type(self).__name__} does not support raw queries") |
| 120 | |
| @@ -133,19 +161,25 @@ | |
| 133 | "type": e.get("type", "concept"), |
| 134 | "descriptions": descs, |
| 135 | "occurrences": e.get("occurrences", []), |
| 136 | } |
| 137 | ) |
| 138 | return {"nodes": nodes, "relationships": self.get_all_relationships()} |
| 139 | |
| 140 | |
| 141 | class InMemoryStore(GraphStore): |
| 142 | """In-memory graph store using Python dicts. Default fallback.""" |
| 143 | |
| 144 | def __init__(self) -> None: |
| 145 | self._nodes: Dict[str, Dict[str, Any]] = {} # keyed by name.lower() |
| 146 | self._relationships: List[Dict[str, Any]] = [] |
| 147 | |
| 148 | def merge_entity( |
| 149 | self, |
| 150 | name: str, |
| 151 | entity_type: str, |
| @@ -154,10 +188,12 @@ | |
| 154 | ) -> None: |
| 155 | key = name.lower() |
| 156 | if key in self._nodes: |
| 157 | if descriptions: |
| 158 | self._nodes[key]["descriptions"].update(descriptions) |
| 159 | else: |
| 160 | self._nodes[key] = { |
| 161 | "id": name, |
| 162 | "name": name, |
| 163 | "type": entity_type, |
| @@ -239,10 +275,47 @@ | |
| 239 | key = name.lower() |
| 240 | if key not in self._nodes: |
| 241 | return False |
| 242 | self._nodes[key].update(properties) |
| 243 | return True |
| 244 | |
| 245 | def has_relationship( |
| 246 | self, |
| 247 | source: str, |
| 248 | target: str, |
| @@ -255,323 +328,430 @@ | |
| 255 | if edge_label is None or rel.get("type") == edge_label: |
| 256 | return True |
| 257 | return False |
| 258 | |
| 259 | |
| 260 | class FalkorDBStore(GraphStore): |
| 261 | """FalkorDB Lite-backed graph store. Requires falkordblite package.""" |
| 262 | |
| 263 | def __init__(self, db_path: Union[str, Path]) -> None: |
| 264 | # Patch redis 7.x compat: UnixDomainSocketConnection missing 'port' |
| 265 | import redis.connection |
| 266 | |
| 267 | if not hasattr(redis.connection.UnixDomainSocketConnection, "port"): |
| 268 | redis.connection.UnixDomainSocketConnection.port = 0 |
| 269 | |
| 270 | from redislite import FalkorDB |
| 271 | |
| 272 | self._db_path = str(db_path) |
| 273 | self._db = FalkorDB(self._db_path) |
| 274 | self._graph = self._db.select_graph("knowledge") |
| 275 | self._ensure_indexes() |
| 276 | |
| 277 | def _ensure_indexes(self) -> None: |
| 278 | for query in [ |
| 279 | "CREATE INDEX FOR (e:Entity) ON (e.name_lower)", |
| 280 | "CREATE INDEX FOR (e:Entity) ON (e.type)", |
| 281 | "CREATE INDEX FOR (e:Entity) ON (e.dag_id)", |
| 282 | ]: |
| 283 | try: |
| 284 | self._graph.query(query) |
| 285 | except Exception: |
| 286 | pass # index already exists |
| 287 | |
| 288 | def merge_entity( |
| 289 | self, |
| 290 | name: str, |
| 291 | entity_type: str, |
| 292 | descriptions: List[str], |
| 293 | source: Optional[str] = None, |
| 294 | ) -> None: |
| 295 | name_lower = name.lower() |
| 296 | |
| 297 | # Check if entity exists |
| 298 | result = self._graph.query( |
| 299 | "MATCH (e:Entity {name_lower: $name_lower}) RETURN e.descriptions", |
| 300 | params={"name_lower": name_lower}, |
| 301 | ) |
| 302 | |
| 303 | if result.result_set: |
| 304 | # Entity exists — merge descriptions |
| 305 | existing_descs = result.result_set[0][0] or [] |
| 306 | merged = list(set(existing_descs + descriptions)) |
| 307 | self._graph.query( |
| 308 | "MATCH (e:Entity {name_lower: $name_lower}) SET e.descriptions = $descs", |
| 309 | params={"name_lower": name_lower, "descs": merged}, |
| 310 | ) |
| 311 | else: |
| 312 | # Create new entity |
| 313 | self._graph.query( |
| 314 | "CREATE (e:Entity {" |
| 315 | "name: $name, name_lower: $name_lower, type: $type, " |
| 316 | "descriptions: $descs, source: $source" |
| 317 | "})", |
| 318 | params={ |
| 319 | "name": name, |
| 320 | "name_lower": name_lower, |
| 321 | "type": entity_type, |
| 322 | "descs": descriptions, |
| 323 | "source": source, |
| 324 | }, |
| 325 | ) |
| 326 | |
| 327 | def add_occurrence( |
| 328 | self, |
| 329 | entity_name: str, |
| 330 | source: str, |
| 331 | timestamp: Optional[float] = None, |
| 332 | text: Optional[str] = None, |
| 333 | ) -> None: |
| 334 | name_lower = entity_name.lower() |
| 335 | self._graph.query( |
| 336 | "MATCH (e:Entity {name_lower: $name_lower}) " |
| 337 | "CREATE (o:Occurrence {source: $source, timestamp: $timestamp, text: $text}) " |
| 338 | "CREATE (e)-[:OCCURRED_IN]->(o)", |
| 339 | params={ |
| 340 | "name_lower": name_lower, |
| 341 | "source": source, |
| 342 | "timestamp": timestamp, |
| 343 | "text": text, |
| 344 | }, |
| 345 | ) |
| 346 | |
| 347 | def add_relationship( |
| 348 | self, |
| 349 | source: str, |
| 350 | target: str, |
| 351 | rel_type: str, |
| 352 | content_source: Optional[str] = None, |
| 353 | timestamp: Optional[float] = None, |
| 354 | ) -> None: |
| 355 | self._graph.query( |
| 356 | "MATCH (a:Entity {name_lower: $src_lower}) " |
| 357 | "MATCH (b:Entity {name_lower: $tgt_lower}) " |
| 358 | "CREATE (a)-[:RELATED_TO {" |
| 359 | "rel_type: $rel_type, content_source: $content_source, timestamp: $timestamp" |
| 360 | "}]->(b)", |
| 361 | params={ |
| 362 | "src_lower": source.lower(), |
| 363 | "tgt_lower": target.lower(), |
| 364 | "rel_type": rel_type, |
| 365 | "content_source": content_source, |
| 366 | "timestamp": timestamp, |
| 367 | }, |
| 368 | ) |
| 369 | |
| 370 | def get_entity(self, name: str) -> Optional[Dict[str, Any]]: |
| 371 | result = self._graph.query( |
| 372 | "MATCH (e:Entity {name_lower: $name_lower}) " |
| 373 | "RETURN e.name, e.type, e.descriptions, e.source", |
| 374 | params={"name_lower": name.lower()}, |
| 375 | ) |
| 376 | if not result.result_set: |
| 377 | return None |
| 378 | |
| 379 | row = result.result_set[0] |
| 380 | entity_name = row[0] |
| 381 | |
| 382 | # Fetch occurrences |
| 383 | occ_result = self._graph.query( |
| 384 | "MATCH (e:Entity {name_lower: $name_lower})-[:OCCURRED_IN]->(o:Occurrence) " |
| 385 | "RETURN o.source, o.timestamp, o.text", |
| 386 | params={"name_lower": name.lower()}, |
| 387 | ) |
| 388 | occurrences = [ |
| 389 | {"source": o[0], "timestamp": o[1], "text": o[2]} for o in occ_result.result_set |
| 390 | ] |
| 391 | |
| 392 | return { |
| 393 | "id": entity_name, |
| 394 | "name": entity_name, |
| 395 | "type": row[1] or "concept", |
| 396 | "descriptions": row[2] or [], |
| 397 | "occurrences": occurrences, |
| 398 | "source": row[3], |
| 399 | } |
| 400 | |
| 401 | def get_all_entities(self) -> List[Dict[str, Any]]: |
| 402 | result = self._graph.query( |
| 403 | "MATCH (e:Entity) RETURN e.name, e.name_lower, e.type, e.descriptions, e.source" |
| 404 | ) |
| 405 | entities = [] |
| 406 | for row in result.result_set: |
| 407 | name_lower = row[1] |
| 408 | # Fetch occurrences for this entity |
| 409 | occ_result = self._graph.query( |
| 410 | "MATCH (e:Entity {name_lower: $name_lower})-[:OCCURRED_IN]->(o:Occurrence) " |
| 411 | "RETURN o.source, o.timestamp, o.text", |
| 412 | params={"name_lower": name_lower}, |
| 413 | ) |
| 414 | occurrences = [ |
| 415 | {"source": o[0], "timestamp": o[1], "text": o[2]} for o in occ_result.result_set |
| 416 | ] |
| 417 | entities.append( |
| 418 | { |
| 419 | "id": row[0], |
| 420 | "name": row[0], |
| 421 | "type": row[2] or "concept", |
| 422 | "descriptions": row[3] or [], |
| 423 | "occurrences": occurrences, |
| 424 | "source": row[4], |
| 425 | } |
| 426 | ) |
| 427 | return entities |
| 428 | |
| 429 | def get_all_relationships(self) -> List[Dict[str, Any]]: |
| 430 | result = self._graph.query( |
| 431 | "MATCH (a:Entity)-[r:RELATED_TO]->(b:Entity) " |
| 432 | "RETURN a.name, b.name, r.rel_type, r.content_source, r.timestamp" |
| 433 | ) |
| 434 | return [ |
| 435 | { |
| 436 | "source": row[0], |
| 437 | "target": row[1], |
| 438 | "type": row[2] or "related_to", |
| 439 | "content_source": row[3], |
| 440 | "timestamp": row[4], |
| 441 | } |
| 442 | for row in result.result_set |
| 443 | ] |
| 444 | |
| 445 | def get_entity_count(self) -> int: |
| 446 | result = self._graph.query("MATCH (e:Entity) RETURN count(e)") |
| 447 | return result.result_set[0][0] if result.result_set else 0 |
| 448 | |
| 449 | def get_relationship_count(self) -> int: |
| 450 | result = self._graph.query("MATCH ()-[r]->() RETURN count(r)") |
| 451 | count = result.result_set[0][0] if result.result_set else 0 |
| 452 | # Subtract occurrence edges which are internal bookkeeping |
| 453 | occ_result = self._graph.query("MATCH ()-[r:OCCURRED_IN]->() RETURN count(r)") |
| 454 | occ_count = occ_result.result_set[0][0] if occ_result.result_set else 0 |
| 455 | return count - occ_count |
| 456 | |
| 457 | def has_entity(self, name: str) -> bool: |
| 458 | result = self._graph.query( |
| 459 | "MATCH (e:Entity {name_lower: $name_lower}) RETURN count(e)", |
| 460 | params={"name_lower": name.lower()}, |
| 461 | ) |
| 462 | return result.result_set[0][0] > 0 if result.result_set else False |
| 463 | |
| 464 | def raw_query(self, query_string: str) -> Any: |
| 465 | """Execute a raw Cypher query and return the result set.""" |
| 466 | result = self._graph.query(query_string) |
| 467 | return result.result_set |
| 468 | |
| 469 | def add_typed_relationship( |
| 470 | self, |
| 471 | source: str, |
| 472 | target: str, |
| 473 | edge_label: str, |
| 474 | properties: Optional[Dict[str, Any]] = None, |
| 475 | ) -> None: |
| 476 | props = properties or {} |
| 477 | # Build property string for Cypher SET clause |
| 478 | prop_assignments = [] |
| 479 | params: Dict[str, Any] = { |
| 480 | "src_lower": source.lower(), |
| 481 | "tgt_lower": target.lower(), |
| 482 | } |
| 483 | for i, (k, v) in enumerate(props.items()): |
| 484 | param_name = f"prop_{i}" |
| 485 | prop_assignments.append(f"r.{k} = ${param_name}") |
| 486 | params[param_name] = v |
| 487 | |
| 488 | set_clause = "" |
| 489 | if prop_assignments: |
| 490 | set_clause = " SET " + ", ".join(prop_assignments) |
| 491 | |
| 492 | # FalkorDB requires static relationship types in CREATE, so we use |
| 493 | # a parameterized approach with specific known labels |
| 494 | query = ( |
| 495 | f"MATCH (a:Entity {{name_lower: $src_lower}}) " |
| 496 | f"MATCH (b:Entity {{name_lower: $tgt_lower}}) " |
| 497 | f"CREATE (a)-[r:{edge_label}]->(b)" |
| 498 | f"{set_clause}" |
| 499 | ) |
| 500 | self._graph.query(query, params=params) |
| 501 | |
| 502 | def set_entity_properties( |
| 503 | self, |
| 504 | name: str, |
| 505 | properties: Dict[str, Any], |
| 506 | ) -> bool: |
| 507 | name_lower = name.lower() |
| 508 | # Check entity exists |
| 509 | if not self.has_entity(name): |
| 510 | return False |
| 511 | |
| 512 | params: Dict[str, Any] = {"name_lower": name_lower} |
| 513 | set_parts = [] |
| 514 | for i, (k, v) in enumerate(properties.items()): |
| 515 | param_name = f"prop_{i}" |
| 516 | set_parts.append(f"e.{k} = ${param_name}") |
| 517 | params[param_name] = v |
| 518 | |
| 519 | if not set_parts: |
| 520 | return True |
| 521 | |
| 522 | query = f"MATCH (e:Entity {{name_lower: $name_lower}}) SET {', '.join(set_parts)}" |
| 523 | self._graph.query(query, params=params) |
| 524 | return True |
| 525 | |
| 526 | def has_relationship( |
| 527 | self, |
| 528 | source: str, |
| 529 | target: str, |
| 530 | edge_label: Optional[str] = None, |
| 531 | ) -> bool: |
| 532 | params = { |
| 533 | "src_lower": source.lower(), |
| 534 | "tgt_lower": target.lower(), |
| 535 | } |
| 536 | if edge_label: |
| 537 | query = ( |
| 538 | f"MATCH (a:Entity {{name_lower: $src_lower}})" |
| 539 | f"-[:{edge_label}]->" |
| 540 | f"(b:Entity {{name_lower: $tgt_lower}}) " |
| 541 | f"RETURN count(*)" |
| 542 | ) |
| 543 | else: |
| 544 | query = ( |
| 545 | "MATCH (a:Entity {name_lower: $src_lower})" |
| 546 | "-[]->" |
| 547 | "(b:Entity {name_lower: $tgt_lower}) " |
| 548 | "RETURN count(*)" |
| 549 | ) |
| 550 | result = self._graph.query(query, params=params) |
| 551 | return result.result_set[0][0] > 0 if result.result_set else False |
| 552 | |
| 553 | def close(self) -> None: |
| 554 | """Release references. FalkorDB Lite handles persistence automatically.""" |
| 555 | self._graph = None |
| 556 | self._db = None |
| 557 | |
| 558 | |
| 559 | def create_store(db_path: Optional[Union[str, Path]] = None) -> GraphStore: |
| 560 | """Create the best available graph store. |
| 561 | |
| 562 | If db_path is provided and falkordblite is installed, uses FalkorDBStore. |
| 563 | Otherwise falls back to InMemoryStore. |
| 564 | """ |
| 565 | if db_path is not None: |
| 566 | try: |
| 567 | return FalkorDBStore(db_path) |
| 568 | except ImportError: |
| 569 | logger.info( |
| 570 | "falkordblite not installed, falling back to in-memory store. " |
| 571 | "Install with: pip install planopticon[graph]" |
| 572 | ) |
| 573 | except Exception as e: |
| 574 | logger.warning( |
| 575 | f"Failed to initialize FalkorDB at {db_path}: {e}. Using in-memory store." |
| 576 | ) |
| 577 | return InMemoryStore() |
| 578 |
| --- video_processor/integrators/graph_store.py | |
| +++ video_processor/integrators/graph_store.py | |
| @@ -1,8 +1,10 @@ | |
| 1 | """Graph storage backends for PlanOpticon knowledge graphs.""" |
| 2 | |
| 3 | import json |
| 4 | import logging |
| 5 | import sqlite3 |
| 6 | from abc import ABC, abstractmethod |
| 7 | from pathlib import Path |
| 8 | from typing import Any, Dict, List, Optional, Union |
| 9 | |
| 10 | logger = logging.getLogger(__name__) |
| @@ -108,13 +110,39 @@ | |
| 110 | """Check if a relationship exists between two entities. |
| 111 | |
| 112 | If edge_label is None, checks for any relationship type. |
| 113 | """ |
| 114 | ... |
| 115 | |
| 116 | def register_source(self, source: Dict[str, Any]) -> None: |
| 117 | """Register a content source. Default no-op for backends that don't support it.""" |
| 118 | pass |
| 119 | |
| 120 | def get_sources(self) -> List[Dict[str, Any]]: |
| 121 | """Return all registered sources.""" |
| 122 | return [] |
| 123 | |
| 124 | def get_source(self, source_id: str) -> Optional[Dict[str, Any]]: |
| 125 | """Get a source by ID.""" |
| 126 | return None |
| 127 | |
| 128 | def add_source_location( |
| 129 | self, |
| 130 | source_id: str, |
| 131 | entity_name_lower: Optional[str] = None, |
| 132 | relationship_id: Optional[int] = None, |
| 133 | **kwargs, |
| 134 | ) -> None: |
| 135 | """Link a source to an entity or relationship with location details.""" |
| 136 | pass |
| 137 | |
| 138 | def get_entity_provenance(self, name: str) -> List[Dict[str, Any]]: |
| 139 | """Get all source locations for an entity.""" |
| 140 | return [] |
| 141 | |
| 142 | def raw_query(self, query_string: str) -> Any: |
| 143 | """Execute a raw query against the backend (e.g. SQL for SQLite). |
| 144 | |
| 145 | Not supported by all backends — raises NotImplementedError by default. |
| 146 | """ |
| 147 | raise NotImplementedError(f"{type(self).__name__} does not support raw queries") |
| 148 | |
| @@ -133,19 +161,25 @@ | |
| 161 | "type": e.get("type", "concept"), |
| 162 | "descriptions": descs, |
| 163 | "occurrences": e.get("occurrences", []), |
| 164 | } |
| 165 | ) |
| 166 | result = {"nodes": nodes, "relationships": self.get_all_relationships()} |
| 167 | sources = self.get_sources() |
| 168 | if sources: |
| 169 | result["sources"] = sources |
| 170 | return result |
| 171 | |
| 172 | |
| 173 | class InMemoryStore(GraphStore): |
| 174 | """In-memory graph store using Python dicts. Default fallback.""" |
| 175 | |
| 176 | def __init__(self) -> None: |
| 177 | self._nodes: Dict[str, Dict[str, Any]] = {} # keyed by name.lower() |
| 178 | self._relationships: List[Dict[str, Any]] = [] |
| 179 | self._sources: Dict[str, Dict[str, Any]] = {} # keyed by source_id |
| 180 | self._source_locations: List[Dict[str, Any]] = [] |
| 181 | |
| 182 | def merge_entity( |
| 183 | self, |
| 184 | name: str, |
| 185 | entity_type: str, |
| @@ -154,10 +188,12 @@ | |
| 188 | ) -> None: |
| 189 | key = name.lower() |
| 190 | if key in self._nodes: |
| 191 | if descriptions: |
| 192 | self._nodes[key]["descriptions"].update(descriptions) |
| 193 | if entity_type and entity_type != self._nodes[key]["type"]: |
| 194 | self._nodes[key]["type"] = entity_type |
| 195 | else: |
| 196 | self._nodes[key] = { |
| 197 | "id": name, |
| 198 | "name": name, |
| 199 | "type": entity_type, |
| @@ -239,10 +275,47 @@ | |
| 275 | key = name.lower() |
| 276 | if key not in self._nodes: |
| 277 | return False |
| 278 | self._nodes[key].update(properties) |
| 279 | return True |
| 280 | |
| 281 | def register_source(self, source: Dict[str, Any]) -> None: |
| 282 | source_id = source.get("source_id", "") |
| 283 | self._sources[source_id] = dict(source) |
| 284 | |
| 285 | def get_sources(self) -> List[Dict[str, Any]]: |
| 286 | return list(self._sources.values()) |
| 287 | |
| 288 | def get_source(self, source_id: str) -> Optional[Dict[str, Any]]: |
| 289 | return self._sources.get(source_id) |
| 290 | |
| 291 | def add_source_location( |
| 292 | self, |
| 293 | source_id: str, |
| 294 | entity_name_lower: Optional[str] = None, |
| 295 | relationship_id: Optional[int] = None, |
| 296 | **kwargs, |
| 297 | ) -> None: |
| 298 | entry: Dict[str, Any] = { |
| 299 | "source_id": source_id, |
| 300 | "entity_name_lower": entity_name_lower, |
| 301 | "relationship_id": relationship_id, |
| 302 | } |
| 303 | entry.update(kwargs) |
| 304 | self._source_locations.append(entry) |
| 305 | |
| 306 | def get_entity_provenance(self, name: str) -> List[Dict[str, Any]]: |
| 307 | name_lower = name.lower() |
| 308 | results = [] |
| 309 | for loc in self._source_locations: |
| 310 | if loc.get("entity_name_lower") == name_lower: |
| 311 | entry = dict(loc) |
| 312 | src = self._sources.get(loc.get("source_id", "")) |
| 313 | if src: |
| 314 | entry["source"] = src |
| 315 | results.append(entry) |
| 316 | return results |
| 317 | |
| 318 | def has_relationship( |
| 319 | self, |
| 320 | source: str, |
| 321 | target: str, |
| @@ -255,323 +328,430 @@ | |
| 328 | if edge_label is None or rel.get("type") == edge_label: |
| 329 | return True |
| 330 | return False |
| 331 | |
| 332 | |
| 333 | class SQLiteStore(GraphStore): |
| 334 | """SQLite-backed graph store. Uses Python's built-in sqlite3 module.""" |
| 335 | |
| 336 | _SCHEMA = """ |
| 337 | CREATE TABLE IF NOT EXISTS entities ( |
| 338 | name TEXT NOT NULL, |
| 339 | name_lower TEXT NOT NULL UNIQUE, |
| 340 | type TEXT NOT NULL DEFAULT 'concept', |
| 341 | descriptions TEXT NOT NULL DEFAULT '[]', |
| 342 | source TEXT, |
| 343 | properties TEXT NOT NULL DEFAULT '{}' |
| 344 | ); |
| 345 | CREATE TABLE IF NOT EXISTS occurrences ( |
| 346 | entity_name_lower TEXT NOT NULL, |
| 347 | source TEXT NOT NULL, |
| 348 | timestamp REAL, |
| 349 | text TEXT, |
| 350 | FOREIGN KEY (entity_name_lower) REFERENCES entities(name_lower) |
| 351 | ); |
| 352 | CREATE TABLE IF NOT EXISTS relationships ( |
| 353 | source TEXT NOT NULL, |
| 354 | target TEXT NOT NULL, |
| 355 | type TEXT NOT NULL DEFAULT 'related_to', |
| 356 | content_source TEXT, |
| 357 | timestamp REAL, |
| 358 | properties TEXT NOT NULL DEFAULT '{}' |
| 359 | ); |
| 360 | CREATE INDEX IF NOT EXISTS idx_entities_name_lower ON entities(name_lower); |
| 361 | CREATE INDEX IF NOT EXISTS idx_entities_type ON entities(type); |
| 362 | CREATE INDEX IF NOT EXISTS idx_occurrences_entity ON occurrences(entity_name_lower); |
| 363 | CREATE INDEX IF NOT EXISTS idx_relationships_source ON relationships(source); |
| 364 | CREATE INDEX IF NOT EXISTS idx_relationships_target ON relationships(target); |
| 365 | |
| 366 | CREATE TABLE IF NOT EXISTS sources ( |
| 367 | source_id TEXT PRIMARY KEY, |
| 368 | source_type TEXT NOT NULL, |
| 369 | title TEXT NOT NULL, |
| 370 | path TEXT, |
| 371 | url TEXT, |
| 372 | mime_type TEXT, |
| 373 | ingested_at TEXT NOT NULL, |
| 374 | metadata TEXT NOT NULL DEFAULT '{}' |
| 375 | ); |
| 376 | CREATE TABLE IF NOT EXISTS source_locations ( |
| 377 | id INTEGER PRIMARY KEY AUTOINCREMENT, |
| 378 | source_id TEXT NOT NULL REFERENCES sources(source_id), |
| 379 | entity_name_lower TEXT, |
| 380 | relationship_id INTEGER, |
| 381 | timestamp REAL, |
| 382 | page INTEGER, |
| 383 | section TEXT, |
| 384 | line_start INTEGER, |
| 385 | line_end INTEGER, |
| 386 | text_snippet TEXT |
| 387 | ); |
| 388 | CREATE INDEX IF NOT EXISTS idx_source_locations_source ON source_locations(source_id); |
| 389 | CREATE INDEX IF NOT EXISTS idx_source_locations_entity |
| 390 | ON source_locations(entity_name_lower); |
| 391 | """ |
| 392 | |
| 393 | def __init__(self, db_path: Union[str, Path]) -> None: |
| 394 | self._db_path = str(db_path) |
| 395 | self._conn = sqlite3.connect(self._db_path) |
| 396 | self._conn.execute("PRAGMA journal_mode=WAL") |
| 397 | self._conn.execute("PRAGMA foreign_keys=ON") |
| 398 | self._conn.executescript(self._SCHEMA) |
| 399 | self._conn.commit() |
| 400 | |
| 401 | def merge_entity( |
| 402 | self, |
| 403 | name: str, |
| 404 | entity_type: str, |
| 405 | descriptions: List[str], |
| 406 | source: Optional[str] = None, |
| 407 | ) -> None: |
| 408 | name_lower = name.lower() |
| 409 | row = self._conn.execute( |
| 410 | "SELECT descriptions FROM entities WHERE name_lower = ?", |
| 411 | (name_lower,), |
| 412 | ).fetchone() |
| 413 | |
| 414 | if row: |
| 415 | existing = json.loads(row[0]) |
| 416 | merged = list(set(existing + descriptions)) |
| 417 | self._conn.execute( |
| 418 | "UPDATE entities SET descriptions = ?, type = ? WHERE name_lower = ?", |
| 419 | (json.dumps(merged), entity_type, name_lower), |
| 420 | ) |
| 421 | else: |
| 422 | self._conn.execute( |
| 423 | "INSERT INTO entities (name, name_lower, type, descriptions, source) " |
| 424 | "VALUES (?, ?, ?, ?, ?)", |
| 425 | (name, name_lower, entity_type, json.dumps(descriptions), source), |
| 426 | ) |
| 427 | self._conn.commit() |
| 428 | |
| 429 | def add_occurrence( |
| 430 | self, |
| 431 | entity_name: str, |
| 432 | source: str, |
| 433 | timestamp: Optional[float] = None, |
| 434 | text: Optional[str] = None, |
| 435 | ) -> None: |
| 436 | name_lower = entity_name.lower() |
| 437 | exists = self._conn.execute( |
| 438 | "SELECT 1 FROM entities WHERE name_lower = ?", (name_lower,) |
| 439 | ).fetchone() |
| 440 | if not exists: |
| 441 | return |
| 442 | self._conn.execute( |
| 443 | "INSERT INTO occurrences (entity_name_lower, source, timestamp, text) " |
| 444 | "VALUES (?, ?, ?, ?)", |
| 445 | (name_lower, source, timestamp, text), |
| 446 | ) |
| 447 | self._conn.commit() |
| 448 | |
| 449 | def add_relationship( |
| 450 | self, |
| 451 | source: str, |
| 452 | target: str, |
| 453 | rel_type: str, |
| 454 | content_source: Optional[str] = None, |
| 455 | timestamp: Optional[float] = None, |
| 456 | ) -> None: |
| 457 | self._conn.execute( |
| 458 | "INSERT INTO relationships (source, target, type, content_source, timestamp) " |
| 459 | "VALUES (?, ?, ?, ?, ?)", |
| 460 | (source, target, rel_type, content_source, timestamp), |
| 461 | ) |
| 462 | self._conn.commit() |
| 463 | |
| 464 | def get_entity(self, name: str) -> Optional[Dict[str, Any]]: |
| 465 | row = self._conn.execute( |
| 466 | "SELECT name, type, descriptions, source FROM entities WHERE name_lower = ?", |
| 467 | (name.lower(),), |
| 468 | ).fetchone() |
| 469 | if not row: |
| 470 | return None |
| 471 | |
| 472 | entity_name = row[0] |
| 473 | occ_rows = self._conn.execute( |
| 474 | "SELECT source, timestamp, text FROM occurrences WHERE entity_name_lower = ?", |
| 475 | (name.lower(),), |
| 476 | ).fetchall() |
| 477 | occurrences = [{"source": o[0], "timestamp": o[1], "text": o[2]} for o in occ_rows] |
| 478 | |
| 479 | return { |
| 480 | "id": entity_name, |
| 481 | "name": entity_name, |
| 482 | "type": row[1] or "concept", |
| 483 | "descriptions": json.loads(row[2]) if row[2] else [], |
| 484 | "occurrences": occurrences, |
| 485 | "source": row[3], |
| 486 | } |
| 487 | |
| 488 | def get_all_entities(self) -> List[Dict[str, Any]]: |
| 489 | rows = self._conn.execute( |
| 490 | "SELECT name, name_lower, type, descriptions, source FROM entities" |
| 491 | ).fetchall() |
| 492 | entities = [] |
| 493 | for row in rows: |
| 494 | name_lower = row[1] |
| 495 | occ_rows = self._conn.execute( |
| 496 | "SELECT source, timestamp, text FROM occurrences WHERE entity_name_lower = ?", |
| 497 | (name_lower,), |
| 498 | ).fetchall() |
| 499 | occurrences = [{"source": o[0], "timestamp": o[1], "text": o[2]} for o in occ_rows] |
| 500 | entities.append( |
| 501 | { |
| 502 | "id": row[0], |
| 503 | "name": row[0], |
| 504 | "type": row[2] or "concept", |
| 505 | "descriptions": json.loads(row[3]) if row[3] else [], |
| 506 | "occurrences": occurrences, |
| 507 | "source": row[4], |
| 508 | } |
| 509 | ) |
| 510 | return entities |
| 511 | |
| 512 | def get_all_relationships(self) -> List[Dict[str, Any]]: |
| 513 | rows = self._conn.execute( |
| 514 | "SELECT source, target, type, content_source, timestamp FROM relationships" |
| 515 | ).fetchall() |
| 516 | return [ |
| 517 | { |
| 518 | "source": row[0], |
| 519 | "target": row[1], |
| 520 | "type": row[2] or "related_to", |
| 521 | "content_source": row[3], |
| 522 | "timestamp": row[4], |
| 523 | } |
| 524 | for row in rows |
| 525 | ] |
| 526 | |
| 527 | def get_entity_count(self) -> int: |
| 528 | row = self._conn.execute("SELECT COUNT(*) FROM entities").fetchone() |
| 529 | return row[0] if row else 0 |
| 530 | |
| 531 | def get_relationship_count(self) -> int: |
| 532 | row = self._conn.execute("SELECT COUNT(*) FROM relationships").fetchone() |
| 533 | return row[0] if row else 0 |
| 534 | |
| 535 | def has_entity(self, name: str) -> bool: |
| 536 | row = self._conn.execute( |
| 537 | "SELECT 1 FROM entities WHERE name_lower = ?", (name.lower(),) |
| 538 | ).fetchone() |
| 539 | return row is not None |
| 540 | |
| 541 | def raw_query(self, query_string: str) -> Any: |
| 542 | """Execute a raw SQL query and return all rows.""" |
| 543 | cursor = self._conn.execute(query_string) |
| 544 | return cursor.fetchall() |
| 545 | |
| 546 | def add_typed_relationship( |
| 547 | self, |
| 548 | source: str, |
| 549 | target: str, |
| 550 | edge_label: str, |
| 551 | properties: Optional[Dict[str, Any]] = None, |
| 552 | ) -> None: |
| 553 | self._conn.execute( |
| 554 | "INSERT INTO relationships (source, target, type, properties) VALUES (?, ?, ?, ?)", |
| 555 | (source, target, edge_label, json.dumps(properties or {})), |
| 556 | ) |
| 557 | self._conn.commit() |
| 558 | |
| 559 | def set_entity_properties( |
| 560 | self, |
| 561 | name: str, |
| 562 | properties: Dict[str, Any], |
| 563 | ) -> bool: |
| 564 | name_lower = name.lower() |
| 565 | if not self.has_entity(name): |
| 566 | return False |
| 567 | if not properties: |
| 568 | return True |
| 569 | row = self._conn.execute( |
| 570 | "SELECT properties FROM entities WHERE name_lower = ?", (name_lower,) |
| 571 | ).fetchone() |
| 572 | existing = json.loads(row[0]) if row and row[0] else {} |
| 573 | existing.update(properties) |
| 574 | self._conn.execute( |
| 575 | "UPDATE entities SET properties = ? WHERE name_lower = ?", |
| 576 | (json.dumps(existing), name_lower), |
| 577 | ) |
| 578 | self._conn.commit() |
| 579 | return True |
| 580 | |
| 581 | def has_relationship( |
| 582 | self, |
| 583 | source: str, |
| 584 | target: str, |
| 585 | edge_label: Optional[str] = None, |
| 586 | ) -> bool: |
| 587 | if edge_label: |
| 588 | row = self._conn.execute( |
| 589 | "SELECT 1 FROM relationships " |
| 590 | "WHERE LOWER(source) = ? AND LOWER(target) = ? AND type = ?", |
| 591 | (source.lower(), target.lower(), edge_label), |
| 592 | ).fetchone() |
| 593 | else: |
| 594 | row = self._conn.execute( |
| 595 | "SELECT 1 FROM relationships WHERE LOWER(source) = ? AND LOWER(target) = ?", |
| 596 | (source.lower(), target.lower()), |
| 597 | ).fetchone() |
| 598 | return row is not None |
| 599 | |
| 600 | def register_source(self, source: Dict[str, Any]) -> None: |
| 601 | source_id = source.get("source_id", "") |
| 602 | existing = self._conn.execute( |
| 603 | "SELECT 1 FROM sources WHERE source_id = ?", (source_id,) |
| 604 | ).fetchone() |
| 605 | if existing: |
| 606 | self._conn.execute( |
| 607 | "UPDATE sources SET source_type = ?, title = ?, path = ?, url = ?, " |
| 608 | "mime_type = ?, ingested_at = ?, metadata = ? WHERE source_id = ?", |
| 609 | ( |
| 610 | source.get("source_type", ""), |
| 611 | source.get("title", ""), |
| 612 | source.get("path"), |
| 613 | source.get("url"), |
| 614 | source.get("mime_type"), |
| 615 | source.get("ingested_at", ""), |
| 616 | json.dumps(source.get("metadata", {})), |
| 617 | source_id, |
| 618 | ), |
| 619 | ) |
| 620 | else: |
| 621 | self._conn.execute( |
| 622 | "INSERT INTO sources (source_id, source_type, title, path, url, " |
| 623 | "mime_type, ingested_at, metadata) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", |
| 624 | ( |
| 625 | source_id, |
| 626 | source.get("source_type", ""), |
| 627 | source.get("title", ""), |
| 628 | source.get("path"), |
| 629 | source.get("url"), |
| 630 | source.get("mime_type"), |
| 631 | source.get("ingested_at", ""), |
| 632 | json.dumps(source.get("metadata", {})), |
| 633 | ), |
| 634 | ) |
| 635 | self._conn.commit() |
| 636 | |
| 637 | def get_sources(self) -> List[Dict[str, Any]]: |
| 638 | rows = self._conn.execute( |
| 639 | "SELECT source_id, source_type, title, path, url, mime_type, " |
| 640 | "ingested_at, metadata FROM sources" |
| 641 | ).fetchall() |
| 642 | return [ |
| 643 | { |
| 644 | "source_id": r[0], |
| 645 | "source_type": r[1], |
| 646 | "title": r[2], |
| 647 | "path": r[3], |
| 648 | "url": r[4], |
| 649 | "mime_type": r[5], |
| 650 | "ingested_at": r[6], |
| 651 | "metadata": json.loads(r[7]) if r[7] else {}, |
| 652 | } |
| 653 | for r in rows |
| 654 | ] |
| 655 | |
| 656 | def get_source(self, source_id: str) -> Optional[Dict[str, Any]]: |
| 657 | row = self._conn.execute( |
| 658 | "SELECT source_id, source_type, title, path, url, mime_type, " |
| 659 | "ingested_at, metadata FROM sources WHERE source_id = ?", |
| 660 | (source_id,), |
| 661 | ).fetchone() |
| 662 | if not row: |
| 663 | return None |
| 664 | return { |
| 665 | "source_id": row[0], |
| 666 | "source_type": row[1], |
| 667 | "title": row[2], |
| 668 | "path": row[3], |
| 669 | "url": row[4], |
| 670 | "mime_type": row[5], |
| 671 | "ingested_at": row[6], |
| 672 | "metadata": json.loads(row[7]) if row[7] else {}, |
| 673 | } |
| 674 | |
| 675 | def add_source_location( |
| 676 | self, |
| 677 | source_id: str, |
| 678 | entity_name_lower: Optional[str] = None, |
| 679 | relationship_id: Optional[int] = None, |
| 680 | **kwargs, |
| 681 | ) -> None: |
| 682 | self._conn.execute( |
| 683 | "INSERT INTO source_locations (source_id, entity_name_lower, relationship_id, " |
| 684 | "timestamp, page, section, line_start, line_end, text_snippet) " |
| 685 | "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", |
| 686 | ( |
| 687 | source_id, |
| 688 | entity_name_lower, |
| 689 | relationship_id, |
| 690 | kwargs.get("timestamp"), |
| 691 | kwargs.get("page"), |
| 692 | kwargs.get("section"), |
| 693 | kwargs.get("line_start"), |
| 694 | kwargs.get("line_end"), |
| 695 | kwargs.get("text_snippet"), |
| 696 | ), |
| 697 | ) |
| 698 | self._conn.commit() |
| 699 | |
| 700 | def get_entity_provenance(self, name: str) -> List[Dict[str, Any]]: |
| 701 | name_lower = name.lower() |
| 702 | rows = self._conn.execute( |
| 703 | "SELECT sl.source_id, sl.entity_name_lower, sl.relationship_id, " |
| 704 | "sl.timestamp, sl.page, sl.section, sl.line_start, sl.line_end, " |
| 705 | "sl.text_snippet, s.source_type, s.title, s.path, s.url, s.mime_type, " |
| 706 | "s.ingested_at, s.metadata " |
| 707 | "FROM source_locations sl " |
| 708 | "JOIN sources s ON sl.source_id = s.source_id " |
| 709 | "WHERE sl.entity_name_lower = ?", |
| 710 | (name_lower,), |
| 711 | ).fetchall() |
| 712 | results = [] |
| 713 | for r in rows: |
| 714 | results.append( |
| 715 | { |
| 716 | "source_id": r[0], |
| 717 | "entity_name_lower": r[1], |
| 718 | "relationship_id": r[2], |
| 719 | "timestamp": r[3], |
| 720 | "page": r[4], |
| 721 | "section": r[5], |
| 722 | "line_start": r[6], |
| 723 | "line_end": r[7], |
| 724 | "text_snippet": r[8], |
| 725 | "source": { |
| 726 | "source_id": r[0], |
| 727 | "source_type": r[9], |
| 728 | "title": r[10], |
| 729 | "path": r[11], |
| 730 | "url": r[12], |
| 731 | "mime_type": r[13], |
| 732 | "ingested_at": r[14], |
| 733 | "metadata": json.loads(r[15]) if r[15] else {}, |
| 734 | }, |
| 735 | } |
| 736 | ) |
| 737 | return results |
| 738 | |
| 739 | def close(self) -> None: |
| 740 | """Close the SQLite connection.""" |
| 741 | if self._conn: |
| 742 | self._conn.close() |
| 743 | self._conn = None |
| 744 | |
| 745 | |
| 746 | def create_store(db_path: Optional[Union[str, Path]] = None) -> GraphStore: |
| 747 | """Create the best available graph store. |
| 748 | |
| 749 | If db_path is provided, uses SQLiteStore for persistent storage. |
| 750 | Otherwise returns an InMemoryStore. |
| 751 | """ |
| 752 | if db_path is not None: |
| 753 | try: |
| 754 | return SQLiteStore(db_path) |
| 755 | except Exception as e: |
| 756 | logger.warning(f"Failed to initialize SQLite at {db_path}: {e}. Using in-memory store.") |
| 757 | return InMemoryStore() |
| 758 |
| --- video_processor/integrators/knowledge_graph.py | ||
| +++ video_processor/integrators/knowledge_graph.py | ||
| @@ -5,11 +5,11 @@ | ||
| 5 | 5 | from typing import Dict, List, Optional, Union |
| 6 | 6 | |
| 7 | 7 | from tqdm import tqdm |
| 8 | 8 | |
| 9 | 9 | from video_processor.integrators.graph_store import GraphStore, create_store |
| 10 | -from video_processor.models import Entity, KnowledgeGraphData, Relationship | |
| 10 | +from video_processor.models import Entity, KnowledgeGraphData, Relationship, SourceRecord | |
| 11 | 11 | from video_processor.providers.manager import ProviderManager |
| 12 | 12 | from video_processor.utils.json_parsing import parse_json_from_response |
| 13 | 13 | |
| 14 | 14 | logger = logging.getLogger(__name__) |
| 15 | 15 | |
| @@ -23,10 +23,14 @@ | ||
| 23 | 23 | db_path: Optional[Path] = None, |
| 24 | 24 | store: Optional[GraphStore] = None, |
| 25 | 25 | ): |
| 26 | 26 | self.pm = provider_manager |
| 27 | 27 | self._store = store or create_store(db_path) |
| 28 | + | |
| 29 | + def register_source(self, source: Dict) -> None: | |
| 30 | + """Register a content source for provenance tracking.""" | |
| 31 | + self._store.register_source(source) | |
| 28 | 32 | |
| 29 | 33 | @property |
| 30 | 34 | def nodes(self) -> Dict[str, dict]: |
| 31 | 35 | """Backward-compatible read access to nodes as a dict keyed by entity name.""" |
| 32 | 36 | result = {} |
| @@ -111,19 +115,32 @@ | ||
| 111 | 115 | ) |
| 112 | 116 | ) |
| 113 | 117 | |
| 114 | 118 | return entities, rels |
| 115 | 119 | |
| 116 | - def add_content(self, text: str, source: str, timestamp: Optional[float] = None) -> None: | |
| 120 | + def add_content( | |
| 121 | + self, | |
| 122 | + text: str, | |
| 123 | + source: str, | |
| 124 | + timestamp: Optional[float] = None, | |
| 125 | + source_id: Optional[str] = None, | |
| 126 | + ) -> None: | |
| 117 | 127 | """Add content to knowledge graph by extracting entities and relationships.""" |
| 118 | 128 | entities, relationships = self.extract_entities_and_relationships(text) |
| 119 | 129 | |
| 120 | 130 | snippet = text[:100] + "..." if len(text) > 100 else text |
| 121 | 131 | |
| 122 | 132 | for entity in entities: |
| 123 | 133 | self._store.merge_entity(entity.name, entity.type, entity.descriptions, source=source) |
| 124 | 134 | self._store.add_occurrence(entity.name, source, timestamp, snippet) |
| 135 | + if source_id: | |
| 136 | + self._store.add_source_location( | |
| 137 | + source_id, | |
| 138 | + entity_name_lower=entity.name.lower(), | |
| 139 | + timestamp=timestamp, | |
| 140 | + text_snippet=snippet, | |
| 141 | + ) | |
| 125 | 142 | |
| 126 | 143 | for rel in relationships: |
| 127 | 144 | if self._store.has_entity(rel.source) and self._store.has_entity(rel.target): |
| 128 | 145 | self._store.add_relationship( |
| 129 | 146 | rel.source, |
| @@ -206,25 +223,69 @@ | ||
| 206 | 223 | content_source=r.get("content_source"), |
| 207 | 224 | timestamp=r.get("timestamp"), |
| 208 | 225 | ) |
| 209 | 226 | for r in self._store.get_all_relationships() |
| 210 | 227 | ] |
| 211 | - return KnowledgeGraphData(nodes=nodes, relationships=rels) | |
| 228 | + | |
| 229 | + sources = [SourceRecord(**s) for s in self._store.get_sources()] | |
| 230 | + | |
| 231 | + return KnowledgeGraphData(nodes=nodes, relationships=rels, sources=sources) | |
| 212 | 232 | |
| 213 | 233 | def to_dict(self) -> Dict: |
| 214 | 234 | """Convert knowledge graph to dictionary (backward-compatible).""" |
| 215 | 235 | return self._store.to_dict() |
| 216 | 236 | |
| 217 | 237 | def save(self, output_path: Union[str, Path]) -> Path: |
| 218 | - """Save knowledge graph to JSON file.""" | |
| 238 | + """Save knowledge graph. Defaults to .db (SQLite), also supports .json.""" | |
| 219 | 239 | output_path = Path(output_path) |
| 220 | 240 | if not output_path.suffix: |
| 221 | - output_path = output_path.with_suffix(".json") | |
| 241 | + output_path = output_path.with_suffix(".db") | |
| 222 | 242 | output_path.parent.mkdir(parents=True, exist_ok=True) |
| 223 | 243 | |
| 224 | - data = self.to_data() | |
| 225 | - output_path.write_text(data.model_dump_json(indent=2)) | |
| 244 | + if output_path.suffix == ".json": | |
| 245 | + data = self.to_data() | |
| 246 | + output_path.write_text(data.model_dump_json(indent=2)) | |
| 247 | + elif output_path.suffix == ".db": | |
| 248 | + # If the backing store is already SQLite at this path, it's already persisted. | |
| 249 | + # Otherwise, create a new SQLite store and copy data into it. | |
| 250 | + from video_processor.integrators.graph_store import SQLiteStore | |
| 251 | + | |
| 252 | + if not isinstance(self._store, SQLiteStore) or self._store._db_path != str(output_path): | |
| 253 | + target = SQLiteStore(output_path) | |
| 254 | + for source in self._store.get_sources(): | |
| 255 | + target.register_source(source) | |
| 256 | + for entity in self._store.get_all_entities(): | |
| 257 | + descs = entity.get("descriptions", []) | |
| 258 | + if isinstance(descs, set): | |
| 259 | + descs = list(descs) | |
| 260 | + target.merge_entity( | |
| 261 | + entity["name"], | |
| 262 | + entity.get("type", "concept"), | |
| 263 | + descs, | |
| 264 | + source=entity.get("source"), | |
| 265 | + ) | |
| 266 | + for occ in entity.get("occurrences", []): | |
| 267 | + target.add_occurrence( | |
| 268 | + entity["name"], | |
| 269 | + occ.get("source", ""), | |
| 270 | + occ.get("timestamp"), | |
| 271 | + occ.get("text"), | |
| 272 | + ) | |
| 273 | + for rel in self._store.get_all_relationships(): | |
| 274 | + target.add_relationship( | |
| 275 | + rel.get("source", ""), | |
| 276 | + rel.get("target", ""), | |
| 277 | + rel.get("type", "related_to"), | |
| 278 | + content_source=rel.get("content_source"), | |
| 279 | + timestamp=rel.get("timestamp"), | |
| 280 | + ) | |
| 281 | + target.close() | |
| 282 | + else: | |
| 283 | + # Unknown suffix — fall back to JSON | |
| 284 | + data = self.to_data() | |
| 285 | + output_path.write_text(data.model_dump_json(indent=2)) | |
| 286 | + | |
| 226 | 287 | logger.info( |
| 227 | 288 | f"Saved knowledge graph with {self._store.get_entity_count()} nodes " |
| 228 | 289 | f"and {self._store.get_relationship_count()} relationships to {output_path}" |
| 229 | 290 | ) |
| 230 | 291 | return output_path |
| @@ -231,10 +292,12 @@ | ||
| 231 | 292 | |
| 232 | 293 | @classmethod |
| 233 | 294 | def from_dict(cls, data: Dict, db_path: Optional[Path] = None) -> "KnowledgeGraph": |
| 234 | 295 | """Reconstruct a KnowledgeGraph from saved JSON dict.""" |
| 235 | 296 | kg = cls(db_path=db_path) |
| 297 | + for source in data.get("sources", []): | |
| 298 | + kg._store.register_source(source) | |
| 236 | 299 | for node in data.get("nodes", []): |
| 237 | 300 | name = node.get("name", node.get("id", "")) |
| 238 | 301 | descs = node.get("descriptions", []) |
| 239 | 302 | if isinstance(descs, set): |
| 240 | 303 | descs = list(descs) |
| @@ -256,23 +319,93 @@ | ||
| 256 | 319 | content_source=rel.get("content_source"), |
| 257 | 320 | timestamp=rel.get("timestamp"), |
| 258 | 321 | ) |
| 259 | 322 | return kg |
| 260 | 323 | |
| 324 | + # Type specificity ranking for conflict resolution during merge. | |
| 325 | + # Higher rank = more specific type wins when two entities match. | |
| 326 | + _TYPE_SPECIFICITY = { | |
| 327 | + "concept": 0, | |
| 328 | + "time": 1, | |
| 329 | + "diagram": 1, | |
| 330 | + "organization": 2, | |
| 331 | + "person": 3, | |
| 332 | + "technology": 3, | |
| 333 | + } | |
| 334 | + | |
| 335 | + @staticmethod | |
| 336 | + def _fuzzy_match(name_a: str, name_b: str, threshold: float = 0.85) -> bool: | |
| 337 | + """Return True if two names are similar enough to be considered the same entity.""" | |
| 338 | + from difflib import SequenceMatcher | |
| 339 | + | |
| 340 | + return SequenceMatcher(None, name_a.lower(), name_b.lower()).ratio() >= threshold | |
| 341 | + | |
| 342 | + def _more_specific_type(self, type_a: str, type_b: str) -> str: | |
| 343 | + """Return the more specific of two entity types.""" | |
| 344 | + rank_a = self._TYPE_SPECIFICITY.get(type_a, 1) | |
| 345 | + rank_b = self._TYPE_SPECIFICITY.get(type_b, 1) | |
| 346 | + return type_a if rank_a >= rank_b else type_b | |
| 347 | + | |
| 261 | 348 | def merge(self, other: "KnowledgeGraph") -> None: |
| 262 | - """Merge another KnowledgeGraph into this one.""" | |
| 349 | + """Merge another KnowledgeGraph into this one. | |
| 350 | + | |
| 351 | + Improvements over naive merge: | |
| 352 | + - Fuzzy name matching (SequenceMatcher >= 0.85) to unify near-duplicate entities | |
| 353 | + - Type conflict resolution: prefer more specific types (e.g. technology > concept) | |
| 354 | + - Provenance: merged entities get a ``merged_from`` description entry | |
| 355 | + """ | |
| 356 | + for source in other._store.get_sources(): | |
| 357 | + self._store.register_source(source) | |
| 358 | + | |
| 359 | + # Build a lookup of existing entity names for fuzzy matching | |
| 360 | + existing_entities = self._store.get_all_entities() | |
| 361 | + existing_names = {e["name"]: e for e in existing_entities} | |
| 362 | + # Cache lowercase -> canonical name for fast lookup | |
| 363 | + name_index: dict[str, str] = {n.lower(): n for n in existing_names} | |
| 364 | + | |
| 263 | 365 | for entity in other._store.get_all_entities(): |
| 264 | - name = entity["name"] | |
| 366 | + incoming_name = entity["name"] | |
| 265 | 367 | descs = entity.get("descriptions", []) |
| 266 | 368 | if isinstance(descs, set): |
| 267 | 369 | descs = list(descs) |
| 268 | - self._store.merge_entity( | |
| 269 | - name, entity.get("type", "concept"), descs, source=entity.get("source") | |
| 270 | - ) | |
| 370 | + incoming_type = entity.get("type", "concept") | |
| 371 | + | |
| 372 | + # Try exact match first (case-insensitive), then fuzzy | |
| 373 | + matched_name: Optional[str] = None | |
| 374 | + if incoming_name.lower() in name_index: | |
| 375 | + matched_name = name_index[incoming_name.lower()] | |
| 376 | + else: | |
| 377 | + for existing_name in existing_names: | |
| 378 | + if self._fuzzy_match(incoming_name, existing_name): | |
| 379 | + matched_name = existing_name | |
| 380 | + break | |
| 381 | + | |
| 382 | + if matched_name is not None: | |
| 383 | + # Resolve type conflict | |
| 384 | + existing_type = existing_names[matched_name].get("type", "concept") | |
| 385 | + resolved_type = self._more_specific_type(existing_type, incoming_type) | |
| 386 | + | |
| 387 | + # Add merge provenance | |
| 388 | + merge_note = f"merged_from:{incoming_name}" | |
| 389 | + merged_descs = descs if incoming_name == matched_name else descs + [merge_note] | |
| 390 | + | |
| 391 | + self._store.merge_entity( | |
| 392 | + matched_name, resolved_type, merged_descs, source=entity.get("source") | |
| 393 | + ) | |
| 394 | + target_name = matched_name | |
| 395 | + else: | |
| 396 | + self._store.merge_entity( | |
| 397 | + incoming_name, incoming_type, descs, source=entity.get("source") | |
| 398 | + ) | |
| 399 | + # Update indexes for subsequent fuzzy matches within this merge | |
| 400 | + existing_names[incoming_name] = entity | |
| 401 | + name_index[incoming_name.lower()] = incoming_name | |
| 402 | + target_name = incoming_name | |
| 403 | + | |
| 271 | 404 | for occ in entity.get("occurrences", []): |
| 272 | 405 | self._store.add_occurrence( |
| 273 | - name, | |
| 406 | + target_name, | |
| 274 | 407 | occ.get("source", ""), |
| 275 | 408 | occ.get("timestamp"), |
| 276 | 409 | occ.get("text"), |
| 277 | 410 | ) |
| 278 | 411 | |
| @@ -283,10 +416,19 @@ | ||
| 283 | 416 | rel.get("type", "related_to"), |
| 284 | 417 | content_source=rel.get("content_source"), |
| 285 | 418 | timestamp=rel.get("timestamp"), |
| 286 | 419 | ) |
| 287 | 420 | |
| 421 | + def classify_for_planning(self): | |
| 422 | + """Classify entities in this knowledge graph into planning taxonomy types.""" | |
| 423 | + from video_processor.integrators.taxonomy import TaxonomyClassifier | |
| 424 | + | |
| 425 | + classifier = TaxonomyClassifier(provider_manager=self.pm) | |
| 426 | + entities = self._store.get_all_entities() | |
| 427 | + relationships = self._store.get_all_relationships() | |
| 428 | + return classifier.classify_entities(entities, relationships) | |
| 429 | + | |
| 288 | 430 | def generate_mermaid(self, max_nodes: int = 30) -> str: |
| 289 | 431 | """Generate Mermaid visualization code.""" |
| 290 | 432 | nodes = self.nodes |
| 291 | 433 | rels = self.relationships |
| 292 | 434 | |
| 293 | 435 | |
| 294 | 436 | ADDED video_processor/integrators/taxonomy.py |
| --- video_processor/integrators/knowledge_graph.py | |
| +++ video_processor/integrators/knowledge_graph.py | |
| @@ -5,11 +5,11 @@ | |
| 5 | from typing import Dict, List, Optional, Union |
| 6 | |
| 7 | from tqdm import tqdm |
| 8 | |
| 9 | from video_processor.integrators.graph_store import GraphStore, create_store |
| 10 | from video_processor.models import Entity, KnowledgeGraphData, Relationship |
| 11 | from video_processor.providers.manager import ProviderManager |
| 12 | from video_processor.utils.json_parsing import parse_json_from_response |
| 13 | |
| 14 | logger = logging.getLogger(__name__) |
| 15 | |
| @@ -23,10 +23,14 @@ | |
| 23 | db_path: Optional[Path] = None, |
| 24 | store: Optional[GraphStore] = None, |
| 25 | ): |
| 26 | self.pm = provider_manager |
| 27 | self._store = store or create_store(db_path) |
| 28 | |
| 29 | @property |
| 30 | def nodes(self) -> Dict[str, dict]: |
| 31 | """Backward-compatible read access to nodes as a dict keyed by entity name.""" |
| 32 | result = {} |
| @@ -111,19 +115,32 @@ | |
| 111 | ) |
| 112 | ) |
| 113 | |
| 114 | return entities, rels |
| 115 | |
| 116 | def add_content(self, text: str, source: str, timestamp: Optional[float] = None) -> None: |
| 117 | """Add content to knowledge graph by extracting entities and relationships.""" |
| 118 | entities, relationships = self.extract_entities_and_relationships(text) |
| 119 | |
| 120 | snippet = text[:100] + "..." if len(text) > 100 else text |
| 121 | |
| 122 | for entity in entities: |
| 123 | self._store.merge_entity(entity.name, entity.type, entity.descriptions, source=source) |
| 124 | self._store.add_occurrence(entity.name, source, timestamp, snippet) |
| 125 | |
| 126 | for rel in relationships: |
| 127 | if self._store.has_entity(rel.source) and self._store.has_entity(rel.target): |
| 128 | self._store.add_relationship( |
| 129 | rel.source, |
| @@ -206,25 +223,69 @@ | |
| 206 | content_source=r.get("content_source"), |
| 207 | timestamp=r.get("timestamp"), |
| 208 | ) |
| 209 | for r in self._store.get_all_relationships() |
| 210 | ] |
| 211 | return KnowledgeGraphData(nodes=nodes, relationships=rels) |
| 212 | |
| 213 | def to_dict(self) -> Dict: |
| 214 | """Convert knowledge graph to dictionary (backward-compatible).""" |
| 215 | return self._store.to_dict() |
| 216 | |
| 217 | def save(self, output_path: Union[str, Path]) -> Path: |
| 218 | """Save knowledge graph to JSON file.""" |
| 219 | output_path = Path(output_path) |
| 220 | if not output_path.suffix: |
| 221 | output_path = output_path.with_suffix(".json") |
| 222 | output_path.parent.mkdir(parents=True, exist_ok=True) |
| 223 | |
| 224 | data = self.to_data() |
| 225 | output_path.write_text(data.model_dump_json(indent=2)) |
| 226 | logger.info( |
| 227 | f"Saved knowledge graph with {self._store.get_entity_count()} nodes " |
| 228 | f"and {self._store.get_relationship_count()} relationships to {output_path}" |
| 229 | ) |
| 230 | return output_path |
| @@ -231,10 +292,12 @@ | |
| 231 | |
| 232 | @classmethod |
| 233 | def from_dict(cls, data: Dict, db_path: Optional[Path] = None) -> "KnowledgeGraph": |
| 234 | """Reconstruct a KnowledgeGraph from saved JSON dict.""" |
| 235 | kg = cls(db_path=db_path) |
| 236 | for node in data.get("nodes", []): |
| 237 | name = node.get("name", node.get("id", "")) |
| 238 | descs = node.get("descriptions", []) |
| 239 | if isinstance(descs, set): |
| 240 | descs = list(descs) |
| @@ -256,23 +319,93 @@ | |
| 256 | content_source=rel.get("content_source"), |
| 257 | timestamp=rel.get("timestamp"), |
| 258 | ) |
| 259 | return kg |
| 260 | |
| 261 | def merge(self, other: "KnowledgeGraph") -> None: |
| 262 | """Merge another KnowledgeGraph into this one.""" |
| 263 | for entity in other._store.get_all_entities(): |
| 264 | name = entity["name"] |
| 265 | descs = entity.get("descriptions", []) |
| 266 | if isinstance(descs, set): |
| 267 | descs = list(descs) |
| 268 | self._store.merge_entity( |
| 269 | name, entity.get("type", "concept"), descs, source=entity.get("source") |
| 270 | ) |
| 271 | for occ in entity.get("occurrences", []): |
| 272 | self._store.add_occurrence( |
| 273 | name, |
| 274 | occ.get("source", ""), |
| 275 | occ.get("timestamp"), |
| 276 | occ.get("text"), |
| 277 | ) |
| 278 | |
| @@ -283,10 +416,19 @@ | |
| 283 | rel.get("type", "related_to"), |
| 284 | content_source=rel.get("content_source"), |
| 285 | timestamp=rel.get("timestamp"), |
| 286 | ) |
| 287 | |
| 288 | def generate_mermaid(self, max_nodes: int = 30) -> str: |
| 289 | """Generate Mermaid visualization code.""" |
| 290 | nodes = self.nodes |
| 291 | rels = self.relationships |
| 292 | |
| 293 | |
| 294 | DDED video_processor/integrators/taxonomy.py |
| --- video_processor/integrators/knowledge_graph.py | |
| +++ video_processor/integrators/knowledge_graph.py | |
| @@ -5,11 +5,11 @@ | |
| 5 | from typing import Dict, List, Optional, Union |
| 6 | |
| 7 | from tqdm import tqdm |
| 8 | |
| 9 | from video_processor.integrators.graph_store import GraphStore, create_store |
| 10 | from video_processor.models import Entity, KnowledgeGraphData, Relationship, SourceRecord |
| 11 | from video_processor.providers.manager import ProviderManager |
| 12 | from video_processor.utils.json_parsing import parse_json_from_response |
| 13 | |
| 14 | logger = logging.getLogger(__name__) |
| 15 | |
| @@ -23,10 +23,14 @@ | |
| 23 | db_path: Optional[Path] = None, |
| 24 | store: Optional[GraphStore] = None, |
| 25 | ): |
| 26 | self.pm = provider_manager |
| 27 | self._store = store or create_store(db_path) |
| 28 | |
| 29 | def register_source(self, source: Dict) -> None: |
| 30 | """Register a content source for provenance tracking.""" |
| 31 | self._store.register_source(source) |
| 32 | |
| 33 | @property |
| 34 | def nodes(self) -> Dict[str, dict]: |
| 35 | """Backward-compatible read access to nodes as a dict keyed by entity name.""" |
| 36 | result = {} |
| @@ -111,19 +115,32 @@ | |
| 115 | ) |
| 116 | ) |
| 117 | |
| 118 | return entities, rels |
| 119 | |
| 120 | def add_content( |
| 121 | self, |
| 122 | text: str, |
| 123 | source: str, |
| 124 | timestamp: Optional[float] = None, |
| 125 | source_id: Optional[str] = None, |
| 126 | ) -> None: |
| 127 | """Add content to knowledge graph by extracting entities and relationships.""" |
| 128 | entities, relationships = self.extract_entities_and_relationships(text) |
| 129 | |
| 130 | snippet = text[:100] + "..." if len(text) > 100 else text |
| 131 | |
| 132 | for entity in entities: |
| 133 | self._store.merge_entity(entity.name, entity.type, entity.descriptions, source=source) |
| 134 | self._store.add_occurrence(entity.name, source, timestamp, snippet) |
| 135 | if source_id: |
| 136 | self._store.add_source_location( |
| 137 | source_id, |
| 138 | entity_name_lower=entity.name.lower(), |
| 139 | timestamp=timestamp, |
| 140 | text_snippet=snippet, |
| 141 | ) |
| 142 | |
| 143 | for rel in relationships: |
| 144 | if self._store.has_entity(rel.source) and self._store.has_entity(rel.target): |
| 145 | self._store.add_relationship( |
| 146 | rel.source, |
| @@ -206,25 +223,69 @@ | |
| 223 | content_source=r.get("content_source"), |
| 224 | timestamp=r.get("timestamp"), |
| 225 | ) |
| 226 | for r in self._store.get_all_relationships() |
| 227 | ] |
| 228 | |
| 229 | sources = [SourceRecord(**s) for s in self._store.get_sources()] |
| 230 | |
| 231 | return KnowledgeGraphData(nodes=nodes, relationships=rels, sources=sources) |
| 232 | |
| 233 | def to_dict(self) -> Dict: |
| 234 | """Convert knowledge graph to dictionary (backward-compatible).""" |
| 235 | return self._store.to_dict() |
| 236 | |
| 237 | def save(self, output_path: Union[str, Path]) -> Path: |
| 238 | """Save knowledge graph. Defaults to .db (SQLite), also supports .json.""" |
| 239 | output_path = Path(output_path) |
| 240 | if not output_path.suffix: |
| 241 | output_path = output_path.with_suffix(".db") |
| 242 | output_path.parent.mkdir(parents=True, exist_ok=True) |
| 243 | |
| 244 | if output_path.suffix == ".json": |
| 245 | data = self.to_data() |
| 246 | output_path.write_text(data.model_dump_json(indent=2)) |
| 247 | elif output_path.suffix == ".db": |
| 248 | # If the backing store is already SQLite at this path, it's already persisted. |
| 249 | # Otherwise, create a new SQLite store and copy data into it. |
| 250 | from video_processor.integrators.graph_store import SQLiteStore |
| 251 | |
| 252 | if not isinstance(self._store, SQLiteStore) or self._store._db_path != str(output_path): |
| 253 | target = SQLiteStore(output_path) |
| 254 | for source in self._store.get_sources(): |
| 255 | target.register_source(source) |
| 256 | for entity in self._store.get_all_entities(): |
| 257 | descs = entity.get("descriptions", []) |
| 258 | if isinstance(descs, set): |
| 259 | descs = list(descs) |
| 260 | target.merge_entity( |
| 261 | entity["name"], |
| 262 | entity.get("type", "concept"), |
| 263 | descs, |
| 264 | source=entity.get("source"), |
| 265 | ) |
| 266 | for occ in entity.get("occurrences", []): |
| 267 | target.add_occurrence( |
| 268 | entity["name"], |
| 269 | occ.get("source", ""), |
| 270 | occ.get("timestamp"), |
| 271 | occ.get("text"), |
| 272 | ) |
| 273 | for rel in self._store.get_all_relationships(): |
| 274 | target.add_relationship( |
| 275 | rel.get("source", ""), |
| 276 | rel.get("target", ""), |
| 277 | rel.get("type", "related_to"), |
| 278 | content_source=rel.get("content_source"), |
| 279 | timestamp=rel.get("timestamp"), |
| 280 | ) |
| 281 | target.close() |
| 282 | else: |
| 283 | # Unknown suffix — fall back to JSON |
| 284 | data = self.to_data() |
| 285 | output_path.write_text(data.model_dump_json(indent=2)) |
| 286 | |
| 287 | logger.info( |
| 288 | f"Saved knowledge graph with {self._store.get_entity_count()} nodes " |
| 289 | f"and {self._store.get_relationship_count()} relationships to {output_path}" |
| 290 | ) |
| 291 | return output_path |
| @@ -231,10 +292,12 @@ | |
| 292 | |
| 293 | @classmethod |
| 294 | def from_dict(cls, data: Dict, db_path: Optional[Path] = None) -> "KnowledgeGraph": |
| 295 | """Reconstruct a KnowledgeGraph from saved JSON dict.""" |
| 296 | kg = cls(db_path=db_path) |
| 297 | for source in data.get("sources", []): |
| 298 | kg._store.register_source(source) |
| 299 | for node in data.get("nodes", []): |
| 300 | name = node.get("name", node.get("id", "")) |
| 301 | descs = node.get("descriptions", []) |
| 302 | if isinstance(descs, set): |
| 303 | descs = list(descs) |
| @@ -256,23 +319,93 @@ | |
| 319 | content_source=rel.get("content_source"), |
| 320 | timestamp=rel.get("timestamp"), |
| 321 | ) |
| 322 | return kg |
| 323 | |
| 324 | # Type specificity ranking for conflict resolution during merge. |
| 325 | # Higher rank = more specific type wins when two entities match. |
| 326 | _TYPE_SPECIFICITY = { |
| 327 | "concept": 0, |
| 328 | "time": 1, |
| 329 | "diagram": 1, |
| 330 | "organization": 2, |
| 331 | "person": 3, |
| 332 | "technology": 3, |
| 333 | } |
| 334 | |
| 335 | @staticmethod |
| 336 | def _fuzzy_match(name_a: str, name_b: str, threshold: float = 0.85) -> bool: |
| 337 | """Return True if two names are similar enough to be considered the same entity.""" |
| 338 | from difflib import SequenceMatcher |
| 339 | |
| 340 | return SequenceMatcher(None, name_a.lower(), name_b.lower()).ratio() >= threshold |
| 341 | |
| 342 | def _more_specific_type(self, type_a: str, type_b: str) -> str: |
| 343 | """Return the more specific of two entity types.""" |
| 344 | rank_a = self._TYPE_SPECIFICITY.get(type_a, 1) |
| 345 | rank_b = self._TYPE_SPECIFICITY.get(type_b, 1) |
| 346 | return type_a if rank_a >= rank_b else type_b |
| 347 | |
| 348 | def merge(self, other: "KnowledgeGraph") -> None: |
| 349 | """Merge another KnowledgeGraph into this one. |
| 350 | |
| 351 | Improvements over naive merge: |
| 352 | - Fuzzy name matching (SequenceMatcher >= 0.85) to unify near-duplicate entities |
| 353 | - Type conflict resolution: prefer more specific types (e.g. technology > concept) |
| 354 | - Provenance: merged entities get a ``merged_from`` description entry |
| 355 | """ |
| 356 | for source in other._store.get_sources(): |
| 357 | self._store.register_source(source) |
| 358 | |
| 359 | # Build a lookup of existing entity names for fuzzy matching |
| 360 | existing_entities = self._store.get_all_entities() |
| 361 | existing_names = {e["name"]: e for e in existing_entities} |
| 362 | # Cache lowercase -> canonical name for fast lookup |
| 363 | name_index: dict[str, str] = {n.lower(): n for n in existing_names} |
| 364 | |
| 365 | for entity in other._store.get_all_entities(): |
| 366 | incoming_name = entity["name"] |
| 367 | descs = entity.get("descriptions", []) |
| 368 | if isinstance(descs, set): |
| 369 | descs = list(descs) |
| 370 | incoming_type = entity.get("type", "concept") |
| 371 | |
| 372 | # Try exact match first (case-insensitive), then fuzzy |
| 373 | matched_name: Optional[str] = None |
| 374 | if incoming_name.lower() in name_index: |
| 375 | matched_name = name_index[incoming_name.lower()] |
| 376 | else: |
| 377 | for existing_name in existing_names: |
| 378 | if self._fuzzy_match(incoming_name, existing_name): |
| 379 | matched_name = existing_name |
| 380 | break |
| 381 | |
| 382 | if matched_name is not None: |
| 383 | # Resolve type conflict |
| 384 | existing_type = existing_names[matched_name].get("type", "concept") |
| 385 | resolved_type = self._more_specific_type(existing_type, incoming_type) |
| 386 | |
| 387 | # Add merge provenance |
| 388 | merge_note = f"merged_from:{incoming_name}" |
| 389 | merged_descs = descs if incoming_name == matched_name else descs + [merge_note] |
| 390 | |
| 391 | self._store.merge_entity( |
| 392 | matched_name, resolved_type, merged_descs, source=entity.get("source") |
| 393 | ) |
| 394 | target_name = matched_name |
| 395 | else: |
| 396 | self._store.merge_entity( |
| 397 | incoming_name, incoming_type, descs, source=entity.get("source") |
| 398 | ) |
| 399 | # Update indexes for subsequent fuzzy matches within this merge |
| 400 | existing_names[incoming_name] = entity |
| 401 | name_index[incoming_name.lower()] = incoming_name |
| 402 | target_name = incoming_name |
| 403 | |
| 404 | for occ in entity.get("occurrences", []): |
| 405 | self._store.add_occurrence( |
| 406 | target_name, |
| 407 | occ.get("source", ""), |
| 408 | occ.get("timestamp"), |
| 409 | occ.get("text"), |
| 410 | ) |
| 411 | |
| @@ -283,10 +416,19 @@ | |
| 416 | rel.get("type", "related_to"), |
| 417 | content_source=rel.get("content_source"), |
| 418 | timestamp=rel.get("timestamp"), |
| 419 | ) |
| 420 | |
| 421 | def classify_for_planning(self): |
| 422 | """Classify entities in this knowledge graph into planning taxonomy types.""" |
| 423 | from video_processor.integrators.taxonomy import TaxonomyClassifier |
| 424 | |
| 425 | classifier = TaxonomyClassifier(provider_manager=self.pm) |
| 426 | entities = self._store.get_all_entities() |
| 427 | relationships = self._store.get_all_relationships() |
| 428 | return classifier.classify_entities(entities, relationships) |
| 429 | |
| 430 | def generate_mermaid(self, max_nodes: int = 30) -> str: |
| 431 | """Generate Mermaid visualization code.""" |
| 432 | nodes = self.nodes |
| 433 | rels = self.relationships |
| 434 | |
| 435 | |
| 436 | DDED video_processor/integrators/taxonomy.py |
| --- a/video_processor/integrators/taxonomy.py | ||
| +++ b/video_processor/integrators/taxonomy.py | ||
| @@ -0,0 +1,193 @@ | ||
| 1 | +"""Taxonomy classifier for planning entity extraction. | |
| 2 | + | |
| 3 | +Bridges raw knowledge graph entities (person, technology, concept) into | |
| 4 | +planning-ready structures (goals, requirements, decisions, risks). | |
| 5 | +""" | |
| 6 | + | |
| 7 | +import logging | |
| 8 | +from typing import Any, Dict, List, Optional | |
| 9 | + | |
| 10 | +from video_processor.models import PlanningEntity, PlanningEntityType | |
| 11 | + | |
| 12 | +logger = logging.getLogger(__name__) | |
| 13 | + | |
| 14 | +# Keyword rules for heuristic classification. Each tuple is | |
| 15 | +# (PlanningEntityType, list-of-keywords). Order matters — first match wins. | |
| 16 | +_KEYWORD_RULES: List[tuple] = [ | |
| 17 | + (PlanningEntityType.GOAL, ["goal", "objective", "aim", "target outcome"]), | |
| 18 | + ( | |
| 19 | + PlanningEntityType.REQUIREMENT, | |
| 20 | + ["must", "should", "requirement", "need", "required"], | |
| 21 | + ), | |
| 22 | + ( | |
| 23 | + PlanningEntityType.CONSTRAINT, | |
| 24 | + ["constraint", "limitation", "restrict", "cannot", "must not"], | |
| 25 | + ), | |
| 26 | + ( | |
| 27 | + PlanningEntityType.DECISION, | |
| 28 | + ["decided", "decision", "chose", "selected", "agreed"], | |
| 29 | + ), | |
| 30 | + (PlanningEntityType.RISK, ["risk", "concern", "worry", "danger", "threat"]), | |
| 31 | + ( | |
| 32 | + PlanningEntityType.ASSUMPTION, | |
| 33 | + ["assume", "assumption", "expecting", "presume"], | |
| 34 | + ), | |
| 35 | + ( | |
| 36 | + PlanningEntityType.DEPENDENCY, | |
| 37 | + ["depends", "dependency", "relies on", "prerequisite", "blocked"], | |
| 38 | + ), | |
| 39 | + ( | |
| 40 | + PlanningEntityType.MILESTONE, | |
| 41 | + ["milestone", "deadline", "deliverable", "release", "launch"], | |
| 42 | + ), | |
| 43 | + ( | |
| 44 | + PlanningEntityType.TASK, | |
| 45 | + ["task", "todo", "action item", "work item", "implement"], | |
| 46 | + ), | |
| 47 | + (PlanningEntityType.FEATURE, ["feature", "capability", "functionality"]), | |
| 48 | +] | |
| 49 | + | |
| 50 | + | |
| 51 | +class TaxonomyClassifier: | |
| 52 | + """Classifies raw knowledge graph entities into planning taxonomy types.""" | |
| 53 | + | |
| 54 | + def __init__(self, provider_manager: Optional[Any] = None): | |
| 55 | + self.pm = provider_manager | |
| 56 | + | |
| 57 | + # ------------------------------------------------------------------ | |
| 58 | + # Public API | |
| 59 | + # ------------------------------------------------------------------ | |
| 60 | + | |
| 61 | + def classify_entities( | |
| 62 | + self, | |
| 63 | + entities: List[Dict], | |
| 64 | + relationships: List[Dict], | |
| 65 | + ) -> List[PlanningEntity]: | |
| 66 | + """Classify extracted entities into planning entity types. | |
| 67 | + | |
| 68 | + Uses heuristic classification first, then LLM refinement if a | |
| 69 | + provider manager is available. | |
| 70 | + """ | |
| 71 | + planning_entities: List[PlanningEntity] = [] | |
| 72 | + | |
| 73 | + # Step 1: heuristic classification | |
| 74 | + for entity in entities: | |
| 75 | + planning_type = self._heuristic_classify(entity, relationships) | |
| 76 | + if planning_type: | |
| 77 | + descs = entity.get("descriptions", []) | |
| 78 | + planning_entities.append( | |
| 79 | + PlanningEntity( | |
| 80 | + name=entity["name"], | |
| 81 | + planning_type=planning_type, | |
| 82 | + description="; ".join(descs[:2]), | |
| 83 | + source_entities=[entity["name"]], | |
| 84 | + ) | |
| 85 | + ) | |
| 86 | + | |
| 87 | + # Step 2: LLM refinement (if provider available) | |
| 88 | + if self.pm and entities: | |
| 89 | + llm_classified = self._llm_classify(entities, relationships) | |
| 90 | + planning_entities = self._merge_classifications(planning_entities, llm_classified) | |
| 91 | + | |
| 92 | + return planning_entities | |
| 93 | + | |
| 94 | + def organize_by_workstream( | |
| 95 | + self, planning_entities: List[PlanningEntity] | |
| 96 | + ) -> Dict[str, List[PlanningEntity]]: | |
| 97 | + """Group planning entities into logical workstreams by type.""" | |
| 98 | + workstreams: Dict[str, List[PlanningEntity]] = {} | |
| 99 | + for pe in planning_entities: | |
| 100 | + group = pe.planning_type.value + "s" | |
| 101 | + workstreams.setdefault(group, []).append(pe) | |
| 102 | + return workstreams | |
| 103 | + | |
| 104 | + # ------------------------------------------------------------------ | |
| 105 | + # Heuristic classification | |
| 106 | + # ------------------------------------------------------------------ | |
| 107 | + | |
| 108 | + def _heuristic_classify( | |
| 109 | + self, | |
| 110 | + entity: Dict, | |
| 111 | + relationships: List[Dict], # noqa: ARG002 — reserved for future rules | |
| 112 | + ) -> Optional[PlanningEntityType]: | |
| 113 | + """Rule-based classification from entity type and description keywords.""" | |
| 114 | + desc_lower = " ".join(entity.get("descriptions", [])).lower() | |
| 115 | + | |
| 116 | + for planning_type, keywords in _KEYWORD_RULES: | |
| 117 | + if any(kw in desc_lower for kw in keywords): | |
| 118 | + return planning_type | |
| 119 | + | |
| 120 | + return None | |
| 121 | + | |
| 122 | + # ------------------------------------------------------------------ | |
| 123 | + # LLM classification | |
| 124 | + # ------------------------------------------------------------------ | |
| 125 | + | |
| 126 | + def _llm_classify( | |
| 127 | + self, entities: List[Dict], relationships: List[Dict] | |
| 128 | + ) -> List[PlanningEntity]: | |
| 129 | + """Use LLM to classify entities into planning types.""" | |
| 130 | + entity_summaries = [] | |
| 131 | + for e in entities[:50]: # limit to avoid token overflow | |
| 132 | + descs = e.get("descriptions", []) | |
| 133 | + desc_str = "; ".join(descs[:2]) if descs else "no description" | |
| 134 | + entity_summaries.append(f"- {e['name']} ({e.get('type', 'concept')}): {desc_str}") | |
| 135 | + | |
| 136 | + prompt = ( | |
| 137 | + "Classify these entities from a knowledge graph into planning categories.\n\n" | |
| 138 | + "Entities:\n" + "\n".join(entity_summaries) + "\n\n" | |
| 139 | + "Categories: goal, requirement, constraint, decision, risk, assumption, " | |
| 140 | + "dependency, milestone, task, feature\n\n" | |
| 141 | + "For each entity that fits a planning category, return JSON:\n" | |
| 142 | + '[{"name": "...", "planning_type": "...", "priority": "high|medium|low"}]\n\n' | |
| 143 | + "Only include entities that clearly fit a planning category. " | |
| 144 | + "Skip entities that are just people, technologies, or general concepts. " | |
| 145 | + "Return ONLY the JSON array." | |
| 146 | + ) | |
| 147 | + | |
| 148 | + try: | |
| 149 | + raw = self.pm.chat( | |
| 150 | + [{"role": "user", "content": prompt}], | |
| 151 | + max_tokens=2048, | |
| 152 | + temperature=0.2, | |
| 153 | + ) | |
| 154 | + except Exception: | |
| 155 | + logger.warning("LLM classification failed, using heuristic only") | |
| 156 | + return [] | |
| 157 | + | |
| 158 | + from video_processor.utils.json_parsing import parse_json_from_response | |
| 159 | + | |
| 160 | + parsed = parse_json_from_response(raw) | |
| 161 | + | |
| 162 | + results: List[PlanningEntity] = [] | |
| 163 | + if isinstance(parsed, list): | |
| 164 | + for item in parsed: | |
| 165 | + if isinstance(item, dict) and "name" in item and "planning_type" in item: | |
| 166 | + try: | |
| 167 | + ptype = PlanningEntityType(item["planning_type"]) | |
| 168 | + results.append( | |
| 169 | + PlanningEntity( | |
| 170 | + name=item["name"], | |
| 171 | + planning_type=ptype, | |
| 172 | + priority=item.get("priority"), | |
| 173 | + source_entities=[item["name"]], | |
| 174 | + ) | |
| 175 | + ) | |
| 176 | + except ValueError: | |
| 177 | + pass | |
| 178 | + return results | |
| 179 | + | |
| 180 | + # ------------------------------------------------------------------ | |
| 181 | + # Merge | |
| 182 | + # ------------------------------------------------------------------ | |
| 183 | + | |
| 184 | + @staticmethod | |
| 185 | + def _merge_classifications( | |
| 186 | + heuristic: List[PlanningEntity], | |
| 187 | + llm: List[PlanningEntity], | |
| 188 | + ) -> List[PlanningEntity]: | |
| 189 | + """Merge heuristic and LLM classifications. LLM wins on conflicts.""" | |
| 190 | + by_name = {pe.name.lower(): pe for pe in heuristic} | |
| 191 | + for pe in llm: | |
| 192 | + by_name[pe.name.lower()] = pe # LLM overrides | |
| 193 | + return list(by_name.values()) |
| --- a/video_processor/integrators/taxonomy.py | |
| +++ b/video_processor/integrators/taxonomy.py | |
| @@ -0,0 +1,193 @@ | |
| --- a/video_processor/integrators/taxonomy.py | |
| +++ b/video_processor/integrators/taxonomy.py | |
| @@ -0,0 +1,193 @@ | |
| 1 | """Taxonomy classifier for planning entity extraction. |
| 2 | |
| 3 | Bridges raw knowledge graph entities (person, technology, concept) into |
| 4 | planning-ready structures (goals, requirements, decisions, risks). |
| 5 | """ |
| 6 | |
| 7 | import logging |
| 8 | from typing import Any, Dict, List, Optional |
| 9 | |
| 10 | from video_processor.models import PlanningEntity, PlanningEntityType |
| 11 | |
| 12 | logger = logging.getLogger(__name__) |
| 13 | |
| 14 | # Keyword rules for heuristic classification. Each tuple is |
| 15 | # (PlanningEntityType, list-of-keywords). Order matters — first match wins. |
| 16 | _KEYWORD_RULES: List[tuple] = [ |
| 17 | (PlanningEntityType.GOAL, ["goal", "objective", "aim", "target outcome"]), |
| 18 | ( |
| 19 | PlanningEntityType.REQUIREMENT, |
| 20 | ["must", "should", "requirement", "need", "required"], |
| 21 | ), |
| 22 | ( |
| 23 | PlanningEntityType.CONSTRAINT, |
| 24 | ["constraint", "limitation", "restrict", "cannot", "must not"], |
| 25 | ), |
| 26 | ( |
| 27 | PlanningEntityType.DECISION, |
| 28 | ["decided", "decision", "chose", "selected", "agreed"], |
| 29 | ), |
| 30 | (PlanningEntityType.RISK, ["risk", "concern", "worry", "danger", "threat"]), |
| 31 | ( |
| 32 | PlanningEntityType.ASSUMPTION, |
| 33 | ["assume", "assumption", "expecting", "presume"], |
| 34 | ), |
| 35 | ( |
| 36 | PlanningEntityType.DEPENDENCY, |
| 37 | ["depends", "dependency", "relies on", "prerequisite", "blocked"], |
| 38 | ), |
| 39 | ( |
| 40 | PlanningEntityType.MILESTONE, |
| 41 | ["milestone", "deadline", "deliverable", "release", "launch"], |
| 42 | ), |
| 43 | ( |
| 44 | PlanningEntityType.TASK, |
| 45 | ["task", "todo", "action item", "work item", "implement"], |
| 46 | ), |
| 47 | (PlanningEntityType.FEATURE, ["feature", "capability", "functionality"]), |
| 48 | ] |
| 49 | |
| 50 | |
| 51 | class TaxonomyClassifier: |
| 52 | """Classifies raw knowledge graph entities into planning taxonomy types.""" |
| 53 | |
| 54 | def __init__(self, provider_manager: Optional[Any] = None): |
| 55 | self.pm = provider_manager |
| 56 | |
| 57 | # ------------------------------------------------------------------ |
| 58 | # Public API |
| 59 | # ------------------------------------------------------------------ |
| 60 | |
| 61 | def classify_entities( |
| 62 | self, |
| 63 | entities: List[Dict], |
| 64 | relationships: List[Dict], |
| 65 | ) -> List[PlanningEntity]: |
| 66 | """Classify extracted entities into planning entity types. |
| 67 | |
| 68 | Uses heuristic classification first, then LLM refinement if a |
| 69 | provider manager is available. |
| 70 | """ |
| 71 | planning_entities: List[PlanningEntity] = [] |
| 72 | |
| 73 | # Step 1: heuristic classification |
| 74 | for entity in entities: |
| 75 | planning_type = self._heuristic_classify(entity, relationships) |
| 76 | if planning_type: |
| 77 | descs = entity.get("descriptions", []) |
| 78 | planning_entities.append( |
| 79 | PlanningEntity( |
| 80 | name=entity["name"], |
| 81 | planning_type=planning_type, |
| 82 | description="; ".join(descs[:2]), |
| 83 | source_entities=[entity["name"]], |
| 84 | ) |
| 85 | ) |
| 86 | |
| 87 | # Step 2: LLM refinement (if provider available) |
| 88 | if self.pm and entities: |
| 89 | llm_classified = self._llm_classify(entities, relationships) |
| 90 | planning_entities = self._merge_classifications(planning_entities, llm_classified) |
| 91 | |
| 92 | return planning_entities |
| 93 | |
| 94 | def organize_by_workstream( |
| 95 | self, planning_entities: List[PlanningEntity] |
| 96 | ) -> Dict[str, List[PlanningEntity]]: |
| 97 | """Group planning entities into logical workstreams by type.""" |
| 98 | workstreams: Dict[str, List[PlanningEntity]] = {} |
| 99 | for pe in planning_entities: |
| 100 | group = pe.planning_type.value + "s" |
| 101 | workstreams.setdefault(group, []).append(pe) |
| 102 | return workstreams |
| 103 | |
| 104 | # ------------------------------------------------------------------ |
| 105 | # Heuristic classification |
| 106 | # ------------------------------------------------------------------ |
| 107 | |
| 108 | def _heuristic_classify( |
| 109 | self, |
| 110 | entity: Dict, |
| 111 | relationships: List[Dict], # noqa: ARG002 — reserved for future rules |
| 112 | ) -> Optional[PlanningEntityType]: |
| 113 | """Rule-based classification from entity type and description keywords.""" |
| 114 | desc_lower = " ".join(entity.get("descriptions", [])).lower() |
| 115 | |
| 116 | for planning_type, keywords in _KEYWORD_RULES: |
| 117 | if any(kw in desc_lower for kw in keywords): |
| 118 | return planning_type |
| 119 | |
| 120 | return None |
| 121 | |
| 122 | # ------------------------------------------------------------------ |
| 123 | # LLM classification |
| 124 | # ------------------------------------------------------------------ |
| 125 | |
| 126 | def _llm_classify( |
| 127 | self, entities: List[Dict], relationships: List[Dict] |
| 128 | ) -> List[PlanningEntity]: |
| 129 | """Use LLM to classify entities into planning types.""" |
| 130 | entity_summaries = [] |
| 131 | for e in entities[:50]: # limit to avoid token overflow |
| 132 | descs = e.get("descriptions", []) |
| 133 | desc_str = "; ".join(descs[:2]) if descs else "no description" |
| 134 | entity_summaries.append(f"- {e['name']} ({e.get('type', 'concept')}): {desc_str}") |
| 135 | |
| 136 | prompt = ( |
| 137 | "Classify these entities from a knowledge graph into planning categories.\n\n" |
| 138 | "Entities:\n" + "\n".join(entity_summaries) + "\n\n" |
| 139 | "Categories: goal, requirement, constraint, decision, risk, assumption, " |
| 140 | "dependency, milestone, task, feature\n\n" |
| 141 | "For each entity that fits a planning category, return JSON:\n" |
| 142 | '[{"name": "...", "planning_type": "...", "priority": "high|medium|low"}]\n\n' |
| 143 | "Only include entities that clearly fit a planning category. " |
| 144 | "Skip entities that are just people, technologies, or general concepts. " |
| 145 | "Return ONLY the JSON array." |
| 146 | ) |
| 147 | |
| 148 | try: |
| 149 | raw = self.pm.chat( |
| 150 | [{"role": "user", "content": prompt}], |
| 151 | max_tokens=2048, |
| 152 | temperature=0.2, |
| 153 | ) |
| 154 | except Exception: |
| 155 | logger.warning("LLM classification failed, using heuristic only") |
| 156 | return [] |
| 157 | |
| 158 | from video_processor.utils.json_parsing import parse_json_from_response |
| 159 | |
| 160 | parsed = parse_json_from_response(raw) |
| 161 | |
| 162 | results: List[PlanningEntity] = [] |
| 163 | if isinstance(parsed, list): |
| 164 | for item in parsed: |
| 165 | if isinstance(item, dict) and "name" in item and "planning_type" in item: |
| 166 | try: |
| 167 | ptype = PlanningEntityType(item["planning_type"]) |
| 168 | results.append( |
| 169 | PlanningEntity( |
| 170 | name=item["name"], |
| 171 | planning_type=ptype, |
| 172 | priority=item.get("priority"), |
| 173 | source_entities=[item["name"]], |
| 174 | ) |
| 175 | ) |
| 176 | except ValueError: |
| 177 | pass |
| 178 | return results |
| 179 | |
| 180 | # ------------------------------------------------------------------ |
| 181 | # Merge |
| 182 | # ------------------------------------------------------------------ |
| 183 | |
| 184 | @staticmethod |
| 185 | def _merge_classifications( |
| 186 | heuristic: List[PlanningEntity], |
| 187 | llm: List[PlanningEntity], |
| 188 | ) -> List[PlanningEntity]: |
| 189 | """Merge heuristic and LLM classifications. LLM wins on conflicts.""" |
| 190 | by_name = {pe.name.lower(): pe for pe in heuristic} |
| 191 | for pe in llm: |
| 192 | by_name[pe.name.lower()] = pe # LLM overrides |
| 193 | return list(by_name.values()) |
+70
-1
| --- video_processor/models.py | ||
| +++ video_processor/models.py | ||
| @@ -1,13 +1,22 @@ | ||
| 1 | 1 | """Pydantic data models for PlanOpticon output.""" |
| 2 | 2 | |
| 3 | 3 | from datetime import datetime |
| 4 | 4 | from enum import Enum |
| 5 | -from typing import Any, Dict, List, Optional | |
| 5 | +from typing import Any, Dict, List, Optional, Protocol, runtime_checkable | |
| 6 | 6 | |
| 7 | 7 | from pydantic import BaseModel, Field |
| 8 | 8 | |
| 9 | + | |
| 10 | +@runtime_checkable | |
| 11 | +class ProgressCallback(Protocol): | |
| 12 | + """Optional callback for pipeline progress updates.""" | |
| 13 | + | |
| 14 | + def on_step_start(self, step: str, index: int, total: int) -> None: ... | |
| 15 | + def on_step_complete(self, step: str, index: int, total: int) -> None: ... | |
| 16 | + def on_progress(self, step: str, percent: float, message: str = "") -> None: ... | |
| 17 | + | |
| 9 | 18 | |
| 10 | 19 | class DiagramType(str, Enum): |
| 11 | 20 | """Types of visual content detected in video frames.""" |
| 12 | 21 | |
| 13 | 22 | flowchart = "flowchart" |
| @@ -98,10 +107,26 @@ | ||
| 98 | 107 | image_path: Optional[str] = Field(default=None, description="Relative path to screenshot") |
| 99 | 108 | confidence: float = Field( |
| 100 | 109 | default=0.0, description="Detection confidence that triggered fallback" |
| 101 | 110 | ) |
| 102 | 111 | |
| 112 | + | |
| 113 | +class SourceRecord(BaseModel): | |
| 114 | + """A content source registered in the knowledge graph for provenance tracking.""" | |
| 115 | + | |
| 116 | + source_id: str = Field(description="Unique identifier for this source") | |
| 117 | + source_type: str = Field(description="Source type: video, document, url, api, manual") | |
| 118 | + title: str = Field(description="Human-readable title") | |
| 119 | + path: Optional[str] = Field(default=None, description="Local file path") | |
| 120 | + url: Optional[str] = Field(default=None, description="URL if applicable") | |
| 121 | + mime_type: Optional[str] = Field(default=None, description="MIME type of the source") | |
| 122 | + ingested_at: str = Field( | |
| 123 | + default_factory=lambda: datetime.now().isoformat(), | |
| 124 | + description="ISO format ingestion timestamp", | |
| 125 | + ) | |
| 126 | + metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional source metadata") | |
| 127 | + | |
| 103 | 128 | |
| 104 | 129 | class Entity(BaseModel): |
| 105 | 130 | """An entity in the knowledge graph.""" |
| 106 | 131 | |
| 107 | 132 | name: str = Field(description="Entity name") |
| @@ -130,10 +155,54 @@ | ||
| 130 | 155 | |
| 131 | 156 | nodes: List[Entity] = Field(default_factory=list, description="Graph nodes/entities") |
| 132 | 157 | relationships: List[Relationship] = Field( |
| 133 | 158 | default_factory=list, description="Graph relationships" |
| 134 | 159 | ) |
| 160 | + sources: List[SourceRecord] = Field( | |
| 161 | + default_factory=list, description="Content sources for provenance tracking" | |
| 162 | + ) | |
| 163 | + | |
| 164 | + | |
| 165 | +class PlanningEntityType(str, Enum): | |
| 166 | + """Types of entities in a planning taxonomy.""" | |
| 167 | + | |
| 168 | + GOAL = "goal" | |
| 169 | + REQUIREMENT = "requirement" | |
| 170 | + CONSTRAINT = "constraint" | |
| 171 | + DECISION = "decision" | |
| 172 | + RISK = "risk" | |
| 173 | + ASSUMPTION = "assumption" | |
| 174 | + DEPENDENCY = "dependency" | |
| 175 | + MILESTONE = "milestone" | |
| 176 | + TASK = "task" | |
| 177 | + FEATURE = "feature" | |
| 178 | + | |
| 179 | + | |
| 180 | +class PlanningEntity(BaseModel): | |
| 181 | + """An entity classified for planning purposes.""" | |
| 182 | + | |
| 183 | + name: str | |
| 184 | + planning_type: PlanningEntityType | |
| 185 | + description: str = "" | |
| 186 | + priority: Optional[str] = None # "high", "medium", "low" | |
| 187 | + status: Optional[str] = None # "identified", "confirmed", "resolved" | |
| 188 | + source_entities: List[str] = Field(default_factory=list) | |
| 189 | + metadata: Dict[str, Any] = Field(default_factory=dict) | |
| 190 | + | |
| 191 | + | |
| 192 | +class PlanningRelationshipType(str, Enum): | |
| 193 | + """Relationship types within a planning taxonomy.""" | |
| 194 | + | |
| 195 | + REQUIRES = "requires" | |
| 196 | + BLOCKED_BY = "blocked_by" | |
| 197 | + HAS_RISK = "has_risk" | |
| 198 | + DEPENDS_ON = "depends_on" | |
| 199 | + ADDRESSES = "addresses" | |
| 200 | + HAS_TRADEOFF = "has_tradeoff" | |
| 201 | + DELIVERS = "delivers" | |
| 202 | + IMPLEMENTS = "implements" | |
| 203 | + PARENT_OF = "parent_of" | |
| 135 | 204 | |
| 136 | 205 | |
| 137 | 206 | class ProcessingStats(BaseModel): |
| 138 | 207 | """Statistics about a processing run.""" |
| 139 | 208 | |
| 140 | 209 |
| --- video_processor/models.py | |
| +++ video_processor/models.py | |
| @@ -1,13 +1,22 @@ | |
| 1 | """Pydantic data models for PlanOpticon output.""" |
| 2 | |
| 3 | from datetime import datetime |
| 4 | from enum import Enum |
| 5 | from typing import Any, Dict, List, Optional |
| 6 | |
| 7 | from pydantic import BaseModel, Field |
| 8 | |
| 9 | |
| 10 | class DiagramType(str, Enum): |
| 11 | """Types of visual content detected in video frames.""" |
| 12 | |
| 13 | flowchart = "flowchart" |
| @@ -98,10 +107,26 @@ | |
| 98 | image_path: Optional[str] = Field(default=None, description="Relative path to screenshot") |
| 99 | confidence: float = Field( |
| 100 | default=0.0, description="Detection confidence that triggered fallback" |
| 101 | ) |
| 102 | |
| 103 | |
| 104 | class Entity(BaseModel): |
| 105 | """An entity in the knowledge graph.""" |
| 106 | |
| 107 | name: str = Field(description="Entity name") |
| @@ -130,10 +155,54 @@ | |
| 130 | |
| 131 | nodes: List[Entity] = Field(default_factory=list, description="Graph nodes/entities") |
| 132 | relationships: List[Relationship] = Field( |
| 133 | default_factory=list, description="Graph relationships" |
| 134 | ) |
| 135 | |
| 136 | |
| 137 | class ProcessingStats(BaseModel): |
| 138 | """Statistics about a processing run.""" |
| 139 | |
| 140 |
| --- video_processor/models.py | |
| +++ video_processor/models.py | |
| @@ -1,13 +1,22 @@ | |
| 1 | """Pydantic data models for PlanOpticon output.""" |
| 2 | |
| 3 | from datetime import datetime |
| 4 | from enum import Enum |
| 5 | from typing import Any, Dict, List, Optional, Protocol, runtime_checkable |
| 6 | |
| 7 | from pydantic import BaseModel, Field |
| 8 | |
| 9 | |
| 10 | @runtime_checkable |
| 11 | class ProgressCallback(Protocol): |
| 12 | """Optional callback for pipeline progress updates.""" |
| 13 | |
| 14 | def on_step_start(self, step: str, index: int, total: int) -> None: ... |
| 15 | def on_step_complete(self, step: str, index: int, total: int) -> None: ... |
| 16 | def on_progress(self, step: str, percent: float, message: str = "") -> None: ... |
| 17 | |
| 18 | |
| 19 | class DiagramType(str, Enum): |
| 20 | """Types of visual content detected in video frames.""" |
| 21 | |
| 22 | flowchart = "flowchart" |
| @@ -98,10 +107,26 @@ | |
| 107 | image_path: Optional[str] = Field(default=None, description="Relative path to screenshot") |
| 108 | confidence: float = Field( |
| 109 | default=0.0, description="Detection confidence that triggered fallback" |
| 110 | ) |
| 111 | |
| 112 | |
| 113 | class SourceRecord(BaseModel): |
| 114 | """A content source registered in the knowledge graph for provenance tracking.""" |
| 115 | |
| 116 | source_id: str = Field(description="Unique identifier for this source") |
| 117 | source_type: str = Field(description="Source type: video, document, url, api, manual") |
| 118 | title: str = Field(description="Human-readable title") |
| 119 | path: Optional[str] = Field(default=None, description="Local file path") |
| 120 | url: Optional[str] = Field(default=None, description="URL if applicable") |
| 121 | mime_type: Optional[str] = Field(default=None, description="MIME type of the source") |
| 122 | ingested_at: str = Field( |
| 123 | default_factory=lambda: datetime.now().isoformat(), |
| 124 | description="ISO format ingestion timestamp", |
| 125 | ) |
| 126 | metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional source metadata") |
| 127 | |
| 128 | |
| 129 | class Entity(BaseModel): |
| 130 | """An entity in the knowledge graph.""" |
| 131 | |
| 132 | name: str = Field(description="Entity name") |
| @@ -130,10 +155,54 @@ | |
| 155 | |
| 156 | nodes: List[Entity] = Field(default_factory=list, description="Graph nodes/entities") |
| 157 | relationships: List[Relationship] = Field( |
| 158 | default_factory=list, description="Graph relationships" |
| 159 | ) |
| 160 | sources: List[SourceRecord] = Field( |
| 161 | default_factory=list, description="Content sources for provenance tracking" |
| 162 | ) |
| 163 | |
| 164 | |
| 165 | class PlanningEntityType(str, Enum): |
| 166 | """Types of entities in a planning taxonomy.""" |
| 167 | |
| 168 | GOAL = "goal" |
| 169 | REQUIREMENT = "requirement" |
| 170 | CONSTRAINT = "constraint" |
| 171 | DECISION = "decision" |
| 172 | RISK = "risk" |
| 173 | ASSUMPTION = "assumption" |
| 174 | DEPENDENCY = "dependency" |
| 175 | MILESTONE = "milestone" |
| 176 | TASK = "task" |
| 177 | FEATURE = "feature" |
| 178 | |
| 179 | |
| 180 | class PlanningEntity(BaseModel): |
| 181 | """An entity classified for planning purposes.""" |
| 182 | |
| 183 | name: str |
| 184 | planning_type: PlanningEntityType |
| 185 | description: str = "" |
| 186 | priority: Optional[str] = None # "high", "medium", "low" |
| 187 | status: Optional[str] = None # "identified", "confirmed", "resolved" |
| 188 | source_entities: List[str] = Field(default_factory=list) |
| 189 | metadata: Dict[str, Any] = Field(default_factory=dict) |
| 190 | |
| 191 | |
| 192 | class PlanningRelationshipType(str, Enum): |
| 193 | """Relationship types within a planning taxonomy.""" |
| 194 | |
| 195 | REQUIRES = "requires" |
| 196 | BLOCKED_BY = "blocked_by" |
| 197 | HAS_RISK = "has_risk" |
| 198 | DEPENDS_ON = "depends_on" |
| 199 | ADDRESSES = "addresses" |
| 200 | HAS_TRADEOFF = "has_tradeoff" |
| 201 | DELIVERS = "delivers" |
| 202 | IMPLEMENTS = "implements" |
| 203 | PARENT_OF = "parent_of" |
| 204 | |
| 205 | |
| 206 | class ProcessingStats(BaseModel): |
| 207 | """Statistics about a processing run.""" |
| 208 | |
| 209 |
| --- video_processor/output_structure.py | ||
| +++ video_processor/output_structure.py | ||
| @@ -24,12 +24,12 @@ | ||
| 24 | 24 | diagram_0.json, .jpg, .mermaid, .svg, .png |
| 25 | 25 | captures/ |
| 26 | 26 | capture_0.jpg, capture_0.json |
| 27 | 27 | results/ |
| 28 | 28 | analysis.md, .html, .pdf |
| 29 | - knowledge_graph.json | |
| 30 | - knowledge_graph.db (when falkordblite installed) | |
| 29 | + knowledge_graph.db (primary, SQLite) | |
| 30 | + knowledge_graph.json (export copy) | |
| 31 | 31 | key_points.json |
| 32 | 32 | action_items.json |
| 33 | 33 | cache/ |
| 34 | 34 | |
| 35 | 35 | Returns dict mapping directory names to Path objects. |
| @@ -56,12 +56,12 @@ | ||
| 56 | 56 | |
| 57 | 57 | Layout: |
| 58 | 58 | output_dir/ |
| 59 | 59 | manifest.json |
| 60 | 60 | batch_summary.md |
| 61 | - knowledge_graph.json | |
| 62 | - knowledge_graph.db (when falkordblite installed) | |
| 61 | + knowledge_graph.db (primary, SQLite) | |
| 62 | + knowledge_graph.json (export copy) (when falkordblite installed) | |
| 63 | 63 | videos/ |
| 64 | 64 | video_1/manifest.json |
| 65 | 65 | video_2/manifest.json |
| 66 | 66 | ... |
| 67 | 67 | |
| 68 | 68 |
| --- video_processor/output_structure.py | |
| +++ video_processor/output_structure.py | |
| @@ -24,12 +24,12 @@ | |
| 24 | diagram_0.json, .jpg, .mermaid, .svg, .png |
| 25 | captures/ |
| 26 | capture_0.jpg, capture_0.json |
| 27 | results/ |
| 28 | analysis.md, .html, .pdf |
| 29 | knowledge_graph.json |
| 30 | knowledge_graph.db (when falkordblite installed) |
| 31 | key_points.json |
| 32 | action_items.json |
| 33 | cache/ |
| 34 | |
| 35 | Returns dict mapping directory names to Path objects. |
| @@ -56,12 +56,12 @@ | |
| 56 | |
| 57 | Layout: |
| 58 | output_dir/ |
| 59 | manifest.json |
| 60 | batch_summary.md |
| 61 | knowledge_graph.json |
| 62 | knowledge_graph.db (when falkordblite installed) |
| 63 | videos/ |
| 64 | video_1/manifest.json |
| 65 | video_2/manifest.json |
| 66 | ... |
| 67 | |
| 68 |
| --- video_processor/output_structure.py | |
| +++ video_processor/output_structure.py | |
| @@ -24,12 +24,12 @@ | |
| 24 | diagram_0.json, .jpg, .mermaid, .svg, .png |
| 25 | captures/ |
| 26 | capture_0.jpg, capture_0.json |
| 27 | results/ |
| 28 | analysis.md, .html, .pdf |
| 29 | knowledge_graph.db (primary, SQLite) |
| 30 | knowledge_graph.json (export copy) |
| 31 | key_points.json |
| 32 | action_items.json |
| 33 | cache/ |
| 34 | |
| 35 | Returns dict mapping directory names to Path objects. |
| @@ -56,12 +56,12 @@ | |
| 56 | |
| 57 | Layout: |
| 58 | output_dir/ |
| 59 | manifest.json |
| 60 | batch_summary.md |
| 61 | knowledge_graph.db (primary, SQLite) |
| 62 | knowledge_graph.json (export copy) (when falkordblite installed) |
| 63 | videos/ |
| 64 | video_1/manifest.json |
| 65 | video_2/manifest.json |
| 66 | ... |
| 67 | |
| 68 |
+54
-6
| --- video_processor/pipeline.py | ||
| +++ video_processor/pipeline.py | ||
| @@ -1,9 +1,11 @@ | ||
| 1 | 1 | """Core video processing pipeline — the reusable function both CLI commands call.""" |
| 2 | 2 | |
| 3 | +import hashlib | |
| 3 | 4 | import json |
| 4 | 5 | import logging |
| 6 | +import mimetypes | |
| 5 | 7 | import time |
| 6 | 8 | from datetime import datetime |
| 7 | 9 | from pathlib import Path |
| 8 | 10 | from typing import Optional |
| 9 | 11 | |
| @@ -20,19 +22,30 @@ | ||
| 20 | 22 | from video_processor.integrators.plan_generator import PlanGenerator |
| 21 | 23 | from video_processor.models import ( |
| 22 | 24 | ActionItem, |
| 23 | 25 | KeyPoint, |
| 24 | 26 | ProcessingStats, |
| 27 | + ProgressCallback, | |
| 25 | 28 | VideoManifest, |
| 26 | 29 | VideoMetadata, |
| 27 | 30 | ) |
| 28 | 31 | from video_processor.output_structure import create_video_output_dirs, write_video_manifest |
| 29 | 32 | from video_processor.providers.manager import ProviderManager |
| 30 | 33 | from video_processor.utils.export import export_all_formats |
| 31 | 34 | |
| 32 | 35 | logger = logging.getLogger(__name__) |
| 33 | 36 | |
| 37 | + | |
| 38 | +def _notify(cb: Optional[ProgressCallback], method: str, *args, **kwargs) -> None: | |
| 39 | + """Safely invoke a callback method, logging any errors.""" | |
| 40 | + if cb is None: | |
| 41 | + return | |
| 42 | + try: | |
| 43 | + getattr(cb, method)(*args, **kwargs) | |
| 44 | + except Exception as e: | |
| 45 | + logger.warning(f"Progress callback {method} failed: {e}") | |
| 46 | + | |
| 34 | 47 | |
| 35 | 48 | def process_single_video( |
| 36 | 49 | input_path: str | Path, |
| 37 | 50 | output_dir: str | Path, |
| 38 | 51 | provider_manager: Optional[ProviderManager] = None, |
| @@ -41,10 +54,12 @@ | ||
| 41 | 54 | sampling_rate: float = 0.5, |
| 42 | 55 | change_threshold: float = 0.15, |
| 43 | 56 | periodic_capture_seconds: float = 30.0, |
| 44 | 57 | use_gpu: bool = False, |
| 45 | 58 | title: Optional[str] = None, |
| 59 | + progress_callback: Optional[ProgressCallback] = None, | |
| 60 | + speaker_hints: Optional[list[str]] = None, | |
| 46 | 61 | ) -> VideoManifest: |
| 47 | 62 | """ |
| 48 | 63 | Full pipeline: frames -> audio -> transcription -> diagrams -> KG -> report -> export. |
| 49 | 64 | |
| 50 | 65 | Returns a populated VideoManifest. |
| @@ -74,12 +89,15 @@ | ||
| 74 | 89 | "Extract key points", |
| 75 | 90 | "Generate report", |
| 76 | 91 | "Export formats", |
| 77 | 92 | ] |
| 78 | 93 | pipeline_bar = tqdm(steps, desc="Pipeline", unit="step", position=0) |
| 94 | + | |
| 95 | + total_steps = len(steps) | |
| 79 | 96 | |
| 80 | 97 | # --- Step 1: Extract frames --- |
| 98 | + _notify(progress_callback, "on_step_start", steps[0], 1, total_steps) | |
| 81 | 99 | pm.usage.start_step("Frame extraction") |
| 82 | 100 | pipeline_bar.set_description("Pipeline: extracting frames") |
| 83 | 101 | existing_frames = sorted(dirs["frames"].glob("frame_*.jpg")) |
| 84 | 102 | people_removed = 0 |
| 85 | 103 | if existing_frames: |
| @@ -99,12 +117,14 @@ | ||
| 99 | 117 | # Filter out people/webcam frames before saving |
| 100 | 118 | frames, people_removed = filter_people_frames(frames) |
| 101 | 119 | frame_paths = save_frames(frames, dirs["frames"], "frame") |
| 102 | 120 | logger.info(f"Saved {len(frames)} content frames ({people_removed} people frames filtered)") |
| 103 | 121 | pipeline_bar.update(1) |
| 122 | + _notify(progress_callback, "on_step_complete", steps[0], 1, total_steps) | |
| 104 | 123 | |
| 105 | 124 | # --- Step 2: Extract audio --- |
| 125 | + _notify(progress_callback, "on_step_start", steps[1], 2, total_steps) | |
| 106 | 126 | pm.usage.start_step("Audio extraction") |
| 107 | 127 | pipeline_bar.set_description("Pipeline: extracting audio") |
| 108 | 128 | audio_path = dirs["root"] / "audio" / f"{video_name}.wav" |
| 109 | 129 | audio_extractor = AudioExtractor() |
| 110 | 130 | if audio_path.exists(): |
| @@ -112,12 +132,14 @@ | ||
| 112 | 132 | else: |
| 113 | 133 | logger.info("Extracting audio...") |
| 114 | 134 | audio_path = audio_extractor.extract_audio(input_path, output_path=audio_path) |
| 115 | 135 | audio_props = audio_extractor.get_audio_properties(audio_path) |
| 116 | 136 | pipeline_bar.update(1) |
| 137 | + _notify(progress_callback, "on_step_complete", steps[1], 2, total_steps) | |
| 117 | 138 | |
| 118 | 139 | # --- Step 3: Transcribe --- |
| 140 | + _notify(progress_callback, "on_step_start", steps[2], 3, total_steps) | |
| 119 | 141 | pm.usage.start_step("Transcription") |
| 120 | 142 | pipeline_bar.set_description("Pipeline: transcribing audio") |
| 121 | 143 | transcript_json = dirs["transcript"] / "transcript.json" |
| 122 | 144 | if transcript_json.exists(): |
| 123 | 145 | logger.info("Resuming: found transcript on disk, skipping transcription") |
| @@ -124,11 +146,11 @@ | ||
| 124 | 146 | transcript_data = json.loads(transcript_json.read_text()) |
| 125 | 147 | transcript_text = transcript_data.get("text", "") |
| 126 | 148 | segments = transcript_data.get("segments", []) |
| 127 | 149 | else: |
| 128 | 150 | logger.info("Transcribing audio...") |
| 129 | - transcription = pm.transcribe_audio(audio_path) | |
| 151 | + transcription = pm.transcribe_audio(audio_path, speaker_hints=speaker_hints) | |
| 130 | 152 | transcript_text = transcription.get("text", "") |
| 131 | 153 | segments = transcription.get("segments", []) |
| 132 | 154 | |
| 133 | 155 | # Save transcript files |
| 134 | 156 | transcript_data = { |
| @@ -154,12 +176,14 @@ | ||
| 154 | 176 | srt_lines.append(f"{_format_srt_time(start)} --> {_format_srt_time(end)}") |
| 155 | 177 | srt_lines.append(seg.get("text", "").strip()) |
| 156 | 178 | srt_lines.append("") |
| 157 | 179 | transcript_srt.write_text("\n".join(srt_lines)) |
| 158 | 180 | pipeline_bar.update(1) |
| 181 | + _notify(progress_callback, "on_step_complete", steps[2], 3, total_steps) | |
| 159 | 182 | |
| 160 | 183 | # --- Step 4: Diagram extraction --- |
| 184 | + _notify(progress_callback, "on_step_start", steps[3], 4, total_steps) | |
| 161 | 185 | pm.usage.start_step("Visual analysis") |
| 162 | 186 | pipeline_bar.set_description("Pipeline: analyzing visuals") |
| 163 | 187 | diagrams = [] |
| 164 | 188 | screen_captures = [] |
| 165 | 189 | existing_diagrams = ( |
| @@ -186,31 +210,50 @@ | ||
| 186 | 210 | subset = [frame_paths[int(i * step)] for i in range(max_frames)] |
| 187 | 211 | diagrams, screen_captures = analyzer.process_frames( |
| 188 | 212 | subset, diagrams_dir=dirs["diagrams"], captures_dir=dirs["captures"] |
| 189 | 213 | ) |
| 190 | 214 | pipeline_bar.update(1) |
| 215 | + _notify(progress_callback, "on_step_complete", steps[3], 4, total_steps) | |
| 191 | 216 | |
| 192 | 217 | # --- Step 5: Knowledge graph --- |
| 218 | + _notify(progress_callback, "on_step_start", steps[4], 5, total_steps) | |
| 193 | 219 | pm.usage.start_step("Knowledge graph") |
| 194 | 220 | pipeline_bar.set_description("Pipeline: building knowledge graph") |
| 195 | - kg_json_path = dirs["results"] / "knowledge_graph.json" | |
| 196 | 221 | kg_db_path = dirs["results"] / "knowledge_graph.db" |
| 197 | - if kg_json_path.exists(): | |
| 222 | + kg_json_path = dirs["results"] / "knowledge_graph.json" | |
| 223 | + # Generate a stable source ID from the input path | |
| 224 | + source_id = hashlib.sha256(str(input_path).encode()).hexdigest()[:12] | |
| 225 | + mime_type = mimetypes.guess_type(str(input_path))[0] or "video/mp4" | |
| 226 | + | |
| 227 | + if kg_db_path.exists(): | |
| 198 | 228 | logger.info("Resuming: found knowledge graph on disk, loading") |
| 199 | - kg_data = json.loads(kg_json_path.read_text()) | |
| 200 | - kg = KnowledgeGraph.from_dict(kg_data, db_path=kg_db_path) | |
| 229 | + kg = KnowledgeGraph(provider_manager=pm, db_path=kg_db_path) | |
| 201 | 230 | else: |
| 202 | 231 | logger.info("Building knowledge graph...") |
| 203 | 232 | kg = KnowledgeGraph(provider_manager=pm, db_path=kg_db_path) |
| 233 | + kg.register_source( | |
| 234 | + { | |
| 235 | + "source_id": source_id, | |
| 236 | + "source_type": "video", | |
| 237 | + "title": title, | |
| 238 | + "path": str(input_path), | |
| 239 | + "mime_type": mime_type, | |
| 240 | + "ingested_at": datetime.now().isoformat(), | |
| 241 | + "metadata": {"duration_seconds": audio_props.get("duration")}, | |
| 242 | + } | |
| 243 | + ) | |
| 204 | 244 | kg.process_transcript(transcript_data) |
| 205 | 245 | if diagrams: |
| 206 | 246 | diagram_dicts = [d.model_dump() for d in diagrams] |
| 207 | 247 | kg.process_diagrams(diagram_dicts) |
| 208 | - kg.save(kg_json_path) | |
| 248 | + # Export JSON copy alongside the SQLite db | |
| 249 | + kg.save(kg_json_path) | |
| 209 | 250 | pipeline_bar.update(1) |
| 251 | + _notify(progress_callback, "on_step_complete", steps[4], 5, total_steps) | |
| 210 | 252 | |
| 211 | 253 | # --- Step 6: Extract key points & action items --- |
| 254 | + _notify(progress_callback, "on_step_start", steps[5], 6, total_steps) | |
| 212 | 255 | pm.usage.start_step("Key points & actions") |
| 213 | 256 | pipeline_bar.set_description("Pipeline: extracting key points") |
| 214 | 257 | kp_path = dirs["results"] / "key_points.json" |
| 215 | 258 | ai_path = dirs["results"] / "action_items.json" |
| 216 | 259 | if kp_path.exists() and ai_path.exists(): |
| @@ -222,12 +265,14 @@ | ||
| 222 | 265 | action_items = _extract_action_items(pm, transcript_text) |
| 223 | 266 | |
| 224 | 267 | kp_path.write_text(json.dumps([kp.model_dump() for kp in key_points], indent=2)) |
| 225 | 268 | ai_path.write_text(json.dumps([ai.model_dump() for ai in action_items], indent=2)) |
| 226 | 269 | pipeline_bar.update(1) |
| 270 | + _notify(progress_callback, "on_step_complete", steps[5], 6, total_steps) | |
| 227 | 271 | |
| 228 | 272 | # --- Step 7: Generate markdown report --- |
| 273 | + _notify(progress_callback, "on_step_start", steps[6], 7, total_steps) | |
| 229 | 274 | pm.usage.start_step("Report generation") |
| 230 | 275 | pipeline_bar.set_description("Pipeline: generating report") |
| 231 | 276 | md_path = dirs["results"] / "analysis.md" |
| 232 | 277 | if md_path.exists(): |
| 233 | 278 | logger.info("Resuming: found analysis report on disk, skipping generation") |
| @@ -241,10 +286,11 @@ | ||
| 241 | 286 | knowledge_graph=kg.to_dict(), |
| 242 | 287 | video_title=title, |
| 243 | 288 | output_path=md_path, |
| 244 | 289 | ) |
| 245 | 290 | pipeline_bar.update(1) |
| 291 | + _notify(progress_callback, "on_step_complete", steps[6], 7, total_steps) | |
| 246 | 292 | |
| 247 | 293 | # --- Build manifest --- |
| 248 | 294 | elapsed = time.time() - start_time |
| 249 | 295 | manifest = VideoManifest( |
| 250 | 296 | video=VideoMetadata( |
| @@ -276,16 +322,18 @@ | ||
| 276 | 322 | screen_captures=screen_captures, |
| 277 | 323 | frame_paths=[f"frames/{Path(p).name}" for p in frame_paths], |
| 278 | 324 | ) |
| 279 | 325 | |
| 280 | 326 | # --- Step 8: Export all formats --- |
| 327 | + _notify(progress_callback, "on_step_start", steps[7], 8, total_steps) | |
| 281 | 328 | pm.usage.start_step("Export formats") |
| 282 | 329 | pipeline_bar.set_description("Pipeline: exporting formats") |
| 283 | 330 | manifest = export_all_formats(output_dir, manifest) |
| 284 | 331 | |
| 285 | 332 | pm.usage.end_step() |
| 286 | 333 | pipeline_bar.update(1) |
| 334 | + _notify(progress_callback, "on_step_complete", steps[7], 8, total_steps) | |
| 287 | 335 | pipeline_bar.set_description("Pipeline: complete") |
| 288 | 336 | pipeline_bar.close() |
| 289 | 337 | |
| 290 | 338 | # Write manifest |
| 291 | 339 | write_video_manifest(manifest, output_dir) |
| 292 | 340 | |
| 293 | 341 | ADDED video_processor/processors/__init__.py |
| 294 | 342 | ADDED video_processor/processors/base.py |
| 295 | 343 | ADDED video_processor/processors/ingest.py |
| 296 | 344 | ADDED video_processor/processors/markdown_processor.py |
| 297 | 345 | ADDED video_processor/processors/pdf_processor.py |
| --- video_processor/pipeline.py | |
| +++ video_processor/pipeline.py | |
| @@ -1,9 +1,11 @@ | |
| 1 | """Core video processing pipeline — the reusable function both CLI commands call.""" |
| 2 | |
| 3 | import json |
| 4 | import logging |
| 5 | import time |
| 6 | from datetime import datetime |
| 7 | from pathlib import Path |
| 8 | from typing import Optional |
| 9 | |
| @@ -20,19 +22,30 @@ | |
| 20 | from video_processor.integrators.plan_generator import PlanGenerator |
| 21 | from video_processor.models import ( |
| 22 | ActionItem, |
| 23 | KeyPoint, |
| 24 | ProcessingStats, |
| 25 | VideoManifest, |
| 26 | VideoMetadata, |
| 27 | ) |
| 28 | from video_processor.output_structure import create_video_output_dirs, write_video_manifest |
| 29 | from video_processor.providers.manager import ProviderManager |
| 30 | from video_processor.utils.export import export_all_formats |
| 31 | |
| 32 | logger = logging.getLogger(__name__) |
| 33 | |
| 34 | |
| 35 | def process_single_video( |
| 36 | input_path: str | Path, |
| 37 | output_dir: str | Path, |
| 38 | provider_manager: Optional[ProviderManager] = None, |
| @@ -41,10 +54,12 @@ | |
| 41 | sampling_rate: float = 0.5, |
| 42 | change_threshold: float = 0.15, |
| 43 | periodic_capture_seconds: float = 30.0, |
| 44 | use_gpu: bool = False, |
| 45 | title: Optional[str] = None, |
| 46 | ) -> VideoManifest: |
| 47 | """ |
| 48 | Full pipeline: frames -> audio -> transcription -> diagrams -> KG -> report -> export. |
| 49 | |
| 50 | Returns a populated VideoManifest. |
| @@ -74,12 +89,15 @@ | |
| 74 | "Extract key points", |
| 75 | "Generate report", |
| 76 | "Export formats", |
| 77 | ] |
| 78 | pipeline_bar = tqdm(steps, desc="Pipeline", unit="step", position=0) |
| 79 | |
| 80 | # --- Step 1: Extract frames --- |
| 81 | pm.usage.start_step("Frame extraction") |
| 82 | pipeline_bar.set_description("Pipeline: extracting frames") |
| 83 | existing_frames = sorted(dirs["frames"].glob("frame_*.jpg")) |
| 84 | people_removed = 0 |
| 85 | if existing_frames: |
| @@ -99,12 +117,14 @@ | |
| 99 | # Filter out people/webcam frames before saving |
| 100 | frames, people_removed = filter_people_frames(frames) |
| 101 | frame_paths = save_frames(frames, dirs["frames"], "frame") |
| 102 | logger.info(f"Saved {len(frames)} content frames ({people_removed} people frames filtered)") |
| 103 | pipeline_bar.update(1) |
| 104 | |
| 105 | # --- Step 2: Extract audio --- |
| 106 | pm.usage.start_step("Audio extraction") |
| 107 | pipeline_bar.set_description("Pipeline: extracting audio") |
| 108 | audio_path = dirs["root"] / "audio" / f"{video_name}.wav" |
| 109 | audio_extractor = AudioExtractor() |
| 110 | if audio_path.exists(): |
| @@ -112,12 +132,14 @@ | |
| 112 | else: |
| 113 | logger.info("Extracting audio...") |
| 114 | audio_path = audio_extractor.extract_audio(input_path, output_path=audio_path) |
| 115 | audio_props = audio_extractor.get_audio_properties(audio_path) |
| 116 | pipeline_bar.update(1) |
| 117 | |
| 118 | # --- Step 3: Transcribe --- |
| 119 | pm.usage.start_step("Transcription") |
| 120 | pipeline_bar.set_description("Pipeline: transcribing audio") |
| 121 | transcript_json = dirs["transcript"] / "transcript.json" |
| 122 | if transcript_json.exists(): |
| 123 | logger.info("Resuming: found transcript on disk, skipping transcription") |
| @@ -124,11 +146,11 @@ | |
| 124 | transcript_data = json.loads(transcript_json.read_text()) |
| 125 | transcript_text = transcript_data.get("text", "") |
| 126 | segments = transcript_data.get("segments", []) |
| 127 | else: |
| 128 | logger.info("Transcribing audio...") |
| 129 | transcription = pm.transcribe_audio(audio_path) |
| 130 | transcript_text = transcription.get("text", "") |
| 131 | segments = transcription.get("segments", []) |
| 132 | |
| 133 | # Save transcript files |
| 134 | transcript_data = { |
| @@ -154,12 +176,14 @@ | |
| 154 | srt_lines.append(f"{_format_srt_time(start)} --> {_format_srt_time(end)}") |
| 155 | srt_lines.append(seg.get("text", "").strip()) |
| 156 | srt_lines.append("") |
| 157 | transcript_srt.write_text("\n".join(srt_lines)) |
| 158 | pipeline_bar.update(1) |
| 159 | |
| 160 | # --- Step 4: Diagram extraction --- |
| 161 | pm.usage.start_step("Visual analysis") |
| 162 | pipeline_bar.set_description("Pipeline: analyzing visuals") |
| 163 | diagrams = [] |
| 164 | screen_captures = [] |
| 165 | existing_diagrams = ( |
| @@ -186,31 +210,50 @@ | |
| 186 | subset = [frame_paths[int(i * step)] for i in range(max_frames)] |
| 187 | diagrams, screen_captures = analyzer.process_frames( |
| 188 | subset, diagrams_dir=dirs["diagrams"], captures_dir=dirs["captures"] |
| 189 | ) |
| 190 | pipeline_bar.update(1) |
| 191 | |
| 192 | # --- Step 5: Knowledge graph --- |
| 193 | pm.usage.start_step("Knowledge graph") |
| 194 | pipeline_bar.set_description("Pipeline: building knowledge graph") |
| 195 | kg_json_path = dirs["results"] / "knowledge_graph.json" |
| 196 | kg_db_path = dirs["results"] / "knowledge_graph.db" |
| 197 | if kg_json_path.exists(): |
| 198 | logger.info("Resuming: found knowledge graph on disk, loading") |
| 199 | kg_data = json.loads(kg_json_path.read_text()) |
| 200 | kg = KnowledgeGraph.from_dict(kg_data, db_path=kg_db_path) |
| 201 | else: |
| 202 | logger.info("Building knowledge graph...") |
| 203 | kg = KnowledgeGraph(provider_manager=pm, db_path=kg_db_path) |
| 204 | kg.process_transcript(transcript_data) |
| 205 | if diagrams: |
| 206 | diagram_dicts = [d.model_dump() for d in diagrams] |
| 207 | kg.process_diagrams(diagram_dicts) |
| 208 | kg.save(kg_json_path) |
| 209 | pipeline_bar.update(1) |
| 210 | |
| 211 | # --- Step 6: Extract key points & action items --- |
| 212 | pm.usage.start_step("Key points & actions") |
| 213 | pipeline_bar.set_description("Pipeline: extracting key points") |
| 214 | kp_path = dirs["results"] / "key_points.json" |
| 215 | ai_path = dirs["results"] / "action_items.json" |
| 216 | if kp_path.exists() and ai_path.exists(): |
| @@ -222,12 +265,14 @@ | |
| 222 | action_items = _extract_action_items(pm, transcript_text) |
| 223 | |
| 224 | kp_path.write_text(json.dumps([kp.model_dump() for kp in key_points], indent=2)) |
| 225 | ai_path.write_text(json.dumps([ai.model_dump() for ai in action_items], indent=2)) |
| 226 | pipeline_bar.update(1) |
| 227 | |
| 228 | # --- Step 7: Generate markdown report --- |
| 229 | pm.usage.start_step("Report generation") |
| 230 | pipeline_bar.set_description("Pipeline: generating report") |
| 231 | md_path = dirs["results"] / "analysis.md" |
| 232 | if md_path.exists(): |
| 233 | logger.info("Resuming: found analysis report on disk, skipping generation") |
| @@ -241,10 +286,11 @@ | |
| 241 | knowledge_graph=kg.to_dict(), |
| 242 | video_title=title, |
| 243 | output_path=md_path, |
| 244 | ) |
| 245 | pipeline_bar.update(1) |
| 246 | |
| 247 | # --- Build manifest --- |
| 248 | elapsed = time.time() - start_time |
| 249 | manifest = VideoManifest( |
| 250 | video=VideoMetadata( |
| @@ -276,16 +322,18 @@ | |
| 276 | screen_captures=screen_captures, |
| 277 | frame_paths=[f"frames/{Path(p).name}" for p in frame_paths], |
| 278 | ) |
| 279 | |
| 280 | # --- Step 8: Export all formats --- |
| 281 | pm.usage.start_step("Export formats") |
| 282 | pipeline_bar.set_description("Pipeline: exporting formats") |
| 283 | manifest = export_all_formats(output_dir, manifest) |
| 284 | |
| 285 | pm.usage.end_step() |
| 286 | pipeline_bar.update(1) |
| 287 | pipeline_bar.set_description("Pipeline: complete") |
| 288 | pipeline_bar.close() |
| 289 | |
| 290 | # Write manifest |
| 291 | write_video_manifest(manifest, output_dir) |
| 292 | |
| 293 | DDED video_processor/processors/__init__.py |
| 294 | DDED video_processor/processors/base.py |
| 295 | DDED video_processor/processors/ingest.py |
| 296 | DDED video_processor/processors/markdown_processor.py |
| 297 | DDED video_processor/processors/pdf_processor.py |
| --- video_processor/pipeline.py | |
| +++ video_processor/pipeline.py | |
| @@ -1,9 +1,11 @@ | |
| 1 | """Core video processing pipeline — the reusable function both CLI commands call.""" |
| 2 | |
| 3 | import hashlib |
| 4 | import json |
| 5 | import logging |
| 6 | import mimetypes |
| 7 | import time |
| 8 | from datetime import datetime |
| 9 | from pathlib import Path |
| 10 | from typing import Optional |
| 11 | |
| @@ -20,19 +22,30 @@ | |
| 22 | from video_processor.integrators.plan_generator import PlanGenerator |
| 23 | from video_processor.models import ( |
| 24 | ActionItem, |
| 25 | KeyPoint, |
| 26 | ProcessingStats, |
| 27 | ProgressCallback, |
| 28 | VideoManifest, |
| 29 | VideoMetadata, |
| 30 | ) |
| 31 | from video_processor.output_structure import create_video_output_dirs, write_video_manifest |
| 32 | from video_processor.providers.manager import ProviderManager |
| 33 | from video_processor.utils.export import export_all_formats |
| 34 | |
| 35 | logger = logging.getLogger(__name__) |
| 36 | |
| 37 | |
| 38 | def _notify(cb: Optional[ProgressCallback], method: str, *args, **kwargs) -> None: |
| 39 | """Safely invoke a callback method, logging any errors.""" |
| 40 | if cb is None: |
| 41 | return |
| 42 | try: |
| 43 | getattr(cb, method)(*args, **kwargs) |
| 44 | except Exception as e: |
| 45 | logger.warning(f"Progress callback {method} failed: {e}") |
| 46 | |
| 47 | |
| 48 | def process_single_video( |
| 49 | input_path: str | Path, |
| 50 | output_dir: str | Path, |
| 51 | provider_manager: Optional[ProviderManager] = None, |
| @@ -41,10 +54,12 @@ | |
| 54 | sampling_rate: float = 0.5, |
| 55 | change_threshold: float = 0.15, |
| 56 | periodic_capture_seconds: float = 30.0, |
| 57 | use_gpu: bool = False, |
| 58 | title: Optional[str] = None, |
| 59 | progress_callback: Optional[ProgressCallback] = None, |
| 60 | speaker_hints: Optional[list[str]] = None, |
| 61 | ) -> VideoManifest: |
| 62 | """ |
| 63 | Full pipeline: frames -> audio -> transcription -> diagrams -> KG -> report -> export. |
| 64 | |
| 65 | Returns a populated VideoManifest. |
| @@ -74,12 +89,15 @@ | |
| 89 | "Extract key points", |
| 90 | "Generate report", |
| 91 | "Export formats", |
| 92 | ] |
| 93 | pipeline_bar = tqdm(steps, desc="Pipeline", unit="step", position=0) |
| 94 | |
| 95 | total_steps = len(steps) |
| 96 | |
| 97 | # --- Step 1: Extract frames --- |
| 98 | _notify(progress_callback, "on_step_start", steps[0], 1, total_steps) |
| 99 | pm.usage.start_step("Frame extraction") |
| 100 | pipeline_bar.set_description("Pipeline: extracting frames") |
| 101 | existing_frames = sorted(dirs["frames"].glob("frame_*.jpg")) |
| 102 | people_removed = 0 |
| 103 | if existing_frames: |
| @@ -99,12 +117,14 @@ | |
| 117 | # Filter out people/webcam frames before saving |
| 118 | frames, people_removed = filter_people_frames(frames) |
| 119 | frame_paths = save_frames(frames, dirs["frames"], "frame") |
| 120 | logger.info(f"Saved {len(frames)} content frames ({people_removed} people frames filtered)") |
| 121 | pipeline_bar.update(1) |
| 122 | _notify(progress_callback, "on_step_complete", steps[0], 1, total_steps) |
| 123 | |
| 124 | # --- Step 2: Extract audio --- |
| 125 | _notify(progress_callback, "on_step_start", steps[1], 2, total_steps) |
| 126 | pm.usage.start_step("Audio extraction") |
| 127 | pipeline_bar.set_description("Pipeline: extracting audio") |
| 128 | audio_path = dirs["root"] / "audio" / f"{video_name}.wav" |
| 129 | audio_extractor = AudioExtractor() |
| 130 | if audio_path.exists(): |
| @@ -112,12 +132,14 @@ | |
| 132 | else: |
| 133 | logger.info("Extracting audio...") |
| 134 | audio_path = audio_extractor.extract_audio(input_path, output_path=audio_path) |
| 135 | audio_props = audio_extractor.get_audio_properties(audio_path) |
| 136 | pipeline_bar.update(1) |
| 137 | _notify(progress_callback, "on_step_complete", steps[1], 2, total_steps) |
| 138 | |
| 139 | # --- Step 3: Transcribe --- |
| 140 | _notify(progress_callback, "on_step_start", steps[2], 3, total_steps) |
| 141 | pm.usage.start_step("Transcription") |
| 142 | pipeline_bar.set_description("Pipeline: transcribing audio") |
| 143 | transcript_json = dirs["transcript"] / "transcript.json" |
| 144 | if transcript_json.exists(): |
| 145 | logger.info("Resuming: found transcript on disk, skipping transcription") |
| @@ -124,11 +146,11 @@ | |
| 146 | transcript_data = json.loads(transcript_json.read_text()) |
| 147 | transcript_text = transcript_data.get("text", "") |
| 148 | segments = transcript_data.get("segments", []) |
| 149 | else: |
| 150 | logger.info("Transcribing audio...") |
| 151 | transcription = pm.transcribe_audio(audio_path, speaker_hints=speaker_hints) |
| 152 | transcript_text = transcription.get("text", "") |
| 153 | segments = transcription.get("segments", []) |
| 154 | |
| 155 | # Save transcript files |
| 156 | transcript_data = { |
| @@ -154,12 +176,14 @@ | |
| 176 | srt_lines.append(f"{_format_srt_time(start)} --> {_format_srt_time(end)}") |
| 177 | srt_lines.append(seg.get("text", "").strip()) |
| 178 | srt_lines.append("") |
| 179 | transcript_srt.write_text("\n".join(srt_lines)) |
| 180 | pipeline_bar.update(1) |
| 181 | _notify(progress_callback, "on_step_complete", steps[2], 3, total_steps) |
| 182 | |
| 183 | # --- Step 4: Diagram extraction --- |
| 184 | _notify(progress_callback, "on_step_start", steps[3], 4, total_steps) |
| 185 | pm.usage.start_step("Visual analysis") |
| 186 | pipeline_bar.set_description("Pipeline: analyzing visuals") |
| 187 | diagrams = [] |
| 188 | screen_captures = [] |
| 189 | existing_diagrams = ( |
| @@ -186,31 +210,50 @@ | |
| 210 | subset = [frame_paths[int(i * step)] for i in range(max_frames)] |
| 211 | diagrams, screen_captures = analyzer.process_frames( |
| 212 | subset, diagrams_dir=dirs["diagrams"], captures_dir=dirs["captures"] |
| 213 | ) |
| 214 | pipeline_bar.update(1) |
| 215 | _notify(progress_callback, "on_step_complete", steps[3], 4, total_steps) |
| 216 | |
| 217 | # --- Step 5: Knowledge graph --- |
| 218 | _notify(progress_callback, "on_step_start", steps[4], 5, total_steps) |
| 219 | pm.usage.start_step("Knowledge graph") |
| 220 | pipeline_bar.set_description("Pipeline: building knowledge graph") |
| 221 | kg_db_path = dirs["results"] / "knowledge_graph.db" |
| 222 | kg_json_path = dirs["results"] / "knowledge_graph.json" |
| 223 | # Generate a stable source ID from the input path |
| 224 | source_id = hashlib.sha256(str(input_path).encode()).hexdigest()[:12] |
| 225 | mime_type = mimetypes.guess_type(str(input_path))[0] or "video/mp4" |
| 226 | |
| 227 | if kg_db_path.exists(): |
| 228 | logger.info("Resuming: found knowledge graph on disk, loading") |
| 229 | kg = KnowledgeGraph(provider_manager=pm, db_path=kg_db_path) |
| 230 | else: |
| 231 | logger.info("Building knowledge graph...") |
| 232 | kg = KnowledgeGraph(provider_manager=pm, db_path=kg_db_path) |
| 233 | kg.register_source( |
| 234 | { |
| 235 | "source_id": source_id, |
| 236 | "source_type": "video", |
| 237 | "title": title, |
| 238 | "path": str(input_path), |
| 239 | "mime_type": mime_type, |
| 240 | "ingested_at": datetime.now().isoformat(), |
| 241 | "metadata": {"duration_seconds": audio_props.get("duration")}, |
| 242 | } |
| 243 | ) |
| 244 | kg.process_transcript(transcript_data) |
| 245 | if diagrams: |
| 246 | diagram_dicts = [d.model_dump() for d in diagrams] |
| 247 | kg.process_diagrams(diagram_dicts) |
| 248 | # Export JSON copy alongside the SQLite db |
| 249 | kg.save(kg_json_path) |
| 250 | pipeline_bar.update(1) |
| 251 | _notify(progress_callback, "on_step_complete", steps[4], 5, total_steps) |
| 252 | |
| 253 | # --- Step 6: Extract key points & action items --- |
| 254 | _notify(progress_callback, "on_step_start", steps[5], 6, total_steps) |
| 255 | pm.usage.start_step("Key points & actions") |
| 256 | pipeline_bar.set_description("Pipeline: extracting key points") |
| 257 | kp_path = dirs["results"] / "key_points.json" |
| 258 | ai_path = dirs["results"] / "action_items.json" |
| 259 | if kp_path.exists() and ai_path.exists(): |
| @@ -222,12 +265,14 @@ | |
| 265 | action_items = _extract_action_items(pm, transcript_text) |
| 266 | |
| 267 | kp_path.write_text(json.dumps([kp.model_dump() for kp in key_points], indent=2)) |
| 268 | ai_path.write_text(json.dumps([ai.model_dump() for ai in action_items], indent=2)) |
| 269 | pipeline_bar.update(1) |
| 270 | _notify(progress_callback, "on_step_complete", steps[5], 6, total_steps) |
| 271 | |
| 272 | # --- Step 7: Generate markdown report --- |
| 273 | _notify(progress_callback, "on_step_start", steps[6], 7, total_steps) |
| 274 | pm.usage.start_step("Report generation") |
| 275 | pipeline_bar.set_description("Pipeline: generating report") |
| 276 | md_path = dirs["results"] / "analysis.md" |
| 277 | if md_path.exists(): |
| 278 | logger.info("Resuming: found analysis report on disk, skipping generation") |
| @@ -241,10 +286,11 @@ | |
| 286 | knowledge_graph=kg.to_dict(), |
| 287 | video_title=title, |
| 288 | output_path=md_path, |
| 289 | ) |
| 290 | pipeline_bar.update(1) |
| 291 | _notify(progress_callback, "on_step_complete", steps[6], 7, total_steps) |
| 292 | |
| 293 | # --- Build manifest --- |
| 294 | elapsed = time.time() - start_time |
| 295 | manifest = VideoManifest( |
| 296 | video=VideoMetadata( |
| @@ -276,16 +322,18 @@ | |
| 322 | screen_captures=screen_captures, |
| 323 | frame_paths=[f"frames/{Path(p).name}" for p in frame_paths], |
| 324 | ) |
| 325 | |
| 326 | # --- Step 8: Export all formats --- |
| 327 | _notify(progress_callback, "on_step_start", steps[7], 8, total_steps) |
| 328 | pm.usage.start_step("Export formats") |
| 329 | pipeline_bar.set_description("Pipeline: exporting formats") |
| 330 | manifest = export_all_formats(output_dir, manifest) |
| 331 | |
| 332 | pm.usage.end_step() |
| 333 | pipeline_bar.update(1) |
| 334 | _notify(progress_callback, "on_step_complete", steps[7], 8, total_steps) |
| 335 | pipeline_bar.set_description("Pipeline: complete") |
| 336 | pipeline_bar.close() |
| 337 | |
| 338 | # Write manifest |
| 339 | write_video_manifest(manifest, output_dir) |
| 340 | |
| 341 | DDED video_processor/processors/__init__.py |
| 342 | DDED video_processor/processors/base.py |
| 343 | DDED video_processor/processors/ingest.py |
| 344 | DDED video_processor/processors/markdown_processor.py |
| 345 | DDED video_processor/processors/pdf_processor.py |
| --- a/video_processor/processors/__init__.py | ||
| +++ b/video_processor/processors/__init__.py | ||
| @@ -0,0 +1,23 @@ | ||
| 1 | +"""Document processors for ingesting files into knowledge graphs.""" | |
| 2 | + | |
| 3 | +from video_processor.processors.base import ( | |
| 4 | + DocumentChunk, | |
| 5 | + DocumentProcessor, | |
| 6 | + get_processor, | |
| 7 | + list_supported_extensions, | |
| 8 | + register_processor, | |
| 9 | +) | |
| 10 | + | |
| 11 | +__all__ = [ | |
| 12 | + "DocumentChunk", | |
| 13 | + "DocumentProcessor", | |
| 14 | + "get_processor", | |
| 15 | + "list_supported_extensions", | |
| 16 | + "register_processor", | |
| 17 | +] | |
| 18 | + | |
| 19 | +# Auto-register built-in processors on import | |
| 20 | +from video_processor.processors import ( | |
| 21 | + markdown_processor, # noqa: F401, E402 | |
| 22 | + pdf_processor, # noqa: F401, E402 | |
| 23 | +) |
| --- a/video_processor/processors/__init__.py | |
| +++ b/video_processor/processors/__init__.py | |
| @@ -0,0 +1,23 @@ | |
| --- a/video_processor/processors/__init__.py | |
| +++ b/video_processor/processors/__init__.py | |
| @@ -0,0 +1,23 @@ | |
| 1 | """Document processors for ingesting files into knowledge graphs.""" |
| 2 | |
| 3 | from video_processor.processors.base import ( |
| 4 | DocumentChunk, |
| 5 | DocumentProcessor, |
| 6 | get_processor, |
| 7 | list_supported_extensions, |
| 8 | register_processor, |
| 9 | ) |
| 10 | |
| 11 | __all__ = [ |
| 12 | "DocumentChunk", |
| 13 | "DocumentProcessor", |
| 14 | "get_processor", |
| 15 | "list_supported_extensions", |
| 16 | "register_processor", |
| 17 | ] |
| 18 | |
| 19 | # Auto-register built-in processors on import |
| 20 | from video_processor.processors import ( |
| 21 | markdown_processor, # noqa: F401, E402 |
| 22 | pdf_processor, # noqa: F401, E402 |
| 23 | ) |
| --- a/video_processor/processors/base.py | ||
| +++ b/video_processor/processors/base.py | ||
| @@ -0,0 +1,56 @@ | ||
| 1 | +"""Base classes and registry for document processors.""" | |
| 2 | + | |
| 3 | +from abc import ABC, abstractmethod | |
| 4 | +from pathlib import Path | |
| 5 | +from typing import Any, Dict, List, Optional | |
| 6 | + | |
| 7 | +from pydantic import BaseModel, Field | |
| 8 | + | |
| 9 | + | |
| 10 | +class DocumentChunk(BaseModel): | |
| 11 | + """A chunk of text from a processed document.""" | |
| 12 | + | |
| 13 | + text: str | |
| 14 | + source_file: str | |
| 15 | + chunk_index: int = 0 | |
| 16 | + page: Optional[int] = None | |
| 17 | + section: Optional[str] = None | |
| 18 | + metadata: Dict[str, Any] = Field(default_factory=dict) | |
| 19 | + | |
| 20 | + | |
| 21 | +class DocumentProcessor(ABC): | |
| 22 | + """Base class for document processors.""" | |
| 23 | + | |
| 24 | + supported_extensions: List[str] = [] | |
| 25 | + | |
| 26 | + @abstractmethod | |
| 27 | + def process(self, path: Path) -> List[DocumentChunk]: | |
| 28 | + """Process a document into chunks.""" | |
| 29 | + ... | |
| 30 | + | |
| 31 | + @abstractmethod | |
| 32 | + def can_process(self, path: Path) -> bool: | |
| 33 | + """Check if this processor can handle the file.""" | |
| 34 | + ... | |
| 35 | + | |
| 36 | + | |
| 37 | +# Registry | |
| 38 | +_processors: Dict[str, type] = {} | |
| 39 | + | |
| 40 | + | |
| 41 | +def register_processor(extensions: List[str], processor_class: type) -> None: | |
| 42 | + """Register a processor class for the given file extensions.""" | |
| 43 | + for ext in extensions: | |
| 44 | + _processors[ext.lower()] = processor_class | |
| 45 | + | |
| 46 | + | |
| 47 | +def get_processor(path: Path) -> Optional[DocumentProcessor]: | |
| 48 | + """Get a processor instance for the given file path, or None if unsupported.""" | |
| 49 | + ext = path.suffix.lower() | |
| 50 | + cls = _processors.get(ext) | |
| 51 | + return cls() if cls else None | |
| 52 | + | |
| 53 | + | |
| 54 | +def list_supported_extensions() -> List[str]: | |
| 55 | + """Return sorted list of all registered file extensions.""" | |
| 56 | + return sorted(_processors.keys()) |
| --- a/video_processor/processors/base.py | |
| +++ b/video_processor/processors/base.py | |
| @@ -0,0 +1,56 @@ | |
| --- a/video_processor/processors/base.py | |
| +++ b/video_processor/processors/base.py | |
| @@ -0,0 +1,56 @@ | |
| 1 | """Base classes and registry for document processors.""" |
| 2 | |
| 3 | from abc import ABC, abstractmethod |
| 4 | from pathlib import Path |
| 5 | from typing import Any, Dict, List, Optional |
| 6 | |
| 7 | from pydantic import BaseModel, Field |
| 8 | |
| 9 | |
| 10 | class DocumentChunk(BaseModel): |
| 11 | """A chunk of text from a processed document.""" |
| 12 | |
| 13 | text: str |
| 14 | source_file: str |
| 15 | chunk_index: int = 0 |
| 16 | page: Optional[int] = None |
| 17 | section: Optional[str] = None |
| 18 | metadata: Dict[str, Any] = Field(default_factory=dict) |
| 19 | |
| 20 | |
| 21 | class DocumentProcessor(ABC): |
| 22 | """Base class for document processors.""" |
| 23 | |
| 24 | supported_extensions: List[str] = [] |
| 25 | |
| 26 | @abstractmethod |
| 27 | def process(self, path: Path) -> List[DocumentChunk]: |
| 28 | """Process a document into chunks.""" |
| 29 | ... |
| 30 | |
| 31 | @abstractmethod |
| 32 | def can_process(self, path: Path) -> bool: |
| 33 | """Check if this processor can handle the file.""" |
| 34 | ... |
| 35 | |
| 36 | |
| 37 | # Registry |
| 38 | _processors: Dict[str, type] = {} |
| 39 | |
| 40 | |
| 41 | def register_processor(extensions: List[str], processor_class: type) -> None: |
| 42 | """Register a processor class for the given file extensions.""" |
| 43 | for ext in extensions: |
| 44 | _processors[ext.lower()] = processor_class |
| 45 | |
| 46 | |
| 47 | def get_processor(path: Path) -> Optional[DocumentProcessor]: |
| 48 | """Get a processor instance for the given file path, or None if unsupported.""" |
| 49 | ext = path.suffix.lower() |
| 50 | cls = _processors.get(ext) |
| 51 | return cls() if cls else None |
| 52 | |
| 53 | |
| 54 | def list_supported_extensions() -> List[str]: |
| 55 | """Return sorted list of all registered file extensions.""" |
| 56 | return sorted(_processors.keys()) |
| --- a/video_processor/processors/ingest.py | ||
| +++ b/video_processor/processors/ingest.py | ||
| @@ -0,0 +1,88 @@ | ||
| 1 | +"""Document ingestion — process files and add content to a knowledge graph.""" | |
| 2 | + | |
| 3 | +import hashlib | |
| 4 | +import logging | |
| 5 | +import mimetypes | |
| 6 | +from datetime import datetime | |
| 7 | +from pathlib import Path | |
| 8 | +from typing import Dict, List, Optional | |
| 9 | + | |
| 10 | +from video_processor.integrators.knowledge_graph import KnowledgeGraph | |
| 11 | +from video_processor.processors.base import get_processor, list_supported_extensions | |
| 12 | + | |
| 13 | +logger = logging.getLogger(__name__) | |
| 14 | + | |
| 15 | + | |
| 16 | +def ingest_file( | |
| 17 | + path: Path, | |
| 18 | + knowledge_graph: KnowledgeGraph, | |
| 19 | + source_id: Optional[str] = None, | |
| 20 | +) -> int: | |
| 21 | + """Process a single file and add its content to the knowledge graph. | |
| 22 | + | |
| 23 | + Returns the number of chunks processed. | |
| 24 | + """ | |
| 25 | + processor = get_processor(path) | |
| 26 | + if processor is None: | |
| 27 | + raise ValueError( | |
| 28 | + f"No processor for {path.suffix}. Supported: {', '.join(list_supported_extensions())}" | |
| 29 | + ) | |
| 30 | + | |
| 31 | + chunks = processor.process(path) | |
| 32 | + | |
| 33 | + if source_id is None: | |
| 34 | + source_id = hashlib.sha256(str(path.resolve()).encode()).hexdigest()[:12] | |
| 35 | + | |
| 36 | + mime = mimetypes.guess_type(str(path))[0] or "application/octet-stream" | |
| 37 | + knowledge_graph.register_source( | |
| 38 | + { | |
| 39 | + "source_id": source_id, | |
| 40 | + "source_type": "document", | |
| 41 | + "title": path.stem, | |
| 42 | + "path": str(path), | |
| 43 | + "mime_type": mime, | |
| 44 | + "ingested_at": datetime.now().isoformat(), | |
| 45 | + "metadata": {"chunks": len(chunks), "extension": path.suffix}, | |
| 46 | + } | |
| 47 | + ) | |
| 48 | + | |
| 49 | + for chunk in chunks: | |
| 50 | + content_source = f"document:{path.name}" | |
| 51 | + if chunk.page is not None: | |
| 52 | + content_source += f":page:{chunk.page}" | |
| 53 | + elif chunk.section: | |
| 54 | + content_source += f":section:{chunk.section}" | |
| 55 | + knowledge_graph.add_content(chunk.text, content_source) | |
| 56 | + | |
| 57 | + return len(chunks) | |
| 58 | + | |
| 59 | + | |
| 60 | +def ingest_directory( | |
| 61 | + directory: Path, | |
| 62 | + knowledge_graph: KnowledgeGraph, | |
| 63 | + recursive: bool = True, | |
| 64 | + extensions: Optional[List[str]] = None, | |
| 65 | +) -> Dict[str, int]: | |
| 66 | + """Process all supported files in a directory. | |
| 67 | + | |
| 68 | + Returns a dict mapping filename to chunk count. | |
| 69 | + """ | |
| 70 | + if not directory.is_dir(): | |
| 71 | + raise ValueError(f"Not a directory: {directory}") | |
| 72 | + | |
| 73 | + supported = set(extensions) if extensions else set(list_supported_extensions()) | |
| 74 | + results: Dict[str, int] = {} | |
| 75 | + | |
| 76 | + glob_fn = directory.rglob if recursive else directory.glob | |
| 77 | + files = sorted(f for f in glob_fn("*") if f.is_file() and f.suffix.lower() in supported) | |
| 78 | + | |
| 79 | + for file_path in files: | |
| 80 | + try: | |
| 81 | + count = ingest_file(file_path, knowledge_graph) | |
| 82 | + results[str(file_path)] = count | |
| 83 | + logger.info(f"Ingested {file_path.name}: {count} chunks") | |
| 84 | + except Exception as e: | |
| 85 | + logger.warning(f"Failed to ingest {file_path.name}: {e}") | |
| 86 | + results[str(file_path)] = 0 | |
| 87 | + | |
| 88 | + return results |
| --- a/video_processor/processors/ingest.py | |
| +++ b/video_processor/processors/ingest.py | |
| @@ -0,0 +1,88 @@ | |
| --- a/video_processor/processors/ingest.py | |
| +++ b/video_processor/processors/ingest.py | |
| @@ -0,0 +1,88 @@ | |
| 1 | """Document ingestion — process files and add content to a knowledge graph.""" |
| 2 | |
| 3 | import hashlib |
| 4 | import logging |
| 5 | import mimetypes |
| 6 | from datetime import datetime |
| 7 | from pathlib import Path |
| 8 | from typing import Dict, List, Optional |
| 9 | |
| 10 | from video_processor.integrators.knowledge_graph import KnowledgeGraph |
| 11 | from video_processor.processors.base import get_processor, list_supported_extensions |
| 12 | |
| 13 | logger = logging.getLogger(__name__) |
| 14 | |
| 15 | |
| 16 | def ingest_file( |
| 17 | path: Path, |
| 18 | knowledge_graph: KnowledgeGraph, |
| 19 | source_id: Optional[str] = None, |
| 20 | ) -> int: |
| 21 | """Process a single file and add its content to the knowledge graph. |
| 22 | |
| 23 | Returns the number of chunks processed. |
| 24 | """ |
| 25 | processor = get_processor(path) |
| 26 | if processor is None: |
| 27 | raise ValueError( |
| 28 | f"No processor for {path.suffix}. Supported: {', '.join(list_supported_extensions())}" |
| 29 | ) |
| 30 | |
| 31 | chunks = processor.process(path) |
| 32 | |
| 33 | if source_id is None: |
| 34 | source_id = hashlib.sha256(str(path.resolve()).encode()).hexdigest()[:12] |
| 35 | |
| 36 | mime = mimetypes.guess_type(str(path))[0] or "application/octet-stream" |
| 37 | knowledge_graph.register_source( |
| 38 | { |
| 39 | "source_id": source_id, |
| 40 | "source_type": "document", |
| 41 | "title": path.stem, |
| 42 | "path": str(path), |
| 43 | "mime_type": mime, |
| 44 | "ingested_at": datetime.now().isoformat(), |
| 45 | "metadata": {"chunks": len(chunks), "extension": path.suffix}, |
| 46 | } |
| 47 | ) |
| 48 | |
| 49 | for chunk in chunks: |
| 50 | content_source = f"document:{path.name}" |
| 51 | if chunk.page is not None: |
| 52 | content_source += f":page:{chunk.page}" |
| 53 | elif chunk.section: |
| 54 | content_source += f":section:{chunk.section}" |
| 55 | knowledge_graph.add_content(chunk.text, content_source) |
| 56 | |
| 57 | return len(chunks) |
| 58 | |
| 59 | |
| 60 | def ingest_directory( |
| 61 | directory: Path, |
| 62 | knowledge_graph: KnowledgeGraph, |
| 63 | recursive: bool = True, |
| 64 | extensions: Optional[List[str]] = None, |
| 65 | ) -> Dict[str, int]: |
| 66 | """Process all supported files in a directory. |
| 67 | |
| 68 | Returns a dict mapping filename to chunk count. |
| 69 | """ |
| 70 | if not directory.is_dir(): |
| 71 | raise ValueError(f"Not a directory: {directory}") |
| 72 | |
| 73 | supported = set(extensions) if extensions else set(list_supported_extensions()) |
| 74 | results: Dict[str, int] = {} |
| 75 | |
| 76 | glob_fn = directory.rglob if recursive else directory.glob |
| 77 | files = sorted(f for f in glob_fn("*") if f.is_file() and f.suffix.lower() in supported) |
| 78 | |
| 79 | for file_path in files: |
| 80 | try: |
| 81 | count = ingest_file(file_path, knowledge_graph) |
| 82 | results[str(file_path)] = count |
| 83 | logger.info(f"Ingested {file_path.name}: {count} chunks") |
| 84 | except Exception as e: |
| 85 | logger.warning(f"Failed to ingest {file_path.name}: {e}") |
| 86 | results[str(file_path)] = 0 |
| 87 | |
| 88 | return results |
| --- a/video_processor/processors/markdown_processor.py | ||
| +++ b/video_processor/processors/markdown_processor.py | ||
| @@ -0,0 +1,133 @@ | ||
| 1 | +"""Markdown and plaintext document processors.""" | |
| 2 | + | |
| 3 | +import re | |
| 4 | +from pathlib import Path | |
| 5 | +from typing import List | |
| 6 | + | |
| 7 | +from video_processor.processors.base import ( | |
| 8 | + DocumentChunk, | |
| 9 | + DocumentProcessor, | |
| 10 | + register_processor, | |
| 11 | +) | |
| 12 | + | |
| 13 | + | |
| 14 | +class MarkdownProcessor(DocumentProcessor): | |
| 15 | + """Process Markdown files by splitting on headings.""" | |
| 16 | + | |
| 17 | + supported_extensions = [".md", ".markdown"] | |
| 18 | + | |
| 19 | + def can_process(self, path: Path) -> bool: | |
| 20 | + return path.suffix.lower() in self.supported_extensions | |
| 21 | + | |
| 22 | + def process(self, path: Path) -> List[DocumentChunk]: | |
| 23 | + text = path.read_text(encoding="utf-8") | |
| 24 | + source = str(path) | |
| 25 | + | |
| 26 | + # Split by headings (lines starting with # or ##) | |
| 27 | + heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE) | |
| 28 | + matches = list(heading_pattern.finditer(text)) | |
| 29 | + | |
| 30 | + if not matches: | |
| 31 | + # No headings — chunk by paragraphs | |
| 32 | + return _chunk_by_paragraphs(text, source) | |
| 33 | + | |
| 34 | + chunks: List[DocumentChunk] = [] | |
| 35 | + | |
| 36 | + # Content before the first heading | |
| 37 | + if matches[0].start() > 0: | |
| 38 | + preamble = text[: matches[0].start()].strip() | |
| 39 | + if preamble: | |
| 40 | + chunks.append( | |
| 41 | + DocumentChunk( | |
| 42 | + text=preamble, | |
| 43 | + source_file=source, | |
| 44 | + chunk_index=0, | |
| 45 | + section="(preamble)", | |
| 46 | + ) | |
| 47 | + ) | |
| 48 | + | |
| 49 | + for i, match in enumerate(matches): | |
| 50 | + section_title = match.group(2).strip() | |
| 51 | + start = match.start() | |
| 52 | + end = matches[i + 1].start() if i + 1 < len(matches) else len(text) | |
| 53 | + section_text = text[start:end].strip() | |
| 54 | + | |
| 55 | + if section_text: | |
| 56 | + chunks.append( | |
| 57 | + DocumentChunk( | |
| 58 | + text=section_text, | |
| 59 | + source_file=source, | |
| 60 | + chunk_index=len(chunks), | |
| 61 | + section=section_title, | |
| 62 | + ) | |
| 63 | + ) | |
| 64 | + | |
| 65 | + return chunks | |
| 66 | + | |
| 67 | + | |
| 68 | +class PlaintextProcessor(DocumentProcessor): | |
| 69 | + """Process plaintext files by splitting on paragraph boundaries.""" | |
| 70 | + | |
| 71 | + supported_extensions = [".txt", ".text", ".log", ".csv"] | |
| 72 | + | |
| 73 | + def can_process(self, path: Path) -> bool: | |
| 74 | + return path.suffix.lower() in self.supported_extensions | |
| 75 | + | |
| 76 | + def process(self, path: Path) -> List[DocumentChunk]: | |
| 77 | + text = path.read_text(encoding="utf-8") | |
| 78 | + return _chunk_by_paragraphs(text, str(path)) | |
| 79 | + | |
| 80 | + | |
| 81 | +def _chunk_by_paragraphs( | |
| 82 | + text: str, | |
| 83 | + source_file: str, | |
| 84 | + max_chunk_size: int = 2000, | |
| 85 | + overlap: int = 200, | |
| 86 | +) -> List[DocumentChunk]: | |
| 87 | + """Split text into chunks by paragraph boundaries with configurable size and overlap.""" | |
| 88 | + # Split on double newlines (paragraph boundaries) | |
| 89 | + paragraphs = re.split(r"\n\s*\n", text) | |
| 90 | + paragraphs = [p.strip() for p in paragraphs if p.strip()] | |
| 91 | + | |
| 92 | + if not paragraphs: | |
| 93 | + return [] | |
| 94 | + | |
| 95 | + chunks: List[DocumentChunk] = [] | |
| 96 | + current_text = "" | |
| 97 | + | |
| 98 | + for para in paragraphs: | |
| 99 | + candidate = (current_text + "\n\n" + para).strip() if current_text else para | |
| 100 | + | |
| 101 | + if len(candidate) > max_chunk_size and current_text: | |
| 102 | + # Flush current chunk | |
| 103 | + chunks.append( | |
| 104 | + DocumentChunk( | |
| 105 | + text=current_text, | |
| 106 | + source_file=source_file, | |
| 107 | + chunk_index=len(chunks), | |
| 108 | + ) | |
| 109 | + ) | |
| 110 | + # Start next chunk with overlap from the end of current | |
| 111 | + if overlap > 0 and len(current_text) > overlap: | |
| 112 | + current_text = current_text[-overlap:] + "\n\n" + para | |
| 113 | + else: | |
| 114 | + current_text = para | |
| 115 | + else: | |
| 116 | + current_text = candidate | |
| 117 | + | |
| 118 | + # Flush remaining | |
| 119 | + if current_text.strip(): | |
| 120 | + chunks.append( | |
| 121 | + DocumentChunk( | |
| 122 | + text=current_text.strip(), | |
| 123 | + source_file=source_file, | |
| 124 | + chunk_index=len(chunks), | |
| 125 | + ) | |
| 126 | + ) | |
| 127 | + | |
| 128 | + return chunks | |
| 129 | + | |
| 130 | + | |
| 131 | +# Register processors | |
| 132 | +register_processor(MarkdownProcessor.supported_extensions, MarkdownProcessor) | |
| 133 | +register_processor(PlaintextProcessor.supported_extensions, PlaintextProcessor) |
| --- a/video_processor/processors/markdown_processor.py | |
| +++ b/video_processor/processors/markdown_processor.py | |
| @@ -0,0 +1,133 @@ | |
| --- a/video_processor/processors/markdown_processor.py | |
| +++ b/video_processor/processors/markdown_processor.py | |
| @@ -0,0 +1,133 @@ | |
| 1 | """Markdown and plaintext document processors.""" |
| 2 | |
| 3 | import re |
| 4 | from pathlib import Path |
| 5 | from typing import List |
| 6 | |
| 7 | from video_processor.processors.base import ( |
| 8 | DocumentChunk, |
| 9 | DocumentProcessor, |
| 10 | register_processor, |
| 11 | ) |
| 12 | |
| 13 | |
| 14 | class MarkdownProcessor(DocumentProcessor): |
| 15 | """Process Markdown files by splitting on headings.""" |
| 16 | |
| 17 | supported_extensions = [".md", ".markdown"] |
| 18 | |
| 19 | def can_process(self, path: Path) -> bool: |
| 20 | return path.suffix.lower() in self.supported_extensions |
| 21 | |
| 22 | def process(self, path: Path) -> List[DocumentChunk]: |
| 23 | text = path.read_text(encoding="utf-8") |
| 24 | source = str(path) |
| 25 | |
| 26 | # Split by headings (lines starting with # or ##) |
| 27 | heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE) |
| 28 | matches = list(heading_pattern.finditer(text)) |
| 29 | |
| 30 | if not matches: |
| 31 | # No headings — chunk by paragraphs |
| 32 | return _chunk_by_paragraphs(text, source) |
| 33 | |
| 34 | chunks: List[DocumentChunk] = [] |
| 35 | |
| 36 | # Content before the first heading |
| 37 | if matches[0].start() > 0: |
| 38 | preamble = text[: matches[0].start()].strip() |
| 39 | if preamble: |
| 40 | chunks.append( |
| 41 | DocumentChunk( |
| 42 | text=preamble, |
| 43 | source_file=source, |
| 44 | chunk_index=0, |
| 45 | section="(preamble)", |
| 46 | ) |
| 47 | ) |
| 48 | |
| 49 | for i, match in enumerate(matches): |
| 50 | section_title = match.group(2).strip() |
| 51 | start = match.start() |
| 52 | end = matches[i + 1].start() if i + 1 < len(matches) else len(text) |
| 53 | section_text = text[start:end].strip() |
| 54 | |
| 55 | if section_text: |
| 56 | chunks.append( |
| 57 | DocumentChunk( |
| 58 | text=section_text, |
| 59 | source_file=source, |
| 60 | chunk_index=len(chunks), |
| 61 | section=section_title, |
| 62 | ) |
| 63 | ) |
| 64 | |
| 65 | return chunks |
| 66 | |
| 67 | |
| 68 | class PlaintextProcessor(DocumentProcessor): |
| 69 | """Process plaintext files by splitting on paragraph boundaries.""" |
| 70 | |
| 71 | supported_extensions = [".txt", ".text", ".log", ".csv"] |
| 72 | |
| 73 | def can_process(self, path: Path) -> bool: |
| 74 | return path.suffix.lower() in self.supported_extensions |
| 75 | |
| 76 | def process(self, path: Path) -> List[DocumentChunk]: |
| 77 | text = path.read_text(encoding="utf-8") |
| 78 | return _chunk_by_paragraphs(text, str(path)) |
| 79 | |
| 80 | |
| 81 | def _chunk_by_paragraphs( |
| 82 | text: str, |
| 83 | source_file: str, |
| 84 | max_chunk_size: int = 2000, |
| 85 | overlap: int = 200, |
| 86 | ) -> List[DocumentChunk]: |
| 87 | """Split text into chunks by paragraph boundaries with configurable size and overlap.""" |
| 88 | # Split on double newlines (paragraph boundaries) |
| 89 | paragraphs = re.split(r"\n\s*\n", text) |
| 90 | paragraphs = [p.strip() for p in paragraphs if p.strip()] |
| 91 | |
| 92 | if not paragraphs: |
| 93 | return [] |
| 94 | |
| 95 | chunks: List[DocumentChunk] = [] |
| 96 | current_text = "" |
| 97 | |
| 98 | for para in paragraphs: |
| 99 | candidate = (current_text + "\n\n" + para).strip() if current_text else para |
| 100 | |
| 101 | if len(candidate) > max_chunk_size and current_text: |
| 102 | # Flush current chunk |
| 103 | chunks.append( |
| 104 | DocumentChunk( |
| 105 | text=current_text, |
| 106 | source_file=source_file, |
| 107 | chunk_index=len(chunks), |
| 108 | ) |
| 109 | ) |
| 110 | # Start next chunk with overlap from the end of current |
| 111 | if overlap > 0 and len(current_text) > overlap: |
| 112 | current_text = current_text[-overlap:] + "\n\n" + para |
| 113 | else: |
| 114 | current_text = para |
| 115 | else: |
| 116 | current_text = candidate |
| 117 | |
| 118 | # Flush remaining |
| 119 | if current_text.strip(): |
| 120 | chunks.append( |
| 121 | DocumentChunk( |
| 122 | text=current_text.strip(), |
| 123 | source_file=source_file, |
| 124 | chunk_index=len(chunks), |
| 125 | ) |
| 126 | ) |
| 127 | |
| 128 | return chunks |
| 129 | |
| 130 | |
| 131 | # Register processors |
| 132 | register_processor(MarkdownProcessor.supported_extensions, MarkdownProcessor) |
| 133 | register_processor(PlaintextProcessor.supported_extensions, PlaintextProcessor) |
| --- a/video_processor/processors/pdf_processor.py | ||
| +++ b/video_processor/processors/pdf_processor.py | ||
| @@ -0,0 +1,77 @@ | ||
| 1 | +"""PDF document processor with graceful fallback between extraction libraries.""" | |
| 2 | + | |
| 3 | +from pathlib import Path | |
| 4 | +from typing import List | |
| 5 | + | |
| 6 | +from video_processor.processors.base import ( | |
| 7 | + DocumentChunk, | |
| 8 | + DocumentProcessor, | |
| 9 | + register_processor, | |
| 10 | +) | |
| 11 | + | |
| 12 | + | |
| 13 | +class PdfProcessor(DocumentProcessor): | |
| 14 | + """Process PDF files using pymupdf or pdfplumber.""" | |
| 15 | + | |
| 16 | + supported_extensions = [".pdf"] | |
| 17 | + | |
| 18 | + def can_process(self, path: Path) -> bool: | |
| 19 | + return path.suffix.lower() in self.supported_extensions | |
| 20 | + | |
| 21 | + def process(self, path: Path) -> List[DocumentChunk]: | |
| 22 | + """Process a PDF, trying pymupdf first, then pdfplumber.""" | |
| 23 | + try: | |
| 24 | + return self._process_pymupdf(path) | |
| 25 | + except ImportError: | |
| 26 | + pass | |
| 27 | + | |
| 28 | + try: | |
| 29 | + return self._process_pdfplumber(path) | |
| 30 | + except ImportError: | |
| 31 | + raise ImportError( | |
| 32 | + "PDF processing requires pymupdf or pdfplumber. " | |
| 33 | + "Install with: pip install 'planopticon[pdf]' OR pip install pdfplumber" | |
| 34 | + ) | |
| 35 | + | |
| 36 | + def _process_pymupdf(self, path: Path) -> List[DocumentChunk]: | |
| 37 | + import pymupdf | |
| 38 | + | |
| 39 | + doc = pymupdf.open(str(path)) | |
| 40 | + chunks: List[DocumentChunk] = [] | |
| 41 | + for page_num, page in enumerate(doc): | |
| 42 | + text = page.get_text() | |
| 43 | + if text.strip(): | |
| 44 | + chunks.append( | |
| 45 | + DocumentChunk( | |
| 46 | + text=text, | |
| 47 | + source_file=str(path), | |
| 48 | + chunk_index=page_num, | |
| 49 | + page=page_num + 1, | |
| 50 | + metadata={"extraction_method": "pymupdf"}, | |
| 51 | + ) | |
| 52 | + ) | |
| 53 | + doc.close() | |
| 54 | + return chunks | |
| 55 | + | |
| 56 | + def _process_pdfplumber(self, path: Path) -> List[DocumentChunk]: | |
| 57 | + import pdfplumber | |
| 58 | + | |
| 59 | + chunks: List[DocumentChunk] = [] | |
| 60 | + with pdfplumber.open(str(path)) as pdf: | |
| 61 | + for page_num, page in enumerate(pdf.pages): | |
| 62 | + text = page.extract_text() or "" | |
| 63 | + if text.strip(): | |
| 64 | + chunks.append( | |
| 65 | + DocumentChunk( | |
| 66 | + text=text, | |
| 67 | + source_file=str(path), | |
| 68 | + chunk_index=page_num, | |
| 69 | + page=page_num + 1, | |
| 70 | + metadata={"extraction_method": "pdfplumber"}, | |
| 71 | + ) | |
| 72 | + ) | |
| 73 | + return chunks | |
| 74 | + | |
| 75 | + | |
| 76 | +# Register processor | |
| 77 | +register_processor(PdfProcessor.supported_extensions, PdfProcessor) |
| --- a/video_processor/processors/pdf_processor.py | |
| +++ b/video_processor/processors/pdf_processor.py | |
| @@ -0,0 +1,77 @@ | |
| --- a/video_processor/processors/pdf_processor.py | |
| +++ b/video_processor/processors/pdf_processor.py | |
| @@ -0,0 +1,77 @@ | |
| 1 | """PDF document processor with graceful fallback between extraction libraries.""" |
| 2 | |
| 3 | from pathlib import Path |
| 4 | from typing import List |
| 5 | |
| 6 | from video_processor.processors.base import ( |
| 7 | DocumentChunk, |
| 8 | DocumentProcessor, |
| 9 | register_processor, |
| 10 | ) |
| 11 | |
| 12 | |
| 13 | class PdfProcessor(DocumentProcessor): |
| 14 | """Process PDF files using pymupdf or pdfplumber.""" |
| 15 | |
| 16 | supported_extensions = [".pdf"] |
| 17 | |
| 18 | def can_process(self, path: Path) -> bool: |
| 19 | return path.suffix.lower() in self.supported_extensions |
| 20 | |
| 21 | def process(self, path: Path) -> List[DocumentChunk]: |
| 22 | """Process a PDF, trying pymupdf first, then pdfplumber.""" |
| 23 | try: |
| 24 | return self._process_pymupdf(path) |
| 25 | except ImportError: |
| 26 | pass |
| 27 | |
| 28 | try: |
| 29 | return self._process_pdfplumber(path) |
| 30 | except ImportError: |
| 31 | raise ImportError( |
| 32 | "PDF processing requires pymupdf or pdfplumber. " |
| 33 | "Install with: pip install 'planopticon[pdf]' OR pip install pdfplumber" |
| 34 | ) |
| 35 | |
| 36 | def _process_pymupdf(self, path: Path) -> List[DocumentChunk]: |
| 37 | import pymupdf |
| 38 | |
| 39 | doc = pymupdf.open(str(path)) |
| 40 | chunks: List[DocumentChunk] = [] |
| 41 | for page_num, page in enumerate(doc): |
| 42 | text = page.get_text() |
| 43 | if text.strip(): |
| 44 | chunks.append( |
| 45 | DocumentChunk( |
| 46 | text=text, |
| 47 | source_file=str(path), |
| 48 | chunk_index=page_num, |
| 49 | page=page_num + 1, |
| 50 | metadata={"extraction_method": "pymupdf"}, |
| 51 | ) |
| 52 | ) |
| 53 | doc.close() |
| 54 | return chunks |
| 55 | |
| 56 | def _process_pdfplumber(self, path: Path) -> List[DocumentChunk]: |
| 57 | import pdfplumber |
| 58 | |
| 59 | chunks: List[DocumentChunk] = [] |
| 60 | with pdfplumber.open(str(path)) as pdf: |
| 61 | for page_num, page in enumerate(pdf.pages): |
| 62 | text = page.extract_text() or "" |
| 63 | if text.strip(): |
| 64 | chunks.append( |
| 65 | DocumentChunk( |
| 66 | text=text, |
| 67 | source_file=str(path), |
| 68 | chunk_index=page_num, |
| 69 | page=page_num + 1, |
| 70 | metadata={"extraction_method": "pdfplumber"}, |
| 71 | ) |
| 72 | ) |
| 73 | return chunks |
| 74 | |
| 75 | |
| 76 | # Register processor |
| 77 | register_processor(PdfProcessor.supported_extensions, PdfProcessor) |
| --- video_processor/providers/__init__.py | ||
| +++ video_processor/providers/__init__.py | ||
| @@ -1,6 +1,40 @@ | ||
| 1 | 1 | """Provider abstraction layer for LLM, vision, and transcription APIs.""" |
| 2 | 2 | |
| 3 | -from video_processor.providers.base import BaseProvider, ModelInfo | |
| 3 | +from video_processor.providers.base import ( | |
| 4 | + BaseProvider, | |
| 5 | + ModelInfo, | |
| 6 | + OpenAICompatibleProvider, | |
| 7 | + ProviderRegistry, | |
| 8 | +) | |
| 4 | 9 | from video_processor.providers.manager import ProviderManager |
| 5 | 10 | |
| 6 | -__all__ = ["BaseProvider", "ModelInfo", "ProviderManager"] | |
| 11 | +__all__ = [ | |
| 12 | + "BaseProvider", | |
| 13 | + "ModelInfo", | |
| 14 | + "OpenAICompatibleProvider", | |
| 15 | + "ProviderManager", | |
| 16 | + "ProviderRegistry", | |
| 17 | + # OpenAI-compatible providers (lazy-loaded via manager) | |
| 18 | + "AzureOpenAIProvider", | |
| 19 | + "CerebrasProvider", | |
| 20 | + "FireworksProvider", | |
| 21 | + "TogetherProvider", | |
| 22 | + "XAIProvider", | |
| 23 | +] | |
| 24 | + | |
| 25 | + | |
| 26 | +def __getattr__(name: str): | |
| 27 | + """Lazy import provider classes to avoid import-time side effects.""" | |
| 28 | + _lazy_imports = { | |
| 29 | + "AzureOpenAIProvider": "video_processor.providers.azure_provider", | |
| 30 | + "CerebrasProvider": "video_processor.providers.cerebras_provider", | |
| 31 | + "FireworksProvider": "video_processor.providers.fireworks_provider", | |
| 32 | + "TogetherProvider": "video_processor.providers.together_provider", | |
| 33 | + "XAIProvider": "video_processor.providers.xai_provider", | |
| 34 | + } | |
| 35 | + if name in _lazy_imports: | |
| 36 | + import importlib | |
| 37 | + | |
| 38 | + mod = importlib.import_module(_lazy_imports[name]) | |
| 39 | + return getattr(mod, name) | |
| 40 | + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") | |
| 7 | 41 | |
| 8 | 42 | ADDED video_processor/providers/ai21_provider.py |
| --- video_processor/providers/__init__.py | |
| +++ video_processor/providers/__init__.py | |
| @@ -1,6 +1,40 @@ | |
| 1 | """Provider abstraction layer for LLM, vision, and transcription APIs.""" |
| 2 | |
| 3 | from video_processor.providers.base import BaseProvider, ModelInfo |
| 4 | from video_processor.providers.manager import ProviderManager |
| 5 | |
| 6 | __all__ = ["BaseProvider", "ModelInfo", "ProviderManager"] |
| 7 | |
| 8 | DDED video_processor/providers/ai21_provider.py |
| --- video_processor/providers/__init__.py | |
| +++ video_processor/providers/__init__.py | |
| @@ -1,6 +1,40 @@ | |
| 1 | """Provider abstraction layer for LLM, vision, and transcription APIs.""" |
| 2 | |
| 3 | from video_processor.providers.base import ( |
| 4 | BaseProvider, |
| 5 | ModelInfo, |
| 6 | OpenAICompatibleProvider, |
| 7 | ProviderRegistry, |
| 8 | ) |
| 9 | from video_processor.providers.manager import ProviderManager |
| 10 | |
| 11 | __all__ = [ |
| 12 | "BaseProvider", |
| 13 | "ModelInfo", |
| 14 | "OpenAICompatibleProvider", |
| 15 | "ProviderManager", |
| 16 | "ProviderRegistry", |
| 17 | # OpenAI-compatible providers (lazy-loaded via manager) |
| 18 | "AzureOpenAIProvider", |
| 19 | "CerebrasProvider", |
| 20 | "FireworksProvider", |
| 21 | "TogetherProvider", |
| 22 | "XAIProvider", |
| 23 | ] |
| 24 | |
| 25 | |
| 26 | def __getattr__(name: str): |
| 27 | """Lazy import provider classes to avoid import-time side effects.""" |
| 28 | _lazy_imports = { |
| 29 | "AzureOpenAIProvider": "video_processor.providers.azure_provider", |
| 30 | "CerebrasProvider": "video_processor.providers.cerebras_provider", |
| 31 | "FireworksProvider": "video_processor.providers.fireworks_provider", |
| 32 | "TogetherProvider": "video_processor.providers.together_provider", |
| 33 | "XAIProvider": "video_processor.providers.xai_provider", |
| 34 | } |
| 35 | if name in _lazy_imports: |
| 36 | import importlib |
| 37 | |
| 38 | mod = importlib.import_module(_lazy_imports[name]) |
| 39 | return getattr(mod, name) |
| 40 | raise AttributeError(f"module {__name__!r} has no attribute {name!r}") |
| 41 | |
| 42 | DDED video_processor/providers/ai21_provider.py |
| --- a/video_processor/providers/ai21_provider.py | ||
| +++ b/video_processor/providers/ai21_provider.py | ||
| @@ -0,0 +1,98 @@ | ||
| 1 | +"""AI21 Labs provider implementation.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import os | |
| 5 | +from pathlib import Path | |
| 6 | +from typing import Optional | |
| 7 | + | |
| 8 | +from dotenv import load_dotenv | |
| 9 | + | |
| 10 | +from video_processor.providers.base import ModelInfo, OpenAICompatibleProvider, ProviderRegistry | |
| 11 | + | |
| 12 | +load_dotenv() | |
| 13 | +logger = logging.getLogger(__name__) | |
| 14 | + | |
| 15 | +# Curated list of AI21 models | |
| 16 | +_AI21_MODELS = [ | |
| 17 | + ModelInfo( | |
| 18 | + id="jamba-1.5-large", | |
| 19 | + provider="ai21", | |
| 20 | + display_name="Jamba 1.5 Large", | |
| 21 | + capabilities=["chat"], | |
| 22 | + ), | |
| 23 | + ModelInfo( | |
| 24 | + id="jamba-1.5-mini", | |
| 25 | + provider="ai21", | |
| 26 | + display_name="Jamba 1.5 Mini", | |
| 27 | + capabilities=["chat"], | |
| 28 | + ), | |
| 29 | + ModelInfo( | |
| 30 | + id="jamba-instruct", | |
| 31 | + provider="ai21", | |
| 32 | + display_name="Jamba Instruct", | |
| 33 | + capabilities=["chat"], | |
| 34 | + ), | |
| 35 | +] | |
| 36 | + | |
| 37 | + | |
| 38 | +class AI21Provider(OpenAICompatibleProvider): | |
| 39 | + """AI21 Labs provider using OpenAI-compatible API.""" | |
| 40 | + | |
| 41 | + provider_name = "ai21" | |
| 42 | + base_url = "https://api.ai21.com/studio/v1" | |
| 43 | + env_var = "AI21_API_KEY" | |
| 44 | + | |
| 45 | + def __init__(self, api_key: Optional[str] = None): | |
| 46 | + api_key = api_key or os.getenv("AI21_API_KEY") | |
| 47 | + if not api_key: | |
| 48 | + raise ValueError("AI21_API_KEY not set") | |
| 49 | + super().__init__(api_key=api_key, base_url=self.base_url) | |
| 50 | + | |
| 51 | + def chat( | |
| 52 | + self, | |
| 53 | + messages: list[dict], | |
| 54 | + max_tokens: int = 4096, | |
| 55 | + temperature: float = 0.7, | |
| 56 | + model: Optional[str] = None, | |
| 57 | + ) -> str: | |
| 58 | + model = model or "jamba-1.5-large" | |
| 59 | + return super().chat(messages, max_tokens, temperature, model) | |
| 60 | + | |
| 61 | + def analyze_image( | |
| 62 | + self, | |
| 63 | + image_bytes: bytes, | |
| 64 | + prompt: str, | |
| 65 | + max_tokens: int = 4096, | |
| 66 | + model: Optional[str] = None, | |
| 67 | + ) -> str: | |
| 68 | + raise NotImplementedError( | |
| 69 | + "AI21 does not currently support vision/image analysis. " | |
| 70 | + "Use OpenAI, Anthropic, or Gemini for image analysis." | |
| 71 | + ) | |
| 72 | + | |
| 73 | + def transcribe_audio( | |
| 74 | + self, | |
| 75 | + audio_path: str | Path, | |
| 76 | + language: Optional[str] = None, | |
| 77 | + model: Optional[str] = None, | |
| 78 | + ) -> dict: | |
| 79 | + raise NotImplementedError( | |
| 80 | + "AI21 does not provide a transcription API. " | |
| 81 | + "Use OpenAI Whisper or Gemini for transcription." | |
| 82 | + ) | |
| 83 | + | |
| 84 | + def list_models(self) -> list[ModelInfo]: | |
| 85 | + return list(_AI21_MODELS) | |
| 86 | + | |
| 87 | + | |
| 88 | +ProviderRegistry.register( | |
| 89 | + name="ai21", | |
| 90 | + provider_class=AI21Provider, | |
| 91 | + env_var="AI21_API_KEY", | |
| 92 | + model_prefixes=["jamba-", "j2-"], | |
| 93 | + default_models={ | |
| 94 | + "chat": "jamba-1.5-large", | |
| 95 | + "vision": "", | |
| 96 | + "audio": "", | |
| 97 | + }, | |
| 98 | +) |
| --- a/video_processor/providers/ai21_provider.py | |
| +++ b/video_processor/providers/ai21_provider.py | |
| @@ -0,0 +1,98 @@ | |
| --- a/video_processor/providers/ai21_provider.py | |
| +++ b/video_processor/providers/ai21_provider.py | |
| @@ -0,0 +1,98 @@ | |
| 1 | """AI21 Labs provider implementation.""" |
| 2 | |
| 3 | import logging |
| 4 | import os |
| 5 | from pathlib import Path |
| 6 | from typing import Optional |
| 7 | |
| 8 | from dotenv import load_dotenv |
| 9 | |
| 10 | from video_processor.providers.base import ModelInfo, OpenAICompatibleProvider, ProviderRegistry |
| 11 | |
| 12 | load_dotenv() |
| 13 | logger = logging.getLogger(__name__) |
| 14 | |
| 15 | # Curated list of AI21 models |
| 16 | _AI21_MODELS = [ |
| 17 | ModelInfo( |
| 18 | id="jamba-1.5-large", |
| 19 | provider="ai21", |
| 20 | display_name="Jamba 1.5 Large", |
| 21 | capabilities=["chat"], |
| 22 | ), |
| 23 | ModelInfo( |
| 24 | id="jamba-1.5-mini", |
| 25 | provider="ai21", |
| 26 | display_name="Jamba 1.5 Mini", |
| 27 | capabilities=["chat"], |
| 28 | ), |
| 29 | ModelInfo( |
| 30 | id="jamba-instruct", |
| 31 | provider="ai21", |
| 32 | display_name="Jamba Instruct", |
| 33 | capabilities=["chat"], |
| 34 | ), |
| 35 | ] |
| 36 | |
| 37 | |
| 38 | class AI21Provider(OpenAICompatibleProvider): |
| 39 | """AI21 Labs provider using OpenAI-compatible API.""" |
| 40 | |
| 41 | provider_name = "ai21" |
| 42 | base_url = "https://api.ai21.com/studio/v1" |
| 43 | env_var = "AI21_API_KEY" |
| 44 | |
| 45 | def __init__(self, api_key: Optional[str] = None): |
| 46 | api_key = api_key or os.getenv("AI21_API_KEY") |
| 47 | if not api_key: |
| 48 | raise ValueError("AI21_API_KEY not set") |
| 49 | super().__init__(api_key=api_key, base_url=self.base_url) |
| 50 | |
| 51 | def chat( |
| 52 | self, |
| 53 | messages: list[dict], |
| 54 | max_tokens: int = 4096, |
| 55 | temperature: float = 0.7, |
| 56 | model: Optional[str] = None, |
| 57 | ) -> str: |
| 58 | model = model or "jamba-1.5-large" |
| 59 | return super().chat(messages, max_tokens, temperature, model) |
| 60 | |
| 61 | def analyze_image( |
| 62 | self, |
| 63 | image_bytes: bytes, |
| 64 | prompt: str, |
| 65 | max_tokens: int = 4096, |
| 66 | model: Optional[str] = None, |
| 67 | ) -> str: |
| 68 | raise NotImplementedError( |
| 69 | "AI21 does not currently support vision/image analysis. " |
| 70 | "Use OpenAI, Anthropic, or Gemini for image analysis." |
| 71 | ) |
| 72 | |
| 73 | def transcribe_audio( |
| 74 | self, |
| 75 | audio_path: str | Path, |
| 76 | language: Optional[str] = None, |
| 77 | model: Optional[str] = None, |
| 78 | ) -> dict: |
| 79 | raise NotImplementedError( |
| 80 | "AI21 does not provide a transcription API. " |
| 81 | "Use OpenAI Whisper or Gemini for transcription." |
| 82 | ) |
| 83 | |
| 84 | def list_models(self) -> list[ModelInfo]: |
| 85 | return list(_AI21_MODELS) |
| 86 | |
| 87 | |
| 88 | ProviderRegistry.register( |
| 89 | name="ai21", |
| 90 | provider_class=AI21Provider, |
| 91 | env_var="AI21_API_KEY", |
| 92 | model_prefixes=["jamba-", "j2-"], |
| 93 | default_models={ |
| 94 | "chat": "jamba-1.5-large", |
| 95 | "vision": "", |
| 96 | "audio": "", |
| 97 | }, |
| 98 | ) |
| --- video_processor/providers/anthropic_provider.py | ||
| +++ video_processor/providers/anthropic_provider.py | ||
| @@ -7,11 +7,11 @@ | ||
| 7 | 7 | from typing import Optional |
| 8 | 8 | |
| 9 | 9 | import anthropic |
| 10 | 10 | from dotenv import load_dotenv |
| 11 | 11 | |
| 12 | -from video_processor.providers.base import BaseProvider, ModelInfo | |
| 12 | +from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry | |
| 13 | 13 | |
| 14 | 14 | load_dotenv() |
| 15 | 15 | logger = logging.getLogger(__name__) |
| 16 | 16 | |
| 17 | 17 | |
| @@ -31,17 +31,31 @@ | ||
| 31 | 31 | messages: list[dict], |
| 32 | 32 | max_tokens: int = 4096, |
| 33 | 33 | temperature: float = 0.7, |
| 34 | 34 | model: Optional[str] = None, |
| 35 | 35 | ) -> str: |
| 36 | - model = model or "claude-sonnet-4-5-20250929" | |
| 37 | - response = self.client.messages.create( | |
| 38 | - model=model, | |
| 39 | - messages=messages, | |
| 40 | - max_tokens=max_tokens, | |
| 41 | - temperature=temperature, | |
| 42 | - ) | |
| 36 | + model = model or "claude-haiku-4-5-20251001" | |
| 37 | + | |
| 38 | + # Anthropic requires system messages as a top-level parameter | |
| 39 | + system_parts = [] | |
| 40 | + chat_messages = [] | |
| 41 | + for msg in messages: | |
| 42 | + if msg.get("role") == "system": | |
| 43 | + system_parts.append(msg["content"]) | |
| 44 | + else: | |
| 45 | + chat_messages.append(msg) | |
| 46 | + | |
| 47 | + kwargs = { | |
| 48 | + "model": model, | |
| 49 | + "messages": chat_messages, | |
| 50 | + "max_tokens": max_tokens, | |
| 51 | + "temperature": temperature, | |
| 52 | + } | |
| 53 | + if system_parts: | |
| 54 | + kwargs["system"] = "\n\n".join(system_parts) | |
| 55 | + | |
| 56 | + response = self.client.messages.create(**kwargs) | |
| 43 | 57 | self._last_usage = { |
| 44 | 58 | "input_tokens": getattr(response.usage, "input_tokens", 0), |
| 45 | 59 | "output_tokens": getattr(response.usage, "output_tokens", 0), |
| 46 | 60 | } |
| 47 | 61 | return response.content[0].text |
| @@ -51,11 +65,11 @@ | ||
| 51 | 65 | image_bytes: bytes, |
| 52 | 66 | prompt: str, |
| 53 | 67 | max_tokens: int = 4096, |
| 54 | 68 | model: Optional[str] = None, |
| 55 | 69 | ) -> str: |
| 56 | - model = model or "claude-sonnet-4-5-20250929" | |
| 70 | + model = model or "claude-haiku-4-5-20251001" | |
| 57 | 71 | b64 = base64.b64encode(image_bytes).decode() |
| 58 | 72 | response = self.client.messages.create( |
| 59 | 73 | model=model, |
| 60 | 74 | messages=[ |
| 61 | 75 | { |
| @@ -108,5 +122,18 @@ | ||
| 108 | 122 | ) |
| 109 | 123 | ) |
| 110 | 124 | except Exception as e: |
| 111 | 125 | logger.warning(f"Failed to list Anthropic models: {e}") |
| 112 | 126 | return sorted(models, key=lambda m: m.id) |
| 127 | + | |
| 128 | + | |
| 129 | +ProviderRegistry.register( | |
| 130 | + name="anthropic", | |
| 131 | + provider_class=AnthropicProvider, | |
| 132 | + env_var="ANTHROPIC_API_KEY", | |
| 133 | + model_prefixes=["claude-"], | |
| 134 | + default_models={ | |
| 135 | + "chat": "claude-haiku-4-5-20251001", | |
| 136 | + "vision": "claude-haiku-4-5-20251001", | |
| 137 | + "audio": "", | |
| 138 | + }, | |
| 139 | +) | |
| 113 | 140 | |
| 114 | 141 | ADDED video_processor/providers/azure_provider.py |
| --- video_processor/providers/anthropic_provider.py | |
| +++ video_processor/providers/anthropic_provider.py | |
| @@ -7,11 +7,11 @@ | |
| 7 | from typing import Optional |
| 8 | |
| 9 | import anthropic |
| 10 | from dotenv import load_dotenv |
| 11 | |
| 12 | from video_processor.providers.base import BaseProvider, ModelInfo |
| 13 | |
| 14 | load_dotenv() |
| 15 | logger = logging.getLogger(__name__) |
| 16 | |
| 17 | |
| @@ -31,17 +31,31 @@ | |
| 31 | messages: list[dict], |
| 32 | max_tokens: int = 4096, |
| 33 | temperature: float = 0.7, |
| 34 | model: Optional[str] = None, |
| 35 | ) -> str: |
| 36 | model = model or "claude-sonnet-4-5-20250929" |
| 37 | response = self.client.messages.create( |
| 38 | model=model, |
| 39 | messages=messages, |
| 40 | max_tokens=max_tokens, |
| 41 | temperature=temperature, |
| 42 | ) |
| 43 | self._last_usage = { |
| 44 | "input_tokens": getattr(response.usage, "input_tokens", 0), |
| 45 | "output_tokens": getattr(response.usage, "output_tokens", 0), |
| 46 | } |
| 47 | return response.content[0].text |
| @@ -51,11 +65,11 @@ | |
| 51 | image_bytes: bytes, |
| 52 | prompt: str, |
| 53 | max_tokens: int = 4096, |
| 54 | model: Optional[str] = None, |
| 55 | ) -> str: |
| 56 | model = model or "claude-sonnet-4-5-20250929" |
| 57 | b64 = base64.b64encode(image_bytes).decode() |
| 58 | response = self.client.messages.create( |
| 59 | model=model, |
| 60 | messages=[ |
| 61 | { |
| @@ -108,5 +122,18 @@ | |
| 108 | ) |
| 109 | ) |
| 110 | except Exception as e: |
| 111 | logger.warning(f"Failed to list Anthropic models: {e}") |
| 112 | return sorted(models, key=lambda m: m.id) |
| 113 | |
| 114 | DDED video_processor/providers/azure_provider.py |
| --- video_processor/providers/anthropic_provider.py | |
| +++ video_processor/providers/anthropic_provider.py | |
| @@ -7,11 +7,11 @@ | |
| 7 | from typing import Optional |
| 8 | |
| 9 | import anthropic |
| 10 | from dotenv import load_dotenv |
| 11 | |
| 12 | from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry |
| 13 | |
| 14 | load_dotenv() |
| 15 | logger = logging.getLogger(__name__) |
| 16 | |
| 17 | |
| @@ -31,17 +31,31 @@ | |
| 31 | messages: list[dict], |
| 32 | max_tokens: int = 4096, |
| 33 | temperature: float = 0.7, |
| 34 | model: Optional[str] = None, |
| 35 | ) -> str: |
| 36 | model = model or "claude-haiku-4-5-20251001" |
| 37 | |
| 38 | # Anthropic requires system messages as a top-level parameter |
| 39 | system_parts = [] |
| 40 | chat_messages = [] |
| 41 | for msg in messages: |
| 42 | if msg.get("role") == "system": |
| 43 | system_parts.append(msg["content"]) |
| 44 | else: |
| 45 | chat_messages.append(msg) |
| 46 | |
| 47 | kwargs = { |
| 48 | "model": model, |
| 49 | "messages": chat_messages, |
| 50 | "max_tokens": max_tokens, |
| 51 | "temperature": temperature, |
| 52 | } |
| 53 | if system_parts: |
| 54 | kwargs["system"] = "\n\n".join(system_parts) |
| 55 | |
| 56 | response = self.client.messages.create(**kwargs) |
| 57 | self._last_usage = { |
| 58 | "input_tokens": getattr(response.usage, "input_tokens", 0), |
| 59 | "output_tokens": getattr(response.usage, "output_tokens", 0), |
| 60 | } |
| 61 | return response.content[0].text |
| @@ -51,11 +65,11 @@ | |
| 65 | image_bytes: bytes, |
| 66 | prompt: str, |
| 67 | max_tokens: int = 4096, |
| 68 | model: Optional[str] = None, |
| 69 | ) -> str: |
| 70 | model = model or "claude-haiku-4-5-20251001" |
| 71 | b64 = base64.b64encode(image_bytes).decode() |
| 72 | response = self.client.messages.create( |
| 73 | model=model, |
| 74 | messages=[ |
| 75 | { |
| @@ -108,5 +122,18 @@ | |
| 122 | ) |
| 123 | ) |
| 124 | except Exception as e: |
| 125 | logger.warning(f"Failed to list Anthropic models: {e}") |
| 126 | return sorted(models, key=lambda m: m.id) |
| 127 | |
| 128 | |
| 129 | ProviderRegistry.register( |
| 130 | name="anthropic", |
| 131 | provider_class=AnthropicProvider, |
| 132 | env_var="ANTHROPIC_API_KEY", |
| 133 | model_prefixes=["claude-"], |
| 134 | default_models={ |
| 135 | "chat": "claude-haiku-4-5-20251001", |
| 136 | "vision": "claude-haiku-4-5-20251001", |
| 137 | "audio": "", |
| 138 | }, |
| 139 | ) |
| 140 | |
| 141 | DDED video_processor/providers/azure_provider.py |
| --- a/video_processor/providers/azure_provider.py | ||
| +++ b/video_processor/providers/azure_provider.py | ||
| @@ -0,0 +1,38 @@ | ||
| 1 | +"""Azure OpenAI provider implementation.""" | |
| 2 | + | |
| 3 | +import os | |
| 4 | + | |
| 5 | +from video_processor.providers.base import OpenAICompatibleProvider, ProviderRegistry | |
| 6 | + | |
| 7 | + | |
| 8 | +class AzureOpenAIProvider(OpenAICompatibleProvider): | |
| 9 | + """Azure OpenAI API provider. | |
| 10 | + | |
| 11 | + Uses the AzureOpenAI client which requires an endpoint and API version | |
| 12 | + in addition to the API key. | |
| 13 | + """ | |
| 14 | + | |
| 15 | + provider_name = "azure" | |
| 16 | + env_var = "AZURE_OPENAI_API_KEY" | |
| 17 | + | |
| 18 | + def __init__(self, api_key=None, endpoint=None, api_version=None): | |
| 19 | + from openai import AzureOpenAI | |
| 20 | + | |
| 21 | + self._api_key = api_key or os.getenv("AZURE_OPENAI_API_KEY", "") | |
| 22 | + endpoint = endpoint or os.getenv("AZURE_OPENAI_ENDPOINT", "") | |
| 23 | + api_version = api_version or os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-15-preview") | |
| 24 | + self._client = AzureOpenAI( | |
| 25 | + api_key=self._api_key, | |
| 26 | + azure_endpoint=endpoint, | |
| 27 | + api_version=api_version, | |
| 28 | + ) | |
| 29 | + self._last_usage = None | |
| 30 | + | |
| 31 | + | |
| 32 | +ProviderRegistry.register( | |
| 33 | + name="azure", | |
| 34 | + provider_class=AzureOpenAIProvider, | |
| 35 | + env_var="AZURE_OPENAI_API_KEY", | |
| 36 | + model_prefixes=[], # Azure uses deployment names, not standard prefixes | |
| 37 | + default_models={"chat": "", "vision": "", "audio": ""}, | |
| 38 | +) |
| --- a/video_processor/providers/azure_provider.py | |
| +++ b/video_processor/providers/azure_provider.py | |
| @@ -0,0 +1,38 @@ | |
| --- a/video_processor/providers/azure_provider.py | |
| +++ b/video_processor/providers/azure_provider.py | |
| @@ -0,0 +1,38 @@ | |
| 1 | """Azure OpenAI provider implementation.""" |
| 2 | |
| 3 | import os |
| 4 | |
| 5 | from video_processor.providers.base import OpenAICompatibleProvider, ProviderRegistry |
| 6 | |
| 7 | |
| 8 | class AzureOpenAIProvider(OpenAICompatibleProvider): |
| 9 | """Azure OpenAI API provider. |
| 10 | |
| 11 | Uses the AzureOpenAI client which requires an endpoint and API version |
| 12 | in addition to the API key. |
| 13 | """ |
| 14 | |
| 15 | provider_name = "azure" |
| 16 | env_var = "AZURE_OPENAI_API_KEY" |
| 17 | |
| 18 | def __init__(self, api_key=None, endpoint=None, api_version=None): |
| 19 | from openai import AzureOpenAI |
| 20 | |
| 21 | self._api_key = api_key or os.getenv("AZURE_OPENAI_API_KEY", "") |
| 22 | endpoint = endpoint or os.getenv("AZURE_OPENAI_ENDPOINT", "") |
| 23 | api_version = api_version or os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-15-preview") |
| 24 | self._client = AzureOpenAI( |
| 25 | api_key=self._api_key, |
| 26 | azure_endpoint=endpoint, |
| 27 | api_version=api_version, |
| 28 | ) |
| 29 | self._last_usage = None |
| 30 | |
| 31 | |
| 32 | ProviderRegistry.register( |
| 33 | name="azure", |
| 34 | provider_class=AzureOpenAIProvider, |
| 35 | env_var="AZURE_OPENAI_API_KEY", |
| 36 | model_prefixes=[], # Azure uses deployment names, not standard prefixes |
| 37 | default_models={"chat": "", "vision": "", "audio": ""}, |
| 38 | ) |
+171
-2
| --- video_processor/providers/base.py | ||
| +++ video_processor/providers/base.py | ||
| @@ -1,12 +1,17 @@ | ||
| 1 | -"""Abstract base class and shared types for provider implementations.""" | |
| 1 | +"""Abstract base class, registry, and shared types for provider implementations.""" | |
| 2 | 2 | |
| 3 | +import base64 | |
| 4 | +import logging | |
| 5 | +import os | |
| 3 | 6 | from abc import ABC, abstractmethod |
| 4 | 7 | from pathlib import Path |
| 5 | -from typing import List, Optional | |
| 8 | +from typing import Dict, List, Optional | |
| 6 | 9 | |
| 7 | 10 | from pydantic import BaseModel, Field |
| 11 | + | |
| 12 | +logger = logging.getLogger(__name__) | |
| 8 | 13 | |
| 9 | 14 | |
| 10 | 15 | class ModelInfo(BaseModel): |
| 11 | 16 | """Information about an available model.""" |
| 12 | 17 | |
| @@ -53,5 +58,169 @@ | ||
| 53 | 58 | """Transcribe an audio file. Returns dict with 'text', 'segments', etc.""" |
| 54 | 59 | |
| 55 | 60 | @abstractmethod |
| 56 | 61 | def list_models(self) -> list[ModelInfo]: |
| 57 | 62 | """Discover available models from this provider's API.""" |
| 63 | + | |
| 64 | + | |
| 65 | +class ProviderRegistry: | |
| 66 | + """Registry for provider classes. Providers register themselves with metadata.""" | |
| 67 | + | |
| 68 | + _providers: Dict[str, Dict] = {} | |
| 69 | + | |
| 70 | + @classmethod | |
| 71 | + def register( | |
| 72 | + cls, | |
| 73 | + name: str, | |
| 74 | + provider_class: type, | |
| 75 | + env_var: str = "", | |
| 76 | + model_prefixes: Optional[List[str]] = None, | |
| 77 | + default_models: Optional[Dict[str, str]] = None, | |
| 78 | + ) -> None: | |
| 79 | + """Register a provider class with its metadata.""" | |
| 80 | + cls._providers[name] = { | |
| 81 | + "class": provider_class, | |
| 82 | + "env_var": env_var, | |
| 83 | + "model_prefixes": model_prefixes or [], | |
| 84 | + "default_models": default_models or {}, | |
| 85 | + } | |
| 86 | + | |
| 87 | + @classmethod | |
| 88 | + def get(cls, name: str) -> type: | |
| 89 | + """Return the provider class for a given name.""" | |
| 90 | + if name not in cls._providers: | |
| 91 | + raise ValueError(f"Unknown provider: {name}") | |
| 92 | + return cls._providers[name]["class"] | |
| 93 | + | |
| 94 | + @classmethod | |
| 95 | + def get_by_model(cls, model_id: str) -> Optional[str]: | |
| 96 | + """Return provider name for a model ID based on prefix matching.""" | |
| 97 | + for name, info in cls._providers.items(): | |
| 98 | + for prefix in info["model_prefixes"]: | |
| 99 | + if model_id.startswith(prefix): | |
| 100 | + return name | |
| 101 | + return None | |
| 102 | + | |
| 103 | + @classmethod | |
| 104 | + def get_default_models(cls, name: str) -> Dict[str, str]: | |
| 105 | + """Return the default models dict for a provider.""" | |
| 106 | + if name not in cls._providers: | |
| 107 | + return {} | |
| 108 | + return cls._providers[name].get("default_models", {}) | |
| 109 | + | |
| 110 | + @classmethod | |
| 111 | + def available(cls) -> List[str]: | |
| 112 | + """Return names of providers whose env var is set (or have no env var requirement).""" | |
| 113 | + result = [] | |
| 114 | + for name, info in cls._providers.items(): | |
| 115 | + env_var = info.get("env_var", "") | |
| 116 | + if not env_var: | |
| 117 | + # Providers without an env var (e.g. ollama) need special availability checks | |
| 118 | + result.append(name) | |
| 119 | + elif os.getenv(env_var, ""): | |
| 120 | + result.append(name) | |
| 121 | + return result | |
| 122 | + | |
| 123 | + @classmethod | |
| 124 | + def all_registered(cls) -> Dict[str, Dict]: | |
| 125 | + """Return all registered providers and their metadata.""" | |
| 126 | + return dict(cls._providers) | |
| 127 | + | |
| 128 | + | |
| 129 | +class OpenAICompatibleProvider(BaseProvider): | |
| 130 | + """Base for providers using OpenAI-compatible APIs. | |
| 131 | + | |
| 132 | + Suitable for Together, Fireworks, Cerebras, xAI, Azure, and similar services. | |
| 133 | + """ | |
| 134 | + | |
| 135 | + provider_name: str = "" | |
| 136 | + base_url: str = "" | |
| 137 | + env_var: str = "" | |
| 138 | + | |
| 139 | + def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None): | |
| 140 | + from openai import OpenAI | |
| 141 | + | |
| 142 | + self._api_key = api_key or os.getenv(self.env_var, "") | |
| 143 | + self._base_url = base_url or self.base_url | |
| 144 | + self._client = OpenAI(api_key=self._api_key, base_url=self._base_url) | |
| 145 | + self._last_usage = None | |
| 146 | + | |
| 147 | + def chat( | |
| 148 | + self, | |
| 149 | + messages: list[dict], | |
| 150 | + max_tokens: int = 4096, | |
| 151 | + temperature: float = 0.7, | |
| 152 | + model: Optional[str] = None, | |
| 153 | + ) -> str: | |
| 154 | + model = model or "gpt-4o" | |
| 155 | + response = self._client.chat.completions.create( | |
| 156 | + model=model, | |
| 157 | + messages=messages, | |
| 158 | + max_tokens=max_tokens, | |
| 159 | + temperature=temperature, | |
| 160 | + ) | |
| 161 | + self._last_usage = { | |
| 162 | + "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0, | |
| 163 | + "output_tokens": getattr(response.usage, "completion_tokens", 0) | |
| 164 | + if response.usage | |
| 165 | + else 0, | |
| 166 | + } | |
| 167 | + return response.choices[0].message.content or "" | |
| 168 | + | |
| 169 | + def analyze_image( | |
| 170 | + self, | |
| 171 | + image_bytes: bytes, | |
| 172 | + prompt: str, | |
| 173 | + max_tokens: int = 4096, | |
| 174 | + model: Optional[str] = None, | |
| 175 | + ) -> str: | |
| 176 | + model = model or "gpt-4o" | |
| 177 | + b64 = base64.b64encode(image_bytes).decode() | |
| 178 | + response = self._client.chat.completions.create( | |
| 179 | + model=model, | |
| 180 | + messages=[ | |
| 181 | + { | |
| 182 | + "role": "user", | |
| 183 | + "content": [ | |
| 184 | + {"type": "text", "text": prompt}, | |
| 185 | + { | |
| 186 | + "type": "image_url", | |
| 187 | + "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, | |
| 188 | + }, | |
| 189 | + ], | |
| 190 | + } | |
| 191 | + ], | |
| 192 | + max_tokens=max_tokens, | |
| 193 | + ) | |
| 194 | + self._last_usage = { | |
| 195 | + "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0, | |
| 196 | + "output_tokens": getattr(response.usage, "completion_tokens", 0) | |
| 197 | + if response.usage | |
| 198 | + else 0, | |
| 199 | + } | |
| 200 | + return response.choices[0].message.content or "" | |
| 201 | + | |
| 202 | + def transcribe_audio( | |
| 203 | + self, | |
| 204 | + audio_path: str | Path, | |
| 205 | + language: Optional[str] = None, | |
| 206 | + model: Optional[str] = None, | |
| 207 | + ) -> dict: | |
| 208 | + raise NotImplementedError(f"{self.provider_name} does not support audio transcription") | |
| 209 | + | |
| 210 | + def list_models(self) -> list[ModelInfo]: | |
| 211 | + models = [] | |
| 212 | + try: | |
| 213 | + for m in self._client.models.list(): | |
| 214 | + mid = m.id | |
| 215 | + caps = ["chat"] | |
| 216 | + models.append( | |
| 217 | + ModelInfo( | |
| 218 | + id=mid, | |
| 219 | + provider=self.provider_name, | |
| 220 | + display_name=mid, | |
| 221 | + capabilities=caps, | |
| 222 | + ) | |
| 223 | + ) | |
| 224 | + except Exception as e: | |
| 225 | + logger.warning(f"Failed to list {self.provider_name} models: {e}") | |
| 226 | + return sorted(models, key=lambda m: m.id) | |
| 58 | 227 | |
| 59 | 228 | ADDED video_processor/providers/bedrock_provider.py |
| 60 | 229 | ADDED video_processor/providers/cerebras_provider.py |
| 61 | 230 | ADDED video_processor/providers/cohere_provider.py |
| --- video_processor/providers/base.py | |
| +++ video_processor/providers/base.py | |
| @@ -1,12 +1,17 @@ | |
| 1 | """Abstract base class and shared types for provider implementations.""" |
| 2 | |
| 3 | from abc import ABC, abstractmethod |
| 4 | from pathlib import Path |
| 5 | from typing import List, Optional |
| 6 | |
| 7 | from pydantic import BaseModel, Field |
| 8 | |
| 9 | |
| 10 | class ModelInfo(BaseModel): |
| 11 | """Information about an available model.""" |
| 12 | |
| @@ -53,5 +58,169 @@ | |
| 53 | """Transcribe an audio file. Returns dict with 'text', 'segments', etc.""" |
| 54 | |
| 55 | @abstractmethod |
| 56 | def list_models(self) -> list[ModelInfo]: |
| 57 | """Discover available models from this provider's API.""" |
| 58 | |
| 59 | DDED video_processor/providers/bedrock_provider.py |
| 60 | DDED video_processor/providers/cerebras_provider.py |
| 61 | DDED video_processor/providers/cohere_provider.py |
| --- video_processor/providers/base.py | |
| +++ video_processor/providers/base.py | |
| @@ -1,12 +1,17 @@ | |
| 1 | """Abstract base class, registry, and shared types for provider implementations.""" |
| 2 | |
| 3 | import base64 |
| 4 | import logging |
| 5 | import os |
| 6 | from abc import ABC, abstractmethod |
| 7 | from pathlib import Path |
| 8 | from typing import Dict, List, Optional |
| 9 | |
| 10 | from pydantic import BaseModel, Field |
| 11 | |
| 12 | logger = logging.getLogger(__name__) |
| 13 | |
| 14 | |
| 15 | class ModelInfo(BaseModel): |
| 16 | """Information about an available model.""" |
| 17 | |
| @@ -53,5 +58,169 @@ | |
| 58 | """Transcribe an audio file. Returns dict with 'text', 'segments', etc.""" |
| 59 | |
| 60 | @abstractmethod |
| 61 | def list_models(self) -> list[ModelInfo]: |
| 62 | """Discover available models from this provider's API.""" |
| 63 | |
| 64 | |
| 65 | class ProviderRegistry: |
| 66 | """Registry for provider classes. Providers register themselves with metadata.""" |
| 67 | |
| 68 | _providers: Dict[str, Dict] = {} |
| 69 | |
| 70 | @classmethod |
| 71 | def register( |
| 72 | cls, |
| 73 | name: str, |
| 74 | provider_class: type, |
| 75 | env_var: str = "", |
| 76 | model_prefixes: Optional[List[str]] = None, |
| 77 | default_models: Optional[Dict[str, str]] = None, |
| 78 | ) -> None: |
| 79 | """Register a provider class with its metadata.""" |
| 80 | cls._providers[name] = { |
| 81 | "class": provider_class, |
| 82 | "env_var": env_var, |
| 83 | "model_prefixes": model_prefixes or [], |
| 84 | "default_models": default_models or {}, |
| 85 | } |
| 86 | |
| 87 | @classmethod |
| 88 | def get(cls, name: str) -> type: |
| 89 | """Return the provider class for a given name.""" |
| 90 | if name not in cls._providers: |
| 91 | raise ValueError(f"Unknown provider: {name}") |
| 92 | return cls._providers[name]["class"] |
| 93 | |
| 94 | @classmethod |
| 95 | def get_by_model(cls, model_id: str) -> Optional[str]: |
| 96 | """Return provider name for a model ID based on prefix matching.""" |
| 97 | for name, info in cls._providers.items(): |
| 98 | for prefix in info["model_prefixes"]: |
| 99 | if model_id.startswith(prefix): |
| 100 | return name |
| 101 | return None |
| 102 | |
| 103 | @classmethod |
| 104 | def get_default_models(cls, name: str) -> Dict[str, str]: |
| 105 | """Return the default models dict for a provider.""" |
| 106 | if name not in cls._providers: |
| 107 | return {} |
| 108 | return cls._providers[name].get("default_models", {}) |
| 109 | |
| 110 | @classmethod |
| 111 | def available(cls) -> List[str]: |
| 112 | """Return names of providers whose env var is set (or have no env var requirement).""" |
| 113 | result = [] |
| 114 | for name, info in cls._providers.items(): |
| 115 | env_var = info.get("env_var", "") |
| 116 | if not env_var: |
| 117 | # Providers without an env var (e.g. ollama) need special availability checks |
| 118 | result.append(name) |
| 119 | elif os.getenv(env_var, ""): |
| 120 | result.append(name) |
| 121 | return result |
| 122 | |
| 123 | @classmethod |
| 124 | def all_registered(cls) -> Dict[str, Dict]: |
| 125 | """Return all registered providers and their metadata.""" |
| 126 | return dict(cls._providers) |
| 127 | |
| 128 | |
| 129 | class OpenAICompatibleProvider(BaseProvider): |
| 130 | """Base for providers using OpenAI-compatible APIs. |
| 131 | |
| 132 | Suitable for Together, Fireworks, Cerebras, xAI, Azure, and similar services. |
| 133 | """ |
| 134 | |
| 135 | provider_name: str = "" |
| 136 | base_url: str = "" |
| 137 | env_var: str = "" |
| 138 | |
| 139 | def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None): |
| 140 | from openai import OpenAI |
| 141 | |
| 142 | self._api_key = api_key or os.getenv(self.env_var, "") |
| 143 | self._base_url = base_url or self.base_url |
| 144 | self._client = OpenAI(api_key=self._api_key, base_url=self._base_url) |
| 145 | self._last_usage = None |
| 146 | |
| 147 | def chat( |
| 148 | self, |
| 149 | messages: list[dict], |
| 150 | max_tokens: int = 4096, |
| 151 | temperature: float = 0.7, |
| 152 | model: Optional[str] = None, |
| 153 | ) -> str: |
| 154 | model = model or "gpt-4o" |
| 155 | response = self._client.chat.completions.create( |
| 156 | model=model, |
| 157 | messages=messages, |
| 158 | max_tokens=max_tokens, |
| 159 | temperature=temperature, |
| 160 | ) |
| 161 | self._last_usage = { |
| 162 | "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0, |
| 163 | "output_tokens": getattr(response.usage, "completion_tokens", 0) |
| 164 | if response.usage |
| 165 | else 0, |
| 166 | } |
| 167 | return response.choices[0].message.content or "" |
| 168 | |
| 169 | def analyze_image( |
| 170 | self, |
| 171 | image_bytes: bytes, |
| 172 | prompt: str, |
| 173 | max_tokens: int = 4096, |
| 174 | model: Optional[str] = None, |
| 175 | ) -> str: |
| 176 | model = model or "gpt-4o" |
| 177 | b64 = base64.b64encode(image_bytes).decode() |
| 178 | response = self._client.chat.completions.create( |
| 179 | model=model, |
| 180 | messages=[ |
| 181 | { |
| 182 | "role": "user", |
| 183 | "content": [ |
| 184 | {"type": "text", "text": prompt}, |
| 185 | { |
| 186 | "type": "image_url", |
| 187 | "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, |
| 188 | }, |
| 189 | ], |
| 190 | } |
| 191 | ], |
| 192 | max_tokens=max_tokens, |
| 193 | ) |
| 194 | self._last_usage = { |
| 195 | "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0, |
| 196 | "output_tokens": getattr(response.usage, "completion_tokens", 0) |
| 197 | if response.usage |
| 198 | else 0, |
| 199 | } |
| 200 | return response.choices[0].message.content or "" |
| 201 | |
| 202 | def transcribe_audio( |
| 203 | self, |
| 204 | audio_path: str | Path, |
| 205 | language: Optional[str] = None, |
| 206 | model: Optional[str] = None, |
| 207 | ) -> dict: |
| 208 | raise NotImplementedError(f"{self.provider_name} does not support audio transcription") |
| 209 | |
| 210 | def list_models(self) -> list[ModelInfo]: |
| 211 | models = [] |
| 212 | try: |
| 213 | for m in self._client.models.list(): |
| 214 | mid = m.id |
| 215 | caps = ["chat"] |
| 216 | models.append( |
| 217 | ModelInfo( |
| 218 | id=mid, |
| 219 | provider=self.provider_name, |
| 220 | display_name=mid, |
| 221 | capabilities=caps, |
| 222 | ) |
| 223 | ) |
| 224 | except Exception as e: |
| 225 | logger.warning(f"Failed to list {self.provider_name} models: {e}") |
| 226 | return sorted(models, key=lambda m: m.id) |
| 227 | |
| 228 | DDED video_processor/providers/bedrock_provider.py |
| 229 | DDED video_processor/providers/cerebras_provider.py |
| 230 | DDED video_processor/providers/cohere_provider.py |
| --- a/video_processor/providers/bedrock_provider.py | ||
| +++ b/video_processor/providers/bedrock_provider.py | ||
| @@ -0,0 +1,193 @@ | ||
| 1 | +"""AWS Bedrock provider implementation.""" | |
| 2 | + | |
| 3 | +import base64 | |
| 4 | +import json | |
| 5 | +import logging | |
| 6 | +import os | |
| 7 | +from pathlib import Path | |
| 8 | +from typing import Optional | |
| 9 | + | |
| 10 | +from dotenv import load_dotenv | |
| 11 | + | |
| 12 | +from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry | |
| 13 | + | |
| 14 | +load_dotenv() | |
| 15 | +logger = logging.getLogger(__name__) | |
| 16 | + | |
| 17 | +# Curated list of popular Bedrock models | |
| 18 | +_BEDROCK_MODELS = [ | |
| 19 | + ModelInfo( | |
| 20 | + id="anthropic.claude-3-5-sonnet-20241022-v2:0", | |
| 21 | + provider="bedrock", | |
| 22 | + display_name="Claude 3.5 Sonnet v2", | |
| 23 | + capabilities=["chat", "vision"], | |
| 24 | + ), | |
| 25 | + ModelInfo( | |
| 26 | + id="anthropic.claude-3-sonnet-20240229-v1:0", | |
| 27 | + provider="bedrock", | |
| 28 | + display_name="Claude 3 Sonnet", | |
| 29 | + capabilities=["chat", "vision"], | |
| 30 | + ), | |
| 31 | + ModelInfo( | |
| 32 | + id="anthropic.claude-3-haiku-20240307-v1:0", | |
| 33 | + provider="bedrock", | |
| 34 | + display_name="Claude 3 Haiku", | |
| 35 | + capabilities=["chat", "vision"], | |
| 36 | + ), | |
| 37 | + ModelInfo( | |
| 38 | + id="amazon.titan-text-express-v1", | |
| 39 | + provider="bedrock", | |
| 40 | + display_name="Amazon Titan Text Express", | |
| 41 | + capabilities=["chat"], | |
| 42 | + ), | |
| 43 | + ModelInfo( | |
| 44 | + id="meta.llama3-70b-instruct-v1:0", | |
| 45 | + provider="bedrock", | |
| 46 | + display_name="Llama 3 70B Instruct", | |
| 47 | + capabilities=["chat"], | |
| 48 | + ), | |
| 49 | + ModelInfo( | |
| 50 | + id="mistral.mistral-large-2402-v1:0", | |
| 51 | + provider="bedrock", | |
| 52 | + display_name="Mistral Large", | |
| 53 | + capabilities=["chat"], | |
| 54 | + ), | |
| 55 | +] | |
| 56 | + | |
| 57 | + | |
| 58 | +class BedrockProvider(BaseProvider): | |
| 59 | + """AWS Bedrock provider using boto3.""" | |
| 60 | + | |
| 61 | + provider_name = "bedrock" | |
| 62 | + | |
| 63 | + def __init__( | |
| 64 | + self, | |
| 65 | + aws_access_key_id: Optional[str] = None, | |
| 66 | + aws_secret_access_key: Optional[str] = None, | |
| 67 | + region_name: Optional[str] = None, | |
| 68 | + ): | |
| 69 | + try: | |
| 70 | + import boto3 | |
| 71 | + except ImportError: | |
| 72 | + raise ImportError("boto3 package not installed. Install with: pip install boto3") | |
| 73 | + | |
| 74 | + self._boto3 = boto3 | |
| 75 | + self._region = region_name or os.getenv("AWS_DEFAULT_REGION", "us-east-1") | |
| 76 | + self._client = boto3.client( | |
| 77 | + "bedrock-runtime", | |
| 78 | + aws_access_key_id=aws_access_key_id or os.getenv("AWS_ACCESS_KEY_ID"), | |
| 79 | + aws_secret_access_key=aws_secret_access_key or os.getenv("AWS_SECRET_ACCESS_KEY"), | |
| 80 | + region_name=self._region, | |
| 81 | + ) | |
| 82 | + self._last_usage = {} | |
| 83 | + | |
| 84 | + def chat( | |
| 85 | + self, | |
| 86 | + messages: list[dict], | |
| 87 | + max_tokens: int = 4096, | |
| 88 | + temperature: float = 0.7, | |
| 89 | + model: Optional[str] = None, | |
| 90 | + ) -> str: | |
| 91 | + model = model or "anthropic.claude-3-sonnet-20240229-v1:0" | |
| 92 | + # Strip bedrock/ prefix if present | |
| 93 | + if model.startswith("bedrock/"): | |
| 94 | + model = model[len("bedrock/") :] | |
| 95 | + | |
| 96 | + body = json.dumps( | |
| 97 | + { | |
| 98 | + "anthropic_version": "bedrock-2023-05-31", | |
| 99 | + "max_tokens": max_tokens, | |
| 100 | + "temperature": temperature, | |
| 101 | + "messages": messages, | |
| 102 | + } | |
| 103 | + ) | |
| 104 | + | |
| 105 | + response = self._client.invoke_model( | |
| 106 | + modelId=model, | |
| 107 | + contentType="application/json", | |
| 108 | + accept="application/json", | |
| 109 | + body=body, | |
| 110 | + ) | |
| 111 | + | |
| 112 | + result = json.loads(response["body"].read()) | |
| 113 | + self._last_usage = { | |
| 114 | + "input_tokens": result.get("usage", {}).get("input_tokens", 0), | |
| 115 | + "output_tokens": result.get("usage", {}).get("output_tokens", 0), | |
| 116 | + } | |
| 117 | + return result.get("content", [{}])[0].get("text", "") | |
| 118 | + | |
| 119 | + def analyze_image( | |
| 120 | + self, | |
| 121 | + image_bytes: bytes, | |
| 122 | + prompt: str, | |
| 123 | + max_tokens: int = 4096, | |
| 124 | + model: Optional[str] = None, | |
| 125 | + ) -> str: | |
| 126 | + model = model or "anthropic.claude-3-sonnet-20240229-v1:0" | |
| 127 | + if model.startswith("bedrock/"): | |
| 128 | + model = model[len("bedrock/") :] | |
| 129 | + | |
| 130 | + b64 = base64.b64encode(image_bytes).decode() | |
| 131 | + body = json.dumps( | |
| 132 | + { | |
| 133 | + "anthropic_version": "bedrock-2023-05-31", | |
| 134 | + "max_tokens": max_tokens, | |
| 135 | + "messages": [ | |
| 136 | + { | |
| 137 | + "role": "user", | |
| 138 | + "content": [ | |
| 139 | + { | |
| 140 | + "type": "image", | |
| 141 | + "source": { | |
| 142 | + "type": "base64", | |
| 143 | + "media_type": "image/jpeg", | |
| 144 | + "data": b64, | |
| 145 | + }, | |
| 146 | + }, | |
| 147 | + {"type": "text", "text": prompt}, | |
| 148 | + ], | |
| 149 | + } | |
| 150 | + ], | |
| 151 | + } | |
| 152 | + ) | |
| 153 | + | |
| 154 | + response = self._client.invoke_model( | |
| 155 | + modelId=model, | |
| 156 | + contentType="application/json", | |
| 157 | + accept="application/json", | |
| 158 | + body=body, | |
| 159 | + ) | |
| 160 | + | |
| 161 | + result = json.loads(response["body"].read()) | |
| 162 | + self._last_usage = { | |
| 163 | + "input_tokens": result.get("usage", {}).get("input_tokens", 0), | |
| 164 | + "output_tokens": result.get("usage", {}).get("output_tokens", 0), | |
| 165 | + } | |
| 166 | + return result.get("content", [{}])[0].get("text", "") | |
| 167 | + | |
| 168 | + def transcribe_audio( | |
| 169 | + self, | |
| 170 | + audio_path: str | Path, | |
| 171 | + language: Optional[str] = None, | |
| 172 | + model: Optional[str] = None, | |
| 173 | + ) -> dict: | |
| 174 | + raise NotImplementedError( | |
| 175 | + "AWS Bedrock does not support audio transcription directly. " | |
| 176 | + "Use Amazon Transcribe or another provider for transcription." | |
| 177 | + ) | |
| 178 | + | |
| 179 | + def list_models(self) -> list[ModelInfo]: | |
| 180 | + return list(_BEDROCK_MODELS) | |
| 181 | + | |
| 182 | + | |
| 183 | +ProviderRegistry.register( | |
| 184 | + name="bedrock", | |
| 185 | + provider_class=BedrockProvider, | |
| 186 | + env_var="AWS_ACCESS_KEY_ID", | |
| 187 | + model_prefixes=["bedrock/"], | |
| 188 | + default_models={ | |
| 189 | + "chat": "anthropic.claude-3-sonnet-20240229-v1:0", | |
| 190 | + "vision": "anthropic.claude-3-sonnet-20240229-v1:0", | |
| 191 | + "audio": "", | |
| 192 | + }, | |
| 193 | +) |
| --- a/video_processor/providers/bedrock_provider.py | |
| +++ b/video_processor/providers/bedrock_provider.py | |
| @@ -0,0 +1,193 @@ | |
| --- a/video_processor/providers/bedrock_provider.py | |
| +++ b/video_processor/providers/bedrock_provider.py | |
| @@ -0,0 +1,193 @@ | |
| 1 | """AWS Bedrock provider implementation.""" |
| 2 | |
| 3 | import base64 |
| 4 | import json |
| 5 | import logging |
| 6 | import os |
| 7 | from pathlib import Path |
| 8 | from typing import Optional |
| 9 | |
| 10 | from dotenv import load_dotenv |
| 11 | |
| 12 | from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry |
| 13 | |
| 14 | load_dotenv() |
| 15 | logger = logging.getLogger(__name__) |
| 16 | |
| 17 | # Curated list of popular Bedrock models |
| 18 | _BEDROCK_MODELS = [ |
| 19 | ModelInfo( |
| 20 | id="anthropic.claude-3-5-sonnet-20241022-v2:0", |
| 21 | provider="bedrock", |
| 22 | display_name="Claude 3.5 Sonnet v2", |
| 23 | capabilities=["chat", "vision"], |
| 24 | ), |
| 25 | ModelInfo( |
| 26 | id="anthropic.claude-3-sonnet-20240229-v1:0", |
| 27 | provider="bedrock", |
| 28 | display_name="Claude 3 Sonnet", |
| 29 | capabilities=["chat", "vision"], |
| 30 | ), |
| 31 | ModelInfo( |
| 32 | id="anthropic.claude-3-haiku-20240307-v1:0", |
| 33 | provider="bedrock", |
| 34 | display_name="Claude 3 Haiku", |
| 35 | capabilities=["chat", "vision"], |
| 36 | ), |
| 37 | ModelInfo( |
| 38 | id="amazon.titan-text-express-v1", |
| 39 | provider="bedrock", |
| 40 | display_name="Amazon Titan Text Express", |
| 41 | capabilities=["chat"], |
| 42 | ), |
| 43 | ModelInfo( |
| 44 | id="meta.llama3-70b-instruct-v1:0", |
| 45 | provider="bedrock", |
| 46 | display_name="Llama 3 70B Instruct", |
| 47 | capabilities=["chat"], |
| 48 | ), |
| 49 | ModelInfo( |
| 50 | id="mistral.mistral-large-2402-v1:0", |
| 51 | provider="bedrock", |
| 52 | display_name="Mistral Large", |
| 53 | capabilities=["chat"], |
| 54 | ), |
| 55 | ] |
| 56 | |
| 57 | |
| 58 | class BedrockProvider(BaseProvider): |
| 59 | """AWS Bedrock provider using boto3.""" |
| 60 | |
| 61 | provider_name = "bedrock" |
| 62 | |
| 63 | def __init__( |
| 64 | self, |
| 65 | aws_access_key_id: Optional[str] = None, |
| 66 | aws_secret_access_key: Optional[str] = None, |
| 67 | region_name: Optional[str] = None, |
| 68 | ): |
| 69 | try: |
| 70 | import boto3 |
| 71 | except ImportError: |
| 72 | raise ImportError("boto3 package not installed. Install with: pip install boto3") |
| 73 | |
| 74 | self._boto3 = boto3 |
| 75 | self._region = region_name or os.getenv("AWS_DEFAULT_REGION", "us-east-1") |
| 76 | self._client = boto3.client( |
| 77 | "bedrock-runtime", |
| 78 | aws_access_key_id=aws_access_key_id or os.getenv("AWS_ACCESS_KEY_ID"), |
| 79 | aws_secret_access_key=aws_secret_access_key or os.getenv("AWS_SECRET_ACCESS_KEY"), |
| 80 | region_name=self._region, |
| 81 | ) |
| 82 | self._last_usage = {} |
| 83 | |
| 84 | def chat( |
| 85 | self, |
| 86 | messages: list[dict], |
| 87 | max_tokens: int = 4096, |
| 88 | temperature: float = 0.7, |
| 89 | model: Optional[str] = None, |
| 90 | ) -> str: |
| 91 | model = model or "anthropic.claude-3-sonnet-20240229-v1:0" |
| 92 | # Strip bedrock/ prefix if present |
| 93 | if model.startswith("bedrock/"): |
| 94 | model = model[len("bedrock/") :] |
| 95 | |
| 96 | body = json.dumps( |
| 97 | { |
| 98 | "anthropic_version": "bedrock-2023-05-31", |
| 99 | "max_tokens": max_tokens, |
| 100 | "temperature": temperature, |
| 101 | "messages": messages, |
| 102 | } |
| 103 | ) |
| 104 | |
| 105 | response = self._client.invoke_model( |
| 106 | modelId=model, |
| 107 | contentType="application/json", |
| 108 | accept="application/json", |
| 109 | body=body, |
| 110 | ) |
| 111 | |
| 112 | result = json.loads(response["body"].read()) |
| 113 | self._last_usage = { |
| 114 | "input_tokens": result.get("usage", {}).get("input_tokens", 0), |
| 115 | "output_tokens": result.get("usage", {}).get("output_tokens", 0), |
| 116 | } |
| 117 | return result.get("content", [{}])[0].get("text", "") |
| 118 | |
| 119 | def analyze_image( |
| 120 | self, |
| 121 | image_bytes: bytes, |
| 122 | prompt: str, |
| 123 | max_tokens: int = 4096, |
| 124 | model: Optional[str] = None, |
| 125 | ) -> str: |
| 126 | model = model or "anthropic.claude-3-sonnet-20240229-v1:0" |
| 127 | if model.startswith("bedrock/"): |
| 128 | model = model[len("bedrock/") :] |
| 129 | |
| 130 | b64 = base64.b64encode(image_bytes).decode() |
| 131 | body = json.dumps( |
| 132 | { |
| 133 | "anthropic_version": "bedrock-2023-05-31", |
| 134 | "max_tokens": max_tokens, |
| 135 | "messages": [ |
| 136 | { |
| 137 | "role": "user", |
| 138 | "content": [ |
| 139 | { |
| 140 | "type": "image", |
| 141 | "source": { |
| 142 | "type": "base64", |
| 143 | "media_type": "image/jpeg", |
| 144 | "data": b64, |
| 145 | }, |
| 146 | }, |
| 147 | {"type": "text", "text": prompt}, |
| 148 | ], |
| 149 | } |
| 150 | ], |
| 151 | } |
| 152 | ) |
| 153 | |
| 154 | response = self._client.invoke_model( |
| 155 | modelId=model, |
| 156 | contentType="application/json", |
| 157 | accept="application/json", |
| 158 | body=body, |
| 159 | ) |
| 160 | |
| 161 | result = json.loads(response["body"].read()) |
| 162 | self._last_usage = { |
| 163 | "input_tokens": result.get("usage", {}).get("input_tokens", 0), |
| 164 | "output_tokens": result.get("usage", {}).get("output_tokens", 0), |
| 165 | } |
| 166 | return result.get("content", [{}])[0].get("text", "") |
| 167 | |
| 168 | def transcribe_audio( |
| 169 | self, |
| 170 | audio_path: str | Path, |
| 171 | language: Optional[str] = None, |
| 172 | model: Optional[str] = None, |
| 173 | ) -> dict: |
| 174 | raise NotImplementedError( |
| 175 | "AWS Bedrock does not support audio transcription directly. " |
| 176 | "Use Amazon Transcribe or another provider for transcription." |
| 177 | ) |
| 178 | |
| 179 | def list_models(self) -> list[ModelInfo]: |
| 180 | return list(_BEDROCK_MODELS) |
| 181 | |
| 182 | |
| 183 | ProviderRegistry.register( |
| 184 | name="bedrock", |
| 185 | provider_class=BedrockProvider, |
| 186 | env_var="AWS_ACCESS_KEY_ID", |
| 187 | model_prefixes=["bedrock/"], |
| 188 | default_models={ |
| 189 | "chat": "anthropic.claude-3-sonnet-20240229-v1:0", |
| 190 | "vision": "anthropic.claude-3-sonnet-20240229-v1:0", |
| 191 | "audio": "", |
| 192 | }, |
| 193 | ) |
| --- a/video_processor/providers/cerebras_provider.py | ||
| +++ b/video_processor/providers/cerebras_provider.py | ||
| @@ -0,0 +1,20 @@ | ||
| 1 | +"""Cerebras provider implementation.""" | |
| 2 | + | |
| 3 | +from video_processor.providers.base import OpenAICompatibleProvider, ProviderRegistry | |
| 4 | + | |
| 5 | + | |
| 6 | +class CerebrasProvider(OpenAICompatibleProvider): | |
| 7 | + """Cerebras AI API provider (OpenAI-compatible).""" | |
| 8 | + | |
| 9 | + provider_name = "cerebras" | |
| 10 | + base_url = "https://api.cerebras.ai/v1" | |
| 11 | + env_var = "CEREBRAS_API_KEY" | |
| 12 | + | |
| 13 | + | |
| 14 | +ProviderRegistry.register( | |
| 15 | + name="cerebras", | |
| 16 | + provider_class=CerebrasProvider, | |
| 17 | + env_var="CEREBRAS_API_KEY", | |
| 18 | + model_prefixes=["cerebras/"], | |
| 19 | + default_models={"chat": "llama3.1-70b", "vision": "", "audio": ""}, | |
| 20 | +) |
| --- a/video_processor/providers/cerebras_provider.py | |
| +++ b/video_processor/providers/cerebras_provider.py | |
| @@ -0,0 +1,20 @@ | |
| --- a/video_processor/providers/cerebras_provider.py | |
| +++ b/video_processor/providers/cerebras_provider.py | |
| @@ -0,0 +1,20 @@ | |
| 1 | """Cerebras provider implementation.""" |
| 2 | |
| 3 | from video_processor.providers.base import OpenAICompatibleProvider, ProviderRegistry |
| 4 | |
| 5 | |
| 6 | class CerebrasProvider(OpenAICompatibleProvider): |
| 7 | """Cerebras AI API provider (OpenAI-compatible).""" |
| 8 | |
| 9 | provider_name = "cerebras" |
| 10 | base_url = "https://api.cerebras.ai/v1" |
| 11 | env_var = "CEREBRAS_API_KEY" |
| 12 | |
| 13 | |
| 14 | ProviderRegistry.register( |
| 15 | name="cerebras", |
| 16 | provider_class=CerebrasProvider, |
| 17 | env_var="CEREBRAS_API_KEY", |
| 18 | model_prefixes=["cerebras/"], |
| 19 | default_models={"chat": "llama3.1-70b", "vision": "", "audio": ""}, |
| 20 | ) |
| --- a/video_processor/providers/cohere_provider.py | ||
| +++ b/video_processor/providers/cohere_provider.py | ||
| @@ -0,0 +1,123 @@ | ||
| 1 | +"""Cohere provider implementation.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import os | |
| 5 | +from pathlib import Path | |
| 6 | +from typing import Optional | |
| 7 | + | |
| 8 | +from dotenv import load_dotenv | |
| 9 | + | |
| 10 | +from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry | |
| 11 | + | |
| 12 | +load_dotenv() | |
| 13 | +logger = logging.getLogger(__name__) | |
| 14 | + | |
| 15 | +# Curated list of Cohere models | |
| 16 | +_COHERE_MODELS = [ | |
| 17 | + ModelInfo( | |
| 18 | + id="command-r-plus", | |
| 19 | + provider="cohere", | |
| 20 | + display_name="Command R+", | |
| 21 | + capabilities=["chat"], | |
| 22 | + ), | |
| 23 | + ModelInfo( | |
| 24 | + id="command-r", | |
| 25 | + provider="cohere", | |
| 26 | + display_name="Command R", | |
| 27 | + capabilities=["chat"], | |
| 28 | + ), | |
| 29 | + ModelInfo( | |
| 30 | + id="command-light", | |
| 31 | + provider="cohere", | |
| 32 | + display_name="Command Light", | |
| 33 | + capabilities=["chat"], | |
| 34 | + ), | |
| 35 | + ModelInfo( | |
| 36 | + id="command-nightly", | |
| 37 | + provider="cohere", | |
| 38 | + display_name="Command Nightly", | |
| 39 | + capabilities=["chat"], | |
| 40 | + ), | |
| 41 | +] | |
| 42 | + | |
| 43 | + | |
| 44 | +class CohereProvider(BaseProvider): | |
| 45 | + """Cohere provider using the cohere SDK.""" | |
| 46 | + | |
| 47 | + provider_name = "cohere" | |
| 48 | + | |
| 49 | + def __init__(self, api_key: Optional[str] = None): | |
| 50 | + try: | |
| 51 | + import cohere | |
| 52 | + except ImportError: | |
| 53 | + raise ImportError("cohere package not installed. Install with: pip install cohere") | |
| 54 | + | |
| 55 | + self._api_key = api_key or os.getenv("COHERE_API_KEY") | |
| 56 | + if not self._api_key: | |
| 57 | + raise ValueError("COHERE_API_KEY not set") | |
| 58 | + | |
| 59 | + self._client = cohere.ClientV2(api_key=self._api_key) | |
| 60 | + self._last_usage = {} | |
| 61 | + | |
| 62 | + def chat( | |
| 63 | + self, | |
| 64 | + messages: list[dict], | |
| 65 | + max_tokens: int = 4096, | |
| 66 | + temperature: float = 0.7, | |
| 67 | + model: Optional[str] = None, | |
| 68 | + ) -> str: | |
| 69 | + model = model or "command-r-plus" | |
| 70 | + | |
| 71 | + response = self._client.chat( | |
| 72 | + model=model, | |
| 73 | + messages=messages, | |
| 74 | + max_tokens=max_tokens, | |
| 75 | + temperature=temperature, | |
| 76 | + ) | |
| 77 | + | |
| 78 | + usage = getattr(response, "usage", None) | |
| 79 | + tokens = getattr(usage, "tokens", None) if usage else None | |
| 80 | + self._last_usage = { | |
| 81 | + "input_tokens": getattr(tokens, "input_tokens", 0) if tokens else 0, | |
| 82 | + "output_tokens": getattr(tokens, "output_tokens", 0) if tokens else 0, | |
| 83 | + } | |
| 84 | + return response.message.content[0].text if response.message.content else "" | |
| 85 | + | |
| 86 | + def analyze_image( | |
| 87 | + self, | |
| 88 | + image_bytes: bytes, | |
| 89 | + prompt: str, | |
| 90 | + max_tokens: int = 4096, | |
| 91 | + model: Optional[str] = None, | |
| 92 | + ) -> str: | |
| 93 | + raise NotImplementedError( | |
| 94 | + "Cohere does not currently support vision/image analysis. " | |
| 95 | + "Use OpenAI, Anthropic, or Gemini for image analysis." | |
| 96 | + ) | |
| 97 | + | |
| 98 | + def transcribe_audio( | |
| 99 | + self, | |
| 100 | + audio_path: str | Path, | |
| 101 | + language: Optional[str] = None, | |
| 102 | + model: Optional[str] = None, | |
| 103 | + ) -> dict: | |
| 104 | + raise NotImplementedError( | |
| 105 | + "Cohere does not provide a transcription API. " | |
| 106 | + "Use OpenAI Whisper or Gemini for transcription." | |
| 107 | + ) | |
| 108 | + | |
| 109 | + def list_models(self) -> list[ModelInfo]: | |
| 110 | + return list(_COHERE_MODELS) | |
| 111 | + | |
| 112 | + | |
| 113 | +ProviderRegistry.register( | |
| 114 | + name="cohere", | |
| 115 | + provider_class=CohereProvider, | |
| 116 | + env_var="COHERE_API_KEY", | |
| 117 | + model_prefixes=["command-"], | |
| 118 | + default_models={ | |
| 119 | + "chat": "command-r-plus", | |
| 120 | + "vision": "", | |
| 121 | + "audio": "", | |
| 122 | + }, | |
| 123 | +) |
| --- a/video_processor/providers/cohere_provider.py | |
| +++ b/video_processor/providers/cohere_provider.py | |
| @@ -0,0 +1,123 @@ | |
| --- a/video_processor/providers/cohere_provider.py | |
| +++ b/video_processor/providers/cohere_provider.py | |
| @@ -0,0 +1,123 @@ | |
| 1 | """Cohere provider implementation.""" |
| 2 | |
| 3 | import logging |
| 4 | import os |
| 5 | from pathlib import Path |
| 6 | from typing import Optional |
| 7 | |
| 8 | from dotenv import load_dotenv |
| 9 | |
| 10 | from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry |
| 11 | |
| 12 | load_dotenv() |
| 13 | logger = logging.getLogger(__name__) |
| 14 | |
| 15 | # Curated list of Cohere models |
| 16 | _COHERE_MODELS = [ |
| 17 | ModelInfo( |
| 18 | id="command-r-plus", |
| 19 | provider="cohere", |
| 20 | display_name="Command R+", |
| 21 | capabilities=["chat"], |
| 22 | ), |
| 23 | ModelInfo( |
| 24 | id="command-r", |
| 25 | provider="cohere", |
| 26 | display_name="Command R", |
| 27 | capabilities=["chat"], |
| 28 | ), |
| 29 | ModelInfo( |
| 30 | id="command-light", |
| 31 | provider="cohere", |
| 32 | display_name="Command Light", |
| 33 | capabilities=["chat"], |
| 34 | ), |
| 35 | ModelInfo( |
| 36 | id="command-nightly", |
| 37 | provider="cohere", |
| 38 | display_name="Command Nightly", |
| 39 | capabilities=["chat"], |
| 40 | ), |
| 41 | ] |
| 42 | |
| 43 | |
| 44 | class CohereProvider(BaseProvider): |
| 45 | """Cohere provider using the cohere SDK.""" |
| 46 | |
| 47 | provider_name = "cohere" |
| 48 | |
| 49 | def __init__(self, api_key: Optional[str] = None): |
| 50 | try: |
| 51 | import cohere |
| 52 | except ImportError: |
| 53 | raise ImportError("cohere package not installed. Install with: pip install cohere") |
| 54 | |
| 55 | self._api_key = api_key or os.getenv("COHERE_API_KEY") |
| 56 | if not self._api_key: |
| 57 | raise ValueError("COHERE_API_KEY not set") |
| 58 | |
| 59 | self._client = cohere.ClientV2(api_key=self._api_key) |
| 60 | self._last_usage = {} |
| 61 | |
| 62 | def chat( |
| 63 | self, |
| 64 | messages: list[dict], |
| 65 | max_tokens: int = 4096, |
| 66 | temperature: float = 0.7, |
| 67 | model: Optional[str] = None, |
| 68 | ) -> str: |
| 69 | model = model or "command-r-plus" |
| 70 | |
| 71 | response = self._client.chat( |
| 72 | model=model, |
| 73 | messages=messages, |
| 74 | max_tokens=max_tokens, |
| 75 | temperature=temperature, |
| 76 | ) |
| 77 | |
| 78 | usage = getattr(response, "usage", None) |
| 79 | tokens = getattr(usage, "tokens", None) if usage else None |
| 80 | self._last_usage = { |
| 81 | "input_tokens": getattr(tokens, "input_tokens", 0) if tokens else 0, |
| 82 | "output_tokens": getattr(tokens, "output_tokens", 0) if tokens else 0, |
| 83 | } |
| 84 | return response.message.content[0].text if response.message.content else "" |
| 85 | |
| 86 | def analyze_image( |
| 87 | self, |
| 88 | image_bytes: bytes, |
| 89 | prompt: str, |
| 90 | max_tokens: int = 4096, |
| 91 | model: Optional[str] = None, |
| 92 | ) -> str: |
| 93 | raise NotImplementedError( |
| 94 | "Cohere does not currently support vision/image analysis. " |
| 95 | "Use OpenAI, Anthropic, or Gemini for image analysis." |
| 96 | ) |
| 97 | |
| 98 | def transcribe_audio( |
| 99 | self, |
| 100 | audio_path: str | Path, |
| 101 | language: Optional[str] = None, |
| 102 | model: Optional[str] = None, |
| 103 | ) -> dict: |
| 104 | raise NotImplementedError( |
| 105 | "Cohere does not provide a transcription API. " |
| 106 | "Use OpenAI Whisper or Gemini for transcription." |
| 107 | ) |
| 108 | |
| 109 | def list_models(self) -> list[ModelInfo]: |
| 110 | return list(_COHERE_MODELS) |
| 111 | |
| 112 | |
| 113 | ProviderRegistry.register( |
| 114 | name="cohere", |
| 115 | provider_class=CohereProvider, |
| 116 | env_var="COHERE_API_KEY", |
| 117 | model_prefixes=["command-"], |
| 118 | default_models={ |
| 119 | "chat": "command-r-plus", |
| 120 | "vision": "", |
| 121 | "audio": "", |
| 122 | }, |
| 123 | ) |
+56
-53
| --- video_processor/providers/discovery.py | ||
| +++ video_processor/providers/discovery.py | ||
| @@ -4,17 +4,27 @@ | ||
| 4 | 4 | import os |
| 5 | 5 | from typing import Optional |
| 6 | 6 | |
| 7 | 7 | from dotenv import load_dotenv |
| 8 | 8 | |
| 9 | -from video_processor.providers.base import ModelInfo | |
| 9 | +from video_processor.providers.base import ModelInfo, ProviderRegistry | |
| 10 | 10 | |
| 11 | 11 | load_dotenv() |
| 12 | 12 | logger = logging.getLogger(__name__) |
| 13 | 13 | |
| 14 | 14 | _cached_models: Optional[list[ModelInfo]] = None |
| 15 | 15 | |
| 16 | + | |
| 17 | +def _ensure_providers_registered() -> None: | |
| 18 | + """Import all built-in provider modules so they register themselves.""" | |
| 19 | + if ProviderRegistry.all_registered(): | |
| 20 | + return | |
| 21 | + import video_processor.providers.anthropic_provider # noqa: F401 | |
| 22 | + import video_processor.providers.gemini_provider # noqa: F401 | |
| 23 | + import video_processor.providers.ollama_provider # noqa: F401 | |
| 24 | + import video_processor.providers.openai_provider # noqa: F401 | |
| 25 | + | |
| 16 | 26 | |
| 17 | 27 | def discover_available_models( |
| 18 | 28 | api_keys: Optional[dict[str, str]] = None, |
| 19 | 29 | force_refresh: bool = False, |
| 20 | 30 | ) -> list[ModelInfo]: |
| @@ -26,70 +36,63 @@ | ||
| 26 | 36 | """ |
| 27 | 37 | global _cached_models |
| 28 | 38 | if _cached_models is not None and not force_refresh: |
| 29 | 39 | return _cached_models |
| 30 | 40 | |
| 41 | + _ensure_providers_registered() | |
| 42 | + | |
| 31 | 43 | keys = api_keys or { |
| 32 | 44 | "openai": os.getenv("OPENAI_API_KEY", ""), |
| 33 | 45 | "anthropic": os.getenv("ANTHROPIC_API_KEY", ""), |
| 34 | 46 | "gemini": os.getenv("GEMINI_API_KEY", ""), |
| 35 | 47 | } |
| 36 | 48 | |
| 37 | 49 | all_models: list[ModelInfo] = [] |
| 38 | 50 | |
| 39 | - # OpenAI | |
| 40 | - if keys.get("openai"): | |
| 41 | - try: | |
| 42 | - from video_processor.providers.openai_provider import OpenAIProvider | |
| 43 | - | |
| 44 | - provider = OpenAIProvider(api_key=keys["openai"]) | |
| 45 | - models = provider.list_models() | |
| 46 | - logger.info(f"Discovered {len(models)} OpenAI models") | |
| 47 | - all_models.extend(models) | |
| 48 | - except Exception as e: | |
| 49 | - logger.info(f"OpenAI discovery skipped: {e}") | |
| 50 | - | |
| 51 | - # Anthropic | |
| 52 | - if keys.get("anthropic"): | |
| 53 | - try: | |
| 54 | - from video_processor.providers.anthropic_provider import AnthropicProvider | |
| 55 | - | |
| 56 | - provider = AnthropicProvider(api_key=keys["anthropic"]) | |
| 57 | - models = provider.list_models() | |
| 58 | - logger.info(f"Discovered {len(models)} Anthropic models") | |
| 59 | - all_models.extend(models) | |
| 60 | - except Exception as e: | |
| 61 | - logger.info(f"Anthropic discovery skipped: {e}") | |
| 62 | - | |
| 63 | - # Gemini (API key or service account) | |
| 64 | - gemini_key = keys.get("gemini") | |
| 65 | - gemini_creds = os.getenv("GOOGLE_APPLICATION_CREDENTIALS", "") | |
| 66 | - if gemini_key or gemini_creds: | |
| 67 | - try: | |
| 68 | - from video_processor.providers.gemini_provider import GeminiProvider | |
| 69 | - | |
| 70 | - provider = GeminiProvider( | |
| 71 | - api_key=gemini_key or None, | |
| 72 | - credentials_path=gemini_creds or None, | |
| 73 | - ) | |
| 74 | - models = provider.list_models() | |
| 75 | - logger.info(f"Discovered {len(models)} Gemini models") | |
| 76 | - all_models.extend(models) | |
| 77 | - except Exception as e: | |
| 78 | - logger.warning(f"Gemini discovery failed: {e}") | |
| 79 | - | |
| 80 | - # Ollama (local, no API key needed) | |
| 81 | - try: | |
| 82 | - from video_processor.providers.ollama_provider import OllamaProvider | |
| 83 | - | |
| 84 | - if OllamaProvider.is_available(): | |
| 85 | - provider = OllamaProvider() | |
| 86 | - models = provider.list_models() | |
| 87 | - logger.info(f"Discovered {len(models)} Ollama models") | |
| 88 | - all_models.extend(models) | |
| 89 | - except Exception as e: | |
| 90 | - logger.info(f"Ollama discovery skipped: {e}") | |
| 51 | + for name, info in ProviderRegistry.all_registered().items(): | |
| 52 | + env_var = info.get("env_var", "") | |
| 53 | + provider_class = info["class"] | |
| 54 | + | |
| 55 | + if name == "ollama": | |
| 56 | + # Ollama: no API key, check server availability | |
| 57 | + try: | |
| 58 | + if provider_class.is_available(): | |
| 59 | + provider = provider_class() | |
| 60 | + models = provider.list_models() | |
| 61 | + logger.info(f"Discovered {len(models)} Ollama models") | |
| 62 | + all_models.extend(models) | |
| 63 | + except Exception as e: | |
| 64 | + logger.info(f"Ollama discovery skipped: {e}") | |
| 65 | + continue | |
| 66 | + | |
| 67 | + # For key-based providers, check the api_keys dict first, then env var | |
| 68 | + key = keys.get(name, "") | |
| 69 | + if not key and env_var: | |
| 70 | + key = os.getenv(env_var, "") | |
| 71 | + | |
| 72 | + # Special case: Gemini also supports service account credentials | |
| 73 | + gemini_creds = "" | |
| 74 | + if name == "gemini": | |
| 75 | + gemini_creds = os.getenv("GOOGLE_APPLICATION_CREDENTIALS", "") | |
| 76 | + | |
| 77 | + if not key and not gemini_creds: | |
| 78 | + continue | |
| 79 | + | |
| 80 | + try: | |
| 81 | + # Handle provider-specific constructor args | |
| 82 | + if name == "gemini": | |
| 83 | + provider = provider_class( | |
| 84 | + api_key=key or None, | |
| 85 | + credentials_path=gemini_creds or None, | |
| 86 | + ) | |
| 87 | + else: | |
| 88 | + provider = provider_class(api_key=key) | |
| 89 | + models = provider.list_models() | |
| 90 | + logger.info(f"Discovered {len(models)} {name.capitalize()} models") | |
| 91 | + all_models.extend(models) | |
| 92 | + except Exception as e: | |
| 93 | + logger.info(f"{name.capitalize()} discovery skipped: {e}") | |
| 91 | 94 | |
| 92 | 95 | # Sort by provider then id |
| 93 | 96 | all_models.sort(key=lambda m: (m.provider, m.id)) |
| 94 | 97 | _cached_models = all_models |
| 95 | 98 | logger.info(f"Total discovered models: {len(all_models)}") |
| 96 | 99 | |
| 97 | 100 | ADDED video_processor/providers/fireworks_provider.py |
| --- video_processor/providers/discovery.py | |
| +++ video_processor/providers/discovery.py | |
| @@ -4,17 +4,27 @@ | |
| 4 | import os |
| 5 | from typing import Optional |
| 6 | |
| 7 | from dotenv import load_dotenv |
| 8 | |
| 9 | from video_processor.providers.base import ModelInfo |
| 10 | |
| 11 | load_dotenv() |
| 12 | logger = logging.getLogger(__name__) |
| 13 | |
| 14 | _cached_models: Optional[list[ModelInfo]] = None |
| 15 | |
| 16 | |
| 17 | def discover_available_models( |
| 18 | api_keys: Optional[dict[str, str]] = None, |
| 19 | force_refresh: bool = False, |
| 20 | ) -> list[ModelInfo]: |
| @@ -26,70 +36,63 @@ | |
| 26 | """ |
| 27 | global _cached_models |
| 28 | if _cached_models is not None and not force_refresh: |
| 29 | return _cached_models |
| 30 | |
| 31 | keys = api_keys or { |
| 32 | "openai": os.getenv("OPENAI_API_KEY", ""), |
| 33 | "anthropic": os.getenv("ANTHROPIC_API_KEY", ""), |
| 34 | "gemini": os.getenv("GEMINI_API_KEY", ""), |
| 35 | } |
| 36 | |
| 37 | all_models: list[ModelInfo] = [] |
| 38 | |
| 39 | # OpenAI |
| 40 | if keys.get("openai"): |
| 41 | try: |
| 42 | from video_processor.providers.openai_provider import OpenAIProvider |
| 43 | |
| 44 | provider = OpenAIProvider(api_key=keys["openai"]) |
| 45 | models = provider.list_models() |
| 46 | logger.info(f"Discovered {len(models)} OpenAI models") |
| 47 | all_models.extend(models) |
| 48 | except Exception as e: |
| 49 | logger.info(f"OpenAI discovery skipped: {e}") |
| 50 | |
| 51 | # Anthropic |
| 52 | if keys.get("anthropic"): |
| 53 | try: |
| 54 | from video_processor.providers.anthropic_provider import AnthropicProvider |
| 55 | |
| 56 | provider = AnthropicProvider(api_key=keys["anthropic"]) |
| 57 | models = provider.list_models() |
| 58 | logger.info(f"Discovered {len(models)} Anthropic models") |
| 59 | all_models.extend(models) |
| 60 | except Exception as e: |
| 61 | logger.info(f"Anthropic discovery skipped: {e}") |
| 62 | |
| 63 | # Gemini (API key or service account) |
| 64 | gemini_key = keys.get("gemini") |
| 65 | gemini_creds = os.getenv("GOOGLE_APPLICATION_CREDENTIALS", "") |
| 66 | if gemini_key or gemini_creds: |
| 67 | try: |
| 68 | from video_processor.providers.gemini_provider import GeminiProvider |
| 69 | |
| 70 | provider = GeminiProvider( |
| 71 | api_key=gemini_key or None, |
| 72 | credentials_path=gemini_creds or None, |
| 73 | ) |
| 74 | models = provider.list_models() |
| 75 | logger.info(f"Discovered {len(models)} Gemini models") |
| 76 | all_models.extend(models) |
| 77 | except Exception as e: |
| 78 | logger.warning(f"Gemini discovery failed: {e}") |
| 79 | |
| 80 | # Ollama (local, no API key needed) |
| 81 | try: |
| 82 | from video_processor.providers.ollama_provider import OllamaProvider |
| 83 | |
| 84 | if OllamaProvider.is_available(): |
| 85 | provider = OllamaProvider() |
| 86 | models = provider.list_models() |
| 87 | logger.info(f"Discovered {len(models)} Ollama models") |
| 88 | all_models.extend(models) |
| 89 | except Exception as e: |
| 90 | logger.info(f"Ollama discovery skipped: {e}") |
| 91 | |
| 92 | # Sort by provider then id |
| 93 | all_models.sort(key=lambda m: (m.provider, m.id)) |
| 94 | _cached_models = all_models |
| 95 | logger.info(f"Total discovered models: {len(all_models)}") |
| 96 | |
| 97 | DDED video_processor/providers/fireworks_provider.py |
| --- video_processor/providers/discovery.py | |
| +++ video_processor/providers/discovery.py | |
| @@ -4,17 +4,27 @@ | |
| 4 | import os |
| 5 | from typing import Optional |
| 6 | |
| 7 | from dotenv import load_dotenv |
| 8 | |
| 9 | from video_processor.providers.base import ModelInfo, ProviderRegistry |
| 10 | |
| 11 | load_dotenv() |
| 12 | logger = logging.getLogger(__name__) |
| 13 | |
| 14 | _cached_models: Optional[list[ModelInfo]] = None |
| 15 | |
| 16 | |
| 17 | def _ensure_providers_registered() -> None: |
| 18 | """Import all built-in provider modules so they register themselves.""" |
| 19 | if ProviderRegistry.all_registered(): |
| 20 | return |
| 21 | import video_processor.providers.anthropic_provider # noqa: F401 |
| 22 | import video_processor.providers.gemini_provider # noqa: F401 |
| 23 | import video_processor.providers.ollama_provider # noqa: F401 |
| 24 | import video_processor.providers.openai_provider # noqa: F401 |
| 25 | |
| 26 | |
| 27 | def discover_available_models( |
| 28 | api_keys: Optional[dict[str, str]] = None, |
| 29 | force_refresh: bool = False, |
| 30 | ) -> list[ModelInfo]: |
| @@ -26,70 +36,63 @@ | |
| 36 | """ |
| 37 | global _cached_models |
| 38 | if _cached_models is not None and not force_refresh: |
| 39 | return _cached_models |
| 40 | |
| 41 | _ensure_providers_registered() |
| 42 | |
| 43 | keys = api_keys or { |
| 44 | "openai": os.getenv("OPENAI_API_KEY", ""), |
| 45 | "anthropic": os.getenv("ANTHROPIC_API_KEY", ""), |
| 46 | "gemini": os.getenv("GEMINI_API_KEY", ""), |
| 47 | } |
| 48 | |
| 49 | all_models: list[ModelInfo] = [] |
| 50 | |
| 51 | for name, info in ProviderRegistry.all_registered().items(): |
| 52 | env_var = info.get("env_var", "") |
| 53 | provider_class = info["class"] |
| 54 | |
| 55 | if name == "ollama": |
| 56 | # Ollama: no API key, check server availability |
| 57 | try: |
| 58 | if provider_class.is_available(): |
| 59 | provider = provider_class() |
| 60 | models = provider.list_models() |
| 61 | logger.info(f"Discovered {len(models)} Ollama models") |
| 62 | all_models.extend(models) |
| 63 | except Exception as e: |
| 64 | logger.info(f"Ollama discovery skipped: {e}") |
| 65 | continue |
| 66 | |
| 67 | # For key-based providers, check the api_keys dict first, then env var |
| 68 | key = keys.get(name, "") |
| 69 | if not key and env_var: |
| 70 | key = os.getenv(env_var, "") |
| 71 | |
| 72 | # Special case: Gemini also supports service account credentials |
| 73 | gemini_creds = "" |
| 74 | if name == "gemini": |
| 75 | gemini_creds = os.getenv("GOOGLE_APPLICATION_CREDENTIALS", "") |
| 76 | |
| 77 | if not key and not gemini_creds: |
| 78 | continue |
| 79 | |
| 80 | try: |
| 81 | # Handle provider-specific constructor args |
| 82 | if name == "gemini": |
| 83 | provider = provider_class( |
| 84 | api_key=key or None, |
| 85 | credentials_path=gemini_creds or None, |
| 86 | ) |
| 87 | else: |
| 88 | provider = provider_class(api_key=key) |
| 89 | models = provider.list_models() |
| 90 | logger.info(f"Discovered {len(models)} {name.capitalize()} models") |
| 91 | all_models.extend(models) |
| 92 | except Exception as e: |
| 93 | logger.info(f"{name.capitalize()} discovery skipped: {e}") |
| 94 | |
| 95 | # Sort by provider then id |
| 96 | all_models.sort(key=lambda m: (m.provider, m.id)) |
| 97 | _cached_models = all_models |
| 98 | logger.info(f"Total discovered models: {len(all_models)}") |
| 99 | |
| 100 | DDED video_processor/providers/fireworks_provider.py |
| --- a/video_processor/providers/fireworks_provider.py | ||
| +++ b/video_processor/providers/fireworks_provider.py | ||
| @@ -0,0 +1,24 @@ | ||
| 1 | +"""Fireworks AI provider implementation.""" | |
| 2 | + | |
| 3 | +from video_processor.providers.base import OpenAICompatibleProvider, ProviderRegistry | |
| 4 | + | |
| 5 | + | |
| 6 | +class FireworksProvider(OpenAICompatibleProvider): | |
| 7 | + """Fireworks AI API provider (OpenAI-compatible).""" | |
| 8 | + | |
| 9 | + provider_name = "fireworks" | |
| 10 | + base_url = "https://api.fireworks.ai/inference/v1" | |
| 11 | + env_var = "FIREWORKS_API_KEY" | |
| 12 | + | |
| 13 | + | |
| 14 | +ProviderRegistry.register( | |
| 15 | + name="fireworks", | |
| 16 | + provider_class=FireworksProvider, | |
| 17 | + env_var="FIREWORKS_API_KEY", | |
| 18 | + model_prefixes=["accounts/fireworks/"], | |
| 19 | + default_models={ | |
| 20 | + "chat": "accounts/fireworks/models/llama-v3p1-70b-instruct", | |
| 21 | + "vision": "", | |
| 22 | + "audio": "", | |
| 23 | + }, | |
| 24 | +) |
| --- a/video_processor/providers/fireworks_provider.py | |
| +++ b/video_processor/providers/fireworks_provider.py | |
| @@ -0,0 +1,24 @@ | |
| --- a/video_processor/providers/fireworks_provider.py | |
| +++ b/video_processor/providers/fireworks_provider.py | |
| @@ -0,0 +1,24 @@ | |
| 1 | """Fireworks AI provider implementation.""" |
| 2 | |
| 3 | from video_processor.providers.base import OpenAICompatibleProvider, ProviderRegistry |
| 4 | |
| 5 | |
| 6 | class FireworksProvider(OpenAICompatibleProvider): |
| 7 | """Fireworks AI API provider (OpenAI-compatible).""" |
| 8 | |
| 9 | provider_name = "fireworks" |
| 10 | base_url = "https://api.fireworks.ai/inference/v1" |
| 11 | env_var = "FIREWORKS_API_KEY" |
| 12 | |
| 13 | |
| 14 | ProviderRegistry.register( |
| 15 | name="fireworks", |
| 16 | provider_class=FireworksProvider, |
| 17 | env_var="FIREWORKS_API_KEY", |
| 18 | model_prefixes=["accounts/fireworks/"], |
| 19 | default_models={ |
| 20 | "chat": "accounts/fireworks/models/llama-v3p1-70b-instruct", |
| 21 | "vision": "", |
| 22 | "audio": "", |
| 23 | }, |
| 24 | ) |
| --- video_processor/providers/gemini_provider.py | ||
| +++ video_processor/providers/gemini_provider.py | ||
| @@ -5,11 +5,11 @@ | ||
| 5 | 5 | from pathlib import Path |
| 6 | 6 | from typing import Optional |
| 7 | 7 | |
| 8 | 8 | from dotenv import load_dotenv |
| 9 | 9 | |
| 10 | -from video_processor.providers.base import BaseProvider, ModelInfo | |
| 10 | +from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry | |
| 11 | 11 | |
| 12 | 12 | load_dotenv() |
| 13 | 13 | logger = logging.getLogger(__name__) |
| 14 | 14 | |
| 15 | 15 | # Capabilities inferred from model id patterns |
| @@ -218,5 +218,18 @@ | ||
| 218 | 218 | ) |
| 219 | 219 | ) |
| 220 | 220 | except Exception as e: |
| 221 | 221 | logger.warning(f"Failed to list Gemini models: {e}") |
| 222 | 222 | return sorted(models, key=lambda m: m.id) |
| 223 | + | |
| 224 | + | |
| 225 | +ProviderRegistry.register( | |
| 226 | + name="gemini", | |
| 227 | + provider_class=GeminiProvider, | |
| 228 | + env_var="GEMINI_API_KEY", | |
| 229 | + model_prefixes=["gemini-"], | |
| 230 | + default_models={ | |
| 231 | + "chat": "gemini-2.5-flash", | |
| 232 | + "vision": "gemini-2.5-flash", | |
| 233 | + "audio": "gemini-2.5-flash", | |
| 234 | + }, | |
| 235 | +) | |
| 223 | 236 | |
| 224 | 237 | ADDED video_processor/providers/huggingface_provider.py |
| 225 | 238 | ADDED video_processor/providers/litellm_provider.py |
| --- video_processor/providers/gemini_provider.py | |
| +++ video_processor/providers/gemini_provider.py | |
| @@ -5,11 +5,11 @@ | |
| 5 | from pathlib import Path |
| 6 | from typing import Optional |
| 7 | |
| 8 | from dotenv import load_dotenv |
| 9 | |
| 10 | from video_processor.providers.base import BaseProvider, ModelInfo |
| 11 | |
| 12 | load_dotenv() |
| 13 | logger = logging.getLogger(__name__) |
| 14 | |
| 15 | # Capabilities inferred from model id patterns |
| @@ -218,5 +218,18 @@ | |
| 218 | ) |
| 219 | ) |
| 220 | except Exception as e: |
| 221 | logger.warning(f"Failed to list Gemini models: {e}") |
| 222 | return sorted(models, key=lambda m: m.id) |
| 223 | |
| 224 | DDED video_processor/providers/huggingface_provider.py |
| 225 | DDED video_processor/providers/litellm_provider.py |
| --- video_processor/providers/gemini_provider.py | |
| +++ video_processor/providers/gemini_provider.py | |
| @@ -5,11 +5,11 @@ | |
| 5 | from pathlib import Path |
| 6 | from typing import Optional |
| 7 | |
| 8 | from dotenv import load_dotenv |
| 9 | |
| 10 | from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry |
| 11 | |
| 12 | load_dotenv() |
| 13 | logger = logging.getLogger(__name__) |
| 14 | |
| 15 | # Capabilities inferred from model id patterns |
| @@ -218,5 +218,18 @@ | |
| 218 | ) |
| 219 | ) |
| 220 | except Exception as e: |
| 221 | logger.warning(f"Failed to list Gemini models: {e}") |
| 222 | return sorted(models, key=lambda m: m.id) |
| 223 | |
| 224 | |
| 225 | ProviderRegistry.register( |
| 226 | name="gemini", |
| 227 | provider_class=GeminiProvider, |
| 228 | env_var="GEMINI_API_KEY", |
| 229 | model_prefixes=["gemini-"], |
| 230 | default_models={ |
| 231 | "chat": "gemini-2.5-flash", |
| 232 | "vision": "gemini-2.5-flash", |
| 233 | "audio": "gemini-2.5-flash", |
| 234 | }, |
| 235 | ) |
| 236 | |
| 237 | DDED video_processor/providers/huggingface_provider.py |
| 238 | DDED video_processor/providers/litellm_provider.py |
| --- a/video_processor/providers/huggingface_provider.py | ||
| +++ b/video_processor/providers/huggingface_provider.py | ||
| @@ -0,0 +1,187 @@ | ||
| 1 | +"""Hugging Face Inference API provider implementation.""" | |
| 2 | + | |
| 3 | +import base64 | |
| 4 | +import logging | |
| 5 | +import os | |
| 6 | +from pathlib import Path | |
| 7 | +from typing import Optional | |
| 8 | + | |
| 9 | +from dotenv import load_dotenv | |
| 10 | + | |
| 11 | +from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry | |
| 12 | + | |
| 13 | +load_dotenv() | |
| 14 | +logger = logging.getLogger(__name__) | |
| 15 | + | |
| 16 | +# Curated list of popular HF Inference models | |
| 17 | +_HF_MODELS = [ | |
| 18 | + ModelInfo( | |
| 19 | + id="meta-llama/Llama-3.1-70B-Instruct", | |
| 20 | + provider="huggingface", | |
| 21 | + display_name="Llama 3.1 70B Instruct", | |
| 22 | + capabilities=["chat"], | |
| 23 | + ), | |
| 24 | + ModelInfo( | |
| 25 | + id="meta-llama/Llama-3.1-8B-Instruct", | |
| 26 | + provider="huggingface", | |
| 27 | + display_name="Llama 3.1 8B Instruct", | |
| 28 | + capabilities=["chat"], | |
| 29 | + ), | |
| 30 | + ModelInfo( | |
| 31 | + id="mistralai/Mixtral-8x7B-Instruct-v0.1", | |
| 32 | + provider="huggingface", | |
| 33 | + display_name="Mixtral 8x7B Instruct", | |
| 34 | + capabilities=["chat"], | |
| 35 | + ), | |
| 36 | + ModelInfo( | |
| 37 | + id="microsoft/Phi-3-mini-4k-instruct", | |
| 38 | + provider="huggingface", | |
| 39 | + display_name="Phi-3 Mini 4K Instruct", | |
| 40 | + capabilities=["chat"], | |
| 41 | + ), | |
| 42 | + ModelInfo( | |
| 43 | + id="llava-hf/llava-v1.6-mistral-7b-hf", | |
| 44 | + provider="huggingface", | |
| 45 | + display_name="LLaVA v1.6 Mistral 7B", | |
| 46 | + capabilities=["chat", "vision"], | |
| 47 | + ), | |
| 48 | + ModelInfo( | |
| 49 | + id="openai/whisper-large-v3", | |
| 50 | + provider="huggingface", | |
| 51 | + display_name="Whisper Large v3", | |
| 52 | + capabilities=["audio"], | |
| 53 | + ), | |
| 54 | +] | |
| 55 | + | |
| 56 | + | |
| 57 | +class HuggingFaceProvider(BaseProvider): | |
| 58 | + """Hugging Face Inference API provider using huggingface_hub.""" | |
| 59 | + | |
| 60 | + provider_name = "huggingface" | |
| 61 | + | |
| 62 | + def __init__(self, token: Optional[str] = None): | |
| 63 | + try: | |
| 64 | + from huggingface_hub import InferenceClient | |
| 65 | + except ImportError: | |
| 66 | + raise ImportError( | |
| 67 | + "huggingface_hub package not installed. Install with: pip install huggingface_hub" | |
| 68 | + ) | |
| 69 | + | |
| 70 | + self._token = token or os.getenv("HF_TOKEN") | |
| 71 | + if not self._token: | |
| 72 | + raise ValueError("HF_TOKEN not set") | |
| 73 | + | |
| 74 | + self._client = InferenceClient(token=self._token) | |
| 75 | + self._last_usage = {} | |
| 76 | + | |
| 77 | + def chat( | |
| 78 | + self, | |
| 79 | + messages: list[dict], | |
| 80 | + max_tokens: int = 4096, | |
| 81 | + temperature: float = 0.7, | |
| 82 | + model: Optional[str] = None, | |
| 83 | + ) -> str: | |
| 84 | + model = model or "meta-llama/Llama-3.1-70B-Instruct" | |
| 85 | + if model.startswith("hf/"): | |
| 86 | + model = model[len("hf/") :] | |
| 87 | + | |
| 88 | + response = self._client.chat_completion( | |
| 89 | + model=model, | |
| 90 | + messages=messages, | |
| 91 | + max_tokens=max_tokens, | |
| 92 | + temperature=temperature, | |
| 93 | + ) | |
| 94 | + | |
| 95 | + usage = getattr(response, "usage", None) | |
| 96 | + self._last_usage = { | |
| 97 | + "input_tokens": getattr(usage, "prompt_tokens", 0) if usage else 0, | |
| 98 | + "output_tokens": getattr(usage, "completion_tokens", 0) if usage else 0, | |
| 99 | + } | |
| 100 | + return response.choices[0].message.content or "" | |
| 101 | + | |
| 102 | + def analyze_image( | |
| 103 | + self, | |
| 104 | + image_bytes: bytes, | |
| 105 | + prompt: str, | |
| 106 | + max_tokens: int = 4096, | |
| 107 | + model: Optional[str] = None, | |
| 108 | + ) -> str: | |
| 109 | + model = model or "llava-hf/llava-v1.6-mistral-7b-hf" | |
| 110 | + if model.startswith("hf/"): | |
| 111 | + model = model[len("hf/") :] | |
| 112 | + | |
| 113 | + b64 = base64.b64encode(image_bytes).decode() | |
| 114 | + | |
| 115 | + response = self._client.chat_completion( | |
| 116 | + model=model, | |
| 117 | + messages=[ | |
| 118 | + { | |
| 119 | + "role": "user", | |
| 120 | + "content": [ | |
| 121 | + {"type": "text", "text": prompt}, | |
| 122 | + { | |
| 123 | + "type": "image_url", | |
| 124 | + "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, | |
| 125 | + }, | |
| 126 | + ], | |
| 127 | + } | |
| 128 | + ], | |
| 129 | + max_tokens=max_tokens, | |
| 130 | + ) | |
| 131 | + | |
| 132 | + usage = getattr(response, "usage", None) | |
| 133 | + self._last_usage = { | |
| 134 | + "input_tokens": getattr(usage, "prompt_tokens", 0) if usage else 0, | |
| 135 | + "output_tokens": getattr(usage, "completion_tokens", 0) if usage else 0, | |
| 136 | + } | |
| 137 | + return response.choices[0].message.content or "" | |
| 138 | + | |
| 139 | + def transcribe_audio( | |
| 140 | + self, | |
| 141 | + audio_path: str | Path, | |
| 142 | + language: Optional[str] = None, | |
| 143 | + model: Optional[str] = None, | |
| 144 | + ) -> dict: | |
| 145 | + model = model or "openai/whisper-large-v3" | |
| 146 | + if model.startswith("hf/"): | |
| 147 | + model = model[len("hf/") :] | |
| 148 | + | |
| 149 | + audio_path = Path(audio_path) | |
| 150 | + audio_bytes = audio_path.read_bytes() | |
| 151 | + | |
| 152 | + result = self._client.automatic_speech_recognition( | |
| 153 | + audio=audio_bytes, | |
| 154 | + model=model, | |
| 155 | + ) | |
| 156 | + | |
| 157 | + text = result.text if hasattr(result, "text") else str(result) | |
| 158 | + | |
| 159 | + self._last_usage = { | |
| 160 | + "input_tokens": 0, | |
| 161 | + "output_tokens": 0, | |
| 162 | + } | |
| 163 | + | |
| 164 | + return { | |
| 165 | + "text": text, | |
| 166 | + "segments": [], | |
| 167 | + "language": language, | |
| 168 | + "duration": None, | |
| 169 | + "provider": "huggingface", | |
| 170 | + "model": model, | |
| 171 | + } | |
| 172 | + | |
| 173 | + def list_models(self) -> list[ModelInfo]: | |
| 174 | + return list(_HF_MODELS) | |
| 175 | + | |
| 176 | + | |
| 177 | +ProviderRegistry.register( | |
| 178 | + name="huggingface", | |
| 179 | + provider_class=HuggingFaceProvider, | |
| 180 | + env_var="HF_TOKEN", | |
| 181 | + model_prefixes=["hf/"], | |
| 182 | + default_models={ | |
| 183 | + "chat": "meta-llama/Llama-3.1-70B-Instruct", | |
| 184 | + "vision": "llava-hf/llava-v1.6-mistral-7b-hf", | |
| 185 | + "audio": "openai/whisper-large-v3", | |
| 186 | + }, | |
| 187 | +) |
| --- a/video_processor/providers/huggingface_provider.py | |
| +++ b/video_processor/providers/huggingface_provider.py | |
| @@ -0,0 +1,187 @@ | |
| --- a/video_processor/providers/huggingface_provider.py | |
| +++ b/video_processor/providers/huggingface_provider.py | |
| @@ -0,0 +1,187 @@ | |
| 1 | """Hugging Face Inference API provider implementation.""" |
| 2 | |
| 3 | import base64 |
| 4 | import logging |
| 5 | import os |
| 6 | from pathlib import Path |
| 7 | from typing import Optional |
| 8 | |
| 9 | from dotenv import load_dotenv |
| 10 | |
| 11 | from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry |
| 12 | |
| 13 | load_dotenv() |
| 14 | logger = logging.getLogger(__name__) |
| 15 | |
| 16 | # Curated list of popular HF Inference models |
| 17 | _HF_MODELS = [ |
| 18 | ModelInfo( |
| 19 | id="meta-llama/Llama-3.1-70B-Instruct", |
| 20 | provider="huggingface", |
| 21 | display_name="Llama 3.1 70B Instruct", |
| 22 | capabilities=["chat"], |
| 23 | ), |
| 24 | ModelInfo( |
| 25 | id="meta-llama/Llama-3.1-8B-Instruct", |
| 26 | provider="huggingface", |
| 27 | display_name="Llama 3.1 8B Instruct", |
| 28 | capabilities=["chat"], |
| 29 | ), |
| 30 | ModelInfo( |
| 31 | id="mistralai/Mixtral-8x7B-Instruct-v0.1", |
| 32 | provider="huggingface", |
| 33 | display_name="Mixtral 8x7B Instruct", |
| 34 | capabilities=["chat"], |
| 35 | ), |
| 36 | ModelInfo( |
| 37 | id="microsoft/Phi-3-mini-4k-instruct", |
| 38 | provider="huggingface", |
| 39 | display_name="Phi-3 Mini 4K Instruct", |
| 40 | capabilities=["chat"], |
| 41 | ), |
| 42 | ModelInfo( |
| 43 | id="llava-hf/llava-v1.6-mistral-7b-hf", |
| 44 | provider="huggingface", |
| 45 | display_name="LLaVA v1.6 Mistral 7B", |
| 46 | capabilities=["chat", "vision"], |
| 47 | ), |
| 48 | ModelInfo( |
| 49 | id="openai/whisper-large-v3", |
| 50 | provider="huggingface", |
| 51 | display_name="Whisper Large v3", |
| 52 | capabilities=["audio"], |
| 53 | ), |
| 54 | ] |
| 55 | |
| 56 | |
| 57 | class HuggingFaceProvider(BaseProvider): |
| 58 | """Hugging Face Inference API provider using huggingface_hub.""" |
| 59 | |
| 60 | provider_name = "huggingface" |
| 61 | |
| 62 | def __init__(self, token: Optional[str] = None): |
| 63 | try: |
| 64 | from huggingface_hub import InferenceClient |
| 65 | except ImportError: |
| 66 | raise ImportError( |
| 67 | "huggingface_hub package not installed. Install with: pip install huggingface_hub" |
| 68 | ) |
| 69 | |
| 70 | self._token = token or os.getenv("HF_TOKEN") |
| 71 | if not self._token: |
| 72 | raise ValueError("HF_TOKEN not set") |
| 73 | |
| 74 | self._client = InferenceClient(token=self._token) |
| 75 | self._last_usage = {} |
| 76 | |
| 77 | def chat( |
| 78 | self, |
| 79 | messages: list[dict], |
| 80 | max_tokens: int = 4096, |
| 81 | temperature: float = 0.7, |
| 82 | model: Optional[str] = None, |
| 83 | ) -> str: |
| 84 | model = model or "meta-llama/Llama-3.1-70B-Instruct" |
| 85 | if model.startswith("hf/"): |
| 86 | model = model[len("hf/") :] |
| 87 | |
| 88 | response = self._client.chat_completion( |
| 89 | model=model, |
| 90 | messages=messages, |
| 91 | max_tokens=max_tokens, |
| 92 | temperature=temperature, |
| 93 | ) |
| 94 | |
| 95 | usage = getattr(response, "usage", None) |
| 96 | self._last_usage = { |
| 97 | "input_tokens": getattr(usage, "prompt_tokens", 0) if usage else 0, |
| 98 | "output_tokens": getattr(usage, "completion_tokens", 0) if usage else 0, |
| 99 | } |
| 100 | return response.choices[0].message.content or "" |
| 101 | |
| 102 | def analyze_image( |
| 103 | self, |
| 104 | image_bytes: bytes, |
| 105 | prompt: str, |
| 106 | max_tokens: int = 4096, |
| 107 | model: Optional[str] = None, |
| 108 | ) -> str: |
| 109 | model = model or "llava-hf/llava-v1.6-mistral-7b-hf" |
| 110 | if model.startswith("hf/"): |
| 111 | model = model[len("hf/") :] |
| 112 | |
| 113 | b64 = base64.b64encode(image_bytes).decode() |
| 114 | |
| 115 | response = self._client.chat_completion( |
| 116 | model=model, |
| 117 | messages=[ |
| 118 | { |
| 119 | "role": "user", |
| 120 | "content": [ |
| 121 | {"type": "text", "text": prompt}, |
| 122 | { |
| 123 | "type": "image_url", |
| 124 | "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, |
| 125 | }, |
| 126 | ], |
| 127 | } |
| 128 | ], |
| 129 | max_tokens=max_tokens, |
| 130 | ) |
| 131 | |
| 132 | usage = getattr(response, "usage", None) |
| 133 | self._last_usage = { |
| 134 | "input_tokens": getattr(usage, "prompt_tokens", 0) if usage else 0, |
| 135 | "output_tokens": getattr(usage, "completion_tokens", 0) if usage else 0, |
| 136 | } |
| 137 | return response.choices[0].message.content or "" |
| 138 | |
| 139 | def transcribe_audio( |
| 140 | self, |
| 141 | audio_path: str | Path, |
| 142 | language: Optional[str] = None, |
| 143 | model: Optional[str] = None, |
| 144 | ) -> dict: |
| 145 | model = model or "openai/whisper-large-v3" |
| 146 | if model.startswith("hf/"): |
| 147 | model = model[len("hf/") :] |
| 148 | |
| 149 | audio_path = Path(audio_path) |
| 150 | audio_bytes = audio_path.read_bytes() |
| 151 | |
| 152 | result = self._client.automatic_speech_recognition( |
| 153 | audio=audio_bytes, |
| 154 | model=model, |
| 155 | ) |
| 156 | |
| 157 | text = result.text if hasattr(result, "text") else str(result) |
| 158 | |
| 159 | self._last_usage = { |
| 160 | "input_tokens": 0, |
| 161 | "output_tokens": 0, |
| 162 | } |
| 163 | |
| 164 | return { |
| 165 | "text": text, |
| 166 | "segments": [], |
| 167 | "language": language, |
| 168 | "duration": None, |
| 169 | "provider": "huggingface", |
| 170 | "model": model, |
| 171 | } |
| 172 | |
| 173 | def list_models(self) -> list[ModelInfo]: |
| 174 | return list(_HF_MODELS) |
| 175 | |
| 176 | |
| 177 | ProviderRegistry.register( |
| 178 | name="huggingface", |
| 179 | provider_class=HuggingFaceProvider, |
| 180 | env_var="HF_TOKEN", |
| 181 | model_prefixes=["hf/"], |
| 182 | default_models={ |
| 183 | "chat": "meta-llama/Llama-3.1-70B-Instruct", |
| 184 | "vision": "llava-hf/llava-v1.6-mistral-7b-hf", |
| 185 | "audio": "openai/whisper-large-v3", |
| 186 | }, |
| 187 | ) |
| --- a/video_processor/providers/litellm_provider.py | ||
| +++ b/video_processor/providers/litellm_provider.py | ||
| @@ -0,0 +1,171 @@ | ||
| 1 | +"""LiteLLM universal proxy provider implementation.""" | |
| 2 | + | |
| 3 | +import base64 | |
| 4 | +import logging | |
| 5 | +from pathlib import Path | |
| 6 | +from typing import Optional | |
| 7 | + | |
| 8 | +from dotenv import load_dotenv | |
| 9 | + | |
| 10 | +from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry | |
| 11 | + | |
| 12 | +load_dotenv() | |
| 13 | +logger = logging.getLogger(__name__) | |
| 14 | + | |
| 15 | + | |
| 16 | +class LiteLLMProvider(BaseProvider): | |
| 17 | + """LiteLLM universal proxy provider. | |
| 18 | + | |
| 19 | + LiteLLM supports 100+ LLM providers through a unified interface. | |
| 20 | + It reads provider API keys from environment variables automatically | |
| 21 | + (e.g. OPENAI_API_KEY, ANTHROPIC_API_KEY, etc.). | |
| 22 | + """ | |
| 23 | + | |
| 24 | + provider_name = "litellm" | |
| 25 | + | |
| 26 | + def __init__(self): | |
| 27 | + try: | |
| 28 | + import litellm # noqa: F401 | |
| 29 | + except ImportError: | |
| 30 | + raise ImportError("litellm package not installed. Install with: pip install litellm") | |
| 31 | + | |
| 32 | + self._litellm = litellm | |
| 33 | + self._last_usage = {} | |
| 34 | + | |
| 35 | + def chat( | |
| 36 | + self, | |
| 37 | + messages: list[dict], | |
| 38 | + max_tokens: int = 4096, | |
| 39 | + temperature: float = 0.7, | |
| 40 | + model: Optional[str] = None, | |
| 41 | + ) -> str: | |
| 42 | + if not model: | |
| 43 | + raise ValueError( | |
| 44 | + "LiteLLM requires an explicit model in provider/model format " | |
| 45 | + "(e.g. 'openai/gpt-4o', 'anthropic/claude-3-sonnet-20240229')" | |
| 46 | + ) | |
| 47 | + | |
| 48 | + response = self._litellm.completion( | |
| 49 | + model=model, | |
| 50 | + messages=messages, | |
| 51 | + max_tokens=max_tokens, | |
| 52 | + temperature=temperature, | |
| 53 | + ) | |
| 54 | + | |
| 55 | + usage = getattr(response, "usage", None) | |
| 56 | + self._last_usage = { | |
| 57 | + "input_tokens": getattr(usage, "prompt_tokens", 0) if usage else 0, | |
| 58 | + "output_tokens": getattr(usage, "completion_tokens", 0) if usage else 0, | |
| 59 | + } | |
| 60 | + return response.choices[0].message.content or "" | |
| 61 | + | |
| 62 | + def analyze_image( | |
| 63 | + self, | |
| 64 | + image_bytes: bytes, | |
| 65 | + prompt: str, | |
| 66 | + max_tokens: int = 4096, | |
| 67 | + model: Optional[str] = None, | |
| 68 | + ) -> str: | |
| 69 | + if not model: | |
| 70 | + raise ValueError( | |
| 71 | + "LiteLLM requires an explicit model for image analysis " | |
| 72 | + "(e.g. 'openai/gpt-4o', 'anthropic/claude-3-sonnet-20240229')" | |
| 73 | + ) | |
| 74 | + | |
| 75 | + b64 = base64.b64encode(image_bytes).decode() | |
| 76 | + | |
| 77 | + response = self._litellm.completion( | |
| 78 | + model=model, | |
| 79 | + messages=[ | |
| 80 | + { | |
| 81 | + "role": "user", | |
| 82 | + "content": [ | |
| 83 | + {"type": "text", "text": prompt}, | |
| 84 | + { | |
| 85 | + "type": "image_url", | |
| 86 | + "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, | |
| 87 | + }, | |
| 88 | + ], | |
| 89 | + } | |
| 90 | + ], | |
| 91 | + max_tokens=max_tokens, | |
| 92 | + ) | |
| 93 | + | |
| 94 | + usage = getattr(response, "usage", None) | |
| 95 | + self._last_usage = { | |
| 96 | + "input_tokens": getattr(usage, "prompt_tokens", 0) if usage else 0, | |
| 97 | + "output_tokens": getattr(usage, "completion_tokens", 0) if usage else 0, | |
| 98 | + } | |
| 99 | + return response.choices[0].message.content or "" | |
| 100 | + | |
| 101 | + def transcribe_audio( | |
| 102 | + self, | |
| 103 | + audio_path: str | Path, | |
| 104 | + language: Optional[str] = None, | |
| 105 | + model: Optional[str] = None, | |
| 106 | + ) -> dict: | |
| 107 | + model = model or "whisper-1" | |
| 108 | + | |
| 109 | + try: | |
| 110 | + with open(audio_path, "rb") as f: | |
| 111 | + response = self._litellm.transcription( | |
| 112 | + model=model, | |
| 113 | + file=f, | |
| 114 | + language=language, | |
| 115 | + ) | |
| 116 | + | |
| 117 | + text = getattr(response, "text", str(response)) | |
| 118 | + self._last_usage = { | |
| 119 | + "input_tokens": 0, | |
| 120 | + "output_tokens": 0, | |
| 121 | + } | |
| 122 | + | |
| 123 | + return { | |
| 124 | + "text": text, | |
| 125 | + "segments": [], | |
| 126 | + "language": language, | |
| 127 | + "duration": None, | |
| 128 | + "provider": "litellm", | |
| 129 | + "model": model, | |
| 130 | + } | |
| 131 | + except Exception: | |
| 132 | + raise NotImplementedError( | |
| 133 | + "Audio transcription failed via LiteLLM. " | |
| 134 | + "Ensure the underlying provider supports transcription." | |
| 135 | + ) | |
| 136 | + | |
| 137 | + def list_models(self) -> list[ModelInfo]: | |
| 138 | + try: | |
| 139 | + model_list = getattr(self._litellm, "model_list", None) | |
| 140 | + if model_list: | |
| 141 | + return [ | |
| 142 | + ModelInfo( | |
| 143 | + id=m if isinstance(m, str) else str(m), | |
| 144 | + provider="litellm", | |
| 145 | + display_name=m if isinstance(m, str) else str(m), | |
| 146 | + capabilities=["chat"], | |
| 147 | + ) | |
| 148 | + for m in model_list | |
| 149 | + ] | |
| 150 | + except Exception as e: | |
| 151 | + logger.warning(f"Failed to list LiteLLM models: {e}") | |
| 152 | + return [] | |
| 153 | + | |
| 154 | + | |
| 155 | +# Only register if litellm is importable | |
| 156 | +try: | |
| 157 | + import litellm # noqa: F401 | |
| 158 | + | |
| 159 | + ProviderRegistry.register( | |
| 160 | + name="litellm", | |
| 161 | + provider_class=LiteLLMProvider, | |
| 162 | + env_var="", | |
| 163 | + model_prefixes=[], | |
| 164 | + default_models={ | |
| 165 | + "chat": "", | |
| 166 | + "vision": "", | |
| 167 | + "audio": "", | |
| 168 | + }, | |
| 169 | + ) | |
| 170 | +except ImportError: | |
| 171 | + pass |
| --- a/video_processor/providers/litellm_provider.py | |
| +++ b/video_processor/providers/litellm_provider.py | |
| @@ -0,0 +1,171 @@ | |
| --- a/video_processor/providers/litellm_provider.py | |
| +++ b/video_processor/providers/litellm_provider.py | |
| @@ -0,0 +1,171 @@ | |
| 1 | """LiteLLM universal proxy provider implementation.""" |
| 2 | |
| 3 | import base64 |
| 4 | import logging |
| 5 | from pathlib import Path |
| 6 | from typing import Optional |
| 7 | |
| 8 | from dotenv import load_dotenv |
| 9 | |
| 10 | from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry |
| 11 | |
| 12 | load_dotenv() |
| 13 | logger = logging.getLogger(__name__) |
| 14 | |
| 15 | |
| 16 | class LiteLLMProvider(BaseProvider): |
| 17 | """LiteLLM universal proxy provider. |
| 18 | |
| 19 | LiteLLM supports 100+ LLM providers through a unified interface. |
| 20 | It reads provider API keys from environment variables automatically |
| 21 | (e.g. OPENAI_API_KEY, ANTHROPIC_API_KEY, etc.). |
| 22 | """ |
| 23 | |
| 24 | provider_name = "litellm" |
| 25 | |
| 26 | def __init__(self): |
| 27 | try: |
| 28 | import litellm # noqa: F401 |
| 29 | except ImportError: |
| 30 | raise ImportError("litellm package not installed. Install with: pip install litellm") |
| 31 | |
| 32 | self._litellm = litellm |
| 33 | self._last_usage = {} |
| 34 | |
| 35 | def chat( |
| 36 | self, |
| 37 | messages: list[dict], |
| 38 | max_tokens: int = 4096, |
| 39 | temperature: float = 0.7, |
| 40 | model: Optional[str] = None, |
| 41 | ) -> str: |
| 42 | if not model: |
| 43 | raise ValueError( |
| 44 | "LiteLLM requires an explicit model in provider/model format " |
| 45 | "(e.g. 'openai/gpt-4o', 'anthropic/claude-3-sonnet-20240229')" |
| 46 | ) |
| 47 | |
| 48 | response = self._litellm.completion( |
| 49 | model=model, |
| 50 | messages=messages, |
| 51 | max_tokens=max_tokens, |
| 52 | temperature=temperature, |
| 53 | ) |
| 54 | |
| 55 | usage = getattr(response, "usage", None) |
| 56 | self._last_usage = { |
| 57 | "input_tokens": getattr(usage, "prompt_tokens", 0) if usage else 0, |
| 58 | "output_tokens": getattr(usage, "completion_tokens", 0) if usage else 0, |
| 59 | } |
| 60 | return response.choices[0].message.content or "" |
| 61 | |
| 62 | def analyze_image( |
| 63 | self, |
| 64 | image_bytes: bytes, |
| 65 | prompt: str, |
| 66 | max_tokens: int = 4096, |
| 67 | model: Optional[str] = None, |
| 68 | ) -> str: |
| 69 | if not model: |
| 70 | raise ValueError( |
| 71 | "LiteLLM requires an explicit model for image analysis " |
| 72 | "(e.g. 'openai/gpt-4o', 'anthropic/claude-3-sonnet-20240229')" |
| 73 | ) |
| 74 | |
| 75 | b64 = base64.b64encode(image_bytes).decode() |
| 76 | |
| 77 | response = self._litellm.completion( |
| 78 | model=model, |
| 79 | messages=[ |
| 80 | { |
| 81 | "role": "user", |
| 82 | "content": [ |
| 83 | {"type": "text", "text": prompt}, |
| 84 | { |
| 85 | "type": "image_url", |
| 86 | "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, |
| 87 | }, |
| 88 | ], |
| 89 | } |
| 90 | ], |
| 91 | max_tokens=max_tokens, |
| 92 | ) |
| 93 | |
| 94 | usage = getattr(response, "usage", None) |
| 95 | self._last_usage = { |
| 96 | "input_tokens": getattr(usage, "prompt_tokens", 0) if usage else 0, |
| 97 | "output_tokens": getattr(usage, "completion_tokens", 0) if usage else 0, |
| 98 | } |
| 99 | return response.choices[0].message.content or "" |
| 100 | |
| 101 | def transcribe_audio( |
| 102 | self, |
| 103 | audio_path: str | Path, |
| 104 | language: Optional[str] = None, |
| 105 | model: Optional[str] = None, |
| 106 | ) -> dict: |
| 107 | model = model or "whisper-1" |
| 108 | |
| 109 | try: |
| 110 | with open(audio_path, "rb") as f: |
| 111 | response = self._litellm.transcription( |
| 112 | model=model, |
| 113 | file=f, |
| 114 | language=language, |
| 115 | ) |
| 116 | |
| 117 | text = getattr(response, "text", str(response)) |
| 118 | self._last_usage = { |
| 119 | "input_tokens": 0, |
| 120 | "output_tokens": 0, |
| 121 | } |
| 122 | |
| 123 | return { |
| 124 | "text": text, |
| 125 | "segments": [], |
| 126 | "language": language, |
| 127 | "duration": None, |
| 128 | "provider": "litellm", |
| 129 | "model": model, |
| 130 | } |
| 131 | except Exception: |
| 132 | raise NotImplementedError( |
| 133 | "Audio transcription failed via LiteLLM. " |
| 134 | "Ensure the underlying provider supports transcription." |
| 135 | ) |
| 136 | |
| 137 | def list_models(self) -> list[ModelInfo]: |
| 138 | try: |
| 139 | model_list = getattr(self._litellm, "model_list", None) |
| 140 | if model_list: |
| 141 | return [ |
| 142 | ModelInfo( |
| 143 | id=m if isinstance(m, str) else str(m), |
| 144 | provider="litellm", |
| 145 | display_name=m if isinstance(m, str) else str(m), |
| 146 | capabilities=["chat"], |
| 147 | ) |
| 148 | for m in model_list |
| 149 | ] |
| 150 | except Exception as e: |
| 151 | logger.warning(f"Failed to list LiteLLM models: {e}") |
| 152 | return [] |
| 153 | |
| 154 | |
| 155 | # Only register if litellm is importable |
| 156 | try: |
| 157 | import litellm # noqa: F401 |
| 158 | |
| 159 | ProviderRegistry.register( |
| 160 | name="litellm", |
| 161 | provider_class=LiteLLMProvider, |
| 162 | env_var="", |
| 163 | model_prefixes=[], |
| 164 | default_models={ |
| 165 | "chat": "", |
| 166 | "vision": "", |
| 167 | "audio": "", |
| 168 | }, |
| 169 | ) |
| 170 | except ImportError: |
| 171 | pass |
+53
-56
| --- video_processor/providers/manager.py | ||
| +++ video_processor/providers/manager.py | ||
| @@ -4,27 +4,44 @@ | ||
| 4 | 4 | from pathlib import Path |
| 5 | 5 | from typing import Optional |
| 6 | 6 | |
| 7 | 7 | from dotenv import load_dotenv |
| 8 | 8 | |
| 9 | -from video_processor.providers.base import BaseProvider, ModelInfo | |
| 9 | +from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry | |
| 10 | 10 | from video_processor.providers.discovery import discover_available_models |
| 11 | 11 | from video_processor.utils.usage_tracker import UsageTracker |
| 12 | 12 | |
| 13 | 13 | load_dotenv() |
| 14 | 14 | logger = logging.getLogger(__name__) |
| 15 | + | |
| 16 | + | |
| 17 | +def _ensure_providers_registered() -> None: | |
| 18 | + """Import all built-in provider modules so they register themselves.""" | |
| 19 | + if ProviderRegistry.all_registered(): | |
| 20 | + return | |
| 21 | + # Each module registers itself on import via ProviderRegistry.register() | |
| 22 | + import video_processor.providers.anthropic_provider # noqa: F401 | |
| 23 | + import video_processor.providers.azure_provider # noqa: F401 | |
| 24 | + import video_processor.providers.cerebras_provider # noqa: F401 | |
| 25 | + import video_processor.providers.fireworks_provider # noqa: F401 | |
| 26 | + import video_processor.providers.gemini_provider # noqa: F401 | |
| 27 | + import video_processor.providers.ollama_provider # noqa: F401 | |
| 28 | + import video_processor.providers.openai_provider # noqa: F401 | |
| 29 | + import video_processor.providers.together_provider # noqa: F401 | |
| 30 | + import video_processor.providers.xai_provider # noqa: F401 | |
| 31 | + | |
| 15 | 32 | |
| 16 | 33 | # Default model preference rankings (tried in order) |
| 17 | 34 | _VISION_PREFERENCES = [ |
| 18 | 35 | ("gemini", "gemini-2.5-flash"), |
| 19 | - ("openai", "gpt-4o"), | |
| 20 | - ("anthropic", "claude-sonnet-4-5-20250929"), | |
| 36 | + ("openai", "gpt-4o-mini"), | |
| 37 | + ("anthropic", "claude-haiku-4-5-20251001"), | |
| 21 | 38 | ] |
| 22 | 39 | |
| 23 | 40 | _CHAT_PREFERENCES = [ |
| 24 | - ("anthropic", "claude-sonnet-4-5-20250929"), | |
| 25 | - ("openai", "gpt-4o"), | |
| 41 | + ("anthropic", "claude-haiku-4-5-20251001"), | |
| 42 | + ("openai", "gpt-4o-mini"), | |
| 26 | 43 | ("gemini", "gemini-2.5-flash"), |
| 27 | 44 | ] |
| 28 | 45 | |
| 29 | 46 | _TRANSCRIPTION_PREFERENCES = [ |
| 30 | 47 | ("openai", "whisper-1"), |
| @@ -57,10 +74,11 @@ | ||
| 57 | 74 | chat_model : override model for chat/LLM tasks |
| 58 | 75 | transcription_model : override model for transcription |
| 59 | 76 | provider : force all tasks to a single provider ('openai', 'anthropic', 'gemini') |
| 60 | 77 | auto : if True and no model specified, pick the best available |
| 61 | 78 | """ |
| 79 | + _ensure_providers_registered() | |
| 62 | 80 | self.auto = auto |
| 63 | 81 | self._providers: dict[str, BaseProvider] = {} |
| 64 | 82 | self._available_models: Optional[list[ModelInfo]] = None |
| 65 | 83 | self.usage = UsageTracker() |
| 66 | 84 | |
| @@ -79,67 +97,31 @@ | ||
| 79 | 97 | self._forced_provider = provider |
| 80 | 98 | |
| 81 | 99 | @staticmethod |
| 82 | 100 | def _default_for_provider(provider: str, capability: str) -> str: |
| 83 | 101 | """Return the default model for a provider/capability combo.""" |
| 84 | - defaults = { | |
| 85 | - "openai": {"chat": "gpt-4o", "vision": "gpt-4o", "audio": "whisper-1"}, | |
| 86 | - "anthropic": { | |
| 87 | - "chat": "claude-sonnet-4-5-20250929", | |
| 88 | - "vision": "claude-sonnet-4-5-20250929", | |
| 89 | - "audio": "", | |
| 90 | - }, | |
| 91 | - "gemini": { | |
| 92 | - "chat": "gemini-2.5-flash", | |
| 93 | - "vision": "gemini-2.5-flash", | |
| 94 | - "audio": "gemini-2.5-flash", | |
| 95 | - }, | |
| 96 | - "ollama": { | |
| 97 | - "chat": "", | |
| 98 | - "vision": "", | |
| 99 | - "audio": "", | |
| 100 | - }, | |
| 101 | - } | |
| 102 | - return defaults.get(provider, {}).get(capability, "") | |
| 102 | + defaults = ProviderRegistry.get_default_models(provider) | |
| 103 | + if defaults: | |
| 104 | + return defaults.get(capability, "") | |
| 105 | + # Fallback for unregistered providers | |
| 106 | + return "" | |
| 103 | 107 | |
| 104 | 108 | def _get_provider(self, provider_name: str) -> BaseProvider: |
| 105 | 109 | """Lazily initialize and cache a provider instance.""" |
| 106 | 110 | if provider_name not in self._providers: |
| 107 | - if provider_name == "openai": | |
| 108 | - from video_processor.providers.openai_provider import OpenAIProvider | |
| 109 | - | |
| 110 | - self._providers[provider_name] = OpenAIProvider() | |
| 111 | - elif provider_name == "anthropic": | |
| 112 | - from video_processor.providers.anthropic_provider import AnthropicProvider | |
| 113 | - | |
| 114 | - self._providers[provider_name] = AnthropicProvider() | |
| 115 | - elif provider_name == "gemini": | |
| 116 | - from video_processor.providers.gemini_provider import GeminiProvider | |
| 117 | - | |
| 118 | - self._providers[provider_name] = GeminiProvider() | |
| 119 | - elif provider_name == "ollama": | |
| 120 | - from video_processor.providers.ollama_provider import OllamaProvider | |
| 121 | - | |
| 122 | - self._providers[provider_name] = OllamaProvider() | |
| 123 | - else: | |
| 124 | - raise ValueError(f"Unknown provider: {provider_name}") | |
| 111 | + _ensure_providers_registered() | |
| 112 | + provider_class = ProviderRegistry.get(provider_name) | |
| 113 | + self._providers[provider_name] = provider_class() | |
| 125 | 114 | return self._providers[provider_name] |
| 126 | 115 | |
| 127 | 116 | def _provider_for_model(self, model_id: str) -> str: |
| 128 | 117 | """Infer the provider from a model id.""" |
| 129 | - if ( | |
| 130 | - model_id.startswith("gpt-") | |
| 131 | - or model_id.startswith("o1") | |
| 132 | - or model_id.startswith("o3") | |
| 133 | - or model_id.startswith("o4") | |
| 134 | - or model_id.startswith("whisper") | |
| 135 | - ): | |
| 136 | - return "openai" | |
| 137 | - if model_id.startswith("claude-"): | |
| 138 | - return "anthropic" | |
| 139 | - if model_id.startswith("gemini-"): | |
| 140 | - return "gemini" | |
| 118 | + _ensure_providers_registered() | |
| 119 | + # Check registry prefix matching first | |
| 120 | + provider_name = ProviderRegistry.get_by_model(model_id) | |
| 121 | + if provider_name: | |
| 122 | + return provider_name | |
| 141 | 123 | # Try discovery (exact match, then prefix match for ollama name:tag format) |
| 142 | 124 | models = self._get_available_models() |
| 143 | 125 | for m in models: |
| 144 | 126 | if m.id == model_id: |
| 145 | 127 | return m.provider |
| @@ -238,10 +220,11 @@ | ||
| 238 | 220 | |
| 239 | 221 | def transcribe_audio( |
| 240 | 222 | self, |
| 241 | 223 | audio_path: str | Path, |
| 242 | 224 | language: Optional[str] = None, |
| 225 | + speaker_hints: Optional[list[str]] = None, | |
| 243 | 226 | ) -> dict: |
| 244 | 227 | """Transcribe audio using local Whisper if available, otherwise API.""" |
| 245 | 228 | # Prefer local Whisper — no file size limits, no API costs |
| 246 | 229 | if not self.transcription_model or self.transcription_model.startswith("whisper-local"): |
| 247 | 230 | try: |
| @@ -253,11 +236,17 @@ | ||
| 253 | 236 | if self.transcription_model and ":" in self.transcription_model: |
| 254 | 237 | size = self.transcription_model.split(":", 1)[1] |
| 255 | 238 | if not hasattr(self, "_whisper_local"): |
| 256 | 239 | self._whisper_local = WhisperLocal(model_size=size) |
| 257 | 240 | logger.info(f"Transcription: using local whisper-{size}") |
| 258 | - result = self._whisper_local.transcribe(audio_path, language=language) | |
| 241 | + # Pass speaker names as initial prompt hint for Whisper | |
| 242 | + whisper_kwargs = {"language": language} | |
| 243 | + if speaker_hints: | |
| 244 | + whisper_kwargs["initial_prompt"] = ( | |
| 245 | + "Speakers: " + ", ".join(speaker_hints) + "." | |
| 246 | + ) | |
| 247 | + result = self._whisper_local.transcribe(audio_path, **whisper_kwargs) | |
| 259 | 248 | duration = result.get("duration") or 0 |
| 260 | 249 | self.usage.record( |
| 261 | 250 | provider="local", |
| 262 | 251 | model=f"whisper-{size}", |
| 263 | 252 | audio_minutes=duration / 60 if duration else 0, |
| @@ -270,11 +259,19 @@ | ||
| 270 | 259 | prov_name, model = self._resolve_model( |
| 271 | 260 | self.transcription_model, "audio", _TRANSCRIPTION_PREFERENCES |
| 272 | 261 | ) |
| 273 | 262 | logger.info(f"Transcription: using {prov_name}/{model}") |
| 274 | 263 | provider = self._get_provider(prov_name) |
| 275 | - result = provider.transcribe_audio(audio_path, language=language, model=model) | |
| 264 | + # Build transcription kwargs, passing speaker hints where supported | |
| 265 | + transcribe_kwargs: dict = {"language": language, "model": model} | |
| 266 | + if speaker_hints: | |
| 267 | + if prov_name == "openai": | |
| 268 | + # OpenAI Whisper supports a 'prompt' parameter for hints | |
| 269 | + transcribe_kwargs["prompt"] = "Speakers: " + ", ".join(speaker_hints) + "." | |
| 270 | + else: | |
| 271 | + transcribe_kwargs["speaker_hints"] = speaker_hints | |
| 272 | + result = provider.transcribe_audio(audio_path, **transcribe_kwargs) | |
| 276 | 273 | duration = result.get("duration") or 0 |
| 277 | 274 | self.usage.record( |
| 278 | 275 | provider=prov_name, |
| 279 | 276 | model=model, |
| 280 | 277 | audio_minutes=duration / 60 if duration else 0, |
| 281 | 278 | |
| 282 | 279 | ADDED video_processor/providers/mistral_provider.py |
| --- video_processor/providers/manager.py | |
| +++ video_processor/providers/manager.py | |
| @@ -4,27 +4,44 @@ | |
| 4 | from pathlib import Path |
| 5 | from typing import Optional |
| 6 | |
| 7 | from dotenv import load_dotenv |
| 8 | |
| 9 | from video_processor.providers.base import BaseProvider, ModelInfo |
| 10 | from video_processor.providers.discovery import discover_available_models |
| 11 | from video_processor.utils.usage_tracker import UsageTracker |
| 12 | |
| 13 | load_dotenv() |
| 14 | logger = logging.getLogger(__name__) |
| 15 | |
| 16 | # Default model preference rankings (tried in order) |
| 17 | _VISION_PREFERENCES = [ |
| 18 | ("gemini", "gemini-2.5-flash"), |
| 19 | ("openai", "gpt-4o"), |
| 20 | ("anthropic", "claude-sonnet-4-5-20250929"), |
| 21 | ] |
| 22 | |
| 23 | _CHAT_PREFERENCES = [ |
| 24 | ("anthropic", "claude-sonnet-4-5-20250929"), |
| 25 | ("openai", "gpt-4o"), |
| 26 | ("gemini", "gemini-2.5-flash"), |
| 27 | ] |
| 28 | |
| 29 | _TRANSCRIPTION_PREFERENCES = [ |
| 30 | ("openai", "whisper-1"), |
| @@ -57,10 +74,11 @@ | |
| 57 | chat_model : override model for chat/LLM tasks |
| 58 | transcription_model : override model for transcription |
| 59 | provider : force all tasks to a single provider ('openai', 'anthropic', 'gemini') |
| 60 | auto : if True and no model specified, pick the best available |
| 61 | """ |
| 62 | self.auto = auto |
| 63 | self._providers: dict[str, BaseProvider] = {} |
| 64 | self._available_models: Optional[list[ModelInfo]] = None |
| 65 | self.usage = UsageTracker() |
| 66 | |
| @@ -79,67 +97,31 @@ | |
| 79 | self._forced_provider = provider |
| 80 | |
| 81 | @staticmethod |
| 82 | def _default_for_provider(provider: str, capability: str) -> str: |
| 83 | """Return the default model for a provider/capability combo.""" |
| 84 | defaults = { |
| 85 | "openai": {"chat": "gpt-4o", "vision": "gpt-4o", "audio": "whisper-1"}, |
| 86 | "anthropic": { |
| 87 | "chat": "claude-sonnet-4-5-20250929", |
| 88 | "vision": "claude-sonnet-4-5-20250929", |
| 89 | "audio": "", |
| 90 | }, |
| 91 | "gemini": { |
| 92 | "chat": "gemini-2.5-flash", |
| 93 | "vision": "gemini-2.5-flash", |
| 94 | "audio": "gemini-2.5-flash", |
| 95 | }, |
| 96 | "ollama": { |
| 97 | "chat": "", |
| 98 | "vision": "", |
| 99 | "audio": "", |
| 100 | }, |
| 101 | } |
| 102 | return defaults.get(provider, {}).get(capability, "") |
| 103 | |
| 104 | def _get_provider(self, provider_name: str) -> BaseProvider: |
| 105 | """Lazily initialize and cache a provider instance.""" |
| 106 | if provider_name not in self._providers: |
| 107 | if provider_name == "openai": |
| 108 | from video_processor.providers.openai_provider import OpenAIProvider |
| 109 | |
| 110 | self._providers[provider_name] = OpenAIProvider() |
| 111 | elif provider_name == "anthropic": |
| 112 | from video_processor.providers.anthropic_provider import AnthropicProvider |
| 113 | |
| 114 | self._providers[provider_name] = AnthropicProvider() |
| 115 | elif provider_name == "gemini": |
| 116 | from video_processor.providers.gemini_provider import GeminiProvider |
| 117 | |
| 118 | self._providers[provider_name] = GeminiProvider() |
| 119 | elif provider_name == "ollama": |
| 120 | from video_processor.providers.ollama_provider import OllamaProvider |
| 121 | |
| 122 | self._providers[provider_name] = OllamaProvider() |
| 123 | else: |
| 124 | raise ValueError(f"Unknown provider: {provider_name}") |
| 125 | return self._providers[provider_name] |
| 126 | |
| 127 | def _provider_for_model(self, model_id: str) -> str: |
| 128 | """Infer the provider from a model id.""" |
| 129 | if ( |
| 130 | model_id.startswith("gpt-") |
| 131 | or model_id.startswith("o1") |
| 132 | or model_id.startswith("o3") |
| 133 | or model_id.startswith("o4") |
| 134 | or model_id.startswith("whisper") |
| 135 | ): |
| 136 | return "openai" |
| 137 | if model_id.startswith("claude-"): |
| 138 | return "anthropic" |
| 139 | if model_id.startswith("gemini-"): |
| 140 | return "gemini" |
| 141 | # Try discovery (exact match, then prefix match for ollama name:tag format) |
| 142 | models = self._get_available_models() |
| 143 | for m in models: |
| 144 | if m.id == model_id: |
| 145 | return m.provider |
| @@ -238,10 +220,11 @@ | |
| 238 | |
| 239 | def transcribe_audio( |
| 240 | self, |
| 241 | audio_path: str | Path, |
| 242 | language: Optional[str] = None, |
| 243 | ) -> dict: |
| 244 | """Transcribe audio using local Whisper if available, otherwise API.""" |
| 245 | # Prefer local Whisper — no file size limits, no API costs |
| 246 | if not self.transcription_model or self.transcription_model.startswith("whisper-local"): |
| 247 | try: |
| @@ -253,11 +236,17 @@ | |
| 253 | if self.transcription_model and ":" in self.transcription_model: |
| 254 | size = self.transcription_model.split(":", 1)[1] |
| 255 | if not hasattr(self, "_whisper_local"): |
| 256 | self._whisper_local = WhisperLocal(model_size=size) |
| 257 | logger.info(f"Transcription: using local whisper-{size}") |
| 258 | result = self._whisper_local.transcribe(audio_path, language=language) |
| 259 | duration = result.get("duration") or 0 |
| 260 | self.usage.record( |
| 261 | provider="local", |
| 262 | model=f"whisper-{size}", |
| 263 | audio_minutes=duration / 60 if duration else 0, |
| @@ -270,11 +259,19 @@ | |
| 270 | prov_name, model = self._resolve_model( |
| 271 | self.transcription_model, "audio", _TRANSCRIPTION_PREFERENCES |
| 272 | ) |
| 273 | logger.info(f"Transcription: using {prov_name}/{model}") |
| 274 | provider = self._get_provider(prov_name) |
| 275 | result = provider.transcribe_audio(audio_path, language=language, model=model) |
| 276 | duration = result.get("duration") or 0 |
| 277 | self.usage.record( |
| 278 | provider=prov_name, |
| 279 | model=model, |
| 280 | audio_minutes=duration / 60 if duration else 0, |
| 281 | |
| 282 | DDED video_processor/providers/mistral_provider.py |
| --- video_processor/providers/manager.py | |
| +++ video_processor/providers/manager.py | |
| @@ -4,27 +4,44 @@ | |
| 4 | from pathlib import Path |
| 5 | from typing import Optional |
| 6 | |
| 7 | from dotenv import load_dotenv |
| 8 | |
| 9 | from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry |
| 10 | from video_processor.providers.discovery import discover_available_models |
| 11 | from video_processor.utils.usage_tracker import UsageTracker |
| 12 | |
| 13 | load_dotenv() |
| 14 | logger = logging.getLogger(__name__) |
| 15 | |
| 16 | |
| 17 | def _ensure_providers_registered() -> None: |
| 18 | """Import all built-in provider modules so they register themselves.""" |
| 19 | if ProviderRegistry.all_registered(): |
| 20 | return |
| 21 | # Each module registers itself on import via ProviderRegistry.register() |
| 22 | import video_processor.providers.anthropic_provider # noqa: F401 |
| 23 | import video_processor.providers.azure_provider # noqa: F401 |
| 24 | import video_processor.providers.cerebras_provider # noqa: F401 |
| 25 | import video_processor.providers.fireworks_provider # noqa: F401 |
| 26 | import video_processor.providers.gemini_provider # noqa: F401 |
| 27 | import video_processor.providers.ollama_provider # noqa: F401 |
| 28 | import video_processor.providers.openai_provider # noqa: F401 |
| 29 | import video_processor.providers.together_provider # noqa: F401 |
| 30 | import video_processor.providers.xai_provider # noqa: F401 |
| 31 | |
| 32 | |
| 33 | # Default model preference rankings (tried in order) |
| 34 | _VISION_PREFERENCES = [ |
| 35 | ("gemini", "gemini-2.5-flash"), |
| 36 | ("openai", "gpt-4o-mini"), |
| 37 | ("anthropic", "claude-haiku-4-5-20251001"), |
| 38 | ] |
| 39 | |
| 40 | _CHAT_PREFERENCES = [ |
| 41 | ("anthropic", "claude-haiku-4-5-20251001"), |
| 42 | ("openai", "gpt-4o-mini"), |
| 43 | ("gemini", "gemini-2.5-flash"), |
| 44 | ] |
| 45 | |
| 46 | _TRANSCRIPTION_PREFERENCES = [ |
| 47 | ("openai", "whisper-1"), |
| @@ -57,10 +74,11 @@ | |
| 74 | chat_model : override model for chat/LLM tasks |
| 75 | transcription_model : override model for transcription |
| 76 | provider : force all tasks to a single provider ('openai', 'anthropic', 'gemini') |
| 77 | auto : if True and no model specified, pick the best available |
| 78 | """ |
| 79 | _ensure_providers_registered() |
| 80 | self.auto = auto |
| 81 | self._providers: dict[str, BaseProvider] = {} |
| 82 | self._available_models: Optional[list[ModelInfo]] = None |
| 83 | self.usage = UsageTracker() |
| 84 | |
| @@ -79,67 +97,31 @@ | |
| 97 | self._forced_provider = provider |
| 98 | |
| 99 | @staticmethod |
| 100 | def _default_for_provider(provider: str, capability: str) -> str: |
| 101 | """Return the default model for a provider/capability combo.""" |
| 102 | defaults = ProviderRegistry.get_default_models(provider) |
| 103 | if defaults: |
| 104 | return defaults.get(capability, "") |
| 105 | # Fallback for unregistered providers |
| 106 | return "" |
| 107 | |
| 108 | def _get_provider(self, provider_name: str) -> BaseProvider: |
| 109 | """Lazily initialize and cache a provider instance.""" |
| 110 | if provider_name not in self._providers: |
| 111 | _ensure_providers_registered() |
| 112 | provider_class = ProviderRegistry.get(provider_name) |
| 113 | self._providers[provider_name] = provider_class() |
| 114 | return self._providers[provider_name] |
| 115 | |
| 116 | def _provider_for_model(self, model_id: str) -> str: |
| 117 | """Infer the provider from a model id.""" |
| 118 | _ensure_providers_registered() |
| 119 | # Check registry prefix matching first |
| 120 | provider_name = ProviderRegistry.get_by_model(model_id) |
| 121 | if provider_name: |
| 122 | return provider_name |
| 123 | # Try discovery (exact match, then prefix match for ollama name:tag format) |
| 124 | models = self._get_available_models() |
| 125 | for m in models: |
| 126 | if m.id == model_id: |
| 127 | return m.provider |
| @@ -238,10 +220,11 @@ | |
| 220 | |
| 221 | def transcribe_audio( |
| 222 | self, |
| 223 | audio_path: str | Path, |
| 224 | language: Optional[str] = None, |
| 225 | speaker_hints: Optional[list[str]] = None, |
| 226 | ) -> dict: |
| 227 | """Transcribe audio using local Whisper if available, otherwise API.""" |
| 228 | # Prefer local Whisper — no file size limits, no API costs |
| 229 | if not self.transcription_model or self.transcription_model.startswith("whisper-local"): |
| 230 | try: |
| @@ -253,11 +236,17 @@ | |
| 236 | if self.transcription_model and ":" in self.transcription_model: |
| 237 | size = self.transcription_model.split(":", 1)[1] |
| 238 | if not hasattr(self, "_whisper_local"): |
| 239 | self._whisper_local = WhisperLocal(model_size=size) |
| 240 | logger.info(f"Transcription: using local whisper-{size}") |
| 241 | # Pass speaker names as initial prompt hint for Whisper |
| 242 | whisper_kwargs = {"language": language} |
| 243 | if speaker_hints: |
| 244 | whisper_kwargs["initial_prompt"] = ( |
| 245 | "Speakers: " + ", ".join(speaker_hints) + "." |
| 246 | ) |
| 247 | result = self._whisper_local.transcribe(audio_path, **whisper_kwargs) |
| 248 | duration = result.get("duration") or 0 |
| 249 | self.usage.record( |
| 250 | provider="local", |
| 251 | model=f"whisper-{size}", |
| 252 | audio_minutes=duration / 60 if duration else 0, |
| @@ -270,11 +259,19 @@ | |
| 259 | prov_name, model = self._resolve_model( |
| 260 | self.transcription_model, "audio", _TRANSCRIPTION_PREFERENCES |
| 261 | ) |
| 262 | logger.info(f"Transcription: using {prov_name}/{model}") |
| 263 | provider = self._get_provider(prov_name) |
| 264 | # Build transcription kwargs, passing speaker hints where supported |
| 265 | transcribe_kwargs: dict = {"language": language, "model": model} |
| 266 | if speaker_hints: |
| 267 | if prov_name == "openai": |
| 268 | # OpenAI Whisper supports a 'prompt' parameter for hints |
| 269 | transcribe_kwargs["prompt"] = "Speakers: " + ", ".join(speaker_hints) + "." |
| 270 | else: |
| 271 | transcribe_kwargs["speaker_hints"] = speaker_hints |
| 272 | result = provider.transcribe_audio(audio_path, **transcribe_kwargs) |
| 273 | duration = result.get("duration") or 0 |
| 274 | self.usage.record( |
| 275 | provider=prov_name, |
| 276 | model=model, |
| 277 | audio_minutes=duration / 60 if duration else 0, |
| 278 | |
| 279 | DDED video_processor/providers/mistral_provider.py |
| --- a/video_processor/providers/mistral_provider.py | ||
| +++ b/video_processor/providers/mistral_provider.py | ||
| @@ -0,0 +1,167 @@ | ||
| 1 | +"""Mistral AI provider implementation.""" | |
| 2 | + | |
| 3 | +import base64 | |
| 4 | +import logging | |
| 5 | +import os | |
| 6 | +from pathlib import Path | |
| 7 | +from typing import Optional | |
| 8 | + | |
| 9 | +from dotenv import load_dotenv | |
| 10 | + | |
| 11 | +from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry | |
| 12 | + | |
| 13 | +load_dotenv() | |
| 14 | +logger = logging.getLogger(__name__) | |
| 15 | + | |
| 16 | +# Curated list of Mistral models | |
| 17 | +_MISTRAL_MODELS = [ | |
| 18 | + ModelInfo( | |
| 19 | + id="mistral-large-latest", | |
| 20 | + provider="mistral", | |
| 21 | + display_name="Mistral Large", | |
| 22 | + capabilities=["chat"], | |
| 23 | + ), | |
| 24 | + ModelInfo( | |
| 25 | + id="mistral-medium-latest", | |
| 26 | + provider="mistral", | |
| 27 | + display_name="Mistral Medium", | |
| 28 | + capabilities=["chat"], | |
| 29 | + ), | |
| 30 | + ModelInfo( | |
| 31 | + id="mistral-small-latest", | |
| 32 | + provider="mistral", | |
| 33 | + display_name="Mistral Small", | |
| 34 | + capabilities=["chat"], | |
| 35 | + ), | |
| 36 | + ModelInfo( | |
| 37 | + id="open-mistral-nemo", | |
| 38 | + provider="mistral", | |
| 39 | + display_name="Mistral Nemo", | |
| 40 | + capabilities=["chat"], | |
| 41 | + ), | |
| 42 | + ModelInfo( | |
| 43 | + id="pixtral-large-latest", | |
| 44 | + provider="mistral", | |
| 45 | + display_name="Pixtral Large", | |
| 46 | + capabilities=["chat", "vision"], | |
| 47 | + ), | |
| 48 | + ModelInfo( | |
| 49 | + id="pixtral-12b-2409", | |
| 50 | + provider="mistral", | |
| 51 | + display_name="Pixtral 12B", | |
| 52 | + capabilities=["chat", "vision"], | |
| 53 | + ), | |
| 54 | + ModelInfo( | |
| 55 | + id="codestral-latest", | |
| 56 | + provider="mistral", | |
| 57 | + display_name="Codestral", | |
| 58 | + capabilities=["chat"], | |
| 59 | + ), | |
| 60 | +] | |
| 61 | + | |
| 62 | + | |
| 63 | +class MistralProvider(BaseProvider): | |
| 64 | + """Mistral AI provider using the mistralai SDK.""" | |
| 65 | + | |
| 66 | + provider_name = "mistral" | |
| 67 | + | |
| 68 | + def __init__(self, api_key: Optional[str] = None): | |
| 69 | + try: | |
| 70 | + from mistralai import Mistral | |
| 71 | + except ImportError: | |
| 72 | + raise ImportError( | |
| 73 | + "mistralai package not installed. Install with: pip install mistralai" | |
| 74 | + ) | |
| 75 | + | |
| 76 | + self._api_key = api_key or os.getenv("MISTRAL_API_KEY") | |
| 77 | + if not self._api_key: | |
| 78 | + raise ValueError("MISTRAL_API_KEY not set") | |
| 79 | + | |
| 80 | + self._client = Mistral(api_key=self._api_key) | |
| 81 | + self._last_usage = {} | |
| 82 | + | |
| 83 | + def chat( | |
| 84 | + self, | |
| 85 | + messages: list[dict], | |
| 86 | + max_tokens: int = 4096, | |
| 87 | + temperature: float = 0.7, | |
| 88 | + model: Optional[str] = None, | |
| 89 | + ) -> str: | |
| 90 | + model = model or "mistral-large-latest" | |
| 91 | + | |
| 92 | + response = self._client.chat.complete( | |
| 93 | + model=model, | |
| 94 | + messages=messages, | |
| 95 | + max_tokens=max_tokens, | |
| 96 | + temperature=temperature, | |
| 97 | + ) | |
| 98 | + | |
| 99 | + self._last_usage = { | |
| 100 | + "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0, | |
| 101 | + "output_tokens": getattr(response.usage, "completion_tokens", 0) | |
| 102 | + if response.usage | |
| 103 | + else 0, | |
| 104 | + } | |
| 105 | + return response.choices[0].message.content or "" | |
| 106 | + | |
| 107 | + def analyze_image( | |
| 108 | + self, | |
| 109 | + image_bytes: bytes, | |
| 110 | + prompt: str, | |
| 111 | + max_tokens: int = 4096, | |
| 112 | + model: Optional[str] = None, | |
| 113 | + ) -> str: | |
| 114 | + model = model or "pixtral-large-latest" | |
| 115 | + b64 = base64.b64encode(image_bytes).decode() | |
| 116 | + | |
| 117 | + response = self._client.chat.complete( | |
| 118 | + model=model, | |
| 119 | + messages=[ | |
| 120 | + { | |
| 121 | + "role": "user", | |
| 122 | + "content": [ | |
| 123 | + {"type": "text", "text": prompt}, | |
| 124 | + { | |
| 125 | + "type": "image_url", | |
| 126 | + "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, | |
| 127 | + }, | |
| 128 | + ], | |
| 129 | + } | |
| 130 | + ], | |
| 131 | + max_tokens=max_tokens, | |
| 132 | + ) | |
| 133 | + | |
| 134 | + self._last_usage = { | |
| 135 | + "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0, | |
| 136 | + "output_tokens": getattr(response.usage, "completion_tokens", 0) | |
| 137 | + if response.usage | |
| 138 | + else 0, | |
| 139 | + } | |
| 140 | + return response.choices[0].message.content or "" | |
| 141 | + | |
| 142 | + def transcribe_audio( | |
| 143 | + self, | |
| 144 | + audio_path: str | Path, | |
| 145 | + language: Optional[str] = None, | |
| 146 | + model: Optional[str] = None, | |
| 147 | + ) -> dict: | |
| 148 | + raise NotImplementedError( | |
| 149 | + "Mistral does not provide a transcription API. " | |
| 150 | + "Use OpenAI Whisper or Gemini for transcription." | |
| 151 | + ) | |
| 152 | + | |
| 153 | + def list_models(self) -> list[ModelInfo]: | |
| 154 | + return list(_MISTRAL_MODELS) | |
| 155 | + | |
| 156 | + | |
| 157 | +ProviderRegistry.register( | |
| 158 | + name="mistral", | |
| 159 | + provider_class=MistralProvider, | |
| 160 | + env_var="MISTRAL_API_KEY", | |
| 161 | + model_prefixes=["mistral-", "pixtral-", "codestral-", "open-mistral-"], | |
| 162 | + default_models={ | |
| 163 | + "chat": "mistral-large-latest", | |
| 164 | + "vision": "pixtral-large-latest", | |
| 165 | + "audio": "", | |
| 166 | + }, | |
| 167 | +) |
| --- a/video_processor/providers/mistral_provider.py | |
| +++ b/video_processor/providers/mistral_provider.py | |
| @@ -0,0 +1,167 @@ | |
| --- a/video_processor/providers/mistral_provider.py | |
| +++ b/video_processor/providers/mistral_provider.py | |
| @@ -0,0 +1,167 @@ | |
| 1 | """Mistral AI provider implementation.""" |
| 2 | |
| 3 | import base64 |
| 4 | import logging |
| 5 | import os |
| 6 | from pathlib import Path |
| 7 | from typing import Optional |
| 8 | |
| 9 | from dotenv import load_dotenv |
| 10 | |
| 11 | from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry |
| 12 | |
| 13 | load_dotenv() |
| 14 | logger = logging.getLogger(__name__) |
| 15 | |
| 16 | # Curated list of Mistral models |
| 17 | _MISTRAL_MODELS = [ |
| 18 | ModelInfo( |
| 19 | id="mistral-large-latest", |
| 20 | provider="mistral", |
| 21 | display_name="Mistral Large", |
| 22 | capabilities=["chat"], |
| 23 | ), |
| 24 | ModelInfo( |
| 25 | id="mistral-medium-latest", |
| 26 | provider="mistral", |
| 27 | display_name="Mistral Medium", |
| 28 | capabilities=["chat"], |
| 29 | ), |
| 30 | ModelInfo( |
| 31 | id="mistral-small-latest", |
| 32 | provider="mistral", |
| 33 | display_name="Mistral Small", |
| 34 | capabilities=["chat"], |
| 35 | ), |
| 36 | ModelInfo( |
| 37 | id="open-mistral-nemo", |
| 38 | provider="mistral", |
| 39 | display_name="Mistral Nemo", |
| 40 | capabilities=["chat"], |
| 41 | ), |
| 42 | ModelInfo( |
| 43 | id="pixtral-large-latest", |
| 44 | provider="mistral", |
| 45 | display_name="Pixtral Large", |
| 46 | capabilities=["chat", "vision"], |
| 47 | ), |
| 48 | ModelInfo( |
| 49 | id="pixtral-12b-2409", |
| 50 | provider="mistral", |
| 51 | display_name="Pixtral 12B", |
| 52 | capabilities=["chat", "vision"], |
| 53 | ), |
| 54 | ModelInfo( |
| 55 | id="codestral-latest", |
| 56 | provider="mistral", |
| 57 | display_name="Codestral", |
| 58 | capabilities=["chat"], |
| 59 | ), |
| 60 | ] |
| 61 | |
| 62 | |
| 63 | class MistralProvider(BaseProvider): |
| 64 | """Mistral AI provider using the mistralai SDK.""" |
| 65 | |
| 66 | provider_name = "mistral" |
| 67 | |
| 68 | def __init__(self, api_key: Optional[str] = None): |
| 69 | try: |
| 70 | from mistralai import Mistral |
| 71 | except ImportError: |
| 72 | raise ImportError( |
| 73 | "mistralai package not installed. Install with: pip install mistralai" |
| 74 | ) |
| 75 | |
| 76 | self._api_key = api_key or os.getenv("MISTRAL_API_KEY") |
| 77 | if not self._api_key: |
| 78 | raise ValueError("MISTRAL_API_KEY not set") |
| 79 | |
| 80 | self._client = Mistral(api_key=self._api_key) |
| 81 | self._last_usage = {} |
| 82 | |
| 83 | def chat( |
| 84 | self, |
| 85 | messages: list[dict], |
| 86 | max_tokens: int = 4096, |
| 87 | temperature: float = 0.7, |
| 88 | model: Optional[str] = None, |
| 89 | ) -> str: |
| 90 | model = model or "mistral-large-latest" |
| 91 | |
| 92 | response = self._client.chat.complete( |
| 93 | model=model, |
| 94 | messages=messages, |
| 95 | max_tokens=max_tokens, |
| 96 | temperature=temperature, |
| 97 | ) |
| 98 | |
| 99 | self._last_usage = { |
| 100 | "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0, |
| 101 | "output_tokens": getattr(response.usage, "completion_tokens", 0) |
| 102 | if response.usage |
| 103 | else 0, |
| 104 | } |
| 105 | return response.choices[0].message.content or "" |
| 106 | |
| 107 | def analyze_image( |
| 108 | self, |
| 109 | image_bytes: bytes, |
| 110 | prompt: str, |
| 111 | max_tokens: int = 4096, |
| 112 | model: Optional[str] = None, |
| 113 | ) -> str: |
| 114 | model = model or "pixtral-large-latest" |
| 115 | b64 = base64.b64encode(image_bytes).decode() |
| 116 | |
| 117 | response = self._client.chat.complete( |
| 118 | model=model, |
| 119 | messages=[ |
| 120 | { |
| 121 | "role": "user", |
| 122 | "content": [ |
| 123 | {"type": "text", "text": prompt}, |
| 124 | { |
| 125 | "type": "image_url", |
| 126 | "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, |
| 127 | }, |
| 128 | ], |
| 129 | } |
| 130 | ], |
| 131 | max_tokens=max_tokens, |
| 132 | ) |
| 133 | |
| 134 | self._last_usage = { |
| 135 | "input_tokens": getattr(response.usage, "prompt_tokens", 0) if response.usage else 0, |
| 136 | "output_tokens": getattr(response.usage, "completion_tokens", 0) |
| 137 | if response.usage |
| 138 | else 0, |
| 139 | } |
| 140 | return response.choices[0].message.content or "" |
| 141 | |
| 142 | def transcribe_audio( |
| 143 | self, |
| 144 | audio_path: str | Path, |
| 145 | language: Optional[str] = None, |
| 146 | model: Optional[str] = None, |
| 147 | ) -> dict: |
| 148 | raise NotImplementedError( |
| 149 | "Mistral does not provide a transcription API. " |
| 150 | "Use OpenAI Whisper or Gemini for transcription." |
| 151 | ) |
| 152 | |
| 153 | def list_models(self) -> list[ModelInfo]: |
| 154 | return list(_MISTRAL_MODELS) |
| 155 | |
| 156 | |
| 157 | ProviderRegistry.register( |
| 158 | name="mistral", |
| 159 | provider_class=MistralProvider, |
| 160 | env_var="MISTRAL_API_KEY", |
| 161 | model_prefixes=["mistral-", "pixtral-", "codestral-", "open-mistral-"], |
| 162 | default_models={ |
| 163 | "chat": "mistral-large-latest", |
| 164 | "vision": "pixtral-large-latest", |
| 165 | "audio": "", |
| 166 | }, |
| 167 | ) |
| --- video_processor/providers/ollama_provider.py | ||
| +++ video_processor/providers/ollama_provider.py | ||
| @@ -7,11 +7,11 @@ | ||
| 7 | 7 | from typing import Optional |
| 8 | 8 | |
| 9 | 9 | import requests |
| 10 | 10 | from openai import OpenAI |
| 11 | 11 | |
| 12 | -from video_processor.providers.base import BaseProvider, ModelInfo | |
| 12 | +from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry | |
| 13 | 13 | |
| 14 | 14 | logger = logging.getLogger(__name__) |
| 15 | 15 | |
| 16 | 16 | # Known vision-capable model families (base name before the colon/tag) |
| 17 | 17 | _VISION_FAMILIES = { |
| @@ -168,5 +168,14 @@ | ||
| 168 | 168 | ) |
| 169 | 169 | ) |
| 170 | 170 | except Exception as e: |
| 171 | 171 | logger.warning(f"Failed to list Ollama models: {e}") |
| 172 | 172 | return sorted(models, key=lambda m: m.id) |
| 173 | + | |
| 174 | + | |
| 175 | +ProviderRegistry.register( | |
| 176 | + name="ollama", | |
| 177 | + provider_class=OllamaProvider, | |
| 178 | + env_var="", | |
| 179 | + model_prefixes=[], | |
| 180 | + default_models={"chat": "", "vision": "", "audio": ""}, | |
| 181 | +) | |
| 173 | 182 |
| --- video_processor/providers/ollama_provider.py | |
| +++ video_processor/providers/ollama_provider.py | |
| @@ -7,11 +7,11 @@ | |
| 7 | from typing import Optional |
| 8 | |
| 9 | import requests |
| 10 | from openai import OpenAI |
| 11 | |
| 12 | from video_processor.providers.base import BaseProvider, ModelInfo |
| 13 | |
| 14 | logger = logging.getLogger(__name__) |
| 15 | |
| 16 | # Known vision-capable model families (base name before the colon/tag) |
| 17 | _VISION_FAMILIES = { |
| @@ -168,5 +168,14 @@ | |
| 168 | ) |
| 169 | ) |
| 170 | except Exception as e: |
| 171 | logger.warning(f"Failed to list Ollama models: {e}") |
| 172 | return sorted(models, key=lambda m: m.id) |
| 173 |
| --- video_processor/providers/ollama_provider.py | |
| +++ video_processor/providers/ollama_provider.py | |
| @@ -7,11 +7,11 @@ | |
| 7 | from typing import Optional |
| 8 | |
| 9 | import requests |
| 10 | from openai import OpenAI |
| 11 | |
| 12 | from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry |
| 13 | |
| 14 | logger = logging.getLogger(__name__) |
| 15 | |
| 16 | # Known vision-capable model families (base name before the colon/tag) |
| 17 | _VISION_FAMILIES = { |
| @@ -168,5 +168,14 @@ | |
| 168 | ) |
| 169 | ) |
| 170 | except Exception as e: |
| 171 | logger.warning(f"Failed to list Ollama models: {e}") |
| 172 | return sorted(models, key=lambda m: m.id) |
| 173 | |
| 174 | |
| 175 | ProviderRegistry.register( |
| 176 | name="ollama", |
| 177 | provider_class=OllamaProvider, |
| 178 | env_var="", |
| 179 | model_prefixes=[], |
| 180 | default_models={"chat": "", "vision": "", "audio": ""}, |
| 181 | ) |
| 182 |
| --- video_processor/providers/openai_provider.py | ||
| +++ video_processor/providers/openai_provider.py | ||
| @@ -7,11 +7,11 @@ | ||
| 7 | 7 | from typing import Optional |
| 8 | 8 | |
| 9 | 9 | from dotenv import load_dotenv |
| 10 | 10 | from openai import OpenAI |
| 11 | 11 | |
| 12 | -from video_processor.providers.base import BaseProvider, ModelInfo | |
| 12 | +from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry | |
| 13 | 13 | |
| 14 | 14 | load_dotenv() |
| 15 | 15 | logger = logging.getLogger(__name__) |
| 16 | 16 | |
| 17 | 17 | # Models known to have vision capability |
| @@ -46,11 +46,11 @@ | ||
| 46 | 46 | messages: list[dict], |
| 47 | 47 | max_tokens: int = 4096, |
| 48 | 48 | temperature: float = 0.7, |
| 49 | 49 | model: Optional[str] = None, |
| 50 | 50 | ) -> str: |
| 51 | - model = model or "gpt-4o" | |
| 51 | + model = model or "gpt-4o-mini" | |
| 52 | 52 | response = self.client.chat.completions.create( |
| 53 | 53 | model=model, |
| 54 | 54 | messages=messages, |
| 55 | 55 | max_tokens=max_tokens, |
| 56 | 56 | temperature=temperature, |
| @@ -68,11 +68,11 @@ | ||
| 68 | 68 | image_bytes: bytes, |
| 69 | 69 | prompt: str, |
| 70 | 70 | max_tokens: int = 4096, |
| 71 | 71 | model: Optional[str] = None, |
| 72 | 72 | ) -> str: |
| 73 | - model = model or "gpt-4o" | |
| 73 | + model = model or "gpt-4o-mini" | |
| 74 | 74 | b64 = base64.b64encode(image_bytes).decode() |
| 75 | 75 | response = self.client.chat.completions.create( |
| 76 | 76 | model=model, |
| 77 | 77 | messages=[ |
| 78 | 78 | { |
| @@ -225,5 +225,14 @@ | ||
| 225 | 225 | ) |
| 226 | 226 | ) |
| 227 | 227 | except Exception as e: |
| 228 | 228 | logger.warning(f"Failed to list OpenAI models: {e}") |
| 229 | 229 | return sorted(models, key=lambda m: m.id) |
| 230 | + | |
| 231 | + | |
| 232 | +ProviderRegistry.register( | |
| 233 | + name="openai", | |
| 234 | + provider_class=OpenAIProvider, | |
| 235 | + env_var="OPENAI_API_KEY", | |
| 236 | + model_prefixes=["gpt-", "o1", "o3", "o4", "whisper"], | |
| 237 | + default_models={"chat": "gpt-4o-mini", "vision": "gpt-4o-mini", "audio": "whisper-1"}, | |
| 238 | +) | |
| 230 | 239 | |
| 231 | 240 | ADDED video_processor/providers/qianfan_provider.py |
| 232 | 241 | ADDED video_processor/providers/together_provider.py |
| 233 | 242 | ADDED video_processor/providers/vertex_provider.py |
| 234 | 243 | ADDED video_processor/providers/xai_provider.py |
| --- video_processor/providers/openai_provider.py | |
| +++ video_processor/providers/openai_provider.py | |
| @@ -7,11 +7,11 @@ | |
| 7 | from typing import Optional |
| 8 | |
| 9 | from dotenv import load_dotenv |
| 10 | from openai import OpenAI |
| 11 | |
| 12 | from video_processor.providers.base import BaseProvider, ModelInfo |
| 13 | |
| 14 | load_dotenv() |
| 15 | logger = logging.getLogger(__name__) |
| 16 | |
| 17 | # Models known to have vision capability |
| @@ -46,11 +46,11 @@ | |
| 46 | messages: list[dict], |
| 47 | max_tokens: int = 4096, |
| 48 | temperature: float = 0.7, |
| 49 | model: Optional[str] = None, |
| 50 | ) -> str: |
| 51 | model = model or "gpt-4o" |
| 52 | response = self.client.chat.completions.create( |
| 53 | model=model, |
| 54 | messages=messages, |
| 55 | max_tokens=max_tokens, |
| 56 | temperature=temperature, |
| @@ -68,11 +68,11 @@ | |
| 68 | image_bytes: bytes, |
| 69 | prompt: str, |
| 70 | max_tokens: int = 4096, |
| 71 | model: Optional[str] = None, |
| 72 | ) -> str: |
| 73 | model = model or "gpt-4o" |
| 74 | b64 = base64.b64encode(image_bytes).decode() |
| 75 | response = self.client.chat.completions.create( |
| 76 | model=model, |
| 77 | messages=[ |
| 78 | { |
| @@ -225,5 +225,14 @@ | |
| 225 | ) |
| 226 | ) |
| 227 | except Exception as e: |
| 228 | logger.warning(f"Failed to list OpenAI models: {e}") |
| 229 | return sorted(models, key=lambda m: m.id) |
| 230 | |
| 231 | DDED video_processor/providers/qianfan_provider.py |
| 232 | DDED video_processor/providers/together_provider.py |
| 233 | DDED video_processor/providers/vertex_provider.py |
| 234 | DDED video_processor/providers/xai_provider.py |
| --- video_processor/providers/openai_provider.py | |
| +++ video_processor/providers/openai_provider.py | |
| @@ -7,11 +7,11 @@ | |
| 7 | from typing import Optional |
| 8 | |
| 9 | from dotenv import load_dotenv |
| 10 | from openai import OpenAI |
| 11 | |
| 12 | from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry |
| 13 | |
| 14 | load_dotenv() |
| 15 | logger = logging.getLogger(__name__) |
| 16 | |
| 17 | # Models known to have vision capability |
| @@ -46,11 +46,11 @@ | |
| 46 | messages: list[dict], |
| 47 | max_tokens: int = 4096, |
| 48 | temperature: float = 0.7, |
| 49 | model: Optional[str] = None, |
| 50 | ) -> str: |
| 51 | model = model or "gpt-4o-mini" |
| 52 | response = self.client.chat.completions.create( |
| 53 | model=model, |
| 54 | messages=messages, |
| 55 | max_tokens=max_tokens, |
| 56 | temperature=temperature, |
| @@ -68,11 +68,11 @@ | |
| 68 | image_bytes: bytes, |
| 69 | prompt: str, |
| 70 | max_tokens: int = 4096, |
| 71 | model: Optional[str] = None, |
| 72 | ) -> str: |
| 73 | model = model or "gpt-4o-mini" |
| 74 | b64 = base64.b64encode(image_bytes).decode() |
| 75 | response = self.client.chat.completions.create( |
| 76 | model=model, |
| 77 | messages=[ |
| 78 | { |
| @@ -225,5 +225,14 @@ | |
| 225 | ) |
| 226 | ) |
| 227 | except Exception as e: |
| 228 | logger.warning(f"Failed to list OpenAI models: {e}") |
| 229 | return sorted(models, key=lambda m: m.id) |
| 230 | |
| 231 | |
| 232 | ProviderRegistry.register( |
| 233 | name="openai", |
| 234 | provider_class=OpenAIProvider, |
| 235 | env_var="OPENAI_API_KEY", |
| 236 | model_prefixes=["gpt-", "o1", "o3", "o4", "whisper"], |
| 237 | default_models={"chat": "gpt-4o-mini", "vision": "gpt-4o-mini", "audio": "whisper-1"}, |
| 238 | ) |
| 239 | |
| 240 | DDED video_processor/providers/qianfan_provider.py |
| 241 | DDED video_processor/providers/together_provider.py |
| 242 | DDED video_processor/providers/vertex_provider.py |
| 243 | DDED video_processor/providers/xai_provider.py |
| --- a/video_processor/providers/qianfan_provider.py | ||
| +++ b/video_processor/providers/qianfan_provider.py | ||
| @@ -0,0 +1,138 @@ | ||
| 1 | +"""Baidu Qianfan (ERNIE) provider implementation.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import os | |
| 5 | +from pathlib import Path | |
| 6 | +from typing import Optional | |
| 7 | + | |
| 8 | +from dotenv import load_dotenv | |
| 9 | + | |
| 10 | +from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry | |
| 11 | + | |
| 12 | +load_dotenv() | |
| 13 | +logger = logging.getLogger(__name__) | |
| 14 | + | |
| 15 | +# Curated list of Qianfan models | |
| 16 | +_QIANFAN_MODELS = [ | |
| 17 | + ModelInfo( | |
| 18 | + id="ernie-4.0-8k", | |
| 19 | + provider="qianfan", | |
| 20 | + display_name="ERNIE 4.0 8K", | |
| 21 | + capabilities=["chat"], | |
| 22 | + ), | |
| 23 | + ModelInfo( | |
| 24 | + id="ernie-3.5-8k", | |
| 25 | + provider="qianfan", | |
| 26 | + display_name="ERNIE 3.5 8K", | |
| 27 | + capabilities=["chat"], | |
| 28 | + ), | |
| 29 | + ModelInfo( | |
| 30 | + id="ernie-speed-8k", | |
| 31 | + provider="qianfan", | |
| 32 | + display_name="ERNIE Speed 8K", | |
| 33 | + capabilities=["chat"], | |
| 34 | + ), | |
| 35 | + ModelInfo( | |
| 36 | + id="ernie-lite-8k", | |
| 37 | + provider="qianfan", | |
| 38 | + display_name="ERNIE Lite 8K", | |
| 39 | + capabilities=["chat"], | |
| 40 | + ), | |
| 41 | +] | |
| 42 | + | |
| 43 | + | |
| 44 | +class QianfanProvider(BaseProvider): | |
| 45 | + """Baidu Qianfan provider using the qianfan SDK.""" | |
| 46 | + | |
| 47 | + provider_name = "qianfan" | |
| 48 | + | |
| 49 | + def __init__( | |
| 50 | + self, | |
| 51 | + access_key: Optional[str] = None, | |
| 52 | + secret_key: Optional[str] = None, | |
| 53 | + ): | |
| 54 | + try: | |
| 55 | + import qianfan | |
| 56 | + except ImportError: | |
| 57 | + raise ImportError("qianfan package not installed. Install with: pip install qianfan") | |
| 58 | + | |
| 59 | + self._access_key = access_key or os.getenv("QIANFAN_ACCESS_KEY") | |
| 60 | + self._secret_key = secret_key or os.getenv("QIANFAN_SECRET_KEY") | |
| 61 | + | |
| 62 | + if not self._access_key or not self._secret_key: | |
| 63 | + raise ValueError("QIANFAN_ACCESS_KEY and QIANFAN_SECRET_KEY must both be set") | |
| 64 | + | |
| 65 | + # Set env vars for the SDK to pick up | |
| 66 | + os.environ["QIANFAN_ACCESS_KEY"] = self._access_key | |
| 67 | + os.environ["QIANFAN_SECRET_KEY"] = self._secret_key | |
| 68 | + | |
| 69 | + self._qianfan = qianfan | |
| 70 | + self._last_usage = {} | |
| 71 | + | |
| 72 | + def chat( | |
| 73 | + self, | |
| 74 | + messages: list[dict], | |
| 75 | + max_tokens: int = 4096, | |
| 76 | + temperature: float = 0.7, | |
| 77 | + model: Optional[str] = None, | |
| 78 | + ) -> str: | |
| 79 | + model = model or "ernie-4.0-8k" | |
| 80 | + if model.startswith("qianfan/"): | |
| 81 | + model = model[len("qianfan/") :] | |
| 82 | + | |
| 83 | + chat_comp = self._qianfan.ChatCompletion() | |
| 84 | + response = chat_comp.do( | |
| 85 | + model=model, | |
| 86 | + messages=messages, | |
| 87 | + temperature=temperature, | |
| 88 | + max_output_tokens=max_tokens, | |
| 89 | + ) | |
| 90 | + | |
| 91 | + body = response.get("body", response) if hasattr(response, "get") else response | |
| 92 | + usage = body.get("usage", {}) if hasattr(body, "get") else {} | |
| 93 | + self._last_usage = { | |
| 94 | + "input_tokens": usage.get("prompt_tokens", 0), | |
| 95 | + "output_tokens": usage.get("completion_tokens", 0), | |
| 96 | + } | |
| 97 | + | |
| 98 | + result = body.get("result", "") if hasattr(body, "get") else str(body) | |
| 99 | + return result | |
| 100 | + | |
| 101 | + def analyze_image( | |
| 102 | + self, | |
| 103 | + image_bytes: bytes, | |
| 104 | + prompt: str, | |
| 105 | + max_tokens: int = 4096, | |
| 106 | + model: Optional[str] = None, | |
| 107 | + ) -> str: | |
| 108 | + raise NotImplementedError( | |
| 109 | + "Qianfan image analysis is not supported in this provider. " | |
| 110 | + "Use OpenAI, Anthropic, or Gemini for image analysis." | |
| 111 | + ) | |
| 112 | + | |
| 113 | + def transcribe_audio( | |
| 114 | + self, | |
| 115 | + audio_path: str | Path, | |
| 116 | + language: Optional[str] = None, | |
| 117 | + model: Optional[str] = None, | |
| 118 | + ) -> dict: | |
| 119 | + raise NotImplementedError( | |
| 120 | + "Qianfan does not provide a transcription API through this provider. " | |
| 121 | + "Use OpenAI Whisper or Gemini for transcription." | |
| 122 | + ) | |
| 123 | + | |
| 124 | + def list_models(self) -> list[ModelInfo]: | |
| 125 | + return list(_QIANFAN_MODELS) | |
| 126 | + | |
| 127 | + | |
| 128 | +ProviderRegistry.register( | |
| 129 | + name="qianfan", | |
| 130 | + provider_class=QianfanProvider, | |
| 131 | + env_var="QIANFAN_ACCESS_KEY", | |
| 132 | + model_prefixes=["ernie-", "qianfan/"], | |
| 133 | + default_models={ | |
| 134 | + "chat": "ernie-4.0-8k", | |
| 135 | + "vision": "", | |
| 136 | + "audio": "", | |
| 137 | + }, | |
| 138 | +) |
| --- a/video_processor/providers/qianfan_provider.py | |
| +++ b/video_processor/providers/qianfan_provider.py | |
| @@ -0,0 +1,138 @@ | |
| --- a/video_processor/providers/qianfan_provider.py | |
| +++ b/video_processor/providers/qianfan_provider.py | |
| @@ -0,0 +1,138 @@ | |
| 1 | """Baidu Qianfan (ERNIE) provider implementation.""" |
| 2 | |
| 3 | import logging |
| 4 | import os |
| 5 | from pathlib import Path |
| 6 | from typing import Optional |
| 7 | |
| 8 | from dotenv import load_dotenv |
| 9 | |
| 10 | from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry |
| 11 | |
| 12 | load_dotenv() |
| 13 | logger = logging.getLogger(__name__) |
| 14 | |
| 15 | # Curated list of Qianfan models |
| 16 | _QIANFAN_MODELS = [ |
| 17 | ModelInfo( |
| 18 | id="ernie-4.0-8k", |
| 19 | provider="qianfan", |
| 20 | display_name="ERNIE 4.0 8K", |
| 21 | capabilities=["chat"], |
| 22 | ), |
| 23 | ModelInfo( |
| 24 | id="ernie-3.5-8k", |
| 25 | provider="qianfan", |
| 26 | display_name="ERNIE 3.5 8K", |
| 27 | capabilities=["chat"], |
| 28 | ), |
| 29 | ModelInfo( |
| 30 | id="ernie-speed-8k", |
| 31 | provider="qianfan", |
| 32 | display_name="ERNIE Speed 8K", |
| 33 | capabilities=["chat"], |
| 34 | ), |
| 35 | ModelInfo( |
| 36 | id="ernie-lite-8k", |
| 37 | provider="qianfan", |
| 38 | display_name="ERNIE Lite 8K", |
| 39 | capabilities=["chat"], |
| 40 | ), |
| 41 | ] |
| 42 | |
| 43 | |
| 44 | class QianfanProvider(BaseProvider): |
| 45 | """Baidu Qianfan provider using the qianfan SDK.""" |
| 46 | |
| 47 | provider_name = "qianfan" |
| 48 | |
| 49 | def __init__( |
| 50 | self, |
| 51 | access_key: Optional[str] = None, |
| 52 | secret_key: Optional[str] = None, |
| 53 | ): |
| 54 | try: |
| 55 | import qianfan |
| 56 | except ImportError: |
| 57 | raise ImportError("qianfan package not installed. Install with: pip install qianfan") |
| 58 | |
| 59 | self._access_key = access_key or os.getenv("QIANFAN_ACCESS_KEY") |
| 60 | self._secret_key = secret_key or os.getenv("QIANFAN_SECRET_KEY") |
| 61 | |
| 62 | if not self._access_key or not self._secret_key: |
| 63 | raise ValueError("QIANFAN_ACCESS_KEY and QIANFAN_SECRET_KEY must both be set") |
| 64 | |
| 65 | # Set env vars for the SDK to pick up |
| 66 | os.environ["QIANFAN_ACCESS_KEY"] = self._access_key |
| 67 | os.environ["QIANFAN_SECRET_KEY"] = self._secret_key |
| 68 | |
| 69 | self._qianfan = qianfan |
| 70 | self._last_usage = {} |
| 71 | |
| 72 | def chat( |
| 73 | self, |
| 74 | messages: list[dict], |
| 75 | max_tokens: int = 4096, |
| 76 | temperature: float = 0.7, |
| 77 | model: Optional[str] = None, |
| 78 | ) -> str: |
| 79 | model = model or "ernie-4.0-8k" |
| 80 | if model.startswith("qianfan/"): |
| 81 | model = model[len("qianfan/") :] |
| 82 | |
| 83 | chat_comp = self._qianfan.ChatCompletion() |
| 84 | response = chat_comp.do( |
| 85 | model=model, |
| 86 | messages=messages, |
| 87 | temperature=temperature, |
| 88 | max_output_tokens=max_tokens, |
| 89 | ) |
| 90 | |
| 91 | body = response.get("body", response) if hasattr(response, "get") else response |
| 92 | usage = body.get("usage", {}) if hasattr(body, "get") else {} |
| 93 | self._last_usage = { |
| 94 | "input_tokens": usage.get("prompt_tokens", 0), |
| 95 | "output_tokens": usage.get("completion_tokens", 0), |
| 96 | } |
| 97 | |
| 98 | result = body.get("result", "") if hasattr(body, "get") else str(body) |
| 99 | return result |
| 100 | |
| 101 | def analyze_image( |
| 102 | self, |
| 103 | image_bytes: bytes, |
| 104 | prompt: str, |
| 105 | max_tokens: int = 4096, |
| 106 | model: Optional[str] = None, |
| 107 | ) -> str: |
| 108 | raise NotImplementedError( |
| 109 | "Qianfan image analysis is not supported in this provider. " |
| 110 | "Use OpenAI, Anthropic, or Gemini for image analysis." |
| 111 | ) |
| 112 | |
| 113 | def transcribe_audio( |
| 114 | self, |
| 115 | audio_path: str | Path, |
| 116 | language: Optional[str] = None, |
| 117 | model: Optional[str] = None, |
| 118 | ) -> dict: |
| 119 | raise NotImplementedError( |
| 120 | "Qianfan does not provide a transcription API through this provider. " |
| 121 | "Use OpenAI Whisper or Gemini for transcription." |
| 122 | ) |
| 123 | |
| 124 | def list_models(self) -> list[ModelInfo]: |
| 125 | return list(_QIANFAN_MODELS) |
| 126 | |
| 127 | |
| 128 | ProviderRegistry.register( |
| 129 | name="qianfan", |
| 130 | provider_class=QianfanProvider, |
| 131 | env_var="QIANFAN_ACCESS_KEY", |
| 132 | model_prefixes=["ernie-", "qianfan/"], |
| 133 | default_models={ |
| 134 | "chat": "ernie-4.0-8k", |
| 135 | "vision": "", |
| 136 | "audio": "", |
| 137 | }, |
| 138 | ) |
| --- a/video_processor/providers/together_provider.py | ||
| +++ b/video_processor/providers/together_provider.py | ||
| @@ -0,0 +1,20 @@ | ||
| 1 | +"""Together AI provider implementation.""" | |
| 2 | + | |
| 3 | +from video_processor.providers.base import OpenAICompatibleProvider, ProviderRegistry | |
| 4 | + | |
| 5 | + | |
| 6 | +class TogetherProvider(OpenAICompatibleProvider): | |
| 7 | + """Together AI API provider (OpenAI-compatible).""" | |
| 8 | + | |
| 9 | + provider_name = "together" | |
| 10 | + base_url = "https://api.together.xyz/v1" | |
| 11 | + env_var = "TOGETHER_API_KEY" | |
| 12 | + | |
| 13 | + | |
| 14 | +ProviderRegistry.register( | |
| 15 | + name="together", | |
| 16 | + provider_class=TogetherProvider, | |
| 17 | + env_var="TOGETHER_API_KEY", | |
| 18 | + model_prefixes=["together/", "meta-llama/", "mistralai/", "Qwen/"], | |
| 19 | + default_models={"chat": "meta-llama/Llama-3-70b-chat-hf", "vision": "", "audio": ""}, | |
| 20 | +) |
| --- a/video_processor/providers/together_provider.py | |
| +++ b/video_processor/providers/together_provider.py | |
| @@ -0,0 +1,20 @@ | |
| --- a/video_processor/providers/together_provider.py | |
| +++ b/video_processor/providers/together_provider.py | |
| @@ -0,0 +1,20 @@ | |
| 1 | """Together AI provider implementation.""" |
| 2 | |
| 3 | from video_processor.providers.base import OpenAICompatibleProvider, ProviderRegistry |
| 4 | |
| 5 | |
| 6 | class TogetherProvider(OpenAICompatibleProvider): |
| 7 | """Together AI API provider (OpenAI-compatible).""" |
| 8 | |
| 9 | provider_name = "together" |
| 10 | base_url = "https://api.together.xyz/v1" |
| 11 | env_var = "TOGETHER_API_KEY" |
| 12 | |
| 13 | |
| 14 | ProviderRegistry.register( |
| 15 | name="together", |
| 16 | provider_class=TogetherProvider, |
| 17 | env_var="TOGETHER_API_KEY", |
| 18 | model_prefixes=["together/", "meta-llama/", "mistralai/", "Qwen/"], |
| 19 | default_models={"chat": "meta-llama/Llama-3-70b-chat-hf", "vision": "", "audio": ""}, |
| 20 | ) |
| --- a/video_processor/providers/vertex_provider.py | ||
| +++ b/video_processor/providers/vertex_provider.py | ||
| @@ -0,0 +1,226 @@ | ||
| 1 | +"""Google Vertex AI provider implementation.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import os | |
| 5 | +from pathlib import Path | |
| 6 | +from typing import Optional | |
| 7 | + | |
| 8 | +from dotenv import load_dotenv | |
| 9 | + | |
| 10 | +from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry | |
| 11 | + | |
| 12 | +load_dotenv() | |
| 13 | +logger = logging.getLogger(__name__) | |
| 14 | + | |
| 15 | +# Curated list of models available on Vertex AI | |
| 16 | +_VERTEX_MODELS = [ | |
| 17 | + ModelInfo( | |
| 18 | + id="gemini-2.0-flash", | |
| 19 | + provider="vertex", | |
| 20 | + display_name="Gemini 2.0 Flash", | |
| 21 | + capabilities=["chat", "vision", "audio"], | |
| 22 | + ), | |
| 23 | + ModelInfo( | |
| 24 | + id="gemini-2.0-pro", | |
| 25 | + provider="vertex", | |
| 26 | + display_name="Gemini 2.0 Pro", | |
| 27 | + capabilities=["chat", "vision", "audio"], | |
| 28 | + ), | |
| 29 | + ModelInfo( | |
| 30 | + id="gemini-1.5-pro", | |
| 31 | + provider="vertex", | |
| 32 | + display_name="Gemini 1.5 Pro", | |
| 33 | + capabilities=["chat", "vision", "audio"], | |
| 34 | + ), | |
| 35 | + ModelInfo( | |
| 36 | + id="gemini-1.5-flash", | |
| 37 | + provider="vertex", | |
| 38 | + display_name="Gemini 1.5 Flash", | |
| 39 | + capabilities=["chat", "vision", "audio"], | |
| 40 | + ), | |
| 41 | +] | |
| 42 | + | |
| 43 | + | |
| 44 | +class VertexProvider(BaseProvider): | |
| 45 | + """Google Vertex AI provider using google-genai SDK with Vertex config.""" | |
| 46 | + | |
| 47 | + provider_name = "vertex" | |
| 48 | + | |
| 49 | + def __init__( | |
| 50 | + self, | |
| 51 | + project: Optional[str] = None, | |
| 52 | + location: Optional[str] = None, | |
| 53 | + ): | |
| 54 | + try: | |
| 55 | + from google import genai | |
| 56 | + from google.genai import types # noqa: F401 | |
| 57 | + except ImportError: | |
| 58 | + raise ImportError( | |
| 59 | + "google-cloud-aiplatform or google-genai package not installed. " | |
| 60 | + "Install with: pip install google-cloud-aiplatform" | |
| 61 | + ) | |
| 62 | + | |
| 63 | + self._genai = genai | |
| 64 | + self._project = project or os.getenv("GOOGLE_CLOUD_PROJECT") | |
| 65 | + self._location = location or os.getenv("GOOGLE_CLOUD_REGION", "us-central1") | |
| 66 | + | |
| 67 | + if not self._project: | |
| 68 | + raise ValueError("GOOGLE_CLOUD_PROJECT not set") | |
| 69 | + | |
| 70 | + self.client = genai.Client( | |
| 71 | + vertexai=True, | |
| 72 | + project=self._project, | |
| 73 | + location=self._location, | |
| 74 | + ) | |
| 75 | + self._last_usage = {} | |
| 76 | + | |
| 77 | + def chat( | |
| 78 | + self, | |
| 79 | + messages: list[dict], | |
| 80 | + max_tokens: int = 4096, | |
| 81 | + temperature: float = 0.7, | |
| 82 | + model: Optional[str] = None, | |
| 83 | + ) -> str: | |
| 84 | + from google.genai import types | |
| 85 | + | |
| 86 | + model = model or "gemini-2.0-flash" | |
| 87 | + if model.startswith("vertex/"): | |
| 88 | + model = model[len("vertex/") :] | |
| 89 | + | |
| 90 | + contents = [] | |
| 91 | + for msg in messages: | |
| 92 | + role = "user" if msg["role"] == "user" else "model" | |
| 93 | + contents.append( | |
| 94 | + types.Content( | |
| 95 | + role=role, | |
| 96 | + parts=[types.Part.from_text(text=msg["content"])], | |
| 97 | + ) | |
| 98 | + ) | |
| 99 | + | |
| 100 | + response = self.client.models.generate_content( | |
| 101 | + model=model, | |
| 102 | + contents=contents, | |
| 103 | + config=types.GenerateContentConfig( | |
| 104 | + max_output_tokens=max_tokens, | |
| 105 | + temperature=temperature, | |
| 106 | + ), | |
| 107 | + ) | |
| 108 | + um = getattr(response, "usage_metadata", None) | |
| 109 | + self._last_usage = { | |
| 110 | + "input_tokens": getattr(um, "prompt_token_count", 0) if um else 0, | |
| 111 | + "output_tokens": getattr(um, "candidates_token_count", 0) if um else 0, | |
| 112 | + } | |
| 113 | + return response.text or "" | |
| 114 | + | |
| 115 | + def analyze_image( | |
| 116 | + self, | |
| 117 | + image_bytes: bytes, | |
| 118 | + prompt: str, | |
| 119 | + max_tokens: int = 4096, | |
| 120 | + model: Optional[str] = None, | |
| 121 | + ) -> str: | |
| 122 | + from google.genai import types | |
| 123 | + | |
| 124 | + model = model or "gemini-2.0-flash" | |
| 125 | + if model.startswith("vertex/"): | |
| 126 | + model = model[len("vertex/") :] | |
| 127 | + | |
| 128 | + response = self.client.models.generate_content( | |
| 129 | + model=model, | |
| 130 | + contents=[ | |
| 131 | + types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg"), | |
| 132 | + prompt, | |
| 133 | + ], | |
| 134 | + config=types.GenerateContentConfig( | |
| 135 | + max_output_tokens=max_tokens, | |
| 136 | + ), | |
| 137 | + ) | |
| 138 | + um = getattr(response, "usage_metadata", None) | |
| 139 | + self._last_usage = { | |
| 140 | + "input_tokens": getattr(um, "prompt_token_count", 0) if um else 0, | |
| 141 | + "output_tokens": getattr(um, "candidates_token_count", 0) if um else 0, | |
| 142 | + } | |
| 143 | + return response.text or "" | |
| 144 | + | |
| 145 | + def transcribe_audio( | |
| 146 | + self, | |
| 147 | + audio_path: str | Path, | |
| 148 | + language: Optional[str] = None, | |
| 149 | + model: Optional[str] = None, | |
| 150 | + ) -> dict: | |
| 151 | + import json | |
| 152 | + | |
| 153 | + from google.genai import types | |
| 154 | + | |
| 155 | + model = model or "gemini-2.0-flash" | |
| 156 | + if model.startswith("vertex/"): | |
| 157 | + model = model[len("vertex/") :] | |
| 158 | + | |
| 159 | + audio_path = Path(audio_path) | |
| 160 | + suffix = audio_path.suffix.lower() | |
| 161 | + mime_map = { | |
| 162 | + ".wav": "audio/wav", | |
| 163 | + ".mp3": "audio/mpeg", | |
| 164 | + ".m4a": "audio/mp4", | |
| 165 | + ".flac": "audio/flac", | |
| 166 | + ".ogg": "audio/ogg", | |
| 167 | + ".webm": "audio/webm", | |
| 168 | + } | |
| 169 | + mime_type = mime_map.get(suffix, "audio/wav") | |
| 170 | + audio_bytes = audio_path.read_bytes() | |
| 171 | + | |
| 172 | + lang_hint = f" The audio is in {language}." if language else "" | |
| 173 | + prompt = ( | |
| 174 | + f"Transcribe this audio accurately.{lang_hint} " | |
| 175 | + "Return a JSON object with keys: " | |
| 176 | + '"text" (full transcript), ' | |
| 177 | + '"segments" (array of {start, end, text} objects with timestamps in seconds).' | |
| 178 | + ) | |
| 179 | + | |
| 180 | + response = self.client.models.generate_content( | |
| 181 | + model=model, | |
| 182 | + contents=[ | |
| 183 | + types.Part.from_bytes(data=audio_bytes, mime_type=mime_type), | |
| 184 | + prompt, | |
| 185 | + ], | |
| 186 | + config=types.GenerateContentConfig( | |
| 187 | + max_output_tokens=8192, | |
| 188 | + response_mime_type="application/json", | |
| 189 | + ), | |
| 190 | + ) | |
| 191 | + | |
| 192 | + try: | |
| 193 | + data = json.loads(response.text) | |
| 194 | + except (json.JSONDecodeError, TypeError): | |
| 195 | + data = {"text": response.text or "", "segments": []} | |
| 196 | + | |
| 197 | + um = getattr(response, "usage_metadata", None) | |
| 198 | + self._last_usage = { | |
| 199 | + "input_tokens": getattr(um, "prompt_token_count", 0) if um else 0, | |
| 200 | + "output_tokens": getattr(um, "candidates_token_count", 0) if um else 0, | |
| 201 | + } | |
| 202 | + | |
| 203 | + return { | |
| 204 | + "text": data.get("text", ""), | |
| 205 | + "segments": data.get("segments", []), | |
| 206 | + "language": language, | |
| 207 | + "duration": None, | |
| 208 | + "provider": "vertex", | |
| 209 | + "model": model, | |
| 210 | + } | |
| 211 | + | |
| 212 | + def list_models(self) -> list[ModelInfo]: | |
| 213 | + return list(_VERTEX_MODELS) | |
| 214 | + | |
| 215 | + | |
| 216 | +ProviderRegistry.register( | |
| 217 | + name="vertex", | |
| 218 | + provider_class=VertexProvider, | |
| 219 | + env_var="GOOGLE_CLOUD_PROJECT", | |
| 220 | + model_prefixes=["vertex/"], | |
| 221 | + default_models={ | |
| 222 | + "chat": "gemini-2.0-flash", | |
| 223 | + "vision": "gemini-2.0-flash", | |
| 224 | + "audio": "gemini-2.0-flash", | |
| 225 | + }, | |
| 226 | +) |
| --- a/video_processor/providers/vertex_provider.py | |
| +++ b/video_processor/providers/vertex_provider.py | |
| @@ -0,0 +1,226 @@ | |
| --- a/video_processor/providers/vertex_provider.py | |
| +++ b/video_processor/providers/vertex_provider.py | |
| @@ -0,0 +1,226 @@ | |
| 1 | """Google Vertex AI provider implementation.""" |
| 2 | |
| 3 | import logging |
| 4 | import os |
| 5 | from pathlib import Path |
| 6 | from typing import Optional |
| 7 | |
| 8 | from dotenv import load_dotenv |
| 9 | |
| 10 | from video_processor.providers.base import BaseProvider, ModelInfo, ProviderRegistry |
| 11 | |
| 12 | load_dotenv() |
| 13 | logger = logging.getLogger(__name__) |
| 14 | |
| 15 | # Curated list of models available on Vertex AI |
| 16 | _VERTEX_MODELS = [ |
| 17 | ModelInfo( |
| 18 | id="gemini-2.0-flash", |
| 19 | provider="vertex", |
| 20 | display_name="Gemini 2.0 Flash", |
| 21 | capabilities=["chat", "vision", "audio"], |
| 22 | ), |
| 23 | ModelInfo( |
| 24 | id="gemini-2.0-pro", |
| 25 | provider="vertex", |
| 26 | display_name="Gemini 2.0 Pro", |
| 27 | capabilities=["chat", "vision", "audio"], |
| 28 | ), |
| 29 | ModelInfo( |
| 30 | id="gemini-1.5-pro", |
| 31 | provider="vertex", |
| 32 | display_name="Gemini 1.5 Pro", |
| 33 | capabilities=["chat", "vision", "audio"], |
| 34 | ), |
| 35 | ModelInfo( |
| 36 | id="gemini-1.5-flash", |
| 37 | provider="vertex", |
| 38 | display_name="Gemini 1.5 Flash", |
| 39 | capabilities=["chat", "vision", "audio"], |
| 40 | ), |
| 41 | ] |
| 42 | |
| 43 | |
| 44 | class VertexProvider(BaseProvider): |
| 45 | """Google Vertex AI provider using google-genai SDK with Vertex config.""" |
| 46 | |
| 47 | provider_name = "vertex" |
| 48 | |
| 49 | def __init__( |
| 50 | self, |
| 51 | project: Optional[str] = None, |
| 52 | location: Optional[str] = None, |
| 53 | ): |
| 54 | try: |
| 55 | from google import genai |
| 56 | from google.genai import types # noqa: F401 |
| 57 | except ImportError: |
| 58 | raise ImportError( |
| 59 | "google-cloud-aiplatform or google-genai package not installed. " |
| 60 | "Install with: pip install google-cloud-aiplatform" |
| 61 | ) |
| 62 | |
| 63 | self._genai = genai |
| 64 | self._project = project or os.getenv("GOOGLE_CLOUD_PROJECT") |
| 65 | self._location = location or os.getenv("GOOGLE_CLOUD_REGION", "us-central1") |
| 66 | |
| 67 | if not self._project: |
| 68 | raise ValueError("GOOGLE_CLOUD_PROJECT not set") |
| 69 | |
| 70 | self.client = genai.Client( |
| 71 | vertexai=True, |
| 72 | project=self._project, |
| 73 | location=self._location, |
| 74 | ) |
| 75 | self._last_usage = {} |
| 76 | |
| 77 | def chat( |
| 78 | self, |
| 79 | messages: list[dict], |
| 80 | max_tokens: int = 4096, |
| 81 | temperature: float = 0.7, |
| 82 | model: Optional[str] = None, |
| 83 | ) -> str: |
| 84 | from google.genai import types |
| 85 | |
| 86 | model = model or "gemini-2.0-flash" |
| 87 | if model.startswith("vertex/"): |
| 88 | model = model[len("vertex/") :] |
| 89 | |
| 90 | contents = [] |
| 91 | for msg in messages: |
| 92 | role = "user" if msg["role"] == "user" else "model" |
| 93 | contents.append( |
| 94 | types.Content( |
| 95 | role=role, |
| 96 | parts=[types.Part.from_text(text=msg["content"])], |
| 97 | ) |
| 98 | ) |
| 99 | |
| 100 | response = self.client.models.generate_content( |
| 101 | model=model, |
| 102 | contents=contents, |
| 103 | config=types.GenerateContentConfig( |
| 104 | max_output_tokens=max_tokens, |
| 105 | temperature=temperature, |
| 106 | ), |
| 107 | ) |
| 108 | um = getattr(response, "usage_metadata", None) |
| 109 | self._last_usage = { |
| 110 | "input_tokens": getattr(um, "prompt_token_count", 0) if um else 0, |
| 111 | "output_tokens": getattr(um, "candidates_token_count", 0) if um else 0, |
| 112 | } |
| 113 | return response.text or "" |
| 114 | |
| 115 | def analyze_image( |
| 116 | self, |
| 117 | image_bytes: bytes, |
| 118 | prompt: str, |
| 119 | max_tokens: int = 4096, |
| 120 | model: Optional[str] = None, |
| 121 | ) -> str: |
| 122 | from google.genai import types |
| 123 | |
| 124 | model = model or "gemini-2.0-flash" |
| 125 | if model.startswith("vertex/"): |
| 126 | model = model[len("vertex/") :] |
| 127 | |
| 128 | response = self.client.models.generate_content( |
| 129 | model=model, |
| 130 | contents=[ |
| 131 | types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg"), |
| 132 | prompt, |
| 133 | ], |
| 134 | config=types.GenerateContentConfig( |
| 135 | max_output_tokens=max_tokens, |
| 136 | ), |
| 137 | ) |
| 138 | um = getattr(response, "usage_metadata", None) |
| 139 | self._last_usage = { |
| 140 | "input_tokens": getattr(um, "prompt_token_count", 0) if um else 0, |
| 141 | "output_tokens": getattr(um, "candidates_token_count", 0) if um else 0, |
| 142 | } |
| 143 | return response.text or "" |
| 144 | |
| 145 | def transcribe_audio( |
| 146 | self, |
| 147 | audio_path: str | Path, |
| 148 | language: Optional[str] = None, |
| 149 | model: Optional[str] = None, |
| 150 | ) -> dict: |
| 151 | import json |
| 152 | |
| 153 | from google.genai import types |
| 154 | |
| 155 | model = model or "gemini-2.0-flash" |
| 156 | if model.startswith("vertex/"): |
| 157 | model = model[len("vertex/") :] |
| 158 | |
| 159 | audio_path = Path(audio_path) |
| 160 | suffix = audio_path.suffix.lower() |
| 161 | mime_map = { |
| 162 | ".wav": "audio/wav", |
| 163 | ".mp3": "audio/mpeg", |
| 164 | ".m4a": "audio/mp4", |
| 165 | ".flac": "audio/flac", |
| 166 | ".ogg": "audio/ogg", |
| 167 | ".webm": "audio/webm", |
| 168 | } |
| 169 | mime_type = mime_map.get(suffix, "audio/wav") |
| 170 | audio_bytes = audio_path.read_bytes() |
| 171 | |
| 172 | lang_hint = f" The audio is in {language}." if language else "" |
| 173 | prompt = ( |
| 174 | f"Transcribe this audio accurately.{lang_hint} " |
| 175 | "Return a JSON object with keys: " |
| 176 | '"text" (full transcript), ' |
| 177 | '"segments" (array of {start, end, text} objects with timestamps in seconds).' |
| 178 | ) |
| 179 | |
| 180 | response = self.client.models.generate_content( |
| 181 | model=model, |
| 182 | contents=[ |
| 183 | types.Part.from_bytes(data=audio_bytes, mime_type=mime_type), |
| 184 | prompt, |
| 185 | ], |
| 186 | config=types.GenerateContentConfig( |
| 187 | max_output_tokens=8192, |
| 188 | response_mime_type="application/json", |
| 189 | ), |
| 190 | ) |
| 191 | |
| 192 | try: |
| 193 | data = json.loads(response.text) |
| 194 | except (json.JSONDecodeError, TypeError): |
| 195 | data = {"text": response.text or "", "segments": []} |
| 196 | |
| 197 | um = getattr(response, "usage_metadata", None) |
| 198 | self._last_usage = { |
| 199 | "input_tokens": getattr(um, "prompt_token_count", 0) if um else 0, |
| 200 | "output_tokens": getattr(um, "candidates_token_count", 0) if um else 0, |
| 201 | } |
| 202 | |
| 203 | return { |
| 204 | "text": data.get("text", ""), |
| 205 | "segments": data.get("segments", []), |
| 206 | "language": language, |
| 207 | "duration": None, |
| 208 | "provider": "vertex", |
| 209 | "model": model, |
| 210 | } |
| 211 | |
| 212 | def list_models(self) -> list[ModelInfo]: |
| 213 | return list(_VERTEX_MODELS) |
| 214 | |
| 215 | |
| 216 | ProviderRegistry.register( |
| 217 | name="vertex", |
| 218 | provider_class=VertexProvider, |
| 219 | env_var="GOOGLE_CLOUD_PROJECT", |
| 220 | model_prefixes=["vertex/"], |
| 221 | default_models={ |
| 222 | "chat": "gemini-2.0-flash", |
| 223 | "vision": "gemini-2.0-flash", |
| 224 | "audio": "gemini-2.0-flash", |
| 225 | }, |
| 226 | ) |
| --- a/video_processor/providers/xai_provider.py | ||
| +++ b/video_processor/providers/xai_provider.py | ||
| @@ -0,0 +1,20 @@ | ||
| 1 | +"""xAI (Grok) provider implementation.""" | |
| 2 | + | |
| 3 | +from video_processor.providers.base import OpenAICompatibleProvider, ProviderRegistry | |
| 4 | + | |
| 5 | + | |
| 6 | +class XAIProvider(OpenAICompatibleProvider): | |
| 7 | + """xAI API provider (OpenAI-compatible).""" | |
| 8 | + | |
| 9 | + provider_name = "xai" | |
| 10 | + base_url = "https://api.x.ai/v1" | |
| 11 | + env_var = "XAI_API_KEY" | |
| 12 | + | |
| 13 | + | |
| 14 | +ProviderRegistry.register( | |
| 15 | + name="xai", | |
| 16 | + provider_class=XAIProvider, | |
| 17 | + env_var="XAI_API_KEY", | |
| 18 | + model_prefixes=["grok-"], | |
| 19 | + default_models={"chat": "grok-2", "vision": "grok-2-vision", "audio": ""}, | |
| 20 | +) |
| --- a/video_processor/providers/xai_provider.py | |
| +++ b/video_processor/providers/xai_provider.py | |
| @@ -0,0 +1,20 @@ | |
| --- a/video_processor/providers/xai_provider.py | |
| +++ b/video_processor/providers/xai_provider.py | |
| @@ -0,0 +1,20 @@ | |
| 1 | """xAI (Grok) provider implementation.""" |
| 2 | |
| 3 | from video_processor.providers.base import OpenAICompatibleProvider, ProviderRegistry |
| 4 | |
| 5 | |
| 6 | class XAIProvider(OpenAICompatibleProvider): |
| 7 | """xAI API provider (OpenAI-compatible).""" |
| 8 | |
| 9 | provider_name = "xai" |
| 10 | base_url = "https://api.x.ai/v1" |
| 11 | env_var = "XAI_API_KEY" |
| 12 | |
| 13 | |
| 14 | ProviderRegistry.register( |
| 15 | name="xai", |
| 16 | provider_class=XAIProvider, |
| 17 | env_var="XAI_API_KEY", |
| 18 | model_prefixes=["grok-"], |
| 19 | default_models={"chat": "grok-2", "vision": "grok-2-vision", "audio": ""}, |
| 20 | ) |
+59
-2
| --- video_processor/sources/__init__.py | ||
| +++ video_processor/sources/__init__.py | ||
| @@ -1,5 +1,62 @@ | ||
| 1 | -"""Cloud source integrations for fetching videos from remote storage.""" | |
| 1 | +"""Cloud, web, and notes source integrations for fetching content from remote sources.""" | |
| 2 | 2 | |
| 3 | 3 | from video_processor.sources.base import BaseSource, SourceFile |
| 4 | 4 | |
| 5 | -__all__ = ["BaseSource", "SourceFile"] | |
| 5 | +__all__ = [ | |
| 6 | + "BaseSource", | |
| 7 | + "SourceFile", | |
| 8 | + "AppleNotesSource", | |
| 9 | + "ArxivSource", | |
| 10 | + "GitHubSource", | |
| 11 | + "GoogleDriveSource", | |
| 12 | + "GoogleKeepSource", | |
| 13 | + "GWSSource", | |
| 14 | + "HackerNewsSource", | |
| 15 | + "LogseqSource", | |
| 16 | + "M365Source", | |
| 17 | + "MeetRecordingSource", | |
| 18 | + "NotionSource", | |
| 19 | + "ObsidianSource", | |
| 20 | + "OneNoteSource", | |
| 21 | + "PodcastSource", | |
| 22 | + "TeamsRecordingSource", | |
| 23 | + "RedditSource", | |
| 24 | + "RSSSource", | |
| 25 | + "TwitterSource", | |
| 26 | + "WebSource", | |
| 27 | + "YouTubeSource", | |
| 28 | + "ZoomSource", | |
| 29 | +] | |
| 30 | + | |
| 31 | + | |
| 32 | +def __getattr__(name: str): | |
| 33 | + """Lazy imports to avoid pulling in optional dependencies at import time.""" | |
| 34 | + _lazy_map = { | |
| 35 | + "AppleNotesSource": "video_processor.sources.apple_notes_source", | |
| 36 | + "ArxivSource": "video_processor.sources.arxiv_source", | |
| 37 | + "GitHubSource": "video_processor.sources.github_source", | |
| 38 | + "GoogleDriveSource": "video_processor.sources.google_drive", | |
| 39 | + "GoogleKeepSource": "video_processor.sources.google_keep_source", | |
| 40 | + "GWSSource": "video_processor.sources.gws_source", | |
| 41 | + "HackerNewsSource": "video_processor.sources.hackernews_source", | |
| 42 | + "LogseqSource": "video_processor.sources.logseq_source", | |
| 43 | + "M365Source": "video_processor.sources.m365_source", | |
| 44 | + "MeetRecordingSource": "video_processor.sources.meet_recording_source", | |
| 45 | + "NotionSource": "video_processor.sources.notion_source", | |
| 46 | + "ObsidianSource": "video_processor.sources.obsidian_source", | |
| 47 | + "OneNoteSource": "video_processor.sources.onenote_source", | |
| 48 | + "PodcastSource": "video_processor.sources.podcast_source", | |
| 49 | + "TeamsRecordingSource": "video_processor.sources.teams_recording_source", | |
| 50 | + "RedditSource": "video_processor.sources.reddit_source", | |
| 51 | + "RSSSource": "video_processor.sources.rss_source", | |
| 52 | + "TwitterSource": "video_processor.sources.twitter_source", | |
| 53 | + "WebSource": "video_processor.sources.web_source", | |
| 54 | + "YouTubeSource": "video_processor.sources.youtube_source", | |
| 55 | + "ZoomSource": "video_processor.sources.zoom_source", | |
| 56 | + } | |
| 57 | + if name in _lazy_map: | |
| 58 | + import importlib | |
| 59 | + | |
| 60 | + module = importlib.import_module(_lazy_map[name]) | |
| 61 | + return getattr(module, name) | |
| 62 | + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") | |
| 6 | 63 | |
| 7 | 64 | ADDED video_processor/sources/apple_notes_source.py |
| 8 | 65 | ADDED video_processor/sources/arxiv_source.py |
| 9 | 66 | ADDED video_processor/sources/github_source.py |
| 10 | 67 | ADDED video_processor/sources/google_keep_source.py |
| 11 | 68 | ADDED video_processor/sources/gws_source.py |
| 12 | 69 | ADDED video_processor/sources/hackernews_source.py |
| 13 | 70 | ADDED video_processor/sources/logseq_source.py |
| 14 | 71 | ADDED video_processor/sources/m365_source.py |
| 15 | 72 | ADDED video_processor/sources/meet_recording_source.py |
| 16 | 73 | ADDED video_processor/sources/notion_source.py |
| 17 | 74 | ADDED video_processor/sources/obsidian_source.py |
| 18 | 75 | ADDED video_processor/sources/onenote_source.py |
| 19 | 76 | ADDED video_processor/sources/podcast_source.py |
| 20 | 77 | ADDED video_processor/sources/reddit_source.py |
| 21 | 78 | ADDED video_processor/sources/rss_source.py |
| 22 | 79 | ADDED video_processor/sources/s3_source.py |
| 23 | 80 | ADDED video_processor/sources/teams_recording_source.py |
| 24 | 81 | ADDED video_processor/sources/twitter_source.py |
| 25 | 82 | ADDED video_processor/sources/web_source.py |
| 26 | 83 | ADDED video_processor/sources/youtube_source.py |
| 27 | 84 | ADDED video_processor/sources/zoom_source.py |
| 28 | 85 | ADDED video_processor/utils/callbacks.py |
| --- video_processor/sources/__init__.py | |
| +++ video_processor/sources/__init__.py | |
| @@ -1,5 +1,62 @@ | |
| 1 | """Cloud source integrations for fetching videos from remote storage.""" |
| 2 | |
| 3 | from video_processor.sources.base import BaseSource, SourceFile |
| 4 | |
| 5 | __all__ = ["BaseSource", "SourceFile"] |
| 6 | |
| 7 | DDED video_processor/sources/apple_notes_source.py |
| 8 | DDED video_processor/sources/arxiv_source.py |
| 9 | DDED video_processor/sources/github_source.py |
| 10 | DDED video_processor/sources/google_keep_source.py |
| 11 | DDED video_processor/sources/gws_source.py |
| 12 | DDED video_processor/sources/hackernews_source.py |
| 13 | DDED video_processor/sources/logseq_source.py |
| 14 | DDED video_processor/sources/m365_source.py |
| 15 | DDED video_processor/sources/meet_recording_source.py |
| 16 | DDED video_processor/sources/notion_source.py |
| 17 | DDED video_processor/sources/obsidian_source.py |
| 18 | DDED video_processor/sources/onenote_source.py |
| 19 | DDED video_processor/sources/podcast_source.py |
| 20 | DDED video_processor/sources/reddit_source.py |
| 21 | DDED video_processor/sources/rss_source.py |
| 22 | DDED video_processor/sources/s3_source.py |
| 23 | DDED video_processor/sources/teams_recording_source.py |
| 24 | DDED video_processor/sources/twitter_source.py |
| 25 | DDED video_processor/sources/web_source.py |
| 26 | DDED video_processor/sources/youtube_source.py |
| 27 | DDED video_processor/sources/zoom_source.py |
| 28 | DDED video_processor/utils/callbacks.py |
| --- video_processor/sources/__init__.py | |
| +++ video_processor/sources/__init__.py | |
| @@ -1,5 +1,62 @@ | |
| 1 | """Cloud, web, and notes source integrations for fetching content from remote sources.""" |
| 2 | |
| 3 | from video_processor.sources.base import BaseSource, SourceFile |
| 4 | |
| 5 | __all__ = [ |
| 6 | "BaseSource", |
| 7 | "SourceFile", |
| 8 | "AppleNotesSource", |
| 9 | "ArxivSource", |
| 10 | "GitHubSource", |
| 11 | "GoogleDriveSource", |
| 12 | "GoogleKeepSource", |
| 13 | "GWSSource", |
| 14 | "HackerNewsSource", |
| 15 | "LogseqSource", |
| 16 | "M365Source", |
| 17 | "MeetRecordingSource", |
| 18 | "NotionSource", |
| 19 | "ObsidianSource", |
| 20 | "OneNoteSource", |
| 21 | "PodcastSource", |
| 22 | "TeamsRecordingSource", |
| 23 | "RedditSource", |
| 24 | "RSSSource", |
| 25 | "TwitterSource", |
| 26 | "WebSource", |
| 27 | "YouTubeSource", |
| 28 | "ZoomSource", |
| 29 | ] |
| 30 | |
| 31 | |
| 32 | def __getattr__(name: str): |
| 33 | """Lazy imports to avoid pulling in optional dependencies at import time.""" |
| 34 | _lazy_map = { |
| 35 | "AppleNotesSource": "video_processor.sources.apple_notes_source", |
| 36 | "ArxivSource": "video_processor.sources.arxiv_source", |
| 37 | "GitHubSource": "video_processor.sources.github_source", |
| 38 | "GoogleDriveSource": "video_processor.sources.google_drive", |
| 39 | "GoogleKeepSource": "video_processor.sources.google_keep_source", |
| 40 | "GWSSource": "video_processor.sources.gws_source", |
| 41 | "HackerNewsSource": "video_processor.sources.hackernews_source", |
| 42 | "LogseqSource": "video_processor.sources.logseq_source", |
| 43 | "M365Source": "video_processor.sources.m365_source", |
| 44 | "MeetRecordingSource": "video_processor.sources.meet_recording_source", |
| 45 | "NotionSource": "video_processor.sources.notion_source", |
| 46 | "ObsidianSource": "video_processor.sources.obsidian_source", |
| 47 | "OneNoteSource": "video_processor.sources.onenote_source", |
| 48 | "PodcastSource": "video_processor.sources.podcast_source", |
| 49 | "TeamsRecordingSource": "video_processor.sources.teams_recording_source", |
| 50 | "RedditSource": "video_processor.sources.reddit_source", |
| 51 | "RSSSource": "video_processor.sources.rss_source", |
| 52 | "TwitterSource": "video_processor.sources.twitter_source", |
| 53 | "WebSource": "video_processor.sources.web_source", |
| 54 | "YouTubeSource": "video_processor.sources.youtube_source", |
| 55 | "ZoomSource": "video_processor.sources.zoom_source", |
| 56 | } |
| 57 | if name in _lazy_map: |
| 58 | import importlib |
| 59 | |
| 60 | module = importlib.import_module(_lazy_map[name]) |
| 61 | return getattr(module, name) |
| 62 | raise AttributeError(f"module {__name__!r} has no attribute {name!r}") |
| 63 | |
| 64 | DDED video_processor/sources/apple_notes_source.py |
| 65 | DDED video_processor/sources/arxiv_source.py |
| 66 | DDED video_processor/sources/github_source.py |
| 67 | DDED video_processor/sources/google_keep_source.py |
| 68 | DDED video_processor/sources/gws_source.py |
| 69 | DDED video_processor/sources/hackernews_source.py |
| 70 | DDED video_processor/sources/logseq_source.py |
| 71 | DDED video_processor/sources/m365_source.py |
| 72 | DDED video_processor/sources/meet_recording_source.py |
| 73 | DDED video_processor/sources/notion_source.py |
| 74 | DDED video_processor/sources/obsidian_source.py |
| 75 | DDED video_processor/sources/onenote_source.py |
| 76 | DDED video_processor/sources/podcast_source.py |
| 77 | DDED video_processor/sources/reddit_source.py |
| 78 | DDED video_processor/sources/rss_source.py |
| 79 | DDED video_processor/sources/s3_source.py |
| 80 | DDED video_processor/sources/teams_recording_source.py |
| 81 | DDED video_processor/sources/twitter_source.py |
| 82 | DDED video_processor/sources/web_source.py |
| 83 | DDED video_processor/sources/youtube_source.py |
| 84 | DDED video_processor/sources/zoom_source.py |
| 85 | DDED video_processor/utils/callbacks.py |
| --- a/video_processor/sources/apple_notes_source.py | ||
| +++ b/video_processor/sources/apple_notes_source.py | ||
| @@ -0,0 +1,178 @@ | ||
| 1 | +"""Apple Notes source connector via osascript (macOS only).""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import re | |
| 5 | +import subprocess | |
| 6 | +import sys | |
| 7 | +from pathlib import Path | |
| 8 | +from typing import List, Optional | |
| 9 | + | |
| 10 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 11 | + | |
| 12 | +logger = logging.getLogger(__name__) | |
| 13 | + | |
| 14 | + | |
| 15 | +class AppleNotesSource(BaseSource): | |
| 16 | + """ | |
| 17 | + Fetch notes from Apple Notes using osascript (AppleScript). | |
| 18 | + | |
| 19 | + Only works on macOS. Requires the Notes app to be available | |
| 20 | + and permission for osascript to access it. | |
| 21 | + """ | |
| 22 | + | |
| 23 | + def __init__(self, folder: Optional[str] = None): | |
| 24 | + self.folder = folder | |
| 25 | + | |
| 26 | + def authenticate(self) -> bool: | |
| 27 | + """Check that we are running on macOS.""" | |
| 28 | + if sys.platform != "darwin": | |
| 29 | + logger.error("Apple Notes is only available on macOS (current: %s)", sys.platform) | |
| 30 | + return False | |
| 31 | + return True | |
| 32 | + | |
| 33 | + def list_videos( | |
| 34 | + self, | |
| 35 | + folder_id: Optional[str] = None, | |
| 36 | + folder_path: Optional[str] = None, | |
| 37 | + patterns: Optional[List[str]] = None, | |
| 38 | + ) -> List[SourceFile]: | |
| 39 | + """List notes from Apple Notes via osascript.""" | |
| 40 | + if not self.authenticate(): | |
| 41 | + return [] | |
| 42 | + | |
| 43 | + if self.folder: | |
| 44 | + script = ( | |
| 45 | + 'tell application "Notes"\n' | |
| 46 | + " set noteList to {}\n" | |
| 47 | + f" repeat with f in folders of default account\n" | |
| 48 | + f' if name of f is "{self.folder}" then\n' | |
| 49 | + " repeat with n in notes of f\n" | |
| 50 | + ' set end of noteList to (id of n) & "|||" & (name of n)\n' | |
| 51 | + " end repeat\n" | |
| 52 | + " end if\n" | |
| 53 | + " end repeat\n" | |
| 54 | + " return noteList\n" | |
| 55 | + "end tell" | |
| 56 | + ) | |
| 57 | + else: | |
| 58 | + script = ( | |
| 59 | + 'tell application "Notes"\n' | |
| 60 | + " set noteList to {}\n" | |
| 61 | + " repeat with n in notes of default account\n" | |
| 62 | + ' set end of noteList to (id of n) & "|||" & (name of n)\n' | |
| 63 | + " end repeat\n" | |
| 64 | + " return noteList\n" | |
| 65 | + "end tell" | |
| 66 | + ) | |
| 67 | + | |
| 68 | + try: | |
| 69 | + result = subprocess.run( | |
| 70 | + ["osascript", "-e", script], | |
| 71 | + capture_output=True, | |
| 72 | + text=True, | |
| 73 | + timeout=30, | |
| 74 | + ) | |
| 75 | + except FileNotFoundError: | |
| 76 | + logger.error("osascript not found. Apple Notes requires macOS.") | |
| 77 | + return [] | |
| 78 | + except subprocess.TimeoutExpired: | |
| 79 | + logger.error("osascript timed out while listing notes.") | |
| 80 | + return [] | |
| 81 | + | |
| 82 | + if result.returncode != 0: | |
| 83 | + logger.error("Failed to list notes: %s", result.stderr.strip()) | |
| 84 | + return [] | |
| 85 | + | |
| 86 | + return self._parse_note_list(result.stdout.strip()) | |
| 87 | + | |
| 88 | + def _parse_note_list(self, output: str) -> List[SourceFile]: | |
| 89 | + """Parse osascript output into SourceFile objects. | |
| 90 | + | |
| 91 | + Expected format: comma-separated items of 'id|||name' pairs. | |
| 92 | + """ | |
| 93 | + files: List[SourceFile] = [] | |
| 94 | + if not output: | |
| 95 | + return files | |
| 96 | + | |
| 97 | + # AppleScript returns a flat comma-separated list | |
| 98 | + entries = output.split(", ") | |
| 99 | + for entry in entries: | |
| 100 | + entry = entry.strip() | |
| 101 | + if "|||" not in entry: | |
| 102 | + continue | |
| 103 | + note_id, _, name = entry.partition("|||") | |
| 104 | + note_id = note_id.strip() | |
| 105 | + name = name.strip() | |
| 106 | + if note_id and name: | |
| 107 | + files.append( | |
| 108 | + SourceFile( | |
| 109 | + name=name, | |
| 110 | + id=note_id, | |
| 111 | + mime_type="text/plain", | |
| 112 | + ) | |
| 113 | + ) | |
| 114 | + | |
| 115 | + logger.info("Found %d notes", len(files)) | |
| 116 | + return files | |
| 117 | + | |
| 118 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 119 | + """Download a note's content as plain text.""" | |
| 120 | + destination = Path(destination) | |
| 121 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 122 | + | |
| 123 | + script = ( | |
| 124 | + 'tell application "Notes"\n' | |
| 125 | + f' set theNote to note id "{file.id}" of default account\n' | |
| 126 | + " return body of theNote\n" | |
| 127 | + "end tell" | |
| 128 | + ) | |
| 129 | + | |
| 130 | + try: | |
| 131 | + result = subprocess.run( | |
| 132 | + ["osascript", "-e", script], | |
| 133 | + capture_output=True, | |
| 134 | + text=True, | |
| 135 | + timeout=30, | |
| 136 | + ) | |
| 137 | + except FileNotFoundError: | |
| 138 | + raise RuntimeError("osascript not found. Apple Notes requires macOS.") | |
| 139 | + except subprocess.TimeoutExpired: | |
| 140 | + raise RuntimeError(f"osascript timed out fetching note {file.id}") | |
| 141 | + | |
| 142 | + if result.returncode != 0: | |
| 143 | + raise RuntimeError(f"Failed to fetch note {file.id}: {result.stderr.strip()}") | |
| 144 | + | |
| 145 | + html_body = result.stdout.strip() | |
| 146 | + text = self._html_to_text(html_body) | |
| 147 | + | |
| 148 | + # Prepend title | |
| 149 | + content = f"# {file.name}\n\n{text}" | |
| 150 | + destination.write_text(content, encoding="utf-8") | |
| 151 | + logger.info("Saved Apple Note to %s", destination) | |
| 152 | + return destination | |
| 153 | + | |
| 154 | + @staticmethod | |
| 155 | + def _html_to_text(html: str) -> str: | |
| 156 | + """Strip HTML tags and return plain text. | |
| 157 | + | |
| 158 | + Apple Notes returns note bodies as HTML. This uses regex-based | |
| 159 | + stripping similar to web_source._strip_html_tags. | |
| 160 | + """ | |
| 161 | + if not html: | |
| 162 | + return "" | |
| 163 | + # Replace <br> variants with newlines | |
| 164 | + text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE) | |
| 165 | + # Replace block-level closing tags with newlines | |
| 166 | + text = re.sub(r"</(?:p|div|li|tr|h[1-6])>", "\n", text, flags=re.IGNORECASE) | |
| 167 | + # Remove all remaining tags | |
| 168 | + text = re.sub(r"<[^>]+>", "", text) | |
| 169 | + # Decode common HTML entities | |
| 170 | + text = text.replace("&", "&") | |
| 171 | + text = text.replace("<", "<") | |
| 172 | + text = text.replace(">", ">") | |
| 173 | + text = text.replace(""", '"') | |
| 174 | + text = text.replace("'", "'") | |
| 175 | + text = text.replace(" ", " ") | |
| 176 | + # Collapse excessive blank lines | |
| 177 | + text = re.sub(r"\n{3,}", "\n\n", text) | |
| 178 | + return text.strip() |
| --- a/video_processor/sources/apple_notes_source.py | |
| +++ b/video_processor/sources/apple_notes_source.py | |
| @@ -0,0 +1,178 @@ | |
| --- a/video_processor/sources/apple_notes_source.py | |
| +++ b/video_processor/sources/apple_notes_source.py | |
| @@ -0,0 +1,178 @@ | |
| 1 | """Apple Notes source connector via osascript (macOS only).""" |
| 2 | |
| 3 | import logging |
| 4 | import re |
| 5 | import subprocess |
| 6 | import sys |
| 7 | from pathlib import Path |
| 8 | from typing import List, Optional |
| 9 | |
| 10 | from video_processor.sources.base import BaseSource, SourceFile |
| 11 | |
| 12 | logger = logging.getLogger(__name__) |
| 13 | |
| 14 | |
| 15 | class AppleNotesSource(BaseSource): |
| 16 | """ |
| 17 | Fetch notes from Apple Notes using osascript (AppleScript). |
| 18 | |
| 19 | Only works on macOS. Requires the Notes app to be available |
| 20 | and permission for osascript to access it. |
| 21 | """ |
| 22 | |
| 23 | def __init__(self, folder: Optional[str] = None): |
| 24 | self.folder = folder |
| 25 | |
| 26 | def authenticate(self) -> bool: |
| 27 | """Check that we are running on macOS.""" |
| 28 | if sys.platform != "darwin": |
| 29 | logger.error("Apple Notes is only available on macOS (current: %s)", sys.platform) |
| 30 | return False |
| 31 | return True |
| 32 | |
| 33 | def list_videos( |
| 34 | self, |
| 35 | folder_id: Optional[str] = None, |
| 36 | folder_path: Optional[str] = None, |
| 37 | patterns: Optional[List[str]] = None, |
| 38 | ) -> List[SourceFile]: |
| 39 | """List notes from Apple Notes via osascript.""" |
| 40 | if not self.authenticate(): |
| 41 | return [] |
| 42 | |
| 43 | if self.folder: |
| 44 | script = ( |
| 45 | 'tell application "Notes"\n' |
| 46 | " set noteList to {}\n" |
| 47 | f" repeat with f in folders of default account\n" |
| 48 | f' if name of f is "{self.folder}" then\n' |
| 49 | " repeat with n in notes of f\n" |
| 50 | ' set end of noteList to (id of n) & "|||" & (name of n)\n' |
| 51 | " end repeat\n" |
| 52 | " end if\n" |
| 53 | " end repeat\n" |
| 54 | " return noteList\n" |
| 55 | "end tell" |
| 56 | ) |
| 57 | else: |
| 58 | script = ( |
| 59 | 'tell application "Notes"\n' |
| 60 | " set noteList to {}\n" |
| 61 | " repeat with n in notes of default account\n" |
| 62 | ' set end of noteList to (id of n) & "|||" & (name of n)\n' |
| 63 | " end repeat\n" |
| 64 | " return noteList\n" |
| 65 | "end tell" |
| 66 | ) |
| 67 | |
| 68 | try: |
| 69 | result = subprocess.run( |
| 70 | ["osascript", "-e", script], |
| 71 | capture_output=True, |
| 72 | text=True, |
| 73 | timeout=30, |
| 74 | ) |
| 75 | except FileNotFoundError: |
| 76 | logger.error("osascript not found. Apple Notes requires macOS.") |
| 77 | return [] |
| 78 | except subprocess.TimeoutExpired: |
| 79 | logger.error("osascript timed out while listing notes.") |
| 80 | return [] |
| 81 | |
| 82 | if result.returncode != 0: |
| 83 | logger.error("Failed to list notes: %s", result.stderr.strip()) |
| 84 | return [] |
| 85 | |
| 86 | return self._parse_note_list(result.stdout.strip()) |
| 87 | |
| 88 | def _parse_note_list(self, output: str) -> List[SourceFile]: |
| 89 | """Parse osascript output into SourceFile objects. |
| 90 | |
| 91 | Expected format: comma-separated items of 'id|||name' pairs. |
| 92 | """ |
| 93 | files: List[SourceFile] = [] |
| 94 | if not output: |
| 95 | return files |
| 96 | |
| 97 | # AppleScript returns a flat comma-separated list |
| 98 | entries = output.split(", ") |
| 99 | for entry in entries: |
| 100 | entry = entry.strip() |
| 101 | if "|||" not in entry: |
| 102 | continue |
| 103 | note_id, _, name = entry.partition("|||") |
| 104 | note_id = note_id.strip() |
| 105 | name = name.strip() |
| 106 | if note_id and name: |
| 107 | files.append( |
| 108 | SourceFile( |
| 109 | name=name, |
| 110 | id=note_id, |
| 111 | mime_type="text/plain", |
| 112 | ) |
| 113 | ) |
| 114 | |
| 115 | logger.info("Found %d notes", len(files)) |
| 116 | return files |
| 117 | |
| 118 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 119 | """Download a note's content as plain text.""" |
| 120 | destination = Path(destination) |
| 121 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 122 | |
| 123 | script = ( |
| 124 | 'tell application "Notes"\n' |
| 125 | f' set theNote to note id "{file.id}" of default account\n' |
| 126 | " return body of theNote\n" |
| 127 | "end tell" |
| 128 | ) |
| 129 | |
| 130 | try: |
| 131 | result = subprocess.run( |
| 132 | ["osascript", "-e", script], |
| 133 | capture_output=True, |
| 134 | text=True, |
| 135 | timeout=30, |
| 136 | ) |
| 137 | except FileNotFoundError: |
| 138 | raise RuntimeError("osascript not found. Apple Notes requires macOS.") |
| 139 | except subprocess.TimeoutExpired: |
| 140 | raise RuntimeError(f"osascript timed out fetching note {file.id}") |
| 141 | |
| 142 | if result.returncode != 0: |
| 143 | raise RuntimeError(f"Failed to fetch note {file.id}: {result.stderr.strip()}") |
| 144 | |
| 145 | html_body = result.stdout.strip() |
| 146 | text = self._html_to_text(html_body) |
| 147 | |
| 148 | # Prepend title |
| 149 | content = f"# {file.name}\n\n{text}" |
| 150 | destination.write_text(content, encoding="utf-8") |
| 151 | logger.info("Saved Apple Note to %s", destination) |
| 152 | return destination |
| 153 | |
| 154 | @staticmethod |
| 155 | def _html_to_text(html: str) -> str: |
| 156 | """Strip HTML tags and return plain text. |
| 157 | |
| 158 | Apple Notes returns note bodies as HTML. This uses regex-based |
| 159 | stripping similar to web_source._strip_html_tags. |
| 160 | """ |
| 161 | if not html: |
| 162 | return "" |
| 163 | # Replace <br> variants with newlines |
| 164 | text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE) |
| 165 | # Replace block-level closing tags with newlines |
| 166 | text = re.sub(r"</(?:p|div|li|tr|h[1-6])>", "\n", text, flags=re.IGNORECASE) |
| 167 | # Remove all remaining tags |
| 168 | text = re.sub(r"<[^>]+>", "", text) |
| 169 | # Decode common HTML entities |
| 170 | text = text.replace("&", "&") |
| 171 | text = text.replace("<", "<") |
| 172 | text = text.replace(">", ">") |
| 173 | text = text.replace(""", '"') |
| 174 | text = text.replace("'", "'") |
| 175 | text = text.replace(" ", " ") |
| 176 | # Collapse excessive blank lines |
| 177 | text = re.sub(r"\n{3,}", "\n\n", text) |
| 178 | return text.strip() |
| --- a/video_processor/sources/arxiv_source.py | ||
| +++ b/video_processor/sources/arxiv_source.py | ||
| @@ -0,0 +1,117 @@ | ||
| 1 | +"""arXiv source connector for fetching paper metadata and PDFs.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import re | |
| 5 | +from pathlib import Path | |
| 6 | +from typing import List, Optional | |
| 7 | + | |
| 8 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 9 | + | |
| 10 | +logger = logging.getLogger(__name__) | |
| 11 | + | |
| 12 | +_ARXIV_ID_PATTERN = re.compile(r"(\d{4}\.\d{4,5})(v\d+)?") | |
| 13 | +ARXIV_API = "http://export.arxiv.org/api/query" | |
| 14 | + | |
| 15 | + | |
| 16 | +def _extract_arxiv_id(url_or_id: str) -> str: | |
| 17 | + """Extract arXiv paper ID from a URL or bare ID string.""" | |
| 18 | + match = _ARXIV_ID_PATTERN.search(url_or_id) | |
| 19 | + if not match: | |
| 20 | + raise ValueError(f"Could not extract arXiv ID from: {url_or_id}") | |
| 21 | + return match.group(0) | |
| 22 | + | |
| 23 | + | |
| 24 | +class ArxivSource(BaseSource): | |
| 25 | + """ | |
| 26 | + Fetch arXiv paper metadata and PDF. | |
| 27 | + | |
| 28 | + Uses the arXiv API (Atom feed) for metadata and direct PDF download. | |
| 29 | + Requires: pip install requests | |
| 30 | + """ | |
| 31 | + | |
| 32 | + def __init__(self, url_or_id: str): | |
| 33 | + self.arxiv_id = _extract_arxiv_id(url_or_id) | |
| 34 | + self._metadata: Optional[dict] = None | |
| 35 | + | |
| 36 | + def authenticate(self) -> bool: | |
| 37 | + """No auth needed for arXiv.""" | |
| 38 | + return True | |
| 39 | + | |
| 40 | + def _fetch_metadata(self) -> dict: | |
| 41 | + """Fetch paper metadata from the arXiv API.""" | |
| 42 | + if self._metadata: | |
| 43 | + return self._metadata | |
| 44 | + | |
| 45 | + import xml.etree.ElementTree as ET | |
| 46 | + | |
| 47 | + import requests | |
| 48 | + | |
| 49 | + resp = requests.get(ARXIV_API, params={"id_list": self.arxiv_id}, timeout=15) | |
| 50 | + resp.raise_for_status() | |
| 51 | + | |
| 52 | + ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"} | |
| 53 | + root = ET.fromstring(resp.text) | |
| 54 | + entry = root.find("atom:entry", ns) | |
| 55 | + if entry is None: | |
| 56 | + raise ValueError(f"Paper not found: {self.arxiv_id}") | |
| 57 | + | |
| 58 | + self._metadata = { | |
| 59 | + "title": (entry.findtext("atom:title", namespaces=ns) or "").strip(), | |
| 60 | + "summary": (entry.findtext("atom:summary", namespaces=ns) or "").strip(), | |
| 61 | + "authors": [ | |
| 62 | + a.findtext("atom:name", namespaces=ns) or "" | |
| 63 | + for a in entry.findall("atom:author", ns) | |
| 64 | + ], | |
| 65 | + "published": entry.findtext("atom:published", namespaces=ns) or "", | |
| 66 | + "pdf_url": f"https://arxiv.org/pdf/{self.arxiv_id}.pdf", | |
| 67 | + } | |
| 68 | + return self._metadata | |
| 69 | + | |
| 70 | + def list_videos( | |
| 71 | + self, | |
| 72 | + folder_id: Optional[str] = None, | |
| 73 | + folder_path: Optional[str] = None, | |
| 74 | + patterns: Optional[List[str]] = None, | |
| 75 | + ) -> List[SourceFile]: | |
| 76 | + """Return SourceFiles for the paper metadata and PDF.""" | |
| 77 | + meta = self._fetch_metadata() | |
| 78 | + return [ | |
| 79 | + SourceFile( | |
| 80 | + name=f"{meta['title']} (metadata)", | |
| 81 | + id=f"meta:{self.arxiv_id}", | |
| 82 | + mime_type="text/plain", | |
| 83 | + ), | |
| 84 | + SourceFile( | |
| 85 | + name=f"{meta['title']}.pdf", | |
| 86 | + id=f"pdf:{self.arxiv_id}", | |
| 87 | + mime_type="application/pdf", | |
| 88 | + ), | |
| 89 | + ] | |
| 90 | + | |
| 91 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 92 | + """Download paper metadata as text or the PDF file.""" | |
| 93 | + import requests | |
| 94 | + | |
| 95 | + destination = Path(destination) | |
| 96 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 97 | + meta = self._fetch_metadata() | |
| 98 | + | |
| 99 | + if file.id.startswith("meta:"): | |
| 100 | + authors = ", ".join(meta["authors"]) | |
| 101 | + text = ( | |
| 102 | + f"# {meta['title']}\n\n" | |
| 103 | + f"Authors: {authors}\n" | |
| 104 | + f"Published: {meta['published']}\n" | |
| 105 | + f"arXiv: {self.arxiv_id}\n\n" | |
| 106 | + f"## Abstract\n\n{meta['summary']}" | |
| 107 | + ) | |
| 108 | + destination.write_text(text, encoding="utf-8") | |
| 109 | + elif file.id.startswith("pdf:"): | |
| 110 | + resp = requests.get(meta["pdf_url"], timeout=60, stream=True) | |
| 111 | + resp.raise_for_status() | |
| 112 | + with open(destination, "wb") as f: | |
| 113 | + for chunk in resp.iter_content(chunk_size=8192): | |
| 114 | + f.write(chunk) | |
| 115 | + | |
| 116 | + logger.info(f"Downloaded arXiv {self.arxiv_id} to {destination}") | |
| 117 | + return destination |
| --- a/video_processor/sources/arxiv_source.py | |
| +++ b/video_processor/sources/arxiv_source.py | |
| @@ -0,0 +1,117 @@ | |
| --- a/video_processor/sources/arxiv_source.py | |
| +++ b/video_processor/sources/arxiv_source.py | |
| @@ -0,0 +1,117 @@ | |
| 1 | """arXiv source connector for fetching paper metadata and PDFs.""" |
| 2 | |
| 3 | import logging |
| 4 | import re |
| 5 | from pathlib import Path |
| 6 | from typing import List, Optional |
| 7 | |
| 8 | from video_processor.sources.base import BaseSource, SourceFile |
| 9 | |
| 10 | logger = logging.getLogger(__name__) |
| 11 | |
| 12 | _ARXIV_ID_PATTERN = re.compile(r"(\d{4}\.\d{4,5})(v\d+)?") |
| 13 | ARXIV_API = "http://export.arxiv.org/api/query" |
| 14 | |
| 15 | |
| 16 | def _extract_arxiv_id(url_or_id: str) -> str: |
| 17 | """Extract arXiv paper ID from a URL or bare ID string.""" |
| 18 | match = _ARXIV_ID_PATTERN.search(url_or_id) |
| 19 | if not match: |
| 20 | raise ValueError(f"Could not extract arXiv ID from: {url_or_id}") |
| 21 | return match.group(0) |
| 22 | |
| 23 | |
| 24 | class ArxivSource(BaseSource): |
| 25 | """ |
| 26 | Fetch arXiv paper metadata and PDF. |
| 27 | |
| 28 | Uses the arXiv API (Atom feed) for metadata and direct PDF download. |
| 29 | Requires: pip install requests |
| 30 | """ |
| 31 | |
| 32 | def __init__(self, url_or_id: str): |
| 33 | self.arxiv_id = _extract_arxiv_id(url_or_id) |
| 34 | self._metadata: Optional[dict] = None |
| 35 | |
| 36 | def authenticate(self) -> bool: |
| 37 | """No auth needed for arXiv.""" |
| 38 | return True |
| 39 | |
| 40 | def _fetch_metadata(self) -> dict: |
| 41 | """Fetch paper metadata from the arXiv API.""" |
| 42 | if self._metadata: |
| 43 | return self._metadata |
| 44 | |
| 45 | import xml.etree.ElementTree as ET |
| 46 | |
| 47 | import requests |
| 48 | |
| 49 | resp = requests.get(ARXIV_API, params={"id_list": self.arxiv_id}, timeout=15) |
| 50 | resp.raise_for_status() |
| 51 | |
| 52 | ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"} |
| 53 | root = ET.fromstring(resp.text) |
| 54 | entry = root.find("atom:entry", ns) |
| 55 | if entry is None: |
| 56 | raise ValueError(f"Paper not found: {self.arxiv_id}") |
| 57 | |
| 58 | self._metadata = { |
| 59 | "title": (entry.findtext("atom:title", namespaces=ns) or "").strip(), |
| 60 | "summary": (entry.findtext("atom:summary", namespaces=ns) or "").strip(), |
| 61 | "authors": [ |
| 62 | a.findtext("atom:name", namespaces=ns) or "" |
| 63 | for a in entry.findall("atom:author", ns) |
| 64 | ], |
| 65 | "published": entry.findtext("atom:published", namespaces=ns) or "", |
| 66 | "pdf_url": f"https://arxiv.org/pdf/{self.arxiv_id}.pdf", |
| 67 | } |
| 68 | return self._metadata |
| 69 | |
| 70 | def list_videos( |
| 71 | self, |
| 72 | folder_id: Optional[str] = None, |
| 73 | folder_path: Optional[str] = None, |
| 74 | patterns: Optional[List[str]] = None, |
| 75 | ) -> List[SourceFile]: |
| 76 | """Return SourceFiles for the paper metadata and PDF.""" |
| 77 | meta = self._fetch_metadata() |
| 78 | return [ |
| 79 | SourceFile( |
| 80 | name=f"{meta['title']} (metadata)", |
| 81 | id=f"meta:{self.arxiv_id}", |
| 82 | mime_type="text/plain", |
| 83 | ), |
| 84 | SourceFile( |
| 85 | name=f"{meta['title']}.pdf", |
| 86 | id=f"pdf:{self.arxiv_id}", |
| 87 | mime_type="application/pdf", |
| 88 | ), |
| 89 | ] |
| 90 | |
| 91 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 92 | """Download paper metadata as text or the PDF file.""" |
| 93 | import requests |
| 94 | |
| 95 | destination = Path(destination) |
| 96 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 97 | meta = self._fetch_metadata() |
| 98 | |
| 99 | if file.id.startswith("meta:"): |
| 100 | authors = ", ".join(meta["authors"]) |
| 101 | text = ( |
| 102 | f"# {meta['title']}\n\n" |
| 103 | f"Authors: {authors}\n" |
| 104 | f"Published: {meta['published']}\n" |
| 105 | f"arXiv: {self.arxiv_id}\n\n" |
| 106 | f"## Abstract\n\n{meta['summary']}" |
| 107 | ) |
| 108 | destination.write_text(text, encoding="utf-8") |
| 109 | elif file.id.startswith("pdf:"): |
| 110 | resp = requests.get(meta["pdf_url"], timeout=60, stream=True) |
| 111 | resp.raise_for_status() |
| 112 | with open(destination, "wb") as f: |
| 113 | for chunk in resp.iter_content(chunk_size=8192): |
| 114 | f.write(chunk) |
| 115 | |
| 116 | logger.info(f"Downloaded arXiv {self.arxiv_id} to {destination}") |
| 117 | return destination |
| --- a/video_processor/sources/github_source.py | ||
| +++ b/video_processor/sources/github_source.py | ||
| @@ -0,0 +1,156 @@ | ||
| 1 | +"""GitHub source connector for fetching repo content, issues, and PRs.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import os | |
| 5 | +from pathlib import Path | |
| 6 | +from typing import List, Optional | |
| 7 | + | |
| 8 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 9 | + | |
| 10 | +logger = logging.getLogger(__name__) | |
| 11 | + | |
| 12 | +API_BASE = "https://api.github.com" | |
| 13 | + | |
| 14 | + | |
| 15 | +class GitHubSource(BaseSource): | |
| 16 | + """ | |
| 17 | + Fetch GitHub repository README, issues, and pull requests as text documents. | |
| 18 | + | |
| 19 | + Auth: Set GITHUB_TOKEN env var, or use `gh auth token` output. | |
| 20 | + Requires: pip install requests | |
| 21 | + """ | |
| 22 | + | |
| 23 | + def __init__(self, repo: str, include_issues: bool = True, include_prs: bool = True): | |
| 24 | + """ | |
| 25 | + Parameters | |
| 26 | + ---------- | |
| 27 | + repo : str | |
| 28 | + GitHub repo in "owner/repo" format. | |
| 29 | + """ | |
| 30 | + self.repo = repo | |
| 31 | + self.include_issues = include_issues | |
| 32 | + self.include_prs = include_prs | |
| 33 | + self._token: Optional[str] = None | |
| 34 | + | |
| 35 | + def authenticate(self) -> bool: | |
| 36 | + """Authenticate via GITHUB_TOKEN env var or gh CLI.""" | |
| 37 | + self._token = os.environ.get("GITHUB_TOKEN") | |
| 38 | + if not self._token: | |
| 39 | + try: | |
| 40 | + import subprocess | |
| 41 | + | |
| 42 | + result = subprocess.run(["gh", "auth", "token"], capture_output=True, text=True) | |
| 43 | + if result.returncode == 0: | |
| 44 | + self._token = result.stdout.strip() | |
| 45 | + except FileNotFoundError: | |
| 46 | + pass | |
| 47 | + if not self._token: | |
| 48 | + logger.warning( | |
| 49 | + "No GitHub token found. Public repos only. Set GITHUB_TOKEN for private repos." | |
| 50 | + ) | |
| 51 | + return True | |
| 52 | + | |
| 53 | + def _headers(self) -> dict: | |
| 54 | + h = {"Accept": "application/vnd.github.v3+json"} | |
| 55 | + if self._token: | |
| 56 | + h["Authorization"] = f"Bearer {self._token}" | |
| 57 | + return h | |
| 58 | + | |
| 59 | + def list_videos( | |
| 60 | + self, | |
| 61 | + folder_id: Optional[str] = None, | |
| 62 | + folder_path: Optional[str] = None, | |
| 63 | + patterns: Optional[List[str]] = None, | |
| 64 | + ) -> List[SourceFile]: | |
| 65 | + """List available documents (README, issues, PRs) as SourceFiles.""" | |
| 66 | + import requests | |
| 67 | + | |
| 68 | + files = [] | |
| 69 | + # README | |
| 70 | + resp = requests.get( | |
| 71 | + f"{API_BASE}/repos/{self.repo}/readme", headers=self._headers(), timeout=15 | |
| 72 | + ) | |
| 73 | + if resp.ok: | |
| 74 | + files.append(SourceFile(name="README", id="readme", mime_type="text/markdown")) | |
| 75 | + | |
| 76 | + # Issues | |
| 77 | + if self.include_issues: | |
| 78 | + resp = requests.get( | |
| 79 | + f"{API_BASE}/repos/{self.repo}/issues", | |
| 80 | + headers=self._headers(), | |
| 81 | + params={"state": "all", "per_page": 100}, | |
| 82 | + timeout=15, | |
| 83 | + ) | |
| 84 | + if resp.ok: | |
| 85 | + for issue in resp.json(): | |
| 86 | + if "pull_request" not in issue: | |
| 87 | + files.append( | |
| 88 | + SourceFile( | |
| 89 | + name=f"Issue #{issue['number']}: {issue['title']}", | |
| 90 | + id=f"issue:{issue['number']}", | |
| 91 | + mime_type="text/plain", | |
| 92 | + ) | |
| 93 | + ) | |
| 94 | + | |
| 95 | + # PRs | |
| 96 | + if self.include_prs: | |
| 97 | + resp = requests.get( | |
| 98 | + f"{API_BASE}/repos/{self.repo}/pulls", | |
| 99 | + headers=self._headers(), | |
| 100 | + params={"state": "all", "per_page": 100}, | |
| 101 | + timeout=15, | |
| 102 | + ) | |
| 103 | + if resp.ok: | |
| 104 | + for pr in resp.json(): | |
| 105 | + files.append( | |
| 106 | + SourceFile( | |
| 107 | + name=f"PR #{pr['number']}: {pr['title']}", | |
| 108 | + id=f"pr:{pr['number']}", | |
| 109 | + mime_type="text/plain", | |
| 110 | + ) | |
| 111 | + ) | |
| 112 | + | |
| 113 | + return files | |
| 114 | + | |
| 115 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 116 | + """Download a single document (README, issue, or PR) as text.""" | |
| 117 | + import requests | |
| 118 | + | |
| 119 | + destination = Path(destination) | |
| 120 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 121 | + | |
| 122 | + if file.id == "readme": | |
| 123 | + resp = requests.get( | |
| 124 | + f"{API_BASE}/repos/{self.repo}/readme", | |
| 125 | + headers={**self._headers(), "Accept": "application/vnd.github.v3.raw"}, | |
| 126 | + timeout=15, | |
| 127 | + ) | |
| 128 | + destination.write_text(resp.text, encoding="utf-8") | |
| 129 | + elif file.id.startswith("issue:"): | |
| 130 | + num = file.id.split(":")[1] | |
| 131 | + resp = requests.get( | |
| 132 | + f"{API_BASE}/repos/{self.repo}/issues/{num}", | |
| 133 | + headers=self._headers(), | |
| 134 | + timeout=15, | |
| 135 | + ) | |
| 136 | + data = resp.json() | |
| 137 | + text = f"# {data['title']}\n\n{data.get('body', '') or ''}" | |
| 138 | + # Append comments | |
| 139 | + comments_resp = requests.get(data["comments_url"], headers=self._headers(), timeout=15) | |
| 140 | + if comments_resp.ok: | |
| 141 | + for c in comments_resp.json(): | |
| 142 | + text += f"\n\n---\n**{c['user']['login']}**: {c.get('body', '')}" | |
| 143 | + destination.write_text(text, encoding="utf-8") | |
| 144 | + elif file.id.startswith("pr:"): | |
| 145 | + num = file.id.split(":")[1] | |
| 146 | + resp = requests.get( | |
| 147 | + f"{API_BASE}/repos/{self.repo}/pulls/{num}", | |
| 148 | + headers=self._headers(), | |
| 149 | + timeout=15, | |
| 150 | + ) | |
| 151 | + data = resp.json() | |
| 152 | + text = f"# PR: {data['title']}\n\n{data.get('body', '') or ''}" | |
| 153 | + destination.write_text(text, encoding="utf-8") | |
| 154 | + | |
| 155 | + logger.info(f"Downloaded {file.name} to {destination}") | |
| 156 | + return destination |
| --- a/video_processor/sources/github_source.py | |
| +++ b/video_processor/sources/github_source.py | |
| @@ -0,0 +1,156 @@ | |
| --- a/video_processor/sources/github_source.py | |
| +++ b/video_processor/sources/github_source.py | |
| @@ -0,0 +1,156 @@ | |
| 1 | """GitHub source connector for fetching repo content, issues, and PRs.""" |
| 2 | |
| 3 | import logging |
| 4 | import os |
| 5 | from pathlib import Path |
| 6 | from typing import List, Optional |
| 7 | |
| 8 | from video_processor.sources.base import BaseSource, SourceFile |
| 9 | |
| 10 | logger = logging.getLogger(__name__) |
| 11 | |
| 12 | API_BASE = "https://api.github.com" |
| 13 | |
| 14 | |
| 15 | class GitHubSource(BaseSource): |
| 16 | """ |
| 17 | Fetch GitHub repository README, issues, and pull requests as text documents. |
| 18 | |
| 19 | Auth: Set GITHUB_TOKEN env var, or use `gh auth token` output. |
| 20 | Requires: pip install requests |
| 21 | """ |
| 22 | |
| 23 | def __init__(self, repo: str, include_issues: bool = True, include_prs: bool = True): |
| 24 | """ |
| 25 | Parameters |
| 26 | ---------- |
| 27 | repo : str |
| 28 | GitHub repo in "owner/repo" format. |
| 29 | """ |
| 30 | self.repo = repo |
| 31 | self.include_issues = include_issues |
| 32 | self.include_prs = include_prs |
| 33 | self._token: Optional[str] = None |
| 34 | |
| 35 | def authenticate(self) -> bool: |
| 36 | """Authenticate via GITHUB_TOKEN env var or gh CLI.""" |
| 37 | self._token = os.environ.get("GITHUB_TOKEN") |
| 38 | if not self._token: |
| 39 | try: |
| 40 | import subprocess |
| 41 | |
| 42 | result = subprocess.run(["gh", "auth", "token"], capture_output=True, text=True) |
| 43 | if result.returncode == 0: |
| 44 | self._token = result.stdout.strip() |
| 45 | except FileNotFoundError: |
| 46 | pass |
| 47 | if not self._token: |
| 48 | logger.warning( |
| 49 | "No GitHub token found. Public repos only. Set GITHUB_TOKEN for private repos." |
| 50 | ) |
| 51 | return True |
| 52 | |
| 53 | def _headers(self) -> dict: |
| 54 | h = {"Accept": "application/vnd.github.v3+json"} |
| 55 | if self._token: |
| 56 | h["Authorization"] = f"Bearer {self._token}" |
| 57 | return h |
| 58 | |
| 59 | def list_videos( |
| 60 | self, |
| 61 | folder_id: Optional[str] = None, |
| 62 | folder_path: Optional[str] = None, |
| 63 | patterns: Optional[List[str]] = None, |
| 64 | ) -> List[SourceFile]: |
| 65 | """List available documents (README, issues, PRs) as SourceFiles.""" |
| 66 | import requests |
| 67 | |
| 68 | files = [] |
| 69 | # README |
| 70 | resp = requests.get( |
| 71 | f"{API_BASE}/repos/{self.repo}/readme", headers=self._headers(), timeout=15 |
| 72 | ) |
| 73 | if resp.ok: |
| 74 | files.append(SourceFile(name="README", id="readme", mime_type="text/markdown")) |
| 75 | |
| 76 | # Issues |
| 77 | if self.include_issues: |
| 78 | resp = requests.get( |
| 79 | f"{API_BASE}/repos/{self.repo}/issues", |
| 80 | headers=self._headers(), |
| 81 | params={"state": "all", "per_page": 100}, |
| 82 | timeout=15, |
| 83 | ) |
| 84 | if resp.ok: |
| 85 | for issue in resp.json(): |
| 86 | if "pull_request" not in issue: |
| 87 | files.append( |
| 88 | SourceFile( |
| 89 | name=f"Issue #{issue['number']}: {issue['title']}", |
| 90 | id=f"issue:{issue['number']}", |
| 91 | mime_type="text/plain", |
| 92 | ) |
| 93 | ) |
| 94 | |
| 95 | # PRs |
| 96 | if self.include_prs: |
| 97 | resp = requests.get( |
| 98 | f"{API_BASE}/repos/{self.repo}/pulls", |
| 99 | headers=self._headers(), |
| 100 | params={"state": "all", "per_page": 100}, |
| 101 | timeout=15, |
| 102 | ) |
| 103 | if resp.ok: |
| 104 | for pr in resp.json(): |
| 105 | files.append( |
| 106 | SourceFile( |
| 107 | name=f"PR #{pr['number']}: {pr['title']}", |
| 108 | id=f"pr:{pr['number']}", |
| 109 | mime_type="text/plain", |
| 110 | ) |
| 111 | ) |
| 112 | |
| 113 | return files |
| 114 | |
| 115 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 116 | """Download a single document (README, issue, or PR) as text.""" |
| 117 | import requests |
| 118 | |
| 119 | destination = Path(destination) |
| 120 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 121 | |
| 122 | if file.id == "readme": |
| 123 | resp = requests.get( |
| 124 | f"{API_BASE}/repos/{self.repo}/readme", |
| 125 | headers={**self._headers(), "Accept": "application/vnd.github.v3.raw"}, |
| 126 | timeout=15, |
| 127 | ) |
| 128 | destination.write_text(resp.text, encoding="utf-8") |
| 129 | elif file.id.startswith("issue:"): |
| 130 | num = file.id.split(":")[1] |
| 131 | resp = requests.get( |
| 132 | f"{API_BASE}/repos/{self.repo}/issues/{num}", |
| 133 | headers=self._headers(), |
| 134 | timeout=15, |
| 135 | ) |
| 136 | data = resp.json() |
| 137 | text = f"# {data['title']}\n\n{data.get('body', '') or ''}" |
| 138 | # Append comments |
| 139 | comments_resp = requests.get(data["comments_url"], headers=self._headers(), timeout=15) |
| 140 | if comments_resp.ok: |
| 141 | for c in comments_resp.json(): |
| 142 | text += f"\n\n---\n**{c['user']['login']}**: {c.get('body', '')}" |
| 143 | destination.write_text(text, encoding="utf-8") |
| 144 | elif file.id.startswith("pr:"): |
| 145 | num = file.id.split(":")[1] |
| 146 | resp = requests.get( |
| 147 | f"{API_BASE}/repos/{self.repo}/pulls/{num}", |
| 148 | headers=self._headers(), |
| 149 | timeout=15, |
| 150 | ) |
| 151 | data = resp.json() |
| 152 | text = f"# PR: {data['title']}\n\n{data.get('body', '') or ''}" |
| 153 | destination.write_text(text, encoding="utf-8") |
| 154 | |
| 155 | logger.info(f"Downloaded {file.name} to {destination}") |
| 156 | return destination |
| --- a/video_processor/sources/google_keep_source.py | ||
| +++ b/video_processor/sources/google_keep_source.py | ||
| @@ -0,0 +1,170 @@ | ||
| 1 | +"""Google Keep source connector using the gws CLI (googleworkspace/cli). | |
| 2 | + | |
| 3 | +Fetches notes from Google Keep via the `gws` CLI tool. | |
| 4 | +Outputs plain text suitable for KG ingestion. | |
| 5 | + | |
| 6 | +Requires: npm install -g @googleworkspace/cli | |
| 7 | +Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless) | |
| 8 | +""" | |
| 9 | + | |
| 10 | +import json | |
| 11 | +import logging | |
| 12 | +import shutil | |
| 13 | +import subprocess | |
| 14 | +from pathlib import Path | |
| 15 | +from typing import Any, Dict, List, Optional | |
| 16 | + | |
| 17 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 18 | + | |
| 19 | +logger = logging.getLogger(__name__) | |
| 20 | + | |
| 21 | + | |
| 22 | +def _run_gws(args: List[str], timeout: int = 30) -> Dict[str, Any]: | |
| 23 | + """Run a gws CLI command and return parsed JSON output.""" | |
| 24 | + cmd = ["gws"] + args | |
| 25 | + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) | |
| 26 | + if proc.returncode != 0: | |
| 27 | + raise RuntimeError(f"gws {' '.join(args)} failed: {proc.stderr.strip()}") | |
| 28 | + try: | |
| 29 | + return json.loads(proc.stdout) | |
| 30 | + except json.JSONDecodeError: | |
| 31 | + return {"raw": proc.stdout.strip()} | |
| 32 | + | |
| 33 | + | |
| 34 | +def _note_to_text(note: dict) -> str: | |
| 35 | + """Extract text content from a Google Keep note structure. | |
| 36 | + | |
| 37 | + Handles plain text notes and checklists. Checklist items are formatted | |
| 38 | + as ``- [x] item`` (checked) or ``- [ ] item`` (unchecked). | |
| 39 | + """ | |
| 40 | + parts: List[str] = [] | |
| 41 | + | |
| 42 | + title = note.get("title", "").strip() | |
| 43 | + if title: | |
| 44 | + parts.append(title) | |
| 45 | + | |
| 46 | + body = note.get("body", note.get("textContent", "")).strip() | |
| 47 | + if body: | |
| 48 | + parts.append(body) | |
| 49 | + | |
| 50 | + # Checklist items may appear under "list", "listContent", or "checklistItems" | |
| 51 | + list_items = note.get("list", note.get("listContent", note.get("checklistItems", []))) | |
| 52 | + if isinstance(list_items, list): | |
| 53 | + for item in list_items: | |
| 54 | + text = item.get("text", "").strip() | |
| 55 | + if not text: | |
| 56 | + continue | |
| 57 | + checked = item.get("checked", item.get("isChecked", False)) | |
| 58 | + marker = "[x]" if checked else "[ ]" | |
| 59 | + parts.append(f"- {marker} {text}") | |
| 60 | + | |
| 61 | + return "\n\n".join(parts) if parts else "" | |
| 62 | + | |
| 63 | + | |
| 64 | +class GoogleKeepSource(BaseSource): | |
| 65 | + """ | |
| 66 | + Fetch notes from Google Keep via the gws CLI. | |
| 67 | + | |
| 68 | + Usage: | |
| 69 | + source = GoogleKeepSource() # all notes | |
| 70 | + source = GoogleKeepSource(label="meetings") # filter by label | |
| 71 | + files = source.list_videos() | |
| 72 | + source.download_all(files, Path("./notes")) | |
| 73 | + """ | |
| 74 | + | |
| 75 | + def __init__(self, label: Optional[str] = None): | |
| 76 | + self.label = label | |
| 77 | + | |
| 78 | + def authenticate(self) -> bool: | |
| 79 | + """Check if gws CLI is installed and authenticated.""" | |
| 80 | + if not shutil.which("gws"): | |
| 81 | + logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli") | |
| 82 | + return False | |
| 83 | + try: | |
| 84 | + _run_gws(["auth", "status"], timeout=10) | |
| 85 | + return True | |
| 86 | + except (RuntimeError, subprocess.TimeoutExpired): | |
| 87 | + logger.error("gws not authenticated. Run: gws auth login") | |
| 88 | + return False | |
| 89 | + | |
| 90 | + def list_videos( | |
| 91 | + self, | |
| 92 | + folder_id: Optional[str] = None, | |
| 93 | + folder_path: Optional[str] = None, | |
| 94 | + patterns: Optional[List[str]] = None, | |
| 95 | + ) -> List[SourceFile]: | |
| 96 | + """List notes in Google Keep. Returns SourceFile per note.""" | |
| 97 | + args = ["keep", "notes", "list", "--output", "json"] | |
| 98 | + | |
| 99 | + if self.label: | |
| 100 | + args.extend(["--label", self.label]) | |
| 101 | + | |
| 102 | + try: | |
| 103 | + result = _run_gws(args, timeout=60) | |
| 104 | + except RuntimeError as e: | |
| 105 | + logger.error(f"Failed to list Keep notes: {e}") | |
| 106 | + return [] | |
| 107 | + | |
| 108 | + # Result may be a list directly or nested under a key | |
| 109 | + notes: List[dict] = [] | |
| 110 | + if isinstance(result, list): | |
| 111 | + notes = result | |
| 112 | + elif isinstance(result, dict): | |
| 113 | + notes = result.get("notes", result.get("items", [])) | |
| 114 | + # If we got a single note back (not a list), wrap it | |
| 115 | + if not notes and "id" in result and "raw" not in result: | |
| 116 | + notes = [result] | |
| 117 | + | |
| 118 | + files: List[SourceFile] = [] | |
| 119 | + for note in notes: | |
| 120 | + note_id = note.get("id", note.get("noteId", "")) | |
| 121 | + title = note.get("title", "Untitled Note").strip() or "Untitled Note" | |
| 122 | + modified = note.get("modifiedTime", note.get("updateTime")) | |
| 123 | + | |
| 124 | + # Estimate size from text content | |
| 125 | + text = _note_to_text(note) | |
| 126 | + size = len(text.encode("utf-8")) if text else None | |
| 127 | + | |
| 128 | + files.append( | |
| 129 | + SourceFile( | |
| 130 | + name=title, | |
| 131 | + id=str(note_id), | |
| 132 | + size_bytes=size, | |
| 133 | + mime_type="text/plain", | |
| 134 | + modified_at=modified, | |
| 135 | + ) | |
| 136 | + ) | |
| 137 | + | |
| 138 | + logger.info(f"Found {len(files)} note(s) in Google Keep") | |
| 139 | + return files | |
| 140 | + | |
| 141 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 142 | + """Download a Keep note's content as a text file.""" | |
| 143 | + destination = Path(destination) | |
| 144 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 145 | + | |
| 146 | + try: | |
| 147 | + result = _run_gws( | |
| 148 | + [ | |
| 149 | + "keep", | |
| 150 | + "notes", | |
| 151 | + "get", | |
| 152 | + "--params", | |
| 153 | + json.dumps({"noteId": file.id}), | |
| 154 | + ], | |
| 155 | + timeout=30, | |
| 156 | + ) | |
| 157 | + except RuntimeError as e: | |
| 158 | + raise RuntimeError(f"Failed to fetch Keep note {file.id}: {e}") from e | |
| 159 | + | |
| 160 | + # result may be the note dict directly or wrapped | |
| 161 | + note = result if isinstance(result, dict) else {} | |
| 162 | + text = _note_to_text(note) | |
| 163 | + | |
| 164 | + if not text: | |
| 165 | + # Fallback: use raw output if structured extraction yielded nothing | |
| 166 | + text = note.get("raw", json.dumps(note, indent=2)) | |
| 167 | + | |
| 168 | + destination.write_text(text, encoding="utf-8") | |
| 169 | + logger.info(f"Saved note '{file.name}' to {destination}") | |
| 170 | + return destination |
| --- a/video_processor/sources/google_keep_source.py | |
| +++ b/video_processor/sources/google_keep_source.py | |
| @@ -0,0 +1,170 @@ | |
| --- a/video_processor/sources/google_keep_source.py | |
| +++ b/video_processor/sources/google_keep_source.py | |
| @@ -0,0 +1,170 @@ | |
| 1 | """Google Keep source connector using the gws CLI (googleworkspace/cli). |
| 2 | |
| 3 | Fetches notes from Google Keep via the `gws` CLI tool. |
| 4 | Outputs plain text suitable for KG ingestion. |
| 5 | |
| 6 | Requires: npm install -g @googleworkspace/cli |
| 7 | Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless) |
| 8 | """ |
| 9 | |
| 10 | import json |
| 11 | import logging |
| 12 | import shutil |
| 13 | import subprocess |
| 14 | from pathlib import Path |
| 15 | from typing import Any, Dict, List, Optional |
| 16 | |
| 17 | from video_processor.sources.base import BaseSource, SourceFile |
| 18 | |
| 19 | logger = logging.getLogger(__name__) |
| 20 | |
| 21 | |
| 22 | def _run_gws(args: List[str], timeout: int = 30) -> Dict[str, Any]: |
| 23 | """Run a gws CLI command and return parsed JSON output.""" |
| 24 | cmd = ["gws"] + args |
| 25 | proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) |
| 26 | if proc.returncode != 0: |
| 27 | raise RuntimeError(f"gws {' '.join(args)} failed: {proc.stderr.strip()}") |
| 28 | try: |
| 29 | return json.loads(proc.stdout) |
| 30 | except json.JSONDecodeError: |
| 31 | return {"raw": proc.stdout.strip()} |
| 32 | |
| 33 | |
| 34 | def _note_to_text(note: dict) -> str: |
| 35 | """Extract text content from a Google Keep note structure. |
| 36 | |
| 37 | Handles plain text notes and checklists. Checklist items are formatted |
| 38 | as ``- [x] item`` (checked) or ``- [ ] item`` (unchecked). |
| 39 | """ |
| 40 | parts: List[str] = [] |
| 41 | |
| 42 | title = note.get("title", "").strip() |
| 43 | if title: |
| 44 | parts.append(title) |
| 45 | |
| 46 | body = note.get("body", note.get("textContent", "")).strip() |
| 47 | if body: |
| 48 | parts.append(body) |
| 49 | |
| 50 | # Checklist items may appear under "list", "listContent", or "checklistItems" |
| 51 | list_items = note.get("list", note.get("listContent", note.get("checklistItems", []))) |
| 52 | if isinstance(list_items, list): |
| 53 | for item in list_items: |
| 54 | text = item.get("text", "").strip() |
| 55 | if not text: |
| 56 | continue |
| 57 | checked = item.get("checked", item.get("isChecked", False)) |
| 58 | marker = "[x]" if checked else "[ ]" |
| 59 | parts.append(f"- {marker} {text}") |
| 60 | |
| 61 | return "\n\n".join(parts) if parts else "" |
| 62 | |
| 63 | |
| 64 | class GoogleKeepSource(BaseSource): |
| 65 | """ |
| 66 | Fetch notes from Google Keep via the gws CLI. |
| 67 | |
| 68 | Usage: |
| 69 | source = GoogleKeepSource() # all notes |
| 70 | source = GoogleKeepSource(label="meetings") # filter by label |
| 71 | files = source.list_videos() |
| 72 | source.download_all(files, Path("./notes")) |
| 73 | """ |
| 74 | |
| 75 | def __init__(self, label: Optional[str] = None): |
| 76 | self.label = label |
| 77 | |
| 78 | def authenticate(self) -> bool: |
| 79 | """Check if gws CLI is installed and authenticated.""" |
| 80 | if not shutil.which("gws"): |
| 81 | logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli") |
| 82 | return False |
| 83 | try: |
| 84 | _run_gws(["auth", "status"], timeout=10) |
| 85 | return True |
| 86 | except (RuntimeError, subprocess.TimeoutExpired): |
| 87 | logger.error("gws not authenticated. Run: gws auth login") |
| 88 | return False |
| 89 | |
| 90 | def list_videos( |
| 91 | self, |
| 92 | folder_id: Optional[str] = None, |
| 93 | folder_path: Optional[str] = None, |
| 94 | patterns: Optional[List[str]] = None, |
| 95 | ) -> List[SourceFile]: |
| 96 | """List notes in Google Keep. Returns SourceFile per note.""" |
| 97 | args = ["keep", "notes", "list", "--output", "json"] |
| 98 | |
| 99 | if self.label: |
| 100 | args.extend(["--label", self.label]) |
| 101 | |
| 102 | try: |
| 103 | result = _run_gws(args, timeout=60) |
| 104 | except RuntimeError as e: |
| 105 | logger.error(f"Failed to list Keep notes: {e}") |
| 106 | return [] |
| 107 | |
| 108 | # Result may be a list directly or nested under a key |
| 109 | notes: List[dict] = [] |
| 110 | if isinstance(result, list): |
| 111 | notes = result |
| 112 | elif isinstance(result, dict): |
| 113 | notes = result.get("notes", result.get("items", [])) |
| 114 | # If we got a single note back (not a list), wrap it |
| 115 | if not notes and "id" in result and "raw" not in result: |
| 116 | notes = [result] |
| 117 | |
| 118 | files: List[SourceFile] = [] |
| 119 | for note in notes: |
| 120 | note_id = note.get("id", note.get("noteId", "")) |
| 121 | title = note.get("title", "Untitled Note").strip() or "Untitled Note" |
| 122 | modified = note.get("modifiedTime", note.get("updateTime")) |
| 123 | |
| 124 | # Estimate size from text content |
| 125 | text = _note_to_text(note) |
| 126 | size = len(text.encode("utf-8")) if text else None |
| 127 | |
| 128 | files.append( |
| 129 | SourceFile( |
| 130 | name=title, |
| 131 | id=str(note_id), |
| 132 | size_bytes=size, |
| 133 | mime_type="text/plain", |
| 134 | modified_at=modified, |
| 135 | ) |
| 136 | ) |
| 137 | |
| 138 | logger.info(f"Found {len(files)} note(s) in Google Keep") |
| 139 | return files |
| 140 | |
| 141 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 142 | """Download a Keep note's content as a text file.""" |
| 143 | destination = Path(destination) |
| 144 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 145 | |
| 146 | try: |
| 147 | result = _run_gws( |
| 148 | [ |
| 149 | "keep", |
| 150 | "notes", |
| 151 | "get", |
| 152 | "--params", |
| 153 | json.dumps({"noteId": file.id}), |
| 154 | ], |
| 155 | timeout=30, |
| 156 | ) |
| 157 | except RuntimeError as e: |
| 158 | raise RuntimeError(f"Failed to fetch Keep note {file.id}: {e}") from e |
| 159 | |
| 160 | # result may be the note dict directly or wrapped |
| 161 | note = result if isinstance(result, dict) else {} |
| 162 | text = _note_to_text(note) |
| 163 | |
| 164 | if not text: |
| 165 | # Fallback: use raw output if structured extraction yielded nothing |
| 166 | text = note.get("raw", json.dumps(note, indent=2)) |
| 167 | |
| 168 | destination.write_text(text, encoding="utf-8") |
| 169 | logger.info(f"Saved note '{file.name}' to {destination}") |
| 170 | return destination |
| --- a/video_processor/sources/gws_source.py | ||
| +++ b/video_processor/sources/gws_source.py | ||
| @@ -0,0 +1,268 @@ | ||
| 1 | +"""Google Workspace source connector using the gws CLI (googleworkspace/cli). | |
| 2 | + | |
| 3 | +Fetches and collates Google Docs, Sheets, Slides, and other Drive files | |
| 4 | +via the `gws` CLI tool. Outputs plain text suitable for KG ingestion. | |
| 5 | + | |
| 6 | +Requires: npm install -g @googleworkspace/cli | |
| 7 | +Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless) | |
| 8 | +""" | |
| 9 | + | |
| 10 | +import json | |
| 11 | +import logging | |
| 12 | +import shutil | |
| 13 | +import subprocess | |
| 14 | +from pathlib import Path | |
| 15 | +from typing import Any, Dict, List, Optional | |
| 16 | + | |
| 17 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 18 | + | |
| 19 | +logger = logging.getLogger(__name__) | |
| 20 | + | |
| 21 | +# Google Workspace MIME types we can extract text from | |
| 22 | +_DOC_MIMES = { | |
| 23 | + "application/vnd.google-apps.document", | |
| 24 | + "application/vnd.google-apps.spreadsheet", | |
| 25 | + "application/vnd.google-apps.presentation", | |
| 26 | + "application/pdf", | |
| 27 | + "text/plain", | |
| 28 | + "text/markdown", | |
| 29 | + "text/html", | |
| 30 | +} | |
| 31 | + | |
| 32 | +# Export MIME mappings for native Google formats | |
| 33 | +_EXPORT_MIMES = { | |
| 34 | + "application/vnd.google-apps.document": "text/plain", | |
| 35 | + "application/vnd.google-apps.spreadsheet": "text/csv", | |
| 36 | + "application/vnd.google-apps.presentation": "text/plain", | |
| 37 | +} | |
| 38 | + | |
| 39 | + | |
| 40 | +def _run_gws(args: List[str], timeout: int = 30) -> Dict[str, Any]: | |
| 41 | + """Run a gws CLI command and return parsed JSON output.""" | |
| 42 | + cmd = ["gws"] + args | |
| 43 | + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) | |
| 44 | + if proc.returncode != 0: | |
| 45 | + raise RuntimeError(f"gws {' '.join(args)} failed: {proc.stderr.strip()}") | |
| 46 | + try: | |
| 47 | + return json.loads(proc.stdout) | |
| 48 | + except json.JSONDecodeError: | |
| 49 | + return {"raw": proc.stdout.strip()} | |
| 50 | + | |
| 51 | + | |
| 52 | +class GWSSource(BaseSource): | |
| 53 | + """ | |
| 54 | + Fetch documents from Google Workspace (Drive, Docs, Sheets, Slides) via gws CLI. | |
| 55 | + | |
| 56 | + Usage: | |
| 57 | + source = GWSSource(folder_id="1abc...") # specific Drive folder | |
| 58 | + source = GWSSource(query="type:document") # Drive search query | |
| 59 | + files = source.list_videos() # lists docs, not just videos | |
| 60 | + source.download_all(files, Path("./docs")) | |
| 61 | + """ | |
| 62 | + | |
| 63 | + def __init__( | |
| 64 | + self, | |
| 65 | + folder_id: Optional[str] = None, | |
| 66 | + query: Optional[str] = None, | |
| 67 | + doc_ids: Optional[List[str]] = None, | |
| 68 | + mime_filter: Optional[List[str]] = None, | |
| 69 | + ): | |
| 70 | + self.folder_id = folder_id | |
| 71 | + self.query = query | |
| 72 | + self.doc_ids = doc_ids or [] | |
| 73 | + self.mime_filter = set(mime_filter) if mime_filter else _DOC_MIMES | |
| 74 | + | |
| 75 | + def authenticate(self) -> bool: | |
| 76 | + """Check if gws CLI is installed and authenticated.""" | |
| 77 | + if not shutil.which("gws"): | |
| 78 | + logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli") | |
| 79 | + return False | |
| 80 | + try: | |
| 81 | + _run_gws(["auth", "status"], timeout=10) | |
| 82 | + return True | |
| 83 | + except (RuntimeError, subprocess.TimeoutExpired): | |
| 84 | + logger.error("gws not authenticated. Run: gws auth login") | |
| 85 | + return False | |
| 86 | + | |
| 87 | + def list_videos( | |
| 88 | + self, | |
| 89 | + folder_id: Optional[str] = None, | |
| 90 | + folder_path: Optional[str] = None, | |
| 91 | + patterns: Optional[List[str]] = None, | |
| 92 | + ) -> List[SourceFile]: | |
| 93 | + """List documents in Drive. Despite the method name, returns docs not just videos.""" | |
| 94 | + folder = folder_id or self.folder_id | |
| 95 | + files: List[SourceFile] = [] | |
| 96 | + | |
| 97 | + # If specific doc IDs were provided, fetch metadata for each | |
| 98 | + if self.doc_ids: | |
| 99 | + for doc_id in self.doc_ids: | |
| 100 | + try: | |
| 101 | + result = _run_gws( | |
| 102 | + [ | |
| 103 | + "drive", | |
| 104 | + "files", | |
| 105 | + "get", | |
| 106 | + "--params", | |
| 107 | + json.dumps( | |
| 108 | + {"fileId": doc_id, "fields": "id,name,mimeType,size,modifiedTime"} | |
| 109 | + ), | |
| 110 | + ] | |
| 111 | + ) | |
| 112 | + files.append(_result_to_source_file(result)) | |
| 113 | + except RuntimeError as e: | |
| 114 | + logger.warning(f"Failed to fetch doc {doc_id}: {e}") | |
| 115 | + return files | |
| 116 | + | |
| 117 | + # Build Drive files list query | |
| 118 | + params: Dict[str, Any] = { | |
| 119 | + "pageSize": 100, | |
| 120 | + "fields": "files(id,name,mimeType,size,modifiedTime)", | |
| 121 | + } | |
| 122 | + | |
| 123 | + q_parts = [] | |
| 124 | + if folder: | |
| 125 | + q_parts.append(f"'{folder}' in parents") | |
| 126 | + if self.query: | |
| 127 | + q_parts.append(self.query) | |
| 128 | + # Filter to document types | |
| 129 | + mime_clauses = [f"mimeType='{m}'" for m in self.mime_filter] | |
| 130 | + if mime_clauses: | |
| 131 | + q_parts.append(f"({' or '.join(mime_clauses)})") | |
| 132 | + if q_parts: | |
| 133 | + params["q"] = " and ".join(q_parts) | |
| 134 | + | |
| 135 | + try: | |
| 136 | + result = _run_gws( | |
| 137 | + [ | |
| 138 | + "drive", | |
| 139 | + "files", | |
| 140 | + "list", | |
| 141 | + "--params", | |
| 142 | + json.dumps(params), | |
| 143 | + ], | |
| 144 | + timeout=60, | |
| 145 | + ) | |
| 146 | + except RuntimeError as e: | |
| 147 | + logger.error(f"Failed to list Drive files: {e}") | |
| 148 | + return [] | |
| 149 | + | |
| 150 | + for item in result.get("files", []): | |
| 151 | + files.append(_result_to_source_file(item)) | |
| 152 | + | |
| 153 | + logger.info(f"Found {len(files)} document(s) in Google Drive") | |
| 154 | + return files | |
| 155 | + | |
| 156 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 157 | + """Download/export a document to a local text file.""" | |
| 158 | + destination = Path(destination) | |
| 159 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 160 | + | |
| 161 | + mime = file.mime_type or "" | |
| 162 | + | |
| 163 | + # Native Google format — export as text | |
| 164 | + if mime in _EXPORT_MIMES: | |
| 165 | + content = self._export_doc(file.id, mime) | |
| 166 | + # Regular file — download directly | |
| 167 | + else: | |
| 168 | + content = self._download_file(file.id) | |
| 169 | + | |
| 170 | + destination.write_text(content, encoding="utf-8") | |
| 171 | + logger.info(f"Saved {file.name} to {destination}") | |
| 172 | + return destination | |
| 173 | + | |
| 174 | + def _export_doc(self, file_id: str, source_mime: str) -> str: | |
| 175 | + """Export a native Google doc to text via gws.""" | |
| 176 | + export_mime = _EXPORT_MIMES.get(source_mime, "text/plain") | |
| 177 | + try: | |
| 178 | + result = _run_gws( | |
| 179 | + [ | |
| 180 | + "drive", | |
| 181 | + "files", | |
| 182 | + "export", | |
| 183 | + "--params", | |
| 184 | + json.dumps({"fileId": file_id, "mimeType": export_mime}), | |
| 185 | + ], | |
| 186 | + timeout=60, | |
| 187 | + ) | |
| 188 | + return result.get("raw", json.dumps(result, indent=2)) | |
| 189 | + except RuntimeError: | |
| 190 | + # Fallback: try getting via Docs API for Google Docs | |
| 191 | + if source_mime == "application/vnd.google-apps.document": | |
| 192 | + return self._get_doc_text(file_id) | |
| 193 | + raise | |
| 194 | + | |
| 195 | + def _get_doc_text(self, doc_id: str) -> str: | |
| 196 | + """Fetch Google Doc content via the Docs API and extract text.""" | |
| 197 | + result = _run_gws( | |
| 198 | + [ | |
| 199 | + "docs", | |
| 200 | + "documents", | |
| 201 | + "get", | |
| 202 | + "--params", | |
| 203 | + json.dumps({"documentId": doc_id}), | |
| 204 | + ], | |
| 205 | + timeout=60, | |
| 206 | + ) | |
| 207 | + | |
| 208 | + # Extract text from the Docs API structural response | |
| 209 | + body = result.get("body", {}) | |
| 210 | + content_parts = [] | |
| 211 | + for element in body.get("content", []): | |
| 212 | + paragraph = element.get("paragraph", {}) | |
| 213 | + for pe in paragraph.get("elements", []): | |
| 214 | + text_run = pe.get("textRun", {}) | |
| 215 | + text = text_run.get("content", "") | |
| 216 | + if text.strip(): | |
| 217 | + content_parts.append(text) | |
| 218 | + | |
| 219 | + return "".join(content_parts) if content_parts else json.dumps(result, indent=2) | |
| 220 | + | |
| 221 | + def _download_file(self, file_id: str) -> str: | |
| 222 | + """Download a non-native file's content.""" | |
| 223 | + result = _run_gws( | |
| 224 | + [ | |
| 225 | + "drive", | |
| 226 | + "files", | |
| 227 | + "get", | |
| 228 | + "--params", | |
| 229 | + json.dumps({"fileId": file_id, "alt": "media"}), | |
| 230 | + ], | |
| 231 | + timeout=60, | |
| 232 | + ) | |
| 233 | + return result.get("raw", json.dumps(result, indent=2)) | |
| 234 | + | |
| 235 | + def fetch_all_text(self, folder_id: Optional[str] = None) -> Dict[str, str]: | |
| 236 | + """Convenience: list all docs and return {filename: text_content} dict.""" | |
| 237 | + files = self.list_videos(folder_id=folder_id) | |
| 238 | + results = {} | |
| 239 | + for f in files: | |
| 240 | + try: | |
| 241 | + if f.mime_type and f.mime_type in _EXPORT_MIMES: | |
| 242 | + results[f.name] = self._export_doc(f.id, f.mime_type) | |
| 243 | + else: | |
| 244 | + results[f.name] = self._download_file(f.id) | |
| 245 | + except Exception as e: | |
| 246 | + logger.warning(f"Failed to fetch {f.name}: {e}") | |
| 247 | + results[f.name] = f"[Error: {e}]" | |
| 248 | + return results | |
| 249 | + | |
| 250 | + def collate(self, folder_id: Optional[str] = None, separator: str = "\n\n---\n\n") -> str: | |
| 251 | + """Fetch all docs and collate into a single text blob for ingestion.""" | |
| 252 | + docs = self.fetch_all_text(folder_id=folder_id) | |
| 253 | + parts = [] | |
| 254 | + for name, content in docs.items(): | |
| 255 | + parts.append(f"# {name}\n\n{content}") | |
| 256 | + return separator.join(parts) | |
| 257 | + | |
| 258 | + | |
| 259 | +def _result_to_source_file(item: dict) -> SourceFile: | |
| 260 | + """Convert a Drive API file result to SourceFile.""" | |
| 261 | + size = item.get("size") | |
| 262 | + return SourceFile( | |
| 263 | + name=item.get("name", "Untitled"), | |
| 264 | + id=item.get("id", ""), | |
| 265 | + size_bytes=int(size) if size else None, | |
| 266 | + mime_type=item.get("mimeType"), | |
| 267 | + modified_at=item.get("modifiedTime"), | |
| 268 | + ) |
| --- a/video_processor/sources/gws_source.py | |
| +++ b/video_processor/sources/gws_source.py | |
| @@ -0,0 +1,268 @@ | |
| --- a/video_processor/sources/gws_source.py | |
| +++ b/video_processor/sources/gws_source.py | |
| @@ -0,0 +1,268 @@ | |
| 1 | """Google Workspace source connector using the gws CLI (googleworkspace/cli). |
| 2 | |
| 3 | Fetches and collates Google Docs, Sheets, Slides, and other Drive files |
| 4 | via the `gws` CLI tool. Outputs plain text suitable for KG ingestion. |
| 5 | |
| 6 | Requires: npm install -g @googleworkspace/cli |
| 7 | Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless) |
| 8 | """ |
| 9 | |
| 10 | import json |
| 11 | import logging |
| 12 | import shutil |
| 13 | import subprocess |
| 14 | from pathlib import Path |
| 15 | from typing import Any, Dict, List, Optional |
| 16 | |
| 17 | from video_processor.sources.base import BaseSource, SourceFile |
| 18 | |
| 19 | logger = logging.getLogger(__name__) |
| 20 | |
| 21 | # Google Workspace MIME types we can extract text from |
| 22 | _DOC_MIMES = { |
| 23 | "application/vnd.google-apps.document", |
| 24 | "application/vnd.google-apps.spreadsheet", |
| 25 | "application/vnd.google-apps.presentation", |
| 26 | "application/pdf", |
| 27 | "text/plain", |
| 28 | "text/markdown", |
| 29 | "text/html", |
| 30 | } |
| 31 | |
| 32 | # Export MIME mappings for native Google formats |
| 33 | _EXPORT_MIMES = { |
| 34 | "application/vnd.google-apps.document": "text/plain", |
| 35 | "application/vnd.google-apps.spreadsheet": "text/csv", |
| 36 | "application/vnd.google-apps.presentation": "text/plain", |
| 37 | } |
| 38 | |
| 39 | |
| 40 | def _run_gws(args: List[str], timeout: int = 30) -> Dict[str, Any]: |
| 41 | """Run a gws CLI command and return parsed JSON output.""" |
| 42 | cmd = ["gws"] + args |
| 43 | proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) |
| 44 | if proc.returncode != 0: |
| 45 | raise RuntimeError(f"gws {' '.join(args)} failed: {proc.stderr.strip()}") |
| 46 | try: |
| 47 | return json.loads(proc.stdout) |
| 48 | except json.JSONDecodeError: |
| 49 | return {"raw": proc.stdout.strip()} |
| 50 | |
| 51 | |
| 52 | class GWSSource(BaseSource): |
| 53 | """ |
| 54 | Fetch documents from Google Workspace (Drive, Docs, Sheets, Slides) via gws CLI. |
| 55 | |
| 56 | Usage: |
| 57 | source = GWSSource(folder_id="1abc...") # specific Drive folder |
| 58 | source = GWSSource(query="type:document") # Drive search query |
| 59 | files = source.list_videos() # lists docs, not just videos |
| 60 | source.download_all(files, Path("./docs")) |
| 61 | """ |
| 62 | |
| 63 | def __init__( |
| 64 | self, |
| 65 | folder_id: Optional[str] = None, |
| 66 | query: Optional[str] = None, |
| 67 | doc_ids: Optional[List[str]] = None, |
| 68 | mime_filter: Optional[List[str]] = None, |
| 69 | ): |
| 70 | self.folder_id = folder_id |
| 71 | self.query = query |
| 72 | self.doc_ids = doc_ids or [] |
| 73 | self.mime_filter = set(mime_filter) if mime_filter else _DOC_MIMES |
| 74 | |
| 75 | def authenticate(self) -> bool: |
| 76 | """Check if gws CLI is installed and authenticated.""" |
| 77 | if not shutil.which("gws"): |
| 78 | logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli") |
| 79 | return False |
| 80 | try: |
| 81 | _run_gws(["auth", "status"], timeout=10) |
| 82 | return True |
| 83 | except (RuntimeError, subprocess.TimeoutExpired): |
| 84 | logger.error("gws not authenticated. Run: gws auth login") |
| 85 | return False |
| 86 | |
| 87 | def list_videos( |
| 88 | self, |
| 89 | folder_id: Optional[str] = None, |
| 90 | folder_path: Optional[str] = None, |
| 91 | patterns: Optional[List[str]] = None, |
| 92 | ) -> List[SourceFile]: |
| 93 | """List documents in Drive. Despite the method name, returns docs not just videos.""" |
| 94 | folder = folder_id or self.folder_id |
| 95 | files: List[SourceFile] = [] |
| 96 | |
| 97 | # If specific doc IDs were provided, fetch metadata for each |
| 98 | if self.doc_ids: |
| 99 | for doc_id in self.doc_ids: |
| 100 | try: |
| 101 | result = _run_gws( |
| 102 | [ |
| 103 | "drive", |
| 104 | "files", |
| 105 | "get", |
| 106 | "--params", |
| 107 | json.dumps( |
| 108 | {"fileId": doc_id, "fields": "id,name,mimeType,size,modifiedTime"} |
| 109 | ), |
| 110 | ] |
| 111 | ) |
| 112 | files.append(_result_to_source_file(result)) |
| 113 | except RuntimeError as e: |
| 114 | logger.warning(f"Failed to fetch doc {doc_id}: {e}") |
| 115 | return files |
| 116 | |
| 117 | # Build Drive files list query |
| 118 | params: Dict[str, Any] = { |
| 119 | "pageSize": 100, |
| 120 | "fields": "files(id,name,mimeType,size,modifiedTime)", |
| 121 | } |
| 122 | |
| 123 | q_parts = [] |
| 124 | if folder: |
| 125 | q_parts.append(f"'{folder}' in parents") |
| 126 | if self.query: |
| 127 | q_parts.append(self.query) |
| 128 | # Filter to document types |
| 129 | mime_clauses = [f"mimeType='{m}'" for m in self.mime_filter] |
| 130 | if mime_clauses: |
| 131 | q_parts.append(f"({' or '.join(mime_clauses)})") |
| 132 | if q_parts: |
| 133 | params["q"] = " and ".join(q_parts) |
| 134 | |
| 135 | try: |
| 136 | result = _run_gws( |
| 137 | [ |
| 138 | "drive", |
| 139 | "files", |
| 140 | "list", |
| 141 | "--params", |
| 142 | json.dumps(params), |
| 143 | ], |
| 144 | timeout=60, |
| 145 | ) |
| 146 | except RuntimeError as e: |
| 147 | logger.error(f"Failed to list Drive files: {e}") |
| 148 | return [] |
| 149 | |
| 150 | for item in result.get("files", []): |
| 151 | files.append(_result_to_source_file(item)) |
| 152 | |
| 153 | logger.info(f"Found {len(files)} document(s) in Google Drive") |
| 154 | return files |
| 155 | |
| 156 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 157 | """Download/export a document to a local text file.""" |
| 158 | destination = Path(destination) |
| 159 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 160 | |
| 161 | mime = file.mime_type or "" |
| 162 | |
| 163 | # Native Google format — export as text |
| 164 | if mime in _EXPORT_MIMES: |
| 165 | content = self._export_doc(file.id, mime) |
| 166 | # Regular file — download directly |
| 167 | else: |
| 168 | content = self._download_file(file.id) |
| 169 | |
| 170 | destination.write_text(content, encoding="utf-8") |
| 171 | logger.info(f"Saved {file.name} to {destination}") |
| 172 | return destination |
| 173 | |
| 174 | def _export_doc(self, file_id: str, source_mime: str) -> str: |
| 175 | """Export a native Google doc to text via gws.""" |
| 176 | export_mime = _EXPORT_MIMES.get(source_mime, "text/plain") |
| 177 | try: |
| 178 | result = _run_gws( |
| 179 | [ |
| 180 | "drive", |
| 181 | "files", |
| 182 | "export", |
| 183 | "--params", |
| 184 | json.dumps({"fileId": file_id, "mimeType": export_mime}), |
| 185 | ], |
| 186 | timeout=60, |
| 187 | ) |
| 188 | return result.get("raw", json.dumps(result, indent=2)) |
| 189 | except RuntimeError: |
| 190 | # Fallback: try getting via Docs API for Google Docs |
| 191 | if source_mime == "application/vnd.google-apps.document": |
| 192 | return self._get_doc_text(file_id) |
| 193 | raise |
| 194 | |
| 195 | def _get_doc_text(self, doc_id: str) -> str: |
| 196 | """Fetch Google Doc content via the Docs API and extract text.""" |
| 197 | result = _run_gws( |
| 198 | [ |
| 199 | "docs", |
| 200 | "documents", |
| 201 | "get", |
| 202 | "--params", |
| 203 | json.dumps({"documentId": doc_id}), |
| 204 | ], |
| 205 | timeout=60, |
| 206 | ) |
| 207 | |
| 208 | # Extract text from the Docs API structural response |
| 209 | body = result.get("body", {}) |
| 210 | content_parts = [] |
| 211 | for element in body.get("content", []): |
| 212 | paragraph = element.get("paragraph", {}) |
| 213 | for pe in paragraph.get("elements", []): |
| 214 | text_run = pe.get("textRun", {}) |
| 215 | text = text_run.get("content", "") |
| 216 | if text.strip(): |
| 217 | content_parts.append(text) |
| 218 | |
| 219 | return "".join(content_parts) if content_parts else json.dumps(result, indent=2) |
| 220 | |
| 221 | def _download_file(self, file_id: str) -> str: |
| 222 | """Download a non-native file's content.""" |
| 223 | result = _run_gws( |
| 224 | [ |
| 225 | "drive", |
| 226 | "files", |
| 227 | "get", |
| 228 | "--params", |
| 229 | json.dumps({"fileId": file_id, "alt": "media"}), |
| 230 | ], |
| 231 | timeout=60, |
| 232 | ) |
| 233 | return result.get("raw", json.dumps(result, indent=2)) |
| 234 | |
| 235 | def fetch_all_text(self, folder_id: Optional[str] = None) -> Dict[str, str]: |
| 236 | """Convenience: list all docs and return {filename: text_content} dict.""" |
| 237 | files = self.list_videos(folder_id=folder_id) |
| 238 | results = {} |
| 239 | for f in files: |
| 240 | try: |
| 241 | if f.mime_type and f.mime_type in _EXPORT_MIMES: |
| 242 | results[f.name] = self._export_doc(f.id, f.mime_type) |
| 243 | else: |
| 244 | results[f.name] = self._download_file(f.id) |
| 245 | except Exception as e: |
| 246 | logger.warning(f"Failed to fetch {f.name}: {e}") |
| 247 | results[f.name] = f"[Error: {e}]" |
| 248 | return results |
| 249 | |
| 250 | def collate(self, folder_id: Optional[str] = None, separator: str = "\n\n---\n\n") -> str: |
| 251 | """Fetch all docs and collate into a single text blob for ingestion.""" |
| 252 | docs = self.fetch_all_text(folder_id=folder_id) |
| 253 | parts = [] |
| 254 | for name, content in docs.items(): |
| 255 | parts.append(f"# {name}\n\n{content}") |
| 256 | return separator.join(parts) |
| 257 | |
| 258 | |
| 259 | def _result_to_source_file(item: dict) -> SourceFile: |
| 260 | """Convert a Drive API file result to SourceFile.""" |
| 261 | size = item.get("size") |
| 262 | return SourceFile( |
| 263 | name=item.get("name", "Untitled"), |
| 264 | id=item.get("id", ""), |
| 265 | size_bytes=int(size) if size else None, |
| 266 | mime_type=item.get("mimeType"), |
| 267 | modified_at=item.get("modifiedTime"), |
| 268 | ) |
| --- a/video_processor/sources/hackernews_source.py | ||
| +++ b/video_processor/sources/hackernews_source.py | ||
| @@ -0,0 +1,112 @@ | ||
| 1 | +"""Hacker News source connector using the official Firebase API.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +from pathlib import Path | |
| 5 | +from typing import List, Optional | |
| 6 | + | |
| 7 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 8 | + | |
| 9 | +logger = logging.getLogger(__name__) | |
| 10 | + | |
| 11 | +HN_API = "https://hacker-news.firebaseio.com/v0" | |
| 12 | + | |
| 13 | + | |
| 14 | +class HackerNewsSource(BaseSource): | |
| 15 | + """ | |
| 16 | + Fetch Hacker News stories and comments via the public API. | |
| 17 | + | |
| 18 | + API docs: https://github.com/HackerNews/API | |
| 19 | + Requires: pip install requests | |
| 20 | + """ | |
| 21 | + | |
| 22 | + def __init__(self, item_id: int, max_comments: int = 200): | |
| 23 | + """ | |
| 24 | + Parameters | |
| 25 | + ---------- | |
| 26 | + item_id : int | |
| 27 | + HN story/item ID (e.g., 12345678). | |
| 28 | + max_comments : int | |
| 29 | + Maximum number of comments to fetch (default 200). | |
| 30 | + """ | |
| 31 | + self.item_id = item_id | |
| 32 | + self.max_comments = max_comments | |
| 33 | + | |
| 34 | + def authenticate(self) -> bool: | |
| 35 | + """No auth needed for the HN API.""" | |
| 36 | + return True | |
| 37 | + | |
| 38 | + def list_videos( | |
| 39 | + self, | |
| 40 | + folder_id: Optional[str] = None, | |
| 41 | + folder_path: Optional[str] = None, | |
| 42 | + patterns: Optional[List[str]] = None, | |
| 43 | + ) -> List[SourceFile]: | |
| 44 | + """Return a single SourceFile for the HN story.""" | |
| 45 | + return [ | |
| 46 | + SourceFile( | |
| 47 | + name=f"hn_{self.item_id}", | |
| 48 | + id=str(self.item_id), | |
| 49 | + mime_type="text/plain", | |
| 50 | + ) | |
| 51 | + ] | |
| 52 | + | |
| 53 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 54 | + """Download the story and comments as plain text.""" | |
| 55 | + destination = Path(destination) | |
| 56 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 57 | + text = self.fetch_text() | |
| 58 | + destination.write_text(text, encoding="utf-8") | |
| 59 | + logger.info(f"Saved HN story {self.item_id} to {destination}") | |
| 60 | + return destination | |
| 61 | + | |
| 62 | + def _get_item(self, item_id: int) -> dict: | |
| 63 | + import requests | |
| 64 | + | |
| 65 | + resp = requests.get(f"{HN_API}/item/{item_id}.json", timeout=10) | |
| 66 | + resp.raise_for_status() | |
| 67 | + return resp.json() or {} | |
| 68 | + | |
| 69 | + def fetch_text(self) -> str: | |
| 70 | + """Fetch story and comments as structured text.""" | |
| 71 | + story = self._get_item(self.item_id) | |
| 72 | + lines = [] | |
| 73 | + lines.append(f"# {story.get('title', 'Untitled')}") | |
| 74 | + lines.append(f"by {story.get('by', 'unknown')} | {story.get('score', 0)} points") | |
| 75 | + if story.get("url"): | |
| 76 | + lines.append(f"URL: {story['url']}") | |
| 77 | + if story.get("text"): | |
| 78 | + lines.append(f"\n{story['text']}") | |
| 79 | + lines.append("") | |
| 80 | + | |
| 81 | + # Fetch comments | |
| 82 | + kid_ids = story.get("kids", []) | |
| 83 | + if kid_ids: | |
| 84 | + lines.append("## Comments\n") | |
| 85 | + count = [0] | |
| 86 | + self._fetch_comments(kid_ids, lines, depth=0, count=count) | |
| 87 | + | |
| 88 | + return "\n".join(lines) | |
| 89 | + | |
| 90 | + def _fetch_comments(self, kid_ids: list, lines: list, depth: int, count: list) -> None: | |
| 91 | + """Recursively fetch and format comments.""" | |
| 92 | + indent = " " * depth | |
| 93 | + for kid_id in kid_ids: | |
| 94 | + if count[0] >= self.max_comments: | |
| 95 | + return | |
| 96 | + try: | |
| 97 | + item = self._get_item(kid_id) | |
| 98 | + except Exception: | |
| 99 | + continue | |
| 100 | + | |
| 101 | + if item.get("deleted") or item.get("dead"): | |
| 102 | + continue | |
| 103 | + | |
| 104 | + count[0] += 1 | |
| 105 | + author = item.get("by", "[deleted]") | |
| 106 | + text = item.get("text", "") | |
| 107 | + lines.append(f"{indent}**{author}**:") | |
| 108 | + lines.append(f"{indent}{text}") | |
| 109 | + lines.append("") | |
| 110 | + | |
| 111 | + if item.get("kids"): | |
| 112 | + self._fetch_comments(item["kids"], lines, depth + 1, count) |
| --- a/video_processor/sources/hackernews_source.py | |
| +++ b/video_processor/sources/hackernews_source.py | |
| @@ -0,0 +1,112 @@ | |
| --- a/video_processor/sources/hackernews_source.py | |
| +++ b/video_processor/sources/hackernews_source.py | |
| @@ -0,0 +1,112 @@ | |
| 1 | """Hacker News source connector using the official Firebase API.""" |
| 2 | |
| 3 | import logging |
| 4 | from pathlib import Path |
| 5 | from typing import List, Optional |
| 6 | |
| 7 | from video_processor.sources.base import BaseSource, SourceFile |
| 8 | |
| 9 | logger = logging.getLogger(__name__) |
| 10 | |
| 11 | HN_API = "https://hacker-news.firebaseio.com/v0" |
| 12 | |
| 13 | |
| 14 | class HackerNewsSource(BaseSource): |
| 15 | """ |
| 16 | Fetch Hacker News stories and comments via the public API. |
| 17 | |
| 18 | API docs: https://github.com/HackerNews/API |
| 19 | Requires: pip install requests |
| 20 | """ |
| 21 | |
| 22 | def __init__(self, item_id: int, max_comments: int = 200): |
| 23 | """ |
| 24 | Parameters |
| 25 | ---------- |
| 26 | item_id : int |
| 27 | HN story/item ID (e.g., 12345678). |
| 28 | max_comments : int |
| 29 | Maximum number of comments to fetch (default 200). |
| 30 | """ |
| 31 | self.item_id = item_id |
| 32 | self.max_comments = max_comments |
| 33 | |
| 34 | def authenticate(self) -> bool: |
| 35 | """No auth needed for the HN API.""" |
| 36 | return True |
| 37 | |
| 38 | def list_videos( |
| 39 | self, |
| 40 | folder_id: Optional[str] = None, |
| 41 | folder_path: Optional[str] = None, |
| 42 | patterns: Optional[List[str]] = None, |
| 43 | ) -> List[SourceFile]: |
| 44 | """Return a single SourceFile for the HN story.""" |
| 45 | return [ |
| 46 | SourceFile( |
| 47 | name=f"hn_{self.item_id}", |
| 48 | id=str(self.item_id), |
| 49 | mime_type="text/plain", |
| 50 | ) |
| 51 | ] |
| 52 | |
| 53 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 54 | """Download the story and comments as plain text.""" |
| 55 | destination = Path(destination) |
| 56 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 57 | text = self.fetch_text() |
| 58 | destination.write_text(text, encoding="utf-8") |
| 59 | logger.info(f"Saved HN story {self.item_id} to {destination}") |
| 60 | return destination |
| 61 | |
| 62 | def _get_item(self, item_id: int) -> dict: |
| 63 | import requests |
| 64 | |
| 65 | resp = requests.get(f"{HN_API}/item/{item_id}.json", timeout=10) |
| 66 | resp.raise_for_status() |
| 67 | return resp.json() or {} |
| 68 | |
| 69 | def fetch_text(self) -> str: |
| 70 | """Fetch story and comments as structured text.""" |
| 71 | story = self._get_item(self.item_id) |
| 72 | lines = [] |
| 73 | lines.append(f"# {story.get('title', 'Untitled')}") |
| 74 | lines.append(f"by {story.get('by', 'unknown')} | {story.get('score', 0)} points") |
| 75 | if story.get("url"): |
| 76 | lines.append(f"URL: {story['url']}") |
| 77 | if story.get("text"): |
| 78 | lines.append(f"\n{story['text']}") |
| 79 | lines.append("") |
| 80 | |
| 81 | # Fetch comments |
| 82 | kid_ids = story.get("kids", []) |
| 83 | if kid_ids: |
| 84 | lines.append("## Comments\n") |
| 85 | count = [0] |
| 86 | self._fetch_comments(kid_ids, lines, depth=0, count=count) |
| 87 | |
| 88 | return "\n".join(lines) |
| 89 | |
| 90 | def _fetch_comments(self, kid_ids: list, lines: list, depth: int, count: list) -> None: |
| 91 | """Recursively fetch and format comments.""" |
| 92 | indent = " " * depth |
| 93 | for kid_id in kid_ids: |
| 94 | if count[0] >= self.max_comments: |
| 95 | return |
| 96 | try: |
| 97 | item = self._get_item(kid_id) |
| 98 | except Exception: |
| 99 | continue |
| 100 | |
| 101 | if item.get("deleted") or item.get("dead"): |
| 102 | continue |
| 103 | |
| 104 | count[0] += 1 |
| 105 | author = item.get("by", "[deleted]") |
| 106 | text = item.get("text", "") |
| 107 | lines.append(f"{indent}**{author}**:") |
| 108 | lines.append(f"{indent}{text}") |
| 109 | lines.append("") |
| 110 | |
| 111 | if item.get("kids"): |
| 112 | self._fetch_comments(item["kids"], lines, depth + 1, count) |
| --- a/video_processor/sources/logseq_source.py | ||
| +++ b/video_processor/sources/logseq_source.py | ||
| @@ -0,0 +1,200 @@ | ||
| 1 | +"""Logseq graph source connector for ingesting markdown pages and journals.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import re | |
| 5 | +import shutil | |
| 6 | +from datetime import datetime, timezone | |
| 7 | +from pathlib import Path | |
| 8 | +from typing import List, Optional, Tuple | |
| 9 | + | |
| 10 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 11 | + | |
| 12 | +logger = logging.getLogger(__name__) | |
| 13 | + | |
| 14 | + | |
| 15 | +def parse_page(path: Path) -> dict: | |
| 16 | + """Parse a Logseq markdown page and extract structured content. | |
| 17 | + | |
| 18 | + Returns a dict with: | |
| 19 | + - properties: dict of page-level properties (key:: value lines at top) | |
| 20 | + - links: list of linked page names from [[wiki-links]] | |
| 21 | + - tags: list of tags from #tag and #[[tag]] occurrences | |
| 22 | + - block_refs: list of block reference IDs from ((block-id)) | |
| 23 | + - body: full text content | |
| 24 | + """ | |
| 25 | + text = path.read_text(encoding="utf-8") | |
| 26 | + lines = text.split("\n") | |
| 27 | + | |
| 28 | + # Extract page properties (key:: value lines at the top of the file) | |
| 29 | + properties: dict = {} | |
| 30 | + body_start = 0 | |
| 31 | + for i, line in enumerate(lines): | |
| 32 | + prop_match = re.match(r"^([A-Za-z][A-Za-z0-9_-]*)::\ ?(.*)", line) | |
| 33 | + if prop_match: | |
| 34 | + key = prop_match.group(1) | |
| 35 | + value = prop_match.group(2).strip() | |
| 36 | + properties[key] = value | |
| 37 | + body_start = i + 1 | |
| 38 | + else: | |
| 39 | + break | |
| 40 | + | |
| 41 | + body = "\n".join(lines[body_start:]) | |
| 42 | + | |
| 43 | + # Extract wiki-links: [[page]] | |
| 44 | + link_pattern = re.compile(r"\[\[([^\]]+)\]\]") | |
| 45 | + links = link_pattern.findall(body) | |
| 46 | + # Also pick up links from properties | |
| 47 | + for value in properties.values(): | |
| 48 | + links.extend(link_pattern.findall(str(value))) | |
| 49 | + | |
| 50 | + # Extract tags: #tag and #[[tag]] | |
| 51 | + # First get #[[multi word tag]] style | |
| 52 | + bracket_tag_pattern = re.compile(r"#\[\[([^\]]+)\]\]") | |
| 53 | + tags = bracket_tag_pattern.findall(text) | |
| 54 | + # Then get simple #tag style (exclude matches already captured as #[[...]]) | |
| 55 | + # Remove bracket tags first to avoid double-matching | |
| 56 | + text_without_bracket_tags = bracket_tag_pattern.sub("", text) | |
| 57 | + simple_tag_pattern = re.compile(r"(?<!\w)#([A-Za-z][A-Za-z0-9_/-]*)") | |
| 58 | + tags.extend(simple_tag_pattern.findall(text_without_bracket_tags)) | |
| 59 | + | |
| 60 | + # Extract block references: ((block-id)) | |
| 61 | + block_ref_pattern = re.compile(r"\(\(([a-f0-9-]+)\)\)") | |
| 62 | + block_refs = block_ref_pattern.findall(text) | |
| 63 | + | |
| 64 | + return { | |
| 65 | + "properties": properties, | |
| 66 | + "links": links, | |
| 67 | + "tags": tags, | |
| 68 | + "block_refs": block_refs, | |
| 69 | + "body": body, | |
| 70 | + } | |
| 71 | + | |
| 72 | + | |
| 73 | +def ingest_graph(graph_path: Path) -> dict: | |
| 74 | + """Ingest an entire Logseq graph and return structured data. | |
| 75 | + | |
| 76 | + Returns a dict with: | |
| 77 | + - notes: list of dicts with name, tags, frontmatter (properties), text | |
| 78 | + - links: list of (source, target) tuples from wiki-links | |
| 79 | + """ | |
| 80 | + graph_path = Path(graph_path) | |
| 81 | + notes: List[dict] = [] | |
| 82 | + links: List[Tuple[str, str]] = [] | |
| 83 | + | |
| 84 | + md_files: List[Path] = [] | |
| 85 | + pages_dir = graph_path / "pages" | |
| 86 | + journals_dir = graph_path / "journals" | |
| 87 | + | |
| 88 | + if pages_dir.is_dir(): | |
| 89 | + md_files.extend(sorted(pages_dir.rglob("*.md"))) | |
| 90 | + if journals_dir.is_dir(): | |
| 91 | + md_files.extend(sorted(journals_dir.rglob("*.md"))) | |
| 92 | + | |
| 93 | + logger.info("Found %d markdown files in graph %s", len(md_files), graph_path) | |
| 94 | + | |
| 95 | + for md_file in md_files: | |
| 96 | + page_name = md_file.stem | |
| 97 | + try: | |
| 98 | + parsed = parse_page(md_file) | |
| 99 | + except Exception: | |
| 100 | + logger.warning("Failed to parse page %s", md_file) | |
| 101 | + continue | |
| 102 | + | |
| 103 | + notes.append( | |
| 104 | + { | |
| 105 | + "name": page_name, | |
| 106 | + "tags": parsed["tags"], | |
| 107 | + "frontmatter": parsed["properties"], | |
| 108 | + "text": parsed["body"], | |
| 109 | + } | |
| 110 | + ) | |
| 111 | + | |
| 112 | + for linked_page in parsed["links"]: | |
| 113 | + links.append((page_name, linked_page)) | |
| 114 | + | |
| 115 | + logger.info( | |
| 116 | + "Ingested %d notes with %d links from graph %s", | |
| 117 | + len(notes), | |
| 118 | + len(links), | |
| 119 | + graph_path, | |
| 120 | + ) | |
| 121 | + return {"notes": notes, "links": links} | |
| 122 | + | |
| 123 | + | |
| 124 | +class LogseqSource(BaseSource): | |
| 125 | + """Source connector for Logseq graphs.""" | |
| 126 | + | |
| 127 | + def __init__(self, graph_path: str) -> None: | |
| 128 | + self.graph_path = Path(graph_path) | |
| 129 | + | |
| 130 | + def authenticate(self) -> bool: | |
| 131 | + """Check that the graph path exists and has pages/ or journals/ dirs.""" | |
| 132 | + if not self.graph_path.is_dir(): | |
| 133 | + logger.error("Graph path does not exist: %s", self.graph_path) | |
| 134 | + return False | |
| 135 | + has_pages = (self.graph_path / "pages").is_dir() | |
| 136 | + has_journals = (self.graph_path / "journals").is_dir() | |
| 137 | + if not has_pages and not has_journals: | |
| 138 | + logger.error( | |
| 139 | + "No pages/ or journals/ directory found in graph: %s", | |
| 140 | + self.graph_path, | |
| 141 | + ) | |
| 142 | + return False | |
| 143 | + logger.info( | |
| 144 | + "Logseq graph authenticated: %s (pages=%s, journals=%s)", | |
| 145 | + self.graph_path, | |
| 146 | + has_pages, | |
| 147 | + has_journals, | |
| 148 | + ) | |
| 149 | + return True | |
| 150 | + | |
| 151 | + def list_videos( | |
| 152 | + self, | |
| 153 | + folder_id: Optional[str] = None, | |
| 154 | + folder_path: Optional[str] = None, | |
| 155 | + patterns: Optional[List[str]] = None, | |
| 156 | + ) -> List[SourceFile]: | |
| 157 | + """List .md files in pages/ and journals/ as SourceFile objects.""" | |
| 158 | + md_files: List[Path] = [] | |
| 159 | + | |
| 160 | + pages_dir = self.graph_path / "pages" | |
| 161 | + journals_dir = self.graph_path / "journals" | |
| 162 | + | |
| 163 | + if folder_path: | |
| 164 | + search_root = self.graph_path / folder_path | |
| 165 | + if search_root.is_dir(): | |
| 166 | + md_files.extend(sorted(search_root.rglob("*.md"))) | |
| 167 | + else: | |
| 168 | + if pages_dir.is_dir(): | |
| 169 | + md_files.extend(sorted(pages_dir.rglob("*.md"))) | |
| 170 | + if journals_dir.is_dir(): | |
| 171 | + md_files.extend(sorted(journals_dir.rglob("*.md"))) | |
| 172 | + | |
| 173 | + results: List[SourceFile] = [] | |
| 174 | + for md_file in md_files: | |
| 175 | + relative = md_file.relative_to(self.graph_path) | |
| 176 | + stat = md_file.stat() | |
| 177 | + modified_dt = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc) | |
| 178 | + | |
| 179 | + results.append( | |
| 180 | + SourceFile( | |
| 181 | + name=md_file.name, | |
| 182 | + id=str(relative), | |
| 183 | + size_bytes=stat.st_size, | |
| 184 | + mime_type="text/markdown", | |
| 185 | + modified_at=modified_dt.isoformat(), | |
| 186 | + path=str(relative), | |
| 187 | + ) | |
| 188 | + ) | |
| 189 | + | |
| 190 | + logger.info("Listed %d files from graph %s", len(results), self.graph_path) | |
| 191 | + return results | |
| 192 | + | |
| 193 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 194 | + """Copy a graph file to the destination path.""" | |
| 195 | + source = self.graph_path / file.id | |
| 196 | + destination = Path(destination) | |
| 197 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 198 | + shutil.copy2(source, destination) | |
| 199 | + logger.info("Copied %s -> %s", source, destination) | |
| 200 | + return destination |
| --- a/video_processor/sources/logseq_source.py | |
| +++ b/video_processor/sources/logseq_source.py | |
| @@ -0,0 +1,200 @@ | |
| --- a/video_processor/sources/logseq_source.py | |
| +++ b/video_processor/sources/logseq_source.py | |
| @@ -0,0 +1,200 @@ | |
| 1 | """Logseq graph source connector for ingesting markdown pages and journals.""" |
| 2 | |
| 3 | import logging |
| 4 | import re |
| 5 | import shutil |
| 6 | from datetime import datetime, timezone |
| 7 | from pathlib import Path |
| 8 | from typing import List, Optional, Tuple |
| 9 | |
| 10 | from video_processor.sources.base import BaseSource, SourceFile |
| 11 | |
| 12 | logger = logging.getLogger(__name__) |
| 13 | |
| 14 | |
| 15 | def parse_page(path: Path) -> dict: |
| 16 | """Parse a Logseq markdown page and extract structured content. |
| 17 | |
| 18 | Returns a dict with: |
| 19 | - properties: dict of page-level properties (key:: value lines at top) |
| 20 | - links: list of linked page names from [[wiki-links]] |
| 21 | - tags: list of tags from #tag and #[[tag]] occurrences |
| 22 | - block_refs: list of block reference IDs from ((block-id)) |
| 23 | - body: full text content |
| 24 | """ |
| 25 | text = path.read_text(encoding="utf-8") |
| 26 | lines = text.split("\n") |
| 27 | |
| 28 | # Extract page properties (key:: value lines at the top of the file) |
| 29 | properties: dict = {} |
| 30 | body_start = 0 |
| 31 | for i, line in enumerate(lines): |
| 32 | prop_match = re.match(r"^([A-Za-z][A-Za-z0-9_-]*)::\ ?(.*)", line) |
| 33 | if prop_match: |
| 34 | key = prop_match.group(1) |
| 35 | value = prop_match.group(2).strip() |
| 36 | properties[key] = value |
| 37 | body_start = i + 1 |
| 38 | else: |
| 39 | break |
| 40 | |
| 41 | body = "\n".join(lines[body_start:]) |
| 42 | |
| 43 | # Extract wiki-links: [[page]] |
| 44 | link_pattern = re.compile(r"\[\[([^\]]+)\]\]") |
| 45 | links = link_pattern.findall(body) |
| 46 | # Also pick up links from properties |
| 47 | for value in properties.values(): |
| 48 | links.extend(link_pattern.findall(str(value))) |
| 49 | |
| 50 | # Extract tags: #tag and #[[tag]] |
| 51 | # First get #[[multi word tag]] style |
| 52 | bracket_tag_pattern = re.compile(r"#\[\[([^\]]+)\]\]") |
| 53 | tags = bracket_tag_pattern.findall(text) |
| 54 | # Then get simple #tag style (exclude matches already captured as #[[...]]) |
| 55 | # Remove bracket tags first to avoid double-matching |
| 56 | text_without_bracket_tags = bracket_tag_pattern.sub("", text) |
| 57 | simple_tag_pattern = re.compile(r"(?<!\w)#([A-Za-z][A-Za-z0-9_/-]*)") |
| 58 | tags.extend(simple_tag_pattern.findall(text_without_bracket_tags)) |
| 59 | |
| 60 | # Extract block references: ((block-id)) |
| 61 | block_ref_pattern = re.compile(r"\(\(([a-f0-9-]+)\)\)") |
| 62 | block_refs = block_ref_pattern.findall(text) |
| 63 | |
| 64 | return { |
| 65 | "properties": properties, |
| 66 | "links": links, |
| 67 | "tags": tags, |
| 68 | "block_refs": block_refs, |
| 69 | "body": body, |
| 70 | } |
| 71 | |
| 72 | |
| 73 | def ingest_graph(graph_path: Path) -> dict: |
| 74 | """Ingest an entire Logseq graph and return structured data. |
| 75 | |
| 76 | Returns a dict with: |
| 77 | - notes: list of dicts with name, tags, frontmatter (properties), text |
| 78 | - links: list of (source, target) tuples from wiki-links |
| 79 | """ |
| 80 | graph_path = Path(graph_path) |
| 81 | notes: List[dict] = [] |
| 82 | links: List[Tuple[str, str]] = [] |
| 83 | |
| 84 | md_files: List[Path] = [] |
| 85 | pages_dir = graph_path / "pages" |
| 86 | journals_dir = graph_path / "journals" |
| 87 | |
| 88 | if pages_dir.is_dir(): |
| 89 | md_files.extend(sorted(pages_dir.rglob("*.md"))) |
| 90 | if journals_dir.is_dir(): |
| 91 | md_files.extend(sorted(journals_dir.rglob("*.md"))) |
| 92 | |
| 93 | logger.info("Found %d markdown files in graph %s", len(md_files), graph_path) |
| 94 | |
| 95 | for md_file in md_files: |
| 96 | page_name = md_file.stem |
| 97 | try: |
| 98 | parsed = parse_page(md_file) |
| 99 | except Exception: |
| 100 | logger.warning("Failed to parse page %s", md_file) |
| 101 | continue |
| 102 | |
| 103 | notes.append( |
| 104 | { |
| 105 | "name": page_name, |
| 106 | "tags": parsed["tags"], |
| 107 | "frontmatter": parsed["properties"], |
| 108 | "text": parsed["body"], |
| 109 | } |
| 110 | ) |
| 111 | |
| 112 | for linked_page in parsed["links"]: |
| 113 | links.append((page_name, linked_page)) |
| 114 | |
| 115 | logger.info( |
| 116 | "Ingested %d notes with %d links from graph %s", |
| 117 | len(notes), |
| 118 | len(links), |
| 119 | graph_path, |
| 120 | ) |
| 121 | return {"notes": notes, "links": links} |
| 122 | |
| 123 | |
| 124 | class LogseqSource(BaseSource): |
| 125 | """Source connector for Logseq graphs.""" |
| 126 | |
| 127 | def __init__(self, graph_path: str) -> None: |
| 128 | self.graph_path = Path(graph_path) |
| 129 | |
| 130 | def authenticate(self) -> bool: |
| 131 | """Check that the graph path exists and has pages/ or journals/ dirs.""" |
| 132 | if not self.graph_path.is_dir(): |
| 133 | logger.error("Graph path does not exist: %s", self.graph_path) |
| 134 | return False |
| 135 | has_pages = (self.graph_path / "pages").is_dir() |
| 136 | has_journals = (self.graph_path / "journals").is_dir() |
| 137 | if not has_pages and not has_journals: |
| 138 | logger.error( |
| 139 | "No pages/ or journals/ directory found in graph: %s", |
| 140 | self.graph_path, |
| 141 | ) |
| 142 | return False |
| 143 | logger.info( |
| 144 | "Logseq graph authenticated: %s (pages=%s, journals=%s)", |
| 145 | self.graph_path, |
| 146 | has_pages, |
| 147 | has_journals, |
| 148 | ) |
| 149 | return True |
| 150 | |
| 151 | def list_videos( |
| 152 | self, |
| 153 | folder_id: Optional[str] = None, |
| 154 | folder_path: Optional[str] = None, |
| 155 | patterns: Optional[List[str]] = None, |
| 156 | ) -> List[SourceFile]: |
| 157 | """List .md files in pages/ and journals/ as SourceFile objects.""" |
| 158 | md_files: List[Path] = [] |
| 159 | |
| 160 | pages_dir = self.graph_path / "pages" |
| 161 | journals_dir = self.graph_path / "journals" |
| 162 | |
| 163 | if folder_path: |
| 164 | search_root = self.graph_path / folder_path |
| 165 | if search_root.is_dir(): |
| 166 | md_files.extend(sorted(search_root.rglob("*.md"))) |
| 167 | else: |
| 168 | if pages_dir.is_dir(): |
| 169 | md_files.extend(sorted(pages_dir.rglob("*.md"))) |
| 170 | if journals_dir.is_dir(): |
| 171 | md_files.extend(sorted(journals_dir.rglob("*.md"))) |
| 172 | |
| 173 | results: List[SourceFile] = [] |
| 174 | for md_file in md_files: |
| 175 | relative = md_file.relative_to(self.graph_path) |
| 176 | stat = md_file.stat() |
| 177 | modified_dt = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc) |
| 178 | |
| 179 | results.append( |
| 180 | SourceFile( |
| 181 | name=md_file.name, |
| 182 | id=str(relative), |
| 183 | size_bytes=stat.st_size, |
| 184 | mime_type="text/markdown", |
| 185 | modified_at=modified_dt.isoformat(), |
| 186 | path=str(relative), |
| 187 | ) |
| 188 | ) |
| 189 | |
| 190 | logger.info("Listed %d files from graph %s", len(results), self.graph_path) |
| 191 | return results |
| 192 | |
| 193 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 194 | """Copy a graph file to the destination path.""" |
| 195 | source = self.graph_path / file.id |
| 196 | destination = Path(destination) |
| 197 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 198 | shutil.copy2(source, destination) |
| 199 | logger.info("Copied %s -> %s", source, destination) |
| 200 | return destination |
| --- a/video_processor/sources/m365_source.py | ||
| +++ b/video_processor/sources/m365_source.py | ||
| @@ -0,0 +1,310 @@ | ||
| 1 | +"""Microsoft 365 source connector using the m365 CLI (cli-microsoft365). | |
| 2 | + | |
| 3 | +Fetches documents from SharePoint and OneDrive via the `m365` CLI tool. | |
| 4 | +Outputs plain text suitable for KG ingestion. | |
| 5 | + | |
| 6 | +Requires: npm install -g @pnp/cli-microsoft365 | |
| 7 | +Auth: m365 login (interactive) | |
| 8 | +Docs: https://pnp.github.io/cli-microsoft365/ | |
| 9 | +""" | |
| 10 | + | |
| 11 | +import json | |
| 12 | +import logging | |
| 13 | +import shutil | |
| 14 | +import subprocess | |
| 15 | +import tempfile | |
| 16 | +from pathlib import Path | |
| 17 | +from typing import Any, Dict, List, Optional | |
| 18 | + | |
| 19 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 20 | + | |
| 21 | +logger = logging.getLogger(__name__) | |
| 22 | + | |
| 23 | +# Document MIME types we can extract text from | |
| 24 | +_DOC_EXTENSIONS = { | |
| 25 | + ".docx", | |
| 26 | + ".doc", | |
| 27 | + ".xlsx", | |
| 28 | + ".xls", | |
| 29 | + ".pptx", | |
| 30 | + ".ppt", | |
| 31 | + ".pdf", | |
| 32 | + ".txt", | |
| 33 | + ".md", | |
| 34 | + ".csv", | |
| 35 | + ".html", | |
| 36 | + ".htm", | |
| 37 | +} | |
| 38 | + | |
| 39 | + | |
| 40 | +def _run_m365(args: List[str], timeout: int = 30) -> Any: | |
| 41 | + """Run an m365 CLI command and return parsed JSON output.""" | |
| 42 | + cmd = ["m365"] + args + ["--output", "json"] | |
| 43 | + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) | |
| 44 | + if proc.returncode != 0: | |
| 45 | + raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}") | |
| 46 | + try: | |
| 47 | + return json.loads(proc.stdout) | |
| 48 | + except json.JSONDecodeError: | |
| 49 | + return proc.stdout.strip() | |
| 50 | + | |
| 51 | + | |
| 52 | +class M365Source(BaseSource): | |
| 53 | + """ | |
| 54 | + Fetch documents from SharePoint Online and OneDrive via the m365 CLI. | |
| 55 | + | |
| 56 | + Usage: | |
| 57 | + # SharePoint site | |
| 58 | + source = M365Source( | |
| 59 | + web_url="https://contoso.sharepoint.com/sites/project-x", | |
| 60 | + folder_url="/sites/project-x/Shared Documents" | |
| 61 | + ) | |
| 62 | + | |
| 63 | + # OneDrive | |
| 64 | + source = M365Source( | |
| 65 | + web_url="https://contoso-my.sharepoint.com/personal/user_contoso_com", | |
| 66 | + folder_url="/personal/user_contoso_com/Documents" | |
| 67 | + ) | |
| 68 | + | |
| 69 | + files = source.list_videos() | |
| 70 | + source.download_all(files, Path("./docs")) | |
| 71 | + """ | |
| 72 | + | |
| 73 | + def __init__( | |
| 74 | + self, | |
| 75 | + web_url: str, | |
| 76 | + folder_url: Optional[str] = None, | |
| 77 | + file_ids: Optional[List[str]] = None, | |
| 78 | + recursive: bool = False, | |
| 79 | + ): | |
| 80 | + self.web_url = web_url | |
| 81 | + self.folder_url = folder_url | |
| 82 | + self.file_ids = file_ids or [] | |
| 83 | + self.recursive = recursive | |
| 84 | + | |
| 85 | + def authenticate(self) -> bool: | |
| 86 | + """Check if m365 CLI is installed and logged in.""" | |
| 87 | + if not shutil.which("m365"): | |
| 88 | + logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365") | |
| 89 | + return False | |
| 90 | + try: | |
| 91 | + result = _run_m365(["status"], timeout=10) | |
| 92 | + # m365 status returns connection info when logged in | |
| 93 | + if isinstance(result, dict) and result.get("connectedAs"): | |
| 94 | + return True | |
| 95 | + if isinstance(result, str) and "Logged in" in result: | |
| 96 | + return True | |
| 97 | + logger.error("m365 not logged in. Run: m365 login") | |
| 98 | + return False | |
| 99 | + except (RuntimeError, subprocess.TimeoutExpired): | |
| 100 | + logger.error("m365 not logged in. Run: m365 login") | |
| 101 | + return False | |
| 102 | + | |
| 103 | + def list_videos( | |
| 104 | + self, | |
| 105 | + folder_id: Optional[str] = None, | |
| 106 | + folder_path: Optional[str] = None, | |
| 107 | + patterns: Optional[List[str]] = None, | |
| 108 | + ) -> List[SourceFile]: | |
| 109 | + """List documents in SharePoint/OneDrive. Returns docs, not just videos.""" | |
| 110 | + files: List[SourceFile] = [] | |
| 111 | + | |
| 112 | + # Fetch specific files by ID | |
| 113 | + if self.file_ids: | |
| 114 | + for fid in self.file_ids: | |
| 115 | + try: | |
| 116 | + result = _run_m365( | |
| 117 | + [ | |
| 118 | + "spo", | |
| 119 | + "file", | |
| 120 | + "get", | |
| 121 | + "--webUrl", | |
| 122 | + self.web_url, | |
| 123 | + "--id", | |
| 124 | + fid, | |
| 125 | + ] | |
| 126 | + ) | |
| 127 | + files.append(_result_to_source_file(result)) | |
| 128 | + except RuntimeError as e: | |
| 129 | + logger.warning(f"Failed to get file {fid}: {e}") | |
| 130 | + return files | |
| 131 | + | |
| 132 | + # List files in folder | |
| 133 | + folder = folder_path or self.folder_url | |
| 134 | + if not folder: | |
| 135 | + logger.error("No folder URL specified. Use --folder-url or folder_path parameter.") | |
| 136 | + return [] | |
| 137 | + | |
| 138 | + try: | |
| 139 | + args = [ | |
| 140 | + "file", | |
| 141 | + "list", | |
| 142 | + "--webUrl", | |
| 143 | + self.web_url, | |
| 144 | + "--folderUrl", | |
| 145 | + folder, | |
| 146 | + ] | |
| 147 | + if self.recursive: | |
| 148 | + args.append("--recursive") | |
| 149 | + | |
| 150 | + result = _run_m365(args, timeout=60) | |
| 151 | + except RuntimeError as e: | |
| 152 | + logger.error(f"Failed to list files: {e}") | |
| 153 | + return [] | |
| 154 | + | |
| 155 | + items = result if isinstance(result, list) else [] | |
| 156 | + for item in items: | |
| 157 | + name = item.get("Name", item.get("name", "")) | |
| 158 | + ext = Path(name).suffix.lower() | |
| 159 | + if ext in _DOC_EXTENSIONS: | |
| 160 | + files.append(_result_to_source_file(item)) | |
| 161 | + | |
| 162 | + logger.info(f"Found {len(files)} document(s) in {folder}") | |
| 163 | + return files | |
| 164 | + | |
| 165 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 166 | + """Download a file from SharePoint/OneDrive.""" | |
| 167 | + destination = Path(destination) | |
| 168 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 169 | + | |
| 170 | + args = [ | |
| 171 | + "spo", | |
| 172 | + "file", | |
| 173 | + "get", | |
| 174 | + "--webUrl", | |
| 175 | + self.web_url, | |
| 176 | + "--asFile", | |
| 177 | + "--path", | |
| 178 | + str(destination), | |
| 179 | + ] | |
| 180 | + | |
| 181 | + # Use URL if available in path field, otherwise use ID | |
| 182 | + if file.path: | |
| 183 | + args.extend(["--url", file.path]) | |
| 184 | + else: | |
| 185 | + args.extend(["--id", file.id]) | |
| 186 | + | |
| 187 | + _run_m365(args, timeout=120) | |
| 188 | + logger.info(f"Downloaded {file.name} to {destination}") | |
| 189 | + return destination | |
| 190 | + | |
| 191 | + def download_as_text(self, file: SourceFile) -> str: | |
| 192 | + """Download a file and attempt to extract text content.""" | |
| 193 | + # For text-based formats, get as string directly | |
| 194 | + text_exts = {".txt", ".md", ".csv", ".html", ".htm"} | |
| 195 | + ext = Path(file.name).suffix.lower() | |
| 196 | + | |
| 197 | + if ext in text_exts: | |
| 198 | + try: | |
| 199 | + args = [ | |
| 200 | + "spo", | |
| 201 | + "file", | |
| 202 | + "get", | |
| 203 | + "--webUrl", | |
| 204 | + self.web_url, | |
| 205 | + "--asString", | |
| 206 | + ] | |
| 207 | + if file.path: | |
| 208 | + args.extend(["--url", file.path]) | |
| 209 | + else: | |
| 210 | + args.extend(["--id", file.id]) | |
| 211 | + | |
| 212 | + result = _run_m365(args, timeout=60) | |
| 213 | + return result if isinstance(result, str) else json.dumps(result) | |
| 214 | + except RuntimeError: | |
| 215 | + pass | |
| 216 | + | |
| 217 | + # For binary formats, download to temp and extract | |
| 218 | + with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: | |
| 219 | + tmp_path = Path(tmp.name) | |
| 220 | + | |
| 221 | + try: | |
| 222 | + self.download(file, tmp_path) | |
| 223 | + return _extract_text(tmp_path) | |
| 224 | + finally: | |
| 225 | + tmp_path.unlink(missing_ok=True) | |
| 226 | + | |
| 227 | + def fetch_all_text(self) -> Dict[str, str]: | |
| 228 | + """List all docs and return {filename: text_content} dict.""" | |
| 229 | + files = self.list_videos() | |
| 230 | + results = {} | |
| 231 | + for f in files: | |
| 232 | + try: | |
| 233 | + results[f.name] = self.download_as_text(f) | |
| 234 | + except Exception as e: | |
| 235 | + logger.warning(f"Failed to fetch {f.name}: {e}") | |
| 236 | + results[f.name] = f"[Error: {e}]" | |
| 237 | + return results | |
| 238 | + | |
| 239 | + def collate(self, separator: str = "\n\n---\n\n") -> str: | |
| 240 | + """Fetch all docs and collate into a single text blob for ingestion.""" | |
| 241 | + docs = self.fetch_all_text() | |
| 242 | + parts = [] | |
| 243 | + for name, content in docs.items(): | |
| 244 | + parts.append(f"# {name}\n\n{content}") | |
| 245 | + return separator.join(parts) | |
| 246 | + | |
| 247 | + | |
| 248 | +def _result_to_source_file(item: dict) -> SourceFile: | |
| 249 | + """Convert an m365 file result to SourceFile.""" | |
| 250 | + name = item.get("Name", item.get("name", "Untitled")) | |
| 251 | + file_id = item.get("UniqueId", item.get("uniqueId", item.get("id", ""))) | |
| 252 | + size = item.get("Length", item.get("length", item.get("size"))) | |
| 253 | + path = item.get("ServerRelativeUrl", item.get("serverRelativeUrl")) | |
| 254 | + modified = item.get("TimeLastModified", item.get("lastModifiedDateTime")) | |
| 255 | + | |
| 256 | + return SourceFile( | |
| 257 | + name=name, | |
| 258 | + id=str(file_id), | |
| 259 | + size_bytes=int(size) if size else None, | |
| 260 | + mime_type=None, | |
| 261 | + modified_at=modified, | |
| 262 | + path=path, | |
| 263 | + ) | |
| 264 | + | |
| 265 | + | |
| 266 | +def _extract_text(path: Path) -> str: | |
| 267 | + """Best-effort text extraction from a downloaded file.""" | |
| 268 | + ext = path.suffix.lower() | |
| 269 | + | |
| 270 | + if ext in {".txt", ".md", ".csv"}: | |
| 271 | + return path.read_text(encoding="utf-8", errors="replace") | |
| 272 | + | |
| 273 | + if ext in {".html", ".htm"}: | |
| 274 | + from video_processor.sources.web_source import _strip_html_tags | |
| 275 | + | |
| 276 | + return _strip_html_tags(path.read_text(encoding="utf-8", errors="replace")) | |
| 277 | + | |
| 278 | + if ext == ".pdf": | |
| 279 | + try: | |
| 280 | + import fitz # pymupdf | |
| 281 | + | |
| 282 | + doc = fitz.open(str(path)) | |
| 283 | + return "\n\n".join(page.get_text() for page in doc) | |
| 284 | + except ImportError: | |
| 285 | + return f"[PDF file: {path.name} — install pymupdf to extract text]" | |
| 286 | + | |
| 287 | + if ext in {".docx", ".pptx", ".xlsx"}: | |
| 288 | + # Try python-docx / openpyxl / python-pptx if available | |
| 289 | + try: | |
| 290 | + if ext == ".docx": | |
| 291 | + from docx import Document | |
| 292 | + | |
| 293 | + doc = Document(str(path)) | |
| 294 | + return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip()) | |
| 295 | + elif ext == ".xlsx": | |
| 296 | + import openpyxl | |
| 297 | + | |
| 298 | + wb = openpyxl.load_workbook(str(path), read_only=True) | |
| 299 | + rows = [] | |
| 300 | + for sheet in wb.sheetnames: | |
| 301 | + ws = wb[sheet] | |
| 302 | + for row in ws.iter_rows(values_only=True): | |
| 303 | + cells = [str(c) if c is not None else "" for c in row] | |
| 304 | + if any(cells): | |
| 305 | + rows.append("\t".join(cells)) | |
| 306 | + return "\n".join(rows) | |
| 307 | + except ImportError: | |
| 308 | + return f"[{ext} file: {path.name} — install python-docx/openpyxl to extract text]" | |
| 309 | + | |
| 310 | + return f"[Unsupported format: {path.name}]" |
| --- a/video_processor/sources/m365_source.py | |
| +++ b/video_processor/sources/m365_source.py | |
| @@ -0,0 +1,310 @@ | |
| --- a/video_processor/sources/m365_source.py | |
| +++ b/video_processor/sources/m365_source.py | |
| @@ -0,0 +1,310 @@ | |
| 1 | """Microsoft 365 source connector using the m365 CLI (cli-microsoft365). |
| 2 | |
| 3 | Fetches documents from SharePoint and OneDrive via the `m365` CLI tool. |
| 4 | Outputs plain text suitable for KG ingestion. |
| 5 | |
| 6 | Requires: npm install -g @pnp/cli-microsoft365 |
| 7 | Auth: m365 login (interactive) |
| 8 | Docs: https://pnp.github.io/cli-microsoft365/ |
| 9 | """ |
| 10 | |
| 11 | import json |
| 12 | import logging |
| 13 | import shutil |
| 14 | import subprocess |
| 15 | import tempfile |
| 16 | from pathlib import Path |
| 17 | from typing import Any, Dict, List, Optional |
| 18 | |
| 19 | from video_processor.sources.base import BaseSource, SourceFile |
| 20 | |
| 21 | logger = logging.getLogger(__name__) |
| 22 | |
| 23 | # Document MIME types we can extract text from |
| 24 | _DOC_EXTENSIONS = { |
| 25 | ".docx", |
| 26 | ".doc", |
| 27 | ".xlsx", |
| 28 | ".xls", |
| 29 | ".pptx", |
| 30 | ".ppt", |
| 31 | ".pdf", |
| 32 | ".txt", |
| 33 | ".md", |
| 34 | ".csv", |
| 35 | ".html", |
| 36 | ".htm", |
| 37 | } |
| 38 | |
| 39 | |
| 40 | def _run_m365(args: List[str], timeout: int = 30) -> Any: |
| 41 | """Run an m365 CLI command and return parsed JSON output.""" |
| 42 | cmd = ["m365"] + args + ["--output", "json"] |
| 43 | proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) |
| 44 | if proc.returncode != 0: |
| 45 | raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}") |
| 46 | try: |
| 47 | return json.loads(proc.stdout) |
| 48 | except json.JSONDecodeError: |
| 49 | return proc.stdout.strip() |
| 50 | |
| 51 | |
| 52 | class M365Source(BaseSource): |
| 53 | """ |
| 54 | Fetch documents from SharePoint Online and OneDrive via the m365 CLI. |
| 55 | |
| 56 | Usage: |
| 57 | # SharePoint site |
| 58 | source = M365Source( |
| 59 | web_url="https://contoso.sharepoint.com/sites/project-x", |
| 60 | folder_url="/sites/project-x/Shared Documents" |
| 61 | ) |
| 62 | |
| 63 | # OneDrive |
| 64 | source = M365Source( |
| 65 | web_url="https://contoso-my.sharepoint.com/personal/user_contoso_com", |
| 66 | folder_url="/personal/user_contoso_com/Documents" |
| 67 | ) |
| 68 | |
| 69 | files = source.list_videos() |
| 70 | source.download_all(files, Path("./docs")) |
| 71 | """ |
| 72 | |
| 73 | def __init__( |
| 74 | self, |
| 75 | web_url: str, |
| 76 | folder_url: Optional[str] = None, |
| 77 | file_ids: Optional[List[str]] = None, |
| 78 | recursive: bool = False, |
| 79 | ): |
| 80 | self.web_url = web_url |
| 81 | self.folder_url = folder_url |
| 82 | self.file_ids = file_ids or [] |
| 83 | self.recursive = recursive |
| 84 | |
| 85 | def authenticate(self) -> bool: |
| 86 | """Check if m365 CLI is installed and logged in.""" |
| 87 | if not shutil.which("m365"): |
| 88 | logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365") |
| 89 | return False |
| 90 | try: |
| 91 | result = _run_m365(["status"], timeout=10) |
| 92 | # m365 status returns connection info when logged in |
| 93 | if isinstance(result, dict) and result.get("connectedAs"): |
| 94 | return True |
| 95 | if isinstance(result, str) and "Logged in" in result: |
| 96 | return True |
| 97 | logger.error("m365 not logged in. Run: m365 login") |
| 98 | return False |
| 99 | except (RuntimeError, subprocess.TimeoutExpired): |
| 100 | logger.error("m365 not logged in. Run: m365 login") |
| 101 | return False |
| 102 | |
| 103 | def list_videos( |
| 104 | self, |
| 105 | folder_id: Optional[str] = None, |
| 106 | folder_path: Optional[str] = None, |
| 107 | patterns: Optional[List[str]] = None, |
| 108 | ) -> List[SourceFile]: |
| 109 | """List documents in SharePoint/OneDrive. Returns docs, not just videos.""" |
| 110 | files: List[SourceFile] = [] |
| 111 | |
| 112 | # Fetch specific files by ID |
| 113 | if self.file_ids: |
| 114 | for fid in self.file_ids: |
| 115 | try: |
| 116 | result = _run_m365( |
| 117 | [ |
| 118 | "spo", |
| 119 | "file", |
| 120 | "get", |
| 121 | "--webUrl", |
| 122 | self.web_url, |
| 123 | "--id", |
| 124 | fid, |
| 125 | ] |
| 126 | ) |
| 127 | files.append(_result_to_source_file(result)) |
| 128 | except RuntimeError as e: |
| 129 | logger.warning(f"Failed to get file {fid}: {e}") |
| 130 | return files |
| 131 | |
| 132 | # List files in folder |
| 133 | folder = folder_path or self.folder_url |
| 134 | if not folder: |
| 135 | logger.error("No folder URL specified. Use --folder-url or folder_path parameter.") |
| 136 | return [] |
| 137 | |
| 138 | try: |
| 139 | args = [ |
| 140 | "file", |
| 141 | "list", |
| 142 | "--webUrl", |
| 143 | self.web_url, |
| 144 | "--folderUrl", |
| 145 | folder, |
| 146 | ] |
| 147 | if self.recursive: |
| 148 | args.append("--recursive") |
| 149 | |
| 150 | result = _run_m365(args, timeout=60) |
| 151 | except RuntimeError as e: |
| 152 | logger.error(f"Failed to list files: {e}") |
| 153 | return [] |
| 154 | |
| 155 | items = result if isinstance(result, list) else [] |
| 156 | for item in items: |
| 157 | name = item.get("Name", item.get("name", "")) |
| 158 | ext = Path(name).suffix.lower() |
| 159 | if ext in _DOC_EXTENSIONS: |
| 160 | files.append(_result_to_source_file(item)) |
| 161 | |
| 162 | logger.info(f"Found {len(files)} document(s) in {folder}") |
| 163 | return files |
| 164 | |
| 165 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 166 | """Download a file from SharePoint/OneDrive.""" |
| 167 | destination = Path(destination) |
| 168 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 169 | |
| 170 | args = [ |
| 171 | "spo", |
| 172 | "file", |
| 173 | "get", |
| 174 | "--webUrl", |
| 175 | self.web_url, |
| 176 | "--asFile", |
| 177 | "--path", |
| 178 | str(destination), |
| 179 | ] |
| 180 | |
| 181 | # Use URL if available in path field, otherwise use ID |
| 182 | if file.path: |
| 183 | args.extend(["--url", file.path]) |
| 184 | else: |
| 185 | args.extend(["--id", file.id]) |
| 186 | |
| 187 | _run_m365(args, timeout=120) |
| 188 | logger.info(f"Downloaded {file.name} to {destination}") |
| 189 | return destination |
| 190 | |
| 191 | def download_as_text(self, file: SourceFile) -> str: |
| 192 | """Download a file and attempt to extract text content.""" |
| 193 | # For text-based formats, get as string directly |
| 194 | text_exts = {".txt", ".md", ".csv", ".html", ".htm"} |
| 195 | ext = Path(file.name).suffix.lower() |
| 196 | |
| 197 | if ext in text_exts: |
| 198 | try: |
| 199 | args = [ |
| 200 | "spo", |
| 201 | "file", |
| 202 | "get", |
| 203 | "--webUrl", |
| 204 | self.web_url, |
| 205 | "--asString", |
| 206 | ] |
| 207 | if file.path: |
| 208 | args.extend(["--url", file.path]) |
| 209 | else: |
| 210 | args.extend(["--id", file.id]) |
| 211 | |
| 212 | result = _run_m365(args, timeout=60) |
| 213 | return result if isinstance(result, str) else json.dumps(result) |
| 214 | except RuntimeError: |
| 215 | pass |
| 216 | |
| 217 | # For binary formats, download to temp and extract |
| 218 | with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: |
| 219 | tmp_path = Path(tmp.name) |
| 220 | |
| 221 | try: |
| 222 | self.download(file, tmp_path) |
| 223 | return _extract_text(tmp_path) |
| 224 | finally: |
| 225 | tmp_path.unlink(missing_ok=True) |
| 226 | |
| 227 | def fetch_all_text(self) -> Dict[str, str]: |
| 228 | """List all docs and return {filename: text_content} dict.""" |
| 229 | files = self.list_videos() |
| 230 | results = {} |
| 231 | for f in files: |
| 232 | try: |
| 233 | results[f.name] = self.download_as_text(f) |
| 234 | except Exception as e: |
| 235 | logger.warning(f"Failed to fetch {f.name}: {e}") |
| 236 | results[f.name] = f"[Error: {e}]" |
| 237 | return results |
| 238 | |
| 239 | def collate(self, separator: str = "\n\n---\n\n") -> str: |
| 240 | """Fetch all docs and collate into a single text blob for ingestion.""" |
| 241 | docs = self.fetch_all_text() |
| 242 | parts = [] |
| 243 | for name, content in docs.items(): |
| 244 | parts.append(f"# {name}\n\n{content}") |
| 245 | return separator.join(parts) |
| 246 | |
| 247 | |
| 248 | def _result_to_source_file(item: dict) -> SourceFile: |
| 249 | """Convert an m365 file result to SourceFile.""" |
| 250 | name = item.get("Name", item.get("name", "Untitled")) |
| 251 | file_id = item.get("UniqueId", item.get("uniqueId", item.get("id", ""))) |
| 252 | size = item.get("Length", item.get("length", item.get("size"))) |
| 253 | path = item.get("ServerRelativeUrl", item.get("serverRelativeUrl")) |
| 254 | modified = item.get("TimeLastModified", item.get("lastModifiedDateTime")) |
| 255 | |
| 256 | return SourceFile( |
| 257 | name=name, |
| 258 | id=str(file_id), |
| 259 | size_bytes=int(size) if size else None, |
| 260 | mime_type=None, |
| 261 | modified_at=modified, |
| 262 | path=path, |
| 263 | ) |
| 264 | |
| 265 | |
| 266 | def _extract_text(path: Path) -> str: |
| 267 | """Best-effort text extraction from a downloaded file.""" |
| 268 | ext = path.suffix.lower() |
| 269 | |
| 270 | if ext in {".txt", ".md", ".csv"}: |
| 271 | return path.read_text(encoding="utf-8", errors="replace") |
| 272 | |
| 273 | if ext in {".html", ".htm"}: |
| 274 | from video_processor.sources.web_source import _strip_html_tags |
| 275 | |
| 276 | return _strip_html_tags(path.read_text(encoding="utf-8", errors="replace")) |
| 277 | |
| 278 | if ext == ".pdf": |
| 279 | try: |
| 280 | import fitz # pymupdf |
| 281 | |
| 282 | doc = fitz.open(str(path)) |
| 283 | return "\n\n".join(page.get_text() for page in doc) |
| 284 | except ImportError: |
| 285 | return f"[PDF file: {path.name} — install pymupdf to extract text]" |
| 286 | |
| 287 | if ext in {".docx", ".pptx", ".xlsx"}: |
| 288 | # Try python-docx / openpyxl / python-pptx if available |
| 289 | try: |
| 290 | if ext == ".docx": |
| 291 | from docx import Document |
| 292 | |
| 293 | doc = Document(str(path)) |
| 294 | return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip()) |
| 295 | elif ext == ".xlsx": |
| 296 | import openpyxl |
| 297 | |
| 298 | wb = openpyxl.load_workbook(str(path), read_only=True) |
| 299 | rows = [] |
| 300 | for sheet in wb.sheetnames: |
| 301 | ws = wb[sheet] |
| 302 | for row in ws.iter_rows(values_only=True): |
| 303 | cells = [str(c) if c is not None else "" for c in row] |
| 304 | if any(cells): |
| 305 | rows.append("\t".join(cells)) |
| 306 | return "\n".join(rows) |
| 307 | except ImportError: |
| 308 | return f"[{ext} file: {path.name} — install python-docx/openpyxl to extract text]" |
| 309 | |
| 310 | return f"[Unsupported format: {path.name}]" |
| --- a/video_processor/sources/meet_recording_source.py | ||
| +++ b/video_processor/sources/meet_recording_source.py | ||
| @@ -0,0 +1,280 @@ | ||
| 1 | +"""Google Meet recording source using the gws CLI (googleworkspace/cli). | |
| 2 | + | |
| 3 | +Fetches Meet recordings and companion transcripts from Google Drive | |
| 4 | +via the `gws` CLI tool. | |
| 5 | + | |
| 6 | +Requires: npm install -g @googleworkspace/cli | |
| 7 | +Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless) | |
| 8 | +""" | |
| 9 | + | |
| 10 | +import json | |
| 11 | +import logging | |
| 12 | +import re | |
| 13 | +import shutil | |
| 14 | +import subprocess | |
| 15 | +from pathlib import Path | |
| 16 | +from typing import List, Optional | |
| 17 | + | |
| 18 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 19 | +from video_processor.sources.gws_source import _run_gws | |
| 20 | + | |
| 21 | +logger = logging.getLogger(__name__) | |
| 22 | + | |
| 23 | + | |
| 24 | +class MeetRecordingSource(BaseSource): | |
| 25 | + """ | |
| 26 | + Fetch Google Meet recordings and transcripts from Google Drive via the gws CLI. | |
| 27 | + | |
| 28 | + Meet stores recordings as MP4 files in Drive (typically in a "Meet Recordings" | |
| 29 | + folder) and auto-generated transcripts as Google Docs. | |
| 30 | + | |
| 31 | + Usage: | |
| 32 | + source = MeetRecordingSource() | |
| 33 | + source.authenticate() | |
| 34 | + recordings = source.list_videos() | |
| 35 | + source.download_all(recordings, Path("./recordings")) | |
| 36 | + | |
| 37 | + # Fetch transcript for a specific recording | |
| 38 | + transcript = source.fetch_transcript("Meet Recording 2026-03-07") | |
| 39 | + """ | |
| 40 | + | |
| 41 | + def __init__(self, drive_folder_id: Optional[str] = None): | |
| 42 | + self.drive_folder_id = drive_folder_id | |
| 43 | + | |
| 44 | + def authenticate(self) -> bool: | |
| 45 | + """Check if gws CLI is installed and authenticated.""" | |
| 46 | + if not shutil.which("gws"): | |
| 47 | + logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli") | |
| 48 | + return False | |
| 49 | + try: | |
| 50 | + _run_gws(["auth", "status"], timeout=10) | |
| 51 | + return True | |
| 52 | + except (RuntimeError, subprocess.TimeoutExpired): | |
| 53 | + logger.error("gws not authenticated. Run: gws auth login") | |
| 54 | + return False | |
| 55 | + | |
| 56 | + def list_videos( | |
| 57 | + self, | |
| 58 | + folder_id: Optional[str] = None, | |
| 59 | + folder_path: Optional[str] = None, | |
| 60 | + patterns: Optional[List[str]] = None, | |
| 61 | + ) -> List[SourceFile]: | |
| 62 | + """List Google Meet recordings in Drive. | |
| 63 | + | |
| 64 | + Searches for MP4 files with 'Meet Recording' in the name. If a | |
| 65 | + drive_folder_id is set, restricts search to that folder. | |
| 66 | + Also discovers companion transcript docs for each recording. | |
| 67 | + """ | |
| 68 | + target_folder = folder_id or self.drive_folder_id | |
| 69 | + files: List[SourceFile] = [] | |
| 70 | + | |
| 71 | + # Build the Drive search query for Meet recordings | |
| 72 | + q_parts = [ | |
| 73 | + "mimeType='video/mp4'", | |
| 74 | + "name contains 'Meet Recording'", | |
| 75 | + "trashed=false", | |
| 76 | + ] | |
| 77 | + if target_folder: | |
| 78 | + q_parts.append(f"'{target_folder}' in parents") | |
| 79 | + | |
| 80 | + params = { | |
| 81 | + "q": " and ".join(q_parts), | |
| 82 | + "fields": "files(id,name,mimeType,size,modifiedTime)", | |
| 83 | + "pageSize": 50, | |
| 84 | + "orderBy": "modifiedTime desc", | |
| 85 | + } | |
| 86 | + | |
| 87 | + try: | |
| 88 | + result = _run_gws( | |
| 89 | + [ | |
| 90 | + "drive", | |
| 91 | + "files", | |
| 92 | + "list", | |
| 93 | + "--params", | |
| 94 | + json.dumps(params), | |
| 95 | + ], | |
| 96 | + timeout=60, | |
| 97 | + ) | |
| 98 | + except RuntimeError as e: | |
| 99 | + logger.error(f"Failed to list Meet recordings: {e}") | |
| 100 | + return [] | |
| 101 | + | |
| 102 | + recordings = result.get("files", []) | |
| 103 | + for item in recordings: | |
| 104 | + size = item.get("size") | |
| 105 | + files.append( | |
| 106 | + SourceFile( | |
| 107 | + name=item.get("name", "Meet Recording"), | |
| 108 | + id=item.get("id", ""), | |
| 109 | + size_bytes=int(size) if size else None, | |
| 110 | + mime_type=item.get("mimeType", "video/mp4"), | |
| 111 | + modified_at=item.get("modifiedTime"), | |
| 112 | + ) | |
| 113 | + ) | |
| 114 | + | |
| 115 | + # Also search for auto-generated transcript docs | |
| 116 | + transcript_params = { | |
| 117 | + "q": " and ".join( | |
| 118 | + [ | |
| 119 | + "mimeType='application/vnd.google-apps.document'", | |
| 120 | + "(name contains 'Transcript' or name contains 'Meeting notes')", | |
| 121 | + "trashed=false", | |
| 122 | + ] | |
| 123 | + + ([f"'{target_folder}' in parents"] if target_folder else []) | |
| 124 | + ), | |
| 125 | + "fields": "files(id,name,mimeType,modifiedTime)", | |
| 126 | + "pageSize": 50, | |
| 127 | + "orderBy": "modifiedTime desc", | |
| 128 | + } | |
| 129 | + | |
| 130 | + try: | |
| 131 | + transcript_result = _run_gws( | |
| 132 | + [ | |
| 133 | + "drive", | |
| 134 | + "files", | |
| 135 | + "list", | |
| 136 | + "--params", | |
| 137 | + json.dumps(transcript_params), | |
| 138 | + ], | |
| 139 | + timeout=60, | |
| 140 | + ) | |
| 141 | + transcript_files = transcript_result.get("files", []) | |
| 142 | + logger.info( | |
| 143 | + f"Found {len(recordings)} recording(s) and " | |
| 144 | + f"{len(transcript_files)} transcript doc(s) in Drive" | |
| 145 | + ) | |
| 146 | + except RuntimeError as e: | |
| 147 | + logger.debug(f"Transcript search failed: {e}") | |
| 148 | + | |
| 149 | + if not files: | |
| 150 | + logger.warning("No Google Meet recordings found in Drive") | |
| 151 | + | |
| 152 | + logger.info(f"Found {len(files)} Meet recording(s)") | |
| 153 | + return files | |
| 154 | + | |
| 155 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 156 | + """Download a Meet recording from Drive.""" | |
| 157 | + destination = Path(destination) | |
| 158 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 159 | + | |
| 160 | + # For video files, download binary content via alt=media | |
| 161 | + result = _run_gws( | |
| 162 | + [ | |
| 163 | + "drive", | |
| 164 | + "files", | |
| 165 | + "get", | |
| 166 | + "--params", | |
| 167 | + json.dumps({"fileId": file.id, "alt": "media"}), | |
| 168 | + ], | |
| 169 | + timeout=300, | |
| 170 | + ) | |
| 171 | + | |
| 172 | + # Write the content — result may be raw binary or a dict wrapper | |
| 173 | + raw = result.get("raw", "") if isinstance(result, dict) else str(result) | |
| 174 | + destination.write_text(raw, encoding="utf-8") | |
| 175 | + logger.info(f"Downloaded {file.name} to {destination}") | |
| 176 | + return destination | |
| 177 | + | |
| 178 | + def fetch_transcript(self, recording_name: str) -> Optional[str]: | |
| 179 | + """Fetch the companion transcript for a Meet recording. | |
| 180 | + | |
| 181 | + Google Meet creates transcript docs with names that typically match | |
| 182 | + the recording date/time. This method searches for the matching | |
| 183 | + Google Doc and extracts its text content. | |
| 184 | + """ | |
| 185 | + transcript_id = self._find_matching_transcript(recording_name) | |
| 186 | + if not transcript_id: | |
| 187 | + logger.info(f"No matching transcript found for: {recording_name}") | |
| 188 | + return None | |
| 189 | + | |
| 190 | + # Fetch the Google Doc content via the Docs API | |
| 191 | + try: | |
| 192 | + result = _run_gws( | |
| 193 | + [ | |
| 194 | + "docs", | |
| 195 | + "documents", | |
| 196 | + "get", | |
| 197 | + "--params", | |
| 198 | + json.dumps({"documentId": transcript_id}), | |
| 199 | + ], | |
| 200 | + timeout=60, | |
| 201 | + ) | |
| 202 | + except RuntimeError as e: | |
| 203 | + logger.warning(f"Failed to fetch transcript doc {transcript_id}: {e}") | |
| 204 | + return None | |
| 205 | + | |
| 206 | + # Extract text from the Docs API structural response | |
| 207 | + body = result.get("body", {}) | |
| 208 | + text_parts: list[str] = [] | |
| 209 | + for element in body.get("content", []): | |
| 210 | + paragraph = element.get("paragraph", {}) | |
| 211 | + for pe in paragraph.get("elements", []): | |
| 212 | + text_run = pe.get("textRun", {}) | |
| 213 | + text = text_run.get("content", "") | |
| 214 | + if text.strip(): | |
| 215 | + text_parts.append(text) | |
| 216 | + | |
| 217 | + if not text_parts: | |
| 218 | + logger.warning(f"Transcript doc {transcript_id} had no extractable text") | |
| 219 | + return None | |
| 220 | + | |
| 221 | + return "".join(text_parts) | |
| 222 | + | |
| 223 | + def _find_matching_transcript(self, recording_name: str) -> Optional[str]: | |
| 224 | + """Search Drive for a transcript doc that matches a recording name. | |
| 225 | + | |
| 226 | + Meet recordings are typically named like: | |
| 227 | + "Meet Recording 2026-03-07T14:30:00" | |
| 228 | + And transcripts are named like: | |
| 229 | + "Meeting Transcript 2026-03-07" or "2026-03-07 - Transcript" | |
| 230 | + | |
| 231 | + This extracts the date portion and searches for matching transcript docs. | |
| 232 | + """ | |
| 233 | + # Extract a date string from the recording name (YYYY-MM-DD pattern) | |
| 234 | + date_match = re.search(r"\d{4}-\d{2}-\d{2}", recording_name) | |
| 235 | + date_str = date_match.group(0) if date_match else recording_name | |
| 236 | + | |
| 237 | + # Search for transcript docs matching the date | |
| 238 | + search_query = " and ".join( | |
| 239 | + [ | |
| 240 | + "mimeType='application/vnd.google-apps.document'", | |
| 241 | + f"name contains '{date_str}'", | |
| 242 | + "(name contains 'Transcript' or name contains 'transcript' " | |
| 243 | + "or name contains 'Meeting notes')", | |
| 244 | + "trashed=false", | |
| 245 | + ] | |
| 246 | + ) | |
| 247 | + if self.drive_folder_id: | |
| 248 | + search_query += f" and '{self.drive_folder_id}' in parents" | |
| 249 | + | |
| 250 | + try: | |
| 251 | + result = _run_gws( | |
| 252 | + [ | |
| 253 | + "drive", | |
| 254 | + "files", | |
| 255 | + "list", | |
| 256 | + "--params", | |
| 257 | + json.dumps( | |
| 258 | + { | |
| 259 | + "q": search_query, | |
| 260 | + "fields": "files(id,name,modifiedTime)", | |
| 261 | + "pageSize": 5, | |
| 262 | + "orderBy": "modifiedTime desc", | |
| 263 | + } | |
| 264 | + ), | |
| 265 | + ], | |
| 266 | + timeout=60, | |
| 267 | + ) | |
| 268 | + except RuntimeError as e: | |
| 269 | + logger.debug(f"Transcript search failed for '{date_str}': {e}") | |
| 270 | + return None | |
| 271 | + | |
| 272 | + files = result.get("files", []) | |
| 273 | + if not files: | |
| 274 | + logger.debug(f"No transcript docs found matching '{date_str}'") | |
| 275 | + return None | |
| 276 | + | |
| 277 | + # Return the most recently modified match | |
| 278 | + best = files[0] | |
| 279 | + logger.info(f"Matched transcript: {best.get('name')} for recording: {recording_name}") | |
| 280 | + return best.get("id") |
| --- a/video_processor/sources/meet_recording_source.py | |
| +++ b/video_processor/sources/meet_recording_source.py | |
| @@ -0,0 +1,280 @@ | |
| --- a/video_processor/sources/meet_recording_source.py | |
| +++ b/video_processor/sources/meet_recording_source.py | |
| @@ -0,0 +1,280 @@ | |
| 1 | """Google Meet recording source using the gws CLI (googleworkspace/cli). |
| 2 | |
| 3 | Fetches Meet recordings and companion transcripts from Google Drive |
| 4 | via the `gws` CLI tool. |
| 5 | |
| 6 | Requires: npm install -g @googleworkspace/cli |
| 7 | Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless) |
| 8 | """ |
| 9 | |
| 10 | import json |
| 11 | import logging |
| 12 | import re |
| 13 | import shutil |
| 14 | import subprocess |
| 15 | from pathlib import Path |
| 16 | from typing import List, Optional |
| 17 | |
| 18 | from video_processor.sources.base import BaseSource, SourceFile |
| 19 | from video_processor.sources.gws_source import _run_gws |
| 20 | |
| 21 | logger = logging.getLogger(__name__) |
| 22 | |
| 23 | |
| 24 | class MeetRecordingSource(BaseSource): |
| 25 | """ |
| 26 | Fetch Google Meet recordings and transcripts from Google Drive via the gws CLI. |
| 27 | |
| 28 | Meet stores recordings as MP4 files in Drive (typically in a "Meet Recordings" |
| 29 | folder) and auto-generated transcripts as Google Docs. |
| 30 | |
| 31 | Usage: |
| 32 | source = MeetRecordingSource() |
| 33 | source.authenticate() |
| 34 | recordings = source.list_videos() |
| 35 | source.download_all(recordings, Path("./recordings")) |
| 36 | |
| 37 | # Fetch transcript for a specific recording |
| 38 | transcript = source.fetch_transcript("Meet Recording 2026-03-07") |
| 39 | """ |
| 40 | |
| 41 | def __init__(self, drive_folder_id: Optional[str] = None): |
| 42 | self.drive_folder_id = drive_folder_id |
| 43 | |
| 44 | def authenticate(self) -> bool: |
| 45 | """Check if gws CLI is installed and authenticated.""" |
| 46 | if not shutil.which("gws"): |
| 47 | logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli") |
| 48 | return False |
| 49 | try: |
| 50 | _run_gws(["auth", "status"], timeout=10) |
| 51 | return True |
| 52 | except (RuntimeError, subprocess.TimeoutExpired): |
| 53 | logger.error("gws not authenticated. Run: gws auth login") |
| 54 | return False |
| 55 | |
| 56 | def list_videos( |
| 57 | self, |
| 58 | folder_id: Optional[str] = None, |
| 59 | folder_path: Optional[str] = None, |
| 60 | patterns: Optional[List[str]] = None, |
| 61 | ) -> List[SourceFile]: |
| 62 | """List Google Meet recordings in Drive. |
| 63 | |
| 64 | Searches for MP4 files with 'Meet Recording' in the name. If a |
| 65 | drive_folder_id is set, restricts search to that folder. |
| 66 | Also discovers companion transcript docs for each recording. |
| 67 | """ |
| 68 | target_folder = folder_id or self.drive_folder_id |
| 69 | files: List[SourceFile] = [] |
| 70 | |
| 71 | # Build the Drive search query for Meet recordings |
| 72 | q_parts = [ |
| 73 | "mimeType='video/mp4'", |
| 74 | "name contains 'Meet Recording'", |
| 75 | "trashed=false", |
| 76 | ] |
| 77 | if target_folder: |
| 78 | q_parts.append(f"'{target_folder}' in parents") |
| 79 | |
| 80 | params = { |
| 81 | "q": " and ".join(q_parts), |
| 82 | "fields": "files(id,name,mimeType,size,modifiedTime)", |
| 83 | "pageSize": 50, |
| 84 | "orderBy": "modifiedTime desc", |
| 85 | } |
| 86 | |
| 87 | try: |
| 88 | result = _run_gws( |
| 89 | [ |
| 90 | "drive", |
| 91 | "files", |
| 92 | "list", |
| 93 | "--params", |
| 94 | json.dumps(params), |
| 95 | ], |
| 96 | timeout=60, |
| 97 | ) |
| 98 | except RuntimeError as e: |
| 99 | logger.error(f"Failed to list Meet recordings: {e}") |
| 100 | return [] |
| 101 | |
| 102 | recordings = result.get("files", []) |
| 103 | for item in recordings: |
| 104 | size = item.get("size") |
| 105 | files.append( |
| 106 | SourceFile( |
| 107 | name=item.get("name", "Meet Recording"), |
| 108 | id=item.get("id", ""), |
| 109 | size_bytes=int(size) if size else None, |
| 110 | mime_type=item.get("mimeType", "video/mp4"), |
| 111 | modified_at=item.get("modifiedTime"), |
| 112 | ) |
| 113 | ) |
| 114 | |
| 115 | # Also search for auto-generated transcript docs |
| 116 | transcript_params = { |
| 117 | "q": " and ".join( |
| 118 | [ |
| 119 | "mimeType='application/vnd.google-apps.document'", |
| 120 | "(name contains 'Transcript' or name contains 'Meeting notes')", |
| 121 | "trashed=false", |
| 122 | ] |
| 123 | + ([f"'{target_folder}' in parents"] if target_folder else []) |
| 124 | ), |
| 125 | "fields": "files(id,name,mimeType,modifiedTime)", |
| 126 | "pageSize": 50, |
| 127 | "orderBy": "modifiedTime desc", |
| 128 | } |
| 129 | |
| 130 | try: |
| 131 | transcript_result = _run_gws( |
| 132 | [ |
| 133 | "drive", |
| 134 | "files", |
| 135 | "list", |
| 136 | "--params", |
| 137 | json.dumps(transcript_params), |
| 138 | ], |
| 139 | timeout=60, |
| 140 | ) |
| 141 | transcript_files = transcript_result.get("files", []) |
| 142 | logger.info( |
| 143 | f"Found {len(recordings)} recording(s) and " |
| 144 | f"{len(transcript_files)} transcript doc(s) in Drive" |
| 145 | ) |
| 146 | except RuntimeError as e: |
| 147 | logger.debug(f"Transcript search failed: {e}") |
| 148 | |
| 149 | if not files: |
| 150 | logger.warning("No Google Meet recordings found in Drive") |
| 151 | |
| 152 | logger.info(f"Found {len(files)} Meet recording(s)") |
| 153 | return files |
| 154 | |
| 155 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 156 | """Download a Meet recording from Drive.""" |
| 157 | destination = Path(destination) |
| 158 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 159 | |
| 160 | # For video files, download binary content via alt=media |
| 161 | result = _run_gws( |
| 162 | [ |
| 163 | "drive", |
| 164 | "files", |
| 165 | "get", |
| 166 | "--params", |
| 167 | json.dumps({"fileId": file.id, "alt": "media"}), |
| 168 | ], |
| 169 | timeout=300, |
| 170 | ) |
| 171 | |
| 172 | # Write the content — result may be raw binary or a dict wrapper |
| 173 | raw = result.get("raw", "") if isinstance(result, dict) else str(result) |
| 174 | destination.write_text(raw, encoding="utf-8") |
| 175 | logger.info(f"Downloaded {file.name} to {destination}") |
| 176 | return destination |
| 177 | |
| 178 | def fetch_transcript(self, recording_name: str) -> Optional[str]: |
| 179 | """Fetch the companion transcript for a Meet recording. |
| 180 | |
| 181 | Google Meet creates transcript docs with names that typically match |
| 182 | the recording date/time. This method searches for the matching |
| 183 | Google Doc and extracts its text content. |
| 184 | """ |
| 185 | transcript_id = self._find_matching_transcript(recording_name) |
| 186 | if not transcript_id: |
| 187 | logger.info(f"No matching transcript found for: {recording_name}") |
| 188 | return None |
| 189 | |
| 190 | # Fetch the Google Doc content via the Docs API |
| 191 | try: |
| 192 | result = _run_gws( |
| 193 | [ |
| 194 | "docs", |
| 195 | "documents", |
| 196 | "get", |
| 197 | "--params", |
| 198 | json.dumps({"documentId": transcript_id}), |
| 199 | ], |
| 200 | timeout=60, |
| 201 | ) |
| 202 | except RuntimeError as e: |
| 203 | logger.warning(f"Failed to fetch transcript doc {transcript_id}: {e}") |
| 204 | return None |
| 205 | |
| 206 | # Extract text from the Docs API structural response |
| 207 | body = result.get("body", {}) |
| 208 | text_parts: list[str] = [] |
| 209 | for element in body.get("content", []): |
| 210 | paragraph = element.get("paragraph", {}) |
| 211 | for pe in paragraph.get("elements", []): |
| 212 | text_run = pe.get("textRun", {}) |
| 213 | text = text_run.get("content", "") |
| 214 | if text.strip(): |
| 215 | text_parts.append(text) |
| 216 | |
| 217 | if not text_parts: |
| 218 | logger.warning(f"Transcript doc {transcript_id} had no extractable text") |
| 219 | return None |
| 220 | |
| 221 | return "".join(text_parts) |
| 222 | |
| 223 | def _find_matching_transcript(self, recording_name: str) -> Optional[str]: |
| 224 | """Search Drive for a transcript doc that matches a recording name. |
| 225 | |
| 226 | Meet recordings are typically named like: |
| 227 | "Meet Recording 2026-03-07T14:30:00" |
| 228 | And transcripts are named like: |
| 229 | "Meeting Transcript 2026-03-07" or "2026-03-07 - Transcript" |
| 230 | |
| 231 | This extracts the date portion and searches for matching transcript docs. |
| 232 | """ |
| 233 | # Extract a date string from the recording name (YYYY-MM-DD pattern) |
| 234 | date_match = re.search(r"\d{4}-\d{2}-\d{2}", recording_name) |
| 235 | date_str = date_match.group(0) if date_match else recording_name |
| 236 | |
| 237 | # Search for transcript docs matching the date |
| 238 | search_query = " and ".join( |
| 239 | [ |
| 240 | "mimeType='application/vnd.google-apps.document'", |
| 241 | f"name contains '{date_str}'", |
| 242 | "(name contains 'Transcript' or name contains 'transcript' " |
| 243 | "or name contains 'Meeting notes')", |
| 244 | "trashed=false", |
| 245 | ] |
| 246 | ) |
| 247 | if self.drive_folder_id: |
| 248 | search_query += f" and '{self.drive_folder_id}' in parents" |
| 249 | |
| 250 | try: |
| 251 | result = _run_gws( |
| 252 | [ |
| 253 | "drive", |
| 254 | "files", |
| 255 | "list", |
| 256 | "--params", |
| 257 | json.dumps( |
| 258 | { |
| 259 | "q": search_query, |
| 260 | "fields": "files(id,name,modifiedTime)", |
| 261 | "pageSize": 5, |
| 262 | "orderBy": "modifiedTime desc", |
| 263 | } |
| 264 | ), |
| 265 | ], |
| 266 | timeout=60, |
| 267 | ) |
| 268 | except RuntimeError as e: |
| 269 | logger.debug(f"Transcript search failed for '{date_str}': {e}") |
| 270 | return None |
| 271 | |
| 272 | files = result.get("files", []) |
| 273 | if not files: |
| 274 | logger.debug(f"No transcript docs found matching '{date_str}'") |
| 275 | return None |
| 276 | |
| 277 | # Return the most recently modified match |
| 278 | best = files[0] |
| 279 | logger.info(f"Matched transcript: {best.get('name')} for recording: {recording_name}") |
| 280 | return best.get("id") |
| --- a/video_processor/sources/notion_source.py | ||
| +++ b/video_processor/sources/notion_source.py | ||
| @@ -0,0 +1,380 @@ | ||
| 1 | +"""Notion API source connector for fetching pages and databases.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import os | |
| 5 | +from pathlib import Path | |
| 6 | +from typing import Dict, List, Optional | |
| 7 | + | |
| 8 | +import requests | |
| 9 | + | |
| 10 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 11 | + | |
| 12 | +logger = logging.getLogger(__name__) | |
| 13 | + | |
| 14 | +NOTION_VERSION = "2022-06-28" | |
| 15 | +NOTION_BASE_URL = "https://api.notion.com/v1" | |
| 16 | + | |
| 17 | + | |
| 18 | +class NotionSource(BaseSource): | |
| 19 | + """ | |
| 20 | + Fetch pages and databases from Notion via the public API. | |
| 21 | + | |
| 22 | + Requires a Notion integration token (internal integration). | |
| 23 | + Set NOTION_API_KEY env var or pass token directly. | |
| 24 | + | |
| 25 | + Requires: pip install requests | |
| 26 | + """ | |
| 27 | + | |
| 28 | + def __init__( | |
| 29 | + self, | |
| 30 | + token: Optional[str] = None, | |
| 31 | + database_id: Optional[str] = None, | |
| 32 | + page_ids: Optional[List[str]] = None, | |
| 33 | + ): | |
| 34 | + self.token = token or os.environ.get("NOTION_API_KEY", "") | |
| 35 | + self.database_id = database_id | |
| 36 | + self.page_ids = page_ids or [] | |
| 37 | + | |
| 38 | + def _headers(self) -> Dict[str, str]: | |
| 39 | + return { | |
| 40 | + "Authorization": f"Bearer {self.token}", | |
| 41 | + "Notion-Version": NOTION_VERSION, | |
| 42 | + "Content-Type": "application/json", | |
| 43 | + } | |
| 44 | + | |
| 45 | + def authenticate(self) -> bool: | |
| 46 | + """Check token is set and make a test call to the Notion API.""" | |
| 47 | + if not self.token: | |
| 48 | + logger.error("Notion token not set. Provide token or set NOTION_API_KEY.") | |
| 49 | + return False | |
| 50 | + try: | |
| 51 | + resp = requests.get( | |
| 52 | + f"{NOTION_BASE_URL}/users/me", | |
| 53 | + headers=self._headers(), | |
| 54 | + timeout=15, | |
| 55 | + ) | |
| 56 | + resp.raise_for_status() | |
| 57 | + user = resp.json() | |
| 58 | + logger.info("Authenticated with Notion as %s", user.get("name", "unknown")) | |
| 59 | + return True | |
| 60 | + except requests.RequestException as exc: | |
| 61 | + logger.error("Notion authentication failed: %s", exc) | |
| 62 | + return False | |
| 63 | + | |
| 64 | + def list_videos( | |
| 65 | + self, | |
| 66 | + folder_id: Optional[str] = None, | |
| 67 | + folder_path: Optional[str] = None, | |
| 68 | + patterns: Optional[List[str]] = None, | |
| 69 | + ) -> List[SourceFile]: | |
| 70 | + """List Notion pages as SourceFiles. | |
| 71 | + | |
| 72 | + If database_id is set, query the database for pages. | |
| 73 | + If page_ids is set, fetch each page individually. | |
| 74 | + """ | |
| 75 | + files: List[SourceFile] = [] | |
| 76 | + | |
| 77 | + if self.database_id: | |
| 78 | + files.extend(self._list_from_database(self.database_id)) | |
| 79 | + | |
| 80 | + if self.page_ids: | |
| 81 | + files.extend(self._list_from_pages(self.page_ids)) | |
| 82 | + | |
| 83 | + if not files: | |
| 84 | + logger.warning("No pages found. Set database_id or page_ids.") | |
| 85 | + | |
| 86 | + return files | |
| 87 | + | |
| 88 | + def _list_from_database(self, database_id: str) -> List[SourceFile]: | |
| 89 | + """Query a Notion database and return SourceFiles for each row.""" | |
| 90 | + files: List[SourceFile] = [] | |
| 91 | + has_more = True | |
| 92 | + start_cursor: Optional[str] = None | |
| 93 | + | |
| 94 | + while has_more: | |
| 95 | + body: Dict = {} | |
| 96 | + if start_cursor: | |
| 97 | + body["start_cursor"] = start_cursor | |
| 98 | + | |
| 99 | + resp = requests.post( | |
| 100 | + f"{NOTION_BASE_URL}/databases/{database_id}/query", | |
| 101 | + headers=self._headers(), | |
| 102 | + json=body, | |
| 103 | + timeout=30, | |
| 104 | + ) | |
| 105 | + resp.raise_for_status() | |
| 106 | + data = resp.json() | |
| 107 | + | |
| 108 | + for page in data.get("results", []): | |
| 109 | + title = _extract_page_title(page) | |
| 110 | + files.append( | |
| 111 | + SourceFile( | |
| 112 | + name=title, | |
| 113 | + id=page["id"], | |
| 114 | + mime_type="text/markdown", | |
| 115 | + modified_at=page.get("last_edited_time"), | |
| 116 | + ) | |
| 117 | + ) | |
| 118 | + | |
| 119 | + has_more = data.get("has_more", False) | |
| 120 | + start_cursor = data.get("next_cursor") | |
| 121 | + | |
| 122 | + return files | |
| 123 | + | |
| 124 | + def _list_from_pages(self, page_ids: List[str]) -> List[SourceFile]: | |
| 125 | + """Fetch individual pages by ID and return SourceFiles.""" | |
| 126 | + files: List[SourceFile] = [] | |
| 127 | + for page_id in page_ids: | |
| 128 | + try: | |
| 129 | + resp = requests.get( | |
| 130 | + f"{NOTION_BASE_URL}/pages/{page_id}", | |
| 131 | + headers=self._headers(), | |
| 132 | + timeout=15, | |
| 133 | + ) | |
| 134 | + resp.raise_for_status() | |
| 135 | + page = resp.json() | |
| 136 | + title = _extract_page_title(page) | |
| 137 | + files.append( | |
| 138 | + SourceFile( | |
| 139 | + name=title, | |
| 140 | + id=page["id"], | |
| 141 | + mime_type="text/markdown", | |
| 142 | + modified_at=page.get("last_edited_time"), | |
| 143 | + ) | |
| 144 | + ) | |
| 145 | + except requests.RequestException as exc: | |
| 146 | + logger.error("Failed to fetch page %s: %s", page_id, exc) | |
| 147 | + return files | |
| 148 | + | |
| 149 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 150 | + """Download page blocks as markdown text and save to destination.""" | |
| 151 | + destination = Path(destination) | |
| 152 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 153 | + | |
| 154 | + blocks = self._fetch_all_blocks(file.id) | |
| 155 | + text = self._blocks_to_text(blocks) | |
| 156 | + | |
| 157 | + # Prepend title | |
| 158 | + content = f"# {file.name}\n\n{text}" | |
| 159 | + destination.write_text(content, encoding="utf-8") | |
| 160 | + logger.info("Saved Notion page to %s", destination) | |
| 161 | + return destination | |
| 162 | + | |
| 163 | + def _fetch_all_blocks(self, page_id: str) -> list: | |
| 164 | + """Fetch all child blocks for a page, handling pagination.""" | |
| 165 | + blocks: list = [] | |
| 166 | + has_more = True | |
| 167 | + start_cursor: Optional[str] = None | |
| 168 | + | |
| 169 | + while has_more: | |
| 170 | + url = f"{NOTION_BASE_URL}/blocks/{page_id}/children?page_size=100" | |
| 171 | + if start_cursor: | |
| 172 | + url += f"&start_cursor={start_cursor}" | |
| 173 | + | |
| 174 | + resp = requests.get(url, headers=self._headers(), timeout=30) | |
| 175 | + resp.raise_for_status() | |
| 176 | + data = resp.json() | |
| 177 | + | |
| 178 | + blocks.extend(data.get("results", [])) | |
| 179 | + has_more = data.get("has_more", False) | |
| 180 | + start_cursor = data.get("next_cursor") | |
| 181 | + | |
| 182 | + return blocks | |
| 183 | + | |
| 184 | + def _blocks_to_text(self, blocks: list) -> str: | |
| 185 | + """Convert Notion block objects to markdown text.""" | |
| 186 | + lines: List[str] = [] | |
| 187 | + numbered_index = 0 | |
| 188 | + | |
| 189 | + for block in blocks: | |
| 190 | + block_type = block.get("type", "") | |
| 191 | + block_data = block.get(block_type, {}) | |
| 192 | + | |
| 193 | + if block_type == "paragraph": | |
| 194 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 195 | + lines.append(text) | |
| 196 | + numbered_index = 0 | |
| 197 | + | |
| 198 | + elif block_type == "heading_1": | |
| 199 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 200 | + lines.append(f"# {text}") | |
| 201 | + numbered_index = 0 | |
| 202 | + | |
| 203 | + elif block_type == "heading_2": | |
| 204 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 205 | + lines.append(f"## {text}") | |
| 206 | + numbered_index = 0 | |
| 207 | + | |
| 208 | + elif block_type == "heading_3": | |
| 209 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 210 | + lines.append(f"### {text}") | |
| 211 | + numbered_index = 0 | |
| 212 | + | |
| 213 | + elif block_type == "bulleted_list_item": | |
| 214 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 215 | + lines.append(f"- {text}") | |
| 216 | + numbered_index = 0 | |
| 217 | + | |
| 218 | + elif block_type == "numbered_list_item": | |
| 219 | + numbered_index += 1 | |
| 220 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 221 | + lines.append(f"{numbered_index}. {text}") | |
| 222 | + | |
| 223 | + elif block_type == "to_do": | |
| 224 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 225 | + checked = block_data.get("checked", False) | |
| 226 | + marker = "[x]" if checked else "[ ]" | |
| 227 | + lines.append(f"- {marker} {text}") | |
| 228 | + numbered_index = 0 | |
| 229 | + | |
| 230 | + elif block_type == "code": | |
| 231 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 232 | + language = block_data.get("language", "") | |
| 233 | + lines.append(f"```{language}") | |
| 234 | + lines.append(text) | |
| 235 | + lines.append("```") | |
| 236 | + numbered_index = 0 | |
| 237 | + | |
| 238 | + elif block_type == "quote": | |
| 239 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 240 | + lines.append(f"> {text}") | |
| 241 | + numbered_index = 0 | |
| 242 | + | |
| 243 | + elif block_type == "callout": | |
| 244 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 245 | + icon = block_data.get("icon", {}) | |
| 246 | + emoji = icon.get("emoji", "") if icon else "" | |
| 247 | + prefix = f"{emoji} " if emoji else "" | |
| 248 | + lines.append(f"> {prefix}{text}") | |
| 249 | + numbered_index = 0 | |
| 250 | + | |
| 251 | + elif block_type == "toggle": | |
| 252 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 253 | + lines.append(f"<details><summary>{text}</summary></details>") | |
| 254 | + numbered_index = 0 | |
| 255 | + | |
| 256 | + elif block_type == "divider": | |
| 257 | + lines.append("---") | |
| 258 | + numbered_index = 0 | |
| 259 | + | |
| 260 | + else: | |
| 261 | + # Unsupported block type — try to extract any rich_text | |
| 262 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 263 | + if text: | |
| 264 | + lines.append(text) | |
| 265 | + numbered_index = 0 | |
| 266 | + | |
| 267 | + return "\n\n".join(lines) | |
| 268 | + | |
| 269 | + def fetch_database_as_table(self, database_id: str) -> str: | |
| 270 | + """Fetch a Notion database and return its rows as CSV-like text. | |
| 271 | + | |
| 272 | + Each row is a page in the database. Columns are derived from | |
| 273 | + the database properties. | |
| 274 | + """ | |
| 275 | + # First, get database schema for column order | |
| 276 | + resp = requests.get( | |
| 277 | + f"{NOTION_BASE_URL}/databases/{database_id}", | |
| 278 | + headers=self._headers(), | |
| 279 | + timeout=15, | |
| 280 | + ) | |
| 281 | + resp.raise_for_status() | |
| 282 | + db_meta = resp.json() | |
| 283 | + properties = db_meta.get("properties", {}) | |
| 284 | + columns = sorted(properties.keys()) | |
| 285 | + | |
| 286 | + # Query all rows | |
| 287 | + rows: List[Dict] = [] | |
| 288 | + has_more = True | |
| 289 | + start_cursor: Optional[str] = None | |
| 290 | + | |
| 291 | + while has_more: | |
| 292 | + body: Dict = {} | |
| 293 | + if start_cursor: | |
| 294 | + body["start_cursor"] = start_cursor | |
| 295 | + | |
| 296 | + resp = requests.post( | |
| 297 | + f"{NOTION_BASE_URL}/databases/{database_id}/query", | |
| 298 | + headers=self._headers(), | |
| 299 | + json=body, | |
| 300 | + timeout=30, | |
| 301 | + ) | |
| 302 | + resp.raise_for_status() | |
| 303 | + data = resp.json() | |
| 304 | + rows.extend(data.get("results", [])) | |
| 305 | + has_more = data.get("has_more", False) | |
| 306 | + start_cursor = data.get("next_cursor") | |
| 307 | + | |
| 308 | + # Build CSV-like output | |
| 309 | + lines: List[str] = [] | |
| 310 | + lines.append(",".join(columns)) | |
| 311 | + | |
| 312 | + for row in rows: | |
| 313 | + row_props = row.get("properties", {}) | |
| 314 | + values: List[str] = [] | |
| 315 | + for col in columns: | |
| 316 | + prop = row_props.get(col, {}) | |
| 317 | + values.append(_extract_property_value(prop)) | |
| 318 | + lines.append(",".join(values)) | |
| 319 | + | |
| 320 | + return "\n".join(lines) | |
| 321 | + | |
| 322 | + | |
| 323 | +def _rich_text_to_str(rich_text: list) -> str: | |
| 324 | + """Extract plain text from a Notion rich_text array.""" | |
| 325 | + return "".join(item.get("plain_text", "") for item in rich_text) | |
| 326 | + | |
| 327 | + | |
| 328 | +def _extract_page_title(page: dict) -> str: | |
| 329 | + """Extract the title from a Notion page object.""" | |
| 330 | + properties = page.get("properties", {}) | |
| 331 | + for prop in properties.values(): | |
| 332 | + if prop.get("type") == "title": | |
| 333 | + return _rich_text_to_str(prop.get("title", [])) | |
| 334 | + return "Untitled" | |
| 335 | + | |
| 336 | + | |
| 337 | +def _extract_property_value(prop: dict) -> str: | |
| 338 | + """Extract a display string from a Notion property value.""" | |
| 339 | + prop_type = prop.get("type", "") | |
| 340 | + | |
| 341 | + if prop_type == "title": | |
| 342 | + return _rich_text_to_str(prop.get("title", [])) | |
| 343 | + elif prop_type == "rich_text": | |
| 344 | + return _rich_text_to_str(prop.get("rich_text", [])) | |
| 345 | + elif prop_type == "number": | |
| 346 | + val = prop.get("number") | |
| 347 | + return str(val) if val is not None else "" | |
| 348 | + elif prop_type == "select": | |
| 349 | + sel = prop.get("select") | |
| 350 | + return sel.get("name", "") if sel else "" | |
| 351 | + elif prop_type == "multi_select": | |
| 352 | + return "; ".join(s.get("name", "") for s in prop.get("multi_select", [])) | |
| 353 | + elif prop_type == "date": | |
| 354 | + date = prop.get("date") | |
| 355 | + if date: | |
| 356 | + start = date.get("start", "") | |
| 357 | + end = date.get("end", "") | |
| 358 | + return f"{start} - {end}" if end else start | |
| 359 | + return "" | |
| 360 | + elif prop_type == "checkbox": | |
| 361 | + return str(prop.get("checkbox", False)) | |
| 362 | + elif prop_type == "url": | |
| 363 | + return prop.get("url", "") or "" | |
| 364 | + elif prop_type == "email": | |
| 365 | + return prop.get("email", "") or "" | |
| 366 | + elif prop_type == "phone_number": | |
| 367 | + return prop.get("phone_number", "") or "" | |
| 368 | + elif prop_type == "status": | |
| 369 | + status = prop.get("status") | |
| 370 | + return status.get("name", "") if status else "" | |
| 371 | + elif prop_type == "people": | |
| 372 | + return "; ".join(p.get("name", "") for p in prop.get("people", [])) | |
| 373 | + elif prop_type == "relation": | |
| 374 | + return "; ".join(r.get("id", "") for r in prop.get("relation", [])) | |
| 375 | + elif prop_type == "formula": | |
| 376 | + formula = prop.get("formula", {}) | |
| 377 | + f_type = formula.get("type", "") | |
| 378 | + return str(formula.get(f_type, "")) | |
| 379 | + else: | |
| 380 | + return "" |
| --- a/video_processor/sources/notion_source.py | |
| +++ b/video_processor/sources/notion_source.py | |
| @@ -0,0 +1,380 @@ | |
| --- a/video_processor/sources/notion_source.py | |
| +++ b/video_processor/sources/notion_source.py | |
| @@ -0,0 +1,380 @@ | |
| 1 | """Notion API source connector for fetching pages and databases.""" |
| 2 | |
| 3 | import logging |
| 4 | import os |
| 5 | from pathlib import Path |
| 6 | from typing import Dict, List, Optional |
| 7 | |
| 8 | import requests |
| 9 | |
| 10 | from video_processor.sources.base import BaseSource, SourceFile |
| 11 | |
| 12 | logger = logging.getLogger(__name__) |
| 13 | |
| 14 | NOTION_VERSION = "2022-06-28" |
| 15 | NOTION_BASE_URL = "https://api.notion.com/v1" |
| 16 | |
| 17 | |
| 18 | class NotionSource(BaseSource): |
| 19 | """ |
| 20 | Fetch pages and databases from Notion via the public API. |
| 21 | |
| 22 | Requires a Notion integration token (internal integration). |
| 23 | Set NOTION_API_KEY env var or pass token directly. |
| 24 | |
| 25 | Requires: pip install requests |
| 26 | """ |
| 27 | |
| 28 | def __init__( |
| 29 | self, |
| 30 | token: Optional[str] = None, |
| 31 | database_id: Optional[str] = None, |
| 32 | page_ids: Optional[List[str]] = None, |
| 33 | ): |
| 34 | self.token = token or os.environ.get("NOTION_API_KEY", "") |
| 35 | self.database_id = database_id |
| 36 | self.page_ids = page_ids or [] |
| 37 | |
| 38 | def _headers(self) -> Dict[str, str]: |
| 39 | return { |
| 40 | "Authorization": f"Bearer {self.token}", |
| 41 | "Notion-Version": NOTION_VERSION, |
| 42 | "Content-Type": "application/json", |
| 43 | } |
| 44 | |
| 45 | def authenticate(self) -> bool: |
| 46 | """Check token is set and make a test call to the Notion API.""" |
| 47 | if not self.token: |
| 48 | logger.error("Notion token not set. Provide token or set NOTION_API_KEY.") |
| 49 | return False |
| 50 | try: |
| 51 | resp = requests.get( |
| 52 | f"{NOTION_BASE_URL}/users/me", |
| 53 | headers=self._headers(), |
| 54 | timeout=15, |
| 55 | ) |
| 56 | resp.raise_for_status() |
| 57 | user = resp.json() |
| 58 | logger.info("Authenticated with Notion as %s", user.get("name", "unknown")) |
| 59 | return True |
| 60 | except requests.RequestException as exc: |
| 61 | logger.error("Notion authentication failed: %s", exc) |
| 62 | return False |
| 63 | |
| 64 | def list_videos( |
| 65 | self, |
| 66 | folder_id: Optional[str] = None, |
| 67 | folder_path: Optional[str] = None, |
| 68 | patterns: Optional[List[str]] = None, |
| 69 | ) -> List[SourceFile]: |
| 70 | """List Notion pages as SourceFiles. |
| 71 | |
| 72 | If database_id is set, query the database for pages. |
| 73 | If page_ids is set, fetch each page individually. |
| 74 | """ |
| 75 | files: List[SourceFile] = [] |
| 76 | |
| 77 | if self.database_id: |
| 78 | files.extend(self._list_from_database(self.database_id)) |
| 79 | |
| 80 | if self.page_ids: |
| 81 | files.extend(self._list_from_pages(self.page_ids)) |
| 82 | |
| 83 | if not files: |
| 84 | logger.warning("No pages found. Set database_id or page_ids.") |
| 85 | |
| 86 | return files |
| 87 | |
| 88 | def _list_from_database(self, database_id: str) -> List[SourceFile]: |
| 89 | """Query a Notion database and return SourceFiles for each row.""" |
| 90 | files: List[SourceFile] = [] |
| 91 | has_more = True |
| 92 | start_cursor: Optional[str] = None |
| 93 | |
| 94 | while has_more: |
| 95 | body: Dict = {} |
| 96 | if start_cursor: |
| 97 | body["start_cursor"] = start_cursor |
| 98 | |
| 99 | resp = requests.post( |
| 100 | f"{NOTION_BASE_URL}/databases/{database_id}/query", |
| 101 | headers=self._headers(), |
| 102 | json=body, |
| 103 | timeout=30, |
| 104 | ) |
| 105 | resp.raise_for_status() |
| 106 | data = resp.json() |
| 107 | |
| 108 | for page in data.get("results", []): |
| 109 | title = _extract_page_title(page) |
| 110 | files.append( |
| 111 | SourceFile( |
| 112 | name=title, |
| 113 | id=page["id"], |
| 114 | mime_type="text/markdown", |
| 115 | modified_at=page.get("last_edited_time"), |
| 116 | ) |
| 117 | ) |
| 118 | |
| 119 | has_more = data.get("has_more", False) |
| 120 | start_cursor = data.get("next_cursor") |
| 121 | |
| 122 | return files |
| 123 | |
| 124 | def _list_from_pages(self, page_ids: List[str]) -> List[SourceFile]: |
| 125 | """Fetch individual pages by ID and return SourceFiles.""" |
| 126 | files: List[SourceFile] = [] |
| 127 | for page_id in page_ids: |
| 128 | try: |
| 129 | resp = requests.get( |
| 130 | f"{NOTION_BASE_URL}/pages/{page_id}", |
| 131 | headers=self._headers(), |
| 132 | timeout=15, |
| 133 | ) |
| 134 | resp.raise_for_status() |
| 135 | page = resp.json() |
| 136 | title = _extract_page_title(page) |
| 137 | files.append( |
| 138 | SourceFile( |
| 139 | name=title, |
| 140 | id=page["id"], |
| 141 | mime_type="text/markdown", |
| 142 | modified_at=page.get("last_edited_time"), |
| 143 | ) |
| 144 | ) |
| 145 | except requests.RequestException as exc: |
| 146 | logger.error("Failed to fetch page %s: %s", page_id, exc) |
| 147 | return files |
| 148 | |
| 149 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 150 | """Download page blocks as markdown text and save to destination.""" |
| 151 | destination = Path(destination) |
| 152 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 153 | |
| 154 | blocks = self._fetch_all_blocks(file.id) |
| 155 | text = self._blocks_to_text(blocks) |
| 156 | |
| 157 | # Prepend title |
| 158 | content = f"# {file.name}\n\n{text}" |
| 159 | destination.write_text(content, encoding="utf-8") |
| 160 | logger.info("Saved Notion page to %s", destination) |
| 161 | return destination |
| 162 | |
| 163 | def _fetch_all_blocks(self, page_id: str) -> list: |
| 164 | """Fetch all child blocks for a page, handling pagination.""" |
| 165 | blocks: list = [] |
| 166 | has_more = True |
| 167 | start_cursor: Optional[str] = None |
| 168 | |
| 169 | while has_more: |
| 170 | url = f"{NOTION_BASE_URL}/blocks/{page_id}/children?page_size=100" |
| 171 | if start_cursor: |
| 172 | url += f"&start_cursor={start_cursor}" |
| 173 | |
| 174 | resp = requests.get(url, headers=self._headers(), timeout=30) |
| 175 | resp.raise_for_status() |
| 176 | data = resp.json() |
| 177 | |
| 178 | blocks.extend(data.get("results", [])) |
| 179 | has_more = data.get("has_more", False) |
| 180 | start_cursor = data.get("next_cursor") |
| 181 | |
| 182 | return blocks |
| 183 | |
| 184 | def _blocks_to_text(self, blocks: list) -> str: |
| 185 | """Convert Notion block objects to markdown text.""" |
| 186 | lines: List[str] = [] |
| 187 | numbered_index = 0 |
| 188 | |
| 189 | for block in blocks: |
| 190 | block_type = block.get("type", "") |
| 191 | block_data = block.get(block_type, {}) |
| 192 | |
| 193 | if block_type == "paragraph": |
| 194 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 195 | lines.append(text) |
| 196 | numbered_index = 0 |
| 197 | |
| 198 | elif block_type == "heading_1": |
| 199 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 200 | lines.append(f"# {text}") |
| 201 | numbered_index = 0 |
| 202 | |
| 203 | elif block_type == "heading_2": |
| 204 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 205 | lines.append(f"## {text}") |
| 206 | numbered_index = 0 |
| 207 | |
| 208 | elif block_type == "heading_3": |
| 209 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 210 | lines.append(f"### {text}") |
| 211 | numbered_index = 0 |
| 212 | |
| 213 | elif block_type == "bulleted_list_item": |
| 214 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 215 | lines.append(f"- {text}") |
| 216 | numbered_index = 0 |
| 217 | |
| 218 | elif block_type == "numbered_list_item": |
| 219 | numbered_index += 1 |
| 220 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 221 | lines.append(f"{numbered_index}. {text}") |
| 222 | |
| 223 | elif block_type == "to_do": |
| 224 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 225 | checked = block_data.get("checked", False) |
| 226 | marker = "[x]" if checked else "[ ]" |
| 227 | lines.append(f"- {marker} {text}") |
| 228 | numbered_index = 0 |
| 229 | |
| 230 | elif block_type == "code": |
| 231 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 232 | language = block_data.get("language", "") |
| 233 | lines.append(f"```{language}") |
| 234 | lines.append(text) |
| 235 | lines.append("```") |
| 236 | numbered_index = 0 |
| 237 | |
| 238 | elif block_type == "quote": |
| 239 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 240 | lines.append(f"> {text}") |
| 241 | numbered_index = 0 |
| 242 | |
| 243 | elif block_type == "callout": |
| 244 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 245 | icon = block_data.get("icon", {}) |
| 246 | emoji = icon.get("emoji", "") if icon else "" |
| 247 | prefix = f"{emoji} " if emoji else "" |
| 248 | lines.append(f"> {prefix}{text}") |
| 249 | numbered_index = 0 |
| 250 | |
| 251 | elif block_type == "toggle": |
| 252 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 253 | lines.append(f"<details><summary>{text}</summary></details>") |
| 254 | numbered_index = 0 |
| 255 | |
| 256 | elif block_type == "divider": |
| 257 | lines.append("---") |
| 258 | numbered_index = 0 |
| 259 | |
| 260 | else: |
| 261 | # Unsupported block type — try to extract any rich_text |
| 262 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 263 | if text: |
| 264 | lines.append(text) |
| 265 | numbered_index = 0 |
| 266 | |
| 267 | return "\n\n".join(lines) |
| 268 | |
| 269 | def fetch_database_as_table(self, database_id: str) -> str: |
| 270 | """Fetch a Notion database and return its rows as CSV-like text. |
| 271 | |
| 272 | Each row is a page in the database. Columns are derived from |
| 273 | the database properties. |
| 274 | """ |
| 275 | # First, get database schema for column order |
| 276 | resp = requests.get( |
| 277 | f"{NOTION_BASE_URL}/databases/{database_id}", |
| 278 | headers=self._headers(), |
| 279 | timeout=15, |
| 280 | ) |
| 281 | resp.raise_for_status() |
| 282 | db_meta = resp.json() |
| 283 | properties = db_meta.get("properties", {}) |
| 284 | columns = sorted(properties.keys()) |
| 285 | |
| 286 | # Query all rows |
| 287 | rows: List[Dict] = [] |
| 288 | has_more = True |
| 289 | start_cursor: Optional[str] = None |
| 290 | |
| 291 | while has_more: |
| 292 | body: Dict = {} |
| 293 | if start_cursor: |
| 294 | body["start_cursor"] = start_cursor |
| 295 | |
| 296 | resp = requests.post( |
| 297 | f"{NOTION_BASE_URL}/databases/{database_id}/query", |
| 298 | headers=self._headers(), |
| 299 | json=body, |
| 300 | timeout=30, |
| 301 | ) |
| 302 | resp.raise_for_status() |
| 303 | data = resp.json() |
| 304 | rows.extend(data.get("results", [])) |
| 305 | has_more = data.get("has_more", False) |
| 306 | start_cursor = data.get("next_cursor") |
| 307 | |
| 308 | # Build CSV-like output |
| 309 | lines: List[str] = [] |
| 310 | lines.append(",".join(columns)) |
| 311 | |
| 312 | for row in rows: |
| 313 | row_props = row.get("properties", {}) |
| 314 | values: List[str] = [] |
| 315 | for col in columns: |
| 316 | prop = row_props.get(col, {}) |
| 317 | values.append(_extract_property_value(prop)) |
| 318 | lines.append(",".join(values)) |
| 319 | |
| 320 | return "\n".join(lines) |
| 321 | |
| 322 | |
| 323 | def _rich_text_to_str(rich_text: list) -> str: |
| 324 | """Extract plain text from a Notion rich_text array.""" |
| 325 | return "".join(item.get("plain_text", "") for item in rich_text) |
| 326 | |
| 327 | |
| 328 | def _extract_page_title(page: dict) -> str: |
| 329 | """Extract the title from a Notion page object.""" |
| 330 | properties = page.get("properties", {}) |
| 331 | for prop in properties.values(): |
| 332 | if prop.get("type") == "title": |
| 333 | return _rich_text_to_str(prop.get("title", [])) |
| 334 | return "Untitled" |
| 335 | |
| 336 | |
| 337 | def _extract_property_value(prop: dict) -> str: |
| 338 | """Extract a display string from a Notion property value.""" |
| 339 | prop_type = prop.get("type", "") |
| 340 | |
| 341 | if prop_type == "title": |
| 342 | return _rich_text_to_str(prop.get("title", [])) |
| 343 | elif prop_type == "rich_text": |
| 344 | return _rich_text_to_str(prop.get("rich_text", [])) |
| 345 | elif prop_type == "number": |
| 346 | val = prop.get("number") |
| 347 | return str(val) if val is not None else "" |
| 348 | elif prop_type == "select": |
| 349 | sel = prop.get("select") |
| 350 | return sel.get("name", "") if sel else "" |
| 351 | elif prop_type == "multi_select": |
| 352 | return "; ".join(s.get("name", "") for s in prop.get("multi_select", [])) |
| 353 | elif prop_type == "date": |
| 354 | date = prop.get("date") |
| 355 | if date: |
| 356 | start = date.get("start", "") |
| 357 | end = date.get("end", "") |
| 358 | return f"{start} - {end}" if end else start |
| 359 | return "" |
| 360 | elif prop_type == "checkbox": |
| 361 | return str(prop.get("checkbox", False)) |
| 362 | elif prop_type == "url": |
| 363 | return prop.get("url", "") or "" |
| 364 | elif prop_type == "email": |
| 365 | return prop.get("email", "") or "" |
| 366 | elif prop_type == "phone_number": |
| 367 | return prop.get("phone_number", "") or "" |
| 368 | elif prop_type == "status": |
| 369 | status = prop.get("status") |
| 370 | return status.get("name", "") if status else "" |
| 371 | elif prop_type == "people": |
| 372 | return "; ".join(p.get("name", "") for p in prop.get("people", [])) |
| 373 | elif prop_type == "relation": |
| 374 | return "; ".join(r.get("id", "") for r in prop.get("relation", [])) |
| 375 | elif prop_type == "formula": |
| 376 | formula = prop.get("formula", {}) |
| 377 | f_type = formula.get("type", "") |
| 378 | return str(formula.get(f_type, "")) |
| 379 | else: |
| 380 | return "" |
| --- a/video_processor/sources/obsidian_source.py | ||
| +++ b/video_processor/sources/obsidian_source.py | ||
| @@ -0,0 +1,178 @@ | ||
| 1 | +"""Obsidian vault source connector for ingesting markdown notes.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import re | |
| 5 | +import shutil | |
| 6 | +from datetime import datetime, timezone | |
| 7 | +from pathlib import Path | |
| 8 | +from typing import List, Optional, Tuple | |
| 9 | + | |
| 10 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 11 | + | |
| 12 | +logger = logging.getLogger(__name__) | |
| 13 | + | |
| 14 | + | |
| 15 | +def parse_note(path: Path) -> dict: | |
| 16 | + """Parse an Obsidian markdown note and extract structured content. | |
| 17 | + | |
| 18 | + Returns a dict with: | |
| 19 | + - frontmatter: dict of YAML frontmatter metadata | |
| 20 | + - links: list of linked page names from [[wiki-links]] | |
| 21 | + - tags: list of tags from #tag occurrences | |
| 22 | + - headings: list of dicts with level and text | |
| 23 | + - body: markdown text without frontmatter | |
| 24 | + """ | |
| 25 | + text = path.read_text(encoding="utf-8") | |
| 26 | + | |
| 27 | + # Extract YAML frontmatter (simple key: value parser, stdlib only) | |
| 28 | + frontmatter: dict = {} | |
| 29 | + body = text | |
| 30 | + fm_match = re.match(r"\A---\n(.*?\n)---\n?(.*)", text, re.DOTALL) | |
| 31 | + if fm_match: | |
| 32 | + fm_text = fm_match.group(1) | |
| 33 | + for line in fm_text.strip().splitlines(): | |
| 34 | + kv = re.match(r"^([A-Za-z_][A-Za-z0-9_ -]*):\s*(.*)", line) | |
| 35 | + if kv: | |
| 36 | + key = kv.group(1).strip() | |
| 37 | + value = kv.group(2).strip() | |
| 38 | + # Strip surrounding quotes | |
| 39 | + if len(value) >= 2 and value[0] == value[-1] and value[0] in ('"', "'"): | |
| 40 | + value = value[1:-1] | |
| 41 | + # Handle YAML-style lists on a single line [a, b, c] | |
| 42 | + list_match = re.match(r"^\[(.+)\]$", value) | |
| 43 | + if list_match: | |
| 44 | + value = [v.strip().strip("\"'") for v in list_match.group(1).split(",")] | |
| 45 | + frontmatter[key] = value | |
| 46 | + body = fm_match.group(2) | |
| 47 | + | |
| 48 | + # Extract wiki-links: [[page]] and [[page|alias]] | |
| 49 | + link_pattern = re.compile(r"\[\[([^\]|]+)(?:\|[^\]]+)?\]\]") | |
| 50 | + links = link_pattern.findall(body) | |
| 51 | + | |
| 52 | + # Extract tags: #tag (but not inside code blocks or frontmatter) | |
| 53 | + # Match #tag but not #[[tag]] (that's Logseq style) and not ## headings | |
| 54 | + tag_pattern = re.compile(r"(?<!\w)#([A-Za-z][A-Za-z0-9_/-]*)") | |
| 55 | + tags = tag_pattern.findall(body) | |
| 56 | + | |
| 57 | + # Extract headings hierarchy | |
| 58 | + heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE) | |
| 59 | + headings = [ | |
| 60 | + {"level": len(m.group(1)), "text": m.group(2).strip()} | |
| 61 | + for m in heading_pattern.finditer(body) | |
| 62 | + ] | |
| 63 | + | |
| 64 | + return { | |
| 65 | + "frontmatter": frontmatter, | |
| 66 | + "links": links, | |
| 67 | + "tags": tags, | |
| 68 | + "headings": headings, | |
| 69 | + "body": body, | |
| 70 | + } | |
| 71 | + | |
| 72 | + | |
| 73 | +def ingest_vault(vault_path: Path) -> dict: | |
| 74 | + """Ingest an entire Obsidian vault and return structured data. | |
| 75 | + | |
| 76 | + Returns a dict with: | |
| 77 | + - notes: list of dicts with name, tags, frontmatter, text | |
| 78 | + - links: list of (source, target) tuples from wiki-links | |
| 79 | + """ | |
| 80 | + vault_path = Path(vault_path) | |
| 81 | + notes: List[dict] = [] | |
| 82 | + links: List[Tuple[str, str]] = [] | |
| 83 | + | |
| 84 | + md_files = sorted(vault_path.rglob("*.md")) | |
| 85 | + logger.info("Found %d markdown files in vault %s", len(md_files), vault_path) | |
| 86 | + | |
| 87 | + for md_file in md_files: | |
| 88 | + note_name = md_file.stem | |
| 89 | + try: | |
| 90 | + parsed = parse_note(md_file) | |
| 91 | + except Exception: | |
| 92 | + logger.warning("Failed to parse note %s", md_file) | |
| 93 | + continue | |
| 94 | + | |
| 95 | + notes.append( | |
| 96 | + { | |
| 97 | + "name": note_name, | |
| 98 | + "tags": parsed["tags"], | |
| 99 | + "frontmatter": parsed["frontmatter"], | |
| 100 | + "text": parsed["body"], | |
| 101 | + } | |
| 102 | + ) | |
| 103 | + | |
| 104 | + for linked_page in parsed["links"]: | |
| 105 | + links.append((note_name, linked_page)) | |
| 106 | + | |
| 107 | + logger.info( | |
| 108 | + "Ingested %d notes with %d links from vault %s", | |
| 109 | + len(notes), | |
| 110 | + len(links), | |
| 111 | + vault_path, | |
| 112 | + ) | |
| 113 | + return {"notes": notes, "links": links} | |
| 114 | + | |
| 115 | + | |
| 116 | +class ObsidianSource(BaseSource): | |
| 117 | + """Source connector for Obsidian vaults.""" | |
| 118 | + | |
| 119 | + def __init__(self, vault_path: str) -> None: | |
| 120 | + self.vault_path = Path(vault_path) | |
| 121 | + | |
| 122 | + def authenticate(self) -> bool: | |
| 123 | + """Check that the vault path exists and contains .md files.""" | |
| 124 | + if not self.vault_path.is_dir(): | |
| 125 | + logger.error("Vault path does not exist: %s", self.vault_path) | |
| 126 | + return False | |
| 127 | + md_files = list(self.vault_path.rglob("*.md")) | |
| 128 | + if not md_files: | |
| 129 | + logger.error("No markdown files found in vault: %s", self.vault_path) | |
| 130 | + return False | |
| 131 | + logger.info( | |
| 132 | + "Obsidian vault authenticated: %s (%d .md files)", | |
| 133 | + self.vault_path, | |
| 134 | + len(md_files), | |
| 135 | + ) | |
| 136 | + return True | |
| 137 | + | |
| 138 | + def list_videos( | |
| 139 | + self, | |
| 140 | + folder_id: Optional[str] = None, | |
| 141 | + folder_path: Optional[str] = None, | |
| 142 | + patterns: Optional[List[str]] = None, | |
| 143 | + ) -> List[SourceFile]: | |
| 144 | + """List all .md files in the vault recursively as SourceFile objects.""" | |
| 145 | + search_root = self.vault_path | |
| 146 | + if folder_path: | |
| 147 | + search_root = self.vault_path / folder_path | |
| 148 | + | |
| 149 | + md_files = sorted(search_root.rglob("*.md")) | |
| 150 | + results: List[SourceFile] = [] | |
| 151 | + | |
| 152 | + for md_file in md_files: | |
| 153 | + relative = md_file.relative_to(self.vault_path) | |
| 154 | + stat = md_file.stat() | |
| 155 | + modified_dt = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc) | |
| 156 | + | |
| 157 | + results.append( | |
| 158 | + SourceFile( | |
| 159 | + name=md_file.name, | |
| 160 | + id=str(relative), | |
| 161 | + size_bytes=stat.st_size, | |
| 162 | + mime_type="text/markdown", | |
| 163 | + modified_at=modified_dt.isoformat(), | |
| 164 | + path=str(relative), | |
| 165 | + ) | |
| 166 | + ) | |
| 167 | + | |
| 168 | + logger.info("Listed %d files from vault %s", len(results), self.vault_path) | |
| 169 | + return results | |
| 170 | + | |
| 171 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 172 | + """Copy a vault file to the destination path.""" | |
| 173 | + source = self.vault_path / file.id | |
| 174 | + destination = Path(destination) | |
| 175 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 176 | + shutil.copy2(source, destination) | |
| 177 | + logger.info("Copied %s -> %s", source, destination) | |
| 178 | + return destination |
| --- a/video_processor/sources/obsidian_source.py | |
| +++ b/video_processor/sources/obsidian_source.py | |
| @@ -0,0 +1,178 @@ | |
| --- a/video_processor/sources/obsidian_source.py | |
| +++ b/video_processor/sources/obsidian_source.py | |
| @@ -0,0 +1,178 @@ | |
| 1 | """Obsidian vault source connector for ingesting markdown notes.""" |
| 2 | |
| 3 | import logging |
| 4 | import re |
| 5 | import shutil |
| 6 | from datetime import datetime, timezone |
| 7 | from pathlib import Path |
| 8 | from typing import List, Optional, Tuple |
| 9 | |
| 10 | from video_processor.sources.base import BaseSource, SourceFile |
| 11 | |
| 12 | logger = logging.getLogger(__name__) |
| 13 | |
| 14 | |
| 15 | def parse_note(path: Path) -> dict: |
| 16 | """Parse an Obsidian markdown note and extract structured content. |
| 17 | |
| 18 | Returns a dict with: |
| 19 | - frontmatter: dict of YAML frontmatter metadata |
| 20 | - links: list of linked page names from [[wiki-links]] |
| 21 | - tags: list of tags from #tag occurrences |
| 22 | - headings: list of dicts with level and text |
| 23 | - body: markdown text without frontmatter |
| 24 | """ |
| 25 | text = path.read_text(encoding="utf-8") |
| 26 | |
| 27 | # Extract YAML frontmatter (simple key: value parser, stdlib only) |
| 28 | frontmatter: dict = {} |
| 29 | body = text |
| 30 | fm_match = re.match(r"\A---\n(.*?\n)---\n?(.*)", text, re.DOTALL) |
| 31 | if fm_match: |
| 32 | fm_text = fm_match.group(1) |
| 33 | for line in fm_text.strip().splitlines(): |
| 34 | kv = re.match(r"^([A-Za-z_][A-Za-z0-9_ -]*):\s*(.*)", line) |
| 35 | if kv: |
| 36 | key = kv.group(1).strip() |
| 37 | value = kv.group(2).strip() |
| 38 | # Strip surrounding quotes |
| 39 | if len(value) >= 2 and value[0] == value[-1] and value[0] in ('"', "'"): |
| 40 | value = value[1:-1] |
| 41 | # Handle YAML-style lists on a single line [a, b, c] |
| 42 | list_match = re.match(r"^\[(.+)\]$", value) |
| 43 | if list_match: |
| 44 | value = [v.strip().strip("\"'") for v in list_match.group(1).split(",")] |
| 45 | frontmatter[key] = value |
| 46 | body = fm_match.group(2) |
| 47 | |
| 48 | # Extract wiki-links: [[page]] and [[page|alias]] |
| 49 | link_pattern = re.compile(r"\[\[([^\]|]+)(?:\|[^\]]+)?\]\]") |
| 50 | links = link_pattern.findall(body) |
| 51 | |
| 52 | # Extract tags: #tag (but not inside code blocks or frontmatter) |
| 53 | # Match #tag but not #[[tag]] (that's Logseq style) and not ## headings |
| 54 | tag_pattern = re.compile(r"(?<!\w)#([A-Za-z][A-Za-z0-9_/-]*)") |
| 55 | tags = tag_pattern.findall(body) |
| 56 | |
| 57 | # Extract headings hierarchy |
| 58 | heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE) |
| 59 | headings = [ |
| 60 | {"level": len(m.group(1)), "text": m.group(2).strip()} |
| 61 | for m in heading_pattern.finditer(body) |
| 62 | ] |
| 63 | |
| 64 | return { |
| 65 | "frontmatter": frontmatter, |
| 66 | "links": links, |
| 67 | "tags": tags, |
| 68 | "headings": headings, |
| 69 | "body": body, |
| 70 | } |
| 71 | |
| 72 | |
| 73 | def ingest_vault(vault_path: Path) -> dict: |
| 74 | """Ingest an entire Obsidian vault and return structured data. |
| 75 | |
| 76 | Returns a dict with: |
| 77 | - notes: list of dicts with name, tags, frontmatter, text |
| 78 | - links: list of (source, target) tuples from wiki-links |
| 79 | """ |
| 80 | vault_path = Path(vault_path) |
| 81 | notes: List[dict] = [] |
| 82 | links: List[Tuple[str, str]] = [] |
| 83 | |
| 84 | md_files = sorted(vault_path.rglob("*.md")) |
| 85 | logger.info("Found %d markdown files in vault %s", len(md_files), vault_path) |
| 86 | |
| 87 | for md_file in md_files: |
| 88 | note_name = md_file.stem |
| 89 | try: |
| 90 | parsed = parse_note(md_file) |
| 91 | except Exception: |
| 92 | logger.warning("Failed to parse note %s", md_file) |
| 93 | continue |
| 94 | |
| 95 | notes.append( |
| 96 | { |
| 97 | "name": note_name, |
| 98 | "tags": parsed["tags"], |
| 99 | "frontmatter": parsed["frontmatter"], |
| 100 | "text": parsed["body"], |
| 101 | } |
| 102 | ) |
| 103 | |
| 104 | for linked_page in parsed["links"]: |
| 105 | links.append((note_name, linked_page)) |
| 106 | |
| 107 | logger.info( |
| 108 | "Ingested %d notes with %d links from vault %s", |
| 109 | len(notes), |
| 110 | len(links), |
| 111 | vault_path, |
| 112 | ) |
| 113 | return {"notes": notes, "links": links} |
| 114 | |
| 115 | |
| 116 | class ObsidianSource(BaseSource): |
| 117 | """Source connector for Obsidian vaults.""" |
| 118 | |
| 119 | def __init__(self, vault_path: str) -> None: |
| 120 | self.vault_path = Path(vault_path) |
| 121 | |
| 122 | def authenticate(self) -> bool: |
| 123 | """Check that the vault path exists and contains .md files.""" |
| 124 | if not self.vault_path.is_dir(): |
| 125 | logger.error("Vault path does not exist: %s", self.vault_path) |
| 126 | return False |
| 127 | md_files = list(self.vault_path.rglob("*.md")) |
| 128 | if not md_files: |
| 129 | logger.error("No markdown files found in vault: %s", self.vault_path) |
| 130 | return False |
| 131 | logger.info( |
| 132 | "Obsidian vault authenticated: %s (%d .md files)", |
| 133 | self.vault_path, |
| 134 | len(md_files), |
| 135 | ) |
| 136 | return True |
| 137 | |
| 138 | def list_videos( |
| 139 | self, |
| 140 | folder_id: Optional[str] = None, |
| 141 | folder_path: Optional[str] = None, |
| 142 | patterns: Optional[List[str]] = None, |
| 143 | ) -> List[SourceFile]: |
| 144 | """List all .md files in the vault recursively as SourceFile objects.""" |
| 145 | search_root = self.vault_path |
| 146 | if folder_path: |
| 147 | search_root = self.vault_path / folder_path |
| 148 | |
| 149 | md_files = sorted(search_root.rglob("*.md")) |
| 150 | results: List[SourceFile] = [] |
| 151 | |
| 152 | for md_file in md_files: |
| 153 | relative = md_file.relative_to(self.vault_path) |
| 154 | stat = md_file.stat() |
| 155 | modified_dt = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc) |
| 156 | |
| 157 | results.append( |
| 158 | SourceFile( |
| 159 | name=md_file.name, |
| 160 | id=str(relative), |
| 161 | size_bytes=stat.st_size, |
| 162 | mime_type="text/markdown", |
| 163 | modified_at=modified_dt.isoformat(), |
| 164 | path=str(relative), |
| 165 | ) |
| 166 | ) |
| 167 | |
| 168 | logger.info("Listed %d files from vault %s", len(results), self.vault_path) |
| 169 | return results |
| 170 | |
| 171 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 172 | """Copy a vault file to the destination path.""" |
| 173 | source = self.vault_path / file.id |
| 174 | destination = Path(destination) |
| 175 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 176 | shutil.copy2(source, destination) |
| 177 | logger.info("Copied %s -> %s", source, destination) |
| 178 | return destination |
| --- a/video_processor/sources/onenote_source.py | ||
| +++ b/video_processor/sources/onenote_source.py | ||
| @@ -0,0 +1,222 @@ | ||
| 1 | +"""Microsoft OneNote source connector using the m365 CLI (cli-microsoft365). | |
| 2 | + | |
| 3 | +Fetches pages from OneNote notebooks via the `m365` CLI tool. | |
| 4 | +Outputs plain text suitable for KG ingestion. | |
| 5 | + | |
| 6 | +Requires: npm install -g @pnp/cli-microsoft365 | |
| 7 | +Auth: m365 login (interactive) | |
| 8 | +Docs: https://pnp.github.io/cli-microsoft365/ | |
| 9 | +""" | |
| 10 | + | |
| 11 | +import json | |
| 12 | +import logging | |
| 13 | +import re | |
| 14 | +import shutil | |
| 15 | +import subprocess | |
| 16 | +from pathlib import Path | |
| 17 | +from typing import Any, List, Optional | |
| 18 | + | |
| 19 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 20 | + | |
| 21 | +logger = logging.getLogger(__name__) | |
| 22 | + | |
| 23 | + | |
| 24 | +def _run_m365(args: List[str], timeout: int = 30) -> Any: | |
| 25 | + """Run an m365 CLI command and return parsed JSON output.""" | |
| 26 | + cmd = ["m365"] + args + ["--output", "json"] | |
| 27 | + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) | |
| 28 | + if proc.returncode != 0: | |
| 29 | + raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}") | |
| 30 | + try: | |
| 31 | + return json.loads(proc.stdout) | |
| 32 | + except json.JSONDecodeError: | |
| 33 | + return proc.stdout.strip() | |
| 34 | + | |
| 35 | + | |
| 36 | +def _html_to_text(html: str) -> str: | |
| 37 | + """Strip HTML tags and decode entities to produce plain text. | |
| 38 | + | |
| 39 | + Uses only stdlib ``re`` — no external dependencies. | |
| 40 | + """ | |
| 41 | + # Remove script/style blocks entirely | |
| 42 | + text = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE) | |
| 43 | + # Replace <br>, <p>, <div>, <li>, <tr> with newlines for readability | |
| 44 | + text = re.sub(r"<br\s*/?>", "\n", text, flags=re.IGNORECASE) | |
| 45 | + text = re.sub(r"</(p|div|li|tr|h[1-6])>", "\n", text, flags=re.IGNORECASE) | |
| 46 | + # Strip remaining tags | |
| 47 | + text = re.sub(r"<[^>]+>", "", text) | |
| 48 | + # Decode common HTML entities | |
| 49 | + entity_map = { | |
| 50 | + "&": "&", | |
| 51 | + "<": "<", | |
| 52 | + ">": ">", | |
| 53 | + """: '"', | |
| 54 | + "'": "'", | |
| 55 | + "'": "'", | |
| 56 | + " ": " ", | |
| 57 | + } | |
| 58 | + for entity, char in entity_map.items(): | |
| 59 | + text = text.replace(entity, char) | |
| 60 | + # Decode numeric entities ({ and ) | |
| 61 | + text = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), text) | |
| 62 | + text = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), text) | |
| 63 | + # Collapse excessive blank lines | |
| 64 | + text = re.sub(r"\n{3,}", "\n\n", text) | |
| 65 | + return text.strip() | |
| 66 | + | |
| 67 | + | |
| 68 | +class OneNoteSource(BaseSource): | |
| 69 | + """ | |
| 70 | + Fetch pages from OneNote notebooks via the m365 CLI. | |
| 71 | + | |
| 72 | + Usage: | |
| 73 | + source = OneNoteSource() # all notebooks | |
| 74 | + source = OneNoteSource(notebook_name="Work Notes") # specific notebook | |
| 75 | + source = OneNoteSource(notebook_name="Work", section_name="Meetings") | |
| 76 | + files = source.list_videos() | |
| 77 | + source.download_all(files, Path("./notes")) | |
| 78 | + """ | |
| 79 | + | |
| 80 | + def __init__( | |
| 81 | + self, | |
| 82 | + notebook_name: Optional[str] = None, | |
| 83 | + section_name: Optional[str] = None, | |
| 84 | + ): | |
| 85 | + self.notebook_name = notebook_name | |
| 86 | + self.section_name = section_name | |
| 87 | + | |
| 88 | + def authenticate(self) -> bool: | |
| 89 | + """Check if m365 CLI is installed and logged in.""" | |
| 90 | + if not shutil.which("m365"): | |
| 91 | + logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365") | |
| 92 | + return False | |
| 93 | + try: | |
| 94 | + result = _run_m365(["status"], timeout=10) | |
| 95 | + if isinstance(result, dict) and result.get("connectedAs"): | |
| 96 | + return True | |
| 97 | + if isinstance(result, str) and "Logged in" in result: | |
| 98 | + return True | |
| 99 | + logger.error("m365 not logged in. Run: m365 login") | |
| 100 | + return False | |
| 101 | + except (RuntimeError, subprocess.TimeoutExpired): | |
| 102 | + logger.error("m365 not logged in. Run: m365 login") | |
| 103 | + return False | |
| 104 | + | |
| 105 | + def list_videos( | |
| 106 | + self, | |
| 107 | + folder_id: Optional[str] = None, | |
| 108 | + folder_path: Optional[str] = None, | |
| 109 | + patterns: Optional[List[str]] = None, | |
| 110 | + ) -> List[SourceFile]: | |
| 111 | + """List OneNote pages across notebooks/sections. Returns SourceFile per page.""" | |
| 112 | + files: List[SourceFile] = [] | |
| 113 | + | |
| 114 | + # Step 1: List notebooks | |
| 115 | + try: | |
| 116 | + notebooks = _run_m365(["onenote", "notebook", "list"], timeout=60) | |
| 117 | + except RuntimeError as e: | |
| 118 | + logger.error(f"Failed to list OneNote notebooks: {e}") | |
| 119 | + return [] | |
| 120 | + | |
| 121 | + if not isinstance(notebooks, list): | |
| 122 | + notebooks = [] | |
| 123 | + | |
| 124 | + # Filter notebooks by name if specified | |
| 125 | + if self.notebook_name: | |
| 126 | + notebooks = [ | |
| 127 | + nb | |
| 128 | + for nb in notebooks | |
| 129 | + if self.notebook_name.lower() in nb.get("displayName", "").lower() | |
| 130 | + ] | |
| 131 | + | |
| 132 | + for notebook in notebooks: | |
| 133 | + notebook_id = notebook.get("id", "") | |
| 134 | + notebook_name = notebook.get("displayName", "Untitled Notebook") | |
| 135 | + | |
| 136 | + # Step 2: List sections in this notebook | |
| 137 | + try: | |
| 138 | + sections = _run_m365( | |
| 139 | + ["onenote", "section", "list", "--notebookId", notebook_id], | |
| 140 | + timeout=60, | |
| 141 | + ) | |
| 142 | + except RuntimeError as e: | |
| 143 | + logger.warning(f"Failed to list sections for notebook '{notebook_name}': {e}") | |
| 144 | + continue | |
| 145 | + | |
| 146 | + if not isinstance(sections, list): | |
| 147 | + sections = [] | |
| 148 | + | |
| 149 | + # Filter sections by name if specified | |
| 150 | + if self.section_name: | |
| 151 | + sections = [ | |
| 152 | + s | |
| 153 | + for s in sections | |
| 154 | + if self.section_name.lower() in s.get("displayName", "").lower() | |
| 155 | + ] | |
| 156 | + | |
| 157 | + for section in sections: | |
| 158 | + section_id = section.get("id", "") | |
| 159 | + section_name = section.get("displayName", "Untitled Section") | |
| 160 | + | |
| 161 | + # Step 3: List pages in this section | |
| 162 | + try: | |
| 163 | + pages = _run_m365( | |
| 164 | + ["onenote", "page", "list", "--sectionId", section_id], | |
| 165 | + timeout=60, | |
| 166 | + ) | |
| 167 | + except RuntimeError as e: | |
| 168 | + logger.warning(f"Failed to list pages in section '{section_name}': {e}") | |
| 169 | + continue | |
| 170 | + | |
| 171 | + if not isinstance(pages, list): | |
| 172 | + pages = [] | |
| 173 | + | |
| 174 | + for page in pages: | |
| 175 | + page_id = page.get("id", "") | |
| 176 | + title = page.get("title", "Untitled Page").strip() or "Untitled Page" | |
| 177 | + modified = page.get("lastModifiedDateTime") | |
| 178 | + # Build a path for organizational context | |
| 179 | + page_path = f"{notebook_name}/{section_name}/{title}" | |
| 180 | + | |
| 181 | + files.append( | |
| 182 | + SourceFile( | |
| 183 | + name=title, | |
| 184 | + id=str(page_id), | |
| 185 | + size_bytes=None, | |
| 186 | + mime_type="text/html", | |
| 187 | + modified_at=modified, | |
| 188 | + path=page_path, | |
| 189 | + ) | |
| 190 | + ) | |
| 191 | + | |
| 192 | + logger.info(f"Found {len(files)} page(s) in OneNote") | |
| 193 | + return files | |
| 194 | + | |
| 195 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 196 | + """Download a OneNote page's content as a text file.""" | |
| 197 | + destination = Path(destination) | |
| 198 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 199 | + | |
| 200 | + try: | |
| 201 | + result = _run_m365( | |
| 202 | + ["onenote", "page", "get", "--id", file.id], | |
| 203 | + timeout=60, | |
| 204 | + ) | |
| 205 | + except RuntimeError as e: | |
| 206 | + raise RuntimeError(f"Failed to fetch OneNote page {file.id}: {e}") from e | |
| 207 | + | |
| 208 | + # Extract HTML content from the result | |
| 209 | + if isinstance(result, dict): | |
| 210 | + html = result.get("content", result.get("body", {}).get("content", "")) | |
| 211 | + if not html: | |
| 212 | + # Fallback: serialize the whole response | |
| 213 | + html = json.dumps(result, indent=2) | |
| 214 | + elif isinstance(result, str): | |
| 215 | + html = result | |
| 216 | + else: | |
| 217 | + html = str(result) | |
| 218 | + | |
| 219 | + text = _html_to_text(html) | |
| 220 | + destination.write_text(text, encoding="utf-8") | |
| 221 | + logger.info(f"Saved page '{file.name}' to {destination}") | |
| 222 | + return destination |
| --- a/video_processor/sources/onenote_source.py | |
| +++ b/video_processor/sources/onenote_source.py | |
| @@ -0,0 +1,222 @@ | |
| --- a/video_processor/sources/onenote_source.py | |
| +++ b/video_processor/sources/onenote_source.py | |
| @@ -0,0 +1,222 @@ | |
| 1 | """Microsoft OneNote source connector using the m365 CLI (cli-microsoft365). |
| 2 | |
| 3 | Fetches pages from OneNote notebooks via the `m365` CLI tool. |
| 4 | Outputs plain text suitable for KG ingestion. |
| 5 | |
| 6 | Requires: npm install -g @pnp/cli-microsoft365 |
| 7 | Auth: m365 login (interactive) |
| 8 | Docs: https://pnp.github.io/cli-microsoft365/ |
| 9 | """ |
| 10 | |
| 11 | import json |
| 12 | import logging |
| 13 | import re |
| 14 | import shutil |
| 15 | import subprocess |
| 16 | from pathlib import Path |
| 17 | from typing import Any, List, Optional |
| 18 | |
| 19 | from video_processor.sources.base import BaseSource, SourceFile |
| 20 | |
| 21 | logger = logging.getLogger(__name__) |
| 22 | |
| 23 | |
| 24 | def _run_m365(args: List[str], timeout: int = 30) -> Any: |
| 25 | """Run an m365 CLI command and return parsed JSON output.""" |
| 26 | cmd = ["m365"] + args + ["--output", "json"] |
| 27 | proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) |
| 28 | if proc.returncode != 0: |
| 29 | raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}") |
| 30 | try: |
| 31 | return json.loads(proc.stdout) |
| 32 | except json.JSONDecodeError: |
| 33 | return proc.stdout.strip() |
| 34 | |
| 35 | |
| 36 | def _html_to_text(html: str) -> str: |
| 37 | """Strip HTML tags and decode entities to produce plain text. |
| 38 | |
| 39 | Uses only stdlib ``re`` — no external dependencies. |
| 40 | """ |
| 41 | # Remove script/style blocks entirely |
| 42 | text = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE) |
| 43 | # Replace <br>, <p>, <div>, <li>, <tr> with newlines for readability |
| 44 | text = re.sub(r"<br\s*/?>", "\n", text, flags=re.IGNORECASE) |
| 45 | text = re.sub(r"</(p|div|li|tr|h[1-6])>", "\n", text, flags=re.IGNORECASE) |
| 46 | # Strip remaining tags |
| 47 | text = re.sub(r"<[^>]+>", "", text) |
| 48 | # Decode common HTML entities |
| 49 | entity_map = { |
| 50 | "&": "&", |
| 51 | "<": "<", |
| 52 | ">": ">", |
| 53 | """: '"', |
| 54 | "'": "'", |
| 55 | "'": "'", |
| 56 | " ": " ", |
| 57 | } |
| 58 | for entity, char in entity_map.items(): |
| 59 | text = text.replace(entity, char) |
| 60 | # Decode numeric entities ({ and ) |
| 61 | text = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), text) |
| 62 | text = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), text) |
| 63 | # Collapse excessive blank lines |
| 64 | text = re.sub(r"\n{3,}", "\n\n", text) |
| 65 | return text.strip() |
| 66 | |
| 67 | |
| 68 | class OneNoteSource(BaseSource): |
| 69 | """ |
| 70 | Fetch pages from OneNote notebooks via the m365 CLI. |
| 71 | |
| 72 | Usage: |
| 73 | source = OneNoteSource() # all notebooks |
| 74 | source = OneNoteSource(notebook_name="Work Notes") # specific notebook |
| 75 | source = OneNoteSource(notebook_name="Work", section_name="Meetings") |
| 76 | files = source.list_videos() |
| 77 | source.download_all(files, Path("./notes")) |
| 78 | """ |
| 79 | |
| 80 | def __init__( |
| 81 | self, |
| 82 | notebook_name: Optional[str] = None, |
| 83 | section_name: Optional[str] = None, |
| 84 | ): |
| 85 | self.notebook_name = notebook_name |
| 86 | self.section_name = section_name |
| 87 | |
| 88 | def authenticate(self) -> bool: |
| 89 | """Check if m365 CLI is installed and logged in.""" |
| 90 | if not shutil.which("m365"): |
| 91 | logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365") |
| 92 | return False |
| 93 | try: |
| 94 | result = _run_m365(["status"], timeout=10) |
| 95 | if isinstance(result, dict) and result.get("connectedAs"): |
| 96 | return True |
| 97 | if isinstance(result, str) and "Logged in" in result: |
| 98 | return True |
| 99 | logger.error("m365 not logged in. Run: m365 login") |
| 100 | return False |
| 101 | except (RuntimeError, subprocess.TimeoutExpired): |
| 102 | logger.error("m365 not logged in. Run: m365 login") |
| 103 | return False |
| 104 | |
| 105 | def list_videos( |
| 106 | self, |
| 107 | folder_id: Optional[str] = None, |
| 108 | folder_path: Optional[str] = None, |
| 109 | patterns: Optional[List[str]] = None, |
| 110 | ) -> List[SourceFile]: |
| 111 | """List OneNote pages across notebooks/sections. Returns SourceFile per page.""" |
| 112 | files: List[SourceFile] = [] |
| 113 | |
| 114 | # Step 1: List notebooks |
| 115 | try: |
| 116 | notebooks = _run_m365(["onenote", "notebook", "list"], timeout=60) |
| 117 | except RuntimeError as e: |
| 118 | logger.error(f"Failed to list OneNote notebooks: {e}") |
| 119 | return [] |
| 120 | |
| 121 | if not isinstance(notebooks, list): |
| 122 | notebooks = [] |
| 123 | |
| 124 | # Filter notebooks by name if specified |
| 125 | if self.notebook_name: |
| 126 | notebooks = [ |
| 127 | nb |
| 128 | for nb in notebooks |
| 129 | if self.notebook_name.lower() in nb.get("displayName", "").lower() |
| 130 | ] |
| 131 | |
| 132 | for notebook in notebooks: |
| 133 | notebook_id = notebook.get("id", "") |
| 134 | notebook_name = notebook.get("displayName", "Untitled Notebook") |
| 135 | |
| 136 | # Step 2: List sections in this notebook |
| 137 | try: |
| 138 | sections = _run_m365( |
| 139 | ["onenote", "section", "list", "--notebookId", notebook_id], |
| 140 | timeout=60, |
| 141 | ) |
| 142 | except RuntimeError as e: |
| 143 | logger.warning(f"Failed to list sections for notebook '{notebook_name}': {e}") |
| 144 | continue |
| 145 | |
| 146 | if not isinstance(sections, list): |
| 147 | sections = [] |
| 148 | |
| 149 | # Filter sections by name if specified |
| 150 | if self.section_name: |
| 151 | sections = [ |
| 152 | s |
| 153 | for s in sections |
| 154 | if self.section_name.lower() in s.get("displayName", "").lower() |
| 155 | ] |
| 156 | |
| 157 | for section in sections: |
| 158 | section_id = section.get("id", "") |
| 159 | section_name = section.get("displayName", "Untitled Section") |
| 160 | |
| 161 | # Step 3: List pages in this section |
| 162 | try: |
| 163 | pages = _run_m365( |
| 164 | ["onenote", "page", "list", "--sectionId", section_id], |
| 165 | timeout=60, |
| 166 | ) |
| 167 | except RuntimeError as e: |
| 168 | logger.warning(f"Failed to list pages in section '{section_name}': {e}") |
| 169 | continue |
| 170 | |
| 171 | if not isinstance(pages, list): |
| 172 | pages = [] |
| 173 | |
| 174 | for page in pages: |
| 175 | page_id = page.get("id", "") |
| 176 | title = page.get("title", "Untitled Page").strip() or "Untitled Page" |
| 177 | modified = page.get("lastModifiedDateTime") |
| 178 | # Build a path for organizational context |
| 179 | page_path = f"{notebook_name}/{section_name}/{title}" |
| 180 | |
| 181 | files.append( |
| 182 | SourceFile( |
| 183 | name=title, |
| 184 | id=str(page_id), |
| 185 | size_bytes=None, |
| 186 | mime_type="text/html", |
| 187 | modified_at=modified, |
| 188 | path=page_path, |
| 189 | ) |
| 190 | ) |
| 191 | |
| 192 | logger.info(f"Found {len(files)} page(s) in OneNote") |
| 193 | return files |
| 194 | |
| 195 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 196 | """Download a OneNote page's content as a text file.""" |
| 197 | destination = Path(destination) |
| 198 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 199 | |
| 200 | try: |
| 201 | result = _run_m365( |
| 202 | ["onenote", "page", "get", "--id", file.id], |
| 203 | timeout=60, |
| 204 | ) |
| 205 | except RuntimeError as e: |
| 206 | raise RuntimeError(f"Failed to fetch OneNote page {file.id}: {e}") from e |
| 207 | |
| 208 | # Extract HTML content from the result |
| 209 | if isinstance(result, dict): |
| 210 | html = result.get("content", result.get("body", {}).get("content", "")) |
| 211 | if not html: |
| 212 | # Fallback: serialize the whole response |
| 213 | html = json.dumps(result, indent=2) |
| 214 | elif isinstance(result, str): |
| 215 | html = result |
| 216 | else: |
| 217 | html = str(result) |
| 218 | |
| 219 | text = _html_to_text(html) |
| 220 | destination.write_text(text, encoding="utf-8") |
| 221 | logger.info(f"Saved page '{file.name}' to {destination}") |
| 222 | return destination |
| --- a/video_processor/sources/podcast_source.py | ||
| +++ b/video_processor/sources/podcast_source.py | ||
| @@ -0,0 +1,119 @@ | ||
| 1 | +"""Podcast feed source connector -- extends RSS for audio enclosures.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +from pathlib import Path | |
| 5 | +from typing import List, Optional | |
| 6 | + | |
| 7 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 8 | + | |
| 9 | +logger = logging.getLogger(__name__) | |
| 10 | + | |
| 11 | + | |
| 12 | +class PodcastSource(BaseSource): | |
| 13 | + """ | |
| 14 | + Parse podcast RSS feeds and download audio episodes for pipeline processing. | |
| 15 | + | |
| 16 | + Extends the RSS pattern to extract <enclosure> audio URLs. | |
| 17 | + Requires: pip install requests | |
| 18 | + Optional: pip install feedparser | |
| 19 | + """ | |
| 20 | + | |
| 21 | + def __init__(self, feed_url: str, max_episodes: int = 10): | |
| 22 | + self.feed_url = feed_url | |
| 23 | + self.max_episodes = max_episodes | |
| 24 | + self._episodes: List[dict] = [] | |
| 25 | + | |
| 26 | + def authenticate(self) -> bool: | |
| 27 | + """No auth needed for public podcast feeds.""" | |
| 28 | + return True | |
| 29 | + | |
| 30 | + def _parse_feed(self) -> None: | |
| 31 | + """Fetch and parse the podcast feed for audio enclosures.""" | |
| 32 | + if self._episodes: | |
| 33 | + return | |
| 34 | + | |
| 35 | + import requests | |
| 36 | + | |
| 37 | + resp = requests.get(self.feed_url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"}) | |
| 38 | + resp.raise_for_status() | |
| 39 | + | |
| 40 | + try: | |
| 41 | + import feedparser | |
| 42 | + | |
| 43 | + feed = feedparser.parse(resp.text) | |
| 44 | + for entry in feed.entries[: self.max_episodes]: | |
| 45 | + audio_url = None | |
| 46 | + for link in entry.get("links", []): | |
| 47 | + if link.get("type", "").startswith("audio/"): | |
| 48 | + audio_url = link.get("href") | |
| 49 | + break | |
| 50 | + if not audio_url and entry.get("enclosures"): | |
| 51 | + audio_url = entry["enclosures"][0].get("href") | |
| 52 | + if audio_url: | |
| 53 | + self._episodes.append( | |
| 54 | + { | |
| 55 | + "title": entry.get("title", "Untitled"), | |
| 56 | + "url": audio_url, | |
| 57 | + "published": entry.get("published", ""), | |
| 58 | + "duration": entry.get("itunes_duration", ""), | |
| 59 | + } | |
| 60 | + ) | |
| 61 | + except ImportError: | |
| 62 | + logger.debug("feedparser not available, using xml.etree fallback") | |
| 63 | + self._parse_xml(resp.text) | |
| 64 | + | |
| 65 | + def _parse_xml(self, text: str) -> None: | |
| 66 | + """Fallback parser for podcast XML using stdlib.""" | |
| 67 | + import xml.etree.ElementTree as ET | |
| 68 | + | |
| 69 | + root = ET.fromstring(text) | |
| 70 | + items = root.findall(".//item") | |
| 71 | + for item in items[: self.max_episodes]: | |
| 72 | + enclosure = item.find("enclosure") | |
| 73 | + if enclosure is None: | |
| 74 | + continue | |
| 75 | + audio_url = enclosure.get("url", "") | |
| 76 | + if not audio_url: | |
| 77 | + continue | |
| 78 | + title = item.findtext("title") or "Untitled" | |
| 79 | + pub = item.findtext("pubDate") or "" | |
| 80 | + self._episodes.append( | |
| 81 | + {"title": title, "url": audio_url, "published": pub, "duration": ""} | |
| 82 | + ) | |
| 83 | + | |
| 84 | + def list_videos( | |
| 85 | + self, | |
| 86 | + folder_id: Optional[str] = None, | |
| 87 | + folder_path: Optional[str] = None, | |
| 88 | + patterns: Optional[List[str]] = None, | |
| 89 | + ) -> List[SourceFile]: | |
| 90 | + """List podcast episodes as SourceFiles.""" | |
| 91 | + self._parse_feed() | |
| 92 | + return [ | |
| 93 | + SourceFile( | |
| 94 | + name=ep["title"], | |
| 95 | + id=ep["url"], | |
| 96 | + mime_type="audio/mpeg", | |
| 97 | + modified_at=ep["published"], | |
| 98 | + ) | |
| 99 | + for ep in self._episodes | |
| 100 | + ] | |
| 101 | + | |
| 102 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 103 | + """Download the podcast audio file.""" | |
| 104 | + import requests | |
| 105 | + | |
| 106 | + destination = Path(destination) | |
| 107 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 108 | + | |
| 109 | + resp = requests.get( | |
| 110 | + file.id, timeout=60, stream=True, headers={"User-Agent": "PlanOpticon/0.3"} | |
| 111 | + ) | |
| 112 | + resp.raise_for_status() | |
| 113 | + | |
| 114 | + with open(destination, "wb") as f: | |
| 115 | + for chunk in resp.iter_content(chunk_size=8192): | |
| 116 | + f.write(chunk) | |
| 117 | + | |
| 118 | + logger.info(f"Downloaded podcast episode to {destination}") | |
| 119 | + return destination |
| --- a/video_processor/sources/podcast_source.py | |
| +++ b/video_processor/sources/podcast_source.py | |
| @@ -0,0 +1,119 @@ | |
| --- a/video_processor/sources/podcast_source.py | |
| +++ b/video_processor/sources/podcast_source.py | |
| @@ -0,0 +1,119 @@ | |
| 1 | """Podcast feed source connector -- extends RSS for audio enclosures.""" |
| 2 | |
| 3 | import logging |
| 4 | from pathlib import Path |
| 5 | from typing import List, Optional |
| 6 | |
| 7 | from video_processor.sources.base import BaseSource, SourceFile |
| 8 | |
| 9 | logger = logging.getLogger(__name__) |
| 10 | |
| 11 | |
| 12 | class PodcastSource(BaseSource): |
| 13 | """ |
| 14 | Parse podcast RSS feeds and download audio episodes for pipeline processing. |
| 15 | |
| 16 | Extends the RSS pattern to extract <enclosure> audio URLs. |
| 17 | Requires: pip install requests |
| 18 | Optional: pip install feedparser |
| 19 | """ |
| 20 | |
| 21 | def __init__(self, feed_url: str, max_episodes: int = 10): |
| 22 | self.feed_url = feed_url |
| 23 | self.max_episodes = max_episodes |
| 24 | self._episodes: List[dict] = [] |
| 25 | |
| 26 | def authenticate(self) -> bool: |
| 27 | """No auth needed for public podcast feeds.""" |
| 28 | return True |
| 29 | |
| 30 | def _parse_feed(self) -> None: |
| 31 | """Fetch and parse the podcast feed for audio enclosures.""" |
| 32 | if self._episodes: |
| 33 | return |
| 34 | |
| 35 | import requests |
| 36 | |
| 37 | resp = requests.get(self.feed_url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"}) |
| 38 | resp.raise_for_status() |
| 39 | |
| 40 | try: |
| 41 | import feedparser |
| 42 | |
| 43 | feed = feedparser.parse(resp.text) |
| 44 | for entry in feed.entries[: self.max_episodes]: |
| 45 | audio_url = None |
| 46 | for link in entry.get("links", []): |
| 47 | if link.get("type", "").startswith("audio/"): |
| 48 | audio_url = link.get("href") |
| 49 | break |
| 50 | if not audio_url and entry.get("enclosures"): |
| 51 | audio_url = entry["enclosures"][0].get("href") |
| 52 | if audio_url: |
| 53 | self._episodes.append( |
| 54 | { |
| 55 | "title": entry.get("title", "Untitled"), |
| 56 | "url": audio_url, |
| 57 | "published": entry.get("published", ""), |
| 58 | "duration": entry.get("itunes_duration", ""), |
| 59 | } |
| 60 | ) |
| 61 | except ImportError: |
| 62 | logger.debug("feedparser not available, using xml.etree fallback") |
| 63 | self._parse_xml(resp.text) |
| 64 | |
| 65 | def _parse_xml(self, text: str) -> None: |
| 66 | """Fallback parser for podcast XML using stdlib.""" |
| 67 | import xml.etree.ElementTree as ET |
| 68 | |
| 69 | root = ET.fromstring(text) |
| 70 | items = root.findall(".//item") |
| 71 | for item in items[: self.max_episodes]: |
| 72 | enclosure = item.find("enclosure") |
| 73 | if enclosure is None: |
| 74 | continue |
| 75 | audio_url = enclosure.get("url", "") |
| 76 | if not audio_url: |
| 77 | continue |
| 78 | title = item.findtext("title") or "Untitled" |
| 79 | pub = item.findtext("pubDate") or "" |
| 80 | self._episodes.append( |
| 81 | {"title": title, "url": audio_url, "published": pub, "duration": ""} |
| 82 | ) |
| 83 | |
| 84 | def list_videos( |
| 85 | self, |
| 86 | folder_id: Optional[str] = None, |
| 87 | folder_path: Optional[str] = None, |
| 88 | patterns: Optional[List[str]] = None, |
| 89 | ) -> List[SourceFile]: |
| 90 | """List podcast episodes as SourceFiles.""" |
| 91 | self._parse_feed() |
| 92 | return [ |
| 93 | SourceFile( |
| 94 | name=ep["title"], |
| 95 | id=ep["url"], |
| 96 | mime_type="audio/mpeg", |
| 97 | modified_at=ep["published"], |
| 98 | ) |
| 99 | for ep in self._episodes |
| 100 | ] |
| 101 | |
| 102 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 103 | """Download the podcast audio file.""" |
| 104 | import requests |
| 105 | |
| 106 | destination = Path(destination) |
| 107 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 108 | |
| 109 | resp = requests.get( |
| 110 | file.id, timeout=60, stream=True, headers={"User-Agent": "PlanOpticon/0.3"} |
| 111 | ) |
| 112 | resp.raise_for_status() |
| 113 | |
| 114 | with open(destination, "wb") as f: |
| 115 | for chunk in resp.iter_content(chunk_size=8192): |
| 116 | f.write(chunk) |
| 117 | |
| 118 | logger.info(f"Downloaded podcast episode to {destination}") |
| 119 | return destination |
| --- a/video_processor/sources/reddit_source.py | ||
| +++ b/video_processor/sources/reddit_source.py | ||
| @@ -0,0 +1,103 @@ | ||
| 1 | +"""Reddit source connector using the public JSON API.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +from pathlib import Path | |
| 5 | +from typing import List, Optional | |
| 6 | + | |
| 7 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 8 | + | |
| 9 | +logger = logging.getLogger(__name__) | |
| 10 | + | |
| 11 | + | |
| 12 | +class RedditSource(BaseSource): | |
| 13 | + """ | |
| 14 | + Fetch Reddit posts and comments via the public JSON API. | |
| 15 | + | |
| 16 | + No auth required for public posts. Append .json to any Reddit URL. | |
| 17 | + Requires: pip install requests | |
| 18 | + """ | |
| 19 | + | |
| 20 | + def __init__(self, url: str): | |
| 21 | + """ | |
| 22 | + Parameters | |
| 23 | + ---------- | |
| 24 | + url : str | |
| 25 | + Reddit post or subreddit URL. | |
| 26 | + """ | |
| 27 | + self.url = url.rstrip("/") | |
| 28 | + | |
| 29 | + def authenticate(self) -> bool: | |
| 30 | + """No auth needed for public Reddit content.""" | |
| 31 | + return True | |
| 32 | + | |
| 33 | + def list_videos( | |
| 34 | + self, | |
| 35 | + folder_id: Optional[str] = None, | |
| 36 | + folder_path: Optional[str] = None, | |
| 37 | + patterns: Optional[List[str]] = None, | |
| 38 | + ) -> List[SourceFile]: | |
| 39 | + """Return a single SourceFile for the Reddit post.""" | |
| 40 | + return [ | |
| 41 | + SourceFile( | |
| 42 | + name=self.url.split("/")[-1] or "reddit_post", | |
| 43 | + id=self.url, | |
| 44 | + mime_type="text/plain", | |
| 45 | + ) | |
| 46 | + ] | |
| 47 | + | |
| 48 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 49 | + """Download post and comments as plain text.""" | |
| 50 | + destination = Path(destination) | |
| 51 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 52 | + text = self.fetch_text() | |
| 53 | + destination.write_text(text, encoding="utf-8") | |
| 54 | + logger.info(f"Saved Reddit content to {destination}") | |
| 55 | + return destination | |
| 56 | + | |
| 57 | + def fetch_text(self) -> str: | |
| 58 | + """Fetch the Reddit post and comments as structured text.""" | |
| 59 | + import requests | |
| 60 | + | |
| 61 | + json_url = self.url.rstrip("/") + ".json" | |
| 62 | + resp = requests.get( | |
| 63 | + json_url, | |
| 64 | + timeout=15, | |
| 65 | + headers={"User-Agent": "PlanOpticon/0.3 (source connector)"}, | |
| 66 | + ) | |
| 67 | + resp.raise_for_status() | |
| 68 | + data = resp.json() | |
| 69 | + | |
| 70 | + lines = [] | |
| 71 | + # Post data is in first listing | |
| 72 | + if isinstance(data, list) and len(data) > 0: | |
| 73 | + post = data[0]["data"]["children"][0]["data"] | |
| 74 | + lines.append(f"# {post.get('title', 'Untitled')}") | |
| 75 | + lines.append(f"by u/{post.get('author', '[deleted]')} | {post.get('score', 0)} points") | |
| 76 | + lines.append("") | |
| 77 | + if post.get("selftext"): | |
| 78 | + lines.append(post["selftext"]) | |
| 79 | + lines.append("") | |
| 80 | + | |
| 81 | + # Comments in second listing | |
| 82 | + if len(data) > 1: | |
| 83 | + lines.append("## Comments\n") | |
| 84 | + self._extract_comments(data[1]["data"]["children"], lines, depth=0) | |
| 85 | + | |
| 86 | + return "\n".join(lines) | |
| 87 | + | |
| 88 | + def _extract_comments(self, children: list, lines: list, depth: int) -> None: | |
| 89 | + """Recursively extract comment text.""" | |
| 90 | + indent = " " * depth | |
| 91 | + for child in children: | |
| 92 | + if child.get("kind") != "t1": | |
| 93 | + continue | |
| 94 | + c = child["data"] | |
| 95 | + author = c.get("author", "[deleted]") | |
| 96 | + body = c.get("body", "") | |
| 97 | + lines.append(f"{indent}**{author}** ({c.get('score', 0)} pts):") | |
| 98 | + lines.append(f"{indent}{body}") | |
| 99 | + lines.append("") | |
| 100 | + # Recurse into replies | |
| 101 | + replies = c.get("replies") | |
| 102 | + if isinstance(replies, dict): | |
| 103 | + self._extract_comments(replies["data"]["children"], lines, depth + 1) |
| --- a/video_processor/sources/reddit_source.py | |
| +++ b/video_processor/sources/reddit_source.py | |
| @@ -0,0 +1,103 @@ | |
| --- a/video_processor/sources/reddit_source.py | |
| +++ b/video_processor/sources/reddit_source.py | |
| @@ -0,0 +1,103 @@ | |
| 1 | """Reddit source connector using the public JSON API.""" |
| 2 | |
| 3 | import logging |
| 4 | from pathlib import Path |
| 5 | from typing import List, Optional |
| 6 | |
| 7 | from video_processor.sources.base import BaseSource, SourceFile |
| 8 | |
| 9 | logger = logging.getLogger(__name__) |
| 10 | |
| 11 | |
| 12 | class RedditSource(BaseSource): |
| 13 | """ |
| 14 | Fetch Reddit posts and comments via the public JSON API. |
| 15 | |
| 16 | No auth required for public posts. Append .json to any Reddit URL. |
| 17 | Requires: pip install requests |
| 18 | """ |
| 19 | |
| 20 | def __init__(self, url: str): |
| 21 | """ |
| 22 | Parameters |
| 23 | ---------- |
| 24 | url : str |
| 25 | Reddit post or subreddit URL. |
| 26 | """ |
| 27 | self.url = url.rstrip("/") |
| 28 | |
| 29 | def authenticate(self) -> bool: |
| 30 | """No auth needed for public Reddit content.""" |
| 31 | return True |
| 32 | |
| 33 | def list_videos( |
| 34 | self, |
| 35 | folder_id: Optional[str] = None, |
| 36 | folder_path: Optional[str] = None, |
| 37 | patterns: Optional[List[str]] = None, |
| 38 | ) -> List[SourceFile]: |
| 39 | """Return a single SourceFile for the Reddit post.""" |
| 40 | return [ |
| 41 | SourceFile( |
| 42 | name=self.url.split("/")[-1] or "reddit_post", |
| 43 | id=self.url, |
| 44 | mime_type="text/plain", |
| 45 | ) |
| 46 | ] |
| 47 | |
| 48 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 49 | """Download post and comments as plain text.""" |
| 50 | destination = Path(destination) |
| 51 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 52 | text = self.fetch_text() |
| 53 | destination.write_text(text, encoding="utf-8") |
| 54 | logger.info(f"Saved Reddit content to {destination}") |
| 55 | return destination |
| 56 | |
| 57 | def fetch_text(self) -> str: |
| 58 | """Fetch the Reddit post and comments as structured text.""" |
| 59 | import requests |
| 60 | |
| 61 | json_url = self.url.rstrip("/") + ".json" |
| 62 | resp = requests.get( |
| 63 | json_url, |
| 64 | timeout=15, |
| 65 | headers={"User-Agent": "PlanOpticon/0.3 (source connector)"}, |
| 66 | ) |
| 67 | resp.raise_for_status() |
| 68 | data = resp.json() |
| 69 | |
| 70 | lines = [] |
| 71 | # Post data is in first listing |
| 72 | if isinstance(data, list) and len(data) > 0: |
| 73 | post = data[0]["data"]["children"][0]["data"] |
| 74 | lines.append(f"# {post.get('title', 'Untitled')}") |
| 75 | lines.append(f"by u/{post.get('author', '[deleted]')} | {post.get('score', 0)} points") |
| 76 | lines.append("") |
| 77 | if post.get("selftext"): |
| 78 | lines.append(post["selftext"]) |
| 79 | lines.append("") |
| 80 | |
| 81 | # Comments in second listing |
| 82 | if len(data) > 1: |
| 83 | lines.append("## Comments\n") |
| 84 | self._extract_comments(data[1]["data"]["children"], lines, depth=0) |
| 85 | |
| 86 | return "\n".join(lines) |
| 87 | |
| 88 | def _extract_comments(self, children: list, lines: list, depth: int) -> None: |
| 89 | """Recursively extract comment text.""" |
| 90 | indent = " " * depth |
| 91 | for child in children: |
| 92 | if child.get("kind") != "t1": |
| 93 | continue |
| 94 | c = child["data"] |
| 95 | author = c.get("author", "[deleted]") |
| 96 | body = c.get("body", "") |
| 97 | lines.append(f"{indent}**{author}** ({c.get('score', 0)} pts):") |
| 98 | lines.append(f"{indent}{body}") |
| 99 | lines.append("") |
| 100 | # Recurse into replies |
| 101 | replies = c.get("replies") |
| 102 | if isinstance(replies, dict): |
| 103 | self._extract_comments(replies["data"]["children"], lines, depth + 1) |
| --- a/video_processor/sources/rss_source.py | ||
| +++ b/video_processor/sources/rss_source.py | ||
| @@ -0,0 +1,114 @@ | ||
| 1 | +"""RSS/Atom feed source connector.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +from pathlib import Path | |
| 5 | +from typing import List, Optional | |
| 6 | + | |
| 7 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 8 | + | |
| 9 | +logger = logging.getLogger(__name__) | |
| 10 | + | |
| 11 | + | |
| 12 | +class RSSSource(BaseSource): | |
| 13 | + """ | |
| 14 | + Parse RSS/Atom feeds and extract entries as text documents. | |
| 15 | + | |
| 16 | + Optional: pip install feedparser (falls back to xml.etree.ElementTree) | |
| 17 | + Requires: pip install requests | |
| 18 | + """ | |
| 19 | + | |
| 20 | + def __init__(self, url: str, max_entries: int = 50): | |
| 21 | + self.url = url | |
| 22 | + self.max_entries = max_entries | |
| 23 | + self._entries: List[dict] = [] | |
| 24 | + | |
| 25 | + def authenticate(self) -> bool: | |
| 26 | + """No auth needed for public feeds.""" | |
| 27 | + return True | |
| 28 | + | |
| 29 | + def _parse_feed(self) -> None: | |
| 30 | + """Fetch and parse the feed.""" | |
| 31 | + if self._entries: | |
| 32 | + return | |
| 33 | + | |
| 34 | + import requests | |
| 35 | + | |
| 36 | + resp = requests.get(self.url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"}) | |
| 37 | + resp.raise_for_status() | |
| 38 | + | |
| 39 | + try: | |
| 40 | + import feedparser | |
| 41 | + | |
| 42 | + feed = feedparser.parse(resp.text) | |
| 43 | + for entry in feed.entries[: self.max_entries]: | |
| 44 | + self._entries.append( | |
| 45 | + { | |
| 46 | + "title": entry.get("title", "Untitled"), | |
| 47 | + "link": entry.get("link", ""), | |
| 48 | + "summary": entry.get("summary", ""), | |
| 49 | + "published": entry.get("published", ""), | |
| 50 | + "id": entry.get("id", entry.get("link", "")), | |
| 51 | + } | |
| 52 | + ) | |
| 53 | + except ImportError: | |
| 54 | + logger.debug("feedparser not available, using xml.etree fallback") | |
| 55 | + self._parse_xml(resp.text) | |
| 56 | + | |
| 57 | + def _parse_xml(self, text: str) -> None: | |
| 58 | + """Fallback parser using stdlib xml.etree.""" | |
| 59 | + import xml.etree.ElementTree as ET | |
| 60 | + | |
| 61 | + root = ET.fromstring(text) | |
| 62 | + # Handle RSS 2.0 | |
| 63 | + ns = {"atom": "http://www.w3.org/2005/Atom"} | |
| 64 | + items = root.findall(".//item") or root.findall(".//atom:entry", ns) | |
| 65 | + for item in items[: self.max_entries]: | |
| 66 | + title = ( | |
| 67 | + item.findtext("title") or item.findtext("atom:title", namespaces=ns) or "Untitled" | |
| 68 | + ) | |
| 69 | + link = item.findtext("link") or "" | |
| 70 | + if not link: | |
| 71 | + link_el = item.find("atom:link", ns) | |
| 72 | + link = link_el.get("href", "") if link_el is not None else "" | |
| 73 | + desc = ( | |
| 74 | + item.findtext("description") or item.findtext("atom:summary", namespaces=ns) or "" | |
| 75 | + ) | |
| 76 | + pub = item.findtext("pubDate") or item.findtext("atom:published", namespaces=ns) or "" | |
| 77 | + self._entries.append( | |
| 78 | + {"title": title, "link": link, "summary": desc, "published": pub, "id": link} | |
| 79 | + ) | |
| 80 | + | |
| 81 | + def list_videos( | |
| 82 | + self, | |
| 83 | + folder_id: Optional[str] = None, | |
| 84 | + folder_path: Optional[str] = None, | |
| 85 | + patterns: Optional[List[str]] = None, | |
| 86 | + ) -> List[SourceFile]: | |
| 87 | + """List feed entries as SourceFiles.""" | |
| 88 | + self._parse_feed() | |
| 89 | + return [ | |
| 90 | + SourceFile( | |
| 91 | + name=e["title"], id=e["id"], mime_type="text/plain", modified_at=e["published"] | |
| 92 | + ) | |
| 93 | + for e in self._entries | |
| 94 | + ] | |
| 95 | + | |
| 96 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 97 | + """Write an entry's content as a text file.""" | |
| 98 | + self._parse_feed() | |
| 99 | + destination = Path(destination) | |
| 100 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 101 | + | |
| 102 | + entry = next((e for e in self._entries if e["id"] == file.id), None) | |
| 103 | + if not entry: | |
| 104 | + raise ValueError(f"Entry not found: {file.id}") | |
| 105 | + | |
| 106 | + text = ( | |
| 107 | + f"# {entry['title']}\n\n" | |
| 108 | + f"Published: {entry['published']}\n" | |
| 109 | + f"Link: {entry['link']}\n\n" | |
| 110 | + f"{entry['summary']}" | |
| 111 | + ) | |
| 112 | + destination.write_text(text, encoding="utf-8") | |
| 113 | + logger.info(f"Saved RSS entry to {destination}") | |
| 114 | + return destination |
| --- a/video_processor/sources/rss_source.py | |
| +++ b/video_processor/sources/rss_source.py | |
| @@ -0,0 +1,114 @@ | |
| --- a/video_processor/sources/rss_source.py | |
| +++ b/video_processor/sources/rss_source.py | |
| @@ -0,0 +1,114 @@ | |
| 1 | """RSS/Atom feed source connector.""" |
| 2 | |
| 3 | import logging |
| 4 | from pathlib import Path |
| 5 | from typing import List, Optional |
| 6 | |
| 7 | from video_processor.sources.base import BaseSource, SourceFile |
| 8 | |
| 9 | logger = logging.getLogger(__name__) |
| 10 | |
| 11 | |
| 12 | class RSSSource(BaseSource): |
| 13 | """ |
| 14 | Parse RSS/Atom feeds and extract entries as text documents. |
| 15 | |
| 16 | Optional: pip install feedparser (falls back to xml.etree.ElementTree) |
| 17 | Requires: pip install requests |
| 18 | """ |
| 19 | |
| 20 | def __init__(self, url: str, max_entries: int = 50): |
| 21 | self.url = url |
| 22 | self.max_entries = max_entries |
| 23 | self._entries: List[dict] = [] |
| 24 | |
| 25 | def authenticate(self) -> bool: |
| 26 | """No auth needed for public feeds.""" |
| 27 | return True |
| 28 | |
| 29 | def _parse_feed(self) -> None: |
| 30 | """Fetch and parse the feed.""" |
| 31 | if self._entries: |
| 32 | return |
| 33 | |
| 34 | import requests |
| 35 | |
| 36 | resp = requests.get(self.url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"}) |
| 37 | resp.raise_for_status() |
| 38 | |
| 39 | try: |
| 40 | import feedparser |
| 41 | |
| 42 | feed = feedparser.parse(resp.text) |
| 43 | for entry in feed.entries[: self.max_entries]: |
| 44 | self._entries.append( |
| 45 | { |
| 46 | "title": entry.get("title", "Untitled"), |
| 47 | "link": entry.get("link", ""), |
| 48 | "summary": entry.get("summary", ""), |
| 49 | "published": entry.get("published", ""), |
| 50 | "id": entry.get("id", entry.get("link", "")), |
| 51 | } |
| 52 | ) |
| 53 | except ImportError: |
| 54 | logger.debug("feedparser not available, using xml.etree fallback") |
| 55 | self._parse_xml(resp.text) |
| 56 | |
| 57 | def _parse_xml(self, text: str) -> None: |
| 58 | """Fallback parser using stdlib xml.etree.""" |
| 59 | import xml.etree.ElementTree as ET |
| 60 | |
| 61 | root = ET.fromstring(text) |
| 62 | # Handle RSS 2.0 |
| 63 | ns = {"atom": "http://www.w3.org/2005/Atom"} |
| 64 | items = root.findall(".//item") or root.findall(".//atom:entry", ns) |
| 65 | for item in items[: self.max_entries]: |
| 66 | title = ( |
| 67 | item.findtext("title") or item.findtext("atom:title", namespaces=ns) or "Untitled" |
| 68 | ) |
| 69 | link = item.findtext("link") or "" |
| 70 | if not link: |
| 71 | link_el = item.find("atom:link", ns) |
| 72 | link = link_el.get("href", "") if link_el is not None else "" |
| 73 | desc = ( |
| 74 | item.findtext("description") or item.findtext("atom:summary", namespaces=ns) or "" |
| 75 | ) |
| 76 | pub = item.findtext("pubDate") or item.findtext("atom:published", namespaces=ns) or "" |
| 77 | self._entries.append( |
| 78 | {"title": title, "link": link, "summary": desc, "published": pub, "id": link} |
| 79 | ) |
| 80 | |
| 81 | def list_videos( |
| 82 | self, |
| 83 | folder_id: Optional[str] = None, |
| 84 | folder_path: Optional[str] = None, |
| 85 | patterns: Optional[List[str]] = None, |
| 86 | ) -> List[SourceFile]: |
| 87 | """List feed entries as SourceFiles.""" |
| 88 | self._parse_feed() |
| 89 | return [ |
| 90 | SourceFile( |
| 91 | name=e["title"], id=e["id"], mime_type="text/plain", modified_at=e["published"] |
| 92 | ) |
| 93 | for e in self._entries |
| 94 | ] |
| 95 | |
| 96 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 97 | """Write an entry's content as a text file.""" |
| 98 | self._parse_feed() |
| 99 | destination = Path(destination) |
| 100 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 101 | |
| 102 | entry = next((e for e in self._entries if e["id"] == file.id), None) |
| 103 | if not entry: |
| 104 | raise ValueError(f"Entry not found: {file.id}") |
| 105 | |
| 106 | text = ( |
| 107 | f"# {entry['title']}\n\n" |
| 108 | f"Published: {entry['published']}\n" |
| 109 | f"Link: {entry['link']}\n\n" |
| 110 | f"{entry['summary']}" |
| 111 | ) |
| 112 | destination.write_text(text, encoding="utf-8") |
| 113 | logger.info(f"Saved RSS entry to {destination}") |
| 114 | return destination |
| --- a/video_processor/sources/s3_source.py | ||
| +++ b/video_processor/sources/s3_source.py | ||
| @@ -0,0 +1,76 @@ | ||
| 1 | +"""AWS S3 source connector for fetching videos from S3 buckets.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +from pathlib import Path | |
| 5 | +from typing import List, Optional | |
| 6 | + | |
| 7 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 8 | + | |
| 9 | +logger = logging.getLogger(__name__) | |
| 10 | + | |
| 11 | +_VIDEO_EXTENSIONS = {".mp4", ".mov", ".avi", ".mkv", ".webm", ".m4v", ".flv", ".wmv"} | |
| 12 | + | |
| 13 | + | |
| 14 | +class S3Source(BaseSource): | |
| 15 | + """Fetches videos from an S3 bucket. Requires boto3 (optional dependency).""" | |
| 16 | + | |
| 17 | + def __init__(self, bucket: str, prefix: str = "", region: Optional[str] = None): | |
| 18 | + self.bucket = bucket | |
| 19 | + self.prefix = prefix | |
| 20 | + self.region = region | |
| 21 | + self._client = None | |
| 22 | + | |
| 23 | + def authenticate(self) -> bool: | |
| 24 | + """Check for AWS credentials by initializing an S3 client.""" | |
| 25 | + try: | |
| 26 | + import boto3 | |
| 27 | + except ImportError: | |
| 28 | + logger.error("boto3 is not installed. Install with: pip install boto3") | |
| 29 | + return False | |
| 30 | + try: | |
| 31 | + kwargs = {} | |
| 32 | + if self.region: | |
| 33 | + kwargs["region_name"] = self.region | |
| 34 | + self._client = boto3.client("s3", **kwargs) | |
| 35 | + self._client.head_bucket(Bucket=self.bucket) | |
| 36 | + return True | |
| 37 | + except Exception as e: | |
| 38 | + logger.error(f"S3 authentication failed: {e}") | |
| 39 | + return False | |
| 40 | + | |
| 41 | + def list_videos( | |
| 42 | + self, | |
| 43 | + folder_id: Optional[str] = None, | |
| 44 | + folder_path: Optional[str] = None, | |
| 45 | + patterns: Optional[List[str]] = None, | |
| 46 | + ) -> List[SourceFile]: | |
| 47 | + """List video files in the bucket under the configured prefix.""" | |
| 48 | + if not self._client: | |
| 49 | + raise RuntimeError("Not authenticated. Call authenticate() first.") | |
| 50 | + prefix = folder_path or self.prefix | |
| 51 | + paginator = self._client.get_paginator("list_objects_v2") | |
| 52 | + files: List[SourceFile] = [] | |
| 53 | + for page in paginator.paginate(Bucket=self.bucket, Prefix=prefix): | |
| 54 | + for obj in page.get("Contents", []): | |
| 55 | + key = obj["Key"] | |
| 56 | + suffix = Path(key).suffix.lower() | |
| 57 | + if suffix in _VIDEO_EXTENSIONS: | |
| 58 | + files.append( | |
| 59 | + SourceFile( | |
| 60 | + name=Path(key).name, | |
| 61 | + id=key, | |
| 62 | + size_bytes=obj.get("Size"), | |
| 63 | + modified_at=str(obj.get("LastModified", "")), | |
| 64 | + path=key, | |
| 65 | + ) | |
| 66 | + ) | |
| 67 | + return files | |
| 68 | + | |
| 69 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 70 | + """Download a single file from S3 to a local path.""" | |
| 71 | + if not self._client: | |
| 72 | + raise RuntimeError("Not authenticated. Call authenticate() first.") | |
| 73 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 74 | + self._client.download_file(self.bucket, file.id, str(destination)) | |
| 75 | + logger.info(f"Downloaded s3://{self.bucket}/{file.id} -> {destination}") | |
| 76 | + return destination |
| --- a/video_processor/sources/s3_source.py | |
| +++ b/video_processor/sources/s3_source.py | |
| @@ -0,0 +1,76 @@ | |
| --- a/video_processor/sources/s3_source.py | |
| +++ b/video_processor/sources/s3_source.py | |
| @@ -0,0 +1,76 @@ | |
| 1 | """AWS S3 source connector for fetching videos from S3 buckets.""" |
| 2 | |
| 3 | import logging |
| 4 | from pathlib import Path |
| 5 | from typing import List, Optional |
| 6 | |
| 7 | from video_processor.sources.base import BaseSource, SourceFile |
| 8 | |
| 9 | logger = logging.getLogger(__name__) |
| 10 | |
| 11 | _VIDEO_EXTENSIONS = {".mp4", ".mov", ".avi", ".mkv", ".webm", ".m4v", ".flv", ".wmv"} |
| 12 | |
| 13 | |
| 14 | class S3Source(BaseSource): |
| 15 | """Fetches videos from an S3 bucket. Requires boto3 (optional dependency).""" |
| 16 | |
| 17 | def __init__(self, bucket: str, prefix: str = "", region: Optional[str] = None): |
| 18 | self.bucket = bucket |
| 19 | self.prefix = prefix |
| 20 | self.region = region |
| 21 | self._client = None |
| 22 | |
| 23 | def authenticate(self) -> bool: |
| 24 | """Check for AWS credentials by initializing an S3 client.""" |
| 25 | try: |
| 26 | import boto3 |
| 27 | except ImportError: |
| 28 | logger.error("boto3 is not installed. Install with: pip install boto3") |
| 29 | return False |
| 30 | try: |
| 31 | kwargs = {} |
| 32 | if self.region: |
| 33 | kwargs["region_name"] = self.region |
| 34 | self._client = boto3.client("s3", **kwargs) |
| 35 | self._client.head_bucket(Bucket=self.bucket) |
| 36 | return True |
| 37 | except Exception as e: |
| 38 | logger.error(f"S3 authentication failed: {e}") |
| 39 | return False |
| 40 | |
| 41 | def list_videos( |
| 42 | self, |
| 43 | folder_id: Optional[str] = None, |
| 44 | folder_path: Optional[str] = None, |
| 45 | patterns: Optional[List[str]] = None, |
| 46 | ) -> List[SourceFile]: |
| 47 | """List video files in the bucket under the configured prefix.""" |
| 48 | if not self._client: |
| 49 | raise RuntimeError("Not authenticated. Call authenticate() first.") |
| 50 | prefix = folder_path or self.prefix |
| 51 | paginator = self._client.get_paginator("list_objects_v2") |
| 52 | files: List[SourceFile] = [] |
| 53 | for page in paginator.paginate(Bucket=self.bucket, Prefix=prefix): |
| 54 | for obj in page.get("Contents", []): |
| 55 | key = obj["Key"] |
| 56 | suffix = Path(key).suffix.lower() |
| 57 | if suffix in _VIDEO_EXTENSIONS: |
| 58 | files.append( |
| 59 | SourceFile( |
| 60 | name=Path(key).name, |
| 61 | id=key, |
| 62 | size_bytes=obj.get("Size"), |
| 63 | modified_at=str(obj.get("LastModified", "")), |
| 64 | path=key, |
| 65 | ) |
| 66 | ) |
| 67 | return files |
| 68 | |
| 69 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 70 | """Download a single file from S3 to a local path.""" |
| 71 | if not self._client: |
| 72 | raise RuntimeError("Not authenticated. Call authenticate() first.") |
| 73 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 74 | self._client.download_file(self.bucket, file.id, str(destination)) |
| 75 | logger.info(f"Downloaded s3://{self.bucket}/{file.id} -> {destination}") |
| 76 | return destination |
| --- a/video_processor/sources/teams_recording_source.py | ||
| +++ b/video_processor/sources/teams_recording_source.py | ||
| @@ -0,0 +1,375 @@ | ||
| 1 | +"""Microsoft Teams meeting recording source using the m365 CLI. | |
| 2 | + | |
| 3 | +Fetches Teams meeting recordings and transcripts via the Microsoft Graph API | |
| 4 | +through the `m365` CLI tool. | |
| 5 | + | |
| 6 | +Requires: npm install -g @pnp/cli-microsoft365 | |
| 7 | +Auth: m365 login (interactive) | |
| 8 | +Docs: https://pnp.github.io/cli-microsoft365/ | |
| 9 | +""" | |
| 10 | + | |
| 11 | +import logging | |
| 12 | +import re | |
| 13 | +import shutil | |
| 14 | +import subprocess | |
| 15 | +from pathlib import Path | |
| 16 | +from typing import List, Optional | |
| 17 | + | |
| 18 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 19 | +from video_processor.sources.m365_source import _run_m365 | |
| 20 | + | |
| 21 | +logger = logging.getLogger(__name__) | |
| 22 | + | |
| 23 | + | |
| 24 | +def _vtt_to_text(vtt: str) -> str: | |
| 25 | + """Strip VTT timing metadata and return plain text. | |
| 26 | + | |
| 27 | + Removes WEBVTT headers, timestamps (00:00:00.000 --> 00:00:05.000), | |
| 28 | + cue identifiers, and deduplicates consecutive identical lines. | |
| 29 | + """ | |
| 30 | + lines = vtt.splitlines() | |
| 31 | + text_lines: list[str] = [] | |
| 32 | + prev_line = "" | |
| 33 | + | |
| 34 | + for line in lines: | |
| 35 | + stripped = line.strip() | |
| 36 | + # Skip WEBVTT header and NOTE blocks | |
| 37 | + if stripped.startswith("WEBVTT") or stripped.startswith("NOTE"): | |
| 38 | + continue | |
| 39 | + # Skip timestamp lines (e.g. 00:00:01.000 --> 00:00:05.000) | |
| 40 | + if re.match(r"\d{2}:\d{2}[:\.][\d.]+ --> \d{2}:\d{2}[:\.][\d.]+", stripped): | |
| 41 | + continue | |
| 42 | + # Skip numeric cue identifiers | |
| 43 | + if re.match(r"^\d+$", stripped): | |
| 44 | + continue | |
| 45 | + # Skip blank lines | |
| 46 | + if not stripped: | |
| 47 | + continue | |
| 48 | + # Strip inline VTT tags like <v Speaker> | |
| 49 | + cleaned = re.sub(r"<[^>]+>", "", stripped).strip() | |
| 50 | + if cleaned and cleaned != prev_line: | |
| 51 | + text_lines.append(cleaned) | |
| 52 | + prev_line = cleaned | |
| 53 | + | |
| 54 | + return "\n".join(text_lines) | |
| 55 | + | |
| 56 | + | |
| 57 | +class TeamsRecordingSource(BaseSource): | |
| 58 | + """ | |
| 59 | + Fetch Teams meeting recordings and transcripts via the m365 CLI / Graph API. | |
| 60 | + | |
| 61 | + Usage: | |
| 62 | + source = TeamsRecordingSource(user_id="me") | |
| 63 | + source.authenticate() | |
| 64 | + recordings = source.list_videos() | |
| 65 | + source.download_all(recordings, Path("./recordings")) | |
| 66 | + | |
| 67 | + # Fetch transcript for a specific meeting | |
| 68 | + transcript = source.fetch_transcript(meeting_id) | |
| 69 | + """ | |
| 70 | + | |
| 71 | + def __init__(self, user_id: str = "me"): | |
| 72 | + self.user_id = user_id | |
| 73 | + | |
| 74 | + def authenticate(self) -> bool: | |
| 75 | + """Check if m365 CLI is installed and logged in.""" | |
| 76 | + if not shutil.which("m365"): | |
| 77 | + logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365") | |
| 78 | + return False | |
| 79 | + try: | |
| 80 | + result = _run_m365(["status"], timeout=10) | |
| 81 | + if isinstance(result, dict) and result.get("connectedAs"): | |
| 82 | + return True | |
| 83 | + if isinstance(result, str) and "Logged in" in result: | |
| 84 | + return True | |
| 85 | + logger.error("m365 not logged in. Run: m365 login") | |
| 86 | + return False | |
| 87 | + except (RuntimeError, subprocess.TimeoutExpired): | |
| 88 | + logger.error("m365 not logged in. Run: m365 login") | |
| 89 | + return False | |
| 90 | + | |
| 91 | + def list_videos( | |
| 92 | + self, | |
| 93 | + folder_id: Optional[str] = None, | |
| 94 | + folder_path: Optional[str] = None, | |
| 95 | + patterns: Optional[List[str]] = None, | |
| 96 | + ) -> List[SourceFile]: | |
| 97 | + """List Teams meeting recordings available for the user. | |
| 98 | + | |
| 99 | + Tries multiple approaches: | |
| 100 | + 1. Graph API onlineMeetings endpoint | |
| 101 | + 2. m365 teams meeting list command | |
| 102 | + 3. Fallback: search chat messages for recording links | |
| 103 | + """ | |
| 104 | + files: List[SourceFile] = [] | |
| 105 | + | |
| 106 | + # Approach 1: Graph API — list online meetings | |
| 107 | + try: | |
| 108 | + result = _run_m365( | |
| 109 | + [ | |
| 110 | + "request", | |
| 111 | + "--url", | |
| 112 | + f"https://graph.microsoft.com/v1.0/{self.user_id}/onlineMeetings", | |
| 113 | + "--method", | |
| 114 | + "get", | |
| 115 | + ], | |
| 116 | + timeout=60, | |
| 117 | + ) | |
| 118 | + meetings = self._extract_meetings_list(result) | |
| 119 | + for meeting in meetings: | |
| 120 | + recording_files = self._get_meeting_recordings(meeting) | |
| 121 | + files.extend(recording_files) | |
| 122 | + | |
| 123 | + if files: | |
| 124 | + logger.info(f"Found {len(files)} recording(s) via Graph API onlineMeetings") | |
| 125 | + return files | |
| 126 | + except RuntimeError as e: | |
| 127 | + logger.debug(f"onlineMeetings endpoint failed: {e}") | |
| 128 | + | |
| 129 | + # Approach 2: m365 teams meeting list | |
| 130 | + try: | |
| 131 | + result = _run_m365(["teams", "meeting", "list"], timeout=60) | |
| 132 | + meetings = result if isinstance(result, list) else [] | |
| 133 | + for meeting in meetings: | |
| 134 | + recording_files = self._get_meeting_recordings(meeting) | |
| 135 | + files.extend(recording_files) | |
| 136 | + | |
| 137 | + if files: | |
| 138 | + logger.info(f"Found {len(files)} recording(s) via m365 teams meeting list") | |
| 139 | + return files | |
| 140 | + except RuntimeError as e: | |
| 141 | + logger.debug(f"teams meeting list failed: {e}") | |
| 142 | + | |
| 143 | + # Approach 3: Fallback — search chat messages for recording links | |
| 144 | + try: | |
| 145 | + result = _run_m365( | |
| 146 | + [ | |
| 147 | + "request", | |
| 148 | + "--url", | |
| 149 | + ( | |
| 150 | + f"https://graph.microsoft.com/v1.0/{self.user_id}/chats" | |
| 151 | + "?$expand=messages($top=50)" | |
| 152 | + "&$filter=chatType eq 'meeting'" | |
| 153 | + "&$top=25" | |
| 154 | + ), | |
| 155 | + "--method", | |
| 156 | + "get", | |
| 157 | + ], | |
| 158 | + timeout=60, | |
| 159 | + ) | |
| 160 | + chats = self._extract_value_list(result) | |
| 161 | + for chat in chats: | |
| 162 | + messages = chat.get("messages", []) | |
| 163 | + for msg in messages: | |
| 164 | + body = msg.get("body", {}).get("content", "") | |
| 165 | + if "recording" in body.lower() or ".mp4" in body.lower(): | |
| 166 | + topic = chat.get("topic", "Meeting Recording") | |
| 167 | + chat_id = chat.get("id", "") | |
| 168 | + msg_id = msg.get("id", "") | |
| 169 | + files.append( | |
| 170 | + SourceFile( | |
| 171 | + name=f"{topic}.mp4", | |
| 172 | + id=f"{chat_id}:{msg_id}", | |
| 173 | + mime_type="video/mp4", | |
| 174 | + modified_at=msg.get("createdDateTime"), | |
| 175 | + ) | |
| 176 | + ) | |
| 177 | + if files: | |
| 178 | + logger.info(f"Found {len(files)} recording link(s) in meeting chats") | |
| 179 | + except RuntimeError as e: | |
| 180 | + logger.debug(f"Chat message fallback failed: {e}") | |
| 181 | + | |
| 182 | + if not files: | |
| 183 | + logger.warning("No Teams meeting recordings found") | |
| 184 | + | |
| 185 | + return files | |
| 186 | + | |
| 187 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 188 | + """Download a recording via its Graph API download URL.""" | |
| 189 | + destination = Path(destination) | |
| 190 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 191 | + | |
| 192 | + download_url = file.path | |
| 193 | + if download_url: | |
| 194 | + # Use the direct download URL from contentUrl / @microsoft.graph.downloadUrl | |
| 195 | + _run_m365( | |
| 196 | + [ | |
| 197 | + "request", | |
| 198 | + "--url", | |
| 199 | + download_url, | |
| 200 | + "--method", | |
| 201 | + "get", | |
| 202 | + "--filePath", | |
| 203 | + str(destination), | |
| 204 | + ], | |
| 205 | + timeout=300, | |
| 206 | + ) | |
| 207 | + else: | |
| 208 | + # Try to get download URL from the recording ID | |
| 209 | + meeting_id, _, recording_id = file.id.partition(":") | |
| 210 | + if recording_id: | |
| 211 | + url = ( | |
| 212 | + f"https://graph.microsoft.com/v1.0/{self.user_id}" | |
| 213 | + f"/onlineMeetings/{meeting_id}" | |
| 214 | + f"/recordings/{recording_id}/content" | |
| 215 | + ) | |
| 216 | + else: | |
| 217 | + url = ( | |
| 218 | + f"https://graph.microsoft.com/v1.0/{self.user_id}" | |
| 219 | + f"/onlineMeetings/{meeting_id}/recordings" | |
| 220 | + ) | |
| 221 | + # Fetch recording list to find the content URL | |
| 222 | + result = _run_m365( | |
| 223 | + ["request", "--url", url, "--method", "get"], | |
| 224 | + timeout=60, | |
| 225 | + ) | |
| 226 | + recordings = self._extract_value_list(result) | |
| 227 | + if recordings: | |
| 228 | + url = ( | |
| 229 | + f"https://graph.microsoft.com/v1.0/{self.user_id}" | |
| 230 | + f"/onlineMeetings/{meeting_id}" | |
| 231 | + f"/recordings/{recordings[0].get('id', '')}/content" | |
| 232 | + ) | |
| 233 | + | |
| 234 | + _run_m365( | |
| 235 | + [ | |
| 236 | + "request", | |
| 237 | + "--url", | |
| 238 | + url, | |
| 239 | + "--method", | |
| 240 | + "get", | |
| 241 | + "--filePath", | |
| 242 | + str(destination), | |
| 243 | + ], | |
| 244 | + timeout=300, | |
| 245 | + ) | |
| 246 | + | |
| 247 | + logger.info(f"Downloaded {file.name} to {destination}") | |
| 248 | + return destination | |
| 249 | + | |
| 250 | + def fetch_transcript(self, meeting_id: str) -> Optional[str]: | |
| 251 | + """Fetch the transcript for a Teams meeting. | |
| 252 | + | |
| 253 | + Queries the Graph API transcripts endpoint, downloads the transcript | |
| 254 | + content, and converts VTT format to plain text. | |
| 255 | + """ | |
| 256 | + try: | |
| 257 | + result = _run_m365( | |
| 258 | + [ | |
| 259 | + "request", | |
| 260 | + "--url", | |
| 261 | + ( | |
| 262 | + f"https://graph.microsoft.com/v1.0/{self.user_id}" | |
| 263 | + f"/onlineMeetings/{meeting_id}/transcripts" | |
| 264 | + ), | |
| 265 | + "--method", | |
| 266 | + "get", | |
| 267 | + ], | |
| 268 | + timeout=60, | |
| 269 | + ) | |
| 270 | + except RuntimeError as e: | |
| 271 | + logger.warning(f"Failed to list transcripts for meeting {meeting_id}: {e}") | |
| 272 | + return None | |
| 273 | + | |
| 274 | + transcripts = self._extract_value_list(result) | |
| 275 | + if not transcripts: | |
| 276 | + logger.info(f"No transcripts found for meeting {meeting_id}") | |
| 277 | + return None | |
| 278 | + | |
| 279 | + # Download the first available transcript | |
| 280 | + transcript_id = transcripts[0].get("id", "") | |
| 281 | + try: | |
| 282 | + content_result = _run_m365( | |
| 283 | + [ | |
| 284 | + "request", | |
| 285 | + "--url", | |
| 286 | + ( | |
| 287 | + f"https://graph.microsoft.com/v1.0/{self.user_id}" | |
| 288 | + f"/onlineMeetings/{meeting_id}" | |
| 289 | + f"/transcripts/{transcript_id}/content" | |
| 290 | + ), | |
| 291 | + "--method", | |
| 292 | + "get", | |
| 293 | + "--accept", | |
| 294 | + "text/vtt", | |
| 295 | + ], | |
| 296 | + timeout=60, | |
| 297 | + ) | |
| 298 | + except RuntimeError as e: | |
| 299 | + logger.warning(f"Failed to download transcript {transcript_id}: {e}") | |
| 300 | + return None | |
| 301 | + | |
| 302 | + # content_result may be raw VTT text or a dict with raw key | |
| 303 | + if isinstance(content_result, dict): | |
| 304 | + raw = content_result.get("raw", "") | |
| 305 | + else: | |
| 306 | + raw = str(content_result) | |
| 307 | + | |
| 308 | + if not raw: | |
| 309 | + logger.warning(f"Empty transcript content for meeting {meeting_id}") | |
| 310 | + return None | |
| 311 | + | |
| 312 | + return _vtt_to_text(raw) | |
| 313 | + | |
| 314 | + # ------------------------------------------------------------------ | |
| 315 | + # Internal helpers | |
| 316 | + # ------------------------------------------------------------------ | |
| 317 | + | |
| 318 | + def _extract_meetings_list(self, result) -> list: | |
| 319 | + """Extract meetings list from Graph API response.""" | |
| 320 | + if isinstance(result, list): | |
| 321 | + return result | |
| 322 | + if isinstance(result, dict): | |
| 323 | + return result.get("value", []) | |
| 324 | + return [] | |
| 325 | + | |
| 326 | + def _extract_value_list(self, result) -> list: | |
| 327 | + """Extract value list from a Graph API response.""" | |
| 328 | + if isinstance(result, list): | |
| 329 | + return result | |
| 330 | + if isinstance(result, dict): | |
| 331 | + return result.get("value", []) | |
| 332 | + return [] | |
| 333 | + | |
| 334 | + def _get_meeting_recordings(self, meeting: dict) -> List[SourceFile]: | |
| 335 | + """Fetch recordings for a single meeting and return SourceFile entries.""" | |
| 336 | + meeting_id = meeting.get("id", "") | |
| 337 | + subject = meeting.get("subject", meeting.get("topic", "Teams Meeting")) | |
| 338 | + start_time = meeting.get("startDateTime", meeting.get("createdDateTime")) | |
| 339 | + | |
| 340 | + if not meeting_id: | |
| 341 | + return [] | |
| 342 | + | |
| 343 | + try: | |
| 344 | + result = _run_m365( | |
| 345 | + [ | |
| 346 | + "request", | |
| 347 | + "--url", | |
| 348 | + ( | |
| 349 | + f"https://graph.microsoft.com/v1.0/{self.user_id}" | |
| 350 | + f"/onlineMeetings/{meeting_id}/recordings" | |
| 351 | + ), | |
| 352 | + "--method", | |
| 353 | + "get", | |
| 354 | + ], | |
| 355 | + timeout=60, | |
| 356 | + ) | |
| 357 | + except RuntimeError: | |
| 358 | + return [] | |
| 359 | + | |
| 360 | + recordings = self._extract_value_list(result) | |
| 361 | + files: List[SourceFile] = [] | |
| 362 | + for rec in recordings: | |
| 363 | + rec_id = rec.get("id", "") | |
| 364 | + download_url = rec.get("content.downloadUrl", rec.get("contentUrl")) | |
| 365 | + files.append( | |
| 366 | + SourceFile( | |
| 367 | + name=f"{subject}.mp4", | |
| 368 | + id=f"{meeting_id}:{rec_id}", | |
| 369 | + mime_type="video/mp4", | |
| 370 | + modified_at=start_time, | |
| 371 | + path=download_url, | |
| 372 | + ) | |
| 373 | + ) | |
| 374 | + | |
| 375 | + return files |
| --- a/video_processor/sources/teams_recording_source.py | |
| +++ b/video_processor/sources/teams_recording_source.py | |
| @@ -0,0 +1,375 @@ | |
| --- a/video_processor/sources/teams_recording_source.py | |
| +++ b/video_processor/sources/teams_recording_source.py | |
| @@ -0,0 +1,375 @@ | |
| 1 | """Microsoft Teams meeting recording source using the m365 CLI. |
| 2 | |
| 3 | Fetches Teams meeting recordings and transcripts via the Microsoft Graph API |
| 4 | through the `m365` CLI tool. |
| 5 | |
| 6 | Requires: npm install -g @pnp/cli-microsoft365 |
| 7 | Auth: m365 login (interactive) |
| 8 | Docs: https://pnp.github.io/cli-microsoft365/ |
| 9 | """ |
| 10 | |
| 11 | import logging |
| 12 | import re |
| 13 | import shutil |
| 14 | import subprocess |
| 15 | from pathlib import Path |
| 16 | from typing import List, Optional |
| 17 | |
| 18 | from video_processor.sources.base import BaseSource, SourceFile |
| 19 | from video_processor.sources.m365_source import _run_m365 |
| 20 | |
| 21 | logger = logging.getLogger(__name__) |
| 22 | |
| 23 | |
| 24 | def _vtt_to_text(vtt: str) -> str: |
| 25 | """Strip VTT timing metadata and return plain text. |
| 26 | |
| 27 | Removes WEBVTT headers, timestamps (00:00:00.000 --> 00:00:05.000), |
| 28 | cue identifiers, and deduplicates consecutive identical lines. |
| 29 | """ |
| 30 | lines = vtt.splitlines() |
| 31 | text_lines: list[str] = [] |
| 32 | prev_line = "" |
| 33 | |
| 34 | for line in lines: |
| 35 | stripped = line.strip() |
| 36 | # Skip WEBVTT header and NOTE blocks |
| 37 | if stripped.startswith("WEBVTT") or stripped.startswith("NOTE"): |
| 38 | continue |
| 39 | # Skip timestamp lines (e.g. 00:00:01.000 --> 00:00:05.000) |
| 40 | if re.match(r"\d{2}:\d{2}[:\.][\d.]+ --> \d{2}:\d{2}[:\.][\d.]+", stripped): |
| 41 | continue |
| 42 | # Skip numeric cue identifiers |
| 43 | if re.match(r"^\d+$", stripped): |
| 44 | continue |
| 45 | # Skip blank lines |
| 46 | if not stripped: |
| 47 | continue |
| 48 | # Strip inline VTT tags like <v Speaker> |
| 49 | cleaned = re.sub(r"<[^>]+>", "", stripped).strip() |
| 50 | if cleaned and cleaned != prev_line: |
| 51 | text_lines.append(cleaned) |
| 52 | prev_line = cleaned |
| 53 | |
| 54 | return "\n".join(text_lines) |
| 55 | |
| 56 | |
| 57 | class TeamsRecordingSource(BaseSource): |
| 58 | """ |
| 59 | Fetch Teams meeting recordings and transcripts via the m365 CLI / Graph API. |
| 60 | |
| 61 | Usage: |
| 62 | source = TeamsRecordingSource(user_id="me") |
| 63 | source.authenticate() |
| 64 | recordings = source.list_videos() |
| 65 | source.download_all(recordings, Path("./recordings")) |
| 66 | |
| 67 | # Fetch transcript for a specific meeting |
| 68 | transcript = source.fetch_transcript(meeting_id) |
| 69 | """ |
| 70 | |
| 71 | def __init__(self, user_id: str = "me"): |
| 72 | self.user_id = user_id |
| 73 | |
| 74 | def authenticate(self) -> bool: |
| 75 | """Check if m365 CLI is installed and logged in.""" |
| 76 | if not shutil.which("m365"): |
| 77 | logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365") |
| 78 | return False |
| 79 | try: |
| 80 | result = _run_m365(["status"], timeout=10) |
| 81 | if isinstance(result, dict) and result.get("connectedAs"): |
| 82 | return True |
| 83 | if isinstance(result, str) and "Logged in" in result: |
| 84 | return True |
| 85 | logger.error("m365 not logged in. Run: m365 login") |
| 86 | return False |
| 87 | except (RuntimeError, subprocess.TimeoutExpired): |
| 88 | logger.error("m365 not logged in. Run: m365 login") |
| 89 | return False |
| 90 | |
| 91 | def list_videos( |
| 92 | self, |
| 93 | folder_id: Optional[str] = None, |
| 94 | folder_path: Optional[str] = None, |
| 95 | patterns: Optional[List[str]] = None, |
| 96 | ) -> List[SourceFile]: |
| 97 | """List Teams meeting recordings available for the user. |
| 98 | |
| 99 | Tries multiple approaches: |
| 100 | 1. Graph API onlineMeetings endpoint |
| 101 | 2. m365 teams meeting list command |
| 102 | 3. Fallback: search chat messages for recording links |
| 103 | """ |
| 104 | files: List[SourceFile] = [] |
| 105 | |
| 106 | # Approach 1: Graph API — list online meetings |
| 107 | try: |
| 108 | result = _run_m365( |
| 109 | [ |
| 110 | "request", |
| 111 | "--url", |
| 112 | f"https://graph.microsoft.com/v1.0/{self.user_id}/onlineMeetings", |
| 113 | "--method", |
| 114 | "get", |
| 115 | ], |
| 116 | timeout=60, |
| 117 | ) |
| 118 | meetings = self._extract_meetings_list(result) |
| 119 | for meeting in meetings: |
| 120 | recording_files = self._get_meeting_recordings(meeting) |
| 121 | files.extend(recording_files) |
| 122 | |
| 123 | if files: |
| 124 | logger.info(f"Found {len(files)} recording(s) via Graph API onlineMeetings") |
| 125 | return files |
| 126 | except RuntimeError as e: |
| 127 | logger.debug(f"onlineMeetings endpoint failed: {e}") |
| 128 | |
| 129 | # Approach 2: m365 teams meeting list |
| 130 | try: |
| 131 | result = _run_m365(["teams", "meeting", "list"], timeout=60) |
| 132 | meetings = result if isinstance(result, list) else [] |
| 133 | for meeting in meetings: |
| 134 | recording_files = self._get_meeting_recordings(meeting) |
| 135 | files.extend(recording_files) |
| 136 | |
| 137 | if files: |
| 138 | logger.info(f"Found {len(files)} recording(s) via m365 teams meeting list") |
| 139 | return files |
| 140 | except RuntimeError as e: |
| 141 | logger.debug(f"teams meeting list failed: {e}") |
| 142 | |
| 143 | # Approach 3: Fallback — search chat messages for recording links |
| 144 | try: |
| 145 | result = _run_m365( |
| 146 | [ |
| 147 | "request", |
| 148 | "--url", |
| 149 | ( |
| 150 | f"https://graph.microsoft.com/v1.0/{self.user_id}/chats" |
| 151 | "?$expand=messages($top=50)" |
| 152 | "&$filter=chatType eq 'meeting'" |
| 153 | "&$top=25" |
| 154 | ), |
| 155 | "--method", |
| 156 | "get", |
| 157 | ], |
| 158 | timeout=60, |
| 159 | ) |
| 160 | chats = self._extract_value_list(result) |
| 161 | for chat in chats: |
| 162 | messages = chat.get("messages", []) |
| 163 | for msg in messages: |
| 164 | body = msg.get("body", {}).get("content", "") |
| 165 | if "recording" in body.lower() or ".mp4" in body.lower(): |
| 166 | topic = chat.get("topic", "Meeting Recording") |
| 167 | chat_id = chat.get("id", "") |
| 168 | msg_id = msg.get("id", "") |
| 169 | files.append( |
| 170 | SourceFile( |
| 171 | name=f"{topic}.mp4", |
| 172 | id=f"{chat_id}:{msg_id}", |
| 173 | mime_type="video/mp4", |
| 174 | modified_at=msg.get("createdDateTime"), |
| 175 | ) |
| 176 | ) |
| 177 | if files: |
| 178 | logger.info(f"Found {len(files)} recording link(s) in meeting chats") |
| 179 | except RuntimeError as e: |
| 180 | logger.debug(f"Chat message fallback failed: {e}") |
| 181 | |
| 182 | if not files: |
| 183 | logger.warning("No Teams meeting recordings found") |
| 184 | |
| 185 | return files |
| 186 | |
| 187 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 188 | """Download a recording via its Graph API download URL.""" |
| 189 | destination = Path(destination) |
| 190 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 191 | |
| 192 | download_url = file.path |
| 193 | if download_url: |
| 194 | # Use the direct download URL from contentUrl / @microsoft.graph.downloadUrl |
| 195 | _run_m365( |
| 196 | [ |
| 197 | "request", |
| 198 | "--url", |
| 199 | download_url, |
| 200 | "--method", |
| 201 | "get", |
| 202 | "--filePath", |
| 203 | str(destination), |
| 204 | ], |
| 205 | timeout=300, |
| 206 | ) |
| 207 | else: |
| 208 | # Try to get download URL from the recording ID |
| 209 | meeting_id, _, recording_id = file.id.partition(":") |
| 210 | if recording_id: |
| 211 | url = ( |
| 212 | f"https://graph.microsoft.com/v1.0/{self.user_id}" |
| 213 | f"/onlineMeetings/{meeting_id}" |
| 214 | f"/recordings/{recording_id}/content" |
| 215 | ) |
| 216 | else: |
| 217 | url = ( |
| 218 | f"https://graph.microsoft.com/v1.0/{self.user_id}" |
| 219 | f"/onlineMeetings/{meeting_id}/recordings" |
| 220 | ) |
| 221 | # Fetch recording list to find the content URL |
| 222 | result = _run_m365( |
| 223 | ["request", "--url", url, "--method", "get"], |
| 224 | timeout=60, |
| 225 | ) |
| 226 | recordings = self._extract_value_list(result) |
| 227 | if recordings: |
| 228 | url = ( |
| 229 | f"https://graph.microsoft.com/v1.0/{self.user_id}" |
| 230 | f"/onlineMeetings/{meeting_id}" |
| 231 | f"/recordings/{recordings[0].get('id', '')}/content" |
| 232 | ) |
| 233 | |
| 234 | _run_m365( |
| 235 | [ |
| 236 | "request", |
| 237 | "--url", |
| 238 | url, |
| 239 | "--method", |
| 240 | "get", |
| 241 | "--filePath", |
| 242 | str(destination), |
| 243 | ], |
| 244 | timeout=300, |
| 245 | ) |
| 246 | |
| 247 | logger.info(f"Downloaded {file.name} to {destination}") |
| 248 | return destination |
| 249 | |
| 250 | def fetch_transcript(self, meeting_id: str) -> Optional[str]: |
| 251 | """Fetch the transcript for a Teams meeting. |
| 252 | |
| 253 | Queries the Graph API transcripts endpoint, downloads the transcript |
| 254 | content, and converts VTT format to plain text. |
| 255 | """ |
| 256 | try: |
| 257 | result = _run_m365( |
| 258 | [ |
| 259 | "request", |
| 260 | "--url", |
| 261 | ( |
| 262 | f"https://graph.microsoft.com/v1.0/{self.user_id}" |
| 263 | f"/onlineMeetings/{meeting_id}/transcripts" |
| 264 | ), |
| 265 | "--method", |
| 266 | "get", |
| 267 | ], |
| 268 | timeout=60, |
| 269 | ) |
| 270 | except RuntimeError as e: |
| 271 | logger.warning(f"Failed to list transcripts for meeting {meeting_id}: {e}") |
| 272 | return None |
| 273 | |
| 274 | transcripts = self._extract_value_list(result) |
| 275 | if not transcripts: |
| 276 | logger.info(f"No transcripts found for meeting {meeting_id}") |
| 277 | return None |
| 278 | |
| 279 | # Download the first available transcript |
| 280 | transcript_id = transcripts[0].get("id", "") |
| 281 | try: |
| 282 | content_result = _run_m365( |
| 283 | [ |
| 284 | "request", |
| 285 | "--url", |
| 286 | ( |
| 287 | f"https://graph.microsoft.com/v1.0/{self.user_id}" |
| 288 | f"/onlineMeetings/{meeting_id}" |
| 289 | f"/transcripts/{transcript_id}/content" |
| 290 | ), |
| 291 | "--method", |
| 292 | "get", |
| 293 | "--accept", |
| 294 | "text/vtt", |
| 295 | ], |
| 296 | timeout=60, |
| 297 | ) |
| 298 | except RuntimeError as e: |
| 299 | logger.warning(f"Failed to download transcript {transcript_id}: {e}") |
| 300 | return None |
| 301 | |
| 302 | # content_result may be raw VTT text or a dict with raw key |
| 303 | if isinstance(content_result, dict): |
| 304 | raw = content_result.get("raw", "") |
| 305 | else: |
| 306 | raw = str(content_result) |
| 307 | |
| 308 | if not raw: |
| 309 | logger.warning(f"Empty transcript content for meeting {meeting_id}") |
| 310 | return None |
| 311 | |
| 312 | return _vtt_to_text(raw) |
| 313 | |
| 314 | # ------------------------------------------------------------------ |
| 315 | # Internal helpers |
| 316 | # ------------------------------------------------------------------ |
| 317 | |
| 318 | def _extract_meetings_list(self, result) -> list: |
| 319 | """Extract meetings list from Graph API response.""" |
| 320 | if isinstance(result, list): |
| 321 | return result |
| 322 | if isinstance(result, dict): |
| 323 | return result.get("value", []) |
| 324 | return [] |
| 325 | |
| 326 | def _extract_value_list(self, result) -> list: |
| 327 | """Extract value list from a Graph API response.""" |
| 328 | if isinstance(result, list): |
| 329 | return result |
| 330 | if isinstance(result, dict): |
| 331 | return result.get("value", []) |
| 332 | return [] |
| 333 | |
| 334 | def _get_meeting_recordings(self, meeting: dict) -> List[SourceFile]: |
| 335 | """Fetch recordings for a single meeting and return SourceFile entries.""" |
| 336 | meeting_id = meeting.get("id", "") |
| 337 | subject = meeting.get("subject", meeting.get("topic", "Teams Meeting")) |
| 338 | start_time = meeting.get("startDateTime", meeting.get("createdDateTime")) |
| 339 | |
| 340 | if not meeting_id: |
| 341 | return [] |
| 342 | |
| 343 | try: |
| 344 | result = _run_m365( |
| 345 | [ |
| 346 | "request", |
| 347 | "--url", |
| 348 | ( |
| 349 | f"https://graph.microsoft.com/v1.0/{self.user_id}" |
| 350 | f"/onlineMeetings/{meeting_id}/recordings" |
| 351 | ), |
| 352 | "--method", |
| 353 | "get", |
| 354 | ], |
| 355 | timeout=60, |
| 356 | ) |
| 357 | except RuntimeError: |
| 358 | return [] |
| 359 | |
| 360 | recordings = self._extract_value_list(result) |
| 361 | files: List[SourceFile] = [] |
| 362 | for rec in recordings: |
| 363 | rec_id = rec.get("id", "") |
| 364 | download_url = rec.get("content.downloadUrl", rec.get("contentUrl")) |
| 365 | files.append( |
| 366 | SourceFile( |
| 367 | name=f"{subject}.mp4", |
| 368 | id=f"{meeting_id}:{rec_id}", |
| 369 | mime_type="video/mp4", |
| 370 | modified_at=start_time, |
| 371 | path=download_url, |
| 372 | ) |
| 373 | ) |
| 374 | |
| 375 | return files |
| --- a/video_processor/sources/twitter_source.py | ||
| +++ b/video_processor/sources/twitter_source.py | ||
| @@ -0,0 +1,129 @@ | ||
| 1 | +"""Twitter/X source connector -- stub requiring auth or gallery-dl.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +from pathlib import Path | |
| 5 | +from typing import List, Optional | |
| 6 | + | |
| 7 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 8 | + | |
| 9 | +logger = logging.getLogger(__name__) | |
| 10 | + | |
| 11 | + | |
| 12 | +class TwitterSource(BaseSource): | |
| 13 | + """ | |
| 14 | + Fetch Twitter/X posts and threads. | |
| 15 | + | |
| 16 | + Twitter API v2 requires authentication. This connector attempts to use | |
| 17 | + gallery-dl as a fallback for public tweets. | |
| 18 | + | |
| 19 | + Auth options: | |
| 20 | + - Set TWITTER_BEARER_TOKEN env var for API v2 access | |
| 21 | + - Install gallery-dl for scraping public tweets: pip install gallery-dl | |
| 22 | + """ | |
| 23 | + | |
| 24 | + def __init__(self, url: str): | |
| 25 | + self.url = url | |
| 26 | + self._bearer_token: Optional[str] = None | |
| 27 | + | |
| 28 | + def authenticate(self) -> bool: | |
| 29 | + """Check for Twitter API token or gallery-dl availability.""" | |
| 30 | + import os | |
| 31 | + | |
| 32 | + self._bearer_token = os.environ.get("TWITTER_BEARER_TOKEN") | |
| 33 | + if self._bearer_token: | |
| 34 | + return True | |
| 35 | + | |
| 36 | + # Check for gallery-dl fallback | |
| 37 | + try: | |
| 38 | + import gallery_dl # noqa: F401 | |
| 39 | + | |
| 40 | + logger.info("Using gallery-dl for Twitter content extraction") | |
| 41 | + return True | |
| 42 | + except ImportError: | |
| 43 | + pass | |
| 44 | + | |
| 45 | + logger.error( | |
| 46 | + "Twitter source requires either:\n" | |
| 47 | + " 1. TWITTER_BEARER_TOKEN env var (Twitter API v2)\n" | |
| 48 | + " 2. gallery-dl installed: pip install gallery-dl\n" | |
| 49 | + "Twitter API access: https://developer.twitter.com/en/portal/dashboard" | |
| 50 | + ) | |
| 51 | + return False | |
| 52 | + | |
| 53 | + def list_videos( | |
| 54 | + self, | |
| 55 | + folder_id: Optional[str] = None, | |
| 56 | + folder_path: Optional[str] = None, | |
| 57 | + patterns: Optional[List[str]] = None, | |
| 58 | + ) -> List[SourceFile]: | |
| 59 | + """Return a single SourceFile for the tweet/thread.""" | |
| 60 | + return [ | |
| 61 | + SourceFile( | |
| 62 | + name=self.url.split("/")[-1] or "tweet", | |
| 63 | + id=self.url, | |
| 64 | + mime_type="text/plain", | |
| 65 | + ) | |
| 66 | + ] | |
| 67 | + | |
| 68 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 69 | + """Download tweet content as text.""" | |
| 70 | + destination = Path(destination) | |
| 71 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 72 | + text = self.fetch_text() | |
| 73 | + destination.write_text(text, encoding="utf-8") | |
| 74 | + logger.info(f"Saved Twitter content to {destination}") | |
| 75 | + return destination | |
| 76 | + | |
| 77 | + def fetch_text(self) -> str: | |
| 78 | + """Extract tweet text via API or gallery-dl.""" | |
| 79 | + if self._bearer_token: | |
| 80 | + return self._fetch_via_api() | |
| 81 | + | |
| 82 | + try: | |
| 83 | + return self._fetch_via_gallery_dl() | |
| 84 | + except ImportError: | |
| 85 | + raise RuntimeError( | |
| 86 | + "No Twitter extraction method available. See authenticate() for setup." | |
| 87 | + ) | |
| 88 | + | |
| 89 | + def _fetch_via_api(self) -> str: | |
| 90 | + """Fetch tweet via Twitter API v2.""" | |
| 91 | + import re | |
| 92 | + | |
| 93 | + import requests | |
| 94 | + | |
| 95 | + match = re.search(r"/status/(\d+)", self.url) | |
| 96 | + if not match: | |
| 97 | + raise ValueError(f"Could not extract tweet ID from: {self.url}") | |
| 98 | + | |
| 99 | + tweet_id = match.group(1) | |
| 100 | + resp = requests.get( | |
| 101 | + f"https://api.twitter.com/2/tweets/{tweet_id}", | |
| 102 | + headers={"Authorization": f"Bearer {self._bearer_token}"}, | |
| 103 | + params={"tweet.fields": "author_id,created_at,text"}, | |
| 104 | + timeout=15, | |
| 105 | + ) | |
| 106 | + resp.raise_for_status() | |
| 107 | + data = resp.json().get("data", {}) | |
| 108 | + return f"{data.get('text', '')}\n\nCreated: {data.get('created_at', 'unknown')}" | |
| 109 | + | |
| 110 | + def _fetch_via_gallery_dl(self) -> str: | |
| 111 | + """Use gallery-dl to extract tweet metadata.""" | |
| 112 | + import json | |
| 113 | + import subprocess | |
| 114 | + | |
| 115 | + result = subprocess.run( | |
| 116 | + ["gallery-dl", "--dump-json", self.url], | |
| 117 | + capture_output=True, | |
| 118 | + text=True, | |
| 119 | + timeout=30, | |
| 120 | + ) | |
| 121 | + if result.returncode != 0: | |
| 122 | + raise RuntimeError(f"gallery-dl failed: {result.stderr}") | |
| 123 | + | |
| 124 | + items = json.loads(result.stdout) | |
| 125 | + texts = [] | |
| 126 | + for item in items if isinstance(items, list) else [items]: | |
| 127 | + if isinstance(item, dict): | |
| 128 | + texts.append(item.get("content", item.get("text", str(item)))) | |
| 129 | + return "\n\n".join(texts) if texts else "No text content extracted." |
| --- a/video_processor/sources/twitter_source.py | |
| +++ b/video_processor/sources/twitter_source.py | |
| @@ -0,0 +1,129 @@ | |
| --- a/video_processor/sources/twitter_source.py | |
| +++ b/video_processor/sources/twitter_source.py | |
| @@ -0,0 +1,129 @@ | |
| 1 | """Twitter/X source connector -- stub requiring auth or gallery-dl.""" |
| 2 | |
| 3 | import logging |
| 4 | from pathlib import Path |
| 5 | from typing import List, Optional |
| 6 | |
| 7 | from video_processor.sources.base import BaseSource, SourceFile |
| 8 | |
| 9 | logger = logging.getLogger(__name__) |
| 10 | |
| 11 | |
| 12 | class TwitterSource(BaseSource): |
| 13 | """ |
| 14 | Fetch Twitter/X posts and threads. |
| 15 | |
| 16 | Twitter API v2 requires authentication. This connector attempts to use |
| 17 | gallery-dl as a fallback for public tweets. |
| 18 | |
| 19 | Auth options: |
| 20 | - Set TWITTER_BEARER_TOKEN env var for API v2 access |
| 21 | - Install gallery-dl for scraping public tweets: pip install gallery-dl |
| 22 | """ |
| 23 | |
| 24 | def __init__(self, url: str): |
| 25 | self.url = url |
| 26 | self._bearer_token: Optional[str] = None |
| 27 | |
| 28 | def authenticate(self) -> bool: |
| 29 | """Check for Twitter API token or gallery-dl availability.""" |
| 30 | import os |
| 31 | |
| 32 | self._bearer_token = os.environ.get("TWITTER_BEARER_TOKEN") |
| 33 | if self._bearer_token: |
| 34 | return True |
| 35 | |
| 36 | # Check for gallery-dl fallback |
| 37 | try: |
| 38 | import gallery_dl # noqa: F401 |
| 39 | |
| 40 | logger.info("Using gallery-dl for Twitter content extraction") |
| 41 | return True |
| 42 | except ImportError: |
| 43 | pass |
| 44 | |
| 45 | logger.error( |
| 46 | "Twitter source requires either:\n" |
| 47 | " 1. TWITTER_BEARER_TOKEN env var (Twitter API v2)\n" |
| 48 | " 2. gallery-dl installed: pip install gallery-dl\n" |
| 49 | "Twitter API access: https://developer.twitter.com/en/portal/dashboard" |
| 50 | ) |
| 51 | return False |
| 52 | |
| 53 | def list_videos( |
| 54 | self, |
| 55 | folder_id: Optional[str] = None, |
| 56 | folder_path: Optional[str] = None, |
| 57 | patterns: Optional[List[str]] = None, |
| 58 | ) -> List[SourceFile]: |
| 59 | """Return a single SourceFile for the tweet/thread.""" |
| 60 | return [ |
| 61 | SourceFile( |
| 62 | name=self.url.split("/")[-1] or "tweet", |
| 63 | id=self.url, |
| 64 | mime_type="text/plain", |
| 65 | ) |
| 66 | ] |
| 67 | |
| 68 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 69 | """Download tweet content as text.""" |
| 70 | destination = Path(destination) |
| 71 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 72 | text = self.fetch_text() |
| 73 | destination.write_text(text, encoding="utf-8") |
| 74 | logger.info(f"Saved Twitter content to {destination}") |
| 75 | return destination |
| 76 | |
| 77 | def fetch_text(self) -> str: |
| 78 | """Extract tweet text via API or gallery-dl.""" |
| 79 | if self._bearer_token: |
| 80 | return self._fetch_via_api() |
| 81 | |
| 82 | try: |
| 83 | return self._fetch_via_gallery_dl() |
| 84 | except ImportError: |
| 85 | raise RuntimeError( |
| 86 | "No Twitter extraction method available. See authenticate() for setup." |
| 87 | ) |
| 88 | |
| 89 | def _fetch_via_api(self) -> str: |
| 90 | """Fetch tweet via Twitter API v2.""" |
| 91 | import re |
| 92 | |
| 93 | import requests |
| 94 | |
| 95 | match = re.search(r"/status/(\d+)", self.url) |
| 96 | if not match: |
| 97 | raise ValueError(f"Could not extract tweet ID from: {self.url}") |
| 98 | |
| 99 | tweet_id = match.group(1) |
| 100 | resp = requests.get( |
| 101 | f"https://api.twitter.com/2/tweets/{tweet_id}", |
| 102 | headers={"Authorization": f"Bearer {self._bearer_token}"}, |
| 103 | params={"tweet.fields": "author_id,created_at,text"}, |
| 104 | timeout=15, |
| 105 | ) |
| 106 | resp.raise_for_status() |
| 107 | data = resp.json().get("data", {}) |
| 108 | return f"{data.get('text', '')}\n\nCreated: {data.get('created_at', 'unknown')}" |
| 109 | |
| 110 | def _fetch_via_gallery_dl(self) -> str: |
| 111 | """Use gallery-dl to extract tweet metadata.""" |
| 112 | import json |
| 113 | import subprocess |
| 114 | |
| 115 | result = subprocess.run( |
| 116 | ["gallery-dl", "--dump-json", self.url], |
| 117 | capture_output=True, |
| 118 | text=True, |
| 119 | timeout=30, |
| 120 | ) |
| 121 | if result.returncode != 0: |
| 122 | raise RuntimeError(f"gallery-dl failed: {result.stderr}") |
| 123 | |
| 124 | items = json.loads(result.stdout) |
| 125 | texts = [] |
| 126 | for item in items if isinstance(items, list) else [items]: |
| 127 | if isinstance(item, dict): |
| 128 | texts.append(item.get("content", item.get("text", str(item)))) |
| 129 | return "\n\n".join(texts) if texts else "No text content extracted." |
| --- a/video_processor/sources/web_source.py | ||
| +++ b/video_processor/sources/web_source.py | ||
| @@ -0,0 +1,90 @@ | ||
| 1 | +"""Web page source connector for fetching and extracting text from URLs.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import re | |
| 5 | +from pathlib import Path | |
| 6 | +from typing import List, Optional | |
| 7 | + | |
| 8 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 9 | + | |
| 10 | +logger = logging.getLogger(__name__) | |
| 11 | + | |
| 12 | + | |
| 13 | +def _strip_html_tags(html: str) -> str: | |
| 14 | + """Minimal HTML tag stripper using stdlib only.""" | |
| 15 | + text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE) | |
| 16 | + text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE) | |
| 17 | + text = re.sub(r"<(nav|footer|header)[^>]*>.*?</\1>", "", text, flags=re.DOTALL | re.IGNORECASE) | |
| 18 | + text = re.sub(r"<[^>]+>", " ", text) | |
| 19 | + text = re.sub(r"\s+", " ", text).strip() | |
| 20 | + return text | |
| 21 | + | |
| 22 | + | |
| 23 | +class WebSource(BaseSource): | |
| 24 | + """ | |
| 25 | + Fetch web pages and extract main text content. | |
| 26 | + | |
| 27 | + Uses requests + BeautifulSoup (optional) for content extraction. | |
| 28 | + Falls back to regex-based tag stripping if bs4 is unavailable. | |
| 29 | + | |
| 30 | + Requires: pip install requests (included in most environments) | |
| 31 | + Optional: pip install beautifulsoup4 lxml | |
| 32 | + """ | |
| 33 | + | |
| 34 | + def __init__(self, url: str): | |
| 35 | + self.url = url | |
| 36 | + self._content: Optional[str] = None | |
| 37 | + | |
| 38 | + def authenticate(self) -> bool: | |
| 39 | + """No auth needed for public web pages.""" | |
| 40 | + return True | |
| 41 | + | |
| 42 | + def list_videos( | |
| 43 | + self, | |
| 44 | + folder_id: Optional[str] = None, | |
| 45 | + folder_path: Optional[str] = None, | |
| 46 | + patterns: Optional[List[str]] = None, | |
| 47 | + ) -> List[SourceFile]: | |
| 48 | + """Return a single SourceFile representing the web page.""" | |
| 49 | + return [ | |
| 50 | + SourceFile( | |
| 51 | + name=self.url.split("/")[-1] or "page", | |
| 52 | + id=self.url, | |
| 53 | + mime_type="text/html", | |
| 54 | + ) | |
| 55 | + ] | |
| 56 | + | |
| 57 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 58 | + """Download and save the extracted text content.""" | |
| 59 | + destination = Path(destination) | |
| 60 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 61 | + text = self.fetch_text() | |
| 62 | + destination.write_text(text, encoding="utf-8") | |
| 63 | + logger.info(f"Saved web content to {destination}") | |
| 64 | + return destination | |
| 65 | + | |
| 66 | + def fetch_text(self) -> str: | |
| 67 | + """Fetch the URL and extract main text content.""" | |
| 68 | + if self._content is not None: | |
| 69 | + return self._content | |
| 70 | + | |
| 71 | + import requests | |
| 72 | + | |
| 73 | + resp = requests.get(self.url, timeout=30, headers={"User-Agent": "PlanOpticon/0.3"}) | |
| 74 | + resp.raise_for_status() | |
| 75 | + | |
| 76 | + try: | |
| 77 | + from bs4 import BeautifulSoup | |
| 78 | + | |
| 79 | + soup = BeautifulSoup(resp.text, "html.parser") | |
| 80 | + # Remove non-content elements | |
| 81 | + for tag in soup(["script", "style", "nav", "footer", "header", "aside"]): | |
| 82 | + tag.decompose() | |
| 83 | + # Prefer <article> or <main> if present | |
| 84 | + main = soup.find("article") or soup.find("main") or soup.find("body") | |
| 85 | + self._content = main.get_text(separator="\n", strip=True) if main else soup.get_text() | |
| 86 | + except ImportError: | |
| 87 | + logger.debug("beautifulsoup4 not available, using regex fallback") | |
| 88 | + self._content = _strip_html_tags(resp.text) | |
| 89 | + | |
| 90 | + return self._content |
| --- a/video_processor/sources/web_source.py | |
| +++ b/video_processor/sources/web_source.py | |
| @@ -0,0 +1,90 @@ | |
| --- a/video_processor/sources/web_source.py | |
| +++ b/video_processor/sources/web_source.py | |
| @@ -0,0 +1,90 @@ | |
| 1 | """Web page source connector for fetching and extracting text from URLs.""" |
| 2 | |
| 3 | import logging |
| 4 | import re |
| 5 | from pathlib import Path |
| 6 | from typing import List, Optional |
| 7 | |
| 8 | from video_processor.sources.base import BaseSource, SourceFile |
| 9 | |
| 10 | logger = logging.getLogger(__name__) |
| 11 | |
| 12 | |
| 13 | def _strip_html_tags(html: str) -> str: |
| 14 | """Minimal HTML tag stripper using stdlib only.""" |
| 15 | text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE) |
| 16 | text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE) |
| 17 | text = re.sub(r"<(nav|footer|header)[^>]*>.*?</\1>", "", text, flags=re.DOTALL | re.IGNORECASE) |
| 18 | text = re.sub(r"<[^>]+>", " ", text) |
| 19 | text = re.sub(r"\s+", " ", text).strip() |
| 20 | return text |
| 21 | |
| 22 | |
| 23 | class WebSource(BaseSource): |
| 24 | """ |
| 25 | Fetch web pages and extract main text content. |
| 26 | |
| 27 | Uses requests + BeautifulSoup (optional) for content extraction. |
| 28 | Falls back to regex-based tag stripping if bs4 is unavailable. |
| 29 | |
| 30 | Requires: pip install requests (included in most environments) |
| 31 | Optional: pip install beautifulsoup4 lxml |
| 32 | """ |
| 33 | |
| 34 | def __init__(self, url: str): |
| 35 | self.url = url |
| 36 | self._content: Optional[str] = None |
| 37 | |
| 38 | def authenticate(self) -> bool: |
| 39 | """No auth needed for public web pages.""" |
| 40 | return True |
| 41 | |
| 42 | def list_videos( |
| 43 | self, |
| 44 | folder_id: Optional[str] = None, |
| 45 | folder_path: Optional[str] = None, |
| 46 | patterns: Optional[List[str]] = None, |
| 47 | ) -> List[SourceFile]: |
| 48 | """Return a single SourceFile representing the web page.""" |
| 49 | return [ |
| 50 | SourceFile( |
| 51 | name=self.url.split("/")[-1] or "page", |
| 52 | id=self.url, |
| 53 | mime_type="text/html", |
| 54 | ) |
| 55 | ] |
| 56 | |
| 57 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 58 | """Download and save the extracted text content.""" |
| 59 | destination = Path(destination) |
| 60 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 61 | text = self.fetch_text() |
| 62 | destination.write_text(text, encoding="utf-8") |
| 63 | logger.info(f"Saved web content to {destination}") |
| 64 | return destination |
| 65 | |
| 66 | def fetch_text(self) -> str: |
| 67 | """Fetch the URL and extract main text content.""" |
| 68 | if self._content is not None: |
| 69 | return self._content |
| 70 | |
| 71 | import requests |
| 72 | |
| 73 | resp = requests.get(self.url, timeout=30, headers={"User-Agent": "PlanOpticon/0.3"}) |
| 74 | resp.raise_for_status() |
| 75 | |
| 76 | try: |
| 77 | from bs4 import BeautifulSoup |
| 78 | |
| 79 | soup = BeautifulSoup(resp.text, "html.parser") |
| 80 | # Remove non-content elements |
| 81 | for tag in soup(["script", "style", "nav", "footer", "header", "aside"]): |
| 82 | tag.decompose() |
| 83 | # Prefer <article> or <main> if present |
| 84 | main = soup.find("article") or soup.find("main") or soup.find("body") |
| 85 | self._content = main.get_text(separator="\n", strip=True) if main else soup.get_text() |
| 86 | except ImportError: |
| 87 | logger.debug("beautifulsoup4 not available, using regex fallback") |
| 88 | self._content = _strip_html_tags(resp.text) |
| 89 | |
| 90 | return self._content |
| --- a/video_processor/sources/youtube_source.py | ||
| +++ b/video_processor/sources/youtube_source.py | ||
| @@ -0,0 +1,118 @@ | ||
| 1 | +"""YouTube source connector using yt-dlp for video/audio download and caption extraction.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import re | |
| 5 | +from pathlib import Path | |
| 6 | +from typing import List, Optional | |
| 7 | + | |
| 8 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 9 | + | |
| 10 | +logger = logging.getLogger(__name__) | |
| 11 | + | |
| 12 | +_YT_URL_PATTERN = re.compile( | |
| 13 | + r"(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/shorts/)([\w-]{11})" | |
| 14 | +) | |
| 15 | + | |
| 16 | + | |
| 17 | +def _extract_video_id(url: str) -> str: | |
| 18 | + """Extract the 11-character video ID from a YouTube URL.""" | |
| 19 | + match = _YT_URL_PATTERN.search(url) | |
| 20 | + if not match: | |
| 21 | + raise ValueError(f"Could not extract YouTube video ID from: {url}") | |
| 22 | + return match.group(1) | |
| 23 | + | |
| 24 | + | |
| 25 | +class YouTubeSource(BaseSource): | |
| 26 | + """ | |
| 27 | + Download YouTube videos/audio and extract captions via yt-dlp. | |
| 28 | + | |
| 29 | + Requires: pip install yt-dlp | |
| 30 | + """ | |
| 31 | + | |
| 32 | + def __init__(self, url: str, audio_only: bool = False): | |
| 33 | + self.url = url | |
| 34 | + self.video_id = _extract_video_id(url) | |
| 35 | + self.audio_only = audio_only | |
| 36 | + | |
| 37 | + def authenticate(self) -> bool: | |
| 38 | + """No auth needed for public videos. Returns True if yt-dlp is available.""" | |
| 39 | + try: | |
| 40 | + import yt_dlp # noqa: F401 | |
| 41 | + | |
| 42 | + return True | |
| 43 | + except ImportError: | |
| 44 | + logger.error("yt-dlp not installed. Run: pip install yt-dlp") | |
| 45 | + return False | |
| 46 | + | |
| 47 | + def list_videos( | |
| 48 | + self, | |
| 49 | + folder_id: Optional[str] = None, | |
| 50 | + folder_path: Optional[str] = None, | |
| 51 | + patterns: Optional[List[str]] = None, | |
| 52 | + ) -> List[SourceFile]: | |
| 53 | + """Return a single SourceFile representing the YouTube video.""" | |
| 54 | + import yt_dlp | |
| 55 | + | |
| 56 | + with yt_dlp.YoutubeDL({"quiet": True}) as ydl: | |
| 57 | + info = ydl.extract_info(self.url, download=False) | |
| 58 | + | |
| 59 | + return [ | |
| 60 | + SourceFile( | |
| 61 | + name=info.get("title", self.video_id), | |
| 62 | + id=self.video_id, | |
| 63 | + size_bytes=info.get("filesize"), | |
| 64 | + mime_type="audio/webm" if self.audio_only else "video/mp4", | |
| 65 | + ) | |
| 66 | + ] | |
| 67 | + | |
| 68 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 69 | + """Download the video or audio to destination path.""" | |
| 70 | + import yt_dlp | |
| 71 | + | |
| 72 | + destination = Path(destination) | |
| 73 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 74 | + | |
| 75 | + opts = { | |
| 76 | + "outtmpl": str(destination), | |
| 77 | + "quiet": True, | |
| 78 | + } | |
| 79 | + if self.audio_only: | |
| 80 | + opts["format"] = "bestaudio/best" | |
| 81 | + opts["postprocessors"] = [{"key": "FFmpegExtractAudio", "preferredcodec": "mp3"}] | |
| 82 | + else: | |
| 83 | + opts["format"] = "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best" | |
| 84 | + | |
| 85 | + with yt_dlp.YoutubeDL(opts) as ydl: | |
| 86 | + ydl.download([self.url]) | |
| 87 | + | |
| 88 | + logger.info(f"Downloaded YouTube video {self.video_id} to {destination}") | |
| 89 | + return destination | |
| 90 | + | |
| 91 | + def fetch_captions(self, lang: str = "en") -> Optional[str]: | |
| 92 | + """Extract auto-generated or manual captions as plain text.""" | |
| 93 | + import yt_dlp | |
| 94 | + | |
| 95 | + opts = { | |
| 96 | + "quiet": True, | |
| 97 | + "writeautomaticsub": True, | |
| 98 | + "writesubtitles": True, | |
| 99 | + "subtitleslangs": [lang], | |
| 100 | + "skip_download": True, | |
| 101 | + } | |
| 102 | + with yt_dlp.YoutubeDL(opts) as ydl: | |
| 103 | + info = ydl.extract_info(self.url, download=False) | |
| 104 | + | |
| 105 | + subs = info.get("subtitles", {}).get(lang) or info.get("automatic_captions", {}).get(lang) | |
| 106 | + if not subs: | |
| 107 | + logger.warning(f"No captions found for language '{lang}'") | |
| 108 | + return None | |
| 109 | + | |
| 110 | + # Prefer vtt/srv format for text extraction | |
| 111 | + for fmt in subs: | |
| 112 | + if fmt.get("ext") in ("vtt", "srv3", "json3"): | |
| 113 | + import requests | |
| 114 | + | |
| 115 | + resp = requests.get(fmt["url"], timeout=30) | |
| 116 | + return resp.text | |
| 117 | + | |
| 118 | + return None |
| --- a/video_processor/sources/youtube_source.py | |
| +++ b/video_processor/sources/youtube_source.py | |
| @@ -0,0 +1,118 @@ | |
| --- a/video_processor/sources/youtube_source.py | |
| +++ b/video_processor/sources/youtube_source.py | |
| @@ -0,0 +1,118 @@ | |
| 1 | """YouTube source connector using yt-dlp for video/audio download and caption extraction.""" |
| 2 | |
| 3 | import logging |
| 4 | import re |
| 5 | from pathlib import Path |
| 6 | from typing import List, Optional |
| 7 | |
| 8 | from video_processor.sources.base import BaseSource, SourceFile |
| 9 | |
| 10 | logger = logging.getLogger(__name__) |
| 11 | |
| 12 | _YT_URL_PATTERN = re.compile( |
| 13 | r"(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/shorts/)([\w-]{11})" |
| 14 | ) |
| 15 | |
| 16 | |
| 17 | def _extract_video_id(url: str) -> str: |
| 18 | """Extract the 11-character video ID from a YouTube URL.""" |
| 19 | match = _YT_URL_PATTERN.search(url) |
| 20 | if not match: |
| 21 | raise ValueError(f"Could not extract YouTube video ID from: {url}") |
| 22 | return match.group(1) |
| 23 | |
| 24 | |
| 25 | class YouTubeSource(BaseSource): |
| 26 | """ |
| 27 | Download YouTube videos/audio and extract captions via yt-dlp. |
| 28 | |
| 29 | Requires: pip install yt-dlp |
| 30 | """ |
| 31 | |
| 32 | def __init__(self, url: str, audio_only: bool = False): |
| 33 | self.url = url |
| 34 | self.video_id = _extract_video_id(url) |
| 35 | self.audio_only = audio_only |
| 36 | |
| 37 | def authenticate(self) -> bool: |
| 38 | """No auth needed for public videos. Returns True if yt-dlp is available.""" |
| 39 | try: |
| 40 | import yt_dlp # noqa: F401 |
| 41 | |
| 42 | return True |
| 43 | except ImportError: |
| 44 | logger.error("yt-dlp not installed. Run: pip install yt-dlp") |
| 45 | return False |
| 46 | |
| 47 | def list_videos( |
| 48 | self, |
| 49 | folder_id: Optional[str] = None, |
| 50 | folder_path: Optional[str] = None, |
| 51 | patterns: Optional[List[str]] = None, |
| 52 | ) -> List[SourceFile]: |
| 53 | """Return a single SourceFile representing the YouTube video.""" |
| 54 | import yt_dlp |
| 55 | |
| 56 | with yt_dlp.YoutubeDL({"quiet": True}) as ydl: |
| 57 | info = ydl.extract_info(self.url, download=False) |
| 58 | |
| 59 | return [ |
| 60 | SourceFile( |
| 61 | name=info.get("title", self.video_id), |
| 62 | id=self.video_id, |
| 63 | size_bytes=info.get("filesize"), |
| 64 | mime_type="audio/webm" if self.audio_only else "video/mp4", |
| 65 | ) |
| 66 | ] |
| 67 | |
| 68 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 69 | """Download the video or audio to destination path.""" |
| 70 | import yt_dlp |
| 71 | |
| 72 | destination = Path(destination) |
| 73 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 74 | |
| 75 | opts = { |
| 76 | "outtmpl": str(destination), |
| 77 | "quiet": True, |
| 78 | } |
| 79 | if self.audio_only: |
| 80 | opts["format"] = "bestaudio/best" |
| 81 | opts["postprocessors"] = [{"key": "FFmpegExtractAudio", "preferredcodec": "mp3"}] |
| 82 | else: |
| 83 | opts["format"] = "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best" |
| 84 | |
| 85 | with yt_dlp.YoutubeDL(opts) as ydl: |
| 86 | ydl.download([self.url]) |
| 87 | |
| 88 | logger.info(f"Downloaded YouTube video {self.video_id} to {destination}") |
| 89 | return destination |
| 90 | |
| 91 | def fetch_captions(self, lang: str = "en") -> Optional[str]: |
| 92 | """Extract auto-generated or manual captions as plain text.""" |
| 93 | import yt_dlp |
| 94 | |
| 95 | opts = { |
| 96 | "quiet": True, |
| 97 | "writeautomaticsub": True, |
| 98 | "writesubtitles": True, |
| 99 | "subtitleslangs": [lang], |
| 100 | "skip_download": True, |
| 101 | } |
| 102 | with yt_dlp.YoutubeDL(opts) as ydl: |
| 103 | info = ydl.extract_info(self.url, download=False) |
| 104 | |
| 105 | subs = info.get("subtitles", {}).get(lang) or info.get("automatic_captions", {}).get(lang) |
| 106 | if not subs: |
| 107 | logger.warning(f"No captions found for language '{lang}'") |
| 108 | return None |
| 109 | |
| 110 | # Prefer vtt/srv format for text extraction |
| 111 | for fmt in subs: |
| 112 | if fmt.get("ext") in ("vtt", "srv3", "json3"): |
| 113 | import requests |
| 114 | |
| 115 | resp = requests.get(fmt["url"], timeout=30) |
| 116 | return resp.text |
| 117 | |
| 118 | return None |
| --- a/video_processor/sources/zoom_source.py | ||
| +++ b/video_processor/sources/zoom_source.py | ||
| @@ -0,0 +1,399 @@ | ||
| 1 | +"""Zoom cloud recordings source integration with OAuth support.""" | |
| 2 | + | |
| 3 | +import base64 | |
| 4 | +import hashlib | |
| 5 | +import json | |
| 6 | +import logging | |
| 7 | +import os | |
| 8 | +import secrets | |
| 9 | +import time | |
| 10 | +import webbrowser | |
| 11 | +from pathlib import Path | |
| 12 | +from typing import Dict, List, Optional | |
| 13 | + | |
| 14 | +import requests | |
| 15 | + | |
| 16 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 17 | + | |
| 18 | +logger = logging.getLogger(__name__) | |
| 19 | + | |
| 20 | +_TOKEN_PATH = Path.home() / ".planopticon" / "zoom_token.json" | |
| 21 | +_BASE_URL = "https://api.zoom.us/v2" | |
| 22 | +_OAUTH_BASE = "https://zoom.us/oauth" | |
| 23 | + | |
| 24 | +# Map Zoom file_type values to MIME types | |
| 25 | +_MIME_TYPES = { | |
| 26 | + "MP4": "video/mp4", | |
| 27 | + "M4A": "audio/mp4", | |
| 28 | + "CHAT": "text/plain", | |
| 29 | + "TRANSCRIPT": "text/vtt", | |
| 30 | + "CSV": "text/csv", | |
| 31 | + "TIMELINE": "application/json", | |
| 32 | +} | |
| 33 | + | |
| 34 | + | |
| 35 | +class ZoomSource(BaseSource): | |
| 36 | + """ | |
| 37 | + Zoom cloud recordings source with OAuth2 support. | |
| 38 | + | |
| 39 | + Auth methods (tried in order): | |
| 40 | + 1. Saved token: Load from token_path, refresh if expired | |
| 41 | + 2. Server-to-Server OAuth: Uses account_id with client credentials | |
| 42 | + 3. OAuth2 Authorization Code with PKCE: Interactive browser flow | |
| 43 | + """ | |
| 44 | + | |
| 45 | + def __init__( | |
| 46 | + self, | |
| 47 | + client_id: Optional[str] = None, | |
| 48 | + client_secret: Optional[str] = None, | |
| 49 | + account_id: Optional[str] = None, | |
| 50 | + token_path: Optional[Path] = None, | |
| 51 | + ): | |
| 52 | + """ | |
| 53 | + Initialize Zoom source. | |
| 54 | + | |
| 55 | + Parameters | |
| 56 | + ---------- | |
| 57 | + client_id : str, optional | |
| 58 | + Zoom OAuth app client ID. Falls back to ZOOM_CLIENT_ID env var. | |
| 59 | + client_secret : str, optional | |
| 60 | + Zoom OAuth app client secret. Falls back to ZOOM_CLIENT_SECRET env var. | |
| 61 | + account_id : str, optional | |
| 62 | + Zoom account ID for Server-to-Server OAuth. Falls back to ZOOM_ACCOUNT_ID env var. | |
| 63 | + token_path : Path, optional | |
| 64 | + Where to store/load OAuth tokens. | |
| 65 | + """ | |
| 66 | + self.client_id = client_id or os.environ.get("ZOOM_CLIENT_ID") | |
| 67 | + self.client_secret = client_secret or os.environ.get("ZOOM_CLIENT_SECRET") | |
| 68 | + self.account_id = account_id or os.environ.get("ZOOM_ACCOUNT_ID") | |
| 69 | + self.token_path = token_path or _TOKEN_PATH | |
| 70 | + self._access_token: Optional[str] = None | |
| 71 | + self._token_data: Optional[Dict] = None | |
| 72 | + | |
| 73 | + def authenticate(self) -> bool: | |
| 74 | + """Authenticate with Zoom API.""" | |
| 75 | + # Try 1: Load saved token | |
| 76 | + if self.token_path.exists(): | |
| 77 | + if self._auth_saved_token(): | |
| 78 | + return True | |
| 79 | + | |
| 80 | + # Try 2: Server-to-Server OAuth (if account_id is set) | |
| 81 | + if self.account_id: | |
| 82 | + return self._auth_server_to_server() | |
| 83 | + | |
| 84 | + # Try 3: OAuth2 Authorization Code flow with PKCE | |
| 85 | + return self._auth_oauth_pkce() | |
| 86 | + | |
| 87 | + def _auth_saved_token(self) -> bool: | |
| 88 | + """Authenticate using a saved OAuth token, refreshing if expired.""" | |
| 89 | + try: | |
| 90 | + data = json.loads(self.token_path.read_text()) | |
| 91 | + expires_at = data.get("expires_at", 0) | |
| 92 | + | |
| 93 | + if time.time() < expires_at: | |
| 94 | + # Token still valid | |
| 95 | + self._access_token = data["access_token"] | |
| 96 | + self._token_data = data | |
| 97 | + logger.info("Authenticated with Zoom via saved token") | |
| 98 | + return True | |
| 99 | + | |
| 100 | + # Token expired, try to refresh | |
| 101 | + if data.get("refresh_token"): | |
| 102 | + return self._refresh_token() | |
| 103 | + | |
| 104 | + # Server-to-Server tokens don't have refresh tokens; | |
| 105 | + # fall through to re-authenticate | |
| 106 | + return False | |
| 107 | + except Exception: | |
| 108 | + return False | |
| 109 | + | |
| 110 | + def _auth_server_to_server(self) -> bool: | |
| 111 | + """Authenticate using Server-to-Server OAuth (account credentials).""" | |
| 112 | + if not self.client_id or not self.client_secret: | |
| 113 | + logger.error( | |
| 114 | + "Zoom client_id and client_secret required for Server-to-Server OAuth. " | |
| 115 | + "Set ZOOM_CLIENT_ID and ZOOM_CLIENT_SECRET env vars." | |
| 116 | + ) | |
| 117 | + return False | |
| 118 | + | |
| 119 | + try: | |
| 120 | + resp = requests.post( | |
| 121 | + f"{_OAUTH_BASE}/token", | |
| 122 | + params={ | |
| 123 | + "grant_type": "account_credentials", | |
| 124 | + "account_id": self.account_id, | |
| 125 | + }, | |
| 126 | + auth=(self.client_id, self.client_secret), | |
| 127 | + timeout=30, | |
| 128 | + ) | |
| 129 | + resp.raise_for_status() | |
| 130 | + token_data = resp.json() | |
| 131 | + | |
| 132 | + self._access_token = token_data["access_token"] | |
| 133 | + self._token_data = { | |
| 134 | + "access_token": token_data["access_token"], | |
| 135 | + "expires_at": time.time() + token_data.get("expires_in", 3600) - 60, | |
| 136 | + "token_type": token_data.get("token_type", "bearer"), | |
| 137 | + } | |
| 138 | + | |
| 139 | + self._save_token(self._token_data) | |
| 140 | + logger.info("Authenticated with Zoom via Server-to-Server OAuth") | |
| 141 | + return True | |
| 142 | + except Exception as e: | |
| 143 | + logger.error(f"Zoom Server-to-Server OAuth failed: {e}") | |
| 144 | + return False | |
| 145 | + | |
| 146 | + def _auth_oauth_pkce(self) -> bool: | |
| 147 | + """Run OAuth2 Authorization Code flow with PKCE.""" | |
| 148 | + if not self.client_id: | |
| 149 | + logger.error("Zoom client_id required for OAuth. Set ZOOM_CLIENT_ID env var.") | |
| 150 | + return False | |
| 151 | + | |
| 152 | + try: | |
| 153 | + # Generate PKCE code verifier and challenge | |
| 154 | + code_verifier = secrets.token_urlsafe(64) | |
| 155 | + code_challenge = ( | |
| 156 | + base64.urlsafe_b64encode(hashlib.sha256(code_verifier.encode("ascii")).digest()) | |
| 157 | + .rstrip(b"=") | |
| 158 | + .decode("ascii") | |
| 159 | + ) | |
| 160 | + | |
| 161 | + authorize_url = ( | |
| 162 | + f"{_OAUTH_BASE}/authorize" | |
| 163 | + f"?response_type=code" | |
| 164 | + f"&client_id={self.client_id}" | |
| 165 | + f"&redirect_uri=urn:ietf:wg:oauth:2.0:oob" | |
| 166 | + f"&code_challenge={code_challenge}" | |
| 167 | + f"&code_challenge_method=S256" | |
| 168 | + ) | |
| 169 | + | |
| 170 | + print(f"\nOpen this URL to authorize PlanOpticon:\n{authorize_url}\n") | |
| 171 | + | |
| 172 | + try: | |
| 173 | + webbrowser.open(authorize_url) | |
| 174 | + except Exception: | |
| 175 | + pass | |
| 176 | + | |
| 177 | + auth_code = input("Enter the authorization code: ").strip() | |
| 178 | + | |
| 179 | + # Exchange authorization code for tokens | |
| 180 | + payload = { | |
| 181 | + "grant_type": "authorization_code", | |
| 182 | + "code": auth_code, | |
| 183 | + "redirect_uri": "urn:ietf:wg:oauth:2.0:oob", | |
| 184 | + "code_verifier": code_verifier, | |
| 185 | + } | |
| 186 | + | |
| 187 | + resp = requests.post( | |
| 188 | + f"{_OAUTH_BASE}/token", | |
| 189 | + data=payload, | |
| 190 | + auth=(self.client_id, self.client_secret or ""), | |
| 191 | + timeout=30, | |
| 192 | + ) | |
| 193 | + resp.raise_for_status() | |
| 194 | + token_data = resp.json() | |
| 195 | + | |
| 196 | + self._access_token = token_data["access_token"] | |
| 197 | + self._token_data = { | |
| 198 | + "access_token": token_data["access_token"], | |
| 199 | + "refresh_token": token_data.get("refresh_token"), | |
| 200 | + "expires_at": time.time() + token_data.get("expires_in", 3600) - 60, | |
| 201 | + "token_type": token_data.get("token_type", "bearer"), | |
| 202 | + "client_id": self.client_id, | |
| 203 | + "client_secret": self.client_secret or "", | |
| 204 | + } | |
| 205 | + | |
| 206 | + self._save_token(self._token_data) | |
| 207 | + logger.info("Authenticated with Zoom via OAuth PKCE") | |
| 208 | + return True | |
| 209 | + except Exception as e: | |
| 210 | + logger.error(f"Zoom OAuth PKCE failed: {e}") | |
| 211 | + return False | |
| 212 | + | |
| 213 | + def _refresh_token(self) -> bool: | |
| 214 | + """Refresh an expired OAuth token.""" | |
| 215 | + try: | |
| 216 | + data = json.loads(self.token_path.read_text()) | |
| 217 | + refresh_token = data.get("refresh_token") | |
| 218 | + client_id = data.get("client_id") or self.client_id | |
| 219 | + client_secret = data.get("client_secret") or self.client_secret | |
| 220 | + | |
| 221 | + if not refresh_token or not client_id: | |
| 222 | + return False | |
| 223 | + | |
| 224 | + resp = requests.post( | |
| 225 | + f"{_OAUTH_BASE}/token", | |
| 226 | + data={ | |
| 227 | + "grant_type": "refresh_token", | |
| 228 | + "refresh_token": refresh_token, | |
| 229 | + }, | |
| 230 | + auth=(client_id, client_secret or ""), | |
| 231 | + timeout=30, | |
| 232 | + ) | |
| 233 | + resp.raise_for_status() | |
| 234 | + token_data = resp.json() | |
| 235 | + | |
| 236 | + self._access_token = token_data["access_token"] | |
| 237 | + self._token_data = { | |
| 238 | + "access_token": token_data["access_token"], | |
| 239 | + "refresh_token": token_data.get("refresh_token", refresh_token), | |
| 240 | + "expires_at": time.time() + token_data.get("expires_in", 3600) - 60, | |
| 241 | + "token_type": token_data.get("token_type", "bearer"), | |
| 242 | + "client_id": client_id, | |
| 243 | + "client_secret": client_secret or "", | |
| 244 | + } | |
| 245 | + | |
| 246 | + self._save_token(self._token_data) | |
| 247 | + logger.info("Refreshed Zoom OAuth token") | |
| 248 | + return True | |
| 249 | + except Exception as e: | |
| 250 | + logger.error(f"Zoom token refresh failed: {e}") | |
| 251 | + return False | |
| 252 | + | |
| 253 | + def _save_token(self, data: Dict) -> None: | |
| 254 | + """Save token data to disk.""" | |
| 255 | + self.token_path.parent.mkdir(parents=True, exist_ok=True) | |
| 256 | + self.token_path.write_text(json.dumps(data)) | |
| 257 | + logger.info(f"OAuth token saved to {self.token_path}") | |
| 258 | + | |
| 259 | + def _api_get(self, endpoint: str, params: Optional[Dict] = None) -> requests.Response: | |
| 260 | + """Make an authenticated GET request to the Zoom API.""" | |
| 261 | + if not self._access_token: | |
| 262 | + raise RuntimeError("Not authenticated. Call authenticate() first.") | |
| 263 | + | |
| 264 | + url = f"{_BASE_URL}/{endpoint.lstrip('/')}" | |
| 265 | + resp = requests.get( | |
| 266 | + url, | |
| 267 | + headers={"Authorization": f"Bearer {self._access_token}"}, | |
| 268 | + params=params, | |
| 269 | + timeout=30, | |
| 270 | + ) | |
| 271 | + resp.raise_for_status() | |
| 272 | + return resp | |
| 273 | + | |
| 274 | + def list_videos( | |
| 275 | + self, | |
| 276 | + folder_id: Optional[str] = None, | |
| 277 | + folder_path: Optional[str] = None, | |
| 278 | + patterns: Optional[List[str]] = None, | |
| 279 | + ) -> List[SourceFile]: | |
| 280 | + """List video files from Zoom cloud recordings.""" | |
| 281 | + if not self._access_token: | |
| 282 | + raise RuntimeError("Not authenticated. Call authenticate() first.") | |
| 283 | + | |
| 284 | + files: List[SourceFile] = [] | |
| 285 | + next_page_token = "" | |
| 286 | + | |
| 287 | + while True: | |
| 288 | + params: Dict = {} | |
| 289 | + if next_page_token: | |
| 290 | + params["next_page_token"] = next_page_token | |
| 291 | + | |
| 292 | + resp = self._api_get("users/me/recordings", params=params) | |
| 293 | + data = resp.json() | |
| 294 | + | |
| 295 | + for meeting in data.get("meetings", []): | |
| 296 | + meeting_id = str(meeting.get("id", "")) | |
| 297 | + topic = meeting.get("topic", "Untitled Meeting") | |
| 298 | + start_time = meeting.get("start_time") | |
| 299 | + | |
| 300 | + for rec_file in meeting.get("recording_files", []): | |
| 301 | + file_type = rec_file.get("file_type", "") | |
| 302 | + mime_type = _MIME_TYPES.get(file_type) | |
| 303 | + | |
| 304 | + # Build a descriptive name | |
| 305 | + file_ext = rec_file.get("file_extension", file_type).lower() | |
| 306 | + file_name = f"{topic}.{file_ext}" | |
| 307 | + | |
| 308 | + if patterns: | |
| 309 | + if not any(file_name.endswith(p.replace("*", "")) for p in patterns): | |
| 310 | + continue | |
| 311 | + | |
| 312 | + files.append( | |
| 313 | + SourceFile( | |
| 314 | + name=file_name, | |
| 315 | + id=meeting_id, | |
| 316 | + size_bytes=rec_file.get("file_size"), | |
| 317 | + mime_type=mime_type, | |
| 318 | + modified_at=start_time, | |
| 319 | + path=rec_file.get("download_url"), | |
| 320 | + ) | |
| 321 | + ) | |
| 322 | + | |
| 323 | + next_page_token = data.get("next_page_token", "") | |
| 324 | + if not next_page_token: | |
| 325 | + break | |
| 326 | + | |
| 327 | + logger.info(f"Found {len(files)} recordings in Zoom") | |
| 328 | + return files | |
| 329 | + | |
| 330 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 331 | + """Download a recording file from Zoom.""" | |
| 332 | + if not self._access_token: | |
| 333 | + raise RuntimeError("Not authenticated. Call authenticate() first.") | |
| 334 | + | |
| 335 | + destination = Path(destination) | |
| 336 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 337 | + | |
| 338 | + download_url = file.path | |
| 339 | + if not download_url: | |
| 340 | + raise ValueError(f"No download URL for file: {file.name}") | |
| 341 | + | |
| 342 | + resp = requests.get( | |
| 343 | + download_url, | |
| 344 | + headers={"Authorization": f"Bearer {self._access_token}"}, | |
| 345 | + stream=True, | |
| 346 | + timeout=60, | |
| 347 | + ) | |
| 348 | + resp.raise_for_status() | |
| 349 | + | |
| 350 | + with open(destination, "wb") as f: | |
| 351 | + for chunk in resp.iter_content(chunk_size=8192): | |
| 352 | + f.write(chunk) | |
| 353 | + | |
| 354 | + logger.info(f"Downloaded {file.name} to {destination}") | |
| 355 | + return destination | |
| 356 | + | |
| 357 | + def fetch_transcript(self, meeting_id: str) -> Optional[str]: | |
| 358 | + """ | |
| 359 | + Fetch the transcript (VTT) for a Zoom meeting recording. | |
| 360 | + | |
| 361 | + Looks for transcript files in the recording's file list and downloads | |
| 362 | + the content as text. | |
| 363 | + | |
| 364 | + Parameters | |
| 365 | + ---------- | |
| 366 | + meeting_id : str | |
| 367 | + The Zoom meeting ID. | |
| 368 | + | |
| 369 | + Returns | |
| 370 | + ------- | |
| 371 | + str or None | |
| 372 | + Transcript text if available, None otherwise. | |
| 373 | + """ | |
| 374 | + if not self._access_token: | |
| 375 | + raise RuntimeError("Not authenticated. Call authenticate() first.") | |
| 376 | + | |
| 377 | + try: | |
| 378 | + resp = self._api_get(f"meetings/{meeting_id}/recordings") | |
| 379 | + data = resp.json() | |
| 380 | + | |
| 381 | + for rec_file in data.get("recording_files", []): | |
| 382 | + file_type = rec_file.get("file_type", "") | |
| 383 | + if file_type == "TRANSCRIPT": | |
| 384 | + download_url = rec_file.get("download_url") | |
| 385 | + if download_url: | |
| 386 | + dl_resp = requests.get( | |
| 387 | + download_url, | |
| 388 | + headers={"Authorization": f"Bearer {self._access_token}"}, | |
| 389 | + timeout=30, | |
| 390 | + ) | |
| 391 | + dl_resp.raise_for_status() | |
| 392 | + logger.info(f"Fetched transcript for meeting {meeting_id}") | |
| 393 | + return dl_resp.text | |
| 394 | + | |
| 395 | + logger.info(f"No transcript found for meeting {meeting_id}") | |
| 396 | + return None | |
| 397 | + except Exception as e: | |
| 398 | + logger.error(f"Failed to fetch transcript for meeting {meeting_id}: {e}") | |
| 399 | + return None |
| --- a/video_processor/sources/zoom_source.py | |
| +++ b/video_processor/sources/zoom_source.py | |
| @@ -0,0 +1,399 @@ | |
| --- a/video_processor/sources/zoom_source.py | |
| +++ b/video_processor/sources/zoom_source.py | |
| @@ -0,0 +1,399 @@ | |
| 1 | """Zoom cloud recordings source integration with OAuth support.""" |
| 2 | |
| 3 | import base64 |
| 4 | import hashlib |
| 5 | import json |
| 6 | import logging |
| 7 | import os |
| 8 | import secrets |
| 9 | import time |
| 10 | import webbrowser |
| 11 | from pathlib import Path |
| 12 | from typing import Dict, List, Optional |
| 13 | |
| 14 | import requests |
| 15 | |
| 16 | from video_processor.sources.base import BaseSource, SourceFile |
| 17 | |
| 18 | logger = logging.getLogger(__name__) |
| 19 | |
| 20 | _TOKEN_PATH = Path.home() / ".planopticon" / "zoom_token.json" |
| 21 | _BASE_URL = "https://api.zoom.us/v2" |
| 22 | _OAUTH_BASE = "https://zoom.us/oauth" |
| 23 | |
| 24 | # Map Zoom file_type values to MIME types |
| 25 | _MIME_TYPES = { |
| 26 | "MP4": "video/mp4", |
| 27 | "M4A": "audio/mp4", |
| 28 | "CHAT": "text/plain", |
| 29 | "TRANSCRIPT": "text/vtt", |
| 30 | "CSV": "text/csv", |
| 31 | "TIMELINE": "application/json", |
| 32 | } |
| 33 | |
| 34 | |
| 35 | class ZoomSource(BaseSource): |
| 36 | """ |
| 37 | Zoom cloud recordings source with OAuth2 support. |
| 38 | |
| 39 | Auth methods (tried in order): |
| 40 | 1. Saved token: Load from token_path, refresh if expired |
| 41 | 2. Server-to-Server OAuth: Uses account_id with client credentials |
| 42 | 3. OAuth2 Authorization Code with PKCE: Interactive browser flow |
| 43 | """ |
| 44 | |
| 45 | def __init__( |
| 46 | self, |
| 47 | client_id: Optional[str] = None, |
| 48 | client_secret: Optional[str] = None, |
| 49 | account_id: Optional[str] = None, |
| 50 | token_path: Optional[Path] = None, |
| 51 | ): |
| 52 | """ |
| 53 | Initialize Zoom source. |
| 54 | |
| 55 | Parameters |
| 56 | ---------- |
| 57 | client_id : str, optional |
| 58 | Zoom OAuth app client ID. Falls back to ZOOM_CLIENT_ID env var. |
| 59 | client_secret : str, optional |
| 60 | Zoom OAuth app client secret. Falls back to ZOOM_CLIENT_SECRET env var. |
| 61 | account_id : str, optional |
| 62 | Zoom account ID for Server-to-Server OAuth. Falls back to ZOOM_ACCOUNT_ID env var. |
| 63 | token_path : Path, optional |
| 64 | Where to store/load OAuth tokens. |
| 65 | """ |
| 66 | self.client_id = client_id or os.environ.get("ZOOM_CLIENT_ID") |
| 67 | self.client_secret = client_secret or os.environ.get("ZOOM_CLIENT_SECRET") |
| 68 | self.account_id = account_id or os.environ.get("ZOOM_ACCOUNT_ID") |
| 69 | self.token_path = token_path or _TOKEN_PATH |
| 70 | self._access_token: Optional[str] = None |
| 71 | self._token_data: Optional[Dict] = None |
| 72 | |
| 73 | def authenticate(self) -> bool: |
| 74 | """Authenticate with Zoom API.""" |
| 75 | # Try 1: Load saved token |
| 76 | if self.token_path.exists(): |
| 77 | if self._auth_saved_token(): |
| 78 | return True |
| 79 | |
| 80 | # Try 2: Server-to-Server OAuth (if account_id is set) |
| 81 | if self.account_id: |
| 82 | return self._auth_server_to_server() |
| 83 | |
| 84 | # Try 3: OAuth2 Authorization Code flow with PKCE |
| 85 | return self._auth_oauth_pkce() |
| 86 | |
| 87 | def _auth_saved_token(self) -> bool: |
| 88 | """Authenticate using a saved OAuth token, refreshing if expired.""" |
| 89 | try: |
| 90 | data = json.loads(self.token_path.read_text()) |
| 91 | expires_at = data.get("expires_at", 0) |
| 92 | |
| 93 | if time.time() < expires_at: |
| 94 | # Token still valid |
| 95 | self._access_token = data["access_token"] |
| 96 | self._token_data = data |
| 97 | logger.info("Authenticated with Zoom via saved token") |
| 98 | return True |
| 99 | |
| 100 | # Token expired, try to refresh |
| 101 | if data.get("refresh_token"): |
| 102 | return self._refresh_token() |
| 103 | |
| 104 | # Server-to-Server tokens don't have refresh tokens; |
| 105 | # fall through to re-authenticate |
| 106 | return False |
| 107 | except Exception: |
| 108 | return False |
| 109 | |
| 110 | def _auth_server_to_server(self) -> bool: |
| 111 | """Authenticate using Server-to-Server OAuth (account credentials).""" |
| 112 | if not self.client_id or not self.client_secret: |
| 113 | logger.error( |
| 114 | "Zoom client_id and client_secret required for Server-to-Server OAuth. " |
| 115 | "Set ZOOM_CLIENT_ID and ZOOM_CLIENT_SECRET env vars." |
| 116 | ) |
| 117 | return False |
| 118 | |
| 119 | try: |
| 120 | resp = requests.post( |
| 121 | f"{_OAUTH_BASE}/token", |
| 122 | params={ |
| 123 | "grant_type": "account_credentials", |
| 124 | "account_id": self.account_id, |
| 125 | }, |
| 126 | auth=(self.client_id, self.client_secret), |
| 127 | timeout=30, |
| 128 | ) |
| 129 | resp.raise_for_status() |
| 130 | token_data = resp.json() |
| 131 | |
| 132 | self._access_token = token_data["access_token"] |
| 133 | self._token_data = { |
| 134 | "access_token": token_data["access_token"], |
| 135 | "expires_at": time.time() + token_data.get("expires_in", 3600) - 60, |
| 136 | "token_type": token_data.get("token_type", "bearer"), |
| 137 | } |
| 138 | |
| 139 | self._save_token(self._token_data) |
| 140 | logger.info("Authenticated with Zoom via Server-to-Server OAuth") |
| 141 | return True |
| 142 | except Exception as e: |
| 143 | logger.error(f"Zoom Server-to-Server OAuth failed: {e}") |
| 144 | return False |
| 145 | |
| 146 | def _auth_oauth_pkce(self) -> bool: |
| 147 | """Run OAuth2 Authorization Code flow with PKCE.""" |
| 148 | if not self.client_id: |
| 149 | logger.error("Zoom client_id required for OAuth. Set ZOOM_CLIENT_ID env var.") |
| 150 | return False |
| 151 | |
| 152 | try: |
| 153 | # Generate PKCE code verifier and challenge |
| 154 | code_verifier = secrets.token_urlsafe(64) |
| 155 | code_challenge = ( |
| 156 | base64.urlsafe_b64encode(hashlib.sha256(code_verifier.encode("ascii")).digest()) |
| 157 | .rstrip(b"=") |
| 158 | .decode("ascii") |
| 159 | ) |
| 160 | |
| 161 | authorize_url = ( |
| 162 | f"{_OAUTH_BASE}/authorize" |
| 163 | f"?response_type=code" |
| 164 | f"&client_id={self.client_id}" |
| 165 | f"&redirect_uri=urn:ietf:wg:oauth:2.0:oob" |
| 166 | f"&code_challenge={code_challenge}" |
| 167 | f"&code_challenge_method=S256" |
| 168 | ) |
| 169 | |
| 170 | print(f"\nOpen this URL to authorize PlanOpticon:\n{authorize_url}\n") |
| 171 | |
| 172 | try: |
| 173 | webbrowser.open(authorize_url) |
| 174 | except Exception: |
| 175 | pass |
| 176 | |
| 177 | auth_code = input("Enter the authorization code: ").strip() |
| 178 | |
| 179 | # Exchange authorization code for tokens |
| 180 | payload = { |
| 181 | "grant_type": "authorization_code", |
| 182 | "code": auth_code, |
| 183 | "redirect_uri": "urn:ietf:wg:oauth:2.0:oob", |
| 184 | "code_verifier": code_verifier, |
| 185 | } |
| 186 | |
| 187 | resp = requests.post( |
| 188 | f"{_OAUTH_BASE}/token", |
| 189 | data=payload, |
| 190 | auth=(self.client_id, self.client_secret or ""), |
| 191 | timeout=30, |
| 192 | ) |
| 193 | resp.raise_for_status() |
| 194 | token_data = resp.json() |
| 195 | |
| 196 | self._access_token = token_data["access_token"] |
| 197 | self._token_data = { |
| 198 | "access_token": token_data["access_token"], |
| 199 | "refresh_token": token_data.get("refresh_token"), |
| 200 | "expires_at": time.time() + token_data.get("expires_in", 3600) - 60, |
| 201 | "token_type": token_data.get("token_type", "bearer"), |
| 202 | "client_id": self.client_id, |
| 203 | "client_secret": self.client_secret or "", |
| 204 | } |
| 205 | |
| 206 | self._save_token(self._token_data) |
| 207 | logger.info("Authenticated with Zoom via OAuth PKCE") |
| 208 | return True |
| 209 | except Exception as e: |
| 210 | logger.error(f"Zoom OAuth PKCE failed: {e}") |
| 211 | return False |
| 212 | |
| 213 | def _refresh_token(self) -> bool: |
| 214 | """Refresh an expired OAuth token.""" |
| 215 | try: |
| 216 | data = json.loads(self.token_path.read_text()) |
| 217 | refresh_token = data.get("refresh_token") |
| 218 | client_id = data.get("client_id") or self.client_id |
| 219 | client_secret = data.get("client_secret") or self.client_secret |
| 220 | |
| 221 | if not refresh_token or not client_id: |
| 222 | return False |
| 223 | |
| 224 | resp = requests.post( |
| 225 | f"{_OAUTH_BASE}/token", |
| 226 | data={ |
| 227 | "grant_type": "refresh_token", |
| 228 | "refresh_token": refresh_token, |
| 229 | }, |
| 230 | auth=(client_id, client_secret or ""), |
| 231 | timeout=30, |
| 232 | ) |
| 233 | resp.raise_for_status() |
| 234 | token_data = resp.json() |
| 235 | |
| 236 | self._access_token = token_data["access_token"] |
| 237 | self._token_data = { |
| 238 | "access_token": token_data["access_token"], |
| 239 | "refresh_token": token_data.get("refresh_token", refresh_token), |
| 240 | "expires_at": time.time() + token_data.get("expires_in", 3600) - 60, |
| 241 | "token_type": token_data.get("token_type", "bearer"), |
| 242 | "client_id": client_id, |
| 243 | "client_secret": client_secret or "", |
| 244 | } |
| 245 | |
| 246 | self._save_token(self._token_data) |
| 247 | logger.info("Refreshed Zoom OAuth token") |
| 248 | return True |
| 249 | except Exception as e: |
| 250 | logger.error(f"Zoom token refresh failed: {e}") |
| 251 | return False |
| 252 | |
| 253 | def _save_token(self, data: Dict) -> None: |
| 254 | """Save token data to disk.""" |
| 255 | self.token_path.parent.mkdir(parents=True, exist_ok=True) |
| 256 | self.token_path.write_text(json.dumps(data)) |
| 257 | logger.info(f"OAuth token saved to {self.token_path}") |
| 258 | |
| 259 | def _api_get(self, endpoint: str, params: Optional[Dict] = None) -> requests.Response: |
| 260 | """Make an authenticated GET request to the Zoom API.""" |
| 261 | if not self._access_token: |
| 262 | raise RuntimeError("Not authenticated. Call authenticate() first.") |
| 263 | |
| 264 | url = f"{_BASE_URL}/{endpoint.lstrip('/')}" |
| 265 | resp = requests.get( |
| 266 | url, |
| 267 | headers={"Authorization": f"Bearer {self._access_token}"}, |
| 268 | params=params, |
| 269 | timeout=30, |
| 270 | ) |
| 271 | resp.raise_for_status() |
| 272 | return resp |
| 273 | |
| 274 | def list_videos( |
| 275 | self, |
| 276 | folder_id: Optional[str] = None, |
| 277 | folder_path: Optional[str] = None, |
| 278 | patterns: Optional[List[str]] = None, |
| 279 | ) -> List[SourceFile]: |
| 280 | """List video files from Zoom cloud recordings.""" |
| 281 | if not self._access_token: |
| 282 | raise RuntimeError("Not authenticated. Call authenticate() first.") |
| 283 | |
| 284 | files: List[SourceFile] = [] |
| 285 | next_page_token = "" |
| 286 | |
| 287 | while True: |
| 288 | params: Dict = {} |
| 289 | if next_page_token: |
| 290 | params["next_page_token"] = next_page_token |
| 291 | |
| 292 | resp = self._api_get("users/me/recordings", params=params) |
| 293 | data = resp.json() |
| 294 | |
| 295 | for meeting in data.get("meetings", []): |
| 296 | meeting_id = str(meeting.get("id", "")) |
| 297 | topic = meeting.get("topic", "Untitled Meeting") |
| 298 | start_time = meeting.get("start_time") |
| 299 | |
| 300 | for rec_file in meeting.get("recording_files", []): |
| 301 | file_type = rec_file.get("file_type", "") |
| 302 | mime_type = _MIME_TYPES.get(file_type) |
| 303 | |
| 304 | # Build a descriptive name |
| 305 | file_ext = rec_file.get("file_extension", file_type).lower() |
| 306 | file_name = f"{topic}.{file_ext}" |
| 307 | |
| 308 | if patterns: |
| 309 | if not any(file_name.endswith(p.replace("*", "")) for p in patterns): |
| 310 | continue |
| 311 | |
| 312 | files.append( |
| 313 | SourceFile( |
| 314 | name=file_name, |
| 315 | id=meeting_id, |
| 316 | size_bytes=rec_file.get("file_size"), |
| 317 | mime_type=mime_type, |
| 318 | modified_at=start_time, |
| 319 | path=rec_file.get("download_url"), |
| 320 | ) |
| 321 | ) |
| 322 | |
| 323 | next_page_token = data.get("next_page_token", "") |
| 324 | if not next_page_token: |
| 325 | break |
| 326 | |
| 327 | logger.info(f"Found {len(files)} recordings in Zoom") |
| 328 | return files |
| 329 | |
| 330 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 331 | """Download a recording file from Zoom.""" |
| 332 | if not self._access_token: |
| 333 | raise RuntimeError("Not authenticated. Call authenticate() first.") |
| 334 | |
| 335 | destination = Path(destination) |
| 336 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 337 | |
| 338 | download_url = file.path |
| 339 | if not download_url: |
| 340 | raise ValueError(f"No download URL for file: {file.name}") |
| 341 | |
| 342 | resp = requests.get( |
| 343 | download_url, |
| 344 | headers={"Authorization": f"Bearer {self._access_token}"}, |
| 345 | stream=True, |
| 346 | timeout=60, |
| 347 | ) |
| 348 | resp.raise_for_status() |
| 349 | |
| 350 | with open(destination, "wb") as f: |
| 351 | for chunk in resp.iter_content(chunk_size=8192): |
| 352 | f.write(chunk) |
| 353 | |
| 354 | logger.info(f"Downloaded {file.name} to {destination}") |
| 355 | return destination |
| 356 | |
| 357 | def fetch_transcript(self, meeting_id: str) -> Optional[str]: |
| 358 | """ |
| 359 | Fetch the transcript (VTT) for a Zoom meeting recording. |
| 360 | |
| 361 | Looks for transcript files in the recording's file list and downloads |
| 362 | the content as text. |
| 363 | |
| 364 | Parameters |
| 365 | ---------- |
| 366 | meeting_id : str |
| 367 | The Zoom meeting ID. |
| 368 | |
| 369 | Returns |
| 370 | ------- |
| 371 | str or None |
| 372 | Transcript text if available, None otherwise. |
| 373 | """ |
| 374 | if not self._access_token: |
| 375 | raise RuntimeError("Not authenticated. Call authenticate() first.") |
| 376 | |
| 377 | try: |
| 378 | resp = self._api_get(f"meetings/{meeting_id}/recordings") |
| 379 | data = resp.json() |
| 380 | |
| 381 | for rec_file in data.get("recording_files", []): |
| 382 | file_type = rec_file.get("file_type", "") |
| 383 | if file_type == "TRANSCRIPT": |
| 384 | download_url = rec_file.get("download_url") |
| 385 | if download_url: |
| 386 | dl_resp = requests.get( |
| 387 | download_url, |
| 388 | headers={"Authorization": f"Bearer {self._access_token}"}, |
| 389 | timeout=30, |
| 390 | ) |
| 391 | dl_resp.raise_for_status() |
| 392 | logger.info(f"Fetched transcript for meeting {meeting_id}") |
| 393 | return dl_resp.text |
| 394 | |
| 395 | logger.info(f"No transcript found for meeting {meeting_id}") |
| 396 | return None |
| 397 | except Exception as e: |
| 398 | logger.error(f"Failed to fetch transcript for meeting {meeting_id}: {e}") |
| 399 | return None |
| --- a/video_processor/utils/callbacks.py | ||
| +++ b/video_processor/utils/callbacks.py | ||
| @@ -0,0 +1,57 @@ | ||
| 1 | +"""Callback implementations for pipeline progress reporting.""" | |
| 2 | + | |
| 3 | +import json | |
| 4 | +import logging | |
| 5 | +from typing import Optional | |
| 6 | + | |
| 7 | +logger = logging.getLogger(__name__) | |
| 8 | + | |
| 9 | + | |
| 10 | +class WebhookCallback: | |
| 11 | + """Posts pipeline progress as JSON to a webhook URL.""" | |
| 12 | + | |
| 13 | + def __init__(self, url: str, timeout: float = 10.0, headers: Optional[dict] = None): | |
| 14 | + self.url = url | |
| 15 | + self.timeout = timeout | |
| 16 | + self.headers = headers or {"Content-Type": "application/json"} | |
| 17 | + | |
| 18 | + def _post(self, payload: dict) -> None: | |
| 19 | + """POST JSON payload to the webhook URL. Failures are logged, not raised.""" | |
| 20 | + try: | |
| 21 | + import urllib.request | |
| 22 | + | |
| 23 | + data = json.dumps(payload).encode("utf-8") | |
| 24 | + req = urllib.request.Request(self.url, data=data, headers=self.headers, method="POST") | |
| 25 | + urllib.request.urlopen(req, timeout=self.timeout) | |
| 26 | + except Exception as e: | |
| 27 | + logger.warning(f"Webhook POST failed: {e}") | |
| 28 | + | |
| 29 | + def on_step_start(self, step: str, index: int, total: int) -> None: | |
| 30 | + self._post( | |
| 31 | + { | |
| 32 | + "event": "step_start", | |
| 33 | + "step": step, | |
| 34 | + "index": index, | |
| 35 | + "total": total, | |
| 36 | + } | |
| 37 | + ) | |
| 38 | + | |
| 39 | + def on_step_complete(self, step: str, index: int, total: int) -> None: | |
| 40 | + self._post( | |
| 41 | + { | |
| 42 | + "event": "step_complete", | |
| 43 | + "step": step, | |
| 44 | + "index": index, | |
| 45 | + "total": total, | |
| 46 | + } | |
| 47 | + ) | |
| 48 | + | |
| 49 | + def on_progress(self, step: str, percent: float, message: str = "") -> None: | |
| 50 | + self._post( | |
| 51 | + { | |
| 52 | + "event": "progress", | |
| 53 | + "step": step, | |
| 54 | + "percent": percent, | |
| 55 | + "message": message, | |
| 56 | + } | |
| 57 | + ) |
| --- a/video_processor/utils/callbacks.py | |
| +++ b/video_processor/utils/callbacks.py | |
| @@ -0,0 +1,57 @@ | |
| --- a/video_processor/utils/callbacks.py | |
| +++ b/video_processor/utils/callbacks.py | |
| @@ -0,0 +1,57 @@ | |
| 1 | """Callback implementations for pipeline progress reporting.""" |
| 2 | |
| 3 | import json |
| 4 | import logging |
| 5 | from typing import Optional |
| 6 | |
| 7 | logger = logging.getLogger(__name__) |
| 8 | |
| 9 | |
| 10 | class WebhookCallback: |
| 11 | """Posts pipeline progress as JSON to a webhook URL.""" |
| 12 | |
| 13 | def __init__(self, url: str, timeout: float = 10.0, headers: Optional[dict] = None): |
| 14 | self.url = url |
| 15 | self.timeout = timeout |
| 16 | self.headers = headers or {"Content-Type": "application/json"} |
| 17 | |
| 18 | def _post(self, payload: dict) -> None: |
| 19 | """POST JSON payload to the webhook URL. Failures are logged, not raised.""" |
| 20 | try: |
| 21 | import urllib.request |
| 22 | |
| 23 | data = json.dumps(payload).encode("utf-8") |
| 24 | req = urllib.request.Request(self.url, data=data, headers=self.headers, method="POST") |
| 25 | urllib.request.urlopen(req, timeout=self.timeout) |
| 26 | except Exception as e: |
| 27 | logger.warning(f"Webhook POST failed: {e}") |
| 28 | |
| 29 | def on_step_start(self, step: str, index: int, total: int) -> None: |
| 30 | self._post( |
| 31 | { |
| 32 | "event": "step_start", |
| 33 | "step": step, |
| 34 | "index": index, |
| 35 | "total": total, |
| 36 | } |
| 37 | ) |
| 38 | |
| 39 | def on_step_complete(self, step: str, index: int, total: int) -> None: |
| 40 | self._post( |
| 41 | { |
| 42 | "event": "step_complete", |
| 43 | "step": step, |
| 44 | "index": index, |
| 45 | "total": total, |
| 46 | } |
| 47 | ) |
| 48 | |
| 49 | def on_progress(self, step: str, percent: float, message: str = "") -> None: |
| 50 | self._post( |
| 51 | { |
| 52 | "event": "progress", |
| 53 | "step": step, |
| 54 | "percent": percent, |
| 55 | "message": message, |
| 56 | } |
| 57 | ) |
| --- video_processor/utils/visualization.py | ||
| +++ video_processor/utils/visualization.py | ||
| @@ -0,0 +1,200 @@ | ||
| 1 | +"""Graph visualization and analysis utilities using NetworkX.""" | |
| 2 | + | |
| 3 | +from typing import Dict, List, Optional | |
| 4 | + | |
| 5 | +try: | |
| 6 | + import networkx as nx | |
| 7 | +except ImportError: | |
| 8 | + nx = None | |
| 9 | + | |
| 10 | + | |
| 11 | +def _require_nx(): | |
| 12 | + if nx is None: | |
| 13 | + raise ImportError( | |
| 14 | + "networkx is required for graph visualization. Install it with: pip install networkx" | |
| 15 | + ) | |
| 16 | + | |
| 17 | + | |
| 18 | +def graph_to_networkx(kg_data: dict) -> "nx.DiGraph": | |
| 19 | + """Convert knowledge graph dict (from to_dict()) to NetworkX directed graph. | |
| 20 | + | |
| 21 | + Nodes get attributes: type, descriptions, source, occurrences | |
| 22 | + Edges get attributes: type, content_source, timestamp | |
| 23 | + """ | |
| 24 | + _require_nx() | |
| 25 | + G = nx.DiGraph() | |
| 26 | + | |
| 27 | + for node in kg_data.get("nodes", []): | |
| 28 | + name = node.get("name", node.get("id", "")) | |
| 29 | + if not name: | |
| 30 | + continue | |
| 31 | + G.add_node( | |
| 32 | + name, | |
| 33 | + type=node.get("type", "concept"), | |
| 34 | + descriptions=node.get("descriptions", []), | |
| 35 | + source=node.get("source"), | |
| 36 | + occurrences=node.get("occurrences", []), | |
| 37 | + ) | |
| 38 | + | |
| 39 | + for rel in kg_data.get("relationships", []): | |
| 40 | + src = rel.get("source", "") | |
| 41 | + tgt = rel.get("target", "") | |
| 42 | + if not src or not tgt: | |
| 43 | + continue | |
| 44 | + G.add_edge( | |
| 45 | + src, | |
| 46 | + tgt, | |
| 47 | + type=rel.get("type", "related_to"), | |
| 48 | + content_source=rel.get("content_source"), | |
| 49 | + timestamp=rel.get("timestamp"), | |
| 50 | + ) | |
| 51 | + | |
| 52 | + return G | |
| 53 | + | |
| 54 | + | |
| 55 | +def compute_graph_stats(G: "nx.DiGraph") -> dict: | |
| 56 | + """Return graph statistics. | |
| 57 | + | |
| 58 | + Keys: node_count, edge_count, density, connected_components, | |
| 59 | + type_breakdown, top_entities (by degree, top 10). | |
| 60 | + """ | |
| 61 | + undirected = G.to_undirected() | |
| 62 | + components = nx.number_connected_components(undirected) if len(G) > 0 else 0 | |
| 63 | + | |
| 64 | + type_breakdown: Dict[str, int] = {} | |
| 65 | + for _, data in G.nodes(data=True): | |
| 66 | + ntype = data.get("type", "concept") | |
| 67 | + type_breakdown[ntype] = type_breakdown.get(ntype, 0) + 1 | |
| 68 | + | |
| 69 | + degree_list = sorted(G.degree(), key=lambda x: x[1], reverse=True) | |
| 70 | + top_entities = [{"name": name, "degree": deg} for name, deg in degree_list[:10]] | |
| 71 | + | |
| 72 | + return { | |
| 73 | + "node_count": G.number_of_nodes(), | |
| 74 | + "edge_count": G.number_of_edges(), | |
| 75 | + "density": nx.density(G), | |
| 76 | + "connected_components": components, | |
| 77 | + "type_breakdown": type_breakdown, | |
| 78 | + "top_entities": top_entities, | |
| 79 | + } | |
| 80 | + | |
| 81 | + | |
| 82 | +def filter_graph( | |
| 83 | + G: "nx.DiGraph", | |
| 84 | + entity_types: Optional[List[str]] = None, | |
| 85 | + min_degree: Optional[int] = None, | |
| 86 | +) -> "nx.DiGraph": | |
| 87 | + """Return subgraph filtered by entity type list and/or minimum degree.""" | |
| 88 | + nodes = set(G.nodes()) | |
| 89 | + | |
| 90 | + if entity_types is not None: | |
| 91 | + types_set = set(entity_types) | |
| 92 | + nodes = {n for n in nodes if G.nodes[n].get("type", "concept") in types_set} | |
| 93 | + | |
| 94 | + if min_degree is not None: | |
| 95 | + nodes = {n for n in nodes if G.degree(n) >= min_degree} | |
| 96 | + | |
| 97 | + return G.subgraph(nodes).copy() | |
| 98 | + | |
| 99 | + | |
| 100 | +def _sanitize_id(name: str) -> str: | |
| 101 | + """Create a Mermaid-safe identifier from a node name.""" | |
| 102 | + return "".join(c if c.isalnum() or c == "_" else "_" for c in name) | |
| 103 | + | |
| 104 | + | |
| 105 | +def generate_mermaid(G: "nx.DiGraph", max_nodes: int = 30, layout: str = "LR") -> str: | |
| 106 | + """Generate Mermaid diagram from NetworkX graph. | |
| 107 | + | |
| 108 | + Selects top nodes by degree. Layout can be LR, TD, etc. | |
| 109 | + """ | |
| 110 | + degree_sorted = sorted(G.degree(), key=lambda x: x[1], reverse=True) | |
| 111 | + top_nodes = {name for name, _ in degree_sorted[:max_nodes]} | |
| 112 | + | |
| 113 | + lines = [f"graph {layout}"] | |
| 114 | + | |
| 115 | + for name in top_nodes: | |
| 116 | + data = G.nodes[name] | |
| 117 | + ntype = data.get("type", "concept") | |
| 118 | + safe_id = _sanitize_id(name) | |
| 119 | + safe_name = name.replace('"', "'") | |
| 120 | + lines.append(f' {safe_id}["{safe_name}"]:::{ntype}') | |
| 121 | + | |
| 122 | + added = set() | |
| 123 | + for src, tgt, data in G.edges(data=True): | |
| 124 | + if src in top_nodes and tgt in top_nodes: | |
| 125 | + rtype = data.get("type", "related_to") | |
| 126 | + key = (src, tgt, rtype) | |
| 127 | + if key not in added: | |
| 128 | + lines.append(f' {_sanitize_id(src)} -- "{rtype}" --> {_sanitize_id(tgt)}') | |
| 129 | + added.add(key) | |
| 130 | + | |
| 131 | + lines.append(" classDef person fill:#f9d5e5,stroke:#333,stroke-width:1px") | |
| 132 | + lines.append(" classDef concept fill:#eeeeee,stroke:#333,stroke-width:1px") | |
| 133 | + lines.append(" classDef technology fill:#d5e5f9,stroke:#333,stroke-width:1px") | |
| 134 | + lines.append(" classDef organization fill:#f9f5d5,stroke:#333,stroke-width:1px") | |
| 135 | + lines.append(" classDef diagram fill:#d5f9e5,stroke:#333,stroke-width:1px") | |
| 136 | + lines.append(" classDef time fill:#e5d5f9,stroke:#333,stroke-width:1px") | |
| 137 | + | |
| 138 | + return "\n".join(lines) | |
| 139 | + | |
| 140 | + | |
| 141 | +def graph_to_d3_json(G: "nx.DiGraph") -> dict: | |
| 142 | + """Export to D3-compatible format. | |
| 143 | + | |
| 144 | + Returns {"nodes": [{"id": ..., "group": ...}], "links": [...]}. | |
| 145 | + """ | |
| 146 | + nodes = [] | |
| 147 | + for name, data in G.nodes(data=True): | |
| 148 | + nodes.append( | |
| 149 | + { | |
| 150 | + "id": name, | |
| 151 | + "group": data.get("type", "concept"), | |
| 152 | + "descriptions": data.get("descriptions", []), | |
| 153 | + } | |
| 154 | + ) | |
| 155 | + | |
| 156 | + links = [] | |
| 157 | + for src, tgt, data in G.edges(data=True): | |
| 158 | + links.append( | |
| 159 | + { | |
| 160 | + "source": src, | |
| 161 | + "target": tgt, | |
| 162 | + "type": data.get("type", "related_to"), | |
| 163 | + } | |
| 164 | + ) | |
| 165 | + | |
| 166 | + return {"nodes": nodes, "links": links} | |
| 167 | + | |
| 168 | + | |
| 169 | +def graph_to_dot(G: "nx.DiGraph") -> str: | |
| 170 | + """Export to Graphviz DOT format.""" | |
| 171 | + lines = ["digraph KnowledgeGraph {"] | |
| 172 | + lines.append(" rankdir=LR;") | |
| 173 | + lines.append(' node [shape=box, style="rounded,filled", fontname="Helvetica"];') | |
| 174 | + lines.append("") | |
| 175 | + | |
| 176 | + type_colors = { | |
| 177 | + "person": "#f9d5e5", | |
| 178 | + "concept": "#eeeeee", | |
| 179 | + "technology": "#d5e5f9", | |
| 180 | + "organization": "#f9f5d5", | |
| 181 | + "diagram": "#d5f9e5", | |
| 182 | + "time": "#e5d5f9", | |
| 183 | + } | |
| 184 | + | |
| 185 | + for name, data in G.nodes(data=True): | |
| 186 | + ntype = data.get("type", "concept") | |
| 187 | + color = type_colors.get(ntype, "#eeeeee") | |
| 188 | + escaped = name.replace('"', '\\"') | |
| 189 | + lines.append(f' "{escaped}" [fillcolor="{color}", label="{escaped}"];') | |
| 190 | + | |
| 191 | + lines.append("") | |
| 192 | + for src, tgt, data in G.edges(data=True): | |
| 193 | + rtype = data.get("type", "related_to") | |
| 194 | + escaped_src = src.replace('"', '\\"') | |
| 195 | + escaped_tgt = tgt.replace('"', '\\"') | |
| 196 | + escaped_type = rtype.replace('"', '\\"') | |
| 197 | + lines.append(f' "{escaped_src}" -> "{escaped_tgt}" [label="{escaped_type}"];') | |
| 198 | + | |
| 199 | + lines.append("}") | |
| 200 | + return "\n".join(lines) | |
| 0 | 201 |
| --- video_processor/utils/visualization.py | |
| +++ video_processor/utils/visualization.py | |
| @@ -0,0 +1,200 @@ | |
| 0 |
| --- video_processor/utils/visualization.py | |
| +++ video_processor/utils/visualization.py | |
| @@ -0,0 +1,200 @@ | |
| 1 | """Graph visualization and analysis utilities using NetworkX.""" |
| 2 | |
| 3 | from typing import Dict, List, Optional |
| 4 | |
| 5 | try: |
| 6 | import networkx as nx |
| 7 | except ImportError: |
| 8 | nx = None |
| 9 | |
| 10 | |
| 11 | def _require_nx(): |
| 12 | if nx is None: |
| 13 | raise ImportError( |
| 14 | "networkx is required for graph visualization. Install it with: pip install networkx" |
| 15 | ) |
| 16 | |
| 17 | |
| 18 | def graph_to_networkx(kg_data: dict) -> "nx.DiGraph": |
| 19 | """Convert knowledge graph dict (from to_dict()) to NetworkX directed graph. |
| 20 | |
| 21 | Nodes get attributes: type, descriptions, source, occurrences |
| 22 | Edges get attributes: type, content_source, timestamp |
| 23 | """ |
| 24 | _require_nx() |
| 25 | G = nx.DiGraph() |
| 26 | |
| 27 | for node in kg_data.get("nodes", []): |
| 28 | name = node.get("name", node.get("id", "")) |
| 29 | if not name: |
| 30 | continue |
| 31 | G.add_node( |
| 32 | name, |
| 33 | type=node.get("type", "concept"), |
| 34 | descriptions=node.get("descriptions", []), |
| 35 | source=node.get("source"), |
| 36 | occurrences=node.get("occurrences", []), |
| 37 | ) |
| 38 | |
| 39 | for rel in kg_data.get("relationships", []): |
| 40 | src = rel.get("source", "") |
| 41 | tgt = rel.get("target", "") |
| 42 | if not src or not tgt: |
| 43 | continue |
| 44 | G.add_edge( |
| 45 | src, |
| 46 | tgt, |
| 47 | type=rel.get("type", "related_to"), |
| 48 | content_source=rel.get("content_source"), |
| 49 | timestamp=rel.get("timestamp"), |
| 50 | ) |
| 51 | |
| 52 | return G |
| 53 | |
| 54 | |
| 55 | def compute_graph_stats(G: "nx.DiGraph") -> dict: |
| 56 | """Return graph statistics. |
| 57 | |
| 58 | Keys: node_count, edge_count, density, connected_components, |
| 59 | type_breakdown, top_entities (by degree, top 10). |
| 60 | """ |
| 61 | undirected = G.to_undirected() |
| 62 | components = nx.number_connected_components(undirected) if len(G) > 0 else 0 |
| 63 | |
| 64 | type_breakdown: Dict[str, int] = {} |
| 65 | for _, data in G.nodes(data=True): |
| 66 | ntype = data.get("type", "concept") |
| 67 | type_breakdown[ntype] = type_breakdown.get(ntype, 0) + 1 |
| 68 | |
| 69 | degree_list = sorted(G.degree(), key=lambda x: x[1], reverse=True) |
| 70 | top_entities = [{"name": name, "degree": deg} for name, deg in degree_list[:10]] |
| 71 | |
| 72 | return { |
| 73 | "node_count": G.number_of_nodes(), |
| 74 | "edge_count": G.number_of_edges(), |
| 75 | "density": nx.density(G), |
| 76 | "connected_components": components, |
| 77 | "type_breakdown": type_breakdown, |
| 78 | "top_entities": top_entities, |
| 79 | } |
| 80 | |
| 81 | |
| 82 | def filter_graph( |
| 83 | G: "nx.DiGraph", |
| 84 | entity_types: Optional[List[str]] = None, |
| 85 | min_degree: Optional[int] = None, |
| 86 | ) -> "nx.DiGraph": |
| 87 | """Return subgraph filtered by entity type list and/or minimum degree.""" |
| 88 | nodes = set(G.nodes()) |
| 89 | |
| 90 | if entity_types is not None: |
| 91 | types_set = set(entity_types) |
| 92 | nodes = {n for n in nodes if G.nodes[n].get("type", "concept") in types_set} |
| 93 | |
| 94 | if min_degree is not None: |
| 95 | nodes = {n for n in nodes if G.degree(n) >= min_degree} |
| 96 | |
| 97 | return G.subgraph(nodes).copy() |
| 98 | |
| 99 | |
| 100 | def _sanitize_id(name: str) -> str: |
| 101 | """Create a Mermaid-safe identifier from a node name.""" |
| 102 | return "".join(c if c.isalnum() or c == "_" else "_" for c in name) |
| 103 | |
| 104 | |
| 105 | def generate_mermaid(G: "nx.DiGraph", max_nodes: int = 30, layout: str = "LR") -> str: |
| 106 | """Generate Mermaid diagram from NetworkX graph. |
| 107 | |
| 108 | Selects top nodes by degree. Layout can be LR, TD, etc. |
| 109 | """ |
| 110 | degree_sorted = sorted(G.degree(), key=lambda x: x[1], reverse=True) |
| 111 | top_nodes = {name for name, _ in degree_sorted[:max_nodes]} |
| 112 | |
| 113 | lines = [f"graph {layout}"] |
| 114 | |
| 115 | for name in top_nodes: |
| 116 | data = G.nodes[name] |
| 117 | ntype = data.get("type", "concept") |
| 118 | safe_id = _sanitize_id(name) |
| 119 | safe_name = name.replace('"', "'") |
| 120 | lines.append(f' {safe_id}["{safe_name}"]:::{ntype}') |
| 121 | |
| 122 | added = set() |
| 123 | for src, tgt, data in G.edges(data=True): |
| 124 | if src in top_nodes and tgt in top_nodes: |
| 125 | rtype = data.get("type", "related_to") |
| 126 | key = (src, tgt, rtype) |
| 127 | if key not in added: |
| 128 | lines.append(f' {_sanitize_id(src)} -- "{rtype}" --> {_sanitize_id(tgt)}') |
| 129 | added.add(key) |
| 130 | |
| 131 | lines.append(" classDef person fill:#f9d5e5,stroke:#333,stroke-width:1px") |
| 132 | lines.append(" classDef concept fill:#eeeeee,stroke:#333,stroke-width:1px") |
| 133 | lines.append(" classDef technology fill:#d5e5f9,stroke:#333,stroke-width:1px") |
| 134 | lines.append(" classDef organization fill:#f9f5d5,stroke:#333,stroke-width:1px") |
| 135 | lines.append(" classDef diagram fill:#d5f9e5,stroke:#333,stroke-width:1px") |
| 136 | lines.append(" classDef time fill:#e5d5f9,stroke:#333,stroke-width:1px") |
| 137 | |
| 138 | return "\n".join(lines) |
| 139 | |
| 140 | |
| 141 | def graph_to_d3_json(G: "nx.DiGraph") -> dict: |
| 142 | """Export to D3-compatible format. |
| 143 | |
| 144 | Returns {"nodes": [{"id": ..., "group": ...}], "links": [...]}. |
| 145 | """ |
| 146 | nodes = [] |
| 147 | for name, data in G.nodes(data=True): |
| 148 | nodes.append( |
| 149 | { |
| 150 | "id": name, |
| 151 | "group": data.get("type", "concept"), |
| 152 | "descriptions": data.get("descriptions", []), |
| 153 | } |
| 154 | ) |
| 155 | |
| 156 | links = [] |
| 157 | for src, tgt, data in G.edges(data=True): |
| 158 | links.append( |
| 159 | { |
| 160 | "source": src, |
| 161 | "target": tgt, |
| 162 | "type": data.get("type", "related_to"), |
| 163 | } |
| 164 | ) |
| 165 | |
| 166 | return {"nodes": nodes, "links": links} |
| 167 | |
| 168 | |
| 169 | def graph_to_dot(G: "nx.DiGraph") -> str: |
| 170 | """Export to Graphviz DOT format.""" |
| 171 | lines = ["digraph KnowledgeGraph {"] |
| 172 | lines.append(" rankdir=LR;") |
| 173 | lines.append(' node [shape=box, style="rounded,filled", fontname="Helvetica"];') |
| 174 | lines.append("") |
| 175 | |
| 176 | type_colors = { |
| 177 | "person": "#f9d5e5", |
| 178 | "concept": "#eeeeee", |
| 179 | "technology": "#d5e5f9", |
| 180 | "organization": "#f9f5d5", |
| 181 | "diagram": "#d5f9e5", |
| 182 | "time": "#e5d5f9", |
| 183 | } |
| 184 | |
| 185 | for name, data in G.nodes(data=True): |
| 186 | ntype = data.get("type", "concept") |
| 187 | color = type_colors.get(ntype, "#eeeeee") |
| 188 | escaped = name.replace('"', '\\"') |
| 189 | lines.append(f' "{escaped}" [fillcolor="{color}", label="{escaped}"];') |
| 190 | |
| 191 | lines.append("") |
| 192 | for src, tgt, data in G.edges(data=True): |
| 193 | rtype = data.get("type", "related_to") |
| 194 | escaped_src = src.replace('"', '\\"') |
| 195 | escaped_tgt = tgt.replace('"', '\\"') |
| 196 | escaped_type = rtype.replace('"', '\\"') |
| 197 | lines.append(f' "{escaped_src}" -> "{escaped_tgt}" [label="{escaped_type}"];') |
| 198 | |
| 199 | lines.append("}") |
| 200 | return "\n".join(lines) |
| 201 |