PlanOpticon
feat: add notes connectors (in/out), wiki generator, and meeting sources Input connectors: - ObsidianSource: vault ingester with frontmatter, wiki-links, tags parsing - LogseqSource: graph ingester with page properties, block refs - NotionSource: API connector for pages and databases - AppleNotesSource: macOS osascript connector - GoogleKeepSource: via gws CLI with checklist support - OneNoteSource: via m365 CLI with notebook/section/page hierarchy Output skills: - WikiGeneratorSkill: generates GitHub wiki from KG (Home, Sidebar, entity pages) - NotesExportSkill: exports KG as Obsidian vault or Notion-compatible markdown CLI commands: - planopticon wiki {generate,push} 52 new tests (728 total)
Commit
855b7fbcf90ec4416cfdeeb72445dd82e03b6c6af67baf07a5d658ae5fd2658d
Parent
7d809ed2e1f41ac…
13 files changed
+222
+482
+2
+420
+315
+61
+16
-4
+178
+170
+200
+380
+178
+222
~
tests/test_agent_skills.py
~
tests/test_sources.py
~
video_processor/agent/skills/__init__.py
+
video_processor/agent/skills/notes_export.py
+
video_processor/agent/skills/wiki_generator.py
~
video_processor/cli/commands.py
~
video_processor/sources/__init__.py
+
video_processor/sources/apple_notes_source.py
+
video_processor/sources/google_keep_source.py
+
video_processor/sources/logseq_source.py
+
video_processor/sources/notion_source.py
+
video_processor/sources/obsidian_source.py
+
video_processor/sources/onenote_source.py
+222
| --- tests/test_agent_skills.py | ||
| +++ tests/test_agent_skills.py | ||
| @@ -401,5 +401,227 @@ | ||
| 401 | 401 | |
| 402 | 402 | skill = ProjectPlanSkill() |
| 403 | 403 | ctx = _make_context() |
| 404 | 404 | ctx.provider_manager = None |
| 405 | 405 | assert skill.can_execute(ctx) is False |
| 406 | + | |
| 407 | + | |
| 408 | +# --------------------------------------------------------------------------- | |
| 409 | +# WikiGeneratorSkill | |
| 410 | +# --------------------------------------------------------------------------- | |
| 411 | + | |
| 412 | + | |
| 413 | +class TestWikiGeneratorSkill: | |
| 414 | + def _sample_kg_data(self): | |
| 415 | + return { | |
| 416 | + "nodes": [ | |
| 417 | + { | |
| 418 | + "name": "Python", | |
| 419 | + "type": "technology", | |
| 420 | + "descriptions": ["A programming language"], | |
| 421 | + }, | |
| 422 | + { | |
| 423 | + "name": "Alice", | |
| 424 | + "type": "person", | |
| 425 | + "descriptions": ["Lead developer"], | |
| 426 | + }, | |
| 427 | + { | |
| 428 | + "name": "FastAPI", | |
| 429 | + "type": "technology", | |
| 430 | + "descriptions": ["Web framework"], | |
| 431 | + }, | |
| 432 | + ], | |
| 433 | + "relationships": [ | |
| 434 | + {"source": "Alice", "target": "Python", "type": "uses"}, | |
| 435 | + {"source": "FastAPI", "target": "Python", "type": "built_with"}, | |
| 436 | + ], | |
| 437 | + } | |
| 438 | + | |
| 439 | + def test_generate_wiki(self): | |
| 440 | + from video_processor.agent.skills.wiki_generator import generate_wiki | |
| 441 | + | |
| 442 | + pages = generate_wiki(self._sample_kg_data(), title="Test Wiki") | |
| 443 | + | |
| 444 | + assert "Home" in pages | |
| 445 | + assert "_Sidebar" in pages | |
| 446 | + assert "Test Wiki" in pages["Home"] | |
| 447 | + assert "3" in pages["Home"] # 3 entities | |
| 448 | + assert "2" in pages["Home"] # 2 relationships | |
| 449 | + | |
| 450 | + # Entity pages should exist | |
| 451 | + assert "Python" in pages | |
| 452 | + assert "Alice" in pages | |
| 453 | + assert "FastAPI" in pages | |
| 454 | + | |
| 455 | + # Type index pages should exist | |
| 456 | + assert "Technology" in pages | |
| 457 | + assert "Person" in pages | |
| 458 | + | |
| 459 | + # Alice's page should reference Python | |
| 460 | + assert "Python" in pages["Alice"] | |
| 461 | + assert "uses" in pages["Alice"] | |
| 462 | + | |
| 463 | + def test_generate_wiki_with_artifacts(self): | |
| 464 | + from video_processor.agent.skills.wiki_generator import generate_wiki | |
| 465 | + | |
| 466 | + art = Artifact( | |
| 467 | + name="Project Plan", | |
| 468 | + content="# Plan\n\nDo the thing.", | |
| 469 | + artifact_type="project_plan", | |
| 470 | + format="markdown", | |
| 471 | + ) | |
| 472 | + pages = generate_wiki(self._sample_kg_data(), artifacts=[art]) | |
| 473 | + | |
| 474 | + assert "Project-Plan" in pages | |
| 475 | + assert "Do the thing." in pages["Project-Plan"] | |
| 476 | + assert "Planning Artifacts" in pages["Home"] | |
| 477 | + | |
| 478 | + def test_write_wiki(self, tmp_path): | |
| 479 | + from video_processor.agent.skills.wiki_generator import write_wiki | |
| 480 | + | |
| 481 | + pages = { | |
| 482 | + "Home": "# Home\n\nWelcome.", | |
| 483 | + "Page-One": "# Page One\n\nContent.", | |
| 484 | + } | |
| 485 | + paths = write_wiki(pages, tmp_path / "wiki") | |
| 486 | + | |
| 487 | + assert len(paths) == 2 | |
| 488 | + assert (tmp_path / "wiki" / "Home.md").exists() | |
| 489 | + assert (tmp_path / "wiki" / "Page-One.md").exists() | |
| 490 | + assert "Welcome." in (tmp_path / "wiki" / "Home.md").read_text() | |
| 491 | + | |
| 492 | + def test_sanitize_filename(self): | |
| 493 | + from video_processor.agent.skills.wiki_generator import _sanitize_filename | |
| 494 | + | |
| 495 | + assert _sanitize_filename("Hello World") == "Hello-World" | |
| 496 | + assert _sanitize_filename("path/to\\file") == "path-to-file" | |
| 497 | + assert _sanitize_filename("version.2") == "version-2" | |
| 498 | + | |
| 499 | + def test_wiki_link(self): | |
| 500 | + from video_processor.agent.skills.wiki_generator import _wiki_link | |
| 501 | + | |
| 502 | + result = _wiki_link("My Page") | |
| 503 | + assert result == "[My Page](My-Page)" | |
| 504 | + | |
| 505 | + result = _wiki_link("Simple") | |
| 506 | + assert result == "[Simple](Simple)" | |
| 507 | + | |
| 508 | + | |
| 509 | +# --------------------------------------------------------------------------- | |
| 510 | +# NotesExportSkill | |
| 511 | +# --------------------------------------------------------------------------- | |
| 512 | + | |
| 513 | + | |
| 514 | +class TestNotesExportSkill: | |
| 515 | + def _sample_kg_data(self): | |
| 516 | + return { | |
| 517 | + "nodes": [ | |
| 518 | + { | |
| 519 | + "name": "Python", | |
| 520 | + "type": "technology", | |
| 521 | + "descriptions": ["A programming language"], | |
| 522 | + }, | |
| 523 | + { | |
| 524 | + "name": "Alice", | |
| 525 | + "type": "person", | |
| 526 | + "descriptions": ["Lead developer"], | |
| 527 | + }, | |
| 528 | + ], | |
| 529 | + "relationships": [ | |
| 530 | + {"source": "Alice", "target": "Python", "type": "uses"}, | |
| 531 | + ], | |
| 532 | + } | |
| 533 | + | |
| 534 | + def test_export_to_obsidian(self, tmp_path): | |
| 535 | + from video_processor.agent.skills.notes_export import export_to_obsidian | |
| 536 | + | |
| 537 | + output_dir = tmp_path / "obsidian_vault" | |
| 538 | + export_to_obsidian(self._sample_kg_data(), output_dir) | |
| 539 | + | |
| 540 | + assert output_dir.is_dir() | |
| 541 | + | |
| 542 | + # Check entity files exist | |
| 543 | + python_file = output_dir / "Python.md" | |
| 544 | + alice_file = output_dir / "Alice.md" | |
| 545 | + assert python_file.exists() | |
| 546 | + assert alice_file.exists() | |
| 547 | + | |
| 548 | + # Check frontmatter in entity file | |
| 549 | + python_content = python_file.read_text() | |
| 550 | + assert "---" in python_content | |
| 551 | + assert "type: technology" in python_content | |
| 552 | + assert "# Python" in python_content | |
| 553 | + | |
| 554 | + # Check wiki-links in Alice file | |
| 555 | + alice_content = alice_file.read_text() | |
| 556 | + assert "[[Python]]" in alice_content | |
| 557 | + assert "uses" in alice_content | |
| 558 | + | |
| 559 | + # Check index file | |
| 560 | + index_file = output_dir / "_Index.md" | |
| 561 | + assert index_file.exists() | |
| 562 | + index_content = index_file.read_text() | |
| 563 | + assert "[[Python]]" in index_content | |
| 564 | + assert "[[Alice]]" in index_content | |
| 565 | + | |
| 566 | + def test_export_to_obsidian_with_artifacts(self, tmp_path): | |
| 567 | + from video_processor.agent.skills.notes_export import export_to_obsidian | |
| 568 | + | |
| 569 | + art = Artifact( | |
| 570 | + name="Test Plan", | |
| 571 | + content="# Plan\n\nSteps here.", | |
| 572 | + artifact_type="project_plan", | |
| 573 | + format="markdown", | |
| 574 | + ) | |
| 575 | + output_dir = tmp_path / "obsidian_arts" | |
| 576 | + export_to_obsidian(self._sample_kg_data(), output_dir, artifacts=[art]) | |
| 577 | + | |
| 578 | + art_file = output_dir / "Test Plan.md" | |
| 579 | + assert art_file.exists() | |
| 580 | + art_content = art_file.read_text() | |
| 581 | + assert "artifact" in art_content | |
| 582 | + assert "Steps here." in art_content | |
| 583 | + | |
| 584 | + def test_export_to_notion_md(self, tmp_path): | |
| 585 | + from video_processor.agent.skills.notes_export import export_to_notion_md | |
| 586 | + | |
| 587 | + output_dir = tmp_path / "notion_export" | |
| 588 | + export_to_notion_md(self._sample_kg_data(), output_dir) | |
| 589 | + | |
| 590 | + assert output_dir.is_dir() | |
| 591 | + | |
| 592 | + # Check CSV database file | |
| 593 | + csv_file = output_dir / "entities_database.csv" | |
| 594 | + assert csv_file.exists() | |
| 595 | + csv_content = csv_file.read_text() | |
| 596 | + assert "Name" in csv_content | |
| 597 | + assert "Type" in csv_content | |
| 598 | + assert "Python" in csv_content | |
| 599 | + assert "Alice" in csv_content | |
| 600 | + | |
| 601 | + # Check entity markdown files | |
| 602 | + python_file = output_dir / "Python.md" | |
| 603 | + assert python_file.exists() | |
| 604 | + python_content = python_file.read_text() | |
| 605 | + assert "# Python" in python_content | |
| 606 | + assert "technology" in python_content | |
| 607 | + | |
| 608 | + # Check overview file | |
| 609 | + overview_file = output_dir / "Overview.md" | |
| 610 | + assert overview_file.exists() | |
| 611 | + | |
| 612 | + def test_export_to_notion_md_with_artifacts(self, tmp_path): | |
| 613 | + from video_processor.agent.skills.notes_export import export_to_notion_md | |
| 614 | + | |
| 615 | + art = Artifact( | |
| 616 | + name="Roadmap", | |
| 617 | + content="# Roadmap\n\nQ1 goals.", | |
| 618 | + artifact_type="roadmap", | |
| 619 | + format="markdown", | |
| 620 | + ) | |
| 621 | + output_dir = tmp_path / "notion_arts" | |
| 622 | + export_to_notion_md(self._sample_kg_data(), output_dir, artifacts=[art]) | |
| 623 | + | |
| 624 | + art_file = output_dir / "Roadmap.md" | |
| 625 | + assert art_file.exists() | |
| 626 | + art_content = art_file.read_text() | |
| 627 | + assert "Q1 goals." in art_content | |
| 406 | 628 |
| --- tests/test_agent_skills.py | |
| +++ tests/test_agent_skills.py | |
| @@ -401,5 +401,227 @@ | |
| 401 | |
| 402 | skill = ProjectPlanSkill() |
| 403 | ctx = _make_context() |
| 404 | ctx.provider_manager = None |
| 405 | assert skill.can_execute(ctx) is False |
| 406 |
| --- tests/test_agent_skills.py | |
| +++ tests/test_agent_skills.py | |
| @@ -401,5 +401,227 @@ | |
| 401 | |
| 402 | skill = ProjectPlanSkill() |
| 403 | ctx = _make_context() |
| 404 | ctx.provider_manager = None |
| 405 | assert skill.can_execute(ctx) is False |
| 406 | |
| 407 | |
| 408 | # --------------------------------------------------------------------------- |
| 409 | # WikiGeneratorSkill |
| 410 | # --------------------------------------------------------------------------- |
| 411 | |
| 412 | |
| 413 | class TestWikiGeneratorSkill: |
| 414 | def _sample_kg_data(self): |
| 415 | return { |
| 416 | "nodes": [ |
| 417 | { |
| 418 | "name": "Python", |
| 419 | "type": "technology", |
| 420 | "descriptions": ["A programming language"], |
| 421 | }, |
| 422 | { |
| 423 | "name": "Alice", |
| 424 | "type": "person", |
| 425 | "descriptions": ["Lead developer"], |
| 426 | }, |
| 427 | { |
| 428 | "name": "FastAPI", |
| 429 | "type": "technology", |
| 430 | "descriptions": ["Web framework"], |
| 431 | }, |
| 432 | ], |
| 433 | "relationships": [ |
| 434 | {"source": "Alice", "target": "Python", "type": "uses"}, |
| 435 | {"source": "FastAPI", "target": "Python", "type": "built_with"}, |
| 436 | ], |
| 437 | } |
| 438 | |
| 439 | def test_generate_wiki(self): |
| 440 | from video_processor.agent.skills.wiki_generator import generate_wiki |
| 441 | |
| 442 | pages = generate_wiki(self._sample_kg_data(), title="Test Wiki") |
| 443 | |
| 444 | assert "Home" in pages |
| 445 | assert "_Sidebar" in pages |
| 446 | assert "Test Wiki" in pages["Home"] |
| 447 | assert "3" in pages["Home"] # 3 entities |
| 448 | assert "2" in pages["Home"] # 2 relationships |
| 449 | |
| 450 | # Entity pages should exist |
| 451 | assert "Python" in pages |
| 452 | assert "Alice" in pages |
| 453 | assert "FastAPI" in pages |
| 454 | |
| 455 | # Type index pages should exist |
| 456 | assert "Technology" in pages |
| 457 | assert "Person" in pages |
| 458 | |
| 459 | # Alice's page should reference Python |
| 460 | assert "Python" in pages["Alice"] |
| 461 | assert "uses" in pages["Alice"] |
| 462 | |
| 463 | def test_generate_wiki_with_artifacts(self): |
| 464 | from video_processor.agent.skills.wiki_generator import generate_wiki |
| 465 | |
| 466 | art = Artifact( |
| 467 | name="Project Plan", |
| 468 | content="# Plan\n\nDo the thing.", |
| 469 | artifact_type="project_plan", |
| 470 | format="markdown", |
| 471 | ) |
| 472 | pages = generate_wiki(self._sample_kg_data(), artifacts=[art]) |
| 473 | |
| 474 | assert "Project-Plan" in pages |
| 475 | assert "Do the thing." in pages["Project-Plan"] |
| 476 | assert "Planning Artifacts" in pages["Home"] |
| 477 | |
| 478 | def test_write_wiki(self, tmp_path): |
| 479 | from video_processor.agent.skills.wiki_generator import write_wiki |
| 480 | |
| 481 | pages = { |
| 482 | "Home": "# Home\n\nWelcome.", |
| 483 | "Page-One": "# Page One\n\nContent.", |
| 484 | } |
| 485 | paths = write_wiki(pages, tmp_path / "wiki") |
| 486 | |
| 487 | assert len(paths) == 2 |
| 488 | assert (tmp_path / "wiki" / "Home.md").exists() |
| 489 | assert (tmp_path / "wiki" / "Page-One.md").exists() |
| 490 | assert "Welcome." in (tmp_path / "wiki" / "Home.md").read_text() |
| 491 | |
| 492 | def test_sanitize_filename(self): |
| 493 | from video_processor.agent.skills.wiki_generator import _sanitize_filename |
| 494 | |
| 495 | assert _sanitize_filename("Hello World") == "Hello-World" |
| 496 | assert _sanitize_filename("path/to\\file") == "path-to-file" |
| 497 | assert _sanitize_filename("version.2") == "version-2" |
| 498 | |
| 499 | def test_wiki_link(self): |
| 500 | from video_processor.agent.skills.wiki_generator import _wiki_link |
| 501 | |
| 502 | result = _wiki_link("My Page") |
| 503 | assert result == "[My Page](My-Page)" |
| 504 | |
| 505 | result = _wiki_link("Simple") |
| 506 | assert result == "[Simple](Simple)" |
| 507 | |
| 508 | |
| 509 | # --------------------------------------------------------------------------- |
| 510 | # NotesExportSkill |
| 511 | # --------------------------------------------------------------------------- |
| 512 | |
| 513 | |
| 514 | class TestNotesExportSkill: |
| 515 | def _sample_kg_data(self): |
| 516 | return { |
| 517 | "nodes": [ |
| 518 | { |
| 519 | "name": "Python", |
| 520 | "type": "technology", |
| 521 | "descriptions": ["A programming language"], |
| 522 | }, |
| 523 | { |
| 524 | "name": "Alice", |
| 525 | "type": "person", |
| 526 | "descriptions": ["Lead developer"], |
| 527 | }, |
| 528 | ], |
| 529 | "relationships": [ |
| 530 | {"source": "Alice", "target": "Python", "type": "uses"}, |
| 531 | ], |
| 532 | } |
| 533 | |
| 534 | def test_export_to_obsidian(self, tmp_path): |
| 535 | from video_processor.agent.skills.notes_export import export_to_obsidian |
| 536 | |
| 537 | output_dir = tmp_path / "obsidian_vault" |
| 538 | export_to_obsidian(self._sample_kg_data(), output_dir) |
| 539 | |
| 540 | assert output_dir.is_dir() |
| 541 | |
| 542 | # Check entity files exist |
| 543 | python_file = output_dir / "Python.md" |
| 544 | alice_file = output_dir / "Alice.md" |
| 545 | assert python_file.exists() |
| 546 | assert alice_file.exists() |
| 547 | |
| 548 | # Check frontmatter in entity file |
| 549 | python_content = python_file.read_text() |
| 550 | assert "---" in python_content |
| 551 | assert "type: technology" in python_content |
| 552 | assert "# Python" in python_content |
| 553 | |
| 554 | # Check wiki-links in Alice file |
| 555 | alice_content = alice_file.read_text() |
| 556 | assert "[[Python]]" in alice_content |
| 557 | assert "uses" in alice_content |
| 558 | |
| 559 | # Check index file |
| 560 | index_file = output_dir / "_Index.md" |
| 561 | assert index_file.exists() |
| 562 | index_content = index_file.read_text() |
| 563 | assert "[[Python]]" in index_content |
| 564 | assert "[[Alice]]" in index_content |
| 565 | |
| 566 | def test_export_to_obsidian_with_artifacts(self, tmp_path): |
| 567 | from video_processor.agent.skills.notes_export import export_to_obsidian |
| 568 | |
| 569 | art = Artifact( |
| 570 | name="Test Plan", |
| 571 | content="# Plan\n\nSteps here.", |
| 572 | artifact_type="project_plan", |
| 573 | format="markdown", |
| 574 | ) |
| 575 | output_dir = tmp_path / "obsidian_arts" |
| 576 | export_to_obsidian(self._sample_kg_data(), output_dir, artifacts=[art]) |
| 577 | |
| 578 | art_file = output_dir / "Test Plan.md" |
| 579 | assert art_file.exists() |
| 580 | art_content = art_file.read_text() |
| 581 | assert "artifact" in art_content |
| 582 | assert "Steps here." in art_content |
| 583 | |
| 584 | def test_export_to_notion_md(self, tmp_path): |
| 585 | from video_processor.agent.skills.notes_export import export_to_notion_md |
| 586 | |
| 587 | output_dir = tmp_path / "notion_export" |
| 588 | export_to_notion_md(self._sample_kg_data(), output_dir) |
| 589 | |
| 590 | assert output_dir.is_dir() |
| 591 | |
| 592 | # Check CSV database file |
| 593 | csv_file = output_dir / "entities_database.csv" |
| 594 | assert csv_file.exists() |
| 595 | csv_content = csv_file.read_text() |
| 596 | assert "Name" in csv_content |
| 597 | assert "Type" in csv_content |
| 598 | assert "Python" in csv_content |
| 599 | assert "Alice" in csv_content |
| 600 | |
| 601 | # Check entity markdown files |
| 602 | python_file = output_dir / "Python.md" |
| 603 | assert python_file.exists() |
| 604 | python_content = python_file.read_text() |
| 605 | assert "# Python" in python_content |
| 606 | assert "technology" in python_content |
| 607 | |
| 608 | # Check overview file |
| 609 | overview_file = output_dir / "Overview.md" |
| 610 | assert overview_file.exists() |
| 611 | |
| 612 | def test_export_to_notion_md_with_artifacts(self, tmp_path): |
| 613 | from video_processor.agent.skills.notes_export import export_to_notion_md |
| 614 | |
| 615 | art = Artifact( |
| 616 | name="Roadmap", |
| 617 | content="# Roadmap\n\nQ1 goals.", |
| 618 | artifact_type="roadmap", |
| 619 | format="markdown", |
| 620 | ) |
| 621 | output_dir = tmp_path / "notion_arts" |
| 622 | export_to_notion_md(self._sample_kg_data(), output_dir, artifacts=[art]) |
| 623 | |
| 624 | art_file = output_dir / "Roadmap.md" |
| 625 | assert art_file.exists() |
| 626 | art_content = art_file.read_text() |
| 627 | assert "Q1 goals." in art_content |
| 628 |
+482
| --- tests/test_sources.py | ||
| +++ tests/test_sources.py | ||
| @@ -858,5 +858,487 @@ | ||
| 858 | 858 | from video_processor.sources.m365_source import M365Source |
| 859 | 859 | |
| 860 | 860 | src = M365Source(web_url="https://contoso.sharepoint.com") |
| 861 | 861 | files = src.list_videos() |
| 862 | 862 | assert files == [] |
| 863 | + | |
| 864 | + | |
| 865 | +# --------------------------------------------------------------------------- | |
| 866 | +# ObsidianSource | |
| 867 | +# --------------------------------------------------------------------------- | |
| 868 | + | |
| 869 | + | |
| 870 | +class TestObsidianSource: | |
| 871 | + def test_import(self): | |
| 872 | + from video_processor.sources.obsidian_source import ObsidianSource | |
| 873 | + | |
| 874 | + assert ObsidianSource is not None | |
| 875 | + | |
| 876 | + def test_constructor(self, tmp_path): | |
| 877 | + from video_processor.sources.obsidian_source import ObsidianSource | |
| 878 | + | |
| 879 | + src = ObsidianSource(vault_path=str(tmp_path)) | |
| 880 | + assert src.vault_path == tmp_path | |
| 881 | + | |
| 882 | + def test_authenticate_with_vault(self, tmp_path): | |
| 883 | + from video_processor.sources.obsidian_source import ObsidianSource | |
| 884 | + | |
| 885 | + (tmp_path / "note.md").write_text("# Hello") | |
| 886 | + src = ObsidianSource(vault_path=str(tmp_path)) | |
| 887 | + assert src.authenticate() is True | |
| 888 | + | |
| 889 | + def test_authenticate_empty_dir(self, tmp_path): | |
| 890 | + from video_processor.sources.obsidian_source import ObsidianSource | |
| 891 | + | |
| 892 | + src = ObsidianSource(vault_path=str(tmp_path)) | |
| 893 | + assert src.authenticate() is False | |
| 894 | + | |
| 895 | + def test_authenticate_nonexistent(self, tmp_path): | |
| 896 | + from video_processor.sources.obsidian_source import ObsidianSource | |
| 897 | + | |
| 898 | + src = ObsidianSource(vault_path=str(tmp_path / "nonexistent")) | |
| 899 | + assert src.authenticate() is False | |
| 900 | + | |
| 901 | + def test_parse_note(self, tmp_path): | |
| 902 | + from video_processor.sources.obsidian_source import parse_note | |
| 903 | + | |
| 904 | + note_content = ( | |
| 905 | + "---\n" | |
| 906 | + "title: Test Note\n" | |
| 907 | + "tags: [python, testing]\n" | |
| 908 | + "---\n" | |
| 909 | + "# Heading One\n\n" | |
| 910 | + "Some text with a [[Wiki Link]] and [[Another Page|alias]].\n\n" | |
| 911 | + "Also has #tag1 and #tag2 inline tags.\n\n" | |
| 912 | + "## Sub Heading\n\n" | |
| 913 | + "More content here.\n" | |
| 914 | + ) | |
| 915 | + note_file = tmp_path / "test_note.md" | |
| 916 | + note_file.write_text(note_content) | |
| 917 | + | |
| 918 | + result = parse_note(note_file) | |
| 919 | + | |
| 920 | + assert result["frontmatter"]["title"] == "Test Note" | |
| 921 | + assert isinstance(result["frontmatter"]["tags"], list) | |
| 922 | + assert "python" in result["frontmatter"]["tags"] | |
| 923 | + assert "Wiki Link" in result["links"] | |
| 924 | + assert "Another Page" in result["links"] | |
| 925 | + assert "tag1" in result["tags"] | |
| 926 | + assert "tag2" in result["tags"] | |
| 927 | + assert len(result["headings"]) == 2 | |
| 928 | + assert result["headings"][0]["level"] == 1 | |
| 929 | + assert result["headings"][0]["text"] == "Heading One" | |
| 930 | + assert "Some text" in result["body"] | |
| 931 | + | |
| 932 | + def test_ingest_vault(self, tmp_path): | |
| 933 | + from video_processor.sources.obsidian_source import ingest_vault | |
| 934 | + | |
| 935 | + (tmp_path / "note_a.md").write_text("# A\n\nLinks to [[B]].\n") | |
| 936 | + (tmp_path / "note_b.md").write_text("# B\n\nLinks to [[A]] and [[C]].\n") | |
| 937 | + | |
| 938 | + result = ingest_vault(tmp_path) | |
| 939 | + | |
| 940 | + assert len(result["notes"]) == 2 | |
| 941 | + names = [n["name"] for n in result["notes"]] | |
| 942 | + assert "note_a" in names | |
| 943 | + assert "note_b" in names | |
| 944 | + # note_a links to B, note_b links to A and C => 3 links | |
| 945 | + assert len(result["links"]) == 3 | |
| 946 | + | |
| 947 | + def test_list_videos(self, tmp_path): | |
| 948 | + from video_processor.sources.obsidian_source import ObsidianSource | |
| 949 | + | |
| 950 | + (tmp_path / "note1.md").write_text("# Note 1") | |
| 951 | + sub = tmp_path / "subdir" | |
| 952 | + sub.mkdir() | |
| 953 | + (sub / "note2.md").write_text("# Note 2") | |
| 954 | + | |
| 955 | + src = ObsidianSource(vault_path=str(tmp_path)) | |
| 956 | + files = src.list_videos() | |
| 957 | + assert len(files) == 2 | |
| 958 | + assert all(f.mime_type == "text/markdown" for f in files) | |
| 959 | + | |
| 960 | + | |
| 961 | +# --------------------------------------------------------------------------- | |
| 962 | +# LogseqSource | |
| 963 | +# --------------------------------------------------------------------------- | |
| 964 | + | |
| 965 | + | |
| 966 | +class TestLogseqSource: | |
| 967 | + def test_import(self): | |
| 968 | + from video_processor.sources.logseq_source import LogseqSource | |
| 969 | + | |
| 970 | + assert LogseqSource is not None | |
| 971 | + | |
| 972 | + def test_constructor(self, tmp_path): | |
| 973 | + from video_processor.sources.logseq_source import LogseqSource | |
| 974 | + | |
| 975 | + src = LogseqSource(graph_path=str(tmp_path)) | |
| 976 | + assert src.graph_path == tmp_path | |
| 977 | + | |
| 978 | + def test_authenticate_with_pages(self, tmp_path): | |
| 979 | + from video_processor.sources.logseq_source import LogseqSource | |
| 980 | + | |
| 981 | + (tmp_path / "pages").mkdir() | |
| 982 | + src = LogseqSource(graph_path=str(tmp_path)) | |
| 983 | + assert src.authenticate() is True | |
| 984 | + | |
| 985 | + def test_authenticate_no_pages_or_journals(self, tmp_path): | |
| 986 | + from video_processor.sources.logseq_source import LogseqSource | |
| 987 | + | |
| 988 | + src = LogseqSource(graph_path=str(tmp_path)) | |
| 989 | + assert src.authenticate() is False | |
| 990 | + | |
| 991 | + def test_authenticate_nonexistent(self, tmp_path): | |
| 992 | + from video_processor.sources.logseq_source import LogseqSource | |
| 993 | + | |
| 994 | + src = LogseqSource(graph_path=str(tmp_path / "nonexistent")) | |
| 995 | + assert src.authenticate() is False | |
| 996 | + | |
| 997 | + def test_parse_page(self, tmp_path): | |
| 998 | + from video_processor.sources.logseq_source import parse_page | |
| 999 | + | |
| 1000 | + page_content = ( | |
| 1001 | + "title:: My Page\n" | |
| 1002 | + "tags:: #project #important\n" | |
| 1003 | + "- Some block content\n" | |
| 1004 | + " - Nested with [[Another Page]] link\n" | |
| 1005 | + " - And a #todo tag\n" | |
| 1006 | + " - Block ref ((abc12345-6789-0abc-def0-123456789abc))\n" | |
| 1007 | + ) | |
| 1008 | + page_file = tmp_path / "my_page.md" | |
| 1009 | + page_file.write_text(page_content) | |
| 1010 | + | |
| 1011 | + result = parse_page(page_file) | |
| 1012 | + | |
| 1013 | + assert result["properties"]["title"] == "My Page" | |
| 1014 | + assert "Another Page" in result["links"] | |
| 1015 | + assert "todo" in result["tags"] | |
| 1016 | + assert "abc12345-6789-0abc-def0-123456789abc" in result["block_refs"] | |
| 1017 | + assert "Some block content" in result["body"] | |
| 1018 | + | |
| 1019 | + def test_ingest_graph(self, tmp_path): | |
| 1020 | + from video_processor.sources.logseq_source import ingest_graph | |
| 1021 | + | |
| 1022 | + pages_dir = tmp_path / "pages" | |
| 1023 | + pages_dir.mkdir() | |
| 1024 | + (pages_dir / "page_a.md").write_text("- Content linking [[Page B]]\n") | |
| 1025 | + (pages_dir / "page_b.md").write_text("- Content linking [[Page A]]\n") | |
| 1026 | + | |
| 1027 | + journals_dir = tmp_path / "journals" | |
| 1028 | + journals_dir.mkdir() | |
| 1029 | + (journals_dir / "2026_03_07.md").write_text("- Journal entry\n") | |
| 1030 | + | |
| 1031 | + result = ingest_graph(tmp_path) | |
| 1032 | + | |
| 1033 | + assert len(result["notes"]) == 3 | |
| 1034 | + assert len(result["links"]) == 2 | |
| 1035 | + | |
| 1036 | + def test_list_videos(self, tmp_path): | |
| 1037 | + from video_processor.sources.logseq_source import LogseqSource | |
| 1038 | + | |
| 1039 | + pages_dir = tmp_path / "pages" | |
| 1040 | + pages_dir.mkdir() | |
| 1041 | + (pages_dir / "page1.md").write_text("- content") | |
| 1042 | + | |
| 1043 | + src = LogseqSource(graph_path=str(tmp_path)) | |
| 1044 | + files = src.list_videos() | |
| 1045 | + assert len(files) == 1 | |
| 1046 | + assert files[0].mime_type == "text/markdown" | |
| 1047 | + | |
| 1048 | + | |
| 1049 | +# --------------------------------------------------------------------------- | |
| 1050 | +# NotionSource | |
| 1051 | +# --------------------------------------------------------------------------- | |
| 1052 | + | |
| 1053 | + | |
| 1054 | +class TestNotionSource: | |
| 1055 | + def test_import(self): | |
| 1056 | + from video_processor.sources.notion_source import NotionSource | |
| 1057 | + | |
| 1058 | + assert NotionSource is not None | |
| 1059 | + | |
| 1060 | + def test_constructor(self): | |
| 1061 | + from video_processor.sources.notion_source import NotionSource | |
| 1062 | + | |
| 1063 | + src = NotionSource(token="ntn_test123", database_id="db-1") | |
| 1064 | + assert src.token == "ntn_test123" | |
| 1065 | + assert src.database_id == "db-1" | |
| 1066 | + assert src.page_ids == [] | |
| 1067 | + | |
| 1068 | + @patch.dict(os.environ, {}, clear=True) | |
| 1069 | + def test_authenticate_no_token(self): | |
| 1070 | + from video_processor.sources.notion_source import NotionSource | |
| 1071 | + | |
| 1072 | + src = NotionSource(token="") | |
| 1073 | + assert src.authenticate() is False | |
| 1074 | + | |
| 1075 | + @patch("requests.get") | |
| 1076 | + def test_authenticate_with_mock(self, mock_get): | |
| 1077 | + from video_processor.sources.notion_source import NotionSource | |
| 1078 | + | |
| 1079 | + mock_resp = MagicMock() | |
| 1080 | + mock_resp.raise_for_status = MagicMock() | |
| 1081 | + mock_resp.json.return_value = {"name": "Test Bot"} | |
| 1082 | + mock_get.return_value = mock_resp | |
| 1083 | + | |
| 1084 | + src = NotionSource(token="ntn_test123") | |
| 1085 | + assert src.authenticate() is True | |
| 1086 | + | |
| 1087 | + @patch("requests.post") | |
| 1088 | + def test_list_videos_database(self, mock_post): | |
| 1089 | + from video_processor.sources.notion_source import NotionSource | |
| 1090 | + | |
| 1091 | + mock_resp = MagicMock() | |
| 1092 | + mock_resp.raise_for_status = MagicMock() | |
| 1093 | + mock_resp.json.return_value = { | |
| 1094 | + "results": [ | |
| 1095 | + { | |
| 1096 | + "id": "page-1", | |
| 1097 | + "last_edited_time": "2026-03-01T00:00:00Z", | |
| 1098 | + "properties": { | |
| 1099 | + "Name": { | |
| 1100 | + "type": "title", | |
| 1101 | + "title": [{"plain_text": "Meeting Notes"}], | |
| 1102 | + } | |
| 1103 | + }, | |
| 1104 | + }, | |
| 1105 | + ], | |
| 1106 | + "has_more": False, | |
| 1107 | + } | |
| 1108 | + mock_post.return_value = mock_resp | |
| 1109 | + | |
| 1110 | + src = NotionSource(token="ntn_test", database_id="db-1") | |
| 1111 | + files = src.list_videos() | |
| 1112 | + assert len(files) == 1 | |
| 1113 | + assert files[0].name == "Meeting Notes" | |
| 1114 | + assert files[0].id == "page-1" | |
| 1115 | + | |
| 1116 | + def test_blocks_to_text(self): | |
| 1117 | + from video_processor.sources.notion_source import NotionSource | |
| 1118 | + | |
| 1119 | + src = NotionSource(token="test") | |
| 1120 | + blocks = [ | |
| 1121 | + { | |
| 1122 | + "type": "heading_1", | |
| 1123 | + "heading_1": { | |
| 1124 | + "rich_text": [{"plain_text": "Title"}], | |
| 1125 | + }, | |
| 1126 | + }, | |
| 1127 | + { | |
| 1128 | + "type": "paragraph", | |
| 1129 | + "paragraph": { | |
| 1130 | + "rich_text": [{"plain_text": "Some paragraph text."}], | |
| 1131 | + }, | |
| 1132 | + }, | |
| 1133 | + { | |
| 1134 | + "type": "bulleted_list_item", | |
| 1135 | + "bulleted_list_item": { | |
| 1136 | + "rich_text": [{"plain_text": "A bullet point"}], | |
| 1137 | + }, | |
| 1138 | + }, | |
| 1139 | + { | |
| 1140 | + "type": "divider", | |
| 1141 | + "divider": {}, | |
| 1142 | + }, | |
| 1143 | + ] | |
| 1144 | + result = src._blocks_to_text(blocks) | |
| 1145 | + assert "# Title" in result | |
| 1146 | + assert "Some paragraph text." in result | |
| 1147 | + assert "- A bullet point" in result | |
| 1148 | + assert "---" in result | |
| 1149 | + | |
| 1150 | + | |
| 1151 | +# --------------------------------------------------------------------------- | |
| 1152 | +# AppleNotesSource | |
| 1153 | +# --------------------------------------------------------------------------- | |
| 1154 | + | |
| 1155 | + | |
| 1156 | +class TestAppleNotesSource: | |
| 1157 | + def test_import(self): | |
| 1158 | + from video_processor.sources.apple_notes_source import AppleNotesSource | |
| 1159 | + | |
| 1160 | + assert AppleNotesSource is not None | |
| 1161 | + | |
| 1162 | + def test_constructor(self): | |
| 1163 | + from video_processor.sources.apple_notes_source import AppleNotesSource | |
| 1164 | + | |
| 1165 | + src = AppleNotesSource(folder="Work") | |
| 1166 | + assert src.folder == "Work" | |
| 1167 | + | |
| 1168 | + def test_constructor_default(self): | |
| 1169 | + from video_processor.sources.apple_notes_source import AppleNotesSource | |
| 1170 | + | |
| 1171 | + src = AppleNotesSource() | |
| 1172 | + assert src.folder is None | |
| 1173 | + | |
| 1174 | + def test_authenticate_platform(self): | |
| 1175 | + import sys | |
| 1176 | + | |
| 1177 | + from video_processor.sources.apple_notes_source import AppleNotesSource | |
| 1178 | + | |
| 1179 | + src = AppleNotesSource() | |
| 1180 | + result = src.authenticate() | |
| 1181 | + if sys.platform == "darwin": | |
| 1182 | + assert result is True | |
| 1183 | + else: | |
| 1184 | + assert result is False | |
| 1185 | + | |
| 1186 | + def test_html_to_text(self): | |
| 1187 | + from video_processor.sources.apple_notes_source import AppleNotesSource | |
| 1188 | + | |
| 1189 | + html = ( | |
| 1190 | + "<div>Hello <b>World</b></div>" | |
| 1191 | + "<p>Paragraph one.</p>" | |
| 1192 | + "<p>Paragraph two with & entity.</p>" | |
| 1193 | + "<br/>" | |
| 1194 | + "<ul><li>Item 1</li><li>Item 2</li></ul>" | |
| 1195 | + ) | |
| 1196 | + result = AppleNotesSource._html_to_text(html) | |
| 1197 | + assert "Hello World" in result | |
| 1198 | + assert "Paragraph one." in result | |
| 1199 | + assert "Paragraph two with & entity." in result | |
| 1200 | + assert "Item 1" in result | |
| 1201 | + | |
| 1202 | + def test_html_to_text_empty(self): | |
| 1203 | + from video_processor.sources.apple_notes_source import AppleNotesSource | |
| 1204 | + | |
| 1205 | + assert AppleNotesSource._html_to_text("") == "" | |
| 1206 | + | |
| 1207 | + def test_html_to_text_entities(self): | |
| 1208 | + from video_processor.sources.apple_notes_source import AppleNotesSource | |
| 1209 | + | |
| 1210 | + html = "<code> "test" 'single' space" | |
| 1211 | + result = AppleNotesSource._html_to_text(html) | |
| 1212 | + assert "<code>" in result | |
| 1213 | + assert '"test"' in result | |
| 1214 | + assert "'single'" in result | |
| 1215 | + | |
| 1216 | + | |
| 1217 | +# --------------------------------------------------------------------------- | |
| 1218 | +# GoogleKeepSource | |
| 1219 | +# --------------------------------------------------------------------------- | |
| 1220 | + | |
| 1221 | + | |
| 1222 | +class TestGoogleKeepSource: | |
| 1223 | + def test_import(self): | |
| 1224 | + from video_processor.sources.google_keep_source import GoogleKeepSource | |
| 1225 | + | |
| 1226 | + assert GoogleKeepSource is not None | |
| 1227 | + | |
| 1228 | + def test_constructor(self): | |
| 1229 | + from video_processor.sources.google_keep_source import GoogleKeepSource | |
| 1230 | + | |
| 1231 | + src = GoogleKeepSource(label="meetings") | |
| 1232 | + assert src.label == "meetings" | |
| 1233 | + | |
| 1234 | + def test_constructor_default(self): | |
| 1235 | + from video_processor.sources.google_keep_source import GoogleKeepSource | |
| 1236 | + | |
| 1237 | + src = GoogleKeepSource() | |
| 1238 | + assert src.label is None | |
| 1239 | + | |
| 1240 | + @patch("shutil.which", return_value=None) | |
| 1241 | + def test_authenticate_no_gws(self, _mock_which): | |
| 1242 | + from video_processor.sources.google_keep_source import GoogleKeepSource | |
| 1243 | + | |
| 1244 | + src = GoogleKeepSource() | |
| 1245 | + assert src.authenticate() is False | |
| 1246 | + | |
| 1247 | + def test_note_to_text(self): | |
| 1248 | + from video_processor.sources.google_keep_source import _note_to_text | |
| 1249 | + | |
| 1250 | + note = { | |
| 1251 | + "title": "Shopping List", | |
| 1252 | + "body": "Remember to buy groceries", | |
| 1253 | + "listContent": [ | |
| 1254 | + {"text": "Milk", "checked": True}, | |
| 1255 | + {"text": "Bread", "checked": False}, | |
| 1256 | + {"text": "", "checked": False}, | |
| 1257 | + ], | |
| 1258 | + } | |
| 1259 | + result = _note_to_text(note) | |
| 1260 | + assert "Shopping List" in result | |
| 1261 | + assert "Remember to buy groceries" in result | |
| 1262 | + assert "- [x] Milk" in result | |
| 1263 | + assert "- [ ] Bread" in result | |
| 1264 | + | |
| 1265 | + def test_note_to_text_empty(self): | |
| 1266 | + from video_processor.sources.google_keep_source import _note_to_text | |
| 1267 | + | |
| 1268 | + assert _note_to_text({}) == "" | |
| 1269 | + | |
| 1270 | + def test_note_to_text_text_content(self): | |
| 1271 | + from video_processor.sources.google_keep_source import _note_to_text | |
| 1272 | + | |
| 1273 | + note = {"title": "Simple", "textContent": "Just a plain note"} | |
| 1274 | + result = _note_to_text(note) | |
| 1275 | + assert "Simple" in result | |
| 1276 | + assert "Just a plain note" in result | |
| 1277 | + | |
| 1278 | + | |
| 1279 | +# --------------------------------------------------------------------------- | |
| 1280 | +# OneNoteSource | |
| 1281 | +# --------------------------------------------------------------------------- | |
| 1282 | + | |
| 1283 | + | |
| 1284 | +class TestOneNoteSource: | |
| 1285 | + def test_import(self): | |
| 1286 | + from video_processor.sources.onenote_source import OneNoteSource | |
| 1287 | + | |
| 1288 | + assert OneNoteSource is not None | |
| 1289 | + | |
| 1290 | + def test_constructor(self): | |
| 1291 | + from video_processor.sources.onenote_source import OneNoteSource | |
| 1292 | + | |
| 1293 | + src = OneNoteSource(notebook_name="Work Notes", section_name="Meetings") | |
| 1294 | + assert src.notebook_name == "Work Notes" | |
| 1295 | + assert src.section_name == "Meetings" | |
| 1296 | + | |
| 1297 | + def test_constructor_default(self): | |
| 1298 | + from video_processor.sources.onenote_source import OneNoteSource | |
| 1299 | + | |
| 1300 | + src = OneNoteSource() | |
| 1301 | + assert src.notebook_name is None | |
| 1302 | + assert src.section_name is None | |
| 1303 | + | |
| 1304 | + @patch("shutil.which", return_value=None) | |
| 1305 | + def test_authenticate_no_m365(self, _mock_which): | |
| 1306 | + from video_processor.sources.onenote_source import OneNoteSource | |
| 1307 | + | |
| 1308 | + src = OneNoteSource() | |
| 1309 | + assert src.authenticate() is False | |
| 1310 | + | |
| 1311 | + def test_html_to_text(self): | |
| 1312 | + from video_processor.sources.onenote_source import _html_to_text | |
| 1313 | + | |
| 1314 | + html = ( | |
| 1315 | + "<html><body>" | |
| 1316 | + "<h1>Meeting Notes</h1>" | |
| 1317 | + "<p>Discussed the & project.</p>" | |
| 1318 | + "<script>var x = 1;</script>" | |
| 1319 | + "<style>.foo { color: red; }</style>" | |
| 1320 | + "<ul><li>Action item 1</li><li>Action item 2</li></ul>" | |
| 1321 | + "<p>Entity A and A decoded.</p>" | |
| 1322 | + "</body></html>" | |
| 1323 | + ) | |
| 1324 | + result = _html_to_text(html) | |
| 1325 | + assert "Meeting Notes" in result | |
| 1326 | + assert "Discussed the & project." in result | |
| 1327 | + assert "var x" not in result | |
| 1328 | + assert ".foo" not in result | |
| 1329 | + assert "Action item 1" in result | |
| 1330 | + assert "Entity A and A decoded." in result | |
| 1331 | + | |
| 1332 | + def test_html_to_text_empty(self): | |
| 1333 | + from video_processor.sources.onenote_source import _html_to_text | |
| 1334 | + | |
| 1335 | + assert _html_to_text("") == "" | |
| 1336 | + | |
| 1337 | + def test_html_to_text_entities(self): | |
| 1338 | + from video_processor.sources.onenote_source import _html_to_text | |
| 1339 | + | |
| 1340 | + html = "<tag> "quoted" 'apos' space" | |
| 1341 | + result = _html_to_text(html) | |
| 1342 | + assert "<tag>" in result | |
| 1343 | + assert '"quoted"' in result | |
| 1344 | + assert "'apos'" in result | |
| 863 | 1345 |
| --- tests/test_sources.py | |
| +++ tests/test_sources.py | |
| @@ -858,5 +858,487 @@ | |
| 858 | from video_processor.sources.m365_source import M365Source |
| 859 | |
| 860 | src = M365Source(web_url="https://contoso.sharepoint.com") |
| 861 | files = src.list_videos() |
| 862 | assert files == [] |
| 863 |
| --- tests/test_sources.py | |
| +++ tests/test_sources.py | |
| @@ -858,5 +858,487 @@ | |
| 858 | from video_processor.sources.m365_source import M365Source |
| 859 | |
| 860 | src = M365Source(web_url="https://contoso.sharepoint.com") |
| 861 | files = src.list_videos() |
| 862 | assert files == [] |
| 863 | |
| 864 | |
| 865 | # --------------------------------------------------------------------------- |
| 866 | # ObsidianSource |
| 867 | # --------------------------------------------------------------------------- |
| 868 | |
| 869 | |
| 870 | class TestObsidianSource: |
| 871 | def test_import(self): |
| 872 | from video_processor.sources.obsidian_source import ObsidianSource |
| 873 | |
| 874 | assert ObsidianSource is not None |
| 875 | |
| 876 | def test_constructor(self, tmp_path): |
| 877 | from video_processor.sources.obsidian_source import ObsidianSource |
| 878 | |
| 879 | src = ObsidianSource(vault_path=str(tmp_path)) |
| 880 | assert src.vault_path == tmp_path |
| 881 | |
| 882 | def test_authenticate_with_vault(self, tmp_path): |
| 883 | from video_processor.sources.obsidian_source import ObsidianSource |
| 884 | |
| 885 | (tmp_path / "note.md").write_text("# Hello") |
| 886 | src = ObsidianSource(vault_path=str(tmp_path)) |
| 887 | assert src.authenticate() is True |
| 888 | |
| 889 | def test_authenticate_empty_dir(self, tmp_path): |
| 890 | from video_processor.sources.obsidian_source import ObsidianSource |
| 891 | |
| 892 | src = ObsidianSource(vault_path=str(tmp_path)) |
| 893 | assert src.authenticate() is False |
| 894 | |
| 895 | def test_authenticate_nonexistent(self, tmp_path): |
| 896 | from video_processor.sources.obsidian_source import ObsidianSource |
| 897 | |
| 898 | src = ObsidianSource(vault_path=str(tmp_path / "nonexistent")) |
| 899 | assert src.authenticate() is False |
| 900 | |
| 901 | def test_parse_note(self, tmp_path): |
| 902 | from video_processor.sources.obsidian_source import parse_note |
| 903 | |
| 904 | note_content = ( |
| 905 | "---\n" |
| 906 | "title: Test Note\n" |
| 907 | "tags: [python, testing]\n" |
| 908 | "---\n" |
| 909 | "# Heading One\n\n" |
| 910 | "Some text with a [[Wiki Link]] and [[Another Page|alias]].\n\n" |
| 911 | "Also has #tag1 and #tag2 inline tags.\n\n" |
| 912 | "## Sub Heading\n\n" |
| 913 | "More content here.\n" |
| 914 | ) |
| 915 | note_file = tmp_path / "test_note.md" |
| 916 | note_file.write_text(note_content) |
| 917 | |
| 918 | result = parse_note(note_file) |
| 919 | |
| 920 | assert result["frontmatter"]["title"] == "Test Note" |
| 921 | assert isinstance(result["frontmatter"]["tags"], list) |
| 922 | assert "python" in result["frontmatter"]["tags"] |
| 923 | assert "Wiki Link" in result["links"] |
| 924 | assert "Another Page" in result["links"] |
| 925 | assert "tag1" in result["tags"] |
| 926 | assert "tag2" in result["tags"] |
| 927 | assert len(result["headings"]) == 2 |
| 928 | assert result["headings"][0]["level"] == 1 |
| 929 | assert result["headings"][0]["text"] == "Heading One" |
| 930 | assert "Some text" in result["body"] |
| 931 | |
| 932 | def test_ingest_vault(self, tmp_path): |
| 933 | from video_processor.sources.obsidian_source import ingest_vault |
| 934 | |
| 935 | (tmp_path / "note_a.md").write_text("# A\n\nLinks to [[B]].\n") |
| 936 | (tmp_path / "note_b.md").write_text("# B\n\nLinks to [[A]] and [[C]].\n") |
| 937 | |
| 938 | result = ingest_vault(tmp_path) |
| 939 | |
| 940 | assert len(result["notes"]) == 2 |
| 941 | names = [n["name"] for n in result["notes"]] |
| 942 | assert "note_a" in names |
| 943 | assert "note_b" in names |
| 944 | # note_a links to B, note_b links to A and C => 3 links |
| 945 | assert len(result["links"]) == 3 |
| 946 | |
| 947 | def test_list_videos(self, tmp_path): |
| 948 | from video_processor.sources.obsidian_source import ObsidianSource |
| 949 | |
| 950 | (tmp_path / "note1.md").write_text("# Note 1") |
| 951 | sub = tmp_path / "subdir" |
| 952 | sub.mkdir() |
| 953 | (sub / "note2.md").write_text("# Note 2") |
| 954 | |
| 955 | src = ObsidianSource(vault_path=str(tmp_path)) |
| 956 | files = src.list_videos() |
| 957 | assert len(files) == 2 |
| 958 | assert all(f.mime_type == "text/markdown" for f in files) |
| 959 | |
| 960 | |
| 961 | # --------------------------------------------------------------------------- |
| 962 | # LogseqSource |
| 963 | # --------------------------------------------------------------------------- |
| 964 | |
| 965 | |
| 966 | class TestLogseqSource: |
| 967 | def test_import(self): |
| 968 | from video_processor.sources.logseq_source import LogseqSource |
| 969 | |
| 970 | assert LogseqSource is not None |
| 971 | |
| 972 | def test_constructor(self, tmp_path): |
| 973 | from video_processor.sources.logseq_source import LogseqSource |
| 974 | |
| 975 | src = LogseqSource(graph_path=str(tmp_path)) |
| 976 | assert src.graph_path == tmp_path |
| 977 | |
| 978 | def test_authenticate_with_pages(self, tmp_path): |
| 979 | from video_processor.sources.logseq_source import LogseqSource |
| 980 | |
| 981 | (tmp_path / "pages").mkdir() |
| 982 | src = LogseqSource(graph_path=str(tmp_path)) |
| 983 | assert src.authenticate() is True |
| 984 | |
| 985 | def test_authenticate_no_pages_or_journals(self, tmp_path): |
| 986 | from video_processor.sources.logseq_source import LogseqSource |
| 987 | |
| 988 | src = LogseqSource(graph_path=str(tmp_path)) |
| 989 | assert src.authenticate() is False |
| 990 | |
| 991 | def test_authenticate_nonexistent(self, tmp_path): |
| 992 | from video_processor.sources.logseq_source import LogseqSource |
| 993 | |
| 994 | src = LogseqSource(graph_path=str(tmp_path / "nonexistent")) |
| 995 | assert src.authenticate() is False |
| 996 | |
| 997 | def test_parse_page(self, tmp_path): |
| 998 | from video_processor.sources.logseq_source import parse_page |
| 999 | |
| 1000 | page_content = ( |
| 1001 | "title:: My Page\n" |
| 1002 | "tags:: #project #important\n" |
| 1003 | "- Some block content\n" |
| 1004 | " - Nested with [[Another Page]] link\n" |
| 1005 | " - And a #todo tag\n" |
| 1006 | " - Block ref ((abc12345-6789-0abc-def0-123456789abc))\n" |
| 1007 | ) |
| 1008 | page_file = tmp_path / "my_page.md" |
| 1009 | page_file.write_text(page_content) |
| 1010 | |
| 1011 | result = parse_page(page_file) |
| 1012 | |
| 1013 | assert result["properties"]["title"] == "My Page" |
| 1014 | assert "Another Page" in result["links"] |
| 1015 | assert "todo" in result["tags"] |
| 1016 | assert "abc12345-6789-0abc-def0-123456789abc" in result["block_refs"] |
| 1017 | assert "Some block content" in result["body"] |
| 1018 | |
| 1019 | def test_ingest_graph(self, tmp_path): |
| 1020 | from video_processor.sources.logseq_source import ingest_graph |
| 1021 | |
| 1022 | pages_dir = tmp_path / "pages" |
| 1023 | pages_dir.mkdir() |
| 1024 | (pages_dir / "page_a.md").write_text("- Content linking [[Page B]]\n") |
| 1025 | (pages_dir / "page_b.md").write_text("- Content linking [[Page A]]\n") |
| 1026 | |
| 1027 | journals_dir = tmp_path / "journals" |
| 1028 | journals_dir.mkdir() |
| 1029 | (journals_dir / "2026_03_07.md").write_text("- Journal entry\n") |
| 1030 | |
| 1031 | result = ingest_graph(tmp_path) |
| 1032 | |
| 1033 | assert len(result["notes"]) == 3 |
| 1034 | assert len(result["links"]) == 2 |
| 1035 | |
| 1036 | def test_list_videos(self, tmp_path): |
| 1037 | from video_processor.sources.logseq_source import LogseqSource |
| 1038 | |
| 1039 | pages_dir = tmp_path / "pages" |
| 1040 | pages_dir.mkdir() |
| 1041 | (pages_dir / "page1.md").write_text("- content") |
| 1042 | |
| 1043 | src = LogseqSource(graph_path=str(tmp_path)) |
| 1044 | files = src.list_videos() |
| 1045 | assert len(files) == 1 |
| 1046 | assert files[0].mime_type == "text/markdown" |
| 1047 | |
| 1048 | |
| 1049 | # --------------------------------------------------------------------------- |
| 1050 | # NotionSource |
| 1051 | # --------------------------------------------------------------------------- |
| 1052 | |
| 1053 | |
| 1054 | class TestNotionSource: |
| 1055 | def test_import(self): |
| 1056 | from video_processor.sources.notion_source import NotionSource |
| 1057 | |
| 1058 | assert NotionSource is not None |
| 1059 | |
| 1060 | def test_constructor(self): |
| 1061 | from video_processor.sources.notion_source import NotionSource |
| 1062 | |
| 1063 | src = NotionSource(token="ntn_test123", database_id="db-1") |
| 1064 | assert src.token == "ntn_test123" |
| 1065 | assert src.database_id == "db-1" |
| 1066 | assert src.page_ids == [] |
| 1067 | |
| 1068 | @patch.dict(os.environ, {}, clear=True) |
| 1069 | def test_authenticate_no_token(self): |
| 1070 | from video_processor.sources.notion_source import NotionSource |
| 1071 | |
| 1072 | src = NotionSource(token="") |
| 1073 | assert src.authenticate() is False |
| 1074 | |
| 1075 | @patch("requests.get") |
| 1076 | def test_authenticate_with_mock(self, mock_get): |
| 1077 | from video_processor.sources.notion_source import NotionSource |
| 1078 | |
| 1079 | mock_resp = MagicMock() |
| 1080 | mock_resp.raise_for_status = MagicMock() |
| 1081 | mock_resp.json.return_value = {"name": "Test Bot"} |
| 1082 | mock_get.return_value = mock_resp |
| 1083 | |
| 1084 | src = NotionSource(token="ntn_test123") |
| 1085 | assert src.authenticate() is True |
| 1086 | |
| 1087 | @patch("requests.post") |
| 1088 | def test_list_videos_database(self, mock_post): |
| 1089 | from video_processor.sources.notion_source import NotionSource |
| 1090 | |
| 1091 | mock_resp = MagicMock() |
| 1092 | mock_resp.raise_for_status = MagicMock() |
| 1093 | mock_resp.json.return_value = { |
| 1094 | "results": [ |
| 1095 | { |
| 1096 | "id": "page-1", |
| 1097 | "last_edited_time": "2026-03-01T00:00:00Z", |
| 1098 | "properties": { |
| 1099 | "Name": { |
| 1100 | "type": "title", |
| 1101 | "title": [{"plain_text": "Meeting Notes"}], |
| 1102 | } |
| 1103 | }, |
| 1104 | }, |
| 1105 | ], |
| 1106 | "has_more": False, |
| 1107 | } |
| 1108 | mock_post.return_value = mock_resp |
| 1109 | |
| 1110 | src = NotionSource(token="ntn_test", database_id="db-1") |
| 1111 | files = src.list_videos() |
| 1112 | assert len(files) == 1 |
| 1113 | assert files[0].name == "Meeting Notes" |
| 1114 | assert files[0].id == "page-1" |
| 1115 | |
| 1116 | def test_blocks_to_text(self): |
| 1117 | from video_processor.sources.notion_source import NotionSource |
| 1118 | |
| 1119 | src = NotionSource(token="test") |
| 1120 | blocks = [ |
| 1121 | { |
| 1122 | "type": "heading_1", |
| 1123 | "heading_1": { |
| 1124 | "rich_text": [{"plain_text": "Title"}], |
| 1125 | }, |
| 1126 | }, |
| 1127 | { |
| 1128 | "type": "paragraph", |
| 1129 | "paragraph": { |
| 1130 | "rich_text": [{"plain_text": "Some paragraph text."}], |
| 1131 | }, |
| 1132 | }, |
| 1133 | { |
| 1134 | "type": "bulleted_list_item", |
| 1135 | "bulleted_list_item": { |
| 1136 | "rich_text": [{"plain_text": "A bullet point"}], |
| 1137 | }, |
| 1138 | }, |
| 1139 | { |
| 1140 | "type": "divider", |
| 1141 | "divider": {}, |
| 1142 | }, |
| 1143 | ] |
| 1144 | result = src._blocks_to_text(blocks) |
| 1145 | assert "# Title" in result |
| 1146 | assert "Some paragraph text." in result |
| 1147 | assert "- A bullet point" in result |
| 1148 | assert "---" in result |
| 1149 | |
| 1150 | |
| 1151 | # --------------------------------------------------------------------------- |
| 1152 | # AppleNotesSource |
| 1153 | # --------------------------------------------------------------------------- |
| 1154 | |
| 1155 | |
| 1156 | class TestAppleNotesSource: |
| 1157 | def test_import(self): |
| 1158 | from video_processor.sources.apple_notes_source import AppleNotesSource |
| 1159 | |
| 1160 | assert AppleNotesSource is not None |
| 1161 | |
| 1162 | def test_constructor(self): |
| 1163 | from video_processor.sources.apple_notes_source import AppleNotesSource |
| 1164 | |
| 1165 | src = AppleNotesSource(folder="Work") |
| 1166 | assert src.folder == "Work" |
| 1167 | |
| 1168 | def test_constructor_default(self): |
| 1169 | from video_processor.sources.apple_notes_source import AppleNotesSource |
| 1170 | |
| 1171 | src = AppleNotesSource() |
| 1172 | assert src.folder is None |
| 1173 | |
| 1174 | def test_authenticate_platform(self): |
| 1175 | import sys |
| 1176 | |
| 1177 | from video_processor.sources.apple_notes_source import AppleNotesSource |
| 1178 | |
| 1179 | src = AppleNotesSource() |
| 1180 | result = src.authenticate() |
| 1181 | if sys.platform == "darwin": |
| 1182 | assert result is True |
| 1183 | else: |
| 1184 | assert result is False |
| 1185 | |
| 1186 | def test_html_to_text(self): |
| 1187 | from video_processor.sources.apple_notes_source import AppleNotesSource |
| 1188 | |
| 1189 | html = ( |
| 1190 | "<div>Hello <b>World</b></div>" |
| 1191 | "<p>Paragraph one.</p>" |
| 1192 | "<p>Paragraph two with & entity.</p>" |
| 1193 | "<br/>" |
| 1194 | "<ul><li>Item 1</li><li>Item 2</li></ul>" |
| 1195 | ) |
| 1196 | result = AppleNotesSource._html_to_text(html) |
| 1197 | assert "Hello World" in result |
| 1198 | assert "Paragraph one." in result |
| 1199 | assert "Paragraph two with & entity." in result |
| 1200 | assert "Item 1" in result |
| 1201 | |
| 1202 | def test_html_to_text_empty(self): |
| 1203 | from video_processor.sources.apple_notes_source import AppleNotesSource |
| 1204 | |
| 1205 | assert AppleNotesSource._html_to_text("") == "" |
| 1206 | |
| 1207 | def test_html_to_text_entities(self): |
| 1208 | from video_processor.sources.apple_notes_source import AppleNotesSource |
| 1209 | |
| 1210 | html = "<code> "test" 'single' space" |
| 1211 | result = AppleNotesSource._html_to_text(html) |
| 1212 | assert "<code>" in result |
| 1213 | assert '"test"' in result |
| 1214 | assert "'single'" in result |
| 1215 | |
| 1216 | |
| 1217 | # --------------------------------------------------------------------------- |
| 1218 | # GoogleKeepSource |
| 1219 | # --------------------------------------------------------------------------- |
| 1220 | |
| 1221 | |
| 1222 | class TestGoogleKeepSource: |
| 1223 | def test_import(self): |
| 1224 | from video_processor.sources.google_keep_source import GoogleKeepSource |
| 1225 | |
| 1226 | assert GoogleKeepSource is not None |
| 1227 | |
| 1228 | def test_constructor(self): |
| 1229 | from video_processor.sources.google_keep_source import GoogleKeepSource |
| 1230 | |
| 1231 | src = GoogleKeepSource(label="meetings") |
| 1232 | assert src.label == "meetings" |
| 1233 | |
| 1234 | def test_constructor_default(self): |
| 1235 | from video_processor.sources.google_keep_source import GoogleKeepSource |
| 1236 | |
| 1237 | src = GoogleKeepSource() |
| 1238 | assert src.label is None |
| 1239 | |
| 1240 | @patch("shutil.which", return_value=None) |
| 1241 | def test_authenticate_no_gws(self, _mock_which): |
| 1242 | from video_processor.sources.google_keep_source import GoogleKeepSource |
| 1243 | |
| 1244 | src = GoogleKeepSource() |
| 1245 | assert src.authenticate() is False |
| 1246 | |
| 1247 | def test_note_to_text(self): |
| 1248 | from video_processor.sources.google_keep_source import _note_to_text |
| 1249 | |
| 1250 | note = { |
| 1251 | "title": "Shopping List", |
| 1252 | "body": "Remember to buy groceries", |
| 1253 | "listContent": [ |
| 1254 | {"text": "Milk", "checked": True}, |
| 1255 | {"text": "Bread", "checked": False}, |
| 1256 | {"text": "", "checked": False}, |
| 1257 | ], |
| 1258 | } |
| 1259 | result = _note_to_text(note) |
| 1260 | assert "Shopping List" in result |
| 1261 | assert "Remember to buy groceries" in result |
| 1262 | assert "- [x] Milk" in result |
| 1263 | assert "- [ ] Bread" in result |
| 1264 | |
| 1265 | def test_note_to_text_empty(self): |
| 1266 | from video_processor.sources.google_keep_source import _note_to_text |
| 1267 | |
| 1268 | assert _note_to_text({}) == "" |
| 1269 | |
| 1270 | def test_note_to_text_text_content(self): |
| 1271 | from video_processor.sources.google_keep_source import _note_to_text |
| 1272 | |
| 1273 | note = {"title": "Simple", "textContent": "Just a plain note"} |
| 1274 | result = _note_to_text(note) |
| 1275 | assert "Simple" in result |
| 1276 | assert "Just a plain note" in result |
| 1277 | |
| 1278 | |
| 1279 | # --------------------------------------------------------------------------- |
| 1280 | # OneNoteSource |
| 1281 | # --------------------------------------------------------------------------- |
| 1282 | |
| 1283 | |
| 1284 | class TestOneNoteSource: |
| 1285 | def test_import(self): |
| 1286 | from video_processor.sources.onenote_source import OneNoteSource |
| 1287 | |
| 1288 | assert OneNoteSource is not None |
| 1289 | |
| 1290 | def test_constructor(self): |
| 1291 | from video_processor.sources.onenote_source import OneNoteSource |
| 1292 | |
| 1293 | src = OneNoteSource(notebook_name="Work Notes", section_name="Meetings") |
| 1294 | assert src.notebook_name == "Work Notes" |
| 1295 | assert src.section_name == "Meetings" |
| 1296 | |
| 1297 | def test_constructor_default(self): |
| 1298 | from video_processor.sources.onenote_source import OneNoteSource |
| 1299 | |
| 1300 | src = OneNoteSource() |
| 1301 | assert src.notebook_name is None |
| 1302 | assert src.section_name is None |
| 1303 | |
| 1304 | @patch("shutil.which", return_value=None) |
| 1305 | def test_authenticate_no_m365(self, _mock_which): |
| 1306 | from video_processor.sources.onenote_source import OneNoteSource |
| 1307 | |
| 1308 | src = OneNoteSource() |
| 1309 | assert src.authenticate() is False |
| 1310 | |
| 1311 | def test_html_to_text(self): |
| 1312 | from video_processor.sources.onenote_source import _html_to_text |
| 1313 | |
| 1314 | html = ( |
| 1315 | "<html><body>" |
| 1316 | "<h1>Meeting Notes</h1>" |
| 1317 | "<p>Discussed the & project.</p>" |
| 1318 | "<script>var x = 1;</script>" |
| 1319 | "<style>.foo { color: red; }</style>" |
| 1320 | "<ul><li>Action item 1</li><li>Action item 2</li></ul>" |
| 1321 | "<p>Entity A and A decoded.</p>" |
| 1322 | "</body></html>" |
| 1323 | ) |
| 1324 | result = _html_to_text(html) |
| 1325 | assert "Meeting Notes" in result |
| 1326 | assert "Discussed the & project." in result |
| 1327 | assert "var x" not in result |
| 1328 | assert ".foo" not in result |
| 1329 | assert "Action item 1" in result |
| 1330 | assert "Entity A and A decoded." in result |
| 1331 | |
| 1332 | def test_html_to_text_empty(self): |
| 1333 | from video_processor.sources.onenote_source import _html_to_text |
| 1334 | |
| 1335 | assert _html_to_text("") == "" |
| 1336 | |
| 1337 | def test_html_to_text_entities(self): |
| 1338 | from video_processor.sources.onenote_source import _html_to_text |
| 1339 | |
| 1340 | html = "<tag> "quoted" 'apos' space" |
| 1341 | result = _html_to_text(html) |
| 1342 | assert "<tag>" in result |
| 1343 | assert '"quoted"' in result |
| 1344 | assert "'apos'" in result |
| 1345 |
| --- video_processor/agent/skills/__init__.py | ||
| +++ video_processor/agent/skills/__init__.py | ||
| @@ -4,15 +4,17 @@ | ||
| 4 | 4 | from video_processor.agent.skills import ( # noqa: F401 |
| 5 | 5 | artifact_export, |
| 6 | 6 | cli_adapter, |
| 7 | 7 | doc_generator, |
| 8 | 8 | github_integration, |
| 9 | + notes_export, | |
| 9 | 10 | prd, |
| 10 | 11 | project_plan, |
| 11 | 12 | requirements_chat, |
| 12 | 13 | roadmap, |
| 13 | 14 | task_breakdown, |
| 15 | + wiki_generator, | |
| 14 | 16 | ) |
| 15 | 17 | from video_processor.agent.skills.base import ( |
| 16 | 18 | AgentContext, |
| 17 | 19 | Artifact, |
| 18 | 20 | Skill, |
| 19 | 21 | |
| 20 | 22 | ADDED video_processor/agent/skills/notes_export.py |
| 21 | 23 | ADDED video_processor/agent/skills/wiki_generator.py |
| --- video_processor/agent/skills/__init__.py | |
| +++ video_processor/agent/skills/__init__.py | |
| @@ -4,15 +4,17 @@ | |
| 4 | from video_processor.agent.skills import ( # noqa: F401 |
| 5 | artifact_export, |
| 6 | cli_adapter, |
| 7 | doc_generator, |
| 8 | github_integration, |
| 9 | prd, |
| 10 | project_plan, |
| 11 | requirements_chat, |
| 12 | roadmap, |
| 13 | task_breakdown, |
| 14 | ) |
| 15 | from video_processor.agent.skills.base import ( |
| 16 | AgentContext, |
| 17 | Artifact, |
| 18 | Skill, |
| 19 | |
| 20 | DDED video_processor/agent/skills/notes_export.py |
| 21 | DDED video_processor/agent/skills/wiki_generator.py |
| --- video_processor/agent/skills/__init__.py | |
| +++ video_processor/agent/skills/__init__.py | |
| @@ -4,15 +4,17 @@ | |
| 4 | from video_processor.agent.skills import ( # noqa: F401 |
| 5 | artifact_export, |
| 6 | cli_adapter, |
| 7 | doc_generator, |
| 8 | github_integration, |
| 9 | notes_export, |
| 10 | prd, |
| 11 | project_plan, |
| 12 | requirements_chat, |
| 13 | roadmap, |
| 14 | task_breakdown, |
| 15 | wiki_generator, |
| 16 | ) |
| 17 | from video_processor.agent.skills.base import ( |
| 18 | AgentContext, |
| 19 | Artifact, |
| 20 | Skill, |
| 21 | |
| 22 | DDED video_processor/agent/skills/notes_export.py |
| 23 | DDED video_processor/agent/skills/wiki_generator.py |
| --- a/video_processor/agent/skills/notes_export.py | ||
| +++ b/video_processor/agent/skills/notes_export.py | ||
| @@ -0,0 +1,420 @@ | ||
| 1 | +"""Skill: Export knowledge graph as structured notes (Obsidian, Notion).""" | |
| 2 | + | |
| 3 | +import csv | |
| 4 | +import io | |
| 5 | +import logging | |
| 6 | +from datetime import date | |
| 7 | +from pathlib import Path | |
| 8 | +from typing import Dict, List, Optional | |
| 9 | + | |
| 10 | +from video_processor.agent.skills.base import ( | |
| 11 | + AgentContext, | |
| 12 | + Artifact, | |
| 13 | + Skill, | |
| 14 | + register_skill, | |
| 15 | +) | |
| 16 | + | |
| 17 | +logger = logging.getLogger(__name__) | |
| 18 | + | |
| 19 | + | |
| 20 | +def _sanitize_filename(name: str) -> str: | |
| 21 | + """Convert a name to a filesystem-safe filename.""" | |
| 22 | + return ( | |
| 23 | + name.replace("/", "-") | |
| 24 | + .replace("\\", "-") | |
| 25 | + .replace(":", "-") | |
| 26 | + .replace('"', "") | |
| 27 | + .replace("?", "") | |
| 28 | + .replace("*", "") | |
| 29 | + .replace("<", "") | |
| 30 | + .replace(">", "") | |
| 31 | + .replace("|", "") | |
| 32 | + ) | |
| 33 | + | |
| 34 | + | |
| 35 | +def _build_indexes(kg_data: dict): | |
| 36 | + """Build lookup structures from knowledge graph data. | |
| 37 | + | |
| 38 | + Returns (nodes, by_type, node_lookup, outgoing, incoming). | |
| 39 | + """ | |
| 40 | + nodes = kg_data.get("nodes", []) | |
| 41 | + relationships = kg_data.get("relationships", []) | |
| 42 | + | |
| 43 | + by_type: Dict[str, list] = {} | |
| 44 | + node_lookup: Dict[str, dict] = {} | |
| 45 | + for node in nodes: | |
| 46 | + name = node.get("name", node.get("id", "")) | |
| 47 | + ntype = node.get("type", "concept") | |
| 48 | + by_type.setdefault(ntype, []).append(node) | |
| 49 | + node_lookup[name] = node | |
| 50 | + | |
| 51 | + outgoing: Dict[str, list] = {} | |
| 52 | + incoming: Dict[str, list] = {} | |
| 53 | + for rel in relationships: | |
| 54 | + src = rel.get("source", "") | |
| 55 | + tgt = rel.get("target", "") | |
| 56 | + rtype = rel.get("type", "related_to") | |
| 57 | + outgoing.setdefault(src, []).append((tgt, rtype)) | |
| 58 | + incoming.setdefault(tgt, []).append((src, rtype)) | |
| 59 | + | |
| 60 | + return nodes, by_type, node_lookup, outgoing, incoming | |
| 61 | + | |
| 62 | + | |
| 63 | +# --------------------------------------------------------------------------- | |
| 64 | +# Obsidian export | |
| 65 | +# --------------------------------------------------------------------------- | |
| 66 | + | |
| 67 | + | |
| 68 | +def export_to_obsidian( | |
| 69 | + kg_data: dict, | |
| 70 | + output_dir: Path, | |
| 71 | + artifacts: Optional[List[Artifact]] = None, | |
| 72 | +) -> List[Path]: | |
| 73 | + """Export knowledge graph as an Obsidian vault. | |
| 74 | + | |
| 75 | + Creates one ``.md`` file per entity with YAML frontmatter and | |
| 76 | + ``[[wiki-links]]``, an ``_Index.md`` Map of Content, tag pages per | |
| 77 | + entity type, and optional artifact notes. | |
| 78 | + """ | |
| 79 | + output_dir.mkdir(parents=True, exist_ok=True) | |
| 80 | + artifacts = artifacts or [] | |
| 81 | + created: List[Path] = [] | |
| 82 | + today = date.today().isoformat() | |
| 83 | + | |
| 84 | + nodes, by_type, node_lookup, outgoing, incoming = _build_indexes(kg_data) | |
| 85 | + | |
| 86 | + # --- Individual entity notes --- | |
| 87 | + for node in nodes: | |
| 88 | + name = node.get("name", node.get("id", "")) | |
| 89 | + if not name: | |
| 90 | + continue | |
| 91 | + ntype = node.get("type", "concept") | |
| 92 | + descs = node.get("descriptions", []) | |
| 93 | + aliases = node.get("aliases", []) | |
| 94 | + | |
| 95 | + # YAML frontmatter | |
| 96 | + tags_yaml = f" - {ntype}" | |
| 97 | + aliases_yaml = "" | |
| 98 | + if aliases: | |
| 99 | + alias_lines = "\n".join(f" - {a}" for a in aliases) | |
| 100 | + aliases_yaml = f"aliases:\n{alias_lines}\n" | |
| 101 | + | |
| 102 | + frontmatter = f"---\ntype: {ntype}\ntags:\n{tags_yaml}\n{aliases_yaml}date: {today}\n---\n" | |
| 103 | + | |
| 104 | + parts = [frontmatter, f"# {name}", ""] | |
| 105 | + | |
| 106 | + # Descriptions | |
| 107 | + if descs: | |
| 108 | + for d in descs: | |
| 109 | + parts.append(f"{d}") | |
| 110 | + parts.append("") | |
| 111 | + | |
| 112 | + # Outgoing relationships | |
| 113 | + outs = outgoing.get(name, []) | |
| 114 | + if outs: | |
| 115 | + parts.append("## Relationships") | |
| 116 | + parts.append("") | |
| 117 | + for tgt, rtype in outs: | |
| 118 | + parts.append(f"- **{rtype}**: [[{tgt}]]") | |
| 119 | + parts.append("") | |
| 120 | + | |
| 121 | + # Incoming relationships | |
| 122 | + ins = incoming.get(name, []) | |
| 123 | + if ins: | |
| 124 | + parts.append("## Referenced by") | |
| 125 | + parts.append("") | |
| 126 | + for src, rtype in ins: | |
| 127 | + parts.append(f"- **{rtype}** from [[{src}]]") | |
| 128 | + parts.append("") | |
| 129 | + | |
| 130 | + filename = _sanitize_filename(name) + ".md" | |
| 131 | + path = output_dir / filename | |
| 132 | + path.write_text("\n".join(parts), encoding="utf-8") | |
| 133 | + created.append(path) | |
| 134 | + | |
| 135 | + # --- Index note (Map of Content) --- | |
| 136 | + index_parts = [ | |
| 137 | + "---", | |
| 138 | + "type: index", | |
| 139 | + "tags:", | |
| 140 | + " - MOC", | |
| 141 | + f"date: {today}", | |
| 142 | + "---", | |
| 143 | + "", | |
| 144 | + "# Index", | |
| 145 | + "", | |
| 146 | + f"**{len(nodes)}** entities | **{len(kg_data.get('relationships', []))}** relationships", | |
| 147 | + "", | |
| 148 | + ] | |
| 149 | + | |
| 150 | + for etype in sorted(by_type.keys()): | |
| 151 | + elist = by_type[etype] | |
| 152 | + index_parts.append(f"## {etype.title()}") | |
| 153 | + index_parts.append("") | |
| 154 | + for node in sorted(elist, key=lambda n: n.get("name", "")): | |
| 155 | + name = node.get("name", "") | |
| 156 | + index_parts.append(f"- [[{name}]]") | |
| 157 | + index_parts.append("") | |
| 158 | + | |
| 159 | + if artifacts: | |
| 160 | + index_parts.append("## Artifacts") | |
| 161 | + index_parts.append("") | |
| 162 | + for art in artifacts: | |
| 163 | + index_parts.append(f"- [[{art.name}]]") | |
| 164 | + index_parts.append("") | |
| 165 | + | |
| 166 | + index_path = output_dir / "_Index.md" | |
| 167 | + index_path.write_text("\n".join(index_parts), encoding="utf-8") | |
| 168 | + created.append(index_path) | |
| 169 | + | |
| 170 | + # --- Tag pages (one per entity type) --- | |
| 171 | + for etype, elist in sorted(by_type.items()): | |
| 172 | + tag_parts = [ | |
| 173 | + "---", | |
| 174 | + "type: tag", | |
| 175 | + "tags:", | |
| 176 | + f" - {etype}", | |
| 177 | + f"date: {today}", | |
| 178 | + "---", | |
| 179 | + "", | |
| 180 | + f"# {etype.title()}", | |
| 181 | + "", | |
| 182 | + f"All entities of type **{etype}** ({len(elist)}).", | |
| 183 | + "", | |
| 184 | + ] | |
| 185 | + for node in sorted(elist, key=lambda n: n.get("name", "")): | |
| 186 | + name = node.get("name", "") | |
| 187 | + descs = node.get("descriptions", []) | |
| 188 | + summary = descs[0] if descs else "" | |
| 189 | + tag_parts.append(f"- [[{name}]]" + (f" - {summary}" if summary else "")) | |
| 190 | + tag_parts.append("") | |
| 191 | + | |
| 192 | + tag_filename = f"Tag - {etype.title()}.md" | |
| 193 | + tag_path = output_dir / _sanitize_filename(tag_filename) | |
| 194 | + tag_path.write_text("\n".join(tag_parts), encoding="utf-8") | |
| 195 | + created.append(tag_path) | |
| 196 | + | |
| 197 | + # --- Artifact notes --- | |
| 198 | + for art in artifacts: | |
| 199 | + art_parts = [ | |
| 200 | + "---", | |
| 201 | + "type: artifact", | |
| 202 | + f"artifact_type: {art.artifact_type}", | |
| 203 | + "tags:", | |
| 204 | + " - artifact", | |
| 205 | + f" - {art.artifact_type}", | |
| 206 | + f"date: {today}", | |
| 207 | + "---", | |
| 208 | + "", | |
| 209 | + f"# {art.name}", | |
| 210 | + "", | |
| 211 | + art.content, | |
| 212 | + "", | |
| 213 | + ] | |
| 214 | + art_filename = _sanitize_filename(art.name) + ".md" | |
| 215 | + art_path = output_dir / art_filename | |
| 216 | + art_path.write_text("\n".join(art_parts), encoding="utf-8") | |
| 217 | + created.append(art_path) | |
| 218 | + | |
| 219 | + logger.info("Exported %d Obsidian notes to %s", len(created), output_dir) | |
| 220 | + return created | |
| 221 | + | |
| 222 | + | |
| 223 | +# --------------------------------------------------------------------------- | |
| 224 | +# Notion-compatible markdown export | |
| 225 | +# --------------------------------------------------------------------------- | |
| 226 | + | |
| 227 | + | |
| 228 | +def export_to_notion_md( | |
| 229 | + kg_data: dict, | |
| 230 | + output_dir: Path, | |
| 231 | + artifacts: Optional[List[Artifact]] = None, | |
| 232 | +) -> List[Path]: | |
| 233 | + """Export knowledge graph as Notion-compatible markdown. | |
| 234 | + | |
| 235 | + Creates ``.md`` files with Notion-style callout blocks and a | |
| 236 | + database-style CSV for bulk import. | |
| 237 | + """ | |
| 238 | + output_dir.mkdir(parents=True, exist_ok=True) | |
| 239 | + artifacts = artifacts or [] | |
| 240 | + created: List[Path] = [] | |
| 241 | + | |
| 242 | + nodes, by_type, node_lookup, outgoing, incoming = _build_indexes(kg_data) | |
| 243 | + | |
| 244 | + # --- Database CSV --- | |
| 245 | + csv_buffer = io.StringIO() | |
| 246 | + writer = csv.writer(csv_buffer) | |
| 247 | + writer.writerow(["Name", "Type", "Description", "Related To"]) | |
| 248 | + | |
| 249 | + for node in nodes: | |
| 250 | + name = node.get("name", node.get("id", "")) | |
| 251 | + ntype = node.get("type", "concept") | |
| 252 | + descs = node.get("descriptions", []) | |
| 253 | + desc_text = "; ".join(descs[:2]) if descs else "" | |
| 254 | + outs = outgoing.get(name, []) | |
| 255 | + related = ", ".join(tgt for tgt, _ in outs) if outs else "" | |
| 256 | + writer.writerow([name, ntype, desc_text, related]) | |
| 257 | + | |
| 258 | + csv_path = output_dir / "entities_database.csv" | |
| 259 | + csv_path.write_text(csv_buffer.getvalue(), encoding="utf-8") | |
| 260 | + created.append(csv_path) | |
| 261 | + | |
| 262 | + # --- Individual entity pages --- | |
| 263 | + for node in nodes: | |
| 264 | + name = node.get("name", node.get("id", "")) | |
| 265 | + if not name: | |
| 266 | + continue | |
| 267 | + ntype = node.get("type", "concept") | |
| 268 | + descs = node.get("descriptions", []) | |
| 269 | + | |
| 270 | + type_emoji = { | |
| 271 | + "person": "person", | |
| 272 | + "technology": "computer", | |
| 273 | + "organization": "building", | |
| 274 | + "concept": "bulb", | |
| 275 | + "event": "calendar", | |
| 276 | + "location": "round_pushpin", | |
| 277 | + } | |
| 278 | + emoji = type_emoji.get(ntype, "bulb") | |
| 279 | + | |
| 280 | + parts = [ | |
| 281 | + f"# {name}", | |
| 282 | + "", | |
| 283 | + f"> :{emoji}: **Type:** {ntype}", | |
| 284 | + "", | |
| 285 | + ] | |
| 286 | + | |
| 287 | + if descs: | |
| 288 | + parts.append("## Description") | |
| 289 | + parts.append("") | |
| 290 | + for d in descs: | |
| 291 | + parts.append(f"{d}") | |
| 292 | + parts.append("") | |
| 293 | + | |
| 294 | + # Properties callout | |
| 295 | + properties = node.get("properties", {}) | |
| 296 | + if properties: | |
| 297 | + parts.append("> :memo: **Properties**") | |
| 298 | + for k, v in properties.items(): | |
| 299 | + parts.append(f"> - **{k}:** {v}") | |
| 300 | + parts.append("") | |
| 301 | + | |
| 302 | + # Outgoing relationships | |
| 303 | + outs = outgoing.get(name, []) | |
| 304 | + if outs: | |
| 305 | + parts.append("## Relationships") | |
| 306 | + parts.append("") | |
| 307 | + parts.append("| Target | Relationship |") | |
| 308 | + parts.append("|--------|-------------|") | |
| 309 | + for tgt, rtype in outs: | |
| 310 | + parts.append(f"| {tgt} | {rtype} |") | |
| 311 | + parts.append("") | |
| 312 | + | |
| 313 | + # Incoming relationships | |
| 314 | + ins = incoming.get(name, []) | |
| 315 | + if ins: | |
| 316 | + parts.append("## Referenced by") | |
| 317 | + parts.append("") | |
| 318 | + parts.append("| Source | Relationship |") | |
| 319 | + parts.append("|--------|-------------|") | |
| 320 | + for src, rtype in ins: | |
| 321 | + parts.append(f"| {src} | {rtype} |") | |
| 322 | + parts.append("") | |
| 323 | + | |
| 324 | + filename = _sanitize_filename(name) + ".md" | |
| 325 | + path = output_dir / filename | |
| 326 | + path.write_text("\n".join(parts), encoding="utf-8") | |
| 327 | + created.append(path) | |
| 328 | + | |
| 329 | + # --- Overview page --- | |
| 330 | + overview_parts = [ | |
| 331 | + "# Knowledge Graph Overview", | |
| 332 | + "", | |
| 333 | + f"> :bar_chart: **Stats:** {len(nodes)} entities, " | |
| 334 | + f"{len(kg_data.get('relationships', []))} relationships", | |
| 335 | + "", | |
| 336 | + "## Entity Types", | |
| 337 | + "", | |
| 338 | + ] | |
| 339 | + for etype in sorted(by_type.keys()): | |
| 340 | + elist = by_type[etype] | |
| 341 | + overview_parts.append(f"### {etype.title()} ({len(elist)})") | |
| 342 | + overview_parts.append("") | |
| 343 | + for node in sorted(elist, key=lambda n: n.get("name", "")): | |
| 344 | + name = node.get("name", "") | |
| 345 | + overview_parts.append(f"- {name}") | |
| 346 | + overview_parts.append("") | |
| 347 | + | |
| 348 | + if artifacts: | |
| 349 | + overview_parts.append("## Artifacts") | |
| 350 | + overview_parts.append("") | |
| 351 | + for art in artifacts: | |
| 352 | + overview_parts.append(f"- **{art.name}** ({art.artifact_type})") | |
| 353 | + overview_parts.append("") | |
| 354 | + | |
| 355 | + overview_path = output_dir / "Overview.md" | |
| 356 | + overview_path.write_text("\n".join(overview_parts), encoding="utf-8") | |
| 357 | + created.append(overview_path) | |
| 358 | + | |
| 359 | + # --- Artifact pages --- | |
| 360 | + for art in artifacts: | |
| 361 | + art_parts = [ | |
| 362 | + f"# {art.name}", | |
| 363 | + "", | |
| 364 | + f"> :page_facing_up: **Type:** {art.artifact_type} | **Format:** {art.format}", | |
| 365 | + "", | |
| 366 | + art.content, | |
| 367 | + "", | |
| 368 | + ] | |
| 369 | + art_filename = _sanitize_filename(art.name) + ".md" | |
| 370 | + art_path = output_dir / art_filename | |
| 371 | + art_path.write_text("\n".join(art_parts), encoding="utf-8") | |
| 372 | + created.append(art_path) | |
| 373 | + | |
| 374 | + logger.info("Exported %d Notion markdown files to %s", len(created), output_dir) | |
| 375 | + return created | |
| 376 | + | |
| 377 | + | |
| 378 | +# --------------------------------------------------------------------------- | |
| 379 | +# Skill class | |
| 380 | +# --------------------------------------------------------------------------- | |
| 381 | + | |
| 382 | + | |
| 383 | +class NotesExportSkill(Skill): | |
| 384 | + """Export knowledge graph as structured notes (Obsidian, Notion). | |
| 385 | + | |
| 386 | + For GitHub wiki export, see the ``wiki_generator`` skill. | |
| 387 | + """ | |
| 388 | + | |
| 389 | + name = "notes_export" | |
| 390 | + description = "Export knowledge graph as structured notes (Obsidian, Notion)" | |
| 391 | + | |
| 392 | + def execute(self, context: AgentContext, **kwargs) -> Artifact: | |
| 393 | + fmt = kwargs.get("format", "obsidian") | |
| 394 | + output_dir = Path(kwargs.get("output_dir", f"notes_export_{fmt}")) | |
| 395 | + kg_data = context.knowledge_graph.to_dict() | |
| 396 | + artifacts = context.artifacts or [] | |
| 397 | + | |
| 398 | + if fmt == "notion": | |
| 399 | + created = export_to_notion_md(kg_data, output_dir, artifacts=artifacts) | |
| 400 | + else: | |
| 401 | + created = export_to_obsidian(kg_data, output_dir, artifacts=artifacts) | |
| 402 | + | |
| 403 | + file_list = "\n".join(f"- {p.name}" for p in created) | |
| 404 | + summary = f"Exported {len(created)} {fmt} notes to `{output_dir}`:\n\n{file_list}" | |
| 405 | + | |
| 406 | + return Artifact( | |
| 407 | + name=f"Notes Export ({fmt.title()})", | |
| 408 | + content=summary, | |
| 409 | + artifact_type="notes_export", | |
| 410 | + format="markdown", | |
| 411 | + metadata={ | |
| 412 | + "output_dir": str(output_dir), | |
| 413 | + "format": fmt, | |
| 414 | + "file_count": len(created), | |
| 415 | + "files": [str(p) for p in created], | |
| 416 | + }, | |
| 417 | + ) | |
| 418 | + | |
| 419 | + | |
| 420 | +register_skill(NotesExportSkill()) |
| --- a/video_processor/agent/skills/notes_export.py | |
| +++ b/video_processor/agent/skills/notes_export.py | |
| @@ -0,0 +1,420 @@ | |
| --- a/video_processor/agent/skills/notes_export.py | |
| +++ b/video_processor/agent/skills/notes_export.py | |
| @@ -0,0 +1,420 @@ | |
| 1 | """Skill: Export knowledge graph as structured notes (Obsidian, Notion).""" |
| 2 | |
| 3 | import csv |
| 4 | import io |
| 5 | import logging |
| 6 | from datetime import date |
| 7 | from pathlib import Path |
| 8 | from typing import Dict, List, Optional |
| 9 | |
| 10 | from video_processor.agent.skills.base import ( |
| 11 | AgentContext, |
| 12 | Artifact, |
| 13 | Skill, |
| 14 | register_skill, |
| 15 | ) |
| 16 | |
| 17 | logger = logging.getLogger(__name__) |
| 18 | |
| 19 | |
| 20 | def _sanitize_filename(name: str) -> str: |
| 21 | """Convert a name to a filesystem-safe filename.""" |
| 22 | return ( |
| 23 | name.replace("/", "-") |
| 24 | .replace("\\", "-") |
| 25 | .replace(":", "-") |
| 26 | .replace('"', "") |
| 27 | .replace("?", "") |
| 28 | .replace("*", "") |
| 29 | .replace("<", "") |
| 30 | .replace(">", "") |
| 31 | .replace("|", "") |
| 32 | ) |
| 33 | |
| 34 | |
| 35 | def _build_indexes(kg_data: dict): |
| 36 | """Build lookup structures from knowledge graph data. |
| 37 | |
| 38 | Returns (nodes, by_type, node_lookup, outgoing, incoming). |
| 39 | """ |
| 40 | nodes = kg_data.get("nodes", []) |
| 41 | relationships = kg_data.get("relationships", []) |
| 42 | |
| 43 | by_type: Dict[str, list] = {} |
| 44 | node_lookup: Dict[str, dict] = {} |
| 45 | for node in nodes: |
| 46 | name = node.get("name", node.get("id", "")) |
| 47 | ntype = node.get("type", "concept") |
| 48 | by_type.setdefault(ntype, []).append(node) |
| 49 | node_lookup[name] = node |
| 50 | |
| 51 | outgoing: Dict[str, list] = {} |
| 52 | incoming: Dict[str, list] = {} |
| 53 | for rel in relationships: |
| 54 | src = rel.get("source", "") |
| 55 | tgt = rel.get("target", "") |
| 56 | rtype = rel.get("type", "related_to") |
| 57 | outgoing.setdefault(src, []).append((tgt, rtype)) |
| 58 | incoming.setdefault(tgt, []).append((src, rtype)) |
| 59 | |
| 60 | return nodes, by_type, node_lookup, outgoing, incoming |
| 61 | |
| 62 | |
| 63 | # --------------------------------------------------------------------------- |
| 64 | # Obsidian export |
| 65 | # --------------------------------------------------------------------------- |
| 66 | |
| 67 | |
| 68 | def export_to_obsidian( |
| 69 | kg_data: dict, |
| 70 | output_dir: Path, |
| 71 | artifacts: Optional[List[Artifact]] = None, |
| 72 | ) -> List[Path]: |
| 73 | """Export knowledge graph as an Obsidian vault. |
| 74 | |
| 75 | Creates one ``.md`` file per entity with YAML frontmatter and |
| 76 | ``[[wiki-links]]``, an ``_Index.md`` Map of Content, tag pages per |
| 77 | entity type, and optional artifact notes. |
| 78 | """ |
| 79 | output_dir.mkdir(parents=True, exist_ok=True) |
| 80 | artifacts = artifacts or [] |
| 81 | created: List[Path] = [] |
| 82 | today = date.today().isoformat() |
| 83 | |
| 84 | nodes, by_type, node_lookup, outgoing, incoming = _build_indexes(kg_data) |
| 85 | |
| 86 | # --- Individual entity notes --- |
| 87 | for node in nodes: |
| 88 | name = node.get("name", node.get("id", "")) |
| 89 | if not name: |
| 90 | continue |
| 91 | ntype = node.get("type", "concept") |
| 92 | descs = node.get("descriptions", []) |
| 93 | aliases = node.get("aliases", []) |
| 94 | |
| 95 | # YAML frontmatter |
| 96 | tags_yaml = f" - {ntype}" |
| 97 | aliases_yaml = "" |
| 98 | if aliases: |
| 99 | alias_lines = "\n".join(f" - {a}" for a in aliases) |
| 100 | aliases_yaml = f"aliases:\n{alias_lines}\n" |
| 101 | |
| 102 | frontmatter = f"---\ntype: {ntype}\ntags:\n{tags_yaml}\n{aliases_yaml}date: {today}\n---\n" |
| 103 | |
| 104 | parts = [frontmatter, f"# {name}", ""] |
| 105 | |
| 106 | # Descriptions |
| 107 | if descs: |
| 108 | for d in descs: |
| 109 | parts.append(f"{d}") |
| 110 | parts.append("") |
| 111 | |
| 112 | # Outgoing relationships |
| 113 | outs = outgoing.get(name, []) |
| 114 | if outs: |
| 115 | parts.append("## Relationships") |
| 116 | parts.append("") |
| 117 | for tgt, rtype in outs: |
| 118 | parts.append(f"- **{rtype}**: [[{tgt}]]") |
| 119 | parts.append("") |
| 120 | |
| 121 | # Incoming relationships |
| 122 | ins = incoming.get(name, []) |
| 123 | if ins: |
| 124 | parts.append("## Referenced by") |
| 125 | parts.append("") |
| 126 | for src, rtype in ins: |
| 127 | parts.append(f"- **{rtype}** from [[{src}]]") |
| 128 | parts.append("") |
| 129 | |
| 130 | filename = _sanitize_filename(name) + ".md" |
| 131 | path = output_dir / filename |
| 132 | path.write_text("\n".join(parts), encoding="utf-8") |
| 133 | created.append(path) |
| 134 | |
| 135 | # --- Index note (Map of Content) --- |
| 136 | index_parts = [ |
| 137 | "---", |
| 138 | "type: index", |
| 139 | "tags:", |
| 140 | " - MOC", |
| 141 | f"date: {today}", |
| 142 | "---", |
| 143 | "", |
| 144 | "# Index", |
| 145 | "", |
| 146 | f"**{len(nodes)}** entities | **{len(kg_data.get('relationships', []))}** relationships", |
| 147 | "", |
| 148 | ] |
| 149 | |
| 150 | for etype in sorted(by_type.keys()): |
| 151 | elist = by_type[etype] |
| 152 | index_parts.append(f"## {etype.title()}") |
| 153 | index_parts.append("") |
| 154 | for node in sorted(elist, key=lambda n: n.get("name", "")): |
| 155 | name = node.get("name", "") |
| 156 | index_parts.append(f"- [[{name}]]") |
| 157 | index_parts.append("") |
| 158 | |
| 159 | if artifacts: |
| 160 | index_parts.append("## Artifacts") |
| 161 | index_parts.append("") |
| 162 | for art in artifacts: |
| 163 | index_parts.append(f"- [[{art.name}]]") |
| 164 | index_parts.append("") |
| 165 | |
| 166 | index_path = output_dir / "_Index.md" |
| 167 | index_path.write_text("\n".join(index_parts), encoding="utf-8") |
| 168 | created.append(index_path) |
| 169 | |
| 170 | # --- Tag pages (one per entity type) --- |
| 171 | for etype, elist in sorted(by_type.items()): |
| 172 | tag_parts = [ |
| 173 | "---", |
| 174 | "type: tag", |
| 175 | "tags:", |
| 176 | f" - {etype}", |
| 177 | f"date: {today}", |
| 178 | "---", |
| 179 | "", |
| 180 | f"# {etype.title()}", |
| 181 | "", |
| 182 | f"All entities of type **{etype}** ({len(elist)}).", |
| 183 | "", |
| 184 | ] |
| 185 | for node in sorted(elist, key=lambda n: n.get("name", "")): |
| 186 | name = node.get("name", "") |
| 187 | descs = node.get("descriptions", []) |
| 188 | summary = descs[0] if descs else "" |
| 189 | tag_parts.append(f"- [[{name}]]" + (f" - {summary}" if summary else "")) |
| 190 | tag_parts.append("") |
| 191 | |
| 192 | tag_filename = f"Tag - {etype.title()}.md" |
| 193 | tag_path = output_dir / _sanitize_filename(tag_filename) |
| 194 | tag_path.write_text("\n".join(tag_parts), encoding="utf-8") |
| 195 | created.append(tag_path) |
| 196 | |
| 197 | # --- Artifact notes --- |
| 198 | for art in artifacts: |
| 199 | art_parts = [ |
| 200 | "---", |
| 201 | "type: artifact", |
| 202 | f"artifact_type: {art.artifact_type}", |
| 203 | "tags:", |
| 204 | " - artifact", |
| 205 | f" - {art.artifact_type}", |
| 206 | f"date: {today}", |
| 207 | "---", |
| 208 | "", |
| 209 | f"# {art.name}", |
| 210 | "", |
| 211 | art.content, |
| 212 | "", |
| 213 | ] |
| 214 | art_filename = _sanitize_filename(art.name) + ".md" |
| 215 | art_path = output_dir / art_filename |
| 216 | art_path.write_text("\n".join(art_parts), encoding="utf-8") |
| 217 | created.append(art_path) |
| 218 | |
| 219 | logger.info("Exported %d Obsidian notes to %s", len(created), output_dir) |
| 220 | return created |
| 221 | |
| 222 | |
| 223 | # --------------------------------------------------------------------------- |
| 224 | # Notion-compatible markdown export |
| 225 | # --------------------------------------------------------------------------- |
| 226 | |
| 227 | |
| 228 | def export_to_notion_md( |
| 229 | kg_data: dict, |
| 230 | output_dir: Path, |
| 231 | artifacts: Optional[List[Artifact]] = None, |
| 232 | ) -> List[Path]: |
| 233 | """Export knowledge graph as Notion-compatible markdown. |
| 234 | |
| 235 | Creates ``.md`` files with Notion-style callout blocks and a |
| 236 | database-style CSV for bulk import. |
| 237 | """ |
| 238 | output_dir.mkdir(parents=True, exist_ok=True) |
| 239 | artifacts = artifacts or [] |
| 240 | created: List[Path] = [] |
| 241 | |
| 242 | nodes, by_type, node_lookup, outgoing, incoming = _build_indexes(kg_data) |
| 243 | |
| 244 | # --- Database CSV --- |
| 245 | csv_buffer = io.StringIO() |
| 246 | writer = csv.writer(csv_buffer) |
| 247 | writer.writerow(["Name", "Type", "Description", "Related To"]) |
| 248 | |
| 249 | for node in nodes: |
| 250 | name = node.get("name", node.get("id", "")) |
| 251 | ntype = node.get("type", "concept") |
| 252 | descs = node.get("descriptions", []) |
| 253 | desc_text = "; ".join(descs[:2]) if descs else "" |
| 254 | outs = outgoing.get(name, []) |
| 255 | related = ", ".join(tgt for tgt, _ in outs) if outs else "" |
| 256 | writer.writerow([name, ntype, desc_text, related]) |
| 257 | |
| 258 | csv_path = output_dir / "entities_database.csv" |
| 259 | csv_path.write_text(csv_buffer.getvalue(), encoding="utf-8") |
| 260 | created.append(csv_path) |
| 261 | |
| 262 | # --- Individual entity pages --- |
| 263 | for node in nodes: |
| 264 | name = node.get("name", node.get("id", "")) |
| 265 | if not name: |
| 266 | continue |
| 267 | ntype = node.get("type", "concept") |
| 268 | descs = node.get("descriptions", []) |
| 269 | |
| 270 | type_emoji = { |
| 271 | "person": "person", |
| 272 | "technology": "computer", |
| 273 | "organization": "building", |
| 274 | "concept": "bulb", |
| 275 | "event": "calendar", |
| 276 | "location": "round_pushpin", |
| 277 | } |
| 278 | emoji = type_emoji.get(ntype, "bulb") |
| 279 | |
| 280 | parts = [ |
| 281 | f"# {name}", |
| 282 | "", |
| 283 | f"> :{emoji}: **Type:** {ntype}", |
| 284 | "", |
| 285 | ] |
| 286 | |
| 287 | if descs: |
| 288 | parts.append("## Description") |
| 289 | parts.append("") |
| 290 | for d in descs: |
| 291 | parts.append(f"{d}") |
| 292 | parts.append("") |
| 293 | |
| 294 | # Properties callout |
| 295 | properties = node.get("properties", {}) |
| 296 | if properties: |
| 297 | parts.append("> :memo: **Properties**") |
| 298 | for k, v in properties.items(): |
| 299 | parts.append(f"> - **{k}:** {v}") |
| 300 | parts.append("") |
| 301 | |
| 302 | # Outgoing relationships |
| 303 | outs = outgoing.get(name, []) |
| 304 | if outs: |
| 305 | parts.append("## Relationships") |
| 306 | parts.append("") |
| 307 | parts.append("| Target | Relationship |") |
| 308 | parts.append("|--------|-------------|") |
| 309 | for tgt, rtype in outs: |
| 310 | parts.append(f"| {tgt} | {rtype} |") |
| 311 | parts.append("") |
| 312 | |
| 313 | # Incoming relationships |
| 314 | ins = incoming.get(name, []) |
| 315 | if ins: |
| 316 | parts.append("## Referenced by") |
| 317 | parts.append("") |
| 318 | parts.append("| Source | Relationship |") |
| 319 | parts.append("|--------|-------------|") |
| 320 | for src, rtype in ins: |
| 321 | parts.append(f"| {src} | {rtype} |") |
| 322 | parts.append("") |
| 323 | |
| 324 | filename = _sanitize_filename(name) + ".md" |
| 325 | path = output_dir / filename |
| 326 | path.write_text("\n".join(parts), encoding="utf-8") |
| 327 | created.append(path) |
| 328 | |
| 329 | # --- Overview page --- |
| 330 | overview_parts = [ |
| 331 | "# Knowledge Graph Overview", |
| 332 | "", |
| 333 | f"> :bar_chart: **Stats:** {len(nodes)} entities, " |
| 334 | f"{len(kg_data.get('relationships', []))} relationships", |
| 335 | "", |
| 336 | "## Entity Types", |
| 337 | "", |
| 338 | ] |
| 339 | for etype in sorted(by_type.keys()): |
| 340 | elist = by_type[etype] |
| 341 | overview_parts.append(f"### {etype.title()} ({len(elist)})") |
| 342 | overview_parts.append("") |
| 343 | for node in sorted(elist, key=lambda n: n.get("name", "")): |
| 344 | name = node.get("name", "") |
| 345 | overview_parts.append(f"- {name}") |
| 346 | overview_parts.append("") |
| 347 | |
| 348 | if artifacts: |
| 349 | overview_parts.append("## Artifacts") |
| 350 | overview_parts.append("") |
| 351 | for art in artifacts: |
| 352 | overview_parts.append(f"- **{art.name}** ({art.artifact_type})") |
| 353 | overview_parts.append("") |
| 354 | |
| 355 | overview_path = output_dir / "Overview.md" |
| 356 | overview_path.write_text("\n".join(overview_parts), encoding="utf-8") |
| 357 | created.append(overview_path) |
| 358 | |
| 359 | # --- Artifact pages --- |
| 360 | for art in artifacts: |
| 361 | art_parts = [ |
| 362 | f"# {art.name}", |
| 363 | "", |
| 364 | f"> :page_facing_up: **Type:** {art.artifact_type} | **Format:** {art.format}", |
| 365 | "", |
| 366 | art.content, |
| 367 | "", |
| 368 | ] |
| 369 | art_filename = _sanitize_filename(art.name) + ".md" |
| 370 | art_path = output_dir / art_filename |
| 371 | art_path.write_text("\n".join(art_parts), encoding="utf-8") |
| 372 | created.append(art_path) |
| 373 | |
| 374 | logger.info("Exported %d Notion markdown files to %s", len(created), output_dir) |
| 375 | return created |
| 376 | |
| 377 | |
| 378 | # --------------------------------------------------------------------------- |
| 379 | # Skill class |
| 380 | # --------------------------------------------------------------------------- |
| 381 | |
| 382 | |
| 383 | class NotesExportSkill(Skill): |
| 384 | """Export knowledge graph as structured notes (Obsidian, Notion). |
| 385 | |
| 386 | For GitHub wiki export, see the ``wiki_generator`` skill. |
| 387 | """ |
| 388 | |
| 389 | name = "notes_export" |
| 390 | description = "Export knowledge graph as structured notes (Obsidian, Notion)" |
| 391 | |
| 392 | def execute(self, context: AgentContext, **kwargs) -> Artifact: |
| 393 | fmt = kwargs.get("format", "obsidian") |
| 394 | output_dir = Path(kwargs.get("output_dir", f"notes_export_{fmt}")) |
| 395 | kg_data = context.knowledge_graph.to_dict() |
| 396 | artifacts = context.artifacts or [] |
| 397 | |
| 398 | if fmt == "notion": |
| 399 | created = export_to_notion_md(kg_data, output_dir, artifacts=artifacts) |
| 400 | else: |
| 401 | created = export_to_obsidian(kg_data, output_dir, artifacts=artifacts) |
| 402 | |
| 403 | file_list = "\n".join(f"- {p.name}" for p in created) |
| 404 | summary = f"Exported {len(created)} {fmt} notes to `{output_dir}`:\n\n{file_list}" |
| 405 | |
| 406 | return Artifact( |
| 407 | name=f"Notes Export ({fmt.title()})", |
| 408 | content=summary, |
| 409 | artifact_type="notes_export", |
| 410 | format="markdown", |
| 411 | metadata={ |
| 412 | "output_dir": str(output_dir), |
| 413 | "format": fmt, |
| 414 | "file_count": len(created), |
| 415 | "files": [str(p) for p in created], |
| 416 | }, |
| 417 | ) |
| 418 | |
| 419 | |
| 420 | register_skill(NotesExportSkill()) |
| --- a/video_processor/agent/skills/wiki_generator.py | ||
| +++ b/video_processor/agent/skills/wiki_generator.py | ||
| @@ -0,0 +1,315 @@ | ||
| 1 | +"""Skill: Generate a GitHub wiki from knowledge graph and artifacts.""" | |
| 2 | + | |
| 3 | +import json | |
| 4 | +import logging | |
| 5 | +import subprocess | |
| 6 | +from pathlib import Path | |
| 7 | +from typing import Dict, List, Optional | |
| 8 | + | |
| 9 | +from video_processor.agent.skills.base import ( | |
| 10 | + AgentContext, | |
| 11 | + Artifact, | |
| 12 | + Skill, | |
| 13 | + register_skill, | |
| 14 | +) | |
| 15 | + | |
| 16 | +logger = logging.getLogger(__name__) | |
| 17 | + | |
| 18 | + | |
| 19 | +def _sanitize_filename(name: str) -> str: | |
| 20 | + """Convert entity name to a wiki-safe filename.""" | |
| 21 | + return name.replace("/", "-").replace("\\", "-").replace(" ", "-").replace(".", "-") | |
| 22 | + | |
| 23 | + | |
| 24 | +def _wiki_link(name: str) -> str: | |
| 25 | + """Create a GitHub wiki-style markdown link.""" | |
| 26 | + safe = _sanitize_filename(name) | |
| 27 | + return f"[{name}]({safe})" | |
| 28 | + | |
| 29 | + | |
| 30 | +def generate_wiki( | |
| 31 | + kg_data: dict, | |
| 32 | + artifacts: Optional[List[Artifact]] = None, | |
| 33 | + title: str = "Knowledge Base", | |
| 34 | +) -> Dict[str, str]: | |
| 35 | + """Generate a dict of {filename: markdown_content} for a GitHub wiki. | |
| 36 | + | |
| 37 | + Returns pages for: Home, _Sidebar, entity type indexes, individual | |
| 38 | + entity pages, and any planning artifacts. | |
| 39 | + """ | |
| 40 | + pages: Dict[str, str] = {} | |
| 41 | + artifacts = artifacts or [] | |
| 42 | + | |
| 43 | + nodes = kg_data.get("nodes", []) | |
| 44 | + relationships = kg_data.get("relationships", []) | |
| 45 | + | |
| 46 | + # Group entities by type | |
| 47 | + by_type: Dict[str, list] = {} | |
| 48 | + node_lookup: Dict[str, dict] = {} | |
| 49 | + for node in nodes: | |
| 50 | + name = node.get("name", node.get("id", "")) | |
| 51 | + ntype = node.get("type", "concept") | |
| 52 | + by_type.setdefault(ntype, []).append(node) | |
| 53 | + node_lookup[name.lower()] = node | |
| 54 | + | |
| 55 | + # Build relationship index (outgoing and incoming per entity) | |
| 56 | + outgoing: Dict[str, list] = {} | |
| 57 | + incoming: Dict[str, list] = {} | |
| 58 | + for rel in relationships: | |
| 59 | + src = rel.get("source", "") | |
| 60 | + tgt = rel.get("target", "") | |
| 61 | + rtype = rel.get("type", "related_to") | |
| 62 | + outgoing.setdefault(src, []).append((tgt, rtype)) | |
| 63 | + incoming.setdefault(tgt, []).append((src, rtype)) | |
| 64 | + | |
| 65 | + # --- Home page --- | |
| 66 | + home_parts = [ | |
| 67 | + f"# {title}", | |
| 68 | + "", | |
| 69 | + f"**{len(nodes)}** entities | **{len(relationships)}** relationships", | |
| 70 | + "", | |
| 71 | + "## Entity Types", | |
| 72 | + "", | |
| 73 | + ] | |
| 74 | + for etype, elist in sorted(by_type.items()): | |
| 75 | + home_parts.append(f"- {_wiki_link(etype.title())} ({len(elist)})") | |
| 76 | + | |
| 77 | + if artifacts: | |
| 78 | + home_parts.append("") | |
| 79 | + home_parts.append("## Planning Artifacts") | |
| 80 | + home_parts.append("") | |
| 81 | + for art in artifacts: | |
| 82 | + safe = _sanitize_filename(art.name) | |
| 83 | + home_parts.append(f"- [{art.name}]({safe})") | |
| 84 | + | |
| 85 | + pages["Home"] = "\n".join(home_parts) | |
| 86 | + | |
| 87 | + # --- Sidebar --- | |
| 88 | + sidebar_parts = [f"**{title}**", "", "**Navigation**", "", "- [Home](Home)", ""] | |
| 89 | + sidebar_parts.append("**Entity Types**") | |
| 90 | + sidebar_parts.append("") | |
| 91 | + for etype in sorted(by_type.keys()): | |
| 92 | + sidebar_parts.append(f"- {_wiki_link(etype.title())}") | |
| 93 | + | |
| 94 | + if artifacts: | |
| 95 | + sidebar_parts.append("") | |
| 96 | + sidebar_parts.append("**Artifacts**") | |
| 97 | + sidebar_parts.append("") | |
| 98 | + for art in artifacts: | |
| 99 | + safe = _sanitize_filename(art.name) | |
| 100 | + sidebar_parts.append(f"- [{art.name}]({safe})") | |
| 101 | + | |
| 102 | + pages["_Sidebar"] = "\n".join(sidebar_parts) | |
| 103 | + | |
| 104 | + # --- Type index pages --- | |
| 105 | + for etype, elist in sorted(by_type.items()): | |
| 106 | + page_name = _sanitize_filename(etype.title()) | |
| 107 | + parts = [ | |
| 108 | + f"# {etype.title()}", | |
| 109 | + "", | |
| 110 | + f"{len(elist)} entities of type **{etype}**.", | |
| 111 | + "", | |
| 112 | + "| Entity | Descriptions |", | |
| 113 | + "|--------|-------------|", | |
| 114 | + ] | |
| 115 | + for node in sorted(elist, key=lambda n: n.get("name", "")): | |
| 116 | + name = node.get("name", "") | |
| 117 | + descs = node.get("descriptions", []) | |
| 118 | + desc_text = "; ".join(descs[:2]) if descs else "—" | |
| 119 | + parts.append(f"| {_wiki_link(name)} | {desc_text} |") | |
| 120 | + | |
| 121 | + pages[page_name] = "\n".join(parts) | |
| 122 | + | |
| 123 | + # --- Individual entity pages --- | |
| 124 | + for node in nodes: | |
| 125 | + name = node.get("name", "") | |
| 126 | + if not name: | |
| 127 | + continue | |
| 128 | + ntype = node.get("type", "concept") | |
| 129 | + descs = node.get("descriptions", []) | |
| 130 | + page_name = _sanitize_filename(name) | |
| 131 | + | |
| 132 | + parts = [ | |
| 133 | + f"# {name}", | |
| 134 | + "", | |
| 135 | + f"**Type:** {ntype}", | |
| 136 | + "", | |
| 137 | + ] | |
| 138 | + | |
| 139 | + if descs: | |
| 140 | + parts.append("## Descriptions") | |
| 141 | + parts.append("") | |
| 142 | + for d in descs: | |
| 143 | + parts.append(f"- {d}") | |
| 144 | + parts.append("") | |
| 145 | + | |
| 146 | + # Outgoing relationships | |
| 147 | + outs = outgoing.get(name, []) | |
| 148 | + if outs: | |
| 149 | + parts.append("## Relationships") | |
| 150 | + parts.append("") | |
| 151 | + parts.append("| Target | Type |") | |
| 152 | + parts.append("|--------|------|") | |
| 153 | + for tgt, rtype in outs: | |
| 154 | + parts.append(f"| {_wiki_link(tgt)} | {rtype} |") | |
| 155 | + parts.append("") | |
| 156 | + | |
| 157 | + # Incoming relationships | |
| 158 | + ins = incoming.get(name, []) | |
| 159 | + if ins: | |
| 160 | + parts.append("## Referenced By") | |
| 161 | + parts.append("") | |
| 162 | + parts.append("| Source | Type |") | |
| 163 | + parts.append("|--------|------|") | |
| 164 | + for src, rtype in ins: | |
| 165 | + parts.append(f"| {_wiki_link(src)} | {rtype} |") | |
| 166 | + parts.append("") | |
| 167 | + | |
| 168 | + # Occurrences / sources | |
| 169 | + occs = node.get("occurrences", []) | |
| 170 | + if occs: | |
| 171 | + parts.append("## Sources") | |
| 172 | + parts.append("") | |
| 173 | + for occ in occs: | |
| 174 | + src = occ.get("source", "unknown") | |
| 175 | + ts = occ.get("timestamp", "") | |
| 176 | + text = occ.get("text", "") | |
| 177 | + line = f"- **{src}**" | |
| 178 | + if ts: | |
| 179 | + line += f" @ {ts}" | |
| 180 | + if text: | |
| 181 | + line += f": _{text}_" | |
| 182 | + parts.append(line) | |
| 183 | + parts.append("") | |
| 184 | + | |
| 185 | + pages[page_name] = "\n".join(parts) | |
| 186 | + | |
| 187 | + # --- Artifact pages --- | |
| 188 | + for art in artifacts: | |
| 189 | + page_name = _sanitize_filename(art.name) | |
| 190 | + if art.format == "json": | |
| 191 | + try: | |
| 192 | + data = json.loads(art.content) | |
| 193 | + content = f"```json\n{json.dumps(data, indent=2)}\n```" | |
| 194 | + except json.JSONDecodeError: | |
| 195 | + content = art.content | |
| 196 | + else: | |
| 197 | + content = art.content | |
| 198 | + | |
| 199 | + pages[page_name] = f"# {art.name}\n\n{content}" | |
| 200 | + | |
| 201 | + return pages | |
| 202 | + | |
| 203 | + | |
| 204 | +def write_wiki(pages: Dict[str, str], output_dir: Path) -> List[Path]: | |
| 205 | + """Write wiki pages to a directory as .md files.""" | |
| 206 | + output_dir.mkdir(parents=True, exist_ok=True) | |
| 207 | + paths = [] | |
| 208 | + for name, content in pages.items(): | |
| 209 | + path = output_dir / f"{name}.md" | |
| 210 | + path.write_text(content, encoding="utf-8") | |
| 211 | + paths.append(path) | |
| 212 | + return paths | |
| 213 | + | |
| 214 | + | |
| 215 | +def push_wiki(wiki_dir: Path, repo: str, message: str = "Update wiki") -> bool: | |
| 216 | + """Push wiki pages to a GitHub wiki repo. | |
| 217 | + | |
| 218 | + Clones the wiki repo, copies pages, commits and pushes. | |
| 219 | + The repo should be in 'owner/repo' format. | |
| 220 | + """ | |
| 221 | + wiki_url = f"https://github.com/{repo}.wiki.git" | |
| 222 | + | |
| 223 | + # Clone existing wiki (or init if empty) | |
| 224 | + clone_dir = wiki_dir / ".wiki_clone" | |
| 225 | + if clone_dir.exists(): | |
| 226 | + subprocess.run(["rm", "-rf", str(clone_dir)], check=True) | |
| 227 | + | |
| 228 | + result = subprocess.run( | |
| 229 | + ["git", "clone", wiki_url, str(clone_dir)], | |
| 230 | + capture_output=True, | |
| 231 | + text=True, | |
| 232 | + ) | |
| 233 | + | |
| 234 | + if result.returncode != 0: | |
| 235 | + # Wiki might not exist yet — init a new repo | |
| 236 | + clone_dir.mkdir(parents=True, exist_ok=True) | |
| 237 | + subprocess.run(["git", "init"], cwd=clone_dir, capture_output=True) | |
| 238 | + subprocess.run( | |
| 239 | + ["git", "remote", "add", "origin", wiki_url], | |
| 240 | + cwd=clone_dir, | |
| 241 | + capture_output=True, | |
| 242 | + ) | |
| 243 | + | |
| 244 | + # Copy wiki pages into clone | |
| 245 | + for md_file in wiki_dir.glob("*.md"): | |
| 246 | + if md_file.parent == wiki_dir: | |
| 247 | + dest = clone_dir / md_file.name | |
| 248 | + dest.write_text(md_file.read_text(encoding="utf-8"), encoding="utf-8") | |
| 249 | + | |
| 250 | + # Commit and push | |
| 251 | + subprocess.run(["git", "add", "-A"], cwd=clone_dir, capture_output=True) | |
| 252 | + commit_result = subprocess.run( | |
| 253 | + ["git", "commit", "-m", message], | |
| 254 | + cwd=clone_dir, | |
| 255 | + capture_output=True, | |
| 256 | + text=True, | |
| 257 | + ) | |
| 258 | + if commit_result.returncode != 0: | |
| 259 | + logger.info("No wiki changes to commit") | |
| 260 | + return True | |
| 261 | + | |
| 262 | + push_result = subprocess.run( | |
| 263 | + ["git", "push", "origin", "master"], | |
| 264 | + cwd=clone_dir, | |
| 265 | + capture_output=True, | |
| 266 | + text=True, | |
| 267 | + ) | |
| 268 | + if push_result.returncode != 0: | |
| 269 | + # Try main branch | |
| 270 | + push_result = subprocess.run( | |
| 271 | + ["git", "push", "origin", "main"], | |
| 272 | + cwd=clone_dir, | |
| 273 | + capture_output=True, | |
| 274 | + text=True, | |
| 275 | + ) | |
| 276 | + | |
| 277 | + if push_result.returncode == 0: | |
| 278 | + logger.info(f"Wiki pushed to {wiki_url}") | |
| 279 | + return True | |
| 280 | + else: | |
| 281 | + logger.error(f"Wiki push failed: {push_result.stderr}") | |
| 282 | + return False | |
| 283 | + | |
| 284 | + | |
| 285 | +class WikiGeneratorSkill(Skill): | |
| 286 | + name = "wiki_generator" | |
| 287 | + description = "Generate a GitHub wiki from knowledge graph and artifacts" | |
| 288 | + | |
| 289 | + def execute(self, context: AgentContext, **kwargs) -> Artifact: | |
| 290 | + kg_data = context.knowledge_graph.to_dict() | |
| 291 | + pages = generate_wiki( | |
| 292 | + kg_data, | |
| 293 | + artifacts=context.artifacts, | |
| 294 | + title=kwargs.get("title", "Knowledge Base"), | |
| 295 | + ) | |
| 296 | + | |
| 297 | + # Return a summary artifact; actual pages are written via write_wiki() | |
| 298 | + page_list = sorted(pages.keys()) | |
| 299 | + summary_parts = [ | |
| 300 | + f"Generated {len(pages)} wiki pages:", | |
| 301 | + "", | |
| 302 | + ] | |
| 303 | + for name in page_list: | |
| 304 | + summary_parts.append(f"- {name}.md") | |
| 305 | + | |
| 306 | + return Artifact( | |
| 307 | + name="Wiki", | |
| 308 | + content="\n".join(summary_parts), | |
| 309 | + artifact_type="wiki", | |
| 310 | + format="markdown", | |
| 311 | + metadata={"pages": pages}, | |
| 312 | + ) | |
| 313 | + | |
| 314 | + | |
| 315 | +register_skill(WikiGeneratorSkill()) |
| --- a/video_processor/agent/skills/wiki_generator.py | |
| +++ b/video_processor/agent/skills/wiki_generator.py | |
| @@ -0,0 +1,315 @@ | |
| --- a/video_processor/agent/skills/wiki_generator.py | |
| +++ b/video_processor/agent/skills/wiki_generator.py | |
| @@ -0,0 +1,315 @@ | |
| 1 | """Skill: Generate a GitHub wiki from knowledge graph and artifacts.""" |
| 2 | |
| 3 | import json |
| 4 | import logging |
| 5 | import subprocess |
| 6 | from pathlib import Path |
| 7 | from typing import Dict, List, Optional |
| 8 | |
| 9 | from video_processor.agent.skills.base import ( |
| 10 | AgentContext, |
| 11 | Artifact, |
| 12 | Skill, |
| 13 | register_skill, |
| 14 | ) |
| 15 | |
| 16 | logger = logging.getLogger(__name__) |
| 17 | |
| 18 | |
| 19 | def _sanitize_filename(name: str) -> str: |
| 20 | """Convert entity name to a wiki-safe filename.""" |
| 21 | return name.replace("/", "-").replace("\\", "-").replace(" ", "-").replace(".", "-") |
| 22 | |
| 23 | |
| 24 | def _wiki_link(name: str) -> str: |
| 25 | """Create a GitHub wiki-style markdown link.""" |
| 26 | safe = _sanitize_filename(name) |
| 27 | return f"[{name}]({safe})" |
| 28 | |
| 29 | |
| 30 | def generate_wiki( |
| 31 | kg_data: dict, |
| 32 | artifacts: Optional[List[Artifact]] = None, |
| 33 | title: str = "Knowledge Base", |
| 34 | ) -> Dict[str, str]: |
| 35 | """Generate a dict of {filename: markdown_content} for a GitHub wiki. |
| 36 | |
| 37 | Returns pages for: Home, _Sidebar, entity type indexes, individual |
| 38 | entity pages, and any planning artifacts. |
| 39 | """ |
| 40 | pages: Dict[str, str] = {} |
| 41 | artifacts = artifacts or [] |
| 42 | |
| 43 | nodes = kg_data.get("nodes", []) |
| 44 | relationships = kg_data.get("relationships", []) |
| 45 | |
| 46 | # Group entities by type |
| 47 | by_type: Dict[str, list] = {} |
| 48 | node_lookup: Dict[str, dict] = {} |
| 49 | for node in nodes: |
| 50 | name = node.get("name", node.get("id", "")) |
| 51 | ntype = node.get("type", "concept") |
| 52 | by_type.setdefault(ntype, []).append(node) |
| 53 | node_lookup[name.lower()] = node |
| 54 | |
| 55 | # Build relationship index (outgoing and incoming per entity) |
| 56 | outgoing: Dict[str, list] = {} |
| 57 | incoming: Dict[str, list] = {} |
| 58 | for rel in relationships: |
| 59 | src = rel.get("source", "") |
| 60 | tgt = rel.get("target", "") |
| 61 | rtype = rel.get("type", "related_to") |
| 62 | outgoing.setdefault(src, []).append((tgt, rtype)) |
| 63 | incoming.setdefault(tgt, []).append((src, rtype)) |
| 64 | |
| 65 | # --- Home page --- |
| 66 | home_parts = [ |
| 67 | f"# {title}", |
| 68 | "", |
| 69 | f"**{len(nodes)}** entities | **{len(relationships)}** relationships", |
| 70 | "", |
| 71 | "## Entity Types", |
| 72 | "", |
| 73 | ] |
| 74 | for etype, elist in sorted(by_type.items()): |
| 75 | home_parts.append(f"- {_wiki_link(etype.title())} ({len(elist)})") |
| 76 | |
| 77 | if artifacts: |
| 78 | home_parts.append("") |
| 79 | home_parts.append("## Planning Artifacts") |
| 80 | home_parts.append("") |
| 81 | for art in artifacts: |
| 82 | safe = _sanitize_filename(art.name) |
| 83 | home_parts.append(f"- [{art.name}]({safe})") |
| 84 | |
| 85 | pages["Home"] = "\n".join(home_parts) |
| 86 | |
| 87 | # --- Sidebar --- |
| 88 | sidebar_parts = [f"**{title}**", "", "**Navigation**", "", "- [Home](Home)", ""] |
| 89 | sidebar_parts.append("**Entity Types**") |
| 90 | sidebar_parts.append("") |
| 91 | for etype in sorted(by_type.keys()): |
| 92 | sidebar_parts.append(f"- {_wiki_link(etype.title())}") |
| 93 | |
| 94 | if artifacts: |
| 95 | sidebar_parts.append("") |
| 96 | sidebar_parts.append("**Artifacts**") |
| 97 | sidebar_parts.append("") |
| 98 | for art in artifacts: |
| 99 | safe = _sanitize_filename(art.name) |
| 100 | sidebar_parts.append(f"- [{art.name}]({safe})") |
| 101 | |
| 102 | pages["_Sidebar"] = "\n".join(sidebar_parts) |
| 103 | |
| 104 | # --- Type index pages --- |
| 105 | for etype, elist in sorted(by_type.items()): |
| 106 | page_name = _sanitize_filename(etype.title()) |
| 107 | parts = [ |
| 108 | f"# {etype.title()}", |
| 109 | "", |
| 110 | f"{len(elist)} entities of type **{etype}**.", |
| 111 | "", |
| 112 | "| Entity | Descriptions |", |
| 113 | "|--------|-------------|", |
| 114 | ] |
| 115 | for node in sorted(elist, key=lambda n: n.get("name", "")): |
| 116 | name = node.get("name", "") |
| 117 | descs = node.get("descriptions", []) |
| 118 | desc_text = "; ".join(descs[:2]) if descs else "—" |
| 119 | parts.append(f"| {_wiki_link(name)} | {desc_text} |") |
| 120 | |
| 121 | pages[page_name] = "\n".join(parts) |
| 122 | |
| 123 | # --- Individual entity pages --- |
| 124 | for node in nodes: |
| 125 | name = node.get("name", "") |
| 126 | if not name: |
| 127 | continue |
| 128 | ntype = node.get("type", "concept") |
| 129 | descs = node.get("descriptions", []) |
| 130 | page_name = _sanitize_filename(name) |
| 131 | |
| 132 | parts = [ |
| 133 | f"# {name}", |
| 134 | "", |
| 135 | f"**Type:** {ntype}", |
| 136 | "", |
| 137 | ] |
| 138 | |
| 139 | if descs: |
| 140 | parts.append("## Descriptions") |
| 141 | parts.append("") |
| 142 | for d in descs: |
| 143 | parts.append(f"- {d}") |
| 144 | parts.append("") |
| 145 | |
| 146 | # Outgoing relationships |
| 147 | outs = outgoing.get(name, []) |
| 148 | if outs: |
| 149 | parts.append("## Relationships") |
| 150 | parts.append("") |
| 151 | parts.append("| Target | Type |") |
| 152 | parts.append("|--------|------|") |
| 153 | for tgt, rtype in outs: |
| 154 | parts.append(f"| {_wiki_link(tgt)} | {rtype} |") |
| 155 | parts.append("") |
| 156 | |
| 157 | # Incoming relationships |
| 158 | ins = incoming.get(name, []) |
| 159 | if ins: |
| 160 | parts.append("## Referenced By") |
| 161 | parts.append("") |
| 162 | parts.append("| Source | Type |") |
| 163 | parts.append("|--------|------|") |
| 164 | for src, rtype in ins: |
| 165 | parts.append(f"| {_wiki_link(src)} | {rtype} |") |
| 166 | parts.append("") |
| 167 | |
| 168 | # Occurrences / sources |
| 169 | occs = node.get("occurrences", []) |
| 170 | if occs: |
| 171 | parts.append("## Sources") |
| 172 | parts.append("") |
| 173 | for occ in occs: |
| 174 | src = occ.get("source", "unknown") |
| 175 | ts = occ.get("timestamp", "") |
| 176 | text = occ.get("text", "") |
| 177 | line = f"- **{src}**" |
| 178 | if ts: |
| 179 | line += f" @ {ts}" |
| 180 | if text: |
| 181 | line += f": _{text}_" |
| 182 | parts.append(line) |
| 183 | parts.append("") |
| 184 | |
| 185 | pages[page_name] = "\n".join(parts) |
| 186 | |
| 187 | # --- Artifact pages --- |
| 188 | for art in artifacts: |
| 189 | page_name = _sanitize_filename(art.name) |
| 190 | if art.format == "json": |
| 191 | try: |
| 192 | data = json.loads(art.content) |
| 193 | content = f"```json\n{json.dumps(data, indent=2)}\n```" |
| 194 | except json.JSONDecodeError: |
| 195 | content = art.content |
| 196 | else: |
| 197 | content = art.content |
| 198 | |
| 199 | pages[page_name] = f"# {art.name}\n\n{content}" |
| 200 | |
| 201 | return pages |
| 202 | |
| 203 | |
| 204 | def write_wiki(pages: Dict[str, str], output_dir: Path) -> List[Path]: |
| 205 | """Write wiki pages to a directory as .md files.""" |
| 206 | output_dir.mkdir(parents=True, exist_ok=True) |
| 207 | paths = [] |
| 208 | for name, content in pages.items(): |
| 209 | path = output_dir / f"{name}.md" |
| 210 | path.write_text(content, encoding="utf-8") |
| 211 | paths.append(path) |
| 212 | return paths |
| 213 | |
| 214 | |
| 215 | def push_wiki(wiki_dir: Path, repo: str, message: str = "Update wiki") -> bool: |
| 216 | """Push wiki pages to a GitHub wiki repo. |
| 217 | |
| 218 | Clones the wiki repo, copies pages, commits and pushes. |
| 219 | The repo should be in 'owner/repo' format. |
| 220 | """ |
| 221 | wiki_url = f"https://github.com/{repo}.wiki.git" |
| 222 | |
| 223 | # Clone existing wiki (or init if empty) |
| 224 | clone_dir = wiki_dir / ".wiki_clone" |
| 225 | if clone_dir.exists(): |
| 226 | subprocess.run(["rm", "-rf", str(clone_dir)], check=True) |
| 227 | |
| 228 | result = subprocess.run( |
| 229 | ["git", "clone", wiki_url, str(clone_dir)], |
| 230 | capture_output=True, |
| 231 | text=True, |
| 232 | ) |
| 233 | |
| 234 | if result.returncode != 0: |
| 235 | # Wiki might not exist yet — init a new repo |
| 236 | clone_dir.mkdir(parents=True, exist_ok=True) |
| 237 | subprocess.run(["git", "init"], cwd=clone_dir, capture_output=True) |
| 238 | subprocess.run( |
| 239 | ["git", "remote", "add", "origin", wiki_url], |
| 240 | cwd=clone_dir, |
| 241 | capture_output=True, |
| 242 | ) |
| 243 | |
| 244 | # Copy wiki pages into clone |
| 245 | for md_file in wiki_dir.glob("*.md"): |
| 246 | if md_file.parent == wiki_dir: |
| 247 | dest = clone_dir / md_file.name |
| 248 | dest.write_text(md_file.read_text(encoding="utf-8"), encoding="utf-8") |
| 249 | |
| 250 | # Commit and push |
| 251 | subprocess.run(["git", "add", "-A"], cwd=clone_dir, capture_output=True) |
| 252 | commit_result = subprocess.run( |
| 253 | ["git", "commit", "-m", message], |
| 254 | cwd=clone_dir, |
| 255 | capture_output=True, |
| 256 | text=True, |
| 257 | ) |
| 258 | if commit_result.returncode != 0: |
| 259 | logger.info("No wiki changes to commit") |
| 260 | return True |
| 261 | |
| 262 | push_result = subprocess.run( |
| 263 | ["git", "push", "origin", "master"], |
| 264 | cwd=clone_dir, |
| 265 | capture_output=True, |
| 266 | text=True, |
| 267 | ) |
| 268 | if push_result.returncode != 0: |
| 269 | # Try main branch |
| 270 | push_result = subprocess.run( |
| 271 | ["git", "push", "origin", "main"], |
| 272 | cwd=clone_dir, |
| 273 | capture_output=True, |
| 274 | text=True, |
| 275 | ) |
| 276 | |
| 277 | if push_result.returncode == 0: |
| 278 | logger.info(f"Wiki pushed to {wiki_url}") |
| 279 | return True |
| 280 | else: |
| 281 | logger.error(f"Wiki push failed: {push_result.stderr}") |
| 282 | return False |
| 283 | |
| 284 | |
| 285 | class WikiGeneratorSkill(Skill): |
| 286 | name = "wiki_generator" |
| 287 | description = "Generate a GitHub wiki from knowledge graph and artifacts" |
| 288 | |
| 289 | def execute(self, context: AgentContext, **kwargs) -> Artifact: |
| 290 | kg_data = context.knowledge_graph.to_dict() |
| 291 | pages = generate_wiki( |
| 292 | kg_data, |
| 293 | artifacts=context.artifacts, |
| 294 | title=kwargs.get("title", "Knowledge Base"), |
| 295 | ) |
| 296 | |
| 297 | # Return a summary artifact; actual pages are written via write_wiki() |
| 298 | page_list = sorted(pages.keys()) |
| 299 | summary_parts = [ |
| 300 | f"Generated {len(pages)} wiki pages:", |
| 301 | "", |
| 302 | ] |
| 303 | for name in page_list: |
| 304 | summary_parts.append(f"- {name}.md") |
| 305 | |
| 306 | return Artifact( |
| 307 | name="Wiki", |
| 308 | content="\n".join(summary_parts), |
| 309 | artifact_type="wiki", |
| 310 | format="markdown", |
| 311 | metadata={"pages": pages}, |
| 312 | ) |
| 313 | |
| 314 | |
| 315 | register_skill(WikiGeneratorSkill()) |
| --- video_processor/cli/commands.py | ||
| +++ video_processor/cli/commands.py | ||
| @@ -1412,10 +1412,71 @@ | ||
| 1412 | 1412 | click.echo(f" Chunks: {total_chunks}") |
| 1413 | 1413 | click.echo(f" Entities: {entity_count}") |
| 1414 | 1414 | click.echo(f" Relationships: {rel_count}") |
| 1415 | 1415 | click.echo(f" Knowledge graph: {kg_path}") |
| 1416 | 1416 | |
| 1417 | + | |
| 1418 | +@cli.group() | |
| 1419 | +def wiki(): | |
| 1420 | + """Generate and push GitHub wikis from knowledge graphs.""" | |
| 1421 | + pass | |
| 1422 | + | |
| 1423 | + | |
| 1424 | +@wiki.command("generate") | |
| 1425 | +@click.argument("db_path", type=click.Path(exists=True)) | |
| 1426 | +@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory for wiki") | |
| 1427 | +@click.option("--title", type=str, default="Knowledge Base", help="Wiki title") | |
| 1428 | +def wiki_generate(db_path, output, title): | |
| 1429 | + """Generate a GitHub wiki from a knowledge graph. | |
| 1430 | + | |
| 1431 | + Examples: | |
| 1432 | + | |
| 1433 | + planopticon wiki generate knowledge_graph.db -o ./wiki | |
| 1434 | + | |
| 1435 | + planopticon wiki generate results/kg.db --title "Project Wiki" | |
| 1436 | + """ | |
| 1437 | + from video_processor.agent.skills.wiki_generator import generate_wiki, write_wiki | |
| 1438 | + from video_processor.integrators.knowledge_graph import KnowledgeGraph | |
| 1439 | + | |
| 1440 | + db_path = Path(db_path) | |
| 1441 | + out_dir = Path(output) if output else Path.cwd() / "wiki" | |
| 1442 | + | |
| 1443 | + kg = KnowledgeGraph(db_path=db_path) | |
| 1444 | + kg_data = kg.to_dict() | |
| 1445 | + pages = generate_wiki(kg_data, title=title) | |
| 1446 | + written = write_wiki(pages, out_dir) | |
| 1447 | + | |
| 1448 | + click.echo(f"Generated {len(written)} wiki pages in {out_dir}") | |
| 1449 | + for p in sorted(written): | |
| 1450 | + click.echo(f" {p.name}") | |
| 1451 | + | |
| 1452 | + | |
| 1453 | +@wiki.command("push") | |
| 1454 | +@click.argument("wiki_dir", type=click.Path(exists=True)) | |
| 1455 | +@click.argument("repo", type=str) | |
| 1456 | +@click.option("--message", "-m", type=str, default="Update wiki", help="Commit message") | |
| 1457 | +def wiki_push(wiki_dir, repo, message): | |
| 1458 | + """Push generated wiki pages to a GitHub wiki repo. | |
| 1459 | + | |
| 1460 | + REPO should be in 'owner/repo' format. | |
| 1461 | + | |
| 1462 | + Examples: | |
| 1463 | + | |
| 1464 | + planopticon wiki push ./wiki ConflictHQ/PlanOpticon | |
| 1465 | + | |
| 1466 | + planopticon wiki push ./wiki owner/repo -m "Add entity pages" | |
| 1467 | + """ | |
| 1468 | + from video_processor.agent.skills.wiki_generator import push_wiki | |
| 1469 | + | |
| 1470 | + wiki_dir = Path(wiki_dir) | |
| 1471 | + success = push_wiki(wiki_dir, repo, message=message) | |
| 1472 | + if success: | |
| 1473 | + click.echo(f"Wiki pushed to https://github.com/{repo}/wiki") | |
| 1474 | + else: | |
| 1475 | + click.echo("Wiki push failed. Check auth and repo permissions.", err=True) | |
| 1476 | + sys.exit(1) | |
| 1477 | + | |
| 1417 | 1478 | |
| 1418 | 1479 | @cli.group() |
| 1419 | 1480 | def kg(): |
| 1420 | 1481 | """Knowledge graph utilities: convert, sync, and inspect.""" |
| 1421 | 1482 | pass |
| 1422 | 1483 |
| --- video_processor/cli/commands.py | |
| +++ video_processor/cli/commands.py | |
| @@ -1412,10 +1412,71 @@ | |
| 1412 | click.echo(f" Chunks: {total_chunks}") |
| 1413 | click.echo(f" Entities: {entity_count}") |
| 1414 | click.echo(f" Relationships: {rel_count}") |
| 1415 | click.echo(f" Knowledge graph: {kg_path}") |
| 1416 | |
| 1417 | |
| 1418 | @cli.group() |
| 1419 | def kg(): |
| 1420 | """Knowledge graph utilities: convert, sync, and inspect.""" |
| 1421 | pass |
| 1422 |
| --- video_processor/cli/commands.py | |
| +++ video_processor/cli/commands.py | |
| @@ -1412,10 +1412,71 @@ | |
| 1412 | click.echo(f" Chunks: {total_chunks}") |
| 1413 | click.echo(f" Entities: {entity_count}") |
| 1414 | click.echo(f" Relationships: {rel_count}") |
| 1415 | click.echo(f" Knowledge graph: {kg_path}") |
| 1416 | |
| 1417 | |
| 1418 | @cli.group() |
| 1419 | def wiki(): |
| 1420 | """Generate and push GitHub wikis from knowledge graphs.""" |
| 1421 | pass |
| 1422 | |
| 1423 | |
| 1424 | @wiki.command("generate") |
| 1425 | @click.argument("db_path", type=click.Path(exists=True)) |
| 1426 | @click.option("-o", "--output", type=click.Path(), default=None, help="Output directory for wiki") |
| 1427 | @click.option("--title", type=str, default="Knowledge Base", help="Wiki title") |
| 1428 | def wiki_generate(db_path, output, title): |
| 1429 | """Generate a GitHub wiki from a knowledge graph. |
| 1430 | |
| 1431 | Examples: |
| 1432 | |
| 1433 | planopticon wiki generate knowledge_graph.db -o ./wiki |
| 1434 | |
| 1435 | planopticon wiki generate results/kg.db --title "Project Wiki" |
| 1436 | """ |
| 1437 | from video_processor.agent.skills.wiki_generator import generate_wiki, write_wiki |
| 1438 | from video_processor.integrators.knowledge_graph import KnowledgeGraph |
| 1439 | |
| 1440 | db_path = Path(db_path) |
| 1441 | out_dir = Path(output) if output else Path.cwd() / "wiki" |
| 1442 | |
| 1443 | kg = KnowledgeGraph(db_path=db_path) |
| 1444 | kg_data = kg.to_dict() |
| 1445 | pages = generate_wiki(kg_data, title=title) |
| 1446 | written = write_wiki(pages, out_dir) |
| 1447 | |
| 1448 | click.echo(f"Generated {len(written)} wiki pages in {out_dir}") |
| 1449 | for p in sorted(written): |
| 1450 | click.echo(f" {p.name}") |
| 1451 | |
| 1452 | |
| 1453 | @wiki.command("push") |
| 1454 | @click.argument("wiki_dir", type=click.Path(exists=True)) |
| 1455 | @click.argument("repo", type=str) |
| 1456 | @click.option("--message", "-m", type=str, default="Update wiki", help="Commit message") |
| 1457 | def wiki_push(wiki_dir, repo, message): |
| 1458 | """Push generated wiki pages to a GitHub wiki repo. |
| 1459 | |
| 1460 | REPO should be in 'owner/repo' format. |
| 1461 | |
| 1462 | Examples: |
| 1463 | |
| 1464 | planopticon wiki push ./wiki ConflictHQ/PlanOpticon |
| 1465 | |
| 1466 | planopticon wiki push ./wiki owner/repo -m "Add entity pages" |
| 1467 | """ |
| 1468 | from video_processor.agent.skills.wiki_generator import push_wiki |
| 1469 | |
| 1470 | wiki_dir = Path(wiki_dir) |
| 1471 | success = push_wiki(wiki_dir, repo, message=message) |
| 1472 | if success: |
| 1473 | click.echo(f"Wiki pushed to https://github.com/{repo}/wiki") |
| 1474 | else: |
| 1475 | click.echo("Wiki push failed. Check auth and repo permissions.", err=True) |
| 1476 | sys.exit(1) |
| 1477 | |
| 1478 | |
| 1479 | @cli.group() |
| 1480 | def kg(): |
| 1481 | """Knowledge graph utilities: convert, sync, and inspect.""" |
| 1482 | pass |
| 1483 |
+16
-4
| --- video_processor/sources/__init__.py | ||
| +++ video_processor/sources/__init__.py | ||
| @@ -1,36 +1,48 @@ | ||
| 1 | -"""Cloud and web source integrations for fetching content from remote sources.""" | |
| 1 | +"""Cloud, web, and notes source integrations for fetching content from remote sources.""" | |
| 2 | 2 | |
| 3 | 3 | from video_processor.sources.base import BaseSource, SourceFile |
| 4 | 4 | |
| 5 | 5 | __all__ = [ |
| 6 | 6 | "BaseSource", |
| 7 | 7 | "SourceFile", |
| 8 | + "AppleNotesSource", | |
| 8 | 9 | "ArxivSource", |
| 9 | 10 | "GitHubSource", |
| 10 | 11 | "GoogleDriveSource", |
| 12 | + "GoogleKeepSource", | |
| 13 | + "GWSSource", | |
| 11 | 14 | "HackerNewsSource", |
| 15 | + "LogseqSource", | |
| 16 | + "M365Source", | |
| 17 | + "NotionSource", | |
| 18 | + "ObsidianSource", | |
| 19 | + "OneNoteSource", | |
| 12 | 20 | "PodcastSource", |
| 13 | 21 | "RedditSource", |
| 14 | 22 | "RSSSource", |
| 15 | 23 | "TwitterSource", |
| 16 | - "GWSSource", | |
| 17 | - "M365Source", | |
| 18 | 24 | "WebSource", |
| 19 | 25 | "YouTubeSource", |
| 20 | 26 | ] |
| 21 | 27 | |
| 22 | 28 | |
| 23 | 29 | def __getattr__(name: str): |
| 24 | 30 | """Lazy imports to avoid pulling in optional dependencies at import time.""" |
| 25 | 31 | _lazy_map = { |
| 32 | + "AppleNotesSource": "video_processor.sources.apple_notes_source", | |
| 26 | 33 | "ArxivSource": "video_processor.sources.arxiv_source", |
| 27 | 34 | "GitHubSource": "video_processor.sources.github_source", |
| 28 | 35 | "GoogleDriveSource": "video_processor.sources.google_drive", |
| 36 | + "GoogleKeepSource": "video_processor.sources.google_keep_source", | |
| 29 | 37 | "GWSSource": "video_processor.sources.gws_source", |
| 30 | - "M365Source": "video_processor.sources.m365_source", | |
| 31 | 38 | "HackerNewsSource": "video_processor.sources.hackernews_source", |
| 39 | + "LogseqSource": "video_processor.sources.logseq_source", | |
| 40 | + "M365Source": "video_processor.sources.m365_source", | |
| 41 | + "NotionSource": "video_processor.sources.notion_source", | |
| 42 | + "ObsidianSource": "video_processor.sources.obsidian_source", | |
| 43 | + "OneNoteSource": "video_processor.sources.onenote_source", | |
| 32 | 44 | "PodcastSource": "video_processor.sources.podcast_source", |
| 33 | 45 | "RedditSource": "video_processor.sources.reddit_source", |
| 34 | 46 | "RSSSource": "video_processor.sources.rss_source", |
| 35 | 47 | "TwitterSource": "video_processor.sources.twitter_source", |
| 36 | 48 | "WebSource": "video_processor.sources.web_source", |
| 37 | 49 | |
| 38 | 50 | ADDED video_processor/sources/apple_notes_source.py |
| 39 | 51 | ADDED video_processor/sources/google_keep_source.py |
| 40 | 52 | ADDED video_processor/sources/logseq_source.py |
| 41 | 53 | ADDED video_processor/sources/notion_source.py |
| 42 | 54 | ADDED video_processor/sources/obsidian_source.py |
| 43 | 55 | ADDED video_processor/sources/onenote_source.py |
| --- video_processor/sources/__init__.py | |
| +++ video_processor/sources/__init__.py | |
| @@ -1,36 +1,48 @@ | |
| 1 | """Cloud and web source integrations for fetching content from remote sources.""" |
| 2 | |
| 3 | from video_processor.sources.base import BaseSource, SourceFile |
| 4 | |
| 5 | __all__ = [ |
| 6 | "BaseSource", |
| 7 | "SourceFile", |
| 8 | "ArxivSource", |
| 9 | "GitHubSource", |
| 10 | "GoogleDriveSource", |
| 11 | "HackerNewsSource", |
| 12 | "PodcastSource", |
| 13 | "RedditSource", |
| 14 | "RSSSource", |
| 15 | "TwitterSource", |
| 16 | "GWSSource", |
| 17 | "M365Source", |
| 18 | "WebSource", |
| 19 | "YouTubeSource", |
| 20 | ] |
| 21 | |
| 22 | |
| 23 | def __getattr__(name: str): |
| 24 | """Lazy imports to avoid pulling in optional dependencies at import time.""" |
| 25 | _lazy_map = { |
| 26 | "ArxivSource": "video_processor.sources.arxiv_source", |
| 27 | "GitHubSource": "video_processor.sources.github_source", |
| 28 | "GoogleDriveSource": "video_processor.sources.google_drive", |
| 29 | "GWSSource": "video_processor.sources.gws_source", |
| 30 | "M365Source": "video_processor.sources.m365_source", |
| 31 | "HackerNewsSource": "video_processor.sources.hackernews_source", |
| 32 | "PodcastSource": "video_processor.sources.podcast_source", |
| 33 | "RedditSource": "video_processor.sources.reddit_source", |
| 34 | "RSSSource": "video_processor.sources.rss_source", |
| 35 | "TwitterSource": "video_processor.sources.twitter_source", |
| 36 | "WebSource": "video_processor.sources.web_source", |
| 37 | |
| 38 | DDED video_processor/sources/apple_notes_source.py |
| 39 | DDED video_processor/sources/google_keep_source.py |
| 40 | DDED video_processor/sources/logseq_source.py |
| 41 | DDED video_processor/sources/notion_source.py |
| 42 | DDED video_processor/sources/obsidian_source.py |
| 43 | DDED video_processor/sources/onenote_source.py |
| --- video_processor/sources/__init__.py | |
| +++ video_processor/sources/__init__.py | |
| @@ -1,36 +1,48 @@ | |
| 1 | """Cloud, web, and notes source integrations for fetching content from remote sources.""" |
| 2 | |
| 3 | from video_processor.sources.base import BaseSource, SourceFile |
| 4 | |
| 5 | __all__ = [ |
| 6 | "BaseSource", |
| 7 | "SourceFile", |
| 8 | "AppleNotesSource", |
| 9 | "ArxivSource", |
| 10 | "GitHubSource", |
| 11 | "GoogleDriveSource", |
| 12 | "GoogleKeepSource", |
| 13 | "GWSSource", |
| 14 | "HackerNewsSource", |
| 15 | "LogseqSource", |
| 16 | "M365Source", |
| 17 | "NotionSource", |
| 18 | "ObsidianSource", |
| 19 | "OneNoteSource", |
| 20 | "PodcastSource", |
| 21 | "RedditSource", |
| 22 | "RSSSource", |
| 23 | "TwitterSource", |
| 24 | "WebSource", |
| 25 | "YouTubeSource", |
| 26 | ] |
| 27 | |
| 28 | |
| 29 | def __getattr__(name: str): |
| 30 | """Lazy imports to avoid pulling in optional dependencies at import time.""" |
| 31 | _lazy_map = { |
| 32 | "AppleNotesSource": "video_processor.sources.apple_notes_source", |
| 33 | "ArxivSource": "video_processor.sources.arxiv_source", |
| 34 | "GitHubSource": "video_processor.sources.github_source", |
| 35 | "GoogleDriveSource": "video_processor.sources.google_drive", |
| 36 | "GoogleKeepSource": "video_processor.sources.google_keep_source", |
| 37 | "GWSSource": "video_processor.sources.gws_source", |
| 38 | "HackerNewsSource": "video_processor.sources.hackernews_source", |
| 39 | "LogseqSource": "video_processor.sources.logseq_source", |
| 40 | "M365Source": "video_processor.sources.m365_source", |
| 41 | "NotionSource": "video_processor.sources.notion_source", |
| 42 | "ObsidianSource": "video_processor.sources.obsidian_source", |
| 43 | "OneNoteSource": "video_processor.sources.onenote_source", |
| 44 | "PodcastSource": "video_processor.sources.podcast_source", |
| 45 | "RedditSource": "video_processor.sources.reddit_source", |
| 46 | "RSSSource": "video_processor.sources.rss_source", |
| 47 | "TwitterSource": "video_processor.sources.twitter_source", |
| 48 | "WebSource": "video_processor.sources.web_source", |
| 49 | |
| 50 | DDED video_processor/sources/apple_notes_source.py |
| 51 | DDED video_processor/sources/google_keep_source.py |
| 52 | DDED video_processor/sources/logseq_source.py |
| 53 | DDED video_processor/sources/notion_source.py |
| 54 | DDED video_processor/sources/obsidian_source.py |
| 55 | DDED video_processor/sources/onenote_source.py |
| --- a/video_processor/sources/apple_notes_source.py | ||
| +++ b/video_processor/sources/apple_notes_source.py | ||
| @@ -0,0 +1,178 @@ | ||
| 1 | +"""Apple Notes source connector via osascript (macOS only).""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import re | |
| 5 | +import subprocess | |
| 6 | +import sys | |
| 7 | +from pathlib import Path | |
| 8 | +from typing import List, Optional | |
| 9 | + | |
| 10 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 11 | + | |
| 12 | +logger = logging.getLogger(__name__) | |
| 13 | + | |
| 14 | + | |
| 15 | +class AppleNotesSource(BaseSource): | |
| 16 | + """ | |
| 17 | + Fetch notes from Apple Notes using osascript (AppleScript). | |
| 18 | + | |
| 19 | + Only works on macOS. Requires the Notes app to be available | |
| 20 | + and permission for osascript to access it. | |
| 21 | + """ | |
| 22 | + | |
| 23 | + def __init__(self, folder: Optional[str] = None): | |
| 24 | + self.folder = folder | |
| 25 | + | |
| 26 | + def authenticate(self) -> bool: | |
| 27 | + """Check that we are running on macOS.""" | |
| 28 | + if sys.platform != "darwin": | |
| 29 | + logger.error("Apple Notes is only available on macOS (current: %s)", sys.platform) | |
| 30 | + return False | |
| 31 | + return True | |
| 32 | + | |
| 33 | + def list_videos( | |
| 34 | + self, | |
| 35 | + folder_id: Optional[str] = None, | |
| 36 | + folder_path: Optional[str] = None, | |
| 37 | + patterns: Optional[List[str]] = None, | |
| 38 | + ) -> List[SourceFile]: | |
| 39 | + """List notes from Apple Notes via osascript.""" | |
| 40 | + if not self.authenticate(): | |
| 41 | + return [] | |
| 42 | + | |
| 43 | + if self.folder: | |
| 44 | + script = ( | |
| 45 | + 'tell application "Notes"\n' | |
| 46 | + " set noteList to {}\n" | |
| 47 | + f" repeat with f in folders of default account\n" | |
| 48 | + f' if name of f is "{self.folder}" then\n' | |
| 49 | + " repeat with n in notes of f\n" | |
| 50 | + ' set end of noteList to (id of n) & "|||" & (name of n)\n' | |
| 51 | + " end repeat\n" | |
| 52 | + " end if\n" | |
| 53 | + " end repeat\n" | |
| 54 | + " return noteList\n" | |
| 55 | + "end tell" | |
| 56 | + ) | |
| 57 | + else: | |
| 58 | + script = ( | |
| 59 | + 'tell application "Notes"\n' | |
| 60 | + " set noteList to {}\n" | |
| 61 | + " repeat with n in notes of default account\n" | |
| 62 | + ' set end of noteList to (id of n) & "|||" & (name of n)\n' | |
| 63 | + " end repeat\n" | |
| 64 | + " return noteList\n" | |
| 65 | + "end tell" | |
| 66 | + ) | |
| 67 | + | |
| 68 | + try: | |
| 69 | + result = subprocess.run( | |
| 70 | + ["osascript", "-e", script], | |
| 71 | + capture_output=True, | |
| 72 | + text=True, | |
| 73 | + timeout=30, | |
| 74 | + ) | |
| 75 | + except FileNotFoundError: | |
| 76 | + logger.error("osascript not found. Apple Notes requires macOS.") | |
| 77 | + return [] | |
| 78 | + except subprocess.TimeoutExpired: | |
| 79 | + logger.error("osascript timed out while listing notes.") | |
| 80 | + return [] | |
| 81 | + | |
| 82 | + if result.returncode != 0: | |
| 83 | + logger.error("Failed to list notes: %s", result.stderr.strip()) | |
| 84 | + return [] | |
| 85 | + | |
| 86 | + return self._parse_note_list(result.stdout.strip()) | |
| 87 | + | |
| 88 | + def _parse_note_list(self, output: str) -> List[SourceFile]: | |
| 89 | + """Parse osascript output into SourceFile objects. | |
| 90 | + | |
| 91 | + Expected format: comma-separated items of 'id|||name' pairs. | |
| 92 | + """ | |
| 93 | + files: List[SourceFile] = [] | |
| 94 | + if not output: | |
| 95 | + return files | |
| 96 | + | |
| 97 | + # AppleScript returns a flat comma-separated list | |
| 98 | + entries = output.split(", ") | |
| 99 | + for entry in entries: | |
| 100 | + entry = entry.strip() | |
| 101 | + if "|||" not in entry: | |
| 102 | + continue | |
| 103 | + note_id, _, name = entry.partition("|||") | |
| 104 | + note_id = note_id.strip() | |
| 105 | + name = name.strip() | |
| 106 | + if note_id and name: | |
| 107 | + files.append( | |
| 108 | + SourceFile( | |
| 109 | + name=name, | |
| 110 | + id=note_id, | |
| 111 | + mime_type="text/plain", | |
| 112 | + ) | |
| 113 | + ) | |
| 114 | + | |
| 115 | + logger.info("Found %d notes", len(files)) | |
| 116 | + return files | |
| 117 | + | |
| 118 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 119 | + """Download a note's content as plain text.""" | |
| 120 | + destination = Path(destination) | |
| 121 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 122 | + | |
| 123 | + script = ( | |
| 124 | + 'tell application "Notes"\n' | |
| 125 | + f' set theNote to note id "{file.id}" of default account\n' | |
| 126 | + " return body of theNote\n" | |
| 127 | + "end tell" | |
| 128 | + ) | |
| 129 | + | |
| 130 | + try: | |
| 131 | + result = subprocess.run( | |
| 132 | + ["osascript", "-e", script], | |
| 133 | + capture_output=True, | |
| 134 | + text=True, | |
| 135 | + timeout=30, | |
| 136 | + ) | |
| 137 | + except FileNotFoundError: | |
| 138 | + raise RuntimeError("osascript not found. Apple Notes requires macOS.") | |
| 139 | + except subprocess.TimeoutExpired: | |
| 140 | + raise RuntimeError(f"osascript timed out fetching note {file.id}") | |
| 141 | + | |
| 142 | + if result.returncode != 0: | |
| 143 | + raise RuntimeError(f"Failed to fetch note {file.id}: {result.stderr.strip()}") | |
| 144 | + | |
| 145 | + html_body = result.stdout.strip() | |
| 146 | + text = self._html_to_text(html_body) | |
| 147 | + | |
| 148 | + # Prepend title | |
| 149 | + content = f"# {file.name}\n\n{text}" | |
| 150 | + destination.write_text(content, encoding="utf-8") | |
| 151 | + logger.info("Saved Apple Note to %s", destination) | |
| 152 | + return destination | |
| 153 | + | |
| 154 | + @staticmethod | |
| 155 | + def _html_to_text(html: str) -> str: | |
| 156 | + """Strip HTML tags and return plain text. | |
| 157 | + | |
| 158 | + Apple Notes returns note bodies as HTML. This uses regex-based | |
| 159 | + stripping similar to web_source._strip_html_tags. | |
| 160 | + """ | |
| 161 | + if not html: | |
| 162 | + return "" | |
| 163 | + # Replace <br> variants with newlines | |
| 164 | + text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE) | |
| 165 | + # Replace block-level closing tags with newlines | |
| 166 | + text = re.sub(r"</(?:p|div|li|tr|h[1-6])>", "\n", text, flags=re.IGNORECASE) | |
| 167 | + # Remove all remaining tags | |
| 168 | + text = re.sub(r"<[^>]+>", "", text) | |
| 169 | + # Decode common HTML entities | |
| 170 | + text = text.replace("&", "&") | |
| 171 | + text = text.replace("<", "<") | |
| 172 | + text = text.replace(">", ">") | |
| 173 | + text = text.replace(""", '"') | |
| 174 | + text = text.replace("'", "'") | |
| 175 | + text = text.replace(" ", " ") | |
| 176 | + # Collapse excessive blank lines | |
| 177 | + text = re.sub(r"\n{3,}", "\n\n", text) | |
| 178 | + return text.strip() |
| --- a/video_processor/sources/apple_notes_source.py | |
| +++ b/video_processor/sources/apple_notes_source.py | |
| @@ -0,0 +1,178 @@ | |
| --- a/video_processor/sources/apple_notes_source.py | |
| +++ b/video_processor/sources/apple_notes_source.py | |
| @@ -0,0 +1,178 @@ | |
| 1 | """Apple Notes source connector via osascript (macOS only).""" |
| 2 | |
| 3 | import logging |
| 4 | import re |
| 5 | import subprocess |
| 6 | import sys |
| 7 | from pathlib import Path |
| 8 | from typing import List, Optional |
| 9 | |
| 10 | from video_processor.sources.base import BaseSource, SourceFile |
| 11 | |
| 12 | logger = logging.getLogger(__name__) |
| 13 | |
| 14 | |
| 15 | class AppleNotesSource(BaseSource): |
| 16 | """ |
| 17 | Fetch notes from Apple Notes using osascript (AppleScript). |
| 18 | |
| 19 | Only works on macOS. Requires the Notes app to be available |
| 20 | and permission for osascript to access it. |
| 21 | """ |
| 22 | |
| 23 | def __init__(self, folder: Optional[str] = None): |
| 24 | self.folder = folder |
| 25 | |
| 26 | def authenticate(self) -> bool: |
| 27 | """Check that we are running on macOS.""" |
| 28 | if sys.platform != "darwin": |
| 29 | logger.error("Apple Notes is only available on macOS (current: %s)", sys.platform) |
| 30 | return False |
| 31 | return True |
| 32 | |
| 33 | def list_videos( |
| 34 | self, |
| 35 | folder_id: Optional[str] = None, |
| 36 | folder_path: Optional[str] = None, |
| 37 | patterns: Optional[List[str]] = None, |
| 38 | ) -> List[SourceFile]: |
| 39 | """List notes from Apple Notes via osascript.""" |
| 40 | if not self.authenticate(): |
| 41 | return [] |
| 42 | |
| 43 | if self.folder: |
| 44 | script = ( |
| 45 | 'tell application "Notes"\n' |
| 46 | " set noteList to {}\n" |
| 47 | f" repeat with f in folders of default account\n" |
| 48 | f' if name of f is "{self.folder}" then\n' |
| 49 | " repeat with n in notes of f\n" |
| 50 | ' set end of noteList to (id of n) & "|||" & (name of n)\n' |
| 51 | " end repeat\n" |
| 52 | " end if\n" |
| 53 | " end repeat\n" |
| 54 | " return noteList\n" |
| 55 | "end tell" |
| 56 | ) |
| 57 | else: |
| 58 | script = ( |
| 59 | 'tell application "Notes"\n' |
| 60 | " set noteList to {}\n" |
| 61 | " repeat with n in notes of default account\n" |
| 62 | ' set end of noteList to (id of n) & "|||" & (name of n)\n' |
| 63 | " end repeat\n" |
| 64 | " return noteList\n" |
| 65 | "end tell" |
| 66 | ) |
| 67 | |
| 68 | try: |
| 69 | result = subprocess.run( |
| 70 | ["osascript", "-e", script], |
| 71 | capture_output=True, |
| 72 | text=True, |
| 73 | timeout=30, |
| 74 | ) |
| 75 | except FileNotFoundError: |
| 76 | logger.error("osascript not found. Apple Notes requires macOS.") |
| 77 | return [] |
| 78 | except subprocess.TimeoutExpired: |
| 79 | logger.error("osascript timed out while listing notes.") |
| 80 | return [] |
| 81 | |
| 82 | if result.returncode != 0: |
| 83 | logger.error("Failed to list notes: %s", result.stderr.strip()) |
| 84 | return [] |
| 85 | |
| 86 | return self._parse_note_list(result.stdout.strip()) |
| 87 | |
| 88 | def _parse_note_list(self, output: str) -> List[SourceFile]: |
| 89 | """Parse osascript output into SourceFile objects. |
| 90 | |
| 91 | Expected format: comma-separated items of 'id|||name' pairs. |
| 92 | """ |
| 93 | files: List[SourceFile] = [] |
| 94 | if not output: |
| 95 | return files |
| 96 | |
| 97 | # AppleScript returns a flat comma-separated list |
| 98 | entries = output.split(", ") |
| 99 | for entry in entries: |
| 100 | entry = entry.strip() |
| 101 | if "|||" not in entry: |
| 102 | continue |
| 103 | note_id, _, name = entry.partition("|||") |
| 104 | note_id = note_id.strip() |
| 105 | name = name.strip() |
| 106 | if note_id and name: |
| 107 | files.append( |
| 108 | SourceFile( |
| 109 | name=name, |
| 110 | id=note_id, |
| 111 | mime_type="text/plain", |
| 112 | ) |
| 113 | ) |
| 114 | |
| 115 | logger.info("Found %d notes", len(files)) |
| 116 | return files |
| 117 | |
| 118 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 119 | """Download a note's content as plain text.""" |
| 120 | destination = Path(destination) |
| 121 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 122 | |
| 123 | script = ( |
| 124 | 'tell application "Notes"\n' |
| 125 | f' set theNote to note id "{file.id}" of default account\n' |
| 126 | " return body of theNote\n" |
| 127 | "end tell" |
| 128 | ) |
| 129 | |
| 130 | try: |
| 131 | result = subprocess.run( |
| 132 | ["osascript", "-e", script], |
| 133 | capture_output=True, |
| 134 | text=True, |
| 135 | timeout=30, |
| 136 | ) |
| 137 | except FileNotFoundError: |
| 138 | raise RuntimeError("osascript not found. Apple Notes requires macOS.") |
| 139 | except subprocess.TimeoutExpired: |
| 140 | raise RuntimeError(f"osascript timed out fetching note {file.id}") |
| 141 | |
| 142 | if result.returncode != 0: |
| 143 | raise RuntimeError(f"Failed to fetch note {file.id}: {result.stderr.strip()}") |
| 144 | |
| 145 | html_body = result.stdout.strip() |
| 146 | text = self._html_to_text(html_body) |
| 147 | |
| 148 | # Prepend title |
| 149 | content = f"# {file.name}\n\n{text}" |
| 150 | destination.write_text(content, encoding="utf-8") |
| 151 | logger.info("Saved Apple Note to %s", destination) |
| 152 | return destination |
| 153 | |
| 154 | @staticmethod |
| 155 | def _html_to_text(html: str) -> str: |
| 156 | """Strip HTML tags and return plain text. |
| 157 | |
| 158 | Apple Notes returns note bodies as HTML. This uses regex-based |
| 159 | stripping similar to web_source._strip_html_tags. |
| 160 | """ |
| 161 | if not html: |
| 162 | return "" |
| 163 | # Replace <br> variants with newlines |
| 164 | text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE) |
| 165 | # Replace block-level closing tags with newlines |
| 166 | text = re.sub(r"</(?:p|div|li|tr|h[1-6])>", "\n", text, flags=re.IGNORECASE) |
| 167 | # Remove all remaining tags |
| 168 | text = re.sub(r"<[^>]+>", "", text) |
| 169 | # Decode common HTML entities |
| 170 | text = text.replace("&", "&") |
| 171 | text = text.replace("<", "<") |
| 172 | text = text.replace(">", ">") |
| 173 | text = text.replace(""", '"') |
| 174 | text = text.replace("'", "'") |
| 175 | text = text.replace(" ", " ") |
| 176 | # Collapse excessive blank lines |
| 177 | text = re.sub(r"\n{3,}", "\n\n", text) |
| 178 | return text.strip() |
| --- a/video_processor/sources/google_keep_source.py | ||
| +++ b/video_processor/sources/google_keep_source.py | ||
| @@ -0,0 +1,170 @@ | ||
| 1 | +"""Google Keep source connector using the gws CLI (googleworkspace/cli). | |
| 2 | + | |
| 3 | +Fetches notes from Google Keep via the `gws` CLI tool. | |
| 4 | +Outputs plain text suitable for KG ingestion. | |
| 5 | + | |
| 6 | +Requires: npm install -g @googleworkspace/cli | |
| 7 | +Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless) | |
| 8 | +""" | |
| 9 | + | |
| 10 | +import json | |
| 11 | +import logging | |
| 12 | +import shutil | |
| 13 | +import subprocess | |
| 14 | +from pathlib import Path | |
| 15 | +from typing import Any, Dict, List, Optional | |
| 16 | + | |
| 17 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 18 | + | |
| 19 | +logger = logging.getLogger(__name__) | |
| 20 | + | |
| 21 | + | |
| 22 | +def _run_gws(args: List[str], timeout: int = 30) -> Dict[str, Any]: | |
| 23 | + """Run a gws CLI command and return parsed JSON output.""" | |
| 24 | + cmd = ["gws"] + args | |
| 25 | + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) | |
| 26 | + if proc.returncode != 0: | |
| 27 | + raise RuntimeError(f"gws {' '.join(args)} failed: {proc.stderr.strip()}") | |
| 28 | + try: | |
| 29 | + return json.loads(proc.stdout) | |
| 30 | + except json.JSONDecodeError: | |
| 31 | + return {"raw": proc.stdout.strip()} | |
| 32 | + | |
| 33 | + | |
| 34 | +def _note_to_text(note: dict) -> str: | |
| 35 | + """Extract text content from a Google Keep note structure. | |
| 36 | + | |
| 37 | + Handles plain text notes and checklists. Checklist items are formatted | |
| 38 | + as ``- [x] item`` (checked) or ``- [ ] item`` (unchecked). | |
| 39 | + """ | |
| 40 | + parts: List[str] = [] | |
| 41 | + | |
| 42 | + title = note.get("title", "").strip() | |
| 43 | + if title: | |
| 44 | + parts.append(title) | |
| 45 | + | |
| 46 | + body = note.get("body", note.get("textContent", "")).strip() | |
| 47 | + if body: | |
| 48 | + parts.append(body) | |
| 49 | + | |
| 50 | + # Checklist items may appear under "list", "listContent", or "checklistItems" | |
| 51 | + list_items = note.get("list", note.get("listContent", note.get("checklistItems", []))) | |
| 52 | + if isinstance(list_items, list): | |
| 53 | + for item in list_items: | |
| 54 | + text = item.get("text", "").strip() | |
| 55 | + if not text: | |
| 56 | + continue | |
| 57 | + checked = item.get("checked", item.get("isChecked", False)) | |
| 58 | + marker = "[x]" if checked else "[ ]" | |
| 59 | + parts.append(f"- {marker} {text}") | |
| 60 | + | |
| 61 | + return "\n\n".join(parts) if parts else "" | |
| 62 | + | |
| 63 | + | |
| 64 | +class GoogleKeepSource(BaseSource): | |
| 65 | + """ | |
| 66 | + Fetch notes from Google Keep via the gws CLI. | |
| 67 | + | |
| 68 | + Usage: | |
| 69 | + source = GoogleKeepSource() # all notes | |
| 70 | + source = GoogleKeepSource(label="meetings") # filter by label | |
| 71 | + files = source.list_videos() | |
| 72 | + source.download_all(files, Path("./notes")) | |
| 73 | + """ | |
| 74 | + | |
| 75 | + def __init__(self, label: Optional[str] = None): | |
| 76 | + self.label = label | |
| 77 | + | |
| 78 | + def authenticate(self) -> bool: | |
| 79 | + """Check if gws CLI is installed and authenticated.""" | |
| 80 | + if not shutil.which("gws"): | |
| 81 | + logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli") | |
| 82 | + return False | |
| 83 | + try: | |
| 84 | + _run_gws(["auth", "status"], timeout=10) | |
| 85 | + return True | |
| 86 | + except (RuntimeError, subprocess.TimeoutExpired): | |
| 87 | + logger.error("gws not authenticated. Run: gws auth login") | |
| 88 | + return False | |
| 89 | + | |
| 90 | + def list_videos( | |
| 91 | + self, | |
| 92 | + folder_id: Optional[str] = None, | |
| 93 | + folder_path: Optional[str] = None, | |
| 94 | + patterns: Optional[List[str]] = None, | |
| 95 | + ) -> List[SourceFile]: | |
| 96 | + """List notes in Google Keep. Returns SourceFile per note.""" | |
| 97 | + args = ["keep", "notes", "list", "--output", "json"] | |
| 98 | + | |
| 99 | + if self.label: | |
| 100 | + args.extend(["--label", self.label]) | |
| 101 | + | |
| 102 | + try: | |
| 103 | + result = _run_gws(args, timeout=60) | |
| 104 | + except RuntimeError as e: | |
| 105 | + logger.error(f"Failed to list Keep notes: {e}") | |
| 106 | + return [] | |
| 107 | + | |
| 108 | + # Result may be a list directly or nested under a key | |
| 109 | + notes: List[dict] = [] | |
| 110 | + if isinstance(result, list): | |
| 111 | + notes = result | |
| 112 | + elif isinstance(result, dict): | |
| 113 | + notes = result.get("notes", result.get("items", [])) | |
| 114 | + # If we got a single note back (not a list), wrap it | |
| 115 | + if not notes and "id" in result and "raw" not in result: | |
| 116 | + notes = [result] | |
| 117 | + | |
| 118 | + files: List[SourceFile] = [] | |
| 119 | + for note in notes: | |
| 120 | + note_id = note.get("id", note.get("noteId", "")) | |
| 121 | + title = note.get("title", "Untitled Note").strip() or "Untitled Note" | |
| 122 | + modified = note.get("modifiedTime", note.get("updateTime")) | |
| 123 | + | |
| 124 | + # Estimate size from text content | |
| 125 | + text = _note_to_text(note) | |
| 126 | + size = len(text.encode("utf-8")) if text else None | |
| 127 | + | |
| 128 | + files.append( | |
| 129 | + SourceFile( | |
| 130 | + name=title, | |
| 131 | + id=str(note_id), | |
| 132 | + size_bytes=size, | |
| 133 | + mime_type="text/plain", | |
| 134 | + modified_at=modified, | |
| 135 | + ) | |
| 136 | + ) | |
| 137 | + | |
| 138 | + logger.info(f"Found {len(files)} note(s) in Google Keep") | |
| 139 | + return files | |
| 140 | + | |
| 141 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 142 | + """Download a Keep note's content as a text file.""" | |
| 143 | + destination = Path(destination) | |
| 144 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 145 | + | |
| 146 | + try: | |
| 147 | + result = _run_gws( | |
| 148 | + [ | |
| 149 | + "keep", | |
| 150 | + "notes", | |
| 151 | + "get", | |
| 152 | + "--params", | |
| 153 | + json.dumps({"noteId": file.id}), | |
| 154 | + ], | |
| 155 | + timeout=30, | |
| 156 | + ) | |
| 157 | + except RuntimeError as e: | |
| 158 | + raise RuntimeError(f"Failed to fetch Keep note {file.id}: {e}") from e | |
| 159 | + | |
| 160 | + # result may be the note dict directly or wrapped | |
| 161 | + note = result if isinstance(result, dict) else {} | |
| 162 | + text = _note_to_text(note) | |
| 163 | + | |
| 164 | + if not text: | |
| 165 | + # Fallback: use raw output if structured extraction yielded nothing | |
| 166 | + text = note.get("raw", json.dumps(note, indent=2)) | |
| 167 | + | |
| 168 | + destination.write_text(text, encoding="utf-8") | |
| 169 | + logger.info(f"Saved note '{file.name}' to {destination}") | |
| 170 | + return destination |
| --- a/video_processor/sources/google_keep_source.py | |
| +++ b/video_processor/sources/google_keep_source.py | |
| @@ -0,0 +1,170 @@ | |
| --- a/video_processor/sources/google_keep_source.py | |
| +++ b/video_processor/sources/google_keep_source.py | |
| @@ -0,0 +1,170 @@ | |
| 1 | """Google Keep source connector using the gws CLI (googleworkspace/cli). |
| 2 | |
| 3 | Fetches notes from Google Keep via the `gws` CLI tool. |
| 4 | Outputs plain text suitable for KG ingestion. |
| 5 | |
| 6 | Requires: npm install -g @googleworkspace/cli |
| 7 | Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless) |
| 8 | """ |
| 9 | |
| 10 | import json |
| 11 | import logging |
| 12 | import shutil |
| 13 | import subprocess |
| 14 | from pathlib import Path |
| 15 | from typing import Any, Dict, List, Optional |
| 16 | |
| 17 | from video_processor.sources.base import BaseSource, SourceFile |
| 18 | |
| 19 | logger = logging.getLogger(__name__) |
| 20 | |
| 21 | |
| 22 | def _run_gws(args: List[str], timeout: int = 30) -> Dict[str, Any]: |
| 23 | """Run a gws CLI command and return parsed JSON output.""" |
| 24 | cmd = ["gws"] + args |
| 25 | proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) |
| 26 | if proc.returncode != 0: |
| 27 | raise RuntimeError(f"gws {' '.join(args)} failed: {proc.stderr.strip()}") |
| 28 | try: |
| 29 | return json.loads(proc.stdout) |
| 30 | except json.JSONDecodeError: |
| 31 | return {"raw": proc.stdout.strip()} |
| 32 | |
| 33 | |
| 34 | def _note_to_text(note: dict) -> str: |
| 35 | """Extract text content from a Google Keep note structure. |
| 36 | |
| 37 | Handles plain text notes and checklists. Checklist items are formatted |
| 38 | as ``- [x] item`` (checked) or ``- [ ] item`` (unchecked). |
| 39 | """ |
| 40 | parts: List[str] = [] |
| 41 | |
| 42 | title = note.get("title", "").strip() |
| 43 | if title: |
| 44 | parts.append(title) |
| 45 | |
| 46 | body = note.get("body", note.get("textContent", "")).strip() |
| 47 | if body: |
| 48 | parts.append(body) |
| 49 | |
| 50 | # Checklist items may appear under "list", "listContent", or "checklistItems" |
| 51 | list_items = note.get("list", note.get("listContent", note.get("checklistItems", []))) |
| 52 | if isinstance(list_items, list): |
| 53 | for item in list_items: |
| 54 | text = item.get("text", "").strip() |
| 55 | if not text: |
| 56 | continue |
| 57 | checked = item.get("checked", item.get("isChecked", False)) |
| 58 | marker = "[x]" if checked else "[ ]" |
| 59 | parts.append(f"- {marker} {text}") |
| 60 | |
| 61 | return "\n\n".join(parts) if parts else "" |
| 62 | |
| 63 | |
| 64 | class GoogleKeepSource(BaseSource): |
| 65 | """ |
| 66 | Fetch notes from Google Keep via the gws CLI. |
| 67 | |
| 68 | Usage: |
| 69 | source = GoogleKeepSource() # all notes |
| 70 | source = GoogleKeepSource(label="meetings") # filter by label |
| 71 | files = source.list_videos() |
| 72 | source.download_all(files, Path("./notes")) |
| 73 | """ |
| 74 | |
| 75 | def __init__(self, label: Optional[str] = None): |
| 76 | self.label = label |
| 77 | |
| 78 | def authenticate(self) -> bool: |
| 79 | """Check if gws CLI is installed and authenticated.""" |
| 80 | if not shutil.which("gws"): |
| 81 | logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli") |
| 82 | return False |
| 83 | try: |
| 84 | _run_gws(["auth", "status"], timeout=10) |
| 85 | return True |
| 86 | except (RuntimeError, subprocess.TimeoutExpired): |
| 87 | logger.error("gws not authenticated. Run: gws auth login") |
| 88 | return False |
| 89 | |
| 90 | def list_videos( |
| 91 | self, |
| 92 | folder_id: Optional[str] = None, |
| 93 | folder_path: Optional[str] = None, |
| 94 | patterns: Optional[List[str]] = None, |
| 95 | ) -> List[SourceFile]: |
| 96 | """List notes in Google Keep. Returns SourceFile per note.""" |
| 97 | args = ["keep", "notes", "list", "--output", "json"] |
| 98 | |
| 99 | if self.label: |
| 100 | args.extend(["--label", self.label]) |
| 101 | |
| 102 | try: |
| 103 | result = _run_gws(args, timeout=60) |
| 104 | except RuntimeError as e: |
| 105 | logger.error(f"Failed to list Keep notes: {e}") |
| 106 | return [] |
| 107 | |
| 108 | # Result may be a list directly or nested under a key |
| 109 | notes: List[dict] = [] |
| 110 | if isinstance(result, list): |
| 111 | notes = result |
| 112 | elif isinstance(result, dict): |
| 113 | notes = result.get("notes", result.get("items", [])) |
| 114 | # If we got a single note back (not a list), wrap it |
| 115 | if not notes and "id" in result and "raw" not in result: |
| 116 | notes = [result] |
| 117 | |
| 118 | files: List[SourceFile] = [] |
| 119 | for note in notes: |
| 120 | note_id = note.get("id", note.get("noteId", "")) |
| 121 | title = note.get("title", "Untitled Note").strip() or "Untitled Note" |
| 122 | modified = note.get("modifiedTime", note.get("updateTime")) |
| 123 | |
| 124 | # Estimate size from text content |
| 125 | text = _note_to_text(note) |
| 126 | size = len(text.encode("utf-8")) if text else None |
| 127 | |
| 128 | files.append( |
| 129 | SourceFile( |
| 130 | name=title, |
| 131 | id=str(note_id), |
| 132 | size_bytes=size, |
| 133 | mime_type="text/plain", |
| 134 | modified_at=modified, |
| 135 | ) |
| 136 | ) |
| 137 | |
| 138 | logger.info(f"Found {len(files)} note(s) in Google Keep") |
| 139 | return files |
| 140 | |
| 141 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 142 | """Download a Keep note's content as a text file.""" |
| 143 | destination = Path(destination) |
| 144 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 145 | |
| 146 | try: |
| 147 | result = _run_gws( |
| 148 | [ |
| 149 | "keep", |
| 150 | "notes", |
| 151 | "get", |
| 152 | "--params", |
| 153 | json.dumps({"noteId": file.id}), |
| 154 | ], |
| 155 | timeout=30, |
| 156 | ) |
| 157 | except RuntimeError as e: |
| 158 | raise RuntimeError(f"Failed to fetch Keep note {file.id}: {e}") from e |
| 159 | |
| 160 | # result may be the note dict directly or wrapped |
| 161 | note = result if isinstance(result, dict) else {} |
| 162 | text = _note_to_text(note) |
| 163 | |
| 164 | if not text: |
| 165 | # Fallback: use raw output if structured extraction yielded nothing |
| 166 | text = note.get("raw", json.dumps(note, indent=2)) |
| 167 | |
| 168 | destination.write_text(text, encoding="utf-8") |
| 169 | logger.info(f"Saved note '{file.name}' to {destination}") |
| 170 | return destination |
| --- a/video_processor/sources/logseq_source.py | ||
| +++ b/video_processor/sources/logseq_source.py | ||
| @@ -0,0 +1,200 @@ | ||
| 1 | +"""Logseq graph source connector for ingesting markdown pages and journals.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import re | |
| 5 | +import shutil | |
| 6 | +from datetime import datetime, timezone | |
| 7 | +from pathlib import Path | |
| 8 | +from typing import List, Optional, Tuple | |
| 9 | + | |
| 10 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 11 | + | |
| 12 | +logger = logging.getLogger(__name__) | |
| 13 | + | |
| 14 | + | |
| 15 | +def parse_page(path: Path) -> dict: | |
| 16 | + """Parse a Logseq markdown page and extract structured content. | |
| 17 | + | |
| 18 | + Returns a dict with: | |
| 19 | + - properties: dict of page-level properties (key:: value lines at top) | |
| 20 | + - links: list of linked page names from [[wiki-links]] | |
| 21 | + - tags: list of tags from #tag and #[[tag]] occurrences | |
| 22 | + - block_refs: list of block reference IDs from ((block-id)) | |
| 23 | + - body: full text content | |
| 24 | + """ | |
| 25 | + text = path.read_text(encoding="utf-8") | |
| 26 | + lines = text.split("\n") | |
| 27 | + | |
| 28 | + # Extract page properties (key:: value lines at the top of the file) | |
| 29 | + properties: dict = {} | |
| 30 | + body_start = 0 | |
| 31 | + for i, line in enumerate(lines): | |
| 32 | + prop_match = re.match(r"^([A-Za-z][A-Za-z0-9_-]*)::\ ?(.*)", line) | |
| 33 | + if prop_match: | |
| 34 | + key = prop_match.group(1) | |
| 35 | + value = prop_match.group(2).strip() | |
| 36 | + properties[key] = value | |
| 37 | + body_start = i + 1 | |
| 38 | + else: | |
| 39 | + break | |
| 40 | + | |
| 41 | + body = "\n".join(lines[body_start:]) | |
| 42 | + | |
| 43 | + # Extract wiki-links: [[page]] | |
| 44 | + link_pattern = re.compile(r"\[\[([^\]]+)\]\]") | |
| 45 | + links = link_pattern.findall(body) | |
| 46 | + # Also pick up links from properties | |
| 47 | + for value in properties.values(): | |
| 48 | + links.extend(link_pattern.findall(str(value))) | |
| 49 | + | |
| 50 | + # Extract tags: #tag and #[[tag]] | |
| 51 | + # First get #[[multi word tag]] style | |
| 52 | + bracket_tag_pattern = re.compile(r"#\[\[([^\]]+)\]\]") | |
| 53 | + tags = bracket_tag_pattern.findall(text) | |
| 54 | + # Then get simple #tag style (exclude matches already captured as #[[...]]) | |
| 55 | + # Remove bracket tags first to avoid double-matching | |
| 56 | + text_without_bracket_tags = bracket_tag_pattern.sub("", text) | |
| 57 | + simple_tag_pattern = re.compile(r"(?<!\w)#([A-Za-z][A-Za-z0-9_/-]*)") | |
| 58 | + tags.extend(simple_tag_pattern.findall(text_without_bracket_tags)) | |
| 59 | + | |
| 60 | + # Extract block references: ((block-id)) | |
| 61 | + block_ref_pattern = re.compile(r"\(\(([a-f0-9-]+)\)\)") | |
| 62 | + block_refs = block_ref_pattern.findall(text) | |
| 63 | + | |
| 64 | + return { | |
| 65 | + "properties": properties, | |
| 66 | + "links": links, | |
| 67 | + "tags": tags, | |
| 68 | + "block_refs": block_refs, | |
| 69 | + "body": body, | |
| 70 | + } | |
| 71 | + | |
| 72 | + | |
| 73 | +def ingest_graph(graph_path: Path) -> dict: | |
| 74 | + """Ingest an entire Logseq graph and return structured data. | |
| 75 | + | |
| 76 | + Returns a dict with: | |
| 77 | + - notes: list of dicts with name, tags, frontmatter (properties), text | |
| 78 | + - links: list of (source, target) tuples from wiki-links | |
| 79 | + """ | |
| 80 | + graph_path = Path(graph_path) | |
| 81 | + notes: List[dict] = [] | |
| 82 | + links: List[Tuple[str, str]] = [] | |
| 83 | + | |
| 84 | + md_files: List[Path] = [] | |
| 85 | + pages_dir = graph_path / "pages" | |
| 86 | + journals_dir = graph_path / "journals" | |
| 87 | + | |
| 88 | + if pages_dir.is_dir(): | |
| 89 | + md_files.extend(sorted(pages_dir.rglob("*.md"))) | |
| 90 | + if journals_dir.is_dir(): | |
| 91 | + md_files.extend(sorted(journals_dir.rglob("*.md"))) | |
| 92 | + | |
| 93 | + logger.info("Found %d markdown files in graph %s", len(md_files), graph_path) | |
| 94 | + | |
| 95 | + for md_file in md_files: | |
| 96 | + page_name = md_file.stem | |
| 97 | + try: | |
| 98 | + parsed = parse_page(md_file) | |
| 99 | + except Exception: | |
| 100 | + logger.warning("Failed to parse page %s", md_file) | |
| 101 | + continue | |
| 102 | + | |
| 103 | + notes.append( | |
| 104 | + { | |
| 105 | + "name": page_name, | |
| 106 | + "tags": parsed["tags"], | |
| 107 | + "frontmatter": parsed["properties"], | |
| 108 | + "text": parsed["body"], | |
| 109 | + } | |
| 110 | + ) | |
| 111 | + | |
| 112 | + for linked_page in parsed["links"]: | |
| 113 | + links.append((page_name, linked_page)) | |
| 114 | + | |
| 115 | + logger.info( | |
| 116 | + "Ingested %d notes with %d links from graph %s", | |
| 117 | + len(notes), | |
| 118 | + len(links), | |
| 119 | + graph_path, | |
| 120 | + ) | |
| 121 | + return {"notes": notes, "links": links} | |
| 122 | + | |
| 123 | + | |
| 124 | +class LogseqSource(BaseSource): | |
| 125 | + """Source connector for Logseq graphs.""" | |
| 126 | + | |
| 127 | + def __init__(self, graph_path: str) -> None: | |
| 128 | + self.graph_path = Path(graph_path) | |
| 129 | + | |
| 130 | + def authenticate(self) -> bool: | |
| 131 | + """Check that the graph path exists and has pages/ or journals/ dirs.""" | |
| 132 | + if not self.graph_path.is_dir(): | |
| 133 | + logger.error("Graph path does not exist: %s", self.graph_path) | |
| 134 | + return False | |
| 135 | + has_pages = (self.graph_path / "pages").is_dir() | |
| 136 | + has_journals = (self.graph_path / "journals").is_dir() | |
| 137 | + if not has_pages and not has_journals: | |
| 138 | + logger.error( | |
| 139 | + "No pages/ or journals/ directory found in graph: %s", | |
| 140 | + self.graph_path, | |
| 141 | + ) | |
| 142 | + return False | |
| 143 | + logger.info( | |
| 144 | + "Logseq graph authenticated: %s (pages=%s, journals=%s)", | |
| 145 | + self.graph_path, | |
| 146 | + has_pages, | |
| 147 | + has_journals, | |
| 148 | + ) | |
| 149 | + return True | |
| 150 | + | |
| 151 | + def list_videos( | |
| 152 | + self, | |
| 153 | + folder_id: Optional[str] = None, | |
| 154 | + folder_path: Optional[str] = None, | |
| 155 | + patterns: Optional[List[str]] = None, | |
| 156 | + ) -> List[SourceFile]: | |
| 157 | + """List .md files in pages/ and journals/ as SourceFile objects.""" | |
| 158 | + md_files: List[Path] = [] | |
| 159 | + | |
| 160 | + pages_dir = self.graph_path / "pages" | |
| 161 | + journals_dir = self.graph_path / "journals" | |
| 162 | + | |
| 163 | + if folder_path: | |
| 164 | + search_root = self.graph_path / folder_path | |
| 165 | + if search_root.is_dir(): | |
| 166 | + md_files.extend(sorted(search_root.rglob("*.md"))) | |
| 167 | + else: | |
| 168 | + if pages_dir.is_dir(): | |
| 169 | + md_files.extend(sorted(pages_dir.rglob("*.md"))) | |
| 170 | + if journals_dir.is_dir(): | |
| 171 | + md_files.extend(sorted(journals_dir.rglob("*.md"))) | |
| 172 | + | |
| 173 | + results: List[SourceFile] = [] | |
| 174 | + for md_file in md_files: | |
| 175 | + relative = md_file.relative_to(self.graph_path) | |
| 176 | + stat = md_file.stat() | |
| 177 | + modified_dt = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc) | |
| 178 | + | |
| 179 | + results.append( | |
| 180 | + SourceFile( | |
| 181 | + name=md_file.name, | |
| 182 | + id=str(relative), | |
| 183 | + size_bytes=stat.st_size, | |
| 184 | + mime_type="text/markdown", | |
| 185 | + modified_at=modified_dt.isoformat(), | |
| 186 | + path=str(relative), | |
| 187 | + ) | |
| 188 | + ) | |
| 189 | + | |
| 190 | + logger.info("Listed %d files from graph %s", len(results), self.graph_path) | |
| 191 | + return results | |
| 192 | + | |
| 193 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 194 | + """Copy a graph file to the destination path.""" | |
| 195 | + source = self.graph_path / file.id | |
| 196 | + destination = Path(destination) | |
| 197 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 198 | + shutil.copy2(source, destination) | |
| 199 | + logger.info("Copied %s -> %s", source, destination) | |
| 200 | + return destination |
| --- a/video_processor/sources/logseq_source.py | |
| +++ b/video_processor/sources/logseq_source.py | |
| @@ -0,0 +1,200 @@ | |
| --- a/video_processor/sources/logseq_source.py | |
| +++ b/video_processor/sources/logseq_source.py | |
| @@ -0,0 +1,200 @@ | |
| 1 | """Logseq graph source connector for ingesting markdown pages and journals.""" |
| 2 | |
| 3 | import logging |
| 4 | import re |
| 5 | import shutil |
| 6 | from datetime import datetime, timezone |
| 7 | from pathlib import Path |
| 8 | from typing import List, Optional, Tuple |
| 9 | |
| 10 | from video_processor.sources.base import BaseSource, SourceFile |
| 11 | |
| 12 | logger = logging.getLogger(__name__) |
| 13 | |
| 14 | |
| 15 | def parse_page(path: Path) -> dict: |
| 16 | """Parse a Logseq markdown page and extract structured content. |
| 17 | |
| 18 | Returns a dict with: |
| 19 | - properties: dict of page-level properties (key:: value lines at top) |
| 20 | - links: list of linked page names from [[wiki-links]] |
| 21 | - tags: list of tags from #tag and #[[tag]] occurrences |
| 22 | - block_refs: list of block reference IDs from ((block-id)) |
| 23 | - body: full text content |
| 24 | """ |
| 25 | text = path.read_text(encoding="utf-8") |
| 26 | lines = text.split("\n") |
| 27 | |
| 28 | # Extract page properties (key:: value lines at the top of the file) |
| 29 | properties: dict = {} |
| 30 | body_start = 0 |
| 31 | for i, line in enumerate(lines): |
| 32 | prop_match = re.match(r"^([A-Za-z][A-Za-z0-9_-]*)::\ ?(.*)", line) |
| 33 | if prop_match: |
| 34 | key = prop_match.group(1) |
| 35 | value = prop_match.group(2).strip() |
| 36 | properties[key] = value |
| 37 | body_start = i + 1 |
| 38 | else: |
| 39 | break |
| 40 | |
| 41 | body = "\n".join(lines[body_start:]) |
| 42 | |
| 43 | # Extract wiki-links: [[page]] |
| 44 | link_pattern = re.compile(r"\[\[([^\]]+)\]\]") |
| 45 | links = link_pattern.findall(body) |
| 46 | # Also pick up links from properties |
| 47 | for value in properties.values(): |
| 48 | links.extend(link_pattern.findall(str(value))) |
| 49 | |
| 50 | # Extract tags: #tag and #[[tag]] |
| 51 | # First get #[[multi word tag]] style |
| 52 | bracket_tag_pattern = re.compile(r"#\[\[([^\]]+)\]\]") |
| 53 | tags = bracket_tag_pattern.findall(text) |
| 54 | # Then get simple #tag style (exclude matches already captured as #[[...]]) |
| 55 | # Remove bracket tags first to avoid double-matching |
| 56 | text_without_bracket_tags = bracket_tag_pattern.sub("", text) |
| 57 | simple_tag_pattern = re.compile(r"(?<!\w)#([A-Za-z][A-Za-z0-9_/-]*)") |
| 58 | tags.extend(simple_tag_pattern.findall(text_without_bracket_tags)) |
| 59 | |
| 60 | # Extract block references: ((block-id)) |
| 61 | block_ref_pattern = re.compile(r"\(\(([a-f0-9-]+)\)\)") |
| 62 | block_refs = block_ref_pattern.findall(text) |
| 63 | |
| 64 | return { |
| 65 | "properties": properties, |
| 66 | "links": links, |
| 67 | "tags": tags, |
| 68 | "block_refs": block_refs, |
| 69 | "body": body, |
| 70 | } |
| 71 | |
| 72 | |
| 73 | def ingest_graph(graph_path: Path) -> dict: |
| 74 | """Ingest an entire Logseq graph and return structured data. |
| 75 | |
| 76 | Returns a dict with: |
| 77 | - notes: list of dicts with name, tags, frontmatter (properties), text |
| 78 | - links: list of (source, target) tuples from wiki-links |
| 79 | """ |
| 80 | graph_path = Path(graph_path) |
| 81 | notes: List[dict] = [] |
| 82 | links: List[Tuple[str, str]] = [] |
| 83 | |
| 84 | md_files: List[Path] = [] |
| 85 | pages_dir = graph_path / "pages" |
| 86 | journals_dir = graph_path / "journals" |
| 87 | |
| 88 | if pages_dir.is_dir(): |
| 89 | md_files.extend(sorted(pages_dir.rglob("*.md"))) |
| 90 | if journals_dir.is_dir(): |
| 91 | md_files.extend(sorted(journals_dir.rglob("*.md"))) |
| 92 | |
| 93 | logger.info("Found %d markdown files in graph %s", len(md_files), graph_path) |
| 94 | |
| 95 | for md_file in md_files: |
| 96 | page_name = md_file.stem |
| 97 | try: |
| 98 | parsed = parse_page(md_file) |
| 99 | except Exception: |
| 100 | logger.warning("Failed to parse page %s", md_file) |
| 101 | continue |
| 102 | |
| 103 | notes.append( |
| 104 | { |
| 105 | "name": page_name, |
| 106 | "tags": parsed["tags"], |
| 107 | "frontmatter": parsed["properties"], |
| 108 | "text": parsed["body"], |
| 109 | } |
| 110 | ) |
| 111 | |
| 112 | for linked_page in parsed["links"]: |
| 113 | links.append((page_name, linked_page)) |
| 114 | |
| 115 | logger.info( |
| 116 | "Ingested %d notes with %d links from graph %s", |
| 117 | len(notes), |
| 118 | len(links), |
| 119 | graph_path, |
| 120 | ) |
| 121 | return {"notes": notes, "links": links} |
| 122 | |
| 123 | |
| 124 | class LogseqSource(BaseSource): |
| 125 | """Source connector for Logseq graphs.""" |
| 126 | |
| 127 | def __init__(self, graph_path: str) -> None: |
| 128 | self.graph_path = Path(graph_path) |
| 129 | |
| 130 | def authenticate(self) -> bool: |
| 131 | """Check that the graph path exists and has pages/ or journals/ dirs.""" |
| 132 | if not self.graph_path.is_dir(): |
| 133 | logger.error("Graph path does not exist: %s", self.graph_path) |
| 134 | return False |
| 135 | has_pages = (self.graph_path / "pages").is_dir() |
| 136 | has_journals = (self.graph_path / "journals").is_dir() |
| 137 | if not has_pages and not has_journals: |
| 138 | logger.error( |
| 139 | "No pages/ or journals/ directory found in graph: %s", |
| 140 | self.graph_path, |
| 141 | ) |
| 142 | return False |
| 143 | logger.info( |
| 144 | "Logseq graph authenticated: %s (pages=%s, journals=%s)", |
| 145 | self.graph_path, |
| 146 | has_pages, |
| 147 | has_journals, |
| 148 | ) |
| 149 | return True |
| 150 | |
| 151 | def list_videos( |
| 152 | self, |
| 153 | folder_id: Optional[str] = None, |
| 154 | folder_path: Optional[str] = None, |
| 155 | patterns: Optional[List[str]] = None, |
| 156 | ) -> List[SourceFile]: |
| 157 | """List .md files in pages/ and journals/ as SourceFile objects.""" |
| 158 | md_files: List[Path] = [] |
| 159 | |
| 160 | pages_dir = self.graph_path / "pages" |
| 161 | journals_dir = self.graph_path / "journals" |
| 162 | |
| 163 | if folder_path: |
| 164 | search_root = self.graph_path / folder_path |
| 165 | if search_root.is_dir(): |
| 166 | md_files.extend(sorted(search_root.rglob("*.md"))) |
| 167 | else: |
| 168 | if pages_dir.is_dir(): |
| 169 | md_files.extend(sorted(pages_dir.rglob("*.md"))) |
| 170 | if journals_dir.is_dir(): |
| 171 | md_files.extend(sorted(journals_dir.rglob("*.md"))) |
| 172 | |
| 173 | results: List[SourceFile] = [] |
| 174 | for md_file in md_files: |
| 175 | relative = md_file.relative_to(self.graph_path) |
| 176 | stat = md_file.stat() |
| 177 | modified_dt = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc) |
| 178 | |
| 179 | results.append( |
| 180 | SourceFile( |
| 181 | name=md_file.name, |
| 182 | id=str(relative), |
| 183 | size_bytes=stat.st_size, |
| 184 | mime_type="text/markdown", |
| 185 | modified_at=modified_dt.isoformat(), |
| 186 | path=str(relative), |
| 187 | ) |
| 188 | ) |
| 189 | |
| 190 | logger.info("Listed %d files from graph %s", len(results), self.graph_path) |
| 191 | return results |
| 192 | |
| 193 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 194 | """Copy a graph file to the destination path.""" |
| 195 | source = self.graph_path / file.id |
| 196 | destination = Path(destination) |
| 197 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 198 | shutil.copy2(source, destination) |
| 199 | logger.info("Copied %s -> %s", source, destination) |
| 200 | return destination |
| --- a/video_processor/sources/notion_source.py | ||
| +++ b/video_processor/sources/notion_source.py | ||
| @@ -0,0 +1,380 @@ | ||
| 1 | +"""Notion API source connector for fetching pages and databases.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import os | |
| 5 | +from pathlib import Path | |
| 6 | +from typing import Dict, List, Optional | |
| 7 | + | |
| 8 | +import requests | |
| 9 | + | |
| 10 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 11 | + | |
| 12 | +logger = logging.getLogger(__name__) | |
| 13 | + | |
| 14 | +NOTION_VERSION = "2022-06-28" | |
| 15 | +NOTION_BASE_URL = "https://api.notion.com/v1" | |
| 16 | + | |
| 17 | + | |
| 18 | +class NotionSource(BaseSource): | |
| 19 | + """ | |
| 20 | + Fetch pages and databases from Notion via the public API. | |
| 21 | + | |
| 22 | + Requires a Notion integration token (internal integration). | |
| 23 | + Set NOTION_API_KEY env var or pass token directly. | |
| 24 | + | |
| 25 | + Requires: pip install requests | |
| 26 | + """ | |
| 27 | + | |
| 28 | + def __init__( | |
| 29 | + self, | |
| 30 | + token: Optional[str] = None, | |
| 31 | + database_id: Optional[str] = None, | |
| 32 | + page_ids: Optional[List[str]] = None, | |
| 33 | + ): | |
| 34 | + self.token = token or os.environ.get("NOTION_API_KEY", "") | |
| 35 | + self.database_id = database_id | |
| 36 | + self.page_ids = page_ids or [] | |
| 37 | + | |
| 38 | + def _headers(self) -> Dict[str, str]: | |
| 39 | + return { | |
| 40 | + "Authorization": f"Bearer {self.token}", | |
| 41 | + "Notion-Version": NOTION_VERSION, | |
| 42 | + "Content-Type": "application/json", | |
| 43 | + } | |
| 44 | + | |
| 45 | + def authenticate(self) -> bool: | |
| 46 | + """Check token is set and make a test call to the Notion API.""" | |
| 47 | + if not self.token: | |
| 48 | + logger.error("Notion token not set. Provide token or set NOTION_API_KEY.") | |
| 49 | + return False | |
| 50 | + try: | |
| 51 | + resp = requests.get( | |
| 52 | + f"{NOTION_BASE_URL}/users/me", | |
| 53 | + headers=self._headers(), | |
| 54 | + timeout=15, | |
| 55 | + ) | |
| 56 | + resp.raise_for_status() | |
| 57 | + user = resp.json() | |
| 58 | + logger.info("Authenticated with Notion as %s", user.get("name", "unknown")) | |
| 59 | + return True | |
| 60 | + except requests.RequestException as exc: | |
| 61 | + logger.error("Notion authentication failed: %s", exc) | |
| 62 | + return False | |
| 63 | + | |
| 64 | + def list_videos( | |
| 65 | + self, | |
| 66 | + folder_id: Optional[str] = None, | |
| 67 | + folder_path: Optional[str] = None, | |
| 68 | + patterns: Optional[List[str]] = None, | |
| 69 | + ) -> List[SourceFile]: | |
| 70 | + """List Notion pages as SourceFiles. | |
| 71 | + | |
| 72 | + If database_id is set, query the database for pages. | |
| 73 | + If page_ids is set, fetch each page individually. | |
| 74 | + """ | |
| 75 | + files: List[SourceFile] = [] | |
| 76 | + | |
| 77 | + if self.database_id: | |
| 78 | + files.extend(self._list_from_database(self.database_id)) | |
| 79 | + | |
| 80 | + if self.page_ids: | |
| 81 | + files.extend(self._list_from_pages(self.page_ids)) | |
| 82 | + | |
| 83 | + if not files: | |
| 84 | + logger.warning("No pages found. Set database_id or page_ids.") | |
| 85 | + | |
| 86 | + return files | |
| 87 | + | |
| 88 | + def _list_from_database(self, database_id: str) -> List[SourceFile]: | |
| 89 | + """Query a Notion database and return SourceFiles for each row.""" | |
| 90 | + files: List[SourceFile] = [] | |
| 91 | + has_more = True | |
| 92 | + start_cursor: Optional[str] = None | |
| 93 | + | |
| 94 | + while has_more: | |
| 95 | + body: Dict = {} | |
| 96 | + if start_cursor: | |
| 97 | + body["start_cursor"] = start_cursor | |
| 98 | + | |
| 99 | + resp = requests.post( | |
| 100 | + f"{NOTION_BASE_URL}/databases/{database_id}/query", | |
| 101 | + headers=self._headers(), | |
| 102 | + json=body, | |
| 103 | + timeout=30, | |
| 104 | + ) | |
| 105 | + resp.raise_for_status() | |
| 106 | + data = resp.json() | |
| 107 | + | |
| 108 | + for page in data.get("results", []): | |
| 109 | + title = _extract_page_title(page) | |
| 110 | + files.append( | |
| 111 | + SourceFile( | |
| 112 | + name=title, | |
| 113 | + id=page["id"], | |
| 114 | + mime_type="text/markdown", | |
| 115 | + modified_at=page.get("last_edited_time"), | |
| 116 | + ) | |
| 117 | + ) | |
| 118 | + | |
| 119 | + has_more = data.get("has_more", False) | |
| 120 | + start_cursor = data.get("next_cursor") | |
| 121 | + | |
| 122 | + return files | |
| 123 | + | |
| 124 | + def _list_from_pages(self, page_ids: List[str]) -> List[SourceFile]: | |
| 125 | + """Fetch individual pages by ID and return SourceFiles.""" | |
| 126 | + files: List[SourceFile] = [] | |
| 127 | + for page_id in page_ids: | |
| 128 | + try: | |
| 129 | + resp = requests.get( | |
| 130 | + f"{NOTION_BASE_URL}/pages/{page_id}", | |
| 131 | + headers=self._headers(), | |
| 132 | + timeout=15, | |
| 133 | + ) | |
| 134 | + resp.raise_for_status() | |
| 135 | + page = resp.json() | |
| 136 | + title = _extract_page_title(page) | |
| 137 | + files.append( | |
| 138 | + SourceFile( | |
| 139 | + name=title, | |
| 140 | + id=page["id"], | |
| 141 | + mime_type="text/markdown", | |
| 142 | + modified_at=page.get("last_edited_time"), | |
| 143 | + ) | |
| 144 | + ) | |
| 145 | + except requests.RequestException as exc: | |
| 146 | + logger.error("Failed to fetch page %s: %s", page_id, exc) | |
| 147 | + return files | |
| 148 | + | |
| 149 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 150 | + """Download page blocks as markdown text and save to destination.""" | |
| 151 | + destination = Path(destination) | |
| 152 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 153 | + | |
| 154 | + blocks = self._fetch_all_blocks(file.id) | |
| 155 | + text = self._blocks_to_text(blocks) | |
| 156 | + | |
| 157 | + # Prepend title | |
| 158 | + content = f"# {file.name}\n\n{text}" | |
| 159 | + destination.write_text(content, encoding="utf-8") | |
| 160 | + logger.info("Saved Notion page to %s", destination) | |
| 161 | + return destination | |
| 162 | + | |
| 163 | + def _fetch_all_blocks(self, page_id: str) -> list: | |
| 164 | + """Fetch all child blocks for a page, handling pagination.""" | |
| 165 | + blocks: list = [] | |
| 166 | + has_more = True | |
| 167 | + start_cursor: Optional[str] = None | |
| 168 | + | |
| 169 | + while has_more: | |
| 170 | + url = f"{NOTION_BASE_URL}/blocks/{page_id}/children?page_size=100" | |
| 171 | + if start_cursor: | |
| 172 | + url += f"&start_cursor={start_cursor}" | |
| 173 | + | |
| 174 | + resp = requests.get(url, headers=self._headers(), timeout=30) | |
| 175 | + resp.raise_for_status() | |
| 176 | + data = resp.json() | |
| 177 | + | |
| 178 | + blocks.extend(data.get("results", [])) | |
| 179 | + has_more = data.get("has_more", False) | |
| 180 | + start_cursor = data.get("next_cursor") | |
| 181 | + | |
| 182 | + return blocks | |
| 183 | + | |
| 184 | + def _blocks_to_text(self, blocks: list) -> str: | |
| 185 | + """Convert Notion block objects to markdown text.""" | |
| 186 | + lines: List[str] = [] | |
| 187 | + numbered_index = 0 | |
| 188 | + | |
| 189 | + for block in blocks: | |
| 190 | + block_type = block.get("type", "") | |
| 191 | + block_data = block.get(block_type, {}) | |
| 192 | + | |
| 193 | + if block_type == "paragraph": | |
| 194 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 195 | + lines.append(text) | |
| 196 | + numbered_index = 0 | |
| 197 | + | |
| 198 | + elif block_type == "heading_1": | |
| 199 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 200 | + lines.append(f"# {text}") | |
| 201 | + numbered_index = 0 | |
| 202 | + | |
| 203 | + elif block_type == "heading_2": | |
| 204 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 205 | + lines.append(f"## {text}") | |
| 206 | + numbered_index = 0 | |
| 207 | + | |
| 208 | + elif block_type == "heading_3": | |
| 209 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 210 | + lines.append(f"### {text}") | |
| 211 | + numbered_index = 0 | |
| 212 | + | |
| 213 | + elif block_type == "bulleted_list_item": | |
| 214 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 215 | + lines.append(f"- {text}") | |
| 216 | + numbered_index = 0 | |
| 217 | + | |
| 218 | + elif block_type == "numbered_list_item": | |
| 219 | + numbered_index += 1 | |
| 220 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 221 | + lines.append(f"{numbered_index}. {text}") | |
| 222 | + | |
| 223 | + elif block_type == "to_do": | |
| 224 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 225 | + checked = block_data.get("checked", False) | |
| 226 | + marker = "[x]" if checked else "[ ]" | |
| 227 | + lines.append(f"- {marker} {text}") | |
| 228 | + numbered_index = 0 | |
| 229 | + | |
| 230 | + elif block_type == "code": | |
| 231 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 232 | + language = block_data.get("language", "") | |
| 233 | + lines.append(f"```{language}") | |
| 234 | + lines.append(text) | |
| 235 | + lines.append("```") | |
| 236 | + numbered_index = 0 | |
| 237 | + | |
| 238 | + elif block_type == "quote": | |
| 239 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 240 | + lines.append(f"> {text}") | |
| 241 | + numbered_index = 0 | |
| 242 | + | |
| 243 | + elif block_type == "callout": | |
| 244 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 245 | + icon = block_data.get("icon", {}) | |
| 246 | + emoji = icon.get("emoji", "") if icon else "" | |
| 247 | + prefix = f"{emoji} " if emoji else "" | |
| 248 | + lines.append(f"> {prefix}{text}") | |
| 249 | + numbered_index = 0 | |
| 250 | + | |
| 251 | + elif block_type == "toggle": | |
| 252 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 253 | + lines.append(f"<details><summary>{text}</summary></details>") | |
| 254 | + numbered_index = 0 | |
| 255 | + | |
| 256 | + elif block_type == "divider": | |
| 257 | + lines.append("---") | |
| 258 | + numbered_index = 0 | |
| 259 | + | |
| 260 | + else: | |
| 261 | + # Unsupported block type — try to extract any rich_text | |
| 262 | + text = _rich_text_to_str(block_data.get("rich_text", [])) | |
| 263 | + if text: | |
| 264 | + lines.append(text) | |
| 265 | + numbered_index = 0 | |
| 266 | + | |
| 267 | + return "\n\n".join(lines) | |
| 268 | + | |
| 269 | + def fetch_database_as_table(self, database_id: str) -> str: | |
| 270 | + """Fetch a Notion database and return its rows as CSV-like text. | |
| 271 | + | |
| 272 | + Each row is a page in the database. Columns are derived from | |
| 273 | + the database properties. | |
| 274 | + """ | |
| 275 | + # First, get database schema for column order | |
| 276 | + resp = requests.get( | |
| 277 | + f"{NOTION_BASE_URL}/databases/{database_id}", | |
| 278 | + headers=self._headers(), | |
| 279 | + timeout=15, | |
| 280 | + ) | |
| 281 | + resp.raise_for_status() | |
| 282 | + db_meta = resp.json() | |
| 283 | + properties = db_meta.get("properties", {}) | |
| 284 | + columns = sorted(properties.keys()) | |
| 285 | + | |
| 286 | + # Query all rows | |
| 287 | + rows: List[Dict] = [] | |
| 288 | + has_more = True | |
| 289 | + start_cursor: Optional[str] = None | |
| 290 | + | |
| 291 | + while has_more: | |
| 292 | + body: Dict = {} | |
| 293 | + if start_cursor: | |
| 294 | + body["start_cursor"] = start_cursor | |
| 295 | + | |
| 296 | + resp = requests.post( | |
| 297 | + f"{NOTION_BASE_URL}/databases/{database_id}/query", | |
| 298 | + headers=self._headers(), | |
| 299 | + json=body, | |
| 300 | + timeout=30, | |
| 301 | + ) | |
| 302 | + resp.raise_for_status() | |
| 303 | + data = resp.json() | |
| 304 | + rows.extend(data.get("results", [])) | |
| 305 | + has_more = data.get("has_more", False) | |
| 306 | + start_cursor = data.get("next_cursor") | |
| 307 | + | |
| 308 | + # Build CSV-like output | |
| 309 | + lines: List[str] = [] | |
| 310 | + lines.append(",".join(columns)) | |
| 311 | + | |
| 312 | + for row in rows: | |
| 313 | + row_props = row.get("properties", {}) | |
| 314 | + values: List[str] = [] | |
| 315 | + for col in columns: | |
| 316 | + prop = row_props.get(col, {}) | |
| 317 | + values.append(_extract_property_value(prop)) | |
| 318 | + lines.append(",".join(values)) | |
| 319 | + | |
| 320 | + return "\n".join(lines) | |
| 321 | + | |
| 322 | + | |
| 323 | +def _rich_text_to_str(rich_text: list) -> str: | |
| 324 | + """Extract plain text from a Notion rich_text array.""" | |
| 325 | + return "".join(item.get("plain_text", "") for item in rich_text) | |
| 326 | + | |
| 327 | + | |
| 328 | +def _extract_page_title(page: dict) -> str: | |
| 329 | + """Extract the title from a Notion page object.""" | |
| 330 | + properties = page.get("properties", {}) | |
| 331 | + for prop in properties.values(): | |
| 332 | + if prop.get("type") == "title": | |
| 333 | + return _rich_text_to_str(prop.get("title", [])) | |
| 334 | + return "Untitled" | |
| 335 | + | |
| 336 | + | |
| 337 | +def _extract_property_value(prop: dict) -> str: | |
| 338 | + """Extract a display string from a Notion property value.""" | |
| 339 | + prop_type = prop.get("type", "") | |
| 340 | + | |
| 341 | + if prop_type == "title": | |
| 342 | + return _rich_text_to_str(prop.get("title", [])) | |
| 343 | + elif prop_type == "rich_text": | |
| 344 | + return _rich_text_to_str(prop.get("rich_text", [])) | |
| 345 | + elif prop_type == "number": | |
| 346 | + val = prop.get("number") | |
| 347 | + return str(val) if val is not None else "" | |
| 348 | + elif prop_type == "select": | |
| 349 | + sel = prop.get("select") | |
| 350 | + return sel.get("name", "") if sel else "" | |
| 351 | + elif prop_type == "multi_select": | |
| 352 | + return "; ".join(s.get("name", "") for s in prop.get("multi_select", [])) | |
| 353 | + elif prop_type == "date": | |
| 354 | + date = prop.get("date") | |
| 355 | + if date: | |
| 356 | + start = date.get("start", "") | |
| 357 | + end = date.get("end", "") | |
| 358 | + return f"{start} - {end}" if end else start | |
| 359 | + return "" | |
| 360 | + elif prop_type == "checkbox": | |
| 361 | + return str(prop.get("checkbox", False)) | |
| 362 | + elif prop_type == "url": | |
| 363 | + return prop.get("url", "") or "" | |
| 364 | + elif prop_type == "email": | |
| 365 | + return prop.get("email", "") or "" | |
| 366 | + elif prop_type == "phone_number": | |
| 367 | + return prop.get("phone_number", "") or "" | |
| 368 | + elif prop_type == "status": | |
| 369 | + status = prop.get("status") | |
| 370 | + return status.get("name", "") if status else "" | |
| 371 | + elif prop_type == "people": | |
| 372 | + return "; ".join(p.get("name", "") for p in prop.get("people", [])) | |
| 373 | + elif prop_type == "relation": | |
| 374 | + return "; ".join(r.get("id", "") for r in prop.get("relation", [])) | |
| 375 | + elif prop_type == "formula": | |
| 376 | + formula = prop.get("formula", {}) | |
| 377 | + f_type = formula.get("type", "") | |
| 378 | + return str(formula.get(f_type, "")) | |
| 379 | + else: | |
| 380 | + return "" |
| --- a/video_processor/sources/notion_source.py | |
| +++ b/video_processor/sources/notion_source.py | |
| @@ -0,0 +1,380 @@ | |
| --- a/video_processor/sources/notion_source.py | |
| +++ b/video_processor/sources/notion_source.py | |
| @@ -0,0 +1,380 @@ | |
| 1 | """Notion API source connector for fetching pages and databases.""" |
| 2 | |
| 3 | import logging |
| 4 | import os |
| 5 | from pathlib import Path |
| 6 | from typing import Dict, List, Optional |
| 7 | |
| 8 | import requests |
| 9 | |
| 10 | from video_processor.sources.base import BaseSource, SourceFile |
| 11 | |
| 12 | logger = logging.getLogger(__name__) |
| 13 | |
| 14 | NOTION_VERSION = "2022-06-28" |
| 15 | NOTION_BASE_URL = "https://api.notion.com/v1" |
| 16 | |
| 17 | |
| 18 | class NotionSource(BaseSource): |
| 19 | """ |
| 20 | Fetch pages and databases from Notion via the public API. |
| 21 | |
| 22 | Requires a Notion integration token (internal integration). |
| 23 | Set NOTION_API_KEY env var or pass token directly. |
| 24 | |
| 25 | Requires: pip install requests |
| 26 | """ |
| 27 | |
| 28 | def __init__( |
| 29 | self, |
| 30 | token: Optional[str] = None, |
| 31 | database_id: Optional[str] = None, |
| 32 | page_ids: Optional[List[str]] = None, |
| 33 | ): |
| 34 | self.token = token or os.environ.get("NOTION_API_KEY", "") |
| 35 | self.database_id = database_id |
| 36 | self.page_ids = page_ids or [] |
| 37 | |
| 38 | def _headers(self) -> Dict[str, str]: |
| 39 | return { |
| 40 | "Authorization": f"Bearer {self.token}", |
| 41 | "Notion-Version": NOTION_VERSION, |
| 42 | "Content-Type": "application/json", |
| 43 | } |
| 44 | |
| 45 | def authenticate(self) -> bool: |
| 46 | """Check token is set and make a test call to the Notion API.""" |
| 47 | if not self.token: |
| 48 | logger.error("Notion token not set. Provide token or set NOTION_API_KEY.") |
| 49 | return False |
| 50 | try: |
| 51 | resp = requests.get( |
| 52 | f"{NOTION_BASE_URL}/users/me", |
| 53 | headers=self._headers(), |
| 54 | timeout=15, |
| 55 | ) |
| 56 | resp.raise_for_status() |
| 57 | user = resp.json() |
| 58 | logger.info("Authenticated with Notion as %s", user.get("name", "unknown")) |
| 59 | return True |
| 60 | except requests.RequestException as exc: |
| 61 | logger.error("Notion authentication failed: %s", exc) |
| 62 | return False |
| 63 | |
| 64 | def list_videos( |
| 65 | self, |
| 66 | folder_id: Optional[str] = None, |
| 67 | folder_path: Optional[str] = None, |
| 68 | patterns: Optional[List[str]] = None, |
| 69 | ) -> List[SourceFile]: |
| 70 | """List Notion pages as SourceFiles. |
| 71 | |
| 72 | If database_id is set, query the database for pages. |
| 73 | If page_ids is set, fetch each page individually. |
| 74 | """ |
| 75 | files: List[SourceFile] = [] |
| 76 | |
| 77 | if self.database_id: |
| 78 | files.extend(self._list_from_database(self.database_id)) |
| 79 | |
| 80 | if self.page_ids: |
| 81 | files.extend(self._list_from_pages(self.page_ids)) |
| 82 | |
| 83 | if not files: |
| 84 | logger.warning("No pages found. Set database_id or page_ids.") |
| 85 | |
| 86 | return files |
| 87 | |
| 88 | def _list_from_database(self, database_id: str) -> List[SourceFile]: |
| 89 | """Query a Notion database and return SourceFiles for each row.""" |
| 90 | files: List[SourceFile] = [] |
| 91 | has_more = True |
| 92 | start_cursor: Optional[str] = None |
| 93 | |
| 94 | while has_more: |
| 95 | body: Dict = {} |
| 96 | if start_cursor: |
| 97 | body["start_cursor"] = start_cursor |
| 98 | |
| 99 | resp = requests.post( |
| 100 | f"{NOTION_BASE_URL}/databases/{database_id}/query", |
| 101 | headers=self._headers(), |
| 102 | json=body, |
| 103 | timeout=30, |
| 104 | ) |
| 105 | resp.raise_for_status() |
| 106 | data = resp.json() |
| 107 | |
| 108 | for page in data.get("results", []): |
| 109 | title = _extract_page_title(page) |
| 110 | files.append( |
| 111 | SourceFile( |
| 112 | name=title, |
| 113 | id=page["id"], |
| 114 | mime_type="text/markdown", |
| 115 | modified_at=page.get("last_edited_time"), |
| 116 | ) |
| 117 | ) |
| 118 | |
| 119 | has_more = data.get("has_more", False) |
| 120 | start_cursor = data.get("next_cursor") |
| 121 | |
| 122 | return files |
| 123 | |
| 124 | def _list_from_pages(self, page_ids: List[str]) -> List[SourceFile]: |
| 125 | """Fetch individual pages by ID and return SourceFiles.""" |
| 126 | files: List[SourceFile] = [] |
| 127 | for page_id in page_ids: |
| 128 | try: |
| 129 | resp = requests.get( |
| 130 | f"{NOTION_BASE_URL}/pages/{page_id}", |
| 131 | headers=self._headers(), |
| 132 | timeout=15, |
| 133 | ) |
| 134 | resp.raise_for_status() |
| 135 | page = resp.json() |
| 136 | title = _extract_page_title(page) |
| 137 | files.append( |
| 138 | SourceFile( |
| 139 | name=title, |
| 140 | id=page["id"], |
| 141 | mime_type="text/markdown", |
| 142 | modified_at=page.get("last_edited_time"), |
| 143 | ) |
| 144 | ) |
| 145 | except requests.RequestException as exc: |
| 146 | logger.error("Failed to fetch page %s: %s", page_id, exc) |
| 147 | return files |
| 148 | |
| 149 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 150 | """Download page blocks as markdown text and save to destination.""" |
| 151 | destination = Path(destination) |
| 152 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 153 | |
| 154 | blocks = self._fetch_all_blocks(file.id) |
| 155 | text = self._blocks_to_text(blocks) |
| 156 | |
| 157 | # Prepend title |
| 158 | content = f"# {file.name}\n\n{text}" |
| 159 | destination.write_text(content, encoding="utf-8") |
| 160 | logger.info("Saved Notion page to %s", destination) |
| 161 | return destination |
| 162 | |
| 163 | def _fetch_all_blocks(self, page_id: str) -> list: |
| 164 | """Fetch all child blocks for a page, handling pagination.""" |
| 165 | blocks: list = [] |
| 166 | has_more = True |
| 167 | start_cursor: Optional[str] = None |
| 168 | |
| 169 | while has_more: |
| 170 | url = f"{NOTION_BASE_URL}/blocks/{page_id}/children?page_size=100" |
| 171 | if start_cursor: |
| 172 | url += f"&start_cursor={start_cursor}" |
| 173 | |
| 174 | resp = requests.get(url, headers=self._headers(), timeout=30) |
| 175 | resp.raise_for_status() |
| 176 | data = resp.json() |
| 177 | |
| 178 | blocks.extend(data.get("results", [])) |
| 179 | has_more = data.get("has_more", False) |
| 180 | start_cursor = data.get("next_cursor") |
| 181 | |
| 182 | return blocks |
| 183 | |
| 184 | def _blocks_to_text(self, blocks: list) -> str: |
| 185 | """Convert Notion block objects to markdown text.""" |
| 186 | lines: List[str] = [] |
| 187 | numbered_index = 0 |
| 188 | |
| 189 | for block in blocks: |
| 190 | block_type = block.get("type", "") |
| 191 | block_data = block.get(block_type, {}) |
| 192 | |
| 193 | if block_type == "paragraph": |
| 194 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 195 | lines.append(text) |
| 196 | numbered_index = 0 |
| 197 | |
| 198 | elif block_type == "heading_1": |
| 199 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 200 | lines.append(f"# {text}") |
| 201 | numbered_index = 0 |
| 202 | |
| 203 | elif block_type == "heading_2": |
| 204 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 205 | lines.append(f"## {text}") |
| 206 | numbered_index = 0 |
| 207 | |
| 208 | elif block_type == "heading_3": |
| 209 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 210 | lines.append(f"### {text}") |
| 211 | numbered_index = 0 |
| 212 | |
| 213 | elif block_type == "bulleted_list_item": |
| 214 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 215 | lines.append(f"- {text}") |
| 216 | numbered_index = 0 |
| 217 | |
| 218 | elif block_type == "numbered_list_item": |
| 219 | numbered_index += 1 |
| 220 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 221 | lines.append(f"{numbered_index}. {text}") |
| 222 | |
| 223 | elif block_type == "to_do": |
| 224 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 225 | checked = block_data.get("checked", False) |
| 226 | marker = "[x]" if checked else "[ ]" |
| 227 | lines.append(f"- {marker} {text}") |
| 228 | numbered_index = 0 |
| 229 | |
| 230 | elif block_type == "code": |
| 231 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 232 | language = block_data.get("language", "") |
| 233 | lines.append(f"```{language}") |
| 234 | lines.append(text) |
| 235 | lines.append("```") |
| 236 | numbered_index = 0 |
| 237 | |
| 238 | elif block_type == "quote": |
| 239 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 240 | lines.append(f"> {text}") |
| 241 | numbered_index = 0 |
| 242 | |
| 243 | elif block_type == "callout": |
| 244 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 245 | icon = block_data.get("icon", {}) |
| 246 | emoji = icon.get("emoji", "") if icon else "" |
| 247 | prefix = f"{emoji} " if emoji else "" |
| 248 | lines.append(f"> {prefix}{text}") |
| 249 | numbered_index = 0 |
| 250 | |
| 251 | elif block_type == "toggle": |
| 252 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 253 | lines.append(f"<details><summary>{text}</summary></details>") |
| 254 | numbered_index = 0 |
| 255 | |
| 256 | elif block_type == "divider": |
| 257 | lines.append("---") |
| 258 | numbered_index = 0 |
| 259 | |
| 260 | else: |
| 261 | # Unsupported block type — try to extract any rich_text |
| 262 | text = _rich_text_to_str(block_data.get("rich_text", [])) |
| 263 | if text: |
| 264 | lines.append(text) |
| 265 | numbered_index = 0 |
| 266 | |
| 267 | return "\n\n".join(lines) |
| 268 | |
| 269 | def fetch_database_as_table(self, database_id: str) -> str: |
| 270 | """Fetch a Notion database and return its rows as CSV-like text. |
| 271 | |
| 272 | Each row is a page in the database. Columns are derived from |
| 273 | the database properties. |
| 274 | """ |
| 275 | # First, get database schema for column order |
| 276 | resp = requests.get( |
| 277 | f"{NOTION_BASE_URL}/databases/{database_id}", |
| 278 | headers=self._headers(), |
| 279 | timeout=15, |
| 280 | ) |
| 281 | resp.raise_for_status() |
| 282 | db_meta = resp.json() |
| 283 | properties = db_meta.get("properties", {}) |
| 284 | columns = sorted(properties.keys()) |
| 285 | |
| 286 | # Query all rows |
| 287 | rows: List[Dict] = [] |
| 288 | has_more = True |
| 289 | start_cursor: Optional[str] = None |
| 290 | |
| 291 | while has_more: |
| 292 | body: Dict = {} |
| 293 | if start_cursor: |
| 294 | body["start_cursor"] = start_cursor |
| 295 | |
| 296 | resp = requests.post( |
| 297 | f"{NOTION_BASE_URL}/databases/{database_id}/query", |
| 298 | headers=self._headers(), |
| 299 | json=body, |
| 300 | timeout=30, |
| 301 | ) |
| 302 | resp.raise_for_status() |
| 303 | data = resp.json() |
| 304 | rows.extend(data.get("results", [])) |
| 305 | has_more = data.get("has_more", False) |
| 306 | start_cursor = data.get("next_cursor") |
| 307 | |
| 308 | # Build CSV-like output |
| 309 | lines: List[str] = [] |
| 310 | lines.append(",".join(columns)) |
| 311 | |
| 312 | for row in rows: |
| 313 | row_props = row.get("properties", {}) |
| 314 | values: List[str] = [] |
| 315 | for col in columns: |
| 316 | prop = row_props.get(col, {}) |
| 317 | values.append(_extract_property_value(prop)) |
| 318 | lines.append(",".join(values)) |
| 319 | |
| 320 | return "\n".join(lines) |
| 321 | |
| 322 | |
| 323 | def _rich_text_to_str(rich_text: list) -> str: |
| 324 | """Extract plain text from a Notion rich_text array.""" |
| 325 | return "".join(item.get("plain_text", "") for item in rich_text) |
| 326 | |
| 327 | |
| 328 | def _extract_page_title(page: dict) -> str: |
| 329 | """Extract the title from a Notion page object.""" |
| 330 | properties = page.get("properties", {}) |
| 331 | for prop in properties.values(): |
| 332 | if prop.get("type") == "title": |
| 333 | return _rich_text_to_str(prop.get("title", [])) |
| 334 | return "Untitled" |
| 335 | |
| 336 | |
| 337 | def _extract_property_value(prop: dict) -> str: |
| 338 | """Extract a display string from a Notion property value.""" |
| 339 | prop_type = prop.get("type", "") |
| 340 | |
| 341 | if prop_type == "title": |
| 342 | return _rich_text_to_str(prop.get("title", [])) |
| 343 | elif prop_type == "rich_text": |
| 344 | return _rich_text_to_str(prop.get("rich_text", [])) |
| 345 | elif prop_type == "number": |
| 346 | val = prop.get("number") |
| 347 | return str(val) if val is not None else "" |
| 348 | elif prop_type == "select": |
| 349 | sel = prop.get("select") |
| 350 | return sel.get("name", "") if sel else "" |
| 351 | elif prop_type == "multi_select": |
| 352 | return "; ".join(s.get("name", "") for s in prop.get("multi_select", [])) |
| 353 | elif prop_type == "date": |
| 354 | date = prop.get("date") |
| 355 | if date: |
| 356 | start = date.get("start", "") |
| 357 | end = date.get("end", "") |
| 358 | return f"{start} - {end}" if end else start |
| 359 | return "" |
| 360 | elif prop_type == "checkbox": |
| 361 | return str(prop.get("checkbox", False)) |
| 362 | elif prop_type == "url": |
| 363 | return prop.get("url", "") or "" |
| 364 | elif prop_type == "email": |
| 365 | return prop.get("email", "") or "" |
| 366 | elif prop_type == "phone_number": |
| 367 | return prop.get("phone_number", "") or "" |
| 368 | elif prop_type == "status": |
| 369 | status = prop.get("status") |
| 370 | return status.get("name", "") if status else "" |
| 371 | elif prop_type == "people": |
| 372 | return "; ".join(p.get("name", "") for p in prop.get("people", [])) |
| 373 | elif prop_type == "relation": |
| 374 | return "; ".join(r.get("id", "") for r in prop.get("relation", [])) |
| 375 | elif prop_type == "formula": |
| 376 | formula = prop.get("formula", {}) |
| 377 | f_type = formula.get("type", "") |
| 378 | return str(formula.get(f_type, "")) |
| 379 | else: |
| 380 | return "" |
| --- a/video_processor/sources/obsidian_source.py | ||
| +++ b/video_processor/sources/obsidian_source.py | ||
| @@ -0,0 +1,178 @@ | ||
| 1 | +"""Obsidian vault source connector for ingesting markdown notes.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import re | |
| 5 | +import shutil | |
| 6 | +from datetime import datetime, timezone | |
| 7 | +from pathlib import Path | |
| 8 | +from typing import List, Optional, Tuple | |
| 9 | + | |
| 10 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 11 | + | |
| 12 | +logger = logging.getLogger(__name__) | |
| 13 | + | |
| 14 | + | |
| 15 | +def parse_note(path: Path) -> dict: | |
| 16 | + """Parse an Obsidian markdown note and extract structured content. | |
| 17 | + | |
| 18 | + Returns a dict with: | |
| 19 | + - frontmatter: dict of YAML frontmatter metadata | |
| 20 | + - links: list of linked page names from [[wiki-links]] | |
| 21 | + - tags: list of tags from #tag occurrences | |
| 22 | + - headings: list of dicts with level and text | |
| 23 | + - body: markdown text without frontmatter | |
| 24 | + """ | |
| 25 | + text = path.read_text(encoding="utf-8") | |
| 26 | + | |
| 27 | + # Extract YAML frontmatter (simple key: value parser, stdlib only) | |
| 28 | + frontmatter: dict = {} | |
| 29 | + body = text | |
| 30 | + fm_match = re.match(r"\A---\n(.*?\n)---\n?(.*)", text, re.DOTALL) | |
| 31 | + if fm_match: | |
| 32 | + fm_text = fm_match.group(1) | |
| 33 | + for line in fm_text.strip().splitlines(): | |
| 34 | + kv = re.match(r"^([A-Za-z_][A-Za-z0-9_ -]*):\s*(.*)", line) | |
| 35 | + if kv: | |
| 36 | + key = kv.group(1).strip() | |
| 37 | + value = kv.group(2).strip() | |
| 38 | + # Strip surrounding quotes | |
| 39 | + if len(value) >= 2 and value[0] == value[-1] and value[0] in ('"', "'"): | |
| 40 | + value = value[1:-1] | |
| 41 | + # Handle YAML-style lists on a single line [a, b, c] | |
| 42 | + list_match = re.match(r"^\[(.+)\]$", value) | |
| 43 | + if list_match: | |
| 44 | + value = [v.strip().strip("\"'") for v in list_match.group(1).split(",")] | |
| 45 | + frontmatter[key] = value | |
| 46 | + body = fm_match.group(2) | |
| 47 | + | |
| 48 | + # Extract wiki-links: [[page]] and [[page|alias]] | |
| 49 | + link_pattern = re.compile(r"\[\[([^\]|]+)(?:\|[^\]]+)?\]\]") | |
| 50 | + links = link_pattern.findall(body) | |
| 51 | + | |
| 52 | + # Extract tags: #tag (but not inside code blocks or frontmatter) | |
| 53 | + # Match #tag but not #[[tag]] (that's Logseq style) and not ## headings | |
| 54 | + tag_pattern = re.compile(r"(?<!\w)#([A-Za-z][A-Za-z0-9_/-]*)") | |
| 55 | + tags = tag_pattern.findall(body) | |
| 56 | + | |
| 57 | + # Extract headings hierarchy | |
| 58 | + heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE) | |
| 59 | + headings = [ | |
| 60 | + {"level": len(m.group(1)), "text": m.group(2).strip()} | |
| 61 | + for m in heading_pattern.finditer(body) | |
| 62 | + ] | |
| 63 | + | |
| 64 | + return { | |
| 65 | + "frontmatter": frontmatter, | |
| 66 | + "links": links, | |
| 67 | + "tags": tags, | |
| 68 | + "headings": headings, | |
| 69 | + "body": body, | |
| 70 | + } | |
| 71 | + | |
| 72 | + | |
| 73 | +def ingest_vault(vault_path: Path) -> dict: | |
| 74 | + """Ingest an entire Obsidian vault and return structured data. | |
| 75 | + | |
| 76 | + Returns a dict with: | |
| 77 | + - notes: list of dicts with name, tags, frontmatter, text | |
| 78 | + - links: list of (source, target) tuples from wiki-links | |
| 79 | + """ | |
| 80 | + vault_path = Path(vault_path) | |
| 81 | + notes: List[dict] = [] | |
| 82 | + links: List[Tuple[str, str]] = [] | |
| 83 | + | |
| 84 | + md_files = sorted(vault_path.rglob("*.md")) | |
| 85 | + logger.info("Found %d markdown files in vault %s", len(md_files), vault_path) | |
| 86 | + | |
| 87 | + for md_file in md_files: | |
| 88 | + note_name = md_file.stem | |
| 89 | + try: | |
| 90 | + parsed = parse_note(md_file) | |
| 91 | + except Exception: | |
| 92 | + logger.warning("Failed to parse note %s", md_file) | |
| 93 | + continue | |
| 94 | + | |
| 95 | + notes.append( | |
| 96 | + { | |
| 97 | + "name": note_name, | |
| 98 | + "tags": parsed["tags"], | |
| 99 | + "frontmatter": parsed["frontmatter"], | |
| 100 | + "text": parsed["body"], | |
| 101 | + } | |
| 102 | + ) | |
| 103 | + | |
| 104 | + for linked_page in parsed["links"]: | |
| 105 | + links.append((note_name, linked_page)) | |
| 106 | + | |
| 107 | + logger.info( | |
| 108 | + "Ingested %d notes with %d links from vault %s", | |
| 109 | + len(notes), | |
| 110 | + len(links), | |
| 111 | + vault_path, | |
| 112 | + ) | |
| 113 | + return {"notes": notes, "links": links} | |
| 114 | + | |
| 115 | + | |
| 116 | +class ObsidianSource(BaseSource): | |
| 117 | + """Source connector for Obsidian vaults.""" | |
| 118 | + | |
| 119 | + def __init__(self, vault_path: str) -> None: | |
| 120 | + self.vault_path = Path(vault_path) | |
| 121 | + | |
| 122 | + def authenticate(self) -> bool: | |
| 123 | + """Check that the vault path exists and contains .md files.""" | |
| 124 | + if not self.vault_path.is_dir(): | |
| 125 | + logger.error("Vault path does not exist: %s", self.vault_path) | |
| 126 | + return False | |
| 127 | + md_files = list(self.vault_path.rglob("*.md")) | |
| 128 | + if not md_files: | |
| 129 | + logger.error("No markdown files found in vault: %s", self.vault_path) | |
| 130 | + return False | |
| 131 | + logger.info( | |
| 132 | + "Obsidian vault authenticated: %s (%d .md files)", | |
| 133 | + self.vault_path, | |
| 134 | + len(md_files), | |
| 135 | + ) | |
| 136 | + return True | |
| 137 | + | |
| 138 | + def list_videos( | |
| 139 | + self, | |
| 140 | + folder_id: Optional[str] = None, | |
| 141 | + folder_path: Optional[str] = None, | |
| 142 | + patterns: Optional[List[str]] = None, | |
| 143 | + ) -> List[SourceFile]: | |
| 144 | + """List all .md files in the vault recursively as SourceFile objects.""" | |
| 145 | + search_root = self.vault_path | |
| 146 | + if folder_path: | |
| 147 | + search_root = self.vault_path / folder_path | |
| 148 | + | |
| 149 | + md_files = sorted(search_root.rglob("*.md")) | |
| 150 | + results: List[SourceFile] = [] | |
| 151 | + | |
| 152 | + for md_file in md_files: | |
| 153 | + relative = md_file.relative_to(self.vault_path) | |
| 154 | + stat = md_file.stat() | |
| 155 | + modified_dt = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc) | |
| 156 | + | |
| 157 | + results.append( | |
| 158 | + SourceFile( | |
| 159 | + name=md_file.name, | |
| 160 | + id=str(relative), | |
| 161 | + size_bytes=stat.st_size, | |
| 162 | + mime_type="text/markdown", | |
| 163 | + modified_at=modified_dt.isoformat(), | |
| 164 | + path=str(relative), | |
| 165 | + ) | |
| 166 | + ) | |
| 167 | + | |
| 168 | + logger.info("Listed %d files from vault %s", len(results), self.vault_path) | |
| 169 | + return results | |
| 170 | + | |
| 171 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 172 | + """Copy a vault file to the destination path.""" | |
| 173 | + source = self.vault_path / file.id | |
| 174 | + destination = Path(destination) | |
| 175 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 176 | + shutil.copy2(source, destination) | |
| 177 | + logger.info("Copied %s -> %s", source, destination) | |
| 178 | + return destination |
| --- a/video_processor/sources/obsidian_source.py | |
| +++ b/video_processor/sources/obsidian_source.py | |
| @@ -0,0 +1,178 @@ | |
| --- a/video_processor/sources/obsidian_source.py | |
| +++ b/video_processor/sources/obsidian_source.py | |
| @@ -0,0 +1,178 @@ | |
| 1 | """Obsidian vault source connector for ingesting markdown notes.""" |
| 2 | |
| 3 | import logging |
| 4 | import re |
| 5 | import shutil |
| 6 | from datetime import datetime, timezone |
| 7 | from pathlib import Path |
| 8 | from typing import List, Optional, Tuple |
| 9 | |
| 10 | from video_processor.sources.base import BaseSource, SourceFile |
| 11 | |
| 12 | logger = logging.getLogger(__name__) |
| 13 | |
| 14 | |
| 15 | def parse_note(path: Path) -> dict: |
| 16 | """Parse an Obsidian markdown note and extract structured content. |
| 17 | |
| 18 | Returns a dict with: |
| 19 | - frontmatter: dict of YAML frontmatter metadata |
| 20 | - links: list of linked page names from [[wiki-links]] |
| 21 | - tags: list of tags from #tag occurrences |
| 22 | - headings: list of dicts with level and text |
| 23 | - body: markdown text without frontmatter |
| 24 | """ |
| 25 | text = path.read_text(encoding="utf-8") |
| 26 | |
| 27 | # Extract YAML frontmatter (simple key: value parser, stdlib only) |
| 28 | frontmatter: dict = {} |
| 29 | body = text |
| 30 | fm_match = re.match(r"\A---\n(.*?\n)---\n?(.*)", text, re.DOTALL) |
| 31 | if fm_match: |
| 32 | fm_text = fm_match.group(1) |
| 33 | for line in fm_text.strip().splitlines(): |
| 34 | kv = re.match(r"^([A-Za-z_][A-Za-z0-9_ -]*):\s*(.*)", line) |
| 35 | if kv: |
| 36 | key = kv.group(1).strip() |
| 37 | value = kv.group(2).strip() |
| 38 | # Strip surrounding quotes |
| 39 | if len(value) >= 2 and value[0] == value[-1] and value[0] in ('"', "'"): |
| 40 | value = value[1:-1] |
| 41 | # Handle YAML-style lists on a single line [a, b, c] |
| 42 | list_match = re.match(r"^\[(.+)\]$", value) |
| 43 | if list_match: |
| 44 | value = [v.strip().strip("\"'") for v in list_match.group(1).split(",")] |
| 45 | frontmatter[key] = value |
| 46 | body = fm_match.group(2) |
| 47 | |
| 48 | # Extract wiki-links: [[page]] and [[page|alias]] |
| 49 | link_pattern = re.compile(r"\[\[([^\]|]+)(?:\|[^\]]+)?\]\]") |
| 50 | links = link_pattern.findall(body) |
| 51 | |
| 52 | # Extract tags: #tag (but not inside code blocks or frontmatter) |
| 53 | # Match #tag but not #[[tag]] (that's Logseq style) and not ## headings |
| 54 | tag_pattern = re.compile(r"(?<!\w)#([A-Za-z][A-Za-z0-9_/-]*)") |
| 55 | tags = tag_pattern.findall(body) |
| 56 | |
| 57 | # Extract headings hierarchy |
| 58 | heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE) |
| 59 | headings = [ |
| 60 | {"level": len(m.group(1)), "text": m.group(2).strip()} |
| 61 | for m in heading_pattern.finditer(body) |
| 62 | ] |
| 63 | |
| 64 | return { |
| 65 | "frontmatter": frontmatter, |
| 66 | "links": links, |
| 67 | "tags": tags, |
| 68 | "headings": headings, |
| 69 | "body": body, |
| 70 | } |
| 71 | |
| 72 | |
| 73 | def ingest_vault(vault_path: Path) -> dict: |
| 74 | """Ingest an entire Obsidian vault and return structured data. |
| 75 | |
| 76 | Returns a dict with: |
| 77 | - notes: list of dicts with name, tags, frontmatter, text |
| 78 | - links: list of (source, target) tuples from wiki-links |
| 79 | """ |
| 80 | vault_path = Path(vault_path) |
| 81 | notes: List[dict] = [] |
| 82 | links: List[Tuple[str, str]] = [] |
| 83 | |
| 84 | md_files = sorted(vault_path.rglob("*.md")) |
| 85 | logger.info("Found %d markdown files in vault %s", len(md_files), vault_path) |
| 86 | |
| 87 | for md_file in md_files: |
| 88 | note_name = md_file.stem |
| 89 | try: |
| 90 | parsed = parse_note(md_file) |
| 91 | except Exception: |
| 92 | logger.warning("Failed to parse note %s", md_file) |
| 93 | continue |
| 94 | |
| 95 | notes.append( |
| 96 | { |
| 97 | "name": note_name, |
| 98 | "tags": parsed["tags"], |
| 99 | "frontmatter": parsed["frontmatter"], |
| 100 | "text": parsed["body"], |
| 101 | } |
| 102 | ) |
| 103 | |
| 104 | for linked_page in parsed["links"]: |
| 105 | links.append((note_name, linked_page)) |
| 106 | |
| 107 | logger.info( |
| 108 | "Ingested %d notes with %d links from vault %s", |
| 109 | len(notes), |
| 110 | len(links), |
| 111 | vault_path, |
| 112 | ) |
| 113 | return {"notes": notes, "links": links} |
| 114 | |
| 115 | |
| 116 | class ObsidianSource(BaseSource): |
| 117 | """Source connector for Obsidian vaults.""" |
| 118 | |
| 119 | def __init__(self, vault_path: str) -> None: |
| 120 | self.vault_path = Path(vault_path) |
| 121 | |
| 122 | def authenticate(self) -> bool: |
| 123 | """Check that the vault path exists and contains .md files.""" |
| 124 | if not self.vault_path.is_dir(): |
| 125 | logger.error("Vault path does not exist: %s", self.vault_path) |
| 126 | return False |
| 127 | md_files = list(self.vault_path.rglob("*.md")) |
| 128 | if not md_files: |
| 129 | logger.error("No markdown files found in vault: %s", self.vault_path) |
| 130 | return False |
| 131 | logger.info( |
| 132 | "Obsidian vault authenticated: %s (%d .md files)", |
| 133 | self.vault_path, |
| 134 | len(md_files), |
| 135 | ) |
| 136 | return True |
| 137 | |
| 138 | def list_videos( |
| 139 | self, |
| 140 | folder_id: Optional[str] = None, |
| 141 | folder_path: Optional[str] = None, |
| 142 | patterns: Optional[List[str]] = None, |
| 143 | ) -> List[SourceFile]: |
| 144 | """List all .md files in the vault recursively as SourceFile objects.""" |
| 145 | search_root = self.vault_path |
| 146 | if folder_path: |
| 147 | search_root = self.vault_path / folder_path |
| 148 | |
| 149 | md_files = sorted(search_root.rglob("*.md")) |
| 150 | results: List[SourceFile] = [] |
| 151 | |
| 152 | for md_file in md_files: |
| 153 | relative = md_file.relative_to(self.vault_path) |
| 154 | stat = md_file.stat() |
| 155 | modified_dt = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc) |
| 156 | |
| 157 | results.append( |
| 158 | SourceFile( |
| 159 | name=md_file.name, |
| 160 | id=str(relative), |
| 161 | size_bytes=stat.st_size, |
| 162 | mime_type="text/markdown", |
| 163 | modified_at=modified_dt.isoformat(), |
| 164 | path=str(relative), |
| 165 | ) |
| 166 | ) |
| 167 | |
| 168 | logger.info("Listed %d files from vault %s", len(results), self.vault_path) |
| 169 | return results |
| 170 | |
| 171 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 172 | """Copy a vault file to the destination path.""" |
| 173 | source = self.vault_path / file.id |
| 174 | destination = Path(destination) |
| 175 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 176 | shutil.copy2(source, destination) |
| 177 | logger.info("Copied %s -> %s", source, destination) |
| 178 | return destination |
| --- a/video_processor/sources/onenote_source.py | ||
| +++ b/video_processor/sources/onenote_source.py | ||
| @@ -0,0 +1,222 @@ | ||
| 1 | +"""Microsoft OneNote source connector using the m365 CLI (cli-microsoft365). | |
| 2 | + | |
| 3 | +Fetches pages from OneNote notebooks via the `m365` CLI tool. | |
| 4 | +Outputs plain text suitable for KG ingestion. | |
| 5 | + | |
| 6 | +Requires: npm install -g @pnp/cli-microsoft365 | |
| 7 | +Auth: m365 login (interactive) | |
| 8 | +Docs: https://pnp.github.io/cli-microsoft365/ | |
| 9 | +""" | |
| 10 | + | |
| 11 | +import json | |
| 12 | +import logging | |
| 13 | +import re | |
| 14 | +import shutil | |
| 15 | +import subprocess | |
| 16 | +from pathlib import Path | |
| 17 | +from typing import Any, List, Optional | |
| 18 | + | |
| 19 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 20 | + | |
| 21 | +logger = logging.getLogger(__name__) | |
| 22 | + | |
| 23 | + | |
| 24 | +def _run_m365(args: List[str], timeout: int = 30) -> Any: | |
| 25 | + """Run an m365 CLI command and return parsed JSON output.""" | |
| 26 | + cmd = ["m365"] + args + ["--output", "json"] | |
| 27 | + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) | |
| 28 | + if proc.returncode != 0: | |
| 29 | + raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}") | |
| 30 | + try: | |
| 31 | + return json.loads(proc.stdout) | |
| 32 | + except json.JSONDecodeError: | |
| 33 | + return proc.stdout.strip() | |
| 34 | + | |
| 35 | + | |
| 36 | +def _html_to_text(html: str) -> str: | |
| 37 | + """Strip HTML tags and decode entities to produce plain text. | |
| 38 | + | |
| 39 | + Uses only stdlib ``re`` — no external dependencies. | |
| 40 | + """ | |
| 41 | + # Remove script/style blocks entirely | |
| 42 | + text = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE) | |
| 43 | + # Replace <br>, <p>, <div>, <li>, <tr> with newlines for readability | |
| 44 | + text = re.sub(r"<br\s*/?>", "\n", text, flags=re.IGNORECASE) | |
| 45 | + text = re.sub(r"</(p|div|li|tr|h[1-6])>", "\n", text, flags=re.IGNORECASE) | |
| 46 | + # Strip remaining tags | |
| 47 | + text = re.sub(r"<[^>]+>", "", text) | |
| 48 | + # Decode common HTML entities | |
| 49 | + entity_map = { | |
| 50 | + "&": "&", | |
| 51 | + "<": "<", | |
| 52 | + ">": ">", | |
| 53 | + """: '"', | |
| 54 | + "'": "'", | |
| 55 | + "'": "'", | |
| 56 | + " ": " ", | |
| 57 | + } | |
| 58 | + for entity, char in entity_map.items(): | |
| 59 | + text = text.replace(entity, char) | |
| 60 | + # Decode numeric entities ({ and ) | |
| 61 | + text = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), text) | |
| 62 | + text = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), text) | |
| 63 | + # Collapse excessive blank lines | |
| 64 | + text = re.sub(r"\n{3,}", "\n\n", text) | |
| 65 | + return text.strip() | |
| 66 | + | |
| 67 | + | |
| 68 | +class OneNoteSource(BaseSource): | |
| 69 | + """ | |
| 70 | + Fetch pages from OneNote notebooks via the m365 CLI. | |
| 71 | + | |
| 72 | + Usage: | |
| 73 | + source = OneNoteSource() # all notebooks | |
| 74 | + source = OneNoteSource(notebook_name="Work Notes") # specific notebook | |
| 75 | + source = OneNoteSource(notebook_name="Work", section_name="Meetings") | |
| 76 | + files = source.list_videos() | |
| 77 | + source.download_all(files, Path("./notes")) | |
| 78 | + """ | |
| 79 | + | |
| 80 | + def __init__( | |
| 81 | + self, | |
| 82 | + notebook_name: Optional[str] = None, | |
| 83 | + section_name: Optional[str] = None, | |
| 84 | + ): | |
| 85 | + self.notebook_name = notebook_name | |
| 86 | + self.section_name = section_name | |
| 87 | + | |
| 88 | + def authenticate(self) -> bool: | |
| 89 | + """Check if m365 CLI is installed and logged in.""" | |
| 90 | + if not shutil.which("m365"): | |
| 91 | + logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365") | |
| 92 | + return False | |
| 93 | + try: | |
| 94 | + result = _run_m365(["status"], timeout=10) | |
| 95 | + if isinstance(result, dict) and result.get("connectedAs"): | |
| 96 | + return True | |
| 97 | + if isinstance(result, str) and "Logged in" in result: | |
| 98 | + return True | |
| 99 | + logger.error("m365 not logged in. Run: m365 login") | |
| 100 | + return False | |
| 101 | + except (RuntimeError, subprocess.TimeoutExpired): | |
| 102 | + logger.error("m365 not logged in. Run: m365 login") | |
| 103 | + return False | |
| 104 | + | |
| 105 | + def list_videos( | |
| 106 | + self, | |
| 107 | + folder_id: Optional[str] = None, | |
| 108 | + folder_path: Optional[str] = None, | |
| 109 | + patterns: Optional[List[str]] = None, | |
| 110 | + ) -> List[SourceFile]: | |
| 111 | + """List OneNote pages across notebooks/sections. Returns SourceFile per page.""" | |
| 112 | + files: List[SourceFile] = [] | |
| 113 | + | |
| 114 | + # Step 1: List notebooks | |
| 115 | + try: | |
| 116 | + notebooks = _run_m365(["onenote", "notebook", "list"], timeout=60) | |
| 117 | + except RuntimeError as e: | |
| 118 | + logger.error(f"Failed to list OneNote notebooks: {e}") | |
| 119 | + return [] | |
| 120 | + | |
| 121 | + if not isinstance(notebooks, list): | |
| 122 | + notebooks = [] | |
| 123 | + | |
| 124 | + # Filter notebooks by name if specified | |
| 125 | + if self.notebook_name: | |
| 126 | + notebooks = [ | |
| 127 | + nb | |
| 128 | + for nb in notebooks | |
| 129 | + if self.notebook_name.lower() in nb.get("displayName", "").lower() | |
| 130 | + ] | |
| 131 | + | |
| 132 | + for notebook in notebooks: | |
| 133 | + notebook_id = notebook.get("id", "") | |
| 134 | + notebook_name = notebook.get("displayName", "Untitled Notebook") | |
| 135 | + | |
| 136 | + # Step 2: List sections in this notebook | |
| 137 | + try: | |
| 138 | + sections = _run_m365( | |
| 139 | + ["onenote", "section", "list", "--notebookId", notebook_id], | |
| 140 | + timeout=60, | |
| 141 | + ) | |
| 142 | + except RuntimeError as e: | |
| 143 | + logger.warning(f"Failed to list sections for notebook '{notebook_name}': {e}") | |
| 144 | + continue | |
| 145 | + | |
| 146 | + if not isinstance(sections, list): | |
| 147 | + sections = [] | |
| 148 | + | |
| 149 | + # Filter sections by name if specified | |
| 150 | + if self.section_name: | |
| 151 | + sections = [ | |
| 152 | + s | |
| 153 | + for s in sections | |
| 154 | + if self.section_name.lower() in s.get("displayName", "").lower() | |
| 155 | + ] | |
| 156 | + | |
| 157 | + for section in sections: | |
| 158 | + section_id = section.get("id", "") | |
| 159 | + section_name = section.get("displayName", "Untitled Section") | |
| 160 | + | |
| 161 | + # Step 3: List pages in this section | |
| 162 | + try: | |
| 163 | + pages = _run_m365( | |
| 164 | + ["onenote", "page", "list", "--sectionId", section_id], | |
| 165 | + timeout=60, | |
| 166 | + ) | |
| 167 | + except RuntimeError as e: | |
| 168 | + logger.warning(f"Failed to list pages in section '{section_name}': {e}") | |
| 169 | + continue | |
| 170 | + | |
| 171 | + if not isinstance(pages, list): | |
| 172 | + pages = [] | |
| 173 | + | |
| 174 | + for page in pages: | |
| 175 | + page_id = page.get("id", "") | |
| 176 | + title = page.get("title", "Untitled Page").strip() or "Untitled Page" | |
| 177 | + modified = page.get("lastModifiedDateTime") | |
| 178 | + # Build a path for organizational context | |
| 179 | + page_path = f"{notebook_name}/{section_name}/{title}" | |
| 180 | + | |
| 181 | + files.append( | |
| 182 | + SourceFile( | |
| 183 | + name=title, | |
| 184 | + id=str(page_id), | |
| 185 | + size_bytes=None, | |
| 186 | + mime_type="text/html", | |
| 187 | + modified_at=modified, | |
| 188 | + path=page_path, | |
| 189 | + ) | |
| 190 | + ) | |
| 191 | + | |
| 192 | + logger.info(f"Found {len(files)} page(s) in OneNote") | |
| 193 | + return files | |
| 194 | + | |
| 195 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 196 | + """Download a OneNote page's content as a text file.""" | |
| 197 | + destination = Path(destination) | |
| 198 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 199 | + | |
| 200 | + try: | |
| 201 | + result = _run_m365( | |
| 202 | + ["onenote", "page", "get", "--id", file.id], | |
| 203 | + timeout=60, | |
| 204 | + ) | |
| 205 | + except RuntimeError as e: | |
| 206 | + raise RuntimeError(f"Failed to fetch OneNote page {file.id}: {e}") from e | |
| 207 | + | |
| 208 | + # Extract HTML content from the result | |
| 209 | + if isinstance(result, dict): | |
| 210 | + html = result.get("content", result.get("body", {}).get("content", "")) | |
| 211 | + if not html: | |
| 212 | + # Fallback: serialize the whole response | |
| 213 | + html = json.dumps(result, indent=2) | |
| 214 | + elif isinstance(result, str): | |
| 215 | + html = result | |
| 216 | + else: | |
| 217 | + html = str(result) | |
| 218 | + | |
| 219 | + text = _html_to_text(html) | |
| 220 | + destination.write_text(text, encoding="utf-8") | |
| 221 | + logger.info(f"Saved page '{file.name}' to {destination}") | |
| 222 | + return destination |
| --- a/video_processor/sources/onenote_source.py | |
| +++ b/video_processor/sources/onenote_source.py | |
| @@ -0,0 +1,222 @@ | |
| --- a/video_processor/sources/onenote_source.py | |
| +++ b/video_processor/sources/onenote_source.py | |
| @@ -0,0 +1,222 @@ | |
| 1 | """Microsoft OneNote source connector using the m365 CLI (cli-microsoft365). |
| 2 | |
| 3 | Fetches pages from OneNote notebooks via the `m365` CLI tool. |
| 4 | Outputs plain text suitable for KG ingestion. |
| 5 | |
| 6 | Requires: npm install -g @pnp/cli-microsoft365 |
| 7 | Auth: m365 login (interactive) |
| 8 | Docs: https://pnp.github.io/cli-microsoft365/ |
| 9 | """ |
| 10 | |
| 11 | import json |
| 12 | import logging |
| 13 | import re |
| 14 | import shutil |
| 15 | import subprocess |
| 16 | from pathlib import Path |
| 17 | from typing import Any, List, Optional |
| 18 | |
| 19 | from video_processor.sources.base import BaseSource, SourceFile |
| 20 | |
| 21 | logger = logging.getLogger(__name__) |
| 22 | |
| 23 | |
| 24 | def _run_m365(args: List[str], timeout: int = 30) -> Any: |
| 25 | """Run an m365 CLI command and return parsed JSON output.""" |
| 26 | cmd = ["m365"] + args + ["--output", "json"] |
| 27 | proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) |
| 28 | if proc.returncode != 0: |
| 29 | raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}") |
| 30 | try: |
| 31 | return json.loads(proc.stdout) |
| 32 | except json.JSONDecodeError: |
| 33 | return proc.stdout.strip() |
| 34 | |
| 35 | |
| 36 | def _html_to_text(html: str) -> str: |
| 37 | """Strip HTML tags and decode entities to produce plain text. |
| 38 | |
| 39 | Uses only stdlib ``re`` — no external dependencies. |
| 40 | """ |
| 41 | # Remove script/style blocks entirely |
| 42 | text = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE) |
| 43 | # Replace <br>, <p>, <div>, <li>, <tr> with newlines for readability |
| 44 | text = re.sub(r"<br\s*/?>", "\n", text, flags=re.IGNORECASE) |
| 45 | text = re.sub(r"</(p|div|li|tr|h[1-6])>", "\n", text, flags=re.IGNORECASE) |
| 46 | # Strip remaining tags |
| 47 | text = re.sub(r"<[^>]+>", "", text) |
| 48 | # Decode common HTML entities |
| 49 | entity_map = { |
| 50 | "&": "&", |
| 51 | "<": "<", |
| 52 | ">": ">", |
| 53 | """: '"', |
| 54 | "'": "'", |
| 55 | "'": "'", |
| 56 | " ": " ", |
| 57 | } |
| 58 | for entity, char in entity_map.items(): |
| 59 | text = text.replace(entity, char) |
| 60 | # Decode numeric entities ({ and ) |
| 61 | text = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), text) |
| 62 | text = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), text) |
| 63 | # Collapse excessive blank lines |
| 64 | text = re.sub(r"\n{3,}", "\n\n", text) |
| 65 | return text.strip() |
| 66 | |
| 67 | |
| 68 | class OneNoteSource(BaseSource): |
| 69 | """ |
| 70 | Fetch pages from OneNote notebooks via the m365 CLI. |
| 71 | |
| 72 | Usage: |
| 73 | source = OneNoteSource() # all notebooks |
| 74 | source = OneNoteSource(notebook_name="Work Notes") # specific notebook |
| 75 | source = OneNoteSource(notebook_name="Work", section_name="Meetings") |
| 76 | files = source.list_videos() |
| 77 | source.download_all(files, Path("./notes")) |
| 78 | """ |
| 79 | |
| 80 | def __init__( |
| 81 | self, |
| 82 | notebook_name: Optional[str] = None, |
| 83 | section_name: Optional[str] = None, |
| 84 | ): |
| 85 | self.notebook_name = notebook_name |
| 86 | self.section_name = section_name |
| 87 | |
| 88 | def authenticate(self) -> bool: |
| 89 | """Check if m365 CLI is installed and logged in.""" |
| 90 | if not shutil.which("m365"): |
| 91 | logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365") |
| 92 | return False |
| 93 | try: |
| 94 | result = _run_m365(["status"], timeout=10) |
| 95 | if isinstance(result, dict) and result.get("connectedAs"): |
| 96 | return True |
| 97 | if isinstance(result, str) and "Logged in" in result: |
| 98 | return True |
| 99 | logger.error("m365 not logged in. Run: m365 login") |
| 100 | return False |
| 101 | except (RuntimeError, subprocess.TimeoutExpired): |
| 102 | logger.error("m365 not logged in. Run: m365 login") |
| 103 | return False |
| 104 | |
| 105 | def list_videos( |
| 106 | self, |
| 107 | folder_id: Optional[str] = None, |
| 108 | folder_path: Optional[str] = None, |
| 109 | patterns: Optional[List[str]] = None, |
| 110 | ) -> List[SourceFile]: |
| 111 | """List OneNote pages across notebooks/sections. Returns SourceFile per page.""" |
| 112 | files: List[SourceFile] = [] |
| 113 | |
| 114 | # Step 1: List notebooks |
| 115 | try: |
| 116 | notebooks = _run_m365(["onenote", "notebook", "list"], timeout=60) |
| 117 | except RuntimeError as e: |
| 118 | logger.error(f"Failed to list OneNote notebooks: {e}") |
| 119 | return [] |
| 120 | |
| 121 | if not isinstance(notebooks, list): |
| 122 | notebooks = [] |
| 123 | |
| 124 | # Filter notebooks by name if specified |
| 125 | if self.notebook_name: |
| 126 | notebooks = [ |
| 127 | nb |
| 128 | for nb in notebooks |
| 129 | if self.notebook_name.lower() in nb.get("displayName", "").lower() |
| 130 | ] |
| 131 | |
| 132 | for notebook in notebooks: |
| 133 | notebook_id = notebook.get("id", "") |
| 134 | notebook_name = notebook.get("displayName", "Untitled Notebook") |
| 135 | |
| 136 | # Step 2: List sections in this notebook |
| 137 | try: |
| 138 | sections = _run_m365( |
| 139 | ["onenote", "section", "list", "--notebookId", notebook_id], |
| 140 | timeout=60, |
| 141 | ) |
| 142 | except RuntimeError as e: |
| 143 | logger.warning(f"Failed to list sections for notebook '{notebook_name}': {e}") |
| 144 | continue |
| 145 | |
| 146 | if not isinstance(sections, list): |
| 147 | sections = [] |
| 148 | |
| 149 | # Filter sections by name if specified |
| 150 | if self.section_name: |
| 151 | sections = [ |
| 152 | s |
| 153 | for s in sections |
| 154 | if self.section_name.lower() in s.get("displayName", "").lower() |
| 155 | ] |
| 156 | |
| 157 | for section in sections: |
| 158 | section_id = section.get("id", "") |
| 159 | section_name = section.get("displayName", "Untitled Section") |
| 160 | |
| 161 | # Step 3: List pages in this section |
| 162 | try: |
| 163 | pages = _run_m365( |
| 164 | ["onenote", "page", "list", "--sectionId", section_id], |
| 165 | timeout=60, |
| 166 | ) |
| 167 | except RuntimeError as e: |
| 168 | logger.warning(f"Failed to list pages in section '{section_name}': {e}") |
| 169 | continue |
| 170 | |
| 171 | if not isinstance(pages, list): |
| 172 | pages = [] |
| 173 | |
| 174 | for page in pages: |
| 175 | page_id = page.get("id", "") |
| 176 | title = page.get("title", "Untitled Page").strip() or "Untitled Page" |
| 177 | modified = page.get("lastModifiedDateTime") |
| 178 | # Build a path for organizational context |
| 179 | page_path = f"{notebook_name}/{section_name}/{title}" |
| 180 | |
| 181 | files.append( |
| 182 | SourceFile( |
| 183 | name=title, |
| 184 | id=str(page_id), |
| 185 | size_bytes=None, |
| 186 | mime_type="text/html", |
| 187 | modified_at=modified, |
| 188 | path=page_path, |
| 189 | ) |
| 190 | ) |
| 191 | |
| 192 | logger.info(f"Found {len(files)} page(s) in OneNote") |
| 193 | return files |
| 194 | |
| 195 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 196 | """Download a OneNote page's content as a text file.""" |
| 197 | destination = Path(destination) |
| 198 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 199 | |
| 200 | try: |
| 201 | result = _run_m365( |
| 202 | ["onenote", "page", "get", "--id", file.id], |
| 203 | timeout=60, |
| 204 | ) |
| 205 | except RuntimeError as e: |
| 206 | raise RuntimeError(f"Failed to fetch OneNote page {file.id}: {e}") from e |
| 207 | |
| 208 | # Extract HTML content from the result |
| 209 | if isinstance(result, dict): |
| 210 | html = result.get("content", result.get("body", {}).get("content", "")) |
| 211 | if not html: |
| 212 | # Fallback: serialize the whole response |
| 213 | html = json.dumps(result, indent=2) |
| 214 | elif isinstance(result, str): |
| 215 | html = result |
| 216 | else: |
| 217 | html = str(result) |
| 218 | |
| 219 | text = _html_to_text(html) |
| 220 | destination.write_text(text, encoding="utf-8") |
| 221 | logger.info(f"Saved page '{file.name}' to {destination}") |
| 222 | return destination |