PlanOpticon

feat: add notes connectors (in/out), wiki generator, and meeting sources Input connectors: - ObsidianSource: vault ingester with frontmatter, wiki-links, tags parsing - LogseqSource: graph ingester with page properties, block refs - NotionSource: API connector for pages and databases - AppleNotesSource: macOS osascript connector - GoogleKeepSource: via gws CLI with checklist support - OneNoteSource: via m365 CLI with notebook/section/page hierarchy Output skills: - WikiGeneratorSkill: generates GitHub wiki from KG (Home, Sidebar, entity pages) - NotesExportSkill: exports KG as Obsidian vault or Notion-compatible markdown CLI commands: - planopticon wiki {generate,push} 52 new tests (728 total)

lmata 2026-03-07 23:12 trunk
Commit 855b7fbcf90ec4416cfdeeb72445dd82e03b6c6af67baf07a5d658ae5fd2658d
--- tests/test_agent_skills.py
+++ tests/test_agent_skills.py
@@ -401,5 +401,227 @@
401401
402402
skill = ProjectPlanSkill()
403403
ctx = _make_context()
404404
ctx.provider_manager = None
405405
assert skill.can_execute(ctx) is False
406
+
407
+
408
+# ---------------------------------------------------------------------------
409
+# WikiGeneratorSkill
410
+# ---------------------------------------------------------------------------
411
+
412
+
413
+class TestWikiGeneratorSkill:
414
+ def _sample_kg_data(self):
415
+ return {
416
+ "nodes": [
417
+ {
418
+ "name": "Python",
419
+ "type": "technology",
420
+ "descriptions": ["A programming language"],
421
+ },
422
+ {
423
+ "name": "Alice",
424
+ "type": "person",
425
+ "descriptions": ["Lead developer"],
426
+ },
427
+ {
428
+ "name": "FastAPI",
429
+ "type": "technology",
430
+ "descriptions": ["Web framework"],
431
+ },
432
+ ],
433
+ "relationships": [
434
+ {"source": "Alice", "target": "Python", "type": "uses"},
435
+ {"source": "FastAPI", "target": "Python", "type": "built_with"},
436
+ ],
437
+ }
438
+
439
+ def test_generate_wiki(self):
440
+ from video_processor.agent.skills.wiki_generator import generate_wiki
441
+
442
+ pages = generate_wiki(self._sample_kg_data(), title="Test Wiki")
443
+
444
+ assert "Home" in pages
445
+ assert "_Sidebar" in pages
446
+ assert "Test Wiki" in pages["Home"]
447
+ assert "3" in pages["Home"] # 3 entities
448
+ assert "2" in pages["Home"] # 2 relationships
449
+
450
+ # Entity pages should exist
451
+ assert "Python" in pages
452
+ assert "Alice" in pages
453
+ assert "FastAPI" in pages
454
+
455
+ # Type index pages should exist
456
+ assert "Technology" in pages
457
+ assert "Person" in pages
458
+
459
+ # Alice's page should reference Python
460
+ assert "Python" in pages["Alice"]
461
+ assert "uses" in pages["Alice"]
462
+
463
+ def test_generate_wiki_with_artifacts(self):
464
+ from video_processor.agent.skills.wiki_generator import generate_wiki
465
+
466
+ art = Artifact(
467
+ name="Project Plan",
468
+ content="# Plan\n\nDo the thing.",
469
+ artifact_type="project_plan",
470
+ format="markdown",
471
+ )
472
+ pages = generate_wiki(self._sample_kg_data(), artifacts=[art])
473
+
474
+ assert "Project-Plan" in pages
475
+ assert "Do the thing." in pages["Project-Plan"]
476
+ assert "Planning Artifacts" in pages["Home"]
477
+
478
+ def test_write_wiki(self, tmp_path):
479
+ from video_processor.agent.skills.wiki_generator import write_wiki
480
+
481
+ pages = {
482
+ "Home": "# Home\n\nWelcome.",
483
+ "Page-One": "# Page One\n\nContent.",
484
+ }
485
+ paths = write_wiki(pages, tmp_path / "wiki")
486
+
487
+ assert len(paths) == 2
488
+ assert (tmp_path / "wiki" / "Home.md").exists()
489
+ assert (tmp_path / "wiki" / "Page-One.md").exists()
490
+ assert "Welcome." in (tmp_path / "wiki" / "Home.md").read_text()
491
+
492
+ def test_sanitize_filename(self):
493
+ from video_processor.agent.skills.wiki_generator import _sanitize_filename
494
+
495
+ assert _sanitize_filename("Hello World") == "Hello-World"
496
+ assert _sanitize_filename("path/to\\file") == "path-to-file"
497
+ assert _sanitize_filename("version.2") == "version-2"
498
+
499
+ def test_wiki_link(self):
500
+ from video_processor.agent.skills.wiki_generator import _wiki_link
501
+
502
+ result = _wiki_link("My Page")
503
+ assert result == "[My Page](My-Page)"
504
+
505
+ result = _wiki_link("Simple")
506
+ assert result == "[Simple](Simple)"
507
+
508
+
509
+# ---------------------------------------------------------------------------
510
+# NotesExportSkill
511
+# ---------------------------------------------------------------------------
512
+
513
+
514
+class TestNotesExportSkill:
515
+ def _sample_kg_data(self):
516
+ return {
517
+ "nodes": [
518
+ {
519
+ "name": "Python",
520
+ "type": "technology",
521
+ "descriptions": ["A programming language"],
522
+ },
523
+ {
524
+ "name": "Alice",
525
+ "type": "person",
526
+ "descriptions": ["Lead developer"],
527
+ },
528
+ ],
529
+ "relationships": [
530
+ {"source": "Alice", "target": "Python", "type": "uses"},
531
+ ],
532
+ }
533
+
534
+ def test_export_to_obsidian(self, tmp_path):
535
+ from video_processor.agent.skills.notes_export import export_to_obsidian
536
+
537
+ output_dir = tmp_path / "obsidian_vault"
538
+ export_to_obsidian(self._sample_kg_data(), output_dir)
539
+
540
+ assert output_dir.is_dir()
541
+
542
+ # Check entity files exist
543
+ python_file = output_dir / "Python.md"
544
+ alice_file = output_dir / "Alice.md"
545
+ assert python_file.exists()
546
+ assert alice_file.exists()
547
+
548
+ # Check frontmatter in entity file
549
+ python_content = python_file.read_text()
550
+ assert "---" in python_content
551
+ assert "type: technology" in python_content
552
+ assert "# Python" in python_content
553
+
554
+ # Check wiki-links in Alice file
555
+ alice_content = alice_file.read_text()
556
+ assert "[[Python]]" in alice_content
557
+ assert "uses" in alice_content
558
+
559
+ # Check index file
560
+ index_file = output_dir / "_Index.md"
561
+ assert index_file.exists()
562
+ index_content = index_file.read_text()
563
+ assert "[[Python]]" in index_content
564
+ assert "[[Alice]]" in index_content
565
+
566
+ def test_export_to_obsidian_with_artifacts(self, tmp_path):
567
+ from video_processor.agent.skills.notes_export import export_to_obsidian
568
+
569
+ art = Artifact(
570
+ name="Test Plan",
571
+ content="# Plan\n\nSteps here.",
572
+ artifact_type="project_plan",
573
+ format="markdown",
574
+ )
575
+ output_dir = tmp_path / "obsidian_arts"
576
+ export_to_obsidian(self._sample_kg_data(), output_dir, artifacts=[art])
577
+
578
+ art_file = output_dir / "Test Plan.md"
579
+ assert art_file.exists()
580
+ art_content = art_file.read_text()
581
+ assert "artifact" in art_content
582
+ assert "Steps here." in art_content
583
+
584
+ def test_export_to_notion_md(self, tmp_path):
585
+ from video_processor.agent.skills.notes_export import export_to_notion_md
586
+
587
+ output_dir = tmp_path / "notion_export"
588
+ export_to_notion_md(self._sample_kg_data(), output_dir)
589
+
590
+ assert output_dir.is_dir()
591
+
592
+ # Check CSV database file
593
+ csv_file = output_dir / "entities_database.csv"
594
+ assert csv_file.exists()
595
+ csv_content = csv_file.read_text()
596
+ assert "Name" in csv_content
597
+ assert "Type" in csv_content
598
+ assert "Python" in csv_content
599
+ assert "Alice" in csv_content
600
+
601
+ # Check entity markdown files
602
+ python_file = output_dir / "Python.md"
603
+ assert python_file.exists()
604
+ python_content = python_file.read_text()
605
+ assert "# Python" in python_content
606
+ assert "technology" in python_content
607
+
608
+ # Check overview file
609
+ overview_file = output_dir / "Overview.md"
610
+ assert overview_file.exists()
611
+
612
+ def test_export_to_notion_md_with_artifacts(self, tmp_path):
613
+ from video_processor.agent.skills.notes_export import export_to_notion_md
614
+
615
+ art = Artifact(
616
+ name="Roadmap",
617
+ content="# Roadmap\n\nQ1 goals.",
618
+ artifact_type="roadmap",
619
+ format="markdown",
620
+ )
621
+ output_dir = tmp_path / "notion_arts"
622
+ export_to_notion_md(self._sample_kg_data(), output_dir, artifacts=[art])
623
+
624
+ art_file = output_dir / "Roadmap.md"
625
+ assert art_file.exists()
626
+ art_content = art_file.read_text()
627
+ assert "Q1 goals." in art_content
406628
--- tests/test_agent_skills.py
+++ tests/test_agent_skills.py
@@ -401,5 +401,227 @@
401
402 skill = ProjectPlanSkill()
403 ctx = _make_context()
404 ctx.provider_manager = None
405 assert skill.can_execute(ctx) is False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
--- tests/test_agent_skills.py
+++ tests/test_agent_skills.py
@@ -401,5 +401,227 @@
401
402 skill = ProjectPlanSkill()
403 ctx = _make_context()
404 ctx.provider_manager = None
405 assert skill.can_execute(ctx) is False
406
407
408 # ---------------------------------------------------------------------------
409 # WikiGeneratorSkill
410 # ---------------------------------------------------------------------------
411
412
413 class TestWikiGeneratorSkill:
414 def _sample_kg_data(self):
415 return {
416 "nodes": [
417 {
418 "name": "Python",
419 "type": "technology",
420 "descriptions": ["A programming language"],
421 },
422 {
423 "name": "Alice",
424 "type": "person",
425 "descriptions": ["Lead developer"],
426 },
427 {
428 "name": "FastAPI",
429 "type": "technology",
430 "descriptions": ["Web framework"],
431 },
432 ],
433 "relationships": [
434 {"source": "Alice", "target": "Python", "type": "uses"},
435 {"source": "FastAPI", "target": "Python", "type": "built_with"},
436 ],
437 }
438
439 def test_generate_wiki(self):
440 from video_processor.agent.skills.wiki_generator import generate_wiki
441
442 pages = generate_wiki(self._sample_kg_data(), title="Test Wiki")
443
444 assert "Home" in pages
445 assert "_Sidebar" in pages
446 assert "Test Wiki" in pages["Home"]
447 assert "3" in pages["Home"] # 3 entities
448 assert "2" in pages["Home"] # 2 relationships
449
450 # Entity pages should exist
451 assert "Python" in pages
452 assert "Alice" in pages
453 assert "FastAPI" in pages
454
455 # Type index pages should exist
456 assert "Technology" in pages
457 assert "Person" in pages
458
459 # Alice's page should reference Python
460 assert "Python" in pages["Alice"]
461 assert "uses" in pages["Alice"]
462
463 def test_generate_wiki_with_artifacts(self):
464 from video_processor.agent.skills.wiki_generator import generate_wiki
465
466 art = Artifact(
467 name="Project Plan",
468 content="# Plan\n\nDo the thing.",
469 artifact_type="project_plan",
470 format="markdown",
471 )
472 pages = generate_wiki(self._sample_kg_data(), artifacts=[art])
473
474 assert "Project-Plan" in pages
475 assert "Do the thing." in pages["Project-Plan"]
476 assert "Planning Artifacts" in pages["Home"]
477
478 def test_write_wiki(self, tmp_path):
479 from video_processor.agent.skills.wiki_generator import write_wiki
480
481 pages = {
482 "Home": "# Home\n\nWelcome.",
483 "Page-One": "# Page One\n\nContent.",
484 }
485 paths = write_wiki(pages, tmp_path / "wiki")
486
487 assert len(paths) == 2
488 assert (tmp_path / "wiki" / "Home.md").exists()
489 assert (tmp_path / "wiki" / "Page-One.md").exists()
490 assert "Welcome." in (tmp_path / "wiki" / "Home.md").read_text()
491
492 def test_sanitize_filename(self):
493 from video_processor.agent.skills.wiki_generator import _sanitize_filename
494
495 assert _sanitize_filename("Hello World") == "Hello-World"
496 assert _sanitize_filename("path/to\\file") == "path-to-file"
497 assert _sanitize_filename("version.2") == "version-2"
498
499 def test_wiki_link(self):
500 from video_processor.agent.skills.wiki_generator import _wiki_link
501
502 result = _wiki_link("My Page")
503 assert result == "[My Page](My-Page)"
504
505 result = _wiki_link("Simple")
506 assert result == "[Simple](Simple)"
507
508
509 # ---------------------------------------------------------------------------
510 # NotesExportSkill
511 # ---------------------------------------------------------------------------
512
513
514 class TestNotesExportSkill:
515 def _sample_kg_data(self):
516 return {
517 "nodes": [
518 {
519 "name": "Python",
520 "type": "technology",
521 "descriptions": ["A programming language"],
522 },
523 {
524 "name": "Alice",
525 "type": "person",
526 "descriptions": ["Lead developer"],
527 },
528 ],
529 "relationships": [
530 {"source": "Alice", "target": "Python", "type": "uses"},
531 ],
532 }
533
534 def test_export_to_obsidian(self, tmp_path):
535 from video_processor.agent.skills.notes_export import export_to_obsidian
536
537 output_dir = tmp_path / "obsidian_vault"
538 export_to_obsidian(self._sample_kg_data(), output_dir)
539
540 assert output_dir.is_dir()
541
542 # Check entity files exist
543 python_file = output_dir / "Python.md"
544 alice_file = output_dir / "Alice.md"
545 assert python_file.exists()
546 assert alice_file.exists()
547
548 # Check frontmatter in entity file
549 python_content = python_file.read_text()
550 assert "---" in python_content
551 assert "type: technology" in python_content
552 assert "# Python" in python_content
553
554 # Check wiki-links in Alice file
555 alice_content = alice_file.read_text()
556 assert "[[Python]]" in alice_content
557 assert "uses" in alice_content
558
559 # Check index file
560 index_file = output_dir / "_Index.md"
561 assert index_file.exists()
562 index_content = index_file.read_text()
563 assert "[[Python]]" in index_content
564 assert "[[Alice]]" in index_content
565
566 def test_export_to_obsidian_with_artifacts(self, tmp_path):
567 from video_processor.agent.skills.notes_export import export_to_obsidian
568
569 art = Artifact(
570 name="Test Plan",
571 content="# Plan\n\nSteps here.",
572 artifact_type="project_plan",
573 format="markdown",
574 )
575 output_dir = tmp_path / "obsidian_arts"
576 export_to_obsidian(self._sample_kg_data(), output_dir, artifacts=[art])
577
578 art_file = output_dir / "Test Plan.md"
579 assert art_file.exists()
580 art_content = art_file.read_text()
581 assert "artifact" in art_content
582 assert "Steps here." in art_content
583
584 def test_export_to_notion_md(self, tmp_path):
585 from video_processor.agent.skills.notes_export import export_to_notion_md
586
587 output_dir = tmp_path / "notion_export"
588 export_to_notion_md(self._sample_kg_data(), output_dir)
589
590 assert output_dir.is_dir()
591
592 # Check CSV database file
593 csv_file = output_dir / "entities_database.csv"
594 assert csv_file.exists()
595 csv_content = csv_file.read_text()
596 assert "Name" in csv_content
597 assert "Type" in csv_content
598 assert "Python" in csv_content
599 assert "Alice" in csv_content
600
601 # Check entity markdown files
602 python_file = output_dir / "Python.md"
603 assert python_file.exists()
604 python_content = python_file.read_text()
605 assert "# Python" in python_content
606 assert "technology" in python_content
607
608 # Check overview file
609 overview_file = output_dir / "Overview.md"
610 assert overview_file.exists()
611
612 def test_export_to_notion_md_with_artifacts(self, tmp_path):
613 from video_processor.agent.skills.notes_export import export_to_notion_md
614
615 art = Artifact(
616 name="Roadmap",
617 content="# Roadmap\n\nQ1 goals.",
618 artifact_type="roadmap",
619 format="markdown",
620 )
621 output_dir = tmp_path / "notion_arts"
622 export_to_notion_md(self._sample_kg_data(), output_dir, artifacts=[art])
623
624 art_file = output_dir / "Roadmap.md"
625 assert art_file.exists()
626 art_content = art_file.read_text()
627 assert "Q1 goals." in art_content
628
--- tests/test_sources.py
+++ tests/test_sources.py
@@ -858,5 +858,487 @@
858858
from video_processor.sources.m365_source import M365Source
859859
860860
src = M365Source(web_url="https://contoso.sharepoint.com")
861861
files = src.list_videos()
862862
assert files == []
863
+
864
+
865
+# ---------------------------------------------------------------------------
866
+# ObsidianSource
867
+# ---------------------------------------------------------------------------
868
+
869
+
870
+class TestObsidianSource:
871
+ def test_import(self):
872
+ from video_processor.sources.obsidian_source import ObsidianSource
873
+
874
+ assert ObsidianSource is not None
875
+
876
+ def test_constructor(self, tmp_path):
877
+ from video_processor.sources.obsidian_source import ObsidianSource
878
+
879
+ src = ObsidianSource(vault_path=str(tmp_path))
880
+ assert src.vault_path == tmp_path
881
+
882
+ def test_authenticate_with_vault(self, tmp_path):
883
+ from video_processor.sources.obsidian_source import ObsidianSource
884
+
885
+ (tmp_path / "note.md").write_text("# Hello")
886
+ src = ObsidianSource(vault_path=str(tmp_path))
887
+ assert src.authenticate() is True
888
+
889
+ def test_authenticate_empty_dir(self, tmp_path):
890
+ from video_processor.sources.obsidian_source import ObsidianSource
891
+
892
+ src = ObsidianSource(vault_path=str(tmp_path))
893
+ assert src.authenticate() is False
894
+
895
+ def test_authenticate_nonexistent(self, tmp_path):
896
+ from video_processor.sources.obsidian_source import ObsidianSource
897
+
898
+ src = ObsidianSource(vault_path=str(tmp_path / "nonexistent"))
899
+ assert src.authenticate() is False
900
+
901
+ def test_parse_note(self, tmp_path):
902
+ from video_processor.sources.obsidian_source import parse_note
903
+
904
+ note_content = (
905
+ "---\n"
906
+ "title: Test Note\n"
907
+ "tags: [python, testing]\n"
908
+ "---\n"
909
+ "# Heading One\n\n"
910
+ "Some text with a [[Wiki Link]] and [[Another Page|alias]].\n\n"
911
+ "Also has #tag1 and #tag2 inline tags.\n\n"
912
+ "## Sub Heading\n\n"
913
+ "More content here.\n"
914
+ )
915
+ note_file = tmp_path / "test_note.md"
916
+ note_file.write_text(note_content)
917
+
918
+ result = parse_note(note_file)
919
+
920
+ assert result["frontmatter"]["title"] == "Test Note"
921
+ assert isinstance(result["frontmatter"]["tags"], list)
922
+ assert "python" in result["frontmatter"]["tags"]
923
+ assert "Wiki Link" in result["links"]
924
+ assert "Another Page" in result["links"]
925
+ assert "tag1" in result["tags"]
926
+ assert "tag2" in result["tags"]
927
+ assert len(result["headings"]) == 2
928
+ assert result["headings"][0]["level"] == 1
929
+ assert result["headings"][0]["text"] == "Heading One"
930
+ assert "Some text" in result["body"]
931
+
932
+ def test_ingest_vault(self, tmp_path):
933
+ from video_processor.sources.obsidian_source import ingest_vault
934
+
935
+ (tmp_path / "note_a.md").write_text("# A\n\nLinks to [[B]].\n")
936
+ (tmp_path / "note_b.md").write_text("# B\n\nLinks to [[A]] and [[C]].\n")
937
+
938
+ result = ingest_vault(tmp_path)
939
+
940
+ assert len(result["notes"]) == 2
941
+ names = [n["name"] for n in result["notes"]]
942
+ assert "note_a" in names
943
+ assert "note_b" in names
944
+ # note_a links to B, note_b links to A and C => 3 links
945
+ assert len(result["links"]) == 3
946
+
947
+ def test_list_videos(self, tmp_path):
948
+ from video_processor.sources.obsidian_source import ObsidianSource
949
+
950
+ (tmp_path / "note1.md").write_text("# Note 1")
951
+ sub = tmp_path / "subdir"
952
+ sub.mkdir()
953
+ (sub / "note2.md").write_text("# Note 2")
954
+
955
+ src = ObsidianSource(vault_path=str(tmp_path))
956
+ files = src.list_videos()
957
+ assert len(files) == 2
958
+ assert all(f.mime_type == "text/markdown" for f in files)
959
+
960
+
961
+# ---------------------------------------------------------------------------
962
+# LogseqSource
963
+# ---------------------------------------------------------------------------
964
+
965
+
966
+class TestLogseqSource:
967
+ def test_import(self):
968
+ from video_processor.sources.logseq_source import LogseqSource
969
+
970
+ assert LogseqSource is not None
971
+
972
+ def test_constructor(self, tmp_path):
973
+ from video_processor.sources.logseq_source import LogseqSource
974
+
975
+ src = LogseqSource(graph_path=str(tmp_path))
976
+ assert src.graph_path == tmp_path
977
+
978
+ def test_authenticate_with_pages(self, tmp_path):
979
+ from video_processor.sources.logseq_source import LogseqSource
980
+
981
+ (tmp_path / "pages").mkdir()
982
+ src = LogseqSource(graph_path=str(tmp_path))
983
+ assert src.authenticate() is True
984
+
985
+ def test_authenticate_no_pages_or_journals(self, tmp_path):
986
+ from video_processor.sources.logseq_source import LogseqSource
987
+
988
+ src = LogseqSource(graph_path=str(tmp_path))
989
+ assert src.authenticate() is False
990
+
991
+ def test_authenticate_nonexistent(self, tmp_path):
992
+ from video_processor.sources.logseq_source import LogseqSource
993
+
994
+ src = LogseqSource(graph_path=str(tmp_path / "nonexistent"))
995
+ assert src.authenticate() is False
996
+
997
+ def test_parse_page(self, tmp_path):
998
+ from video_processor.sources.logseq_source import parse_page
999
+
1000
+ page_content = (
1001
+ "title:: My Page\n"
1002
+ "tags:: #project #important\n"
1003
+ "- Some block content\n"
1004
+ " - Nested with [[Another Page]] link\n"
1005
+ " - And a #todo tag\n"
1006
+ " - Block ref ((abc12345-6789-0abc-def0-123456789abc))\n"
1007
+ )
1008
+ page_file = tmp_path / "my_page.md"
1009
+ page_file.write_text(page_content)
1010
+
1011
+ result = parse_page(page_file)
1012
+
1013
+ assert result["properties"]["title"] == "My Page"
1014
+ assert "Another Page" in result["links"]
1015
+ assert "todo" in result["tags"]
1016
+ assert "abc12345-6789-0abc-def0-123456789abc" in result["block_refs"]
1017
+ assert "Some block content" in result["body"]
1018
+
1019
+ def test_ingest_graph(self, tmp_path):
1020
+ from video_processor.sources.logseq_source import ingest_graph
1021
+
1022
+ pages_dir = tmp_path / "pages"
1023
+ pages_dir.mkdir()
1024
+ (pages_dir / "page_a.md").write_text("- Content linking [[Page B]]\n")
1025
+ (pages_dir / "page_b.md").write_text("- Content linking [[Page A]]\n")
1026
+
1027
+ journals_dir = tmp_path / "journals"
1028
+ journals_dir.mkdir()
1029
+ (journals_dir / "2026_03_07.md").write_text("- Journal entry\n")
1030
+
1031
+ result = ingest_graph(tmp_path)
1032
+
1033
+ assert len(result["notes"]) == 3
1034
+ assert len(result["links"]) == 2
1035
+
1036
+ def test_list_videos(self, tmp_path):
1037
+ from video_processor.sources.logseq_source import LogseqSource
1038
+
1039
+ pages_dir = tmp_path / "pages"
1040
+ pages_dir.mkdir()
1041
+ (pages_dir / "page1.md").write_text("- content")
1042
+
1043
+ src = LogseqSource(graph_path=str(tmp_path))
1044
+ files = src.list_videos()
1045
+ assert len(files) == 1
1046
+ assert files[0].mime_type == "text/markdown"
1047
+
1048
+
1049
+# ---------------------------------------------------------------------------
1050
+# NotionSource
1051
+# ---------------------------------------------------------------------------
1052
+
1053
+
1054
+class TestNotionSource:
1055
+ def test_import(self):
1056
+ from video_processor.sources.notion_source import NotionSource
1057
+
1058
+ assert NotionSource is not None
1059
+
1060
+ def test_constructor(self):
1061
+ from video_processor.sources.notion_source import NotionSource
1062
+
1063
+ src = NotionSource(token="ntn_test123", database_id="db-1")
1064
+ assert src.token == "ntn_test123"
1065
+ assert src.database_id == "db-1"
1066
+ assert src.page_ids == []
1067
+
1068
+ @patch.dict(os.environ, {}, clear=True)
1069
+ def test_authenticate_no_token(self):
1070
+ from video_processor.sources.notion_source import NotionSource
1071
+
1072
+ src = NotionSource(token="")
1073
+ assert src.authenticate() is False
1074
+
1075
+ @patch("requests.get")
1076
+ def test_authenticate_with_mock(self, mock_get):
1077
+ from video_processor.sources.notion_source import NotionSource
1078
+
1079
+ mock_resp = MagicMock()
1080
+ mock_resp.raise_for_status = MagicMock()
1081
+ mock_resp.json.return_value = {"name": "Test Bot"}
1082
+ mock_get.return_value = mock_resp
1083
+
1084
+ src = NotionSource(token="ntn_test123")
1085
+ assert src.authenticate() is True
1086
+
1087
+ @patch("requests.post")
1088
+ def test_list_videos_database(self, mock_post):
1089
+ from video_processor.sources.notion_source import NotionSource
1090
+
1091
+ mock_resp = MagicMock()
1092
+ mock_resp.raise_for_status = MagicMock()
1093
+ mock_resp.json.return_value = {
1094
+ "results": [
1095
+ {
1096
+ "id": "page-1",
1097
+ "last_edited_time": "2026-03-01T00:00:00Z",
1098
+ "properties": {
1099
+ "Name": {
1100
+ "type": "title",
1101
+ "title": [{"plain_text": "Meeting Notes"}],
1102
+ }
1103
+ },
1104
+ },
1105
+ ],
1106
+ "has_more": False,
1107
+ }
1108
+ mock_post.return_value = mock_resp
1109
+
1110
+ src = NotionSource(token="ntn_test", database_id="db-1")
1111
+ files = src.list_videos()
1112
+ assert len(files) == 1
1113
+ assert files[0].name == "Meeting Notes"
1114
+ assert files[0].id == "page-1"
1115
+
1116
+ def test_blocks_to_text(self):
1117
+ from video_processor.sources.notion_source import NotionSource
1118
+
1119
+ src = NotionSource(token="test")
1120
+ blocks = [
1121
+ {
1122
+ "type": "heading_1",
1123
+ "heading_1": {
1124
+ "rich_text": [{"plain_text": "Title"}],
1125
+ },
1126
+ },
1127
+ {
1128
+ "type": "paragraph",
1129
+ "paragraph": {
1130
+ "rich_text": [{"plain_text": "Some paragraph text."}],
1131
+ },
1132
+ },
1133
+ {
1134
+ "type": "bulleted_list_item",
1135
+ "bulleted_list_item": {
1136
+ "rich_text": [{"plain_text": "A bullet point"}],
1137
+ },
1138
+ },
1139
+ {
1140
+ "type": "divider",
1141
+ "divider": {},
1142
+ },
1143
+ ]
1144
+ result = src._blocks_to_text(blocks)
1145
+ assert "# Title" in result
1146
+ assert "Some paragraph text." in result
1147
+ assert "- A bullet point" in result
1148
+ assert "---" in result
1149
+
1150
+
1151
+# ---------------------------------------------------------------------------
1152
+# AppleNotesSource
1153
+# ---------------------------------------------------------------------------
1154
+
1155
+
1156
+class TestAppleNotesSource:
1157
+ def test_import(self):
1158
+ from video_processor.sources.apple_notes_source import AppleNotesSource
1159
+
1160
+ assert AppleNotesSource is not None
1161
+
1162
+ def test_constructor(self):
1163
+ from video_processor.sources.apple_notes_source import AppleNotesSource
1164
+
1165
+ src = AppleNotesSource(folder="Work")
1166
+ assert src.folder == "Work"
1167
+
1168
+ def test_constructor_default(self):
1169
+ from video_processor.sources.apple_notes_source import AppleNotesSource
1170
+
1171
+ src = AppleNotesSource()
1172
+ assert src.folder is None
1173
+
1174
+ def test_authenticate_platform(self):
1175
+ import sys
1176
+
1177
+ from video_processor.sources.apple_notes_source import AppleNotesSource
1178
+
1179
+ src = AppleNotesSource()
1180
+ result = src.authenticate()
1181
+ if sys.platform == "darwin":
1182
+ assert result is True
1183
+ else:
1184
+ assert result is False
1185
+
1186
+ def test_html_to_text(self):
1187
+ from video_processor.sources.apple_notes_source import AppleNotesSource
1188
+
1189
+ html = (
1190
+ "<div>Hello <b>World</b></div>"
1191
+ "<p>Paragraph one.</p>"
1192
+ "<p>Paragraph two with &amp; entity.</p>"
1193
+ "<br/>"
1194
+ "<ul><li>Item 1</li><li>Item 2</li></ul>"
1195
+ )
1196
+ result = AppleNotesSource._html_to_text(html)
1197
+ assert "Hello World" in result
1198
+ assert "Paragraph one." in result
1199
+ assert "Paragraph two with & entity." in result
1200
+ assert "Item 1" in result
1201
+
1202
+ def test_html_to_text_empty(self):
1203
+ from video_processor.sources.apple_notes_source import AppleNotesSource
1204
+
1205
+ assert AppleNotesSource._html_to_text("") == ""
1206
+
1207
+ def test_html_to_text_entities(self):
1208
+ from video_processor.sources.apple_notes_source import AppleNotesSource
1209
+
1210
+ html = "&lt;code&gt; &quot;test&quot; &#39;single&#39; &nbsp;space"
1211
+ result = AppleNotesSource._html_to_text(html)
1212
+ assert "<code>" in result
1213
+ assert '"test"' in result
1214
+ assert "'single'" in result
1215
+
1216
+
1217
+# ---------------------------------------------------------------------------
1218
+# GoogleKeepSource
1219
+# ---------------------------------------------------------------------------
1220
+
1221
+
1222
+class TestGoogleKeepSource:
1223
+ def test_import(self):
1224
+ from video_processor.sources.google_keep_source import GoogleKeepSource
1225
+
1226
+ assert GoogleKeepSource is not None
1227
+
1228
+ def test_constructor(self):
1229
+ from video_processor.sources.google_keep_source import GoogleKeepSource
1230
+
1231
+ src = GoogleKeepSource(label="meetings")
1232
+ assert src.label == "meetings"
1233
+
1234
+ def test_constructor_default(self):
1235
+ from video_processor.sources.google_keep_source import GoogleKeepSource
1236
+
1237
+ src = GoogleKeepSource()
1238
+ assert src.label is None
1239
+
1240
+ @patch("shutil.which", return_value=None)
1241
+ def test_authenticate_no_gws(self, _mock_which):
1242
+ from video_processor.sources.google_keep_source import GoogleKeepSource
1243
+
1244
+ src = GoogleKeepSource()
1245
+ assert src.authenticate() is False
1246
+
1247
+ def test_note_to_text(self):
1248
+ from video_processor.sources.google_keep_source import _note_to_text
1249
+
1250
+ note = {
1251
+ "title": "Shopping List",
1252
+ "body": "Remember to buy groceries",
1253
+ "listContent": [
1254
+ {"text": "Milk", "checked": True},
1255
+ {"text": "Bread", "checked": False},
1256
+ {"text": "", "checked": False},
1257
+ ],
1258
+ }
1259
+ result = _note_to_text(note)
1260
+ assert "Shopping List" in result
1261
+ assert "Remember to buy groceries" in result
1262
+ assert "- [x] Milk" in result
1263
+ assert "- [ ] Bread" in result
1264
+
1265
+ def test_note_to_text_empty(self):
1266
+ from video_processor.sources.google_keep_source import _note_to_text
1267
+
1268
+ assert _note_to_text({}) == ""
1269
+
1270
+ def test_note_to_text_text_content(self):
1271
+ from video_processor.sources.google_keep_source import _note_to_text
1272
+
1273
+ note = {"title": "Simple", "textContent": "Just a plain note"}
1274
+ result = _note_to_text(note)
1275
+ assert "Simple" in result
1276
+ assert "Just a plain note" in result
1277
+
1278
+
1279
+# ---------------------------------------------------------------------------
1280
+# OneNoteSource
1281
+# ---------------------------------------------------------------------------
1282
+
1283
+
1284
+class TestOneNoteSource:
1285
+ def test_import(self):
1286
+ from video_processor.sources.onenote_source import OneNoteSource
1287
+
1288
+ assert OneNoteSource is not None
1289
+
1290
+ def test_constructor(self):
1291
+ from video_processor.sources.onenote_source import OneNoteSource
1292
+
1293
+ src = OneNoteSource(notebook_name="Work Notes", section_name="Meetings")
1294
+ assert src.notebook_name == "Work Notes"
1295
+ assert src.section_name == "Meetings"
1296
+
1297
+ def test_constructor_default(self):
1298
+ from video_processor.sources.onenote_source import OneNoteSource
1299
+
1300
+ src = OneNoteSource()
1301
+ assert src.notebook_name is None
1302
+ assert src.section_name is None
1303
+
1304
+ @patch("shutil.which", return_value=None)
1305
+ def test_authenticate_no_m365(self, _mock_which):
1306
+ from video_processor.sources.onenote_source import OneNoteSource
1307
+
1308
+ src = OneNoteSource()
1309
+ assert src.authenticate() is False
1310
+
1311
+ def test_html_to_text(self):
1312
+ from video_processor.sources.onenote_source import _html_to_text
1313
+
1314
+ html = (
1315
+ "<html><body>"
1316
+ "<h1>Meeting Notes</h1>"
1317
+ "<p>Discussed the &amp; project.</p>"
1318
+ "<script>var x = 1;</script>"
1319
+ "<style>.foo { color: red; }</style>"
1320
+ "<ul><li>Action item 1</li><li>Action item 2</li></ul>"
1321
+ "<p>Entity &#x41; and &#65; decoded.</p>"
1322
+ "</body></html>"
1323
+ )
1324
+ result = _html_to_text(html)
1325
+ assert "Meeting Notes" in result
1326
+ assert "Discussed the & project." in result
1327
+ assert "var x" not in result
1328
+ assert ".foo" not in result
1329
+ assert "Action item 1" in result
1330
+ assert "Entity A and A decoded." in result
1331
+
1332
+ def test_html_to_text_empty(self):
1333
+ from video_processor.sources.onenote_source import _html_to_text
1334
+
1335
+ assert _html_to_text("") == ""
1336
+
1337
+ def test_html_to_text_entities(self):
1338
+ from video_processor.sources.onenote_source import _html_to_text
1339
+
1340
+ html = "&lt;tag&gt; &quot;quoted&quot; &apos;apos&apos; &nbsp;space"
1341
+ result = _html_to_text(html)
1342
+ assert "<tag>" in result
1343
+ assert '"quoted"' in result
1344
+ assert "'apos'" in result
8631345
--- tests/test_sources.py
+++ tests/test_sources.py
@@ -858,5 +858,487 @@
858 from video_processor.sources.m365_source import M365Source
859
860 src = M365Source(web_url="https://contoso.sharepoint.com")
861 files = src.list_videos()
862 assert files == []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
863
--- tests/test_sources.py
+++ tests/test_sources.py
@@ -858,5 +858,487 @@
858 from video_processor.sources.m365_source import M365Source
859
860 src = M365Source(web_url="https://contoso.sharepoint.com")
861 files = src.list_videos()
862 assert files == []
863
864
865 # ---------------------------------------------------------------------------
866 # ObsidianSource
867 # ---------------------------------------------------------------------------
868
869
870 class TestObsidianSource:
871 def test_import(self):
872 from video_processor.sources.obsidian_source import ObsidianSource
873
874 assert ObsidianSource is not None
875
876 def test_constructor(self, tmp_path):
877 from video_processor.sources.obsidian_source import ObsidianSource
878
879 src = ObsidianSource(vault_path=str(tmp_path))
880 assert src.vault_path == tmp_path
881
882 def test_authenticate_with_vault(self, tmp_path):
883 from video_processor.sources.obsidian_source import ObsidianSource
884
885 (tmp_path / "note.md").write_text("# Hello")
886 src = ObsidianSource(vault_path=str(tmp_path))
887 assert src.authenticate() is True
888
889 def test_authenticate_empty_dir(self, tmp_path):
890 from video_processor.sources.obsidian_source import ObsidianSource
891
892 src = ObsidianSource(vault_path=str(tmp_path))
893 assert src.authenticate() is False
894
895 def test_authenticate_nonexistent(self, tmp_path):
896 from video_processor.sources.obsidian_source import ObsidianSource
897
898 src = ObsidianSource(vault_path=str(tmp_path / "nonexistent"))
899 assert src.authenticate() is False
900
901 def test_parse_note(self, tmp_path):
902 from video_processor.sources.obsidian_source import parse_note
903
904 note_content = (
905 "---\n"
906 "title: Test Note\n"
907 "tags: [python, testing]\n"
908 "---\n"
909 "# Heading One\n\n"
910 "Some text with a [[Wiki Link]] and [[Another Page|alias]].\n\n"
911 "Also has #tag1 and #tag2 inline tags.\n\n"
912 "## Sub Heading\n\n"
913 "More content here.\n"
914 )
915 note_file = tmp_path / "test_note.md"
916 note_file.write_text(note_content)
917
918 result = parse_note(note_file)
919
920 assert result["frontmatter"]["title"] == "Test Note"
921 assert isinstance(result["frontmatter"]["tags"], list)
922 assert "python" in result["frontmatter"]["tags"]
923 assert "Wiki Link" in result["links"]
924 assert "Another Page" in result["links"]
925 assert "tag1" in result["tags"]
926 assert "tag2" in result["tags"]
927 assert len(result["headings"]) == 2
928 assert result["headings"][0]["level"] == 1
929 assert result["headings"][0]["text"] == "Heading One"
930 assert "Some text" in result["body"]
931
932 def test_ingest_vault(self, tmp_path):
933 from video_processor.sources.obsidian_source import ingest_vault
934
935 (tmp_path / "note_a.md").write_text("# A\n\nLinks to [[B]].\n")
936 (tmp_path / "note_b.md").write_text("# B\n\nLinks to [[A]] and [[C]].\n")
937
938 result = ingest_vault(tmp_path)
939
940 assert len(result["notes"]) == 2
941 names = [n["name"] for n in result["notes"]]
942 assert "note_a" in names
943 assert "note_b" in names
944 # note_a links to B, note_b links to A and C => 3 links
945 assert len(result["links"]) == 3
946
947 def test_list_videos(self, tmp_path):
948 from video_processor.sources.obsidian_source import ObsidianSource
949
950 (tmp_path / "note1.md").write_text("# Note 1")
951 sub = tmp_path / "subdir"
952 sub.mkdir()
953 (sub / "note2.md").write_text("# Note 2")
954
955 src = ObsidianSource(vault_path=str(tmp_path))
956 files = src.list_videos()
957 assert len(files) == 2
958 assert all(f.mime_type == "text/markdown" for f in files)
959
960
961 # ---------------------------------------------------------------------------
962 # LogseqSource
963 # ---------------------------------------------------------------------------
964
965
966 class TestLogseqSource:
967 def test_import(self):
968 from video_processor.sources.logseq_source import LogseqSource
969
970 assert LogseqSource is not None
971
972 def test_constructor(self, tmp_path):
973 from video_processor.sources.logseq_source import LogseqSource
974
975 src = LogseqSource(graph_path=str(tmp_path))
976 assert src.graph_path == tmp_path
977
978 def test_authenticate_with_pages(self, tmp_path):
979 from video_processor.sources.logseq_source import LogseqSource
980
981 (tmp_path / "pages").mkdir()
982 src = LogseqSource(graph_path=str(tmp_path))
983 assert src.authenticate() is True
984
985 def test_authenticate_no_pages_or_journals(self, tmp_path):
986 from video_processor.sources.logseq_source import LogseqSource
987
988 src = LogseqSource(graph_path=str(tmp_path))
989 assert src.authenticate() is False
990
991 def test_authenticate_nonexistent(self, tmp_path):
992 from video_processor.sources.logseq_source import LogseqSource
993
994 src = LogseqSource(graph_path=str(tmp_path / "nonexistent"))
995 assert src.authenticate() is False
996
997 def test_parse_page(self, tmp_path):
998 from video_processor.sources.logseq_source import parse_page
999
1000 page_content = (
1001 "title:: My Page\n"
1002 "tags:: #project #important\n"
1003 "- Some block content\n"
1004 " - Nested with [[Another Page]] link\n"
1005 " - And a #todo tag\n"
1006 " - Block ref ((abc12345-6789-0abc-def0-123456789abc))\n"
1007 )
1008 page_file = tmp_path / "my_page.md"
1009 page_file.write_text(page_content)
1010
1011 result = parse_page(page_file)
1012
1013 assert result["properties"]["title"] == "My Page"
1014 assert "Another Page" in result["links"]
1015 assert "todo" in result["tags"]
1016 assert "abc12345-6789-0abc-def0-123456789abc" in result["block_refs"]
1017 assert "Some block content" in result["body"]
1018
1019 def test_ingest_graph(self, tmp_path):
1020 from video_processor.sources.logseq_source import ingest_graph
1021
1022 pages_dir = tmp_path / "pages"
1023 pages_dir.mkdir()
1024 (pages_dir / "page_a.md").write_text("- Content linking [[Page B]]\n")
1025 (pages_dir / "page_b.md").write_text("- Content linking [[Page A]]\n")
1026
1027 journals_dir = tmp_path / "journals"
1028 journals_dir.mkdir()
1029 (journals_dir / "2026_03_07.md").write_text("- Journal entry\n")
1030
1031 result = ingest_graph(tmp_path)
1032
1033 assert len(result["notes"]) == 3
1034 assert len(result["links"]) == 2
1035
1036 def test_list_videos(self, tmp_path):
1037 from video_processor.sources.logseq_source import LogseqSource
1038
1039 pages_dir = tmp_path / "pages"
1040 pages_dir.mkdir()
1041 (pages_dir / "page1.md").write_text("- content")
1042
1043 src = LogseqSource(graph_path=str(tmp_path))
1044 files = src.list_videos()
1045 assert len(files) == 1
1046 assert files[0].mime_type == "text/markdown"
1047
1048
1049 # ---------------------------------------------------------------------------
1050 # NotionSource
1051 # ---------------------------------------------------------------------------
1052
1053
1054 class TestNotionSource:
1055 def test_import(self):
1056 from video_processor.sources.notion_source import NotionSource
1057
1058 assert NotionSource is not None
1059
1060 def test_constructor(self):
1061 from video_processor.sources.notion_source import NotionSource
1062
1063 src = NotionSource(token="ntn_test123", database_id="db-1")
1064 assert src.token == "ntn_test123"
1065 assert src.database_id == "db-1"
1066 assert src.page_ids == []
1067
1068 @patch.dict(os.environ, {}, clear=True)
1069 def test_authenticate_no_token(self):
1070 from video_processor.sources.notion_source import NotionSource
1071
1072 src = NotionSource(token="")
1073 assert src.authenticate() is False
1074
1075 @patch("requests.get")
1076 def test_authenticate_with_mock(self, mock_get):
1077 from video_processor.sources.notion_source import NotionSource
1078
1079 mock_resp = MagicMock()
1080 mock_resp.raise_for_status = MagicMock()
1081 mock_resp.json.return_value = {"name": "Test Bot"}
1082 mock_get.return_value = mock_resp
1083
1084 src = NotionSource(token="ntn_test123")
1085 assert src.authenticate() is True
1086
1087 @patch("requests.post")
1088 def test_list_videos_database(self, mock_post):
1089 from video_processor.sources.notion_source import NotionSource
1090
1091 mock_resp = MagicMock()
1092 mock_resp.raise_for_status = MagicMock()
1093 mock_resp.json.return_value = {
1094 "results": [
1095 {
1096 "id": "page-1",
1097 "last_edited_time": "2026-03-01T00:00:00Z",
1098 "properties": {
1099 "Name": {
1100 "type": "title",
1101 "title": [{"plain_text": "Meeting Notes"}],
1102 }
1103 },
1104 },
1105 ],
1106 "has_more": False,
1107 }
1108 mock_post.return_value = mock_resp
1109
1110 src = NotionSource(token="ntn_test", database_id="db-1")
1111 files = src.list_videos()
1112 assert len(files) == 1
1113 assert files[0].name == "Meeting Notes"
1114 assert files[0].id == "page-1"
1115
1116 def test_blocks_to_text(self):
1117 from video_processor.sources.notion_source import NotionSource
1118
1119 src = NotionSource(token="test")
1120 blocks = [
1121 {
1122 "type": "heading_1",
1123 "heading_1": {
1124 "rich_text": [{"plain_text": "Title"}],
1125 },
1126 },
1127 {
1128 "type": "paragraph",
1129 "paragraph": {
1130 "rich_text": [{"plain_text": "Some paragraph text."}],
1131 },
1132 },
1133 {
1134 "type": "bulleted_list_item",
1135 "bulleted_list_item": {
1136 "rich_text": [{"plain_text": "A bullet point"}],
1137 },
1138 },
1139 {
1140 "type": "divider",
1141 "divider": {},
1142 },
1143 ]
1144 result = src._blocks_to_text(blocks)
1145 assert "# Title" in result
1146 assert "Some paragraph text." in result
1147 assert "- A bullet point" in result
1148 assert "---" in result
1149
1150
1151 # ---------------------------------------------------------------------------
1152 # AppleNotesSource
1153 # ---------------------------------------------------------------------------
1154
1155
1156 class TestAppleNotesSource:
1157 def test_import(self):
1158 from video_processor.sources.apple_notes_source import AppleNotesSource
1159
1160 assert AppleNotesSource is not None
1161
1162 def test_constructor(self):
1163 from video_processor.sources.apple_notes_source import AppleNotesSource
1164
1165 src = AppleNotesSource(folder="Work")
1166 assert src.folder == "Work"
1167
1168 def test_constructor_default(self):
1169 from video_processor.sources.apple_notes_source import AppleNotesSource
1170
1171 src = AppleNotesSource()
1172 assert src.folder is None
1173
1174 def test_authenticate_platform(self):
1175 import sys
1176
1177 from video_processor.sources.apple_notes_source import AppleNotesSource
1178
1179 src = AppleNotesSource()
1180 result = src.authenticate()
1181 if sys.platform == "darwin":
1182 assert result is True
1183 else:
1184 assert result is False
1185
1186 def test_html_to_text(self):
1187 from video_processor.sources.apple_notes_source import AppleNotesSource
1188
1189 html = (
1190 "<div>Hello <b>World</b></div>"
1191 "<p>Paragraph one.</p>"
1192 "<p>Paragraph two with &amp; entity.</p>"
1193 "<br/>"
1194 "<ul><li>Item 1</li><li>Item 2</li></ul>"
1195 )
1196 result = AppleNotesSource._html_to_text(html)
1197 assert "Hello World" in result
1198 assert "Paragraph one." in result
1199 assert "Paragraph two with & entity." in result
1200 assert "Item 1" in result
1201
1202 def test_html_to_text_empty(self):
1203 from video_processor.sources.apple_notes_source import AppleNotesSource
1204
1205 assert AppleNotesSource._html_to_text("") == ""
1206
1207 def test_html_to_text_entities(self):
1208 from video_processor.sources.apple_notes_source import AppleNotesSource
1209
1210 html = "&lt;code&gt; &quot;test&quot; &#39;single&#39; &nbsp;space"
1211 result = AppleNotesSource._html_to_text(html)
1212 assert "<code>" in result
1213 assert '"test"' in result
1214 assert "'single'" in result
1215
1216
1217 # ---------------------------------------------------------------------------
1218 # GoogleKeepSource
1219 # ---------------------------------------------------------------------------
1220
1221
1222 class TestGoogleKeepSource:
1223 def test_import(self):
1224 from video_processor.sources.google_keep_source import GoogleKeepSource
1225
1226 assert GoogleKeepSource is not None
1227
1228 def test_constructor(self):
1229 from video_processor.sources.google_keep_source import GoogleKeepSource
1230
1231 src = GoogleKeepSource(label="meetings")
1232 assert src.label == "meetings"
1233
1234 def test_constructor_default(self):
1235 from video_processor.sources.google_keep_source import GoogleKeepSource
1236
1237 src = GoogleKeepSource()
1238 assert src.label is None
1239
1240 @patch("shutil.which", return_value=None)
1241 def test_authenticate_no_gws(self, _mock_which):
1242 from video_processor.sources.google_keep_source import GoogleKeepSource
1243
1244 src = GoogleKeepSource()
1245 assert src.authenticate() is False
1246
1247 def test_note_to_text(self):
1248 from video_processor.sources.google_keep_source import _note_to_text
1249
1250 note = {
1251 "title": "Shopping List",
1252 "body": "Remember to buy groceries",
1253 "listContent": [
1254 {"text": "Milk", "checked": True},
1255 {"text": "Bread", "checked": False},
1256 {"text": "", "checked": False},
1257 ],
1258 }
1259 result = _note_to_text(note)
1260 assert "Shopping List" in result
1261 assert "Remember to buy groceries" in result
1262 assert "- [x] Milk" in result
1263 assert "- [ ] Bread" in result
1264
1265 def test_note_to_text_empty(self):
1266 from video_processor.sources.google_keep_source import _note_to_text
1267
1268 assert _note_to_text({}) == ""
1269
1270 def test_note_to_text_text_content(self):
1271 from video_processor.sources.google_keep_source import _note_to_text
1272
1273 note = {"title": "Simple", "textContent": "Just a plain note"}
1274 result = _note_to_text(note)
1275 assert "Simple" in result
1276 assert "Just a plain note" in result
1277
1278
1279 # ---------------------------------------------------------------------------
1280 # OneNoteSource
1281 # ---------------------------------------------------------------------------
1282
1283
1284 class TestOneNoteSource:
1285 def test_import(self):
1286 from video_processor.sources.onenote_source import OneNoteSource
1287
1288 assert OneNoteSource is not None
1289
1290 def test_constructor(self):
1291 from video_processor.sources.onenote_source import OneNoteSource
1292
1293 src = OneNoteSource(notebook_name="Work Notes", section_name="Meetings")
1294 assert src.notebook_name == "Work Notes"
1295 assert src.section_name == "Meetings"
1296
1297 def test_constructor_default(self):
1298 from video_processor.sources.onenote_source import OneNoteSource
1299
1300 src = OneNoteSource()
1301 assert src.notebook_name is None
1302 assert src.section_name is None
1303
1304 @patch("shutil.which", return_value=None)
1305 def test_authenticate_no_m365(self, _mock_which):
1306 from video_processor.sources.onenote_source import OneNoteSource
1307
1308 src = OneNoteSource()
1309 assert src.authenticate() is False
1310
1311 def test_html_to_text(self):
1312 from video_processor.sources.onenote_source import _html_to_text
1313
1314 html = (
1315 "<html><body>"
1316 "<h1>Meeting Notes</h1>"
1317 "<p>Discussed the &amp; project.</p>"
1318 "<script>var x = 1;</script>"
1319 "<style>.foo { color: red; }</style>"
1320 "<ul><li>Action item 1</li><li>Action item 2</li></ul>"
1321 "<p>Entity &#x41; and &#65; decoded.</p>"
1322 "</body></html>"
1323 )
1324 result = _html_to_text(html)
1325 assert "Meeting Notes" in result
1326 assert "Discussed the & project." in result
1327 assert "var x" not in result
1328 assert ".foo" not in result
1329 assert "Action item 1" in result
1330 assert "Entity A and A decoded." in result
1331
1332 def test_html_to_text_empty(self):
1333 from video_processor.sources.onenote_source import _html_to_text
1334
1335 assert _html_to_text("") == ""
1336
1337 def test_html_to_text_entities(self):
1338 from video_processor.sources.onenote_source import _html_to_text
1339
1340 html = "&lt;tag&gt; &quot;quoted&quot; &apos;apos&apos; &nbsp;space"
1341 result = _html_to_text(html)
1342 assert "<tag>" in result
1343 assert '"quoted"' in result
1344 assert "'apos'" in result
1345
--- video_processor/agent/skills/__init__.py
+++ video_processor/agent/skills/__init__.py
@@ -4,15 +4,17 @@
44
from video_processor.agent.skills import ( # noqa: F401
55
artifact_export,
66
cli_adapter,
77
doc_generator,
88
github_integration,
9
+ notes_export,
910
prd,
1011
project_plan,
1112
requirements_chat,
1213
roadmap,
1314
task_breakdown,
15
+ wiki_generator,
1416
)
1517
from video_processor.agent.skills.base import (
1618
AgentContext,
1719
Artifact,
1820
Skill,
1921
2022
ADDED video_processor/agent/skills/notes_export.py
2123
ADDED video_processor/agent/skills/wiki_generator.py
--- video_processor/agent/skills/__init__.py
+++ video_processor/agent/skills/__init__.py
@@ -4,15 +4,17 @@
4 from video_processor.agent.skills import ( # noqa: F401
5 artifact_export,
6 cli_adapter,
7 doc_generator,
8 github_integration,
 
9 prd,
10 project_plan,
11 requirements_chat,
12 roadmap,
13 task_breakdown,
 
14 )
15 from video_processor.agent.skills.base import (
16 AgentContext,
17 Artifact,
18 Skill,
19
20 DDED video_processor/agent/skills/notes_export.py
21 DDED video_processor/agent/skills/wiki_generator.py
--- video_processor/agent/skills/__init__.py
+++ video_processor/agent/skills/__init__.py
@@ -4,15 +4,17 @@
4 from video_processor.agent.skills import ( # noqa: F401
5 artifact_export,
6 cli_adapter,
7 doc_generator,
8 github_integration,
9 notes_export,
10 prd,
11 project_plan,
12 requirements_chat,
13 roadmap,
14 task_breakdown,
15 wiki_generator,
16 )
17 from video_processor.agent.skills.base import (
18 AgentContext,
19 Artifact,
20 Skill,
21
22 DDED video_processor/agent/skills/notes_export.py
23 DDED video_processor/agent/skills/wiki_generator.py
--- a/video_processor/agent/skills/notes_export.py
+++ b/video_processor/agent/skills/notes_export.py
@@ -0,0 +1,420 @@
1
+"""Skill: Export knowledge graph as structured notes (Obsidian, Notion)."""
2
+
3
+import csv
4
+import io
5
+import logging
6
+from datetime import date
7
+from pathlib import Path
8
+from typing import Dict, List, Optional
9
+
10
+from video_processor.agent.skills.base import (
11
+ AgentContext,
12
+ Artifact,
13
+ Skill,
14
+ register_skill,
15
+)
16
+
17
+logger = logging.getLogger(__name__)
18
+
19
+
20
+def _sanitize_filename(name: str) -> str:
21
+ """Convert a name to a filesystem-safe filename."""
22
+ return (
23
+ name.replace("/", "-")
24
+ .replace("\\", "-")
25
+ .replace(":", "-")
26
+ .replace('"', "")
27
+ .replace("?", "")
28
+ .replace("*", "")
29
+ .replace("<", "")
30
+ .replace(">", "")
31
+ .replace("|", "")
32
+ )
33
+
34
+
35
+def _build_indexes(kg_data: dict):
36
+ """Build lookup structures from knowledge graph data.
37
+
38
+ Returns (nodes, by_type, node_lookup, outgoing, incoming).
39
+ """
40
+ nodes = kg_data.get("nodes", [])
41
+ relationships = kg_data.get("relationships", [])
42
+
43
+ by_type: Dict[str, list] = {}
44
+ node_lookup: Dict[str, dict] = {}
45
+ for node in nodes:
46
+ name = node.get("name", node.get("id", ""))
47
+ ntype = node.get("type", "concept")
48
+ by_type.setdefault(ntype, []).append(node)
49
+ node_lookup[name] = node
50
+
51
+ outgoing: Dict[str, list] = {}
52
+ incoming: Dict[str, list] = {}
53
+ for rel in relationships:
54
+ src = rel.get("source", "")
55
+ tgt = rel.get("target", "")
56
+ rtype = rel.get("type", "related_to")
57
+ outgoing.setdefault(src, []).append((tgt, rtype))
58
+ incoming.setdefault(tgt, []).append((src, rtype))
59
+
60
+ return nodes, by_type, node_lookup, outgoing, incoming
61
+
62
+
63
+# ---------------------------------------------------------------------------
64
+# Obsidian export
65
+# ---------------------------------------------------------------------------
66
+
67
+
68
+def export_to_obsidian(
69
+ kg_data: dict,
70
+ output_dir: Path,
71
+ artifacts: Optional[List[Artifact]] = None,
72
+) -> List[Path]:
73
+ """Export knowledge graph as an Obsidian vault.
74
+
75
+ Creates one ``.md`` file per entity with YAML frontmatter and
76
+ ``[[wiki-links]]``, an ``_Index.md`` Map of Content, tag pages per
77
+ entity type, and optional artifact notes.
78
+ """
79
+ output_dir.mkdir(parents=True, exist_ok=True)
80
+ artifacts = artifacts or []
81
+ created: List[Path] = []
82
+ today = date.today().isoformat()
83
+
84
+ nodes, by_type, node_lookup, outgoing, incoming = _build_indexes(kg_data)
85
+
86
+ # --- Individual entity notes ---
87
+ for node in nodes:
88
+ name = node.get("name", node.get("id", ""))
89
+ if not name:
90
+ continue
91
+ ntype = node.get("type", "concept")
92
+ descs = node.get("descriptions", [])
93
+ aliases = node.get("aliases", [])
94
+
95
+ # YAML frontmatter
96
+ tags_yaml = f" - {ntype}"
97
+ aliases_yaml = ""
98
+ if aliases:
99
+ alias_lines = "\n".join(f" - {a}" for a in aliases)
100
+ aliases_yaml = f"aliases:\n{alias_lines}\n"
101
+
102
+ frontmatter = f"---\ntype: {ntype}\ntags:\n{tags_yaml}\n{aliases_yaml}date: {today}\n---\n"
103
+
104
+ parts = [frontmatter, f"# {name}", ""]
105
+
106
+ # Descriptions
107
+ if descs:
108
+ for d in descs:
109
+ parts.append(f"{d}")
110
+ parts.append("")
111
+
112
+ # Outgoing relationships
113
+ outs = outgoing.get(name, [])
114
+ if outs:
115
+ parts.append("## Relationships")
116
+ parts.append("")
117
+ for tgt, rtype in outs:
118
+ parts.append(f"- **{rtype}**: [[{tgt}]]")
119
+ parts.append("")
120
+
121
+ # Incoming relationships
122
+ ins = incoming.get(name, [])
123
+ if ins:
124
+ parts.append("## Referenced by")
125
+ parts.append("")
126
+ for src, rtype in ins:
127
+ parts.append(f"- **{rtype}** from [[{src}]]")
128
+ parts.append("")
129
+
130
+ filename = _sanitize_filename(name) + ".md"
131
+ path = output_dir / filename
132
+ path.write_text("\n".join(parts), encoding="utf-8")
133
+ created.append(path)
134
+
135
+ # --- Index note (Map of Content) ---
136
+ index_parts = [
137
+ "---",
138
+ "type: index",
139
+ "tags:",
140
+ " - MOC",
141
+ f"date: {today}",
142
+ "---",
143
+ "",
144
+ "# Index",
145
+ "",
146
+ f"**{len(nodes)}** entities | **{len(kg_data.get('relationships', []))}** relationships",
147
+ "",
148
+ ]
149
+
150
+ for etype in sorted(by_type.keys()):
151
+ elist = by_type[etype]
152
+ index_parts.append(f"## {etype.title()}")
153
+ index_parts.append("")
154
+ for node in sorted(elist, key=lambda n: n.get("name", "")):
155
+ name = node.get("name", "")
156
+ index_parts.append(f"- [[{name}]]")
157
+ index_parts.append("")
158
+
159
+ if artifacts:
160
+ index_parts.append("## Artifacts")
161
+ index_parts.append("")
162
+ for art in artifacts:
163
+ index_parts.append(f"- [[{art.name}]]")
164
+ index_parts.append("")
165
+
166
+ index_path = output_dir / "_Index.md"
167
+ index_path.write_text("\n".join(index_parts), encoding="utf-8")
168
+ created.append(index_path)
169
+
170
+ # --- Tag pages (one per entity type) ---
171
+ for etype, elist in sorted(by_type.items()):
172
+ tag_parts = [
173
+ "---",
174
+ "type: tag",
175
+ "tags:",
176
+ f" - {etype}",
177
+ f"date: {today}",
178
+ "---",
179
+ "",
180
+ f"# {etype.title()}",
181
+ "",
182
+ f"All entities of type **{etype}** ({len(elist)}).",
183
+ "",
184
+ ]
185
+ for node in sorted(elist, key=lambda n: n.get("name", "")):
186
+ name = node.get("name", "")
187
+ descs = node.get("descriptions", [])
188
+ summary = descs[0] if descs else ""
189
+ tag_parts.append(f"- [[{name}]]" + (f" - {summary}" if summary else ""))
190
+ tag_parts.append("")
191
+
192
+ tag_filename = f"Tag - {etype.title()}.md"
193
+ tag_path = output_dir / _sanitize_filename(tag_filename)
194
+ tag_path.write_text("\n".join(tag_parts), encoding="utf-8")
195
+ created.append(tag_path)
196
+
197
+ # --- Artifact notes ---
198
+ for art in artifacts:
199
+ art_parts = [
200
+ "---",
201
+ "type: artifact",
202
+ f"artifact_type: {art.artifact_type}",
203
+ "tags:",
204
+ " - artifact",
205
+ f" - {art.artifact_type}",
206
+ f"date: {today}",
207
+ "---",
208
+ "",
209
+ f"# {art.name}",
210
+ "",
211
+ art.content,
212
+ "",
213
+ ]
214
+ art_filename = _sanitize_filename(art.name) + ".md"
215
+ art_path = output_dir / art_filename
216
+ art_path.write_text("\n".join(art_parts), encoding="utf-8")
217
+ created.append(art_path)
218
+
219
+ logger.info("Exported %d Obsidian notes to %s", len(created), output_dir)
220
+ return created
221
+
222
+
223
+# ---------------------------------------------------------------------------
224
+# Notion-compatible markdown export
225
+# ---------------------------------------------------------------------------
226
+
227
+
228
+def export_to_notion_md(
229
+ kg_data: dict,
230
+ output_dir: Path,
231
+ artifacts: Optional[List[Artifact]] = None,
232
+) -> List[Path]:
233
+ """Export knowledge graph as Notion-compatible markdown.
234
+
235
+ Creates ``.md`` files with Notion-style callout blocks and a
236
+ database-style CSV for bulk import.
237
+ """
238
+ output_dir.mkdir(parents=True, exist_ok=True)
239
+ artifacts = artifacts or []
240
+ created: List[Path] = []
241
+
242
+ nodes, by_type, node_lookup, outgoing, incoming = _build_indexes(kg_data)
243
+
244
+ # --- Database CSV ---
245
+ csv_buffer = io.StringIO()
246
+ writer = csv.writer(csv_buffer)
247
+ writer.writerow(["Name", "Type", "Description", "Related To"])
248
+
249
+ for node in nodes:
250
+ name = node.get("name", node.get("id", ""))
251
+ ntype = node.get("type", "concept")
252
+ descs = node.get("descriptions", [])
253
+ desc_text = "; ".join(descs[:2]) if descs else ""
254
+ outs = outgoing.get(name, [])
255
+ related = ", ".join(tgt for tgt, _ in outs) if outs else ""
256
+ writer.writerow([name, ntype, desc_text, related])
257
+
258
+ csv_path = output_dir / "entities_database.csv"
259
+ csv_path.write_text(csv_buffer.getvalue(), encoding="utf-8")
260
+ created.append(csv_path)
261
+
262
+ # --- Individual entity pages ---
263
+ for node in nodes:
264
+ name = node.get("name", node.get("id", ""))
265
+ if not name:
266
+ continue
267
+ ntype = node.get("type", "concept")
268
+ descs = node.get("descriptions", [])
269
+
270
+ type_emoji = {
271
+ "person": "person",
272
+ "technology": "computer",
273
+ "organization": "building",
274
+ "concept": "bulb",
275
+ "event": "calendar",
276
+ "location": "round_pushpin",
277
+ }
278
+ emoji = type_emoji.get(ntype, "bulb")
279
+
280
+ parts = [
281
+ f"# {name}",
282
+ "",
283
+ f"> :{emoji}: **Type:** {ntype}",
284
+ "",
285
+ ]
286
+
287
+ if descs:
288
+ parts.append("## Description")
289
+ parts.append("")
290
+ for d in descs:
291
+ parts.append(f"{d}")
292
+ parts.append("")
293
+
294
+ # Properties callout
295
+ properties = node.get("properties", {})
296
+ if properties:
297
+ parts.append("> :memo: **Properties**")
298
+ for k, v in properties.items():
299
+ parts.append(f"> - **{k}:** {v}")
300
+ parts.append("")
301
+
302
+ # Outgoing relationships
303
+ outs = outgoing.get(name, [])
304
+ if outs:
305
+ parts.append("## Relationships")
306
+ parts.append("")
307
+ parts.append("| Target | Relationship |")
308
+ parts.append("|--------|-------------|")
309
+ for tgt, rtype in outs:
310
+ parts.append(f"| {tgt} | {rtype} |")
311
+ parts.append("")
312
+
313
+ # Incoming relationships
314
+ ins = incoming.get(name, [])
315
+ if ins:
316
+ parts.append("## Referenced by")
317
+ parts.append("")
318
+ parts.append("| Source | Relationship |")
319
+ parts.append("|--------|-------------|")
320
+ for src, rtype in ins:
321
+ parts.append(f"| {src} | {rtype} |")
322
+ parts.append("")
323
+
324
+ filename = _sanitize_filename(name) + ".md"
325
+ path = output_dir / filename
326
+ path.write_text("\n".join(parts), encoding="utf-8")
327
+ created.append(path)
328
+
329
+ # --- Overview page ---
330
+ overview_parts = [
331
+ "# Knowledge Graph Overview",
332
+ "",
333
+ f"> :bar_chart: **Stats:** {len(nodes)} entities, "
334
+ f"{len(kg_data.get('relationships', []))} relationships",
335
+ "",
336
+ "## Entity Types",
337
+ "",
338
+ ]
339
+ for etype in sorted(by_type.keys()):
340
+ elist = by_type[etype]
341
+ overview_parts.append(f"### {etype.title()} ({len(elist)})")
342
+ overview_parts.append("")
343
+ for node in sorted(elist, key=lambda n: n.get("name", "")):
344
+ name = node.get("name", "")
345
+ overview_parts.append(f"- {name}")
346
+ overview_parts.append("")
347
+
348
+ if artifacts:
349
+ overview_parts.append("## Artifacts")
350
+ overview_parts.append("")
351
+ for art in artifacts:
352
+ overview_parts.append(f"- **{art.name}** ({art.artifact_type})")
353
+ overview_parts.append("")
354
+
355
+ overview_path = output_dir / "Overview.md"
356
+ overview_path.write_text("\n".join(overview_parts), encoding="utf-8")
357
+ created.append(overview_path)
358
+
359
+ # --- Artifact pages ---
360
+ for art in artifacts:
361
+ art_parts = [
362
+ f"# {art.name}",
363
+ "",
364
+ f"> :page_facing_up: **Type:** {art.artifact_type} | **Format:** {art.format}",
365
+ "",
366
+ art.content,
367
+ "",
368
+ ]
369
+ art_filename = _sanitize_filename(art.name) + ".md"
370
+ art_path = output_dir / art_filename
371
+ art_path.write_text("\n".join(art_parts), encoding="utf-8")
372
+ created.append(art_path)
373
+
374
+ logger.info("Exported %d Notion markdown files to %s", len(created), output_dir)
375
+ return created
376
+
377
+
378
+# ---------------------------------------------------------------------------
379
+# Skill class
380
+# ---------------------------------------------------------------------------
381
+
382
+
383
+class NotesExportSkill(Skill):
384
+ """Export knowledge graph as structured notes (Obsidian, Notion).
385
+
386
+ For GitHub wiki export, see the ``wiki_generator`` skill.
387
+ """
388
+
389
+ name = "notes_export"
390
+ description = "Export knowledge graph as structured notes (Obsidian, Notion)"
391
+
392
+ def execute(self, context: AgentContext, **kwargs) -> Artifact:
393
+ fmt = kwargs.get("format", "obsidian")
394
+ output_dir = Path(kwargs.get("output_dir", f"notes_export_{fmt}"))
395
+ kg_data = context.knowledge_graph.to_dict()
396
+ artifacts = context.artifacts or []
397
+
398
+ if fmt == "notion":
399
+ created = export_to_notion_md(kg_data, output_dir, artifacts=artifacts)
400
+ else:
401
+ created = export_to_obsidian(kg_data, output_dir, artifacts=artifacts)
402
+
403
+ file_list = "\n".join(f"- {p.name}" for p in created)
404
+ summary = f"Exported {len(created)} {fmt} notes to `{output_dir}`:\n\n{file_list}"
405
+
406
+ return Artifact(
407
+ name=f"Notes Export ({fmt.title()})",
408
+ content=summary,
409
+ artifact_type="notes_export",
410
+ format="markdown",
411
+ metadata={
412
+ "output_dir": str(output_dir),
413
+ "format": fmt,
414
+ "file_count": len(created),
415
+ "files": [str(p) for p in created],
416
+ },
417
+ )
418
+
419
+
420
+register_skill(NotesExportSkill())
--- a/video_processor/agent/skills/notes_export.py
+++ b/video_processor/agent/skills/notes_export.py
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/agent/skills/notes_export.py
+++ b/video_processor/agent/skills/notes_export.py
@@ -0,0 +1,420 @@
1 """Skill: Export knowledge graph as structured notes (Obsidian, Notion)."""
2
3 import csv
4 import io
5 import logging
6 from datetime import date
7 from pathlib import Path
8 from typing import Dict, List, Optional
9
10 from video_processor.agent.skills.base import (
11 AgentContext,
12 Artifact,
13 Skill,
14 register_skill,
15 )
16
17 logger = logging.getLogger(__name__)
18
19
20 def _sanitize_filename(name: str) -> str:
21 """Convert a name to a filesystem-safe filename."""
22 return (
23 name.replace("/", "-")
24 .replace("\\", "-")
25 .replace(":", "-")
26 .replace('"', "")
27 .replace("?", "")
28 .replace("*", "")
29 .replace("<", "")
30 .replace(">", "")
31 .replace("|", "")
32 )
33
34
35 def _build_indexes(kg_data: dict):
36 """Build lookup structures from knowledge graph data.
37
38 Returns (nodes, by_type, node_lookup, outgoing, incoming).
39 """
40 nodes = kg_data.get("nodes", [])
41 relationships = kg_data.get("relationships", [])
42
43 by_type: Dict[str, list] = {}
44 node_lookup: Dict[str, dict] = {}
45 for node in nodes:
46 name = node.get("name", node.get("id", ""))
47 ntype = node.get("type", "concept")
48 by_type.setdefault(ntype, []).append(node)
49 node_lookup[name] = node
50
51 outgoing: Dict[str, list] = {}
52 incoming: Dict[str, list] = {}
53 for rel in relationships:
54 src = rel.get("source", "")
55 tgt = rel.get("target", "")
56 rtype = rel.get("type", "related_to")
57 outgoing.setdefault(src, []).append((tgt, rtype))
58 incoming.setdefault(tgt, []).append((src, rtype))
59
60 return nodes, by_type, node_lookup, outgoing, incoming
61
62
63 # ---------------------------------------------------------------------------
64 # Obsidian export
65 # ---------------------------------------------------------------------------
66
67
68 def export_to_obsidian(
69 kg_data: dict,
70 output_dir: Path,
71 artifacts: Optional[List[Artifact]] = None,
72 ) -> List[Path]:
73 """Export knowledge graph as an Obsidian vault.
74
75 Creates one ``.md`` file per entity with YAML frontmatter and
76 ``[[wiki-links]]``, an ``_Index.md`` Map of Content, tag pages per
77 entity type, and optional artifact notes.
78 """
79 output_dir.mkdir(parents=True, exist_ok=True)
80 artifacts = artifacts or []
81 created: List[Path] = []
82 today = date.today().isoformat()
83
84 nodes, by_type, node_lookup, outgoing, incoming = _build_indexes(kg_data)
85
86 # --- Individual entity notes ---
87 for node in nodes:
88 name = node.get("name", node.get("id", ""))
89 if not name:
90 continue
91 ntype = node.get("type", "concept")
92 descs = node.get("descriptions", [])
93 aliases = node.get("aliases", [])
94
95 # YAML frontmatter
96 tags_yaml = f" - {ntype}"
97 aliases_yaml = ""
98 if aliases:
99 alias_lines = "\n".join(f" - {a}" for a in aliases)
100 aliases_yaml = f"aliases:\n{alias_lines}\n"
101
102 frontmatter = f"---\ntype: {ntype}\ntags:\n{tags_yaml}\n{aliases_yaml}date: {today}\n---\n"
103
104 parts = [frontmatter, f"# {name}", ""]
105
106 # Descriptions
107 if descs:
108 for d in descs:
109 parts.append(f"{d}")
110 parts.append("")
111
112 # Outgoing relationships
113 outs = outgoing.get(name, [])
114 if outs:
115 parts.append("## Relationships")
116 parts.append("")
117 for tgt, rtype in outs:
118 parts.append(f"- **{rtype}**: [[{tgt}]]")
119 parts.append("")
120
121 # Incoming relationships
122 ins = incoming.get(name, [])
123 if ins:
124 parts.append("## Referenced by")
125 parts.append("")
126 for src, rtype in ins:
127 parts.append(f"- **{rtype}** from [[{src}]]")
128 parts.append("")
129
130 filename = _sanitize_filename(name) + ".md"
131 path = output_dir / filename
132 path.write_text("\n".join(parts), encoding="utf-8")
133 created.append(path)
134
135 # --- Index note (Map of Content) ---
136 index_parts = [
137 "---",
138 "type: index",
139 "tags:",
140 " - MOC",
141 f"date: {today}",
142 "---",
143 "",
144 "# Index",
145 "",
146 f"**{len(nodes)}** entities | **{len(kg_data.get('relationships', []))}** relationships",
147 "",
148 ]
149
150 for etype in sorted(by_type.keys()):
151 elist = by_type[etype]
152 index_parts.append(f"## {etype.title()}")
153 index_parts.append("")
154 for node in sorted(elist, key=lambda n: n.get("name", "")):
155 name = node.get("name", "")
156 index_parts.append(f"- [[{name}]]")
157 index_parts.append("")
158
159 if artifacts:
160 index_parts.append("## Artifacts")
161 index_parts.append("")
162 for art in artifacts:
163 index_parts.append(f"- [[{art.name}]]")
164 index_parts.append("")
165
166 index_path = output_dir / "_Index.md"
167 index_path.write_text("\n".join(index_parts), encoding="utf-8")
168 created.append(index_path)
169
170 # --- Tag pages (one per entity type) ---
171 for etype, elist in sorted(by_type.items()):
172 tag_parts = [
173 "---",
174 "type: tag",
175 "tags:",
176 f" - {etype}",
177 f"date: {today}",
178 "---",
179 "",
180 f"# {etype.title()}",
181 "",
182 f"All entities of type **{etype}** ({len(elist)}).",
183 "",
184 ]
185 for node in sorted(elist, key=lambda n: n.get("name", "")):
186 name = node.get("name", "")
187 descs = node.get("descriptions", [])
188 summary = descs[0] if descs else ""
189 tag_parts.append(f"- [[{name}]]" + (f" - {summary}" if summary else ""))
190 tag_parts.append("")
191
192 tag_filename = f"Tag - {etype.title()}.md"
193 tag_path = output_dir / _sanitize_filename(tag_filename)
194 tag_path.write_text("\n".join(tag_parts), encoding="utf-8")
195 created.append(tag_path)
196
197 # --- Artifact notes ---
198 for art in artifacts:
199 art_parts = [
200 "---",
201 "type: artifact",
202 f"artifact_type: {art.artifact_type}",
203 "tags:",
204 " - artifact",
205 f" - {art.artifact_type}",
206 f"date: {today}",
207 "---",
208 "",
209 f"# {art.name}",
210 "",
211 art.content,
212 "",
213 ]
214 art_filename = _sanitize_filename(art.name) + ".md"
215 art_path = output_dir / art_filename
216 art_path.write_text("\n".join(art_parts), encoding="utf-8")
217 created.append(art_path)
218
219 logger.info("Exported %d Obsidian notes to %s", len(created), output_dir)
220 return created
221
222
223 # ---------------------------------------------------------------------------
224 # Notion-compatible markdown export
225 # ---------------------------------------------------------------------------
226
227
228 def export_to_notion_md(
229 kg_data: dict,
230 output_dir: Path,
231 artifacts: Optional[List[Artifact]] = None,
232 ) -> List[Path]:
233 """Export knowledge graph as Notion-compatible markdown.
234
235 Creates ``.md`` files with Notion-style callout blocks and a
236 database-style CSV for bulk import.
237 """
238 output_dir.mkdir(parents=True, exist_ok=True)
239 artifacts = artifacts or []
240 created: List[Path] = []
241
242 nodes, by_type, node_lookup, outgoing, incoming = _build_indexes(kg_data)
243
244 # --- Database CSV ---
245 csv_buffer = io.StringIO()
246 writer = csv.writer(csv_buffer)
247 writer.writerow(["Name", "Type", "Description", "Related To"])
248
249 for node in nodes:
250 name = node.get("name", node.get("id", ""))
251 ntype = node.get("type", "concept")
252 descs = node.get("descriptions", [])
253 desc_text = "; ".join(descs[:2]) if descs else ""
254 outs = outgoing.get(name, [])
255 related = ", ".join(tgt for tgt, _ in outs) if outs else ""
256 writer.writerow([name, ntype, desc_text, related])
257
258 csv_path = output_dir / "entities_database.csv"
259 csv_path.write_text(csv_buffer.getvalue(), encoding="utf-8")
260 created.append(csv_path)
261
262 # --- Individual entity pages ---
263 for node in nodes:
264 name = node.get("name", node.get("id", ""))
265 if not name:
266 continue
267 ntype = node.get("type", "concept")
268 descs = node.get("descriptions", [])
269
270 type_emoji = {
271 "person": "person",
272 "technology": "computer",
273 "organization": "building",
274 "concept": "bulb",
275 "event": "calendar",
276 "location": "round_pushpin",
277 }
278 emoji = type_emoji.get(ntype, "bulb")
279
280 parts = [
281 f"# {name}",
282 "",
283 f"> :{emoji}: **Type:** {ntype}",
284 "",
285 ]
286
287 if descs:
288 parts.append("## Description")
289 parts.append("")
290 for d in descs:
291 parts.append(f"{d}")
292 parts.append("")
293
294 # Properties callout
295 properties = node.get("properties", {})
296 if properties:
297 parts.append("> :memo: **Properties**")
298 for k, v in properties.items():
299 parts.append(f"> - **{k}:** {v}")
300 parts.append("")
301
302 # Outgoing relationships
303 outs = outgoing.get(name, [])
304 if outs:
305 parts.append("## Relationships")
306 parts.append("")
307 parts.append("| Target | Relationship |")
308 parts.append("|--------|-------------|")
309 for tgt, rtype in outs:
310 parts.append(f"| {tgt} | {rtype} |")
311 parts.append("")
312
313 # Incoming relationships
314 ins = incoming.get(name, [])
315 if ins:
316 parts.append("## Referenced by")
317 parts.append("")
318 parts.append("| Source | Relationship |")
319 parts.append("|--------|-------------|")
320 for src, rtype in ins:
321 parts.append(f"| {src} | {rtype} |")
322 parts.append("")
323
324 filename = _sanitize_filename(name) + ".md"
325 path = output_dir / filename
326 path.write_text("\n".join(parts), encoding="utf-8")
327 created.append(path)
328
329 # --- Overview page ---
330 overview_parts = [
331 "# Knowledge Graph Overview",
332 "",
333 f"> :bar_chart: **Stats:** {len(nodes)} entities, "
334 f"{len(kg_data.get('relationships', []))} relationships",
335 "",
336 "## Entity Types",
337 "",
338 ]
339 for etype in sorted(by_type.keys()):
340 elist = by_type[etype]
341 overview_parts.append(f"### {etype.title()} ({len(elist)})")
342 overview_parts.append("")
343 for node in sorted(elist, key=lambda n: n.get("name", "")):
344 name = node.get("name", "")
345 overview_parts.append(f"- {name}")
346 overview_parts.append("")
347
348 if artifacts:
349 overview_parts.append("## Artifacts")
350 overview_parts.append("")
351 for art in artifacts:
352 overview_parts.append(f"- **{art.name}** ({art.artifact_type})")
353 overview_parts.append("")
354
355 overview_path = output_dir / "Overview.md"
356 overview_path.write_text("\n".join(overview_parts), encoding="utf-8")
357 created.append(overview_path)
358
359 # --- Artifact pages ---
360 for art in artifacts:
361 art_parts = [
362 f"# {art.name}",
363 "",
364 f"> :page_facing_up: **Type:** {art.artifact_type} | **Format:** {art.format}",
365 "",
366 art.content,
367 "",
368 ]
369 art_filename = _sanitize_filename(art.name) + ".md"
370 art_path = output_dir / art_filename
371 art_path.write_text("\n".join(art_parts), encoding="utf-8")
372 created.append(art_path)
373
374 logger.info("Exported %d Notion markdown files to %s", len(created), output_dir)
375 return created
376
377
378 # ---------------------------------------------------------------------------
379 # Skill class
380 # ---------------------------------------------------------------------------
381
382
383 class NotesExportSkill(Skill):
384 """Export knowledge graph as structured notes (Obsidian, Notion).
385
386 For GitHub wiki export, see the ``wiki_generator`` skill.
387 """
388
389 name = "notes_export"
390 description = "Export knowledge graph as structured notes (Obsidian, Notion)"
391
392 def execute(self, context: AgentContext, **kwargs) -> Artifact:
393 fmt = kwargs.get("format", "obsidian")
394 output_dir = Path(kwargs.get("output_dir", f"notes_export_{fmt}"))
395 kg_data = context.knowledge_graph.to_dict()
396 artifacts = context.artifacts or []
397
398 if fmt == "notion":
399 created = export_to_notion_md(kg_data, output_dir, artifacts=artifacts)
400 else:
401 created = export_to_obsidian(kg_data, output_dir, artifacts=artifacts)
402
403 file_list = "\n".join(f"- {p.name}" for p in created)
404 summary = f"Exported {len(created)} {fmt} notes to `{output_dir}`:\n\n{file_list}"
405
406 return Artifact(
407 name=f"Notes Export ({fmt.title()})",
408 content=summary,
409 artifact_type="notes_export",
410 format="markdown",
411 metadata={
412 "output_dir": str(output_dir),
413 "format": fmt,
414 "file_count": len(created),
415 "files": [str(p) for p in created],
416 },
417 )
418
419
420 register_skill(NotesExportSkill())
--- a/video_processor/agent/skills/wiki_generator.py
+++ b/video_processor/agent/skills/wiki_generator.py
@@ -0,0 +1,315 @@
1
+"""Skill: Generate a GitHub wiki from knowledge graph and artifacts."""
2
+
3
+import json
4
+import logging
5
+import subprocess
6
+from pathlib import Path
7
+from typing import Dict, List, Optional
8
+
9
+from video_processor.agent.skills.base import (
10
+ AgentContext,
11
+ Artifact,
12
+ Skill,
13
+ register_skill,
14
+)
15
+
16
+logger = logging.getLogger(__name__)
17
+
18
+
19
+def _sanitize_filename(name: str) -> str:
20
+ """Convert entity name to a wiki-safe filename."""
21
+ return name.replace("/", "-").replace("\\", "-").replace(" ", "-").replace(".", "-")
22
+
23
+
24
+def _wiki_link(name: str) -> str:
25
+ """Create a GitHub wiki-style markdown link."""
26
+ safe = _sanitize_filename(name)
27
+ return f"[{name}]({safe})"
28
+
29
+
30
+def generate_wiki(
31
+ kg_data: dict,
32
+ artifacts: Optional[List[Artifact]] = None,
33
+ title: str = "Knowledge Base",
34
+) -> Dict[str, str]:
35
+ """Generate a dict of {filename: markdown_content} for a GitHub wiki.
36
+
37
+ Returns pages for: Home, _Sidebar, entity type indexes, individual
38
+ entity pages, and any planning artifacts.
39
+ """
40
+ pages: Dict[str, str] = {}
41
+ artifacts = artifacts or []
42
+
43
+ nodes = kg_data.get("nodes", [])
44
+ relationships = kg_data.get("relationships", [])
45
+
46
+ # Group entities by type
47
+ by_type: Dict[str, list] = {}
48
+ node_lookup: Dict[str, dict] = {}
49
+ for node in nodes:
50
+ name = node.get("name", node.get("id", ""))
51
+ ntype = node.get("type", "concept")
52
+ by_type.setdefault(ntype, []).append(node)
53
+ node_lookup[name.lower()] = node
54
+
55
+ # Build relationship index (outgoing and incoming per entity)
56
+ outgoing: Dict[str, list] = {}
57
+ incoming: Dict[str, list] = {}
58
+ for rel in relationships:
59
+ src = rel.get("source", "")
60
+ tgt = rel.get("target", "")
61
+ rtype = rel.get("type", "related_to")
62
+ outgoing.setdefault(src, []).append((tgt, rtype))
63
+ incoming.setdefault(tgt, []).append((src, rtype))
64
+
65
+ # --- Home page ---
66
+ home_parts = [
67
+ f"# {title}",
68
+ "",
69
+ f"**{len(nodes)}** entities | **{len(relationships)}** relationships",
70
+ "",
71
+ "## Entity Types",
72
+ "",
73
+ ]
74
+ for etype, elist in sorted(by_type.items()):
75
+ home_parts.append(f"- {_wiki_link(etype.title())} ({len(elist)})")
76
+
77
+ if artifacts:
78
+ home_parts.append("")
79
+ home_parts.append("## Planning Artifacts")
80
+ home_parts.append("")
81
+ for art in artifacts:
82
+ safe = _sanitize_filename(art.name)
83
+ home_parts.append(f"- [{art.name}]({safe})")
84
+
85
+ pages["Home"] = "\n".join(home_parts)
86
+
87
+ # --- Sidebar ---
88
+ sidebar_parts = [f"**{title}**", "", "**Navigation**", "", "- [Home](Home)", ""]
89
+ sidebar_parts.append("**Entity Types**")
90
+ sidebar_parts.append("")
91
+ for etype in sorted(by_type.keys()):
92
+ sidebar_parts.append(f"- {_wiki_link(etype.title())}")
93
+
94
+ if artifacts:
95
+ sidebar_parts.append("")
96
+ sidebar_parts.append("**Artifacts**")
97
+ sidebar_parts.append("")
98
+ for art in artifacts:
99
+ safe = _sanitize_filename(art.name)
100
+ sidebar_parts.append(f"- [{art.name}]({safe})")
101
+
102
+ pages["_Sidebar"] = "\n".join(sidebar_parts)
103
+
104
+ # --- Type index pages ---
105
+ for etype, elist in sorted(by_type.items()):
106
+ page_name = _sanitize_filename(etype.title())
107
+ parts = [
108
+ f"# {etype.title()}",
109
+ "",
110
+ f"{len(elist)} entities of type **{etype}**.",
111
+ "",
112
+ "| Entity | Descriptions |",
113
+ "|--------|-------------|",
114
+ ]
115
+ for node in sorted(elist, key=lambda n: n.get("name", "")):
116
+ name = node.get("name", "")
117
+ descs = node.get("descriptions", [])
118
+ desc_text = "; ".join(descs[:2]) if descs else "—"
119
+ parts.append(f"| {_wiki_link(name)} | {desc_text} |")
120
+
121
+ pages[page_name] = "\n".join(parts)
122
+
123
+ # --- Individual entity pages ---
124
+ for node in nodes:
125
+ name = node.get("name", "")
126
+ if not name:
127
+ continue
128
+ ntype = node.get("type", "concept")
129
+ descs = node.get("descriptions", [])
130
+ page_name = _sanitize_filename(name)
131
+
132
+ parts = [
133
+ f"# {name}",
134
+ "",
135
+ f"**Type:** {ntype}",
136
+ "",
137
+ ]
138
+
139
+ if descs:
140
+ parts.append("## Descriptions")
141
+ parts.append("")
142
+ for d in descs:
143
+ parts.append(f"- {d}")
144
+ parts.append("")
145
+
146
+ # Outgoing relationships
147
+ outs = outgoing.get(name, [])
148
+ if outs:
149
+ parts.append("## Relationships")
150
+ parts.append("")
151
+ parts.append("| Target | Type |")
152
+ parts.append("|--------|------|")
153
+ for tgt, rtype in outs:
154
+ parts.append(f"| {_wiki_link(tgt)} | {rtype} |")
155
+ parts.append("")
156
+
157
+ # Incoming relationships
158
+ ins = incoming.get(name, [])
159
+ if ins:
160
+ parts.append("## Referenced By")
161
+ parts.append("")
162
+ parts.append("| Source | Type |")
163
+ parts.append("|--------|------|")
164
+ for src, rtype in ins:
165
+ parts.append(f"| {_wiki_link(src)} | {rtype} |")
166
+ parts.append("")
167
+
168
+ # Occurrences / sources
169
+ occs = node.get("occurrences", [])
170
+ if occs:
171
+ parts.append("## Sources")
172
+ parts.append("")
173
+ for occ in occs:
174
+ src = occ.get("source", "unknown")
175
+ ts = occ.get("timestamp", "")
176
+ text = occ.get("text", "")
177
+ line = f"- **{src}**"
178
+ if ts:
179
+ line += f" @ {ts}"
180
+ if text:
181
+ line += f": _{text}_"
182
+ parts.append(line)
183
+ parts.append("")
184
+
185
+ pages[page_name] = "\n".join(parts)
186
+
187
+ # --- Artifact pages ---
188
+ for art in artifacts:
189
+ page_name = _sanitize_filename(art.name)
190
+ if art.format == "json":
191
+ try:
192
+ data = json.loads(art.content)
193
+ content = f"```json\n{json.dumps(data, indent=2)}\n```"
194
+ except json.JSONDecodeError:
195
+ content = art.content
196
+ else:
197
+ content = art.content
198
+
199
+ pages[page_name] = f"# {art.name}\n\n{content}"
200
+
201
+ return pages
202
+
203
+
204
+def write_wiki(pages: Dict[str, str], output_dir: Path) -> List[Path]:
205
+ """Write wiki pages to a directory as .md files."""
206
+ output_dir.mkdir(parents=True, exist_ok=True)
207
+ paths = []
208
+ for name, content in pages.items():
209
+ path = output_dir / f"{name}.md"
210
+ path.write_text(content, encoding="utf-8")
211
+ paths.append(path)
212
+ return paths
213
+
214
+
215
+def push_wiki(wiki_dir: Path, repo: str, message: str = "Update wiki") -> bool:
216
+ """Push wiki pages to a GitHub wiki repo.
217
+
218
+ Clones the wiki repo, copies pages, commits and pushes.
219
+ The repo should be in 'owner/repo' format.
220
+ """
221
+ wiki_url = f"https://github.com/{repo}.wiki.git"
222
+
223
+ # Clone existing wiki (or init if empty)
224
+ clone_dir = wiki_dir / ".wiki_clone"
225
+ if clone_dir.exists():
226
+ subprocess.run(["rm", "-rf", str(clone_dir)], check=True)
227
+
228
+ result = subprocess.run(
229
+ ["git", "clone", wiki_url, str(clone_dir)],
230
+ capture_output=True,
231
+ text=True,
232
+ )
233
+
234
+ if result.returncode != 0:
235
+ # Wiki might not exist yet — init a new repo
236
+ clone_dir.mkdir(parents=True, exist_ok=True)
237
+ subprocess.run(["git", "init"], cwd=clone_dir, capture_output=True)
238
+ subprocess.run(
239
+ ["git", "remote", "add", "origin", wiki_url],
240
+ cwd=clone_dir,
241
+ capture_output=True,
242
+ )
243
+
244
+ # Copy wiki pages into clone
245
+ for md_file in wiki_dir.glob("*.md"):
246
+ if md_file.parent == wiki_dir:
247
+ dest = clone_dir / md_file.name
248
+ dest.write_text(md_file.read_text(encoding="utf-8"), encoding="utf-8")
249
+
250
+ # Commit and push
251
+ subprocess.run(["git", "add", "-A"], cwd=clone_dir, capture_output=True)
252
+ commit_result = subprocess.run(
253
+ ["git", "commit", "-m", message],
254
+ cwd=clone_dir,
255
+ capture_output=True,
256
+ text=True,
257
+ )
258
+ if commit_result.returncode != 0:
259
+ logger.info("No wiki changes to commit")
260
+ return True
261
+
262
+ push_result = subprocess.run(
263
+ ["git", "push", "origin", "master"],
264
+ cwd=clone_dir,
265
+ capture_output=True,
266
+ text=True,
267
+ )
268
+ if push_result.returncode != 0:
269
+ # Try main branch
270
+ push_result = subprocess.run(
271
+ ["git", "push", "origin", "main"],
272
+ cwd=clone_dir,
273
+ capture_output=True,
274
+ text=True,
275
+ )
276
+
277
+ if push_result.returncode == 0:
278
+ logger.info(f"Wiki pushed to {wiki_url}")
279
+ return True
280
+ else:
281
+ logger.error(f"Wiki push failed: {push_result.stderr}")
282
+ return False
283
+
284
+
285
+class WikiGeneratorSkill(Skill):
286
+ name = "wiki_generator"
287
+ description = "Generate a GitHub wiki from knowledge graph and artifacts"
288
+
289
+ def execute(self, context: AgentContext, **kwargs) -> Artifact:
290
+ kg_data = context.knowledge_graph.to_dict()
291
+ pages = generate_wiki(
292
+ kg_data,
293
+ artifacts=context.artifacts,
294
+ title=kwargs.get("title", "Knowledge Base"),
295
+ )
296
+
297
+ # Return a summary artifact; actual pages are written via write_wiki()
298
+ page_list = sorted(pages.keys())
299
+ summary_parts = [
300
+ f"Generated {len(pages)} wiki pages:",
301
+ "",
302
+ ]
303
+ for name in page_list:
304
+ summary_parts.append(f"- {name}.md")
305
+
306
+ return Artifact(
307
+ name="Wiki",
308
+ content="\n".join(summary_parts),
309
+ artifact_type="wiki",
310
+ format="markdown",
311
+ metadata={"pages": pages},
312
+ )
313
+
314
+
315
+register_skill(WikiGeneratorSkill())
--- a/video_processor/agent/skills/wiki_generator.py
+++ b/video_processor/agent/skills/wiki_generator.py
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/agent/skills/wiki_generator.py
+++ b/video_processor/agent/skills/wiki_generator.py
@@ -0,0 +1,315 @@
1 """Skill: Generate a GitHub wiki from knowledge graph and artifacts."""
2
3 import json
4 import logging
5 import subprocess
6 from pathlib import Path
7 from typing import Dict, List, Optional
8
9 from video_processor.agent.skills.base import (
10 AgentContext,
11 Artifact,
12 Skill,
13 register_skill,
14 )
15
16 logger = logging.getLogger(__name__)
17
18
19 def _sanitize_filename(name: str) -> str:
20 """Convert entity name to a wiki-safe filename."""
21 return name.replace("/", "-").replace("\\", "-").replace(" ", "-").replace(".", "-")
22
23
24 def _wiki_link(name: str) -> str:
25 """Create a GitHub wiki-style markdown link."""
26 safe = _sanitize_filename(name)
27 return f"[{name}]({safe})"
28
29
30 def generate_wiki(
31 kg_data: dict,
32 artifacts: Optional[List[Artifact]] = None,
33 title: str = "Knowledge Base",
34 ) -> Dict[str, str]:
35 """Generate a dict of {filename: markdown_content} for a GitHub wiki.
36
37 Returns pages for: Home, _Sidebar, entity type indexes, individual
38 entity pages, and any planning artifacts.
39 """
40 pages: Dict[str, str] = {}
41 artifacts = artifacts or []
42
43 nodes = kg_data.get("nodes", [])
44 relationships = kg_data.get("relationships", [])
45
46 # Group entities by type
47 by_type: Dict[str, list] = {}
48 node_lookup: Dict[str, dict] = {}
49 for node in nodes:
50 name = node.get("name", node.get("id", ""))
51 ntype = node.get("type", "concept")
52 by_type.setdefault(ntype, []).append(node)
53 node_lookup[name.lower()] = node
54
55 # Build relationship index (outgoing and incoming per entity)
56 outgoing: Dict[str, list] = {}
57 incoming: Dict[str, list] = {}
58 for rel in relationships:
59 src = rel.get("source", "")
60 tgt = rel.get("target", "")
61 rtype = rel.get("type", "related_to")
62 outgoing.setdefault(src, []).append((tgt, rtype))
63 incoming.setdefault(tgt, []).append((src, rtype))
64
65 # --- Home page ---
66 home_parts = [
67 f"# {title}",
68 "",
69 f"**{len(nodes)}** entities | **{len(relationships)}** relationships",
70 "",
71 "## Entity Types",
72 "",
73 ]
74 for etype, elist in sorted(by_type.items()):
75 home_parts.append(f"- {_wiki_link(etype.title())} ({len(elist)})")
76
77 if artifacts:
78 home_parts.append("")
79 home_parts.append("## Planning Artifacts")
80 home_parts.append("")
81 for art in artifacts:
82 safe = _sanitize_filename(art.name)
83 home_parts.append(f"- [{art.name}]({safe})")
84
85 pages["Home"] = "\n".join(home_parts)
86
87 # --- Sidebar ---
88 sidebar_parts = [f"**{title}**", "", "**Navigation**", "", "- [Home](Home)", ""]
89 sidebar_parts.append("**Entity Types**")
90 sidebar_parts.append("")
91 for etype in sorted(by_type.keys()):
92 sidebar_parts.append(f"- {_wiki_link(etype.title())}")
93
94 if artifacts:
95 sidebar_parts.append("")
96 sidebar_parts.append("**Artifacts**")
97 sidebar_parts.append("")
98 for art in artifacts:
99 safe = _sanitize_filename(art.name)
100 sidebar_parts.append(f"- [{art.name}]({safe})")
101
102 pages["_Sidebar"] = "\n".join(sidebar_parts)
103
104 # --- Type index pages ---
105 for etype, elist in sorted(by_type.items()):
106 page_name = _sanitize_filename(etype.title())
107 parts = [
108 f"# {etype.title()}",
109 "",
110 f"{len(elist)} entities of type **{etype}**.",
111 "",
112 "| Entity | Descriptions |",
113 "|--------|-------------|",
114 ]
115 for node in sorted(elist, key=lambda n: n.get("name", "")):
116 name = node.get("name", "")
117 descs = node.get("descriptions", [])
118 desc_text = "; ".join(descs[:2]) if descs else "—"
119 parts.append(f"| {_wiki_link(name)} | {desc_text} |")
120
121 pages[page_name] = "\n".join(parts)
122
123 # --- Individual entity pages ---
124 for node in nodes:
125 name = node.get("name", "")
126 if not name:
127 continue
128 ntype = node.get("type", "concept")
129 descs = node.get("descriptions", [])
130 page_name = _sanitize_filename(name)
131
132 parts = [
133 f"# {name}",
134 "",
135 f"**Type:** {ntype}",
136 "",
137 ]
138
139 if descs:
140 parts.append("## Descriptions")
141 parts.append("")
142 for d in descs:
143 parts.append(f"- {d}")
144 parts.append("")
145
146 # Outgoing relationships
147 outs = outgoing.get(name, [])
148 if outs:
149 parts.append("## Relationships")
150 parts.append("")
151 parts.append("| Target | Type |")
152 parts.append("|--------|------|")
153 for tgt, rtype in outs:
154 parts.append(f"| {_wiki_link(tgt)} | {rtype} |")
155 parts.append("")
156
157 # Incoming relationships
158 ins = incoming.get(name, [])
159 if ins:
160 parts.append("## Referenced By")
161 parts.append("")
162 parts.append("| Source | Type |")
163 parts.append("|--------|------|")
164 for src, rtype in ins:
165 parts.append(f"| {_wiki_link(src)} | {rtype} |")
166 parts.append("")
167
168 # Occurrences / sources
169 occs = node.get("occurrences", [])
170 if occs:
171 parts.append("## Sources")
172 parts.append("")
173 for occ in occs:
174 src = occ.get("source", "unknown")
175 ts = occ.get("timestamp", "")
176 text = occ.get("text", "")
177 line = f"- **{src}**"
178 if ts:
179 line += f" @ {ts}"
180 if text:
181 line += f": _{text}_"
182 parts.append(line)
183 parts.append("")
184
185 pages[page_name] = "\n".join(parts)
186
187 # --- Artifact pages ---
188 for art in artifacts:
189 page_name = _sanitize_filename(art.name)
190 if art.format == "json":
191 try:
192 data = json.loads(art.content)
193 content = f"```json\n{json.dumps(data, indent=2)}\n```"
194 except json.JSONDecodeError:
195 content = art.content
196 else:
197 content = art.content
198
199 pages[page_name] = f"# {art.name}\n\n{content}"
200
201 return pages
202
203
204 def write_wiki(pages: Dict[str, str], output_dir: Path) -> List[Path]:
205 """Write wiki pages to a directory as .md files."""
206 output_dir.mkdir(parents=True, exist_ok=True)
207 paths = []
208 for name, content in pages.items():
209 path = output_dir / f"{name}.md"
210 path.write_text(content, encoding="utf-8")
211 paths.append(path)
212 return paths
213
214
215 def push_wiki(wiki_dir: Path, repo: str, message: str = "Update wiki") -> bool:
216 """Push wiki pages to a GitHub wiki repo.
217
218 Clones the wiki repo, copies pages, commits and pushes.
219 The repo should be in 'owner/repo' format.
220 """
221 wiki_url = f"https://github.com/{repo}.wiki.git"
222
223 # Clone existing wiki (or init if empty)
224 clone_dir = wiki_dir / ".wiki_clone"
225 if clone_dir.exists():
226 subprocess.run(["rm", "-rf", str(clone_dir)], check=True)
227
228 result = subprocess.run(
229 ["git", "clone", wiki_url, str(clone_dir)],
230 capture_output=True,
231 text=True,
232 )
233
234 if result.returncode != 0:
235 # Wiki might not exist yet — init a new repo
236 clone_dir.mkdir(parents=True, exist_ok=True)
237 subprocess.run(["git", "init"], cwd=clone_dir, capture_output=True)
238 subprocess.run(
239 ["git", "remote", "add", "origin", wiki_url],
240 cwd=clone_dir,
241 capture_output=True,
242 )
243
244 # Copy wiki pages into clone
245 for md_file in wiki_dir.glob("*.md"):
246 if md_file.parent == wiki_dir:
247 dest = clone_dir / md_file.name
248 dest.write_text(md_file.read_text(encoding="utf-8"), encoding="utf-8")
249
250 # Commit and push
251 subprocess.run(["git", "add", "-A"], cwd=clone_dir, capture_output=True)
252 commit_result = subprocess.run(
253 ["git", "commit", "-m", message],
254 cwd=clone_dir,
255 capture_output=True,
256 text=True,
257 )
258 if commit_result.returncode != 0:
259 logger.info("No wiki changes to commit")
260 return True
261
262 push_result = subprocess.run(
263 ["git", "push", "origin", "master"],
264 cwd=clone_dir,
265 capture_output=True,
266 text=True,
267 )
268 if push_result.returncode != 0:
269 # Try main branch
270 push_result = subprocess.run(
271 ["git", "push", "origin", "main"],
272 cwd=clone_dir,
273 capture_output=True,
274 text=True,
275 )
276
277 if push_result.returncode == 0:
278 logger.info(f"Wiki pushed to {wiki_url}")
279 return True
280 else:
281 logger.error(f"Wiki push failed: {push_result.stderr}")
282 return False
283
284
285 class WikiGeneratorSkill(Skill):
286 name = "wiki_generator"
287 description = "Generate a GitHub wiki from knowledge graph and artifacts"
288
289 def execute(self, context: AgentContext, **kwargs) -> Artifact:
290 kg_data = context.knowledge_graph.to_dict()
291 pages = generate_wiki(
292 kg_data,
293 artifacts=context.artifacts,
294 title=kwargs.get("title", "Knowledge Base"),
295 )
296
297 # Return a summary artifact; actual pages are written via write_wiki()
298 page_list = sorted(pages.keys())
299 summary_parts = [
300 f"Generated {len(pages)} wiki pages:",
301 "",
302 ]
303 for name in page_list:
304 summary_parts.append(f"- {name}.md")
305
306 return Artifact(
307 name="Wiki",
308 content="\n".join(summary_parts),
309 artifact_type="wiki",
310 format="markdown",
311 metadata={"pages": pages},
312 )
313
314
315 register_skill(WikiGeneratorSkill())
--- video_processor/cli/commands.py
+++ video_processor/cli/commands.py
@@ -1412,10 +1412,71 @@
14121412
click.echo(f" Chunks: {total_chunks}")
14131413
click.echo(f" Entities: {entity_count}")
14141414
click.echo(f" Relationships: {rel_count}")
14151415
click.echo(f" Knowledge graph: {kg_path}")
14161416
1417
+
1418
+@cli.group()
1419
+def wiki():
1420
+ """Generate and push GitHub wikis from knowledge graphs."""
1421
+ pass
1422
+
1423
+
1424
+@wiki.command("generate")
1425
+@click.argument("db_path", type=click.Path(exists=True))
1426
+@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory for wiki")
1427
+@click.option("--title", type=str, default="Knowledge Base", help="Wiki title")
1428
+def wiki_generate(db_path, output, title):
1429
+ """Generate a GitHub wiki from a knowledge graph.
1430
+
1431
+ Examples:
1432
+
1433
+ planopticon wiki generate knowledge_graph.db -o ./wiki
1434
+
1435
+ planopticon wiki generate results/kg.db --title "Project Wiki"
1436
+ """
1437
+ from video_processor.agent.skills.wiki_generator import generate_wiki, write_wiki
1438
+ from video_processor.integrators.knowledge_graph import KnowledgeGraph
1439
+
1440
+ db_path = Path(db_path)
1441
+ out_dir = Path(output) if output else Path.cwd() / "wiki"
1442
+
1443
+ kg = KnowledgeGraph(db_path=db_path)
1444
+ kg_data = kg.to_dict()
1445
+ pages = generate_wiki(kg_data, title=title)
1446
+ written = write_wiki(pages, out_dir)
1447
+
1448
+ click.echo(f"Generated {len(written)} wiki pages in {out_dir}")
1449
+ for p in sorted(written):
1450
+ click.echo(f" {p.name}")
1451
+
1452
+
1453
+@wiki.command("push")
1454
+@click.argument("wiki_dir", type=click.Path(exists=True))
1455
+@click.argument("repo", type=str)
1456
+@click.option("--message", "-m", type=str, default="Update wiki", help="Commit message")
1457
+def wiki_push(wiki_dir, repo, message):
1458
+ """Push generated wiki pages to a GitHub wiki repo.
1459
+
1460
+ REPO should be in 'owner/repo' format.
1461
+
1462
+ Examples:
1463
+
1464
+ planopticon wiki push ./wiki ConflictHQ/PlanOpticon
1465
+
1466
+ planopticon wiki push ./wiki owner/repo -m "Add entity pages"
1467
+ """
1468
+ from video_processor.agent.skills.wiki_generator import push_wiki
1469
+
1470
+ wiki_dir = Path(wiki_dir)
1471
+ success = push_wiki(wiki_dir, repo, message=message)
1472
+ if success:
1473
+ click.echo(f"Wiki pushed to https://github.com/{repo}/wiki")
1474
+ else:
1475
+ click.echo("Wiki push failed. Check auth and repo permissions.", err=True)
1476
+ sys.exit(1)
1477
+
14171478
14181479
@cli.group()
14191480
def kg():
14201481
"""Knowledge graph utilities: convert, sync, and inspect."""
14211482
pass
14221483
--- video_processor/cli/commands.py
+++ video_processor/cli/commands.py
@@ -1412,10 +1412,71 @@
1412 click.echo(f" Chunks: {total_chunks}")
1413 click.echo(f" Entities: {entity_count}")
1414 click.echo(f" Relationships: {rel_count}")
1415 click.echo(f" Knowledge graph: {kg_path}")
1416
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1417
1418 @cli.group()
1419 def kg():
1420 """Knowledge graph utilities: convert, sync, and inspect."""
1421 pass
1422
--- video_processor/cli/commands.py
+++ video_processor/cli/commands.py
@@ -1412,10 +1412,71 @@
1412 click.echo(f" Chunks: {total_chunks}")
1413 click.echo(f" Entities: {entity_count}")
1414 click.echo(f" Relationships: {rel_count}")
1415 click.echo(f" Knowledge graph: {kg_path}")
1416
1417
1418 @cli.group()
1419 def wiki():
1420 """Generate and push GitHub wikis from knowledge graphs."""
1421 pass
1422
1423
1424 @wiki.command("generate")
1425 @click.argument("db_path", type=click.Path(exists=True))
1426 @click.option("-o", "--output", type=click.Path(), default=None, help="Output directory for wiki")
1427 @click.option("--title", type=str, default="Knowledge Base", help="Wiki title")
1428 def wiki_generate(db_path, output, title):
1429 """Generate a GitHub wiki from a knowledge graph.
1430
1431 Examples:
1432
1433 planopticon wiki generate knowledge_graph.db -o ./wiki
1434
1435 planopticon wiki generate results/kg.db --title "Project Wiki"
1436 """
1437 from video_processor.agent.skills.wiki_generator import generate_wiki, write_wiki
1438 from video_processor.integrators.knowledge_graph import KnowledgeGraph
1439
1440 db_path = Path(db_path)
1441 out_dir = Path(output) if output else Path.cwd() / "wiki"
1442
1443 kg = KnowledgeGraph(db_path=db_path)
1444 kg_data = kg.to_dict()
1445 pages = generate_wiki(kg_data, title=title)
1446 written = write_wiki(pages, out_dir)
1447
1448 click.echo(f"Generated {len(written)} wiki pages in {out_dir}")
1449 for p in sorted(written):
1450 click.echo(f" {p.name}")
1451
1452
1453 @wiki.command("push")
1454 @click.argument("wiki_dir", type=click.Path(exists=True))
1455 @click.argument("repo", type=str)
1456 @click.option("--message", "-m", type=str, default="Update wiki", help="Commit message")
1457 def wiki_push(wiki_dir, repo, message):
1458 """Push generated wiki pages to a GitHub wiki repo.
1459
1460 REPO should be in 'owner/repo' format.
1461
1462 Examples:
1463
1464 planopticon wiki push ./wiki ConflictHQ/PlanOpticon
1465
1466 planopticon wiki push ./wiki owner/repo -m "Add entity pages"
1467 """
1468 from video_processor.agent.skills.wiki_generator import push_wiki
1469
1470 wiki_dir = Path(wiki_dir)
1471 success = push_wiki(wiki_dir, repo, message=message)
1472 if success:
1473 click.echo(f"Wiki pushed to https://github.com/{repo}/wiki")
1474 else:
1475 click.echo("Wiki push failed. Check auth and repo permissions.", err=True)
1476 sys.exit(1)
1477
1478
1479 @cli.group()
1480 def kg():
1481 """Knowledge graph utilities: convert, sync, and inspect."""
1482 pass
1483
--- video_processor/sources/__init__.py
+++ video_processor/sources/__init__.py
@@ -1,36 +1,48 @@
1
-"""Cloud and web source integrations for fetching content from remote sources."""
1
+"""Cloud, web, and notes source integrations for fetching content from remote sources."""
22
33
from video_processor.sources.base import BaseSource, SourceFile
44
55
__all__ = [
66
"BaseSource",
77
"SourceFile",
8
+ "AppleNotesSource",
89
"ArxivSource",
910
"GitHubSource",
1011
"GoogleDriveSource",
12
+ "GoogleKeepSource",
13
+ "GWSSource",
1114
"HackerNewsSource",
15
+ "LogseqSource",
16
+ "M365Source",
17
+ "NotionSource",
18
+ "ObsidianSource",
19
+ "OneNoteSource",
1220
"PodcastSource",
1321
"RedditSource",
1422
"RSSSource",
1523
"TwitterSource",
16
- "GWSSource",
17
- "M365Source",
1824
"WebSource",
1925
"YouTubeSource",
2026
]
2127
2228
2329
def __getattr__(name: str):
2430
"""Lazy imports to avoid pulling in optional dependencies at import time."""
2531
_lazy_map = {
32
+ "AppleNotesSource": "video_processor.sources.apple_notes_source",
2633
"ArxivSource": "video_processor.sources.arxiv_source",
2734
"GitHubSource": "video_processor.sources.github_source",
2835
"GoogleDriveSource": "video_processor.sources.google_drive",
36
+ "GoogleKeepSource": "video_processor.sources.google_keep_source",
2937
"GWSSource": "video_processor.sources.gws_source",
30
- "M365Source": "video_processor.sources.m365_source",
3138
"HackerNewsSource": "video_processor.sources.hackernews_source",
39
+ "LogseqSource": "video_processor.sources.logseq_source",
40
+ "M365Source": "video_processor.sources.m365_source",
41
+ "NotionSource": "video_processor.sources.notion_source",
42
+ "ObsidianSource": "video_processor.sources.obsidian_source",
43
+ "OneNoteSource": "video_processor.sources.onenote_source",
3244
"PodcastSource": "video_processor.sources.podcast_source",
3345
"RedditSource": "video_processor.sources.reddit_source",
3446
"RSSSource": "video_processor.sources.rss_source",
3547
"TwitterSource": "video_processor.sources.twitter_source",
3648
"WebSource": "video_processor.sources.web_source",
3749
3850
ADDED video_processor/sources/apple_notes_source.py
3951
ADDED video_processor/sources/google_keep_source.py
4052
ADDED video_processor/sources/logseq_source.py
4153
ADDED video_processor/sources/notion_source.py
4254
ADDED video_processor/sources/obsidian_source.py
4355
ADDED video_processor/sources/onenote_source.py
--- video_processor/sources/__init__.py
+++ video_processor/sources/__init__.py
@@ -1,36 +1,48 @@
1 """Cloud and web source integrations for fetching content from remote sources."""
2
3 from video_processor.sources.base import BaseSource, SourceFile
4
5 __all__ = [
6 "BaseSource",
7 "SourceFile",
 
8 "ArxivSource",
9 "GitHubSource",
10 "GoogleDriveSource",
 
 
11 "HackerNewsSource",
 
 
 
 
 
12 "PodcastSource",
13 "RedditSource",
14 "RSSSource",
15 "TwitterSource",
16 "GWSSource",
17 "M365Source",
18 "WebSource",
19 "YouTubeSource",
20 ]
21
22
23 def __getattr__(name: str):
24 """Lazy imports to avoid pulling in optional dependencies at import time."""
25 _lazy_map = {
 
26 "ArxivSource": "video_processor.sources.arxiv_source",
27 "GitHubSource": "video_processor.sources.github_source",
28 "GoogleDriveSource": "video_processor.sources.google_drive",
 
29 "GWSSource": "video_processor.sources.gws_source",
30 "M365Source": "video_processor.sources.m365_source",
31 "HackerNewsSource": "video_processor.sources.hackernews_source",
 
 
 
 
 
32 "PodcastSource": "video_processor.sources.podcast_source",
33 "RedditSource": "video_processor.sources.reddit_source",
34 "RSSSource": "video_processor.sources.rss_source",
35 "TwitterSource": "video_processor.sources.twitter_source",
36 "WebSource": "video_processor.sources.web_source",
37
38 DDED video_processor/sources/apple_notes_source.py
39 DDED video_processor/sources/google_keep_source.py
40 DDED video_processor/sources/logseq_source.py
41 DDED video_processor/sources/notion_source.py
42 DDED video_processor/sources/obsidian_source.py
43 DDED video_processor/sources/onenote_source.py
--- video_processor/sources/__init__.py
+++ video_processor/sources/__init__.py
@@ -1,36 +1,48 @@
1 """Cloud, web, and notes source integrations for fetching content from remote sources."""
2
3 from video_processor.sources.base import BaseSource, SourceFile
4
5 __all__ = [
6 "BaseSource",
7 "SourceFile",
8 "AppleNotesSource",
9 "ArxivSource",
10 "GitHubSource",
11 "GoogleDriveSource",
12 "GoogleKeepSource",
13 "GWSSource",
14 "HackerNewsSource",
15 "LogseqSource",
16 "M365Source",
17 "NotionSource",
18 "ObsidianSource",
19 "OneNoteSource",
20 "PodcastSource",
21 "RedditSource",
22 "RSSSource",
23 "TwitterSource",
 
 
24 "WebSource",
25 "YouTubeSource",
26 ]
27
28
29 def __getattr__(name: str):
30 """Lazy imports to avoid pulling in optional dependencies at import time."""
31 _lazy_map = {
32 "AppleNotesSource": "video_processor.sources.apple_notes_source",
33 "ArxivSource": "video_processor.sources.arxiv_source",
34 "GitHubSource": "video_processor.sources.github_source",
35 "GoogleDriveSource": "video_processor.sources.google_drive",
36 "GoogleKeepSource": "video_processor.sources.google_keep_source",
37 "GWSSource": "video_processor.sources.gws_source",
 
38 "HackerNewsSource": "video_processor.sources.hackernews_source",
39 "LogseqSource": "video_processor.sources.logseq_source",
40 "M365Source": "video_processor.sources.m365_source",
41 "NotionSource": "video_processor.sources.notion_source",
42 "ObsidianSource": "video_processor.sources.obsidian_source",
43 "OneNoteSource": "video_processor.sources.onenote_source",
44 "PodcastSource": "video_processor.sources.podcast_source",
45 "RedditSource": "video_processor.sources.reddit_source",
46 "RSSSource": "video_processor.sources.rss_source",
47 "TwitterSource": "video_processor.sources.twitter_source",
48 "WebSource": "video_processor.sources.web_source",
49
50 DDED video_processor/sources/apple_notes_source.py
51 DDED video_processor/sources/google_keep_source.py
52 DDED video_processor/sources/logseq_source.py
53 DDED video_processor/sources/notion_source.py
54 DDED video_processor/sources/obsidian_source.py
55 DDED video_processor/sources/onenote_source.py
--- a/video_processor/sources/apple_notes_source.py
+++ b/video_processor/sources/apple_notes_source.py
@@ -0,0 +1,178 @@
1
+"""Apple Notes source connector via osascript (macOS only)."""
2
+
3
+import logging
4
+import re
5
+import subprocess
6
+import sys
7
+from pathlib import Path
8
+from typing import List, Optional
9
+
10
+from video_processor.sources.base import BaseSource, SourceFile
11
+
12
+logger = logging.getLogger(__name__)
13
+
14
+
15
+class AppleNotesSource(BaseSource):
16
+ """
17
+ Fetch notes from Apple Notes using osascript (AppleScript).
18
+
19
+ Only works on macOS. Requires the Notes app to be available
20
+ and permission for osascript to access it.
21
+ """
22
+
23
+ def __init__(self, folder: Optional[str] = None):
24
+ self.folder = folder
25
+
26
+ def authenticate(self) -> bool:
27
+ """Check that we are running on macOS."""
28
+ if sys.platform != "darwin":
29
+ logger.error("Apple Notes is only available on macOS (current: %s)", sys.platform)
30
+ return False
31
+ return True
32
+
33
+ def list_videos(
34
+ self,
35
+ folder_id: Optional[str] = None,
36
+ folder_path: Optional[str] = None,
37
+ patterns: Optional[List[str]] = None,
38
+ ) -> List[SourceFile]:
39
+ """List notes from Apple Notes via osascript."""
40
+ if not self.authenticate():
41
+ return []
42
+
43
+ if self.folder:
44
+ script = (
45
+ 'tell application "Notes"\n'
46
+ " set noteList to {}\n"
47
+ f" repeat with f in folders of default account\n"
48
+ f' if name of f is "{self.folder}" then\n'
49
+ " repeat with n in notes of f\n"
50
+ ' set end of noteList to (id of n) & "|||" & (name of n)\n'
51
+ " end repeat\n"
52
+ " end if\n"
53
+ " end repeat\n"
54
+ " return noteList\n"
55
+ "end tell"
56
+ )
57
+ else:
58
+ script = (
59
+ 'tell application "Notes"\n'
60
+ " set noteList to {}\n"
61
+ " repeat with n in notes of default account\n"
62
+ ' set end of noteList to (id of n) & "|||" & (name of n)\n'
63
+ " end repeat\n"
64
+ " return noteList\n"
65
+ "end tell"
66
+ )
67
+
68
+ try:
69
+ result = subprocess.run(
70
+ ["osascript", "-e", script],
71
+ capture_output=True,
72
+ text=True,
73
+ timeout=30,
74
+ )
75
+ except FileNotFoundError:
76
+ logger.error("osascript not found. Apple Notes requires macOS.")
77
+ return []
78
+ except subprocess.TimeoutExpired:
79
+ logger.error("osascript timed out while listing notes.")
80
+ return []
81
+
82
+ if result.returncode != 0:
83
+ logger.error("Failed to list notes: %s", result.stderr.strip())
84
+ return []
85
+
86
+ return self._parse_note_list(result.stdout.strip())
87
+
88
+ def _parse_note_list(self, output: str) -> List[SourceFile]:
89
+ """Parse osascript output into SourceFile objects.
90
+
91
+ Expected format: comma-separated items of 'id|||name' pairs.
92
+ """
93
+ files: List[SourceFile] = []
94
+ if not output:
95
+ return files
96
+
97
+ # AppleScript returns a flat comma-separated list
98
+ entries = output.split(", ")
99
+ for entry in entries:
100
+ entry = entry.strip()
101
+ if "|||" not in entry:
102
+ continue
103
+ note_id, _, name = entry.partition("|||")
104
+ note_id = note_id.strip()
105
+ name = name.strip()
106
+ if note_id and name:
107
+ files.append(
108
+ SourceFile(
109
+ name=name,
110
+ id=note_id,
111
+ mime_type="text/plain",
112
+ )
113
+ )
114
+
115
+ logger.info("Found %d notes", len(files))
116
+ return files
117
+
118
+ def download(self, file: SourceFile, destination: Path) -> Path:
119
+ """Download a note's content as plain text."""
120
+ destination = Path(destination)
121
+ destination.parent.mkdir(parents=True, exist_ok=True)
122
+
123
+ script = (
124
+ 'tell application "Notes"\n'
125
+ f' set theNote to note id "{file.id}" of default account\n'
126
+ " return body of theNote\n"
127
+ "end tell"
128
+ )
129
+
130
+ try:
131
+ result = subprocess.run(
132
+ ["osascript", "-e", script],
133
+ capture_output=True,
134
+ text=True,
135
+ timeout=30,
136
+ )
137
+ except FileNotFoundError:
138
+ raise RuntimeError("osascript not found. Apple Notes requires macOS.")
139
+ except subprocess.TimeoutExpired:
140
+ raise RuntimeError(f"osascript timed out fetching note {file.id}")
141
+
142
+ if result.returncode != 0:
143
+ raise RuntimeError(f"Failed to fetch note {file.id}: {result.stderr.strip()}")
144
+
145
+ html_body = result.stdout.strip()
146
+ text = self._html_to_text(html_body)
147
+
148
+ # Prepend title
149
+ content = f"# {file.name}\n\n{text}"
150
+ destination.write_text(content, encoding="utf-8")
151
+ logger.info("Saved Apple Note to %s", destination)
152
+ return destination
153
+
154
+ @staticmethod
155
+ def _html_to_text(html: str) -> str:
156
+ """Strip HTML tags and return plain text.
157
+
158
+ Apple Notes returns note bodies as HTML. This uses regex-based
159
+ stripping similar to web_source._strip_html_tags.
160
+ """
161
+ if not html:
162
+ return ""
163
+ # Replace <br> variants with newlines
164
+ text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE)
165
+ # Replace block-level closing tags with newlines
166
+ text = re.sub(r"</(?:p|div|li|tr|h[1-6])>", "\n", text, flags=re.IGNORECASE)
167
+ # Remove all remaining tags
168
+ text = re.sub(r"<[^>]+>", "", text)
169
+ # Decode common HTML entities
170
+ text = text.replace("&amp;", "&")
171
+ text = text.replace("&lt;", "<")
172
+ text = text.replace("&gt;", ">")
173
+ text = text.replace("&quot;", '"')
174
+ text = text.replace("&#39;", "'")
175
+ text = text.replace("&nbsp;", " ")
176
+ # Collapse excessive blank lines
177
+ text = re.sub(r"\n{3,}", "\n\n", text)
178
+ return text.strip()
--- a/video_processor/sources/apple_notes_source.py
+++ b/video_processor/sources/apple_notes_source.py
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/apple_notes_source.py
+++ b/video_processor/sources/apple_notes_source.py
@@ -0,0 +1,178 @@
1 """Apple Notes source connector via osascript (macOS only)."""
2
3 import logging
4 import re
5 import subprocess
6 import sys
7 from pathlib import Path
8 from typing import List, Optional
9
10 from video_processor.sources.base import BaseSource, SourceFile
11
12 logger = logging.getLogger(__name__)
13
14
15 class AppleNotesSource(BaseSource):
16 """
17 Fetch notes from Apple Notes using osascript (AppleScript).
18
19 Only works on macOS. Requires the Notes app to be available
20 and permission for osascript to access it.
21 """
22
23 def __init__(self, folder: Optional[str] = None):
24 self.folder = folder
25
26 def authenticate(self) -> bool:
27 """Check that we are running on macOS."""
28 if sys.platform != "darwin":
29 logger.error("Apple Notes is only available on macOS (current: %s)", sys.platform)
30 return False
31 return True
32
33 def list_videos(
34 self,
35 folder_id: Optional[str] = None,
36 folder_path: Optional[str] = None,
37 patterns: Optional[List[str]] = None,
38 ) -> List[SourceFile]:
39 """List notes from Apple Notes via osascript."""
40 if not self.authenticate():
41 return []
42
43 if self.folder:
44 script = (
45 'tell application "Notes"\n'
46 " set noteList to {}\n"
47 f" repeat with f in folders of default account\n"
48 f' if name of f is "{self.folder}" then\n'
49 " repeat with n in notes of f\n"
50 ' set end of noteList to (id of n) & "|||" & (name of n)\n'
51 " end repeat\n"
52 " end if\n"
53 " end repeat\n"
54 " return noteList\n"
55 "end tell"
56 )
57 else:
58 script = (
59 'tell application "Notes"\n'
60 " set noteList to {}\n"
61 " repeat with n in notes of default account\n"
62 ' set end of noteList to (id of n) & "|||" & (name of n)\n'
63 " end repeat\n"
64 " return noteList\n"
65 "end tell"
66 )
67
68 try:
69 result = subprocess.run(
70 ["osascript", "-e", script],
71 capture_output=True,
72 text=True,
73 timeout=30,
74 )
75 except FileNotFoundError:
76 logger.error("osascript not found. Apple Notes requires macOS.")
77 return []
78 except subprocess.TimeoutExpired:
79 logger.error("osascript timed out while listing notes.")
80 return []
81
82 if result.returncode != 0:
83 logger.error("Failed to list notes: %s", result.stderr.strip())
84 return []
85
86 return self._parse_note_list(result.stdout.strip())
87
88 def _parse_note_list(self, output: str) -> List[SourceFile]:
89 """Parse osascript output into SourceFile objects.
90
91 Expected format: comma-separated items of 'id|||name' pairs.
92 """
93 files: List[SourceFile] = []
94 if not output:
95 return files
96
97 # AppleScript returns a flat comma-separated list
98 entries = output.split(", ")
99 for entry in entries:
100 entry = entry.strip()
101 if "|||" not in entry:
102 continue
103 note_id, _, name = entry.partition("|||")
104 note_id = note_id.strip()
105 name = name.strip()
106 if note_id and name:
107 files.append(
108 SourceFile(
109 name=name,
110 id=note_id,
111 mime_type="text/plain",
112 )
113 )
114
115 logger.info("Found %d notes", len(files))
116 return files
117
118 def download(self, file: SourceFile, destination: Path) -> Path:
119 """Download a note's content as plain text."""
120 destination = Path(destination)
121 destination.parent.mkdir(parents=True, exist_ok=True)
122
123 script = (
124 'tell application "Notes"\n'
125 f' set theNote to note id "{file.id}" of default account\n'
126 " return body of theNote\n"
127 "end tell"
128 )
129
130 try:
131 result = subprocess.run(
132 ["osascript", "-e", script],
133 capture_output=True,
134 text=True,
135 timeout=30,
136 )
137 except FileNotFoundError:
138 raise RuntimeError("osascript not found. Apple Notes requires macOS.")
139 except subprocess.TimeoutExpired:
140 raise RuntimeError(f"osascript timed out fetching note {file.id}")
141
142 if result.returncode != 0:
143 raise RuntimeError(f"Failed to fetch note {file.id}: {result.stderr.strip()}")
144
145 html_body = result.stdout.strip()
146 text = self._html_to_text(html_body)
147
148 # Prepend title
149 content = f"# {file.name}\n\n{text}"
150 destination.write_text(content, encoding="utf-8")
151 logger.info("Saved Apple Note to %s", destination)
152 return destination
153
154 @staticmethod
155 def _html_to_text(html: str) -> str:
156 """Strip HTML tags and return plain text.
157
158 Apple Notes returns note bodies as HTML. This uses regex-based
159 stripping similar to web_source._strip_html_tags.
160 """
161 if not html:
162 return ""
163 # Replace <br> variants with newlines
164 text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE)
165 # Replace block-level closing tags with newlines
166 text = re.sub(r"</(?:p|div|li|tr|h[1-6])>", "\n", text, flags=re.IGNORECASE)
167 # Remove all remaining tags
168 text = re.sub(r"<[^>]+>", "", text)
169 # Decode common HTML entities
170 text = text.replace("&amp;", "&")
171 text = text.replace("&lt;", "<")
172 text = text.replace("&gt;", ">")
173 text = text.replace("&quot;", '"')
174 text = text.replace("&#39;", "'")
175 text = text.replace("&nbsp;", " ")
176 # Collapse excessive blank lines
177 text = re.sub(r"\n{3,}", "\n\n", text)
178 return text.strip()
--- a/video_processor/sources/google_keep_source.py
+++ b/video_processor/sources/google_keep_source.py
@@ -0,0 +1,170 @@
1
+"""Google Keep source connector using the gws CLI (googleworkspace/cli).
2
+
3
+Fetches notes from Google Keep via the `gws` CLI tool.
4
+Outputs plain text suitable for KG ingestion.
5
+
6
+Requires: npm install -g @googleworkspace/cli
7
+Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless)
8
+"""
9
+
10
+import json
11
+import logging
12
+import shutil
13
+import subprocess
14
+from pathlib import Path
15
+from typing import Any, Dict, List, Optional
16
+
17
+from video_processor.sources.base import BaseSource, SourceFile
18
+
19
+logger = logging.getLogger(__name__)
20
+
21
+
22
+def _run_gws(args: List[str], timeout: int = 30) -> Dict[str, Any]:
23
+ """Run a gws CLI command and return parsed JSON output."""
24
+ cmd = ["gws"] + args
25
+ proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
26
+ if proc.returncode != 0:
27
+ raise RuntimeError(f"gws {' '.join(args)} failed: {proc.stderr.strip()}")
28
+ try:
29
+ return json.loads(proc.stdout)
30
+ except json.JSONDecodeError:
31
+ return {"raw": proc.stdout.strip()}
32
+
33
+
34
+def _note_to_text(note: dict) -> str:
35
+ """Extract text content from a Google Keep note structure.
36
+
37
+ Handles plain text notes and checklists. Checklist items are formatted
38
+ as ``- [x] item`` (checked) or ``- [ ] item`` (unchecked).
39
+ """
40
+ parts: List[str] = []
41
+
42
+ title = note.get("title", "").strip()
43
+ if title:
44
+ parts.append(title)
45
+
46
+ body = note.get("body", note.get("textContent", "")).strip()
47
+ if body:
48
+ parts.append(body)
49
+
50
+ # Checklist items may appear under "list", "listContent", or "checklistItems"
51
+ list_items = note.get("list", note.get("listContent", note.get("checklistItems", [])))
52
+ if isinstance(list_items, list):
53
+ for item in list_items:
54
+ text = item.get("text", "").strip()
55
+ if not text:
56
+ continue
57
+ checked = item.get("checked", item.get("isChecked", False))
58
+ marker = "[x]" if checked else "[ ]"
59
+ parts.append(f"- {marker} {text}")
60
+
61
+ return "\n\n".join(parts) if parts else ""
62
+
63
+
64
+class GoogleKeepSource(BaseSource):
65
+ """
66
+ Fetch notes from Google Keep via the gws CLI.
67
+
68
+ Usage:
69
+ source = GoogleKeepSource() # all notes
70
+ source = GoogleKeepSource(label="meetings") # filter by label
71
+ files = source.list_videos()
72
+ source.download_all(files, Path("./notes"))
73
+ """
74
+
75
+ def __init__(self, label: Optional[str] = None):
76
+ self.label = label
77
+
78
+ def authenticate(self) -> bool:
79
+ """Check if gws CLI is installed and authenticated."""
80
+ if not shutil.which("gws"):
81
+ logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli")
82
+ return False
83
+ try:
84
+ _run_gws(["auth", "status"], timeout=10)
85
+ return True
86
+ except (RuntimeError, subprocess.TimeoutExpired):
87
+ logger.error("gws not authenticated. Run: gws auth login")
88
+ return False
89
+
90
+ def list_videos(
91
+ self,
92
+ folder_id: Optional[str] = None,
93
+ folder_path: Optional[str] = None,
94
+ patterns: Optional[List[str]] = None,
95
+ ) -> List[SourceFile]:
96
+ """List notes in Google Keep. Returns SourceFile per note."""
97
+ args = ["keep", "notes", "list", "--output", "json"]
98
+
99
+ if self.label:
100
+ args.extend(["--label", self.label])
101
+
102
+ try:
103
+ result = _run_gws(args, timeout=60)
104
+ except RuntimeError as e:
105
+ logger.error(f"Failed to list Keep notes: {e}")
106
+ return []
107
+
108
+ # Result may be a list directly or nested under a key
109
+ notes: List[dict] = []
110
+ if isinstance(result, list):
111
+ notes = result
112
+ elif isinstance(result, dict):
113
+ notes = result.get("notes", result.get("items", []))
114
+ # If we got a single note back (not a list), wrap it
115
+ if not notes and "id" in result and "raw" not in result:
116
+ notes = [result]
117
+
118
+ files: List[SourceFile] = []
119
+ for note in notes:
120
+ note_id = note.get("id", note.get("noteId", ""))
121
+ title = note.get("title", "Untitled Note").strip() or "Untitled Note"
122
+ modified = note.get("modifiedTime", note.get("updateTime"))
123
+
124
+ # Estimate size from text content
125
+ text = _note_to_text(note)
126
+ size = len(text.encode("utf-8")) if text else None
127
+
128
+ files.append(
129
+ SourceFile(
130
+ name=title,
131
+ id=str(note_id),
132
+ size_bytes=size,
133
+ mime_type="text/plain",
134
+ modified_at=modified,
135
+ )
136
+ )
137
+
138
+ logger.info(f"Found {len(files)} note(s) in Google Keep")
139
+ return files
140
+
141
+ def download(self, file: SourceFile, destination: Path) -> Path:
142
+ """Download a Keep note's content as a text file."""
143
+ destination = Path(destination)
144
+ destination.parent.mkdir(parents=True, exist_ok=True)
145
+
146
+ try:
147
+ result = _run_gws(
148
+ [
149
+ "keep",
150
+ "notes",
151
+ "get",
152
+ "--params",
153
+ json.dumps({"noteId": file.id}),
154
+ ],
155
+ timeout=30,
156
+ )
157
+ except RuntimeError as e:
158
+ raise RuntimeError(f"Failed to fetch Keep note {file.id}: {e}") from e
159
+
160
+ # result may be the note dict directly or wrapped
161
+ note = result if isinstance(result, dict) else {}
162
+ text = _note_to_text(note)
163
+
164
+ if not text:
165
+ # Fallback: use raw output if structured extraction yielded nothing
166
+ text = note.get("raw", json.dumps(note, indent=2))
167
+
168
+ destination.write_text(text, encoding="utf-8")
169
+ logger.info(f"Saved note '{file.name}' to {destination}")
170
+ return destination
--- a/video_processor/sources/google_keep_source.py
+++ b/video_processor/sources/google_keep_source.py
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/google_keep_source.py
+++ b/video_processor/sources/google_keep_source.py
@@ -0,0 +1,170 @@
1 """Google Keep source connector using the gws CLI (googleworkspace/cli).
2
3 Fetches notes from Google Keep via the `gws` CLI tool.
4 Outputs plain text suitable for KG ingestion.
5
6 Requires: npm install -g @googleworkspace/cli
7 Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless)
8 """
9
10 import json
11 import logging
12 import shutil
13 import subprocess
14 from pathlib import Path
15 from typing import Any, Dict, List, Optional
16
17 from video_processor.sources.base import BaseSource, SourceFile
18
19 logger = logging.getLogger(__name__)
20
21
22 def _run_gws(args: List[str], timeout: int = 30) -> Dict[str, Any]:
23 """Run a gws CLI command and return parsed JSON output."""
24 cmd = ["gws"] + args
25 proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
26 if proc.returncode != 0:
27 raise RuntimeError(f"gws {' '.join(args)} failed: {proc.stderr.strip()}")
28 try:
29 return json.loads(proc.stdout)
30 except json.JSONDecodeError:
31 return {"raw": proc.stdout.strip()}
32
33
34 def _note_to_text(note: dict) -> str:
35 """Extract text content from a Google Keep note structure.
36
37 Handles plain text notes and checklists. Checklist items are formatted
38 as ``- [x] item`` (checked) or ``- [ ] item`` (unchecked).
39 """
40 parts: List[str] = []
41
42 title = note.get("title", "").strip()
43 if title:
44 parts.append(title)
45
46 body = note.get("body", note.get("textContent", "")).strip()
47 if body:
48 parts.append(body)
49
50 # Checklist items may appear under "list", "listContent", or "checklistItems"
51 list_items = note.get("list", note.get("listContent", note.get("checklistItems", [])))
52 if isinstance(list_items, list):
53 for item in list_items:
54 text = item.get("text", "").strip()
55 if not text:
56 continue
57 checked = item.get("checked", item.get("isChecked", False))
58 marker = "[x]" if checked else "[ ]"
59 parts.append(f"- {marker} {text}")
60
61 return "\n\n".join(parts) if parts else ""
62
63
64 class GoogleKeepSource(BaseSource):
65 """
66 Fetch notes from Google Keep via the gws CLI.
67
68 Usage:
69 source = GoogleKeepSource() # all notes
70 source = GoogleKeepSource(label="meetings") # filter by label
71 files = source.list_videos()
72 source.download_all(files, Path("./notes"))
73 """
74
75 def __init__(self, label: Optional[str] = None):
76 self.label = label
77
78 def authenticate(self) -> bool:
79 """Check if gws CLI is installed and authenticated."""
80 if not shutil.which("gws"):
81 logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli")
82 return False
83 try:
84 _run_gws(["auth", "status"], timeout=10)
85 return True
86 except (RuntimeError, subprocess.TimeoutExpired):
87 logger.error("gws not authenticated. Run: gws auth login")
88 return False
89
90 def list_videos(
91 self,
92 folder_id: Optional[str] = None,
93 folder_path: Optional[str] = None,
94 patterns: Optional[List[str]] = None,
95 ) -> List[SourceFile]:
96 """List notes in Google Keep. Returns SourceFile per note."""
97 args = ["keep", "notes", "list", "--output", "json"]
98
99 if self.label:
100 args.extend(["--label", self.label])
101
102 try:
103 result = _run_gws(args, timeout=60)
104 except RuntimeError as e:
105 logger.error(f"Failed to list Keep notes: {e}")
106 return []
107
108 # Result may be a list directly or nested under a key
109 notes: List[dict] = []
110 if isinstance(result, list):
111 notes = result
112 elif isinstance(result, dict):
113 notes = result.get("notes", result.get("items", []))
114 # If we got a single note back (not a list), wrap it
115 if not notes and "id" in result and "raw" not in result:
116 notes = [result]
117
118 files: List[SourceFile] = []
119 for note in notes:
120 note_id = note.get("id", note.get("noteId", ""))
121 title = note.get("title", "Untitled Note").strip() or "Untitled Note"
122 modified = note.get("modifiedTime", note.get("updateTime"))
123
124 # Estimate size from text content
125 text = _note_to_text(note)
126 size = len(text.encode("utf-8")) if text else None
127
128 files.append(
129 SourceFile(
130 name=title,
131 id=str(note_id),
132 size_bytes=size,
133 mime_type="text/plain",
134 modified_at=modified,
135 )
136 )
137
138 logger.info(f"Found {len(files)} note(s) in Google Keep")
139 return files
140
141 def download(self, file: SourceFile, destination: Path) -> Path:
142 """Download a Keep note's content as a text file."""
143 destination = Path(destination)
144 destination.parent.mkdir(parents=True, exist_ok=True)
145
146 try:
147 result = _run_gws(
148 [
149 "keep",
150 "notes",
151 "get",
152 "--params",
153 json.dumps({"noteId": file.id}),
154 ],
155 timeout=30,
156 )
157 except RuntimeError as e:
158 raise RuntimeError(f"Failed to fetch Keep note {file.id}: {e}") from e
159
160 # result may be the note dict directly or wrapped
161 note = result if isinstance(result, dict) else {}
162 text = _note_to_text(note)
163
164 if not text:
165 # Fallback: use raw output if structured extraction yielded nothing
166 text = note.get("raw", json.dumps(note, indent=2))
167
168 destination.write_text(text, encoding="utf-8")
169 logger.info(f"Saved note '{file.name}' to {destination}")
170 return destination
--- a/video_processor/sources/logseq_source.py
+++ b/video_processor/sources/logseq_source.py
@@ -0,0 +1,200 @@
1
+"""Logseq graph source connector for ingesting markdown pages and journals."""
2
+
3
+import logging
4
+import re
5
+import shutil
6
+from datetime import datetime, timezone
7
+from pathlib import Path
8
+from typing import List, Optional, Tuple
9
+
10
+from video_processor.sources.base import BaseSource, SourceFile
11
+
12
+logger = logging.getLogger(__name__)
13
+
14
+
15
+def parse_page(path: Path) -> dict:
16
+ """Parse a Logseq markdown page and extract structured content.
17
+
18
+ Returns a dict with:
19
+ - properties: dict of page-level properties (key:: value lines at top)
20
+ - links: list of linked page names from [[wiki-links]]
21
+ - tags: list of tags from #tag and #[[tag]] occurrences
22
+ - block_refs: list of block reference IDs from ((block-id))
23
+ - body: full text content
24
+ """
25
+ text = path.read_text(encoding="utf-8")
26
+ lines = text.split("\n")
27
+
28
+ # Extract page properties (key:: value lines at the top of the file)
29
+ properties: dict = {}
30
+ body_start = 0
31
+ for i, line in enumerate(lines):
32
+ prop_match = re.match(r"^([A-Za-z][A-Za-z0-9_-]*)::\ ?(.*)", line)
33
+ if prop_match:
34
+ key = prop_match.group(1)
35
+ value = prop_match.group(2).strip()
36
+ properties[key] = value
37
+ body_start = i + 1
38
+ else:
39
+ break
40
+
41
+ body = "\n".join(lines[body_start:])
42
+
43
+ # Extract wiki-links: [[page]]
44
+ link_pattern = re.compile(r"\[\[([^\]]+)\]\]")
45
+ links = link_pattern.findall(body)
46
+ # Also pick up links from properties
47
+ for value in properties.values():
48
+ links.extend(link_pattern.findall(str(value)))
49
+
50
+ # Extract tags: #tag and #[[tag]]
51
+ # First get #[[multi word tag]] style
52
+ bracket_tag_pattern = re.compile(r"#\[\[([^\]]+)\]\]")
53
+ tags = bracket_tag_pattern.findall(text)
54
+ # Then get simple #tag style (exclude matches already captured as #[[...]])
55
+ # Remove bracket tags first to avoid double-matching
56
+ text_without_bracket_tags = bracket_tag_pattern.sub("", text)
57
+ simple_tag_pattern = re.compile(r"(?<!\w)#([A-Za-z][A-Za-z0-9_/-]*)")
58
+ tags.extend(simple_tag_pattern.findall(text_without_bracket_tags))
59
+
60
+ # Extract block references: ((block-id))
61
+ block_ref_pattern = re.compile(r"\(\(([a-f0-9-]+)\)\)")
62
+ block_refs = block_ref_pattern.findall(text)
63
+
64
+ return {
65
+ "properties": properties,
66
+ "links": links,
67
+ "tags": tags,
68
+ "block_refs": block_refs,
69
+ "body": body,
70
+ }
71
+
72
+
73
+def ingest_graph(graph_path: Path) -> dict:
74
+ """Ingest an entire Logseq graph and return structured data.
75
+
76
+ Returns a dict with:
77
+ - notes: list of dicts with name, tags, frontmatter (properties), text
78
+ - links: list of (source, target) tuples from wiki-links
79
+ """
80
+ graph_path = Path(graph_path)
81
+ notes: List[dict] = []
82
+ links: List[Tuple[str, str]] = []
83
+
84
+ md_files: List[Path] = []
85
+ pages_dir = graph_path / "pages"
86
+ journals_dir = graph_path / "journals"
87
+
88
+ if pages_dir.is_dir():
89
+ md_files.extend(sorted(pages_dir.rglob("*.md")))
90
+ if journals_dir.is_dir():
91
+ md_files.extend(sorted(journals_dir.rglob("*.md")))
92
+
93
+ logger.info("Found %d markdown files in graph %s", len(md_files), graph_path)
94
+
95
+ for md_file in md_files:
96
+ page_name = md_file.stem
97
+ try:
98
+ parsed = parse_page(md_file)
99
+ except Exception:
100
+ logger.warning("Failed to parse page %s", md_file)
101
+ continue
102
+
103
+ notes.append(
104
+ {
105
+ "name": page_name,
106
+ "tags": parsed["tags"],
107
+ "frontmatter": parsed["properties"],
108
+ "text": parsed["body"],
109
+ }
110
+ )
111
+
112
+ for linked_page in parsed["links"]:
113
+ links.append((page_name, linked_page))
114
+
115
+ logger.info(
116
+ "Ingested %d notes with %d links from graph %s",
117
+ len(notes),
118
+ len(links),
119
+ graph_path,
120
+ )
121
+ return {"notes": notes, "links": links}
122
+
123
+
124
+class LogseqSource(BaseSource):
125
+ """Source connector for Logseq graphs."""
126
+
127
+ def __init__(self, graph_path: str) -> None:
128
+ self.graph_path = Path(graph_path)
129
+
130
+ def authenticate(self) -> bool:
131
+ """Check that the graph path exists and has pages/ or journals/ dirs."""
132
+ if not self.graph_path.is_dir():
133
+ logger.error("Graph path does not exist: %s", self.graph_path)
134
+ return False
135
+ has_pages = (self.graph_path / "pages").is_dir()
136
+ has_journals = (self.graph_path / "journals").is_dir()
137
+ if not has_pages and not has_journals:
138
+ logger.error(
139
+ "No pages/ or journals/ directory found in graph: %s",
140
+ self.graph_path,
141
+ )
142
+ return False
143
+ logger.info(
144
+ "Logseq graph authenticated: %s (pages=%s, journals=%s)",
145
+ self.graph_path,
146
+ has_pages,
147
+ has_journals,
148
+ )
149
+ return True
150
+
151
+ def list_videos(
152
+ self,
153
+ folder_id: Optional[str] = None,
154
+ folder_path: Optional[str] = None,
155
+ patterns: Optional[List[str]] = None,
156
+ ) -> List[SourceFile]:
157
+ """List .md files in pages/ and journals/ as SourceFile objects."""
158
+ md_files: List[Path] = []
159
+
160
+ pages_dir = self.graph_path / "pages"
161
+ journals_dir = self.graph_path / "journals"
162
+
163
+ if folder_path:
164
+ search_root = self.graph_path / folder_path
165
+ if search_root.is_dir():
166
+ md_files.extend(sorted(search_root.rglob("*.md")))
167
+ else:
168
+ if pages_dir.is_dir():
169
+ md_files.extend(sorted(pages_dir.rglob("*.md")))
170
+ if journals_dir.is_dir():
171
+ md_files.extend(sorted(journals_dir.rglob("*.md")))
172
+
173
+ results: List[SourceFile] = []
174
+ for md_file in md_files:
175
+ relative = md_file.relative_to(self.graph_path)
176
+ stat = md_file.stat()
177
+ modified_dt = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc)
178
+
179
+ results.append(
180
+ SourceFile(
181
+ name=md_file.name,
182
+ id=str(relative),
183
+ size_bytes=stat.st_size,
184
+ mime_type="text/markdown",
185
+ modified_at=modified_dt.isoformat(),
186
+ path=str(relative),
187
+ )
188
+ )
189
+
190
+ logger.info("Listed %d files from graph %s", len(results), self.graph_path)
191
+ return results
192
+
193
+ def download(self, file: SourceFile, destination: Path) -> Path:
194
+ """Copy a graph file to the destination path."""
195
+ source = self.graph_path / file.id
196
+ destination = Path(destination)
197
+ destination.parent.mkdir(parents=True, exist_ok=True)
198
+ shutil.copy2(source, destination)
199
+ logger.info("Copied %s -> %s", source, destination)
200
+ return destination
--- a/video_processor/sources/logseq_source.py
+++ b/video_processor/sources/logseq_source.py
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/logseq_source.py
+++ b/video_processor/sources/logseq_source.py
@@ -0,0 +1,200 @@
1 """Logseq graph source connector for ingesting markdown pages and journals."""
2
3 import logging
4 import re
5 import shutil
6 from datetime import datetime, timezone
7 from pathlib import Path
8 from typing import List, Optional, Tuple
9
10 from video_processor.sources.base import BaseSource, SourceFile
11
12 logger = logging.getLogger(__name__)
13
14
15 def parse_page(path: Path) -> dict:
16 """Parse a Logseq markdown page and extract structured content.
17
18 Returns a dict with:
19 - properties: dict of page-level properties (key:: value lines at top)
20 - links: list of linked page names from [[wiki-links]]
21 - tags: list of tags from #tag and #[[tag]] occurrences
22 - block_refs: list of block reference IDs from ((block-id))
23 - body: full text content
24 """
25 text = path.read_text(encoding="utf-8")
26 lines = text.split("\n")
27
28 # Extract page properties (key:: value lines at the top of the file)
29 properties: dict = {}
30 body_start = 0
31 for i, line in enumerate(lines):
32 prop_match = re.match(r"^([A-Za-z][A-Za-z0-9_-]*)::\ ?(.*)", line)
33 if prop_match:
34 key = prop_match.group(1)
35 value = prop_match.group(2).strip()
36 properties[key] = value
37 body_start = i + 1
38 else:
39 break
40
41 body = "\n".join(lines[body_start:])
42
43 # Extract wiki-links: [[page]]
44 link_pattern = re.compile(r"\[\[([^\]]+)\]\]")
45 links = link_pattern.findall(body)
46 # Also pick up links from properties
47 for value in properties.values():
48 links.extend(link_pattern.findall(str(value)))
49
50 # Extract tags: #tag and #[[tag]]
51 # First get #[[multi word tag]] style
52 bracket_tag_pattern = re.compile(r"#\[\[([^\]]+)\]\]")
53 tags = bracket_tag_pattern.findall(text)
54 # Then get simple #tag style (exclude matches already captured as #[[...]])
55 # Remove bracket tags first to avoid double-matching
56 text_without_bracket_tags = bracket_tag_pattern.sub("", text)
57 simple_tag_pattern = re.compile(r"(?<!\w)#([A-Za-z][A-Za-z0-9_/-]*)")
58 tags.extend(simple_tag_pattern.findall(text_without_bracket_tags))
59
60 # Extract block references: ((block-id))
61 block_ref_pattern = re.compile(r"\(\(([a-f0-9-]+)\)\)")
62 block_refs = block_ref_pattern.findall(text)
63
64 return {
65 "properties": properties,
66 "links": links,
67 "tags": tags,
68 "block_refs": block_refs,
69 "body": body,
70 }
71
72
73 def ingest_graph(graph_path: Path) -> dict:
74 """Ingest an entire Logseq graph and return structured data.
75
76 Returns a dict with:
77 - notes: list of dicts with name, tags, frontmatter (properties), text
78 - links: list of (source, target) tuples from wiki-links
79 """
80 graph_path = Path(graph_path)
81 notes: List[dict] = []
82 links: List[Tuple[str, str]] = []
83
84 md_files: List[Path] = []
85 pages_dir = graph_path / "pages"
86 journals_dir = graph_path / "journals"
87
88 if pages_dir.is_dir():
89 md_files.extend(sorted(pages_dir.rglob("*.md")))
90 if journals_dir.is_dir():
91 md_files.extend(sorted(journals_dir.rglob("*.md")))
92
93 logger.info("Found %d markdown files in graph %s", len(md_files), graph_path)
94
95 for md_file in md_files:
96 page_name = md_file.stem
97 try:
98 parsed = parse_page(md_file)
99 except Exception:
100 logger.warning("Failed to parse page %s", md_file)
101 continue
102
103 notes.append(
104 {
105 "name": page_name,
106 "tags": parsed["tags"],
107 "frontmatter": parsed["properties"],
108 "text": parsed["body"],
109 }
110 )
111
112 for linked_page in parsed["links"]:
113 links.append((page_name, linked_page))
114
115 logger.info(
116 "Ingested %d notes with %d links from graph %s",
117 len(notes),
118 len(links),
119 graph_path,
120 )
121 return {"notes": notes, "links": links}
122
123
124 class LogseqSource(BaseSource):
125 """Source connector for Logseq graphs."""
126
127 def __init__(self, graph_path: str) -> None:
128 self.graph_path = Path(graph_path)
129
130 def authenticate(self) -> bool:
131 """Check that the graph path exists and has pages/ or journals/ dirs."""
132 if not self.graph_path.is_dir():
133 logger.error("Graph path does not exist: %s", self.graph_path)
134 return False
135 has_pages = (self.graph_path / "pages").is_dir()
136 has_journals = (self.graph_path / "journals").is_dir()
137 if not has_pages and not has_journals:
138 logger.error(
139 "No pages/ or journals/ directory found in graph: %s",
140 self.graph_path,
141 )
142 return False
143 logger.info(
144 "Logseq graph authenticated: %s (pages=%s, journals=%s)",
145 self.graph_path,
146 has_pages,
147 has_journals,
148 )
149 return True
150
151 def list_videos(
152 self,
153 folder_id: Optional[str] = None,
154 folder_path: Optional[str] = None,
155 patterns: Optional[List[str]] = None,
156 ) -> List[SourceFile]:
157 """List .md files in pages/ and journals/ as SourceFile objects."""
158 md_files: List[Path] = []
159
160 pages_dir = self.graph_path / "pages"
161 journals_dir = self.graph_path / "journals"
162
163 if folder_path:
164 search_root = self.graph_path / folder_path
165 if search_root.is_dir():
166 md_files.extend(sorted(search_root.rglob("*.md")))
167 else:
168 if pages_dir.is_dir():
169 md_files.extend(sorted(pages_dir.rglob("*.md")))
170 if journals_dir.is_dir():
171 md_files.extend(sorted(journals_dir.rglob("*.md")))
172
173 results: List[SourceFile] = []
174 for md_file in md_files:
175 relative = md_file.relative_to(self.graph_path)
176 stat = md_file.stat()
177 modified_dt = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc)
178
179 results.append(
180 SourceFile(
181 name=md_file.name,
182 id=str(relative),
183 size_bytes=stat.st_size,
184 mime_type="text/markdown",
185 modified_at=modified_dt.isoformat(),
186 path=str(relative),
187 )
188 )
189
190 logger.info("Listed %d files from graph %s", len(results), self.graph_path)
191 return results
192
193 def download(self, file: SourceFile, destination: Path) -> Path:
194 """Copy a graph file to the destination path."""
195 source = self.graph_path / file.id
196 destination = Path(destination)
197 destination.parent.mkdir(parents=True, exist_ok=True)
198 shutil.copy2(source, destination)
199 logger.info("Copied %s -> %s", source, destination)
200 return destination
--- a/video_processor/sources/notion_source.py
+++ b/video_processor/sources/notion_source.py
@@ -0,0 +1,380 @@
1
+"""Notion API source connector for fetching pages and databases."""
2
+
3
+import logging
4
+import os
5
+from pathlib import Path
6
+from typing import Dict, List, Optional
7
+
8
+import requests
9
+
10
+from video_processor.sources.base import BaseSource, SourceFile
11
+
12
+logger = logging.getLogger(__name__)
13
+
14
+NOTION_VERSION = "2022-06-28"
15
+NOTION_BASE_URL = "https://api.notion.com/v1"
16
+
17
+
18
+class NotionSource(BaseSource):
19
+ """
20
+ Fetch pages and databases from Notion via the public API.
21
+
22
+ Requires a Notion integration token (internal integration).
23
+ Set NOTION_API_KEY env var or pass token directly.
24
+
25
+ Requires: pip install requests
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ token: Optional[str] = None,
31
+ database_id: Optional[str] = None,
32
+ page_ids: Optional[List[str]] = None,
33
+ ):
34
+ self.token = token or os.environ.get("NOTION_API_KEY", "")
35
+ self.database_id = database_id
36
+ self.page_ids = page_ids or []
37
+
38
+ def _headers(self) -> Dict[str, str]:
39
+ return {
40
+ "Authorization": f"Bearer {self.token}",
41
+ "Notion-Version": NOTION_VERSION,
42
+ "Content-Type": "application/json",
43
+ }
44
+
45
+ def authenticate(self) -> bool:
46
+ """Check token is set and make a test call to the Notion API."""
47
+ if not self.token:
48
+ logger.error("Notion token not set. Provide token or set NOTION_API_KEY.")
49
+ return False
50
+ try:
51
+ resp = requests.get(
52
+ f"{NOTION_BASE_URL}/users/me",
53
+ headers=self._headers(),
54
+ timeout=15,
55
+ )
56
+ resp.raise_for_status()
57
+ user = resp.json()
58
+ logger.info("Authenticated with Notion as %s", user.get("name", "unknown"))
59
+ return True
60
+ except requests.RequestException as exc:
61
+ logger.error("Notion authentication failed: %s", exc)
62
+ return False
63
+
64
+ def list_videos(
65
+ self,
66
+ folder_id: Optional[str] = None,
67
+ folder_path: Optional[str] = None,
68
+ patterns: Optional[List[str]] = None,
69
+ ) -> List[SourceFile]:
70
+ """List Notion pages as SourceFiles.
71
+
72
+ If database_id is set, query the database for pages.
73
+ If page_ids is set, fetch each page individually.
74
+ """
75
+ files: List[SourceFile] = []
76
+
77
+ if self.database_id:
78
+ files.extend(self._list_from_database(self.database_id))
79
+
80
+ if self.page_ids:
81
+ files.extend(self._list_from_pages(self.page_ids))
82
+
83
+ if not files:
84
+ logger.warning("No pages found. Set database_id or page_ids.")
85
+
86
+ return files
87
+
88
+ def _list_from_database(self, database_id: str) -> List[SourceFile]:
89
+ """Query a Notion database and return SourceFiles for each row."""
90
+ files: List[SourceFile] = []
91
+ has_more = True
92
+ start_cursor: Optional[str] = None
93
+
94
+ while has_more:
95
+ body: Dict = {}
96
+ if start_cursor:
97
+ body["start_cursor"] = start_cursor
98
+
99
+ resp = requests.post(
100
+ f"{NOTION_BASE_URL}/databases/{database_id}/query",
101
+ headers=self._headers(),
102
+ json=body,
103
+ timeout=30,
104
+ )
105
+ resp.raise_for_status()
106
+ data = resp.json()
107
+
108
+ for page in data.get("results", []):
109
+ title = _extract_page_title(page)
110
+ files.append(
111
+ SourceFile(
112
+ name=title,
113
+ id=page["id"],
114
+ mime_type="text/markdown",
115
+ modified_at=page.get("last_edited_time"),
116
+ )
117
+ )
118
+
119
+ has_more = data.get("has_more", False)
120
+ start_cursor = data.get("next_cursor")
121
+
122
+ return files
123
+
124
+ def _list_from_pages(self, page_ids: List[str]) -> List[SourceFile]:
125
+ """Fetch individual pages by ID and return SourceFiles."""
126
+ files: List[SourceFile] = []
127
+ for page_id in page_ids:
128
+ try:
129
+ resp = requests.get(
130
+ f"{NOTION_BASE_URL}/pages/{page_id}",
131
+ headers=self._headers(),
132
+ timeout=15,
133
+ )
134
+ resp.raise_for_status()
135
+ page = resp.json()
136
+ title = _extract_page_title(page)
137
+ files.append(
138
+ SourceFile(
139
+ name=title,
140
+ id=page["id"],
141
+ mime_type="text/markdown",
142
+ modified_at=page.get("last_edited_time"),
143
+ )
144
+ )
145
+ except requests.RequestException as exc:
146
+ logger.error("Failed to fetch page %s: %s", page_id, exc)
147
+ return files
148
+
149
+ def download(self, file: SourceFile, destination: Path) -> Path:
150
+ """Download page blocks as markdown text and save to destination."""
151
+ destination = Path(destination)
152
+ destination.parent.mkdir(parents=True, exist_ok=True)
153
+
154
+ blocks = self._fetch_all_blocks(file.id)
155
+ text = self._blocks_to_text(blocks)
156
+
157
+ # Prepend title
158
+ content = f"# {file.name}\n\n{text}"
159
+ destination.write_text(content, encoding="utf-8")
160
+ logger.info("Saved Notion page to %s", destination)
161
+ return destination
162
+
163
+ def _fetch_all_blocks(self, page_id: str) -> list:
164
+ """Fetch all child blocks for a page, handling pagination."""
165
+ blocks: list = []
166
+ has_more = True
167
+ start_cursor: Optional[str] = None
168
+
169
+ while has_more:
170
+ url = f"{NOTION_BASE_URL}/blocks/{page_id}/children?page_size=100"
171
+ if start_cursor:
172
+ url += f"&start_cursor={start_cursor}"
173
+
174
+ resp = requests.get(url, headers=self._headers(), timeout=30)
175
+ resp.raise_for_status()
176
+ data = resp.json()
177
+
178
+ blocks.extend(data.get("results", []))
179
+ has_more = data.get("has_more", False)
180
+ start_cursor = data.get("next_cursor")
181
+
182
+ return blocks
183
+
184
+ def _blocks_to_text(self, blocks: list) -> str:
185
+ """Convert Notion block objects to markdown text."""
186
+ lines: List[str] = []
187
+ numbered_index = 0
188
+
189
+ for block in blocks:
190
+ block_type = block.get("type", "")
191
+ block_data = block.get(block_type, {})
192
+
193
+ if block_type == "paragraph":
194
+ text = _rich_text_to_str(block_data.get("rich_text", []))
195
+ lines.append(text)
196
+ numbered_index = 0
197
+
198
+ elif block_type == "heading_1":
199
+ text = _rich_text_to_str(block_data.get("rich_text", []))
200
+ lines.append(f"# {text}")
201
+ numbered_index = 0
202
+
203
+ elif block_type == "heading_2":
204
+ text = _rich_text_to_str(block_data.get("rich_text", []))
205
+ lines.append(f"## {text}")
206
+ numbered_index = 0
207
+
208
+ elif block_type == "heading_3":
209
+ text = _rich_text_to_str(block_data.get("rich_text", []))
210
+ lines.append(f"### {text}")
211
+ numbered_index = 0
212
+
213
+ elif block_type == "bulleted_list_item":
214
+ text = _rich_text_to_str(block_data.get("rich_text", []))
215
+ lines.append(f"- {text}")
216
+ numbered_index = 0
217
+
218
+ elif block_type == "numbered_list_item":
219
+ numbered_index += 1
220
+ text = _rich_text_to_str(block_data.get("rich_text", []))
221
+ lines.append(f"{numbered_index}. {text}")
222
+
223
+ elif block_type == "to_do":
224
+ text = _rich_text_to_str(block_data.get("rich_text", []))
225
+ checked = block_data.get("checked", False)
226
+ marker = "[x]" if checked else "[ ]"
227
+ lines.append(f"- {marker} {text}")
228
+ numbered_index = 0
229
+
230
+ elif block_type == "code":
231
+ text = _rich_text_to_str(block_data.get("rich_text", []))
232
+ language = block_data.get("language", "")
233
+ lines.append(f"```{language}")
234
+ lines.append(text)
235
+ lines.append("```")
236
+ numbered_index = 0
237
+
238
+ elif block_type == "quote":
239
+ text = _rich_text_to_str(block_data.get("rich_text", []))
240
+ lines.append(f"> {text}")
241
+ numbered_index = 0
242
+
243
+ elif block_type == "callout":
244
+ text = _rich_text_to_str(block_data.get("rich_text", []))
245
+ icon = block_data.get("icon", {})
246
+ emoji = icon.get("emoji", "") if icon else ""
247
+ prefix = f"{emoji} " if emoji else ""
248
+ lines.append(f"> {prefix}{text}")
249
+ numbered_index = 0
250
+
251
+ elif block_type == "toggle":
252
+ text = _rich_text_to_str(block_data.get("rich_text", []))
253
+ lines.append(f"<details><summary>{text}</summary></details>")
254
+ numbered_index = 0
255
+
256
+ elif block_type == "divider":
257
+ lines.append("---")
258
+ numbered_index = 0
259
+
260
+ else:
261
+ # Unsupported block type — try to extract any rich_text
262
+ text = _rich_text_to_str(block_data.get("rich_text", []))
263
+ if text:
264
+ lines.append(text)
265
+ numbered_index = 0
266
+
267
+ return "\n\n".join(lines)
268
+
269
+ def fetch_database_as_table(self, database_id: str) -> str:
270
+ """Fetch a Notion database and return its rows as CSV-like text.
271
+
272
+ Each row is a page in the database. Columns are derived from
273
+ the database properties.
274
+ """
275
+ # First, get database schema for column order
276
+ resp = requests.get(
277
+ f"{NOTION_BASE_URL}/databases/{database_id}",
278
+ headers=self._headers(),
279
+ timeout=15,
280
+ )
281
+ resp.raise_for_status()
282
+ db_meta = resp.json()
283
+ properties = db_meta.get("properties", {})
284
+ columns = sorted(properties.keys())
285
+
286
+ # Query all rows
287
+ rows: List[Dict] = []
288
+ has_more = True
289
+ start_cursor: Optional[str] = None
290
+
291
+ while has_more:
292
+ body: Dict = {}
293
+ if start_cursor:
294
+ body["start_cursor"] = start_cursor
295
+
296
+ resp = requests.post(
297
+ f"{NOTION_BASE_URL}/databases/{database_id}/query",
298
+ headers=self._headers(),
299
+ json=body,
300
+ timeout=30,
301
+ )
302
+ resp.raise_for_status()
303
+ data = resp.json()
304
+ rows.extend(data.get("results", []))
305
+ has_more = data.get("has_more", False)
306
+ start_cursor = data.get("next_cursor")
307
+
308
+ # Build CSV-like output
309
+ lines: List[str] = []
310
+ lines.append(",".join(columns))
311
+
312
+ for row in rows:
313
+ row_props = row.get("properties", {})
314
+ values: List[str] = []
315
+ for col in columns:
316
+ prop = row_props.get(col, {})
317
+ values.append(_extract_property_value(prop))
318
+ lines.append(",".join(values))
319
+
320
+ return "\n".join(lines)
321
+
322
+
323
+def _rich_text_to_str(rich_text: list) -> str:
324
+ """Extract plain text from a Notion rich_text array."""
325
+ return "".join(item.get("plain_text", "") for item in rich_text)
326
+
327
+
328
+def _extract_page_title(page: dict) -> str:
329
+ """Extract the title from a Notion page object."""
330
+ properties = page.get("properties", {})
331
+ for prop in properties.values():
332
+ if prop.get("type") == "title":
333
+ return _rich_text_to_str(prop.get("title", []))
334
+ return "Untitled"
335
+
336
+
337
+def _extract_property_value(prop: dict) -> str:
338
+ """Extract a display string from a Notion property value."""
339
+ prop_type = prop.get("type", "")
340
+
341
+ if prop_type == "title":
342
+ return _rich_text_to_str(prop.get("title", []))
343
+ elif prop_type == "rich_text":
344
+ return _rich_text_to_str(prop.get("rich_text", []))
345
+ elif prop_type == "number":
346
+ val = prop.get("number")
347
+ return str(val) if val is not None else ""
348
+ elif prop_type == "select":
349
+ sel = prop.get("select")
350
+ return sel.get("name", "") if sel else ""
351
+ elif prop_type == "multi_select":
352
+ return "; ".join(s.get("name", "") for s in prop.get("multi_select", []))
353
+ elif prop_type == "date":
354
+ date = prop.get("date")
355
+ if date:
356
+ start = date.get("start", "")
357
+ end = date.get("end", "")
358
+ return f"{start} - {end}" if end else start
359
+ return ""
360
+ elif prop_type == "checkbox":
361
+ return str(prop.get("checkbox", False))
362
+ elif prop_type == "url":
363
+ return prop.get("url", "") or ""
364
+ elif prop_type == "email":
365
+ return prop.get("email", "") or ""
366
+ elif prop_type == "phone_number":
367
+ return prop.get("phone_number", "") or ""
368
+ elif prop_type == "status":
369
+ status = prop.get("status")
370
+ return status.get("name", "") if status else ""
371
+ elif prop_type == "people":
372
+ return "; ".join(p.get("name", "") for p in prop.get("people", []))
373
+ elif prop_type == "relation":
374
+ return "; ".join(r.get("id", "") for r in prop.get("relation", []))
375
+ elif prop_type == "formula":
376
+ formula = prop.get("formula", {})
377
+ f_type = formula.get("type", "")
378
+ return str(formula.get(f_type, ""))
379
+ else:
380
+ return ""
--- a/video_processor/sources/notion_source.py
+++ b/video_processor/sources/notion_source.py
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/notion_source.py
+++ b/video_processor/sources/notion_source.py
@@ -0,0 +1,380 @@
1 """Notion API source connector for fetching pages and databases."""
2
3 import logging
4 import os
5 from pathlib import Path
6 from typing import Dict, List, Optional
7
8 import requests
9
10 from video_processor.sources.base import BaseSource, SourceFile
11
12 logger = logging.getLogger(__name__)
13
14 NOTION_VERSION = "2022-06-28"
15 NOTION_BASE_URL = "https://api.notion.com/v1"
16
17
18 class NotionSource(BaseSource):
19 """
20 Fetch pages and databases from Notion via the public API.
21
22 Requires a Notion integration token (internal integration).
23 Set NOTION_API_KEY env var or pass token directly.
24
25 Requires: pip install requests
26 """
27
28 def __init__(
29 self,
30 token: Optional[str] = None,
31 database_id: Optional[str] = None,
32 page_ids: Optional[List[str]] = None,
33 ):
34 self.token = token or os.environ.get("NOTION_API_KEY", "")
35 self.database_id = database_id
36 self.page_ids = page_ids or []
37
38 def _headers(self) -> Dict[str, str]:
39 return {
40 "Authorization": f"Bearer {self.token}",
41 "Notion-Version": NOTION_VERSION,
42 "Content-Type": "application/json",
43 }
44
45 def authenticate(self) -> bool:
46 """Check token is set and make a test call to the Notion API."""
47 if not self.token:
48 logger.error("Notion token not set. Provide token or set NOTION_API_KEY.")
49 return False
50 try:
51 resp = requests.get(
52 f"{NOTION_BASE_URL}/users/me",
53 headers=self._headers(),
54 timeout=15,
55 )
56 resp.raise_for_status()
57 user = resp.json()
58 logger.info("Authenticated with Notion as %s", user.get("name", "unknown"))
59 return True
60 except requests.RequestException as exc:
61 logger.error("Notion authentication failed: %s", exc)
62 return False
63
64 def list_videos(
65 self,
66 folder_id: Optional[str] = None,
67 folder_path: Optional[str] = None,
68 patterns: Optional[List[str]] = None,
69 ) -> List[SourceFile]:
70 """List Notion pages as SourceFiles.
71
72 If database_id is set, query the database for pages.
73 If page_ids is set, fetch each page individually.
74 """
75 files: List[SourceFile] = []
76
77 if self.database_id:
78 files.extend(self._list_from_database(self.database_id))
79
80 if self.page_ids:
81 files.extend(self._list_from_pages(self.page_ids))
82
83 if not files:
84 logger.warning("No pages found. Set database_id or page_ids.")
85
86 return files
87
88 def _list_from_database(self, database_id: str) -> List[SourceFile]:
89 """Query a Notion database and return SourceFiles for each row."""
90 files: List[SourceFile] = []
91 has_more = True
92 start_cursor: Optional[str] = None
93
94 while has_more:
95 body: Dict = {}
96 if start_cursor:
97 body["start_cursor"] = start_cursor
98
99 resp = requests.post(
100 f"{NOTION_BASE_URL}/databases/{database_id}/query",
101 headers=self._headers(),
102 json=body,
103 timeout=30,
104 )
105 resp.raise_for_status()
106 data = resp.json()
107
108 for page in data.get("results", []):
109 title = _extract_page_title(page)
110 files.append(
111 SourceFile(
112 name=title,
113 id=page["id"],
114 mime_type="text/markdown",
115 modified_at=page.get("last_edited_time"),
116 )
117 )
118
119 has_more = data.get("has_more", False)
120 start_cursor = data.get("next_cursor")
121
122 return files
123
124 def _list_from_pages(self, page_ids: List[str]) -> List[SourceFile]:
125 """Fetch individual pages by ID and return SourceFiles."""
126 files: List[SourceFile] = []
127 for page_id in page_ids:
128 try:
129 resp = requests.get(
130 f"{NOTION_BASE_URL}/pages/{page_id}",
131 headers=self._headers(),
132 timeout=15,
133 )
134 resp.raise_for_status()
135 page = resp.json()
136 title = _extract_page_title(page)
137 files.append(
138 SourceFile(
139 name=title,
140 id=page["id"],
141 mime_type="text/markdown",
142 modified_at=page.get("last_edited_time"),
143 )
144 )
145 except requests.RequestException as exc:
146 logger.error("Failed to fetch page %s: %s", page_id, exc)
147 return files
148
149 def download(self, file: SourceFile, destination: Path) -> Path:
150 """Download page blocks as markdown text and save to destination."""
151 destination = Path(destination)
152 destination.parent.mkdir(parents=True, exist_ok=True)
153
154 blocks = self._fetch_all_blocks(file.id)
155 text = self._blocks_to_text(blocks)
156
157 # Prepend title
158 content = f"# {file.name}\n\n{text}"
159 destination.write_text(content, encoding="utf-8")
160 logger.info("Saved Notion page to %s", destination)
161 return destination
162
163 def _fetch_all_blocks(self, page_id: str) -> list:
164 """Fetch all child blocks for a page, handling pagination."""
165 blocks: list = []
166 has_more = True
167 start_cursor: Optional[str] = None
168
169 while has_more:
170 url = f"{NOTION_BASE_URL}/blocks/{page_id}/children?page_size=100"
171 if start_cursor:
172 url += f"&start_cursor={start_cursor}"
173
174 resp = requests.get(url, headers=self._headers(), timeout=30)
175 resp.raise_for_status()
176 data = resp.json()
177
178 blocks.extend(data.get("results", []))
179 has_more = data.get("has_more", False)
180 start_cursor = data.get("next_cursor")
181
182 return blocks
183
184 def _blocks_to_text(self, blocks: list) -> str:
185 """Convert Notion block objects to markdown text."""
186 lines: List[str] = []
187 numbered_index = 0
188
189 for block in blocks:
190 block_type = block.get("type", "")
191 block_data = block.get(block_type, {})
192
193 if block_type == "paragraph":
194 text = _rich_text_to_str(block_data.get("rich_text", []))
195 lines.append(text)
196 numbered_index = 0
197
198 elif block_type == "heading_1":
199 text = _rich_text_to_str(block_data.get("rich_text", []))
200 lines.append(f"# {text}")
201 numbered_index = 0
202
203 elif block_type == "heading_2":
204 text = _rich_text_to_str(block_data.get("rich_text", []))
205 lines.append(f"## {text}")
206 numbered_index = 0
207
208 elif block_type == "heading_3":
209 text = _rich_text_to_str(block_data.get("rich_text", []))
210 lines.append(f"### {text}")
211 numbered_index = 0
212
213 elif block_type == "bulleted_list_item":
214 text = _rich_text_to_str(block_data.get("rich_text", []))
215 lines.append(f"- {text}")
216 numbered_index = 0
217
218 elif block_type == "numbered_list_item":
219 numbered_index += 1
220 text = _rich_text_to_str(block_data.get("rich_text", []))
221 lines.append(f"{numbered_index}. {text}")
222
223 elif block_type == "to_do":
224 text = _rich_text_to_str(block_data.get("rich_text", []))
225 checked = block_data.get("checked", False)
226 marker = "[x]" if checked else "[ ]"
227 lines.append(f"- {marker} {text}")
228 numbered_index = 0
229
230 elif block_type == "code":
231 text = _rich_text_to_str(block_data.get("rich_text", []))
232 language = block_data.get("language", "")
233 lines.append(f"```{language}")
234 lines.append(text)
235 lines.append("```")
236 numbered_index = 0
237
238 elif block_type == "quote":
239 text = _rich_text_to_str(block_data.get("rich_text", []))
240 lines.append(f"> {text}")
241 numbered_index = 0
242
243 elif block_type == "callout":
244 text = _rich_text_to_str(block_data.get("rich_text", []))
245 icon = block_data.get("icon", {})
246 emoji = icon.get("emoji", "") if icon else ""
247 prefix = f"{emoji} " if emoji else ""
248 lines.append(f"> {prefix}{text}")
249 numbered_index = 0
250
251 elif block_type == "toggle":
252 text = _rich_text_to_str(block_data.get("rich_text", []))
253 lines.append(f"<details><summary>{text}</summary></details>")
254 numbered_index = 0
255
256 elif block_type == "divider":
257 lines.append("---")
258 numbered_index = 0
259
260 else:
261 # Unsupported block type — try to extract any rich_text
262 text = _rich_text_to_str(block_data.get("rich_text", []))
263 if text:
264 lines.append(text)
265 numbered_index = 0
266
267 return "\n\n".join(lines)
268
269 def fetch_database_as_table(self, database_id: str) -> str:
270 """Fetch a Notion database and return its rows as CSV-like text.
271
272 Each row is a page in the database. Columns are derived from
273 the database properties.
274 """
275 # First, get database schema for column order
276 resp = requests.get(
277 f"{NOTION_BASE_URL}/databases/{database_id}",
278 headers=self._headers(),
279 timeout=15,
280 )
281 resp.raise_for_status()
282 db_meta = resp.json()
283 properties = db_meta.get("properties", {})
284 columns = sorted(properties.keys())
285
286 # Query all rows
287 rows: List[Dict] = []
288 has_more = True
289 start_cursor: Optional[str] = None
290
291 while has_more:
292 body: Dict = {}
293 if start_cursor:
294 body["start_cursor"] = start_cursor
295
296 resp = requests.post(
297 f"{NOTION_BASE_URL}/databases/{database_id}/query",
298 headers=self._headers(),
299 json=body,
300 timeout=30,
301 )
302 resp.raise_for_status()
303 data = resp.json()
304 rows.extend(data.get("results", []))
305 has_more = data.get("has_more", False)
306 start_cursor = data.get("next_cursor")
307
308 # Build CSV-like output
309 lines: List[str] = []
310 lines.append(",".join(columns))
311
312 for row in rows:
313 row_props = row.get("properties", {})
314 values: List[str] = []
315 for col in columns:
316 prop = row_props.get(col, {})
317 values.append(_extract_property_value(prop))
318 lines.append(",".join(values))
319
320 return "\n".join(lines)
321
322
323 def _rich_text_to_str(rich_text: list) -> str:
324 """Extract plain text from a Notion rich_text array."""
325 return "".join(item.get("plain_text", "") for item in rich_text)
326
327
328 def _extract_page_title(page: dict) -> str:
329 """Extract the title from a Notion page object."""
330 properties = page.get("properties", {})
331 for prop in properties.values():
332 if prop.get("type") == "title":
333 return _rich_text_to_str(prop.get("title", []))
334 return "Untitled"
335
336
337 def _extract_property_value(prop: dict) -> str:
338 """Extract a display string from a Notion property value."""
339 prop_type = prop.get("type", "")
340
341 if prop_type == "title":
342 return _rich_text_to_str(prop.get("title", []))
343 elif prop_type == "rich_text":
344 return _rich_text_to_str(prop.get("rich_text", []))
345 elif prop_type == "number":
346 val = prop.get("number")
347 return str(val) if val is not None else ""
348 elif prop_type == "select":
349 sel = prop.get("select")
350 return sel.get("name", "") if sel else ""
351 elif prop_type == "multi_select":
352 return "; ".join(s.get("name", "") for s in prop.get("multi_select", []))
353 elif prop_type == "date":
354 date = prop.get("date")
355 if date:
356 start = date.get("start", "")
357 end = date.get("end", "")
358 return f"{start} - {end}" if end else start
359 return ""
360 elif prop_type == "checkbox":
361 return str(prop.get("checkbox", False))
362 elif prop_type == "url":
363 return prop.get("url", "") or ""
364 elif prop_type == "email":
365 return prop.get("email", "") or ""
366 elif prop_type == "phone_number":
367 return prop.get("phone_number", "") or ""
368 elif prop_type == "status":
369 status = prop.get("status")
370 return status.get("name", "") if status else ""
371 elif prop_type == "people":
372 return "; ".join(p.get("name", "") for p in prop.get("people", []))
373 elif prop_type == "relation":
374 return "; ".join(r.get("id", "") for r in prop.get("relation", []))
375 elif prop_type == "formula":
376 formula = prop.get("formula", {})
377 f_type = formula.get("type", "")
378 return str(formula.get(f_type, ""))
379 else:
380 return ""
--- a/video_processor/sources/obsidian_source.py
+++ b/video_processor/sources/obsidian_source.py
@@ -0,0 +1,178 @@
1
+"""Obsidian vault source connector for ingesting markdown notes."""
2
+
3
+import logging
4
+import re
5
+import shutil
6
+from datetime import datetime, timezone
7
+from pathlib import Path
8
+from typing import List, Optional, Tuple
9
+
10
+from video_processor.sources.base import BaseSource, SourceFile
11
+
12
+logger = logging.getLogger(__name__)
13
+
14
+
15
+def parse_note(path: Path) -> dict:
16
+ """Parse an Obsidian markdown note and extract structured content.
17
+
18
+ Returns a dict with:
19
+ - frontmatter: dict of YAML frontmatter metadata
20
+ - links: list of linked page names from [[wiki-links]]
21
+ - tags: list of tags from #tag occurrences
22
+ - headings: list of dicts with level and text
23
+ - body: markdown text without frontmatter
24
+ """
25
+ text = path.read_text(encoding="utf-8")
26
+
27
+ # Extract YAML frontmatter (simple key: value parser, stdlib only)
28
+ frontmatter: dict = {}
29
+ body = text
30
+ fm_match = re.match(r"\A---\n(.*?\n)---\n?(.*)", text, re.DOTALL)
31
+ if fm_match:
32
+ fm_text = fm_match.group(1)
33
+ for line in fm_text.strip().splitlines():
34
+ kv = re.match(r"^([A-Za-z_][A-Za-z0-9_ -]*):\s*(.*)", line)
35
+ if kv:
36
+ key = kv.group(1).strip()
37
+ value = kv.group(2).strip()
38
+ # Strip surrounding quotes
39
+ if len(value) >= 2 and value[0] == value[-1] and value[0] in ('"', "'"):
40
+ value = value[1:-1]
41
+ # Handle YAML-style lists on a single line [a, b, c]
42
+ list_match = re.match(r"^\[(.+)\]$", value)
43
+ if list_match:
44
+ value = [v.strip().strip("\"'") for v in list_match.group(1).split(",")]
45
+ frontmatter[key] = value
46
+ body = fm_match.group(2)
47
+
48
+ # Extract wiki-links: [[page]] and [[page|alias]]
49
+ link_pattern = re.compile(r"\[\[([^\]|]+)(?:\|[^\]]+)?\]\]")
50
+ links = link_pattern.findall(body)
51
+
52
+ # Extract tags: #tag (but not inside code blocks or frontmatter)
53
+ # Match #tag but not #[[tag]] (that's Logseq style) and not ## headings
54
+ tag_pattern = re.compile(r"(?<!\w)#([A-Za-z][A-Za-z0-9_/-]*)")
55
+ tags = tag_pattern.findall(body)
56
+
57
+ # Extract headings hierarchy
58
+ heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
59
+ headings = [
60
+ {"level": len(m.group(1)), "text": m.group(2).strip()}
61
+ for m in heading_pattern.finditer(body)
62
+ ]
63
+
64
+ return {
65
+ "frontmatter": frontmatter,
66
+ "links": links,
67
+ "tags": tags,
68
+ "headings": headings,
69
+ "body": body,
70
+ }
71
+
72
+
73
+def ingest_vault(vault_path: Path) -> dict:
74
+ """Ingest an entire Obsidian vault and return structured data.
75
+
76
+ Returns a dict with:
77
+ - notes: list of dicts with name, tags, frontmatter, text
78
+ - links: list of (source, target) tuples from wiki-links
79
+ """
80
+ vault_path = Path(vault_path)
81
+ notes: List[dict] = []
82
+ links: List[Tuple[str, str]] = []
83
+
84
+ md_files = sorted(vault_path.rglob("*.md"))
85
+ logger.info("Found %d markdown files in vault %s", len(md_files), vault_path)
86
+
87
+ for md_file in md_files:
88
+ note_name = md_file.stem
89
+ try:
90
+ parsed = parse_note(md_file)
91
+ except Exception:
92
+ logger.warning("Failed to parse note %s", md_file)
93
+ continue
94
+
95
+ notes.append(
96
+ {
97
+ "name": note_name,
98
+ "tags": parsed["tags"],
99
+ "frontmatter": parsed["frontmatter"],
100
+ "text": parsed["body"],
101
+ }
102
+ )
103
+
104
+ for linked_page in parsed["links"]:
105
+ links.append((note_name, linked_page))
106
+
107
+ logger.info(
108
+ "Ingested %d notes with %d links from vault %s",
109
+ len(notes),
110
+ len(links),
111
+ vault_path,
112
+ )
113
+ return {"notes": notes, "links": links}
114
+
115
+
116
+class ObsidianSource(BaseSource):
117
+ """Source connector for Obsidian vaults."""
118
+
119
+ def __init__(self, vault_path: str) -> None:
120
+ self.vault_path = Path(vault_path)
121
+
122
+ def authenticate(self) -> bool:
123
+ """Check that the vault path exists and contains .md files."""
124
+ if not self.vault_path.is_dir():
125
+ logger.error("Vault path does not exist: %s", self.vault_path)
126
+ return False
127
+ md_files = list(self.vault_path.rglob("*.md"))
128
+ if not md_files:
129
+ logger.error("No markdown files found in vault: %s", self.vault_path)
130
+ return False
131
+ logger.info(
132
+ "Obsidian vault authenticated: %s (%d .md files)",
133
+ self.vault_path,
134
+ len(md_files),
135
+ )
136
+ return True
137
+
138
+ def list_videos(
139
+ self,
140
+ folder_id: Optional[str] = None,
141
+ folder_path: Optional[str] = None,
142
+ patterns: Optional[List[str]] = None,
143
+ ) -> List[SourceFile]:
144
+ """List all .md files in the vault recursively as SourceFile objects."""
145
+ search_root = self.vault_path
146
+ if folder_path:
147
+ search_root = self.vault_path / folder_path
148
+
149
+ md_files = sorted(search_root.rglob("*.md"))
150
+ results: List[SourceFile] = []
151
+
152
+ for md_file in md_files:
153
+ relative = md_file.relative_to(self.vault_path)
154
+ stat = md_file.stat()
155
+ modified_dt = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc)
156
+
157
+ results.append(
158
+ SourceFile(
159
+ name=md_file.name,
160
+ id=str(relative),
161
+ size_bytes=stat.st_size,
162
+ mime_type="text/markdown",
163
+ modified_at=modified_dt.isoformat(),
164
+ path=str(relative),
165
+ )
166
+ )
167
+
168
+ logger.info("Listed %d files from vault %s", len(results), self.vault_path)
169
+ return results
170
+
171
+ def download(self, file: SourceFile, destination: Path) -> Path:
172
+ """Copy a vault file to the destination path."""
173
+ source = self.vault_path / file.id
174
+ destination = Path(destination)
175
+ destination.parent.mkdir(parents=True, exist_ok=True)
176
+ shutil.copy2(source, destination)
177
+ logger.info("Copied %s -> %s", source, destination)
178
+ return destination
--- a/video_processor/sources/obsidian_source.py
+++ b/video_processor/sources/obsidian_source.py
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/obsidian_source.py
+++ b/video_processor/sources/obsidian_source.py
@@ -0,0 +1,178 @@
1 """Obsidian vault source connector for ingesting markdown notes."""
2
3 import logging
4 import re
5 import shutil
6 from datetime import datetime, timezone
7 from pathlib import Path
8 from typing import List, Optional, Tuple
9
10 from video_processor.sources.base import BaseSource, SourceFile
11
12 logger = logging.getLogger(__name__)
13
14
15 def parse_note(path: Path) -> dict:
16 """Parse an Obsidian markdown note and extract structured content.
17
18 Returns a dict with:
19 - frontmatter: dict of YAML frontmatter metadata
20 - links: list of linked page names from [[wiki-links]]
21 - tags: list of tags from #tag occurrences
22 - headings: list of dicts with level and text
23 - body: markdown text without frontmatter
24 """
25 text = path.read_text(encoding="utf-8")
26
27 # Extract YAML frontmatter (simple key: value parser, stdlib only)
28 frontmatter: dict = {}
29 body = text
30 fm_match = re.match(r"\A---\n(.*?\n)---\n?(.*)", text, re.DOTALL)
31 if fm_match:
32 fm_text = fm_match.group(1)
33 for line in fm_text.strip().splitlines():
34 kv = re.match(r"^([A-Za-z_][A-Za-z0-9_ -]*):\s*(.*)", line)
35 if kv:
36 key = kv.group(1).strip()
37 value = kv.group(2).strip()
38 # Strip surrounding quotes
39 if len(value) >= 2 and value[0] == value[-1] and value[0] in ('"', "'"):
40 value = value[1:-1]
41 # Handle YAML-style lists on a single line [a, b, c]
42 list_match = re.match(r"^\[(.+)\]$", value)
43 if list_match:
44 value = [v.strip().strip("\"'") for v in list_match.group(1).split(",")]
45 frontmatter[key] = value
46 body = fm_match.group(2)
47
48 # Extract wiki-links: [[page]] and [[page|alias]]
49 link_pattern = re.compile(r"\[\[([^\]|]+)(?:\|[^\]]+)?\]\]")
50 links = link_pattern.findall(body)
51
52 # Extract tags: #tag (but not inside code blocks or frontmatter)
53 # Match #tag but not #[[tag]] (that's Logseq style) and not ## headings
54 tag_pattern = re.compile(r"(?<!\w)#([A-Za-z][A-Za-z0-9_/-]*)")
55 tags = tag_pattern.findall(body)
56
57 # Extract headings hierarchy
58 heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
59 headings = [
60 {"level": len(m.group(1)), "text": m.group(2).strip()}
61 for m in heading_pattern.finditer(body)
62 ]
63
64 return {
65 "frontmatter": frontmatter,
66 "links": links,
67 "tags": tags,
68 "headings": headings,
69 "body": body,
70 }
71
72
73 def ingest_vault(vault_path: Path) -> dict:
74 """Ingest an entire Obsidian vault and return structured data.
75
76 Returns a dict with:
77 - notes: list of dicts with name, tags, frontmatter, text
78 - links: list of (source, target) tuples from wiki-links
79 """
80 vault_path = Path(vault_path)
81 notes: List[dict] = []
82 links: List[Tuple[str, str]] = []
83
84 md_files = sorted(vault_path.rglob("*.md"))
85 logger.info("Found %d markdown files in vault %s", len(md_files), vault_path)
86
87 for md_file in md_files:
88 note_name = md_file.stem
89 try:
90 parsed = parse_note(md_file)
91 except Exception:
92 logger.warning("Failed to parse note %s", md_file)
93 continue
94
95 notes.append(
96 {
97 "name": note_name,
98 "tags": parsed["tags"],
99 "frontmatter": parsed["frontmatter"],
100 "text": parsed["body"],
101 }
102 )
103
104 for linked_page in parsed["links"]:
105 links.append((note_name, linked_page))
106
107 logger.info(
108 "Ingested %d notes with %d links from vault %s",
109 len(notes),
110 len(links),
111 vault_path,
112 )
113 return {"notes": notes, "links": links}
114
115
116 class ObsidianSource(BaseSource):
117 """Source connector for Obsidian vaults."""
118
119 def __init__(self, vault_path: str) -> None:
120 self.vault_path = Path(vault_path)
121
122 def authenticate(self) -> bool:
123 """Check that the vault path exists and contains .md files."""
124 if not self.vault_path.is_dir():
125 logger.error("Vault path does not exist: %s", self.vault_path)
126 return False
127 md_files = list(self.vault_path.rglob("*.md"))
128 if not md_files:
129 logger.error("No markdown files found in vault: %s", self.vault_path)
130 return False
131 logger.info(
132 "Obsidian vault authenticated: %s (%d .md files)",
133 self.vault_path,
134 len(md_files),
135 )
136 return True
137
138 def list_videos(
139 self,
140 folder_id: Optional[str] = None,
141 folder_path: Optional[str] = None,
142 patterns: Optional[List[str]] = None,
143 ) -> List[SourceFile]:
144 """List all .md files in the vault recursively as SourceFile objects."""
145 search_root = self.vault_path
146 if folder_path:
147 search_root = self.vault_path / folder_path
148
149 md_files = sorted(search_root.rglob("*.md"))
150 results: List[SourceFile] = []
151
152 for md_file in md_files:
153 relative = md_file.relative_to(self.vault_path)
154 stat = md_file.stat()
155 modified_dt = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc)
156
157 results.append(
158 SourceFile(
159 name=md_file.name,
160 id=str(relative),
161 size_bytes=stat.st_size,
162 mime_type="text/markdown",
163 modified_at=modified_dt.isoformat(),
164 path=str(relative),
165 )
166 )
167
168 logger.info("Listed %d files from vault %s", len(results), self.vault_path)
169 return results
170
171 def download(self, file: SourceFile, destination: Path) -> Path:
172 """Copy a vault file to the destination path."""
173 source = self.vault_path / file.id
174 destination = Path(destination)
175 destination.parent.mkdir(parents=True, exist_ok=True)
176 shutil.copy2(source, destination)
177 logger.info("Copied %s -> %s", source, destination)
178 return destination
--- a/video_processor/sources/onenote_source.py
+++ b/video_processor/sources/onenote_source.py
@@ -0,0 +1,222 @@
1
+"""Microsoft OneNote source connector using the m365 CLI (cli-microsoft365).
2
+
3
+Fetches pages from OneNote notebooks via the `m365` CLI tool.
4
+Outputs plain text suitable for KG ingestion.
5
+
6
+Requires: npm install -g @pnp/cli-microsoft365
7
+Auth: m365 login (interactive)
8
+Docs: https://pnp.github.io/cli-microsoft365/
9
+"""
10
+
11
+import json
12
+import logging
13
+import re
14
+import shutil
15
+import subprocess
16
+from pathlib import Path
17
+from typing import Any, List, Optional
18
+
19
+from video_processor.sources.base import BaseSource, SourceFile
20
+
21
+logger = logging.getLogger(__name__)
22
+
23
+
24
+def _run_m365(args: List[str], timeout: int = 30) -> Any:
25
+ """Run an m365 CLI command and return parsed JSON output."""
26
+ cmd = ["m365"] + args + ["--output", "json"]
27
+ proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
28
+ if proc.returncode != 0:
29
+ raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}")
30
+ try:
31
+ return json.loads(proc.stdout)
32
+ except json.JSONDecodeError:
33
+ return proc.stdout.strip()
34
+
35
+
36
+def _html_to_text(html: str) -> str:
37
+ """Strip HTML tags and decode entities to produce plain text.
38
+
39
+ Uses only stdlib ``re`` — no external dependencies.
40
+ """
41
+ # Remove script/style blocks entirely
42
+ text = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
43
+ # Replace <br>, <p>, <div>, <li>, <tr> with newlines for readability
44
+ text = re.sub(r"<br\s*/?>", "\n", text, flags=re.IGNORECASE)
45
+ text = re.sub(r"</(p|div|li|tr|h[1-6])>", "\n", text, flags=re.IGNORECASE)
46
+ # Strip remaining tags
47
+ text = re.sub(r"<[^>]+>", "", text)
48
+ # Decode common HTML entities
49
+ entity_map = {
50
+ "&amp;": "&",
51
+ "&lt;": "<",
52
+ "&gt;": ">",
53
+ "&quot;": '"',
54
+ "&#39;": "'",
55
+ "&apos;": "'",
56
+ "&nbsp;": " ",
57
+ }
58
+ for entity, char in entity_map.items():
59
+ text = text.replace(entity, char)
60
+ # Decode numeric entities (&#123; and &#x1a;)
61
+ text = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), text)
62
+ text = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), text)
63
+ # Collapse excessive blank lines
64
+ text = re.sub(r"\n{3,}", "\n\n", text)
65
+ return text.strip()
66
+
67
+
68
+class OneNoteSource(BaseSource):
69
+ """
70
+ Fetch pages from OneNote notebooks via the m365 CLI.
71
+
72
+ Usage:
73
+ source = OneNoteSource() # all notebooks
74
+ source = OneNoteSource(notebook_name="Work Notes") # specific notebook
75
+ source = OneNoteSource(notebook_name="Work", section_name="Meetings")
76
+ files = source.list_videos()
77
+ source.download_all(files, Path("./notes"))
78
+ """
79
+
80
+ def __init__(
81
+ self,
82
+ notebook_name: Optional[str] = None,
83
+ section_name: Optional[str] = None,
84
+ ):
85
+ self.notebook_name = notebook_name
86
+ self.section_name = section_name
87
+
88
+ def authenticate(self) -> bool:
89
+ """Check if m365 CLI is installed and logged in."""
90
+ if not shutil.which("m365"):
91
+ logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365")
92
+ return False
93
+ try:
94
+ result = _run_m365(["status"], timeout=10)
95
+ if isinstance(result, dict) and result.get("connectedAs"):
96
+ return True
97
+ if isinstance(result, str) and "Logged in" in result:
98
+ return True
99
+ logger.error("m365 not logged in. Run: m365 login")
100
+ return False
101
+ except (RuntimeError, subprocess.TimeoutExpired):
102
+ logger.error("m365 not logged in. Run: m365 login")
103
+ return False
104
+
105
+ def list_videos(
106
+ self,
107
+ folder_id: Optional[str] = None,
108
+ folder_path: Optional[str] = None,
109
+ patterns: Optional[List[str]] = None,
110
+ ) -> List[SourceFile]:
111
+ """List OneNote pages across notebooks/sections. Returns SourceFile per page."""
112
+ files: List[SourceFile] = []
113
+
114
+ # Step 1: List notebooks
115
+ try:
116
+ notebooks = _run_m365(["onenote", "notebook", "list"], timeout=60)
117
+ except RuntimeError as e:
118
+ logger.error(f"Failed to list OneNote notebooks: {e}")
119
+ return []
120
+
121
+ if not isinstance(notebooks, list):
122
+ notebooks = []
123
+
124
+ # Filter notebooks by name if specified
125
+ if self.notebook_name:
126
+ notebooks = [
127
+ nb
128
+ for nb in notebooks
129
+ if self.notebook_name.lower() in nb.get("displayName", "").lower()
130
+ ]
131
+
132
+ for notebook in notebooks:
133
+ notebook_id = notebook.get("id", "")
134
+ notebook_name = notebook.get("displayName", "Untitled Notebook")
135
+
136
+ # Step 2: List sections in this notebook
137
+ try:
138
+ sections = _run_m365(
139
+ ["onenote", "section", "list", "--notebookId", notebook_id],
140
+ timeout=60,
141
+ )
142
+ except RuntimeError as e:
143
+ logger.warning(f"Failed to list sections for notebook '{notebook_name}': {e}")
144
+ continue
145
+
146
+ if not isinstance(sections, list):
147
+ sections = []
148
+
149
+ # Filter sections by name if specified
150
+ if self.section_name:
151
+ sections = [
152
+ s
153
+ for s in sections
154
+ if self.section_name.lower() in s.get("displayName", "").lower()
155
+ ]
156
+
157
+ for section in sections:
158
+ section_id = section.get("id", "")
159
+ section_name = section.get("displayName", "Untitled Section")
160
+
161
+ # Step 3: List pages in this section
162
+ try:
163
+ pages = _run_m365(
164
+ ["onenote", "page", "list", "--sectionId", section_id],
165
+ timeout=60,
166
+ )
167
+ except RuntimeError as e:
168
+ logger.warning(f"Failed to list pages in section '{section_name}': {e}")
169
+ continue
170
+
171
+ if not isinstance(pages, list):
172
+ pages = []
173
+
174
+ for page in pages:
175
+ page_id = page.get("id", "")
176
+ title = page.get("title", "Untitled Page").strip() or "Untitled Page"
177
+ modified = page.get("lastModifiedDateTime")
178
+ # Build a path for organizational context
179
+ page_path = f"{notebook_name}/{section_name}/{title}"
180
+
181
+ files.append(
182
+ SourceFile(
183
+ name=title,
184
+ id=str(page_id),
185
+ size_bytes=None,
186
+ mime_type="text/html",
187
+ modified_at=modified,
188
+ path=page_path,
189
+ )
190
+ )
191
+
192
+ logger.info(f"Found {len(files)} page(s) in OneNote")
193
+ return files
194
+
195
+ def download(self, file: SourceFile, destination: Path) -> Path:
196
+ """Download a OneNote page's content as a text file."""
197
+ destination = Path(destination)
198
+ destination.parent.mkdir(parents=True, exist_ok=True)
199
+
200
+ try:
201
+ result = _run_m365(
202
+ ["onenote", "page", "get", "--id", file.id],
203
+ timeout=60,
204
+ )
205
+ except RuntimeError as e:
206
+ raise RuntimeError(f"Failed to fetch OneNote page {file.id}: {e}") from e
207
+
208
+ # Extract HTML content from the result
209
+ if isinstance(result, dict):
210
+ html = result.get("content", result.get("body", {}).get("content", ""))
211
+ if not html:
212
+ # Fallback: serialize the whole response
213
+ html = json.dumps(result, indent=2)
214
+ elif isinstance(result, str):
215
+ html = result
216
+ else:
217
+ html = str(result)
218
+
219
+ text = _html_to_text(html)
220
+ destination.write_text(text, encoding="utf-8")
221
+ logger.info(f"Saved page '{file.name}' to {destination}")
222
+ return destination
--- a/video_processor/sources/onenote_source.py
+++ b/video_processor/sources/onenote_source.py
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/onenote_source.py
+++ b/video_processor/sources/onenote_source.py
@@ -0,0 +1,222 @@
1 """Microsoft OneNote source connector using the m365 CLI (cli-microsoft365).
2
3 Fetches pages from OneNote notebooks via the `m365` CLI tool.
4 Outputs plain text suitable for KG ingestion.
5
6 Requires: npm install -g @pnp/cli-microsoft365
7 Auth: m365 login (interactive)
8 Docs: https://pnp.github.io/cli-microsoft365/
9 """
10
11 import json
12 import logging
13 import re
14 import shutil
15 import subprocess
16 from pathlib import Path
17 from typing import Any, List, Optional
18
19 from video_processor.sources.base import BaseSource, SourceFile
20
21 logger = logging.getLogger(__name__)
22
23
24 def _run_m365(args: List[str], timeout: int = 30) -> Any:
25 """Run an m365 CLI command and return parsed JSON output."""
26 cmd = ["m365"] + args + ["--output", "json"]
27 proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
28 if proc.returncode != 0:
29 raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}")
30 try:
31 return json.loads(proc.stdout)
32 except json.JSONDecodeError:
33 return proc.stdout.strip()
34
35
36 def _html_to_text(html: str) -> str:
37 """Strip HTML tags and decode entities to produce plain text.
38
39 Uses only stdlib ``re`` — no external dependencies.
40 """
41 # Remove script/style blocks entirely
42 text = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
43 # Replace <br>, <p>, <div>, <li>, <tr> with newlines for readability
44 text = re.sub(r"<br\s*/?>", "\n", text, flags=re.IGNORECASE)
45 text = re.sub(r"</(p|div|li|tr|h[1-6])>", "\n", text, flags=re.IGNORECASE)
46 # Strip remaining tags
47 text = re.sub(r"<[^>]+>", "", text)
48 # Decode common HTML entities
49 entity_map = {
50 "&amp;": "&",
51 "&lt;": "<",
52 "&gt;": ">",
53 "&quot;": '"',
54 "&#39;": "'",
55 "&apos;": "'",
56 "&nbsp;": " ",
57 }
58 for entity, char in entity_map.items():
59 text = text.replace(entity, char)
60 # Decode numeric entities (&#123; and &#x1a;)
61 text = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), text)
62 text = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), text)
63 # Collapse excessive blank lines
64 text = re.sub(r"\n{3,}", "\n\n", text)
65 return text.strip()
66
67
68 class OneNoteSource(BaseSource):
69 """
70 Fetch pages from OneNote notebooks via the m365 CLI.
71
72 Usage:
73 source = OneNoteSource() # all notebooks
74 source = OneNoteSource(notebook_name="Work Notes") # specific notebook
75 source = OneNoteSource(notebook_name="Work", section_name="Meetings")
76 files = source.list_videos()
77 source.download_all(files, Path("./notes"))
78 """
79
80 def __init__(
81 self,
82 notebook_name: Optional[str] = None,
83 section_name: Optional[str] = None,
84 ):
85 self.notebook_name = notebook_name
86 self.section_name = section_name
87
88 def authenticate(self) -> bool:
89 """Check if m365 CLI is installed and logged in."""
90 if not shutil.which("m365"):
91 logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365")
92 return False
93 try:
94 result = _run_m365(["status"], timeout=10)
95 if isinstance(result, dict) and result.get("connectedAs"):
96 return True
97 if isinstance(result, str) and "Logged in" in result:
98 return True
99 logger.error("m365 not logged in. Run: m365 login")
100 return False
101 except (RuntimeError, subprocess.TimeoutExpired):
102 logger.error("m365 not logged in. Run: m365 login")
103 return False
104
105 def list_videos(
106 self,
107 folder_id: Optional[str] = None,
108 folder_path: Optional[str] = None,
109 patterns: Optional[List[str]] = None,
110 ) -> List[SourceFile]:
111 """List OneNote pages across notebooks/sections. Returns SourceFile per page."""
112 files: List[SourceFile] = []
113
114 # Step 1: List notebooks
115 try:
116 notebooks = _run_m365(["onenote", "notebook", "list"], timeout=60)
117 except RuntimeError as e:
118 logger.error(f"Failed to list OneNote notebooks: {e}")
119 return []
120
121 if not isinstance(notebooks, list):
122 notebooks = []
123
124 # Filter notebooks by name if specified
125 if self.notebook_name:
126 notebooks = [
127 nb
128 for nb in notebooks
129 if self.notebook_name.lower() in nb.get("displayName", "").lower()
130 ]
131
132 for notebook in notebooks:
133 notebook_id = notebook.get("id", "")
134 notebook_name = notebook.get("displayName", "Untitled Notebook")
135
136 # Step 2: List sections in this notebook
137 try:
138 sections = _run_m365(
139 ["onenote", "section", "list", "--notebookId", notebook_id],
140 timeout=60,
141 )
142 except RuntimeError as e:
143 logger.warning(f"Failed to list sections for notebook '{notebook_name}': {e}")
144 continue
145
146 if not isinstance(sections, list):
147 sections = []
148
149 # Filter sections by name if specified
150 if self.section_name:
151 sections = [
152 s
153 for s in sections
154 if self.section_name.lower() in s.get("displayName", "").lower()
155 ]
156
157 for section in sections:
158 section_id = section.get("id", "")
159 section_name = section.get("displayName", "Untitled Section")
160
161 # Step 3: List pages in this section
162 try:
163 pages = _run_m365(
164 ["onenote", "page", "list", "--sectionId", section_id],
165 timeout=60,
166 )
167 except RuntimeError as e:
168 logger.warning(f"Failed to list pages in section '{section_name}': {e}")
169 continue
170
171 if not isinstance(pages, list):
172 pages = []
173
174 for page in pages:
175 page_id = page.get("id", "")
176 title = page.get("title", "Untitled Page").strip() or "Untitled Page"
177 modified = page.get("lastModifiedDateTime")
178 # Build a path for organizational context
179 page_path = f"{notebook_name}/{section_name}/{title}"
180
181 files.append(
182 SourceFile(
183 name=title,
184 id=str(page_id),
185 size_bytes=None,
186 mime_type="text/html",
187 modified_at=modified,
188 path=page_path,
189 )
190 )
191
192 logger.info(f"Found {len(files)} page(s) in OneNote")
193 return files
194
195 def download(self, file: SourceFile, destination: Path) -> Path:
196 """Download a OneNote page's content as a text file."""
197 destination = Path(destination)
198 destination.parent.mkdir(parents=True, exist_ok=True)
199
200 try:
201 result = _run_m365(
202 ["onenote", "page", "get", "--id", file.id],
203 timeout=60,
204 )
205 except RuntimeError as e:
206 raise RuntimeError(f"Failed to fetch OneNote page {file.id}: {e}") from e
207
208 # Extract HTML content from the result
209 if isinstance(result, dict):
210 html = result.get("content", result.get("body", {}).get("content", ""))
211 if not html:
212 # Fallback: serialize the whole response
213 html = json.dumps(result, indent=2)
214 elif isinstance(result, str):
215 html = result
216 else:
217 html = str(result)
218
219 text = _html_to_text(html)
220 destination.write_text(text, encoding="utf-8")
221 logger.info(f"Saved page '{file.name}' to {destination}")
222 return destination

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button