PlanOpticon

feat: add notes connectors (in/out), wiki generator, and meeting sources Input connectors: - ObsidianSource: vault ingester with frontmatter, wiki-links, tags parsing - LogseqSource: graph ingester with page properties, block refs - NotionSource: API connector for pages and databases - AppleNotesSource: macOS osascript connector - GoogleKeepSource: via gws CLI with checklist support - OneNoteSource: via m365 CLI with notebook/section/page hierarchy Output skills: - WikiGeneratorSkill: generates GitHub wiki from KG (Home, Sidebar, entity pages) - NotesExportSkill: exports KG as Obsidian vault or Notion-compatible markdown CLI commands: - planopticon wiki {generate,push} 52 new tests (728 total)

lmata 2026-03-07 23:12 trunk

Commit 855b7fbcf90ec4416cfdeeb72445dd82e03b6c6af67baf07a5d658ae5fd2658d

Parent 7d809ed2e1f41ac…

13 files changed +222 +482 +2 +420 +315 +61 +16 -4 +178 +170 +200 +380 +178 +222

~ tests/test_agent_skills.py ~ tests/test_sources.py ~ video_processor/agent/skills/__init__.py + video_processor/agent/skills/notes_export.py + video_processor/agent/skills/wiki_generator.py ~ video_processor/cli/commands.py ~ video_processor/sources/__init__.py + video_processor/sources/apple_notes_source.py + video_processor/sources/google_keep_source.py + video_processor/sources/logseq_source.py + video_processor/sources/notion_source.py + video_processor/sources/obsidian_source.py + video_processor/sources/onenote_source.py

M tests/test_agent_skills.py

+222

		--- tests/test_agent_skills.py
		+++ tests/test_agent_skills.py
		@@ -401,5 +401,227 @@
401	401
402	402	skill = ProjectPlanSkill()
403	403	ctx = _make_context()
404	404	ctx.provider_manager = None
405	405	assert skill.can_execute(ctx) is False
	406	+
	407	+
	408	+# ---------------------------------------------------------------------------
	409	+# WikiGeneratorSkill
	410	+# ---------------------------------------------------------------------------
	411	+
	412	+
	413	+class TestWikiGeneratorSkill:
	414	+ def _sample_kg_data(self):
	415	+ return {
	416	+ "nodes": [
	417	+ {
	418	+ "name": "Python",
	419	+ "type": "technology",
	420	+ "descriptions": ["A programming language"],
	421	+ },
	422	+ {
	423	+ "name": "Alice",
	424	+ "type": "person",
	425	+ "descriptions": ["Lead developer"],
	426	+ },
	427	+ {
	428	+ "name": "FastAPI",
	429	+ "type": "technology",
	430	+ "descriptions": ["Web framework"],
	431	+ },
	432	+ ],
	433	+ "relationships": [
	434	+ {"source": "Alice", "target": "Python", "type": "uses"},
	435	+ {"source": "FastAPI", "target": "Python", "type": "built_with"},
	436	+ ],
	437	+ }
	438	+
	439	+ def test_generate_wiki(self):
	440	+ from video_processor.agent.skills.wiki_generator import generate_wiki
	441	+
	442	+ pages = generate_wiki(self._sample_kg_data(), title="Test Wiki")
	443	+
	444	+ assert "Home" in pages
	445	+ assert "_Sidebar" in pages
	446	+ assert "Test Wiki" in pages["Home"]
	447	+ assert "3" in pages["Home"] # 3 entities
	448	+ assert "2" in pages["Home"] # 2 relationships
	449	+
	450	+ # Entity pages should exist
	451	+ assert "Python" in pages
	452	+ assert "Alice" in pages
	453	+ assert "FastAPI" in pages
	454	+
	455	+ # Type index pages should exist
	456	+ assert "Technology" in pages
	457	+ assert "Person" in pages
	458	+
	459	+ # Alice's page should reference Python
	460	+ assert "Python" in pages["Alice"]
	461	+ assert "uses" in pages["Alice"]
	462	+
	463	+ def test_generate_wiki_with_artifacts(self):
	464	+ from video_processor.agent.skills.wiki_generator import generate_wiki
	465	+
	466	+ art = Artifact(
	467	+ name="Project Plan",
	468	+ content="# Plan\n\nDo the thing.",
	469	+ artifact_type="project_plan",
	470	+ format="markdown",
	471	+ )
	472	+ pages = generate_wiki(self._sample_kg_data(), artifacts=[art])
	473	+
	474	+ assert "Project-Plan" in pages
	475	+ assert "Do the thing." in pages["Project-Plan"]
	476	+ assert "Planning Artifacts" in pages["Home"]
	477	+
	478	+ def test_write_wiki(self, tmp_path):
	479	+ from video_processor.agent.skills.wiki_generator import write_wiki
	480	+
	481	+ pages = {
	482	+ "Home": "# Home\n\nWelcome.",
	483	+ "Page-One": "# Page One\n\nContent.",
	484	+ }
	485	+ paths = write_wiki(pages, tmp_path / "wiki")
	486	+
	487	+ assert len(paths) == 2
	488	+ assert (tmp_path / "wiki" / "Home.md").exists()
	489	+ assert (tmp_path / "wiki" / "Page-One.md").exists()
	490	+ assert "Welcome." in (tmp_path / "wiki" / "Home.md").read_text()
	491	+
	492	+ def test_sanitize_filename(self):
	493	+ from video_processor.agent.skills.wiki_generator import _sanitize_filename
	494	+
	495	+ assert _sanitize_filename("Hello World") == "Hello-World"
	496	+ assert _sanitize_filename("path/to\\file") == "path-to-file"
	497	+ assert _sanitize_filename("version.2") == "version-2"
	498	+
	499	+ def test_wiki_link(self):
	500	+ from video_processor.agent.skills.wiki_generator import _wiki_link
	501	+
	502	+ result = _wiki_link("My Page")
	503	+ assert result == "[My Page](My-Page)"
	504	+
	505	+ result = _wiki_link("Simple")
	506	+ assert result == "[Simple](Simple)"
	507	+
	508	+
	509	+# ---------------------------------------------------------------------------
	510	+# NotesExportSkill
	511	+# ---------------------------------------------------------------------------
	512	+
	513	+
	514	+class TestNotesExportSkill:
	515	+ def _sample_kg_data(self):
	516	+ return {
	517	+ "nodes": [
	518	+ {
	519	+ "name": "Python",
	520	+ "type": "technology",
	521	+ "descriptions": ["A programming language"],
	522	+ },
	523	+ {
	524	+ "name": "Alice",
	525	+ "type": "person",
	526	+ "descriptions": ["Lead developer"],
	527	+ },
	528	+ ],
	529	+ "relationships": [
	530	+ {"source": "Alice", "target": "Python", "type": "uses"},
	531	+ ],
	532	+ }
	533	+
	534	+ def test_export_to_obsidian(self, tmp_path):
	535	+ from video_processor.agent.skills.notes_export import export_to_obsidian
	536	+
	537	+ output_dir = tmp_path / "obsidian_vault"
	538	+ export_to_obsidian(self._sample_kg_data(), output_dir)
	539	+
	540	+ assert output_dir.is_dir()
	541	+
	542	+ # Check entity files exist
	543	+ python_file = output_dir / "Python.md"
	544	+ alice_file = output_dir / "Alice.md"
	545	+ assert python_file.exists()
	546	+ assert alice_file.exists()
	547	+
	548	+ # Check frontmatter in entity file
	549	+ python_content = python_file.read_text()
	550	+ assert "---" in python_content
	551	+ assert "type: technology" in python_content
	552	+ assert "# Python" in python_content
	553	+
	554	+ # Check wiki-links in Alice file
	555	+ alice_content = alice_file.read_text()
	556	+ assert "[[Python]]" in alice_content
	557	+ assert "uses" in alice_content
	558	+
	559	+ # Check index file
	560	+ index_file = output_dir / "_Index.md"
	561	+ assert index_file.exists()
	562	+ index_content = index_file.read_text()
	563	+ assert "[[Python]]" in index_content
	564	+ assert "[[Alice]]" in index_content
	565	+
	566	+ def test_export_to_obsidian_with_artifacts(self, tmp_path):
	567	+ from video_processor.agent.skills.notes_export import export_to_obsidian
	568	+
	569	+ art = Artifact(
	570	+ name="Test Plan",
	571	+ content="# Plan\n\nSteps here.",
	572	+ artifact_type="project_plan",
	573	+ format="markdown",
	574	+ )
	575	+ output_dir = tmp_path / "obsidian_arts"
	576	+ export_to_obsidian(self._sample_kg_data(), output_dir, artifacts=[art])
	577	+
	578	+ art_file = output_dir / "Test Plan.md"
	579	+ assert art_file.exists()
	580	+ art_content = art_file.read_text()
	581	+ assert "artifact" in art_content
	582	+ assert "Steps here." in art_content
	583	+
	584	+ def test_export_to_notion_md(self, tmp_path):
	585	+ from video_processor.agent.skills.notes_export import export_to_notion_md
	586	+
	587	+ output_dir = tmp_path / "notion_export"
	588	+ export_to_notion_md(self._sample_kg_data(), output_dir)
	589	+
	590	+ assert output_dir.is_dir()
	591	+
	592	+ # Check CSV database file
	593	+ csv_file = output_dir / "entities_database.csv"
	594	+ assert csv_file.exists()
	595	+ csv_content = csv_file.read_text()
	596	+ assert "Name" in csv_content
	597	+ assert "Type" in csv_content
	598	+ assert "Python" in csv_content
	599	+ assert "Alice" in csv_content
	600	+
	601	+ # Check entity markdown files
	602	+ python_file = output_dir / "Python.md"
	603	+ assert python_file.exists()
	604	+ python_content = python_file.read_text()
	605	+ assert "# Python" in python_content
	606	+ assert "technology" in python_content
	607	+
	608	+ # Check overview file
	609	+ overview_file = output_dir / "Overview.md"
	610	+ assert overview_file.exists()
	611	+
	612	+ def test_export_to_notion_md_with_artifacts(self, tmp_path):
	613	+ from video_processor.agent.skills.notes_export import export_to_notion_md
	614	+
	615	+ art = Artifact(
	616	+ name="Roadmap",
	617	+ content="# Roadmap\n\nQ1 goals.",
	618	+ artifact_type="roadmap",
	619	+ format="markdown",
	620	+ )
	621	+ output_dir = tmp_path / "notion_arts"
	622	+ export_to_notion_md(self._sample_kg_data(), output_dir, artifacts=[art])
	623	+
	624	+ art_file = output_dir / "Roadmap.md"
	625	+ assert art_file.exists()
	626	+ art_content = art_file.read_text()
	627	+ assert "Q1 goals." in art_content
406	628

	--- tests/test_agent_skills.py
	+++ tests/test_agent_skills.py
	@@ -401,5 +401,227 @@
401
402	skill = ProjectPlanSkill()
403	ctx = _make_context()
404	ctx.provider_manager = None
405	assert skill.can_execute(ctx) is False






























































































































































































































406

	--- tests/test_agent_skills.py
	+++ tests/test_agent_skills.py
	@@ -401,5 +401,227 @@
401
402	skill = ProjectPlanSkill()
403	ctx = _make_context()
404	ctx.provider_manager = None
405	assert skill.can_execute(ctx) is False
406
407
408	# ---------------------------------------------------------------------------
409	# WikiGeneratorSkill
410	# ---------------------------------------------------------------------------
411
412
413	class TestWikiGeneratorSkill:
414	def _sample_kg_data(self):
415	return {
416	"nodes": [
417	{
418	"name": "Python",
419	"type": "technology",
420	"descriptions": ["A programming language"],
421	},
422	{
423	"name": "Alice",
424	"type": "person",
425	"descriptions": ["Lead developer"],
426	},
427	{
428	"name": "FastAPI",
429	"type": "technology",
430	"descriptions": ["Web framework"],
431	},
432	],
433	"relationships": [
434	{"source": "Alice", "target": "Python", "type": "uses"},
435	{"source": "FastAPI", "target": "Python", "type": "built_with"},
436	],
437	}
438
439	def test_generate_wiki(self):
440	from video_processor.agent.skills.wiki_generator import generate_wiki
441
442	pages = generate_wiki(self._sample_kg_data(), title="Test Wiki")
443
444	assert "Home" in pages
445	assert "_Sidebar" in pages
446	assert "Test Wiki" in pages["Home"]
447	assert "3" in pages["Home"] # 3 entities
448	assert "2" in pages["Home"] # 2 relationships
449
450	# Entity pages should exist
451	assert "Python" in pages
452	assert "Alice" in pages
453	assert "FastAPI" in pages
454
455	# Type index pages should exist
456	assert "Technology" in pages
457	assert "Person" in pages
458
459	# Alice's page should reference Python
460	assert "Python" in pages["Alice"]
461	assert "uses" in pages["Alice"]
462
463	def test_generate_wiki_with_artifacts(self):
464	from video_processor.agent.skills.wiki_generator import generate_wiki
465
466	art = Artifact(
467	name="Project Plan",
468	content="# Plan\n\nDo the thing.",
469	artifact_type="project_plan",
470	format="markdown",
471	)
472	pages = generate_wiki(self._sample_kg_data(), artifacts=[art])
473
474	assert "Project-Plan" in pages
475	assert "Do the thing." in pages["Project-Plan"]
476	assert "Planning Artifacts" in pages["Home"]
477
478	def test_write_wiki(self, tmp_path):
479	from video_processor.agent.skills.wiki_generator import write_wiki
480
481	pages = {
482	"Home": "# Home\n\nWelcome.",
483	"Page-One": "# Page One\n\nContent.",
484	}
485	paths = write_wiki(pages, tmp_path / "wiki")
486
487	assert len(paths) == 2
488	assert (tmp_path / "wiki" / "Home.md").exists()
489	assert (tmp_path / "wiki" / "Page-One.md").exists()
490	assert "Welcome." in (tmp_path / "wiki" / "Home.md").read_text()
491
492	def test_sanitize_filename(self):
493	from video_processor.agent.skills.wiki_generator import _sanitize_filename
494
495	assert _sanitize_filename("Hello World") == "Hello-World"
496	assert _sanitize_filename("path/to\\file") == "path-to-file"
497	assert _sanitize_filename("version.2") == "version-2"
498
499	def test_wiki_link(self):
500	from video_processor.agent.skills.wiki_generator import _wiki_link
501
502	result = _wiki_link("My Page")
503	assert result == "[My Page](My-Page)"
504
505	result = _wiki_link("Simple")
506	assert result == "[Simple](Simple)"
507
508
509	# ---------------------------------------------------------------------------
510	# NotesExportSkill
511	# ---------------------------------------------------------------------------
512
513
514	class TestNotesExportSkill:
515	def _sample_kg_data(self):
516	return {
517	"nodes": [
518	{
519	"name": "Python",
520	"type": "technology",
521	"descriptions": ["A programming language"],
522	},
523	{
524	"name": "Alice",
525	"type": "person",
526	"descriptions": ["Lead developer"],
527	},
528	],
529	"relationships": [
530	{"source": "Alice", "target": "Python", "type": "uses"},
531	],
532	}
533
534	def test_export_to_obsidian(self, tmp_path):
535	from video_processor.agent.skills.notes_export import export_to_obsidian
536
537	output_dir = tmp_path / "obsidian_vault"
538	export_to_obsidian(self._sample_kg_data(), output_dir)
539
540	assert output_dir.is_dir()
541
542	# Check entity files exist
543	python_file = output_dir / "Python.md"
544	alice_file = output_dir / "Alice.md"
545	assert python_file.exists()
546	assert alice_file.exists()
547
548	# Check frontmatter in entity file
549	python_content = python_file.read_text()
550	assert "---" in python_content
551	assert "type: technology" in python_content
552	assert "# Python" in python_content
553
554	# Check wiki-links in Alice file
555	alice_content = alice_file.read_text()
556	assert "[[Python]]" in alice_content
557	assert "uses" in alice_content
558
559	# Check index file
560	index_file = output_dir / "_Index.md"
561	assert index_file.exists()
562	index_content = index_file.read_text()
563	assert "[[Python]]" in index_content
564	assert "[[Alice]]" in index_content
565
566	def test_export_to_obsidian_with_artifacts(self, tmp_path):
567	from video_processor.agent.skills.notes_export import export_to_obsidian
568
569	art = Artifact(
570	name="Test Plan",
571	content="# Plan\n\nSteps here.",
572	artifact_type="project_plan",
573	format="markdown",
574	)
575	output_dir = tmp_path / "obsidian_arts"
576	export_to_obsidian(self._sample_kg_data(), output_dir, artifacts=[art])
577
578	art_file = output_dir / "Test Plan.md"
579	assert art_file.exists()
580	art_content = art_file.read_text()
581	assert "artifact" in art_content
582	assert "Steps here." in art_content
583
584	def test_export_to_notion_md(self, tmp_path):
585	from video_processor.agent.skills.notes_export import export_to_notion_md
586
587	output_dir = tmp_path / "notion_export"
588	export_to_notion_md(self._sample_kg_data(), output_dir)
589
590	assert output_dir.is_dir()
591
592	# Check CSV database file
593	csv_file = output_dir / "entities_database.csv"
594	assert csv_file.exists()
595	csv_content = csv_file.read_text()
596	assert "Name" in csv_content
597	assert "Type" in csv_content
598	assert "Python" in csv_content
599	assert "Alice" in csv_content
600
601	# Check entity markdown files
602	python_file = output_dir / "Python.md"
603	assert python_file.exists()
604	python_content = python_file.read_text()
605	assert "# Python" in python_content
606	assert "technology" in python_content
607
608	# Check overview file
609	overview_file = output_dir / "Overview.md"
610	assert overview_file.exists()
611
612	def test_export_to_notion_md_with_artifacts(self, tmp_path):
613	from video_processor.agent.skills.notes_export import export_to_notion_md
614
615	art = Artifact(
616	name="Roadmap",
617	content="# Roadmap\n\nQ1 goals.",
618	artifact_type="roadmap",
619	format="markdown",
620	)
621	output_dir = tmp_path / "notion_arts"
622	export_to_notion_md(self._sample_kg_data(), output_dir, artifacts=[art])
623
624	art_file = output_dir / "Roadmap.md"
625	assert art_file.exists()
626	art_content = art_file.read_text()
627	assert "Q1 goals." in art_content
628

M tests/test_sources.py

+482

		--- tests/test_sources.py
		+++ tests/test_sources.py
		@@ -858,5 +858,487 @@
858	858	from video_processor.sources.m365_source import M365Source
859	859
860	860	src = M365Source(web_url="https://contoso.sharepoint.com")
861	861	files = src.list_videos()
862	862	assert files == []
	863	+
	864	+
	865	+# ---------------------------------------------------------------------------
	866	+# ObsidianSource
	867	+# ---------------------------------------------------------------------------
	868	+
	869	+
	870	+class TestObsidianSource:
	871	+ def test_import(self):
	872	+ from video_processor.sources.obsidian_source import ObsidianSource
	873	+
	874	+ assert ObsidianSource is not None
	875	+
	876	+ def test_constructor(self, tmp_path):
	877	+ from video_processor.sources.obsidian_source import ObsidianSource
	878	+
	879	+ src = ObsidianSource(vault_path=str(tmp_path))
	880	+ assert src.vault_path == tmp_path
	881	+
	882	+ def test_authenticate_with_vault(self, tmp_path):
	883	+ from video_processor.sources.obsidian_source import ObsidianSource
	884	+
	885	+ (tmp_path / "note.md").write_text("# Hello")
	886	+ src = ObsidianSource(vault_path=str(tmp_path))
	887	+ assert src.authenticate() is True
	888	+
	889	+ def test_authenticate_empty_dir(self, tmp_path):
	890	+ from video_processor.sources.obsidian_source import ObsidianSource
	891	+
	892	+ src = ObsidianSource(vault_path=str(tmp_path))
	893	+ assert src.authenticate() is False
	894	+
	895	+ def test_authenticate_nonexistent(self, tmp_path):
	896	+ from video_processor.sources.obsidian_source import ObsidianSource
	897	+
	898	+ src = ObsidianSource(vault_path=str(tmp_path / "nonexistent"))
	899	+ assert src.authenticate() is False
	900	+
	901	+ def test_parse_note(self, tmp_path):
	902	+ from video_processor.sources.obsidian_source import parse_note
	903	+
	904	+ note_content = (
	905	+ "---\n"
	906	+ "title: Test Note\n"
	907	+ "tags: [python, testing]\n"
	908	+ "---\n"
	909	+ "# Heading One\n\n"
	910	+ "Some text with a [[Wiki Link]] and [[Another Page\|alias]].\n\n"
	911	+ "Also has #tag1 and #tag2 inline tags.\n\n"
	912	+ "## Sub Heading\n\n"
	913	+ "More content here.\n"
	914	+ )
	915	+ note_file = tmp_path / "test_note.md"
	916	+ note_file.write_text(note_content)
	917	+
	918	+ result = parse_note(note_file)
	919	+
	920	+ assert result["frontmatter"]["title"] == "Test Note"
	921	+ assert isinstance(result["frontmatter"]["tags"], list)
	922	+ assert "python" in result["frontmatter"]["tags"]
	923	+ assert "Wiki Link" in result["links"]
	924	+ assert "Another Page" in result["links"]
	925	+ assert "tag1" in result["tags"]
	926	+ assert "tag2" in result["tags"]
	927	+ assert len(result["headings"]) == 2
	928	+ assert result["headings"][0]["level"] == 1
	929	+ assert result["headings"][0]["text"] == "Heading One"
	930	+ assert "Some text" in result["body"]
	931	+
	932	+ def test_ingest_vault(self, tmp_path):
	933	+ from video_processor.sources.obsidian_source import ingest_vault
	934	+
	935	+ (tmp_path / "note_a.md").write_text("# A\n\nLinks to [[B]].\n")
	936	+ (tmp_path / "note_b.md").write_text("# B\n\nLinks to [[A]] and [[C]].\n")
	937	+
	938	+ result = ingest_vault(tmp_path)
	939	+
	940	+ assert len(result["notes"]) == 2
	941	+ names = [n["name"] for n in result["notes"]]
	942	+ assert "note_a" in names
	943	+ assert "note_b" in names
	944	+ # note_a links to B, note_b links to A and C => 3 links
	945	+ assert len(result["links"]) == 3
	946	+
	947	+ def test_list_videos(self, tmp_path):
	948	+ from video_processor.sources.obsidian_source import ObsidianSource
	949	+
	950	+ (tmp_path / "note1.md").write_text("# Note 1")
	951	+ sub = tmp_path / "subdir"
	952	+ sub.mkdir()
	953	+ (sub / "note2.md").write_text("# Note 2")
	954	+
	955	+ src = ObsidianSource(vault_path=str(tmp_path))
	956	+ files = src.list_videos()
	957	+ assert len(files) == 2
	958	+ assert all(f.mime_type == "text/markdown" for f in files)
	959	+
	960	+
	961	+# ---------------------------------------------------------------------------
	962	+# LogseqSource
	963	+# ---------------------------------------------------------------------------
	964	+
	965	+
	966	+class TestLogseqSource:
	967	+ def test_import(self):
	968	+ from video_processor.sources.logseq_source import LogseqSource
	969	+
	970	+ assert LogseqSource is not None
	971	+
	972	+ def test_constructor(self, tmp_path):
	973	+ from video_processor.sources.logseq_source import LogseqSource
	974	+
	975	+ src = LogseqSource(graph_path=str(tmp_path))
	976	+ assert src.graph_path == tmp_path
	977	+
	978	+ def test_authenticate_with_pages(self, tmp_path):
	979	+ from video_processor.sources.logseq_source import LogseqSource
	980	+
	981	+ (tmp_path / "pages").mkdir()
	982	+ src = LogseqSource(graph_path=str(tmp_path))
	983	+ assert src.authenticate() is True
	984	+
	985	+ def test_authenticate_no_pages_or_journals(self, tmp_path):
	986	+ from video_processor.sources.logseq_source import LogseqSource
	987	+
	988	+ src = LogseqSource(graph_path=str(tmp_path))
	989	+ assert src.authenticate() is False
	990	+
	991	+ def test_authenticate_nonexistent(self, tmp_path):
	992	+ from video_processor.sources.logseq_source import LogseqSource
	993	+
	994	+ src = LogseqSource(graph_path=str(tmp_path / "nonexistent"))
	995	+ assert src.authenticate() is False
	996	+
	997	+ def test_parse_page(self, tmp_path):
	998	+ from video_processor.sources.logseq_source import parse_page
	999	+
	1000	+ page_content = (
	1001	+ "title:: My Page\n"
	1002	+ "tags:: #project #important\n"
	1003	+ "- Some block content\n"
	1004	+ " - Nested with [[Another Page]] link\n"
	1005	+ " - And a #todo tag\n"
	1006	+ " - Block ref ((abc12345-6789-0abc-def0-123456789abc))\n"
	1007	+ )
	1008	+ page_file = tmp_path / "my_page.md"
	1009	+ page_file.write_text(page_content)
	1010	+
	1011	+ result = parse_page(page_file)
	1012	+
	1013	+ assert result["properties"]["title"] == "My Page"
	1014	+ assert "Another Page" in result["links"]
	1015	+ assert "todo" in result["tags"]
	1016	+ assert "abc12345-6789-0abc-def0-123456789abc" in result["block_refs"]
	1017	+ assert "Some block content" in result["body"]
	1018	+
	1019	+ def test_ingest_graph(self, tmp_path):
	1020	+ from video_processor.sources.logseq_source import ingest_graph
	1021	+
	1022	+ pages_dir = tmp_path / "pages"
	1023	+ pages_dir.mkdir()
	1024	+ (pages_dir / "page_a.md").write_text("- Content linking [[Page B]]\n")
	1025	+ (pages_dir / "page_b.md").write_text("- Content linking [[Page A]]\n")
	1026	+
	1027	+ journals_dir = tmp_path / "journals"
	1028	+ journals_dir.mkdir()
	1029	+ (journals_dir / "2026_03_07.md").write_text("- Journal entry\n")
	1030	+
	1031	+ result = ingest_graph(tmp_path)
	1032	+
	1033	+ assert len(result["notes"]) == 3
	1034	+ assert len(result["links"]) == 2
	1035	+
	1036	+ def test_list_videos(self, tmp_path):
	1037	+ from video_processor.sources.logseq_source import LogseqSource
	1038	+
	1039	+ pages_dir = tmp_path / "pages"
	1040	+ pages_dir.mkdir()
	1041	+ (pages_dir / "page1.md").write_text("- content")
	1042	+
	1043	+ src = LogseqSource(graph_path=str(tmp_path))
	1044	+ files = src.list_videos()
	1045	+ assert len(files) == 1
	1046	+ assert files[0].mime_type == "text/markdown"
	1047	+
	1048	+
	1049	+# ---------------------------------------------------------------------------
	1050	+# NotionSource
	1051	+# ---------------------------------------------------------------------------
	1052	+
	1053	+
	1054	+class TestNotionSource:
	1055	+ def test_import(self):
	1056	+ from video_processor.sources.notion_source import NotionSource
	1057	+
	1058	+ assert NotionSource is not None
	1059	+
	1060	+ def test_constructor(self):
	1061	+ from video_processor.sources.notion_source import NotionSource
	1062	+
	1063	+ src = NotionSource(token="ntn_test123", database_id="db-1")
	1064	+ assert src.token == "ntn_test123"
	1065	+ assert src.database_id == "db-1"
	1066	+ assert src.page_ids == []
	1067	+
	1068	+ @patch.dict(os.environ, {}, clear=True)
	1069	+ def test_authenticate_no_token(self):
	1070	+ from video_processor.sources.notion_source import NotionSource
	1071	+
	1072	+ src = NotionSource(token="")
	1073	+ assert src.authenticate() is False
	1074	+
	1075	+ @patch("requests.get")
	1076	+ def test_authenticate_with_mock(self, mock_get):
	1077	+ from video_processor.sources.notion_source import NotionSource
	1078	+
	1079	+ mock_resp = MagicMock()
	1080	+ mock_resp.raise_for_status = MagicMock()
	1081	+ mock_resp.json.return_value = {"name": "Test Bot"}
	1082	+ mock_get.return_value = mock_resp
	1083	+
	1084	+ src = NotionSource(token="ntn_test123")
	1085	+ assert src.authenticate() is True
	1086	+
	1087	+ @patch("requests.post")
	1088	+ def test_list_videos_database(self, mock_post):
	1089	+ from video_processor.sources.notion_source import NotionSource
	1090	+
	1091	+ mock_resp = MagicMock()
	1092	+ mock_resp.raise_for_status = MagicMock()
	1093	+ mock_resp.json.return_value = {
	1094	+ "results": [
	1095	+ {
	1096	+ "id": "page-1",
	1097	+ "last_edited_time": "2026-03-01T00:00:00Z",
	1098	+ "properties": {
	1099	+ "Name": {
	1100	+ "type": "title",
	1101	+ "title": [{"plain_text": "Meeting Notes"}],
	1102	+ }
	1103	+ },
	1104	+ },
	1105	+ ],
	1106	+ "has_more": False,
	1107	+ }
	1108	+ mock_post.return_value = mock_resp
	1109	+
	1110	+ src = NotionSource(token="ntn_test", database_id="db-1")
	1111	+ files = src.list_videos()
	1112	+ assert len(files) == 1
	1113	+ assert files[0].name == "Meeting Notes"
	1114	+ assert files[0].id == "page-1"
	1115	+
	1116	+ def test_blocks_to_text(self):
	1117	+ from video_processor.sources.notion_source import NotionSource
	1118	+
	1119	+ src = NotionSource(token="test")
	1120	+ blocks = [
	1121	+ {
	1122	+ "type": "heading_1",
	1123	+ "heading_1": {
	1124	+ "rich_text": [{"plain_text": "Title"}],
	1125	+ },
	1126	+ },
	1127	+ {
	1128	+ "type": "paragraph",
	1129	+ "paragraph": {
	1130	+ "rich_text": [{"plain_text": "Some paragraph text."}],
	1131	+ },
	1132	+ },
	1133	+ {
	1134	+ "type": "bulleted_list_item",
	1135	+ "bulleted_list_item": {
	1136	+ "rich_text": [{"plain_text": "A bullet point"}],
	1137	+ },
	1138	+ },
	1139	+ {
	1140	+ "type": "divider",
	1141	+ "divider": {},
	1142	+ },
	1143	+ ]
	1144	+ result = src._blocks_to_text(blocks)
	1145	+ assert "# Title" in result
	1146	+ assert "Some paragraph text." in result
	1147	+ assert "- A bullet point" in result
	1148	+ assert "---" in result
	1149	+
	1150	+
	1151	+# ---------------------------------------------------------------------------
	1152	+# AppleNotesSource
	1153	+# ---------------------------------------------------------------------------
	1154	+
	1155	+
	1156	+class TestAppleNotesSource:
	1157	+ def test_import(self):
	1158	+ from video_processor.sources.apple_notes_source import AppleNotesSource
	1159	+
	1160	+ assert AppleNotesSource is not None
	1161	+
	1162	+ def test_constructor(self):
	1163	+ from video_processor.sources.apple_notes_source import AppleNotesSource
	1164	+
	1165	+ src = AppleNotesSource(folder="Work")
	1166	+ assert src.folder == "Work"
	1167	+
	1168	+ def test_constructor_default(self):
	1169	+ from video_processor.sources.apple_notes_source import AppleNotesSource
	1170	+
	1171	+ src = AppleNotesSource()
	1172	+ assert src.folder is None
	1173	+
	1174	+ def test_authenticate_platform(self):
	1175	+ import sys
	1176	+
	1177	+ from video_processor.sources.apple_notes_source import AppleNotesSource
	1178	+
	1179	+ src = AppleNotesSource()
	1180	+ result = src.authenticate()
	1181	+ if sys.platform == "darwin":
	1182	+ assert result is True
	1183	+ else:
	1184	+ assert result is False
	1185	+
	1186	+ def test_html_to_text(self):
	1187	+ from video_processor.sources.apple_notes_source import AppleNotesSource
	1188	+
	1189	+ html = (
	1190	+ "<div>Hello <b>World</b></div>"
	1191	+ "<p>Paragraph one.</p>"
	1192	+ "<p>Paragraph two with & entity.</p>"
	1193	+ "<br/>"
	1194	+ "<ul><li>Item 1</li><li>Item 2</li></ul>"
	1195	+ )
	1196	+ result = AppleNotesSource._html_to_text(html)
	1197	+ assert "Hello World" in result
	1198	+ assert "Paragraph one." in result
	1199	+ assert "Paragraph two with & entity." in result
	1200	+ assert "Item 1" in result
	1201	+
	1202	+ def test_html_to_text_empty(self):
	1203	+ from video_processor.sources.apple_notes_source import AppleNotesSource
	1204	+
	1205	+ assert AppleNotesSource._html_to_text("") == ""
	1206	+
	1207	+ def test_html_to_text_entities(self):
	1208	+ from video_processor.sources.apple_notes_source import AppleNotesSource
	1209	+
	1210	+ html = "<code> "test" 'single'  space"
	1211	+ result = AppleNotesSource._html_to_text(html)
	1212	+ assert "<code>" in result
	1213	+ assert '"test"' in result
	1214	+ assert "'single'" in result
	1215	+
	1216	+
	1217	+# ---------------------------------------------------------------------------
	1218	+# GoogleKeepSource
	1219	+# ---------------------------------------------------------------------------
	1220	+
	1221	+
	1222	+class TestGoogleKeepSource:
	1223	+ def test_import(self):
	1224	+ from video_processor.sources.google_keep_source import GoogleKeepSource
	1225	+
	1226	+ assert GoogleKeepSource is not None
	1227	+
	1228	+ def test_constructor(self):
	1229	+ from video_processor.sources.google_keep_source import GoogleKeepSource
	1230	+
	1231	+ src = GoogleKeepSource(label="meetings")
	1232	+ assert src.label == "meetings"
	1233	+
	1234	+ def test_constructor_default(self):
	1235	+ from video_processor.sources.google_keep_source import GoogleKeepSource
	1236	+
	1237	+ src = GoogleKeepSource()
	1238	+ assert src.label is None
	1239	+
	1240	+ @patch("shutil.which", return_value=None)
	1241	+ def test_authenticate_no_gws(self, _mock_which):
	1242	+ from video_processor.sources.google_keep_source import GoogleKeepSource
	1243	+
	1244	+ src = GoogleKeepSource()
	1245	+ assert src.authenticate() is False
	1246	+
	1247	+ def test_note_to_text(self):
	1248	+ from video_processor.sources.google_keep_source import _note_to_text
	1249	+
	1250	+ note = {
	1251	+ "title": "Shopping List",
	1252	+ "body": "Remember to buy groceries",
	1253	+ "listContent": [
	1254	+ {"text": "Milk", "checked": True},
	1255	+ {"text": "Bread", "checked": False},
	1256	+ {"text": "", "checked": False},
	1257	+ ],
	1258	+ }
	1259	+ result = _note_to_text(note)
	1260	+ assert "Shopping List" in result
	1261	+ assert "Remember to buy groceries" in result
	1262	+ assert "- [x] Milk" in result
	1263	+ assert "- [ ] Bread" in result
	1264	+
	1265	+ def test_note_to_text_empty(self):
	1266	+ from video_processor.sources.google_keep_source import _note_to_text
	1267	+
	1268	+ assert _note_to_text({}) == ""
	1269	+
	1270	+ def test_note_to_text_text_content(self):
	1271	+ from video_processor.sources.google_keep_source import _note_to_text
	1272	+
	1273	+ note = {"title": "Simple", "textContent": "Just a plain note"}
	1274	+ result = _note_to_text(note)
	1275	+ assert "Simple" in result
	1276	+ assert "Just a plain note" in result
	1277	+
	1278	+
	1279	+# ---------------------------------------------------------------------------
	1280	+# OneNoteSource
	1281	+# ---------------------------------------------------------------------------
	1282	+
	1283	+
	1284	+class TestOneNoteSource:
	1285	+ def test_import(self):
	1286	+ from video_processor.sources.onenote_source import OneNoteSource
	1287	+
	1288	+ assert OneNoteSource is not None
	1289	+
	1290	+ def test_constructor(self):
	1291	+ from video_processor.sources.onenote_source import OneNoteSource
	1292	+
	1293	+ src = OneNoteSource(notebook_name="Work Notes", section_name="Meetings")
	1294	+ assert src.notebook_name == "Work Notes"
	1295	+ assert src.section_name == "Meetings"
	1296	+
	1297	+ def test_constructor_default(self):
	1298	+ from video_processor.sources.onenote_source import OneNoteSource
	1299	+
	1300	+ src = OneNoteSource()
	1301	+ assert src.notebook_name is None
	1302	+ assert src.section_name is None
	1303	+
	1304	+ @patch("shutil.which", return_value=None)
	1305	+ def test_authenticate_no_m365(self, _mock_which):
	1306	+ from video_processor.sources.onenote_source import OneNoteSource
	1307	+
	1308	+ src = OneNoteSource()
	1309	+ assert src.authenticate() is False
	1310	+
	1311	+ def test_html_to_text(self):
	1312	+ from video_processor.sources.onenote_source import _html_to_text
	1313	+
	1314	+ html = (
	1315	+ "<html><body>"
	1316	+ "<h1>Meeting Notes</h1>"
	1317	+ "<p>Discussed the & project.</p>"
	1318	+ "<script>var x = 1;</script>"
	1319	+ "<style>.foo { color: red; }</style>"
	1320	+ "<ul><li>Action item 1</li><li>Action item 2</li></ul>"
	1321	+ "<p>Entity A and A decoded.</p>"
	1322	+ "</body></html>"
	1323	+ )
	1324	+ result = _html_to_text(html)
	1325	+ assert "Meeting Notes" in result
	1326	+ assert "Discussed the & project." in result
	1327	+ assert "var x" not in result
	1328	+ assert ".foo" not in result
	1329	+ assert "Action item 1" in result
	1330	+ assert "Entity A and A decoded." in result
	1331	+
	1332	+ def test_html_to_text_empty(self):
	1333	+ from video_processor.sources.onenote_source import _html_to_text
	1334	+
	1335	+ assert _html_to_text("") == ""
	1336	+
	1337	+ def test_html_to_text_entities(self):
	1338	+ from video_processor.sources.onenote_source import _html_to_text
	1339	+
	1340	+ html = "<tag> "quoted" 'apos'  space"
	1341	+ result = _html_to_text(html)
	1342	+ assert "<tag>" in result
	1343	+ assert '"quoted"' in result
	1344	+ assert "'apos'" in result
863	1345

	--- tests/test_sources.py
	+++ tests/test_sources.py
	@@ -858,5 +858,487 @@
858	from video_processor.sources.m365_source import M365Source
859
860	src = M365Source(web_url="https://contoso.sharepoint.com")
861	files = src.list_videos()
862	assert files == []


































































































































































































































































































































































































































































































863

	--- tests/test_sources.py
	+++ tests/test_sources.py
	@@ -858,5 +858,487 @@
858	from video_processor.sources.m365_source import M365Source
859
860	src = M365Source(web_url="https://contoso.sharepoint.com")
861	files = src.list_videos()
862	assert files == []
863
864
865	# ---------------------------------------------------------------------------
866	# ObsidianSource
867	# ---------------------------------------------------------------------------
868
869
870	class TestObsidianSource:
871	def test_import(self):
872	from video_processor.sources.obsidian_source import ObsidianSource
873
874	assert ObsidianSource is not None
875
876	def test_constructor(self, tmp_path):
877	from video_processor.sources.obsidian_source import ObsidianSource
878
879	src = ObsidianSource(vault_path=str(tmp_path))
880	assert src.vault_path == tmp_path
881
882	def test_authenticate_with_vault(self, tmp_path):
883	from video_processor.sources.obsidian_source import ObsidianSource
884
885	(tmp_path / "note.md").write_text("# Hello")
886	src = ObsidianSource(vault_path=str(tmp_path))
887	assert src.authenticate() is True
888
889	def test_authenticate_empty_dir(self, tmp_path):
890	from video_processor.sources.obsidian_source import ObsidianSource
891
892	src = ObsidianSource(vault_path=str(tmp_path))
893	assert src.authenticate() is False
894
895	def test_authenticate_nonexistent(self, tmp_path):
896	from video_processor.sources.obsidian_source import ObsidianSource
897
898	src = ObsidianSource(vault_path=str(tmp_path / "nonexistent"))
899	assert src.authenticate() is False
900
901	def test_parse_note(self, tmp_path):
902	from video_processor.sources.obsidian_source import parse_note
903
904	note_content = (
905	"---\n"
906	"title: Test Note\n"
907	"tags: [python, testing]\n"
908	"---\n"
909	"# Heading One\n\n"
910	"Some text with a [[Wiki Link]] and [[Another Page\|alias]].\n\n"
911	"Also has #tag1 and #tag2 inline tags.\n\n"
912	"## Sub Heading\n\n"
913	"More content here.\n"
914	)
915	note_file = tmp_path / "test_note.md"
916	note_file.write_text(note_content)
917
918	result = parse_note(note_file)
919
920	assert result["frontmatter"]["title"] == "Test Note"
921	assert isinstance(result["frontmatter"]["tags"], list)
922	assert "python" in result["frontmatter"]["tags"]
923	assert "Wiki Link" in result["links"]
924	assert "Another Page" in result["links"]
925	assert "tag1" in result["tags"]
926	assert "tag2" in result["tags"]
927	assert len(result["headings"]) == 2
928	assert result["headings"][0]["level"] == 1
929	assert result["headings"][0]["text"] == "Heading One"
930	assert "Some text" in result["body"]
931
932	def test_ingest_vault(self, tmp_path):
933	from video_processor.sources.obsidian_source import ingest_vault
934
935	(tmp_path / "note_a.md").write_text("# A\n\nLinks to [[B]].\n")
936	(tmp_path / "note_b.md").write_text("# B\n\nLinks to [[A]] and [[C]].\n")
937
938	result = ingest_vault(tmp_path)
939
940	assert len(result["notes"]) == 2
941	names = [n["name"] for n in result["notes"]]
942	assert "note_a" in names
943	assert "note_b" in names
944	# note_a links to B, note_b links to A and C => 3 links
945	assert len(result["links"]) == 3
946
947	def test_list_videos(self, tmp_path):
948	from video_processor.sources.obsidian_source import ObsidianSource
949
950	(tmp_path / "note1.md").write_text("# Note 1")
951	sub = tmp_path / "subdir"
952	sub.mkdir()
953	(sub / "note2.md").write_text("# Note 2")
954
955	src = ObsidianSource(vault_path=str(tmp_path))
956	files = src.list_videos()
957	assert len(files) == 2
958	assert all(f.mime_type == "text/markdown" for f in files)
959
960
961	# ---------------------------------------------------------------------------
962	# LogseqSource
963	# ---------------------------------------------------------------------------
964
965
966	class TestLogseqSource:
967	def test_import(self):
968	from video_processor.sources.logseq_source import LogseqSource
969
970	assert LogseqSource is not None
971
972	def test_constructor(self, tmp_path):
973	from video_processor.sources.logseq_source import LogseqSource
974
975	src = LogseqSource(graph_path=str(tmp_path))
976	assert src.graph_path == tmp_path
977
978	def test_authenticate_with_pages(self, tmp_path):
979	from video_processor.sources.logseq_source import LogseqSource
980
981	(tmp_path / "pages").mkdir()
982	src = LogseqSource(graph_path=str(tmp_path))
983	assert src.authenticate() is True
984
985	def test_authenticate_no_pages_or_journals(self, tmp_path):
986	from video_processor.sources.logseq_source import LogseqSource
987
988	src = LogseqSource(graph_path=str(tmp_path))
989	assert src.authenticate() is False
990
991	def test_authenticate_nonexistent(self, tmp_path):
992	from video_processor.sources.logseq_source import LogseqSource
993
994	src = LogseqSource(graph_path=str(tmp_path / "nonexistent"))
995	assert src.authenticate() is False
996
997	def test_parse_page(self, tmp_path):
998	from video_processor.sources.logseq_source import parse_page
999
1000	page_content = (
1001	"title:: My Page\n"
1002	"tags:: #project #important\n"
1003	"- Some block content\n"
1004	" - Nested with [[Another Page]] link\n"
1005	" - And a #todo tag\n"
1006	" - Block ref ((abc12345-6789-0abc-def0-123456789abc))\n"
1007	)
1008	page_file = tmp_path / "my_page.md"
1009	page_file.write_text(page_content)
1010
1011	result = parse_page(page_file)
1012
1013	assert result["properties"]["title"] == "My Page"
1014	assert "Another Page" in result["links"]
1015	assert "todo" in result["tags"]
1016	assert "abc12345-6789-0abc-def0-123456789abc" in result["block_refs"]
1017	assert "Some block content" in result["body"]
1018
1019	def test_ingest_graph(self, tmp_path):
1020	from video_processor.sources.logseq_source import ingest_graph
1021
1022	pages_dir = tmp_path / "pages"
1023	pages_dir.mkdir()
1024	(pages_dir / "page_a.md").write_text("- Content linking [[Page B]]\n")
1025	(pages_dir / "page_b.md").write_text("- Content linking [[Page A]]\n")
1026
1027	journals_dir = tmp_path / "journals"
1028	journals_dir.mkdir()
1029	(journals_dir / "2026_03_07.md").write_text("- Journal entry\n")
1030
1031	result = ingest_graph(tmp_path)
1032
1033	assert len(result["notes"]) == 3
1034	assert len(result["links"]) == 2
1035
1036	def test_list_videos(self, tmp_path):
1037	from video_processor.sources.logseq_source import LogseqSource
1038
1039	pages_dir = tmp_path / "pages"
1040	pages_dir.mkdir()
1041	(pages_dir / "page1.md").write_text("- content")
1042
1043	src = LogseqSource(graph_path=str(tmp_path))
1044	files = src.list_videos()
1045	assert len(files) == 1
1046	assert files[0].mime_type == "text/markdown"
1047
1048
1049	# ---------------------------------------------------------------------------
1050	# NotionSource
1051	# ---------------------------------------------------------------------------
1052
1053
1054	class TestNotionSource:
1055	def test_import(self):
1056	from video_processor.sources.notion_source import NotionSource
1057
1058	assert NotionSource is not None
1059
1060	def test_constructor(self):
1061	from video_processor.sources.notion_source import NotionSource
1062
1063	src = NotionSource(token="ntn_test123", database_id="db-1")
1064	assert src.token == "ntn_test123"
1065	assert src.database_id == "db-1"
1066	assert src.page_ids == []
1067
1068	@patch.dict(os.environ, {}, clear=True)
1069	def test_authenticate_no_token(self):
1070	from video_processor.sources.notion_source import NotionSource
1071
1072	src = NotionSource(token="")
1073	assert src.authenticate() is False
1074
1075	@patch("requests.get")
1076	def test_authenticate_with_mock(self, mock_get):
1077	from video_processor.sources.notion_source import NotionSource
1078
1079	mock_resp = MagicMock()
1080	mock_resp.raise_for_status = MagicMock()
1081	mock_resp.json.return_value = {"name": "Test Bot"}
1082	mock_get.return_value = mock_resp
1083
1084	src = NotionSource(token="ntn_test123")
1085	assert src.authenticate() is True
1086
1087	@patch("requests.post")
1088	def test_list_videos_database(self, mock_post):
1089	from video_processor.sources.notion_source import NotionSource
1090
1091	mock_resp = MagicMock()
1092	mock_resp.raise_for_status = MagicMock()
1093	mock_resp.json.return_value = {
1094	"results": [
1095	{
1096	"id": "page-1",
1097	"last_edited_time": "2026-03-01T00:00:00Z",
1098	"properties": {
1099	"Name": {
1100	"type": "title",
1101	"title": [{"plain_text": "Meeting Notes"}],
1102	}
1103	},
1104	},
1105	],
1106	"has_more": False,
1107	}
1108	mock_post.return_value = mock_resp
1109
1110	src = NotionSource(token="ntn_test", database_id="db-1")
1111	files = src.list_videos()
1112	assert len(files) == 1
1113	assert files[0].name == "Meeting Notes"
1114	assert files[0].id == "page-1"
1115
1116	def test_blocks_to_text(self):
1117	from video_processor.sources.notion_source import NotionSource
1118
1119	src = NotionSource(token="test")
1120	blocks = [
1121	{
1122	"type": "heading_1",
1123	"heading_1": {
1124	"rich_text": [{"plain_text": "Title"}],
1125	},
1126	},
1127	{
1128	"type": "paragraph",
1129	"paragraph": {
1130	"rich_text": [{"plain_text": "Some paragraph text."}],
1131	},
1132	},
1133	{
1134	"type": "bulleted_list_item",
1135	"bulleted_list_item": {
1136	"rich_text": [{"plain_text": "A bullet point"}],
1137	},
1138	},
1139	{
1140	"type": "divider",
1141	"divider": {},
1142	},
1143	]
1144	result = src._blocks_to_text(blocks)
1145	assert "# Title" in result
1146	assert "Some paragraph text." in result
1147	assert "- A bullet point" in result
1148	assert "---" in result
1149
1150
1151	# ---------------------------------------------------------------------------
1152	# AppleNotesSource
1153	# ---------------------------------------------------------------------------
1154
1155
1156	class TestAppleNotesSource:
1157	def test_import(self):
1158	from video_processor.sources.apple_notes_source import AppleNotesSource
1159
1160	assert AppleNotesSource is not None
1161
1162	def test_constructor(self):
1163	from video_processor.sources.apple_notes_source import AppleNotesSource
1164
1165	src = AppleNotesSource(folder="Work")
1166	assert src.folder == "Work"
1167
1168	def test_constructor_default(self):
1169	from video_processor.sources.apple_notes_source import AppleNotesSource
1170
1171	src = AppleNotesSource()
1172	assert src.folder is None
1173
1174	def test_authenticate_platform(self):
1175	import sys
1176
1177	from video_processor.sources.apple_notes_source import AppleNotesSource
1178
1179	src = AppleNotesSource()
1180	result = src.authenticate()
1181	if sys.platform == "darwin":
1182	assert result is True
1183	else:
1184	assert result is False
1185
1186	def test_html_to_text(self):
1187	from video_processor.sources.apple_notes_source import AppleNotesSource
1188
1189	html = (
1190	"<div>Hello <b>World</b></div>"
1191	"<p>Paragraph one.</p>"
1192	"<p>Paragraph two with & entity.</p>"
1193	"<br/>"
1194	"<ul><li>Item 1</li><li>Item 2</li></ul>"
1195	)
1196	result = AppleNotesSource._html_to_text(html)
1197	assert "Hello World" in result
1198	assert "Paragraph one." in result
1199	assert "Paragraph two with & entity." in result
1200	assert "Item 1" in result
1201
1202	def test_html_to_text_empty(self):
1203	from video_processor.sources.apple_notes_source import AppleNotesSource
1204
1205	assert AppleNotesSource._html_to_text("") == ""
1206
1207	def test_html_to_text_entities(self):
1208	from video_processor.sources.apple_notes_source import AppleNotesSource
1209
1210	html = "<code> "test" 'single'  space"
1211	result = AppleNotesSource._html_to_text(html)
1212	assert "<code>" in result
1213	assert '"test"' in result
1214	assert "'single'" in result
1215
1216
1217	# ---------------------------------------------------------------------------
1218	# GoogleKeepSource
1219	# ---------------------------------------------------------------------------
1220
1221
1222	class TestGoogleKeepSource:
1223	def test_import(self):
1224	from video_processor.sources.google_keep_source import GoogleKeepSource
1225
1226	assert GoogleKeepSource is not None
1227
1228	def test_constructor(self):
1229	from video_processor.sources.google_keep_source import GoogleKeepSource
1230
1231	src = GoogleKeepSource(label="meetings")
1232	assert src.label == "meetings"
1233
1234	def test_constructor_default(self):
1235	from video_processor.sources.google_keep_source import GoogleKeepSource
1236
1237	src = GoogleKeepSource()
1238	assert src.label is None
1239
1240	@patch("shutil.which", return_value=None)
1241	def test_authenticate_no_gws(self, _mock_which):
1242	from video_processor.sources.google_keep_source import GoogleKeepSource
1243
1244	src = GoogleKeepSource()
1245	assert src.authenticate() is False
1246
1247	def test_note_to_text(self):
1248	from video_processor.sources.google_keep_source import _note_to_text
1249
1250	note = {
1251	"title": "Shopping List",
1252	"body": "Remember to buy groceries",
1253	"listContent": [
1254	{"text": "Milk", "checked": True},
1255	{"text": "Bread", "checked": False},
1256	{"text": "", "checked": False},
1257	],
1258	}
1259	result = _note_to_text(note)
1260	assert "Shopping List" in result
1261	assert "Remember to buy groceries" in result
1262	assert "- [x] Milk" in result
1263	assert "- [ ] Bread" in result
1264
1265	def test_note_to_text_empty(self):
1266	from video_processor.sources.google_keep_source import _note_to_text
1267
1268	assert _note_to_text({}) == ""
1269
1270	def test_note_to_text_text_content(self):
1271	from video_processor.sources.google_keep_source import _note_to_text
1272
1273	note = {"title": "Simple", "textContent": "Just a plain note"}
1274	result = _note_to_text(note)
1275	assert "Simple" in result
1276	assert "Just a plain note" in result
1277
1278
1279	# ---------------------------------------------------------------------------
1280	# OneNoteSource
1281	# ---------------------------------------------------------------------------
1282
1283
1284	class TestOneNoteSource:
1285	def test_import(self):
1286	from video_processor.sources.onenote_source import OneNoteSource
1287
1288	assert OneNoteSource is not None
1289
1290	def test_constructor(self):
1291	from video_processor.sources.onenote_source import OneNoteSource
1292
1293	src = OneNoteSource(notebook_name="Work Notes", section_name="Meetings")
1294	assert src.notebook_name == "Work Notes"
1295	assert src.section_name == "Meetings"
1296
1297	def test_constructor_default(self):
1298	from video_processor.sources.onenote_source import OneNoteSource
1299
1300	src = OneNoteSource()
1301	assert src.notebook_name is None
1302	assert src.section_name is None
1303
1304	@patch("shutil.which", return_value=None)
1305	def test_authenticate_no_m365(self, _mock_which):
1306	from video_processor.sources.onenote_source import OneNoteSource
1307
1308	src = OneNoteSource()
1309	assert src.authenticate() is False
1310
1311	def test_html_to_text(self):
1312	from video_processor.sources.onenote_source import _html_to_text
1313
1314	html = (
1315	"<html><body>"
1316	"<h1>Meeting Notes</h1>"
1317	"<p>Discussed the & project.</p>"
1318	"<script>var x = 1;</script>"
1319	"<style>.foo { color: red; }</style>"
1320	"<ul><li>Action item 1</li><li>Action item 2</li></ul>"
1321	"<p>Entity A and A decoded.</p>"
1322	"</body></html>"
1323	)
1324	result = _html_to_text(html)
1325	assert "Meeting Notes" in result
1326	assert "Discussed the & project." in result
1327	assert "var x" not in result
1328	assert ".foo" not in result
1329	assert "Action item 1" in result
1330	assert "Entity A and A decoded." in result
1331
1332	def test_html_to_text_empty(self):
1333	from video_processor.sources.onenote_source import _html_to_text
1334
1335	assert _html_to_text("") == ""
1336
1337	def test_html_to_text_entities(self):
1338	from video_processor.sources.onenote_source import _html_to_text
1339
1340	html = "<tag> "quoted" 'apos'  space"
1341	result = _html_to_text(html)
1342	assert "<tag>" in result
1343	assert '"quoted"' in result
1344	assert "'apos'" in result
1345

M video_processor/agent/skills/__init__.py

		--- video_processor/agent/skills/__init__.py
		+++ video_processor/agent/skills/__init__.py
		@@ -4,15 +4,17 @@
4	4	from video_processor.agent.skills import ( # noqa: F401
5	5	artifact_export,
6	6	cli_adapter,
7	7	doc_generator,
8	8	github_integration,
	9	+ notes_export,
9	10	prd,
10	11	project_plan,
11	12	requirements_chat,
12	13	roadmap,
13	14	task_breakdown,
	15	+ wiki_generator,
14	16	)
15	17	from video_processor.agent.skills.base import (
16	18	AgentContext,
17	19	Artifact,
18	20	Skill,
19	21
20	22	ADDED video_processor/agent/skills/notes_export.py
21	23	ADDED video_processor/agent/skills/wiki_generator.py

	--- video_processor/agent/skills/__init__.py
	+++ video_processor/agent/skills/__init__.py
	@@ -4,15 +4,17 @@
4	from video_processor.agent.skills import ( # noqa: F401
5	artifact_export,
6	cli_adapter,
7	doc_generator,
8	github_integration,

9	prd,
10	project_plan,
11	requirements_chat,
12	roadmap,
13	task_breakdown,

14	)
15	from video_processor.agent.skills.base import (
16	AgentContext,
17	Artifact,
18	Skill,
19
20	DDED video_processor/agent/skills/notes_export.py
21	DDED video_processor/agent/skills/wiki_generator.py

	--- video_processor/agent/skills/__init__.py
	+++ video_processor/agent/skills/__init__.py
	@@ -4,15 +4,17 @@
4	from video_processor.agent.skills import ( # noqa: F401
5	artifact_export,
6	cli_adapter,
7	doc_generator,
8	github_integration,
9	notes_export,
10	prd,
11	project_plan,
12	requirements_chat,
13	roadmap,
14	task_breakdown,
15	wiki_generator,
16	)
17	from video_processor.agent.skills.base import (
18	AgentContext,
19	Artifact,
20	Skill,
21
22	DDED video_processor/agent/skills/notes_export.py
23	DDED video_processor/agent/skills/wiki_generator.py

A video_processor/agent/skills/notes_export.py

+420

		--- a/video_processor/agent/skills/notes_export.py
		+++ b/video_processor/agent/skills/notes_export.py
		@@ -0,0 +1,420 @@
	1	+"""Skill: Export knowledge graph as structured notes (Obsidian, Notion)."""
	2	+
	3	+import csv
	4	+import io
	5	+import logging
	6	+from datetime import date
	7	+from pathlib import Path
	8	+from typing import Dict, List, Optional
	9	+
	10	+from video_processor.agent.skills.base import (
	11	+ AgentContext,
	12	+ Artifact,
	13	+ Skill,
	14	+ register_skill,
	15	+)
	16	+
	17	+logger = logging.getLogger(__name__)
	18	+
	19	+
	20	+def _sanitize_filename(name: str) -> str:
	21	+ """Convert a name to a filesystem-safe filename."""
	22	+ return (
	23	+ name.replace("/", "-")
	24	+ .replace("\\", "-")
	25	+ .replace(":", "-")
	26	+ .replace('"', "")
	27	+ .replace("?", "")
	28	+ .replace("*", "")
	29	+ .replace("<", "")
	30	+ .replace(">", "")
	31	+ .replace("\|", "")
	32	+ )
	33	+
	34	+
	35	+def _build_indexes(kg_data: dict):
	36	+ """Build lookup structures from knowledge graph data.
	37	+
	38	+ Returns (nodes, by_type, node_lookup, outgoing, incoming).
	39	+ """
	40	+ nodes = kg_data.get("nodes", [])
	41	+ relationships = kg_data.get("relationships", [])
	42	+
	43	+ by_type: Dict[str, list] = {}
	44	+ node_lookup: Dict[str, dict] = {}
	45	+ for node in nodes:
	46	+ name = node.get("name", node.get("id", ""))
	47	+ ntype = node.get("type", "concept")
	48	+ by_type.setdefault(ntype, []).append(node)
	49	+ node_lookup[name] = node
	50	+
	51	+ outgoing: Dict[str, list] = {}
	52	+ incoming: Dict[str, list] = {}
	53	+ for rel in relationships:
	54	+ src = rel.get("source", "")
	55	+ tgt = rel.get("target", "")
	56	+ rtype = rel.get("type", "related_to")
	57	+ outgoing.setdefault(src, []).append((tgt, rtype))
	58	+ incoming.setdefault(tgt, []).append((src, rtype))
	59	+
	60	+ return nodes, by_type, node_lookup, outgoing, incoming
	61	+
	62	+
	63	+# ---------------------------------------------------------------------------
	64	+# Obsidian export
	65	+# ---------------------------------------------------------------------------
	66	+
	67	+
	68	+def export_to_obsidian(
	69	+ kg_data: dict,
	70	+ output_dir: Path,
	71	+ artifacts: Optional[List[Artifact]] = None,
	72	+) -> List[Path]:
	73	+ """Export knowledge graph as an Obsidian vault.
	74	+
	75	+ Creates one ``.md`` file per entity with YAML frontmatter and
	76	+ ``[[wiki-links]]``, an ``_Index.md`` Map of Content, tag pages per
	77	+ entity type, and optional artifact notes.
	78	+ """
	79	+ output_dir.mkdir(parents=True, exist_ok=True)
	80	+ artifacts = artifacts or []
	81	+ created: List[Path] = []
	82	+ today = date.today().isoformat()
	83	+
	84	+ nodes, by_type, node_lookup, outgoing, incoming = _build_indexes(kg_data)
	85	+
	86	+ # --- Individual entity notes ---
	87	+ for node in nodes:
	88	+ name = node.get("name", node.get("id", ""))
	89	+ if not name:
	90	+ continue
	91	+ ntype = node.get("type", "concept")
	92	+ descs = node.get("descriptions", [])
	93	+ aliases = node.get("aliases", [])
	94	+
	95	+ # YAML frontmatter
	96	+ tags_yaml = f" - {ntype}"
	97	+ aliases_yaml = ""
	98	+ if aliases:
	99	+ alias_lines = "\n".join(f" - {a}" for a in aliases)
	100	+ aliases_yaml = f"aliases:\n{alias_lines}\n"
	101	+
	102	+ frontmatter = f"---\ntype: {ntype}\ntags:\n{tags_yaml}\n{aliases_yaml}date: {today}\n---\n"
	103	+
	104	+ parts = [frontmatter, f"# {name}", ""]
	105	+
	106	+ # Descriptions
	107	+ if descs:
	108	+ for d in descs:
	109	+ parts.append(f"{d}")
	110	+ parts.append("")
	111	+
	112	+ # Outgoing relationships
	113	+ outs = outgoing.get(name, [])
	114	+ if outs:
	115	+ parts.append("## Relationships")
	116	+ parts.append("")
	117	+ for tgt, rtype in outs:
	118	+ parts.append(f"- {rtype}: [[{tgt}]]")
	119	+ parts.append("")
	120	+
	121	+ # Incoming relationships
	122	+ ins = incoming.get(name, [])
	123	+ if ins:
	124	+ parts.append("## Referenced by")
	125	+ parts.append("")
	126	+ for src, rtype in ins:
	127	+ parts.append(f"- {rtype} from [[{src}]]")
	128	+ parts.append("")
	129	+
	130	+ filename = _sanitize_filename(name) + ".md"
	131	+ path = output_dir / filename
	132	+ path.write_text("\n".join(parts), encoding="utf-8")
	133	+ created.append(path)
	134	+
	135	+ # --- Index note (Map of Content) ---
	136	+ index_parts = [
	137	+ "---",
	138	+ "type: index",
	139	+ "tags:",
	140	+ " - MOC",
	141	+ f"date: {today}",
	142	+ "---",
	143	+ "",
	144	+ "# Index",
	145	+ "",
	146	+ f"{len(nodes)} entities \| {len(kg_data.get('relationships', []))} relationships",
	147	+ "",
	148	+ ]
	149	+
	150	+ for etype in sorted(by_type.keys()):
	151	+ elist = by_type[etype]
	152	+ index_parts.append(f"## {etype.title()}")
	153	+ index_parts.append("")
	154	+ for node in sorted(elist, key=lambda n: n.get("name", "")):
	155	+ name = node.get("name", "")
	156	+ index_parts.append(f"- [[{name}]]")
	157	+ index_parts.append("")
	158	+
	159	+ if artifacts:
	160	+ index_parts.append("## Artifacts")
	161	+ index_parts.append("")
	162	+ for art in artifacts:
	163	+ index_parts.append(f"- [[{art.name}]]")
	164	+ index_parts.append("")
	165	+
	166	+ index_path = output_dir / "_Index.md"
	167	+ index_path.write_text("\n".join(index_parts), encoding="utf-8")
	168	+ created.append(index_path)
	169	+
	170	+ # --- Tag pages (one per entity type) ---
	171	+ for etype, elist in sorted(by_type.items()):
	172	+ tag_parts = [
	173	+ "---",
	174	+ "type: tag",
	175	+ "tags:",
	176	+ f" - {etype}",
	177	+ f"date: {today}",
	178	+ "---",
	179	+ "",
	180	+ f"# {etype.title()}",
	181	+ "",
	182	+ f"All entities of type {etype} ({len(elist)}).",
	183	+ "",
	184	+ ]
	185	+ for node in sorted(elist, key=lambda n: n.get("name", "")):
	186	+ name = node.get("name", "")
	187	+ descs = node.get("descriptions", [])
	188	+ summary = descs[0] if descs else ""
	189	+ tag_parts.append(f"- [[{name}]]" + (f" - {summary}" if summary else ""))
	190	+ tag_parts.append("")
	191	+
	192	+ tag_filename = f"Tag - {etype.title()}.md"
	193	+ tag_path = output_dir / _sanitize_filename(tag_filename)
	194	+ tag_path.write_text("\n".join(tag_parts), encoding="utf-8")
	195	+ created.append(tag_path)
	196	+
	197	+ # --- Artifact notes ---
	198	+ for art in artifacts:
	199	+ art_parts = [
	200	+ "---",
	201	+ "type: artifact",
	202	+ f"artifact_type: {art.artifact_type}",
	203	+ "tags:",
	204	+ " - artifact",
	205	+ f" - {art.artifact_type}",
	206	+ f"date: {today}",
	207	+ "---",
	208	+ "",
	209	+ f"# {art.name}",
	210	+ "",
	211	+ art.content,
	212	+ "",
	213	+ ]
	214	+ art_filename = _sanitize_filename(art.name) + ".md"
	215	+ art_path = output_dir / art_filename
	216	+ art_path.write_text("\n".join(art_parts), encoding="utf-8")
	217	+ created.append(art_path)
	218	+
	219	+ logger.info("Exported %d Obsidian notes to %s", len(created), output_dir)
	220	+ return created
	221	+
	222	+
	223	+# ---------------------------------------------------------------------------
	224	+# Notion-compatible markdown export
	225	+# ---------------------------------------------------------------------------
	226	+
	227	+
	228	+def export_to_notion_md(
	229	+ kg_data: dict,
	230	+ output_dir: Path,
	231	+ artifacts: Optional[List[Artifact]] = None,
	232	+) -> List[Path]:
	233	+ """Export knowledge graph as Notion-compatible markdown.
	234	+
	235	+ Creates ``.md`` files with Notion-style callout blocks and a
	236	+ database-style CSV for bulk import.
	237	+ """
	238	+ output_dir.mkdir(parents=True, exist_ok=True)
	239	+ artifacts = artifacts or []
	240	+ created: List[Path] = []
	241	+
	242	+ nodes, by_type, node_lookup, outgoing, incoming = _build_indexes(kg_data)
	243	+
	244	+ # --- Database CSV ---
	245	+ csv_buffer = io.StringIO()
	246	+ writer = csv.writer(csv_buffer)
	247	+ writer.writerow(["Name", "Type", "Description", "Related To"])
	248	+
	249	+ for node in nodes:
	250	+ name = node.get("name", node.get("id", ""))
	251	+ ntype = node.get("type", "concept")
	252	+ descs = node.get("descriptions", [])
	253	+ desc_text = "; ".join(descs[:2]) if descs else ""
	254	+ outs = outgoing.get(name, [])
	255	+ related = ", ".join(tgt for tgt, _ in outs) if outs else ""
	256	+ writer.writerow([name, ntype, desc_text, related])
	257	+
	258	+ csv_path = output_dir / "entities_database.csv"
	259	+ csv_path.write_text(csv_buffer.getvalue(), encoding="utf-8")
	260	+ created.append(csv_path)
	261	+
	262	+ # --- Individual entity pages ---
	263	+ for node in nodes:
	264	+ name = node.get("name", node.get("id", ""))
	265	+ if not name:
	266	+ continue
	267	+ ntype = node.get("type", "concept")
	268	+ descs = node.get("descriptions", [])
	269	+
	270	+ type_emoji = {
	271	+ "person": "person",
	272	+ "technology": "computer",
	273	+ "organization": "building",
	274	+ "concept": "bulb",
	275	+ "event": "calendar",
	276	+ "location": "round_pushpin",
	277	+ }
	278	+ emoji = type_emoji.get(ntype, "bulb")
	279	+
	280	+ parts = [
	281	+ f"# {name}",
	282	+ "",
	283	+ f"> :{emoji}: Type: {ntype}",
	284	+ "",
	285	+ ]
	286	+
	287	+ if descs:
	288	+ parts.append("## Description")
	289	+ parts.append("")
	290	+ for d in descs:
	291	+ parts.append(f"{d}")
	292	+ parts.append("")
	293	+
	294	+ # Properties callout
	295	+ properties = node.get("properties", {})
	296	+ if properties:
	297	+ parts.append("> :memo: Properties")
	298	+ for k, v in properties.items():
	299	+ parts.append(f"> - {k}: {v}")
	300	+ parts.append("")
	301	+
	302	+ # Outgoing relationships
	303	+ outs = outgoing.get(name, [])
	304	+ if outs:
	305	+ parts.append("## Relationships")
	306	+ parts.append("")
	307	+ parts.append("\| Target \| Relationship \|")
	308	+ parts.append("\|--------\|-------------\|")
	309	+ for tgt, rtype in outs:
	310	+ parts.append(f"\| {tgt} \| {rtype} \|")
	311	+ parts.append("")
	312	+
	313	+ # Incoming relationships
	314	+ ins = incoming.get(name, [])
	315	+ if ins:
	316	+ parts.append("## Referenced by")
	317	+ parts.append("")
	318	+ parts.append("\| Source \| Relationship \|")
	319	+ parts.append("\|--------\|-------------\|")
	320	+ for src, rtype in ins:
	321	+ parts.append(f"\| {src} \| {rtype} \|")
	322	+ parts.append("")
	323	+
	324	+ filename = _sanitize_filename(name) + ".md"
	325	+ path = output_dir / filename
	326	+ path.write_text("\n".join(parts), encoding="utf-8")
	327	+ created.append(path)
	328	+
	329	+ # --- Overview page ---
	330	+ overview_parts = [
	331	+ "# Knowledge Graph Overview",
	332	+ "",
	333	+ f"> :bar_chart: Stats: {len(nodes)} entities, "
	334	+ f"{len(kg_data.get('relationships', []))} relationships",
	335	+ "",
	336	+ "## Entity Types",
	337	+ "",
	338	+ ]
	339	+ for etype in sorted(by_type.keys()):
	340	+ elist = by_type[etype]
	341	+ overview_parts.append(f"### {etype.title()} ({len(elist)})")
	342	+ overview_parts.append("")
	343	+ for node in sorted(elist, key=lambda n: n.get("name", "")):
	344	+ name = node.get("name", "")
	345	+ overview_parts.append(f"- {name}")
	346	+ overview_parts.append("")
	347	+
	348	+ if artifacts:
	349	+ overview_parts.append("## Artifacts")
	350	+ overview_parts.append("")
	351	+ for art in artifacts:
	352	+ overview_parts.append(f"- {art.name} ({art.artifact_type})")
	353	+ overview_parts.append("")
	354	+
	355	+ overview_path = output_dir / "Overview.md"
	356	+ overview_path.write_text("\n".join(overview_parts), encoding="utf-8")
	357	+ created.append(overview_path)
	358	+
	359	+ # --- Artifact pages ---
	360	+ for art in artifacts:
	361	+ art_parts = [
	362	+ f"# {art.name}",
	363	+ "",
	364	+ f"> :page_facing_up: Type: {art.artifact_type} \| Format: {art.format}",
	365	+ "",
	366	+ art.content,
	367	+ "",
	368	+ ]
	369	+ art_filename = _sanitize_filename(art.name) + ".md"
	370	+ art_path = output_dir / art_filename
	371	+ art_path.write_text("\n".join(art_parts), encoding="utf-8")
	372	+ created.append(art_path)
	373	+
	374	+ logger.info("Exported %d Notion markdown files to %s", len(created), output_dir)
	375	+ return created
	376	+
	377	+
	378	+# ---------------------------------------------------------------------------
	379	+# Skill class
	380	+# ---------------------------------------------------------------------------
	381	+
	382	+
	383	+class NotesExportSkill(Skill):
	384	+ """Export knowledge graph as structured notes (Obsidian, Notion).
	385	+
	386	+ For GitHub wiki export, see the ``wiki_generator`` skill.
	387	+ """
	388	+
	389	+ name = "notes_export"
	390	+ description = "Export knowledge graph as structured notes (Obsidian, Notion)"
	391	+
	392	+ def execute(self, context: AgentContext, **kwargs) -> Artifact:
	393	+ fmt = kwargs.get("format", "obsidian")
	394	+ output_dir = Path(kwargs.get("output_dir", f"notes_export_{fmt}"))
	395	+ kg_data = context.knowledge_graph.to_dict()
	396	+ artifacts = context.artifacts or []
	397	+
	398	+ if fmt == "notion":
	399	+ created = export_to_notion_md(kg_data, output_dir, artifacts=artifacts)
	400	+ else:
	401	+ created = export_to_obsidian(kg_data, output_dir, artifacts=artifacts)
	402	+
	403	+ file_list = "\n".join(f"- {p.name}" for p in created)
	404	+ summary = f"Exported {len(created)} {fmt} notes to `{output_dir}`:\n\n{file_list}"
	405	+
	406	+ return Artifact(
	407	+ name=f"Notes Export ({fmt.title()})",
	408	+ content=summary,
	409	+ artifact_type="notes_export",
	410	+ format="markdown",
	411	+ metadata={
	412	+ "output_dir": str(output_dir),
	413	+ "format": fmt,
	414	+ "file_count": len(created),
	415	+ "files": [str(p) for p in created],
	416	+ },
	417	+ )
	418	+
	419	+
	420	+register_skill(NotesExportSkill())

	--- a/video_processor/agent/skills/notes_export.py
	+++ b/video_processor/agent/skills/notes_export.py
	@@ -0,0 +1,420 @@

	--- a/video_processor/agent/skills/notes_export.py
	+++ b/video_processor/agent/skills/notes_export.py
	@@ -0,0 +1,420 @@
1	"""Skill: Export knowledge graph as structured notes (Obsidian, Notion)."""
2
3	import csv
4	import io
5	import logging
6	from datetime import date
7	from pathlib import Path
8	from typing import Dict, List, Optional
9
10	from video_processor.agent.skills.base import (
11	AgentContext,
12	Artifact,
13	Skill,
14	register_skill,
15	)
16
17	logger = logging.getLogger(__name__)
18
19
20	def _sanitize_filename(name: str) -> str:
21	"""Convert a name to a filesystem-safe filename."""
22	return (
23	name.replace("/", "-")
24	.replace("\\", "-")
25	.replace(":", "-")
26	.replace('"', "")
27	.replace("?", "")
28	.replace("*", "")
29	.replace("<", "")
30	.replace(">", "")
31	.replace("\|", "")
32	)
33
34
35	def _build_indexes(kg_data: dict):
36	"""Build lookup structures from knowledge graph data.
37
38	Returns (nodes, by_type, node_lookup, outgoing, incoming).
39	"""
40	nodes = kg_data.get("nodes", [])
41	relationships = kg_data.get("relationships", [])
42
43	by_type: Dict[str, list] = {}
44	node_lookup: Dict[str, dict] = {}
45	for node in nodes:
46	name = node.get("name", node.get("id", ""))
47	ntype = node.get("type", "concept")
48	by_type.setdefault(ntype, []).append(node)
49	node_lookup[name] = node
50
51	outgoing: Dict[str, list] = {}
52	incoming: Dict[str, list] = {}
53	for rel in relationships:
54	src = rel.get("source", "")
55	tgt = rel.get("target", "")
56	rtype = rel.get("type", "related_to")
57	outgoing.setdefault(src, []).append((tgt, rtype))
58	incoming.setdefault(tgt, []).append((src, rtype))
59
60	return nodes, by_type, node_lookup, outgoing, incoming
61
62
63	# ---------------------------------------------------------------------------
64	# Obsidian export
65	# ---------------------------------------------------------------------------
66
67
68	def export_to_obsidian(
69	kg_data: dict,
70	output_dir: Path,
71	artifacts: Optional[List[Artifact]] = None,
72	) -> List[Path]:
73	"""Export knowledge graph as an Obsidian vault.
74
75	Creates one ``.md`` file per entity with YAML frontmatter and
76	``[[wiki-links]]``, an ``_Index.md`` Map of Content, tag pages per
77	entity type, and optional artifact notes.
78	"""
79	output_dir.mkdir(parents=True, exist_ok=True)
80	artifacts = artifacts or []
81	created: List[Path] = []
82	today = date.today().isoformat()
83
84	nodes, by_type, node_lookup, outgoing, incoming = _build_indexes(kg_data)
85
86	# --- Individual entity notes ---
87	for node in nodes:
88	name = node.get("name", node.get("id", ""))
89	if not name:
90	continue
91	ntype = node.get("type", "concept")
92	descs = node.get("descriptions", [])
93	aliases = node.get("aliases", [])
94
95	# YAML frontmatter
96	tags_yaml = f" - {ntype}"
97	aliases_yaml = ""
98	if aliases:
99	alias_lines = "\n".join(f" - {a}" for a in aliases)
100	aliases_yaml = f"aliases:\n{alias_lines}\n"
101
102	frontmatter = f"---\ntype: {ntype}\ntags:\n{tags_yaml}\n{aliases_yaml}date: {today}\n---\n"
103
104	parts = [frontmatter, f"# {name}", ""]
105
106	# Descriptions
107	if descs:
108	for d in descs:
109	parts.append(f"{d}")
110	parts.append("")
111
112	# Outgoing relationships
113	outs = outgoing.get(name, [])
114	if outs:
115	parts.append("## Relationships")
116	parts.append("")
117	for tgt, rtype in outs:
118	parts.append(f"- {rtype}: [[{tgt}]]")
119	parts.append("")
120
121	# Incoming relationships
122	ins = incoming.get(name, [])
123	if ins:
124	parts.append("## Referenced by")
125	parts.append("")
126	for src, rtype in ins:
127	parts.append(f"- {rtype} from [[{src}]]")
128	parts.append("")
129
130	filename = _sanitize_filename(name) + ".md"
131	path = output_dir / filename
132	path.write_text("\n".join(parts), encoding="utf-8")
133	created.append(path)
134
135	# --- Index note (Map of Content) ---
136	index_parts = [
137	"---",
138	"type: index",
139	"tags:",
140	" - MOC",
141	f"date: {today}",
142	"---",
143	"",
144	"# Index",
145	"",
146	f"{len(nodes)} entities \| {len(kg_data.get('relationships', []))} relationships",
147	"",
148	]
149
150	for etype in sorted(by_type.keys()):
151	elist = by_type[etype]
152	index_parts.append(f"## {etype.title()}")
153	index_parts.append("")
154	for node in sorted(elist, key=lambda n: n.get("name", "")):
155	name = node.get("name", "")
156	index_parts.append(f"- [[{name}]]")
157	index_parts.append("")
158
159	if artifacts:
160	index_parts.append("## Artifacts")
161	index_parts.append("")
162	for art in artifacts:
163	index_parts.append(f"- [[{art.name}]]")
164	index_parts.append("")
165
166	index_path = output_dir / "_Index.md"
167	index_path.write_text("\n".join(index_parts), encoding="utf-8")
168	created.append(index_path)
169
170	# --- Tag pages (one per entity type) ---
171	for etype, elist in sorted(by_type.items()):
172	tag_parts = [
173	"---",
174	"type: tag",
175	"tags:",
176	f" - {etype}",
177	f"date: {today}",
178	"---",
179	"",
180	f"# {etype.title()}",
181	"",
182	f"All entities of type {etype} ({len(elist)}).",
183	"",
184	]
185	for node in sorted(elist, key=lambda n: n.get("name", "")):
186	name = node.get("name", "")
187	descs = node.get("descriptions", [])
188	summary = descs[0] if descs else ""
189	tag_parts.append(f"- [[{name}]]" + (f" - {summary}" if summary else ""))
190	tag_parts.append("")
191
192	tag_filename = f"Tag - {etype.title()}.md"
193	tag_path = output_dir / _sanitize_filename(tag_filename)
194	tag_path.write_text("\n".join(tag_parts), encoding="utf-8")
195	created.append(tag_path)
196
197	# --- Artifact notes ---
198	for art in artifacts:
199	art_parts = [
200	"---",
201	"type: artifact",
202	f"artifact_type: {art.artifact_type}",
203	"tags:",
204	" - artifact",
205	f" - {art.artifact_type}",
206	f"date: {today}",
207	"---",
208	"",
209	f"# {art.name}",
210	"",
211	art.content,
212	"",
213	]
214	art_filename = _sanitize_filename(art.name) + ".md"
215	art_path = output_dir / art_filename
216	art_path.write_text("\n".join(art_parts), encoding="utf-8")
217	created.append(art_path)
218
219	logger.info("Exported %d Obsidian notes to %s", len(created), output_dir)
220	return created
221
222
223	# ---------------------------------------------------------------------------
224	# Notion-compatible markdown export
225	# ---------------------------------------------------------------------------
226
227
228	def export_to_notion_md(
229	kg_data: dict,
230	output_dir: Path,
231	artifacts: Optional[List[Artifact]] = None,
232	) -> List[Path]:
233	"""Export knowledge graph as Notion-compatible markdown.
234
235	Creates ``.md`` files with Notion-style callout blocks and a
236	database-style CSV for bulk import.
237	"""
238	output_dir.mkdir(parents=True, exist_ok=True)
239	artifacts = artifacts or []
240	created: List[Path] = []
241
242	nodes, by_type, node_lookup, outgoing, incoming = _build_indexes(kg_data)
243
244	# --- Database CSV ---
245	csv_buffer = io.StringIO()
246	writer = csv.writer(csv_buffer)
247	writer.writerow(["Name", "Type", "Description", "Related To"])
248
249	for node in nodes:
250	name = node.get("name", node.get("id", ""))
251	ntype = node.get("type", "concept")
252	descs = node.get("descriptions", [])
253	desc_text = "; ".join(descs[:2]) if descs else ""
254	outs = outgoing.get(name, [])
255	related = ", ".join(tgt for tgt, _ in outs) if outs else ""
256	writer.writerow([name, ntype, desc_text, related])
257
258	csv_path = output_dir / "entities_database.csv"
259	csv_path.write_text(csv_buffer.getvalue(), encoding="utf-8")
260	created.append(csv_path)
261
262	# --- Individual entity pages ---
263	for node in nodes:
264	name = node.get("name", node.get("id", ""))
265	if not name:
266	continue
267	ntype = node.get("type", "concept")
268	descs = node.get("descriptions", [])
269
270	type_emoji = {
271	"person": "person",
272	"technology": "computer",
273	"organization": "building",
274	"concept": "bulb",
275	"event": "calendar",
276	"location": "round_pushpin",
277	}
278	emoji = type_emoji.get(ntype, "bulb")
279
280	parts = [
281	f"# {name}",
282	"",
283	f"> :{emoji}: Type: {ntype}",
284	"",
285	]
286
287	if descs:
288	parts.append("## Description")
289	parts.append("")
290	for d in descs:
291	parts.append(f"{d}")
292	parts.append("")
293
294	# Properties callout
295	properties = node.get("properties", {})
296	if properties:
297	parts.append("> :memo: Properties")
298	for k, v in properties.items():
299	parts.append(f"> - {k}: {v}")
300	parts.append("")
301
302	# Outgoing relationships
303	outs = outgoing.get(name, [])
304	if outs:
305	parts.append("## Relationships")
306	parts.append("")
307	parts.append("\| Target \| Relationship \|")
308	parts.append("\|--------\|-------------\|")
309	for tgt, rtype in outs:
310	parts.append(f"\| {tgt} \| {rtype} \|")
311	parts.append("")
312
313	# Incoming relationships
314	ins = incoming.get(name, [])
315	if ins:
316	parts.append("## Referenced by")
317	parts.append("")
318	parts.append("\| Source \| Relationship \|")
319	parts.append("\|--------\|-------------\|")
320	for src, rtype in ins:
321	parts.append(f"\| {src} \| {rtype} \|")
322	parts.append("")
323
324	filename = _sanitize_filename(name) + ".md"
325	path = output_dir / filename
326	path.write_text("\n".join(parts), encoding="utf-8")
327	created.append(path)
328
329	# --- Overview page ---
330	overview_parts = [
331	"# Knowledge Graph Overview",
332	"",
333	f"> :bar_chart: Stats: {len(nodes)} entities, "
334	f"{len(kg_data.get('relationships', []))} relationships",
335	"",
336	"## Entity Types",
337	"",
338	]
339	for etype in sorted(by_type.keys()):
340	elist = by_type[etype]
341	overview_parts.append(f"### {etype.title()} ({len(elist)})")
342	overview_parts.append("")
343	for node in sorted(elist, key=lambda n: n.get("name", "")):
344	name = node.get("name", "")
345	overview_parts.append(f"- {name}")
346	overview_parts.append("")
347
348	if artifacts:
349	overview_parts.append("## Artifacts")
350	overview_parts.append("")
351	for art in artifacts:
352	overview_parts.append(f"- {art.name} ({art.artifact_type})")
353	overview_parts.append("")
354
355	overview_path = output_dir / "Overview.md"
356	overview_path.write_text("\n".join(overview_parts), encoding="utf-8")
357	created.append(overview_path)
358
359	# --- Artifact pages ---
360	for art in artifacts:
361	art_parts = [
362	f"# {art.name}",
363	"",
364	f"> :page_facing_up: Type: {art.artifact_type} \| Format: {art.format}",
365	"",
366	art.content,
367	"",
368	]
369	art_filename = _sanitize_filename(art.name) + ".md"
370	art_path = output_dir / art_filename
371	art_path.write_text("\n".join(art_parts), encoding="utf-8")
372	created.append(art_path)
373
374	logger.info("Exported %d Notion markdown files to %s", len(created), output_dir)
375	return created
376
377
378	# ---------------------------------------------------------------------------
379	# Skill class
380	# ---------------------------------------------------------------------------
381
382
383	class NotesExportSkill(Skill):
384	"""Export knowledge graph as structured notes (Obsidian, Notion).
385
386	For GitHub wiki export, see the ``wiki_generator`` skill.
387	"""
388
389	name = "notes_export"
390	description = "Export knowledge graph as structured notes (Obsidian, Notion)"
391
392	def execute(self, context: AgentContext, **kwargs) -> Artifact:
393	fmt = kwargs.get("format", "obsidian")
394	output_dir = Path(kwargs.get("output_dir", f"notes_export_{fmt}"))
395	kg_data = context.knowledge_graph.to_dict()
396	artifacts = context.artifacts or []
397
398	if fmt == "notion":
399	created = export_to_notion_md(kg_data, output_dir, artifacts=artifacts)
400	else:
401	created = export_to_obsidian(kg_data, output_dir, artifacts=artifacts)
402
403	file_list = "\n".join(f"- {p.name}" for p in created)
404	summary = f"Exported {len(created)} {fmt} notes to `{output_dir}`:\n\n{file_list}"
405
406	return Artifact(
407	name=f"Notes Export ({fmt.title()})",
408	content=summary,
409	artifact_type="notes_export",
410	format="markdown",
411	metadata={
412	"output_dir": str(output_dir),
413	"format": fmt,
414	"file_count": len(created),
415	"files": [str(p) for p in created],
416	},
417	)
418
419
420	register_skill(NotesExportSkill())

A video_processor/agent/skills/wiki_generator.py

+315

		--- a/video_processor/agent/skills/wiki_generator.py
		+++ b/video_processor/agent/skills/wiki_generator.py
		@@ -0,0 +1,315 @@
	1	+"""Skill: Generate a GitHub wiki from knowledge graph and artifacts."""
	2	+
	3	+import json
	4	+import logging
	5	+import subprocess
	6	+from pathlib import Path
	7	+from typing import Dict, List, Optional
	8	+
	9	+from video_processor.agent.skills.base import (
	10	+ AgentContext,
	11	+ Artifact,
	12	+ Skill,
	13	+ register_skill,
	14	+)
	15	+
	16	+logger = logging.getLogger(__name__)
	17	+
	18	+
	19	+def _sanitize_filename(name: str) -> str:
	20	+ """Convert entity name to a wiki-safe filename."""
	21	+ return name.replace("/", "-").replace("\\", "-").replace(" ", "-").replace(".", "-")
	22	+
	23	+
	24	+def _wiki_link(name: str) -> str:
	25	+ """Create a GitHub wiki-style markdown link."""
	26	+ safe = _sanitize_filename(name)
	27	+ return f"[{name}]({safe})"
	28	+
	29	+
	30	+def generate_wiki(
	31	+ kg_data: dict,
	32	+ artifacts: Optional[List[Artifact]] = None,
	33	+ title: str = "Knowledge Base",
	34	+) -> Dict[str, str]:
	35	+ """Generate a dict of {filename: markdown_content} for a GitHub wiki.
	36	+
	37	+ Returns pages for: Home, _Sidebar, entity type indexes, individual
	38	+ entity pages, and any planning artifacts.
	39	+ """
	40	+ pages: Dict[str, str] = {}
	41	+ artifacts = artifacts or []
	42	+
	43	+ nodes = kg_data.get("nodes", [])
	44	+ relationships = kg_data.get("relationships", [])
	45	+
	46	+ # Group entities by type
	47	+ by_type: Dict[str, list] = {}
	48	+ node_lookup: Dict[str, dict] = {}
	49	+ for node in nodes:
	50	+ name = node.get("name", node.get("id", ""))
	51	+ ntype = node.get("type", "concept")
	52	+ by_type.setdefault(ntype, []).append(node)
	53	+ node_lookup[name.lower()] = node
	54	+
	55	+ # Build relationship index (outgoing and incoming per entity)
	56	+ outgoing: Dict[str, list] = {}
	57	+ incoming: Dict[str, list] = {}
	58	+ for rel in relationships:
	59	+ src = rel.get("source", "")
	60	+ tgt = rel.get("target", "")
	61	+ rtype = rel.get("type", "related_to")
	62	+ outgoing.setdefault(src, []).append((tgt, rtype))
	63	+ incoming.setdefault(tgt, []).append((src, rtype))
	64	+
	65	+ # --- Home page ---
	66	+ home_parts = [
	67	+ f"# {title}",
	68	+ "",
	69	+ f"{len(nodes)} entities \| {len(relationships)} relationships",
	70	+ "",
	71	+ "## Entity Types",
	72	+ "",
	73	+ ]
	74	+ for etype, elist in sorted(by_type.items()):
	75	+ home_parts.append(f"- {_wiki_link(etype.title())} ({len(elist)})")
	76	+
	77	+ if artifacts:
	78	+ home_parts.append("")
	79	+ home_parts.append("## Planning Artifacts")
	80	+ home_parts.append("")
	81	+ for art in artifacts:
	82	+ safe = _sanitize_filename(art.name)
	83	+ home_parts.append(f"- [{art.name}]({safe})")
	84	+
	85	+ pages["Home"] = "\n".join(home_parts)
	86	+
	87	+ # --- Sidebar ---
	88	+ sidebar_parts = [f"{title}", "", "Navigation", "", "- [Home](Home)", ""]
	89	+ sidebar_parts.append("Entity Types")
	90	+ sidebar_parts.append("")
	91	+ for etype in sorted(by_type.keys()):
	92	+ sidebar_parts.append(f"- {_wiki_link(etype.title())}")
	93	+
	94	+ if artifacts:
	95	+ sidebar_parts.append("")
	96	+ sidebar_parts.append("Artifacts")
	97	+ sidebar_parts.append("")
	98	+ for art in artifacts:
	99	+ safe = _sanitize_filename(art.name)
	100	+ sidebar_parts.append(f"- [{art.name}]({safe})")
	101	+
	102	+ pages["_Sidebar"] = "\n".join(sidebar_parts)
	103	+
	104	+ # --- Type index pages ---
	105	+ for etype, elist in sorted(by_type.items()):
	106	+ page_name = _sanitize_filename(etype.title())
	107	+ parts = [
	108	+ f"# {etype.title()}",
	109	+ "",
	110	+ f"{len(elist)} entities of type {etype}.",
	111	+ "",
	112	+ "\| Entity \| Descriptions \|",
	113	+ "\|--------\|-------------\|",
	114	+ ]
	115	+ for node in sorted(elist, key=lambda n: n.get("name", "")):
	116	+ name = node.get("name", "")
	117	+ descs = node.get("descriptions", [])
	118	+ desc_text = "; ".join(descs[:2]) if descs else "—"
	119	+ parts.append(f"\| {_wiki_link(name)} \| {desc_text} \|")
	120	+
	121	+ pages[page_name] = "\n".join(parts)
	122	+
	123	+ # --- Individual entity pages ---
	124	+ for node in nodes:
	125	+ name = node.get("name", "")
	126	+ if not name:
	127	+ continue
	128	+ ntype = node.get("type", "concept")
	129	+ descs = node.get("descriptions", [])
	130	+ page_name = _sanitize_filename(name)
	131	+
	132	+ parts = [
	133	+ f"# {name}",
	134	+ "",
	135	+ f"Type: {ntype}",
	136	+ "",
	137	+ ]
	138	+
	139	+ if descs:
	140	+ parts.append("## Descriptions")
	141	+ parts.append("")
	142	+ for d in descs:
	143	+ parts.append(f"- {d}")
	144	+ parts.append("")
	145	+
	146	+ # Outgoing relationships
	147	+ outs = outgoing.get(name, [])
	148	+ if outs:
	149	+ parts.append("## Relationships")
	150	+ parts.append("")
	151	+ parts.append("\| Target \| Type \|")
	152	+ parts.append("\|--------\|------\|")
	153	+ for tgt, rtype in outs:
	154	+ parts.append(f"\| {_wiki_link(tgt)} \| {rtype} \|")
	155	+ parts.append("")
	156	+
	157	+ # Incoming relationships
	158	+ ins = incoming.get(name, [])
	159	+ if ins:
	160	+ parts.append("## Referenced By")
	161	+ parts.append("")
	162	+ parts.append("\| Source \| Type \|")
	163	+ parts.append("\|--------\|------\|")
	164	+ for src, rtype in ins:
	165	+ parts.append(f"\| {_wiki_link(src)} \| {rtype} \|")
	166	+ parts.append("")
	167	+
	168	+ # Occurrences / sources
	169	+ occs = node.get("occurrences", [])
	170	+ if occs:
	171	+ parts.append("## Sources")
	172	+ parts.append("")
	173	+ for occ in occs:
	174	+ src = occ.get("source", "unknown")
	175	+ ts = occ.get("timestamp", "")
	176	+ text = occ.get("text", "")
	177	+ line = f"- {src}"
	178	+ if ts:
	179	+ line += f" @ {ts}"
	180	+ if text:
	181	+ line += f": _{text}_"
	182	+ parts.append(line)
	183	+ parts.append("")
	184	+
	185	+ pages[page_name] = "\n".join(parts)
	186	+
	187	+ # --- Artifact pages ---
	188	+ for art in artifacts:
	189	+ page_name = _sanitize_filename(art.name)
	190	+ if art.format == "json":
	191	+ try:
	192	+ data = json.loads(art.content)
	193	+ content = f"```json\n{json.dumps(data, indent=2)}\n```"
	194	+ except json.JSONDecodeError:
	195	+ content = art.content
	196	+ else:
	197	+ content = art.content
	198	+
	199	+ pages[page_name] = f"# {art.name}\n\n{content}"
	200	+
	201	+ return pages
	202	+
	203	+
	204	+def write_wiki(pages: Dict[str, str], output_dir: Path) -> List[Path]:
	205	+ """Write wiki pages to a directory as .md files."""
	206	+ output_dir.mkdir(parents=True, exist_ok=True)
	207	+ paths = []
	208	+ for name, content in pages.items():
	209	+ path = output_dir / f"{name}.md"
	210	+ path.write_text(content, encoding="utf-8")
	211	+ paths.append(path)
	212	+ return paths
	213	+
	214	+
	215	+def push_wiki(wiki_dir: Path, repo: str, message: str = "Update wiki") -> bool:
	216	+ """Push wiki pages to a GitHub wiki repo.
	217	+
	218	+ Clones the wiki repo, copies pages, commits and pushes.
	219	+ The repo should be in 'owner/repo' format.
	220	+ """
	221	+ wiki_url = f"https://github.com/{repo}.wiki.git"
	222	+
	223	+ # Clone existing wiki (or init if empty)
	224	+ clone_dir = wiki_dir / ".wiki_clone"
	225	+ if clone_dir.exists():
	226	+ subprocess.run(["rm", "-rf", str(clone_dir)], check=True)
	227	+
	228	+ result = subprocess.run(
	229	+ ["git", "clone", wiki_url, str(clone_dir)],
	230	+ capture_output=True,
	231	+ text=True,
	232	+ )
	233	+
	234	+ if result.returncode != 0:
	235	+ # Wiki might not exist yet — init a new repo
	236	+ clone_dir.mkdir(parents=True, exist_ok=True)
	237	+ subprocess.run(["git", "init"], cwd=clone_dir, capture_output=True)
	238	+ subprocess.run(
	239	+ ["git", "remote", "add", "origin", wiki_url],
	240	+ cwd=clone_dir,
	241	+ capture_output=True,
	242	+ )
	243	+
	244	+ # Copy wiki pages into clone
	245	+ for md_file in wiki_dir.glob("*.md"):
	246	+ if md_file.parent == wiki_dir:
	247	+ dest = clone_dir / md_file.name
	248	+ dest.write_text(md_file.read_text(encoding="utf-8"), encoding="utf-8")
	249	+
	250	+ # Commit and push
	251	+ subprocess.run(["git", "add", "-A"], cwd=clone_dir, capture_output=True)
	252	+ commit_result = subprocess.run(
	253	+ ["git", "commit", "-m", message],
	254	+ cwd=clone_dir,
	255	+ capture_output=True,
	256	+ text=True,
	257	+ )
	258	+ if commit_result.returncode != 0:
	259	+ logger.info("No wiki changes to commit")
	260	+ return True
	261	+
	262	+ push_result = subprocess.run(
	263	+ ["git", "push", "origin", "master"],
	264	+ cwd=clone_dir,
	265	+ capture_output=True,
	266	+ text=True,
	267	+ )
	268	+ if push_result.returncode != 0:
	269	+ # Try main branch
	270	+ push_result = subprocess.run(
	271	+ ["git", "push", "origin", "main"],
	272	+ cwd=clone_dir,
	273	+ capture_output=True,
	274	+ text=True,
	275	+ )
	276	+
	277	+ if push_result.returncode == 0:
	278	+ logger.info(f"Wiki pushed to {wiki_url}")
	279	+ return True
	280	+ else:
	281	+ logger.error(f"Wiki push failed: {push_result.stderr}")
	282	+ return False
	283	+
	284	+
	285	+class WikiGeneratorSkill(Skill):
	286	+ name = "wiki_generator"
	287	+ description = "Generate a GitHub wiki from knowledge graph and artifacts"
	288	+
	289	+ def execute(self, context: AgentContext, **kwargs) -> Artifact:
	290	+ kg_data = context.knowledge_graph.to_dict()
	291	+ pages = generate_wiki(
	292	+ kg_data,
	293	+ artifacts=context.artifacts,
	294	+ title=kwargs.get("title", "Knowledge Base"),
	295	+ )
	296	+
	297	+ # Return a summary artifact; actual pages are written via write_wiki()
	298	+ page_list = sorted(pages.keys())
	299	+ summary_parts = [
	300	+ f"Generated {len(pages)} wiki pages:",
	301	+ "",
	302	+ ]
	303	+ for name in page_list:
	304	+ summary_parts.append(f"- {name}.md")
	305	+
	306	+ return Artifact(
	307	+ name="Wiki",
	308	+ content="\n".join(summary_parts),
	309	+ artifact_type="wiki",
	310	+ format="markdown",
	311	+ metadata={"pages": pages},
	312	+ )
	313	+
	314	+
	315	+register_skill(WikiGeneratorSkill())

	--- a/video_processor/agent/skills/wiki_generator.py
	+++ b/video_processor/agent/skills/wiki_generator.py
	@@ -0,0 +1,315 @@

	--- a/video_processor/agent/skills/wiki_generator.py
	+++ b/video_processor/agent/skills/wiki_generator.py
	@@ -0,0 +1,315 @@
1	"""Skill: Generate a GitHub wiki from knowledge graph and artifacts."""
2
3	import json
4	import logging
5	import subprocess
6	from pathlib import Path
7	from typing import Dict, List, Optional
8
9	from video_processor.agent.skills.base import (
10	AgentContext,
11	Artifact,
12	Skill,
13	register_skill,
14	)
15
16	logger = logging.getLogger(__name__)
17
18
19	def _sanitize_filename(name: str) -> str:
20	"""Convert entity name to a wiki-safe filename."""
21	return name.replace("/", "-").replace("\\", "-").replace(" ", "-").replace(".", "-")
22
23
24	def _wiki_link(name: str) -> str:
25	"""Create a GitHub wiki-style markdown link."""
26	safe = _sanitize_filename(name)
27	return f"[{name}]({safe})"
28
29
30	def generate_wiki(
31	kg_data: dict,
32	artifacts: Optional[List[Artifact]] = None,
33	title: str = "Knowledge Base",
34	) -> Dict[str, str]:
35	"""Generate a dict of {filename: markdown_content} for a GitHub wiki.
36
37	Returns pages for: Home, _Sidebar, entity type indexes, individual
38	entity pages, and any planning artifacts.
39	"""
40	pages: Dict[str, str] = {}
41	artifacts = artifacts or []
42
43	nodes = kg_data.get("nodes", [])
44	relationships = kg_data.get("relationships", [])
45
46	# Group entities by type
47	by_type: Dict[str, list] = {}
48	node_lookup: Dict[str, dict] = {}
49	for node in nodes:
50	name = node.get("name", node.get("id", ""))
51	ntype = node.get("type", "concept")
52	by_type.setdefault(ntype, []).append(node)
53	node_lookup[name.lower()] = node
54
55	# Build relationship index (outgoing and incoming per entity)
56	outgoing: Dict[str, list] = {}
57	incoming: Dict[str, list] = {}
58	for rel in relationships:
59	src = rel.get("source", "")
60	tgt = rel.get("target", "")
61	rtype = rel.get("type", "related_to")
62	outgoing.setdefault(src, []).append((tgt, rtype))
63	incoming.setdefault(tgt, []).append((src, rtype))
64
65	# --- Home page ---
66	home_parts = [
67	f"# {title}",
68	"",
69	f"{len(nodes)} entities \| {len(relationships)} relationships",
70	"",
71	"## Entity Types",
72	"",
73	]
74	for etype, elist in sorted(by_type.items()):
75	home_parts.append(f"- {_wiki_link(etype.title())} ({len(elist)})")
76
77	if artifacts:
78	home_parts.append("")
79	home_parts.append("## Planning Artifacts")
80	home_parts.append("")
81	for art in artifacts:
82	safe = _sanitize_filename(art.name)
83	home_parts.append(f"- [{art.name}]({safe})")
84
85	pages["Home"] = "\n".join(home_parts)
86
87	# --- Sidebar ---
88	sidebar_parts = [f"{title}", "", "Navigation", "", "- [Home](Home)", ""]
89	sidebar_parts.append("Entity Types")
90	sidebar_parts.append("")
91	for etype in sorted(by_type.keys()):
92	sidebar_parts.append(f"- {_wiki_link(etype.title())}")
93
94	if artifacts:
95	sidebar_parts.append("")
96	sidebar_parts.append("Artifacts")
97	sidebar_parts.append("")
98	for art in artifacts:
99	safe = _sanitize_filename(art.name)
100	sidebar_parts.append(f"- [{art.name}]({safe})")
101
102	pages["_Sidebar"] = "\n".join(sidebar_parts)
103
104	# --- Type index pages ---
105	for etype, elist in sorted(by_type.items()):
106	page_name = _sanitize_filename(etype.title())
107	parts = [
108	f"# {etype.title()}",
109	"",
110	f"{len(elist)} entities of type {etype}.",
111	"",
112	"\| Entity \| Descriptions \|",
113	"\|--------\|-------------\|",
114	]
115	for node in sorted(elist, key=lambda n: n.get("name", "")):
116	name = node.get("name", "")
117	descs = node.get("descriptions", [])
118	desc_text = "; ".join(descs[:2]) if descs else "—"
119	parts.append(f"\| {_wiki_link(name)} \| {desc_text} \|")
120
121	pages[page_name] = "\n".join(parts)
122
123	# --- Individual entity pages ---
124	for node in nodes:
125	name = node.get("name", "")
126	if not name:
127	continue
128	ntype = node.get("type", "concept")
129	descs = node.get("descriptions", [])
130	page_name = _sanitize_filename(name)
131
132	parts = [
133	f"# {name}",
134	"",
135	f"Type: {ntype}",
136	"",
137	]
138
139	if descs:
140	parts.append("## Descriptions")
141	parts.append("")
142	for d in descs:
143	parts.append(f"- {d}")
144	parts.append("")
145
146	# Outgoing relationships
147	outs = outgoing.get(name, [])
148	if outs:
149	parts.append("## Relationships")
150	parts.append("")
151	parts.append("\| Target \| Type \|")
152	parts.append("\|--------\|------\|")
153	for tgt, rtype in outs:
154	parts.append(f"\| {_wiki_link(tgt)} \| {rtype} \|")
155	parts.append("")
156
157	# Incoming relationships
158	ins = incoming.get(name, [])
159	if ins:
160	parts.append("## Referenced By")
161	parts.append("")
162	parts.append("\| Source \| Type \|")
163	parts.append("\|--------\|------\|")
164	for src, rtype in ins:
165	parts.append(f"\| {_wiki_link(src)} \| {rtype} \|")
166	parts.append("")
167
168	# Occurrences / sources
169	occs = node.get("occurrences", [])
170	if occs:
171	parts.append("## Sources")
172	parts.append("")
173	for occ in occs:
174	src = occ.get("source", "unknown")
175	ts = occ.get("timestamp", "")
176	text = occ.get("text", "")
177	line = f"- {src}"
178	if ts:
179	line += f" @ {ts}"
180	if text:
181	line += f": _{text}_"
182	parts.append(line)
183	parts.append("")
184
185	pages[page_name] = "\n".join(parts)
186
187	# --- Artifact pages ---
188	for art in artifacts:
189	page_name = _sanitize_filename(art.name)
190	if art.format == "json":
191	try:
192	data = json.loads(art.content)
193	content = f"```json\n{json.dumps(data, indent=2)}\n```"
194	except json.JSONDecodeError:
195	content = art.content
196	else:
197	content = art.content
198
199	pages[page_name] = f"# {art.name}\n\n{content}"
200
201	return pages
202
203
204	def write_wiki(pages: Dict[str, str], output_dir: Path) -> List[Path]:
205	"""Write wiki pages to a directory as .md files."""
206	output_dir.mkdir(parents=True, exist_ok=True)
207	paths = []
208	for name, content in pages.items():
209	path = output_dir / f"{name}.md"
210	path.write_text(content, encoding="utf-8")
211	paths.append(path)
212	return paths
213
214
215	def push_wiki(wiki_dir: Path, repo: str, message: str = "Update wiki") -> bool:
216	"""Push wiki pages to a GitHub wiki repo.
217
218	Clones the wiki repo, copies pages, commits and pushes.
219	The repo should be in 'owner/repo' format.
220	"""
221	wiki_url = f"https://github.com/{repo}.wiki.git"
222
223	# Clone existing wiki (or init if empty)
224	clone_dir = wiki_dir / ".wiki_clone"
225	if clone_dir.exists():
226	subprocess.run(["rm", "-rf", str(clone_dir)], check=True)
227
228	result = subprocess.run(
229	["git", "clone", wiki_url, str(clone_dir)],
230	capture_output=True,
231	text=True,
232	)
233
234	if result.returncode != 0:
235	# Wiki might not exist yet — init a new repo
236	clone_dir.mkdir(parents=True, exist_ok=True)
237	subprocess.run(["git", "init"], cwd=clone_dir, capture_output=True)
238	subprocess.run(
239	["git", "remote", "add", "origin", wiki_url],
240	cwd=clone_dir,
241	capture_output=True,
242	)
243
244	# Copy wiki pages into clone
245	for md_file in wiki_dir.glob("*.md"):
246	if md_file.parent == wiki_dir:
247	dest = clone_dir / md_file.name
248	dest.write_text(md_file.read_text(encoding="utf-8"), encoding="utf-8")
249
250	# Commit and push
251	subprocess.run(["git", "add", "-A"], cwd=clone_dir, capture_output=True)
252	commit_result = subprocess.run(
253	["git", "commit", "-m", message],
254	cwd=clone_dir,
255	capture_output=True,
256	text=True,
257	)
258	if commit_result.returncode != 0:
259	logger.info("No wiki changes to commit")
260	return True
261
262	push_result = subprocess.run(
263	["git", "push", "origin", "master"],
264	cwd=clone_dir,
265	capture_output=True,
266	text=True,
267	)
268	if push_result.returncode != 0:
269	# Try main branch
270	push_result = subprocess.run(
271	["git", "push", "origin", "main"],
272	cwd=clone_dir,
273	capture_output=True,
274	text=True,
275	)
276
277	if push_result.returncode == 0:
278	logger.info(f"Wiki pushed to {wiki_url}")
279	return True
280	else:
281	logger.error(f"Wiki push failed: {push_result.stderr}")
282	return False
283
284
285	class WikiGeneratorSkill(Skill):
286	name = "wiki_generator"
287	description = "Generate a GitHub wiki from knowledge graph and artifacts"
288
289	def execute(self, context: AgentContext, **kwargs) -> Artifact:
290	kg_data = context.knowledge_graph.to_dict()
291	pages = generate_wiki(
292	kg_data,
293	artifacts=context.artifacts,
294	title=kwargs.get("title", "Knowledge Base"),
295	)
296
297	# Return a summary artifact; actual pages are written via write_wiki()
298	page_list = sorted(pages.keys())
299	summary_parts = [
300	f"Generated {len(pages)} wiki pages:",
301	"",
302	]
303	for name in page_list:
304	summary_parts.append(f"- {name}.md")
305
306	return Artifact(
307	name="Wiki",
308	content="\n".join(summary_parts),
309	artifact_type="wiki",
310	format="markdown",
311	metadata={"pages": pages},
312	)
313
314
315	register_skill(WikiGeneratorSkill())

M video_processor/cli/commands.py

+61

		--- video_processor/cli/commands.py
		+++ video_processor/cli/commands.py
		@@ -1412,10 +1412,71 @@
1412	1412	click.echo(f" Chunks: {total_chunks}")
1413	1413	click.echo(f" Entities: {entity_count}")
1414	1414	click.echo(f" Relationships: {rel_count}")
1415	1415	click.echo(f" Knowledge graph: {kg_path}")
1416	1416
	1417	+
	1418	+@cli.group()
	1419	+def wiki():
	1420	+ """Generate and push GitHub wikis from knowledge graphs."""
	1421	+ pass
	1422	+
	1423	+
	1424	+@wiki.command("generate")
	1425	+@click.argument("db_path", type=click.Path(exists=True))
	1426	+@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory for wiki")
	1427	+@click.option("--title", type=str, default="Knowledge Base", help="Wiki title")
	1428	+def wiki_generate(db_path, output, title):
	1429	+ """Generate a GitHub wiki from a knowledge graph.
	1430	+
	1431	+ Examples:
	1432	+
	1433	+ planopticon wiki generate knowledge_graph.db -o ./wiki
	1434	+
	1435	+ planopticon wiki generate results/kg.db --title "Project Wiki"
	1436	+ """
	1437	+ from video_processor.agent.skills.wiki_generator import generate_wiki, write_wiki
	1438	+ from video_processor.integrators.knowledge_graph import KnowledgeGraph
	1439	+
	1440	+ db_path = Path(db_path)
	1441	+ out_dir = Path(output) if output else Path.cwd() / "wiki"
	1442	+
	1443	+ kg = KnowledgeGraph(db_path=db_path)
	1444	+ kg_data = kg.to_dict()
	1445	+ pages = generate_wiki(kg_data, title=title)
	1446	+ written = write_wiki(pages, out_dir)
	1447	+
	1448	+ click.echo(f"Generated {len(written)} wiki pages in {out_dir}")
	1449	+ for p in sorted(written):
	1450	+ click.echo(f" {p.name}")
	1451	+
	1452	+
	1453	+@wiki.command("push")
	1454	+@click.argument("wiki_dir", type=click.Path(exists=True))
	1455	+@click.argument("repo", type=str)
	1456	+@click.option("--message", "-m", type=str, default="Update wiki", help="Commit message")
	1457	+def wiki_push(wiki_dir, repo, message):
	1458	+ """Push generated wiki pages to a GitHub wiki repo.
	1459	+
	1460	+ REPO should be in 'owner/repo' format.
	1461	+
	1462	+ Examples:
	1463	+
	1464	+ planopticon wiki push ./wiki ConflictHQ/PlanOpticon
	1465	+
	1466	+ planopticon wiki push ./wiki owner/repo -m "Add entity pages"
	1467	+ """
	1468	+ from video_processor.agent.skills.wiki_generator import push_wiki
	1469	+
	1470	+ wiki_dir = Path(wiki_dir)
	1471	+ success = push_wiki(wiki_dir, repo, message=message)
	1472	+ if success:
	1473	+ click.echo(f"Wiki pushed to https://github.com/{repo}/wiki")
	1474	+ else:
	1475	+ click.echo("Wiki push failed. Check auth and repo permissions.", err=True)
	1476	+ sys.exit(1)
	1477	+
1417	1478
1418	1479	@cli.group()
1419	1480	def kg():
1420	1481	"""Knowledge graph utilities: convert, sync, and inspect."""
1421	1482	pass
1422	1483

	--- video_processor/cli/commands.py
	+++ video_processor/cli/commands.py
	@@ -1412,10 +1412,71 @@
1412	click.echo(f" Chunks: {total_chunks}")
1413	click.echo(f" Entities: {entity_count}")
1414	click.echo(f" Relationships: {rel_count}")
1415	click.echo(f" Knowledge graph: {kg_path}")
1416





























































1417
1418	@cli.group()
1419	def kg():
1420	"""Knowledge graph utilities: convert, sync, and inspect."""
1421	pass
1422

	--- video_processor/cli/commands.py
	+++ video_processor/cli/commands.py
	@@ -1412,10 +1412,71 @@
1412	click.echo(f" Chunks: {total_chunks}")
1413	click.echo(f" Entities: {entity_count}")
1414	click.echo(f" Relationships: {rel_count}")
1415	click.echo(f" Knowledge graph: {kg_path}")
1416
1417
1418	@cli.group()
1419	def wiki():
1420	"""Generate and push GitHub wikis from knowledge graphs."""
1421	pass
1422
1423
1424	@wiki.command("generate")
1425	@click.argument("db_path", type=click.Path(exists=True))
1426	@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory for wiki")
1427	@click.option("--title", type=str, default="Knowledge Base", help="Wiki title")
1428	def wiki_generate(db_path, output, title):
1429	"""Generate a GitHub wiki from a knowledge graph.
1430
1431	Examples:
1432
1433	planopticon wiki generate knowledge_graph.db -o ./wiki
1434
1435	planopticon wiki generate results/kg.db --title "Project Wiki"
1436	"""
1437	from video_processor.agent.skills.wiki_generator import generate_wiki, write_wiki
1438	from video_processor.integrators.knowledge_graph import KnowledgeGraph
1439
1440	db_path = Path(db_path)
1441	out_dir = Path(output) if output else Path.cwd() / "wiki"
1442
1443	kg = KnowledgeGraph(db_path=db_path)
1444	kg_data = kg.to_dict()
1445	pages = generate_wiki(kg_data, title=title)
1446	written = write_wiki(pages, out_dir)
1447
1448	click.echo(f"Generated {len(written)} wiki pages in {out_dir}")
1449	for p in sorted(written):
1450	click.echo(f" {p.name}")
1451
1452
1453	@wiki.command("push")
1454	@click.argument("wiki_dir", type=click.Path(exists=True))
1455	@click.argument("repo", type=str)
1456	@click.option("--message", "-m", type=str, default="Update wiki", help="Commit message")
1457	def wiki_push(wiki_dir, repo, message):
1458	"""Push generated wiki pages to a GitHub wiki repo.
1459
1460	REPO should be in 'owner/repo' format.
1461
1462	Examples:
1463
1464	planopticon wiki push ./wiki ConflictHQ/PlanOpticon
1465
1466	planopticon wiki push ./wiki owner/repo -m "Add entity pages"
1467	"""
1468	from video_processor.agent.skills.wiki_generator import push_wiki
1469
1470	wiki_dir = Path(wiki_dir)
1471	success = push_wiki(wiki_dir, repo, message=message)
1472	if success:
1473	click.echo(f"Wiki pushed to https://github.com/{repo}/wiki")
1474	else:
1475	click.echo("Wiki push failed. Check auth and repo permissions.", err=True)
1476	sys.exit(1)
1477
1478
1479	@cli.group()
1480	def kg():
1481	"""Knowledge graph utilities: convert, sync, and inspect."""
1482	pass
1483

M video_processor/sources/__init__.py

+16 -4

		--- video_processor/sources/__init__.py
		+++ video_processor/sources/__init__.py
		@@ -1,36 +1,48 @@
1		-"""Cloud and web source integrations for fetching content from remote sources."""
	1	+"""Cloud, web, and notes source integrations for fetching content from remote sources."""
2	2
3	3	from video_processor.sources.base import BaseSource, SourceFile
4	4
5	5	__all__ = [
6	6	"BaseSource",
7	7	"SourceFile",
	8	+ "AppleNotesSource",
8	9	"ArxivSource",
9	10	"GitHubSource",
10	11	"GoogleDriveSource",
	12	+ "GoogleKeepSource",
	13	+ "GWSSource",
11	14	"HackerNewsSource",
	15	+ "LogseqSource",
	16	+ "M365Source",
	17	+ "NotionSource",
	18	+ "ObsidianSource",
	19	+ "OneNoteSource",
12	20	"PodcastSource",
13	21	"RedditSource",
14	22	"RSSSource",
15	23	"TwitterSource",
16		- "GWSSource",
17		- "M365Source",
18	24	"WebSource",
19	25	"YouTubeSource",
20	26	]
21	27
22	28
23	29	def __getattr__(name: str):
24	30	"""Lazy imports to avoid pulling in optional dependencies at import time."""
25	31	_lazy_map = {
	32	+ "AppleNotesSource": "video_processor.sources.apple_notes_source",
26	33	"ArxivSource": "video_processor.sources.arxiv_source",
27	34	"GitHubSource": "video_processor.sources.github_source",
28	35	"GoogleDriveSource": "video_processor.sources.google_drive",
	36	+ "GoogleKeepSource": "video_processor.sources.google_keep_source",
29	37	"GWSSource": "video_processor.sources.gws_source",
30		- "M365Source": "video_processor.sources.m365_source",
31	38	"HackerNewsSource": "video_processor.sources.hackernews_source",
	39	+ "LogseqSource": "video_processor.sources.logseq_source",
	40	+ "M365Source": "video_processor.sources.m365_source",
	41	+ "NotionSource": "video_processor.sources.notion_source",
	42	+ "ObsidianSource": "video_processor.sources.obsidian_source",
	43	+ "OneNoteSource": "video_processor.sources.onenote_source",
32	44	"PodcastSource": "video_processor.sources.podcast_source",
33	45	"RedditSource": "video_processor.sources.reddit_source",
34	46	"RSSSource": "video_processor.sources.rss_source",
35	47	"TwitterSource": "video_processor.sources.twitter_source",
36	48	"WebSource": "video_processor.sources.web_source",
37	49
38	50	ADDED video_processor/sources/apple_notes_source.py
39	51	ADDED video_processor/sources/google_keep_source.py
40	52	ADDED video_processor/sources/logseq_source.py
41	53	ADDED video_processor/sources/notion_source.py
42	54	ADDED video_processor/sources/obsidian_source.py
43	55	ADDED video_processor/sources/onenote_source.py

	--- video_processor/sources/__init__.py
	+++ video_processor/sources/__init__.py
	@@ -1,36 +1,48 @@
1	"""Cloud and web source integrations for fetching content from remote sources."""
2
3	from video_processor.sources.base import BaseSource, SourceFile
4
5	__all__ = [
6	"BaseSource",
7	"SourceFile",

8	"ArxivSource",
9	"GitHubSource",
10	"GoogleDriveSource",


11	"HackerNewsSource",





12	"PodcastSource",
13	"RedditSource",
14	"RSSSource",
15	"TwitterSource",
16	"GWSSource",
17	"M365Source",
18	"WebSource",
19	"YouTubeSource",
20	]
21
22
23	def __getattr__(name: str):
24	"""Lazy imports to avoid pulling in optional dependencies at import time."""
25	_lazy_map = {

26	"ArxivSource": "video_processor.sources.arxiv_source",
27	"GitHubSource": "video_processor.sources.github_source",
28	"GoogleDriveSource": "video_processor.sources.google_drive",

29	"GWSSource": "video_processor.sources.gws_source",
30	"M365Source": "video_processor.sources.m365_source",
31	"HackerNewsSource": "video_processor.sources.hackernews_source",





32	"PodcastSource": "video_processor.sources.podcast_source",
33	"RedditSource": "video_processor.sources.reddit_source",
34	"RSSSource": "video_processor.sources.rss_source",
35	"TwitterSource": "video_processor.sources.twitter_source",
36	"WebSource": "video_processor.sources.web_source",
37
38	DDED video_processor/sources/apple_notes_source.py
39	DDED video_processor/sources/google_keep_source.py
40	DDED video_processor/sources/logseq_source.py
41	DDED video_processor/sources/notion_source.py
42	DDED video_processor/sources/obsidian_source.py
43	DDED video_processor/sources/onenote_source.py

	--- video_processor/sources/__init__.py
	+++ video_processor/sources/__init__.py
	@@ -1,36 +1,48 @@
1	"""Cloud, web, and notes source integrations for fetching content from remote sources."""
2
3	from video_processor.sources.base import BaseSource, SourceFile
4
5	__all__ = [
6	"BaseSource",
7	"SourceFile",
8	"AppleNotesSource",
9	"ArxivSource",
10	"GitHubSource",
11	"GoogleDriveSource",
12	"GoogleKeepSource",
13	"GWSSource",
14	"HackerNewsSource",
15	"LogseqSource",
16	"M365Source",
17	"NotionSource",
18	"ObsidianSource",
19	"OneNoteSource",
20	"PodcastSource",
21	"RedditSource",
22	"RSSSource",
23	"TwitterSource",


24	"WebSource",
25	"YouTubeSource",
26	]
27
28
29	def __getattr__(name: str):
30	"""Lazy imports to avoid pulling in optional dependencies at import time."""
31	_lazy_map = {
32	"AppleNotesSource": "video_processor.sources.apple_notes_source",
33	"ArxivSource": "video_processor.sources.arxiv_source",
34	"GitHubSource": "video_processor.sources.github_source",
35	"GoogleDriveSource": "video_processor.sources.google_drive",
36	"GoogleKeepSource": "video_processor.sources.google_keep_source",
37	"GWSSource": "video_processor.sources.gws_source",

38	"HackerNewsSource": "video_processor.sources.hackernews_source",
39	"LogseqSource": "video_processor.sources.logseq_source",
40	"M365Source": "video_processor.sources.m365_source",
41	"NotionSource": "video_processor.sources.notion_source",
42	"ObsidianSource": "video_processor.sources.obsidian_source",
43	"OneNoteSource": "video_processor.sources.onenote_source",
44	"PodcastSource": "video_processor.sources.podcast_source",
45	"RedditSource": "video_processor.sources.reddit_source",
46	"RSSSource": "video_processor.sources.rss_source",
47	"TwitterSource": "video_processor.sources.twitter_source",
48	"WebSource": "video_processor.sources.web_source",
49
50	DDED video_processor/sources/apple_notes_source.py
51	DDED video_processor/sources/google_keep_source.py
52	DDED video_processor/sources/logseq_source.py
53	DDED video_processor/sources/notion_source.py
54	DDED video_processor/sources/obsidian_source.py
55	DDED video_processor/sources/onenote_source.py

A video_processor/sources/apple_notes_source.py

+178

		--- a/video_processor/sources/apple_notes_source.py
		+++ b/video_processor/sources/apple_notes_source.py
		@@ -0,0 +1,178 @@
	1	+"""Apple Notes source connector via osascript (macOS only)."""
	2	+
	3	+import logging
	4	+import re
	5	+import subprocess
	6	+import sys
	7	+from pathlib import Path
	8	+from typing import List, Optional
	9	+
	10	+from video_processor.sources.base import BaseSource, SourceFile
	11	+
	12	+logger = logging.getLogger(__name__)
	13	+
	14	+
	15	+class AppleNotesSource(BaseSource):
	16	+ """
	17	+ Fetch notes from Apple Notes using osascript (AppleScript).
	18	+
	19	+ Only works on macOS. Requires the Notes app to be available
	20	+ and permission for osascript to access it.
	21	+ """
	22	+
	23	+ def __init__(self, folder: Optional[str] = None):
	24	+ self.folder = folder
	25	+
	26	+ def authenticate(self) -> bool:
	27	+ """Check that we are running on macOS."""
	28	+ if sys.platform != "darwin":
	29	+ logger.error("Apple Notes is only available on macOS (current: %s)", sys.platform)
	30	+ return False
	31	+ return True
	32	+
	33	+ def list_videos(
	34	+ self,
	35	+ folder_id: Optional[str] = None,
	36	+ folder_path: Optional[str] = None,
	37	+ patterns: Optional[List[str]] = None,
	38	+ ) -> List[SourceFile]:
	39	+ """List notes from Apple Notes via osascript."""
	40	+ if not self.authenticate():
	41	+ return []
	42	+
	43	+ if self.folder:
	44	+ script = (
	45	+ 'tell application "Notes"\n'
	46	+ " set noteList to {}\n"
	47	+ f" repeat with f in folders of default account\n"
	48	+ f' if name of f is "{self.folder}" then\n'
	49	+ " repeat with n in notes of f\n"
	50	+ ' set end of noteList to (id of n) & "\|\|\|" & (name of n)\n'
	51	+ " end repeat\n"
	52	+ " end if\n"
	53	+ " end repeat\n"
	54	+ " return noteList\n"
	55	+ "end tell"
	56	+ )
	57	+ else:
	58	+ script = (
	59	+ 'tell application "Notes"\n'
	60	+ " set noteList to {}\n"
	61	+ " repeat with n in notes of default account\n"
	62	+ ' set end of noteList to (id of n) & "\|\|\|" & (name of n)\n'
	63	+ " end repeat\n"
	64	+ " return noteList\n"
	65	+ "end tell"
	66	+ )
	67	+
	68	+ try:
	69	+ result = subprocess.run(
	70	+ ["osascript", "-e", script],
	71	+ capture_output=True,
	72	+ text=True,
	73	+ timeout=30,
	74	+ )
	75	+ except FileNotFoundError:
	76	+ logger.error("osascript not found. Apple Notes requires macOS.")
	77	+ return []
	78	+ except subprocess.TimeoutExpired:
	79	+ logger.error("osascript timed out while listing notes.")
	80	+ return []
	81	+
	82	+ if result.returncode != 0:
	83	+ logger.error("Failed to list notes: %s", result.stderr.strip())
	84	+ return []
	85	+
	86	+ return self._parse_note_list(result.stdout.strip())
	87	+
	88	+ def _parse_note_list(self, output: str) -> List[SourceFile]:
	89	+ """Parse osascript output into SourceFile objects.
	90	+
	91	+ Expected format: comma-separated items of 'id\|\|\|name' pairs.
	92	+ """
	93	+ files: List[SourceFile] = []
	94	+ if not output:
	95	+ return files
	96	+
	97	+ # AppleScript returns a flat comma-separated list
	98	+ entries = output.split(", ")
	99	+ for entry in entries:
	100	+ entry = entry.strip()
	101	+ if "\|\|\|" not in entry:
	102	+ continue
	103	+ note_id, _, name = entry.partition("\|\|\|")
	104	+ note_id = note_id.strip()
	105	+ name = name.strip()
	106	+ if note_id and name:
	107	+ files.append(
	108	+ SourceFile(
	109	+ name=name,
	110	+ id=note_id,
	111	+ mime_type="text/plain",
	112	+ )
	113	+ )
	114	+
	115	+ logger.info("Found %d notes", len(files))
	116	+ return files
	117	+
	118	+ def download(self, file: SourceFile, destination: Path) -> Path:
	119	+ """Download a note's content as plain text."""
	120	+ destination = Path(destination)
	121	+ destination.parent.mkdir(parents=True, exist_ok=True)
	122	+
	123	+ script = (
	124	+ 'tell application "Notes"\n'
	125	+ f' set theNote to note id "{file.id}" of default account\n'
	126	+ " return body of theNote\n"
	127	+ "end tell"
	128	+ )
	129	+
	130	+ try:
	131	+ result = subprocess.run(
	132	+ ["osascript", "-e", script],
	133	+ capture_output=True,
	134	+ text=True,
	135	+ timeout=30,
	136	+ )
	137	+ except FileNotFoundError:
	138	+ raise RuntimeError("osascript not found. Apple Notes requires macOS.")
	139	+ except subprocess.TimeoutExpired:
	140	+ raise RuntimeError(f"osascript timed out fetching note {file.id}")
	141	+
	142	+ if result.returncode != 0:
	143	+ raise RuntimeError(f"Failed to fetch note {file.id}: {result.stderr.strip()}")
	144	+
	145	+ html_body = result.stdout.strip()
	146	+ text = self._html_to_text(html_body)
	147	+
	148	+ # Prepend title
	149	+ content = f"# {file.name}\n\n{text}"
	150	+ destination.write_text(content, encoding="utf-8")
	151	+ logger.info("Saved Apple Note to %s", destination)
	152	+ return destination
	153	+
	154	+ @staticmethod
	155	+ def _html_to_text(html: str) -> str:
	156	+ """Strip HTML tags and return plain text.
	157	+
	158	+ Apple Notes returns note bodies as HTML. This uses regex-based
	159	+ stripping similar to web_source._strip_html_tags.
	160	+ """
	161	+ if not html:
	162	+ return ""
	163	+ # Replace <br> variants with newlines
	164	+ text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE)
	165	+ # Replace block-level closing tags with newlines
	166	+ text = re.sub(r"</(?:p\|div\|li\|tr\|h[1-6])>", "\n", text, flags=re.IGNORECASE)
	167	+ # Remove all remaining tags
	168	+ text = re.sub(r"<[^>]+>", "", text)
	169	+ # Decode common HTML entities
	170	+ text = text.replace("&", "&")
	171	+ text = text.replace("<", "<")
	172	+ text = text.replace(">", ">")
	173	+ text = text.replace(""", '"')
	174	+ text = text.replace("'", "'")
	175	+ text = text.replace(" ", " ")
	176	+ # Collapse excessive blank lines
	177	+ text = re.sub(r"\n{3,}", "\n\n", text)
	178	+ return text.strip()

	--- a/video_processor/sources/apple_notes_source.py
	+++ b/video_processor/sources/apple_notes_source.py
	@@ -0,0 +1,178 @@

	--- a/video_processor/sources/apple_notes_source.py
	+++ b/video_processor/sources/apple_notes_source.py
	@@ -0,0 +1,178 @@
1	"""Apple Notes source connector via osascript (macOS only)."""
2
3	import logging
4	import re
5	import subprocess
6	import sys
7	from pathlib import Path
8	from typing import List, Optional
9
10	from video_processor.sources.base import BaseSource, SourceFile
11
12	logger = logging.getLogger(__name__)
13
14
15	class AppleNotesSource(BaseSource):
16	"""
17	Fetch notes from Apple Notes using osascript (AppleScript).
18
19	Only works on macOS. Requires the Notes app to be available
20	and permission for osascript to access it.
21	"""
22
23	def __init__(self, folder: Optional[str] = None):
24	self.folder = folder
25
26	def authenticate(self) -> bool:
27	"""Check that we are running on macOS."""
28	if sys.platform != "darwin":
29	logger.error("Apple Notes is only available on macOS (current: %s)", sys.platform)
30	return False
31	return True
32
33	def list_videos(
34	self,
35	folder_id: Optional[str] = None,
36	folder_path: Optional[str] = None,
37	patterns: Optional[List[str]] = None,
38	) -> List[SourceFile]:
39	"""List notes from Apple Notes via osascript."""
40	if not self.authenticate():
41	return []
42
43	if self.folder:
44	script = (
45	'tell application "Notes"\n'
46	" set noteList to {}\n"
47	f" repeat with f in folders of default account\n"
48	f' if name of f is "{self.folder}" then\n'
49	" repeat with n in notes of f\n"
50	' set end of noteList to (id of n) & "\|\|\|" & (name of n)\n'
51	" end repeat\n"
52	" end if\n"
53	" end repeat\n"
54	" return noteList\n"
55	"end tell"
56	)
57	else:
58	script = (
59	'tell application "Notes"\n'
60	" set noteList to {}\n"
61	" repeat with n in notes of default account\n"
62	' set end of noteList to (id of n) & "\|\|\|" & (name of n)\n'
63	" end repeat\n"
64	" return noteList\n"
65	"end tell"
66	)
67
68	try:
69	result = subprocess.run(
70	["osascript", "-e", script],
71	capture_output=True,
72	text=True,
73	timeout=30,
74	)
75	except FileNotFoundError:
76	logger.error("osascript not found. Apple Notes requires macOS.")
77	return []
78	except subprocess.TimeoutExpired:
79	logger.error("osascript timed out while listing notes.")
80	return []
81
82	if result.returncode != 0:
83	logger.error("Failed to list notes: %s", result.stderr.strip())
84	return []
85
86	return self._parse_note_list(result.stdout.strip())
87
88	def _parse_note_list(self, output: str) -> List[SourceFile]:
89	"""Parse osascript output into SourceFile objects.
90
91	Expected format: comma-separated items of 'id\|\|\|name' pairs.
92	"""
93	files: List[SourceFile] = []
94	if not output:
95	return files
96
97	# AppleScript returns a flat comma-separated list
98	entries = output.split(", ")
99	for entry in entries:
100	entry = entry.strip()
101	if "\|\|\|" not in entry:
102	continue
103	note_id, _, name = entry.partition("\|\|\|")
104	note_id = note_id.strip()
105	name = name.strip()
106	if note_id and name:
107	files.append(
108	SourceFile(
109	name=name,
110	id=note_id,
111	mime_type="text/plain",
112	)
113	)
114
115	logger.info("Found %d notes", len(files))
116	return files
117
118	def download(self, file: SourceFile, destination: Path) -> Path:
119	"""Download a note's content as plain text."""
120	destination = Path(destination)
121	destination.parent.mkdir(parents=True, exist_ok=True)
122
123	script = (
124	'tell application "Notes"\n'
125	f' set theNote to note id "{file.id}" of default account\n'
126	" return body of theNote\n"
127	"end tell"
128	)
129
130	try:
131	result = subprocess.run(
132	["osascript", "-e", script],
133	capture_output=True,
134	text=True,
135	timeout=30,
136	)
137	except FileNotFoundError:
138	raise RuntimeError("osascript not found. Apple Notes requires macOS.")
139	except subprocess.TimeoutExpired:
140	raise RuntimeError(f"osascript timed out fetching note {file.id}")
141
142	if result.returncode != 0:
143	raise RuntimeError(f"Failed to fetch note {file.id}: {result.stderr.strip()}")
144
145	html_body = result.stdout.strip()
146	text = self._html_to_text(html_body)
147
148	# Prepend title
149	content = f"# {file.name}\n\n{text}"
150	destination.write_text(content, encoding="utf-8")
151	logger.info("Saved Apple Note to %s", destination)
152	return destination
153
154	@staticmethod
155	def _html_to_text(html: str) -> str:
156	"""Strip HTML tags and return plain text.
157
158	Apple Notes returns note bodies as HTML. This uses regex-based
159	stripping similar to web_source._strip_html_tags.
160	"""
161	if not html:
162	return ""
163	# Replace <br> variants with newlines
164	text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE)
165	# Replace block-level closing tags with newlines
166	text = re.sub(r"</(?:p\|div\|li\|tr\|h[1-6])>", "\n", text, flags=re.IGNORECASE)
167	# Remove all remaining tags
168	text = re.sub(r"<[^>]+>", "", text)
169	# Decode common HTML entities
170	text = text.replace("&", "&")
171	text = text.replace("<", "<")
172	text = text.replace(">", ">")
173	text = text.replace(""", '"')
174	text = text.replace("'", "'")
175	text = text.replace(" ", " ")
176	# Collapse excessive blank lines
177	text = re.sub(r"\n{3,}", "\n\n", text)
178	return text.strip()

A video_processor/sources/google_keep_source.py

+170

		--- a/video_processor/sources/google_keep_source.py
		+++ b/video_processor/sources/google_keep_source.py
		@@ -0,0 +1,170 @@
	1	+"""Google Keep source connector using the gws CLI (googleworkspace/cli).
	2	+
	3	+Fetches notes from Google Keep via the `gws` CLI tool.
	4	+Outputs plain text suitable for KG ingestion.
	5	+
	6	+Requires: npm install -g @googleworkspace/cli
	7	+Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless)
	8	+"""
	9	+
	10	+import json
	11	+import logging
	12	+import shutil
	13	+import subprocess
	14	+from pathlib import Path
	15	+from typing import Any, Dict, List, Optional
	16	+
	17	+from video_processor.sources.base import BaseSource, SourceFile
	18	+
	19	+logger = logging.getLogger(__name__)
	20	+
	21	+
	22	+def _run_gws(args: List[str], timeout: int = 30) -> Dict[str, Any]:
	23	+ """Run a gws CLI command and return parsed JSON output."""
	24	+ cmd = ["gws"] + args
	25	+ proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
	26	+ if proc.returncode != 0:
	27	+ raise RuntimeError(f"gws {' '.join(args)} failed: {proc.stderr.strip()}")
	28	+ try:
	29	+ return json.loads(proc.stdout)
	30	+ except json.JSONDecodeError:
	31	+ return {"raw": proc.stdout.strip()}
	32	+
	33	+
	34	+def _note_to_text(note: dict) -> str:
	35	+ """Extract text content from a Google Keep note structure.
	36	+
	37	+ Handles plain text notes and checklists. Checklist items are formatted
	38	+ as ``- [x] item`` (checked) or ``- [ ] item`` (unchecked).
	39	+ """
	40	+ parts: List[str] = []
	41	+
	42	+ title = note.get("title", "").strip()
	43	+ if title:
	44	+ parts.append(title)
	45	+
	46	+ body = note.get("body", note.get("textContent", "")).strip()
	47	+ if body:
	48	+ parts.append(body)
	49	+
	50	+ # Checklist items may appear under "list", "listContent", or "checklistItems"
	51	+ list_items = note.get("list", note.get("listContent", note.get("checklistItems", [])))
	52	+ if isinstance(list_items, list):
	53	+ for item in list_items:
	54	+ text = item.get("text", "").strip()
	55	+ if not text:
	56	+ continue
	57	+ checked = item.get("checked", item.get("isChecked", False))
	58	+ marker = "[x]" if checked else "[ ]"
	59	+ parts.append(f"- {marker} {text}")
	60	+
	61	+ return "\n\n".join(parts) if parts else ""
	62	+
	63	+
	64	+class GoogleKeepSource(BaseSource):
	65	+ """
	66	+ Fetch notes from Google Keep via the gws CLI.
	67	+
	68	+ Usage:
	69	+ source = GoogleKeepSource() # all notes
	70	+ source = GoogleKeepSource(label="meetings") # filter by label
	71	+ files = source.list_videos()
	72	+ source.download_all(files, Path("./notes"))
	73	+ """
	74	+
	75	+ def __init__(self, label: Optional[str] = None):
	76	+ self.label = label
	77	+
	78	+ def authenticate(self) -> bool:
	79	+ """Check if gws CLI is installed and authenticated."""
	80	+ if not shutil.which("gws"):
	81	+ logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli")
	82	+ return False
	83	+ try:
	84	+ _run_gws(["auth", "status"], timeout=10)
	85	+ return True
	86	+ except (RuntimeError, subprocess.TimeoutExpired):
	87	+ logger.error("gws not authenticated. Run: gws auth login")
	88	+ return False
	89	+
	90	+ def list_videos(
	91	+ self,
	92	+ folder_id: Optional[str] = None,
	93	+ folder_path: Optional[str] = None,
	94	+ patterns: Optional[List[str]] = None,
	95	+ ) -> List[SourceFile]:
	96	+ """List notes in Google Keep. Returns SourceFile per note."""
	97	+ args = ["keep", "notes", "list", "--output", "json"]
	98	+
	99	+ if self.label:
	100	+ args.extend(["--label", self.label])
	101	+
	102	+ try:
	103	+ result = _run_gws(args, timeout=60)
	104	+ except RuntimeError as e:
	105	+ logger.error(f"Failed to list Keep notes: {e}")
	106	+ return []
	107	+
	108	+ # Result may be a list directly or nested under a key
	109	+ notes: List[dict] = []
	110	+ if isinstance(result, list):
	111	+ notes = result
	112	+ elif isinstance(result, dict):
	113	+ notes = result.get("notes", result.get("items", []))
	114	+ # If we got a single note back (not a list), wrap it
	115	+ if not notes and "id" in result and "raw" not in result:
	116	+ notes = [result]
	117	+
	118	+ files: List[SourceFile] = []
	119	+ for note in notes:
	120	+ note_id = note.get("id", note.get("noteId", ""))
	121	+ title = note.get("title", "Untitled Note").strip() or "Untitled Note"
	122	+ modified = note.get("modifiedTime", note.get("updateTime"))
	123	+
	124	+ # Estimate size from text content
	125	+ text = _note_to_text(note)
	126	+ size = len(text.encode("utf-8")) if text else None
	127	+
	128	+ files.append(
	129	+ SourceFile(
	130	+ name=title,
	131	+ id=str(note_id),
	132	+ size_bytes=size,
	133	+ mime_type="text/plain",
	134	+ modified_at=modified,
	135	+ )
	136	+ )
	137	+
	138	+ logger.info(f"Found {len(files)} note(s) in Google Keep")
	139	+ return files
	140	+
	141	+ def download(self, file: SourceFile, destination: Path) -> Path:
	142	+ """Download a Keep note's content as a text file."""
	143	+ destination = Path(destination)
	144	+ destination.parent.mkdir(parents=True, exist_ok=True)
	145	+
	146	+ try:
	147	+ result = _run_gws(
	148	+ [
	149	+ "keep",
	150	+ "notes",
	151	+ "get",
	152	+ "--params",
	153	+ json.dumps({"noteId": file.id}),
	154	+ ],
	155	+ timeout=30,
	156	+ )
	157	+ except RuntimeError as e:
	158	+ raise RuntimeError(f"Failed to fetch Keep note {file.id}: {e}") from e
	159	+
	160	+ # result may be the note dict directly or wrapped
	161	+ note = result if isinstance(result, dict) else {}
	162	+ text = _note_to_text(note)
	163	+
	164	+ if not text:
	165	+ # Fallback: use raw output if structured extraction yielded nothing
	166	+ text = note.get("raw", json.dumps(note, indent=2))
	167	+
	168	+ destination.write_text(text, encoding="utf-8")
	169	+ logger.info(f"Saved note '{file.name}' to {destination}")
	170	+ return destination

	--- a/video_processor/sources/google_keep_source.py
	+++ b/video_processor/sources/google_keep_source.py
	@@ -0,0 +1,170 @@

	--- a/video_processor/sources/google_keep_source.py
	+++ b/video_processor/sources/google_keep_source.py
	@@ -0,0 +1,170 @@
1	"""Google Keep source connector using the gws CLI (googleworkspace/cli).
2
3	Fetches notes from Google Keep via the `gws` CLI tool.
4	Outputs plain text suitable for KG ingestion.
5
6	Requires: npm install -g @googleworkspace/cli
7	Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless)
8	"""
9
10	import json
11	import logging
12	import shutil
13	import subprocess
14	from pathlib import Path
15	from typing import Any, Dict, List, Optional
16
17	from video_processor.sources.base import BaseSource, SourceFile
18
19	logger = logging.getLogger(__name__)
20
21
22	def _run_gws(args: List[str], timeout: int = 30) -> Dict[str, Any]:
23	"""Run a gws CLI command and return parsed JSON output."""
24	cmd = ["gws"] + args
25	proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
26	if proc.returncode != 0:
27	raise RuntimeError(f"gws {' '.join(args)} failed: {proc.stderr.strip()}")
28	try:
29	return json.loads(proc.stdout)
30	except json.JSONDecodeError:
31	return {"raw": proc.stdout.strip()}
32
33
34	def _note_to_text(note: dict) -> str:
35	"""Extract text content from a Google Keep note structure.
36
37	Handles plain text notes and checklists. Checklist items are formatted
38	as ``- [x] item`` (checked) or ``- [ ] item`` (unchecked).
39	"""
40	parts: List[str] = []
41
42	title = note.get("title", "").strip()
43	if title:
44	parts.append(title)
45
46	body = note.get("body", note.get("textContent", "")).strip()
47	if body:
48	parts.append(body)
49
50	# Checklist items may appear under "list", "listContent", or "checklistItems"
51	list_items = note.get("list", note.get("listContent", note.get("checklistItems", [])))
52	if isinstance(list_items, list):
53	for item in list_items:
54	text = item.get("text", "").strip()
55	if not text:
56	continue
57	checked = item.get("checked", item.get("isChecked", False))
58	marker = "[x]" if checked else "[ ]"
59	parts.append(f"- {marker} {text}")
60
61	return "\n\n".join(parts) if parts else ""
62
63
64	class GoogleKeepSource(BaseSource):
65	"""
66	Fetch notes from Google Keep via the gws CLI.
67
68	Usage:
69	source = GoogleKeepSource() # all notes
70	source = GoogleKeepSource(label="meetings") # filter by label
71	files = source.list_videos()
72	source.download_all(files, Path("./notes"))
73	"""
74
75	def __init__(self, label: Optional[str] = None):
76	self.label = label
77
78	def authenticate(self) -> bool:
79	"""Check if gws CLI is installed and authenticated."""
80	if not shutil.which("gws"):
81	logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli")
82	return False
83	try:
84	_run_gws(["auth", "status"], timeout=10)
85	return True
86	except (RuntimeError, subprocess.TimeoutExpired):
87	logger.error("gws not authenticated. Run: gws auth login")
88	return False
89
90	def list_videos(
91	self,
92	folder_id: Optional[str] = None,
93	folder_path: Optional[str] = None,
94	patterns: Optional[List[str]] = None,
95	) -> List[SourceFile]:
96	"""List notes in Google Keep. Returns SourceFile per note."""
97	args = ["keep", "notes", "list", "--output", "json"]
98
99	if self.label:
100	args.extend(["--label", self.label])
101
102	try:
103	result = _run_gws(args, timeout=60)
104	except RuntimeError as e:
105	logger.error(f"Failed to list Keep notes: {e}")
106	return []
107
108	# Result may be a list directly or nested under a key
109	notes: List[dict] = []
110	if isinstance(result, list):
111	notes = result
112	elif isinstance(result, dict):
113	notes = result.get("notes", result.get("items", []))
114	# If we got a single note back (not a list), wrap it
115	if not notes and "id" in result and "raw" not in result:
116	notes = [result]
117
118	files: List[SourceFile] = []
119	for note in notes:
120	note_id = note.get("id", note.get("noteId", ""))
121	title = note.get("title", "Untitled Note").strip() or "Untitled Note"
122	modified = note.get("modifiedTime", note.get("updateTime"))
123
124	# Estimate size from text content
125	text = _note_to_text(note)
126	size = len(text.encode("utf-8")) if text else None
127
128	files.append(
129	SourceFile(
130	name=title,
131	id=str(note_id),
132	size_bytes=size,
133	mime_type="text/plain",
134	modified_at=modified,
135	)
136	)
137
138	logger.info(f"Found {len(files)} note(s) in Google Keep")
139	return files
140
141	def download(self, file: SourceFile, destination: Path) -> Path:
142	"""Download a Keep note's content as a text file."""
143	destination = Path(destination)
144	destination.parent.mkdir(parents=True, exist_ok=True)
145
146	try:
147	result = _run_gws(
148	[
149	"keep",
150	"notes",
151	"get",
152	"--params",
153	json.dumps({"noteId": file.id}),
154	],
155	timeout=30,
156	)
157	except RuntimeError as e:
158	raise RuntimeError(f"Failed to fetch Keep note {file.id}: {e}") from e
159
160	# result may be the note dict directly or wrapped
161	note = result if isinstance(result, dict) else {}
162	text = _note_to_text(note)
163
164	if not text:
165	# Fallback: use raw output if structured extraction yielded nothing
166	text = note.get("raw", json.dumps(note, indent=2))
167
168	destination.write_text(text, encoding="utf-8")
169	logger.info(f"Saved note '{file.name}' to {destination}")
170	return destination

A video_processor/sources/logseq_source.py

+200

		--- a/video_processor/sources/logseq_source.py
		+++ b/video_processor/sources/logseq_source.py
		@@ -0,0 +1,200 @@
	1	+"""Logseq graph source connector for ingesting markdown pages and journals."""
	2	+
	3	+import logging
	4	+import re
	5	+import shutil
	6	+from datetime import datetime, timezone
	7	+from pathlib import Path
	8	+from typing import List, Optional, Tuple
	9	+
	10	+from video_processor.sources.base import BaseSource, SourceFile
	11	+
	12	+logger = logging.getLogger(__name__)
	13	+
	14	+
	15	+def parse_page(path: Path) -> dict:
	16	+ """Parse a Logseq markdown page and extract structured content.
	17	+
	18	+ Returns a dict with:
	19	+ - properties: dict of page-level properties (key:: value lines at top)
	20	+ - links: list of linked page names from [[wiki-links]]
	21	+ - tags: list of tags from #tag and #[[tag]] occurrences
	22	+ - block_refs: list of block reference IDs from ((block-id))
	23	+ - body: full text content
	24	+ """
	25	+ text = path.read_text(encoding="utf-8")
	26	+ lines = text.split("\n")
	27	+
	28	+ # Extract page properties (key:: value lines at the top of the file)
	29	+ properties: dict = {}
	30	+ body_start = 0
	31	+ for i, line in enumerate(lines):
	32	+ prop_match = re.match(r"^([A-Za-z][A-Za-z0-9_-])::\ ?(.)", line)
	33	+ if prop_match:
	34	+ key = prop_match.group(1)
	35	+ value = prop_match.group(2).strip()
	36	+ properties[key] = value
	37	+ body_start = i + 1
	38	+ else:
	39	+ break
	40	+
	41	+ body = "\n".join(lines[body_start:])
	42	+
	43	+ # Extract wiki-links: [[page]]
	44	+ link_pattern = re.compile(r"\[\[([^\]]+)\]\]")
	45	+ links = link_pattern.findall(body)
	46	+ # Also pick up links from properties
	47	+ for value in properties.values():
	48	+ links.extend(link_pattern.findall(str(value)))
	49	+
	50	+ # Extract tags: #tag and #[[tag]]
	51	+ # First get #[[multi word tag]] style
	52	+ bracket_tag_pattern = re.compile(r"#\[\[([^\]]+)\]\]")
	53	+ tags = bracket_tag_pattern.findall(text)
	54	+ # Then get simple #tag style (exclude matches already captured as #[[...]])
	55	+ # Remove bracket tags first to avoid double-matching
	56	+ text_without_bracket_tags = bracket_tag_pattern.sub("", text)
	57	+ simple_tag_pattern = re.compile(r"(?<!\w)#([A-Za-z][A-Za-z0-9_/-]*)")
	58	+ tags.extend(simple_tag_pattern.findall(text_without_bracket_tags))
	59	+
	60	+ # Extract block references: ((block-id))
	61	+ block_ref_pattern = re.compile(r"$\(([a-f0-9-]+)$\)")
	62	+ block_refs = block_ref_pattern.findall(text)
	63	+
	64	+ return {
	65	+ "properties": properties,
	66	+ "links": links,
	67	+ "tags": tags,
	68	+ "block_refs": block_refs,
	69	+ "body": body,
	70	+ }
	71	+
	72	+
	73	+def ingest_graph(graph_path: Path) -> dict:
	74	+ """Ingest an entire Logseq graph and return structured data.
	75	+
	76	+ Returns a dict with:
	77	+ - notes: list of dicts with name, tags, frontmatter (properties), text
	78	+ - links: list of (source, target) tuples from wiki-links
	79	+ """
	80	+ graph_path = Path(graph_path)
	81	+ notes: List[dict] = []
	82	+ links: List[Tuple[str, str]] = []
	83	+
	84	+ md_files: List[Path] = []
	85	+ pages_dir = graph_path / "pages"
	86	+ journals_dir = graph_path / "journals"
	87	+
	88	+ if pages_dir.is_dir():
	89	+ md_files.extend(sorted(pages_dir.rglob("*.md")))
	90	+ if journals_dir.is_dir():
	91	+ md_files.extend(sorted(journals_dir.rglob("*.md")))
	92	+
	93	+ logger.info("Found %d markdown files in graph %s", len(md_files), graph_path)
	94	+
	95	+ for md_file in md_files:
	96	+ page_name = md_file.stem
	97	+ try:
	98	+ parsed = parse_page(md_file)
	99	+ except Exception:
	100	+ logger.warning("Failed to parse page %s", md_file)
	101	+ continue
	102	+
	103	+ notes.append(
	104	+ {
	105	+ "name": page_name,
	106	+ "tags": parsed["tags"],
	107	+ "frontmatter": parsed["properties"],
	108	+ "text": parsed["body"],
	109	+ }
	110	+ )
	111	+
	112	+ for linked_page in parsed["links"]:
	113	+ links.append((page_name, linked_page))
	114	+
	115	+ logger.info(
	116	+ "Ingested %d notes with %d links from graph %s",
	117	+ len(notes),
	118	+ len(links),
	119	+ graph_path,
	120	+ )
	121	+ return {"notes": notes, "links": links}
	122	+
	123	+
	124	+class LogseqSource(BaseSource):
	125	+ """Source connector for Logseq graphs."""
	126	+
	127	+ def __init__(self, graph_path: str) -> None:
	128	+ self.graph_path = Path(graph_path)
	129	+
	130	+ def authenticate(self) -> bool:
	131	+ """Check that the graph path exists and has pages/ or journals/ dirs."""
	132	+ if not self.graph_path.is_dir():
	133	+ logger.error("Graph path does not exist: %s", self.graph_path)
	134	+ return False
	135	+ has_pages = (self.graph_path / "pages").is_dir()
	136	+ has_journals = (self.graph_path / "journals").is_dir()
	137	+ if not has_pages and not has_journals:
	138	+ logger.error(
	139	+ "No pages/ or journals/ directory found in graph: %s",
	140	+ self.graph_path,
	141	+ )
	142	+ return False
	143	+ logger.info(
	144	+ "Logseq graph authenticated: %s (pages=%s, journals=%s)",
	145	+ self.graph_path,
	146	+ has_pages,
	147	+ has_journals,
	148	+ )
	149	+ return True
	150	+
	151	+ def list_videos(
	152	+ self,
	153	+ folder_id: Optional[str] = None,
	154	+ folder_path: Optional[str] = None,
	155	+ patterns: Optional[List[str]] = None,
	156	+ ) -> List[SourceFile]:
	157	+ """List .md files in pages/ and journals/ as SourceFile objects."""
	158	+ md_files: List[Path] = []
	159	+
	160	+ pages_dir = self.graph_path / "pages"
	161	+ journals_dir = self.graph_path / "journals"
	162	+
	163	+ if folder_path:
	164	+ search_root = self.graph_path / folder_path
	165	+ if search_root.is_dir():
	166	+ md_files.extend(sorted(search_root.rglob("*.md")))
	167	+ else:
	168	+ if pages_dir.is_dir():
	169	+ md_files.extend(sorted(pages_dir.rglob("*.md")))
	170	+ if journals_dir.is_dir():
	171	+ md_files.extend(sorted(journals_dir.rglob("*.md")))
	172	+
	173	+ results: List[SourceFile] = []
	174	+ for md_file in md_files:
	175	+ relative = md_file.relative_to(self.graph_path)
	176	+ stat = md_file.stat()
	177	+ modified_dt = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc)
	178	+
	179	+ results.append(
	180	+ SourceFile(
	181	+ name=md_file.name,
	182	+ id=str(relative),
	183	+ size_bytes=stat.st_size,
	184	+ mime_type="text/markdown",
	185	+ modified_at=modified_dt.isoformat(),
	186	+ path=str(relative),
	187	+ )
	188	+ )
	189	+
	190	+ logger.info("Listed %d files from graph %s", len(results), self.graph_path)
	191	+ return results
	192	+
	193	+ def download(self, file: SourceFile, destination: Path) -> Path:
	194	+ """Copy a graph file to the destination path."""
	195	+ source = self.graph_path / file.id
	196	+ destination = Path(destination)
	197	+ destination.parent.mkdir(parents=True, exist_ok=True)
	198	+ shutil.copy2(source, destination)
	199	+ logger.info("Copied %s -> %s", source, destination)
	200	+ return destination

	--- a/video_processor/sources/logseq_source.py
	+++ b/video_processor/sources/logseq_source.py
	@@ -0,0 +1,200 @@

	--- a/video_processor/sources/logseq_source.py
	+++ b/video_processor/sources/logseq_source.py
	@@ -0,0 +1,200 @@
1	"""Logseq graph source connector for ingesting markdown pages and journals."""
2
3	import logging
4	import re
5	import shutil
6	from datetime import datetime, timezone
7	from pathlib import Path
8	from typing import List, Optional, Tuple
9
10	from video_processor.sources.base import BaseSource, SourceFile
11
12	logger = logging.getLogger(__name__)
13
14
15	def parse_page(path: Path) -> dict:
16	"""Parse a Logseq markdown page and extract structured content.
17
18	Returns a dict with:
19	- properties: dict of page-level properties (key:: value lines at top)
20	- links: list of linked page names from [[wiki-links]]
21	- tags: list of tags from #tag and #[[tag]] occurrences
22	- block_refs: list of block reference IDs from ((block-id))
23	- body: full text content
24	"""
25	text = path.read_text(encoding="utf-8")
26	lines = text.split("\n")
27
28	# Extract page properties (key:: value lines at the top of the file)
29	properties: dict = {}
30	body_start = 0
31	for i, line in enumerate(lines):
32	prop_match = re.match(r"^([A-Za-z][A-Za-z0-9_-])::\ ?(.)", line)
33	if prop_match:
34	key = prop_match.group(1)
35	value = prop_match.group(2).strip()
36	properties[key] = value
37	body_start = i + 1
38	else:
39	break
40
41	body = "\n".join(lines[body_start:])
42
43	# Extract wiki-links: [[page]]
44	link_pattern = re.compile(r"\[\[([^\]]+)\]\]")
45	links = link_pattern.findall(body)
46	# Also pick up links from properties
47	for value in properties.values():
48	links.extend(link_pattern.findall(str(value)))
49
50	# Extract tags: #tag and #[[tag]]
51	# First get #[[multi word tag]] style
52	bracket_tag_pattern = re.compile(r"#\[\[([^\]]+)\]\]")
53	tags = bracket_tag_pattern.findall(text)
54	# Then get simple #tag style (exclude matches already captured as #[[...]])
55	# Remove bracket tags first to avoid double-matching
56	text_without_bracket_tags = bracket_tag_pattern.sub("", text)
57	simple_tag_pattern = re.compile(r"(?<!\w)#([A-Za-z][A-Za-z0-9_/-]*)")
58	tags.extend(simple_tag_pattern.findall(text_without_bracket_tags))
59
60	# Extract block references: ((block-id))
61	block_ref_pattern = re.compile(r"$\(([a-f0-9-]+)$\)")
62	block_refs = block_ref_pattern.findall(text)
63
64	return {
65	"properties": properties,
66	"links": links,
67	"tags": tags,
68	"block_refs": block_refs,
69	"body": body,
70	}
71
72
73	def ingest_graph(graph_path: Path) -> dict:
74	"""Ingest an entire Logseq graph and return structured data.
75
76	Returns a dict with:
77	- notes: list of dicts with name, tags, frontmatter (properties), text
78	- links: list of (source, target) tuples from wiki-links
79	"""
80	graph_path = Path(graph_path)
81	notes: List[dict] = []
82	links: List[Tuple[str, str]] = []
83
84	md_files: List[Path] = []
85	pages_dir = graph_path / "pages"
86	journals_dir = graph_path / "journals"
87
88	if pages_dir.is_dir():
89	md_files.extend(sorted(pages_dir.rglob("*.md")))
90	if journals_dir.is_dir():
91	md_files.extend(sorted(journals_dir.rglob("*.md")))
92
93	logger.info("Found %d markdown files in graph %s", len(md_files), graph_path)
94
95	for md_file in md_files:
96	page_name = md_file.stem
97	try:
98	parsed = parse_page(md_file)
99	except Exception:
100	logger.warning("Failed to parse page %s", md_file)
101	continue
102
103	notes.append(
104	{
105	"name": page_name,
106	"tags": parsed["tags"],
107	"frontmatter": parsed["properties"],
108	"text": parsed["body"],
109	}
110	)
111
112	for linked_page in parsed["links"]:
113	links.append((page_name, linked_page))
114
115	logger.info(
116	"Ingested %d notes with %d links from graph %s",
117	len(notes),
118	len(links),
119	graph_path,
120	)
121	return {"notes": notes, "links": links}
122
123
124	class LogseqSource(BaseSource):
125	"""Source connector for Logseq graphs."""
126
127	def __init__(self, graph_path: str) -> None:
128	self.graph_path = Path(graph_path)
129
130	def authenticate(self) -> bool:
131	"""Check that the graph path exists and has pages/ or journals/ dirs."""
132	if not self.graph_path.is_dir():
133	logger.error("Graph path does not exist: %s", self.graph_path)
134	return False
135	has_pages = (self.graph_path / "pages").is_dir()
136	has_journals = (self.graph_path / "journals").is_dir()
137	if not has_pages and not has_journals:
138	logger.error(
139	"No pages/ or journals/ directory found in graph: %s",
140	self.graph_path,
141	)
142	return False
143	logger.info(
144	"Logseq graph authenticated: %s (pages=%s, journals=%s)",
145	self.graph_path,
146	has_pages,
147	has_journals,
148	)
149	return True
150
151	def list_videos(
152	self,
153	folder_id: Optional[str] = None,
154	folder_path: Optional[str] = None,
155	patterns: Optional[List[str]] = None,
156	) -> List[SourceFile]:
157	"""List .md files in pages/ and journals/ as SourceFile objects."""
158	md_files: List[Path] = []
159
160	pages_dir = self.graph_path / "pages"
161	journals_dir = self.graph_path / "journals"
162
163	if folder_path:
164	search_root = self.graph_path / folder_path
165	if search_root.is_dir():
166	md_files.extend(sorted(search_root.rglob("*.md")))
167	else:
168	if pages_dir.is_dir():
169	md_files.extend(sorted(pages_dir.rglob("*.md")))
170	if journals_dir.is_dir():
171	md_files.extend(sorted(journals_dir.rglob("*.md")))
172
173	results: List[SourceFile] = []
174	for md_file in md_files:
175	relative = md_file.relative_to(self.graph_path)
176	stat = md_file.stat()
177	modified_dt = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc)
178
179	results.append(
180	SourceFile(
181	name=md_file.name,
182	id=str(relative),
183	size_bytes=stat.st_size,
184	mime_type="text/markdown",
185	modified_at=modified_dt.isoformat(),
186	path=str(relative),
187	)
188	)
189
190	logger.info("Listed %d files from graph %s", len(results), self.graph_path)
191	return results
192
193	def download(self, file: SourceFile, destination: Path) -> Path:
194	"""Copy a graph file to the destination path."""
195	source = self.graph_path / file.id
196	destination = Path(destination)
197	destination.parent.mkdir(parents=True, exist_ok=True)
198	shutil.copy2(source, destination)
199	logger.info("Copied %s -> %s", source, destination)
200	return destination

A video_processor/sources/notion_source.py

+380

		--- a/video_processor/sources/notion_source.py
		+++ b/video_processor/sources/notion_source.py
		@@ -0,0 +1,380 @@
	1	+"""Notion API source connector for fetching pages and databases."""
	2	+
	3	+import logging
	4	+import os
	5	+from pathlib import Path
	6	+from typing import Dict, List, Optional
	7	+
	8	+import requests
	9	+
	10	+from video_processor.sources.base import BaseSource, SourceFile
	11	+
	12	+logger = logging.getLogger(__name__)
	13	+
	14	+NOTION_VERSION = "2022-06-28"
	15	+NOTION_BASE_URL = "https://api.notion.com/v1"
	16	+
	17	+
	18	+class NotionSource(BaseSource):
	19	+ """
	20	+ Fetch pages and databases from Notion via the public API.
	21	+
	22	+ Requires a Notion integration token (internal integration).
	23	+ Set NOTION_API_KEY env var or pass token directly.
	24	+
	25	+ Requires: pip install requests
	26	+ """
	27	+
	28	+ def __init__(
	29	+ self,
	30	+ token: Optional[str] = None,
	31	+ database_id: Optional[str] = None,
	32	+ page_ids: Optional[List[str]] = None,
	33	+ ):
	34	+ self.token = token or os.environ.get("NOTION_API_KEY", "")
	35	+ self.database_id = database_id
	36	+ self.page_ids = page_ids or []
	37	+
	38	+ def _headers(self) -> Dict[str, str]:
	39	+ return {
	40	+ "Authorization": f"Bearer {self.token}",
	41	+ "Notion-Version": NOTION_VERSION,
	42	+ "Content-Type": "application/json",
	43	+ }
	44	+
	45	+ def authenticate(self) -> bool:
	46	+ """Check token is set and make a test call to the Notion API."""
	47	+ if not self.token:
	48	+ logger.error("Notion token not set. Provide token or set NOTION_API_KEY.")
	49	+ return False
	50	+ try:
	51	+ resp = requests.get(
	52	+ f"{NOTION_BASE_URL}/users/me",
	53	+ headers=self._headers(),
	54	+ timeout=15,
	55	+ )
	56	+ resp.raise_for_status()
	57	+ user = resp.json()
	58	+ logger.info("Authenticated with Notion as %s", user.get("name", "unknown"))
	59	+ return True
	60	+ except requests.RequestException as exc:
	61	+ logger.error("Notion authentication failed: %s", exc)
	62	+ return False
	63	+
	64	+ def list_videos(
	65	+ self,
	66	+ folder_id: Optional[str] = None,
	67	+ folder_path: Optional[str] = None,
	68	+ patterns: Optional[List[str]] = None,
	69	+ ) -> List[SourceFile]:
	70	+ """List Notion pages as SourceFiles.
	71	+
	72	+ If database_id is set, query the database for pages.
	73	+ If page_ids is set, fetch each page individually.
	74	+ """
	75	+ files: List[SourceFile] = []
	76	+
	77	+ if self.database_id:
	78	+ files.extend(self._list_from_database(self.database_id))
	79	+
	80	+ if self.page_ids:
	81	+ files.extend(self._list_from_pages(self.page_ids))
	82	+
	83	+ if not files:
	84	+ logger.warning("No pages found. Set database_id or page_ids.")
	85	+
	86	+ return files
	87	+
	88	+ def _list_from_database(self, database_id: str) -> List[SourceFile]:
	89	+ """Query a Notion database and return SourceFiles for each row."""
	90	+ files: List[SourceFile] = []
	91	+ has_more = True
	92	+ start_cursor: Optional[str] = None
	93	+
	94	+ while has_more:
	95	+ body: Dict = {}
	96	+ if start_cursor:
	97	+ body["start_cursor"] = start_cursor
	98	+
	99	+ resp = requests.post(
	100	+ f"{NOTION_BASE_URL}/databases/{database_id}/query",
	101	+ headers=self._headers(),
	102	+ json=body,
	103	+ timeout=30,
	104	+ )
	105	+ resp.raise_for_status()
	106	+ data = resp.json()
	107	+
	108	+ for page in data.get("results", []):
	109	+ title = _extract_page_title(page)
	110	+ files.append(
	111	+ SourceFile(
	112	+ name=title,
	113	+ id=page["id"],
	114	+ mime_type="text/markdown",
	115	+ modified_at=page.get("last_edited_time"),
	116	+ )
	117	+ )
	118	+
	119	+ has_more = data.get("has_more", False)
	120	+ start_cursor = data.get("next_cursor")
	121	+
	122	+ return files
	123	+
	124	+ def _list_from_pages(self, page_ids: List[str]) -> List[SourceFile]:
	125	+ """Fetch individual pages by ID and return SourceFiles."""
	126	+ files: List[SourceFile] = []
	127	+ for page_id in page_ids:
	128	+ try:
	129	+ resp = requests.get(
	130	+ f"{NOTION_BASE_URL}/pages/{page_id}",
	131	+ headers=self._headers(),
	132	+ timeout=15,
	133	+ )
	134	+ resp.raise_for_status()
	135	+ page = resp.json()
	136	+ title = _extract_page_title(page)
	137	+ files.append(
	138	+ SourceFile(
	139	+ name=title,
	140	+ id=page["id"],
	141	+ mime_type="text/markdown",
	142	+ modified_at=page.get("last_edited_time"),
	143	+ )
	144	+ )
	145	+ except requests.RequestException as exc:
	146	+ logger.error("Failed to fetch page %s: %s", page_id, exc)
	147	+ return files
	148	+
	149	+ def download(self, file: SourceFile, destination: Path) -> Path:
	150	+ """Download page blocks as markdown text and save to destination."""
	151	+ destination = Path(destination)
	152	+ destination.parent.mkdir(parents=True, exist_ok=True)
	153	+
	154	+ blocks = self._fetch_all_blocks(file.id)
	155	+ text = self._blocks_to_text(blocks)
	156	+
	157	+ # Prepend title
	158	+ content = f"# {file.name}\n\n{text}"
	159	+ destination.write_text(content, encoding="utf-8")
	160	+ logger.info("Saved Notion page to %s", destination)
	161	+ return destination
	162	+
	163	+ def _fetch_all_blocks(self, page_id: str) -> list:
	164	+ """Fetch all child blocks for a page, handling pagination."""
	165	+ blocks: list = []
	166	+ has_more = True
	167	+ start_cursor: Optional[str] = None
	168	+
	169	+ while has_more:
	170	+ url = f"{NOTION_BASE_URL}/blocks/{page_id}/children?page_size=100"
	171	+ if start_cursor:
	172	+ url += f"&start_cursor={start_cursor}"
	173	+
	174	+ resp = requests.get(url, headers=self._headers(), timeout=30)
	175	+ resp.raise_for_status()
	176	+ data = resp.json()
	177	+
	178	+ blocks.extend(data.get("results", []))
	179	+ has_more = data.get("has_more", False)
	180	+ start_cursor = data.get("next_cursor")
	181	+
	182	+ return blocks
	183	+
	184	+ def _blocks_to_text(self, blocks: list) -> str:
	185	+ """Convert Notion block objects to markdown text."""
	186	+ lines: List[str] = []
	187	+ numbered_index = 0
	188	+
	189	+ for block in blocks:
	190	+ block_type = block.get("type", "")
	191	+ block_data = block.get(block_type, {})
	192	+
	193	+ if block_type == "paragraph":
	194	+ text = _rich_text_to_str(block_data.get("rich_text", []))
	195	+ lines.append(text)
	196	+ numbered_index = 0
	197	+
	198	+ elif block_type == "heading_1":
	199	+ text = _rich_text_to_str(block_data.get("rich_text", []))
	200	+ lines.append(f"# {text}")
	201	+ numbered_index = 0
	202	+
	203	+ elif block_type == "heading_2":
	204	+ text = _rich_text_to_str(block_data.get("rich_text", []))
	205	+ lines.append(f"## {text}")
	206	+ numbered_index = 0
	207	+
	208	+ elif block_type == "heading_3":
	209	+ text = _rich_text_to_str(block_data.get("rich_text", []))
	210	+ lines.append(f"### {text}")
	211	+ numbered_index = 0
	212	+
	213	+ elif block_type == "bulleted_list_item":
	214	+ text = _rich_text_to_str(block_data.get("rich_text", []))
	215	+ lines.append(f"- {text}")
	216	+ numbered_index = 0
	217	+
	218	+ elif block_type == "numbered_list_item":
	219	+ numbered_index += 1
	220	+ text = _rich_text_to_str(block_data.get("rich_text", []))
	221	+ lines.append(f"{numbered_index}. {text}")
	222	+
	223	+ elif block_type == "to_do":
	224	+ text = _rich_text_to_str(block_data.get("rich_text", []))
	225	+ checked = block_data.get("checked", False)
	226	+ marker = "[x]" if checked else "[ ]"
	227	+ lines.append(f"- {marker} {text}")
	228	+ numbered_index = 0
	229	+
	230	+ elif block_type == "code":
	231	+ text = _rich_text_to_str(block_data.get("rich_text", []))
	232	+ language = block_data.get("language", "")
	233	+ lines.append(f"```{language}")
	234	+ lines.append(text)
	235	+ lines.append("```")
	236	+ numbered_index = 0
	237	+
	238	+ elif block_type == "quote":
	239	+ text = _rich_text_to_str(block_data.get("rich_text", []))
	240	+ lines.append(f"> {text}")
	241	+ numbered_index = 0
	242	+
	243	+ elif block_type == "callout":
	244	+ text = _rich_text_to_str(block_data.get("rich_text", []))
	245	+ icon = block_data.get("icon", {})
	246	+ emoji = icon.get("emoji", "") if icon else ""
	247	+ prefix = f"{emoji} " if emoji else ""
	248	+ lines.append(f"> {prefix}{text}")
	249	+ numbered_index = 0
	250	+
	251	+ elif block_type == "toggle":
	252	+ text = _rich_text_to_str(block_data.get("rich_text", []))
	253	+ lines.append(f"<details><summary>{text}</summary></details>")
	254	+ numbered_index = 0
	255	+
	256	+ elif block_type == "divider":
	257	+ lines.append("---")
	258	+ numbered_index = 0
	259	+
	260	+ else:
	261	+ # Unsupported block type — try to extract any rich_text
	262	+ text = _rich_text_to_str(block_data.get("rich_text", []))
	263	+ if text:
	264	+ lines.append(text)
	265	+ numbered_index = 0
	266	+
	267	+ return "\n\n".join(lines)
	268	+
	269	+ def fetch_database_as_table(self, database_id: str) -> str:
	270	+ """Fetch a Notion database and return its rows as CSV-like text.
	271	+
	272	+ Each row is a page in the database. Columns are derived from
	273	+ the database properties.
	274	+ """
	275	+ # First, get database schema for column order
	276	+ resp = requests.get(
	277	+ f"{NOTION_BASE_URL}/databases/{database_id}",
	278	+ headers=self._headers(),
	279	+ timeout=15,
	280	+ )
	281	+ resp.raise_for_status()
	282	+ db_meta = resp.json()
	283	+ properties = db_meta.get("properties", {})
	284	+ columns = sorted(properties.keys())
	285	+
	286	+ # Query all rows
	287	+ rows: List[Dict] = []
	288	+ has_more = True
	289	+ start_cursor: Optional[str] = None
	290	+
	291	+ while has_more:
	292	+ body: Dict = {}
	293	+ if start_cursor:
	294	+ body["start_cursor"] = start_cursor
	295	+
	296	+ resp = requests.post(
	297	+ f"{NOTION_BASE_URL}/databases/{database_id}/query",
	298	+ headers=self._headers(),
	299	+ json=body,
	300	+ timeout=30,
	301	+ )
	302	+ resp.raise_for_status()
	303	+ data = resp.json()
	304	+ rows.extend(data.get("results", []))
	305	+ has_more = data.get("has_more", False)
	306	+ start_cursor = data.get("next_cursor")
	307	+
	308	+ # Build CSV-like output
	309	+ lines: List[str] = []
	310	+ lines.append(",".join(columns))
	311	+
	312	+ for row in rows:
	313	+ row_props = row.get("properties", {})
	314	+ values: List[str] = []
	315	+ for col in columns:
	316	+ prop = row_props.get(col, {})
	317	+ values.append(_extract_property_value(prop))
	318	+ lines.append(",".join(values))
	319	+
	320	+ return "\n".join(lines)
	321	+
	322	+
	323	+def _rich_text_to_str(rich_text: list) -> str:
	324	+ """Extract plain text from a Notion rich_text array."""
	325	+ return "".join(item.get("plain_text", "") for item in rich_text)
	326	+
	327	+
	328	+def _extract_page_title(page: dict) -> str:
	329	+ """Extract the title from a Notion page object."""
	330	+ properties = page.get("properties", {})
	331	+ for prop in properties.values():
	332	+ if prop.get("type") == "title":
	333	+ return _rich_text_to_str(prop.get("title", []))
	334	+ return "Untitled"
	335	+
	336	+
	337	+def _extract_property_value(prop: dict) -> str:
	338	+ """Extract a display string from a Notion property value."""
	339	+ prop_type = prop.get("type", "")
	340	+
	341	+ if prop_type == "title":
	342	+ return _rich_text_to_str(prop.get("title", []))
	343	+ elif prop_type == "rich_text":
	344	+ return _rich_text_to_str(prop.get("rich_text", []))
	345	+ elif prop_type == "number":
	346	+ val = prop.get("number")
	347	+ return str(val) if val is not None else ""
	348	+ elif prop_type == "select":
	349	+ sel = prop.get("select")
	350	+ return sel.get("name", "") if sel else ""
	351	+ elif prop_type == "multi_select":
	352	+ return "; ".join(s.get("name", "") for s in prop.get("multi_select", []))
	353	+ elif prop_type == "date":
	354	+ date = prop.get("date")
	355	+ if date:
	356	+ start = date.get("start", "")
	357	+ end = date.get("end", "")
	358	+ return f"{start} - {end}" if end else start
	359	+ return ""
	360	+ elif prop_type == "checkbox":
	361	+ return str(prop.get("checkbox", False))
	362	+ elif prop_type == "url":
	363	+ return prop.get("url", "") or ""
	364	+ elif prop_type == "email":
	365	+ return prop.get("email", "") or ""
	366	+ elif prop_type == "phone_number":
	367	+ return prop.get("phone_number", "") or ""
	368	+ elif prop_type == "status":
	369	+ status = prop.get("status")
	370	+ return status.get("name", "") if status else ""
	371	+ elif prop_type == "people":
	372	+ return "; ".join(p.get("name", "") for p in prop.get("people", []))
	373	+ elif prop_type == "relation":
	374	+ return "; ".join(r.get("id", "") for r in prop.get("relation", []))
	375	+ elif prop_type == "formula":
	376	+ formula = prop.get("formula", {})
	377	+ f_type = formula.get("type", "")
	378	+ return str(formula.get(f_type, ""))
	379	+ else:
	380	+ return ""

	--- a/video_processor/sources/notion_source.py
	+++ b/video_processor/sources/notion_source.py
	@@ -0,0 +1,380 @@

	--- a/video_processor/sources/notion_source.py
	+++ b/video_processor/sources/notion_source.py
	@@ -0,0 +1,380 @@
1	"""Notion API source connector for fetching pages and databases."""
2
3	import logging
4	import os
5	from pathlib import Path
6	from typing import Dict, List, Optional
7
8	import requests
9
10	from video_processor.sources.base import BaseSource, SourceFile
11
12	logger = logging.getLogger(__name__)
13
14	NOTION_VERSION = "2022-06-28"
15	NOTION_BASE_URL = "https://api.notion.com/v1"
16
17
18	class NotionSource(BaseSource):
19	"""
20	Fetch pages and databases from Notion via the public API.
21
22	Requires a Notion integration token (internal integration).
23	Set NOTION_API_KEY env var or pass token directly.
24
25	Requires: pip install requests
26	"""
27
28	def __init__(
29	self,
30	token: Optional[str] = None,
31	database_id: Optional[str] = None,
32	page_ids: Optional[List[str]] = None,
33	):
34	self.token = token or os.environ.get("NOTION_API_KEY", "")
35	self.database_id = database_id
36	self.page_ids = page_ids or []
37
38	def _headers(self) -> Dict[str, str]:
39	return {
40	"Authorization": f"Bearer {self.token}",
41	"Notion-Version": NOTION_VERSION,
42	"Content-Type": "application/json",
43	}
44
45	def authenticate(self) -> bool:
46	"""Check token is set and make a test call to the Notion API."""
47	if not self.token:
48	logger.error("Notion token not set. Provide token or set NOTION_API_KEY.")
49	return False
50	try:
51	resp = requests.get(
52	f"{NOTION_BASE_URL}/users/me",
53	headers=self._headers(),
54	timeout=15,
55	)
56	resp.raise_for_status()
57	user = resp.json()
58	logger.info("Authenticated with Notion as %s", user.get("name", "unknown"))
59	return True
60	except requests.RequestException as exc:
61	logger.error("Notion authentication failed: %s", exc)
62	return False
63
64	def list_videos(
65	self,
66	folder_id: Optional[str] = None,
67	folder_path: Optional[str] = None,
68	patterns: Optional[List[str]] = None,
69	) -> List[SourceFile]:
70	"""List Notion pages as SourceFiles.
71
72	If database_id is set, query the database for pages.
73	If page_ids is set, fetch each page individually.
74	"""
75	files: List[SourceFile] = []
76
77	if self.database_id:
78	files.extend(self._list_from_database(self.database_id))
79
80	if self.page_ids:
81	files.extend(self._list_from_pages(self.page_ids))
82
83	if not files:
84	logger.warning("No pages found. Set database_id or page_ids.")
85
86	return files
87
88	def _list_from_database(self, database_id: str) -> List[SourceFile]:
89	"""Query a Notion database and return SourceFiles for each row."""
90	files: List[SourceFile] = []
91	has_more = True
92	start_cursor: Optional[str] = None
93
94	while has_more:
95	body: Dict = {}
96	if start_cursor:
97	body["start_cursor"] = start_cursor
98
99	resp = requests.post(
100	f"{NOTION_BASE_URL}/databases/{database_id}/query",
101	headers=self._headers(),
102	json=body,
103	timeout=30,
104	)
105	resp.raise_for_status()
106	data = resp.json()
107
108	for page in data.get("results", []):
109	title = _extract_page_title(page)
110	files.append(
111	SourceFile(
112	name=title,
113	id=page["id"],
114	mime_type="text/markdown",
115	modified_at=page.get("last_edited_time"),
116	)
117	)
118
119	has_more = data.get("has_more", False)
120	start_cursor = data.get("next_cursor")
121
122	return files
123
124	def _list_from_pages(self, page_ids: List[str]) -> List[SourceFile]:
125	"""Fetch individual pages by ID and return SourceFiles."""
126	files: List[SourceFile] = []
127	for page_id in page_ids:
128	try:
129	resp = requests.get(
130	f"{NOTION_BASE_URL}/pages/{page_id}",
131	headers=self._headers(),
132	timeout=15,
133	)
134	resp.raise_for_status()
135	page = resp.json()
136	title = _extract_page_title(page)
137	files.append(
138	SourceFile(
139	name=title,
140	id=page["id"],
141	mime_type="text/markdown",
142	modified_at=page.get("last_edited_time"),
143	)
144	)
145	except requests.RequestException as exc:
146	logger.error("Failed to fetch page %s: %s", page_id, exc)
147	return files
148
149	def download(self, file: SourceFile, destination: Path) -> Path:
150	"""Download page blocks as markdown text and save to destination."""
151	destination = Path(destination)
152	destination.parent.mkdir(parents=True, exist_ok=True)
153
154	blocks = self._fetch_all_blocks(file.id)
155	text = self._blocks_to_text(blocks)
156
157	# Prepend title
158	content = f"# {file.name}\n\n{text}"
159	destination.write_text(content, encoding="utf-8")
160	logger.info("Saved Notion page to %s", destination)
161	return destination
162
163	def _fetch_all_blocks(self, page_id: str) -> list:
164	"""Fetch all child blocks for a page, handling pagination."""
165	blocks: list = []
166	has_more = True
167	start_cursor: Optional[str] = None
168
169	while has_more:
170	url = f"{NOTION_BASE_URL}/blocks/{page_id}/children?page_size=100"
171	if start_cursor:
172	url += f"&start_cursor={start_cursor}"
173
174	resp = requests.get(url, headers=self._headers(), timeout=30)
175	resp.raise_for_status()
176	data = resp.json()
177
178	blocks.extend(data.get("results", []))
179	has_more = data.get("has_more", False)
180	start_cursor = data.get("next_cursor")
181
182	return blocks
183
184	def _blocks_to_text(self, blocks: list) -> str:
185	"""Convert Notion block objects to markdown text."""
186	lines: List[str] = []
187	numbered_index = 0
188
189	for block in blocks:
190	block_type = block.get("type", "")
191	block_data = block.get(block_type, {})
192
193	if block_type == "paragraph":
194	text = _rich_text_to_str(block_data.get("rich_text", []))
195	lines.append(text)
196	numbered_index = 0
197
198	elif block_type == "heading_1":
199	text = _rich_text_to_str(block_data.get("rich_text", []))
200	lines.append(f"# {text}")
201	numbered_index = 0
202
203	elif block_type == "heading_2":
204	text = _rich_text_to_str(block_data.get("rich_text", []))
205	lines.append(f"## {text}")
206	numbered_index = 0
207
208	elif block_type == "heading_3":
209	text = _rich_text_to_str(block_data.get("rich_text", []))
210	lines.append(f"### {text}")
211	numbered_index = 0
212
213	elif block_type == "bulleted_list_item":
214	text = _rich_text_to_str(block_data.get("rich_text", []))
215	lines.append(f"- {text}")
216	numbered_index = 0
217
218	elif block_type == "numbered_list_item":
219	numbered_index += 1
220	text = _rich_text_to_str(block_data.get("rich_text", []))
221	lines.append(f"{numbered_index}. {text}")
222
223	elif block_type == "to_do":
224	text = _rich_text_to_str(block_data.get("rich_text", []))
225	checked = block_data.get("checked", False)
226	marker = "[x]" if checked else "[ ]"
227	lines.append(f"- {marker} {text}")
228	numbered_index = 0
229
230	elif block_type == "code":
231	text = _rich_text_to_str(block_data.get("rich_text", []))
232	language = block_data.get("language", "")
233	lines.append(f"```{language}")
234	lines.append(text)
235	lines.append("```")
236	numbered_index = 0
237
238	elif block_type == "quote":
239	text = _rich_text_to_str(block_data.get("rich_text", []))
240	lines.append(f"> {text}")
241	numbered_index = 0
242
243	elif block_type == "callout":
244	text = _rich_text_to_str(block_data.get("rich_text", []))
245	icon = block_data.get("icon", {})
246	emoji = icon.get("emoji", "") if icon else ""
247	prefix = f"{emoji} " if emoji else ""
248	lines.append(f"> {prefix}{text}")
249	numbered_index = 0
250
251	elif block_type == "toggle":
252	text = _rich_text_to_str(block_data.get("rich_text", []))
253	lines.append(f"<details><summary>{text}</summary></details>")
254	numbered_index = 0
255
256	elif block_type == "divider":
257	lines.append("---")
258	numbered_index = 0
259
260	else:
261	# Unsupported block type — try to extract any rich_text
262	text = _rich_text_to_str(block_data.get("rich_text", []))
263	if text:
264	lines.append(text)
265	numbered_index = 0
266
267	return "\n\n".join(lines)
268
269	def fetch_database_as_table(self, database_id: str) -> str:
270	"""Fetch a Notion database and return its rows as CSV-like text.
271
272	Each row is a page in the database. Columns are derived from
273	the database properties.
274	"""
275	# First, get database schema for column order
276	resp = requests.get(
277	f"{NOTION_BASE_URL}/databases/{database_id}",
278	headers=self._headers(),
279	timeout=15,
280	)
281	resp.raise_for_status()
282	db_meta = resp.json()
283	properties = db_meta.get("properties", {})
284	columns = sorted(properties.keys())
285
286	# Query all rows
287	rows: List[Dict] = []
288	has_more = True
289	start_cursor: Optional[str] = None
290
291	while has_more:
292	body: Dict = {}
293	if start_cursor:
294	body["start_cursor"] = start_cursor
295
296	resp = requests.post(
297	f"{NOTION_BASE_URL}/databases/{database_id}/query",
298	headers=self._headers(),
299	json=body,
300	timeout=30,
301	)
302	resp.raise_for_status()
303	data = resp.json()
304	rows.extend(data.get("results", []))
305	has_more = data.get("has_more", False)
306	start_cursor = data.get("next_cursor")
307
308	# Build CSV-like output
309	lines: List[str] = []
310	lines.append(",".join(columns))
311
312	for row in rows:
313	row_props = row.get("properties", {})
314	values: List[str] = []
315	for col in columns:
316	prop = row_props.get(col, {})
317	values.append(_extract_property_value(prop))
318	lines.append(",".join(values))
319
320	return "\n".join(lines)
321
322
323	def _rich_text_to_str(rich_text: list) -> str:
324	"""Extract plain text from a Notion rich_text array."""
325	return "".join(item.get("plain_text", "") for item in rich_text)
326
327
328	def _extract_page_title(page: dict) -> str:
329	"""Extract the title from a Notion page object."""
330	properties = page.get("properties", {})
331	for prop in properties.values():
332	if prop.get("type") == "title":
333	return _rich_text_to_str(prop.get("title", []))
334	return "Untitled"
335
336
337	def _extract_property_value(prop: dict) -> str:
338	"""Extract a display string from a Notion property value."""
339	prop_type = prop.get("type", "")
340
341	if prop_type == "title":
342	return _rich_text_to_str(prop.get("title", []))
343	elif prop_type == "rich_text":
344	return _rich_text_to_str(prop.get("rich_text", []))
345	elif prop_type == "number":
346	val = prop.get("number")
347	return str(val) if val is not None else ""
348	elif prop_type == "select":
349	sel = prop.get("select")
350	return sel.get("name", "") if sel else ""
351	elif prop_type == "multi_select":
352	return "; ".join(s.get("name", "") for s in prop.get("multi_select", []))
353	elif prop_type == "date":
354	date = prop.get("date")
355	if date:
356	start = date.get("start", "")
357	end = date.get("end", "")
358	return f"{start} - {end}" if end else start
359	return ""
360	elif prop_type == "checkbox":
361	return str(prop.get("checkbox", False))
362	elif prop_type == "url":
363	return prop.get("url", "") or ""
364	elif prop_type == "email":
365	return prop.get("email", "") or ""
366	elif prop_type == "phone_number":
367	return prop.get("phone_number", "") or ""
368	elif prop_type == "status":
369	status = prop.get("status")
370	return status.get("name", "") if status else ""
371	elif prop_type == "people":
372	return "; ".join(p.get("name", "") for p in prop.get("people", []))
373	elif prop_type == "relation":
374	return "; ".join(r.get("id", "") for r in prop.get("relation", []))
375	elif prop_type == "formula":
376	formula = prop.get("formula", {})
377	f_type = formula.get("type", "")
378	return str(formula.get(f_type, ""))
379	else:
380	return ""

A video_processor/sources/obsidian_source.py

+178

		--- a/video_processor/sources/obsidian_source.py
		+++ b/video_processor/sources/obsidian_source.py
		@@ -0,0 +1,178 @@
	1	+"""Obsidian vault source connector for ingesting markdown notes."""
	2	+
	3	+import logging
	4	+import re
	5	+import shutil
	6	+from datetime import datetime, timezone
	7	+from pathlib import Path
	8	+from typing import List, Optional, Tuple
	9	+
	10	+from video_processor.sources.base import BaseSource, SourceFile
	11	+
	12	+logger = logging.getLogger(__name__)
	13	+
	14	+
	15	+def parse_note(path: Path) -> dict:
	16	+ """Parse an Obsidian markdown note and extract structured content.
	17	+
	18	+ Returns a dict with:
	19	+ - frontmatter: dict of YAML frontmatter metadata
	20	+ - links: list of linked page names from [[wiki-links]]
	21	+ - tags: list of tags from #tag occurrences
	22	+ - headings: list of dicts with level and text
	23	+ - body: markdown text without frontmatter
	24	+ """
	25	+ text = path.read_text(encoding="utf-8")
	26	+
	27	+ # Extract YAML frontmatter (simple key: value parser, stdlib only)
	28	+ frontmatter: dict = {}
	29	+ body = text
	30	+ fm_match = re.match(r"\A---\n(.?\n)---\n?(.)", text, re.DOTALL)
	31	+ if fm_match:
	32	+ fm_text = fm_match.group(1)
	33	+ for line in fm_text.strip().splitlines():
	34	+ kv = re.match(r"^([A-Za-z_][A-Za-z0-9_ -]):\s(.*)", line)
	35	+ if kv:
	36	+ key = kv.group(1).strip()
	37	+ value = kv.group(2).strip()
	38	+ # Strip surrounding quotes
	39	+ if len(value) >= 2 and value[0] == value[-1] and value[0] in ('"', "'"):
	40	+ value = value[1:-1]
	41	+ # Handle YAML-style lists on a single line [a, b, c]
	42	+ list_match = re.match(r"^\[(.+)\]$", value)
	43	+ if list_match:
	44	+ value = [v.strip().strip("\"'") for v in list_match.group(1).split(",")]
	45	+ frontmatter[key] = value
	46	+ body = fm_match.group(2)
	47	+
	48	+ # Extract wiki-links: [[page]] and [[page\|alias]]
	49	+ link_pattern = re.compile(r"\[\[([^\]\|]+)(?:\\|[^\]]+)?\]\]")
	50	+ links = link_pattern.findall(body)
	51	+
	52	+ # Extract tags: #tag (but not inside code blocks or frontmatter)
	53	+ # Match #tag but not #[[tag]] (that's Logseq style) and not ## headings
	54	+ tag_pattern = re.compile(r"(?<!\w)#([A-Za-z][A-Za-z0-9_/-]*)")
	55	+ tags = tag_pattern.findall(body)
	56	+
	57	+ # Extract headings hierarchy
	58	+ heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
	59	+ headings = [
	60	+ {"level": len(m.group(1)), "text": m.group(2).strip()}
	61	+ for m in heading_pattern.finditer(body)
	62	+ ]
	63	+
	64	+ return {
	65	+ "frontmatter": frontmatter,
	66	+ "links": links,
	67	+ "tags": tags,
	68	+ "headings": headings,
	69	+ "body": body,
	70	+ }
	71	+
	72	+
	73	+def ingest_vault(vault_path: Path) -> dict:
	74	+ """Ingest an entire Obsidian vault and return structured data.
	75	+
	76	+ Returns a dict with:
	77	+ - notes: list of dicts with name, tags, frontmatter, text
	78	+ - links: list of (source, target) tuples from wiki-links
	79	+ """
	80	+ vault_path = Path(vault_path)
	81	+ notes: List[dict] = []
	82	+ links: List[Tuple[str, str]] = []
	83	+
	84	+ md_files = sorted(vault_path.rglob("*.md"))
	85	+ logger.info("Found %d markdown files in vault %s", len(md_files), vault_path)
	86	+
	87	+ for md_file in md_files:
	88	+ note_name = md_file.stem
	89	+ try:
	90	+ parsed = parse_note(md_file)
	91	+ except Exception:
	92	+ logger.warning("Failed to parse note %s", md_file)
	93	+ continue
	94	+
	95	+ notes.append(
	96	+ {
	97	+ "name": note_name,
	98	+ "tags": parsed["tags"],
	99	+ "frontmatter": parsed["frontmatter"],
	100	+ "text": parsed["body"],
	101	+ }
	102	+ )
	103	+
	104	+ for linked_page in parsed["links"]:
	105	+ links.append((note_name, linked_page))
	106	+
	107	+ logger.info(
	108	+ "Ingested %d notes with %d links from vault %s",
	109	+ len(notes),
	110	+ len(links),
	111	+ vault_path,
	112	+ )
	113	+ return {"notes": notes, "links": links}
	114	+
	115	+
	116	+class ObsidianSource(BaseSource):
	117	+ """Source connector for Obsidian vaults."""
	118	+
	119	+ def __init__(self, vault_path: str) -> None:
	120	+ self.vault_path = Path(vault_path)
	121	+
	122	+ def authenticate(self) -> bool:
	123	+ """Check that the vault path exists and contains .md files."""
	124	+ if not self.vault_path.is_dir():
	125	+ logger.error("Vault path does not exist: %s", self.vault_path)
	126	+ return False
	127	+ md_files = list(self.vault_path.rglob("*.md"))
	128	+ if not md_files:
	129	+ logger.error("No markdown files found in vault: %s", self.vault_path)
	130	+ return False
	131	+ logger.info(
	132	+ "Obsidian vault authenticated: %s (%d .md files)",
	133	+ self.vault_path,
	134	+ len(md_files),
	135	+ )
	136	+ return True
	137	+
	138	+ def list_videos(
	139	+ self,
	140	+ folder_id: Optional[str] = None,
	141	+ folder_path: Optional[str] = None,
	142	+ patterns: Optional[List[str]] = None,
	143	+ ) -> List[SourceFile]:
	144	+ """List all .md files in the vault recursively as SourceFile objects."""
	145	+ search_root = self.vault_path
	146	+ if folder_path:
	147	+ search_root = self.vault_path / folder_path
	148	+
	149	+ md_files = sorted(search_root.rglob("*.md"))
	150	+ results: List[SourceFile] = []
	151	+
	152	+ for md_file in md_files:
	153	+ relative = md_file.relative_to(self.vault_path)
	154	+ stat = md_file.stat()
	155	+ modified_dt = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc)
	156	+
	157	+ results.append(
	158	+ SourceFile(
	159	+ name=md_file.name,
	160	+ id=str(relative),
	161	+ size_bytes=stat.st_size,
	162	+ mime_type="text/markdown",
	163	+ modified_at=modified_dt.isoformat(),
	164	+ path=str(relative),
	165	+ )
	166	+ )
	167	+
	168	+ logger.info("Listed %d files from vault %s", len(results), self.vault_path)
	169	+ return results
	170	+
	171	+ def download(self, file: SourceFile, destination: Path) -> Path:
	172	+ """Copy a vault file to the destination path."""
	173	+ source = self.vault_path / file.id
	174	+ destination = Path(destination)
	175	+ destination.parent.mkdir(parents=True, exist_ok=True)
	176	+ shutil.copy2(source, destination)
	177	+ logger.info("Copied %s -> %s", source, destination)
	178	+ return destination

	--- a/video_processor/sources/obsidian_source.py
	+++ b/video_processor/sources/obsidian_source.py
	@@ -0,0 +1,178 @@

	--- a/video_processor/sources/obsidian_source.py
	+++ b/video_processor/sources/obsidian_source.py
	@@ -0,0 +1,178 @@
1	"""Obsidian vault source connector for ingesting markdown notes."""
2
3	import logging
4	import re
5	import shutil
6	from datetime import datetime, timezone
7	from pathlib import Path
8	from typing import List, Optional, Tuple
9
10	from video_processor.sources.base import BaseSource, SourceFile
11
12	logger = logging.getLogger(__name__)
13
14
15	def parse_note(path: Path) -> dict:
16	"""Parse an Obsidian markdown note and extract structured content.
17
18	Returns a dict with:
19	- frontmatter: dict of YAML frontmatter metadata
20	- links: list of linked page names from [[wiki-links]]
21	- tags: list of tags from #tag occurrences
22	- headings: list of dicts with level and text
23	- body: markdown text without frontmatter
24	"""
25	text = path.read_text(encoding="utf-8")
26
27	# Extract YAML frontmatter (simple key: value parser, stdlib only)
28	frontmatter: dict = {}
29	body = text
30	fm_match = re.match(r"\A---\n(.?\n)---\n?(.)", text, re.DOTALL)
31	if fm_match:
32	fm_text = fm_match.group(1)
33	for line in fm_text.strip().splitlines():
34	kv = re.match(r"^([A-Za-z_][A-Za-z0-9_ -]):\s(.*)", line)
35	if kv:
36	key = kv.group(1).strip()
37	value = kv.group(2).strip()
38	# Strip surrounding quotes
39	if len(value) >= 2 and value[0] == value[-1] and value[0] in ('"', "'"):
40	value = value[1:-1]
41	# Handle YAML-style lists on a single line [a, b, c]
42	list_match = re.match(r"^\[(.+)\]$", value)
43	if list_match:
44	value = [v.strip().strip("\"'") for v in list_match.group(1).split(",")]
45	frontmatter[key] = value
46	body = fm_match.group(2)
47
48	# Extract wiki-links: [[page]] and [[page\|alias]]
49	link_pattern = re.compile(r"\[\[([^\]\|]+)(?:\\|[^\]]+)?\]\]")
50	links = link_pattern.findall(body)
51
52	# Extract tags: #tag (but not inside code blocks or frontmatter)
53	# Match #tag but not #[[tag]] (that's Logseq style) and not ## headings
54	tag_pattern = re.compile(r"(?<!\w)#([A-Za-z][A-Za-z0-9_/-]*)")
55	tags = tag_pattern.findall(body)
56
57	# Extract headings hierarchy
58	heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
59	headings = [
60	{"level": len(m.group(1)), "text": m.group(2).strip()}
61	for m in heading_pattern.finditer(body)
62	]
63
64	return {
65	"frontmatter": frontmatter,
66	"links": links,
67	"tags": tags,
68	"headings": headings,
69	"body": body,
70	}
71
72
73	def ingest_vault(vault_path: Path) -> dict:
74	"""Ingest an entire Obsidian vault and return structured data.
75
76	Returns a dict with:
77	- notes: list of dicts with name, tags, frontmatter, text
78	- links: list of (source, target) tuples from wiki-links
79	"""
80	vault_path = Path(vault_path)
81	notes: List[dict] = []
82	links: List[Tuple[str, str]] = []
83
84	md_files = sorted(vault_path.rglob("*.md"))
85	logger.info("Found %d markdown files in vault %s", len(md_files), vault_path)
86
87	for md_file in md_files:
88	note_name = md_file.stem
89	try:
90	parsed = parse_note(md_file)
91	except Exception:
92	logger.warning("Failed to parse note %s", md_file)
93	continue
94
95	notes.append(
96	{
97	"name": note_name,
98	"tags": parsed["tags"],
99	"frontmatter": parsed["frontmatter"],
100	"text": parsed["body"],
101	}
102	)
103
104	for linked_page in parsed["links"]:
105	links.append((note_name, linked_page))
106
107	logger.info(
108	"Ingested %d notes with %d links from vault %s",
109	len(notes),
110	len(links),
111	vault_path,
112	)
113	return {"notes": notes, "links": links}
114
115
116	class ObsidianSource(BaseSource):
117	"""Source connector for Obsidian vaults."""
118
119	def __init__(self, vault_path: str) -> None:
120	self.vault_path = Path(vault_path)
121
122	def authenticate(self) -> bool:
123	"""Check that the vault path exists and contains .md files."""
124	if not self.vault_path.is_dir():
125	logger.error("Vault path does not exist: %s", self.vault_path)
126	return False
127	md_files = list(self.vault_path.rglob("*.md"))
128	if not md_files:
129	logger.error("No markdown files found in vault: %s", self.vault_path)
130	return False
131	logger.info(
132	"Obsidian vault authenticated: %s (%d .md files)",
133	self.vault_path,
134	len(md_files),
135	)
136	return True
137
138	def list_videos(
139	self,
140	folder_id: Optional[str] = None,
141	folder_path: Optional[str] = None,
142	patterns: Optional[List[str]] = None,
143	) -> List[SourceFile]:
144	"""List all .md files in the vault recursively as SourceFile objects."""
145	search_root = self.vault_path
146	if folder_path:
147	search_root = self.vault_path / folder_path
148
149	md_files = sorted(search_root.rglob("*.md"))
150	results: List[SourceFile] = []
151
152	for md_file in md_files:
153	relative = md_file.relative_to(self.vault_path)
154	stat = md_file.stat()
155	modified_dt = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc)
156
157	results.append(
158	SourceFile(
159	name=md_file.name,
160	id=str(relative),
161	size_bytes=stat.st_size,
162	mime_type="text/markdown",
163	modified_at=modified_dt.isoformat(),
164	path=str(relative),
165	)
166	)
167
168	logger.info("Listed %d files from vault %s", len(results), self.vault_path)
169	return results
170
171	def download(self, file: SourceFile, destination: Path) -> Path:
172	"""Copy a vault file to the destination path."""
173	source = self.vault_path / file.id
174	destination = Path(destination)
175	destination.parent.mkdir(parents=True, exist_ok=True)
176	shutil.copy2(source, destination)
177	logger.info("Copied %s -> %s", source, destination)
178	return destination

A video_processor/sources/onenote_source.py

+222

		--- a/video_processor/sources/onenote_source.py
		+++ b/video_processor/sources/onenote_source.py
		@@ -0,0 +1,222 @@
	1	+"""Microsoft OneNote source connector using the m365 CLI (cli-microsoft365).
	2	+
	3	+Fetches pages from OneNote notebooks via the `m365` CLI tool.
	4	+Outputs plain text suitable for KG ingestion.
	5	+
	6	+Requires: npm install -g @pnp/cli-microsoft365
	7	+Auth: m365 login (interactive)
	8	+Docs: https://pnp.github.io/cli-microsoft365/
	9	+"""
	10	+
	11	+import json
	12	+import logging
	13	+import re
	14	+import shutil
	15	+import subprocess
	16	+from pathlib import Path
	17	+from typing import Any, List, Optional
	18	+
	19	+from video_processor.sources.base import BaseSource, SourceFile
	20	+
	21	+logger = logging.getLogger(__name__)
	22	+
	23	+
	24	+def _run_m365(args: List[str], timeout: int = 30) -> Any:
	25	+ """Run an m365 CLI command and return parsed JSON output."""
	26	+ cmd = ["m365"] + args + ["--output", "json"]
	27	+ proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
	28	+ if proc.returncode != 0:
	29	+ raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}")
	30	+ try:
	31	+ return json.loads(proc.stdout)
	32	+ except json.JSONDecodeError:
	33	+ return proc.stdout.strip()
	34	+
	35	+
	36	+def _html_to_text(html: str) -> str:
	37	+ """Strip HTML tags and decode entities to produce plain text.
	38	+
	39	+ Uses only stdlib ``re`` — no external dependencies.
	40	+ """
	41	+ # Remove script/style blocks entirely
	42	+ text = re.sub(r"<(script\|style)[^>]>.?</\1>", "", html, flags=re.DOTALL \| re.IGNORECASE)
	43	+ # Replace <br>, <p>, <div>, <li>, <tr> with newlines for readability
	44	+ text = re.sub(r"<br\s*/?>", "\n", text, flags=re.IGNORECASE)
	45	+ text = re.sub(r"</(p\|div\|li\|tr\|h[1-6])>", "\n", text, flags=re.IGNORECASE)
	46	+ # Strip remaining tags
	47	+ text = re.sub(r"<[^>]+>", "", text)
	48	+ # Decode common HTML entities
	49	+ entity_map = {
	50	+ "&": "&",
	51	+ "<": "<",
	52	+ ">": ">",
	53	+ """: '"',
	54	+ "'": "'",
	55	+ "'": "'",
	56	+ " ": " ",
	57	+ }
	58	+ for entity, char in entity_map.items():
	59	+ text = text.replace(entity, char)
	60	+ # Decode numeric entities ({ and )
	61	+ text = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), text)
	62	+ text = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), text)
	63	+ # Collapse excessive blank lines
	64	+ text = re.sub(r"\n{3,}", "\n\n", text)
	65	+ return text.strip()
	66	+
	67	+
	68	+class OneNoteSource(BaseSource):
	69	+ """
	70	+ Fetch pages from OneNote notebooks via the m365 CLI.
	71	+
	72	+ Usage:
	73	+ source = OneNoteSource() # all notebooks
	74	+ source = OneNoteSource(notebook_name="Work Notes") # specific notebook
	75	+ source = OneNoteSource(notebook_name="Work", section_name="Meetings")
	76	+ files = source.list_videos()
	77	+ source.download_all(files, Path("./notes"))
	78	+ """
	79	+
	80	+ def __init__(
	81	+ self,
	82	+ notebook_name: Optional[str] = None,
	83	+ section_name: Optional[str] = None,
	84	+ ):
	85	+ self.notebook_name = notebook_name
	86	+ self.section_name = section_name
	87	+
	88	+ def authenticate(self) -> bool:
	89	+ """Check if m365 CLI is installed and logged in."""
	90	+ if not shutil.which("m365"):
	91	+ logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365")
	92	+ return False
	93	+ try:
	94	+ result = _run_m365(["status"], timeout=10)
	95	+ if isinstance(result, dict) and result.get("connectedAs"):
	96	+ return True
	97	+ if isinstance(result, str) and "Logged in" in result:
	98	+ return True
	99	+ logger.error("m365 not logged in. Run: m365 login")
	100	+ return False
	101	+ except (RuntimeError, subprocess.TimeoutExpired):
	102	+ logger.error("m365 not logged in. Run: m365 login")
	103	+ return False
	104	+
	105	+ def list_videos(
	106	+ self,
	107	+ folder_id: Optional[str] = None,
	108	+ folder_path: Optional[str] = None,
	109	+ patterns: Optional[List[str]] = None,
	110	+ ) -> List[SourceFile]:
	111	+ """List OneNote pages across notebooks/sections. Returns SourceFile per page."""
	112	+ files: List[SourceFile] = []
	113	+
	114	+ # Step 1: List notebooks
	115	+ try:
	116	+ notebooks = _run_m365(["onenote", "notebook", "list"], timeout=60)
	117	+ except RuntimeError as e:
	118	+ logger.error(f"Failed to list OneNote notebooks: {e}")
	119	+ return []
	120	+
	121	+ if not isinstance(notebooks, list):
	122	+ notebooks = []
	123	+
	124	+ # Filter notebooks by name if specified
	125	+ if self.notebook_name:
	126	+ notebooks = [
	127	+ nb
	128	+ for nb in notebooks
	129	+ if self.notebook_name.lower() in nb.get("displayName", "").lower()
	130	+ ]
	131	+
	132	+ for notebook in notebooks:
	133	+ notebook_id = notebook.get("id", "")
	134	+ notebook_name = notebook.get("displayName", "Untitled Notebook")
	135	+
	136	+ # Step 2: List sections in this notebook
	137	+ try:
	138	+ sections = _run_m365(
	139	+ ["onenote", "section", "list", "--notebookId", notebook_id],
	140	+ timeout=60,
	141	+ )
	142	+ except RuntimeError as e:
	143	+ logger.warning(f"Failed to list sections for notebook '{notebook_name}': {e}")
	144	+ continue
	145	+
	146	+ if not isinstance(sections, list):
	147	+ sections = []
	148	+
	149	+ # Filter sections by name if specified
	150	+ if self.section_name:
	151	+ sections = [
	152	+ s
	153	+ for s in sections
	154	+ if self.section_name.lower() in s.get("displayName", "").lower()
	155	+ ]
	156	+
	157	+ for section in sections:
	158	+ section_id = section.get("id", "")
	159	+ section_name = section.get("displayName", "Untitled Section")
	160	+
	161	+ # Step 3: List pages in this section
	162	+ try:
	163	+ pages = _run_m365(
	164	+ ["onenote", "page", "list", "--sectionId", section_id],
	165	+ timeout=60,
	166	+ )
	167	+ except RuntimeError as e:
	168	+ logger.warning(f"Failed to list pages in section '{section_name}': {e}")
	169	+ continue
	170	+
	171	+ if not isinstance(pages, list):
	172	+ pages = []
	173	+
	174	+ for page in pages:
	175	+ page_id = page.get("id", "")
	176	+ title = page.get("title", "Untitled Page").strip() or "Untitled Page"
	177	+ modified = page.get("lastModifiedDateTime")
	178	+ # Build a path for organizational context
	179	+ page_path = f"{notebook_name}/{section_name}/{title}"
	180	+
	181	+ files.append(
	182	+ SourceFile(
	183	+ name=title,
	184	+ id=str(page_id),
	185	+ size_bytes=None,
	186	+ mime_type="text/html",
	187	+ modified_at=modified,
	188	+ path=page_path,
	189	+ )
	190	+ )
	191	+
	192	+ logger.info(f"Found {len(files)} page(s) in OneNote")
	193	+ return files
	194	+
	195	+ def download(self, file: SourceFile, destination: Path) -> Path:
	196	+ """Download a OneNote page's content as a text file."""
	197	+ destination = Path(destination)
	198	+ destination.parent.mkdir(parents=True, exist_ok=True)
	199	+
	200	+ try:
	201	+ result = _run_m365(
	202	+ ["onenote", "page", "get", "--id", file.id],
	203	+ timeout=60,
	204	+ )
	205	+ except RuntimeError as e:
	206	+ raise RuntimeError(f"Failed to fetch OneNote page {file.id}: {e}") from e
	207	+
	208	+ # Extract HTML content from the result
	209	+ if isinstance(result, dict):
	210	+ html = result.get("content", result.get("body", {}).get("content", ""))
	211	+ if not html:
	212	+ # Fallback: serialize the whole response
	213	+ html = json.dumps(result, indent=2)
	214	+ elif isinstance(result, str):
	215	+ html = result
	216	+ else:
	217	+ html = str(result)
	218	+
	219	+ text = _html_to_text(html)
	220	+ destination.write_text(text, encoding="utf-8")
	221	+ logger.info(f"Saved page '{file.name}' to {destination}")
	222	+ return destination

	--- a/video_processor/sources/onenote_source.py
	+++ b/video_processor/sources/onenote_source.py
	@@ -0,0 +1,222 @@

	--- a/video_processor/sources/onenote_source.py
	+++ b/video_processor/sources/onenote_source.py
	@@ -0,0 +1,222 @@
1	"""Microsoft OneNote source connector using the m365 CLI (cli-microsoft365).
2
3	Fetches pages from OneNote notebooks via the `m365` CLI tool.
4	Outputs plain text suitable for KG ingestion.
5
6	Requires: npm install -g @pnp/cli-microsoft365
7	Auth: m365 login (interactive)
8	Docs: https://pnp.github.io/cli-microsoft365/
9	"""
10
11	import json
12	import logging
13	import re
14	import shutil
15	import subprocess
16	from pathlib import Path
17	from typing import Any, List, Optional
18
19	from video_processor.sources.base import BaseSource, SourceFile
20
21	logger = logging.getLogger(__name__)
22
23
24	def _run_m365(args: List[str], timeout: int = 30) -> Any:
25	"""Run an m365 CLI command and return parsed JSON output."""
26	cmd = ["m365"] + args + ["--output", "json"]
27	proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
28	if proc.returncode != 0:
29	raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}")
30	try:
31	return json.loads(proc.stdout)
32	except json.JSONDecodeError:
33	return proc.stdout.strip()
34
35
36	def _html_to_text(html: str) -> str:
37	"""Strip HTML tags and decode entities to produce plain text.
38
39	Uses only stdlib ``re`` — no external dependencies.
40	"""
41	# Remove script/style blocks entirely
42	text = re.sub(r"<(script\|style)[^>]>.?</\1>", "", html, flags=re.DOTALL \| re.IGNORECASE)
43	# Replace <br>, <p>, <div>, <li>, <tr> with newlines for readability
44	text = re.sub(r"<br\s*/?>", "\n", text, flags=re.IGNORECASE)
45	text = re.sub(r"</(p\|div\|li\|tr\|h[1-6])>", "\n", text, flags=re.IGNORECASE)
46	# Strip remaining tags
47	text = re.sub(r"<[^>]+>", "", text)
48	# Decode common HTML entities
49	entity_map = {
50	"&": "&",
51	"<": "<",
52	">": ">",
53	""": '"',
54	"'": "'",
55	"'": "'",
56	" ": " ",
57	}
58	for entity, char in entity_map.items():
59	text = text.replace(entity, char)
60	# Decode numeric entities ({ and )
61	text = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), text)
62	text = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), text)
63	# Collapse excessive blank lines
64	text = re.sub(r"\n{3,}", "\n\n", text)
65	return text.strip()
66
67
68	class OneNoteSource(BaseSource):
69	"""
70	Fetch pages from OneNote notebooks via the m365 CLI.
71
72	Usage:
73	source = OneNoteSource() # all notebooks
74	source = OneNoteSource(notebook_name="Work Notes") # specific notebook
75	source = OneNoteSource(notebook_name="Work", section_name="Meetings")
76	files = source.list_videos()
77	source.download_all(files, Path("./notes"))
78	"""
79
80	def __init__(
81	self,
82	notebook_name: Optional[str] = None,
83	section_name: Optional[str] = None,
84	):
85	self.notebook_name = notebook_name
86	self.section_name = section_name
87
88	def authenticate(self) -> bool:
89	"""Check if m365 CLI is installed and logged in."""
90	if not shutil.which("m365"):
91	logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365")
92	return False
93	try:
94	result = _run_m365(["status"], timeout=10)
95	if isinstance(result, dict) and result.get("connectedAs"):
96	return True
97	if isinstance(result, str) and "Logged in" in result:
98	return True
99	logger.error("m365 not logged in. Run: m365 login")
100	return False
101	except (RuntimeError, subprocess.TimeoutExpired):
102	logger.error("m365 not logged in. Run: m365 login")
103	return False
104
105	def list_videos(
106	self,
107	folder_id: Optional[str] = None,
108	folder_path: Optional[str] = None,
109	patterns: Optional[List[str]] = None,
110	) -> List[SourceFile]:
111	"""List OneNote pages across notebooks/sections. Returns SourceFile per page."""
112	files: List[SourceFile] = []
113
114	# Step 1: List notebooks
115	try:
116	notebooks = _run_m365(["onenote", "notebook", "list"], timeout=60)
117	except RuntimeError as e:
118	logger.error(f"Failed to list OneNote notebooks: {e}")
119	return []
120
121	if not isinstance(notebooks, list):
122	notebooks = []
123
124	# Filter notebooks by name if specified
125	if self.notebook_name:
126	notebooks = [
127	nb
128	for nb in notebooks
129	if self.notebook_name.lower() in nb.get("displayName", "").lower()
130	]
131
132	for notebook in notebooks:
133	notebook_id = notebook.get("id", "")
134	notebook_name = notebook.get("displayName", "Untitled Notebook")
135
136	# Step 2: List sections in this notebook
137	try:
138	sections = _run_m365(
139	["onenote", "section", "list", "--notebookId", notebook_id],
140	timeout=60,
141	)
142	except RuntimeError as e:
143	logger.warning(f"Failed to list sections for notebook '{notebook_name}': {e}")
144	continue
145
146	if not isinstance(sections, list):
147	sections = []
148
149	# Filter sections by name if specified
150	if self.section_name:
151	sections = [
152	s
153	for s in sections
154	if self.section_name.lower() in s.get("displayName", "").lower()
155	]
156
157	for section in sections:
158	section_id = section.get("id", "")
159	section_name = section.get("displayName", "Untitled Section")
160
161	# Step 3: List pages in this section
162	try:
163	pages = _run_m365(
164	["onenote", "page", "list", "--sectionId", section_id],
165	timeout=60,
166	)
167	except RuntimeError as e:
168	logger.warning(f"Failed to list pages in section '{section_name}': {e}")
169	continue
170
171	if not isinstance(pages, list):
172	pages = []
173
174	for page in pages:
175	page_id = page.get("id", "")
176	title = page.get("title", "Untitled Page").strip() or "Untitled Page"
177	modified = page.get("lastModifiedDateTime")
178	# Build a path for organizational context
179	page_path = f"{notebook_name}/{section_name}/{title}"
180
181	files.append(
182	SourceFile(
183	name=title,
184	id=str(page_id),
185	size_bytes=None,
186	mime_type="text/html",
187	modified_at=modified,
188	path=page_path,
189	)
190	)
191
192	logger.info(f"Found {len(files)} page(s) in OneNote")
193	return files
194
195	def download(self, file: SourceFile, destination: Path) -> Path:
196	"""Download a OneNote page's content as a text file."""
197	destination = Path(destination)
198	destination.parent.mkdir(parents=True, exist_ok=True)
199
200	try:
201	result = _run_m365(
202	["onenote", "page", "get", "--id", file.id],
203	timeout=60,
204	)
205	except RuntimeError as e:
206	raise RuntimeError(f"Failed to fetch OneNote page {file.id}: {e}") from e
207
208	# Extract HTML content from the result
209	if isinstance(result, dict):
210	html = result.get("content", result.get("body", {}).get("content", ""))
211	if not html:
212	# Fallback: serialize the whole response
213	html = json.dumps(result, indent=2)
214	elif isinstance(result, str):
215	html = result
216	else:
217	html = str(result)
218
219	text = _html_to_text(html)
220	destination.write_text(text, encoding="utf-8")
221	logger.info(f"Saved page '{file.name}' to {destination}")
222	return destination

PlanOpticon

Keyboard Shortcuts