PlanOpticon

feat(sources): add Google Workspace (gws) and Microsoft 365 (m365) source connectors - GWSSource: fetch/collate Google Docs, Sheets, Slides via gws CLI - M365Source: fetch docs from SharePoint/OneDrive via m365 CLI - CLI groups: planopticon gws {list,fetch,ingest} and planopticon m365 {list,fetch,ingest} - Both support direct ingest into knowledge graphs - Text extraction for .docx, .xlsx, .pdf, .html, .txt, .md, .csv - 29 new tests (GWSSource, M365Source, CLI help)

lmata 2026-03-07 23:02 trunk

Commit 7d809ed2e1f41acb2eb046fb778e401a3e5bf1a65590f0a95002684322014355

Parent 40374de5d998901…

6 files changed +53 +324 +399 +4 +268 +310

~ tests/test_cli.py ~ tests/test_sources.py ~ video_processor/cli/commands.py ~ video_processor/sources/__init__.py + video_processor/sources/gws_source.py + video_processor/sources/m365_source.py

M tests/test_cli.py

+53

		--- tests/test_cli.py
		+++ tests/test_cli.py
		@@ -20,10 +20,12 @@
20	20	assert "PlanOpticon" in result.output
21	21	assert "analyze" in result.output
22	22	assert "query" in result.output
23	23	assert "agent" in result.output
24	24	assert "kg" in result.output
	25	+ assert "gws" in result.output
	26	+ assert "m365" in result.output
25	27	assert "ingest" in result.output
26	28	assert "batch" in result.output
27	29
28	30
29	31	class TestAnalyzeHelp:
		@@ -140,13 +142,64 @@
140	142	assert result.exit_code == 0
141	143	assert "--cache-dir" in result.output
142	144	assert "--older-than" in result.output
143	145	assert "--all" in result.output
144	146
	147	+
	148	+class TestGWSHelp:
	149	+ def test_help(self):
	150	+ runner = CliRunner()
	151	+ result = runner.invoke(cli, ["gws", "--help"])
	152	+ assert result.exit_code == 0
	153	+ assert "list" in result.output
	154	+ assert "fetch" in result.output
	155	+ assert "ingest" in result.output
	156	+
	157	+ def test_list_help(self):
	158	+ runner = CliRunner()
	159	+ result = runner.invoke(cli, ["gws", "list", "--help"])
	160	+ assert result.exit_code == 0
	161	+ assert "--folder-id" in result.output
	162	+ assert "--query" in result.output
	163	+
	164	+ def test_ingest_help(self):
	165	+ runner = CliRunner()
	166	+ result = runner.invoke(cli, ["gws", "ingest", "--help"])
	167	+ assert result.exit_code == 0
	168	+ assert "--folder-id" in result.output
	169	+ assert "--doc-id" in result.output
	170	+ assert "--db-path" in result.output
	171	+
	172	+
	173	+class TestM365Help:
	174	+ def test_help(self):
	175	+ runner = CliRunner()
	176	+ result = runner.invoke(cli, ["m365", "--help"])
	177	+ assert result.exit_code == 0
	178	+ assert "list" in result.output
	179	+ assert "fetch" in result.output
	180	+ assert "ingest" in result.output
	181	+
	182	+ def test_list_help(self):
	183	+ runner = CliRunner()
	184	+ result = runner.invoke(cli, ["m365", "list", "--help"])
	185	+ assert result.exit_code == 0
	186	+ assert "--web-url" in result.output
	187	+ assert "--folder-url" in result.output
	188	+ assert "--recursive" in result.output
	189	+
	190	+ def test_ingest_help(self):
	191	+ runner = CliRunner()
	192	+ result = runner.invoke(cli, ["m365", "ingest", "--help"])
	193	+ assert result.exit_code == 0
	194	+ assert "--web-url" in result.output
	195	+ assert "--file-id" in result.output
	196	+ assert "--db-path" in result.output
	197	+
145	198
146	199	class TestAuthHelp:
147	200	def test_help(self):
148	201	runner = CliRunner()
149	202	result = runner.invoke(cli, ["auth", "--help"])
150	203	assert result.exit_code == 0
151	204	assert "google" in result.output
152	205	assert "dropbox" in result.output
153	206

	--- tests/test_cli.py
	+++ tests/test_cli.py
	@@ -20,10 +20,12 @@
20	assert "PlanOpticon" in result.output
21	assert "analyze" in result.output
22	assert "query" in result.output
23	assert "agent" in result.output
24	assert "kg" in result.output


25	assert "ingest" in result.output
26	assert "batch" in result.output
27
28
29	class TestAnalyzeHelp:
	@@ -140,13 +142,64 @@
140	assert result.exit_code == 0
141	assert "--cache-dir" in result.output
142	assert "--older-than" in result.output
143	assert "--all" in result.output
144



















































145
146	class TestAuthHelp:
147	def test_help(self):
148	runner = CliRunner()
149	result = runner.invoke(cli, ["auth", "--help"])
150	assert result.exit_code == 0
151	assert "google" in result.output
152	assert "dropbox" in result.output
153

	--- tests/test_cli.py
	+++ tests/test_cli.py
	@@ -20,10 +20,12 @@
20	assert "PlanOpticon" in result.output
21	assert "analyze" in result.output
22	assert "query" in result.output
23	assert "agent" in result.output
24	assert "kg" in result.output
25	assert "gws" in result.output
26	assert "m365" in result.output
27	assert "ingest" in result.output
28	assert "batch" in result.output
29
30
31	class TestAnalyzeHelp:
	@@ -140,13 +142,64 @@
142	assert result.exit_code == 0
143	assert "--cache-dir" in result.output
144	assert "--older-than" in result.output
145	assert "--all" in result.output
146
147
148	class TestGWSHelp:
149	def test_help(self):
150	runner = CliRunner()
151	result = runner.invoke(cli, ["gws", "--help"])
152	assert result.exit_code == 0
153	assert "list" in result.output
154	assert "fetch" in result.output
155	assert "ingest" in result.output
156
157	def test_list_help(self):
158	runner = CliRunner()
159	result = runner.invoke(cli, ["gws", "list", "--help"])
160	assert result.exit_code == 0
161	assert "--folder-id" in result.output
162	assert "--query" in result.output
163
164	def test_ingest_help(self):
165	runner = CliRunner()
166	result = runner.invoke(cli, ["gws", "ingest", "--help"])
167	assert result.exit_code == 0
168	assert "--folder-id" in result.output
169	assert "--doc-id" in result.output
170	assert "--db-path" in result.output
171
172
173	class TestM365Help:
174	def test_help(self):
175	runner = CliRunner()
176	result = runner.invoke(cli, ["m365", "--help"])
177	assert result.exit_code == 0
178	assert "list" in result.output
179	assert "fetch" in result.output
180	assert "ingest" in result.output
181
182	def test_list_help(self):
183	runner = CliRunner()
184	result = runner.invoke(cli, ["m365", "list", "--help"])
185	assert result.exit_code == 0
186	assert "--web-url" in result.output
187	assert "--folder-url" in result.output
188	assert "--recursive" in result.output
189
190	def test_ingest_help(self):
191	runner = CliRunner()
192	result = runner.invoke(cli, ["m365", "ingest", "--help"])
193	assert result.exit_code == 0
194	assert "--web-url" in result.output
195	assert "--file-id" in result.output
196	assert "--db-path" in result.output
197
198
199	class TestAuthHelp:
200	def test_help(self):
201	runner = CliRunner()
202	result = runner.invoke(cli, ["auth", "--help"])
203	assert result.exit_code == 0
204	assert "google" in result.output
205	assert "dropbox" in result.output
206

M tests/test_sources.py

+324

		--- tests/test_sources.py
		+++ tests/test_sources.py
		@@ -534,5 +534,329 @@
534	534	# Only .mp4 and .mkv are video extensions
535	535	assert len(files) == 2
536	536	names = [f.name for f in files]
537	537	assert "clip.mp4" in names
538	538	assert "movie.mkv" in names
	539	+
	540	+
	541	+# ---------------------------------------------------------------------------
	542	+# GWSSource
	543	+# ---------------------------------------------------------------------------
	544	+
	545	+
	546	+class TestGWSSource:
	547	+ def test_import(self):
	548	+ from video_processor.sources.gws_source import GWSSource
	549	+
	550	+ assert GWSSource is not None
	551	+
	552	+ def test_constructor_defaults(self):
	553	+ from video_processor.sources.gws_source import GWSSource
	554	+
	555	+ src = GWSSource()
	556	+ assert src.folder_id is None
	557	+ assert src.query is None
	558	+ assert src.doc_ids == []
	559	+
	560	+ def test_constructor_with_folder(self):
	561	+ from video_processor.sources.gws_source import GWSSource
	562	+
	563	+ src = GWSSource(folder_id="1abc", query="name contains 'spec'")
	564	+ assert src.folder_id == "1abc"
	565	+ assert src.query == "name contains 'spec'"
	566	+
	567	+ def test_constructor_with_doc_ids(self):
	568	+ from video_processor.sources.gws_source import GWSSource
	569	+
	570	+ src = GWSSource(doc_ids=["doc1", "doc2"])
	571	+ assert src.doc_ids == ["doc1", "doc2"]
	572	+
	573	+ @patch("shutil.which", return_value=None)
	574	+ def test_authenticate_no_gws(self, _mock_which):
	575	+ from video_processor.sources.gws_source import GWSSource
	576	+
	577	+ src = GWSSource()
	578	+ assert src.authenticate() is False
	579	+
	580	+ @patch("video_processor.sources.gws_source._run_gws")
	581	+ @patch("shutil.which", return_value="/usr/local/bin/gws")
	582	+ def test_authenticate_success(self, _mock_which, mock_run):
	583	+ from video_processor.sources.gws_source import GWSSource
	584	+
	585	+ mock_run.return_value = {"connectedAs": "[email protected]"}
	586	+ src = GWSSource()
	587	+ assert src.authenticate() is True
	588	+
	589	+ @patch("video_processor.sources.gws_source._run_gws")
	590	+ @patch("shutil.which", return_value="/usr/local/bin/gws")
	591	+ def test_list_videos(self, _mock_which, mock_run):
	592	+ from video_processor.sources.gws_source import GWSSource
	593	+
	594	+ mock_run.return_value = {
	595	+ "files": [
	596	+ {
	597	+ "id": "doc123",
	598	+ "name": "Project Spec",
	599	+ "mimeType": "application/vnd.google-apps.document",
	600	+ "modifiedTime": "2026-01-01T00:00:00Z",
	601	+ },
	602	+ {
	603	+ "id": "sheet456",
	604	+ "name": "Budget",
	605	+ "mimeType": "application/vnd.google-apps.spreadsheet",
	606	+ },
	607	+ ]
	608	+ }
	609	+ src = GWSSource(folder_id="folder1")
	610	+ files = src.list_videos()
	611	+ assert len(files) == 2
	612	+ assert files[0].name == "Project Spec"
	613	+ assert files[1].id == "sheet456"
	614	+
	615	+ @patch("video_processor.sources.gws_source._run_gws")
	616	+ @patch("shutil.which", return_value="/usr/local/bin/gws")
	617	+ def test_list_videos_with_doc_ids(self, _mock_which, mock_run):
	618	+ from video_processor.sources.gws_source import GWSSource
	619	+
	620	+ mock_run.return_value = {
	621	+ "id": "doc123",
	622	+ "name": "My Doc",
	623	+ "mimeType": "application/vnd.google-apps.document",
	624	+ }
	625	+ src = GWSSource(doc_ids=["doc123"])
	626	+ files = src.list_videos()
	627	+ assert len(files) == 1
	628	+ assert files[0].name == "My Doc"
	629	+
	630	+ def test_result_to_source_file(self):
	631	+ from video_processor.sources.gws_source import _result_to_source_file
	632	+
	633	+ sf = _result_to_source_file(
	634	+ {
	635	+ "id": "abc",
	636	+ "name": "Test Doc",
	637	+ "mimeType": "text/plain",
	638	+ "size": "1024",
	639	+ "modifiedTime": "2026-03-01",
	640	+ }
	641	+ )
	642	+ assert sf.name == "Test Doc"
	643	+ assert sf.id == "abc"
	644	+ assert sf.size_bytes == 1024
	645	+ assert sf.mime_type == "text/plain"
	646	+
	647	+ @patch("video_processor.sources.gws_source._run_gws")
	648	+ def test_get_doc_text(self, mock_run):
	649	+ from video_processor.sources.gws_source import GWSSource
	650	+
	651	+ mock_run.return_value = {
	652	+ "body": {
	653	+ "content": [
	654	+ {
	655	+ "paragraph": {
	656	+ "elements": [
	657	+ {"textRun": {"content": "Hello world\n"}},
	658	+ ]
	659	+ }
	660	+ },
	661	+ {
	662	+ "paragraph": {
	663	+ "elements": [
	664	+ {"textRun": {"content": "Second paragraph\n"}},
	665	+ ]
	666	+ }
	667	+ },
	668	+ ]
	669	+ }
	670	+ }
	671	+ src = GWSSource()
	672	+ text = src._get_doc_text("doc123")
	673	+ assert "Hello world" in text
	674	+ assert "Second paragraph" in text
	675	+
	676	+ @patch("video_processor.sources.gws_source._run_gws")
	677	+ def test_collate(self, mock_run):
	678	+ from video_processor.sources.gws_source import GWSSource
	679	+
	680	+ # First call: list files, second+: export each
	681	+ mock_run.side_effect = [
	682	+ {
	683	+ "files": [
	684	+ {
	685	+ "id": "d1",
	686	+ "name": "Doc A",
	687	+ "mimeType": "application/vnd.google-apps.document",
	688	+ },
	689	+ ]
	690	+ },
	691	+ {"raw": "Content of Doc A"},
	692	+ ]
	693	+ src = GWSSource(folder_id="f1")
	694	+ result = src.collate()
	695	+ assert "Doc A" in result
	696	+ assert "Content of Doc A" in result
	697	+
	698	+
	699	+# ---------------------------------------------------------------------------
	700	+# M365Source
	701	+# ---------------------------------------------------------------------------
	702	+
	703	+
	704	+class TestM365Source:
	705	+ def test_import(self):
	706	+ from video_processor.sources.m365_source import M365Source
	707	+
	708	+ assert M365Source is not None
	709	+
	710	+ def test_constructor(self):
	711	+ from video_processor.sources.m365_source import M365Source
	712	+
	713	+ src = M365Source(
	714	+ web_url="https://contoso.sharepoint.com/sites/proj",
	715	+ folder_url="/sites/proj/Shared Documents",
	716	+ )
	717	+ assert src.web_url == "https://contoso.sharepoint.com/sites/proj"
	718	+ assert src.folder_url == "/sites/proj/Shared Documents"
	719	+ assert src.file_ids == []
	720	+ assert src.recursive is False
	721	+
	722	+ def test_constructor_with_file_ids(self):
	723	+ from video_processor.sources.m365_source import M365Source
	724	+
	725	+ src = M365Source(
	726	+ web_url="https://contoso.sharepoint.com",
	727	+ file_ids=["id1", "id2"],
	728	+ )
	729	+ assert src.file_ids == ["id1", "id2"]
	730	+
	731	+ @patch("shutil.which", return_value=None)
	732	+ def test_authenticate_no_m365(self, _mock_which):
	733	+ from video_processor.sources.m365_source import M365Source
	734	+
	735	+ src = M365Source(web_url="https://contoso.sharepoint.com")
	736	+ assert src.authenticate() is False
	737	+
	738	+ @patch("video_processor.sources.m365_source._run_m365")
	739	+ @patch("shutil.which", return_value="/usr/local/bin/m365")
	740	+ def test_authenticate_logged_in(self, _mock_which, mock_run):
	741	+ from video_processor.sources.m365_source import M365Source
	742	+
	743	+ mock_run.return_value = {"connectedAs": "[email protected]"}
	744	+ src = M365Source(web_url="https://contoso.sharepoint.com")
	745	+ assert src.authenticate() is True
	746	+
	747	+ @patch("video_processor.sources.m365_source._run_m365")
	748	+ @patch("shutil.which", return_value="/usr/local/bin/m365")
	749	+ def test_authenticate_not_logged_in(self, _mock_which, mock_run):
	750	+ from video_processor.sources.m365_source import M365Source
	751	+
	752	+ mock_run.return_value = {}
	753	+ src = M365Source(web_url="https://contoso.sharepoint.com")
	754	+ assert src.authenticate() is False
	755	+
	756	+ @patch("video_processor.sources.m365_source._run_m365")
	757	+ @patch("shutil.which", return_value="/usr/local/bin/m365")
	758	+ def test_list_videos(self, _mock_which, mock_run):
	759	+ from video_processor.sources.m365_source import M365Source
	760	+
	761	+ mock_run.side_effect = [
	762	+ {"connectedAs": "[email protected]"}, # authenticate
	763	+ [
	764	+ {
	765	+ "Name": "spec.docx",
	766	+ "UniqueId": "uid-1",
	767	+ "Length": "20480",
	768	+ "ServerRelativeUrl": "/sites/proj/docs/spec.docx",
	769	+ },
	770	+ {
	771	+ "Name": "budget.xlsx",
	772	+ "UniqueId": "uid-2",
	773	+ "Length": "10240",
	774	+ "ServerRelativeUrl": "/sites/proj/docs/budget.xlsx",
	775	+ },
	776	+ {
	777	+ "Name": "image.png",
	778	+ "UniqueId": "uid-3",
	779	+ "Length": "5000",
	780	+ "ServerRelativeUrl": "/sites/proj/docs/image.png",
	781	+ },
	782	+ ],
	783	+ ]
	784	+ src = M365Source(
	785	+ web_url="https://contoso.sharepoint.com/sites/proj",
	786	+ folder_url="/sites/proj/docs",
	787	+ )
	788	+ src.authenticate()
	789	+ files = src.list_videos()
	790	+ # Only .docx and .xlsx match _DOC_EXTENSIONS, not .png
	791	+ assert len(files) == 2
	792	+ names = [f.name for f in files]
	793	+ assert "spec.docx" in names
	794	+ assert "budget.xlsx" in names
	795	+
	796	+ @patch("video_processor.sources.m365_source._run_m365")
	797	+ def test_list_videos_with_file_ids(self, mock_run):
	798	+ from video_processor.sources.m365_source import M365Source
	799	+
	800	+ mock_run.return_value = {
	801	+ "Name": "report.pdf",
	802	+ "UniqueId": "uid-1",
	803	+ "Length": "50000",
	804	+ "ServerRelativeUrl": "/sites/proj/docs/report.pdf",
	805	+ }
	806	+ src = M365Source(
	807	+ web_url="https://contoso.sharepoint.com",
	808	+ file_ids=["uid-1"],
	809	+ )
	810	+ files = src.list_videos()
	811	+ assert len(files) == 1
	812	+ assert files[0].name == "report.pdf"
	813	+
	814	+ def test_result_to_source_file(self):
	815	+ from video_processor.sources.m365_source import _result_to_source_file
	816	+
	817	+ sf = _result_to_source_file(
	818	+ {
	819	+ "Name": "notes.txt",
	820	+ "UniqueId": "abc-123",
	821	+ "Length": "512",
	822	+ "ServerRelativeUrl": "/sites/proj/notes.txt",
	823	+ "TimeLastModified": "2026-03-01T12:00:00Z",
	824	+ }
	825	+ )
	826	+ assert sf.name == "notes.txt"
	827	+ assert sf.id == "abc-123"
	828	+ assert sf.size_bytes == 512
	829	+ assert sf.path == "/sites/proj/notes.txt"
	830	+ assert sf.modified_at == "2026-03-01T12:00:00Z"
	831	+
	832	+ def test_extract_text_txt(self, tmp_path):
	833	+ from video_processor.sources.m365_source import _extract_text
	834	+
	835	+ f = tmp_path / "test.txt"
	836	+ f.write_text("Hello from a text file")
	837	+ result = _extract_text(f)
	838	+ assert result == "Hello from a text file"
	839	+
	840	+ def test_extract_text_md(self, tmp_path):
	841	+ from video_processor.sources.m365_source import _extract_text
	842	+
	843	+ f = tmp_path / "readme.md"
	844	+ f.write_text("# Title\n\nSome content")
	845	+ result = _extract_text(f)
	846	+ assert "Title" in result
	847	+ assert "Some content" in result
	848	+
	849	+ def test_extract_text_unsupported(self, tmp_path):
	850	+ from video_processor.sources.m365_source import _extract_text
	851	+
	852	+ f = tmp_path / "data.bin"
	853	+ f.write_bytes(b"\x00\x01\x02")
	854	+ result = _extract_text(f)
	855	+ assert "Unsupported" in result
	856	+
	857	+ def test_list_no_folder_url(self):
	858	+ from video_processor.sources.m365_source import M365Source
	859	+
	860	+ src = M365Source(web_url="https://contoso.sharepoint.com")
	861	+ files = src.list_videos()
	862	+ assert files == []
539	863

	--- tests/test_sources.py
	+++ tests/test_sources.py
	@@ -534,5 +534,329 @@
534	# Only .mp4 and .mkv are video extensions
535	assert len(files) == 2
536	names = [f.name for f in files]
537	assert "clip.mp4" in names
538	assert "movie.mkv" in names




































































































































































































































































































































539

	--- tests/test_sources.py
	+++ tests/test_sources.py
	@@ -534,5 +534,329 @@
534	# Only .mp4 and .mkv are video extensions
535	assert len(files) == 2
536	names = [f.name for f in files]
537	assert "clip.mp4" in names
538	assert "movie.mkv" in names
539
540
541	# ---------------------------------------------------------------------------
542	# GWSSource
543	# ---------------------------------------------------------------------------
544
545
546	class TestGWSSource:
547	def test_import(self):
548	from video_processor.sources.gws_source import GWSSource
549
550	assert GWSSource is not None
551
552	def test_constructor_defaults(self):
553	from video_processor.sources.gws_source import GWSSource
554
555	src = GWSSource()
556	assert src.folder_id is None
557	assert src.query is None
558	assert src.doc_ids == []
559
560	def test_constructor_with_folder(self):
561	from video_processor.sources.gws_source import GWSSource
562
563	src = GWSSource(folder_id="1abc", query="name contains 'spec'")
564	assert src.folder_id == "1abc"
565	assert src.query == "name contains 'spec'"
566
567	def test_constructor_with_doc_ids(self):
568	from video_processor.sources.gws_source import GWSSource
569
570	src = GWSSource(doc_ids=["doc1", "doc2"])
571	assert src.doc_ids == ["doc1", "doc2"]
572
573	@patch("shutil.which", return_value=None)
574	def test_authenticate_no_gws(self, _mock_which):
575	from video_processor.sources.gws_source import GWSSource
576
577	src = GWSSource()
578	assert src.authenticate() is False
579
580	@patch("video_processor.sources.gws_source._run_gws")
581	@patch("shutil.which", return_value="/usr/local/bin/gws")
582	def test_authenticate_success(self, _mock_which, mock_run):
583	from video_processor.sources.gws_source import GWSSource
584
585	mock_run.return_value = {"connectedAs": "[email protected]"}
586	src = GWSSource()
587	assert src.authenticate() is True
588
589	@patch("video_processor.sources.gws_source._run_gws")
590	@patch("shutil.which", return_value="/usr/local/bin/gws")
591	def test_list_videos(self, _mock_which, mock_run):
592	from video_processor.sources.gws_source import GWSSource
593
594	mock_run.return_value = {
595	"files": [
596	{
597	"id": "doc123",
598	"name": "Project Spec",
599	"mimeType": "application/vnd.google-apps.document",
600	"modifiedTime": "2026-01-01T00:00:00Z",
601	},
602	{
603	"id": "sheet456",
604	"name": "Budget",
605	"mimeType": "application/vnd.google-apps.spreadsheet",
606	},
607	]
608	}
609	src = GWSSource(folder_id="folder1")
610	files = src.list_videos()
611	assert len(files) == 2
612	assert files[0].name == "Project Spec"
613	assert files[1].id == "sheet456"
614
615	@patch("video_processor.sources.gws_source._run_gws")
616	@patch("shutil.which", return_value="/usr/local/bin/gws")
617	def test_list_videos_with_doc_ids(self, _mock_which, mock_run):
618	from video_processor.sources.gws_source import GWSSource
619
620	mock_run.return_value = {
621	"id": "doc123",
622	"name": "My Doc",
623	"mimeType": "application/vnd.google-apps.document",
624	}
625	src = GWSSource(doc_ids=["doc123"])
626	files = src.list_videos()
627	assert len(files) == 1
628	assert files[0].name == "My Doc"
629
630	def test_result_to_source_file(self):
631	from video_processor.sources.gws_source import _result_to_source_file
632
633	sf = _result_to_source_file(
634	{
635	"id": "abc",
636	"name": "Test Doc",
637	"mimeType": "text/plain",
638	"size": "1024",
639	"modifiedTime": "2026-03-01",
640	}
641	)
642	assert sf.name == "Test Doc"
643	assert sf.id == "abc"
644	assert sf.size_bytes == 1024
645	assert sf.mime_type == "text/plain"
646
647	@patch("video_processor.sources.gws_source._run_gws")
648	def test_get_doc_text(self, mock_run):
649	from video_processor.sources.gws_source import GWSSource
650
651	mock_run.return_value = {
652	"body": {
653	"content": [
654	{
655	"paragraph": {
656	"elements": [
657	{"textRun": {"content": "Hello world\n"}},
658	]
659	}
660	},
661	{
662	"paragraph": {
663	"elements": [
664	{"textRun": {"content": "Second paragraph\n"}},
665	]
666	}
667	},
668	]
669	}
670	}
671	src = GWSSource()
672	text = src._get_doc_text("doc123")
673	assert "Hello world" in text
674	assert "Second paragraph" in text
675
676	@patch("video_processor.sources.gws_source._run_gws")
677	def test_collate(self, mock_run):
678	from video_processor.sources.gws_source import GWSSource
679
680	# First call: list files, second+: export each
681	mock_run.side_effect = [
682	{
683	"files": [
684	{
685	"id": "d1",
686	"name": "Doc A",
687	"mimeType": "application/vnd.google-apps.document",
688	},
689	]
690	},
691	{"raw": "Content of Doc A"},
692	]
693	src = GWSSource(folder_id="f1")
694	result = src.collate()
695	assert "Doc A" in result
696	assert "Content of Doc A" in result
697
698
699	# ---------------------------------------------------------------------------
700	# M365Source
701	# ---------------------------------------------------------------------------
702
703
704	class TestM365Source:
705	def test_import(self):
706	from video_processor.sources.m365_source import M365Source
707
708	assert M365Source is not None
709
710	def test_constructor(self):
711	from video_processor.sources.m365_source import M365Source
712
713	src = M365Source(
714	web_url="https://contoso.sharepoint.com/sites/proj",
715	folder_url="/sites/proj/Shared Documents",
716	)
717	assert src.web_url == "https://contoso.sharepoint.com/sites/proj"
718	assert src.folder_url == "/sites/proj/Shared Documents"
719	assert src.file_ids == []
720	assert src.recursive is False
721
722	def test_constructor_with_file_ids(self):
723	from video_processor.sources.m365_source import M365Source
724
725	src = M365Source(
726	web_url="https://contoso.sharepoint.com",
727	file_ids=["id1", "id2"],
728	)
729	assert src.file_ids == ["id1", "id2"]
730
731	@patch("shutil.which", return_value=None)
732	def test_authenticate_no_m365(self, _mock_which):
733	from video_processor.sources.m365_source import M365Source
734
735	src = M365Source(web_url="https://contoso.sharepoint.com")
736	assert src.authenticate() is False
737
738	@patch("video_processor.sources.m365_source._run_m365")
739	@patch("shutil.which", return_value="/usr/local/bin/m365")
740	def test_authenticate_logged_in(self, _mock_which, mock_run):
741	from video_processor.sources.m365_source import M365Source
742
743	mock_run.return_value = {"connectedAs": "[email protected]"}
744	src = M365Source(web_url="https://contoso.sharepoint.com")
745	assert src.authenticate() is True
746
747	@patch("video_processor.sources.m365_source._run_m365")
748	@patch("shutil.which", return_value="/usr/local/bin/m365")
749	def test_authenticate_not_logged_in(self, _mock_which, mock_run):
750	from video_processor.sources.m365_source import M365Source
751
752	mock_run.return_value = {}
753	src = M365Source(web_url="https://contoso.sharepoint.com")
754	assert src.authenticate() is False
755
756	@patch("video_processor.sources.m365_source._run_m365")
757	@patch("shutil.which", return_value="/usr/local/bin/m365")
758	def test_list_videos(self, _mock_which, mock_run):
759	from video_processor.sources.m365_source import M365Source
760
761	mock_run.side_effect = [
762	{"connectedAs": "[email protected]"}, # authenticate
763	[
764	{
765	"Name": "spec.docx",
766	"UniqueId": "uid-1",
767	"Length": "20480",
768	"ServerRelativeUrl": "/sites/proj/docs/spec.docx",
769	},
770	{
771	"Name": "budget.xlsx",
772	"UniqueId": "uid-2",
773	"Length": "10240",
774	"ServerRelativeUrl": "/sites/proj/docs/budget.xlsx",
775	},
776	{
777	"Name": "image.png",
778	"UniqueId": "uid-3",
779	"Length": "5000",
780	"ServerRelativeUrl": "/sites/proj/docs/image.png",
781	},
782	],
783	]
784	src = M365Source(
785	web_url="https://contoso.sharepoint.com/sites/proj",
786	folder_url="/sites/proj/docs",
787	)
788	src.authenticate()
789	files = src.list_videos()
790	# Only .docx and .xlsx match _DOC_EXTENSIONS, not .png
791	assert len(files) == 2
792	names = [f.name for f in files]
793	assert "spec.docx" in names
794	assert "budget.xlsx" in names
795
796	@patch("video_processor.sources.m365_source._run_m365")
797	def test_list_videos_with_file_ids(self, mock_run):
798	from video_processor.sources.m365_source import M365Source
799
800	mock_run.return_value = {
801	"Name": "report.pdf",
802	"UniqueId": "uid-1",
803	"Length": "50000",
804	"ServerRelativeUrl": "/sites/proj/docs/report.pdf",
805	}
806	src = M365Source(
807	web_url="https://contoso.sharepoint.com",
808	file_ids=["uid-1"],
809	)
810	files = src.list_videos()
811	assert len(files) == 1
812	assert files[0].name == "report.pdf"
813
814	def test_result_to_source_file(self):
815	from video_processor.sources.m365_source import _result_to_source_file
816
817	sf = _result_to_source_file(
818	{
819	"Name": "notes.txt",
820	"UniqueId": "abc-123",
821	"Length": "512",
822	"ServerRelativeUrl": "/sites/proj/notes.txt",
823	"TimeLastModified": "2026-03-01T12:00:00Z",
824	}
825	)
826	assert sf.name == "notes.txt"
827	assert sf.id == "abc-123"
828	assert sf.size_bytes == 512
829	assert sf.path == "/sites/proj/notes.txt"
830	assert sf.modified_at == "2026-03-01T12:00:00Z"
831
832	def test_extract_text_txt(self, tmp_path):
833	from video_processor.sources.m365_source import _extract_text
834
835	f = tmp_path / "test.txt"
836	f.write_text("Hello from a text file")
837	result = _extract_text(f)
838	assert result == "Hello from a text file"
839
840	def test_extract_text_md(self, tmp_path):
841	from video_processor.sources.m365_source import _extract_text
842
843	f = tmp_path / "readme.md"
844	f.write_text("# Title\n\nSome content")
845	result = _extract_text(f)
846	assert "Title" in result
847	assert "Some content" in result
848
849	def test_extract_text_unsupported(self, tmp_path):
850	from video_processor.sources.m365_source import _extract_text
851
852	f = tmp_path / "data.bin"
853	f.write_bytes(b"\x00\x01\x02")
854	result = _extract_text(f)
855	assert "Unsupported" in result
856
857	def test_list_no_folder_url(self):
858	from video_processor.sources.m365_source import M365Source
859
860	src = M365Source(web_url="https://contoso.sharepoint.com")
861	files = src.list_videos()
862	assert files == []
863

M video_processor/cli/commands.py

+399

		--- video_processor/cli/commands.py
		+++ video_processor/cli/commands.py
		@@ -1013,10 +1013,409 @@
1013	1013	click.echo("Dropbox authentication successful.")
1014	1014	else:
1015	1015	click.echo("Dropbox authentication failed.", err=True)
1016	1016	sys.exit(1)
1017	1017
	1018	+
	1019	+@cli.group()
	1020	+def gws():
	1021	+ """Google Workspace: fetch docs, sheets, and slides via the gws CLI."""
	1022	+ pass
	1023	+
	1024	+
	1025	+@gws.command("list")
	1026	+@click.option("--folder-id", type=str, default=None, help="Drive folder ID to list")
	1027	+@click.option("--query", "-q", type=str, default=None, help="Drive search query")
	1028	+@click.option("--json", "as_json", is_flag=True, help="Output as JSON")
	1029	+def gws_list(folder_id, query, as_json):
	1030	+ """List documents in Google Drive.
	1031	+
	1032	+ Examples:
	1033	+
	1034	+ planopticon gws list
	1035	+
	1036	+ planopticon gws list --folder-id 1abc...
	1037	+
	1038	+ planopticon gws list -q "name contains 'PRD'" --json
	1039	+ """
	1040	+ from video_processor.sources.gws_source import GWSSource
	1041	+
	1042	+ source = GWSSource(folder_id=folder_id, query=query)
	1043	+ if not source.authenticate():
	1044	+ click.echo("Error: gws CLI not available or not authenticated.", err=True)
	1045	+ click.echo("Install: npm install -g @googleworkspace/cli", err=True)
	1046	+ click.echo("Auth: gws auth login", err=True)
	1047	+ sys.exit(1)
	1048	+
	1049	+ files = source.list_videos(folder_id=folder_id)
	1050	+ if as_json:
	1051	+ click.echo(json.dumps([f.model_dump() for f in files], indent=2, default=str))
	1052	+ else:
	1053	+ if not files:
	1054	+ click.echo("No documents found.")
	1055	+ return
	1056	+ for f in files:
	1057	+ size = f"{f.size_bytes / 1024:.0f}KB" if f.size_bytes else "—"
	1058	+ click.echo(f" {f.id[:12]}… {size:>8s} {f.mime_type or ''} {f.name}")
	1059	+
	1060	+
	1061	+@gws.command("fetch")
	1062	+@click.argument("doc_ids", nargs=-1)
	1063	+@click.option("--folder-id", type=str, default=None, help="Fetch all docs in a folder")
	1064	+@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory")
	1065	+def gws_fetch(doc_ids, folder_id, output):
	1066	+ """Fetch Google Docs/Sheets/Slides as text files.
	1067	+
	1068	+ Examples:
	1069	+
	1070	+ planopticon gws fetch DOC_ID1 DOC_ID2 -o ./docs
	1071	+
	1072	+ planopticon gws fetch --folder-id 1abc... -o ./docs
	1073	+ """
	1074	+ from video_processor.sources.gws_source import GWSSource
	1075	+
	1076	+ source = GWSSource(folder_id=folder_id, doc_ids=list(doc_ids))
	1077	+ if not source.authenticate():
	1078	+ click.echo("Error: gws CLI not available or not authenticated.", err=True)
	1079	+ sys.exit(1)
	1080	+
	1081	+ out_dir = Path(output) if output else Path.cwd() / "gws_docs"
	1082	+ out_dir.mkdir(parents=True, exist_ok=True)
	1083	+
	1084	+ files = source.list_videos(folder_id=folder_id)
	1085	+ if not files:
	1086	+ click.echo("No documents found.")
	1087	+ return
	1088	+
	1089	+ for f in files:
	1090	+ safe_name = f.name.replace("/", "_").replace("\\", "_")
	1091	+ dest = out_dir / f"{safe_name}.txt"
	1092	+ try:
	1093	+ source.download(f, dest)
	1094	+ click.echo(f" ✓ {f.name} → {dest}")
	1095	+ except Exception as e:
	1096	+ click.echo(f" ✗ {f.name}: {e}", err=True)
	1097	+
	1098	+ click.echo(f"\nFetched {len(files)} document(s) to {out_dir}")
	1099	+
	1100	+
	1101	+@gws.command("ingest")
	1102	+@click.option("--folder-id", type=str, default=None, help="Drive folder ID")
	1103	+@click.option("--doc-id", type=str, multiple=True, help="Specific doc IDs (repeatable)")
	1104	+@click.option("--query", "-q", type=str, default=None, help="Drive search query")
	1105	+@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory")
	1106	+@click.option("--db-path", type=click.Path(), default=None, help="Existing DB to merge into")
	1107	+@click.option(
	1108	+ "-p",
	1109	+ "--provider",
	1110	+ type=click.Choice(
	1111	+ [
	1112	+ "auto",
	1113	+ "openai",
	1114	+ "anthropic",
	1115	+ "gemini",
	1116	+ "ollama",
	1117	+ "azure",
	1118	+ "together",
	1119	+ "fireworks",
	1120	+ "cerebras",
	1121	+ "xai",
	1122	+ ]
	1123	+ ),
	1124	+ default="auto",
	1125	+ help="API provider",
	1126	+)
	1127	+@click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks")
	1128	+@click.pass_context
	1129	+def gws_ingest(ctx, folder_id, doc_id, query, output, db_path, provider, chat_model):
	1130	+ """Fetch Google Workspace docs and ingest into a knowledge graph.
	1131	+
	1132	+ Combines gws fetch + planopticon ingest in one step.
	1133	+
	1134	+ Examples:
	1135	+
	1136	+ planopticon gws ingest --folder-id 1abc...
	1137	+
	1138	+ planopticon gws ingest --doc-id DOC1 --doc-id DOC2 -o ./results
	1139	+
	1140	+ planopticon gws ingest -q "name contains 'spec'" --db-path existing.db
	1141	+ """
	1142	+ import tempfile
	1143	+
	1144	+ from video_processor.integrators.knowledge_graph import KnowledgeGraph
	1145	+ from video_processor.processors.ingest import ingest_file
	1146	+ from video_processor.providers.manager import ProviderManager
	1147	+ from video_processor.sources.gws_source import GWSSource
	1148	+
	1149	+ source = GWSSource(folder_id=folder_id, doc_ids=list(doc_id), query=query)
	1150	+ if not source.authenticate():
	1151	+ click.echo("Error: gws CLI not available or not authenticated.", err=True)
	1152	+ click.echo("Install: npm install -g @googleworkspace/cli", err=True)
	1153	+ click.echo("Auth: gws auth login", err=True)
	1154	+ sys.exit(1)
	1155	+
	1156	+ # Fetch docs to temp dir
	1157	+ files = source.list_videos(folder_id=folder_id)
	1158	+ if not files:
	1159	+ click.echo("No documents found.")
	1160	+ return
	1161	+
	1162	+ click.echo(f"Found {len(files)} document(s), fetching...")
	1163	+
	1164	+ with tempfile.TemporaryDirectory() as tmp_dir:
	1165	+ tmp_path = Path(tmp_dir)
	1166	+ local_files = []
	1167	+ for f in files:
	1168	+ safe_name = f.name.replace("/", "_").replace("\\", "_")
	1169	+ dest = tmp_path / f"{safe_name}.txt"
	1170	+ try:
	1171	+ source.download(f, dest)
	1172	+ local_files.append(dest)
	1173	+ click.echo(f" ✓ {f.name}")
	1174	+ except Exception as e:
	1175	+ click.echo(f" ✗ {f.name}: {e}", err=True)
	1176	+
	1177	+ if not local_files:
	1178	+ click.echo("No documents fetched successfully.", err=True)
	1179	+ sys.exit(1)
	1180	+
	1181	+ # Set up KG
	1182	+ prov = None if provider == "auto" else provider
	1183	+ pm = ProviderManager(chat_model=chat_model, provider=prov)
	1184	+
	1185	+ if db_path:
	1186	+ kg_path = Path(db_path)
	1187	+ elif output:
	1188	+ out_dir = Path(output)
	1189	+ out_dir.mkdir(parents=True, exist_ok=True)
	1190	+ kg_path = out_dir / "knowledge_graph.db"
	1191	+ else:
	1192	+ kg_path = Path.cwd() / "knowledge_graph.db"
	1193	+
	1194	+ kg_path.parent.mkdir(parents=True, exist_ok=True)
	1195	+ kg = KnowledgeGraph(provider_manager=pm, db_path=kg_path)
	1196	+
	1197	+ total_chunks = 0
	1198	+ for lf in local_files:
	1199	+ try:
	1200	+ count = ingest_file(lf, kg)
	1201	+ total_chunks += count
	1202	+ click.echo(f" Ingested {lf.stem}: {count} chunks")
	1203	+ except Exception as e:
	1204	+ click.echo(f" Failed to ingest {lf.stem}: {e}", err=True)
	1205	+
	1206	+ kg.save(kg_path)
	1207	+ kg.save(kg_path.with_suffix(".json"))
	1208	+
	1209	+ entity_count = kg._store.get_entity_count()
	1210	+ rel_count = kg._store.get_relationship_count()
	1211	+
	1212	+ click.echo("\nIngestion complete:")
	1213	+ click.echo(f" Documents: {len(local_files)}")
	1214	+ click.echo(f" Chunks: {total_chunks}")
	1215	+ click.echo(f" Entities: {entity_count}")
	1216	+ click.echo(f" Relationships: {rel_count}")
	1217	+ click.echo(f" Knowledge graph: {kg_path}")
	1218	+
	1219	+
	1220	+@cli.group()
	1221	+def m365():
	1222	+ """Microsoft 365: fetch docs from SharePoint and OneDrive via the m365 CLI."""
	1223	+ pass
	1224	+
	1225	+
	1226	+@m365.command("list")
	1227	+@click.option("--web-url", type=str, required=True, help="SharePoint site URL")
	1228	+@click.option("--folder-url", type=str, required=True, help="Server-relative folder URL")
	1229	+@click.option("--recursive", is_flag=True, help="Include subfolders")
	1230	+@click.option("--json", "as_json", is_flag=True, help="Output as JSON")
	1231	+def m365_list(web_url, folder_url, recursive, as_json):
	1232	+ """List documents in SharePoint or OneDrive.
	1233	+
	1234	+ Examples:
	1235	+
	1236	+ planopticon m365 list --web-url https://contoso.sharepoint.com/sites/proj \\
	1237	+ --folder-url /sites/proj/Shared\\ Documents
	1238	+
	1239	+ planopticon m365 list --web-url URL --folder-url FOLDER --recursive --json
	1240	+ """
	1241	+ from video_processor.sources.m365_source import M365Source
	1242	+
	1243	+ source = M365Source(web_url=web_url, folder_url=folder_url, recursive=recursive)
	1244	+ if not source.authenticate():
	1245	+ click.echo("Error: m365 CLI not available or not logged in.", err=True)
	1246	+ click.echo("Install: npm install -g @pnp/cli-microsoft365", err=True)
	1247	+ click.echo("Auth: m365 login", err=True)
	1248	+ sys.exit(1)
	1249	+
	1250	+ files = source.list_videos()
	1251	+ if as_json:
	1252	+ click.echo(json.dumps([f.model_dump() for f in files], indent=2, default=str))
	1253	+ else:
	1254	+ if not files:
	1255	+ click.echo("No documents found.")
	1256	+ return
	1257	+ for f in files:
	1258	+ size = f"{f.size_bytes / 1024:.0f}KB" if f.size_bytes else "—"
	1259	+ click.echo(f" {f.id[:12]}… {size:>8s} {f.name}")
	1260	+
	1261	+
	1262	+@m365.command("fetch")
	1263	+@click.option("--web-url", type=str, required=True, help="SharePoint site URL")
	1264	+@click.option("--folder-url", type=str, default=None, help="Server-relative folder URL")
	1265	+@click.option("--file-id", type=str, multiple=True, help="Specific file IDs (repeatable)")
	1266	+@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory")
	1267	+def m365_fetch(web_url, folder_url, file_id, output):
	1268	+ """Fetch SharePoint/OneDrive documents as local files.
	1269	+
	1270	+ Examples:
	1271	+
	1272	+ planopticon m365 fetch --web-url URL --folder-url FOLDER -o ./docs
	1273	+
	1274	+ planopticon m365 fetch --web-url URL --file-id ID1 --file-id ID2 -o ./docs
	1275	+ """
	1276	+ from video_processor.sources.m365_source import M365Source
	1277	+
	1278	+ source = M365Source(web_url=web_url, folder_url=folder_url, file_ids=list(file_id))
	1279	+ if not source.authenticate():
	1280	+ click.echo("Error: m365 CLI not available or not logged in.", err=True)
	1281	+ sys.exit(1)
	1282	+
	1283	+ out_dir = Path(output) if output else Path.cwd() / "m365_docs"
	1284	+ out_dir.mkdir(parents=True, exist_ok=True)
	1285	+
	1286	+ files = source.list_videos()
	1287	+ if not files:
	1288	+ click.echo("No documents found.")
	1289	+ return
	1290	+
	1291	+ for f in files:
	1292	+ dest = out_dir / f.name
	1293	+ try:
	1294	+ source.download(f, dest)
	1295	+ click.echo(f" fetched {f.name}")
	1296	+ except Exception as e:
	1297	+ click.echo(f" failed {f.name}: {e}", err=True)
	1298	+
	1299	+ click.echo(f"\nFetched {len(files)} document(s) to {out_dir}")
	1300	+
	1301	+
	1302	+@m365.command("ingest")
	1303	+@click.option("--web-url", type=str, required=True, help="SharePoint site URL")
	1304	+@click.option("--folder-url", type=str, default=None, help="Server-relative folder URL")
	1305	+@click.option("--file-id", type=str, multiple=True, help="Specific file IDs (repeatable)")
	1306	+@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory")
	1307	+@click.option("--db-path", type=click.Path(), default=None, help="Existing DB to merge into")
	1308	+@click.option(
	1309	+ "-p",
	1310	+ "--provider",
	1311	+ type=click.Choice(
	1312	+ [
	1313	+ "auto",
	1314	+ "openai",
	1315	+ "anthropic",
	1316	+ "gemini",
	1317	+ "ollama",
	1318	+ "azure",
	1319	+ "together",
	1320	+ "fireworks",
	1321	+ "cerebras",
	1322	+ "xai",
	1323	+ ]
	1324	+ ),
	1325	+ default="auto",
	1326	+ help="API provider",
	1327	+)
	1328	+@click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks")
	1329	+@click.pass_context
	1330	+def m365_ingest(ctx, web_url, folder_url, file_id, output, db_path, provider, chat_model):
	1331	+ """Fetch SharePoint/OneDrive docs and ingest into a knowledge graph.
	1332	+
	1333	+ Examples:
	1334	+
	1335	+ planopticon m365 ingest --web-url URL --folder-url FOLDER
	1336	+
	1337	+ planopticon m365 ingest --web-url URL --file-id ID1 --file-id ID2 -o ./results
	1338	+ """
	1339	+ import tempfile
	1340	+
	1341	+ from video_processor.integrators.knowledge_graph import KnowledgeGraph
	1342	+ from video_processor.processors.ingest import ingest_file
	1343	+ from video_processor.providers.manager import ProviderManager
	1344	+ from video_processor.sources.m365_source import M365Source
	1345	+
	1346	+ source = M365Source(web_url=web_url, folder_url=folder_url, file_ids=list(file_id))
	1347	+ if not source.authenticate():
	1348	+ click.echo("Error: m365 CLI not available or not logged in.", err=True)
	1349	+ click.echo("Install: npm install -g @pnp/cli-microsoft365", err=True)
	1350	+ click.echo("Auth: m365 login", err=True)
	1351	+ sys.exit(1)
	1352	+
	1353	+ files = source.list_videos()
	1354	+ if not files:
	1355	+ click.echo("No documents found.")
	1356	+ return
	1357	+
	1358	+ click.echo(f"Found {len(files)} document(s), fetching...")
	1359	+
	1360	+ with tempfile.TemporaryDirectory() as tmp_dir:
	1361	+ tmp_path = Path(tmp_dir)
	1362	+ local_files = []
	1363	+ for f in files:
	1364	+ dest = tmp_path / f.name
	1365	+ try:
	1366	+ source.download(f, dest)
	1367	+ # Extract text for non-text formats
	1368	+ text_dest = tmp_path / f"{Path(f.name).stem}.txt"
	1369	+ text = source.download_as_text(f)
	1370	+ text_dest.write_text(text, encoding="utf-8")
	1371	+ local_files.append(text_dest)
	1372	+ click.echo(f" fetched {f.name}")
	1373	+ except Exception as e:
	1374	+ click.echo(f" failed {f.name}: {e}", err=True)
	1375	+
	1376	+ if not local_files:
	1377	+ click.echo("No documents fetched successfully.", err=True)
	1378	+ sys.exit(1)
	1379	+
	1380	+ prov = None if provider == "auto" else provider
	1381	+ pm = ProviderManager(chat_model=chat_model, provider=prov)
	1382	+
	1383	+ if db_path:
	1384	+ kg_path = Path(db_path)
	1385	+ elif output:
	1386	+ out_dir = Path(output)
	1387	+ out_dir.mkdir(parents=True, exist_ok=True)
	1388	+ kg_path = out_dir / "knowledge_graph.db"
	1389	+ else:
	1390	+ kg_path = Path.cwd() / "knowledge_graph.db"
	1391	+
	1392	+ kg_path.parent.mkdir(parents=True, exist_ok=True)
	1393	+ kg = KnowledgeGraph(provider_manager=pm, db_path=kg_path)
	1394	+
	1395	+ total_chunks = 0
	1396	+ for lf in local_files:
	1397	+ try:
	1398	+ count = ingest_file(lf, kg)
	1399	+ total_chunks += count
	1400	+ click.echo(f" Ingested {lf.stem}: {count} chunks")
	1401	+ except Exception as e:
	1402	+ click.echo(f" Failed to ingest {lf.stem}: {e}", err=True)
	1403	+
	1404	+ kg.save(kg_path)
	1405	+ kg.save(kg_path.with_suffix(".json"))
	1406	+
	1407	+ entity_count = kg._store.get_entity_count()
	1408	+ rel_count = kg._store.get_relationship_count()
	1409	+
	1410	+ click.echo("\nIngestion complete:")
	1411	+ click.echo(f" Documents: {len(local_files)}")
	1412	+ click.echo(f" Chunks: {total_chunks}")
	1413	+ click.echo(f" Entities: {entity_count}")
	1414	+ click.echo(f" Relationships: {rel_count}")
	1415	+ click.echo(f" Knowledge graph: {kg_path}")
	1416	+
1018	1417
1019	1418	@cli.group()
1020	1419	def kg():
1021	1420	"""Knowledge graph utilities: convert, sync, and inspect."""
1022	1421	pass
1023	1422

	--- video_processor/cli/commands.py
	+++ video_processor/cli/commands.py
	@@ -1013,10 +1013,409 @@
1013	click.echo("Dropbox authentication successful.")
1014	else:
1015	click.echo("Dropbox authentication failed.", err=True)
1016	sys.exit(1)
1017















































































































































































































































































































































































































1018
1019	@cli.group()
1020	def kg():
1021	"""Knowledge graph utilities: convert, sync, and inspect."""
1022	pass
1023

	--- video_processor/cli/commands.py
	+++ video_processor/cli/commands.py
	@@ -1013,10 +1013,409 @@
1013	click.echo("Dropbox authentication successful.")
1014	else:
1015	click.echo("Dropbox authentication failed.", err=True)
1016	sys.exit(1)
1017
1018
1019	@cli.group()
1020	def gws():
1021	"""Google Workspace: fetch docs, sheets, and slides via the gws CLI."""
1022	pass
1023
1024
1025	@gws.command("list")
1026	@click.option("--folder-id", type=str, default=None, help="Drive folder ID to list")
1027	@click.option("--query", "-q", type=str, default=None, help="Drive search query")
1028	@click.option("--json", "as_json", is_flag=True, help="Output as JSON")
1029	def gws_list(folder_id, query, as_json):
1030	"""List documents in Google Drive.
1031
1032	Examples:
1033
1034	planopticon gws list
1035
1036	planopticon gws list --folder-id 1abc...
1037
1038	planopticon gws list -q "name contains 'PRD'" --json
1039	"""
1040	from video_processor.sources.gws_source import GWSSource
1041
1042	source = GWSSource(folder_id=folder_id, query=query)
1043	if not source.authenticate():
1044	click.echo("Error: gws CLI not available or not authenticated.", err=True)
1045	click.echo("Install: npm install -g @googleworkspace/cli", err=True)
1046	click.echo("Auth: gws auth login", err=True)
1047	sys.exit(1)
1048
1049	files = source.list_videos(folder_id=folder_id)
1050	if as_json:
1051	click.echo(json.dumps([f.model_dump() for f in files], indent=2, default=str))
1052	else:
1053	if not files:
1054	click.echo("No documents found.")
1055	return
1056	for f in files:
1057	size = f"{f.size_bytes / 1024:.0f}KB" if f.size_bytes else "—"
1058	click.echo(f" {f.id[:12]}… {size:>8s} {f.mime_type or ''} {f.name}")
1059
1060
1061	@gws.command("fetch")
1062	@click.argument("doc_ids", nargs=-1)
1063	@click.option("--folder-id", type=str, default=None, help="Fetch all docs in a folder")
1064	@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory")
1065	def gws_fetch(doc_ids, folder_id, output):
1066	"""Fetch Google Docs/Sheets/Slides as text files.
1067
1068	Examples:
1069
1070	planopticon gws fetch DOC_ID1 DOC_ID2 -o ./docs
1071
1072	planopticon gws fetch --folder-id 1abc... -o ./docs
1073	"""
1074	from video_processor.sources.gws_source import GWSSource
1075
1076	source = GWSSource(folder_id=folder_id, doc_ids=list(doc_ids))
1077	if not source.authenticate():
1078	click.echo("Error: gws CLI not available or not authenticated.", err=True)
1079	sys.exit(1)
1080
1081	out_dir = Path(output) if output else Path.cwd() / "gws_docs"
1082	out_dir.mkdir(parents=True, exist_ok=True)
1083
1084	files = source.list_videos(folder_id=folder_id)
1085	if not files:
1086	click.echo("No documents found.")
1087	return
1088
1089	for f in files:
1090	safe_name = f.name.replace("/", "_").replace("\\", "_")
1091	dest = out_dir / f"{safe_name}.txt"
1092	try:
1093	source.download(f, dest)
1094	click.echo(f" ✓ {f.name} → {dest}")
1095	except Exception as e:
1096	click.echo(f" ✗ {f.name}: {e}", err=True)
1097
1098	click.echo(f"\nFetched {len(files)} document(s) to {out_dir}")
1099
1100
1101	@gws.command("ingest")
1102	@click.option("--folder-id", type=str, default=None, help="Drive folder ID")
1103	@click.option("--doc-id", type=str, multiple=True, help="Specific doc IDs (repeatable)")
1104	@click.option("--query", "-q", type=str, default=None, help="Drive search query")
1105	@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory")
1106	@click.option("--db-path", type=click.Path(), default=None, help="Existing DB to merge into")
1107	@click.option(
1108	"-p",
1109	"--provider",
1110	type=click.Choice(
1111	[
1112	"auto",
1113	"openai",
1114	"anthropic",
1115	"gemini",
1116	"ollama",
1117	"azure",
1118	"together",
1119	"fireworks",
1120	"cerebras",
1121	"xai",
1122	]
1123	),
1124	default="auto",
1125	help="API provider",
1126	)
1127	@click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks")
1128	@click.pass_context
1129	def gws_ingest(ctx, folder_id, doc_id, query, output, db_path, provider, chat_model):
1130	"""Fetch Google Workspace docs and ingest into a knowledge graph.
1131
1132	Combines gws fetch + planopticon ingest in one step.
1133
1134	Examples:
1135
1136	planopticon gws ingest --folder-id 1abc...
1137
1138	planopticon gws ingest --doc-id DOC1 --doc-id DOC2 -o ./results
1139
1140	planopticon gws ingest -q "name contains 'spec'" --db-path existing.db
1141	"""
1142	import tempfile
1143
1144	from video_processor.integrators.knowledge_graph import KnowledgeGraph
1145	from video_processor.processors.ingest import ingest_file
1146	from video_processor.providers.manager import ProviderManager
1147	from video_processor.sources.gws_source import GWSSource
1148
1149	source = GWSSource(folder_id=folder_id, doc_ids=list(doc_id), query=query)
1150	if not source.authenticate():
1151	click.echo("Error: gws CLI not available or not authenticated.", err=True)
1152	click.echo("Install: npm install -g @googleworkspace/cli", err=True)
1153	click.echo("Auth: gws auth login", err=True)
1154	sys.exit(1)
1155
1156	# Fetch docs to temp dir
1157	files = source.list_videos(folder_id=folder_id)
1158	if not files:
1159	click.echo("No documents found.")
1160	return
1161
1162	click.echo(f"Found {len(files)} document(s), fetching...")
1163
1164	with tempfile.TemporaryDirectory() as tmp_dir:
1165	tmp_path = Path(tmp_dir)
1166	local_files = []
1167	for f in files:
1168	safe_name = f.name.replace("/", "_").replace("\\", "_")
1169	dest = tmp_path / f"{safe_name}.txt"
1170	try:
1171	source.download(f, dest)
1172	local_files.append(dest)
1173	click.echo(f" ✓ {f.name}")
1174	except Exception as e:
1175	click.echo(f" ✗ {f.name}: {e}", err=True)
1176
1177	if not local_files:
1178	click.echo("No documents fetched successfully.", err=True)
1179	sys.exit(1)
1180
1181	# Set up KG
1182	prov = None if provider == "auto" else provider
1183	pm = ProviderManager(chat_model=chat_model, provider=prov)
1184
1185	if db_path:
1186	kg_path = Path(db_path)
1187	elif output:
1188	out_dir = Path(output)
1189	out_dir.mkdir(parents=True, exist_ok=True)
1190	kg_path = out_dir / "knowledge_graph.db"
1191	else:
1192	kg_path = Path.cwd() / "knowledge_graph.db"
1193
1194	kg_path.parent.mkdir(parents=True, exist_ok=True)
1195	kg = KnowledgeGraph(provider_manager=pm, db_path=kg_path)
1196
1197	total_chunks = 0
1198	for lf in local_files:
1199	try:
1200	count = ingest_file(lf, kg)
1201	total_chunks += count
1202	click.echo(f" Ingested {lf.stem}: {count} chunks")
1203	except Exception as e:
1204	click.echo(f" Failed to ingest {lf.stem}: {e}", err=True)
1205
1206	kg.save(kg_path)
1207	kg.save(kg_path.with_suffix(".json"))
1208
1209	entity_count = kg._store.get_entity_count()
1210	rel_count = kg._store.get_relationship_count()
1211
1212	click.echo("\nIngestion complete:")
1213	click.echo(f" Documents: {len(local_files)}")
1214	click.echo(f" Chunks: {total_chunks}")
1215	click.echo(f" Entities: {entity_count}")
1216	click.echo(f" Relationships: {rel_count}")
1217	click.echo(f" Knowledge graph: {kg_path}")
1218
1219
1220	@cli.group()
1221	def m365():
1222	"""Microsoft 365: fetch docs from SharePoint and OneDrive via the m365 CLI."""
1223	pass
1224
1225
1226	@m365.command("list")
1227	@click.option("--web-url", type=str, required=True, help="SharePoint site URL")
1228	@click.option("--folder-url", type=str, required=True, help="Server-relative folder URL")
1229	@click.option("--recursive", is_flag=True, help="Include subfolders")
1230	@click.option("--json", "as_json", is_flag=True, help="Output as JSON")
1231	def m365_list(web_url, folder_url, recursive, as_json):
1232	"""List documents in SharePoint or OneDrive.
1233
1234	Examples:
1235
1236	planopticon m365 list --web-url https://contoso.sharepoint.com/sites/proj \\
1237	--folder-url /sites/proj/Shared\\ Documents
1238
1239	planopticon m365 list --web-url URL --folder-url FOLDER --recursive --json
1240	"""
1241	from video_processor.sources.m365_source import M365Source
1242
1243	source = M365Source(web_url=web_url, folder_url=folder_url, recursive=recursive)
1244	if not source.authenticate():
1245	click.echo("Error: m365 CLI not available or not logged in.", err=True)
1246	click.echo("Install: npm install -g @pnp/cli-microsoft365", err=True)
1247	click.echo("Auth: m365 login", err=True)
1248	sys.exit(1)
1249
1250	files = source.list_videos()
1251	if as_json:
1252	click.echo(json.dumps([f.model_dump() for f in files], indent=2, default=str))
1253	else:
1254	if not files:
1255	click.echo("No documents found.")
1256	return
1257	for f in files:
1258	size = f"{f.size_bytes / 1024:.0f}KB" if f.size_bytes else "—"
1259	click.echo(f" {f.id[:12]}… {size:>8s} {f.name}")
1260
1261
1262	@m365.command("fetch")
1263	@click.option("--web-url", type=str, required=True, help="SharePoint site URL")
1264	@click.option("--folder-url", type=str, default=None, help="Server-relative folder URL")
1265	@click.option("--file-id", type=str, multiple=True, help="Specific file IDs (repeatable)")
1266	@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory")
1267	def m365_fetch(web_url, folder_url, file_id, output):
1268	"""Fetch SharePoint/OneDrive documents as local files.
1269
1270	Examples:
1271
1272	planopticon m365 fetch --web-url URL --folder-url FOLDER -o ./docs
1273
1274	planopticon m365 fetch --web-url URL --file-id ID1 --file-id ID2 -o ./docs
1275	"""
1276	from video_processor.sources.m365_source import M365Source
1277
1278	source = M365Source(web_url=web_url, folder_url=folder_url, file_ids=list(file_id))
1279	if not source.authenticate():
1280	click.echo("Error: m365 CLI not available or not logged in.", err=True)
1281	sys.exit(1)
1282
1283	out_dir = Path(output) if output else Path.cwd() / "m365_docs"
1284	out_dir.mkdir(parents=True, exist_ok=True)
1285
1286	files = source.list_videos()
1287	if not files:
1288	click.echo("No documents found.")
1289	return
1290
1291	for f in files:
1292	dest = out_dir / f.name
1293	try:
1294	source.download(f, dest)
1295	click.echo(f" fetched {f.name}")
1296	except Exception as e:
1297	click.echo(f" failed {f.name}: {e}", err=True)
1298
1299	click.echo(f"\nFetched {len(files)} document(s) to {out_dir}")
1300
1301
1302	@m365.command("ingest")
1303	@click.option("--web-url", type=str, required=True, help="SharePoint site URL")
1304	@click.option("--folder-url", type=str, default=None, help="Server-relative folder URL")
1305	@click.option("--file-id", type=str, multiple=True, help="Specific file IDs (repeatable)")
1306	@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory")
1307	@click.option("--db-path", type=click.Path(), default=None, help="Existing DB to merge into")
1308	@click.option(
1309	"-p",
1310	"--provider",
1311	type=click.Choice(
1312	[
1313	"auto",
1314	"openai",
1315	"anthropic",
1316	"gemini",
1317	"ollama",
1318	"azure",
1319	"together",
1320	"fireworks",
1321	"cerebras",
1322	"xai",
1323	]
1324	),
1325	default="auto",
1326	help="API provider",
1327	)
1328	@click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks")
1329	@click.pass_context
1330	def m365_ingest(ctx, web_url, folder_url, file_id, output, db_path, provider, chat_model):
1331	"""Fetch SharePoint/OneDrive docs and ingest into a knowledge graph.
1332
1333	Examples:
1334
1335	planopticon m365 ingest --web-url URL --folder-url FOLDER
1336
1337	planopticon m365 ingest --web-url URL --file-id ID1 --file-id ID2 -o ./results
1338	"""
1339	import tempfile
1340
1341	from video_processor.integrators.knowledge_graph import KnowledgeGraph
1342	from video_processor.processors.ingest import ingest_file
1343	from video_processor.providers.manager import ProviderManager
1344	from video_processor.sources.m365_source import M365Source
1345
1346	source = M365Source(web_url=web_url, folder_url=folder_url, file_ids=list(file_id))
1347	if not source.authenticate():
1348	click.echo("Error: m365 CLI not available or not logged in.", err=True)
1349	click.echo("Install: npm install -g @pnp/cli-microsoft365", err=True)
1350	click.echo("Auth: m365 login", err=True)
1351	sys.exit(1)
1352
1353	files = source.list_videos()
1354	if not files:
1355	click.echo("No documents found.")
1356	return
1357
1358	click.echo(f"Found {len(files)} document(s), fetching...")
1359
1360	with tempfile.TemporaryDirectory() as tmp_dir:
1361	tmp_path = Path(tmp_dir)
1362	local_files = []
1363	for f in files:
1364	dest = tmp_path / f.name
1365	try:
1366	source.download(f, dest)
1367	# Extract text for non-text formats
1368	text_dest = tmp_path / f"{Path(f.name).stem}.txt"
1369	text = source.download_as_text(f)
1370	text_dest.write_text(text, encoding="utf-8")
1371	local_files.append(text_dest)
1372	click.echo(f" fetched {f.name}")
1373	except Exception as e:
1374	click.echo(f" failed {f.name}: {e}", err=True)
1375
1376	if not local_files:
1377	click.echo("No documents fetched successfully.", err=True)
1378	sys.exit(1)
1379
1380	prov = None if provider == "auto" else provider
1381	pm = ProviderManager(chat_model=chat_model, provider=prov)
1382
1383	if db_path:
1384	kg_path = Path(db_path)
1385	elif output:
1386	out_dir = Path(output)
1387	out_dir.mkdir(parents=True, exist_ok=True)
1388	kg_path = out_dir / "knowledge_graph.db"
1389	else:
1390	kg_path = Path.cwd() / "knowledge_graph.db"
1391
1392	kg_path.parent.mkdir(parents=True, exist_ok=True)
1393	kg = KnowledgeGraph(provider_manager=pm, db_path=kg_path)
1394
1395	total_chunks = 0
1396	for lf in local_files:
1397	try:
1398	count = ingest_file(lf, kg)
1399	total_chunks += count
1400	click.echo(f" Ingested {lf.stem}: {count} chunks")
1401	except Exception as e:
1402	click.echo(f" Failed to ingest {lf.stem}: {e}", err=True)
1403
1404	kg.save(kg_path)
1405	kg.save(kg_path.with_suffix(".json"))
1406
1407	entity_count = kg._store.get_entity_count()
1408	rel_count = kg._store.get_relationship_count()
1409
1410	click.echo("\nIngestion complete:")
1411	click.echo(f" Documents: {len(local_files)}")
1412	click.echo(f" Chunks: {total_chunks}")
1413	click.echo(f" Entities: {entity_count}")
1414	click.echo(f" Relationships: {rel_count}")
1415	click.echo(f" Knowledge graph: {kg_path}")
1416
1417
1418	@cli.group()
1419	def kg():
1420	"""Knowledge graph utilities: convert, sync, and inspect."""
1421	pass
1422

M video_processor/sources/__init__.py

		--- video_processor/sources/__init__.py
		+++ video_processor/sources/__init__.py
		@@ -11,10 +11,12 @@
11	11	"HackerNewsSource",
12	12	"PodcastSource",
13	13	"RedditSource",
14	14	"RSSSource",
15	15	"TwitterSource",
	16	+ "GWSSource",
	17	+ "M365Source",
16	18	"WebSource",
17	19	"YouTubeSource",
18	20	]
19	21
20	22
		@@ -22,10 +24,12 @@
22	24	"""Lazy imports to avoid pulling in optional dependencies at import time."""
23	25	_lazy_map = {
24	26	"ArxivSource": "video_processor.sources.arxiv_source",
25	27	"GitHubSource": "video_processor.sources.github_source",
26	28	"GoogleDriveSource": "video_processor.sources.google_drive",
	29	+ "GWSSource": "video_processor.sources.gws_source",
	30	+ "M365Source": "video_processor.sources.m365_source",
27	31	"HackerNewsSource": "video_processor.sources.hackernews_source",
28	32	"PodcastSource": "video_processor.sources.podcast_source",
29	33	"RedditSource": "video_processor.sources.reddit_source",
30	34	"RSSSource": "video_processor.sources.rss_source",
31	35	"TwitterSource": "video_processor.sources.twitter_source",
32	36
33	37	ADDED video_processor/sources/gws_source.py
34	38	ADDED video_processor/sources/m365_source.py

	--- video_processor/sources/__init__.py
	+++ video_processor/sources/__init__.py
	@@ -11,10 +11,12 @@
11	"HackerNewsSource",
12	"PodcastSource",
13	"RedditSource",
14	"RSSSource",
15	"TwitterSource",


16	"WebSource",
17	"YouTubeSource",
18	]
19
20
	@@ -22,10 +24,12 @@
22	"""Lazy imports to avoid pulling in optional dependencies at import time."""
23	_lazy_map = {
24	"ArxivSource": "video_processor.sources.arxiv_source",
25	"GitHubSource": "video_processor.sources.github_source",
26	"GoogleDriveSource": "video_processor.sources.google_drive",


27	"HackerNewsSource": "video_processor.sources.hackernews_source",
28	"PodcastSource": "video_processor.sources.podcast_source",
29	"RedditSource": "video_processor.sources.reddit_source",
30	"RSSSource": "video_processor.sources.rss_source",
31	"TwitterSource": "video_processor.sources.twitter_source",
32
33	DDED video_processor/sources/gws_source.py
34	DDED video_processor/sources/m365_source.py

	--- video_processor/sources/__init__.py
	+++ video_processor/sources/__init__.py
	@@ -11,10 +11,12 @@
11	"HackerNewsSource",
12	"PodcastSource",
13	"RedditSource",
14	"RSSSource",
15	"TwitterSource",
16	"GWSSource",
17	"M365Source",
18	"WebSource",
19	"YouTubeSource",
20	]
21
22
	@@ -22,10 +24,12 @@
24	"""Lazy imports to avoid pulling in optional dependencies at import time."""
25	_lazy_map = {
26	"ArxivSource": "video_processor.sources.arxiv_source",
27	"GitHubSource": "video_processor.sources.github_source",
28	"GoogleDriveSource": "video_processor.sources.google_drive",
29	"GWSSource": "video_processor.sources.gws_source",
30	"M365Source": "video_processor.sources.m365_source",
31	"HackerNewsSource": "video_processor.sources.hackernews_source",
32	"PodcastSource": "video_processor.sources.podcast_source",
33	"RedditSource": "video_processor.sources.reddit_source",
34	"RSSSource": "video_processor.sources.rss_source",
35	"TwitterSource": "video_processor.sources.twitter_source",
36
37	DDED video_processor/sources/gws_source.py
38	DDED video_processor/sources/m365_source.py

A video_processor/sources/gws_source.py

+268

		--- a/video_processor/sources/gws_source.py
		+++ b/video_processor/sources/gws_source.py
		@@ -0,0 +1,268 @@
	1	+"""Google Workspace source connector using the gws CLI (googleworkspace/cli).
	2	+
	3	+Fetches and collates Google Docs, Sheets, Slides, and other Drive files
	4	+via the `gws` CLI tool. Outputs plain text suitable for KG ingestion.
	5	+
	6	+Requires: npm install -g @googleworkspace/cli
	7	+Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless)
	8	+"""
	9	+
	10	+import json
	11	+import logging
	12	+import shutil
	13	+import subprocess
	14	+from pathlib import Path
	15	+from typing import Any, Dict, List, Optional
	16	+
	17	+from video_processor.sources.base import BaseSource, SourceFile
	18	+
	19	+logger = logging.getLogger(__name__)
	20	+
	21	+# Google Workspace MIME types we can extract text from
	22	+_DOC_MIMES = {
	23	+ "application/vnd.google-apps.document",
	24	+ "application/vnd.google-apps.spreadsheet",
	25	+ "application/vnd.google-apps.presentation",
	26	+ "application/pdf",
	27	+ "text/plain",
	28	+ "text/markdown",
	29	+ "text/html",
	30	+}
	31	+
	32	+# Export MIME mappings for native Google formats
	33	+_EXPORT_MIMES = {
	34	+ "application/vnd.google-apps.document": "text/plain",
	35	+ "application/vnd.google-apps.spreadsheet": "text/csv",
	36	+ "application/vnd.google-apps.presentation": "text/plain",
	37	+}
	38	+
	39	+
	40	+def _run_gws(args: List[str], timeout: int = 30) -> Dict[str, Any]:
	41	+ """Run a gws CLI command and return parsed JSON output."""
	42	+ cmd = ["gws"] + args
	43	+ proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
	44	+ if proc.returncode != 0:
	45	+ raise RuntimeError(f"gws {' '.join(args)} failed: {proc.stderr.strip()}")
	46	+ try:
	47	+ return json.loads(proc.stdout)
	48	+ except json.JSONDecodeError:
	49	+ return {"raw": proc.stdout.strip()}
	50	+
	51	+
	52	+class GWSSource(BaseSource):
	53	+ """
	54	+ Fetch documents from Google Workspace (Drive, Docs, Sheets, Slides) via gws CLI.
	55	+
	56	+ Usage:
	57	+ source = GWSSource(folder_id="1abc...") # specific Drive folder
	58	+ source = GWSSource(query="type:document") # Drive search query
	59	+ files = source.list_videos() # lists docs, not just videos
	60	+ source.download_all(files, Path("./docs"))
	61	+ """
	62	+
	63	+ def __init__(
	64	+ self,
	65	+ folder_id: Optional[str] = None,
	66	+ query: Optional[str] = None,
	67	+ doc_ids: Optional[List[str]] = None,
	68	+ mime_filter: Optional[List[str]] = None,
	69	+ ):
	70	+ self.folder_id = folder_id
	71	+ self.query = query
	72	+ self.doc_ids = doc_ids or []
	73	+ self.mime_filter = set(mime_filter) if mime_filter else _DOC_MIMES
	74	+
	75	+ def authenticate(self) -> bool:
	76	+ """Check if gws CLI is installed and authenticated."""
	77	+ if not shutil.which("gws"):
	78	+ logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli")
	79	+ return False
	80	+ try:
	81	+ _run_gws(["auth", "status"], timeout=10)
	82	+ return True
	83	+ except (RuntimeError, subprocess.TimeoutExpired):
	84	+ logger.error("gws not authenticated. Run: gws auth login")
	85	+ return False
	86	+
	87	+ def list_videos(
	88	+ self,
	89	+ folder_id: Optional[str] = None,
	90	+ folder_path: Optional[str] = None,
	91	+ patterns: Optional[List[str]] = None,
	92	+ ) -> List[SourceFile]:
	93	+ """List documents in Drive. Despite the method name, returns docs not just videos."""
	94	+ folder = folder_id or self.folder_id
	95	+ files: List[SourceFile] = []
	96	+
	97	+ # If specific doc IDs were provided, fetch metadata for each
	98	+ if self.doc_ids:
	99	+ for doc_id in self.doc_ids:
	100	+ try:
	101	+ result = _run_gws(
	102	+ [
	103	+ "drive",
	104	+ "files",
	105	+ "get",
	106	+ "--params",
	107	+ json.dumps(
	108	+ {"fileId": doc_id, "fields": "id,name,mimeType,size,modifiedTime"}
	109	+ ),
	110	+ ]
	111	+ )
	112	+ files.append(_result_to_source_file(result))
	113	+ except RuntimeError as e:
	114	+ logger.warning(f"Failed to fetch doc {doc_id}: {e}")
	115	+ return files
	116	+
	117	+ # Build Drive files list query
	118	+ params: Dict[str, Any] = {
	119	+ "pageSize": 100,
	120	+ "fields": "files(id,name,mimeType,size,modifiedTime)",
	121	+ }
	122	+
	123	+ q_parts = []
	124	+ if folder:
	125	+ q_parts.append(f"'{folder}' in parents")
	126	+ if self.query:
	127	+ q_parts.append(self.query)
	128	+ # Filter to document types
	129	+ mime_clauses = [f"mimeType='{m}'" for m in self.mime_filter]
	130	+ if mime_clauses:
	131	+ q_parts.append(f"({' or '.join(mime_clauses)})")
	132	+ if q_parts:
	133	+ params["q"] = " and ".join(q_parts)
	134	+
	135	+ try:
	136	+ result = _run_gws(
	137	+ [
	138	+ "drive",
	139	+ "files",
	140	+ "list",
	141	+ "--params",
	142	+ json.dumps(params),
	143	+ ],
	144	+ timeout=60,
	145	+ )
	146	+ except RuntimeError as e:
	147	+ logger.error(f"Failed to list Drive files: {e}")
	148	+ return []
	149	+
	150	+ for item in result.get("files", []):
	151	+ files.append(_result_to_source_file(item))
	152	+
	153	+ logger.info(f"Found {len(files)} document(s) in Google Drive")
	154	+ return files
	155	+
	156	+ def download(self, file: SourceFile, destination: Path) -> Path:
	157	+ """Download/export a document to a local text file."""
	158	+ destination = Path(destination)
	159	+ destination.parent.mkdir(parents=True, exist_ok=True)
	160	+
	161	+ mime = file.mime_type or ""
	162	+
	163	+ # Native Google format — export as text
	164	+ if mime in _EXPORT_MIMES:
	165	+ content = self._export_doc(file.id, mime)
	166	+ # Regular file — download directly
	167	+ else:
	168	+ content = self._download_file(file.id)
	169	+
	170	+ destination.write_text(content, encoding="utf-8")
	171	+ logger.info(f"Saved {file.name} to {destination}")
	172	+ return destination
	173	+
	174	+ def _export_doc(self, file_id: str, source_mime: str) -> str:
	175	+ """Export a native Google doc to text via gws."""
	176	+ export_mime = _EXPORT_MIMES.get(source_mime, "text/plain")
	177	+ try:
	178	+ result = _run_gws(
	179	+ [
	180	+ "drive",
	181	+ "files",
	182	+ "export",
	183	+ "--params",
	184	+ json.dumps({"fileId": file_id, "mimeType": export_mime}),
	185	+ ],
	186	+ timeout=60,
	187	+ )
	188	+ return result.get("raw", json.dumps(result, indent=2))
	189	+ except RuntimeError:
	190	+ # Fallback: try getting via Docs API for Google Docs
	191	+ if source_mime == "application/vnd.google-apps.document":
	192	+ return self._get_doc_text(file_id)
	193	+ raise
	194	+
	195	+ def _get_doc_text(self, doc_id: str) -> str:
	196	+ """Fetch Google Doc content via the Docs API and extract text."""
	197	+ result = _run_gws(
	198	+ [
	199	+ "docs",
	200	+ "documents",
	201	+ "get",
	202	+ "--params",
	203	+ json.dumps({"documentId": doc_id}),
	204	+ ],
	205	+ timeout=60,
	206	+ )
	207	+
	208	+ # Extract text from the Docs API structural response
	209	+ body = result.get("body", {})
	210	+ content_parts = []
	211	+ for element in body.get("content", []):
	212	+ paragraph = element.get("paragraph", {})
	213	+ for pe in paragraph.get("elements", []):
	214	+ text_run = pe.get("textRun", {})
	215	+ text = text_run.get("content", "")
	216	+ if text.strip():
	217	+ content_parts.append(text)
	218	+
	219	+ return "".join(content_parts) if content_parts else json.dumps(result, indent=2)
	220	+
	221	+ def _download_file(self, file_id: str) -> str:
	222	+ """Download a non-native file's content."""
	223	+ result = _run_gws(
	224	+ [
	225	+ "drive",
	226	+ "files",
	227	+ "get",
	228	+ "--params",
	229	+ json.dumps({"fileId": file_id, "alt": "media"}),
	230	+ ],
	231	+ timeout=60,
	232	+ )
	233	+ return result.get("raw", json.dumps(result, indent=2))
	234	+
	235	+ def fetch_all_text(self, folder_id: Optional[str] = None) -> Dict[str, str]:
	236	+ """Convenience: list all docs and return {filename: text_content} dict."""
	237	+ files = self.list_videos(folder_id=folder_id)
	238	+ results = {}
	239	+ for f in files:
	240	+ try:
	241	+ if f.mime_type and f.mime_type in _EXPORT_MIMES:
	242	+ results[f.name] = self._export_doc(f.id, f.mime_type)
	243	+ else:
	244	+ results[f.name] = self._download_file(f.id)
	245	+ except Exception as e:
	246	+ logger.warning(f"Failed to fetch {f.name}: {e}")
	247	+ results[f.name] = f"[Error: {e}]"
	248	+ return results
	249	+
	250	+ def collate(self, folder_id: Optional[str] = None, separator: str = "\n\n---\n\n") -> str:
	251	+ """Fetch all docs and collate into a single text blob for ingestion."""
	252	+ docs = self.fetch_all_text(folder_id=folder_id)
	253	+ parts = []
	254	+ for name, content in docs.items():
	255	+ parts.append(f"# {name}\n\n{content}")
	256	+ return separator.join(parts)
	257	+
	258	+
	259	+def _result_to_source_file(item: dict) -> SourceFile:
	260	+ """Convert a Drive API file result to SourceFile."""
	261	+ size = item.get("size")
	262	+ return SourceFile(
	263	+ name=item.get("name", "Untitled"),
	264	+ id=item.get("id", ""),
	265	+ size_bytes=int(size) if size else None,
	266	+ mime_type=item.get("mimeType"),
	267	+ modified_at=item.get("modifiedTime"),
	268	+ )

	--- a/video_processor/sources/gws_source.py
	+++ b/video_processor/sources/gws_source.py
	@@ -0,0 +1,268 @@

	--- a/video_processor/sources/gws_source.py
	+++ b/video_processor/sources/gws_source.py
	@@ -0,0 +1,268 @@
1	"""Google Workspace source connector using the gws CLI (googleworkspace/cli).
2
3	Fetches and collates Google Docs, Sheets, Slides, and other Drive files
4	via the `gws` CLI tool. Outputs plain text suitable for KG ingestion.
5
6	Requires: npm install -g @googleworkspace/cli
7	Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless)
8	"""
9
10	import json
11	import logging
12	import shutil
13	import subprocess
14	from pathlib import Path
15	from typing import Any, Dict, List, Optional
16
17	from video_processor.sources.base import BaseSource, SourceFile
18
19	logger = logging.getLogger(__name__)
20
21	# Google Workspace MIME types we can extract text from
22	_DOC_MIMES = {
23	"application/vnd.google-apps.document",
24	"application/vnd.google-apps.spreadsheet",
25	"application/vnd.google-apps.presentation",
26	"application/pdf",
27	"text/plain",
28	"text/markdown",
29	"text/html",
30	}
31
32	# Export MIME mappings for native Google formats
33	_EXPORT_MIMES = {
34	"application/vnd.google-apps.document": "text/plain",
35	"application/vnd.google-apps.spreadsheet": "text/csv",
36	"application/vnd.google-apps.presentation": "text/plain",
37	}
38
39
40	def _run_gws(args: List[str], timeout: int = 30) -> Dict[str, Any]:
41	"""Run a gws CLI command and return parsed JSON output."""
42	cmd = ["gws"] + args
43	proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
44	if proc.returncode != 0:
45	raise RuntimeError(f"gws {' '.join(args)} failed: {proc.stderr.strip()}")
46	try:
47	return json.loads(proc.stdout)
48	except json.JSONDecodeError:
49	return {"raw": proc.stdout.strip()}
50
51
52	class GWSSource(BaseSource):
53	"""
54	Fetch documents from Google Workspace (Drive, Docs, Sheets, Slides) via gws CLI.
55
56	Usage:
57	source = GWSSource(folder_id="1abc...") # specific Drive folder
58	source = GWSSource(query="type:document") # Drive search query
59	files = source.list_videos() # lists docs, not just videos
60	source.download_all(files, Path("./docs"))
61	"""
62
63	def __init__(
64	self,
65	folder_id: Optional[str] = None,
66	query: Optional[str] = None,
67	doc_ids: Optional[List[str]] = None,
68	mime_filter: Optional[List[str]] = None,
69	):
70	self.folder_id = folder_id
71	self.query = query
72	self.doc_ids = doc_ids or []
73	self.mime_filter = set(mime_filter) if mime_filter else _DOC_MIMES
74
75	def authenticate(self) -> bool:
76	"""Check if gws CLI is installed and authenticated."""
77	if not shutil.which("gws"):
78	logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli")
79	return False
80	try:
81	_run_gws(["auth", "status"], timeout=10)
82	return True
83	except (RuntimeError, subprocess.TimeoutExpired):
84	logger.error("gws not authenticated. Run: gws auth login")
85	return False
86
87	def list_videos(
88	self,
89	folder_id: Optional[str] = None,
90	folder_path: Optional[str] = None,
91	patterns: Optional[List[str]] = None,
92	) -> List[SourceFile]:
93	"""List documents in Drive. Despite the method name, returns docs not just videos."""
94	folder = folder_id or self.folder_id
95	files: List[SourceFile] = []
96
97	# If specific doc IDs were provided, fetch metadata for each
98	if self.doc_ids:
99	for doc_id in self.doc_ids:
100	try:
101	result = _run_gws(
102	[
103	"drive",
104	"files",
105	"get",
106	"--params",
107	json.dumps(
108	{"fileId": doc_id, "fields": "id,name,mimeType,size,modifiedTime"}
109	),
110	]
111	)
112	files.append(_result_to_source_file(result))
113	except RuntimeError as e:
114	logger.warning(f"Failed to fetch doc {doc_id}: {e}")
115	return files
116
117	# Build Drive files list query
118	params: Dict[str, Any] = {
119	"pageSize": 100,
120	"fields": "files(id,name,mimeType,size,modifiedTime)",
121	}
122
123	q_parts = []
124	if folder:
125	q_parts.append(f"'{folder}' in parents")
126	if self.query:
127	q_parts.append(self.query)
128	# Filter to document types
129	mime_clauses = [f"mimeType='{m}'" for m in self.mime_filter]
130	if mime_clauses:
131	q_parts.append(f"({' or '.join(mime_clauses)})")
132	if q_parts:
133	params["q"] = " and ".join(q_parts)
134
135	try:
136	result = _run_gws(
137	[
138	"drive",
139	"files",
140	"list",
141	"--params",
142	json.dumps(params),
143	],
144	timeout=60,
145	)
146	except RuntimeError as e:
147	logger.error(f"Failed to list Drive files: {e}")
148	return []
149
150	for item in result.get("files", []):
151	files.append(_result_to_source_file(item))
152
153	logger.info(f"Found {len(files)} document(s) in Google Drive")
154	return files
155
156	def download(self, file: SourceFile, destination: Path) -> Path:
157	"""Download/export a document to a local text file."""
158	destination = Path(destination)
159	destination.parent.mkdir(parents=True, exist_ok=True)
160
161	mime = file.mime_type or ""
162
163	# Native Google format — export as text
164	if mime in _EXPORT_MIMES:
165	content = self._export_doc(file.id, mime)
166	# Regular file — download directly
167	else:
168	content = self._download_file(file.id)
169
170	destination.write_text(content, encoding="utf-8")
171	logger.info(f"Saved {file.name} to {destination}")
172	return destination
173
174	def _export_doc(self, file_id: str, source_mime: str) -> str:
175	"""Export a native Google doc to text via gws."""
176	export_mime = _EXPORT_MIMES.get(source_mime, "text/plain")
177	try:
178	result = _run_gws(
179	[
180	"drive",
181	"files",
182	"export",
183	"--params",
184	json.dumps({"fileId": file_id, "mimeType": export_mime}),
185	],
186	timeout=60,
187	)
188	return result.get("raw", json.dumps(result, indent=2))
189	except RuntimeError:
190	# Fallback: try getting via Docs API for Google Docs
191	if source_mime == "application/vnd.google-apps.document":
192	return self._get_doc_text(file_id)
193	raise
194
195	def _get_doc_text(self, doc_id: str) -> str:
196	"""Fetch Google Doc content via the Docs API and extract text."""
197	result = _run_gws(
198	[
199	"docs",
200	"documents",
201	"get",
202	"--params",
203	json.dumps({"documentId": doc_id}),
204	],
205	timeout=60,
206	)
207
208	# Extract text from the Docs API structural response
209	body = result.get("body", {})
210	content_parts = []
211	for element in body.get("content", []):
212	paragraph = element.get("paragraph", {})
213	for pe in paragraph.get("elements", []):
214	text_run = pe.get("textRun", {})
215	text = text_run.get("content", "")
216	if text.strip():
217	content_parts.append(text)
218
219	return "".join(content_parts) if content_parts else json.dumps(result, indent=2)
220
221	def _download_file(self, file_id: str) -> str:
222	"""Download a non-native file's content."""
223	result = _run_gws(
224	[
225	"drive",
226	"files",
227	"get",
228	"--params",
229	json.dumps({"fileId": file_id, "alt": "media"}),
230	],
231	timeout=60,
232	)
233	return result.get("raw", json.dumps(result, indent=2))
234
235	def fetch_all_text(self, folder_id: Optional[str] = None) -> Dict[str, str]:
236	"""Convenience: list all docs and return {filename: text_content} dict."""
237	files = self.list_videos(folder_id=folder_id)
238	results = {}
239	for f in files:
240	try:
241	if f.mime_type and f.mime_type in _EXPORT_MIMES:
242	results[f.name] = self._export_doc(f.id, f.mime_type)
243	else:
244	results[f.name] = self._download_file(f.id)
245	except Exception as e:
246	logger.warning(f"Failed to fetch {f.name}: {e}")
247	results[f.name] = f"[Error: {e}]"
248	return results
249
250	def collate(self, folder_id: Optional[str] = None, separator: str = "\n\n---\n\n") -> str:
251	"""Fetch all docs and collate into a single text blob for ingestion."""
252	docs = self.fetch_all_text(folder_id=folder_id)
253	parts = []
254	for name, content in docs.items():
255	parts.append(f"# {name}\n\n{content}")
256	return separator.join(parts)
257
258
259	def _result_to_source_file(item: dict) -> SourceFile:
260	"""Convert a Drive API file result to SourceFile."""
261	size = item.get("size")
262	return SourceFile(
263	name=item.get("name", "Untitled"),
264	id=item.get("id", ""),
265	size_bytes=int(size) if size else None,
266	mime_type=item.get("mimeType"),
267	modified_at=item.get("modifiedTime"),
268	)

A video_processor/sources/m365_source.py

+310

		--- a/video_processor/sources/m365_source.py
		+++ b/video_processor/sources/m365_source.py
		@@ -0,0 +1,310 @@
	1	+"""Microsoft 365 source connector using the m365 CLI (cli-microsoft365).
	2	+
	3	+Fetches documents from SharePoint and OneDrive via the `m365` CLI tool.
	4	+Outputs plain text suitable for KG ingestion.
	5	+
	6	+Requires: npm install -g @pnp/cli-microsoft365
	7	+Auth: m365 login (interactive)
	8	+Docs: https://pnp.github.io/cli-microsoft365/
	9	+"""
	10	+
	11	+import json
	12	+import logging
	13	+import shutil
	14	+import subprocess
	15	+import tempfile
	16	+from pathlib import Path
	17	+from typing import Any, Dict, List, Optional
	18	+
	19	+from video_processor.sources.base import BaseSource, SourceFile
	20	+
	21	+logger = logging.getLogger(__name__)
	22	+
	23	+# Document MIME types we can extract text from
	24	+_DOC_EXTENSIONS = {
	25	+ ".docx",
	26	+ ".doc",
	27	+ ".xlsx",
	28	+ ".xls",
	29	+ ".pptx",
	30	+ ".ppt",
	31	+ ".pdf",
	32	+ ".txt",
	33	+ ".md",
	34	+ ".csv",
	35	+ ".html",
	36	+ ".htm",
	37	+}
	38	+
	39	+
	40	+def _run_m365(args: List[str], timeout: int = 30) -> Any:
	41	+ """Run an m365 CLI command and return parsed JSON output."""
	42	+ cmd = ["m365"] + args + ["--output", "json"]
	43	+ proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
	44	+ if proc.returncode != 0:
	45	+ raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}")
	46	+ try:
	47	+ return json.loads(proc.stdout)
	48	+ except json.JSONDecodeError:
	49	+ return proc.stdout.strip()
	50	+
	51	+
	52	+class M365Source(BaseSource):
	53	+ """
	54	+ Fetch documents from SharePoint Online and OneDrive via the m365 CLI.
	55	+
	56	+ Usage:
	57	+ # SharePoint site
	58	+ source = M365Source(
	59	+ web_url="https://contoso.sharepoint.com/sites/project-x",
	60	+ folder_url="/sites/project-x/Shared Documents"
	61	+ )
	62	+
	63	+ # OneDrive
	64	+ source = M365Source(
	65	+ web_url="https://contoso-my.sharepoint.com/personal/user_contoso_com",
	66	+ folder_url="/personal/user_contoso_com/Documents"
	67	+ )
	68	+
	69	+ files = source.list_videos()
	70	+ source.download_all(files, Path("./docs"))
	71	+ """
	72	+
	73	+ def __init__(
	74	+ self,
	75	+ web_url: str,
	76	+ folder_url: Optional[str] = None,
	77	+ file_ids: Optional[List[str]] = None,
	78	+ recursive: bool = False,
	79	+ ):
	80	+ self.web_url = web_url
	81	+ self.folder_url = folder_url
	82	+ self.file_ids = file_ids or []
	83	+ self.recursive = recursive
	84	+
	85	+ def authenticate(self) -> bool:
	86	+ """Check if m365 CLI is installed and logged in."""
	87	+ if not shutil.which("m365"):
	88	+ logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365")
	89	+ return False
	90	+ try:
	91	+ result = _run_m365(["status"], timeout=10)
	92	+ # m365 status returns connection info when logged in
	93	+ if isinstance(result, dict) and result.get("connectedAs"):
	94	+ return True
	95	+ if isinstance(result, str) and "Logged in" in result:
	96	+ return True
	97	+ logger.error("m365 not logged in. Run: m365 login")
	98	+ return False
	99	+ except (RuntimeError, subprocess.TimeoutExpired):
	100	+ logger.error("m365 not logged in. Run: m365 login")
	101	+ return False
	102	+
	103	+ def list_videos(
	104	+ self,
	105	+ folder_id: Optional[str] = None,
	106	+ folder_path: Optional[str] = None,
	107	+ patterns: Optional[List[str]] = None,
	108	+ ) -> List[SourceFile]:
	109	+ """List documents in SharePoint/OneDrive. Returns docs, not just videos."""
	110	+ files: List[SourceFile] = []
	111	+
	112	+ # Fetch specific files by ID
	113	+ if self.file_ids:
	114	+ for fid in self.file_ids:
	115	+ try:
	116	+ result = _run_m365(
	117	+ [
	118	+ "spo",
	119	+ "file",
	120	+ "get",
	121	+ "--webUrl",
	122	+ self.web_url,
	123	+ "--id",
	124	+ fid,
	125	+ ]
	126	+ )
	127	+ files.append(_result_to_source_file(result))
	128	+ except RuntimeError as e:
	129	+ logger.warning(f"Failed to get file {fid}: {e}")
	130	+ return files
	131	+
	132	+ # List files in folder
	133	+ folder = folder_path or self.folder_url
	134	+ if not folder:
	135	+ logger.error("No folder URL specified. Use --folder-url or folder_path parameter.")
	136	+ return []
	137	+
	138	+ try:
	139	+ args = [
	140	+ "file",
	141	+ "list",
	142	+ "--webUrl",
	143	+ self.web_url,
	144	+ "--folderUrl",
	145	+ folder,
	146	+ ]
	147	+ if self.recursive:
	148	+ args.append("--recursive")
	149	+
	150	+ result = _run_m365(args, timeout=60)
	151	+ except RuntimeError as e:
	152	+ logger.error(f"Failed to list files: {e}")
	153	+ return []
	154	+
	155	+ items = result if isinstance(result, list) else []
	156	+ for item in items:
	157	+ name = item.get("Name", item.get("name", ""))
	158	+ ext = Path(name).suffix.lower()
	159	+ if ext in _DOC_EXTENSIONS:
	160	+ files.append(_result_to_source_file(item))
	161	+
	162	+ logger.info(f"Found {len(files)} document(s) in {folder}")
	163	+ return files
	164	+
	165	+ def download(self, file: SourceFile, destination: Path) -> Path:
	166	+ """Download a file from SharePoint/OneDrive."""
	167	+ destination = Path(destination)
	168	+ destination.parent.mkdir(parents=True, exist_ok=True)
	169	+
	170	+ args = [
	171	+ "spo",
	172	+ "file",
	173	+ "get",
	174	+ "--webUrl",
	175	+ self.web_url,
	176	+ "--asFile",
	177	+ "--path",
	178	+ str(destination),
	179	+ ]
	180	+
	181	+ # Use URL if available in path field, otherwise use ID
	182	+ if file.path:
	183	+ args.extend(["--url", file.path])
	184	+ else:
	185	+ args.extend(["--id", file.id])
	186	+
	187	+ _run_m365(args, timeout=120)
	188	+ logger.info(f"Downloaded {file.name} to {destination}")
	189	+ return destination
	190	+
	191	+ def download_as_text(self, file: SourceFile) -> str:
	192	+ """Download a file and attempt to extract text content."""
	193	+ # For text-based formats, get as string directly
	194	+ text_exts = {".txt", ".md", ".csv", ".html", ".htm"}
	195	+ ext = Path(file.name).suffix.lower()
	196	+
	197	+ if ext in text_exts:
	198	+ try:
	199	+ args = [
	200	+ "spo",
	201	+ "file",
	202	+ "get",
	203	+ "--webUrl",
	204	+ self.web_url,
	205	+ "--asString",
	206	+ ]
	207	+ if file.path:
	208	+ args.extend(["--url", file.path])
	209	+ else:
	210	+ args.extend(["--id", file.id])
	211	+
	212	+ result = _run_m365(args, timeout=60)
	213	+ return result if isinstance(result, str) else json.dumps(result)
	214	+ except RuntimeError:
	215	+ pass
	216	+
	217	+ # For binary formats, download to temp and extract
	218	+ with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
	219	+ tmp_path = Path(tmp.name)
	220	+
	221	+ try:
	222	+ self.download(file, tmp_path)
	223	+ return _extract_text(tmp_path)
	224	+ finally:
	225	+ tmp_path.unlink(missing_ok=True)
	226	+
	227	+ def fetch_all_text(self) -> Dict[str, str]:
	228	+ """List all docs and return {filename: text_content} dict."""
	229	+ files = self.list_videos()
	230	+ results = {}
	231	+ for f in files:
	232	+ try:
	233	+ results[f.name] = self.download_as_text(f)
	234	+ except Exception as e:
	235	+ logger.warning(f"Failed to fetch {f.name}: {e}")
	236	+ results[f.name] = f"[Error: {e}]"
	237	+ return results
	238	+
	239	+ def collate(self, separator: str = "\n\n---\n\n") -> str:
	240	+ """Fetch all docs and collate into a single text blob for ingestion."""
	241	+ docs = self.fetch_all_text()
	242	+ parts = []
	243	+ for name, content in docs.items():
	244	+ parts.append(f"# {name}\n\n{content}")
	245	+ return separator.join(parts)
	246	+
	247	+
	248	+def _result_to_source_file(item: dict) -> SourceFile:
	249	+ """Convert an m365 file result to SourceFile."""
	250	+ name = item.get("Name", item.get("name", "Untitled"))
	251	+ file_id = item.get("UniqueId", item.get("uniqueId", item.get("id", "")))
	252	+ size = item.get("Length", item.get("length", item.get("size")))
	253	+ path = item.get("ServerRelativeUrl", item.get("serverRelativeUrl"))
	254	+ modified = item.get("TimeLastModified", item.get("lastModifiedDateTime"))
	255	+
	256	+ return SourceFile(
	257	+ name=name,
	258	+ id=str(file_id),
	259	+ size_bytes=int(size) if size else None,
	260	+ mime_type=None,
	261	+ modified_at=modified,
	262	+ path=path,
	263	+ )
	264	+
	265	+
	266	+def _extract_text(path: Path) -> str:
	267	+ """Best-effort text extraction from a downloaded file."""
	268	+ ext = path.suffix.lower()
	269	+
	270	+ if ext in {".txt", ".md", ".csv"}:
	271	+ return path.read_text(encoding="utf-8", errors="replace")
	272	+
	273	+ if ext in {".html", ".htm"}:
	274	+ from video_processor.sources.web_source import _strip_html_tags
	275	+
	276	+ return _strip_html_tags(path.read_text(encoding="utf-8", errors="replace"))
	277	+
	278	+ if ext == ".pdf":
	279	+ try:
	280	+ import fitz # pymupdf
	281	+
	282	+ doc = fitz.open(str(path))
	283	+ return "\n\n".join(page.get_text() for page in doc)
	284	+ except ImportError:
	285	+ return f"[PDF file: {path.name} — install pymupdf to extract text]"
	286	+
	287	+ if ext in {".docx", ".pptx", ".xlsx"}:
	288	+ # Try python-docx / openpyxl / python-pptx if available
	289	+ try:
	290	+ if ext == ".docx":
	291	+ from docx import Document
	292	+
	293	+ doc = Document(str(path))
	294	+ return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())
	295	+ elif ext == ".xlsx":
	296	+ import openpyxl
	297	+
	298	+ wb = openpyxl.load_workbook(str(path), read_only=True)
	299	+ rows = []
	300	+ for sheet in wb.sheetnames:
	301	+ ws = wb[sheet]
	302	+ for row in ws.iter_rows(values_only=True):
	303	+ cells = [str(c) if c is not None else "" for c in row]
	304	+ if any(cells):
	305	+ rows.append("\t".join(cells))
	306	+ return "\n".join(rows)
	307	+ except ImportError:
	308	+ return f"[{ext} file: {path.name} — install python-docx/openpyxl to extract text]"
	309	+
	310	+ return f"[Unsupported format: {path.name}]"

	--- a/video_processor/sources/m365_source.py
	+++ b/video_processor/sources/m365_source.py
	@@ -0,0 +1,310 @@

	--- a/video_processor/sources/m365_source.py
	+++ b/video_processor/sources/m365_source.py
	@@ -0,0 +1,310 @@
1	"""Microsoft 365 source connector using the m365 CLI (cli-microsoft365).
2
3	Fetches documents from SharePoint and OneDrive via the `m365` CLI tool.
4	Outputs plain text suitable for KG ingestion.
5
6	Requires: npm install -g @pnp/cli-microsoft365
7	Auth: m365 login (interactive)
8	Docs: https://pnp.github.io/cli-microsoft365/
9	"""
10
11	import json
12	import logging
13	import shutil
14	import subprocess
15	import tempfile
16	from pathlib import Path
17	from typing import Any, Dict, List, Optional
18
19	from video_processor.sources.base import BaseSource, SourceFile
20
21	logger = logging.getLogger(__name__)
22
23	# Document MIME types we can extract text from
24	_DOC_EXTENSIONS = {
25	".docx",
26	".doc",
27	".xlsx",
28	".xls",
29	".pptx",
30	".ppt",
31	".pdf",
32	".txt",
33	".md",
34	".csv",
35	".html",
36	".htm",
37	}
38
39
40	def _run_m365(args: List[str], timeout: int = 30) -> Any:
41	"""Run an m365 CLI command and return parsed JSON output."""
42	cmd = ["m365"] + args + ["--output", "json"]
43	proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
44	if proc.returncode != 0:
45	raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}")
46	try:
47	return json.loads(proc.stdout)
48	except json.JSONDecodeError:
49	return proc.stdout.strip()
50
51
52	class M365Source(BaseSource):
53	"""
54	Fetch documents from SharePoint Online and OneDrive via the m365 CLI.
55
56	Usage:
57	# SharePoint site
58	source = M365Source(
59	web_url="https://contoso.sharepoint.com/sites/project-x",
60	folder_url="/sites/project-x/Shared Documents"
61	)
62
63	# OneDrive
64	source = M365Source(
65	web_url="https://contoso-my.sharepoint.com/personal/user_contoso_com",
66	folder_url="/personal/user_contoso_com/Documents"
67	)
68
69	files = source.list_videos()
70	source.download_all(files, Path("./docs"))
71	"""
72
73	def __init__(
74	self,
75	web_url: str,
76	folder_url: Optional[str] = None,
77	file_ids: Optional[List[str]] = None,
78	recursive: bool = False,
79	):
80	self.web_url = web_url
81	self.folder_url = folder_url
82	self.file_ids = file_ids or []
83	self.recursive = recursive
84
85	def authenticate(self) -> bool:
86	"""Check if m365 CLI is installed and logged in."""
87	if not shutil.which("m365"):
88	logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365")
89	return False
90	try:
91	result = _run_m365(["status"], timeout=10)
92	# m365 status returns connection info when logged in
93	if isinstance(result, dict) and result.get("connectedAs"):
94	return True
95	if isinstance(result, str) and "Logged in" in result:
96	return True
97	logger.error("m365 not logged in. Run: m365 login")
98	return False
99	except (RuntimeError, subprocess.TimeoutExpired):
100	logger.error("m365 not logged in. Run: m365 login")
101	return False
102
103	def list_videos(
104	self,
105	folder_id: Optional[str] = None,
106	folder_path: Optional[str] = None,
107	patterns: Optional[List[str]] = None,
108	) -> List[SourceFile]:
109	"""List documents in SharePoint/OneDrive. Returns docs, not just videos."""
110	files: List[SourceFile] = []
111
112	# Fetch specific files by ID
113	if self.file_ids:
114	for fid in self.file_ids:
115	try:
116	result = _run_m365(
117	[
118	"spo",
119	"file",
120	"get",
121	"--webUrl",
122	self.web_url,
123	"--id",
124	fid,
125	]
126	)
127	files.append(_result_to_source_file(result))
128	except RuntimeError as e:
129	logger.warning(f"Failed to get file {fid}: {e}")
130	return files
131
132	# List files in folder
133	folder = folder_path or self.folder_url
134	if not folder:
135	logger.error("No folder URL specified. Use --folder-url or folder_path parameter.")
136	return []
137
138	try:
139	args = [
140	"file",
141	"list",
142	"--webUrl",
143	self.web_url,
144	"--folderUrl",
145	folder,
146	]
147	if self.recursive:
148	args.append("--recursive")
149
150	result = _run_m365(args, timeout=60)
151	except RuntimeError as e:
152	logger.error(f"Failed to list files: {e}")
153	return []
154
155	items = result if isinstance(result, list) else []
156	for item in items:
157	name = item.get("Name", item.get("name", ""))
158	ext = Path(name).suffix.lower()
159	if ext in _DOC_EXTENSIONS:
160	files.append(_result_to_source_file(item))
161
162	logger.info(f"Found {len(files)} document(s) in {folder}")
163	return files
164
165	def download(self, file: SourceFile, destination: Path) -> Path:
166	"""Download a file from SharePoint/OneDrive."""
167	destination = Path(destination)
168	destination.parent.mkdir(parents=True, exist_ok=True)
169
170	args = [
171	"spo",
172	"file",
173	"get",
174	"--webUrl",
175	self.web_url,
176	"--asFile",
177	"--path",
178	str(destination),
179	]
180
181	# Use URL if available in path field, otherwise use ID
182	if file.path:
183	args.extend(["--url", file.path])
184	else:
185	args.extend(["--id", file.id])
186
187	_run_m365(args, timeout=120)
188	logger.info(f"Downloaded {file.name} to {destination}")
189	return destination
190
191	def download_as_text(self, file: SourceFile) -> str:
192	"""Download a file and attempt to extract text content."""
193	# For text-based formats, get as string directly
194	text_exts = {".txt", ".md", ".csv", ".html", ".htm"}
195	ext = Path(file.name).suffix.lower()
196
197	if ext in text_exts:
198	try:
199	args = [
200	"spo",
201	"file",
202	"get",
203	"--webUrl",
204	self.web_url,
205	"--asString",
206	]
207	if file.path:
208	args.extend(["--url", file.path])
209	else:
210	args.extend(["--id", file.id])
211
212	result = _run_m365(args, timeout=60)
213	return result if isinstance(result, str) else json.dumps(result)
214	except RuntimeError:
215	pass
216
217	# For binary formats, download to temp and extract
218	with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
219	tmp_path = Path(tmp.name)
220
221	try:
222	self.download(file, tmp_path)
223	return _extract_text(tmp_path)
224	finally:
225	tmp_path.unlink(missing_ok=True)
226
227	def fetch_all_text(self) -> Dict[str, str]:
228	"""List all docs and return {filename: text_content} dict."""
229	files = self.list_videos()
230	results = {}
231	for f in files:
232	try:
233	results[f.name] = self.download_as_text(f)
234	except Exception as e:
235	logger.warning(f"Failed to fetch {f.name}: {e}")
236	results[f.name] = f"[Error: {e}]"
237	return results
238
239	def collate(self, separator: str = "\n\n---\n\n") -> str:
240	"""Fetch all docs and collate into a single text blob for ingestion."""
241	docs = self.fetch_all_text()
242	parts = []
243	for name, content in docs.items():
244	parts.append(f"# {name}\n\n{content}")
245	return separator.join(parts)
246
247
248	def _result_to_source_file(item: dict) -> SourceFile:
249	"""Convert an m365 file result to SourceFile."""
250	name = item.get("Name", item.get("name", "Untitled"))
251	file_id = item.get("UniqueId", item.get("uniqueId", item.get("id", "")))
252	size = item.get("Length", item.get("length", item.get("size")))
253	path = item.get("ServerRelativeUrl", item.get("serverRelativeUrl"))
254	modified = item.get("TimeLastModified", item.get("lastModifiedDateTime"))
255
256	return SourceFile(
257	name=name,
258	id=str(file_id),
259	size_bytes=int(size) if size else None,
260	mime_type=None,
261	modified_at=modified,
262	path=path,
263	)
264
265
266	def _extract_text(path: Path) -> str:
267	"""Best-effort text extraction from a downloaded file."""
268	ext = path.suffix.lower()
269
270	if ext in {".txt", ".md", ".csv"}:
271	return path.read_text(encoding="utf-8", errors="replace")
272
273	if ext in {".html", ".htm"}:
274	from video_processor.sources.web_source import _strip_html_tags
275
276	return _strip_html_tags(path.read_text(encoding="utf-8", errors="replace"))
277
278	if ext == ".pdf":
279	try:
280	import fitz # pymupdf
281
282	doc = fitz.open(str(path))
283	return "\n\n".join(page.get_text() for page in doc)
284	except ImportError:
285	return f"[PDF file: {path.name} — install pymupdf to extract text]"
286
287	if ext in {".docx", ".pptx", ".xlsx"}:
288	# Try python-docx / openpyxl / python-pptx if available
289	try:
290	if ext == ".docx":
291	from docx import Document
292
293	doc = Document(str(path))
294	return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())
295	elif ext == ".xlsx":
296	import openpyxl
297
298	wb = openpyxl.load_workbook(str(path), read_only=True)
299	rows = []
300	for sheet in wb.sheetnames:
301	ws = wb[sheet]
302	for row in ws.iter_rows(values_only=True):
303	cells = [str(c) if c is not None else "" for c in row]
304	if any(cells):
305	rows.append("\t".join(cells))
306	return "\n".join(rows)
307	except ImportError:
308	return f"[{ext} file: {path.name} — install python-docx/openpyxl to extract text]"
309
310	return f"[Unsupported format: {path.name}]"

PlanOpticon

Keyboard Shortcuts