PlanOpticon

feat(sources): add Google Workspace (gws) and Microsoft 365 (m365) source connectors - GWSSource: fetch/collate Google Docs, Sheets, Slides via gws CLI - M365Source: fetch docs from SharePoint/OneDrive via m365 CLI - CLI groups: planopticon gws {list,fetch,ingest} and planopticon m365 {list,fetch,ingest} - Both support direct ingest into knowledge graphs - Text extraction for .docx, .xlsx, .pdf, .html, .txt, .md, .csv - 29 new tests (GWSSource, M365Source, CLI help)

lmata 2026-03-07 23:02 trunk
Commit 7d809ed2e1f41acb2eb046fb778e401a3e5bf1a65590f0a95002684322014355
--- tests/test_cli.py
+++ tests/test_cli.py
@@ -20,10 +20,12 @@
2020
assert "PlanOpticon" in result.output
2121
assert "analyze" in result.output
2222
assert "query" in result.output
2323
assert "agent" in result.output
2424
assert "kg" in result.output
25
+ assert "gws" in result.output
26
+ assert "m365" in result.output
2527
assert "ingest" in result.output
2628
assert "batch" in result.output
2729
2830
2931
class TestAnalyzeHelp:
@@ -140,13 +142,64 @@
140142
assert result.exit_code == 0
141143
assert "--cache-dir" in result.output
142144
assert "--older-than" in result.output
143145
assert "--all" in result.output
144146
147
+
148
+class TestGWSHelp:
149
+ def test_help(self):
150
+ runner = CliRunner()
151
+ result = runner.invoke(cli, ["gws", "--help"])
152
+ assert result.exit_code == 0
153
+ assert "list" in result.output
154
+ assert "fetch" in result.output
155
+ assert "ingest" in result.output
156
+
157
+ def test_list_help(self):
158
+ runner = CliRunner()
159
+ result = runner.invoke(cli, ["gws", "list", "--help"])
160
+ assert result.exit_code == 0
161
+ assert "--folder-id" in result.output
162
+ assert "--query" in result.output
163
+
164
+ def test_ingest_help(self):
165
+ runner = CliRunner()
166
+ result = runner.invoke(cli, ["gws", "ingest", "--help"])
167
+ assert result.exit_code == 0
168
+ assert "--folder-id" in result.output
169
+ assert "--doc-id" in result.output
170
+ assert "--db-path" in result.output
171
+
172
+
173
+class TestM365Help:
174
+ def test_help(self):
175
+ runner = CliRunner()
176
+ result = runner.invoke(cli, ["m365", "--help"])
177
+ assert result.exit_code == 0
178
+ assert "list" in result.output
179
+ assert "fetch" in result.output
180
+ assert "ingest" in result.output
181
+
182
+ def test_list_help(self):
183
+ runner = CliRunner()
184
+ result = runner.invoke(cli, ["m365", "list", "--help"])
185
+ assert result.exit_code == 0
186
+ assert "--web-url" in result.output
187
+ assert "--folder-url" in result.output
188
+ assert "--recursive" in result.output
189
+
190
+ def test_ingest_help(self):
191
+ runner = CliRunner()
192
+ result = runner.invoke(cli, ["m365", "ingest", "--help"])
193
+ assert result.exit_code == 0
194
+ assert "--web-url" in result.output
195
+ assert "--file-id" in result.output
196
+ assert "--db-path" in result.output
197
+
145198
146199
class TestAuthHelp:
147200
def test_help(self):
148201
runner = CliRunner()
149202
result = runner.invoke(cli, ["auth", "--help"])
150203
assert result.exit_code == 0
151204
assert "google" in result.output
152205
assert "dropbox" in result.output
153206
--- tests/test_cli.py
+++ tests/test_cli.py
@@ -20,10 +20,12 @@
20 assert "PlanOpticon" in result.output
21 assert "analyze" in result.output
22 assert "query" in result.output
23 assert "agent" in result.output
24 assert "kg" in result.output
 
 
25 assert "ingest" in result.output
26 assert "batch" in result.output
27
28
29 class TestAnalyzeHelp:
@@ -140,13 +142,64 @@
140 assert result.exit_code == 0
141 assert "--cache-dir" in result.output
142 assert "--older-than" in result.output
143 assert "--all" in result.output
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
146 class TestAuthHelp:
147 def test_help(self):
148 runner = CliRunner()
149 result = runner.invoke(cli, ["auth", "--help"])
150 assert result.exit_code == 0
151 assert "google" in result.output
152 assert "dropbox" in result.output
153
--- tests/test_cli.py
+++ tests/test_cli.py
@@ -20,10 +20,12 @@
20 assert "PlanOpticon" in result.output
21 assert "analyze" in result.output
22 assert "query" in result.output
23 assert "agent" in result.output
24 assert "kg" in result.output
25 assert "gws" in result.output
26 assert "m365" in result.output
27 assert "ingest" in result.output
28 assert "batch" in result.output
29
30
31 class TestAnalyzeHelp:
@@ -140,13 +142,64 @@
142 assert result.exit_code == 0
143 assert "--cache-dir" in result.output
144 assert "--older-than" in result.output
145 assert "--all" in result.output
146
147
148 class TestGWSHelp:
149 def test_help(self):
150 runner = CliRunner()
151 result = runner.invoke(cli, ["gws", "--help"])
152 assert result.exit_code == 0
153 assert "list" in result.output
154 assert "fetch" in result.output
155 assert "ingest" in result.output
156
157 def test_list_help(self):
158 runner = CliRunner()
159 result = runner.invoke(cli, ["gws", "list", "--help"])
160 assert result.exit_code == 0
161 assert "--folder-id" in result.output
162 assert "--query" in result.output
163
164 def test_ingest_help(self):
165 runner = CliRunner()
166 result = runner.invoke(cli, ["gws", "ingest", "--help"])
167 assert result.exit_code == 0
168 assert "--folder-id" in result.output
169 assert "--doc-id" in result.output
170 assert "--db-path" in result.output
171
172
173 class TestM365Help:
174 def test_help(self):
175 runner = CliRunner()
176 result = runner.invoke(cli, ["m365", "--help"])
177 assert result.exit_code == 0
178 assert "list" in result.output
179 assert "fetch" in result.output
180 assert "ingest" in result.output
181
182 def test_list_help(self):
183 runner = CliRunner()
184 result = runner.invoke(cli, ["m365", "list", "--help"])
185 assert result.exit_code == 0
186 assert "--web-url" in result.output
187 assert "--folder-url" in result.output
188 assert "--recursive" in result.output
189
190 def test_ingest_help(self):
191 runner = CliRunner()
192 result = runner.invoke(cli, ["m365", "ingest", "--help"])
193 assert result.exit_code == 0
194 assert "--web-url" in result.output
195 assert "--file-id" in result.output
196 assert "--db-path" in result.output
197
198
199 class TestAuthHelp:
200 def test_help(self):
201 runner = CliRunner()
202 result = runner.invoke(cli, ["auth", "--help"])
203 assert result.exit_code == 0
204 assert "google" in result.output
205 assert "dropbox" in result.output
206
--- tests/test_sources.py
+++ tests/test_sources.py
@@ -534,5 +534,329 @@
534534
# Only .mp4 and .mkv are video extensions
535535
assert len(files) == 2
536536
names = [f.name for f in files]
537537
assert "clip.mp4" in names
538538
assert "movie.mkv" in names
539
+
540
+
541
+# ---------------------------------------------------------------------------
542
+# GWSSource
543
+# ---------------------------------------------------------------------------
544
+
545
+
546
+class TestGWSSource:
547
+ def test_import(self):
548
+ from video_processor.sources.gws_source import GWSSource
549
+
550
+ assert GWSSource is not None
551
+
552
+ def test_constructor_defaults(self):
553
+ from video_processor.sources.gws_source import GWSSource
554
+
555
+ src = GWSSource()
556
+ assert src.folder_id is None
557
+ assert src.query is None
558
+ assert src.doc_ids == []
559
+
560
+ def test_constructor_with_folder(self):
561
+ from video_processor.sources.gws_source import GWSSource
562
+
563
+ src = GWSSource(folder_id="1abc", query="name contains 'spec'")
564
+ assert src.folder_id == "1abc"
565
+ assert src.query == "name contains 'spec'"
566
+
567
+ def test_constructor_with_doc_ids(self):
568
+ from video_processor.sources.gws_source import GWSSource
569
+
570
+ src = GWSSource(doc_ids=["doc1", "doc2"])
571
+ assert src.doc_ids == ["doc1", "doc2"]
572
+
573
+ @patch("shutil.which", return_value=None)
574
+ def test_authenticate_no_gws(self, _mock_which):
575
+ from video_processor.sources.gws_source import GWSSource
576
+
577
+ src = GWSSource()
578
+ assert src.authenticate() is False
579
+
580
+ @patch("video_processor.sources.gws_source._run_gws")
581
+ @patch("shutil.which", return_value="/usr/local/bin/gws")
582
+ def test_authenticate_success(self, _mock_which, mock_run):
583
+ from video_processor.sources.gws_source import GWSSource
584
+
585
+ mock_run.return_value = {"connectedAs": "[email protected]"}
586
+ src = GWSSource()
587
+ assert src.authenticate() is True
588
+
589
+ @patch("video_processor.sources.gws_source._run_gws")
590
+ @patch("shutil.which", return_value="/usr/local/bin/gws")
591
+ def test_list_videos(self, _mock_which, mock_run):
592
+ from video_processor.sources.gws_source import GWSSource
593
+
594
+ mock_run.return_value = {
595
+ "files": [
596
+ {
597
+ "id": "doc123",
598
+ "name": "Project Spec",
599
+ "mimeType": "application/vnd.google-apps.document",
600
+ "modifiedTime": "2026-01-01T00:00:00Z",
601
+ },
602
+ {
603
+ "id": "sheet456",
604
+ "name": "Budget",
605
+ "mimeType": "application/vnd.google-apps.spreadsheet",
606
+ },
607
+ ]
608
+ }
609
+ src = GWSSource(folder_id="folder1")
610
+ files = src.list_videos()
611
+ assert len(files) == 2
612
+ assert files[0].name == "Project Spec"
613
+ assert files[1].id == "sheet456"
614
+
615
+ @patch("video_processor.sources.gws_source._run_gws")
616
+ @patch("shutil.which", return_value="/usr/local/bin/gws")
617
+ def test_list_videos_with_doc_ids(self, _mock_which, mock_run):
618
+ from video_processor.sources.gws_source import GWSSource
619
+
620
+ mock_run.return_value = {
621
+ "id": "doc123",
622
+ "name": "My Doc",
623
+ "mimeType": "application/vnd.google-apps.document",
624
+ }
625
+ src = GWSSource(doc_ids=["doc123"])
626
+ files = src.list_videos()
627
+ assert len(files) == 1
628
+ assert files[0].name == "My Doc"
629
+
630
+ def test_result_to_source_file(self):
631
+ from video_processor.sources.gws_source import _result_to_source_file
632
+
633
+ sf = _result_to_source_file(
634
+ {
635
+ "id": "abc",
636
+ "name": "Test Doc",
637
+ "mimeType": "text/plain",
638
+ "size": "1024",
639
+ "modifiedTime": "2026-03-01",
640
+ }
641
+ )
642
+ assert sf.name == "Test Doc"
643
+ assert sf.id == "abc"
644
+ assert sf.size_bytes == 1024
645
+ assert sf.mime_type == "text/plain"
646
+
647
+ @patch("video_processor.sources.gws_source._run_gws")
648
+ def test_get_doc_text(self, mock_run):
649
+ from video_processor.sources.gws_source import GWSSource
650
+
651
+ mock_run.return_value = {
652
+ "body": {
653
+ "content": [
654
+ {
655
+ "paragraph": {
656
+ "elements": [
657
+ {"textRun": {"content": "Hello world\n"}},
658
+ ]
659
+ }
660
+ },
661
+ {
662
+ "paragraph": {
663
+ "elements": [
664
+ {"textRun": {"content": "Second paragraph\n"}},
665
+ ]
666
+ }
667
+ },
668
+ ]
669
+ }
670
+ }
671
+ src = GWSSource()
672
+ text = src._get_doc_text("doc123")
673
+ assert "Hello world" in text
674
+ assert "Second paragraph" in text
675
+
676
+ @patch("video_processor.sources.gws_source._run_gws")
677
+ def test_collate(self, mock_run):
678
+ from video_processor.sources.gws_source import GWSSource
679
+
680
+ # First call: list files, second+: export each
681
+ mock_run.side_effect = [
682
+ {
683
+ "files": [
684
+ {
685
+ "id": "d1",
686
+ "name": "Doc A",
687
+ "mimeType": "application/vnd.google-apps.document",
688
+ },
689
+ ]
690
+ },
691
+ {"raw": "Content of Doc A"},
692
+ ]
693
+ src = GWSSource(folder_id="f1")
694
+ result = src.collate()
695
+ assert "Doc A" in result
696
+ assert "Content of Doc A" in result
697
+
698
+
699
+# ---------------------------------------------------------------------------
700
+# M365Source
701
+# ---------------------------------------------------------------------------
702
+
703
+
704
+class TestM365Source:
705
+ def test_import(self):
706
+ from video_processor.sources.m365_source import M365Source
707
+
708
+ assert M365Source is not None
709
+
710
+ def test_constructor(self):
711
+ from video_processor.sources.m365_source import M365Source
712
+
713
+ src = M365Source(
714
+ web_url="https://contoso.sharepoint.com/sites/proj",
715
+ folder_url="/sites/proj/Shared Documents",
716
+ )
717
+ assert src.web_url == "https://contoso.sharepoint.com/sites/proj"
718
+ assert src.folder_url == "/sites/proj/Shared Documents"
719
+ assert src.file_ids == []
720
+ assert src.recursive is False
721
+
722
+ def test_constructor_with_file_ids(self):
723
+ from video_processor.sources.m365_source import M365Source
724
+
725
+ src = M365Source(
726
+ web_url="https://contoso.sharepoint.com",
727
+ file_ids=["id1", "id2"],
728
+ )
729
+ assert src.file_ids == ["id1", "id2"]
730
+
731
+ @patch("shutil.which", return_value=None)
732
+ def test_authenticate_no_m365(self, _mock_which):
733
+ from video_processor.sources.m365_source import M365Source
734
+
735
+ src = M365Source(web_url="https://contoso.sharepoint.com")
736
+ assert src.authenticate() is False
737
+
738
+ @patch("video_processor.sources.m365_source._run_m365")
739
+ @patch("shutil.which", return_value="/usr/local/bin/m365")
740
+ def test_authenticate_logged_in(self, _mock_which, mock_run):
741
+ from video_processor.sources.m365_source import M365Source
742
+
743
+ mock_run.return_value = {"connectedAs": "[email protected]"}
744
+ src = M365Source(web_url="https://contoso.sharepoint.com")
745
+ assert src.authenticate() is True
746
+
747
+ @patch("video_processor.sources.m365_source._run_m365")
748
+ @patch("shutil.which", return_value="/usr/local/bin/m365")
749
+ def test_authenticate_not_logged_in(self, _mock_which, mock_run):
750
+ from video_processor.sources.m365_source import M365Source
751
+
752
+ mock_run.return_value = {}
753
+ src = M365Source(web_url="https://contoso.sharepoint.com")
754
+ assert src.authenticate() is False
755
+
756
+ @patch("video_processor.sources.m365_source._run_m365")
757
+ @patch("shutil.which", return_value="/usr/local/bin/m365")
758
+ def test_list_videos(self, _mock_which, mock_run):
759
+ from video_processor.sources.m365_source import M365Source
760
+
761
+ mock_run.side_effect = [
762
+ {"connectedAs": "[email protected]"}, # authenticate
763
+ [
764
+ {
765
+ "Name": "spec.docx",
766
+ "UniqueId": "uid-1",
767
+ "Length": "20480",
768
+ "ServerRelativeUrl": "/sites/proj/docs/spec.docx",
769
+ },
770
+ {
771
+ "Name": "budget.xlsx",
772
+ "UniqueId": "uid-2",
773
+ "Length": "10240",
774
+ "ServerRelativeUrl": "/sites/proj/docs/budget.xlsx",
775
+ },
776
+ {
777
+ "Name": "image.png",
778
+ "UniqueId": "uid-3",
779
+ "Length": "5000",
780
+ "ServerRelativeUrl": "/sites/proj/docs/image.png",
781
+ },
782
+ ],
783
+ ]
784
+ src = M365Source(
785
+ web_url="https://contoso.sharepoint.com/sites/proj",
786
+ folder_url="/sites/proj/docs",
787
+ )
788
+ src.authenticate()
789
+ files = src.list_videos()
790
+ # Only .docx and .xlsx match _DOC_EXTENSIONS, not .png
791
+ assert len(files) == 2
792
+ names = [f.name for f in files]
793
+ assert "spec.docx" in names
794
+ assert "budget.xlsx" in names
795
+
796
+ @patch("video_processor.sources.m365_source._run_m365")
797
+ def test_list_videos_with_file_ids(self, mock_run):
798
+ from video_processor.sources.m365_source import M365Source
799
+
800
+ mock_run.return_value = {
801
+ "Name": "report.pdf",
802
+ "UniqueId": "uid-1",
803
+ "Length": "50000",
804
+ "ServerRelativeUrl": "/sites/proj/docs/report.pdf",
805
+ }
806
+ src = M365Source(
807
+ web_url="https://contoso.sharepoint.com",
808
+ file_ids=["uid-1"],
809
+ )
810
+ files = src.list_videos()
811
+ assert len(files) == 1
812
+ assert files[0].name == "report.pdf"
813
+
814
+ def test_result_to_source_file(self):
815
+ from video_processor.sources.m365_source import _result_to_source_file
816
+
817
+ sf = _result_to_source_file(
818
+ {
819
+ "Name": "notes.txt",
820
+ "UniqueId": "abc-123",
821
+ "Length": "512",
822
+ "ServerRelativeUrl": "/sites/proj/notes.txt",
823
+ "TimeLastModified": "2026-03-01T12:00:00Z",
824
+ }
825
+ )
826
+ assert sf.name == "notes.txt"
827
+ assert sf.id == "abc-123"
828
+ assert sf.size_bytes == 512
829
+ assert sf.path == "/sites/proj/notes.txt"
830
+ assert sf.modified_at == "2026-03-01T12:00:00Z"
831
+
832
+ def test_extract_text_txt(self, tmp_path):
833
+ from video_processor.sources.m365_source import _extract_text
834
+
835
+ f = tmp_path / "test.txt"
836
+ f.write_text("Hello from a text file")
837
+ result = _extract_text(f)
838
+ assert result == "Hello from a text file"
839
+
840
+ def test_extract_text_md(self, tmp_path):
841
+ from video_processor.sources.m365_source import _extract_text
842
+
843
+ f = tmp_path / "readme.md"
844
+ f.write_text("# Title\n\nSome content")
845
+ result = _extract_text(f)
846
+ assert "Title" in result
847
+ assert "Some content" in result
848
+
849
+ def test_extract_text_unsupported(self, tmp_path):
850
+ from video_processor.sources.m365_source import _extract_text
851
+
852
+ f = tmp_path / "data.bin"
853
+ f.write_bytes(b"\x00\x01\x02")
854
+ result = _extract_text(f)
855
+ assert "Unsupported" in result
856
+
857
+ def test_list_no_folder_url(self):
858
+ from video_processor.sources.m365_source import M365Source
859
+
860
+ src = M365Source(web_url="https://contoso.sharepoint.com")
861
+ files = src.list_videos()
862
+ assert files == []
539863
--- tests/test_sources.py
+++ tests/test_sources.py
@@ -534,5 +534,329 @@
534 # Only .mp4 and .mkv are video extensions
535 assert len(files) == 2
536 names = [f.name for f in files]
537 assert "clip.mp4" in names
538 assert "movie.mkv" in names
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
--- tests/test_sources.py
+++ tests/test_sources.py
@@ -534,5 +534,329 @@
534 # Only .mp4 and .mkv are video extensions
535 assert len(files) == 2
536 names = [f.name for f in files]
537 assert "clip.mp4" in names
538 assert "movie.mkv" in names
539
540
541 # ---------------------------------------------------------------------------
542 # GWSSource
543 # ---------------------------------------------------------------------------
544
545
546 class TestGWSSource:
547 def test_import(self):
548 from video_processor.sources.gws_source import GWSSource
549
550 assert GWSSource is not None
551
552 def test_constructor_defaults(self):
553 from video_processor.sources.gws_source import GWSSource
554
555 src = GWSSource()
556 assert src.folder_id is None
557 assert src.query is None
558 assert src.doc_ids == []
559
560 def test_constructor_with_folder(self):
561 from video_processor.sources.gws_source import GWSSource
562
563 src = GWSSource(folder_id="1abc", query="name contains 'spec'")
564 assert src.folder_id == "1abc"
565 assert src.query == "name contains 'spec'"
566
567 def test_constructor_with_doc_ids(self):
568 from video_processor.sources.gws_source import GWSSource
569
570 src = GWSSource(doc_ids=["doc1", "doc2"])
571 assert src.doc_ids == ["doc1", "doc2"]
572
573 @patch("shutil.which", return_value=None)
574 def test_authenticate_no_gws(self, _mock_which):
575 from video_processor.sources.gws_source import GWSSource
576
577 src = GWSSource()
578 assert src.authenticate() is False
579
580 @patch("video_processor.sources.gws_source._run_gws")
581 @patch("shutil.which", return_value="/usr/local/bin/gws")
582 def test_authenticate_success(self, _mock_which, mock_run):
583 from video_processor.sources.gws_source import GWSSource
584
585 mock_run.return_value = {"connectedAs": "[email protected]"}
586 src = GWSSource()
587 assert src.authenticate() is True
588
589 @patch("video_processor.sources.gws_source._run_gws")
590 @patch("shutil.which", return_value="/usr/local/bin/gws")
591 def test_list_videos(self, _mock_which, mock_run):
592 from video_processor.sources.gws_source import GWSSource
593
594 mock_run.return_value = {
595 "files": [
596 {
597 "id": "doc123",
598 "name": "Project Spec",
599 "mimeType": "application/vnd.google-apps.document",
600 "modifiedTime": "2026-01-01T00:00:00Z",
601 },
602 {
603 "id": "sheet456",
604 "name": "Budget",
605 "mimeType": "application/vnd.google-apps.spreadsheet",
606 },
607 ]
608 }
609 src = GWSSource(folder_id="folder1")
610 files = src.list_videos()
611 assert len(files) == 2
612 assert files[0].name == "Project Spec"
613 assert files[1].id == "sheet456"
614
615 @patch("video_processor.sources.gws_source._run_gws")
616 @patch("shutil.which", return_value="/usr/local/bin/gws")
617 def test_list_videos_with_doc_ids(self, _mock_which, mock_run):
618 from video_processor.sources.gws_source import GWSSource
619
620 mock_run.return_value = {
621 "id": "doc123",
622 "name": "My Doc",
623 "mimeType": "application/vnd.google-apps.document",
624 }
625 src = GWSSource(doc_ids=["doc123"])
626 files = src.list_videos()
627 assert len(files) == 1
628 assert files[0].name == "My Doc"
629
630 def test_result_to_source_file(self):
631 from video_processor.sources.gws_source import _result_to_source_file
632
633 sf = _result_to_source_file(
634 {
635 "id": "abc",
636 "name": "Test Doc",
637 "mimeType": "text/plain",
638 "size": "1024",
639 "modifiedTime": "2026-03-01",
640 }
641 )
642 assert sf.name == "Test Doc"
643 assert sf.id == "abc"
644 assert sf.size_bytes == 1024
645 assert sf.mime_type == "text/plain"
646
647 @patch("video_processor.sources.gws_source._run_gws")
648 def test_get_doc_text(self, mock_run):
649 from video_processor.sources.gws_source import GWSSource
650
651 mock_run.return_value = {
652 "body": {
653 "content": [
654 {
655 "paragraph": {
656 "elements": [
657 {"textRun": {"content": "Hello world\n"}},
658 ]
659 }
660 },
661 {
662 "paragraph": {
663 "elements": [
664 {"textRun": {"content": "Second paragraph\n"}},
665 ]
666 }
667 },
668 ]
669 }
670 }
671 src = GWSSource()
672 text = src._get_doc_text("doc123")
673 assert "Hello world" in text
674 assert "Second paragraph" in text
675
676 @patch("video_processor.sources.gws_source._run_gws")
677 def test_collate(self, mock_run):
678 from video_processor.sources.gws_source import GWSSource
679
680 # First call: list files, second+: export each
681 mock_run.side_effect = [
682 {
683 "files": [
684 {
685 "id": "d1",
686 "name": "Doc A",
687 "mimeType": "application/vnd.google-apps.document",
688 },
689 ]
690 },
691 {"raw": "Content of Doc A"},
692 ]
693 src = GWSSource(folder_id="f1")
694 result = src.collate()
695 assert "Doc A" in result
696 assert "Content of Doc A" in result
697
698
699 # ---------------------------------------------------------------------------
700 # M365Source
701 # ---------------------------------------------------------------------------
702
703
704 class TestM365Source:
705 def test_import(self):
706 from video_processor.sources.m365_source import M365Source
707
708 assert M365Source is not None
709
710 def test_constructor(self):
711 from video_processor.sources.m365_source import M365Source
712
713 src = M365Source(
714 web_url="https://contoso.sharepoint.com/sites/proj",
715 folder_url="/sites/proj/Shared Documents",
716 )
717 assert src.web_url == "https://contoso.sharepoint.com/sites/proj"
718 assert src.folder_url == "/sites/proj/Shared Documents"
719 assert src.file_ids == []
720 assert src.recursive is False
721
722 def test_constructor_with_file_ids(self):
723 from video_processor.sources.m365_source import M365Source
724
725 src = M365Source(
726 web_url="https://contoso.sharepoint.com",
727 file_ids=["id1", "id2"],
728 )
729 assert src.file_ids == ["id1", "id2"]
730
731 @patch("shutil.which", return_value=None)
732 def test_authenticate_no_m365(self, _mock_which):
733 from video_processor.sources.m365_source import M365Source
734
735 src = M365Source(web_url="https://contoso.sharepoint.com")
736 assert src.authenticate() is False
737
738 @patch("video_processor.sources.m365_source._run_m365")
739 @patch("shutil.which", return_value="/usr/local/bin/m365")
740 def test_authenticate_logged_in(self, _mock_which, mock_run):
741 from video_processor.sources.m365_source import M365Source
742
743 mock_run.return_value = {"connectedAs": "[email protected]"}
744 src = M365Source(web_url="https://contoso.sharepoint.com")
745 assert src.authenticate() is True
746
747 @patch("video_processor.sources.m365_source._run_m365")
748 @patch("shutil.which", return_value="/usr/local/bin/m365")
749 def test_authenticate_not_logged_in(self, _mock_which, mock_run):
750 from video_processor.sources.m365_source import M365Source
751
752 mock_run.return_value = {}
753 src = M365Source(web_url="https://contoso.sharepoint.com")
754 assert src.authenticate() is False
755
756 @patch("video_processor.sources.m365_source._run_m365")
757 @patch("shutil.which", return_value="/usr/local/bin/m365")
758 def test_list_videos(self, _mock_which, mock_run):
759 from video_processor.sources.m365_source import M365Source
760
761 mock_run.side_effect = [
762 {"connectedAs": "[email protected]"}, # authenticate
763 [
764 {
765 "Name": "spec.docx",
766 "UniqueId": "uid-1",
767 "Length": "20480",
768 "ServerRelativeUrl": "/sites/proj/docs/spec.docx",
769 },
770 {
771 "Name": "budget.xlsx",
772 "UniqueId": "uid-2",
773 "Length": "10240",
774 "ServerRelativeUrl": "/sites/proj/docs/budget.xlsx",
775 },
776 {
777 "Name": "image.png",
778 "UniqueId": "uid-3",
779 "Length": "5000",
780 "ServerRelativeUrl": "/sites/proj/docs/image.png",
781 },
782 ],
783 ]
784 src = M365Source(
785 web_url="https://contoso.sharepoint.com/sites/proj",
786 folder_url="/sites/proj/docs",
787 )
788 src.authenticate()
789 files = src.list_videos()
790 # Only .docx and .xlsx match _DOC_EXTENSIONS, not .png
791 assert len(files) == 2
792 names = [f.name for f in files]
793 assert "spec.docx" in names
794 assert "budget.xlsx" in names
795
796 @patch("video_processor.sources.m365_source._run_m365")
797 def test_list_videos_with_file_ids(self, mock_run):
798 from video_processor.sources.m365_source import M365Source
799
800 mock_run.return_value = {
801 "Name": "report.pdf",
802 "UniqueId": "uid-1",
803 "Length": "50000",
804 "ServerRelativeUrl": "/sites/proj/docs/report.pdf",
805 }
806 src = M365Source(
807 web_url="https://contoso.sharepoint.com",
808 file_ids=["uid-1"],
809 )
810 files = src.list_videos()
811 assert len(files) == 1
812 assert files[0].name == "report.pdf"
813
814 def test_result_to_source_file(self):
815 from video_processor.sources.m365_source import _result_to_source_file
816
817 sf = _result_to_source_file(
818 {
819 "Name": "notes.txt",
820 "UniqueId": "abc-123",
821 "Length": "512",
822 "ServerRelativeUrl": "/sites/proj/notes.txt",
823 "TimeLastModified": "2026-03-01T12:00:00Z",
824 }
825 )
826 assert sf.name == "notes.txt"
827 assert sf.id == "abc-123"
828 assert sf.size_bytes == 512
829 assert sf.path == "/sites/proj/notes.txt"
830 assert sf.modified_at == "2026-03-01T12:00:00Z"
831
832 def test_extract_text_txt(self, tmp_path):
833 from video_processor.sources.m365_source import _extract_text
834
835 f = tmp_path / "test.txt"
836 f.write_text("Hello from a text file")
837 result = _extract_text(f)
838 assert result == "Hello from a text file"
839
840 def test_extract_text_md(self, tmp_path):
841 from video_processor.sources.m365_source import _extract_text
842
843 f = tmp_path / "readme.md"
844 f.write_text("# Title\n\nSome content")
845 result = _extract_text(f)
846 assert "Title" in result
847 assert "Some content" in result
848
849 def test_extract_text_unsupported(self, tmp_path):
850 from video_processor.sources.m365_source import _extract_text
851
852 f = tmp_path / "data.bin"
853 f.write_bytes(b"\x00\x01\x02")
854 result = _extract_text(f)
855 assert "Unsupported" in result
856
857 def test_list_no_folder_url(self):
858 from video_processor.sources.m365_source import M365Source
859
860 src = M365Source(web_url="https://contoso.sharepoint.com")
861 files = src.list_videos()
862 assert files == []
863
--- video_processor/cli/commands.py
+++ video_processor/cli/commands.py
@@ -1013,10 +1013,409 @@
10131013
click.echo("Dropbox authentication successful.")
10141014
else:
10151015
click.echo("Dropbox authentication failed.", err=True)
10161016
sys.exit(1)
10171017
1018
+
1019
+@cli.group()
1020
+def gws():
1021
+ """Google Workspace: fetch docs, sheets, and slides via the gws CLI."""
1022
+ pass
1023
+
1024
+
1025
+@gws.command("list")
1026
+@click.option("--folder-id", type=str, default=None, help="Drive folder ID to list")
1027
+@click.option("--query", "-q", type=str, default=None, help="Drive search query")
1028
+@click.option("--json", "as_json", is_flag=True, help="Output as JSON")
1029
+def gws_list(folder_id, query, as_json):
1030
+ """List documents in Google Drive.
1031
+
1032
+ Examples:
1033
+
1034
+ planopticon gws list
1035
+
1036
+ planopticon gws list --folder-id 1abc...
1037
+
1038
+ planopticon gws list -q "name contains 'PRD'" --json
1039
+ """
1040
+ from video_processor.sources.gws_source import GWSSource
1041
+
1042
+ source = GWSSource(folder_id=folder_id, query=query)
1043
+ if not source.authenticate():
1044
+ click.echo("Error: gws CLI not available or not authenticated.", err=True)
1045
+ click.echo("Install: npm install -g @googleworkspace/cli", err=True)
1046
+ click.echo("Auth: gws auth login", err=True)
1047
+ sys.exit(1)
1048
+
1049
+ files = source.list_videos(folder_id=folder_id)
1050
+ if as_json:
1051
+ click.echo(json.dumps([f.model_dump() for f in files], indent=2, default=str))
1052
+ else:
1053
+ if not files:
1054
+ click.echo("No documents found.")
1055
+ return
1056
+ for f in files:
1057
+ size = f"{f.size_bytes / 1024:.0f}KB" if f.size_bytes else "—"
1058
+ click.echo(f" {f.id[:12]}… {size:>8s} {f.mime_type or ''} {f.name}")
1059
+
1060
+
1061
+@gws.command("fetch")
1062
+@click.argument("doc_ids", nargs=-1)
1063
+@click.option("--folder-id", type=str, default=None, help="Fetch all docs in a folder")
1064
+@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory")
1065
+def gws_fetch(doc_ids, folder_id, output):
1066
+ """Fetch Google Docs/Sheets/Slides as text files.
1067
+
1068
+ Examples:
1069
+
1070
+ planopticon gws fetch DOC_ID1 DOC_ID2 -o ./docs
1071
+
1072
+ planopticon gws fetch --folder-id 1abc... -o ./docs
1073
+ """
1074
+ from video_processor.sources.gws_source import GWSSource
1075
+
1076
+ source = GWSSource(folder_id=folder_id, doc_ids=list(doc_ids))
1077
+ if not source.authenticate():
1078
+ click.echo("Error: gws CLI not available or not authenticated.", err=True)
1079
+ sys.exit(1)
1080
+
1081
+ out_dir = Path(output) if output else Path.cwd() / "gws_docs"
1082
+ out_dir.mkdir(parents=True, exist_ok=True)
1083
+
1084
+ files = source.list_videos(folder_id=folder_id)
1085
+ if not files:
1086
+ click.echo("No documents found.")
1087
+ return
1088
+
1089
+ for f in files:
1090
+ safe_name = f.name.replace("/", "_").replace("\\", "_")
1091
+ dest = out_dir / f"{safe_name}.txt"
1092
+ try:
1093
+ source.download(f, dest)
1094
+ click.echo(f" ✓ {f.name} → {dest}")
1095
+ except Exception as e:
1096
+ click.echo(f" ✗ {f.name}: {e}", err=True)
1097
+
1098
+ click.echo(f"\nFetched {len(files)} document(s) to {out_dir}")
1099
+
1100
+
1101
+@gws.command("ingest")
1102
+@click.option("--folder-id", type=str, default=None, help="Drive folder ID")
1103
+@click.option("--doc-id", type=str, multiple=True, help="Specific doc IDs (repeatable)")
1104
+@click.option("--query", "-q", type=str, default=None, help="Drive search query")
1105
+@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory")
1106
+@click.option("--db-path", type=click.Path(), default=None, help="Existing DB to merge into")
1107
+@click.option(
1108
+ "-p",
1109
+ "--provider",
1110
+ type=click.Choice(
1111
+ [
1112
+ "auto",
1113
+ "openai",
1114
+ "anthropic",
1115
+ "gemini",
1116
+ "ollama",
1117
+ "azure",
1118
+ "together",
1119
+ "fireworks",
1120
+ "cerebras",
1121
+ "xai",
1122
+ ]
1123
+ ),
1124
+ default="auto",
1125
+ help="API provider",
1126
+)
1127
+@click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks")
1128
+@click.pass_context
1129
+def gws_ingest(ctx, folder_id, doc_id, query, output, db_path, provider, chat_model):
1130
+ """Fetch Google Workspace docs and ingest into a knowledge graph.
1131
+
1132
+ Combines gws fetch + planopticon ingest in one step.
1133
+
1134
+ Examples:
1135
+
1136
+ planopticon gws ingest --folder-id 1abc...
1137
+
1138
+ planopticon gws ingest --doc-id DOC1 --doc-id DOC2 -o ./results
1139
+
1140
+ planopticon gws ingest -q "name contains 'spec'" --db-path existing.db
1141
+ """
1142
+ import tempfile
1143
+
1144
+ from video_processor.integrators.knowledge_graph import KnowledgeGraph
1145
+ from video_processor.processors.ingest import ingest_file
1146
+ from video_processor.providers.manager import ProviderManager
1147
+ from video_processor.sources.gws_source import GWSSource
1148
+
1149
+ source = GWSSource(folder_id=folder_id, doc_ids=list(doc_id), query=query)
1150
+ if not source.authenticate():
1151
+ click.echo("Error: gws CLI not available or not authenticated.", err=True)
1152
+ click.echo("Install: npm install -g @googleworkspace/cli", err=True)
1153
+ click.echo("Auth: gws auth login", err=True)
1154
+ sys.exit(1)
1155
+
1156
+ # Fetch docs to temp dir
1157
+ files = source.list_videos(folder_id=folder_id)
1158
+ if not files:
1159
+ click.echo("No documents found.")
1160
+ return
1161
+
1162
+ click.echo(f"Found {len(files)} document(s), fetching...")
1163
+
1164
+ with tempfile.TemporaryDirectory() as tmp_dir:
1165
+ tmp_path = Path(tmp_dir)
1166
+ local_files = []
1167
+ for f in files:
1168
+ safe_name = f.name.replace("/", "_").replace("\\", "_")
1169
+ dest = tmp_path / f"{safe_name}.txt"
1170
+ try:
1171
+ source.download(f, dest)
1172
+ local_files.append(dest)
1173
+ click.echo(f" ✓ {f.name}")
1174
+ except Exception as e:
1175
+ click.echo(f" ✗ {f.name}: {e}", err=True)
1176
+
1177
+ if not local_files:
1178
+ click.echo("No documents fetched successfully.", err=True)
1179
+ sys.exit(1)
1180
+
1181
+ # Set up KG
1182
+ prov = None if provider == "auto" else provider
1183
+ pm = ProviderManager(chat_model=chat_model, provider=prov)
1184
+
1185
+ if db_path:
1186
+ kg_path = Path(db_path)
1187
+ elif output:
1188
+ out_dir = Path(output)
1189
+ out_dir.mkdir(parents=True, exist_ok=True)
1190
+ kg_path = out_dir / "knowledge_graph.db"
1191
+ else:
1192
+ kg_path = Path.cwd() / "knowledge_graph.db"
1193
+
1194
+ kg_path.parent.mkdir(parents=True, exist_ok=True)
1195
+ kg = KnowledgeGraph(provider_manager=pm, db_path=kg_path)
1196
+
1197
+ total_chunks = 0
1198
+ for lf in local_files:
1199
+ try:
1200
+ count = ingest_file(lf, kg)
1201
+ total_chunks += count
1202
+ click.echo(f" Ingested {lf.stem}: {count} chunks")
1203
+ except Exception as e:
1204
+ click.echo(f" Failed to ingest {lf.stem}: {e}", err=True)
1205
+
1206
+ kg.save(kg_path)
1207
+ kg.save(kg_path.with_suffix(".json"))
1208
+
1209
+ entity_count = kg._store.get_entity_count()
1210
+ rel_count = kg._store.get_relationship_count()
1211
+
1212
+ click.echo("\nIngestion complete:")
1213
+ click.echo(f" Documents: {len(local_files)}")
1214
+ click.echo(f" Chunks: {total_chunks}")
1215
+ click.echo(f" Entities: {entity_count}")
1216
+ click.echo(f" Relationships: {rel_count}")
1217
+ click.echo(f" Knowledge graph: {kg_path}")
1218
+
1219
+
1220
+@cli.group()
1221
+def m365():
1222
+ """Microsoft 365: fetch docs from SharePoint and OneDrive via the m365 CLI."""
1223
+ pass
1224
+
1225
+
1226
+@m365.command("list")
1227
+@click.option("--web-url", type=str, required=True, help="SharePoint site URL")
1228
+@click.option("--folder-url", type=str, required=True, help="Server-relative folder URL")
1229
+@click.option("--recursive", is_flag=True, help="Include subfolders")
1230
+@click.option("--json", "as_json", is_flag=True, help="Output as JSON")
1231
+def m365_list(web_url, folder_url, recursive, as_json):
1232
+ """List documents in SharePoint or OneDrive.
1233
+
1234
+ Examples:
1235
+
1236
+ planopticon m365 list --web-url https://contoso.sharepoint.com/sites/proj \\
1237
+ --folder-url /sites/proj/Shared\\ Documents
1238
+
1239
+ planopticon m365 list --web-url URL --folder-url FOLDER --recursive --json
1240
+ """
1241
+ from video_processor.sources.m365_source import M365Source
1242
+
1243
+ source = M365Source(web_url=web_url, folder_url=folder_url, recursive=recursive)
1244
+ if not source.authenticate():
1245
+ click.echo("Error: m365 CLI not available or not logged in.", err=True)
1246
+ click.echo("Install: npm install -g @pnp/cli-microsoft365", err=True)
1247
+ click.echo("Auth: m365 login", err=True)
1248
+ sys.exit(1)
1249
+
1250
+ files = source.list_videos()
1251
+ if as_json:
1252
+ click.echo(json.dumps([f.model_dump() for f in files], indent=2, default=str))
1253
+ else:
1254
+ if not files:
1255
+ click.echo("No documents found.")
1256
+ return
1257
+ for f in files:
1258
+ size = f"{f.size_bytes / 1024:.0f}KB" if f.size_bytes else "—"
1259
+ click.echo(f" {f.id[:12]}… {size:>8s} {f.name}")
1260
+
1261
+
1262
+@m365.command("fetch")
1263
+@click.option("--web-url", type=str, required=True, help="SharePoint site URL")
1264
+@click.option("--folder-url", type=str, default=None, help="Server-relative folder URL")
1265
+@click.option("--file-id", type=str, multiple=True, help="Specific file IDs (repeatable)")
1266
+@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory")
1267
+def m365_fetch(web_url, folder_url, file_id, output):
1268
+ """Fetch SharePoint/OneDrive documents as local files.
1269
+
1270
+ Examples:
1271
+
1272
+ planopticon m365 fetch --web-url URL --folder-url FOLDER -o ./docs
1273
+
1274
+ planopticon m365 fetch --web-url URL --file-id ID1 --file-id ID2 -o ./docs
1275
+ """
1276
+ from video_processor.sources.m365_source import M365Source
1277
+
1278
+ source = M365Source(web_url=web_url, folder_url=folder_url, file_ids=list(file_id))
1279
+ if not source.authenticate():
1280
+ click.echo("Error: m365 CLI not available or not logged in.", err=True)
1281
+ sys.exit(1)
1282
+
1283
+ out_dir = Path(output) if output else Path.cwd() / "m365_docs"
1284
+ out_dir.mkdir(parents=True, exist_ok=True)
1285
+
1286
+ files = source.list_videos()
1287
+ if not files:
1288
+ click.echo("No documents found.")
1289
+ return
1290
+
1291
+ for f in files:
1292
+ dest = out_dir / f.name
1293
+ try:
1294
+ source.download(f, dest)
1295
+ click.echo(f" fetched {f.name}")
1296
+ except Exception as e:
1297
+ click.echo(f" failed {f.name}: {e}", err=True)
1298
+
1299
+ click.echo(f"\nFetched {len(files)} document(s) to {out_dir}")
1300
+
1301
+
1302
+@m365.command("ingest")
1303
+@click.option("--web-url", type=str, required=True, help="SharePoint site URL")
1304
+@click.option("--folder-url", type=str, default=None, help="Server-relative folder URL")
1305
+@click.option("--file-id", type=str, multiple=True, help="Specific file IDs (repeatable)")
1306
+@click.option("-o", "--output", type=click.Path(), default=None, help="Output directory")
1307
+@click.option("--db-path", type=click.Path(), default=None, help="Existing DB to merge into")
1308
+@click.option(
1309
+ "-p",
1310
+ "--provider",
1311
+ type=click.Choice(
1312
+ [
1313
+ "auto",
1314
+ "openai",
1315
+ "anthropic",
1316
+ "gemini",
1317
+ "ollama",
1318
+ "azure",
1319
+ "together",
1320
+ "fireworks",
1321
+ "cerebras",
1322
+ "xai",
1323
+ ]
1324
+ ),
1325
+ default="auto",
1326
+ help="API provider",
1327
+)
1328
+@click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks")
1329
+@click.pass_context
1330
+def m365_ingest(ctx, web_url, folder_url, file_id, output, db_path, provider, chat_model):
1331
+ """Fetch SharePoint/OneDrive docs and ingest into a knowledge graph.
1332
+
1333
+ Examples:
1334
+
1335
+ planopticon m365 ingest --web-url URL --folder-url FOLDER
1336
+
1337
+ planopticon m365 ingest --web-url URL --file-id ID1 --file-id ID2 -o ./results
1338
+ """
1339
+ import tempfile
1340
+
1341
+ from video_processor.integrators.knowledge_graph import KnowledgeGraph
1342
+ from video_processor.processors.ingest import ingest_file
1343
+ from video_processor.providers.manager import ProviderManager
1344
+ from video_processor.sources.m365_source import M365Source
1345
+
1346
+ source = M365Source(web_url=web_url, folder_url=folder_url, file_ids=list(file_id))
1347
+ if not source.authenticate():
1348
+ click.echo("Error: m365 CLI not available or not logged in.", err=True)
1349
+ click.echo("Install: npm install -g @pnp/cli-microsoft365", err=True)
1350
+ click.echo("Auth: m365 login", err=True)
1351
+ sys.exit(1)
1352
+
1353
+ files = source.list_videos()
1354
+ if not files:
1355
+ click.echo("No documents found.")
1356
+ return
1357
+
1358
+ click.echo(f"Found {len(files)} document(s), fetching...")
1359
+
1360
+ with tempfile.TemporaryDirectory() as tmp_dir:
1361
+ tmp_path = Path(tmp_dir)
1362
+ local_files = []
1363
+ for f in files:
1364
+ dest = tmp_path / f.name
1365
+ try:
1366
+ source.download(f, dest)
1367
+ # Extract text for non-text formats
1368
+ text_dest = tmp_path / f"{Path(f.name).stem}.txt"
1369
+ text = source.download_as_text(f)
1370
+ text_dest.write_text(text, encoding="utf-8")
1371
+ local_files.append(text_dest)
1372
+ click.echo(f" fetched {f.name}")
1373
+ except Exception as e:
1374
+ click.echo(f" failed {f.name}: {e}", err=True)
1375
+
1376
+ if not local_files:
1377
+ click.echo("No documents fetched successfully.", err=True)
1378
+ sys.exit(1)
1379
+
1380
+ prov = None if provider == "auto" else provider
1381
+ pm = ProviderManager(chat_model=chat_model, provider=prov)
1382
+
1383
+ if db_path:
1384
+ kg_path = Path(db_path)
1385
+ elif output:
1386
+ out_dir = Path(output)
1387
+ out_dir.mkdir(parents=True, exist_ok=True)
1388
+ kg_path = out_dir / "knowledge_graph.db"
1389
+ else:
1390
+ kg_path = Path.cwd() / "knowledge_graph.db"
1391
+
1392
+ kg_path.parent.mkdir(parents=True, exist_ok=True)
1393
+ kg = KnowledgeGraph(provider_manager=pm, db_path=kg_path)
1394
+
1395
+ total_chunks = 0
1396
+ for lf in local_files:
1397
+ try:
1398
+ count = ingest_file(lf, kg)
1399
+ total_chunks += count
1400
+ click.echo(f" Ingested {lf.stem}: {count} chunks")
1401
+ except Exception as e:
1402
+ click.echo(f" Failed to ingest {lf.stem}: {e}", err=True)
1403
+
1404
+ kg.save(kg_path)
1405
+ kg.save(kg_path.with_suffix(".json"))
1406
+
1407
+ entity_count = kg._store.get_entity_count()
1408
+ rel_count = kg._store.get_relationship_count()
1409
+
1410
+ click.echo("\nIngestion complete:")
1411
+ click.echo(f" Documents: {len(local_files)}")
1412
+ click.echo(f" Chunks: {total_chunks}")
1413
+ click.echo(f" Entities: {entity_count}")
1414
+ click.echo(f" Relationships: {rel_count}")
1415
+ click.echo(f" Knowledge graph: {kg_path}")
1416
+
10181417
10191418
@cli.group()
10201419
def kg():
10211420
"""Knowledge graph utilities: convert, sync, and inspect."""
10221421
pass
10231422
--- video_processor/cli/commands.py
+++ video_processor/cli/commands.py
@@ -1013,10 +1013,409 @@
1013 click.echo("Dropbox authentication successful.")
1014 else:
1015 click.echo("Dropbox authentication failed.", err=True)
1016 sys.exit(1)
1017
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1018
1019 @cli.group()
1020 def kg():
1021 """Knowledge graph utilities: convert, sync, and inspect."""
1022 pass
1023
--- video_processor/cli/commands.py
+++ video_processor/cli/commands.py
@@ -1013,10 +1013,409 @@
1013 click.echo("Dropbox authentication successful.")
1014 else:
1015 click.echo("Dropbox authentication failed.", err=True)
1016 sys.exit(1)
1017
1018
1019 @cli.group()
1020 def gws():
1021 """Google Workspace: fetch docs, sheets, and slides via the gws CLI."""
1022 pass
1023
1024
1025 @gws.command("list")
1026 @click.option("--folder-id", type=str, default=None, help="Drive folder ID to list")
1027 @click.option("--query", "-q", type=str, default=None, help="Drive search query")
1028 @click.option("--json", "as_json", is_flag=True, help="Output as JSON")
1029 def gws_list(folder_id, query, as_json):
1030 """List documents in Google Drive.
1031
1032 Examples:
1033
1034 planopticon gws list
1035
1036 planopticon gws list --folder-id 1abc...
1037
1038 planopticon gws list -q "name contains 'PRD'" --json
1039 """
1040 from video_processor.sources.gws_source import GWSSource
1041
1042 source = GWSSource(folder_id=folder_id, query=query)
1043 if not source.authenticate():
1044 click.echo("Error: gws CLI not available or not authenticated.", err=True)
1045 click.echo("Install: npm install -g @googleworkspace/cli", err=True)
1046 click.echo("Auth: gws auth login", err=True)
1047 sys.exit(1)
1048
1049 files = source.list_videos(folder_id=folder_id)
1050 if as_json:
1051 click.echo(json.dumps([f.model_dump() for f in files], indent=2, default=str))
1052 else:
1053 if not files:
1054 click.echo("No documents found.")
1055 return
1056 for f in files:
1057 size = f"{f.size_bytes / 1024:.0f}KB" if f.size_bytes else "—"
1058 click.echo(f" {f.id[:12]}… {size:>8s} {f.mime_type or ''} {f.name}")
1059
1060
1061 @gws.command("fetch")
1062 @click.argument("doc_ids", nargs=-1)
1063 @click.option("--folder-id", type=str, default=None, help="Fetch all docs in a folder")
1064 @click.option("-o", "--output", type=click.Path(), default=None, help="Output directory")
1065 def gws_fetch(doc_ids, folder_id, output):
1066 """Fetch Google Docs/Sheets/Slides as text files.
1067
1068 Examples:
1069
1070 planopticon gws fetch DOC_ID1 DOC_ID2 -o ./docs
1071
1072 planopticon gws fetch --folder-id 1abc... -o ./docs
1073 """
1074 from video_processor.sources.gws_source import GWSSource
1075
1076 source = GWSSource(folder_id=folder_id, doc_ids=list(doc_ids))
1077 if not source.authenticate():
1078 click.echo("Error: gws CLI not available or not authenticated.", err=True)
1079 sys.exit(1)
1080
1081 out_dir = Path(output) if output else Path.cwd() / "gws_docs"
1082 out_dir.mkdir(parents=True, exist_ok=True)
1083
1084 files = source.list_videos(folder_id=folder_id)
1085 if not files:
1086 click.echo("No documents found.")
1087 return
1088
1089 for f in files:
1090 safe_name = f.name.replace("/", "_").replace("\\", "_")
1091 dest = out_dir / f"{safe_name}.txt"
1092 try:
1093 source.download(f, dest)
1094 click.echo(f" ✓ {f.name} → {dest}")
1095 except Exception as e:
1096 click.echo(f" ✗ {f.name}: {e}", err=True)
1097
1098 click.echo(f"\nFetched {len(files)} document(s) to {out_dir}")
1099
1100
1101 @gws.command("ingest")
1102 @click.option("--folder-id", type=str, default=None, help="Drive folder ID")
1103 @click.option("--doc-id", type=str, multiple=True, help="Specific doc IDs (repeatable)")
1104 @click.option("--query", "-q", type=str, default=None, help="Drive search query")
1105 @click.option("-o", "--output", type=click.Path(), default=None, help="Output directory")
1106 @click.option("--db-path", type=click.Path(), default=None, help="Existing DB to merge into")
1107 @click.option(
1108 "-p",
1109 "--provider",
1110 type=click.Choice(
1111 [
1112 "auto",
1113 "openai",
1114 "anthropic",
1115 "gemini",
1116 "ollama",
1117 "azure",
1118 "together",
1119 "fireworks",
1120 "cerebras",
1121 "xai",
1122 ]
1123 ),
1124 default="auto",
1125 help="API provider",
1126 )
1127 @click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks")
1128 @click.pass_context
1129 def gws_ingest(ctx, folder_id, doc_id, query, output, db_path, provider, chat_model):
1130 """Fetch Google Workspace docs and ingest into a knowledge graph.
1131
1132 Combines gws fetch + planopticon ingest in one step.
1133
1134 Examples:
1135
1136 planopticon gws ingest --folder-id 1abc...
1137
1138 planopticon gws ingest --doc-id DOC1 --doc-id DOC2 -o ./results
1139
1140 planopticon gws ingest -q "name contains 'spec'" --db-path existing.db
1141 """
1142 import tempfile
1143
1144 from video_processor.integrators.knowledge_graph import KnowledgeGraph
1145 from video_processor.processors.ingest import ingest_file
1146 from video_processor.providers.manager import ProviderManager
1147 from video_processor.sources.gws_source import GWSSource
1148
1149 source = GWSSource(folder_id=folder_id, doc_ids=list(doc_id), query=query)
1150 if not source.authenticate():
1151 click.echo("Error: gws CLI not available or not authenticated.", err=True)
1152 click.echo("Install: npm install -g @googleworkspace/cli", err=True)
1153 click.echo("Auth: gws auth login", err=True)
1154 sys.exit(1)
1155
1156 # Fetch docs to temp dir
1157 files = source.list_videos(folder_id=folder_id)
1158 if not files:
1159 click.echo("No documents found.")
1160 return
1161
1162 click.echo(f"Found {len(files)} document(s), fetching...")
1163
1164 with tempfile.TemporaryDirectory() as tmp_dir:
1165 tmp_path = Path(tmp_dir)
1166 local_files = []
1167 for f in files:
1168 safe_name = f.name.replace("/", "_").replace("\\", "_")
1169 dest = tmp_path / f"{safe_name}.txt"
1170 try:
1171 source.download(f, dest)
1172 local_files.append(dest)
1173 click.echo(f" ✓ {f.name}")
1174 except Exception as e:
1175 click.echo(f" ✗ {f.name}: {e}", err=True)
1176
1177 if not local_files:
1178 click.echo("No documents fetched successfully.", err=True)
1179 sys.exit(1)
1180
1181 # Set up KG
1182 prov = None if provider == "auto" else provider
1183 pm = ProviderManager(chat_model=chat_model, provider=prov)
1184
1185 if db_path:
1186 kg_path = Path(db_path)
1187 elif output:
1188 out_dir = Path(output)
1189 out_dir.mkdir(parents=True, exist_ok=True)
1190 kg_path = out_dir / "knowledge_graph.db"
1191 else:
1192 kg_path = Path.cwd() / "knowledge_graph.db"
1193
1194 kg_path.parent.mkdir(parents=True, exist_ok=True)
1195 kg = KnowledgeGraph(provider_manager=pm, db_path=kg_path)
1196
1197 total_chunks = 0
1198 for lf in local_files:
1199 try:
1200 count = ingest_file(lf, kg)
1201 total_chunks += count
1202 click.echo(f" Ingested {lf.stem}: {count} chunks")
1203 except Exception as e:
1204 click.echo(f" Failed to ingest {lf.stem}: {e}", err=True)
1205
1206 kg.save(kg_path)
1207 kg.save(kg_path.with_suffix(".json"))
1208
1209 entity_count = kg._store.get_entity_count()
1210 rel_count = kg._store.get_relationship_count()
1211
1212 click.echo("\nIngestion complete:")
1213 click.echo(f" Documents: {len(local_files)}")
1214 click.echo(f" Chunks: {total_chunks}")
1215 click.echo(f" Entities: {entity_count}")
1216 click.echo(f" Relationships: {rel_count}")
1217 click.echo(f" Knowledge graph: {kg_path}")
1218
1219
1220 @cli.group()
1221 def m365():
1222 """Microsoft 365: fetch docs from SharePoint and OneDrive via the m365 CLI."""
1223 pass
1224
1225
1226 @m365.command("list")
1227 @click.option("--web-url", type=str, required=True, help="SharePoint site URL")
1228 @click.option("--folder-url", type=str, required=True, help="Server-relative folder URL")
1229 @click.option("--recursive", is_flag=True, help="Include subfolders")
1230 @click.option("--json", "as_json", is_flag=True, help="Output as JSON")
1231 def m365_list(web_url, folder_url, recursive, as_json):
1232 """List documents in SharePoint or OneDrive.
1233
1234 Examples:
1235
1236 planopticon m365 list --web-url https://contoso.sharepoint.com/sites/proj \\
1237 --folder-url /sites/proj/Shared\\ Documents
1238
1239 planopticon m365 list --web-url URL --folder-url FOLDER --recursive --json
1240 """
1241 from video_processor.sources.m365_source import M365Source
1242
1243 source = M365Source(web_url=web_url, folder_url=folder_url, recursive=recursive)
1244 if not source.authenticate():
1245 click.echo("Error: m365 CLI not available or not logged in.", err=True)
1246 click.echo("Install: npm install -g @pnp/cli-microsoft365", err=True)
1247 click.echo("Auth: m365 login", err=True)
1248 sys.exit(1)
1249
1250 files = source.list_videos()
1251 if as_json:
1252 click.echo(json.dumps([f.model_dump() for f in files], indent=2, default=str))
1253 else:
1254 if not files:
1255 click.echo("No documents found.")
1256 return
1257 for f in files:
1258 size = f"{f.size_bytes / 1024:.0f}KB" if f.size_bytes else "—"
1259 click.echo(f" {f.id[:12]}… {size:>8s} {f.name}")
1260
1261
1262 @m365.command("fetch")
1263 @click.option("--web-url", type=str, required=True, help="SharePoint site URL")
1264 @click.option("--folder-url", type=str, default=None, help="Server-relative folder URL")
1265 @click.option("--file-id", type=str, multiple=True, help="Specific file IDs (repeatable)")
1266 @click.option("-o", "--output", type=click.Path(), default=None, help="Output directory")
1267 def m365_fetch(web_url, folder_url, file_id, output):
1268 """Fetch SharePoint/OneDrive documents as local files.
1269
1270 Examples:
1271
1272 planopticon m365 fetch --web-url URL --folder-url FOLDER -o ./docs
1273
1274 planopticon m365 fetch --web-url URL --file-id ID1 --file-id ID2 -o ./docs
1275 """
1276 from video_processor.sources.m365_source import M365Source
1277
1278 source = M365Source(web_url=web_url, folder_url=folder_url, file_ids=list(file_id))
1279 if not source.authenticate():
1280 click.echo("Error: m365 CLI not available or not logged in.", err=True)
1281 sys.exit(1)
1282
1283 out_dir = Path(output) if output else Path.cwd() / "m365_docs"
1284 out_dir.mkdir(parents=True, exist_ok=True)
1285
1286 files = source.list_videos()
1287 if not files:
1288 click.echo("No documents found.")
1289 return
1290
1291 for f in files:
1292 dest = out_dir / f.name
1293 try:
1294 source.download(f, dest)
1295 click.echo(f" fetched {f.name}")
1296 except Exception as e:
1297 click.echo(f" failed {f.name}: {e}", err=True)
1298
1299 click.echo(f"\nFetched {len(files)} document(s) to {out_dir}")
1300
1301
1302 @m365.command("ingest")
1303 @click.option("--web-url", type=str, required=True, help="SharePoint site URL")
1304 @click.option("--folder-url", type=str, default=None, help="Server-relative folder URL")
1305 @click.option("--file-id", type=str, multiple=True, help="Specific file IDs (repeatable)")
1306 @click.option("-o", "--output", type=click.Path(), default=None, help="Output directory")
1307 @click.option("--db-path", type=click.Path(), default=None, help="Existing DB to merge into")
1308 @click.option(
1309 "-p",
1310 "--provider",
1311 type=click.Choice(
1312 [
1313 "auto",
1314 "openai",
1315 "anthropic",
1316 "gemini",
1317 "ollama",
1318 "azure",
1319 "together",
1320 "fireworks",
1321 "cerebras",
1322 "xai",
1323 ]
1324 ),
1325 default="auto",
1326 help="API provider",
1327 )
1328 @click.option("--chat-model", type=str, default=None, help="Override model for LLM/chat tasks")
1329 @click.pass_context
1330 def m365_ingest(ctx, web_url, folder_url, file_id, output, db_path, provider, chat_model):
1331 """Fetch SharePoint/OneDrive docs and ingest into a knowledge graph.
1332
1333 Examples:
1334
1335 planopticon m365 ingest --web-url URL --folder-url FOLDER
1336
1337 planopticon m365 ingest --web-url URL --file-id ID1 --file-id ID2 -o ./results
1338 """
1339 import tempfile
1340
1341 from video_processor.integrators.knowledge_graph import KnowledgeGraph
1342 from video_processor.processors.ingest import ingest_file
1343 from video_processor.providers.manager import ProviderManager
1344 from video_processor.sources.m365_source import M365Source
1345
1346 source = M365Source(web_url=web_url, folder_url=folder_url, file_ids=list(file_id))
1347 if not source.authenticate():
1348 click.echo("Error: m365 CLI not available or not logged in.", err=True)
1349 click.echo("Install: npm install -g @pnp/cli-microsoft365", err=True)
1350 click.echo("Auth: m365 login", err=True)
1351 sys.exit(1)
1352
1353 files = source.list_videos()
1354 if not files:
1355 click.echo("No documents found.")
1356 return
1357
1358 click.echo(f"Found {len(files)} document(s), fetching...")
1359
1360 with tempfile.TemporaryDirectory() as tmp_dir:
1361 tmp_path = Path(tmp_dir)
1362 local_files = []
1363 for f in files:
1364 dest = tmp_path / f.name
1365 try:
1366 source.download(f, dest)
1367 # Extract text for non-text formats
1368 text_dest = tmp_path / f"{Path(f.name).stem}.txt"
1369 text = source.download_as_text(f)
1370 text_dest.write_text(text, encoding="utf-8")
1371 local_files.append(text_dest)
1372 click.echo(f" fetched {f.name}")
1373 except Exception as e:
1374 click.echo(f" failed {f.name}: {e}", err=True)
1375
1376 if not local_files:
1377 click.echo("No documents fetched successfully.", err=True)
1378 sys.exit(1)
1379
1380 prov = None if provider == "auto" else provider
1381 pm = ProviderManager(chat_model=chat_model, provider=prov)
1382
1383 if db_path:
1384 kg_path = Path(db_path)
1385 elif output:
1386 out_dir = Path(output)
1387 out_dir.mkdir(parents=True, exist_ok=True)
1388 kg_path = out_dir / "knowledge_graph.db"
1389 else:
1390 kg_path = Path.cwd() / "knowledge_graph.db"
1391
1392 kg_path.parent.mkdir(parents=True, exist_ok=True)
1393 kg = KnowledgeGraph(provider_manager=pm, db_path=kg_path)
1394
1395 total_chunks = 0
1396 for lf in local_files:
1397 try:
1398 count = ingest_file(lf, kg)
1399 total_chunks += count
1400 click.echo(f" Ingested {lf.stem}: {count} chunks")
1401 except Exception as e:
1402 click.echo(f" Failed to ingest {lf.stem}: {e}", err=True)
1403
1404 kg.save(kg_path)
1405 kg.save(kg_path.with_suffix(".json"))
1406
1407 entity_count = kg._store.get_entity_count()
1408 rel_count = kg._store.get_relationship_count()
1409
1410 click.echo("\nIngestion complete:")
1411 click.echo(f" Documents: {len(local_files)}")
1412 click.echo(f" Chunks: {total_chunks}")
1413 click.echo(f" Entities: {entity_count}")
1414 click.echo(f" Relationships: {rel_count}")
1415 click.echo(f" Knowledge graph: {kg_path}")
1416
1417
1418 @cli.group()
1419 def kg():
1420 """Knowledge graph utilities: convert, sync, and inspect."""
1421 pass
1422
--- video_processor/sources/__init__.py
+++ video_processor/sources/__init__.py
@@ -11,10 +11,12 @@
1111
"HackerNewsSource",
1212
"PodcastSource",
1313
"RedditSource",
1414
"RSSSource",
1515
"TwitterSource",
16
+ "GWSSource",
17
+ "M365Source",
1618
"WebSource",
1719
"YouTubeSource",
1820
]
1921
2022
@@ -22,10 +24,12 @@
2224
"""Lazy imports to avoid pulling in optional dependencies at import time."""
2325
_lazy_map = {
2426
"ArxivSource": "video_processor.sources.arxiv_source",
2527
"GitHubSource": "video_processor.sources.github_source",
2628
"GoogleDriveSource": "video_processor.sources.google_drive",
29
+ "GWSSource": "video_processor.sources.gws_source",
30
+ "M365Source": "video_processor.sources.m365_source",
2731
"HackerNewsSource": "video_processor.sources.hackernews_source",
2832
"PodcastSource": "video_processor.sources.podcast_source",
2933
"RedditSource": "video_processor.sources.reddit_source",
3034
"RSSSource": "video_processor.sources.rss_source",
3135
"TwitterSource": "video_processor.sources.twitter_source",
3236
3337
ADDED video_processor/sources/gws_source.py
3438
ADDED video_processor/sources/m365_source.py
--- video_processor/sources/__init__.py
+++ video_processor/sources/__init__.py
@@ -11,10 +11,12 @@
11 "HackerNewsSource",
12 "PodcastSource",
13 "RedditSource",
14 "RSSSource",
15 "TwitterSource",
 
 
16 "WebSource",
17 "YouTubeSource",
18 ]
19
20
@@ -22,10 +24,12 @@
22 """Lazy imports to avoid pulling in optional dependencies at import time."""
23 _lazy_map = {
24 "ArxivSource": "video_processor.sources.arxiv_source",
25 "GitHubSource": "video_processor.sources.github_source",
26 "GoogleDriveSource": "video_processor.sources.google_drive",
 
 
27 "HackerNewsSource": "video_processor.sources.hackernews_source",
28 "PodcastSource": "video_processor.sources.podcast_source",
29 "RedditSource": "video_processor.sources.reddit_source",
30 "RSSSource": "video_processor.sources.rss_source",
31 "TwitterSource": "video_processor.sources.twitter_source",
32
33 DDED video_processor/sources/gws_source.py
34 DDED video_processor/sources/m365_source.py
--- video_processor/sources/__init__.py
+++ video_processor/sources/__init__.py
@@ -11,10 +11,12 @@
11 "HackerNewsSource",
12 "PodcastSource",
13 "RedditSource",
14 "RSSSource",
15 "TwitterSource",
16 "GWSSource",
17 "M365Source",
18 "WebSource",
19 "YouTubeSource",
20 ]
21
22
@@ -22,10 +24,12 @@
24 """Lazy imports to avoid pulling in optional dependencies at import time."""
25 _lazy_map = {
26 "ArxivSource": "video_processor.sources.arxiv_source",
27 "GitHubSource": "video_processor.sources.github_source",
28 "GoogleDriveSource": "video_processor.sources.google_drive",
29 "GWSSource": "video_processor.sources.gws_source",
30 "M365Source": "video_processor.sources.m365_source",
31 "HackerNewsSource": "video_processor.sources.hackernews_source",
32 "PodcastSource": "video_processor.sources.podcast_source",
33 "RedditSource": "video_processor.sources.reddit_source",
34 "RSSSource": "video_processor.sources.rss_source",
35 "TwitterSource": "video_processor.sources.twitter_source",
36
37 DDED video_processor/sources/gws_source.py
38 DDED video_processor/sources/m365_source.py
--- a/video_processor/sources/gws_source.py
+++ b/video_processor/sources/gws_source.py
@@ -0,0 +1,268 @@
1
+"""Google Workspace source connector using the gws CLI (googleworkspace/cli).
2
+
3
+Fetches and collates Google Docs, Sheets, Slides, and other Drive files
4
+via the `gws` CLI tool. Outputs plain text suitable for KG ingestion.
5
+
6
+Requires: npm install -g @googleworkspace/cli
7
+Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless)
8
+"""
9
+
10
+import json
11
+import logging
12
+import shutil
13
+import subprocess
14
+from pathlib import Path
15
+from typing import Any, Dict, List, Optional
16
+
17
+from video_processor.sources.base import BaseSource, SourceFile
18
+
19
+logger = logging.getLogger(__name__)
20
+
21
+# Google Workspace MIME types we can extract text from
22
+_DOC_MIMES = {
23
+ "application/vnd.google-apps.document",
24
+ "application/vnd.google-apps.spreadsheet",
25
+ "application/vnd.google-apps.presentation",
26
+ "application/pdf",
27
+ "text/plain",
28
+ "text/markdown",
29
+ "text/html",
30
+}
31
+
32
+# Export MIME mappings for native Google formats
33
+_EXPORT_MIMES = {
34
+ "application/vnd.google-apps.document": "text/plain",
35
+ "application/vnd.google-apps.spreadsheet": "text/csv",
36
+ "application/vnd.google-apps.presentation": "text/plain",
37
+}
38
+
39
+
40
+def _run_gws(args: List[str], timeout: int = 30) -> Dict[str, Any]:
41
+ """Run a gws CLI command and return parsed JSON output."""
42
+ cmd = ["gws"] + args
43
+ proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
44
+ if proc.returncode != 0:
45
+ raise RuntimeError(f"gws {' '.join(args)} failed: {proc.stderr.strip()}")
46
+ try:
47
+ return json.loads(proc.stdout)
48
+ except json.JSONDecodeError:
49
+ return {"raw": proc.stdout.strip()}
50
+
51
+
52
+class GWSSource(BaseSource):
53
+ """
54
+ Fetch documents from Google Workspace (Drive, Docs, Sheets, Slides) via gws CLI.
55
+
56
+ Usage:
57
+ source = GWSSource(folder_id="1abc...") # specific Drive folder
58
+ source = GWSSource(query="type:document") # Drive search query
59
+ files = source.list_videos() # lists docs, not just videos
60
+ source.download_all(files, Path("./docs"))
61
+ """
62
+
63
+ def __init__(
64
+ self,
65
+ folder_id: Optional[str] = None,
66
+ query: Optional[str] = None,
67
+ doc_ids: Optional[List[str]] = None,
68
+ mime_filter: Optional[List[str]] = None,
69
+ ):
70
+ self.folder_id = folder_id
71
+ self.query = query
72
+ self.doc_ids = doc_ids or []
73
+ self.mime_filter = set(mime_filter) if mime_filter else _DOC_MIMES
74
+
75
+ def authenticate(self) -> bool:
76
+ """Check if gws CLI is installed and authenticated."""
77
+ if not shutil.which("gws"):
78
+ logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli")
79
+ return False
80
+ try:
81
+ _run_gws(["auth", "status"], timeout=10)
82
+ return True
83
+ except (RuntimeError, subprocess.TimeoutExpired):
84
+ logger.error("gws not authenticated. Run: gws auth login")
85
+ return False
86
+
87
+ def list_videos(
88
+ self,
89
+ folder_id: Optional[str] = None,
90
+ folder_path: Optional[str] = None,
91
+ patterns: Optional[List[str]] = None,
92
+ ) -> List[SourceFile]:
93
+ """List documents in Drive. Despite the method name, returns docs not just videos."""
94
+ folder = folder_id or self.folder_id
95
+ files: List[SourceFile] = []
96
+
97
+ # If specific doc IDs were provided, fetch metadata for each
98
+ if self.doc_ids:
99
+ for doc_id in self.doc_ids:
100
+ try:
101
+ result = _run_gws(
102
+ [
103
+ "drive",
104
+ "files",
105
+ "get",
106
+ "--params",
107
+ json.dumps(
108
+ {"fileId": doc_id, "fields": "id,name,mimeType,size,modifiedTime"}
109
+ ),
110
+ ]
111
+ )
112
+ files.append(_result_to_source_file(result))
113
+ except RuntimeError as e:
114
+ logger.warning(f"Failed to fetch doc {doc_id}: {e}")
115
+ return files
116
+
117
+ # Build Drive files list query
118
+ params: Dict[str, Any] = {
119
+ "pageSize": 100,
120
+ "fields": "files(id,name,mimeType,size,modifiedTime)",
121
+ }
122
+
123
+ q_parts = []
124
+ if folder:
125
+ q_parts.append(f"'{folder}' in parents")
126
+ if self.query:
127
+ q_parts.append(self.query)
128
+ # Filter to document types
129
+ mime_clauses = [f"mimeType='{m}'" for m in self.mime_filter]
130
+ if mime_clauses:
131
+ q_parts.append(f"({' or '.join(mime_clauses)})")
132
+ if q_parts:
133
+ params["q"] = " and ".join(q_parts)
134
+
135
+ try:
136
+ result = _run_gws(
137
+ [
138
+ "drive",
139
+ "files",
140
+ "list",
141
+ "--params",
142
+ json.dumps(params),
143
+ ],
144
+ timeout=60,
145
+ )
146
+ except RuntimeError as e:
147
+ logger.error(f"Failed to list Drive files: {e}")
148
+ return []
149
+
150
+ for item in result.get("files", []):
151
+ files.append(_result_to_source_file(item))
152
+
153
+ logger.info(f"Found {len(files)} document(s) in Google Drive")
154
+ return files
155
+
156
+ def download(self, file: SourceFile, destination: Path) -> Path:
157
+ """Download/export a document to a local text file."""
158
+ destination = Path(destination)
159
+ destination.parent.mkdir(parents=True, exist_ok=True)
160
+
161
+ mime = file.mime_type or ""
162
+
163
+ # Native Google format — export as text
164
+ if mime in _EXPORT_MIMES:
165
+ content = self._export_doc(file.id, mime)
166
+ # Regular file — download directly
167
+ else:
168
+ content = self._download_file(file.id)
169
+
170
+ destination.write_text(content, encoding="utf-8")
171
+ logger.info(f"Saved {file.name} to {destination}")
172
+ return destination
173
+
174
+ def _export_doc(self, file_id: str, source_mime: str) -> str:
175
+ """Export a native Google doc to text via gws."""
176
+ export_mime = _EXPORT_MIMES.get(source_mime, "text/plain")
177
+ try:
178
+ result = _run_gws(
179
+ [
180
+ "drive",
181
+ "files",
182
+ "export",
183
+ "--params",
184
+ json.dumps({"fileId": file_id, "mimeType": export_mime}),
185
+ ],
186
+ timeout=60,
187
+ )
188
+ return result.get("raw", json.dumps(result, indent=2))
189
+ except RuntimeError:
190
+ # Fallback: try getting via Docs API for Google Docs
191
+ if source_mime == "application/vnd.google-apps.document":
192
+ return self._get_doc_text(file_id)
193
+ raise
194
+
195
+ def _get_doc_text(self, doc_id: str) -> str:
196
+ """Fetch Google Doc content via the Docs API and extract text."""
197
+ result = _run_gws(
198
+ [
199
+ "docs",
200
+ "documents",
201
+ "get",
202
+ "--params",
203
+ json.dumps({"documentId": doc_id}),
204
+ ],
205
+ timeout=60,
206
+ )
207
+
208
+ # Extract text from the Docs API structural response
209
+ body = result.get("body", {})
210
+ content_parts = []
211
+ for element in body.get("content", []):
212
+ paragraph = element.get("paragraph", {})
213
+ for pe in paragraph.get("elements", []):
214
+ text_run = pe.get("textRun", {})
215
+ text = text_run.get("content", "")
216
+ if text.strip():
217
+ content_parts.append(text)
218
+
219
+ return "".join(content_parts) if content_parts else json.dumps(result, indent=2)
220
+
221
+ def _download_file(self, file_id: str) -> str:
222
+ """Download a non-native file's content."""
223
+ result = _run_gws(
224
+ [
225
+ "drive",
226
+ "files",
227
+ "get",
228
+ "--params",
229
+ json.dumps({"fileId": file_id, "alt": "media"}),
230
+ ],
231
+ timeout=60,
232
+ )
233
+ return result.get("raw", json.dumps(result, indent=2))
234
+
235
+ def fetch_all_text(self, folder_id: Optional[str] = None) -> Dict[str, str]:
236
+ """Convenience: list all docs and return {filename: text_content} dict."""
237
+ files = self.list_videos(folder_id=folder_id)
238
+ results = {}
239
+ for f in files:
240
+ try:
241
+ if f.mime_type and f.mime_type in _EXPORT_MIMES:
242
+ results[f.name] = self._export_doc(f.id, f.mime_type)
243
+ else:
244
+ results[f.name] = self._download_file(f.id)
245
+ except Exception as e:
246
+ logger.warning(f"Failed to fetch {f.name}: {e}")
247
+ results[f.name] = f"[Error: {e}]"
248
+ return results
249
+
250
+ def collate(self, folder_id: Optional[str] = None, separator: str = "\n\n---\n\n") -> str:
251
+ """Fetch all docs and collate into a single text blob for ingestion."""
252
+ docs = self.fetch_all_text(folder_id=folder_id)
253
+ parts = []
254
+ for name, content in docs.items():
255
+ parts.append(f"# {name}\n\n{content}")
256
+ return separator.join(parts)
257
+
258
+
259
+def _result_to_source_file(item: dict) -> SourceFile:
260
+ """Convert a Drive API file result to SourceFile."""
261
+ size = item.get("size")
262
+ return SourceFile(
263
+ name=item.get("name", "Untitled"),
264
+ id=item.get("id", ""),
265
+ size_bytes=int(size) if size else None,
266
+ mime_type=item.get("mimeType"),
267
+ modified_at=item.get("modifiedTime"),
268
+ )
--- a/video_processor/sources/gws_source.py
+++ b/video_processor/sources/gws_source.py
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/gws_source.py
+++ b/video_processor/sources/gws_source.py
@@ -0,0 +1,268 @@
1 """Google Workspace source connector using the gws CLI (googleworkspace/cli).
2
3 Fetches and collates Google Docs, Sheets, Slides, and other Drive files
4 via the `gws` CLI tool. Outputs plain text suitable for KG ingestion.
5
6 Requires: npm install -g @googleworkspace/cli
7 Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless)
8 """
9
10 import json
11 import logging
12 import shutil
13 import subprocess
14 from pathlib import Path
15 from typing import Any, Dict, List, Optional
16
17 from video_processor.sources.base import BaseSource, SourceFile
18
19 logger = logging.getLogger(__name__)
20
21 # Google Workspace MIME types we can extract text from
22 _DOC_MIMES = {
23 "application/vnd.google-apps.document",
24 "application/vnd.google-apps.spreadsheet",
25 "application/vnd.google-apps.presentation",
26 "application/pdf",
27 "text/plain",
28 "text/markdown",
29 "text/html",
30 }
31
32 # Export MIME mappings for native Google formats
33 _EXPORT_MIMES = {
34 "application/vnd.google-apps.document": "text/plain",
35 "application/vnd.google-apps.spreadsheet": "text/csv",
36 "application/vnd.google-apps.presentation": "text/plain",
37 }
38
39
40 def _run_gws(args: List[str], timeout: int = 30) -> Dict[str, Any]:
41 """Run a gws CLI command and return parsed JSON output."""
42 cmd = ["gws"] + args
43 proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
44 if proc.returncode != 0:
45 raise RuntimeError(f"gws {' '.join(args)} failed: {proc.stderr.strip()}")
46 try:
47 return json.loads(proc.stdout)
48 except json.JSONDecodeError:
49 return {"raw": proc.stdout.strip()}
50
51
52 class GWSSource(BaseSource):
53 """
54 Fetch documents from Google Workspace (Drive, Docs, Sheets, Slides) via gws CLI.
55
56 Usage:
57 source = GWSSource(folder_id="1abc...") # specific Drive folder
58 source = GWSSource(query="type:document") # Drive search query
59 files = source.list_videos() # lists docs, not just videos
60 source.download_all(files, Path("./docs"))
61 """
62
63 def __init__(
64 self,
65 folder_id: Optional[str] = None,
66 query: Optional[str] = None,
67 doc_ids: Optional[List[str]] = None,
68 mime_filter: Optional[List[str]] = None,
69 ):
70 self.folder_id = folder_id
71 self.query = query
72 self.doc_ids = doc_ids or []
73 self.mime_filter = set(mime_filter) if mime_filter else _DOC_MIMES
74
75 def authenticate(self) -> bool:
76 """Check if gws CLI is installed and authenticated."""
77 if not shutil.which("gws"):
78 logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli")
79 return False
80 try:
81 _run_gws(["auth", "status"], timeout=10)
82 return True
83 except (RuntimeError, subprocess.TimeoutExpired):
84 logger.error("gws not authenticated. Run: gws auth login")
85 return False
86
87 def list_videos(
88 self,
89 folder_id: Optional[str] = None,
90 folder_path: Optional[str] = None,
91 patterns: Optional[List[str]] = None,
92 ) -> List[SourceFile]:
93 """List documents in Drive. Despite the method name, returns docs not just videos."""
94 folder = folder_id or self.folder_id
95 files: List[SourceFile] = []
96
97 # If specific doc IDs were provided, fetch metadata for each
98 if self.doc_ids:
99 for doc_id in self.doc_ids:
100 try:
101 result = _run_gws(
102 [
103 "drive",
104 "files",
105 "get",
106 "--params",
107 json.dumps(
108 {"fileId": doc_id, "fields": "id,name,mimeType,size,modifiedTime"}
109 ),
110 ]
111 )
112 files.append(_result_to_source_file(result))
113 except RuntimeError as e:
114 logger.warning(f"Failed to fetch doc {doc_id}: {e}")
115 return files
116
117 # Build Drive files list query
118 params: Dict[str, Any] = {
119 "pageSize": 100,
120 "fields": "files(id,name,mimeType,size,modifiedTime)",
121 }
122
123 q_parts = []
124 if folder:
125 q_parts.append(f"'{folder}' in parents")
126 if self.query:
127 q_parts.append(self.query)
128 # Filter to document types
129 mime_clauses = [f"mimeType='{m}'" for m in self.mime_filter]
130 if mime_clauses:
131 q_parts.append(f"({' or '.join(mime_clauses)})")
132 if q_parts:
133 params["q"] = " and ".join(q_parts)
134
135 try:
136 result = _run_gws(
137 [
138 "drive",
139 "files",
140 "list",
141 "--params",
142 json.dumps(params),
143 ],
144 timeout=60,
145 )
146 except RuntimeError as e:
147 logger.error(f"Failed to list Drive files: {e}")
148 return []
149
150 for item in result.get("files", []):
151 files.append(_result_to_source_file(item))
152
153 logger.info(f"Found {len(files)} document(s) in Google Drive")
154 return files
155
156 def download(self, file: SourceFile, destination: Path) -> Path:
157 """Download/export a document to a local text file."""
158 destination = Path(destination)
159 destination.parent.mkdir(parents=True, exist_ok=True)
160
161 mime = file.mime_type or ""
162
163 # Native Google format — export as text
164 if mime in _EXPORT_MIMES:
165 content = self._export_doc(file.id, mime)
166 # Regular file — download directly
167 else:
168 content = self._download_file(file.id)
169
170 destination.write_text(content, encoding="utf-8")
171 logger.info(f"Saved {file.name} to {destination}")
172 return destination
173
174 def _export_doc(self, file_id: str, source_mime: str) -> str:
175 """Export a native Google doc to text via gws."""
176 export_mime = _EXPORT_MIMES.get(source_mime, "text/plain")
177 try:
178 result = _run_gws(
179 [
180 "drive",
181 "files",
182 "export",
183 "--params",
184 json.dumps({"fileId": file_id, "mimeType": export_mime}),
185 ],
186 timeout=60,
187 )
188 return result.get("raw", json.dumps(result, indent=2))
189 except RuntimeError:
190 # Fallback: try getting via Docs API for Google Docs
191 if source_mime == "application/vnd.google-apps.document":
192 return self._get_doc_text(file_id)
193 raise
194
195 def _get_doc_text(self, doc_id: str) -> str:
196 """Fetch Google Doc content via the Docs API and extract text."""
197 result = _run_gws(
198 [
199 "docs",
200 "documents",
201 "get",
202 "--params",
203 json.dumps({"documentId": doc_id}),
204 ],
205 timeout=60,
206 )
207
208 # Extract text from the Docs API structural response
209 body = result.get("body", {})
210 content_parts = []
211 for element in body.get("content", []):
212 paragraph = element.get("paragraph", {})
213 for pe in paragraph.get("elements", []):
214 text_run = pe.get("textRun", {})
215 text = text_run.get("content", "")
216 if text.strip():
217 content_parts.append(text)
218
219 return "".join(content_parts) if content_parts else json.dumps(result, indent=2)
220
221 def _download_file(self, file_id: str) -> str:
222 """Download a non-native file's content."""
223 result = _run_gws(
224 [
225 "drive",
226 "files",
227 "get",
228 "--params",
229 json.dumps({"fileId": file_id, "alt": "media"}),
230 ],
231 timeout=60,
232 )
233 return result.get("raw", json.dumps(result, indent=2))
234
235 def fetch_all_text(self, folder_id: Optional[str] = None) -> Dict[str, str]:
236 """Convenience: list all docs and return {filename: text_content} dict."""
237 files = self.list_videos(folder_id=folder_id)
238 results = {}
239 for f in files:
240 try:
241 if f.mime_type and f.mime_type in _EXPORT_MIMES:
242 results[f.name] = self._export_doc(f.id, f.mime_type)
243 else:
244 results[f.name] = self._download_file(f.id)
245 except Exception as e:
246 logger.warning(f"Failed to fetch {f.name}: {e}")
247 results[f.name] = f"[Error: {e}]"
248 return results
249
250 def collate(self, folder_id: Optional[str] = None, separator: str = "\n\n---\n\n") -> str:
251 """Fetch all docs and collate into a single text blob for ingestion."""
252 docs = self.fetch_all_text(folder_id=folder_id)
253 parts = []
254 for name, content in docs.items():
255 parts.append(f"# {name}\n\n{content}")
256 return separator.join(parts)
257
258
259 def _result_to_source_file(item: dict) -> SourceFile:
260 """Convert a Drive API file result to SourceFile."""
261 size = item.get("size")
262 return SourceFile(
263 name=item.get("name", "Untitled"),
264 id=item.get("id", ""),
265 size_bytes=int(size) if size else None,
266 mime_type=item.get("mimeType"),
267 modified_at=item.get("modifiedTime"),
268 )
--- a/video_processor/sources/m365_source.py
+++ b/video_processor/sources/m365_source.py
@@ -0,0 +1,310 @@
1
+"""Microsoft 365 source connector using the m365 CLI (cli-microsoft365).
2
+
3
+Fetches documents from SharePoint and OneDrive via the `m365` CLI tool.
4
+Outputs plain text suitable for KG ingestion.
5
+
6
+Requires: npm install -g @pnp/cli-microsoft365
7
+Auth: m365 login (interactive)
8
+Docs: https://pnp.github.io/cli-microsoft365/
9
+"""
10
+
11
+import json
12
+import logging
13
+import shutil
14
+import subprocess
15
+import tempfile
16
+from pathlib import Path
17
+from typing import Any, Dict, List, Optional
18
+
19
+from video_processor.sources.base import BaseSource, SourceFile
20
+
21
+logger = logging.getLogger(__name__)
22
+
23
+# Document MIME types we can extract text from
24
+_DOC_EXTENSIONS = {
25
+ ".docx",
26
+ ".doc",
27
+ ".xlsx",
28
+ ".xls",
29
+ ".pptx",
30
+ ".ppt",
31
+ ".pdf",
32
+ ".txt",
33
+ ".md",
34
+ ".csv",
35
+ ".html",
36
+ ".htm",
37
+}
38
+
39
+
40
+def _run_m365(args: List[str], timeout: int = 30) -> Any:
41
+ """Run an m365 CLI command and return parsed JSON output."""
42
+ cmd = ["m365"] + args + ["--output", "json"]
43
+ proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
44
+ if proc.returncode != 0:
45
+ raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}")
46
+ try:
47
+ return json.loads(proc.stdout)
48
+ except json.JSONDecodeError:
49
+ return proc.stdout.strip()
50
+
51
+
52
+class M365Source(BaseSource):
53
+ """
54
+ Fetch documents from SharePoint Online and OneDrive via the m365 CLI.
55
+
56
+ Usage:
57
+ # SharePoint site
58
+ source = M365Source(
59
+ web_url="https://contoso.sharepoint.com/sites/project-x",
60
+ folder_url="/sites/project-x/Shared Documents"
61
+ )
62
+
63
+ # OneDrive
64
+ source = M365Source(
65
+ web_url="https://contoso-my.sharepoint.com/personal/user_contoso_com",
66
+ folder_url="/personal/user_contoso_com/Documents"
67
+ )
68
+
69
+ files = source.list_videos()
70
+ source.download_all(files, Path("./docs"))
71
+ """
72
+
73
+ def __init__(
74
+ self,
75
+ web_url: str,
76
+ folder_url: Optional[str] = None,
77
+ file_ids: Optional[List[str]] = None,
78
+ recursive: bool = False,
79
+ ):
80
+ self.web_url = web_url
81
+ self.folder_url = folder_url
82
+ self.file_ids = file_ids or []
83
+ self.recursive = recursive
84
+
85
+ def authenticate(self) -> bool:
86
+ """Check if m365 CLI is installed and logged in."""
87
+ if not shutil.which("m365"):
88
+ logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365")
89
+ return False
90
+ try:
91
+ result = _run_m365(["status"], timeout=10)
92
+ # m365 status returns connection info when logged in
93
+ if isinstance(result, dict) and result.get("connectedAs"):
94
+ return True
95
+ if isinstance(result, str) and "Logged in" in result:
96
+ return True
97
+ logger.error("m365 not logged in. Run: m365 login")
98
+ return False
99
+ except (RuntimeError, subprocess.TimeoutExpired):
100
+ logger.error("m365 not logged in. Run: m365 login")
101
+ return False
102
+
103
+ def list_videos(
104
+ self,
105
+ folder_id: Optional[str] = None,
106
+ folder_path: Optional[str] = None,
107
+ patterns: Optional[List[str]] = None,
108
+ ) -> List[SourceFile]:
109
+ """List documents in SharePoint/OneDrive. Returns docs, not just videos."""
110
+ files: List[SourceFile] = []
111
+
112
+ # Fetch specific files by ID
113
+ if self.file_ids:
114
+ for fid in self.file_ids:
115
+ try:
116
+ result = _run_m365(
117
+ [
118
+ "spo",
119
+ "file",
120
+ "get",
121
+ "--webUrl",
122
+ self.web_url,
123
+ "--id",
124
+ fid,
125
+ ]
126
+ )
127
+ files.append(_result_to_source_file(result))
128
+ except RuntimeError as e:
129
+ logger.warning(f"Failed to get file {fid}: {e}")
130
+ return files
131
+
132
+ # List files in folder
133
+ folder = folder_path or self.folder_url
134
+ if not folder:
135
+ logger.error("No folder URL specified. Use --folder-url or folder_path parameter.")
136
+ return []
137
+
138
+ try:
139
+ args = [
140
+ "file",
141
+ "list",
142
+ "--webUrl",
143
+ self.web_url,
144
+ "--folderUrl",
145
+ folder,
146
+ ]
147
+ if self.recursive:
148
+ args.append("--recursive")
149
+
150
+ result = _run_m365(args, timeout=60)
151
+ except RuntimeError as e:
152
+ logger.error(f"Failed to list files: {e}")
153
+ return []
154
+
155
+ items = result if isinstance(result, list) else []
156
+ for item in items:
157
+ name = item.get("Name", item.get("name", ""))
158
+ ext = Path(name).suffix.lower()
159
+ if ext in _DOC_EXTENSIONS:
160
+ files.append(_result_to_source_file(item))
161
+
162
+ logger.info(f"Found {len(files)} document(s) in {folder}")
163
+ return files
164
+
165
+ def download(self, file: SourceFile, destination: Path) -> Path:
166
+ """Download a file from SharePoint/OneDrive."""
167
+ destination = Path(destination)
168
+ destination.parent.mkdir(parents=True, exist_ok=True)
169
+
170
+ args = [
171
+ "spo",
172
+ "file",
173
+ "get",
174
+ "--webUrl",
175
+ self.web_url,
176
+ "--asFile",
177
+ "--path",
178
+ str(destination),
179
+ ]
180
+
181
+ # Use URL if available in path field, otherwise use ID
182
+ if file.path:
183
+ args.extend(["--url", file.path])
184
+ else:
185
+ args.extend(["--id", file.id])
186
+
187
+ _run_m365(args, timeout=120)
188
+ logger.info(f"Downloaded {file.name} to {destination}")
189
+ return destination
190
+
191
+ def download_as_text(self, file: SourceFile) -> str:
192
+ """Download a file and attempt to extract text content."""
193
+ # For text-based formats, get as string directly
194
+ text_exts = {".txt", ".md", ".csv", ".html", ".htm"}
195
+ ext = Path(file.name).suffix.lower()
196
+
197
+ if ext in text_exts:
198
+ try:
199
+ args = [
200
+ "spo",
201
+ "file",
202
+ "get",
203
+ "--webUrl",
204
+ self.web_url,
205
+ "--asString",
206
+ ]
207
+ if file.path:
208
+ args.extend(["--url", file.path])
209
+ else:
210
+ args.extend(["--id", file.id])
211
+
212
+ result = _run_m365(args, timeout=60)
213
+ return result if isinstance(result, str) else json.dumps(result)
214
+ except RuntimeError:
215
+ pass
216
+
217
+ # For binary formats, download to temp and extract
218
+ with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
219
+ tmp_path = Path(tmp.name)
220
+
221
+ try:
222
+ self.download(file, tmp_path)
223
+ return _extract_text(tmp_path)
224
+ finally:
225
+ tmp_path.unlink(missing_ok=True)
226
+
227
+ def fetch_all_text(self) -> Dict[str, str]:
228
+ """List all docs and return {filename: text_content} dict."""
229
+ files = self.list_videos()
230
+ results = {}
231
+ for f in files:
232
+ try:
233
+ results[f.name] = self.download_as_text(f)
234
+ except Exception as e:
235
+ logger.warning(f"Failed to fetch {f.name}: {e}")
236
+ results[f.name] = f"[Error: {e}]"
237
+ return results
238
+
239
+ def collate(self, separator: str = "\n\n---\n\n") -> str:
240
+ """Fetch all docs and collate into a single text blob for ingestion."""
241
+ docs = self.fetch_all_text()
242
+ parts = []
243
+ for name, content in docs.items():
244
+ parts.append(f"# {name}\n\n{content}")
245
+ return separator.join(parts)
246
+
247
+
248
+def _result_to_source_file(item: dict) -> SourceFile:
249
+ """Convert an m365 file result to SourceFile."""
250
+ name = item.get("Name", item.get("name", "Untitled"))
251
+ file_id = item.get("UniqueId", item.get("uniqueId", item.get("id", "")))
252
+ size = item.get("Length", item.get("length", item.get("size")))
253
+ path = item.get("ServerRelativeUrl", item.get("serverRelativeUrl"))
254
+ modified = item.get("TimeLastModified", item.get("lastModifiedDateTime"))
255
+
256
+ return SourceFile(
257
+ name=name,
258
+ id=str(file_id),
259
+ size_bytes=int(size) if size else None,
260
+ mime_type=None,
261
+ modified_at=modified,
262
+ path=path,
263
+ )
264
+
265
+
266
+def _extract_text(path: Path) -> str:
267
+ """Best-effort text extraction from a downloaded file."""
268
+ ext = path.suffix.lower()
269
+
270
+ if ext in {".txt", ".md", ".csv"}:
271
+ return path.read_text(encoding="utf-8", errors="replace")
272
+
273
+ if ext in {".html", ".htm"}:
274
+ from video_processor.sources.web_source import _strip_html_tags
275
+
276
+ return _strip_html_tags(path.read_text(encoding="utf-8", errors="replace"))
277
+
278
+ if ext == ".pdf":
279
+ try:
280
+ import fitz # pymupdf
281
+
282
+ doc = fitz.open(str(path))
283
+ return "\n\n".join(page.get_text() for page in doc)
284
+ except ImportError:
285
+ return f"[PDF file: {path.name} — install pymupdf to extract text]"
286
+
287
+ if ext in {".docx", ".pptx", ".xlsx"}:
288
+ # Try python-docx / openpyxl / python-pptx if available
289
+ try:
290
+ if ext == ".docx":
291
+ from docx import Document
292
+
293
+ doc = Document(str(path))
294
+ return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())
295
+ elif ext == ".xlsx":
296
+ import openpyxl
297
+
298
+ wb = openpyxl.load_workbook(str(path), read_only=True)
299
+ rows = []
300
+ for sheet in wb.sheetnames:
301
+ ws = wb[sheet]
302
+ for row in ws.iter_rows(values_only=True):
303
+ cells = [str(c) if c is not None else "" for c in row]
304
+ if any(cells):
305
+ rows.append("\t".join(cells))
306
+ return "\n".join(rows)
307
+ except ImportError:
308
+ return f"[{ext} file: {path.name} — install python-docx/openpyxl to extract text]"
309
+
310
+ return f"[Unsupported format: {path.name}]"
--- a/video_processor/sources/m365_source.py
+++ b/video_processor/sources/m365_source.py
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/m365_source.py
+++ b/video_processor/sources/m365_source.py
@@ -0,0 +1,310 @@
1 """Microsoft 365 source connector using the m365 CLI (cli-microsoft365).
2
3 Fetches documents from SharePoint and OneDrive via the `m365` CLI tool.
4 Outputs plain text suitable for KG ingestion.
5
6 Requires: npm install -g @pnp/cli-microsoft365
7 Auth: m365 login (interactive)
8 Docs: https://pnp.github.io/cli-microsoft365/
9 """
10
11 import json
12 import logging
13 import shutil
14 import subprocess
15 import tempfile
16 from pathlib import Path
17 from typing import Any, Dict, List, Optional
18
19 from video_processor.sources.base import BaseSource, SourceFile
20
21 logger = logging.getLogger(__name__)
22
23 # Document MIME types we can extract text from
24 _DOC_EXTENSIONS = {
25 ".docx",
26 ".doc",
27 ".xlsx",
28 ".xls",
29 ".pptx",
30 ".ppt",
31 ".pdf",
32 ".txt",
33 ".md",
34 ".csv",
35 ".html",
36 ".htm",
37 }
38
39
40 def _run_m365(args: List[str], timeout: int = 30) -> Any:
41 """Run an m365 CLI command and return parsed JSON output."""
42 cmd = ["m365"] + args + ["--output", "json"]
43 proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
44 if proc.returncode != 0:
45 raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}")
46 try:
47 return json.loads(proc.stdout)
48 except json.JSONDecodeError:
49 return proc.stdout.strip()
50
51
52 class M365Source(BaseSource):
53 """
54 Fetch documents from SharePoint Online and OneDrive via the m365 CLI.
55
56 Usage:
57 # SharePoint site
58 source = M365Source(
59 web_url="https://contoso.sharepoint.com/sites/project-x",
60 folder_url="/sites/project-x/Shared Documents"
61 )
62
63 # OneDrive
64 source = M365Source(
65 web_url="https://contoso-my.sharepoint.com/personal/user_contoso_com",
66 folder_url="/personal/user_contoso_com/Documents"
67 )
68
69 files = source.list_videos()
70 source.download_all(files, Path("./docs"))
71 """
72
73 def __init__(
74 self,
75 web_url: str,
76 folder_url: Optional[str] = None,
77 file_ids: Optional[List[str]] = None,
78 recursive: bool = False,
79 ):
80 self.web_url = web_url
81 self.folder_url = folder_url
82 self.file_ids = file_ids or []
83 self.recursive = recursive
84
85 def authenticate(self) -> bool:
86 """Check if m365 CLI is installed and logged in."""
87 if not shutil.which("m365"):
88 logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365")
89 return False
90 try:
91 result = _run_m365(["status"], timeout=10)
92 # m365 status returns connection info when logged in
93 if isinstance(result, dict) and result.get("connectedAs"):
94 return True
95 if isinstance(result, str) and "Logged in" in result:
96 return True
97 logger.error("m365 not logged in. Run: m365 login")
98 return False
99 except (RuntimeError, subprocess.TimeoutExpired):
100 logger.error("m365 not logged in. Run: m365 login")
101 return False
102
103 def list_videos(
104 self,
105 folder_id: Optional[str] = None,
106 folder_path: Optional[str] = None,
107 patterns: Optional[List[str]] = None,
108 ) -> List[SourceFile]:
109 """List documents in SharePoint/OneDrive. Returns docs, not just videos."""
110 files: List[SourceFile] = []
111
112 # Fetch specific files by ID
113 if self.file_ids:
114 for fid in self.file_ids:
115 try:
116 result = _run_m365(
117 [
118 "spo",
119 "file",
120 "get",
121 "--webUrl",
122 self.web_url,
123 "--id",
124 fid,
125 ]
126 )
127 files.append(_result_to_source_file(result))
128 except RuntimeError as e:
129 logger.warning(f"Failed to get file {fid}: {e}")
130 return files
131
132 # List files in folder
133 folder = folder_path or self.folder_url
134 if not folder:
135 logger.error("No folder URL specified. Use --folder-url or folder_path parameter.")
136 return []
137
138 try:
139 args = [
140 "file",
141 "list",
142 "--webUrl",
143 self.web_url,
144 "--folderUrl",
145 folder,
146 ]
147 if self.recursive:
148 args.append("--recursive")
149
150 result = _run_m365(args, timeout=60)
151 except RuntimeError as e:
152 logger.error(f"Failed to list files: {e}")
153 return []
154
155 items = result if isinstance(result, list) else []
156 for item in items:
157 name = item.get("Name", item.get("name", ""))
158 ext = Path(name).suffix.lower()
159 if ext in _DOC_EXTENSIONS:
160 files.append(_result_to_source_file(item))
161
162 logger.info(f"Found {len(files)} document(s) in {folder}")
163 return files
164
165 def download(self, file: SourceFile, destination: Path) -> Path:
166 """Download a file from SharePoint/OneDrive."""
167 destination = Path(destination)
168 destination.parent.mkdir(parents=True, exist_ok=True)
169
170 args = [
171 "spo",
172 "file",
173 "get",
174 "--webUrl",
175 self.web_url,
176 "--asFile",
177 "--path",
178 str(destination),
179 ]
180
181 # Use URL if available in path field, otherwise use ID
182 if file.path:
183 args.extend(["--url", file.path])
184 else:
185 args.extend(["--id", file.id])
186
187 _run_m365(args, timeout=120)
188 logger.info(f"Downloaded {file.name} to {destination}")
189 return destination
190
191 def download_as_text(self, file: SourceFile) -> str:
192 """Download a file and attempt to extract text content."""
193 # For text-based formats, get as string directly
194 text_exts = {".txt", ".md", ".csv", ".html", ".htm"}
195 ext = Path(file.name).suffix.lower()
196
197 if ext in text_exts:
198 try:
199 args = [
200 "spo",
201 "file",
202 "get",
203 "--webUrl",
204 self.web_url,
205 "--asString",
206 ]
207 if file.path:
208 args.extend(["--url", file.path])
209 else:
210 args.extend(["--id", file.id])
211
212 result = _run_m365(args, timeout=60)
213 return result if isinstance(result, str) else json.dumps(result)
214 except RuntimeError:
215 pass
216
217 # For binary formats, download to temp and extract
218 with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
219 tmp_path = Path(tmp.name)
220
221 try:
222 self.download(file, tmp_path)
223 return _extract_text(tmp_path)
224 finally:
225 tmp_path.unlink(missing_ok=True)
226
227 def fetch_all_text(self) -> Dict[str, str]:
228 """List all docs and return {filename: text_content} dict."""
229 files = self.list_videos()
230 results = {}
231 for f in files:
232 try:
233 results[f.name] = self.download_as_text(f)
234 except Exception as e:
235 logger.warning(f"Failed to fetch {f.name}: {e}")
236 results[f.name] = f"[Error: {e}]"
237 return results
238
239 def collate(self, separator: str = "\n\n---\n\n") -> str:
240 """Fetch all docs and collate into a single text blob for ingestion."""
241 docs = self.fetch_all_text()
242 parts = []
243 for name, content in docs.items():
244 parts.append(f"# {name}\n\n{content}")
245 return separator.join(parts)
246
247
248 def _result_to_source_file(item: dict) -> SourceFile:
249 """Convert an m365 file result to SourceFile."""
250 name = item.get("Name", item.get("name", "Untitled"))
251 file_id = item.get("UniqueId", item.get("uniqueId", item.get("id", "")))
252 size = item.get("Length", item.get("length", item.get("size")))
253 path = item.get("ServerRelativeUrl", item.get("serverRelativeUrl"))
254 modified = item.get("TimeLastModified", item.get("lastModifiedDateTime"))
255
256 return SourceFile(
257 name=name,
258 id=str(file_id),
259 size_bytes=int(size) if size else None,
260 mime_type=None,
261 modified_at=modified,
262 path=path,
263 )
264
265
266 def _extract_text(path: Path) -> str:
267 """Best-effort text extraction from a downloaded file."""
268 ext = path.suffix.lower()
269
270 if ext in {".txt", ".md", ".csv"}:
271 return path.read_text(encoding="utf-8", errors="replace")
272
273 if ext in {".html", ".htm"}:
274 from video_processor.sources.web_source import _strip_html_tags
275
276 return _strip_html_tags(path.read_text(encoding="utf-8", errors="replace"))
277
278 if ext == ".pdf":
279 try:
280 import fitz # pymupdf
281
282 doc = fitz.open(str(path))
283 return "\n\n".join(page.get_text() for page in doc)
284 except ImportError:
285 return f"[PDF file: {path.name} — install pymupdf to extract text]"
286
287 if ext in {".docx", ".pptx", ".xlsx"}:
288 # Try python-docx / openpyxl / python-pptx if available
289 try:
290 if ext == ".docx":
291 from docx import Document
292
293 doc = Document(str(path))
294 return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())
295 elif ext == ".xlsx":
296 import openpyxl
297
298 wb = openpyxl.load_workbook(str(path), read_only=True)
299 rows = []
300 for sheet in wb.sheetnames:
301 ws = wb[sheet]
302 for row in ws.iter_rows(values_only=True):
303 cells = [str(c) if c is not None else "" for c in row]
304 if any(cells):
305 rows.append("\t".join(cells))
306 return "\n".join(rows)
307 except ImportError:
308 return f"[{ext} file: {path.name} — install python-docx/openpyxl to extract text]"
309
310 return f"[Unsupported format: {path.name}]"

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button