PlanOpticon

planopticon / video_processor / sources / m365_source.py

Blame History Raw 311 lines

1	`"""Microsoft 365 source connector using the m365 CLI (cli-microsoft365).`
2
3	Fetches documents from SharePoint and OneDrive via the `m365` CLI tool.
4	`Outputs plain text suitable for KG ingestion.`
5
6	`Requires: npm install -g @pnp/cli-microsoft365`
7	`Auth: m365 login (interactive)`
8	`Docs: https://pnp.github.io/cli-microsoft365/`
9	`"""`
10
11	`import json`
12	`import logging`
13	`import shutil`
14	`import subprocess`
15	`import tempfile`
16	`from pathlib import Path`
17	`from typing import Any, Dict, List, Optional`
18
19	`from video_processor.sources.base import BaseSource, SourceFile`
20
21	`logger = logging.getLogger(__name__)`
22
23	`# Document MIME types we can extract text from`
24	`_DOC_EXTENSIONS = {`
25	`".docx",`
26	`".doc",`
27	`".xlsx",`
28	`".xls",`
29	`".pptx",`
30	`".ppt",`
31	`".pdf",`
32	`".txt",`
33	`".md",`
34	`".csv",`
35	`".html",`
36	`".htm",`
37	`}`
38
39
40	`def _run_m365(args: List[str], timeout: int = 30) -> Any:`
41	`"""Run an m365 CLI command and return parsed JSON output."""`
42	`cmd = ["m365"] + args + ["--output", "json"]`
43	`proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)`
44	`if proc.returncode != 0:`
45	`raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}")`
46	`try:`
47	`return json.loads(proc.stdout)`
48	`except json.JSONDecodeError:`
49	`return proc.stdout.strip()`
50
51
52	`class M365Source(BaseSource):`
53	`"""`
54	`Fetch documents from SharePoint Online and OneDrive via the m365 CLI.`
55
56	`Usage:`
57	`# SharePoint site`
58	`source = M365Source(`
59	`web_url="https://contoso.sharepoint.com/sites/project-x",`
60	`folder_url="/sites/project-x/Shared Documents"`
61	`)`
62
63	`# OneDrive`
64	`source = M365Source(`
65	`web_url="https://contoso-my.sharepoint.com/personal/user_contoso_com",`
66	`folder_url="/personal/user_contoso_com/Documents"`
67	`)`
68
69	`files = source.list_videos()`
70	`source.download_all(files, Path("./docs"))`
71	`"""`
72
73	`def __init__(`
74	`self,`
75	`web_url: str,`
76	`folder_url: Optional[str] = None,`
77	`file_ids: Optional[List[str]] = None,`
78	`recursive: bool = False,`
79	`):`
80	`self.web_url = web_url`
81	`self.folder_url = folder_url`
82	`self.file_ids = file_ids or []`
83	`self.recursive = recursive`
84
85	`def authenticate(self) -> bool:`
86	`"""Check if m365 CLI is installed and logged in."""`
87	`if not shutil.which("m365"):`
88	`logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365")`
89	`return False`
90	`try:`
91	`result = _run_m365(["status"], timeout=10)`
92	`# m365 status returns connection info when logged in`
93	`if isinstance(result, dict) and result.get("connectedAs"):`
94	`return True`
95	`if isinstance(result, str) and "Logged in" in result:`
96	`return True`
97	`logger.error("m365 not logged in. Run: m365 login")`
98	`return False`
99	`except (RuntimeError, subprocess.TimeoutExpired):`
100	`logger.error("m365 not logged in. Run: m365 login")`
101	`return False`
102
103	`def list_videos(`
104	`self,`
105	`folder_id: Optional[str] = None,`
106	`folder_path: Optional[str] = None,`
107	`patterns: Optional[List[str]] = None,`
108	`) -> List[SourceFile]:`
109	`"""List documents in SharePoint/OneDrive. Returns docs, not just videos."""`
110	`files: List[SourceFile] = []`
111
112	`# Fetch specific files by ID`
113	`if self.file_ids:`
114	`for fid in self.file_ids:`
115	`try:`
116	`result = _run_m365(`
117	`[`
118	`"spo",`
119	`"file",`
120	`"get",`
121	`"--webUrl",`
122	`self.web_url,`
123	`"--id",`
124	`fid,`
125	`]`
126	`)`
127	`files.append(_result_to_source_file(result))`
128	`except RuntimeError as e:`
129	`logger.warning(f"Failed to get file {fid}: {e}")`
130	`return files`
131
132	`# List files in folder`
133	`folder = folder_path or self.folder_url`
134	`if not folder:`
135	`logger.error("No folder URL specified. Use --folder-url or folder_path parameter.")`
136	`return []`
137
138	`try:`
139	`args = [`
140	`"file",`
141	`"list",`
142	`"--webUrl",`
143	`self.web_url,`
144	`"--folderUrl",`
145	`folder,`
146	`]`
147	`if self.recursive:`
148	`args.append("--recursive")`
149
150	`result = _run_m365(args, timeout=60)`
151	`except RuntimeError as e:`
152	`logger.error(f"Failed to list files: {e}")`
153	`return []`
154
155	`items = result if isinstance(result, list) else []`
156	`for item in items:`
157	`name = item.get("Name", item.get("name", ""))`
158	`ext = Path(name).suffix.lower()`
159	`if ext in _DOC_EXTENSIONS:`
160	`files.append(_result_to_source_file(item))`
161
162	`logger.info(f"Found {len(files)} document(s) in {folder}")`
163	`return files`
164
165	`def download(self, file: SourceFile, destination: Path) -> Path:`
166	`"""Download a file from SharePoint/OneDrive."""`
167	`destination = Path(destination)`
168	`destination.parent.mkdir(parents=True, exist_ok=True)`
169
170	`args = [`
171	`"spo",`
172	`"file",`
173	`"get",`
174	`"--webUrl",`
175	`self.web_url,`
176	`"--asFile",`
177	`"--path",`
178	`str(destination),`
179	`]`
180
181	`# Use URL if available in path field, otherwise use ID`
182	`if file.path:`
183	`args.extend(["--url", file.path])`
184	`else:`
185	`args.extend(["--id", file.id])`
186
187	`_run_m365(args, timeout=120)`
188	`logger.info(f"Downloaded {file.name} to {destination}")`
189	`return destination`
190
191	`def download_as_text(self, file: SourceFile) -> str:`
192	`"""Download a file and attempt to extract text content."""`
193	`# For text-based formats, get as string directly`
194	`text_exts = {".txt", ".md", ".csv", ".html", ".htm"}`
195	`ext = Path(file.name).suffix.lower()`
196
197	`if ext in text_exts:`
198	`try:`
199	`args = [`
200	`"spo",`
201	`"file",`
202	`"get",`
203	`"--webUrl",`
204	`self.web_url,`
205	`"--asString",`
206	`]`
207	`if file.path:`
208	`args.extend(["--url", file.path])`
209	`else:`
210	`args.extend(["--id", file.id])`
211
212	`result = _run_m365(args, timeout=60)`
213	`return result if isinstance(result, str) else json.dumps(result)`
214	`except RuntimeError:`
215	`pass`
216
217	`# For binary formats, download to temp and extract`
218	`with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:`
219	`tmp_path = Path(tmp.name)`
220
221	`try:`
222	`self.download(file, tmp_path)`
223	`return _extract_text(tmp_path)`
224	`finally:`
225	`tmp_path.unlink(missing_ok=True)`
226
227	`def fetch_all_text(self) -> Dict[str, str]:`
228	`"""List all docs and return {filename: text_content} dict."""`
229	`files = self.list_videos()`
230	`results = {}`
231	`for f in files:`
232	`try:`
233	`results[f.name] = self.download_as_text(f)`
234	`except Exception as e:`
235	`logger.warning(f"Failed to fetch {f.name}: {e}")`
236	`results[f.name] = f"[Error: {e}]"`
237	`return results`
238
239	`def collate(self, separator: str = "\n\n---\n\n") -> str:`
240	`"""Fetch all docs and collate into a single text blob for ingestion."""`
241	`docs = self.fetch_all_text()`
242	`parts = []`
243	`for name, content in docs.items():`
244	`parts.append(f"# {name}\n\n{content}")`
245	`return separator.join(parts)`
246
247
248	`def _result_to_source_file(item: dict) -> SourceFile:`
249	`"""Convert an m365 file result to SourceFile."""`
250	`name = item.get("Name", item.get("name", "Untitled"))`
251	`file_id = item.get("UniqueId", item.get("uniqueId", item.get("id", "")))`
252	`size = item.get("Length", item.get("length", item.get("size")))`
253	`path = item.get("ServerRelativeUrl", item.get("serverRelativeUrl"))`
254	`modified = item.get("TimeLastModified", item.get("lastModifiedDateTime"))`
255
256	`return SourceFile(`
257	`name=name,`
258	`id=str(file_id),`
259	`size_bytes=int(size) if size else None,`
260	`mime_type=None,`
261	`modified_at=modified,`
262	`path=path,`
263	`)`
264
265
266	`def _extract_text(path: Path) -> str:`
267	`"""Best-effort text extraction from a downloaded file."""`
268	`ext = path.suffix.lower()`
269
270	`if ext in {".txt", ".md", ".csv"}:`
271	`return path.read_text(encoding="utf-8", errors="replace")`
272
273	`if ext in {".html", ".htm"}:`
274	`from video_processor.sources.web_source import _strip_html_tags`
275
276	`return _strip_html_tags(path.read_text(encoding="utf-8", errors="replace"))`
277
278	`if ext == ".pdf":`
279	`try:`
280	`import fitz # pymupdf`
281
282	`doc = fitz.open(str(path))`
283	`return "\n\n".join(page.get_text() for page in doc)`
284	`except ImportError:`
285	`return f"[PDF file: {path.name} — install pymupdf to extract text]"`
286
287	`if ext in {".docx", ".pptx", ".xlsx"}:`
288	`# Try python-docx / openpyxl / python-pptx if available`
289	`try:`
290	`if ext == ".docx":`
291	`from docx import Document`
292
293	`doc = Document(str(path))`
294	`return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())`
295	`elif ext == ".xlsx":`
296	`import openpyxl`
297
298	`wb = openpyxl.load_workbook(str(path), read_only=True)`
299	`rows = []`
300	`for sheet in wb.sheetnames:`
301	`ws = wb[sheet]`
302	`for row in ws.iter_rows(values_only=True):`
303	`cells = [str(c) if c is not None else "" for c in row]`
304	`if any(cells):`
305	`rows.append("\t".join(cells))`
306	`return "\n".join(rows)`
307	`except ImportError:`
308	`return f"[{ext} file: {path.name} — install python-docx/openpyxl to extract text]"`
309
310	`return f"[Unsupported format: {path.name}]"`
311

PlanOpticon

Keyboard Shortcuts