PlanOpticon

planopticon / video_processor / sources / m365_source.py
Blame History Raw 311 lines
1
"""Microsoft 365 source connector using the m365 CLI (cli-microsoft365).
2
3
Fetches documents from SharePoint and OneDrive via the `m365` CLI tool.
4
Outputs plain text suitable for KG ingestion.
5
6
Requires: npm install -g @pnp/cli-microsoft365
7
Auth: m365 login (interactive)
8
Docs: https://pnp.github.io/cli-microsoft365/
9
"""
10
11
import json
12
import logging
13
import shutil
14
import subprocess
15
import tempfile
16
from pathlib import Path
17
from typing import Any, Dict, List, Optional
18
19
from video_processor.sources.base import BaseSource, SourceFile
20
21
logger = logging.getLogger(__name__)
22
23
# Document MIME types we can extract text from
24
_DOC_EXTENSIONS = {
25
".docx",
26
".doc",
27
".xlsx",
28
".xls",
29
".pptx",
30
".ppt",
31
".pdf",
32
".txt",
33
".md",
34
".csv",
35
".html",
36
".htm",
37
}
38
39
40
def _run_m365(args: List[str], timeout: int = 30) -> Any:
41
"""Run an m365 CLI command and return parsed JSON output."""
42
cmd = ["m365"] + args + ["--output", "json"]
43
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
44
if proc.returncode != 0:
45
raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}")
46
try:
47
return json.loads(proc.stdout)
48
except json.JSONDecodeError:
49
return proc.stdout.strip()
50
51
52
class M365Source(BaseSource):
53
"""
54
Fetch documents from SharePoint Online and OneDrive via the m365 CLI.
55
56
Usage:
57
# SharePoint site
58
source = M365Source(
59
web_url="https://contoso.sharepoint.com/sites/project-x",
60
folder_url="/sites/project-x/Shared Documents"
61
)
62
63
# OneDrive
64
source = M365Source(
65
web_url="https://contoso-my.sharepoint.com/personal/user_contoso_com",
66
folder_url="/personal/user_contoso_com/Documents"
67
)
68
69
files = source.list_videos()
70
source.download_all(files, Path("./docs"))
71
"""
72
73
def __init__(
74
self,
75
web_url: str,
76
folder_url: Optional[str] = None,
77
file_ids: Optional[List[str]] = None,
78
recursive: bool = False,
79
):
80
self.web_url = web_url
81
self.folder_url = folder_url
82
self.file_ids = file_ids or []
83
self.recursive = recursive
84
85
def authenticate(self) -> bool:
86
"""Check if m365 CLI is installed and logged in."""
87
if not shutil.which("m365"):
88
logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365")
89
return False
90
try:
91
result = _run_m365(["status"], timeout=10)
92
# m365 status returns connection info when logged in
93
if isinstance(result, dict) and result.get("connectedAs"):
94
return True
95
if isinstance(result, str) and "Logged in" in result:
96
return True
97
logger.error("m365 not logged in. Run: m365 login")
98
return False
99
except (RuntimeError, subprocess.TimeoutExpired):
100
logger.error("m365 not logged in. Run: m365 login")
101
return False
102
103
def list_videos(
104
self,
105
folder_id: Optional[str] = None,
106
folder_path: Optional[str] = None,
107
patterns: Optional[List[str]] = None,
108
) -> List[SourceFile]:
109
"""List documents in SharePoint/OneDrive. Returns docs, not just videos."""
110
files: List[SourceFile] = []
111
112
# Fetch specific files by ID
113
if self.file_ids:
114
for fid in self.file_ids:
115
try:
116
result = _run_m365(
117
[
118
"spo",
119
"file",
120
"get",
121
"--webUrl",
122
self.web_url,
123
"--id",
124
fid,
125
]
126
)
127
files.append(_result_to_source_file(result))
128
except RuntimeError as e:
129
logger.warning(f"Failed to get file {fid}: {e}")
130
return files
131
132
# List files in folder
133
folder = folder_path or self.folder_url
134
if not folder:
135
logger.error("No folder URL specified. Use --folder-url or folder_path parameter.")
136
return []
137
138
try:
139
args = [
140
"file",
141
"list",
142
"--webUrl",
143
self.web_url,
144
"--folderUrl",
145
folder,
146
]
147
if self.recursive:
148
args.append("--recursive")
149
150
result = _run_m365(args, timeout=60)
151
except RuntimeError as e:
152
logger.error(f"Failed to list files: {e}")
153
return []
154
155
items = result if isinstance(result, list) else []
156
for item in items:
157
name = item.get("Name", item.get("name", ""))
158
ext = Path(name).suffix.lower()
159
if ext in _DOC_EXTENSIONS:
160
files.append(_result_to_source_file(item))
161
162
logger.info(f"Found {len(files)} document(s) in {folder}")
163
return files
164
165
def download(self, file: SourceFile, destination: Path) -> Path:
166
"""Download a file from SharePoint/OneDrive."""
167
destination = Path(destination)
168
destination.parent.mkdir(parents=True, exist_ok=True)
169
170
args = [
171
"spo",
172
"file",
173
"get",
174
"--webUrl",
175
self.web_url,
176
"--asFile",
177
"--path",
178
str(destination),
179
]
180
181
# Use URL if available in path field, otherwise use ID
182
if file.path:
183
args.extend(["--url", file.path])
184
else:
185
args.extend(["--id", file.id])
186
187
_run_m365(args, timeout=120)
188
logger.info(f"Downloaded {file.name} to {destination}")
189
return destination
190
191
def download_as_text(self, file: SourceFile) -> str:
192
"""Download a file and attempt to extract text content."""
193
# For text-based formats, get as string directly
194
text_exts = {".txt", ".md", ".csv", ".html", ".htm"}
195
ext = Path(file.name).suffix.lower()
196
197
if ext in text_exts:
198
try:
199
args = [
200
"spo",
201
"file",
202
"get",
203
"--webUrl",
204
self.web_url,
205
"--asString",
206
]
207
if file.path:
208
args.extend(["--url", file.path])
209
else:
210
args.extend(["--id", file.id])
211
212
result = _run_m365(args, timeout=60)
213
return result if isinstance(result, str) else json.dumps(result)
214
except RuntimeError:
215
pass
216
217
# For binary formats, download to temp and extract
218
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
219
tmp_path = Path(tmp.name)
220
221
try:
222
self.download(file, tmp_path)
223
return _extract_text(tmp_path)
224
finally:
225
tmp_path.unlink(missing_ok=True)
226
227
def fetch_all_text(self) -> Dict[str, str]:
228
"""List all docs and return {filename: text_content} dict."""
229
files = self.list_videos()
230
results = {}
231
for f in files:
232
try:
233
results[f.name] = self.download_as_text(f)
234
except Exception as e:
235
logger.warning(f"Failed to fetch {f.name}: {e}")
236
results[f.name] = f"[Error: {e}]"
237
return results
238
239
def collate(self, separator: str = "\n\n---\n\n") -> str:
240
"""Fetch all docs and collate into a single text blob for ingestion."""
241
docs = self.fetch_all_text()
242
parts = []
243
for name, content in docs.items():
244
parts.append(f"# {name}\n\n{content}")
245
return separator.join(parts)
246
247
248
def _result_to_source_file(item: dict) -> SourceFile:
249
"""Convert an m365 file result to SourceFile."""
250
name = item.get("Name", item.get("name", "Untitled"))
251
file_id = item.get("UniqueId", item.get("uniqueId", item.get("id", "")))
252
size = item.get("Length", item.get("length", item.get("size")))
253
path = item.get("ServerRelativeUrl", item.get("serverRelativeUrl"))
254
modified = item.get("TimeLastModified", item.get("lastModifiedDateTime"))
255
256
return SourceFile(
257
name=name,
258
id=str(file_id),
259
size_bytes=int(size) if size else None,
260
mime_type=None,
261
modified_at=modified,
262
path=path,
263
)
264
265
266
def _extract_text(path: Path) -> str:
267
"""Best-effort text extraction from a downloaded file."""
268
ext = path.suffix.lower()
269
270
if ext in {".txt", ".md", ".csv"}:
271
return path.read_text(encoding="utf-8", errors="replace")
272
273
if ext in {".html", ".htm"}:
274
from video_processor.sources.web_source import _strip_html_tags
275
276
return _strip_html_tags(path.read_text(encoding="utf-8", errors="replace"))
277
278
if ext == ".pdf":
279
try:
280
import fitz # pymupdf
281
282
doc = fitz.open(str(path))
283
return "\n\n".join(page.get_text() for page in doc)
284
except ImportError:
285
return f"[PDF file: {path.name} — install pymupdf to extract text]"
286
287
if ext in {".docx", ".pptx", ".xlsx"}:
288
# Try python-docx / openpyxl / python-pptx if available
289
try:
290
if ext == ".docx":
291
from docx import Document
292
293
doc = Document(str(path))
294
return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())
295
elif ext == ".xlsx":
296
import openpyxl
297
298
wb = openpyxl.load_workbook(str(path), read_only=True)
299
rows = []
300
for sheet in wb.sheetnames:
301
ws = wb[sheet]
302
for row in ws.iter_rows(values_only=True):
303
cells = [str(c) if c is not None else "" for c in row]
304
if any(cells):
305
rows.append("\t".join(cells))
306
return "\n".join(rows)
307
except ImportError:
308
return f"[{ext} file: {path.name} — install python-docx/openpyxl to extract text]"
309
310
return f"[Unsupported format: {path.name}]"
311

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button