|
1
|
"""Microsoft 365 source connector using the m365 CLI (cli-microsoft365). |
|
2
|
|
|
3
|
Fetches documents from SharePoint and OneDrive via the `m365` CLI tool. |
|
4
|
Outputs plain text suitable for KG ingestion. |
|
5
|
|
|
6
|
Requires: npm install -g @pnp/cli-microsoft365 |
|
7
|
Auth: m365 login (interactive) |
|
8
|
Docs: https://pnp.github.io/cli-microsoft365/ |
|
9
|
""" |
|
10
|
|
|
11
|
import json |
|
12
|
import logging |
|
13
|
import shutil |
|
14
|
import subprocess |
|
15
|
import tempfile |
|
16
|
from pathlib import Path |
|
17
|
from typing import Any, Dict, List, Optional |
|
18
|
|
|
19
|
from video_processor.sources.base import BaseSource, SourceFile |
|
20
|
|
|
21
|
logger = logging.getLogger(__name__) |
|
22
|
|
|
23
|
# Document MIME types we can extract text from |
|
24
|
_DOC_EXTENSIONS = { |
|
25
|
".docx", |
|
26
|
".doc", |
|
27
|
".xlsx", |
|
28
|
".xls", |
|
29
|
".pptx", |
|
30
|
".ppt", |
|
31
|
".pdf", |
|
32
|
".txt", |
|
33
|
".md", |
|
34
|
".csv", |
|
35
|
".html", |
|
36
|
".htm", |
|
37
|
} |
|
38
|
|
|
39
|
|
|
40
|
def _run_m365(args: List[str], timeout: int = 30) -> Any: |
|
41
|
"""Run an m365 CLI command and return parsed JSON output.""" |
|
42
|
cmd = ["m365"] + args + ["--output", "json"] |
|
43
|
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) |
|
44
|
if proc.returncode != 0: |
|
45
|
raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}") |
|
46
|
try: |
|
47
|
return json.loads(proc.stdout) |
|
48
|
except json.JSONDecodeError: |
|
49
|
return proc.stdout.strip() |
|
50
|
|
|
51
|
|
|
52
|
class M365Source(BaseSource): |
|
53
|
""" |
|
54
|
Fetch documents from SharePoint Online and OneDrive via the m365 CLI. |
|
55
|
|
|
56
|
Usage: |
|
57
|
# SharePoint site |
|
58
|
source = M365Source( |
|
59
|
web_url="https://contoso.sharepoint.com/sites/project-x", |
|
60
|
folder_url="/sites/project-x/Shared Documents" |
|
61
|
) |
|
62
|
|
|
63
|
# OneDrive |
|
64
|
source = M365Source( |
|
65
|
web_url="https://contoso-my.sharepoint.com/personal/user_contoso_com", |
|
66
|
folder_url="/personal/user_contoso_com/Documents" |
|
67
|
) |
|
68
|
|
|
69
|
files = source.list_videos() |
|
70
|
source.download_all(files, Path("./docs")) |
|
71
|
""" |
|
72
|
|
|
73
|
def __init__( |
|
74
|
self, |
|
75
|
web_url: str, |
|
76
|
folder_url: Optional[str] = None, |
|
77
|
file_ids: Optional[List[str]] = None, |
|
78
|
recursive: bool = False, |
|
79
|
): |
|
80
|
self.web_url = web_url |
|
81
|
self.folder_url = folder_url |
|
82
|
self.file_ids = file_ids or [] |
|
83
|
self.recursive = recursive |
|
84
|
|
|
85
|
def authenticate(self) -> bool: |
|
86
|
"""Check if m365 CLI is installed and logged in.""" |
|
87
|
if not shutil.which("m365"): |
|
88
|
logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365") |
|
89
|
return False |
|
90
|
try: |
|
91
|
result = _run_m365(["status"], timeout=10) |
|
92
|
# m365 status returns connection info when logged in |
|
93
|
if isinstance(result, dict) and result.get("connectedAs"): |
|
94
|
return True |
|
95
|
if isinstance(result, str) and "Logged in" in result: |
|
96
|
return True |
|
97
|
logger.error("m365 not logged in. Run: m365 login") |
|
98
|
return False |
|
99
|
except (RuntimeError, subprocess.TimeoutExpired): |
|
100
|
logger.error("m365 not logged in. Run: m365 login") |
|
101
|
return False |
|
102
|
|
|
103
|
def list_videos( |
|
104
|
self, |
|
105
|
folder_id: Optional[str] = None, |
|
106
|
folder_path: Optional[str] = None, |
|
107
|
patterns: Optional[List[str]] = None, |
|
108
|
) -> List[SourceFile]: |
|
109
|
"""List documents in SharePoint/OneDrive. Returns docs, not just videos.""" |
|
110
|
files: List[SourceFile] = [] |
|
111
|
|
|
112
|
# Fetch specific files by ID |
|
113
|
if self.file_ids: |
|
114
|
for fid in self.file_ids: |
|
115
|
try: |
|
116
|
result = _run_m365( |
|
117
|
[ |
|
118
|
"spo", |
|
119
|
"file", |
|
120
|
"get", |
|
121
|
"--webUrl", |
|
122
|
self.web_url, |
|
123
|
"--id", |
|
124
|
fid, |
|
125
|
] |
|
126
|
) |
|
127
|
files.append(_result_to_source_file(result)) |
|
128
|
except RuntimeError as e: |
|
129
|
logger.warning(f"Failed to get file {fid}: {e}") |
|
130
|
return files |
|
131
|
|
|
132
|
# List files in folder |
|
133
|
folder = folder_path or self.folder_url |
|
134
|
if not folder: |
|
135
|
logger.error("No folder URL specified. Use --folder-url or folder_path parameter.") |
|
136
|
return [] |
|
137
|
|
|
138
|
try: |
|
139
|
args = [ |
|
140
|
"file", |
|
141
|
"list", |
|
142
|
"--webUrl", |
|
143
|
self.web_url, |
|
144
|
"--folderUrl", |
|
145
|
folder, |
|
146
|
] |
|
147
|
if self.recursive: |
|
148
|
args.append("--recursive") |
|
149
|
|
|
150
|
result = _run_m365(args, timeout=60) |
|
151
|
except RuntimeError as e: |
|
152
|
logger.error(f"Failed to list files: {e}") |
|
153
|
return [] |
|
154
|
|
|
155
|
items = result if isinstance(result, list) else [] |
|
156
|
for item in items: |
|
157
|
name = item.get("Name", item.get("name", "")) |
|
158
|
ext = Path(name).suffix.lower() |
|
159
|
if ext in _DOC_EXTENSIONS: |
|
160
|
files.append(_result_to_source_file(item)) |
|
161
|
|
|
162
|
logger.info(f"Found {len(files)} document(s) in {folder}") |
|
163
|
return files |
|
164
|
|
|
165
|
def download(self, file: SourceFile, destination: Path) -> Path: |
|
166
|
"""Download a file from SharePoint/OneDrive.""" |
|
167
|
destination = Path(destination) |
|
168
|
destination.parent.mkdir(parents=True, exist_ok=True) |
|
169
|
|
|
170
|
args = [ |
|
171
|
"spo", |
|
172
|
"file", |
|
173
|
"get", |
|
174
|
"--webUrl", |
|
175
|
self.web_url, |
|
176
|
"--asFile", |
|
177
|
"--path", |
|
178
|
str(destination), |
|
179
|
] |
|
180
|
|
|
181
|
# Use URL if available in path field, otherwise use ID |
|
182
|
if file.path: |
|
183
|
args.extend(["--url", file.path]) |
|
184
|
else: |
|
185
|
args.extend(["--id", file.id]) |
|
186
|
|
|
187
|
_run_m365(args, timeout=120) |
|
188
|
logger.info(f"Downloaded {file.name} to {destination}") |
|
189
|
return destination |
|
190
|
|
|
191
|
def download_as_text(self, file: SourceFile) -> str: |
|
192
|
"""Download a file and attempt to extract text content.""" |
|
193
|
# For text-based formats, get as string directly |
|
194
|
text_exts = {".txt", ".md", ".csv", ".html", ".htm"} |
|
195
|
ext = Path(file.name).suffix.lower() |
|
196
|
|
|
197
|
if ext in text_exts: |
|
198
|
try: |
|
199
|
args = [ |
|
200
|
"spo", |
|
201
|
"file", |
|
202
|
"get", |
|
203
|
"--webUrl", |
|
204
|
self.web_url, |
|
205
|
"--asString", |
|
206
|
] |
|
207
|
if file.path: |
|
208
|
args.extend(["--url", file.path]) |
|
209
|
else: |
|
210
|
args.extend(["--id", file.id]) |
|
211
|
|
|
212
|
result = _run_m365(args, timeout=60) |
|
213
|
return result if isinstance(result, str) else json.dumps(result) |
|
214
|
except RuntimeError: |
|
215
|
pass |
|
216
|
|
|
217
|
# For binary formats, download to temp and extract |
|
218
|
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: |
|
219
|
tmp_path = Path(tmp.name) |
|
220
|
|
|
221
|
try: |
|
222
|
self.download(file, tmp_path) |
|
223
|
return _extract_text(tmp_path) |
|
224
|
finally: |
|
225
|
tmp_path.unlink(missing_ok=True) |
|
226
|
|
|
227
|
def fetch_all_text(self) -> Dict[str, str]: |
|
228
|
"""List all docs and return {filename: text_content} dict.""" |
|
229
|
files = self.list_videos() |
|
230
|
results = {} |
|
231
|
for f in files: |
|
232
|
try: |
|
233
|
results[f.name] = self.download_as_text(f) |
|
234
|
except Exception as e: |
|
235
|
logger.warning(f"Failed to fetch {f.name}: {e}") |
|
236
|
results[f.name] = f"[Error: {e}]" |
|
237
|
return results |
|
238
|
|
|
239
|
def collate(self, separator: str = "\n\n---\n\n") -> str: |
|
240
|
"""Fetch all docs and collate into a single text blob for ingestion.""" |
|
241
|
docs = self.fetch_all_text() |
|
242
|
parts = [] |
|
243
|
for name, content in docs.items(): |
|
244
|
parts.append(f"# {name}\n\n{content}") |
|
245
|
return separator.join(parts) |
|
246
|
|
|
247
|
|
|
248
|
def _result_to_source_file(item: dict) -> SourceFile: |
|
249
|
"""Convert an m365 file result to SourceFile.""" |
|
250
|
name = item.get("Name", item.get("name", "Untitled")) |
|
251
|
file_id = item.get("UniqueId", item.get("uniqueId", item.get("id", ""))) |
|
252
|
size = item.get("Length", item.get("length", item.get("size"))) |
|
253
|
path = item.get("ServerRelativeUrl", item.get("serverRelativeUrl")) |
|
254
|
modified = item.get("TimeLastModified", item.get("lastModifiedDateTime")) |
|
255
|
|
|
256
|
return SourceFile( |
|
257
|
name=name, |
|
258
|
id=str(file_id), |
|
259
|
size_bytes=int(size) if size else None, |
|
260
|
mime_type=None, |
|
261
|
modified_at=modified, |
|
262
|
path=path, |
|
263
|
) |
|
264
|
|
|
265
|
|
|
266
|
def _extract_text(path: Path) -> str: |
|
267
|
"""Best-effort text extraction from a downloaded file.""" |
|
268
|
ext = path.suffix.lower() |
|
269
|
|
|
270
|
if ext in {".txt", ".md", ".csv"}: |
|
271
|
return path.read_text(encoding="utf-8", errors="replace") |
|
272
|
|
|
273
|
if ext in {".html", ".htm"}: |
|
274
|
from video_processor.sources.web_source import _strip_html_tags |
|
275
|
|
|
276
|
return _strip_html_tags(path.read_text(encoding="utf-8", errors="replace")) |
|
277
|
|
|
278
|
if ext == ".pdf": |
|
279
|
try: |
|
280
|
import fitz # pymupdf |
|
281
|
|
|
282
|
doc = fitz.open(str(path)) |
|
283
|
return "\n\n".join(page.get_text() for page in doc) |
|
284
|
except ImportError: |
|
285
|
return f"[PDF file: {path.name} — install pymupdf to extract text]" |
|
286
|
|
|
287
|
if ext in {".docx", ".pptx", ".xlsx"}: |
|
288
|
# Try python-docx / openpyxl / python-pptx if available |
|
289
|
try: |
|
290
|
if ext == ".docx": |
|
291
|
from docx import Document |
|
292
|
|
|
293
|
doc = Document(str(path)) |
|
294
|
return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip()) |
|
295
|
elif ext == ".xlsx": |
|
296
|
import openpyxl |
|
297
|
|
|
298
|
wb = openpyxl.load_workbook(str(path), read_only=True) |
|
299
|
rows = [] |
|
300
|
for sheet in wb.sheetnames: |
|
301
|
ws = wb[sheet] |
|
302
|
for row in ws.iter_rows(values_only=True): |
|
303
|
cells = [str(c) if c is not None else "" for c in row] |
|
304
|
if any(cells): |
|
305
|
rows.append("\t".join(cells)) |
|
306
|
return "\n".join(rows) |
|
307
|
except ImportError: |
|
308
|
return f"[{ext} file: {path.name} — install python-docx/openpyxl to extract text]" |
|
309
|
|
|
310
|
return f"[Unsupported format: {path.name}]" |
|
311
|
|