|
0981a08…
|
noreply
|
1 |
"""Microsoft 365 source connector using the m365 CLI (cli-microsoft365). |
|
0981a08…
|
noreply
|
2 |
|
|
0981a08…
|
noreply
|
3 |
Fetches documents from SharePoint and OneDrive via the `m365` CLI tool. |
|
0981a08…
|
noreply
|
4 |
Outputs plain text suitable for KG ingestion. |
|
0981a08…
|
noreply
|
5 |
|
|
0981a08…
|
noreply
|
6 |
Requires: npm install -g @pnp/cli-microsoft365 |
|
0981a08…
|
noreply
|
7 |
Auth: m365 login (interactive) |
|
0981a08…
|
noreply
|
8 |
Docs: https://pnp.github.io/cli-microsoft365/ |
|
0981a08…
|
noreply
|
9 |
""" |
|
0981a08…
|
noreply
|
10 |
|
|
0981a08…
|
noreply
|
11 |
import json |
|
0981a08…
|
noreply
|
12 |
import logging |
|
0981a08…
|
noreply
|
13 |
import shutil |
|
0981a08…
|
noreply
|
14 |
import subprocess |
|
0981a08…
|
noreply
|
15 |
import tempfile |
|
0981a08…
|
noreply
|
16 |
from pathlib import Path |
|
0981a08…
|
noreply
|
17 |
from typing import Any, Dict, List, Optional |
|
0981a08…
|
noreply
|
18 |
|
|
0981a08…
|
noreply
|
19 |
from video_processor.sources.base import BaseSource, SourceFile |
|
0981a08…
|
noreply
|
20 |
|
|
0981a08…
|
noreply
|
21 |
logger = logging.getLogger(__name__) |
|
0981a08…
|
noreply
|
22 |
|
|
0981a08…
|
noreply
|
23 |
# Document MIME types we can extract text from |
|
0981a08…
|
noreply
|
24 |
_DOC_EXTENSIONS = { |
|
0981a08…
|
noreply
|
25 |
".docx", |
|
0981a08…
|
noreply
|
26 |
".doc", |
|
0981a08…
|
noreply
|
27 |
".xlsx", |
|
0981a08…
|
noreply
|
28 |
".xls", |
|
0981a08…
|
noreply
|
29 |
".pptx", |
|
0981a08…
|
noreply
|
30 |
".ppt", |
|
0981a08…
|
noreply
|
31 |
".pdf", |
|
0981a08…
|
noreply
|
32 |
".txt", |
|
0981a08…
|
noreply
|
33 |
".md", |
|
0981a08…
|
noreply
|
34 |
".csv", |
|
0981a08…
|
noreply
|
35 |
".html", |
|
0981a08…
|
noreply
|
36 |
".htm", |
|
0981a08…
|
noreply
|
37 |
} |
|
0981a08…
|
noreply
|
38 |
|
|
0981a08…
|
noreply
|
39 |
|
|
0981a08…
|
noreply
|
40 |
def _run_m365(args: List[str], timeout: int = 30) -> Any: |
|
0981a08…
|
noreply
|
41 |
"""Run an m365 CLI command and return parsed JSON output.""" |
|
0981a08…
|
noreply
|
42 |
cmd = ["m365"] + args + ["--output", "json"] |
|
0981a08…
|
noreply
|
43 |
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) |
|
0981a08…
|
noreply
|
44 |
if proc.returncode != 0: |
|
0981a08…
|
noreply
|
45 |
raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}") |
|
0981a08…
|
noreply
|
46 |
try: |
|
0981a08…
|
noreply
|
47 |
return json.loads(proc.stdout) |
|
0981a08…
|
noreply
|
48 |
except json.JSONDecodeError: |
|
0981a08…
|
noreply
|
49 |
return proc.stdout.strip() |
|
0981a08…
|
noreply
|
50 |
|
|
0981a08…
|
noreply
|
51 |
|
|
0981a08…
|
noreply
|
52 |
class M365Source(BaseSource): |
|
0981a08…
|
noreply
|
53 |
""" |
|
0981a08…
|
noreply
|
54 |
Fetch documents from SharePoint Online and OneDrive via the m365 CLI. |
|
0981a08…
|
noreply
|
55 |
|
|
0981a08…
|
noreply
|
56 |
Usage: |
|
0981a08…
|
noreply
|
57 |
# SharePoint site |
|
0981a08…
|
noreply
|
58 |
source = M365Source( |
|
0981a08…
|
noreply
|
59 |
web_url="https://contoso.sharepoint.com/sites/project-x", |
|
0981a08…
|
noreply
|
60 |
folder_url="/sites/project-x/Shared Documents" |
|
0981a08…
|
noreply
|
61 |
) |
|
0981a08…
|
noreply
|
62 |
|
|
0981a08…
|
noreply
|
63 |
# OneDrive |
|
0981a08…
|
noreply
|
64 |
source = M365Source( |
|
0981a08…
|
noreply
|
65 |
web_url="https://contoso-my.sharepoint.com/personal/user_contoso_com", |
|
0981a08…
|
noreply
|
66 |
folder_url="/personal/user_contoso_com/Documents" |
|
0981a08…
|
noreply
|
67 |
) |
|
0981a08…
|
noreply
|
68 |
|
|
0981a08…
|
noreply
|
69 |
files = source.list_videos() |
|
0981a08…
|
noreply
|
70 |
source.download_all(files, Path("./docs")) |
|
0981a08…
|
noreply
|
71 |
""" |
|
0981a08…
|
noreply
|
72 |
|
|
0981a08…
|
noreply
|
73 |
def __init__( |
|
0981a08…
|
noreply
|
74 |
self, |
|
0981a08…
|
noreply
|
75 |
web_url: str, |
|
0981a08…
|
noreply
|
76 |
folder_url: Optional[str] = None, |
|
0981a08…
|
noreply
|
77 |
file_ids: Optional[List[str]] = None, |
|
0981a08…
|
noreply
|
78 |
recursive: bool = False, |
|
0981a08…
|
noreply
|
79 |
): |
|
0981a08…
|
noreply
|
80 |
self.web_url = web_url |
|
0981a08…
|
noreply
|
81 |
self.folder_url = folder_url |
|
0981a08…
|
noreply
|
82 |
self.file_ids = file_ids or [] |
|
0981a08…
|
noreply
|
83 |
self.recursive = recursive |
|
0981a08…
|
noreply
|
84 |
|
|
0981a08…
|
noreply
|
85 |
def authenticate(self) -> bool: |
|
0981a08…
|
noreply
|
86 |
"""Check if m365 CLI is installed and logged in.""" |
|
0981a08…
|
noreply
|
87 |
if not shutil.which("m365"): |
|
0981a08…
|
noreply
|
88 |
logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365") |
|
0981a08…
|
noreply
|
89 |
return False |
|
0981a08…
|
noreply
|
90 |
try: |
|
0981a08…
|
noreply
|
91 |
result = _run_m365(["status"], timeout=10) |
|
0981a08…
|
noreply
|
92 |
# m365 status returns connection info when logged in |
|
0981a08…
|
noreply
|
93 |
if isinstance(result, dict) and result.get("connectedAs"): |
|
0981a08…
|
noreply
|
94 |
return True |
|
0981a08…
|
noreply
|
95 |
if isinstance(result, str) and "Logged in" in result: |
|
0981a08…
|
noreply
|
96 |
return True |
|
0981a08…
|
noreply
|
97 |
logger.error("m365 not logged in. Run: m365 login") |
|
0981a08…
|
noreply
|
98 |
return False |
|
0981a08…
|
noreply
|
99 |
except (RuntimeError, subprocess.TimeoutExpired): |
|
0981a08…
|
noreply
|
100 |
logger.error("m365 not logged in. Run: m365 login") |
|
0981a08…
|
noreply
|
101 |
return False |
|
0981a08…
|
noreply
|
102 |
|
|
0981a08…
|
noreply
|
103 |
def list_videos( |
|
0981a08…
|
noreply
|
104 |
self, |
|
0981a08…
|
noreply
|
105 |
folder_id: Optional[str] = None, |
|
0981a08…
|
noreply
|
106 |
folder_path: Optional[str] = None, |
|
0981a08…
|
noreply
|
107 |
patterns: Optional[List[str]] = None, |
|
0981a08…
|
noreply
|
108 |
) -> List[SourceFile]: |
|
0981a08…
|
noreply
|
109 |
"""List documents in SharePoint/OneDrive. Returns docs, not just videos.""" |
|
0981a08…
|
noreply
|
110 |
files: List[SourceFile] = [] |
|
0981a08…
|
noreply
|
111 |
|
|
0981a08…
|
noreply
|
112 |
# Fetch specific files by ID |
|
0981a08…
|
noreply
|
113 |
if self.file_ids: |
|
0981a08…
|
noreply
|
114 |
for fid in self.file_ids: |
|
0981a08…
|
noreply
|
115 |
try: |
|
0981a08…
|
noreply
|
116 |
result = _run_m365( |
|
0981a08…
|
noreply
|
117 |
[ |
|
0981a08…
|
noreply
|
118 |
"spo", |
|
0981a08…
|
noreply
|
119 |
"file", |
|
0981a08…
|
noreply
|
120 |
"get", |
|
0981a08…
|
noreply
|
121 |
"--webUrl", |
|
0981a08…
|
noreply
|
122 |
self.web_url, |
|
0981a08…
|
noreply
|
123 |
"--id", |
|
0981a08…
|
noreply
|
124 |
fid, |
|
0981a08…
|
noreply
|
125 |
] |
|
0981a08…
|
noreply
|
126 |
) |
|
0981a08…
|
noreply
|
127 |
files.append(_result_to_source_file(result)) |
|
0981a08…
|
noreply
|
128 |
except RuntimeError as e: |
|
0981a08…
|
noreply
|
129 |
logger.warning(f"Failed to get file {fid}: {e}") |
|
0981a08…
|
noreply
|
130 |
return files |
|
0981a08…
|
noreply
|
131 |
|
|
0981a08…
|
noreply
|
132 |
# List files in folder |
|
0981a08…
|
noreply
|
133 |
folder = folder_path or self.folder_url |
|
0981a08…
|
noreply
|
134 |
if not folder: |
|
0981a08…
|
noreply
|
135 |
logger.error("No folder URL specified. Use --folder-url or folder_path parameter.") |
|
0981a08…
|
noreply
|
136 |
return [] |
|
0981a08…
|
noreply
|
137 |
|
|
0981a08…
|
noreply
|
138 |
try: |
|
0981a08…
|
noreply
|
139 |
args = [ |
|
0981a08…
|
noreply
|
140 |
"file", |
|
0981a08…
|
noreply
|
141 |
"list", |
|
0981a08…
|
noreply
|
142 |
"--webUrl", |
|
0981a08…
|
noreply
|
143 |
self.web_url, |
|
0981a08…
|
noreply
|
144 |
"--folderUrl", |
|
0981a08…
|
noreply
|
145 |
folder, |
|
0981a08…
|
noreply
|
146 |
] |
|
0981a08…
|
noreply
|
147 |
if self.recursive: |
|
0981a08…
|
noreply
|
148 |
args.append("--recursive") |
|
0981a08…
|
noreply
|
149 |
|
|
0981a08…
|
noreply
|
150 |
result = _run_m365(args, timeout=60) |
|
0981a08…
|
noreply
|
151 |
except RuntimeError as e: |
|
0981a08…
|
noreply
|
152 |
logger.error(f"Failed to list files: {e}") |
|
0981a08…
|
noreply
|
153 |
return [] |
|
0981a08…
|
noreply
|
154 |
|
|
0981a08…
|
noreply
|
155 |
items = result if isinstance(result, list) else [] |
|
0981a08…
|
noreply
|
156 |
for item in items: |
|
0981a08…
|
noreply
|
157 |
name = item.get("Name", item.get("name", "")) |
|
0981a08…
|
noreply
|
158 |
ext = Path(name).suffix.lower() |
|
0981a08…
|
noreply
|
159 |
if ext in _DOC_EXTENSIONS: |
|
0981a08…
|
noreply
|
160 |
files.append(_result_to_source_file(item)) |
|
0981a08…
|
noreply
|
161 |
|
|
0981a08…
|
noreply
|
162 |
logger.info(f"Found {len(files)} document(s) in {folder}") |
|
0981a08…
|
noreply
|
163 |
return files |
|
0981a08…
|
noreply
|
164 |
|
|
0981a08…
|
noreply
|
165 |
def download(self, file: SourceFile, destination: Path) -> Path: |
|
0981a08…
|
noreply
|
166 |
"""Download a file from SharePoint/OneDrive.""" |
|
0981a08…
|
noreply
|
167 |
destination = Path(destination) |
|
0981a08…
|
noreply
|
168 |
destination.parent.mkdir(parents=True, exist_ok=True) |
|
0981a08…
|
noreply
|
169 |
|
|
0981a08…
|
noreply
|
170 |
args = [ |
|
0981a08…
|
noreply
|
171 |
"spo", |
|
0981a08…
|
noreply
|
172 |
"file", |
|
0981a08…
|
noreply
|
173 |
"get", |
|
0981a08…
|
noreply
|
174 |
"--webUrl", |
|
0981a08…
|
noreply
|
175 |
self.web_url, |
|
0981a08…
|
noreply
|
176 |
"--asFile", |
|
0981a08…
|
noreply
|
177 |
"--path", |
|
0981a08…
|
noreply
|
178 |
str(destination), |
|
0981a08…
|
noreply
|
179 |
] |
|
0981a08…
|
noreply
|
180 |
|
|
0981a08…
|
noreply
|
181 |
# Use URL if available in path field, otherwise use ID |
|
0981a08…
|
noreply
|
182 |
if file.path: |
|
0981a08…
|
noreply
|
183 |
args.extend(["--url", file.path]) |
|
0981a08…
|
noreply
|
184 |
else: |
|
0981a08…
|
noreply
|
185 |
args.extend(["--id", file.id]) |
|
0981a08…
|
noreply
|
186 |
|
|
0981a08…
|
noreply
|
187 |
_run_m365(args, timeout=120) |
|
0981a08…
|
noreply
|
188 |
logger.info(f"Downloaded {file.name} to {destination}") |
|
0981a08…
|
noreply
|
189 |
return destination |
|
0981a08…
|
noreply
|
190 |
|
|
0981a08…
|
noreply
|
191 |
def download_as_text(self, file: SourceFile) -> str: |
|
0981a08…
|
noreply
|
192 |
"""Download a file and attempt to extract text content.""" |
|
0981a08…
|
noreply
|
193 |
# For text-based formats, get as string directly |
|
0981a08…
|
noreply
|
194 |
text_exts = {".txt", ".md", ".csv", ".html", ".htm"} |
|
0981a08…
|
noreply
|
195 |
ext = Path(file.name).suffix.lower() |
|
0981a08…
|
noreply
|
196 |
|
|
0981a08…
|
noreply
|
197 |
if ext in text_exts: |
|
0981a08…
|
noreply
|
198 |
try: |
|
0981a08…
|
noreply
|
199 |
args = [ |
|
0981a08…
|
noreply
|
200 |
"spo", |
|
0981a08…
|
noreply
|
201 |
"file", |
|
0981a08…
|
noreply
|
202 |
"get", |
|
0981a08…
|
noreply
|
203 |
"--webUrl", |
|
0981a08…
|
noreply
|
204 |
self.web_url, |
|
0981a08…
|
noreply
|
205 |
"--asString", |
|
0981a08…
|
noreply
|
206 |
] |
|
0981a08…
|
noreply
|
207 |
if file.path: |
|
0981a08…
|
noreply
|
208 |
args.extend(["--url", file.path]) |
|
0981a08…
|
noreply
|
209 |
else: |
|
0981a08…
|
noreply
|
210 |
args.extend(["--id", file.id]) |
|
0981a08…
|
noreply
|
211 |
|
|
0981a08…
|
noreply
|
212 |
result = _run_m365(args, timeout=60) |
|
0981a08…
|
noreply
|
213 |
return result if isinstance(result, str) else json.dumps(result) |
|
0981a08…
|
noreply
|
214 |
except RuntimeError: |
|
0981a08…
|
noreply
|
215 |
pass |
|
0981a08…
|
noreply
|
216 |
|
|
0981a08…
|
noreply
|
217 |
# For binary formats, download to temp and extract |
|
0981a08…
|
noreply
|
218 |
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: |
|
0981a08…
|
noreply
|
219 |
tmp_path = Path(tmp.name) |
|
0981a08…
|
noreply
|
220 |
|
|
0981a08…
|
noreply
|
221 |
try: |
|
0981a08…
|
noreply
|
222 |
self.download(file, tmp_path) |
|
0981a08…
|
noreply
|
223 |
return _extract_text(tmp_path) |
|
0981a08…
|
noreply
|
224 |
finally: |
|
0981a08…
|
noreply
|
225 |
tmp_path.unlink(missing_ok=True) |
|
0981a08…
|
noreply
|
226 |
|
|
0981a08…
|
noreply
|
227 |
def fetch_all_text(self) -> Dict[str, str]: |
|
0981a08…
|
noreply
|
228 |
"""List all docs and return {filename: text_content} dict.""" |
|
0981a08…
|
noreply
|
229 |
files = self.list_videos() |
|
0981a08…
|
noreply
|
230 |
results = {} |
|
0981a08…
|
noreply
|
231 |
for f in files: |
|
0981a08…
|
noreply
|
232 |
try: |
|
0981a08…
|
noreply
|
233 |
results[f.name] = self.download_as_text(f) |
|
0981a08…
|
noreply
|
234 |
except Exception as e: |
|
0981a08…
|
noreply
|
235 |
logger.warning(f"Failed to fetch {f.name}: {e}") |
|
0981a08…
|
noreply
|
236 |
results[f.name] = f"[Error: {e}]" |
|
0981a08…
|
noreply
|
237 |
return results |
|
0981a08…
|
noreply
|
238 |
|
|
0981a08…
|
noreply
|
239 |
def collate(self, separator: str = "\n\n---\n\n") -> str: |
|
0981a08…
|
noreply
|
240 |
"""Fetch all docs and collate into a single text blob for ingestion.""" |
|
0981a08…
|
noreply
|
241 |
docs = self.fetch_all_text() |
|
0981a08…
|
noreply
|
242 |
parts = [] |
|
0981a08…
|
noreply
|
243 |
for name, content in docs.items(): |
|
0981a08…
|
noreply
|
244 |
parts.append(f"# {name}\n\n{content}") |
|
0981a08…
|
noreply
|
245 |
return separator.join(parts) |
|
0981a08…
|
noreply
|
246 |
|
|
0981a08…
|
noreply
|
247 |
|
|
0981a08…
|
noreply
|
248 |
def _result_to_source_file(item: dict) -> SourceFile: |
|
0981a08…
|
noreply
|
249 |
"""Convert an m365 file result to SourceFile.""" |
|
0981a08…
|
noreply
|
250 |
name = item.get("Name", item.get("name", "Untitled")) |
|
0981a08…
|
noreply
|
251 |
file_id = item.get("UniqueId", item.get("uniqueId", item.get("id", ""))) |
|
0981a08…
|
noreply
|
252 |
size = item.get("Length", item.get("length", item.get("size"))) |
|
0981a08…
|
noreply
|
253 |
path = item.get("ServerRelativeUrl", item.get("serverRelativeUrl")) |
|
0981a08…
|
noreply
|
254 |
modified = item.get("TimeLastModified", item.get("lastModifiedDateTime")) |
|
0981a08…
|
noreply
|
255 |
|
|
0981a08…
|
noreply
|
256 |
return SourceFile( |
|
0981a08…
|
noreply
|
257 |
name=name, |
|
0981a08…
|
noreply
|
258 |
id=str(file_id), |
|
0981a08…
|
noreply
|
259 |
size_bytes=int(size) if size else None, |
|
0981a08…
|
noreply
|
260 |
mime_type=None, |
|
0981a08…
|
noreply
|
261 |
modified_at=modified, |
|
0981a08…
|
noreply
|
262 |
path=path, |
|
0981a08…
|
noreply
|
263 |
) |
|
0981a08…
|
noreply
|
264 |
|
|
0981a08…
|
noreply
|
265 |
|
|
0981a08…
|
noreply
|
266 |
def _extract_text(path: Path) -> str: |
|
0981a08…
|
noreply
|
267 |
"""Best-effort text extraction from a downloaded file.""" |
|
0981a08…
|
noreply
|
268 |
ext = path.suffix.lower() |
|
0981a08…
|
noreply
|
269 |
|
|
0981a08…
|
noreply
|
270 |
if ext in {".txt", ".md", ".csv"}: |
|
0981a08…
|
noreply
|
271 |
return path.read_text(encoding="utf-8", errors="replace") |
|
0981a08…
|
noreply
|
272 |
|
|
0981a08…
|
noreply
|
273 |
if ext in {".html", ".htm"}: |
|
0981a08…
|
noreply
|
274 |
from video_processor.sources.web_source import _strip_html_tags |
|
0981a08…
|
noreply
|
275 |
|
|
0981a08…
|
noreply
|
276 |
return _strip_html_tags(path.read_text(encoding="utf-8", errors="replace")) |
|
0981a08…
|
noreply
|
277 |
|
|
0981a08…
|
noreply
|
278 |
if ext == ".pdf": |
|
0981a08…
|
noreply
|
279 |
try: |
|
0981a08…
|
noreply
|
280 |
import fitz # pymupdf |
|
0981a08…
|
noreply
|
281 |
|
|
0981a08…
|
noreply
|
282 |
doc = fitz.open(str(path)) |
|
0981a08…
|
noreply
|
283 |
return "\n\n".join(page.get_text() for page in doc) |
|
0981a08…
|
noreply
|
284 |
except ImportError: |
|
0981a08…
|
noreply
|
285 |
return f"[PDF file: {path.name} — install pymupdf to extract text]" |
|
0981a08…
|
noreply
|
286 |
|
|
0981a08…
|
noreply
|
287 |
if ext in {".docx", ".pptx", ".xlsx"}: |
|
0981a08…
|
noreply
|
288 |
# Try python-docx / openpyxl / python-pptx if available |
|
0981a08…
|
noreply
|
289 |
try: |
|
0981a08…
|
noreply
|
290 |
if ext == ".docx": |
|
0981a08…
|
noreply
|
291 |
from docx import Document |
|
0981a08…
|
noreply
|
292 |
|
|
0981a08…
|
noreply
|
293 |
doc = Document(str(path)) |
|
0981a08…
|
noreply
|
294 |
return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip()) |
|
0981a08…
|
noreply
|
295 |
elif ext == ".xlsx": |
|
0981a08…
|
noreply
|
296 |
import openpyxl |
|
0981a08…
|
noreply
|
297 |
|
|
0981a08…
|
noreply
|
298 |
wb = openpyxl.load_workbook(str(path), read_only=True) |
|
0981a08…
|
noreply
|
299 |
rows = [] |
|
0981a08…
|
noreply
|
300 |
for sheet in wb.sheetnames: |
|
0981a08…
|
noreply
|
301 |
ws = wb[sheet] |
|
0981a08…
|
noreply
|
302 |
for row in ws.iter_rows(values_only=True): |
|
0981a08…
|
noreply
|
303 |
cells = [str(c) if c is not None else "" for c in row] |
|
0981a08…
|
noreply
|
304 |
if any(cells): |
|
0981a08…
|
noreply
|
305 |
rows.append("\t".join(cells)) |
|
0981a08…
|
noreply
|
306 |
return "\n".join(rows) |
|
0981a08…
|
noreply
|
307 |
except ImportError: |
|
0981a08…
|
noreply
|
308 |
return f"[{ext} file: {path.name} — install python-docx/openpyxl to extract text]" |
|
0981a08…
|
noreply
|
309 |
|
|
0981a08…
|
noreply
|
310 |
return f"[Unsupported format: {path.name}]" |