PlanOpticon

planopticon / video_processor / sources / m365_source.py

Source Blame History 310 lines

0981a08…	noreply	1	"""Microsoft 365 source connector using the m365 CLI (cli-microsoft365).
0981a08…	noreply	2
0981a08…	noreply	3	Fetches documents from SharePoint and OneDrive via the `m365` CLI tool.
0981a08…	noreply	4	Outputs plain text suitable for KG ingestion.
0981a08…	noreply	5
0981a08…	noreply	6	Requires: npm install -g @pnp/cli-microsoft365
0981a08…	noreply	7	Auth: m365 login (interactive)
0981a08…	noreply	8	Docs: https://pnp.github.io/cli-microsoft365/
0981a08…	noreply	9	"""
0981a08…	noreply	10
0981a08…	noreply	11	import json
0981a08…	noreply	12	import logging
0981a08…	noreply	13	import shutil
0981a08…	noreply	14	import subprocess
0981a08…	noreply	15	import tempfile
0981a08…	noreply	16	from pathlib import Path
0981a08…	noreply	17	from typing import Any, Dict, List, Optional
0981a08…	noreply	18
0981a08…	noreply	19	from video_processor.sources.base import BaseSource, SourceFile
0981a08…	noreply	20
0981a08…	noreply	21	logger = logging.getLogger(__name__)
0981a08…	noreply	22
0981a08…	noreply	23	# Document MIME types we can extract text from
0981a08…	noreply	24	_DOC_EXTENSIONS = {
0981a08…	noreply	25	".docx",
0981a08…	noreply	26	".doc",
0981a08…	noreply	27	".xlsx",
0981a08…	noreply	28	".xls",
0981a08…	noreply	29	".pptx",
0981a08…	noreply	30	".ppt",
0981a08…	noreply	31	".pdf",
0981a08…	noreply	32	".txt",
0981a08…	noreply	33	".md",
0981a08…	noreply	34	".csv",
0981a08…	noreply	35	".html",
0981a08…	noreply	36	".htm",
0981a08…	noreply	37	}
0981a08…	noreply	38
0981a08…	noreply	39
0981a08…	noreply	40	def _run_m365(args: List[str], timeout: int = 30) -> Any:
0981a08…	noreply	41	"""Run an m365 CLI command and return parsed JSON output."""
0981a08…	noreply	42	cmd = ["m365"] + args + ["--output", "json"]
0981a08…	noreply	43	proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
0981a08…	noreply	44	if proc.returncode != 0:
0981a08…	noreply	45	raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}")
0981a08…	noreply	46	try:
0981a08…	noreply	47	return json.loads(proc.stdout)
0981a08…	noreply	48	except json.JSONDecodeError:
0981a08…	noreply	49	return proc.stdout.strip()
0981a08…	noreply	50
0981a08…	noreply	51
0981a08…	noreply	52	class M365Source(BaseSource):
0981a08…	noreply	53	"""
0981a08…	noreply	54	Fetch documents from SharePoint Online and OneDrive via the m365 CLI.
0981a08…	noreply	55
0981a08…	noreply	56	Usage:
0981a08…	noreply	57	# SharePoint site
0981a08…	noreply	58	source = M365Source(
0981a08…	noreply	59	web_url="https://contoso.sharepoint.com/sites/project-x",
0981a08…	noreply	60	folder_url="/sites/project-x/Shared Documents"
0981a08…	noreply	61	)
0981a08…	noreply	62
0981a08…	noreply	63	# OneDrive
0981a08…	noreply	64	source = M365Source(
0981a08…	noreply	65	web_url="https://contoso-my.sharepoint.com/personal/user_contoso_com",
0981a08…	noreply	66	folder_url="/personal/user_contoso_com/Documents"
0981a08…	noreply	67	)
0981a08…	noreply	68
0981a08…	noreply	69	files = source.list_videos()
0981a08…	noreply	70	source.download_all(files, Path("./docs"))
0981a08…	noreply	71	"""
0981a08…	noreply	72
0981a08…	noreply	73	def __init__(
0981a08…	noreply	74	self,
0981a08…	noreply	75	web_url: str,
0981a08…	noreply	76	folder_url: Optional[str] = None,
0981a08…	noreply	77	file_ids: Optional[List[str]] = None,
0981a08…	noreply	78	recursive: bool = False,
0981a08…	noreply	79	):
0981a08…	noreply	80	self.web_url = web_url
0981a08…	noreply	81	self.folder_url = folder_url
0981a08…	noreply	82	self.file_ids = file_ids or []
0981a08…	noreply	83	self.recursive = recursive
0981a08…	noreply	84
0981a08…	noreply	85	def authenticate(self) -> bool:
0981a08…	noreply	86	"""Check if m365 CLI is installed and logged in."""
0981a08…	noreply	87	if not shutil.which("m365"):
0981a08…	noreply	88	logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365")
0981a08…	noreply	89	return False
0981a08…	noreply	90	try:
0981a08…	noreply	91	result = _run_m365(["status"], timeout=10)
0981a08…	noreply	92	# m365 status returns connection info when logged in
0981a08…	noreply	93	if isinstance(result, dict) and result.get("connectedAs"):
0981a08…	noreply	94	return True
0981a08…	noreply	95	if isinstance(result, str) and "Logged in" in result:
0981a08…	noreply	96	return True
0981a08…	noreply	97	logger.error("m365 not logged in. Run: m365 login")
0981a08…	noreply	98	return False
0981a08…	noreply	99	except (RuntimeError, subprocess.TimeoutExpired):
0981a08…	noreply	100	logger.error("m365 not logged in. Run: m365 login")
0981a08…	noreply	101	return False
0981a08…	noreply	102
0981a08…	noreply	103	def list_videos(
0981a08…	noreply	104	self,
0981a08…	noreply	105	folder_id: Optional[str] = None,
0981a08…	noreply	106	folder_path: Optional[str] = None,
0981a08…	noreply	107	patterns: Optional[List[str]] = None,
0981a08…	noreply	108	) -> List[SourceFile]:
0981a08…	noreply	109	"""List documents in SharePoint/OneDrive. Returns docs, not just videos."""
0981a08…	noreply	110	files: List[SourceFile] = []
0981a08…	noreply	111
0981a08…	noreply	112	# Fetch specific files by ID
0981a08…	noreply	113	if self.file_ids:
0981a08…	noreply	114	for fid in self.file_ids:
0981a08…	noreply	115	try:
0981a08…	noreply	116	result = _run_m365(
0981a08…	noreply	117	[
0981a08…	noreply	118	"spo",
0981a08…	noreply	119	"file",
0981a08…	noreply	120	"get",
0981a08…	noreply	121	"--webUrl",
0981a08…	noreply	122	self.web_url,
0981a08…	noreply	123	"--id",
0981a08…	noreply	124	fid,
0981a08…	noreply	125	]
0981a08…	noreply	126	)
0981a08…	noreply	127	files.append(_result_to_source_file(result))
0981a08…	noreply	128	except RuntimeError as e:
0981a08…	noreply	129	logger.warning(f"Failed to get file {fid}: {e}")
0981a08…	noreply	130	return files
0981a08…	noreply	131
0981a08…	noreply	132	# List files in folder
0981a08…	noreply	133	folder = folder_path or self.folder_url
0981a08…	noreply	134	if not folder:
0981a08…	noreply	135	logger.error("No folder URL specified. Use --folder-url or folder_path parameter.")
0981a08…	noreply	136	return []
0981a08…	noreply	137
0981a08…	noreply	138	try:
0981a08…	noreply	139	args = [
0981a08…	noreply	140	"file",
0981a08…	noreply	141	"list",
0981a08…	noreply	142	"--webUrl",
0981a08…	noreply	143	self.web_url,
0981a08…	noreply	144	"--folderUrl",
0981a08…	noreply	145	folder,
0981a08…	noreply	146	]
0981a08…	noreply	147	if self.recursive:
0981a08…	noreply	148	args.append("--recursive")
0981a08…	noreply	149
0981a08…	noreply	150	result = _run_m365(args, timeout=60)
0981a08…	noreply	151	except RuntimeError as e:
0981a08…	noreply	152	logger.error(f"Failed to list files: {e}")
0981a08…	noreply	153	return []
0981a08…	noreply	154
0981a08…	noreply	155	items = result if isinstance(result, list) else []
0981a08…	noreply	156	for item in items:
0981a08…	noreply	157	name = item.get("Name", item.get("name", ""))
0981a08…	noreply	158	ext = Path(name).suffix.lower()
0981a08…	noreply	159	if ext in _DOC_EXTENSIONS:
0981a08…	noreply	160	files.append(_result_to_source_file(item))
0981a08…	noreply	161
0981a08…	noreply	162	logger.info(f"Found {len(files)} document(s) in {folder}")
0981a08…	noreply	163	return files
0981a08…	noreply	164
0981a08…	noreply	165	def download(self, file: SourceFile, destination: Path) -> Path:
0981a08…	noreply	166	"""Download a file from SharePoint/OneDrive."""
0981a08…	noreply	167	destination = Path(destination)
0981a08…	noreply	168	destination.parent.mkdir(parents=True, exist_ok=True)
0981a08…	noreply	169
0981a08…	noreply	170	args = [
0981a08…	noreply	171	"spo",
0981a08…	noreply	172	"file",
0981a08…	noreply	173	"get",
0981a08…	noreply	174	"--webUrl",
0981a08…	noreply	175	self.web_url,
0981a08…	noreply	176	"--asFile",
0981a08…	noreply	177	"--path",
0981a08…	noreply	178	str(destination),
0981a08…	noreply	179	]
0981a08…	noreply	180
0981a08…	noreply	181	# Use URL if available in path field, otherwise use ID
0981a08…	noreply	182	if file.path:
0981a08…	noreply	183	args.extend(["--url", file.path])
0981a08…	noreply	184	else:
0981a08…	noreply	185	args.extend(["--id", file.id])
0981a08…	noreply	186
0981a08…	noreply	187	_run_m365(args, timeout=120)
0981a08…	noreply	188	logger.info(f"Downloaded {file.name} to {destination}")
0981a08…	noreply	189	return destination
0981a08…	noreply	190
0981a08…	noreply	191	def download_as_text(self, file: SourceFile) -> str:
0981a08…	noreply	192	"""Download a file and attempt to extract text content."""
0981a08…	noreply	193	# For text-based formats, get as string directly
0981a08…	noreply	194	text_exts = {".txt", ".md", ".csv", ".html", ".htm"}
0981a08…	noreply	195	ext = Path(file.name).suffix.lower()
0981a08…	noreply	196
0981a08…	noreply	197	if ext in text_exts:
0981a08…	noreply	198	try:
0981a08…	noreply	199	args = [
0981a08…	noreply	200	"spo",
0981a08…	noreply	201	"file",
0981a08…	noreply	202	"get",
0981a08…	noreply	203	"--webUrl",
0981a08…	noreply	204	self.web_url,
0981a08…	noreply	205	"--asString",
0981a08…	noreply	206	]
0981a08…	noreply	207	if file.path:
0981a08…	noreply	208	args.extend(["--url", file.path])
0981a08…	noreply	209	else:
0981a08…	noreply	210	args.extend(["--id", file.id])
0981a08…	noreply	211
0981a08…	noreply	212	result = _run_m365(args, timeout=60)
0981a08…	noreply	213	return result if isinstance(result, str) else json.dumps(result)
0981a08…	noreply	214	except RuntimeError:
0981a08…	noreply	215	pass
0981a08…	noreply	216
0981a08…	noreply	217	# For binary formats, download to temp and extract
0981a08…	noreply	218	with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
0981a08…	noreply	219	tmp_path = Path(tmp.name)
0981a08…	noreply	220
0981a08…	noreply	221	try:
0981a08…	noreply	222	self.download(file, tmp_path)
0981a08…	noreply	223	return _extract_text(tmp_path)
0981a08…	noreply	224	finally:
0981a08…	noreply	225	tmp_path.unlink(missing_ok=True)
0981a08…	noreply	226
0981a08…	noreply	227	def fetch_all_text(self) -> Dict[str, str]:
0981a08…	noreply	228	"""List all docs and return {filename: text_content} dict."""
0981a08…	noreply	229	files = self.list_videos()
0981a08…	noreply	230	results = {}
0981a08…	noreply	231	for f in files:
0981a08…	noreply	232	try:
0981a08…	noreply	233	results[f.name] = self.download_as_text(f)
0981a08…	noreply	234	except Exception as e:
0981a08…	noreply	235	logger.warning(f"Failed to fetch {f.name}: {e}")
0981a08…	noreply	236	results[f.name] = f"[Error: {e}]"
0981a08…	noreply	237	return results
0981a08…	noreply	238
0981a08…	noreply	239	def collate(self, separator: str = "\n\n---\n\n") -> str:
0981a08…	noreply	240	"""Fetch all docs and collate into a single text blob for ingestion."""
0981a08…	noreply	241	docs = self.fetch_all_text()
0981a08…	noreply	242	parts = []
0981a08…	noreply	243	for name, content in docs.items():
0981a08…	noreply	244	parts.append(f"# {name}\n\n{content}")
0981a08…	noreply	245	return separator.join(parts)
0981a08…	noreply	246
0981a08…	noreply	247
0981a08…	noreply	248	def _result_to_source_file(item: dict) -> SourceFile:
0981a08…	noreply	249	"""Convert an m365 file result to SourceFile."""
0981a08…	noreply	250	name = item.get("Name", item.get("name", "Untitled"))
0981a08…	noreply	251	file_id = item.get("UniqueId", item.get("uniqueId", item.get("id", "")))
0981a08…	noreply	252	size = item.get("Length", item.get("length", item.get("size")))
0981a08…	noreply	253	path = item.get("ServerRelativeUrl", item.get("serverRelativeUrl"))
0981a08…	noreply	254	modified = item.get("TimeLastModified", item.get("lastModifiedDateTime"))
0981a08…	noreply	255
0981a08…	noreply	256	return SourceFile(
0981a08…	noreply	257	name=name,
0981a08…	noreply	258	id=str(file_id),
0981a08…	noreply	259	size_bytes=int(size) if size else None,
0981a08…	noreply	260	mime_type=None,
0981a08…	noreply	261	modified_at=modified,
0981a08…	noreply	262	path=path,
0981a08…	noreply	263	)
0981a08…	noreply	264
0981a08…	noreply	265
0981a08…	noreply	266	def _extract_text(path: Path) -> str:
0981a08…	noreply	267	"""Best-effort text extraction from a downloaded file."""
0981a08…	noreply	268	ext = path.suffix.lower()
0981a08…	noreply	269
0981a08…	noreply	270	if ext in {".txt", ".md", ".csv"}:
0981a08…	noreply	271	return path.read_text(encoding="utf-8", errors="replace")
0981a08…	noreply	272
0981a08…	noreply	273	if ext in {".html", ".htm"}:
0981a08…	noreply	274	from video_processor.sources.web_source import _strip_html_tags
0981a08…	noreply	275
0981a08…	noreply	276	return _strip_html_tags(path.read_text(encoding="utf-8", errors="replace"))
0981a08…	noreply	277
0981a08…	noreply	278	if ext == ".pdf":
0981a08…	noreply	279	try:
0981a08…	noreply	280	import fitz # pymupdf
0981a08…	noreply	281
0981a08…	noreply	282	doc = fitz.open(str(path))
0981a08…	noreply	283	return "\n\n".join(page.get_text() for page in doc)
0981a08…	noreply	284	except ImportError:
0981a08…	noreply	285	return f"[PDF file: {path.name} — install pymupdf to extract text]"
0981a08…	noreply	286
0981a08…	noreply	287	if ext in {".docx", ".pptx", ".xlsx"}:
0981a08…	noreply	288	# Try python-docx / openpyxl / python-pptx if available
0981a08…	noreply	289	try:
0981a08…	noreply	290	if ext == ".docx":
0981a08…	noreply	291	from docx import Document
0981a08…	noreply	292
0981a08…	noreply	293	doc = Document(str(path))
0981a08…	noreply	294	return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())
0981a08…	noreply	295	elif ext == ".xlsx":
0981a08…	noreply	296	import openpyxl
0981a08…	noreply	297
0981a08…	noreply	298	wb = openpyxl.load_workbook(str(path), read_only=True)
0981a08…	noreply	299	rows = []
0981a08…	noreply	300	for sheet in wb.sheetnames:
0981a08…	noreply	301	ws = wb[sheet]
0981a08…	noreply	302	for row in ws.iter_rows(values_only=True):
0981a08…	noreply	303	cells = [str(c) if c is not None else "" for c in row]
0981a08…	noreply	304	if any(cells):
0981a08…	noreply	305	rows.append("\t".join(cells))
0981a08…	noreply	306	return "\n".join(rows)
0981a08…	noreply	307	except ImportError:
0981a08…	noreply	308	return f"[{ext} file: {path.name} — install python-docx/openpyxl to extract text]"
0981a08…	noreply	309
0981a08…	noreply	310	return f"[Unsupported format: {path.name}]"

PlanOpticon

Keyboard Shortcuts