PlanOpticon

planopticon / video_processor / sources / m365_source.py
Source Blame History 310 lines
0981a08… noreply 1 """Microsoft 365 source connector using the m365 CLI (cli-microsoft365).
0981a08… noreply 2
0981a08… noreply 3 Fetches documents from SharePoint and OneDrive via the `m365` CLI tool.
0981a08… noreply 4 Outputs plain text suitable for KG ingestion.
0981a08… noreply 5
0981a08… noreply 6 Requires: npm install -g @pnp/cli-microsoft365
0981a08… noreply 7 Auth: m365 login (interactive)
0981a08… noreply 8 Docs: https://pnp.github.io/cli-microsoft365/
0981a08… noreply 9 """
0981a08… noreply 10
0981a08… noreply 11 import json
0981a08… noreply 12 import logging
0981a08… noreply 13 import shutil
0981a08… noreply 14 import subprocess
0981a08… noreply 15 import tempfile
0981a08… noreply 16 from pathlib import Path
0981a08… noreply 17 from typing import Any, Dict, List, Optional
0981a08… noreply 18
0981a08… noreply 19 from video_processor.sources.base import BaseSource, SourceFile
0981a08… noreply 20
0981a08… noreply 21 logger = logging.getLogger(__name__)
0981a08… noreply 22
0981a08… noreply 23 # Document MIME types we can extract text from
0981a08… noreply 24 _DOC_EXTENSIONS = {
0981a08… noreply 25 ".docx",
0981a08… noreply 26 ".doc",
0981a08… noreply 27 ".xlsx",
0981a08… noreply 28 ".xls",
0981a08… noreply 29 ".pptx",
0981a08… noreply 30 ".ppt",
0981a08… noreply 31 ".pdf",
0981a08… noreply 32 ".txt",
0981a08… noreply 33 ".md",
0981a08… noreply 34 ".csv",
0981a08… noreply 35 ".html",
0981a08… noreply 36 ".htm",
0981a08… noreply 37 }
0981a08… noreply 38
0981a08… noreply 39
0981a08… noreply 40 def _run_m365(args: List[str], timeout: int = 30) -> Any:
0981a08… noreply 41 """Run an m365 CLI command and return parsed JSON output."""
0981a08… noreply 42 cmd = ["m365"] + args + ["--output", "json"]
0981a08… noreply 43 proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
0981a08… noreply 44 if proc.returncode != 0:
0981a08… noreply 45 raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}")
0981a08… noreply 46 try:
0981a08… noreply 47 return json.loads(proc.stdout)
0981a08… noreply 48 except json.JSONDecodeError:
0981a08… noreply 49 return proc.stdout.strip()
0981a08… noreply 50
0981a08… noreply 51
0981a08… noreply 52 class M365Source(BaseSource):
0981a08… noreply 53 """
0981a08… noreply 54 Fetch documents from SharePoint Online and OneDrive via the m365 CLI.
0981a08… noreply 55
0981a08… noreply 56 Usage:
0981a08… noreply 57 # SharePoint site
0981a08… noreply 58 source = M365Source(
0981a08… noreply 59 web_url="https://contoso.sharepoint.com/sites/project-x",
0981a08… noreply 60 folder_url="/sites/project-x/Shared Documents"
0981a08… noreply 61 )
0981a08… noreply 62
0981a08… noreply 63 # OneDrive
0981a08… noreply 64 source = M365Source(
0981a08… noreply 65 web_url="https://contoso-my.sharepoint.com/personal/user_contoso_com",
0981a08… noreply 66 folder_url="/personal/user_contoso_com/Documents"
0981a08… noreply 67 )
0981a08… noreply 68
0981a08… noreply 69 files = source.list_videos()
0981a08… noreply 70 source.download_all(files, Path("./docs"))
0981a08… noreply 71 """
0981a08… noreply 72
0981a08… noreply 73 def __init__(
0981a08… noreply 74 self,
0981a08… noreply 75 web_url: str,
0981a08… noreply 76 folder_url: Optional[str] = None,
0981a08… noreply 77 file_ids: Optional[List[str]] = None,
0981a08… noreply 78 recursive: bool = False,
0981a08… noreply 79 ):
0981a08… noreply 80 self.web_url = web_url
0981a08… noreply 81 self.folder_url = folder_url
0981a08… noreply 82 self.file_ids = file_ids or []
0981a08… noreply 83 self.recursive = recursive
0981a08… noreply 84
0981a08… noreply 85 def authenticate(self) -> bool:
0981a08… noreply 86 """Check if m365 CLI is installed and logged in."""
0981a08… noreply 87 if not shutil.which("m365"):
0981a08… noreply 88 logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365")
0981a08… noreply 89 return False
0981a08… noreply 90 try:
0981a08… noreply 91 result = _run_m365(["status"], timeout=10)
0981a08… noreply 92 # m365 status returns connection info when logged in
0981a08… noreply 93 if isinstance(result, dict) and result.get("connectedAs"):
0981a08… noreply 94 return True
0981a08… noreply 95 if isinstance(result, str) and "Logged in" in result:
0981a08… noreply 96 return True
0981a08… noreply 97 logger.error("m365 not logged in. Run: m365 login")
0981a08… noreply 98 return False
0981a08… noreply 99 except (RuntimeError, subprocess.TimeoutExpired):
0981a08… noreply 100 logger.error("m365 not logged in. Run: m365 login")
0981a08… noreply 101 return False
0981a08… noreply 102
0981a08… noreply 103 def list_videos(
0981a08… noreply 104 self,
0981a08… noreply 105 folder_id: Optional[str] = None,
0981a08… noreply 106 folder_path: Optional[str] = None,
0981a08… noreply 107 patterns: Optional[List[str]] = None,
0981a08… noreply 108 ) -> List[SourceFile]:
0981a08… noreply 109 """List documents in SharePoint/OneDrive. Returns docs, not just videos."""
0981a08… noreply 110 files: List[SourceFile] = []
0981a08… noreply 111
0981a08… noreply 112 # Fetch specific files by ID
0981a08… noreply 113 if self.file_ids:
0981a08… noreply 114 for fid in self.file_ids:
0981a08… noreply 115 try:
0981a08… noreply 116 result = _run_m365(
0981a08… noreply 117 [
0981a08… noreply 118 "spo",
0981a08… noreply 119 "file",
0981a08… noreply 120 "get",
0981a08… noreply 121 "--webUrl",
0981a08… noreply 122 self.web_url,
0981a08… noreply 123 "--id",
0981a08… noreply 124 fid,
0981a08… noreply 125 ]
0981a08… noreply 126 )
0981a08… noreply 127 files.append(_result_to_source_file(result))
0981a08… noreply 128 except RuntimeError as e:
0981a08… noreply 129 logger.warning(f"Failed to get file {fid}: {e}")
0981a08… noreply 130 return files
0981a08… noreply 131
0981a08… noreply 132 # List files in folder
0981a08… noreply 133 folder = folder_path or self.folder_url
0981a08… noreply 134 if not folder:
0981a08… noreply 135 logger.error("No folder URL specified. Use --folder-url or folder_path parameter.")
0981a08… noreply 136 return []
0981a08… noreply 137
0981a08… noreply 138 try:
0981a08… noreply 139 args = [
0981a08… noreply 140 "file",
0981a08… noreply 141 "list",
0981a08… noreply 142 "--webUrl",
0981a08… noreply 143 self.web_url,
0981a08… noreply 144 "--folderUrl",
0981a08… noreply 145 folder,
0981a08… noreply 146 ]
0981a08… noreply 147 if self.recursive:
0981a08… noreply 148 args.append("--recursive")
0981a08… noreply 149
0981a08… noreply 150 result = _run_m365(args, timeout=60)
0981a08… noreply 151 except RuntimeError as e:
0981a08… noreply 152 logger.error(f"Failed to list files: {e}")
0981a08… noreply 153 return []
0981a08… noreply 154
0981a08… noreply 155 items = result if isinstance(result, list) else []
0981a08… noreply 156 for item in items:
0981a08… noreply 157 name = item.get("Name", item.get("name", ""))
0981a08… noreply 158 ext = Path(name).suffix.lower()
0981a08… noreply 159 if ext in _DOC_EXTENSIONS:
0981a08… noreply 160 files.append(_result_to_source_file(item))
0981a08… noreply 161
0981a08… noreply 162 logger.info(f"Found {len(files)} document(s) in {folder}")
0981a08… noreply 163 return files
0981a08… noreply 164
0981a08… noreply 165 def download(self, file: SourceFile, destination: Path) -> Path:
0981a08… noreply 166 """Download a file from SharePoint/OneDrive."""
0981a08… noreply 167 destination = Path(destination)
0981a08… noreply 168 destination.parent.mkdir(parents=True, exist_ok=True)
0981a08… noreply 169
0981a08… noreply 170 args = [
0981a08… noreply 171 "spo",
0981a08… noreply 172 "file",
0981a08… noreply 173 "get",
0981a08… noreply 174 "--webUrl",
0981a08… noreply 175 self.web_url,
0981a08… noreply 176 "--asFile",
0981a08… noreply 177 "--path",
0981a08… noreply 178 str(destination),
0981a08… noreply 179 ]
0981a08… noreply 180
0981a08… noreply 181 # Use URL if available in path field, otherwise use ID
0981a08… noreply 182 if file.path:
0981a08… noreply 183 args.extend(["--url", file.path])
0981a08… noreply 184 else:
0981a08… noreply 185 args.extend(["--id", file.id])
0981a08… noreply 186
0981a08… noreply 187 _run_m365(args, timeout=120)
0981a08… noreply 188 logger.info(f"Downloaded {file.name} to {destination}")
0981a08… noreply 189 return destination
0981a08… noreply 190
0981a08… noreply 191 def download_as_text(self, file: SourceFile) -> str:
0981a08… noreply 192 """Download a file and attempt to extract text content."""
0981a08… noreply 193 # For text-based formats, get as string directly
0981a08… noreply 194 text_exts = {".txt", ".md", ".csv", ".html", ".htm"}
0981a08… noreply 195 ext = Path(file.name).suffix.lower()
0981a08… noreply 196
0981a08… noreply 197 if ext in text_exts:
0981a08… noreply 198 try:
0981a08… noreply 199 args = [
0981a08… noreply 200 "spo",
0981a08… noreply 201 "file",
0981a08… noreply 202 "get",
0981a08… noreply 203 "--webUrl",
0981a08… noreply 204 self.web_url,
0981a08… noreply 205 "--asString",
0981a08… noreply 206 ]
0981a08… noreply 207 if file.path:
0981a08… noreply 208 args.extend(["--url", file.path])
0981a08… noreply 209 else:
0981a08… noreply 210 args.extend(["--id", file.id])
0981a08… noreply 211
0981a08… noreply 212 result = _run_m365(args, timeout=60)
0981a08… noreply 213 return result if isinstance(result, str) else json.dumps(result)
0981a08… noreply 214 except RuntimeError:
0981a08… noreply 215 pass
0981a08… noreply 216
0981a08… noreply 217 # For binary formats, download to temp and extract
0981a08… noreply 218 with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
0981a08… noreply 219 tmp_path = Path(tmp.name)
0981a08… noreply 220
0981a08… noreply 221 try:
0981a08… noreply 222 self.download(file, tmp_path)
0981a08… noreply 223 return _extract_text(tmp_path)
0981a08… noreply 224 finally:
0981a08… noreply 225 tmp_path.unlink(missing_ok=True)
0981a08… noreply 226
0981a08… noreply 227 def fetch_all_text(self) -> Dict[str, str]:
0981a08… noreply 228 """List all docs and return {filename: text_content} dict."""
0981a08… noreply 229 files = self.list_videos()
0981a08… noreply 230 results = {}
0981a08… noreply 231 for f in files:
0981a08… noreply 232 try:
0981a08… noreply 233 results[f.name] = self.download_as_text(f)
0981a08… noreply 234 except Exception as e:
0981a08… noreply 235 logger.warning(f"Failed to fetch {f.name}: {e}")
0981a08… noreply 236 results[f.name] = f"[Error: {e}]"
0981a08… noreply 237 return results
0981a08… noreply 238
0981a08… noreply 239 def collate(self, separator: str = "\n\n---\n\n") -> str:
0981a08… noreply 240 """Fetch all docs and collate into a single text blob for ingestion."""
0981a08… noreply 241 docs = self.fetch_all_text()
0981a08… noreply 242 parts = []
0981a08… noreply 243 for name, content in docs.items():
0981a08… noreply 244 parts.append(f"# {name}\n\n{content}")
0981a08… noreply 245 return separator.join(parts)
0981a08… noreply 246
0981a08… noreply 247
0981a08… noreply 248 def _result_to_source_file(item: dict) -> SourceFile:
0981a08… noreply 249 """Convert an m365 file result to SourceFile."""
0981a08… noreply 250 name = item.get("Name", item.get("name", "Untitled"))
0981a08… noreply 251 file_id = item.get("UniqueId", item.get("uniqueId", item.get("id", "")))
0981a08… noreply 252 size = item.get("Length", item.get("length", item.get("size")))
0981a08… noreply 253 path = item.get("ServerRelativeUrl", item.get("serverRelativeUrl"))
0981a08… noreply 254 modified = item.get("TimeLastModified", item.get("lastModifiedDateTime"))
0981a08… noreply 255
0981a08… noreply 256 return SourceFile(
0981a08… noreply 257 name=name,
0981a08… noreply 258 id=str(file_id),
0981a08… noreply 259 size_bytes=int(size) if size else None,
0981a08… noreply 260 mime_type=None,
0981a08… noreply 261 modified_at=modified,
0981a08… noreply 262 path=path,
0981a08… noreply 263 )
0981a08… noreply 264
0981a08… noreply 265
0981a08… noreply 266 def _extract_text(path: Path) -> str:
0981a08… noreply 267 """Best-effort text extraction from a downloaded file."""
0981a08… noreply 268 ext = path.suffix.lower()
0981a08… noreply 269
0981a08… noreply 270 if ext in {".txt", ".md", ".csv"}:
0981a08… noreply 271 return path.read_text(encoding="utf-8", errors="replace")
0981a08… noreply 272
0981a08… noreply 273 if ext in {".html", ".htm"}:
0981a08… noreply 274 from video_processor.sources.web_source import _strip_html_tags
0981a08… noreply 275
0981a08… noreply 276 return _strip_html_tags(path.read_text(encoding="utf-8", errors="replace"))
0981a08… noreply 277
0981a08… noreply 278 if ext == ".pdf":
0981a08… noreply 279 try:
0981a08… noreply 280 import fitz # pymupdf
0981a08… noreply 281
0981a08… noreply 282 doc = fitz.open(str(path))
0981a08… noreply 283 return "\n\n".join(page.get_text() for page in doc)
0981a08… noreply 284 except ImportError:
0981a08… noreply 285 return f"[PDF file: {path.name} — install pymupdf to extract text]"
0981a08… noreply 286
0981a08… noreply 287 if ext in {".docx", ".pptx", ".xlsx"}:
0981a08… noreply 288 # Try python-docx / openpyxl / python-pptx if available
0981a08… noreply 289 try:
0981a08… noreply 290 if ext == ".docx":
0981a08… noreply 291 from docx import Document
0981a08… noreply 292
0981a08… noreply 293 doc = Document(str(path))
0981a08… noreply 294 return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())
0981a08… noreply 295 elif ext == ".xlsx":
0981a08… noreply 296 import openpyxl
0981a08… noreply 297
0981a08… noreply 298 wb = openpyxl.load_workbook(str(path), read_only=True)
0981a08… noreply 299 rows = []
0981a08… noreply 300 for sheet in wb.sheetnames:
0981a08… noreply 301 ws = wb[sheet]
0981a08… noreply 302 for row in ws.iter_rows(values_only=True):
0981a08… noreply 303 cells = [str(c) if c is not None else "" for c in row]
0981a08… noreply 304 if any(cells):
0981a08… noreply 305 rows.append("\t".join(cells))
0981a08… noreply 306 return "\n".join(rows)
0981a08… noreply 307 except ImportError:
0981a08… noreply 308 return f"[{ext} file: {path.name} — install python-docx/openpyxl to extract text]"
0981a08… noreply 309
0981a08… noreply 310 return f"[Unsupported format: {path.name}]"

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button