PlanOpticon

planopticon / video_processor / sources / onenote_source.py
Source Blame History 222 lines
0981a08… noreply 1 """Microsoft OneNote source connector using the m365 CLI (cli-microsoft365).
0981a08… noreply 2
0981a08… noreply 3 Fetches pages from OneNote notebooks via the `m365` CLI tool.
0981a08… noreply 4 Outputs plain text suitable for KG ingestion.
0981a08… noreply 5
0981a08… noreply 6 Requires: npm install -g @pnp/cli-microsoft365
0981a08… noreply 7 Auth: m365 login (interactive)
0981a08… noreply 8 Docs: https://pnp.github.io/cli-microsoft365/
0981a08… noreply 9 """
0981a08… noreply 10
0981a08… noreply 11 import json
0981a08… noreply 12 import logging
0981a08… noreply 13 import re
0981a08… noreply 14 import shutil
0981a08… noreply 15 import subprocess
0981a08… noreply 16 from pathlib import Path
0981a08… noreply 17 from typing import Any, List, Optional
0981a08… noreply 18
0981a08… noreply 19 from video_processor.sources.base import BaseSource, SourceFile
0981a08… noreply 20
0981a08… noreply 21 logger = logging.getLogger(__name__)
0981a08… noreply 22
0981a08… noreply 23
0981a08… noreply 24 def _run_m365(args: List[str], timeout: int = 30) -> Any:
0981a08… noreply 25 """Run an m365 CLI command and return parsed JSON output."""
0981a08… noreply 26 cmd = ["m365"] + args + ["--output", "json"]
0981a08… noreply 27 proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
0981a08… noreply 28 if proc.returncode != 0:
0981a08… noreply 29 raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}")
0981a08… noreply 30 try:
0981a08… noreply 31 return json.loads(proc.stdout)
0981a08… noreply 32 except json.JSONDecodeError:
0981a08… noreply 33 return proc.stdout.strip()
0981a08… noreply 34
0981a08… noreply 35
0981a08… noreply 36 def _html_to_text(html: str) -> str:
0981a08… noreply 37 """Strip HTML tags and decode entities to produce plain text.
0981a08… noreply 38
0981a08… noreply 39 Uses only stdlib ``re`` — no external dependencies.
0981a08… noreply 40 """
0981a08… noreply 41 # Remove script/style blocks entirely
0981a08… noreply 42 text = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
0981a08… noreply 43 # Replace <br>, <p>, <div>, <li>, <tr> with newlines for readability
0981a08… noreply 44 text = re.sub(r"<br\s*/?>", "\n", text, flags=re.IGNORECASE)
0981a08… noreply 45 text = re.sub(r"</(p|div|li|tr|h[1-6])>", "\n", text, flags=re.IGNORECASE)
0981a08… noreply 46 # Strip remaining tags
0981a08… noreply 47 text = re.sub(r"<[^>]+>", "", text)
0981a08… noreply 48 # Decode common HTML entities
0981a08… noreply 49 entity_map = {
0981a08… noreply 50 "&amp;": "&",
0981a08… noreply 51 "&lt;": "<",
0981a08… noreply 52 "&gt;": ">",
0981a08… noreply 53 "&quot;": '"',
0981a08… noreply 54 "&#39;": "'",
0981a08… noreply 55 "&apos;": "'",
0981a08… noreply 56 "&nbsp;": " ",
0981a08… noreply 57 }
0981a08… noreply 58 for entity, char in entity_map.items():
0981a08… noreply 59 text = text.replace(entity, char)
0981a08… noreply 60 # Decode numeric entities (&#123; and &#x1a;)
0981a08… noreply 61 text = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), text)
0981a08… noreply 62 text = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), text)
0981a08… noreply 63 # Collapse excessive blank lines
0981a08… noreply 64 text = re.sub(r"\n{3,}", "\n\n", text)
0981a08… noreply 65 return text.strip()
0981a08… noreply 66
0981a08… noreply 67
0981a08… noreply 68 class OneNoteSource(BaseSource):
0981a08… noreply 69 """
0981a08… noreply 70 Fetch pages from OneNote notebooks via the m365 CLI.
0981a08… noreply 71
0981a08… noreply 72 Usage:
0981a08… noreply 73 source = OneNoteSource() # all notebooks
0981a08… noreply 74 source = OneNoteSource(notebook_name="Work Notes") # specific notebook
0981a08… noreply 75 source = OneNoteSource(notebook_name="Work", section_name="Meetings")
0981a08… noreply 76 files = source.list_videos()
0981a08… noreply 77 source.download_all(files, Path("./notes"))
0981a08… noreply 78 """
0981a08… noreply 79
0981a08… noreply 80 def __init__(
0981a08… noreply 81 self,
0981a08… noreply 82 notebook_name: Optional[str] = None,
0981a08… noreply 83 section_name: Optional[str] = None,
0981a08… noreply 84 ):
0981a08… noreply 85 self.notebook_name = notebook_name
0981a08… noreply 86 self.section_name = section_name
0981a08… noreply 87
0981a08… noreply 88 def authenticate(self) -> bool:
0981a08… noreply 89 """Check if m365 CLI is installed and logged in."""
0981a08… noreply 90 if not shutil.which("m365"):
0981a08… noreply 91 logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365")
0981a08… noreply 92 return False
0981a08… noreply 93 try:
0981a08… noreply 94 result = _run_m365(["status"], timeout=10)
0981a08… noreply 95 if isinstance(result, dict) and result.get("connectedAs"):
0981a08… noreply 96 return True
0981a08… noreply 97 if isinstance(result, str) and "Logged in" in result:
0981a08… noreply 98 return True
0981a08… noreply 99 logger.error("m365 not logged in. Run: m365 login")
0981a08… noreply 100 return False
0981a08… noreply 101 except (RuntimeError, subprocess.TimeoutExpired):
0981a08… noreply 102 logger.error("m365 not logged in. Run: m365 login")
0981a08… noreply 103 return False
0981a08… noreply 104
0981a08… noreply 105 def list_videos(
0981a08… noreply 106 self,
0981a08… noreply 107 folder_id: Optional[str] = None,
0981a08… noreply 108 folder_path: Optional[str] = None,
0981a08… noreply 109 patterns: Optional[List[str]] = None,
0981a08… noreply 110 ) -> List[SourceFile]:
0981a08… noreply 111 """List OneNote pages across notebooks/sections. Returns SourceFile per page."""
0981a08… noreply 112 files: List[SourceFile] = []
0981a08… noreply 113
0981a08… noreply 114 # Step 1: List notebooks
0981a08… noreply 115 try:
0981a08… noreply 116 notebooks = _run_m365(["onenote", "notebook", "list"], timeout=60)
0981a08… noreply 117 except RuntimeError as e:
0981a08… noreply 118 logger.error(f"Failed to list OneNote notebooks: {e}")
0981a08… noreply 119 return []
0981a08… noreply 120
0981a08… noreply 121 if not isinstance(notebooks, list):
0981a08… noreply 122 notebooks = []
0981a08… noreply 123
0981a08… noreply 124 # Filter notebooks by name if specified
0981a08… noreply 125 if self.notebook_name:
0981a08… noreply 126 notebooks = [
0981a08… noreply 127 nb
0981a08… noreply 128 for nb in notebooks
0981a08… noreply 129 if self.notebook_name.lower() in nb.get("displayName", "").lower()
0981a08… noreply 130 ]
0981a08… noreply 131
0981a08… noreply 132 for notebook in notebooks:
0981a08… noreply 133 notebook_id = notebook.get("id", "")
0981a08… noreply 134 notebook_name = notebook.get("displayName", "Untitled Notebook")
0981a08… noreply 135
0981a08… noreply 136 # Step 2: List sections in this notebook
0981a08… noreply 137 try:
0981a08… noreply 138 sections = _run_m365(
0981a08… noreply 139 ["onenote", "section", "list", "--notebookId", notebook_id],
0981a08… noreply 140 timeout=60,
0981a08… noreply 141 )
0981a08… noreply 142 except RuntimeError as e:
0981a08… noreply 143 logger.warning(f"Failed to list sections for notebook '{notebook_name}': {e}")
0981a08… noreply 144 continue
0981a08… noreply 145
0981a08… noreply 146 if not isinstance(sections, list):
0981a08… noreply 147 sections = []
0981a08… noreply 148
0981a08… noreply 149 # Filter sections by name if specified
0981a08… noreply 150 if self.section_name:
0981a08… noreply 151 sections = [
0981a08… noreply 152 s
0981a08… noreply 153 for s in sections
0981a08… noreply 154 if self.section_name.lower() in s.get("displayName", "").lower()
0981a08… noreply 155 ]
0981a08… noreply 156
0981a08… noreply 157 for section in sections:
0981a08… noreply 158 section_id = section.get("id", "")
0981a08… noreply 159 section_name = section.get("displayName", "Untitled Section")
0981a08… noreply 160
0981a08… noreply 161 # Step 3: List pages in this section
0981a08… noreply 162 try:
0981a08… noreply 163 pages = _run_m365(
0981a08… noreply 164 ["onenote", "page", "list", "--sectionId", section_id],
0981a08… noreply 165 timeout=60,
0981a08… noreply 166 )
0981a08… noreply 167 except RuntimeError as e:
0981a08… noreply 168 logger.warning(f"Failed to list pages in section '{section_name}': {e}")
0981a08… noreply 169 continue
0981a08… noreply 170
0981a08… noreply 171 if not isinstance(pages, list):
0981a08… noreply 172 pages = []
0981a08… noreply 173
0981a08… noreply 174 for page in pages:
0981a08… noreply 175 page_id = page.get("id", "")
0981a08… noreply 176 title = page.get("title", "Untitled Page").strip() or "Untitled Page"
0981a08… noreply 177 modified = page.get("lastModifiedDateTime")
0981a08… noreply 178 # Build a path for organizational context
0981a08… noreply 179 page_path = f"{notebook_name}/{section_name}/{title}"
0981a08… noreply 180
0981a08… noreply 181 files.append(
0981a08… noreply 182 SourceFile(
0981a08… noreply 183 name=title,
0981a08… noreply 184 id=str(page_id),
0981a08… noreply 185 size_bytes=None,
0981a08… noreply 186 mime_type="text/html",
0981a08… noreply 187 modified_at=modified,
0981a08… noreply 188 path=page_path,
0981a08… noreply 189 )
0981a08… noreply 190 )
0981a08… noreply 191
0981a08… noreply 192 logger.info(f"Found {len(files)} page(s) in OneNote")
0981a08… noreply 193 return files
0981a08… noreply 194
0981a08… noreply 195 def download(self, file: SourceFile, destination: Path) -> Path:
0981a08… noreply 196 """Download a OneNote page's content as a text file."""
0981a08… noreply 197 destination = Path(destination)
0981a08… noreply 198 destination.parent.mkdir(parents=True, exist_ok=True)
0981a08… noreply 199
0981a08… noreply 200 try:
0981a08… noreply 201 result = _run_m365(
0981a08… noreply 202 ["onenote", "page", "get", "--id", file.id],
0981a08… noreply 203 timeout=60,
0981a08… noreply 204 )
0981a08… noreply 205 except RuntimeError as e:
0981a08… noreply 206 raise RuntimeError(f"Failed to fetch OneNote page {file.id}: {e}") from e
0981a08… noreply 207
0981a08… noreply 208 # Extract HTML content from the result
0981a08… noreply 209 if isinstance(result, dict):
0981a08… noreply 210 html = result.get("content", result.get("body", {}).get("content", ""))
0981a08… noreply 211 if not html:
0981a08… noreply 212 # Fallback: serialize the whole response
0981a08… noreply 213 html = json.dumps(result, indent=2)
0981a08… noreply 214 elif isinstance(result, str):
0981a08… noreply 215 html = result
0981a08… noreply 216 else:
0981a08… noreply 217 html = str(result)
0981a08… noreply 218
0981a08… noreply 219 text = _html_to_text(html)
0981a08… noreply 220 destination.write_text(text, encoding="utf-8")
0981a08… noreply 221 logger.info(f"Saved page '{file.name}' to {destination}")
0981a08… noreply 222 return destination

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button