PlanOpticon

planopticon / video_processor / sources / onenote_source.py

Blame History Raw 223 lines

1	`"""Microsoft OneNote source connector using the m365 CLI (cli-microsoft365).`
2
3	Fetches pages from OneNote notebooks via the `m365` CLI tool.
4	`Outputs plain text suitable for KG ingestion.`
5
6	`Requires: npm install -g @pnp/cli-microsoft365`
7	`Auth: m365 login (interactive)`
8	`Docs: https://pnp.github.io/cli-microsoft365/`
9	`"""`
10
11	`import json`
12	`import logging`
13	`import re`
14	`import shutil`
15	`import subprocess`
16	`from pathlib import Path`
17	`from typing import Any, List, Optional`
18
19	`from video_processor.sources.base import BaseSource, SourceFile`
20
21	`logger = logging.getLogger(__name__)`
22
23
24	`def _run_m365(args: List[str], timeout: int = 30) -> Any:`
25	`"""Run an m365 CLI command and return parsed JSON output."""`
26	`cmd = ["m365"] + args + ["--output", "json"]`
27	`proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)`
28	`if proc.returncode != 0:`
29	`raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}")`
30	`try:`
31	`return json.loads(proc.stdout)`
32	`except json.JSONDecodeError:`
33	`return proc.stdout.strip()`
34
35
36	`def _html_to_text(html: str) -> str:`
37	`"""Strip HTML tags and decode entities to produce plain text.`
38
39	Uses only stdlib ``re`` — no external dependencies.
40	`"""`
41	`# Remove script/style blocks entirely`
42	`text = re.sub(r"<(script\|style)[^>]>.?</\1>", "", html, flags=re.DOTALL \| re.IGNORECASE)`
43	`# Replace <br>, <p>, <div>, <li>, <tr> with newlines for readability`
44	`text = re.sub(r"<br\s*/?>", "\n", text, flags=re.IGNORECASE)`
45	`text = re.sub(r"</(p\|div\|li\|tr\|h[1-6])>", "\n", text, flags=re.IGNORECASE)`
46	`# Strip remaining tags`
47	`text = re.sub(r"<[^>]+>", "", text)`
48	`# Decode common HTML entities`
49	`entity_map = {`
50	`"&": "&",`
51	`"<": "<",`
52	`">": ">",`
53	`""": '"',`
54	`"'": "'",`
55	`"'": "'",`
56	`" ": " ",`
57	`}`
58	`for entity, char in entity_map.items():`
59	`text = text.replace(entity, char)`
60	`# Decode numeric entities ({ and )`
61	`text = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), text)`
62	`text = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), text)`
63	`# Collapse excessive blank lines`
64	`text = re.sub(r"\n{3,}", "\n\n", text)`
65	`return text.strip()`
66
67
68	`class OneNoteSource(BaseSource):`
69	`"""`
70	`Fetch pages from OneNote notebooks via the m365 CLI.`
71
72	`Usage:`
73	`source = OneNoteSource() # all notebooks`
74	`source = OneNoteSource(notebook_name="Work Notes") # specific notebook`
75	`source = OneNoteSource(notebook_name="Work", section_name="Meetings")`
76	`files = source.list_videos()`
77	`source.download_all(files, Path("./notes"))`
78	`"""`
79
80	`def __init__(`
81	`self,`
82	`notebook_name: Optional[str] = None,`
83	`section_name: Optional[str] = None,`
84	`):`
85	`self.notebook_name = notebook_name`
86	`self.section_name = section_name`
87
88	`def authenticate(self) -> bool:`
89	`"""Check if m365 CLI is installed and logged in."""`
90	`if not shutil.which("m365"):`
91	`logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365")`
92	`return False`
93	`try:`
94	`result = _run_m365(["status"], timeout=10)`
95	`if isinstance(result, dict) and result.get("connectedAs"):`
96	`return True`
97	`if isinstance(result, str) and "Logged in" in result:`
98	`return True`
99	`logger.error("m365 not logged in. Run: m365 login")`
100	`return False`
101	`except (RuntimeError, subprocess.TimeoutExpired):`
102	`logger.error("m365 not logged in. Run: m365 login")`
103	`return False`
104
105	`def list_videos(`
106	`self,`
107	`folder_id: Optional[str] = None,`
108	`folder_path: Optional[str] = None,`
109	`patterns: Optional[List[str]] = None,`
110	`) -> List[SourceFile]:`
111	`"""List OneNote pages across notebooks/sections. Returns SourceFile per page."""`
112	`files: List[SourceFile] = []`
113
114	`# Step 1: List notebooks`
115	`try:`
116	`notebooks = _run_m365(["onenote", "notebook", "list"], timeout=60)`
117	`except RuntimeError as e:`
118	`logger.error(f"Failed to list OneNote notebooks: {e}")`
119	`return []`
120
121	`if not isinstance(notebooks, list):`
122	`notebooks = []`
123
124	`# Filter notebooks by name if specified`
125	`if self.notebook_name:`
126	`notebooks = [`
127	`nb`
128	`for nb in notebooks`
129	`if self.notebook_name.lower() in nb.get("displayName", "").lower()`
130	`]`
131
132	`for notebook in notebooks:`
133	`notebook_id = notebook.get("id", "")`
134	`notebook_name = notebook.get("displayName", "Untitled Notebook")`
135
136	`# Step 2: List sections in this notebook`
137	`try:`
138	`sections = _run_m365(`
139	`["onenote", "section", "list", "--notebookId", notebook_id],`
140	`timeout=60,`
141	`)`
142	`except RuntimeError as e:`
143	`logger.warning(f"Failed to list sections for notebook '{notebook_name}': {e}")`
144	`continue`
145
146	`if not isinstance(sections, list):`
147	`sections = []`
148
149	`# Filter sections by name if specified`
150	`if self.section_name:`
151	`sections = [`
152	`s`
153	`for s in sections`
154	`if self.section_name.lower() in s.get("displayName", "").lower()`
155	`]`
156
157	`for section in sections:`
158	`section_id = section.get("id", "")`
159	`section_name = section.get("displayName", "Untitled Section")`
160
161	`# Step 3: List pages in this section`
162	`try:`
163	`pages = _run_m365(`
164	`["onenote", "page", "list", "--sectionId", section_id],`
165	`timeout=60,`
166	`)`
167	`except RuntimeError as e:`
168	`logger.warning(f"Failed to list pages in section '{section_name}': {e}")`
169	`continue`
170
171	`if not isinstance(pages, list):`
172	`pages = []`
173
174	`for page in pages:`
175	`page_id = page.get("id", "")`
176	`title = page.get("title", "Untitled Page").strip() or "Untitled Page"`
177	`modified = page.get("lastModifiedDateTime")`
178	`# Build a path for organizational context`
179	`page_path = f"{notebook_name}/{section_name}/{title}"`
180
181	`files.append(`
182	`SourceFile(`
183	`name=title,`
184	`id=str(page_id),`
185	`size_bytes=None,`
186	`mime_type="text/html",`
187	`modified_at=modified,`
188	`path=page_path,`
189	`)`
190	`)`
191
192	`logger.info(f"Found {len(files)} page(s) in OneNote")`
193	`return files`
194
195	`def download(self, file: SourceFile, destination: Path) -> Path:`
196	`"""Download a OneNote page's content as a text file."""`
197	`destination = Path(destination)`
198	`destination.parent.mkdir(parents=True, exist_ok=True)`
199
200	`try:`
201	`result = _run_m365(`
202	`["onenote", "page", "get", "--id", file.id],`
203	`timeout=60,`
204	`)`
205	`except RuntimeError as e:`
206	`raise RuntimeError(f"Failed to fetch OneNote page {file.id}: {e}") from e`
207
208	`# Extract HTML content from the result`
209	`if isinstance(result, dict):`
210	`html = result.get("content", result.get("body", {}).get("content", ""))`
211	`if not html:`
212	`# Fallback: serialize the whole response`
213	`html = json.dumps(result, indent=2)`
214	`elif isinstance(result, str):`
215	`html = result`
216	`else:`
217	`html = str(result)`
218
219	`text = _html_to_text(html)`
220	`destination.write_text(text, encoding="utf-8")`
221	`logger.info(f"Saved page '{file.name}' to {destination}")`
222	`return destination`
223

PlanOpticon

Keyboard Shortcuts