PlanOpticon

planopticon / video_processor / sources / onenote_source.py
Blame History Raw 223 lines
1
"""Microsoft OneNote source connector using the m365 CLI (cli-microsoft365).
2
3
Fetches pages from OneNote notebooks via the `m365` CLI tool.
4
Outputs plain text suitable for KG ingestion.
5
6
Requires: npm install -g @pnp/cli-microsoft365
7
Auth: m365 login (interactive)
8
Docs: https://pnp.github.io/cli-microsoft365/
9
"""
10
11
import json
12
import logging
13
import re
14
import shutil
15
import subprocess
16
from pathlib import Path
17
from typing import Any, List, Optional
18
19
from video_processor.sources.base import BaseSource, SourceFile
20
21
logger = logging.getLogger(__name__)
22
23
24
def _run_m365(args: List[str], timeout: int = 30) -> Any:
25
"""Run an m365 CLI command and return parsed JSON output."""
26
cmd = ["m365"] + args + ["--output", "json"]
27
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
28
if proc.returncode != 0:
29
raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}")
30
try:
31
return json.loads(proc.stdout)
32
except json.JSONDecodeError:
33
return proc.stdout.strip()
34
35
36
def _html_to_text(html: str) -> str:
37
"""Strip HTML tags and decode entities to produce plain text.
38
39
Uses only stdlib ``re`` — no external dependencies.
40
"""
41
# Remove script/style blocks entirely
42
text = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
43
# Replace <br>, <p>, <div>, <li>, <tr> with newlines for readability
44
text = re.sub(r"<br\s*/?>", "\n", text, flags=re.IGNORECASE)
45
text = re.sub(r"</(p|div|li|tr|h[1-6])>", "\n", text, flags=re.IGNORECASE)
46
# Strip remaining tags
47
text = re.sub(r"<[^>]+>", "", text)
48
# Decode common HTML entities
49
entity_map = {
50
"&amp;": "&",
51
"&lt;": "<",
52
"&gt;": ">",
53
"&quot;": '"',
54
"&#39;": "'",
55
"&apos;": "'",
56
"&nbsp;": " ",
57
}
58
for entity, char in entity_map.items():
59
text = text.replace(entity, char)
60
# Decode numeric entities (&#123; and &#x1a;)
61
text = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), text)
62
text = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), text)
63
# Collapse excessive blank lines
64
text = re.sub(r"\n{3,}", "\n\n", text)
65
return text.strip()
66
67
68
class OneNoteSource(BaseSource):
69
"""
70
Fetch pages from OneNote notebooks via the m365 CLI.
71
72
Usage:
73
source = OneNoteSource() # all notebooks
74
source = OneNoteSource(notebook_name="Work Notes") # specific notebook
75
source = OneNoteSource(notebook_name="Work", section_name="Meetings")
76
files = source.list_videos()
77
source.download_all(files, Path("./notes"))
78
"""
79
80
def __init__(
81
self,
82
notebook_name: Optional[str] = None,
83
section_name: Optional[str] = None,
84
):
85
self.notebook_name = notebook_name
86
self.section_name = section_name
87
88
def authenticate(self) -> bool:
89
"""Check if m365 CLI is installed and logged in."""
90
if not shutil.which("m365"):
91
logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365")
92
return False
93
try:
94
result = _run_m365(["status"], timeout=10)
95
if isinstance(result, dict) and result.get("connectedAs"):
96
return True
97
if isinstance(result, str) and "Logged in" in result:
98
return True
99
logger.error("m365 not logged in. Run: m365 login")
100
return False
101
except (RuntimeError, subprocess.TimeoutExpired):
102
logger.error("m365 not logged in. Run: m365 login")
103
return False
104
105
def list_videos(
106
self,
107
folder_id: Optional[str] = None,
108
folder_path: Optional[str] = None,
109
patterns: Optional[List[str]] = None,
110
) -> List[SourceFile]:
111
"""List OneNote pages across notebooks/sections. Returns SourceFile per page."""
112
files: List[SourceFile] = []
113
114
# Step 1: List notebooks
115
try:
116
notebooks = _run_m365(["onenote", "notebook", "list"], timeout=60)
117
except RuntimeError as e:
118
logger.error(f"Failed to list OneNote notebooks: {e}")
119
return []
120
121
if not isinstance(notebooks, list):
122
notebooks = []
123
124
# Filter notebooks by name if specified
125
if self.notebook_name:
126
notebooks = [
127
nb
128
for nb in notebooks
129
if self.notebook_name.lower() in nb.get("displayName", "").lower()
130
]
131
132
for notebook in notebooks:
133
notebook_id = notebook.get("id", "")
134
notebook_name = notebook.get("displayName", "Untitled Notebook")
135
136
# Step 2: List sections in this notebook
137
try:
138
sections = _run_m365(
139
["onenote", "section", "list", "--notebookId", notebook_id],
140
timeout=60,
141
)
142
except RuntimeError as e:
143
logger.warning(f"Failed to list sections for notebook '{notebook_name}': {e}")
144
continue
145
146
if not isinstance(sections, list):
147
sections = []
148
149
# Filter sections by name if specified
150
if self.section_name:
151
sections = [
152
s
153
for s in sections
154
if self.section_name.lower() in s.get("displayName", "").lower()
155
]
156
157
for section in sections:
158
section_id = section.get("id", "")
159
section_name = section.get("displayName", "Untitled Section")
160
161
# Step 3: List pages in this section
162
try:
163
pages = _run_m365(
164
["onenote", "page", "list", "--sectionId", section_id],
165
timeout=60,
166
)
167
except RuntimeError as e:
168
logger.warning(f"Failed to list pages in section '{section_name}': {e}")
169
continue
170
171
if not isinstance(pages, list):
172
pages = []
173
174
for page in pages:
175
page_id = page.get("id", "")
176
title = page.get("title", "Untitled Page").strip() or "Untitled Page"
177
modified = page.get("lastModifiedDateTime")
178
# Build a path for organizational context
179
page_path = f"{notebook_name}/{section_name}/{title}"
180
181
files.append(
182
SourceFile(
183
name=title,
184
id=str(page_id),
185
size_bytes=None,
186
mime_type="text/html",
187
modified_at=modified,
188
path=page_path,
189
)
190
)
191
192
logger.info(f"Found {len(files)} page(s) in OneNote")
193
return files
194
195
def download(self, file: SourceFile, destination: Path) -> Path:
196
"""Download a OneNote page's content as a text file."""
197
destination = Path(destination)
198
destination.parent.mkdir(parents=True, exist_ok=True)
199
200
try:
201
result = _run_m365(
202
["onenote", "page", "get", "--id", file.id],
203
timeout=60,
204
)
205
except RuntimeError as e:
206
raise RuntimeError(f"Failed to fetch OneNote page {file.id}: {e}") from e
207
208
# Extract HTML content from the result
209
if isinstance(result, dict):
210
html = result.get("content", result.get("body", {}).get("content", ""))
211
if not html:
212
# Fallback: serialize the whole response
213
html = json.dumps(result, indent=2)
214
elif isinstance(result, str):
215
html = result
216
else:
217
html = str(result)
218
219
text = _html_to_text(html)
220
destination.write_text(text, encoding="utf-8")
221
logger.info(f"Saved page '{file.name}' to {destination}")
222
return destination
223

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button