|
1
|
"""Microsoft OneNote source connector using the m365 CLI (cli-microsoft365). |
|
2
|
|
|
3
|
Fetches pages from OneNote notebooks via the `m365` CLI tool. |
|
4
|
Outputs plain text suitable for KG ingestion. |
|
5
|
|
|
6
|
Requires: npm install -g @pnp/cli-microsoft365 |
|
7
|
Auth: m365 login (interactive) |
|
8
|
Docs: https://pnp.github.io/cli-microsoft365/ |
|
9
|
""" |
|
10
|
|
|
11
|
import json |
|
12
|
import logging |
|
13
|
import re |
|
14
|
import shutil |
|
15
|
import subprocess |
|
16
|
from pathlib import Path |
|
17
|
from typing import Any, List, Optional |
|
18
|
|
|
19
|
from video_processor.sources.base import BaseSource, SourceFile |
|
20
|
|
|
21
|
logger = logging.getLogger(__name__) |
|
22
|
|
|
23
|
|
|
24
|
def _run_m365(args: List[str], timeout: int = 30) -> Any: |
|
25
|
"""Run an m365 CLI command and return parsed JSON output.""" |
|
26
|
cmd = ["m365"] + args + ["--output", "json"] |
|
27
|
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) |
|
28
|
if proc.returncode != 0: |
|
29
|
raise RuntimeError(f"m365 {' '.join(args)} failed: {proc.stderr.strip()}") |
|
30
|
try: |
|
31
|
return json.loads(proc.stdout) |
|
32
|
except json.JSONDecodeError: |
|
33
|
return proc.stdout.strip() |
|
34
|
|
|
35
|
|
|
36
|
def _html_to_text(html: str) -> str: |
|
37
|
"""Strip HTML tags and decode entities to produce plain text. |
|
38
|
|
|
39
|
Uses only stdlib ``re`` — no external dependencies. |
|
40
|
""" |
|
41
|
# Remove script/style blocks entirely |
|
42
|
text = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE) |
|
43
|
# Replace <br>, <p>, <div>, <li>, <tr> with newlines for readability |
|
44
|
text = re.sub(r"<br\s*/?>", "\n", text, flags=re.IGNORECASE) |
|
45
|
text = re.sub(r"</(p|div|li|tr|h[1-6])>", "\n", text, flags=re.IGNORECASE) |
|
46
|
# Strip remaining tags |
|
47
|
text = re.sub(r"<[^>]+>", "", text) |
|
48
|
# Decode common HTML entities |
|
49
|
entity_map = { |
|
50
|
"&": "&", |
|
51
|
"<": "<", |
|
52
|
">": ">", |
|
53
|
""": '"', |
|
54
|
"'": "'", |
|
55
|
"'": "'", |
|
56
|
" ": " ", |
|
57
|
} |
|
58
|
for entity, char in entity_map.items(): |
|
59
|
text = text.replace(entity, char) |
|
60
|
# Decode numeric entities ({ and ) |
|
61
|
text = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), text) |
|
62
|
text = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), text) |
|
63
|
# Collapse excessive blank lines |
|
64
|
text = re.sub(r"\n{3,}", "\n\n", text) |
|
65
|
return text.strip() |
|
66
|
|
|
67
|
|
|
68
|
class OneNoteSource(BaseSource): |
|
69
|
""" |
|
70
|
Fetch pages from OneNote notebooks via the m365 CLI. |
|
71
|
|
|
72
|
Usage: |
|
73
|
source = OneNoteSource() # all notebooks |
|
74
|
source = OneNoteSource(notebook_name="Work Notes") # specific notebook |
|
75
|
source = OneNoteSource(notebook_name="Work", section_name="Meetings") |
|
76
|
files = source.list_videos() |
|
77
|
source.download_all(files, Path("./notes")) |
|
78
|
""" |
|
79
|
|
|
80
|
def __init__( |
|
81
|
self, |
|
82
|
notebook_name: Optional[str] = None, |
|
83
|
section_name: Optional[str] = None, |
|
84
|
): |
|
85
|
self.notebook_name = notebook_name |
|
86
|
self.section_name = section_name |
|
87
|
|
|
88
|
def authenticate(self) -> bool: |
|
89
|
"""Check if m365 CLI is installed and logged in.""" |
|
90
|
if not shutil.which("m365"): |
|
91
|
logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365") |
|
92
|
return False |
|
93
|
try: |
|
94
|
result = _run_m365(["status"], timeout=10) |
|
95
|
if isinstance(result, dict) and result.get("connectedAs"): |
|
96
|
return True |
|
97
|
if isinstance(result, str) and "Logged in" in result: |
|
98
|
return True |
|
99
|
logger.error("m365 not logged in. Run: m365 login") |
|
100
|
return False |
|
101
|
except (RuntimeError, subprocess.TimeoutExpired): |
|
102
|
logger.error("m365 not logged in. Run: m365 login") |
|
103
|
return False |
|
104
|
|
|
105
|
def list_videos( |
|
106
|
self, |
|
107
|
folder_id: Optional[str] = None, |
|
108
|
folder_path: Optional[str] = None, |
|
109
|
patterns: Optional[List[str]] = None, |
|
110
|
) -> List[SourceFile]: |
|
111
|
"""List OneNote pages across notebooks/sections. Returns SourceFile per page.""" |
|
112
|
files: List[SourceFile] = [] |
|
113
|
|
|
114
|
# Step 1: List notebooks |
|
115
|
try: |
|
116
|
notebooks = _run_m365(["onenote", "notebook", "list"], timeout=60) |
|
117
|
except RuntimeError as e: |
|
118
|
logger.error(f"Failed to list OneNote notebooks: {e}") |
|
119
|
return [] |
|
120
|
|
|
121
|
if not isinstance(notebooks, list): |
|
122
|
notebooks = [] |
|
123
|
|
|
124
|
# Filter notebooks by name if specified |
|
125
|
if self.notebook_name: |
|
126
|
notebooks = [ |
|
127
|
nb |
|
128
|
for nb in notebooks |
|
129
|
if self.notebook_name.lower() in nb.get("displayName", "").lower() |
|
130
|
] |
|
131
|
|
|
132
|
for notebook in notebooks: |
|
133
|
notebook_id = notebook.get("id", "") |
|
134
|
notebook_name = notebook.get("displayName", "Untitled Notebook") |
|
135
|
|
|
136
|
# Step 2: List sections in this notebook |
|
137
|
try: |
|
138
|
sections = _run_m365( |
|
139
|
["onenote", "section", "list", "--notebookId", notebook_id], |
|
140
|
timeout=60, |
|
141
|
) |
|
142
|
except RuntimeError as e: |
|
143
|
logger.warning(f"Failed to list sections for notebook '{notebook_name}': {e}") |
|
144
|
continue |
|
145
|
|
|
146
|
if not isinstance(sections, list): |
|
147
|
sections = [] |
|
148
|
|
|
149
|
# Filter sections by name if specified |
|
150
|
if self.section_name: |
|
151
|
sections = [ |
|
152
|
s |
|
153
|
for s in sections |
|
154
|
if self.section_name.lower() in s.get("displayName", "").lower() |
|
155
|
] |
|
156
|
|
|
157
|
for section in sections: |
|
158
|
section_id = section.get("id", "") |
|
159
|
section_name = section.get("displayName", "Untitled Section") |
|
160
|
|
|
161
|
# Step 3: List pages in this section |
|
162
|
try: |
|
163
|
pages = _run_m365( |
|
164
|
["onenote", "page", "list", "--sectionId", section_id], |
|
165
|
timeout=60, |
|
166
|
) |
|
167
|
except RuntimeError as e: |
|
168
|
logger.warning(f"Failed to list pages in section '{section_name}': {e}") |
|
169
|
continue |
|
170
|
|
|
171
|
if not isinstance(pages, list): |
|
172
|
pages = [] |
|
173
|
|
|
174
|
for page in pages: |
|
175
|
page_id = page.get("id", "") |
|
176
|
title = page.get("title", "Untitled Page").strip() or "Untitled Page" |
|
177
|
modified = page.get("lastModifiedDateTime") |
|
178
|
# Build a path for organizational context |
|
179
|
page_path = f"{notebook_name}/{section_name}/{title}" |
|
180
|
|
|
181
|
files.append( |
|
182
|
SourceFile( |
|
183
|
name=title, |
|
184
|
id=str(page_id), |
|
185
|
size_bytes=None, |
|
186
|
mime_type="text/html", |
|
187
|
modified_at=modified, |
|
188
|
path=page_path, |
|
189
|
) |
|
190
|
) |
|
191
|
|
|
192
|
logger.info(f"Found {len(files)} page(s) in OneNote") |
|
193
|
return files |
|
194
|
|
|
195
|
def download(self, file: SourceFile, destination: Path) -> Path: |
|
196
|
"""Download a OneNote page's content as a text file.""" |
|
197
|
destination = Path(destination) |
|
198
|
destination.parent.mkdir(parents=True, exist_ok=True) |
|
199
|
|
|
200
|
try: |
|
201
|
result = _run_m365( |
|
202
|
["onenote", "page", "get", "--id", file.id], |
|
203
|
timeout=60, |
|
204
|
) |
|
205
|
except RuntimeError as e: |
|
206
|
raise RuntimeError(f"Failed to fetch OneNote page {file.id}: {e}") from e |
|
207
|
|
|
208
|
# Extract HTML content from the result |
|
209
|
if isinstance(result, dict): |
|
210
|
html = result.get("content", result.get("body", {}).get("content", "")) |
|
211
|
if not html: |
|
212
|
# Fallback: serialize the whole response |
|
213
|
html = json.dumps(result, indent=2) |
|
214
|
elif isinstance(result, str): |
|
215
|
html = result |
|
216
|
else: |
|
217
|
html = str(result) |
|
218
|
|
|
219
|
text = _html_to_text(html) |
|
220
|
destination.write_text(text, encoding="utf-8") |
|
221
|
logger.info(f"Saved page '{file.name}' to {destination}") |
|
222
|
return destination |
|
223
|
|