|
0981a08…
|
noreply
|
1 |
"""Obsidian vault source connector for ingesting markdown notes.""" |
|
0981a08…
|
noreply
|
2 |
|
|
0981a08…
|
noreply
|
3 |
import logging |
|
0981a08…
|
noreply
|
4 |
import re |
|
0981a08…
|
noreply
|
5 |
import shutil |
|
0981a08…
|
noreply
|
6 |
from datetime import datetime, timezone |
|
0981a08…
|
noreply
|
7 |
from pathlib import Path |
|
0981a08…
|
noreply
|
8 |
from typing import List, Optional, Tuple |
|
0981a08…
|
noreply
|
9 |
|
|
0981a08…
|
noreply
|
10 |
from video_processor.sources.base import BaseSource, SourceFile |
|
0981a08…
|
noreply
|
11 |
|
|
0981a08…
|
noreply
|
12 |
logger = logging.getLogger(__name__) |
|
0981a08…
|
noreply
|
13 |
|
|
0981a08…
|
noreply
|
14 |
|
|
0981a08…
|
noreply
|
15 |
def parse_note(path: Path) -> dict: |
|
0981a08…
|
noreply
|
16 |
"""Parse an Obsidian markdown note and extract structured content. |
|
0981a08…
|
noreply
|
17 |
|
|
0981a08…
|
noreply
|
18 |
Returns a dict with: |
|
0981a08…
|
noreply
|
19 |
- frontmatter: dict of YAML frontmatter metadata |
|
0981a08…
|
noreply
|
20 |
- links: list of linked page names from [[wiki-links]] |
|
0981a08…
|
noreply
|
21 |
- tags: list of tags from #tag occurrences |
|
0981a08…
|
noreply
|
22 |
- headings: list of dicts with level and text |
|
0981a08…
|
noreply
|
23 |
- body: markdown text without frontmatter |
|
0981a08…
|
noreply
|
24 |
""" |
|
0981a08…
|
noreply
|
25 |
text = path.read_text(encoding="utf-8") |
|
0981a08…
|
noreply
|
26 |
|
|
0981a08…
|
noreply
|
27 |
# Extract YAML frontmatter (simple key: value parser, stdlib only) |
|
0981a08…
|
noreply
|
28 |
frontmatter: dict = {} |
|
0981a08…
|
noreply
|
29 |
body = text |
|
0981a08…
|
noreply
|
30 |
fm_match = re.match(r"\A---\n(.*?\n)---\n?(.*)", text, re.DOTALL) |
|
0981a08…
|
noreply
|
31 |
if fm_match: |
|
0981a08…
|
noreply
|
32 |
fm_text = fm_match.group(1) |
|
0981a08…
|
noreply
|
33 |
for line in fm_text.strip().splitlines(): |
|
0981a08…
|
noreply
|
34 |
kv = re.match(r"^([A-Za-z_][A-Za-z0-9_ -]*):\s*(.*)", line) |
|
0981a08…
|
noreply
|
35 |
if kv: |
|
0981a08…
|
noreply
|
36 |
key = kv.group(1).strip() |
|
0981a08…
|
noreply
|
37 |
value = kv.group(2).strip() |
|
0981a08…
|
noreply
|
38 |
# Strip surrounding quotes |
|
0981a08…
|
noreply
|
39 |
if len(value) >= 2 and value[0] == value[-1] and value[0] in ('"', "'"): |
|
0981a08…
|
noreply
|
40 |
value = value[1:-1] |
|
0981a08…
|
noreply
|
41 |
# Handle YAML-style lists on a single line [a, b, c] |
|
0981a08…
|
noreply
|
42 |
list_match = re.match(r"^\[(.+)\]$", value) |
|
0981a08…
|
noreply
|
43 |
if list_match: |
|
0981a08…
|
noreply
|
44 |
value = [v.strip().strip("\"'") for v in list_match.group(1).split(",")] |
|
0981a08…
|
noreply
|
45 |
frontmatter[key] = value |
|
0981a08…
|
noreply
|
46 |
body = fm_match.group(2) |
|
0981a08…
|
noreply
|
47 |
|
|
0981a08…
|
noreply
|
48 |
# Extract wiki-links: [[page]] and [[page|alias]] |
|
0981a08…
|
noreply
|
49 |
link_pattern = re.compile(r"\[\[([^\]|]+)(?:\|[^\]]+)?\]\]") |
|
0981a08…
|
noreply
|
50 |
links = link_pattern.findall(body) |
|
0981a08…
|
noreply
|
51 |
|
|
0981a08…
|
noreply
|
52 |
# Extract tags: #tag (but not inside code blocks or frontmatter) |
|
0981a08…
|
noreply
|
53 |
# Match #tag but not #[[tag]] (that's Logseq style) and not ## headings |
|
0981a08…
|
noreply
|
54 |
tag_pattern = re.compile(r"(?<!\w)#([A-Za-z][A-Za-z0-9_/-]*)") |
|
0981a08…
|
noreply
|
55 |
tags = tag_pattern.findall(body) |
|
0981a08…
|
noreply
|
56 |
|
|
0981a08…
|
noreply
|
57 |
# Extract headings hierarchy |
|
0981a08…
|
noreply
|
58 |
heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE) |
|
0981a08…
|
noreply
|
59 |
headings = [ |
|
0981a08…
|
noreply
|
60 |
{"level": len(m.group(1)), "text": m.group(2).strip()} |
|
0981a08…
|
noreply
|
61 |
for m in heading_pattern.finditer(body) |
|
0981a08…
|
noreply
|
62 |
] |
|
0981a08…
|
noreply
|
63 |
|
|
0981a08…
|
noreply
|
64 |
return { |
|
0981a08…
|
noreply
|
65 |
"frontmatter": frontmatter, |
|
0981a08…
|
noreply
|
66 |
"links": links, |
|
0981a08…
|
noreply
|
67 |
"tags": tags, |
|
0981a08…
|
noreply
|
68 |
"headings": headings, |
|
0981a08…
|
noreply
|
69 |
"body": body, |
|
0981a08…
|
noreply
|
70 |
} |
|
0981a08…
|
noreply
|
71 |
|
|
0981a08…
|
noreply
|
72 |
|
|
0981a08…
|
noreply
|
73 |
def ingest_vault(vault_path: Path) -> dict: |
|
0981a08…
|
noreply
|
74 |
"""Ingest an entire Obsidian vault and return structured data. |
|
0981a08…
|
noreply
|
75 |
|
|
0981a08…
|
noreply
|
76 |
Returns a dict with: |
|
0981a08…
|
noreply
|
77 |
- notes: list of dicts with name, tags, frontmatter, text |
|
0981a08…
|
noreply
|
78 |
- links: list of (source, target) tuples from wiki-links |
|
0981a08…
|
noreply
|
79 |
""" |
|
0981a08…
|
noreply
|
80 |
vault_path = Path(vault_path) |
|
0981a08…
|
noreply
|
81 |
notes: List[dict] = [] |
|
0981a08…
|
noreply
|
82 |
links: List[Tuple[str, str]] = [] |
|
0981a08…
|
noreply
|
83 |
|
|
0981a08…
|
noreply
|
84 |
md_files = sorted(vault_path.rglob("*.md")) |
|
0981a08…
|
noreply
|
85 |
logger.info("Found %d markdown files in vault %s", len(md_files), vault_path) |
|
0981a08…
|
noreply
|
86 |
|
|
0981a08…
|
noreply
|
87 |
for md_file in md_files: |
|
0981a08…
|
noreply
|
88 |
note_name = md_file.stem |
|
0981a08…
|
noreply
|
89 |
try: |
|
0981a08…
|
noreply
|
90 |
parsed = parse_note(md_file) |
|
0981a08…
|
noreply
|
91 |
except Exception: |
|
0981a08…
|
noreply
|
92 |
logger.warning("Failed to parse note %s", md_file) |
|
0981a08…
|
noreply
|
93 |
continue |
|
0981a08…
|
noreply
|
94 |
|
|
0981a08…
|
noreply
|
95 |
notes.append( |
|
0981a08…
|
noreply
|
96 |
{ |
|
0981a08…
|
noreply
|
97 |
"name": note_name, |
|
0981a08…
|
noreply
|
98 |
"tags": parsed["tags"], |
|
0981a08…
|
noreply
|
99 |
"frontmatter": parsed["frontmatter"], |
|
0981a08…
|
noreply
|
100 |
"text": parsed["body"], |
|
0981a08…
|
noreply
|
101 |
} |
|
0981a08…
|
noreply
|
102 |
) |
|
0981a08…
|
noreply
|
103 |
|
|
0981a08…
|
noreply
|
104 |
for linked_page in parsed["links"]: |
|
0981a08…
|
noreply
|
105 |
links.append((note_name, linked_page)) |
|
0981a08…
|
noreply
|
106 |
|
|
0981a08…
|
noreply
|
107 |
logger.info( |
|
0981a08…
|
noreply
|
108 |
"Ingested %d notes with %d links from vault %s", |
|
0981a08…
|
noreply
|
109 |
len(notes), |
|
0981a08…
|
noreply
|
110 |
len(links), |
|
0981a08…
|
noreply
|
111 |
vault_path, |
|
0981a08…
|
noreply
|
112 |
) |
|
0981a08…
|
noreply
|
113 |
return {"notes": notes, "links": links} |
|
0981a08…
|
noreply
|
114 |
|
|
0981a08…
|
noreply
|
115 |
|
|
0981a08…
|
noreply
|
116 |
class ObsidianSource(BaseSource): |
|
0981a08…
|
noreply
|
117 |
"""Source connector for Obsidian vaults.""" |
|
0981a08…
|
noreply
|
118 |
|
|
0981a08…
|
noreply
|
119 |
def __init__(self, vault_path: str) -> None: |
|
0981a08…
|
noreply
|
120 |
self.vault_path = Path(vault_path) |
|
0981a08…
|
noreply
|
121 |
|
|
0981a08…
|
noreply
|
122 |
def authenticate(self) -> bool: |
|
0981a08…
|
noreply
|
123 |
"""Check that the vault path exists and contains .md files.""" |
|
0981a08…
|
noreply
|
124 |
if not self.vault_path.is_dir(): |
|
0981a08…
|
noreply
|
125 |
logger.error("Vault path does not exist: %s", self.vault_path) |
|
0981a08…
|
noreply
|
126 |
return False |
|
0981a08…
|
noreply
|
127 |
md_files = list(self.vault_path.rglob("*.md")) |
|
0981a08…
|
noreply
|
128 |
if not md_files: |
|
0981a08…
|
noreply
|
129 |
logger.error("No markdown files found in vault: %s", self.vault_path) |
|
0981a08…
|
noreply
|
130 |
return False |
|
0981a08…
|
noreply
|
131 |
logger.info( |
|
0981a08…
|
noreply
|
132 |
"Obsidian vault authenticated: %s (%d .md files)", |
|
0981a08…
|
noreply
|
133 |
self.vault_path, |
|
0981a08…
|
noreply
|
134 |
len(md_files), |
|
0981a08…
|
noreply
|
135 |
) |
|
0981a08…
|
noreply
|
136 |
return True |
|
0981a08…
|
noreply
|
137 |
|
|
0981a08…
|
noreply
|
138 |
def list_videos( |
|
0981a08…
|
noreply
|
139 |
self, |
|
0981a08…
|
noreply
|
140 |
folder_id: Optional[str] = None, |
|
0981a08…
|
noreply
|
141 |
folder_path: Optional[str] = None, |
|
0981a08…
|
noreply
|
142 |
patterns: Optional[List[str]] = None, |
|
0981a08…
|
noreply
|
143 |
) -> List[SourceFile]: |
|
0981a08…
|
noreply
|
144 |
"""List all .md files in the vault recursively as SourceFile objects.""" |
|
0981a08…
|
noreply
|
145 |
search_root = self.vault_path |
|
0981a08…
|
noreply
|
146 |
if folder_path: |
|
0981a08…
|
noreply
|
147 |
search_root = self.vault_path / folder_path |
|
0981a08…
|
noreply
|
148 |
|
|
0981a08…
|
noreply
|
149 |
md_files = sorted(search_root.rglob("*.md")) |
|
0981a08…
|
noreply
|
150 |
results: List[SourceFile] = [] |
|
0981a08…
|
noreply
|
151 |
|
|
0981a08…
|
noreply
|
152 |
for md_file in md_files: |
|
0981a08…
|
noreply
|
153 |
relative = md_file.relative_to(self.vault_path) |
|
0981a08…
|
noreply
|
154 |
stat = md_file.stat() |
|
0981a08…
|
noreply
|
155 |
modified_dt = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc) |
|
0981a08…
|
noreply
|
156 |
|
|
0981a08…
|
noreply
|
157 |
results.append( |
|
0981a08…
|
noreply
|
158 |
SourceFile( |
|
0981a08…
|
noreply
|
159 |
name=md_file.name, |
|
0981a08…
|
noreply
|
160 |
id=str(relative), |
|
0981a08…
|
noreply
|
161 |
size_bytes=stat.st_size, |
|
0981a08…
|
noreply
|
162 |
mime_type="text/markdown", |
|
0981a08…
|
noreply
|
163 |
modified_at=modified_dt.isoformat(), |
|
0981a08…
|
noreply
|
164 |
path=str(relative), |
|
0981a08…
|
noreply
|
165 |
) |
|
0981a08…
|
noreply
|
166 |
) |
|
0981a08…
|
noreply
|
167 |
|
|
0981a08…
|
noreply
|
168 |
logger.info("Listed %d files from vault %s", len(results), self.vault_path) |
|
0981a08…
|
noreply
|
169 |
return results |
|
0981a08…
|
noreply
|
170 |
|
|
0981a08…
|
noreply
|
171 |
def download(self, file: SourceFile, destination: Path) -> Path: |
|
0981a08…
|
noreply
|
172 |
"""Copy a vault file to the destination path.""" |
|
0981a08…
|
noreply
|
173 |
source = self.vault_path / file.id |
|
0981a08…
|
noreply
|
174 |
destination = Path(destination) |
|
0981a08…
|
noreply
|
175 |
destination.parent.mkdir(parents=True, exist_ok=True) |
|
0981a08…
|
noreply
|
176 |
shutil.copy2(source, destination) |
|
0981a08…
|
noreply
|
177 |
logger.info("Copied %s -> %s", source, destination) |
|
0981a08…
|
noreply
|
178 |
return destination |