PlanOpticon

planopticon / video_processor / sources / obsidian_source.py
Source Blame History 178 lines
0981a08… noreply 1 """Obsidian vault source connector for ingesting markdown notes."""
0981a08… noreply 2
0981a08… noreply 3 import logging
0981a08… noreply 4 import re
0981a08… noreply 5 import shutil
0981a08… noreply 6 from datetime import datetime, timezone
0981a08… noreply 7 from pathlib import Path
0981a08… noreply 8 from typing import List, Optional, Tuple
0981a08… noreply 9
0981a08… noreply 10 from video_processor.sources.base import BaseSource, SourceFile
0981a08… noreply 11
0981a08… noreply 12 logger = logging.getLogger(__name__)
0981a08… noreply 13
0981a08… noreply 14
0981a08… noreply 15 def parse_note(path: Path) -> dict:
0981a08… noreply 16 """Parse an Obsidian markdown note and extract structured content.
0981a08… noreply 17
0981a08… noreply 18 Returns a dict with:
0981a08… noreply 19 - frontmatter: dict of YAML frontmatter metadata
0981a08… noreply 20 - links: list of linked page names from [[wiki-links]]
0981a08… noreply 21 - tags: list of tags from #tag occurrences
0981a08… noreply 22 - headings: list of dicts with level and text
0981a08… noreply 23 - body: markdown text without frontmatter
0981a08… noreply 24 """
0981a08… noreply 25 text = path.read_text(encoding="utf-8")
0981a08… noreply 26
0981a08… noreply 27 # Extract YAML frontmatter (simple key: value parser, stdlib only)
0981a08… noreply 28 frontmatter: dict = {}
0981a08… noreply 29 body = text
0981a08… noreply 30 fm_match = re.match(r"\A---\n(.*?\n)---\n?(.*)", text, re.DOTALL)
0981a08… noreply 31 if fm_match:
0981a08… noreply 32 fm_text = fm_match.group(1)
0981a08… noreply 33 for line in fm_text.strip().splitlines():
0981a08… noreply 34 kv = re.match(r"^([A-Za-z_][A-Za-z0-9_ -]*):\s*(.*)", line)
0981a08… noreply 35 if kv:
0981a08… noreply 36 key = kv.group(1).strip()
0981a08… noreply 37 value = kv.group(2).strip()
0981a08… noreply 38 # Strip surrounding quotes
0981a08… noreply 39 if len(value) >= 2 and value[0] == value[-1] and value[0] in ('"', "'"):
0981a08… noreply 40 value = value[1:-1]
0981a08… noreply 41 # Handle YAML-style lists on a single line [a, b, c]
0981a08… noreply 42 list_match = re.match(r"^\[(.+)\]$", value)
0981a08… noreply 43 if list_match:
0981a08… noreply 44 value = [v.strip().strip("\"'") for v in list_match.group(1).split(",")]
0981a08… noreply 45 frontmatter[key] = value
0981a08… noreply 46 body = fm_match.group(2)
0981a08… noreply 47
0981a08… noreply 48 # Extract wiki-links: [[page]] and [[page|alias]]
0981a08… noreply 49 link_pattern = re.compile(r"\[\[([^\]|]+)(?:\|[^\]]+)?\]\]")
0981a08… noreply 50 links = link_pattern.findall(body)
0981a08… noreply 51
0981a08… noreply 52 # Extract tags: #tag (but not inside code blocks or frontmatter)
0981a08… noreply 53 # Match #tag but not #[[tag]] (that's Logseq style) and not ## headings
0981a08… noreply 54 tag_pattern = re.compile(r"(?<!\w)#([A-Za-z][A-Za-z0-9_/-]*)")
0981a08… noreply 55 tags = tag_pattern.findall(body)
0981a08… noreply 56
0981a08… noreply 57 # Extract headings hierarchy
0981a08… noreply 58 heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
0981a08… noreply 59 headings = [
0981a08… noreply 60 {"level": len(m.group(1)), "text": m.group(2).strip()}
0981a08… noreply 61 for m in heading_pattern.finditer(body)
0981a08… noreply 62 ]
0981a08… noreply 63
0981a08… noreply 64 return {
0981a08… noreply 65 "frontmatter": frontmatter,
0981a08… noreply 66 "links": links,
0981a08… noreply 67 "tags": tags,
0981a08… noreply 68 "headings": headings,
0981a08… noreply 69 "body": body,
0981a08… noreply 70 }
0981a08… noreply 71
0981a08… noreply 72
0981a08… noreply 73 def ingest_vault(vault_path: Path) -> dict:
0981a08… noreply 74 """Ingest an entire Obsidian vault and return structured data.
0981a08… noreply 75
0981a08… noreply 76 Returns a dict with:
0981a08… noreply 77 - notes: list of dicts with name, tags, frontmatter, text
0981a08… noreply 78 - links: list of (source, target) tuples from wiki-links
0981a08… noreply 79 """
0981a08… noreply 80 vault_path = Path(vault_path)
0981a08… noreply 81 notes: List[dict] = []
0981a08… noreply 82 links: List[Tuple[str, str]] = []
0981a08… noreply 83
0981a08… noreply 84 md_files = sorted(vault_path.rglob("*.md"))
0981a08… noreply 85 logger.info("Found %d markdown files in vault %s", len(md_files), vault_path)
0981a08… noreply 86
0981a08… noreply 87 for md_file in md_files:
0981a08… noreply 88 note_name = md_file.stem
0981a08… noreply 89 try:
0981a08… noreply 90 parsed = parse_note(md_file)
0981a08… noreply 91 except Exception:
0981a08… noreply 92 logger.warning("Failed to parse note %s", md_file)
0981a08… noreply 93 continue
0981a08… noreply 94
0981a08… noreply 95 notes.append(
0981a08… noreply 96 {
0981a08… noreply 97 "name": note_name,
0981a08… noreply 98 "tags": parsed["tags"],
0981a08… noreply 99 "frontmatter": parsed["frontmatter"],
0981a08… noreply 100 "text": parsed["body"],
0981a08… noreply 101 }
0981a08… noreply 102 )
0981a08… noreply 103
0981a08… noreply 104 for linked_page in parsed["links"]:
0981a08… noreply 105 links.append((note_name, linked_page))
0981a08… noreply 106
0981a08… noreply 107 logger.info(
0981a08… noreply 108 "Ingested %d notes with %d links from vault %s",
0981a08… noreply 109 len(notes),
0981a08… noreply 110 len(links),
0981a08… noreply 111 vault_path,
0981a08… noreply 112 )
0981a08… noreply 113 return {"notes": notes, "links": links}
0981a08… noreply 114
0981a08… noreply 115
0981a08… noreply 116 class ObsidianSource(BaseSource):
0981a08… noreply 117 """Source connector for Obsidian vaults."""
0981a08… noreply 118
0981a08… noreply 119 def __init__(self, vault_path: str) -> None:
0981a08… noreply 120 self.vault_path = Path(vault_path)
0981a08… noreply 121
0981a08… noreply 122 def authenticate(self) -> bool:
0981a08… noreply 123 """Check that the vault path exists and contains .md files."""
0981a08… noreply 124 if not self.vault_path.is_dir():
0981a08… noreply 125 logger.error("Vault path does not exist: %s", self.vault_path)
0981a08… noreply 126 return False
0981a08… noreply 127 md_files = list(self.vault_path.rglob("*.md"))
0981a08… noreply 128 if not md_files:
0981a08… noreply 129 logger.error("No markdown files found in vault: %s", self.vault_path)
0981a08… noreply 130 return False
0981a08… noreply 131 logger.info(
0981a08… noreply 132 "Obsidian vault authenticated: %s (%d .md files)",
0981a08… noreply 133 self.vault_path,
0981a08… noreply 134 len(md_files),
0981a08… noreply 135 )
0981a08… noreply 136 return True
0981a08… noreply 137
0981a08… noreply 138 def list_videos(
0981a08… noreply 139 self,
0981a08… noreply 140 folder_id: Optional[str] = None,
0981a08… noreply 141 folder_path: Optional[str] = None,
0981a08… noreply 142 patterns: Optional[List[str]] = None,
0981a08… noreply 143 ) -> List[SourceFile]:
0981a08… noreply 144 """List all .md files in the vault recursively as SourceFile objects."""
0981a08… noreply 145 search_root = self.vault_path
0981a08… noreply 146 if folder_path:
0981a08… noreply 147 search_root = self.vault_path / folder_path
0981a08… noreply 148
0981a08… noreply 149 md_files = sorted(search_root.rglob("*.md"))
0981a08… noreply 150 results: List[SourceFile] = []
0981a08… noreply 151
0981a08… noreply 152 for md_file in md_files:
0981a08… noreply 153 relative = md_file.relative_to(self.vault_path)
0981a08… noreply 154 stat = md_file.stat()
0981a08… noreply 155 modified_dt = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc)
0981a08… noreply 156
0981a08… noreply 157 results.append(
0981a08… noreply 158 SourceFile(
0981a08… noreply 159 name=md_file.name,
0981a08… noreply 160 id=str(relative),
0981a08… noreply 161 size_bytes=stat.st_size,
0981a08… noreply 162 mime_type="text/markdown",
0981a08… noreply 163 modified_at=modified_dt.isoformat(),
0981a08… noreply 164 path=str(relative),
0981a08… noreply 165 )
0981a08… noreply 166 )
0981a08… noreply 167
0981a08… noreply 168 logger.info("Listed %d files from vault %s", len(results), self.vault_path)
0981a08… noreply 169 return results
0981a08… noreply 170
0981a08… noreply 171 def download(self, file: SourceFile, destination: Path) -> Path:
0981a08… noreply 172 """Copy a vault file to the destination path."""
0981a08… noreply 173 source = self.vault_path / file.id
0981a08… noreply 174 destination = Path(destination)
0981a08… noreply 175 destination.parent.mkdir(parents=True, exist_ok=True)
0981a08… noreply 176 shutil.copy2(source, destination)
0981a08… noreply 177 logger.info("Copied %s -> %s", source, destination)
0981a08… noreply 178 return destination

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button