|
1
|
"""Logseq graph source connector for ingesting markdown pages and journals.""" |
|
2
|
|
|
3
|
import logging |
|
4
|
import re |
|
5
|
import shutil |
|
6
|
from datetime import datetime, timezone |
|
7
|
from pathlib import Path |
|
8
|
from typing import List, Optional, Tuple |
|
9
|
|
|
10
|
from video_processor.sources.base import BaseSource, SourceFile |
|
11
|
|
|
12
|
logger = logging.getLogger(__name__) |
|
13
|
|
|
14
|
|
|
15
|
def parse_page(path: Path) -> dict: |
|
16
|
"""Parse a Logseq markdown page and extract structured content. |
|
17
|
|
|
18
|
Returns a dict with: |
|
19
|
- properties: dict of page-level properties (key:: value lines at top) |
|
20
|
- links: list of linked page names from [[wiki-links]] |
|
21
|
- tags: list of tags from #tag and #[[tag]] occurrences |
|
22
|
- block_refs: list of block reference IDs from ((block-id)) |
|
23
|
- body: full text content |
|
24
|
""" |
|
25
|
text = path.read_text(encoding="utf-8") |
|
26
|
lines = text.split("\n") |
|
27
|
|
|
28
|
# Extract page properties (key:: value lines at the top of the file) |
|
29
|
properties: dict = {} |
|
30
|
body_start = 0 |
|
31
|
for i, line in enumerate(lines): |
|
32
|
prop_match = re.match(r"^([A-Za-z][A-Za-z0-9_-]*)::\ ?(.*)", line) |
|
33
|
if prop_match: |
|
34
|
key = prop_match.group(1) |
|
35
|
value = prop_match.group(2).strip() |
|
36
|
properties[key] = value |
|
37
|
body_start = i + 1 |
|
38
|
else: |
|
39
|
break |
|
40
|
|
|
41
|
body = "\n".join(lines[body_start:]) |
|
42
|
|
|
43
|
# Extract wiki-links: [[page]] |
|
44
|
link_pattern = re.compile(r"\[\[([^\]]+)\]\]") |
|
45
|
links = link_pattern.findall(body) |
|
46
|
# Also pick up links from properties |
|
47
|
for value in properties.values(): |
|
48
|
links.extend(link_pattern.findall(str(value))) |
|
49
|
|
|
50
|
# Extract tags: #tag and #[[tag]] |
|
51
|
# First get #[[multi word tag]] style |
|
52
|
bracket_tag_pattern = re.compile(r"#\[\[([^\]]+)\]\]") |
|
53
|
tags = bracket_tag_pattern.findall(text) |
|
54
|
# Then get simple #tag style (exclude matches already captured as #[[...]]) |
|
55
|
# Remove bracket tags first to avoid double-matching |
|
56
|
text_without_bracket_tags = bracket_tag_pattern.sub("", text) |
|
57
|
simple_tag_pattern = re.compile(r"(?<!\w)#([A-Za-z][A-Za-z0-9_/-]*)") |
|
58
|
tags.extend(simple_tag_pattern.findall(text_without_bracket_tags)) |
|
59
|
|
|
60
|
# Extract block references: ((block-id)) |
|
61
|
block_ref_pattern = re.compile(r"\(\(([a-f0-9-]+)\)\)") |
|
62
|
block_refs = block_ref_pattern.findall(text) |
|
63
|
|
|
64
|
return { |
|
65
|
"properties": properties, |
|
66
|
"links": links, |
|
67
|
"tags": tags, |
|
68
|
"block_refs": block_refs, |
|
69
|
"body": body, |
|
70
|
} |
|
71
|
|
|
72
|
|
|
73
|
def ingest_graph(graph_path: Path) -> dict: |
|
74
|
"""Ingest an entire Logseq graph and return structured data. |
|
75
|
|
|
76
|
Returns a dict with: |
|
77
|
- notes: list of dicts with name, tags, frontmatter (properties), text |
|
78
|
- links: list of (source, target) tuples from wiki-links |
|
79
|
""" |
|
80
|
graph_path = Path(graph_path) |
|
81
|
notes: List[dict] = [] |
|
82
|
links: List[Tuple[str, str]] = [] |
|
83
|
|
|
84
|
md_files: List[Path] = [] |
|
85
|
pages_dir = graph_path / "pages" |
|
86
|
journals_dir = graph_path / "journals" |
|
87
|
|
|
88
|
if pages_dir.is_dir(): |
|
89
|
md_files.extend(sorted(pages_dir.rglob("*.md"))) |
|
90
|
if journals_dir.is_dir(): |
|
91
|
md_files.extend(sorted(journals_dir.rglob("*.md"))) |
|
92
|
|
|
93
|
logger.info("Found %d markdown files in graph %s", len(md_files), graph_path) |
|
94
|
|
|
95
|
for md_file in md_files: |
|
96
|
page_name = md_file.stem |
|
97
|
try: |
|
98
|
parsed = parse_page(md_file) |
|
99
|
except Exception: |
|
100
|
logger.warning("Failed to parse page %s", md_file) |
|
101
|
continue |
|
102
|
|
|
103
|
notes.append( |
|
104
|
{ |
|
105
|
"name": page_name, |
|
106
|
"tags": parsed["tags"], |
|
107
|
"frontmatter": parsed["properties"], |
|
108
|
"text": parsed["body"], |
|
109
|
} |
|
110
|
) |
|
111
|
|
|
112
|
for linked_page in parsed["links"]: |
|
113
|
links.append((page_name, linked_page)) |
|
114
|
|
|
115
|
logger.info( |
|
116
|
"Ingested %d notes with %d links from graph %s", |
|
117
|
len(notes), |
|
118
|
len(links), |
|
119
|
graph_path, |
|
120
|
) |
|
121
|
return {"notes": notes, "links": links} |
|
122
|
|
|
123
|
|
|
124
|
class LogseqSource(BaseSource): |
|
125
|
"""Source connector for Logseq graphs.""" |
|
126
|
|
|
127
|
def __init__(self, graph_path: str) -> None: |
|
128
|
self.graph_path = Path(graph_path) |
|
129
|
|
|
130
|
def authenticate(self) -> bool: |
|
131
|
"""Check that the graph path exists and has pages/ or journals/ dirs.""" |
|
132
|
if not self.graph_path.is_dir(): |
|
133
|
logger.error("Graph path does not exist: %s", self.graph_path) |
|
134
|
return False |
|
135
|
has_pages = (self.graph_path / "pages").is_dir() |
|
136
|
has_journals = (self.graph_path / "journals").is_dir() |
|
137
|
if not has_pages and not has_journals: |
|
138
|
logger.error( |
|
139
|
"No pages/ or journals/ directory found in graph: %s", |
|
140
|
self.graph_path, |
|
141
|
) |
|
142
|
return False |
|
143
|
logger.info( |
|
144
|
"Logseq graph authenticated: %s (pages=%s, journals=%s)", |
|
145
|
self.graph_path, |
|
146
|
has_pages, |
|
147
|
has_journals, |
|
148
|
) |
|
149
|
return True |
|
150
|
|
|
151
|
def list_videos( |
|
152
|
self, |
|
153
|
folder_id: Optional[str] = None, |
|
154
|
folder_path: Optional[str] = None, |
|
155
|
patterns: Optional[List[str]] = None, |
|
156
|
) -> List[SourceFile]: |
|
157
|
"""List .md files in pages/ and journals/ as SourceFile objects.""" |
|
158
|
md_files: List[Path] = [] |
|
159
|
|
|
160
|
pages_dir = self.graph_path / "pages" |
|
161
|
journals_dir = self.graph_path / "journals" |
|
162
|
|
|
163
|
if folder_path: |
|
164
|
search_root = self.graph_path / folder_path |
|
165
|
if search_root.is_dir(): |
|
166
|
md_files.extend(sorted(search_root.rglob("*.md"))) |
|
167
|
else: |
|
168
|
if pages_dir.is_dir(): |
|
169
|
md_files.extend(sorted(pages_dir.rglob("*.md"))) |
|
170
|
if journals_dir.is_dir(): |
|
171
|
md_files.extend(sorted(journals_dir.rglob("*.md"))) |
|
172
|
|
|
173
|
results: List[SourceFile] = [] |
|
174
|
for md_file in md_files: |
|
175
|
relative = md_file.relative_to(self.graph_path) |
|
176
|
stat = md_file.stat() |
|
177
|
modified_dt = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc) |
|
178
|
|
|
179
|
results.append( |
|
180
|
SourceFile( |
|
181
|
name=md_file.name, |
|
182
|
id=str(relative), |
|
183
|
size_bytes=stat.st_size, |
|
184
|
mime_type="text/markdown", |
|
185
|
modified_at=modified_dt.isoformat(), |
|
186
|
path=str(relative), |
|
187
|
) |
|
188
|
) |
|
189
|
|
|
190
|
logger.info("Listed %d files from graph %s", len(results), self.graph_path) |
|
191
|
return results |
|
192
|
|
|
193
|
def download(self, file: SourceFile, destination: Path) -> Path: |
|
194
|
"""Copy a graph file to the destination path.""" |
|
195
|
source = self.graph_path / file.id |
|
196
|
destination = Path(destination) |
|
197
|
destination.parent.mkdir(parents=True, exist_ok=True) |
|
198
|
shutil.copy2(source, destination) |
|
199
|
logger.info("Copied %s -> %s", source, destination) |
|
200
|
return destination |
|
201
|
|