|
1
|
"""Auto-detect knowledge graph files in the filesystem.""" |
|
2
|
|
|
3
|
import logging |
|
4
|
from pathlib import Path |
|
5
|
from typing import Dict, List, Optional |
|
6
|
|
|
7
|
logger = logging.getLogger(__name__) |
|
8
|
|
|
9
|
# Common output subdirectories where graphs may live |
|
10
|
_OUTPUT_SUBDIRS = ["results", "output", "knowledge-base"] |
|
11
|
|
|
12
|
# Filenames we look for, in preference order |
|
13
|
_DB_FILENAMES = ["knowledge_graph.db"] |
|
14
|
_JSON_FILENAMES = ["knowledge_graph.json"] |
|
15
|
|
|
16
|
|
|
17
|
def find_knowledge_graphs( |
|
18
|
start_dir: Optional[Path] = None, |
|
19
|
walk_up: bool = True, |
|
20
|
max_depth_down: int = 4, |
|
21
|
) -> List[Path]: |
|
22
|
"""Find knowledge graph files near *start_dir*, sorted by proximity. |
|
23
|
|
|
24
|
Search order: |
|
25
|
1. start_dir itself |
|
26
|
2. Common output subdirs (results/, output/, knowledge-base/) |
|
27
|
3. Recursive walk downward (up to *max_depth_down* levels) |
|
28
|
4. Walk upward through parent directories (if *walk_up* is True) |
|
29
|
|
|
30
|
Returns .db files first, then .json, each group sorted closest-first. |
|
31
|
""" |
|
32
|
start_dir = Path(start_dir or Path.cwd()).resolve() |
|
33
|
found_db: List[tuple] = [] # (distance, path) |
|
34
|
found_json: List[tuple] = [] |
|
35
|
seen: set = set() |
|
36
|
|
|
37
|
def _record(path: Path, distance: int) -> None: |
|
38
|
rp = path.resolve() |
|
39
|
if rp in seen or not rp.is_file(): |
|
40
|
return |
|
41
|
seen.add(rp) |
|
42
|
bucket = found_db if rp.suffix == ".db" else found_json |
|
43
|
bucket.append((distance, rp)) |
|
44
|
|
|
45
|
# 1. Direct check in start_dir |
|
46
|
for name in _DB_FILENAMES + _JSON_FILENAMES: |
|
47
|
_record(start_dir / name, 0) |
|
48
|
|
|
49
|
# 2. Common output subdirs |
|
50
|
for subdir in _OUTPUT_SUBDIRS: |
|
51
|
for name in _DB_FILENAMES + _JSON_FILENAMES: |
|
52
|
_record(start_dir / subdir / name, 1) |
|
53
|
|
|
54
|
# 3. Walk downward |
|
55
|
def _walk_down(directory: Path, depth: int) -> None: |
|
56
|
if depth > max_depth_down: |
|
57
|
return |
|
58
|
try: |
|
59
|
for child in sorted(directory.iterdir()): |
|
60
|
if child.is_file() and child.name in (_DB_FILENAMES + _JSON_FILENAMES): |
|
61
|
_record(child, depth) |
|
62
|
elif child.is_dir() and not child.name.startswith("."): |
|
63
|
_walk_down(child, depth + 1) |
|
64
|
except PermissionError: |
|
65
|
pass |
|
66
|
|
|
67
|
_walk_down(start_dir, 1) |
|
68
|
|
|
69
|
# 4. Walk upward |
|
70
|
if walk_up: |
|
71
|
parent = start_dir.parent |
|
72
|
distance = 1 |
|
73
|
while parent != parent.parent: |
|
74
|
for name in _DB_FILENAMES + _JSON_FILENAMES: |
|
75
|
_record(parent / name, distance) |
|
76
|
for subdir in _OUTPUT_SUBDIRS: |
|
77
|
for name in _DB_FILENAMES + _JSON_FILENAMES: |
|
78
|
_record(parent / subdir / name, distance + 1) |
|
79
|
parent = parent.parent |
|
80
|
distance += 1 |
|
81
|
|
|
82
|
# Sort each group by distance, then combine db-first |
|
83
|
found_db.sort(key=lambda x: x[0]) |
|
84
|
found_json.sort(key=lambda x: x[0]) |
|
85
|
return [p for _, p in found_db] + [p for _, p in found_json] |
|
86
|
|
|
87
|
|
|
88
|
def find_nearest_graph(start_dir: Optional[Path] = None) -> Optional[Path]: |
|
89
|
"""Return the closest knowledge graph file, or None.""" |
|
90
|
graphs = find_knowledge_graphs(start_dir) |
|
91
|
return graphs[0] if graphs else None |
|
92
|
|
|
93
|
|
|
94
|
def describe_graph(db_path: Path) -> Dict: |
|
95
|
"""Return summary stats for a knowledge graph file. |
|
96
|
|
|
97
|
Returns dict with: entity_count, relationship_count, entity_types, store_type. |
|
98
|
""" |
|
99
|
from video_processor.integrators.graph_store import ( |
|
100
|
InMemoryStore, |
|
101
|
SQLiteStore, |
|
102
|
create_store, |
|
103
|
) |
|
104
|
|
|
105
|
db_path = Path(db_path) |
|
106
|
|
|
107
|
if db_path.suffix == ".json": |
|
108
|
import json |
|
109
|
|
|
110
|
data = json.loads(db_path.read_text()) |
|
111
|
store = InMemoryStore() |
|
112
|
for node in data.get("nodes", []): |
|
113
|
store.merge_entity( |
|
114
|
node.get("name", ""), |
|
115
|
node.get("type", "concept"), |
|
116
|
node.get("descriptions", []), |
|
117
|
) |
|
118
|
for rel in data.get("relationships", []): |
|
119
|
store.add_relationship( |
|
120
|
rel.get("source", ""), |
|
121
|
rel.get("target", ""), |
|
122
|
rel.get("type", "related_to"), |
|
123
|
) |
|
124
|
store_type = "json" |
|
125
|
else: |
|
126
|
store = create_store(db_path) |
|
127
|
store_type = "sqlite" if isinstance(store, SQLiteStore) else "inmemory" |
|
128
|
|
|
129
|
entities = store.get_all_entities() |
|
130
|
entity_types = {} |
|
131
|
for e in entities: |
|
132
|
t = e.get("type", "concept") |
|
133
|
entity_types[t] = entity_types.get(t, 0) + 1 |
|
134
|
|
|
135
|
return { |
|
136
|
"entity_count": store.get_entity_count(), |
|
137
|
"relationship_count": store.get_relationship_count(), |
|
138
|
"entity_types": entity_types, |
|
139
|
"store_type": store_type, |
|
140
|
} |
|
141
|
|