|
b363c5b…
|
noreply
|
1 |
"""Auto-detect knowledge graph files in the filesystem.""" |
|
b363c5b…
|
noreply
|
2 |
|
|
b363c5b…
|
noreply
|
3 |
import logging |
|
b363c5b…
|
noreply
|
4 |
from pathlib import Path |
|
b363c5b…
|
noreply
|
5 |
from typing import Dict, List, Optional |
|
b363c5b…
|
noreply
|
6 |
|
|
b363c5b…
|
noreply
|
7 |
logger = logging.getLogger(__name__) |
|
b363c5b…
|
noreply
|
8 |
|
|
b363c5b…
|
noreply
|
9 |
# Common output subdirectories where graphs may live |
|
b363c5b…
|
noreply
|
10 |
_OUTPUT_SUBDIRS = ["results", "output", "knowledge-base"] |
|
b363c5b…
|
noreply
|
11 |
|
|
b363c5b…
|
noreply
|
12 |
# Filenames we look for, in preference order |
|
b363c5b…
|
noreply
|
13 |
_DB_FILENAMES = ["knowledge_graph.db"] |
|
b363c5b…
|
noreply
|
14 |
_JSON_FILENAMES = ["knowledge_graph.json"] |
|
b363c5b…
|
noreply
|
15 |
|
|
b363c5b…
|
noreply
|
16 |
|
|
b363c5b…
|
noreply
|
17 |
def find_knowledge_graphs( |
|
b363c5b…
|
noreply
|
18 |
start_dir: Optional[Path] = None, |
|
b363c5b…
|
noreply
|
19 |
walk_up: bool = True, |
|
b363c5b…
|
noreply
|
20 |
max_depth_down: int = 4, |
|
b363c5b…
|
noreply
|
21 |
) -> List[Path]: |
|
b363c5b…
|
noreply
|
22 |
"""Find knowledge graph files near *start_dir*, sorted by proximity. |
|
b363c5b…
|
noreply
|
23 |
|
|
b363c5b…
|
noreply
|
24 |
Search order: |
|
b363c5b…
|
noreply
|
25 |
1. start_dir itself |
|
b363c5b…
|
noreply
|
26 |
2. Common output subdirs (results/, output/, knowledge-base/) |
|
b363c5b…
|
noreply
|
27 |
3. Recursive walk downward (up to *max_depth_down* levels) |
|
b363c5b…
|
noreply
|
28 |
4. Walk upward through parent directories (if *walk_up* is True) |
|
b363c5b…
|
noreply
|
29 |
|
|
b363c5b…
|
noreply
|
30 |
Returns .db files first, then .json, each group sorted closest-first. |
|
b363c5b…
|
noreply
|
31 |
""" |
|
b363c5b…
|
noreply
|
32 |
start_dir = Path(start_dir or Path.cwd()).resolve() |
|
b363c5b…
|
noreply
|
33 |
found_db: List[tuple] = [] # (distance, path) |
|
b363c5b…
|
noreply
|
34 |
found_json: List[tuple] = [] |
|
b363c5b…
|
noreply
|
35 |
seen: set = set() |
|
b363c5b…
|
noreply
|
36 |
|
|
b363c5b…
|
noreply
|
37 |
def _record(path: Path, distance: int) -> None: |
|
b363c5b…
|
noreply
|
38 |
rp = path.resolve() |
|
b363c5b…
|
noreply
|
39 |
if rp in seen or not rp.is_file(): |
|
b363c5b…
|
noreply
|
40 |
return |
|
b363c5b…
|
noreply
|
41 |
seen.add(rp) |
|
b363c5b…
|
noreply
|
42 |
bucket = found_db if rp.suffix == ".db" else found_json |
|
b363c5b…
|
noreply
|
43 |
bucket.append((distance, rp)) |
|
b363c5b…
|
noreply
|
44 |
|
|
b363c5b…
|
noreply
|
45 |
# 1. Direct check in start_dir |
|
b363c5b…
|
noreply
|
46 |
for name in _DB_FILENAMES + _JSON_FILENAMES: |
|
b363c5b…
|
noreply
|
47 |
_record(start_dir / name, 0) |
|
b363c5b…
|
noreply
|
48 |
|
|
b363c5b…
|
noreply
|
49 |
# 2. Common output subdirs |
|
b363c5b…
|
noreply
|
50 |
for subdir in _OUTPUT_SUBDIRS: |
|
b363c5b…
|
noreply
|
51 |
for name in _DB_FILENAMES + _JSON_FILENAMES: |
|
b363c5b…
|
noreply
|
52 |
_record(start_dir / subdir / name, 1) |
|
b363c5b…
|
noreply
|
53 |
|
|
b363c5b…
|
noreply
|
54 |
# 3. Walk downward |
|
b363c5b…
|
noreply
|
55 |
def _walk_down(directory: Path, depth: int) -> None: |
|
b363c5b…
|
noreply
|
56 |
if depth > max_depth_down: |
|
b363c5b…
|
noreply
|
57 |
return |
|
b363c5b…
|
noreply
|
58 |
try: |
|
b363c5b…
|
noreply
|
59 |
for child in sorted(directory.iterdir()): |
|
b363c5b…
|
noreply
|
60 |
if child.is_file() and child.name in (_DB_FILENAMES + _JSON_FILENAMES): |
|
b363c5b…
|
noreply
|
61 |
_record(child, depth) |
|
b363c5b…
|
noreply
|
62 |
elif child.is_dir() and not child.name.startswith("."): |
|
b363c5b…
|
noreply
|
63 |
_walk_down(child, depth + 1) |
|
b363c5b…
|
noreply
|
64 |
except PermissionError: |
|
b363c5b…
|
noreply
|
65 |
pass |
|
b363c5b…
|
noreply
|
66 |
|
|
b363c5b…
|
noreply
|
67 |
_walk_down(start_dir, 1) |
|
b363c5b…
|
noreply
|
68 |
|
|
b363c5b…
|
noreply
|
69 |
# 4. Walk upward |
|
b363c5b…
|
noreply
|
70 |
if walk_up: |
|
b363c5b…
|
noreply
|
71 |
parent = start_dir.parent |
|
b363c5b…
|
noreply
|
72 |
distance = 1 |
|
b363c5b…
|
noreply
|
73 |
while parent != parent.parent: |
|
b363c5b…
|
noreply
|
74 |
for name in _DB_FILENAMES + _JSON_FILENAMES: |
|
b363c5b…
|
noreply
|
75 |
_record(parent / name, distance) |
|
b363c5b…
|
noreply
|
76 |
for subdir in _OUTPUT_SUBDIRS: |
|
b363c5b…
|
noreply
|
77 |
for name in _DB_FILENAMES + _JSON_FILENAMES: |
|
b363c5b…
|
noreply
|
78 |
_record(parent / subdir / name, distance + 1) |
|
b363c5b…
|
noreply
|
79 |
parent = parent.parent |
|
b363c5b…
|
noreply
|
80 |
distance += 1 |
|
b363c5b…
|
noreply
|
81 |
|
|
b363c5b…
|
noreply
|
82 |
# Sort each group by distance, then combine db-first |
|
b363c5b…
|
noreply
|
83 |
found_db.sort(key=lambda x: x[0]) |
|
b363c5b…
|
noreply
|
84 |
found_json.sort(key=lambda x: x[0]) |
|
b363c5b…
|
noreply
|
85 |
return [p for _, p in found_db] + [p for _, p in found_json] |
|
b363c5b…
|
noreply
|
86 |
|
|
b363c5b…
|
noreply
|
87 |
|
|
b363c5b…
|
noreply
|
88 |
def find_nearest_graph(start_dir: Optional[Path] = None) -> Optional[Path]: |
|
b363c5b…
|
noreply
|
89 |
"""Return the closest knowledge graph file, or None.""" |
|
b363c5b…
|
noreply
|
90 |
graphs = find_knowledge_graphs(start_dir) |
|
b363c5b…
|
noreply
|
91 |
return graphs[0] if graphs else None |
|
b363c5b…
|
noreply
|
92 |
|
|
b363c5b…
|
noreply
|
93 |
|
|
b363c5b…
|
noreply
|
94 |
def describe_graph(db_path: Path) -> Dict: |
|
b363c5b…
|
noreply
|
95 |
"""Return summary stats for a knowledge graph file. |
|
b363c5b…
|
noreply
|
96 |
|
|
b363c5b…
|
noreply
|
97 |
Returns dict with: entity_count, relationship_count, entity_types, store_type. |
|
b363c5b…
|
noreply
|
98 |
""" |
|
b363c5b…
|
noreply
|
99 |
from video_processor.integrators.graph_store import ( |
|
b363c5b…
|
noreply
|
100 |
InMemoryStore, |
|
0981a08…
|
noreply
|
101 |
SQLiteStore, |
|
b363c5b…
|
noreply
|
102 |
create_store, |
|
b363c5b…
|
noreply
|
103 |
) |
|
b363c5b…
|
noreply
|
104 |
|
|
b363c5b…
|
noreply
|
105 |
db_path = Path(db_path) |
|
b363c5b…
|
noreply
|
106 |
|
|
b363c5b…
|
noreply
|
107 |
if db_path.suffix == ".json": |
|
b363c5b…
|
noreply
|
108 |
import json |
|
b363c5b…
|
noreply
|
109 |
|
|
b363c5b…
|
noreply
|
110 |
data = json.loads(db_path.read_text()) |
|
b363c5b…
|
noreply
|
111 |
store = InMemoryStore() |
|
b363c5b…
|
noreply
|
112 |
for node in data.get("nodes", []): |
|
b363c5b…
|
noreply
|
113 |
store.merge_entity( |
|
b363c5b…
|
noreply
|
114 |
node.get("name", ""), |
|
b363c5b…
|
noreply
|
115 |
node.get("type", "concept"), |
|
b363c5b…
|
noreply
|
116 |
node.get("descriptions", []), |
|
b363c5b…
|
noreply
|
117 |
) |
|
b363c5b…
|
noreply
|
118 |
for rel in data.get("relationships", []): |
|
b363c5b…
|
noreply
|
119 |
store.add_relationship( |
|
b363c5b…
|
noreply
|
120 |
rel.get("source", ""), |
|
b363c5b…
|
noreply
|
121 |
rel.get("target", ""), |
|
b363c5b…
|
noreply
|
122 |
rel.get("type", "related_to"), |
|
b363c5b…
|
noreply
|
123 |
) |
|
b363c5b…
|
noreply
|
124 |
store_type = "json" |
|
b363c5b…
|
noreply
|
125 |
else: |
|
b363c5b…
|
noreply
|
126 |
store = create_store(db_path) |
|
0981a08…
|
noreply
|
127 |
store_type = "sqlite" if isinstance(store, SQLiteStore) else "inmemory" |
|
b363c5b…
|
noreply
|
128 |
|
|
b363c5b…
|
noreply
|
129 |
entities = store.get_all_entities() |
|
b363c5b…
|
noreply
|
130 |
entity_types = {} |
|
b363c5b…
|
noreply
|
131 |
for e in entities: |
|
b363c5b…
|
noreply
|
132 |
t = e.get("type", "concept") |
|
b363c5b…
|
noreply
|
133 |
entity_types[t] = entity_types.get(t, 0) + 1 |
|
b363c5b…
|
noreply
|
134 |
|
|
b363c5b…
|
noreply
|
135 |
return { |
|
b363c5b…
|
noreply
|
136 |
"entity_count": store.get_entity_count(), |
|
b363c5b…
|
noreply
|
137 |
"relationship_count": store.get_relationship_count(), |
|
b363c5b…
|
noreply
|
138 |
"entity_types": entity_types, |
|
b363c5b…
|
noreply
|
139 |
"store_type": store_type, |
|
b363c5b…
|
noreply
|
140 |
} |