|
b363c5b…
|
noreply
|
1 |
"""Query engine for PlanOpticon knowledge graphs.""" |
|
b363c5b…
|
noreply
|
2 |
|
|
b363c5b…
|
noreply
|
3 |
import json |
|
b363c5b…
|
noreply
|
4 |
import logging |
|
b363c5b…
|
noreply
|
5 |
from dataclasses import dataclass |
|
b363c5b…
|
noreply
|
6 |
from pathlib import Path |
|
b363c5b…
|
noreply
|
7 |
from typing import Any, Dict, Optional |
|
b363c5b…
|
noreply
|
8 |
|
|
b363c5b…
|
noreply
|
9 |
from video_processor.integrators.graph_store import ( |
|
b363c5b…
|
noreply
|
10 |
GraphStore, |
|
b363c5b…
|
noreply
|
11 |
InMemoryStore, |
|
b363c5b…
|
noreply
|
12 |
create_store, |
|
b363c5b…
|
noreply
|
13 |
) |
|
b363c5b…
|
noreply
|
14 |
|
|
b363c5b…
|
noreply
|
15 |
logger = logging.getLogger(__name__) |
|
b363c5b…
|
noreply
|
16 |
|
|
b363c5b…
|
noreply
|
17 |
|
|
b363c5b…
|
noreply
|
18 |
@dataclass |
|
b363c5b…
|
noreply
|
19 |
class QueryResult: |
|
b363c5b…
|
noreply
|
20 |
"""Uniform wrapper for query results.""" |
|
b363c5b…
|
noreply
|
21 |
|
|
b363c5b…
|
noreply
|
22 |
data: Any |
|
b363c5b…
|
noreply
|
23 |
query_type: str # "cypher", "filter", "agentic" |
|
b363c5b…
|
noreply
|
24 |
raw_query: str = "" |
|
b363c5b…
|
noreply
|
25 |
explanation: str = "" |
|
b363c5b…
|
noreply
|
26 |
|
|
b363c5b…
|
noreply
|
27 |
def to_text(self) -> str: |
|
b363c5b…
|
noreply
|
28 |
"""Human-readable text output.""" |
|
b363c5b…
|
noreply
|
29 |
lines = [] |
|
b363c5b…
|
noreply
|
30 |
if self.explanation: |
|
b363c5b…
|
noreply
|
31 |
lines.append(self.explanation) |
|
b363c5b…
|
noreply
|
32 |
lines.append("") |
|
b363c5b…
|
noreply
|
33 |
|
|
b363c5b…
|
noreply
|
34 |
if isinstance(self.data, dict): |
|
b363c5b…
|
noreply
|
35 |
# Stats or single entity |
|
b363c5b…
|
noreply
|
36 |
for key, value in self.data.items(): |
|
b363c5b…
|
noreply
|
37 |
if isinstance(value, dict): |
|
b363c5b…
|
noreply
|
38 |
lines.append(f"{key}:") |
|
b363c5b…
|
noreply
|
39 |
for k, v in value.items(): |
|
b363c5b…
|
noreply
|
40 |
lines.append(f" {k}: {v}") |
|
b363c5b…
|
noreply
|
41 |
else: |
|
b363c5b…
|
noreply
|
42 |
lines.append(f"{key}: {value}") |
|
b363c5b…
|
noreply
|
43 |
elif isinstance(self.data, list): |
|
b363c5b…
|
noreply
|
44 |
if not self.data: |
|
b363c5b…
|
noreply
|
45 |
lines.append("No results found.") |
|
b363c5b…
|
noreply
|
46 |
for item in self.data: |
|
b363c5b…
|
noreply
|
47 |
if isinstance(item, dict): |
|
b363c5b…
|
noreply
|
48 |
if "source" in item and "target" in item: |
|
b363c5b…
|
noreply
|
49 |
rtype = item.get("type", "related_to") |
|
b363c5b…
|
noreply
|
50 |
lines.append(f" {item['source']} --[{rtype}]--> {item['target']}") |
|
b363c5b…
|
noreply
|
51 |
elif item.get("name") and item.get("type"): |
|
b363c5b…
|
noreply
|
52 |
descs = item.get("descriptions", []) |
|
b363c5b…
|
noreply
|
53 |
if isinstance(descs, set): |
|
b363c5b…
|
noreply
|
54 |
descs = list(descs) |
|
b363c5b…
|
noreply
|
55 |
desc_str = "; ".join(descs[:3]) if descs else "" |
|
b363c5b…
|
noreply
|
56 |
line = f" [{item['type']}] {item['name']}" |
|
b363c5b…
|
noreply
|
57 |
if desc_str: |
|
b363c5b…
|
noreply
|
58 |
line += f" — {desc_str}" |
|
b363c5b…
|
noreply
|
59 |
lines.append(line) |
|
b363c5b…
|
noreply
|
60 |
else: |
|
b363c5b…
|
noreply
|
61 |
lines.append(f" {item}") |
|
b363c5b…
|
noreply
|
62 |
else: |
|
b363c5b…
|
noreply
|
63 |
lines.append(f" {item}") |
|
b363c5b…
|
noreply
|
64 |
else: |
|
b363c5b…
|
noreply
|
65 |
lines.append(str(self.data)) |
|
b363c5b…
|
noreply
|
66 |
|
|
b363c5b…
|
noreply
|
67 |
return "\n".join(lines) |
|
b363c5b…
|
noreply
|
68 |
|
|
b363c5b…
|
noreply
|
69 |
def to_json(self) -> str: |
|
b363c5b…
|
noreply
|
70 |
"""JSON string output.""" |
|
b363c5b…
|
noreply
|
71 |
payload = { |
|
b363c5b…
|
noreply
|
72 |
"query_type": self.query_type, |
|
b363c5b…
|
noreply
|
73 |
"raw_query": self.raw_query, |
|
b363c5b…
|
noreply
|
74 |
"explanation": self.explanation, |
|
b363c5b…
|
noreply
|
75 |
"data": self.data, |
|
b363c5b…
|
noreply
|
76 |
} |
|
b363c5b…
|
noreply
|
77 |
return json.dumps(payload, indent=2, default=str) |
|
b363c5b…
|
noreply
|
78 |
|
|
b363c5b…
|
noreply
|
79 |
def to_mermaid(self) -> str: |
|
b363c5b…
|
noreply
|
80 |
"""Mermaid diagram output from result data.""" |
|
b363c5b…
|
noreply
|
81 |
lines = ["graph LR"] |
|
b363c5b…
|
noreply
|
82 |
seen_nodes = set() |
|
b363c5b…
|
noreply
|
83 |
edges = [] |
|
b363c5b…
|
noreply
|
84 |
|
|
b363c5b…
|
noreply
|
85 |
items = self.data if isinstance(self.data, list) else [self.data] |
|
b363c5b…
|
noreply
|
86 |
|
|
b363c5b…
|
noreply
|
87 |
for item in items: |
|
b363c5b…
|
noreply
|
88 |
if not isinstance(item, dict): |
|
b363c5b…
|
noreply
|
89 |
continue |
|
b363c5b…
|
noreply
|
90 |
# Entity node |
|
b363c5b…
|
noreply
|
91 |
if "name" in item and "type" in item: |
|
b363c5b…
|
noreply
|
92 |
name = item["name"] |
|
b363c5b…
|
noreply
|
93 |
if name not in seen_nodes: |
|
b363c5b…
|
noreply
|
94 |
safe_id = _mermaid_id(name) |
|
b363c5b…
|
noreply
|
95 |
safe_name = name.replace('"', "'") |
|
b363c5b…
|
noreply
|
96 |
ntype = item.get("type", "concept") |
|
b363c5b…
|
noreply
|
97 |
lines.append(f' {safe_id}["{safe_name}"]:::{ntype}') |
|
b363c5b…
|
noreply
|
98 |
seen_nodes.add(name) |
|
b363c5b…
|
noreply
|
99 |
# Relationship edge |
|
b363c5b…
|
noreply
|
100 |
if "source" in item and "target" in item: |
|
b363c5b…
|
noreply
|
101 |
src = item["source"] |
|
b363c5b…
|
noreply
|
102 |
tgt = item["target"] |
|
b363c5b…
|
noreply
|
103 |
rtype = item.get("type", "related_to") |
|
b363c5b…
|
noreply
|
104 |
for n in (src, tgt): |
|
b363c5b…
|
noreply
|
105 |
if n not in seen_nodes: |
|
b363c5b…
|
noreply
|
106 |
safe_id = _mermaid_id(n) |
|
b363c5b…
|
noreply
|
107 |
lines.append(f' {safe_id}["{n.replace(chr(34), chr(39))}"]') |
|
b363c5b…
|
noreply
|
108 |
seen_nodes.add(n) |
|
b363c5b…
|
noreply
|
109 |
edges.append((src, tgt, rtype)) |
|
b363c5b…
|
noreply
|
110 |
|
|
b363c5b…
|
noreply
|
111 |
for src, tgt, rtype in edges: |
|
b363c5b…
|
noreply
|
112 |
lines.append(f' {_mermaid_id(src)} -- "{rtype}" --> {_mermaid_id(tgt)}') |
|
b363c5b…
|
noreply
|
113 |
|
|
b363c5b…
|
noreply
|
114 |
lines.append(" classDef person fill:#f9d5e5,stroke:#333") |
|
b363c5b…
|
noreply
|
115 |
lines.append(" classDef concept fill:#eeeeee,stroke:#333") |
|
b363c5b…
|
noreply
|
116 |
lines.append(" classDef technology fill:#d5e5f9,stroke:#333") |
|
b363c5b…
|
noreply
|
117 |
lines.append(" classDef organization fill:#f9e5d5,stroke:#333") |
|
b363c5b…
|
noreply
|
118 |
|
|
b363c5b…
|
noreply
|
119 |
return "\n".join(lines) |
|
b363c5b…
|
noreply
|
120 |
|
|
b363c5b…
|
noreply
|
121 |
|
|
b363c5b…
|
noreply
|
122 |
def _mermaid_id(name: str) -> str: |
|
b363c5b…
|
noreply
|
123 |
return "".join(c if c.isalnum() or c == "_" else "_" for c in name) |
|
b363c5b…
|
noreply
|
124 |
|
|
b363c5b…
|
noreply
|
125 |
|
|
b363c5b…
|
noreply
|
126 |
class GraphQueryEngine: |
|
b363c5b…
|
noreply
|
127 |
"""Query engine with direct (no-LLM) and agentic (LLM) modes.""" |
|
b363c5b…
|
noreply
|
128 |
|
|
b363c5b…
|
noreply
|
129 |
def __init__(self, store: GraphStore, provider_manager=None): |
|
b363c5b…
|
noreply
|
130 |
self.store = store |
|
b363c5b…
|
noreply
|
131 |
self.pm = provider_manager |
|
b363c5b…
|
noreply
|
132 |
|
|
b363c5b…
|
noreply
|
133 |
@classmethod |
|
b363c5b…
|
noreply
|
134 |
def from_db_path(cls, path: Path, provider_manager=None) -> "GraphQueryEngine": |
|
b363c5b…
|
noreply
|
135 |
"""Open a .db file and create a query engine.""" |
|
b363c5b…
|
noreply
|
136 |
store = create_store(path) |
|
b363c5b…
|
noreply
|
137 |
return cls(store, provider_manager) |
|
b363c5b…
|
noreply
|
138 |
|
|
b363c5b…
|
noreply
|
139 |
@classmethod |
|
b363c5b…
|
noreply
|
140 |
def from_json_path(cls, path: Path, provider_manager=None) -> "GraphQueryEngine": |
|
b363c5b…
|
noreply
|
141 |
"""Load a .json knowledge graph file and create a query engine.""" |
|
b363c5b…
|
noreply
|
142 |
data = json.loads(Path(path).read_text()) |
|
b363c5b…
|
noreply
|
143 |
store = InMemoryStore() |
|
b363c5b…
|
noreply
|
144 |
for node in data.get("nodes", []): |
|
b363c5b…
|
noreply
|
145 |
store.merge_entity( |
|
b363c5b…
|
noreply
|
146 |
node.get("name", ""), |
|
b363c5b…
|
noreply
|
147 |
node.get("type", "concept"), |
|
b363c5b…
|
noreply
|
148 |
node.get("descriptions", []), |
|
b363c5b…
|
noreply
|
149 |
) |
|
b363c5b…
|
noreply
|
150 |
for occ in node.get("occurrences", []): |
|
b363c5b…
|
noreply
|
151 |
store.add_occurrence( |
|
b363c5b…
|
noreply
|
152 |
node.get("name", ""), |
|
b363c5b…
|
noreply
|
153 |
occ.get("source", ""), |
|
b363c5b…
|
noreply
|
154 |
occ.get("timestamp"), |
|
b363c5b…
|
noreply
|
155 |
occ.get("text"), |
|
b363c5b…
|
noreply
|
156 |
) |
|
b363c5b…
|
noreply
|
157 |
for rel in data.get("relationships", []): |
|
b363c5b…
|
noreply
|
158 |
store.add_relationship( |
|
b363c5b…
|
noreply
|
159 |
rel.get("source", ""), |
|
b363c5b…
|
noreply
|
160 |
rel.get("target", ""), |
|
b363c5b…
|
noreply
|
161 |
rel.get("type", "related_to"), |
|
b363c5b…
|
noreply
|
162 |
content_source=rel.get("content_source"), |
|
b363c5b…
|
noreply
|
163 |
timestamp=rel.get("timestamp"), |
|
b363c5b…
|
noreply
|
164 |
) |
|
b363c5b…
|
noreply
|
165 |
return cls(store, provider_manager) |
|
b363c5b…
|
noreply
|
166 |
|
|
b363c5b…
|
noreply
|
167 |
# ── Direct mode methods (no LLM required) ── |
|
b363c5b…
|
noreply
|
168 |
|
|
b363c5b…
|
noreply
|
169 |
def entities( |
|
b363c5b…
|
noreply
|
170 |
self, |
|
b363c5b…
|
noreply
|
171 |
name: Optional[str] = None, |
|
b363c5b…
|
noreply
|
172 |
entity_type: Optional[str] = None, |
|
b363c5b…
|
noreply
|
173 |
limit: int = 50, |
|
b363c5b…
|
noreply
|
174 |
) -> QueryResult: |
|
b363c5b…
|
noreply
|
175 |
"""Filter entities by name substring and/or type.""" |
|
b363c5b…
|
noreply
|
176 |
all_entities = self.store.get_all_entities() |
|
b363c5b…
|
noreply
|
177 |
results = [] |
|
b363c5b…
|
noreply
|
178 |
for e in all_entities: |
|
b363c5b…
|
noreply
|
179 |
if name and name.lower() not in e.get("name", "").lower(): |
|
b363c5b…
|
noreply
|
180 |
continue |
|
b363c5b…
|
noreply
|
181 |
if entity_type and entity_type.lower() != e.get("type", "").lower(): |
|
b363c5b…
|
noreply
|
182 |
continue |
|
b363c5b…
|
noreply
|
183 |
results.append(e) |
|
b363c5b…
|
noreply
|
184 |
if len(results) >= limit: |
|
b363c5b…
|
noreply
|
185 |
break |
|
b363c5b…
|
noreply
|
186 |
|
|
b363c5b…
|
noreply
|
187 |
raw = f"entities(name={name!r}, entity_type={entity_type!r}, limit={limit})" |
|
b363c5b…
|
noreply
|
188 |
return QueryResult( |
|
b363c5b…
|
noreply
|
189 |
data=results, |
|
b363c5b…
|
noreply
|
190 |
query_type="filter", |
|
b363c5b…
|
noreply
|
191 |
raw_query=raw, |
|
b363c5b…
|
noreply
|
192 |
explanation=f"Found {len(results)} entities", |
|
b363c5b…
|
noreply
|
193 |
) |
|
b363c5b…
|
noreply
|
194 |
|
|
b363c5b…
|
noreply
|
195 |
def relationships( |
|
b363c5b…
|
noreply
|
196 |
self, |
|
b363c5b…
|
noreply
|
197 |
source: Optional[str] = None, |
|
b363c5b…
|
noreply
|
198 |
target: Optional[str] = None, |
|
b363c5b…
|
noreply
|
199 |
rel_type: Optional[str] = None, |
|
b363c5b…
|
noreply
|
200 |
limit: int = 50, |
|
b363c5b…
|
noreply
|
201 |
) -> QueryResult: |
|
b363c5b…
|
noreply
|
202 |
"""Filter relationships by source, target, and/or type.""" |
|
b363c5b…
|
noreply
|
203 |
all_rels = self.store.get_all_relationships() |
|
b363c5b…
|
noreply
|
204 |
results = [] |
|
b363c5b…
|
noreply
|
205 |
for r in all_rels: |
|
b363c5b…
|
noreply
|
206 |
if source and source.lower() not in r.get("source", "").lower(): |
|
b363c5b…
|
noreply
|
207 |
continue |
|
b363c5b…
|
noreply
|
208 |
if target and target.lower() not in r.get("target", "").lower(): |
|
b363c5b…
|
noreply
|
209 |
continue |
|
b363c5b…
|
noreply
|
210 |
if rel_type and rel_type.lower() not in r.get("type", "").lower(): |
|
b363c5b…
|
noreply
|
211 |
continue |
|
b363c5b…
|
noreply
|
212 |
results.append(r) |
|
b363c5b…
|
noreply
|
213 |
if len(results) >= limit: |
|
b363c5b…
|
noreply
|
214 |
break |
|
b363c5b…
|
noreply
|
215 |
|
|
b363c5b…
|
noreply
|
216 |
raw = f"relationships(source={source!r}, target={target!r}, rel_type={rel_type!r})" |
|
b363c5b…
|
noreply
|
217 |
return QueryResult( |
|
b363c5b…
|
noreply
|
218 |
data=results, |
|
b363c5b…
|
noreply
|
219 |
query_type="filter", |
|
b363c5b…
|
noreply
|
220 |
raw_query=raw, |
|
b363c5b…
|
noreply
|
221 |
explanation=f"Found {len(results)} relationships", |
|
b363c5b…
|
noreply
|
222 |
) |
|
b363c5b…
|
noreply
|
223 |
|
|
b363c5b…
|
noreply
|
224 |
def neighbors(self, entity_name: str, depth: int = 1) -> QueryResult: |
|
b363c5b…
|
noreply
|
225 |
"""Get an entity and its connected nodes (up to *depth* hops).""" |
|
b363c5b…
|
noreply
|
226 |
entity = self.store.get_entity(entity_name) |
|
b363c5b…
|
noreply
|
227 |
if not entity: |
|
b363c5b…
|
noreply
|
228 |
return QueryResult( |
|
b363c5b…
|
noreply
|
229 |
data=[], |
|
b363c5b…
|
noreply
|
230 |
query_type="filter", |
|
b363c5b…
|
noreply
|
231 |
raw_query=f"neighbors({entity_name!r}, depth={depth})", |
|
b363c5b…
|
noreply
|
232 |
explanation=f"Entity '{entity_name}' not found", |
|
b363c5b…
|
noreply
|
233 |
) |
|
b363c5b…
|
noreply
|
234 |
|
|
b363c5b…
|
noreply
|
235 |
visited = {entity_name.lower()} |
|
b363c5b…
|
noreply
|
236 |
result_entities = [entity] |
|
b363c5b…
|
noreply
|
237 |
result_rels = [] |
|
b363c5b…
|
noreply
|
238 |
frontier = {entity_name.lower()} |
|
b363c5b…
|
noreply
|
239 |
|
|
b363c5b…
|
noreply
|
240 |
all_rels = self.store.get_all_relationships() |
|
b363c5b…
|
noreply
|
241 |
|
|
b363c5b…
|
noreply
|
242 |
for _ in range(depth): |
|
b363c5b…
|
noreply
|
243 |
next_frontier = set() |
|
b363c5b…
|
noreply
|
244 |
for rel in all_rels: |
|
b363c5b…
|
noreply
|
245 |
src_lower = rel["source"].lower() |
|
b363c5b…
|
noreply
|
246 |
tgt_lower = rel["target"].lower() |
|
b363c5b…
|
noreply
|
247 |
if src_lower in frontier or tgt_lower in frontier: |
|
b363c5b…
|
noreply
|
248 |
result_rels.append(rel) |
|
b363c5b…
|
noreply
|
249 |
for n in (src_lower, tgt_lower): |
|
b363c5b…
|
noreply
|
250 |
if n not in visited: |
|
b363c5b…
|
noreply
|
251 |
visited.add(n) |
|
b363c5b…
|
noreply
|
252 |
next_frontier.add(n) |
|
b363c5b…
|
noreply
|
253 |
e = self.store.get_entity(n) |
|
b363c5b…
|
noreply
|
254 |
if e: |
|
b363c5b…
|
noreply
|
255 |
result_entities.append(e) |
|
b363c5b…
|
noreply
|
256 |
frontier = next_frontier |
|
b363c5b…
|
noreply
|
257 |
|
|
b363c5b…
|
noreply
|
258 |
# Combine entities + relationships into output |
|
b363c5b…
|
noreply
|
259 |
combined = result_entities + result_rels |
|
b363c5b…
|
noreply
|
260 |
return QueryResult( |
|
b363c5b…
|
noreply
|
261 |
data=combined, |
|
b363c5b…
|
noreply
|
262 |
query_type="filter", |
|
b363c5b…
|
noreply
|
263 |
raw_query=f"neighbors({entity_name!r}, depth={depth})", |
|
b363c5b…
|
noreply
|
264 |
explanation=( |
|
b363c5b…
|
noreply
|
265 |
f"Found {len(result_entities)} entities and {len(result_rels)} relationships" |
|
b363c5b…
|
noreply
|
266 |
), |
|
b363c5b…
|
noreply
|
267 |
) |
|
b363c5b…
|
noreply
|
268 |
|
|
b363c5b…
|
noreply
|
269 |
def stats(self) -> QueryResult: |
|
b363c5b…
|
noreply
|
270 |
"""Return entity count, relationship count, type breakdown.""" |
|
b363c5b…
|
noreply
|
271 |
all_entities = self.store.get_all_entities() |
|
b363c5b…
|
noreply
|
272 |
type_breakdown = {} |
|
b363c5b…
|
noreply
|
273 |
for e in all_entities: |
|
b363c5b…
|
noreply
|
274 |
t = e.get("type", "concept") |
|
b363c5b…
|
noreply
|
275 |
type_breakdown[t] = type_breakdown.get(t, 0) + 1 |
|
b363c5b…
|
noreply
|
276 |
|
|
b363c5b…
|
noreply
|
277 |
data = { |
|
b363c5b…
|
noreply
|
278 |
"entity_count": self.store.get_entity_count(), |
|
b363c5b…
|
noreply
|
279 |
"relationship_count": self.store.get_relationship_count(), |
|
b363c5b…
|
noreply
|
280 |
"entity_types": type_breakdown, |
|
b363c5b…
|
noreply
|
281 |
} |
|
b363c5b…
|
noreply
|
282 |
return QueryResult( |
|
b363c5b…
|
noreply
|
283 |
data=data, |
|
b363c5b…
|
noreply
|
284 |
query_type="filter", |
|
b363c5b…
|
noreply
|
285 |
raw_query="stats()", |
|
b363c5b…
|
noreply
|
286 |
explanation="Knowledge graph statistics", |
|
b363c5b…
|
noreply
|
287 |
) |
|
b363c5b…
|
noreply
|
288 |
|
|
0981a08…
|
noreply
|
289 |
def sources(self) -> QueryResult: |
|
0981a08…
|
noreply
|
290 |
"""Return all registered content sources.""" |
|
0981a08…
|
noreply
|
291 |
all_sources = self.store.get_sources() |
|
0981a08…
|
noreply
|
292 |
return QueryResult( |
|
0981a08…
|
noreply
|
293 |
data=all_sources, |
|
0981a08…
|
noreply
|
294 |
query_type="filter", |
|
0981a08…
|
noreply
|
295 |
raw_query="sources()", |
|
0981a08…
|
noreply
|
296 |
explanation=f"Found {len(all_sources)} registered sources", |
|
0981a08…
|
noreply
|
297 |
) |
|
0981a08…
|
noreply
|
298 |
|
|
0981a08…
|
noreply
|
299 |
def provenance(self, entity_name: str) -> QueryResult: |
|
0981a08…
|
noreply
|
300 |
"""Return source locations for a given entity.""" |
|
0981a08…
|
noreply
|
301 |
locations = self.store.get_entity_provenance(entity_name) |
|
0981a08…
|
noreply
|
302 |
if not locations: |
|
0981a08…
|
noreply
|
303 |
return QueryResult( |
|
0981a08…
|
noreply
|
304 |
data=[], |
|
0981a08…
|
noreply
|
305 |
query_type="filter", |
|
0981a08…
|
noreply
|
306 |
raw_query=f"provenance({entity_name!r})", |
|
0981a08…
|
noreply
|
307 |
explanation=f"No provenance records found for '{entity_name}'", |
|
0981a08…
|
noreply
|
308 |
) |
|
0981a08…
|
noreply
|
309 |
return QueryResult( |
|
0981a08…
|
noreply
|
310 |
data=locations, |
|
0981a08…
|
noreply
|
311 |
query_type="filter", |
|
0981a08…
|
noreply
|
312 |
raw_query=f"provenance({entity_name!r})", |
|
0981a08…
|
noreply
|
313 |
explanation=f"Found {len(locations)} provenance records for '{entity_name}'", |
|
0981a08…
|
noreply
|
314 |
) |
|
0981a08…
|
noreply
|
315 |
|
|
4a3c1b4…
|
noreply
|
316 |
def shortest_path(self, start: str, end: str, max_depth: int = 6) -> QueryResult: |
|
4a3c1b4…
|
noreply
|
317 |
"""Find the shortest path between two entities via BFS.""" |
|
4a3c1b4…
|
noreply
|
318 |
start_entity = self.store.get_entity(start) |
|
4a3c1b4…
|
noreply
|
319 |
end_entity = self.store.get_entity(end) |
|
4a3c1b4…
|
noreply
|
320 |
if not start_entity: |
|
4a3c1b4…
|
noreply
|
321 |
return QueryResult( |
|
4a3c1b4…
|
noreply
|
322 |
data=[], |
|
4a3c1b4…
|
noreply
|
323 |
query_type="filter", |
|
4a3c1b4…
|
noreply
|
324 |
raw_query=f"shortest_path({start!r}, {end!r})", |
|
4a3c1b4…
|
noreply
|
325 |
explanation=f"Entity '{start}' not found", |
|
4a3c1b4…
|
noreply
|
326 |
) |
|
4a3c1b4…
|
noreply
|
327 |
if not end_entity: |
|
4a3c1b4…
|
noreply
|
328 |
return QueryResult( |
|
4a3c1b4…
|
noreply
|
329 |
data=[], |
|
4a3c1b4…
|
noreply
|
330 |
query_type="filter", |
|
4a3c1b4…
|
noreply
|
331 |
raw_query=f"shortest_path({start!r}, {end!r})", |
|
4a3c1b4…
|
noreply
|
332 |
explanation=f"Entity '{end}' not found", |
|
4a3c1b4…
|
noreply
|
333 |
) |
|
4a3c1b4…
|
noreply
|
334 |
|
|
4a3c1b4…
|
noreply
|
335 |
all_rels = self.store.get_all_relationships() |
|
4a3c1b4…
|
noreply
|
336 |
# Build adjacency list |
|
4a3c1b4…
|
noreply
|
337 |
adj: dict[str, list[tuple[str, dict]]] = {} |
|
4a3c1b4…
|
noreply
|
338 |
for rel in all_rels: |
|
4a3c1b4…
|
noreply
|
339 |
src_l = rel["source"].lower() |
|
4a3c1b4…
|
noreply
|
340 |
tgt_l = rel["target"].lower() |
|
4a3c1b4…
|
noreply
|
341 |
adj.setdefault(src_l, []).append((tgt_l, rel)) |
|
4a3c1b4…
|
noreply
|
342 |
adj.setdefault(tgt_l, []).append((src_l, rel)) |
|
4a3c1b4…
|
noreply
|
343 |
|
|
4a3c1b4…
|
noreply
|
344 |
# BFS |
|
4a3c1b4…
|
noreply
|
345 |
start_l = start.lower() |
|
4a3c1b4…
|
noreply
|
346 |
end_l = end.lower() |
|
4a3c1b4…
|
noreply
|
347 |
if start_l == end_l: |
|
4a3c1b4…
|
noreply
|
348 |
return QueryResult( |
|
4a3c1b4…
|
noreply
|
349 |
data=[start_entity], |
|
4a3c1b4…
|
noreply
|
350 |
query_type="filter", |
|
4a3c1b4…
|
noreply
|
351 |
raw_query=f"shortest_path({start!r}, {end!r})", |
|
4a3c1b4…
|
noreply
|
352 |
explanation="Start and end are the same entity", |
|
4a3c1b4…
|
noreply
|
353 |
) |
|
4a3c1b4…
|
noreply
|
354 |
|
|
4a3c1b4…
|
noreply
|
355 |
from collections import deque |
|
4a3c1b4…
|
noreply
|
356 |
|
|
4a3c1b4…
|
noreply
|
357 |
queue: deque[tuple[str, list[dict]]] = deque([(start_l, [])]) |
|
4a3c1b4…
|
noreply
|
358 |
visited = {start_l} |
|
4a3c1b4…
|
noreply
|
359 |
|
|
4a3c1b4…
|
noreply
|
360 |
while queue: |
|
4a3c1b4…
|
noreply
|
361 |
current, path = queue.popleft() |
|
4a3c1b4…
|
noreply
|
362 |
if len(path) >= max_depth: |
|
4a3c1b4…
|
noreply
|
363 |
continue |
|
4a3c1b4…
|
noreply
|
364 |
for neighbor, rel in adj.get(current, []): |
|
4a3c1b4…
|
noreply
|
365 |
if neighbor in visited: |
|
4a3c1b4…
|
noreply
|
366 |
continue |
|
4a3c1b4…
|
noreply
|
367 |
new_path = path + [rel] |
|
4a3c1b4…
|
noreply
|
368 |
if neighbor == end_l: |
|
4a3c1b4…
|
noreply
|
369 |
# Build result: entities + relationships along path |
|
4a3c1b4…
|
noreply
|
370 |
path_entities = [start_entity] |
|
4a3c1b4…
|
noreply
|
371 |
for r in new_path: |
|
4a3c1b4…
|
noreply
|
372 |
path_entities.append(r) |
|
4a3c1b4…
|
noreply
|
373 |
tgt_name = r["target"] if r["source"].lower() == current else r["source"] |
|
4a3c1b4…
|
noreply
|
374 |
e = self.store.get_entity(tgt_name) |
|
4a3c1b4…
|
noreply
|
375 |
if e: |
|
4a3c1b4…
|
noreply
|
376 |
path_entities.append(e) |
|
4a3c1b4…
|
noreply
|
377 |
path_entities.append(end_entity) |
|
4a3c1b4…
|
noreply
|
378 |
# Deduplicate |
|
4a3c1b4…
|
noreply
|
379 |
seen = set() |
|
4a3c1b4…
|
noreply
|
380 |
deduped = [] |
|
4a3c1b4…
|
noreply
|
381 |
for item in path_entities: |
|
4a3c1b4…
|
noreply
|
382 |
key = str(item) |
|
4a3c1b4…
|
noreply
|
383 |
if key not in seen: |
|
4a3c1b4…
|
noreply
|
384 |
seen.add(key) |
|
4a3c1b4…
|
noreply
|
385 |
deduped.append(item) |
|
4a3c1b4…
|
noreply
|
386 |
return QueryResult( |
|
4a3c1b4…
|
noreply
|
387 |
data=deduped, |
|
4a3c1b4…
|
noreply
|
388 |
query_type="filter", |
|
4a3c1b4…
|
noreply
|
389 |
raw_query=f"shortest_path({start!r}, {end!r})", |
|
4a3c1b4…
|
noreply
|
390 |
explanation=f"Path found: {len(new_path)} hops", |
|
4a3c1b4…
|
noreply
|
391 |
) |
|
4a3c1b4…
|
noreply
|
392 |
visited.add(neighbor) |
|
4a3c1b4…
|
noreply
|
393 |
queue.append((neighbor, new_path)) |
|
4a3c1b4…
|
noreply
|
394 |
|
|
4a3c1b4…
|
noreply
|
395 |
return QueryResult( |
|
4a3c1b4…
|
noreply
|
396 |
data=[], |
|
4a3c1b4…
|
noreply
|
397 |
query_type="filter", |
|
4a3c1b4…
|
noreply
|
398 |
raw_query=f"shortest_path({start!r}, {end!r})", |
|
4a3c1b4…
|
noreply
|
399 |
explanation=f"No path found between '{start}' and '{end}' within {max_depth} hops", |
|
4a3c1b4…
|
noreply
|
400 |
) |
|
4a3c1b4…
|
noreply
|
401 |
|
|
4a3c1b4…
|
noreply
|
402 |
def clusters(self) -> QueryResult: |
|
4a3c1b4…
|
noreply
|
403 |
"""Find connected components (clusters) in the graph.""" |
|
4a3c1b4…
|
noreply
|
404 |
all_entities = self.store.get_all_entities() |
|
4a3c1b4…
|
noreply
|
405 |
all_rels = self.store.get_all_relationships() |
|
4a3c1b4…
|
noreply
|
406 |
|
|
4a3c1b4…
|
noreply
|
407 |
# Build adjacency |
|
4a3c1b4…
|
noreply
|
408 |
adj: dict[str, set[str]] = {} |
|
4a3c1b4…
|
noreply
|
409 |
for e in all_entities: |
|
4a3c1b4…
|
noreply
|
410 |
adj.setdefault(e["name"].lower(), set()) |
|
4a3c1b4…
|
noreply
|
411 |
for r in all_rels: |
|
4a3c1b4…
|
noreply
|
412 |
adj.setdefault(r["source"].lower(), set()).add(r["target"].lower()) |
|
4a3c1b4…
|
noreply
|
413 |
adj.setdefault(r["target"].lower(), set()).add(r["source"].lower()) |
|
4a3c1b4…
|
noreply
|
414 |
|
|
4a3c1b4…
|
noreply
|
415 |
visited: set[str] = set() |
|
4a3c1b4…
|
noreply
|
416 |
components: list[list[str]] = [] |
|
4a3c1b4…
|
noreply
|
417 |
|
|
4a3c1b4…
|
noreply
|
418 |
for node in adj: |
|
4a3c1b4…
|
noreply
|
419 |
if node in visited: |
|
4a3c1b4…
|
noreply
|
420 |
continue |
|
4a3c1b4…
|
noreply
|
421 |
component: list[str] = [] |
|
4a3c1b4…
|
noreply
|
422 |
stack = [node] |
|
4a3c1b4…
|
noreply
|
423 |
while stack: |
|
4a3c1b4…
|
noreply
|
424 |
n = stack.pop() |
|
4a3c1b4…
|
noreply
|
425 |
if n in visited: |
|
4a3c1b4…
|
noreply
|
426 |
continue |
|
4a3c1b4…
|
noreply
|
427 |
visited.add(n) |
|
4a3c1b4…
|
noreply
|
428 |
component.append(n) |
|
4a3c1b4…
|
noreply
|
429 |
stack.extend(adj.get(n, set()) - visited) |
|
4a3c1b4…
|
noreply
|
430 |
components.append(sorted(component)) |
|
4a3c1b4…
|
noreply
|
431 |
|
|
4a3c1b4…
|
noreply
|
432 |
# Sort by size descending |
|
4a3c1b4…
|
noreply
|
433 |
components.sort(key=len, reverse=True) |
|
4a3c1b4…
|
noreply
|
434 |
|
|
4a3c1b4…
|
noreply
|
435 |
result = [{"cluster_id": i, "size": len(c), "members": c} for i, c in enumerate(components)] |
|
4a3c1b4…
|
noreply
|
436 |
|
|
4a3c1b4…
|
noreply
|
437 |
return QueryResult( |
|
4a3c1b4…
|
noreply
|
438 |
data=result, |
|
4a3c1b4…
|
noreply
|
439 |
query_type="filter", |
|
4a3c1b4…
|
noreply
|
440 |
raw_query="clusters()", |
|
4a3c1b4…
|
noreply
|
441 |
explanation=f"Found {len(components)} clusters", |
|
4a3c1b4…
|
noreply
|
442 |
) |
|
4a3c1b4…
|
noreply
|
443 |
|
|
0981a08…
|
noreply
|
444 |
def sql(self, query: str) -> QueryResult: |
|
0981a08…
|
noreply
|
445 |
"""Execute a raw SQL query (SQLite only).""" |
|
b363c5b…
|
noreply
|
446 |
result = self.store.raw_query(query) |
|
b363c5b…
|
noreply
|
447 |
return QueryResult( |
|
b363c5b…
|
noreply
|
448 |
data=result, |
|
0981a08…
|
noreply
|
449 |
query_type="sql", |
|
b363c5b…
|
noreply
|
450 |
raw_query=query, |
|
b363c5b…
|
noreply
|
451 |
explanation=( |
|
0981a08…
|
noreply
|
452 |
f"SQL query returned {len(result) if isinstance(result, list) else 1} rows" |
|
b363c5b…
|
noreply
|
453 |
), |
|
b363c5b…
|
noreply
|
454 |
) |
|
b363c5b…
|
noreply
|
455 |
|
|
b363c5b…
|
noreply
|
456 |
# ── Agentic mode (requires LLM) ── |
|
b363c5b…
|
noreply
|
457 |
|
|
b363c5b…
|
noreply
|
458 |
def ask(self, question: str) -> QueryResult: |
|
b363c5b…
|
noreply
|
459 |
"""Answer a natural language question using LLM-guided query planning. |
|
b363c5b…
|
noreply
|
460 |
|
|
b363c5b…
|
noreply
|
461 |
The LLM picks from known direct-mode actions (never generates arbitrary code), |
|
b363c5b…
|
noreply
|
462 |
the engine executes them, then the LLM synthesizes a natural language answer. |
|
b363c5b…
|
noreply
|
463 |
""" |
|
b363c5b…
|
noreply
|
464 |
if not self.pm: |
|
b363c5b…
|
noreply
|
465 |
return QueryResult( |
|
b363c5b…
|
noreply
|
466 |
data=None, |
|
b363c5b…
|
noreply
|
467 |
query_type="agentic", |
|
b363c5b…
|
noreply
|
468 |
raw_query=question, |
|
b363c5b…
|
noreply
|
469 |
explanation="Agentic mode requires a configured LLM provider. " |
|
b363c5b…
|
noreply
|
470 |
"Pass --provider/--chat-model or set an API key.", |
|
b363c5b…
|
noreply
|
471 |
) |
|
b363c5b…
|
noreply
|
472 |
|
|
b363c5b…
|
noreply
|
473 |
# Step 1: Ask LLM to generate a query plan |
|
b363c5b…
|
noreply
|
474 |
stats = self.stats().data |
|
b363c5b…
|
noreply
|
475 |
plan_prompt = ( |
|
b363c5b…
|
noreply
|
476 |
"You are a knowledge graph query planner. Given a user question and graph stats, " |
|
b363c5b…
|
noreply
|
477 |
"choose ONE action to answer it.\n\n" |
|
b363c5b…
|
noreply
|
478 |
f"Graph stats: {json.dumps(stats)}\n\n" |
|
b363c5b…
|
noreply
|
479 |
"Available actions (pick exactly one):\n" |
|
b363c5b…
|
noreply
|
480 |
'- {{"action": "entities", "name": "...", "entity_type": "..."}}\n' |
|
b363c5b…
|
noreply
|
481 |
'- {{"action": "relationships", "source": "...", "target": "...", "rel_type": "..."}}\n' |
|
b363c5b…
|
noreply
|
482 |
'- {{"action": "neighbors", "entity_name": "...", "depth": 1}}\n' |
|
4a3c1b4…
|
noreply
|
483 |
'- {{"action": "shortest_path", "start": "...", "end": "..."}}\n' |
|
4a3c1b4…
|
noreply
|
484 |
'- {{"action": "clusters"}}\n' |
|
b363c5b…
|
noreply
|
485 |
'- {{"action": "stats"}}\n\n' |
|
b363c5b…
|
noreply
|
486 |
f"User question: {question}\n\n" |
|
b363c5b…
|
noreply
|
487 |
"Return ONLY a JSON object with the action. Omit optional fields you don't need." |
|
b363c5b…
|
noreply
|
488 |
) |
|
b363c5b…
|
noreply
|
489 |
|
|
b363c5b…
|
noreply
|
490 |
try: |
|
b363c5b…
|
noreply
|
491 |
plan_raw = self.pm.chat( |
|
b363c5b…
|
noreply
|
492 |
[{"role": "user", "content": plan_prompt}], |
|
b363c5b…
|
noreply
|
493 |
max_tokens=256, |
|
b363c5b…
|
noreply
|
494 |
temperature=0.1, |
|
b363c5b…
|
noreply
|
495 |
) |
|
b363c5b…
|
noreply
|
496 |
except Exception as e: |
|
b363c5b…
|
noreply
|
497 |
return QueryResult( |
|
b363c5b…
|
noreply
|
498 |
data=None, |
|
b363c5b…
|
noreply
|
499 |
query_type="agentic", |
|
b363c5b…
|
noreply
|
500 |
raw_query=question, |
|
b363c5b…
|
noreply
|
501 |
explanation=f"LLM query planning failed: {e}", |
|
b363c5b…
|
noreply
|
502 |
) |
|
b363c5b…
|
noreply
|
503 |
|
|
b363c5b…
|
noreply
|
504 |
# Parse the plan |
|
b363c5b…
|
noreply
|
505 |
plan = _parse_json(plan_raw) |
|
b363c5b…
|
noreply
|
506 |
if not plan or "action" not in plan: |
|
b363c5b…
|
noreply
|
507 |
return QueryResult( |
|
b363c5b…
|
noreply
|
508 |
data=None, |
|
b363c5b…
|
noreply
|
509 |
query_type="agentic", |
|
b363c5b…
|
noreply
|
510 |
raw_query=question, |
|
b363c5b…
|
noreply
|
511 |
explanation="Could not parse LLM query plan from response.", |
|
b363c5b…
|
noreply
|
512 |
) |
|
b363c5b…
|
noreply
|
513 |
|
|
b363c5b…
|
noreply
|
514 |
# Step 2: Execute the planned action |
|
b363c5b…
|
noreply
|
515 |
action = plan["action"] |
|
b363c5b…
|
noreply
|
516 |
try: |
|
b363c5b…
|
noreply
|
517 |
if action == "entities": |
|
b363c5b…
|
noreply
|
518 |
result = self.entities( |
|
b363c5b…
|
noreply
|
519 |
name=plan.get("name"), |
|
b363c5b…
|
noreply
|
520 |
entity_type=plan.get("entity_type"), |
|
b363c5b…
|
noreply
|
521 |
) |
|
b363c5b…
|
noreply
|
522 |
elif action == "relationships": |
|
b363c5b…
|
noreply
|
523 |
result = self.relationships( |
|
b363c5b…
|
noreply
|
524 |
source=plan.get("source"), |
|
b363c5b…
|
noreply
|
525 |
target=plan.get("target"), |
|
b363c5b…
|
noreply
|
526 |
rel_type=plan.get("rel_type"), |
|
b363c5b…
|
noreply
|
527 |
) |
|
b363c5b…
|
noreply
|
528 |
elif action == "neighbors": |
|
b363c5b…
|
noreply
|
529 |
result = self.neighbors( |
|
b363c5b…
|
noreply
|
530 |
entity_name=plan.get("entity_name", ""), |
|
b363c5b…
|
noreply
|
531 |
depth=plan.get("depth", 1), |
|
b363c5b…
|
noreply
|
532 |
) |
|
4a3c1b4…
|
noreply
|
533 |
elif action == "shortest_path": |
|
4a3c1b4…
|
noreply
|
534 |
result = self.shortest_path( |
|
4a3c1b4…
|
noreply
|
535 |
start=plan.get("start", ""), |
|
4a3c1b4…
|
noreply
|
536 |
end=plan.get("end", ""), |
|
4a3c1b4…
|
noreply
|
537 |
) |
|
4a3c1b4…
|
noreply
|
538 |
elif action == "clusters": |
|
4a3c1b4…
|
noreply
|
539 |
result = self.clusters() |
|
b363c5b…
|
noreply
|
540 |
elif action == "stats": |
|
b363c5b…
|
noreply
|
541 |
result = self.stats() |
|
b363c5b…
|
noreply
|
542 |
else: |
|
b363c5b…
|
noreply
|
543 |
return QueryResult( |
|
b363c5b…
|
noreply
|
544 |
data=None, |
|
b363c5b…
|
noreply
|
545 |
query_type="agentic", |
|
b363c5b…
|
noreply
|
546 |
raw_query=question, |
|
b363c5b…
|
noreply
|
547 |
explanation=f"Unknown action in plan: {action}", |
|
b363c5b…
|
noreply
|
548 |
) |
|
b363c5b…
|
noreply
|
549 |
except Exception as e: |
|
b363c5b…
|
noreply
|
550 |
return QueryResult( |
|
b363c5b…
|
noreply
|
551 |
data=None, |
|
b363c5b…
|
noreply
|
552 |
query_type="agentic", |
|
b363c5b…
|
noreply
|
553 |
raw_query=question, |
|
b363c5b…
|
noreply
|
554 |
explanation=f"Action execution failed: {e}", |
|
b363c5b…
|
noreply
|
555 |
) |
|
b363c5b…
|
noreply
|
556 |
|
|
b363c5b…
|
noreply
|
557 |
# Step 3: Synthesize a natural language answer |
|
b363c5b…
|
noreply
|
558 |
synth_prompt = ( |
|
b363c5b…
|
noreply
|
559 |
"You are a helpful assistant answering questions about a knowledge graph.\n\n" |
|
b363c5b…
|
noreply
|
560 |
f"User question: {question}\n\n" |
|
b363c5b…
|
noreply
|
561 |
f"Query result:\n{result.to_text()}\n\n" |
|
b363c5b…
|
noreply
|
562 |
"Provide a concise, natural language answer based on the data above." |
|
b363c5b…
|
noreply
|
563 |
) |
|
b363c5b…
|
noreply
|
564 |
|
|
b363c5b…
|
noreply
|
565 |
try: |
|
b363c5b…
|
noreply
|
566 |
answer = self.pm.chat( |
|
b363c5b…
|
noreply
|
567 |
[{"role": "user", "content": synth_prompt}], |
|
b363c5b…
|
noreply
|
568 |
max_tokens=1024, |
|
b363c5b…
|
noreply
|
569 |
temperature=0.3, |
|
b363c5b…
|
noreply
|
570 |
) |
|
b363c5b…
|
noreply
|
571 |
except Exception as e: |
|
b363c5b…
|
noreply
|
572 |
# Return the raw result if synthesis fails |
|
b363c5b…
|
noreply
|
573 |
result.query_type = "agentic" |
|
b363c5b…
|
noreply
|
574 |
result.explanation = f"LLM synthesis failed ({e}), showing raw results" |
|
b363c5b…
|
noreply
|
575 |
return result |
|
b363c5b…
|
noreply
|
576 |
|
|
b363c5b…
|
noreply
|
577 |
return QueryResult( |
|
b363c5b…
|
noreply
|
578 |
data=result.data, |
|
b363c5b…
|
noreply
|
579 |
query_type="agentic", |
|
b363c5b…
|
noreply
|
580 |
raw_query=question, |
|
b363c5b…
|
noreply
|
581 |
explanation=answer.strip(), |
|
b363c5b…
|
noreply
|
582 |
) |
|
b363c5b…
|
noreply
|
583 |
|
|
b363c5b…
|
noreply
|
584 |
|
|
b363c5b…
|
noreply
|
585 |
def _parse_json(text: str) -> Optional[Dict]: |
|
b363c5b…
|
noreply
|
586 |
"""Try to extract a JSON object from LLM output.""" |
|
b363c5b…
|
noreply
|
587 |
text = text.strip() |
|
b363c5b…
|
noreply
|
588 |
# Try direct parse first |
|
b363c5b…
|
noreply
|
589 |
try: |
|
b363c5b…
|
noreply
|
590 |
return json.loads(text) |
|
b363c5b…
|
noreply
|
591 |
except json.JSONDecodeError: |
|
b363c5b…
|
noreply
|
592 |
pass |
|
b363c5b…
|
noreply
|
593 |
# Try to find JSON between braces |
|
b363c5b…
|
noreply
|
594 |
start = text.find("{") |
|
b363c5b…
|
noreply
|
595 |
end = text.rfind("}") |
|
b363c5b…
|
noreply
|
596 |
if start >= 0 and end > start: |
|
b363c5b…
|
noreply
|
597 |
try: |
|
b363c5b…
|
noreply
|
598 |
return json.loads(text[start : end + 1]) |
|
b363c5b…
|
noreply
|
599 |
except json.JSONDecodeError: |
|
b363c5b…
|
noreply
|
600 |
pass |
|
b363c5b…
|
noreply
|
601 |
return None |