|
321f2f5…
|
leo
|
1 |
"""Knowledge graph integration for organizing extracted content.""" |
|
321f2f5…
|
leo
|
2 |
|
|
321f2f5…
|
leo
|
3 |
import logging |
|
321f2f5…
|
leo
|
4 |
from pathlib import Path |
|
321f2f5…
|
leo
|
5 |
from typing import Dict, List, Optional, Union |
|
321f2f5…
|
leo
|
6 |
|
|
287a3bb…
|
leo
|
7 |
from tqdm import tqdm |
|
287a3bb…
|
leo
|
8 |
|
|
0ad36b7…
|
noreply
|
9 |
from video_processor.integrators.graph_store import GraphStore, create_store |
|
0981a08…
|
noreply
|
10 |
from video_processor.models import Entity, KnowledgeGraphData, Relationship, SourceRecord |
|
321f2f5…
|
leo
|
11 |
from video_processor.providers.manager import ProviderManager |
|
321f2f5…
|
leo
|
12 |
from video_processor.utils.json_parsing import parse_json_from_response |
|
321f2f5…
|
leo
|
13 |
|
|
321f2f5…
|
leo
|
14 |
logger = logging.getLogger(__name__) |
|
321f2f5…
|
leo
|
15 |
|
|
321f2f5…
|
leo
|
16 |
|
|
321f2f5…
|
leo
|
17 |
class KnowledgeGraph: |
|
321f2f5…
|
leo
|
18 |
"""Integrates extracted content into a structured knowledge graph.""" |
|
321f2f5…
|
leo
|
19 |
|
|
321f2f5…
|
leo
|
20 |
def __init__( |
|
321f2f5…
|
leo
|
21 |
self, |
|
321f2f5…
|
leo
|
22 |
provider_manager: Optional[ProviderManager] = None, |
|
0ad36b7…
|
noreply
|
23 |
db_path: Optional[Path] = None, |
|
0ad36b7…
|
noreply
|
24 |
store: Optional[GraphStore] = None, |
|
321f2f5…
|
leo
|
25 |
): |
|
321f2f5…
|
leo
|
26 |
self.pm = provider_manager |
|
0ad36b7…
|
noreply
|
27 |
self._store = store or create_store(db_path) |
|
0981a08…
|
noreply
|
28 |
|
|
0981a08…
|
noreply
|
29 |
def register_source(self, source: Dict) -> None: |
|
0981a08…
|
noreply
|
30 |
"""Register a content source for provenance tracking.""" |
|
0981a08…
|
noreply
|
31 |
self._store.register_source(source) |
|
0ad36b7…
|
noreply
|
32 |
|
|
0ad36b7…
|
noreply
|
33 |
@property |
|
0ad36b7…
|
noreply
|
34 |
def nodes(self) -> Dict[str, dict]: |
|
0ad36b7…
|
noreply
|
35 |
"""Backward-compatible read access to nodes as a dict keyed by entity name.""" |
|
0ad36b7…
|
noreply
|
36 |
result = {} |
|
0ad36b7…
|
noreply
|
37 |
for entity in self._store.get_all_entities(): |
|
0ad36b7…
|
noreply
|
38 |
name = entity["name"] |
|
0ad36b7…
|
noreply
|
39 |
descs = entity.get("descriptions", []) |
|
0ad36b7…
|
noreply
|
40 |
result[name] = { |
|
0ad36b7…
|
noreply
|
41 |
"id": entity.get("id", name), |
|
0ad36b7…
|
noreply
|
42 |
"name": name, |
|
0ad36b7…
|
noreply
|
43 |
"type": entity.get("type", "concept"), |
|
0ad36b7…
|
noreply
|
44 |
"descriptions": set(descs) if isinstance(descs, list) else descs, |
|
0ad36b7…
|
noreply
|
45 |
"occurrences": entity.get("occurrences", []), |
|
0ad36b7…
|
noreply
|
46 |
} |
|
0ad36b7…
|
noreply
|
47 |
return result |
|
0ad36b7…
|
noreply
|
48 |
|
|
0ad36b7…
|
noreply
|
49 |
@property |
|
0ad36b7…
|
noreply
|
50 |
def relationships(self) -> List[dict]: |
|
0ad36b7…
|
noreply
|
51 |
"""Backward-compatible read access to relationships.""" |
|
0ad36b7…
|
noreply
|
52 |
return self._store.get_all_relationships() |
|
321f2f5…
|
leo
|
53 |
|
|
321f2f5…
|
leo
|
54 |
def _chat(self, prompt: str, temperature: float = 0.3) -> str: |
|
321f2f5…
|
leo
|
55 |
"""Send a chat message through ProviderManager (or return empty if none).""" |
|
321f2f5…
|
leo
|
56 |
if not self.pm: |
|
321f2f5…
|
leo
|
57 |
return "" |
|
321f2f5…
|
leo
|
58 |
return self.pm.chat( |
|
321f2f5…
|
leo
|
59 |
[{"role": "user", "content": prompt}], |
|
321f2f5…
|
leo
|
60 |
max_tokens=4096, |
|
321f2f5…
|
leo
|
61 |
temperature=temperature, |
|
321f2f5…
|
leo
|
62 |
) |
|
321f2f5…
|
leo
|
63 |
|
|
829e24a…
|
leo
|
64 |
def extract_entities_and_relationships( |
|
829e24a…
|
leo
|
65 |
self, text: str |
|
829e24a…
|
leo
|
66 |
) -> tuple[List[Entity], List[Relationship]]: |
|
287a3bb…
|
leo
|
67 |
"""Extract entities and relationships in a single LLM call.""" |
|
321f2f5…
|
leo
|
68 |
prompt = ( |
|
287a3bb…
|
leo
|
69 |
"Extract all notable entities and relationships from the following content.\n\n" |
|
321f2f5…
|
leo
|
70 |
f"CONTENT:\n{text}\n\n" |
|
287a3bb…
|
leo
|
71 |
"Return a JSON object with two keys:\n" |
|
829e24a…
|
leo
|
72 |
'- "entities": array of {"name": "...", ' |
|
829e24a…
|
leo
|
73 |
'"type": "person|concept|technology|organization|time", ' |
|
829e24a…
|
leo
|
74 |
'"description": "brief description"}\n' |
|
829e24a…
|
leo
|
75 |
'- "relationships": array of {"source": "entity name", ' |
|
829e24a…
|
leo
|
76 |
'"target": "entity name", ' |
|
829e24a…
|
leo
|
77 |
'"type": "relationship description"}\n\n' |
|
287a3bb…
|
leo
|
78 |
"Return ONLY the JSON object." |
|
321f2f5…
|
leo
|
79 |
) |
|
321f2f5…
|
leo
|
80 |
raw = self._chat(prompt) |
|
321f2f5…
|
leo
|
81 |
parsed = parse_json_from_response(raw) |
|
321f2f5…
|
leo
|
82 |
|
|
321f2f5…
|
leo
|
83 |
entities = [] |
|
287a3bb…
|
leo
|
84 |
rels = [] |
|
287a3bb…
|
leo
|
85 |
|
|
287a3bb…
|
leo
|
86 |
if isinstance(parsed, dict): |
|
287a3bb…
|
leo
|
87 |
for item in parsed.get("entities", []): |
|
287a3bb…
|
leo
|
88 |
if isinstance(item, dict) and "name" in item: |
|
829e24a…
|
leo
|
89 |
entities.append( |
|
829e24a…
|
leo
|
90 |
Entity( |
|
829e24a…
|
leo
|
91 |
name=item["name"], |
|
829e24a…
|
leo
|
92 |
type=item.get("type", "concept"), |
|
829e24a…
|
leo
|
93 |
descriptions=[item["description"]] if item.get("description") else [], |
|
829e24a…
|
leo
|
94 |
) |
|
829e24a…
|
leo
|
95 |
) |
|
829e24a…
|
leo
|
96 |
{e.name for e in entities} |
|
287a3bb…
|
leo
|
97 |
for item in parsed.get("relationships", []): |
|
287a3bb…
|
leo
|
98 |
if isinstance(item, dict) and "source" in item and "target" in item: |
|
829e24a…
|
leo
|
99 |
rels.append( |
|
829e24a…
|
leo
|
100 |
Relationship( |
|
829e24a…
|
leo
|
101 |
source=item["source"], |
|
829e24a…
|
leo
|
102 |
target=item["target"], |
|
829e24a…
|
leo
|
103 |
type=item.get("type", "related_to"), |
|
829e24a…
|
leo
|
104 |
) |
|
829e24a…
|
leo
|
105 |
) |
|
287a3bb…
|
leo
|
106 |
elif isinstance(parsed, list): |
|
287a3bb…
|
leo
|
107 |
# Fallback: if model returns a flat entity list |
|
321f2f5…
|
leo
|
108 |
for item in parsed: |
|
321f2f5…
|
leo
|
109 |
if isinstance(item, dict) and "name" in item: |
|
829e24a…
|
leo
|
110 |
entities.append( |
|
829e24a…
|
leo
|
111 |
Entity( |
|
829e24a…
|
leo
|
112 |
name=item["name"], |
|
829e24a…
|
leo
|
113 |
type=item.get("type", "concept"), |
|
829e24a…
|
leo
|
114 |
descriptions=[item["description"]] if item.get("description") else [], |
|
829e24a…
|
leo
|
115 |
) |
|
829e24a…
|
leo
|
116 |
) |
|
287a3bb…
|
leo
|
117 |
|
|
287a3bb…
|
leo
|
118 |
return entities, rels |
|
287a3bb…
|
leo
|
119 |
|
|
0981a08…
|
noreply
|
120 |
def add_content( |
|
0981a08…
|
noreply
|
121 |
self, |
|
0981a08…
|
noreply
|
122 |
text: str, |
|
0981a08…
|
noreply
|
123 |
source: str, |
|
0981a08…
|
noreply
|
124 |
timestamp: Optional[float] = None, |
|
0981a08…
|
noreply
|
125 |
source_id: Optional[str] = None, |
|
0981a08…
|
noreply
|
126 |
) -> None: |
|
321f2f5…
|
leo
|
127 |
"""Add content to knowledge graph by extracting entities and relationships.""" |
|
287a3bb…
|
leo
|
128 |
entities, relationships = self.extract_entities_and_relationships(text) |
|
287a3bb…
|
leo
|
129 |
|
|
0ad36b7…
|
noreply
|
130 |
snippet = text[:100] + "..." if len(text) > 100 else text |
|
0ad36b7…
|
noreply
|
131 |
|
|
321f2f5…
|
leo
|
132 |
for entity in entities: |
|
0ad36b7…
|
noreply
|
133 |
self._store.merge_entity(entity.name, entity.type, entity.descriptions, source=source) |
|
0ad36b7…
|
noreply
|
134 |
self._store.add_occurrence(entity.name, source, timestamp, snippet) |
|
0981a08…
|
noreply
|
135 |
if source_id: |
|
0981a08…
|
noreply
|
136 |
self._store.add_source_location( |
|
0981a08…
|
noreply
|
137 |
source_id, |
|
0981a08…
|
noreply
|
138 |
entity_name_lower=entity.name.lower(), |
|
0981a08…
|
noreply
|
139 |
timestamp=timestamp, |
|
0981a08…
|
noreply
|
140 |
text_snippet=snippet, |
|
0981a08…
|
noreply
|
141 |
) |
|
321f2f5…
|
leo
|
142 |
|
|
321f2f5…
|
leo
|
143 |
for rel in relationships: |
|
0ad36b7…
|
noreply
|
144 |
if self._store.has_entity(rel.source) and self._store.has_entity(rel.target): |
|
0ad36b7…
|
noreply
|
145 |
self._store.add_relationship( |
|
0ad36b7…
|
noreply
|
146 |
rel.source, |
|
0ad36b7…
|
noreply
|
147 |
rel.target, |
|
0ad36b7…
|
noreply
|
148 |
rel.type, |
|
0ad36b7…
|
noreply
|
149 |
content_source=source, |
|
0ad36b7…
|
noreply
|
150 |
timestamp=timestamp, |
|
829e24a…
|
leo
|
151 |
) |
|
287a3bb…
|
leo
|
152 |
|
|
287a3bb…
|
leo
|
153 |
def process_transcript(self, transcript: Dict, batch_size: int = 10) -> None: |
|
287a3bb…
|
leo
|
154 |
"""Process transcript segments into knowledge graph, batching for efficiency.""" |
|
321f2f5…
|
leo
|
155 |
if "segments" not in transcript: |
|
321f2f5…
|
leo
|
156 |
logger.warning("Transcript missing segments") |
|
321f2f5…
|
leo
|
157 |
return |
|
321f2f5…
|
leo
|
158 |
|
|
287a3bb…
|
leo
|
159 |
segments = transcript["segments"] |
|
287a3bb…
|
leo
|
160 |
|
|
287a3bb…
|
leo
|
161 |
# Register speakers first |
|
287a3bb…
|
leo
|
162 |
for i, segment in enumerate(segments): |
|
287a3bb…
|
leo
|
163 |
speaker = segment.get("speaker", None) |
|
0ad36b7…
|
noreply
|
164 |
if speaker and not self._store.has_entity(speaker): |
|
0ad36b7…
|
noreply
|
165 |
self._store.merge_entity(speaker, "person", ["Speaker in transcript"]) |
|
287a3bb…
|
leo
|
166 |
|
|
287a3bb…
|
leo
|
167 |
# Batch segments together for fewer API calls |
|
287a3bb…
|
leo
|
168 |
batches = [] |
|
287a3bb…
|
leo
|
169 |
for start in range(0, len(segments), batch_size): |
|
829e24a…
|
leo
|
170 |
batches.append(segments[start : start + batch_size]) |
|
287a3bb…
|
leo
|
171 |
|
|
287a3bb…
|
leo
|
172 |
for batch in tqdm(batches, desc="Building knowledge graph", unit="batch"): |
|
287a3bb…
|
leo
|
173 |
# Combine batch text |
|
829e24a…
|
leo
|
174 |
combined_text = " ".join(seg["text"] for seg in batch if "text" in seg) |
|
287a3bb…
|
leo
|
175 |
if not combined_text.strip(): |
|
287a3bb…
|
leo
|
176 |
continue |
|
287a3bb…
|
leo
|
177 |
|
|
287a3bb…
|
leo
|
178 |
# Use first segment's timestamp as batch timestamp |
|
287a3bb…
|
leo
|
179 |
batch_start_idx = segments.index(batch[0]) |
|
287a3bb…
|
leo
|
180 |
timestamp = batch[0].get("start", None) |
|
287a3bb…
|
leo
|
181 |
source = f"transcript_batch_{batch_start_idx}" |
|
287a3bb…
|
leo
|
182 |
|
|
287a3bb…
|
leo
|
183 |
self.add_content(combined_text, source, timestamp) |
|
321f2f5…
|
leo
|
184 |
|
|
321f2f5…
|
leo
|
185 |
def process_diagrams(self, diagrams: List[Dict]) -> None: |
|
321f2f5…
|
leo
|
186 |
"""Process diagram results into knowledge graph.""" |
|
287a3bb…
|
leo
|
187 |
for i, diagram in enumerate(tqdm(diagrams, desc="Processing diagrams for KG", unit="diag")): |
|
321f2f5…
|
leo
|
188 |
text_content = diagram.get("text_content", "") |
|
0ad36b7…
|
noreply
|
189 |
source = f"diagram_{i}" |
|
321f2f5…
|
leo
|
190 |
if text_content: |
|
321f2f5…
|
leo
|
191 |
self.add_content(text_content, source) |
|
321f2f5…
|
leo
|
192 |
|
|
321f2f5…
|
leo
|
193 |
diagram_id = f"diagram_{i}" |
|
0ad36b7…
|
noreply
|
194 |
if not self._store.has_entity(diagram_id): |
|
0ad36b7…
|
noreply
|
195 |
self._store.merge_entity(diagram_id, "diagram", ["Visual diagram from video"]) |
|
0ad36b7…
|
noreply
|
196 |
self._store.add_occurrence( |
|
0ad36b7…
|
noreply
|
197 |
diagram_id, |
|
0ad36b7…
|
noreply
|
198 |
source if text_content else diagram_id, |
|
0ad36b7…
|
noreply
|
199 |
text=f"frame_index={diagram.get('frame_index')}", |
|
2a1b11a…
|
noreply
|
200 |
) |
|
2a1b11a…
|
noreply
|
201 |
|
|
2a1b11a…
|
noreply
|
202 |
def process_screenshots(self, screenshots: List[Dict]) -> None: |
|
2a1b11a…
|
noreply
|
203 |
"""Process screenshot captures into knowledge graph. |
|
2a1b11a…
|
noreply
|
204 |
|
|
2a1b11a…
|
noreply
|
205 |
Extracts entities from text_content and adds screenshot-specific |
|
2a1b11a…
|
noreply
|
206 |
entities from the entities list. |
|
2a1b11a…
|
noreply
|
207 |
""" |
|
2a1b11a…
|
noreply
|
208 |
for i, capture in enumerate(screenshots): |
|
2a1b11a…
|
noreply
|
209 |
text_content = capture.get("text_content", "") |
|
2a1b11a…
|
noreply
|
210 |
source = f"screenshot_{i}" |
|
2a1b11a…
|
noreply
|
211 |
content_type = capture.get("content_type", "screenshot") |
|
2a1b11a…
|
noreply
|
212 |
|
|
2a1b11a…
|
noreply
|
213 |
# Extract entities from visible text via LLM |
|
2a1b11a…
|
noreply
|
214 |
if text_content: |
|
2a1b11a…
|
noreply
|
215 |
self.add_content(text_content, source) |
|
2a1b11a…
|
noreply
|
216 |
|
|
2a1b11a…
|
noreply
|
217 |
# Add explicitly identified entities from vision extraction |
|
2a1b11a…
|
noreply
|
218 |
for entity_name in capture.get("entities", []): |
|
2a1b11a…
|
noreply
|
219 |
if not entity_name or len(entity_name) < 2: |
|
2a1b11a…
|
noreply
|
220 |
continue |
|
2a1b11a…
|
noreply
|
221 |
if not self._store.has_entity(entity_name): |
|
2a1b11a…
|
noreply
|
222 |
self._store.merge_entity( |
|
2a1b11a…
|
noreply
|
223 |
entity_name, |
|
2a1b11a…
|
noreply
|
224 |
"concept", |
|
2a1b11a…
|
noreply
|
225 |
[f"Identified in {content_type} screenshot"], |
|
2a1b11a…
|
noreply
|
226 |
source=source, |
|
2a1b11a…
|
noreply
|
227 |
) |
|
2a1b11a…
|
noreply
|
228 |
self._store.add_occurrence( |
|
2a1b11a…
|
noreply
|
229 |
entity_name, |
|
2a1b11a…
|
noreply
|
230 |
source, |
|
2a1b11a…
|
noreply
|
231 |
text=f"Visible in {content_type} (frame {capture.get('frame_index', '?')})", |
|
0981a08…
|
noreply
|
232 |
) |
|
0981a08…
|
noreply
|
233 |
|
|
321f2f5…
|
leo
|
234 |
def to_data(self) -> KnowledgeGraphData: |
|
321f2f5…
|
leo
|
235 |
"""Convert to pydantic KnowledgeGraphData model.""" |
|
321f2f5…
|
leo
|
236 |
nodes = [] |
|
0ad36b7…
|
noreply
|
237 |
for entity in self._store.get_all_entities(): |
|
0ad36b7…
|
noreply
|
238 |
descs = entity.get("descriptions", []) |
|
321f2f5…
|
leo
|
239 |
if isinstance(descs, set): |
|
321f2f5…
|
leo
|
240 |
descs = list(descs) |
|
829e24a…
|
leo
|
241 |
nodes.append( |
|
829e24a…
|
leo
|
242 |
Entity( |
|
0ad36b7…
|
noreply
|
243 |
name=entity["name"], |
|
0ad36b7…
|
noreply
|
244 |
type=entity.get("type", "concept"), |
|
829e24a…
|
leo
|
245 |
descriptions=descs, |
|
0ad36b7…
|
noreply
|
246 |
occurrences=entity.get("occurrences", []), |
|
829e24a…
|
leo
|
247 |
) |
|
829e24a…
|
leo
|
248 |
) |
|
321f2f5…
|
leo
|
249 |
|
|
321f2f5…
|
leo
|
250 |
rels = [ |
|
321f2f5…
|
leo
|
251 |
Relationship( |
|
321f2f5…
|
leo
|
252 |
source=r["source"], |
|
321f2f5…
|
leo
|
253 |
target=r["target"], |
|
321f2f5…
|
leo
|
254 |
type=r.get("type", "related_to"), |
|
321f2f5…
|
leo
|
255 |
content_source=r.get("content_source"), |
|
321f2f5…
|
leo
|
256 |
timestamp=r.get("timestamp"), |
|
321f2f5…
|
leo
|
257 |
) |
|
0ad36b7…
|
noreply
|
258 |
for r in self._store.get_all_relationships() |
|
321f2f5…
|
leo
|
259 |
] |
|
0981a08…
|
noreply
|
260 |
|
|
0981a08…
|
noreply
|
261 |
sources = [SourceRecord(**s) for s in self._store.get_sources()] |
|
0981a08…
|
noreply
|
262 |
|
|
0981a08…
|
noreply
|
263 |
return KnowledgeGraphData(nodes=nodes, relationships=rels, sources=sources) |
|
321f2f5…
|
leo
|
264 |
|
|
321f2f5…
|
leo
|
265 |
def to_dict(self) -> Dict: |
|
321f2f5…
|
leo
|
266 |
"""Convert knowledge graph to dictionary (backward-compatible).""" |
|
0ad36b7…
|
noreply
|
267 |
return self._store.to_dict() |
|
321f2f5…
|
leo
|
268 |
|
|
321f2f5…
|
leo
|
269 |
def save(self, output_path: Union[str, Path]) -> Path: |
|
0981a08…
|
noreply
|
270 |
"""Save knowledge graph. Defaults to .db (SQLite), also supports .json.""" |
|
321f2f5…
|
leo
|
271 |
output_path = Path(output_path) |
|
321f2f5…
|
leo
|
272 |
if not output_path.suffix: |
|
0981a08…
|
noreply
|
273 |
output_path = output_path.with_suffix(".db") |
|
321f2f5…
|
leo
|
274 |
output_path.parent.mkdir(parents=True, exist_ok=True) |
|
321f2f5…
|
leo
|
275 |
|
|
0981a08…
|
noreply
|
276 |
if output_path.suffix == ".json": |
|
0981a08…
|
noreply
|
277 |
data = self.to_data() |
|
0981a08…
|
noreply
|
278 |
output_path.write_text(data.model_dump_json(indent=2)) |
|
0981a08…
|
noreply
|
279 |
elif output_path.suffix == ".db": |
|
0981a08…
|
noreply
|
280 |
# If the backing store is already SQLite at this path, it's already persisted. |
|
0981a08…
|
noreply
|
281 |
# Otherwise, create a new SQLite store and copy data into it. |
|
0981a08…
|
noreply
|
282 |
from video_processor.integrators.graph_store import SQLiteStore |
|
0981a08…
|
noreply
|
283 |
|
|
0981a08…
|
noreply
|
284 |
if not isinstance(self._store, SQLiteStore) or self._store._db_path != str(output_path): |
|
0981a08…
|
noreply
|
285 |
target = SQLiteStore(output_path) |
|
0981a08…
|
noreply
|
286 |
for source in self._store.get_sources(): |
|
0981a08…
|
noreply
|
287 |
target.register_source(source) |
|
0981a08…
|
noreply
|
288 |
for entity in self._store.get_all_entities(): |
|
0981a08…
|
noreply
|
289 |
descs = entity.get("descriptions", []) |
|
0981a08…
|
noreply
|
290 |
if isinstance(descs, set): |
|
0981a08…
|
noreply
|
291 |
descs = list(descs) |
|
0981a08…
|
noreply
|
292 |
target.merge_entity( |
|
0981a08…
|
noreply
|
293 |
entity["name"], |
|
0981a08…
|
noreply
|
294 |
entity.get("type", "concept"), |
|
0981a08…
|
noreply
|
295 |
descs, |
|
0981a08…
|
noreply
|
296 |
source=entity.get("source"), |
|
0981a08…
|
noreply
|
297 |
) |
|
0981a08…
|
noreply
|
298 |
for occ in entity.get("occurrences", []): |
|
0981a08…
|
noreply
|
299 |
target.add_occurrence( |
|
0981a08…
|
noreply
|
300 |
entity["name"], |
|
0981a08…
|
noreply
|
301 |
occ.get("source", ""), |
|
0981a08…
|
noreply
|
302 |
occ.get("timestamp"), |
|
0981a08…
|
noreply
|
303 |
occ.get("text"), |
|
0981a08…
|
noreply
|
304 |
) |
|
0981a08…
|
noreply
|
305 |
for rel in self._store.get_all_relationships(): |
|
0981a08…
|
noreply
|
306 |
target.add_relationship( |
|
0981a08…
|
noreply
|
307 |
rel.get("source", ""), |
|
0981a08…
|
noreply
|
308 |
rel.get("target", ""), |
|
0981a08…
|
noreply
|
309 |
rel.get("type", "related_to"), |
|
0981a08…
|
noreply
|
310 |
content_source=rel.get("content_source"), |
|
0981a08…
|
noreply
|
311 |
timestamp=rel.get("timestamp"), |
|
0981a08…
|
noreply
|
312 |
) |
|
0981a08…
|
noreply
|
313 |
target.close() |
|
0981a08…
|
noreply
|
314 |
else: |
|
0981a08…
|
noreply
|
315 |
# Unknown suffix — fall back to JSON |
|
0981a08…
|
noreply
|
316 |
data = self.to_data() |
|
0981a08…
|
noreply
|
317 |
output_path.write_text(data.model_dump_json(indent=2)) |
|
0981a08…
|
noreply
|
318 |
|
|
321f2f5…
|
leo
|
319 |
logger.info( |
|
0ad36b7…
|
noreply
|
320 |
f"Saved knowledge graph with {self._store.get_entity_count()} nodes " |
|
0ad36b7…
|
noreply
|
321 |
f"and {self._store.get_relationship_count()} relationships to {output_path}" |
|
321f2f5…
|
leo
|
322 |
) |
|
321f2f5…
|
leo
|
323 |
return output_path |
|
321f2f5…
|
leo
|
324 |
|
|
321f2f5…
|
leo
|
325 |
@classmethod |
|
0ad36b7…
|
noreply
|
326 |
def from_dict(cls, data: Dict, db_path: Optional[Path] = None) -> "KnowledgeGraph": |
|
321f2f5…
|
leo
|
327 |
"""Reconstruct a KnowledgeGraph from saved JSON dict.""" |
|
0ad36b7…
|
noreply
|
328 |
kg = cls(db_path=db_path) |
|
0981a08…
|
noreply
|
329 |
for source in data.get("sources", []): |
|
0981a08…
|
noreply
|
330 |
kg._store.register_source(source) |
|
321f2f5…
|
leo
|
331 |
for node in data.get("nodes", []): |
|
0ad36b7…
|
noreply
|
332 |
name = node.get("name", node.get("id", "")) |
|
321f2f5…
|
leo
|
333 |
descs = node.get("descriptions", []) |
|
0ad36b7…
|
noreply
|
334 |
if isinstance(descs, set): |
|
0ad36b7…
|
noreply
|
335 |
descs = list(descs) |
|
0ad36b7…
|
noreply
|
336 |
kg._store.merge_entity( |
|
0ad36b7…
|
noreply
|
337 |
name, node.get("type", "concept"), descs, source=node.get("source") |
|
0ad36b7…
|
noreply
|
338 |
) |
|
0ad36b7…
|
noreply
|
339 |
for occ in node.get("occurrences", []): |
|
0ad36b7…
|
noreply
|
340 |
kg._store.add_occurrence( |
|
0ad36b7…
|
noreply
|
341 |
name, |
|
0ad36b7…
|
noreply
|
342 |
occ.get("source", ""), |
|
0ad36b7…
|
noreply
|
343 |
occ.get("timestamp"), |
|
0ad36b7…
|
noreply
|
344 |
occ.get("text"), |
|
0ad36b7…
|
noreply
|
345 |
) |
|
0ad36b7…
|
noreply
|
346 |
for rel in data.get("relationships", []): |
|
0ad36b7…
|
noreply
|
347 |
kg._store.add_relationship( |
|
0ad36b7…
|
noreply
|
348 |
rel.get("source", ""), |
|
0ad36b7…
|
noreply
|
349 |
rel.get("target", ""), |
|
0ad36b7…
|
noreply
|
350 |
rel.get("type", "related_to"), |
|
0ad36b7…
|
noreply
|
351 |
content_source=rel.get("content_source"), |
|
0ad36b7…
|
noreply
|
352 |
timestamp=rel.get("timestamp"), |
|
0ad36b7…
|
noreply
|
353 |
) |
|
321f2f5…
|
leo
|
354 |
return kg |
|
321f2f5…
|
leo
|
355 |
|
|
0981a08…
|
noreply
|
356 |
# Type specificity ranking for conflict resolution during merge. |
|
0981a08…
|
noreply
|
357 |
# Higher rank = more specific type wins when two entities match. |
|
0981a08…
|
noreply
|
358 |
_TYPE_SPECIFICITY = { |
|
0981a08…
|
noreply
|
359 |
"concept": 0, |
|
0981a08…
|
noreply
|
360 |
"time": 1, |
|
0981a08…
|
noreply
|
361 |
"diagram": 1, |
|
0981a08…
|
noreply
|
362 |
"organization": 2, |
|
0981a08…
|
noreply
|
363 |
"person": 3, |
|
0981a08…
|
noreply
|
364 |
"technology": 3, |
|
0981a08…
|
noreply
|
365 |
} |
|
0981a08…
|
noreply
|
366 |
|
|
0981a08…
|
noreply
|
367 |
@staticmethod |
|
0981a08…
|
noreply
|
368 |
def _fuzzy_match(name_a: str, name_b: str, threshold: float = 0.85) -> bool: |
|
0981a08…
|
noreply
|
369 |
"""Return True if two names are similar enough to be considered the same entity.""" |
|
0981a08…
|
noreply
|
370 |
from difflib import SequenceMatcher |
|
0981a08…
|
noreply
|
371 |
|
|
0981a08…
|
noreply
|
372 |
return SequenceMatcher(None, name_a.lower(), name_b.lower()).ratio() >= threshold |
|
0981a08…
|
noreply
|
373 |
|
|
0981a08…
|
noreply
|
374 |
def _more_specific_type(self, type_a: str, type_b: str) -> str: |
|
0981a08…
|
noreply
|
375 |
"""Return the more specific of two entity types.""" |
|
0981a08…
|
noreply
|
376 |
rank_a = self._TYPE_SPECIFICITY.get(type_a, 1) |
|
0981a08…
|
noreply
|
377 |
rank_b = self._TYPE_SPECIFICITY.get(type_b, 1) |
|
0981a08…
|
noreply
|
378 |
return type_a if rank_a >= rank_b else type_b |
|
0981a08…
|
noreply
|
379 |
|
|
321f2f5…
|
leo
|
380 |
def merge(self, other: "KnowledgeGraph") -> None: |
|
0981a08…
|
noreply
|
381 |
"""Merge another KnowledgeGraph into this one. |
|
0981a08…
|
noreply
|
382 |
|
|
0981a08…
|
noreply
|
383 |
Improvements over naive merge: |
|
0981a08…
|
noreply
|
384 |
- Fuzzy name matching (SequenceMatcher >= 0.85) to unify near-duplicate entities |
|
0981a08…
|
noreply
|
385 |
- Type conflict resolution: prefer more specific types (e.g. technology > concept) |
|
0981a08…
|
noreply
|
386 |
- Provenance: merged entities get a ``merged_from`` description entry |
|
0981a08…
|
noreply
|
387 |
""" |
|
0981a08…
|
noreply
|
388 |
for source in other._store.get_sources(): |
|
0981a08…
|
noreply
|
389 |
self._store.register_source(source) |
|
0981a08…
|
noreply
|
390 |
|
|
0981a08…
|
noreply
|
391 |
# Build a lookup of existing entity names for fuzzy matching |
|
0981a08…
|
noreply
|
392 |
existing_entities = self._store.get_all_entities() |
|
0981a08…
|
noreply
|
393 |
existing_names = {e["name"]: e for e in existing_entities} |
|
0981a08…
|
noreply
|
394 |
# Cache lowercase -> canonical name for fast lookup |
|
0981a08…
|
noreply
|
395 |
name_index: dict[str, str] = {n.lower(): n for n in existing_names} |
|
0981a08…
|
noreply
|
396 |
|
|
0ad36b7…
|
noreply
|
397 |
for entity in other._store.get_all_entities(): |
|
0981a08…
|
noreply
|
398 |
incoming_name = entity["name"] |
|
0ad36b7…
|
noreply
|
399 |
descs = entity.get("descriptions", []) |
|
0ad36b7…
|
noreply
|
400 |
if isinstance(descs, set): |
|
0ad36b7…
|
noreply
|
401 |
descs = list(descs) |
|
0981a08…
|
noreply
|
402 |
incoming_type = entity.get("type", "concept") |
|
0981a08…
|
noreply
|
403 |
|
|
0981a08…
|
noreply
|
404 |
# Try exact match first (case-insensitive), then fuzzy |
|
0981a08…
|
noreply
|
405 |
matched_name: Optional[str] = None |
|
0981a08…
|
noreply
|
406 |
if incoming_name.lower() in name_index: |
|
0981a08…
|
noreply
|
407 |
matched_name = name_index[incoming_name.lower()] |
|
0981a08…
|
noreply
|
408 |
else: |
|
0981a08…
|
noreply
|
409 |
for existing_name in existing_names: |
|
0981a08…
|
noreply
|
410 |
if self._fuzzy_match(incoming_name, existing_name): |
|
0981a08…
|
noreply
|
411 |
matched_name = existing_name |
|
0981a08…
|
noreply
|
412 |
break |
|
0981a08…
|
noreply
|
413 |
|
|
0981a08…
|
noreply
|
414 |
if matched_name is not None: |
|
0981a08…
|
noreply
|
415 |
# Resolve type conflict |
|
0981a08…
|
noreply
|
416 |
existing_type = existing_names[matched_name].get("type", "concept") |
|
0981a08…
|
noreply
|
417 |
resolved_type = self._more_specific_type(existing_type, incoming_type) |
|
0981a08…
|
noreply
|
418 |
|
|
0981a08…
|
noreply
|
419 |
# Add merge provenance |
|
0981a08…
|
noreply
|
420 |
merge_note = f"merged_from:{incoming_name}" |
|
0981a08…
|
noreply
|
421 |
merged_descs = descs if incoming_name == matched_name else descs + [merge_note] |
|
0981a08…
|
noreply
|
422 |
|
|
0981a08…
|
noreply
|
423 |
self._store.merge_entity( |
|
0981a08…
|
noreply
|
424 |
matched_name, resolved_type, merged_descs, source=entity.get("source") |
|
0981a08…
|
noreply
|
425 |
) |
|
0981a08…
|
noreply
|
426 |
target_name = matched_name |
|
0981a08…
|
noreply
|
427 |
else: |
|
0981a08…
|
noreply
|
428 |
self._store.merge_entity( |
|
0981a08…
|
noreply
|
429 |
incoming_name, incoming_type, descs, source=entity.get("source") |
|
0981a08…
|
noreply
|
430 |
) |
|
0981a08…
|
noreply
|
431 |
# Update indexes for subsequent fuzzy matches within this merge |
|
0981a08…
|
noreply
|
432 |
existing_names[incoming_name] = entity |
|
0981a08…
|
noreply
|
433 |
name_index[incoming_name.lower()] = incoming_name |
|
0981a08…
|
noreply
|
434 |
target_name = incoming_name |
|
0981a08…
|
noreply
|
435 |
|
|
0ad36b7…
|
noreply
|
436 |
for occ in entity.get("occurrences", []): |
|
0ad36b7…
|
noreply
|
437 |
self._store.add_occurrence( |
|
0981a08…
|
noreply
|
438 |
target_name, |
|
0ad36b7…
|
noreply
|
439 |
occ.get("source", ""), |
|
0ad36b7…
|
noreply
|
440 |
occ.get("timestamp"), |
|
0ad36b7…
|
noreply
|
441 |
occ.get("text"), |
|
0ad36b7…
|
noreply
|
442 |
) |
|
0ad36b7…
|
noreply
|
443 |
|
|
0ad36b7…
|
noreply
|
444 |
for rel in other._store.get_all_relationships(): |
|
0ad36b7…
|
noreply
|
445 |
self._store.add_relationship( |
|
0ad36b7…
|
noreply
|
446 |
rel.get("source", ""), |
|
0ad36b7…
|
noreply
|
447 |
rel.get("target", ""), |
|
0ad36b7…
|
noreply
|
448 |
rel.get("type", "related_to"), |
|
0ad36b7…
|
noreply
|
449 |
content_source=rel.get("content_source"), |
|
0ad36b7…
|
noreply
|
450 |
timestamp=rel.get("timestamp"), |
|
0ad36b7…
|
noreply
|
451 |
) |
|
0981a08…
|
noreply
|
452 |
|
|
0981a08…
|
noreply
|
453 |
def classify_for_planning(self): |
|
0981a08…
|
noreply
|
454 |
"""Classify entities in this knowledge graph into planning taxonomy types.""" |
|
0981a08…
|
noreply
|
455 |
from video_processor.integrators.taxonomy import TaxonomyClassifier |
|
0981a08…
|
noreply
|
456 |
|
|
0981a08…
|
noreply
|
457 |
classifier = TaxonomyClassifier(provider_manager=self.pm) |
|
0981a08…
|
noreply
|
458 |
entities = self._store.get_all_entities() |
|
0981a08…
|
noreply
|
459 |
relationships = self._store.get_all_relationships() |
|
0981a08…
|
noreply
|
460 |
return classifier.classify_entities(entities, relationships) |
|
321f2f5…
|
leo
|
461 |
|
|
321f2f5…
|
leo
|
462 |
def generate_mermaid(self, max_nodes: int = 30) -> str: |
|
321f2f5…
|
leo
|
463 |
"""Generate Mermaid visualization code.""" |
|
0ad36b7…
|
noreply
|
464 |
nodes = self.nodes |
|
0ad36b7…
|
noreply
|
465 |
rels = self.relationships |
|
0ad36b7…
|
noreply
|
466 |
|
|
321f2f5…
|
leo
|
467 |
node_importance = {} |
|
0ad36b7…
|
noreply
|
468 |
for node_id in nodes: |
|
0ad36b7…
|
noreply
|
469 |
count = sum(1 for rel in rels if rel["source"] == node_id or rel["target"] == node_id) |
|
321f2f5…
|
leo
|
470 |
node_importance[node_id] = count |
|
321f2f5…
|
leo
|
471 |
|
|
321f2f5…
|
leo
|
472 |
important = sorted(node_importance.items(), key=lambda x: x[1], reverse=True) |
|
321f2f5…
|
leo
|
473 |
important_ids = [n[0] for n in important[:max_nodes]] |
|
321f2f5…
|
leo
|
474 |
|
|
321f2f5…
|
leo
|
475 |
mermaid = ["graph LR"] |
|
321f2f5…
|
leo
|
476 |
|
|
321f2f5…
|
leo
|
477 |
for nid in important_ids: |
|
0ad36b7…
|
noreply
|
478 |
node = nodes[nid] |
|
321f2f5…
|
leo
|
479 |
ntype = node.get("type", "concept") |
|
321f2f5…
|
leo
|
480 |
# Sanitize id for mermaid (alphanumeric + underscore only) |
|
321f2f5…
|
leo
|
481 |
safe_id = "".join(c if c.isalnum() or c == "_" else "_" for c in nid) |
|
321f2f5…
|
leo
|
482 |
safe_name = node["name"].replace('"', "'") |
|
321f2f5…
|
leo
|
483 |
mermaid.append(f' {safe_id}["{safe_name}"]:::{ntype}') |
|
321f2f5…
|
leo
|
484 |
|
|
321f2f5…
|
leo
|
485 |
added = set() |
|
0ad36b7…
|
noreply
|
486 |
for rel in rels: |
|
321f2f5…
|
leo
|
487 |
src, tgt = rel["source"], rel["target"] |
|
321f2f5…
|
leo
|
488 |
if src in important_ids and tgt in important_ids: |
|
321f2f5…
|
leo
|
489 |
rtype = rel.get("type", "related_to") |
|
321f2f5…
|
leo
|
490 |
key = f"{src}|{tgt}|{rtype}" |
|
321f2f5…
|
leo
|
491 |
if key not in added: |
|
321f2f5…
|
leo
|
492 |
safe_src = "".join(c if c.isalnum() or c == "_" else "_" for c in src) |
|
321f2f5…
|
leo
|
493 |
safe_tgt = "".join(c if c.isalnum() or c == "_" else "_" for c in tgt) |
|
321f2f5…
|
leo
|
494 |
mermaid.append(f' {safe_src} -- "{rtype}" --> {safe_tgt}') |
|
321f2f5…
|
leo
|
495 |
added.add(key) |
|
321f2f5…
|
leo
|
496 |
|
|
321f2f5…
|
leo
|
497 |
mermaid.append(" classDef person fill:#f9d5e5,stroke:#333,stroke-width:1px") |
|
321f2f5…
|
leo
|
498 |
mermaid.append(" classDef concept fill:#eeeeee,stroke:#333,stroke-width:1px") |
|
321f2f5…
|
leo
|
499 |
mermaid.append(" classDef diagram fill:#d5f9e5,stroke:#333,stroke-width:1px") |
|
321f2f5…
|
leo
|
500 |
mermaid.append(" classDef time fill:#e5d5f9,stroke:#333,stroke-width:1px") |
|
321f2f5…
|
leo
|
501 |
|
|
321f2f5…
|
leo
|
502 |
return "\n".join(mermaid) |