|
ccf32cc…
|
leo
|
1 |
"""Content cross-referencing between transcript and diagram entities.""" |
|
ccf32cc…
|
leo
|
2 |
|
|
ccf32cc…
|
leo
|
3 |
import logging |
|
ccf32cc…
|
leo
|
4 |
from typing import List, Optional |
|
ccf32cc…
|
leo
|
5 |
|
|
ccf32cc…
|
leo
|
6 |
from video_processor.models import Entity, KeyPoint |
|
ccf32cc…
|
leo
|
7 |
from video_processor.providers.manager import ProviderManager |
|
ccf32cc…
|
leo
|
8 |
from video_processor.utils.json_parsing import parse_json_from_response |
|
ccf32cc…
|
leo
|
9 |
|
|
ccf32cc…
|
leo
|
10 |
logger = logging.getLogger(__name__) |
|
ccf32cc…
|
leo
|
11 |
|
|
ccf32cc…
|
leo
|
12 |
|
|
ccf32cc…
|
leo
|
13 |
class ContentAnalyzer: |
|
ccf32cc…
|
leo
|
14 |
"""Cross-references transcript and diagram entities for richer knowledge.""" |
|
ccf32cc…
|
leo
|
15 |
|
|
ccf32cc…
|
leo
|
16 |
def __init__(self, provider_manager: Optional[ProviderManager] = None): |
|
ccf32cc…
|
leo
|
17 |
self.pm = provider_manager |
|
ccf32cc…
|
leo
|
18 |
|
|
ccf32cc…
|
leo
|
19 |
def cross_reference( |
|
ccf32cc…
|
leo
|
20 |
self, |
|
ccf32cc…
|
leo
|
21 |
transcript_entities: List[Entity], |
|
ccf32cc…
|
leo
|
22 |
diagram_entities: List[Entity], |
|
ccf32cc…
|
leo
|
23 |
) -> List[Entity]: |
|
ccf32cc…
|
leo
|
24 |
""" |
|
ccf32cc…
|
leo
|
25 |
Merge entities from transcripts and diagrams. |
|
ccf32cc…
|
leo
|
26 |
|
|
ccf32cc…
|
leo
|
27 |
Merges by exact name overlap first, then uses LLM for fuzzy matching |
|
ccf32cc…
|
leo
|
28 |
of remaining entities. Adds source attribution. |
|
ccf32cc…
|
leo
|
29 |
""" |
|
ccf32cc…
|
leo
|
30 |
merged: dict[str, Entity] = {} |
|
ccf32cc…
|
leo
|
31 |
|
|
ccf32cc…
|
leo
|
32 |
# Index transcript entities |
|
ccf32cc…
|
leo
|
33 |
for e in transcript_entities: |
|
ccf32cc…
|
leo
|
34 |
key = e.name.lower() |
|
ccf32cc…
|
leo
|
35 |
merged[key] = Entity( |
|
ccf32cc…
|
leo
|
36 |
name=e.name, |
|
ccf32cc…
|
leo
|
37 |
type=e.type, |
|
ccf32cc…
|
leo
|
38 |
descriptions=list(e.descriptions), |
|
ccf32cc…
|
leo
|
39 |
source="transcript", |
|
ccf32cc…
|
leo
|
40 |
occurrences=list(e.occurrences), |
|
ccf32cc…
|
leo
|
41 |
) |
|
ccf32cc…
|
leo
|
42 |
|
|
ccf32cc…
|
leo
|
43 |
# Merge diagram entities |
|
ccf32cc…
|
leo
|
44 |
for e in diagram_entities: |
|
ccf32cc…
|
leo
|
45 |
key = e.name.lower() |
|
ccf32cc…
|
leo
|
46 |
if key in merged: |
|
ccf32cc…
|
leo
|
47 |
existing = merged[key] |
|
ccf32cc…
|
leo
|
48 |
existing.source = "both" |
|
ccf32cc…
|
leo
|
49 |
existing.descriptions = list(set(existing.descriptions + e.descriptions)) |
|
ccf32cc…
|
leo
|
50 |
existing.occurrences.extend(e.occurrences) |
|
ccf32cc…
|
leo
|
51 |
else: |
|
ccf32cc…
|
leo
|
52 |
merged[key] = Entity( |
|
ccf32cc…
|
leo
|
53 |
name=e.name, |
|
ccf32cc…
|
leo
|
54 |
type=e.type, |
|
ccf32cc…
|
leo
|
55 |
descriptions=list(e.descriptions), |
|
ccf32cc…
|
leo
|
56 |
source="diagram", |
|
ccf32cc…
|
leo
|
57 |
occurrences=list(e.occurrences), |
|
ccf32cc…
|
leo
|
58 |
) |
|
ccf32cc…
|
leo
|
59 |
|
|
ccf32cc…
|
leo
|
60 |
# LLM fuzzy matching for unmatched entities |
|
ccf32cc…
|
leo
|
61 |
if self.pm: |
|
ccf32cc…
|
leo
|
62 |
unmatched_t = [ |
|
829e24a…
|
leo
|
63 |
e |
|
829e24a…
|
leo
|
64 |
for e in transcript_entities |
|
829e24a…
|
leo
|
65 |
if e.name.lower() not in {d.name.lower() for d in diagram_entities} |
|
ccf32cc…
|
leo
|
66 |
] |
|
ccf32cc…
|
leo
|
67 |
unmatched_d = [ |
|
829e24a…
|
leo
|
68 |
e |
|
829e24a…
|
leo
|
69 |
for e in diagram_entities |
|
829e24a…
|
leo
|
70 |
if e.name.lower() not in {t.name.lower() for t in transcript_entities} |
|
ccf32cc…
|
leo
|
71 |
] |
|
ccf32cc…
|
leo
|
72 |
|
|
ccf32cc…
|
leo
|
73 |
if unmatched_t and unmatched_d: |
|
ccf32cc…
|
leo
|
74 |
matches = self._fuzzy_match(unmatched_t, unmatched_d) |
|
ccf32cc…
|
leo
|
75 |
for t_name, d_name in matches: |
|
ccf32cc…
|
leo
|
76 |
t_key = t_name.lower() |
|
ccf32cc…
|
leo
|
77 |
d_key = d_name.lower() |
|
ccf32cc…
|
leo
|
78 |
if t_key in merged and d_key in merged: |
|
ccf32cc…
|
leo
|
79 |
t_entity = merged[t_key] |
|
ccf32cc…
|
leo
|
80 |
d_entity = merged.pop(d_key) |
|
ccf32cc…
|
leo
|
81 |
t_entity.source = "both" |
|
ccf32cc…
|
leo
|
82 |
t_entity.descriptions = list( |
|
ccf32cc…
|
leo
|
83 |
set(t_entity.descriptions + d_entity.descriptions) |
|
ccf32cc…
|
leo
|
84 |
) |
|
ccf32cc…
|
leo
|
85 |
t_entity.occurrences.extend(d_entity.occurrences) |
|
ccf32cc…
|
leo
|
86 |
|
|
ccf32cc…
|
leo
|
87 |
return list(merged.values()) |
|
ccf32cc…
|
leo
|
88 |
|
|
ccf32cc…
|
leo
|
89 |
def _fuzzy_match( |
|
ccf32cc…
|
leo
|
90 |
self, |
|
ccf32cc…
|
leo
|
91 |
transcript_entities: List[Entity], |
|
ccf32cc…
|
leo
|
92 |
diagram_entities: List[Entity], |
|
ccf32cc…
|
leo
|
93 |
) -> List[tuple[str, str]]: |
|
ccf32cc…
|
leo
|
94 |
"""Use LLM to fuzzy-match entity names across sources.""" |
|
ccf32cc…
|
leo
|
95 |
if not self.pm: |
|
ccf32cc…
|
leo
|
96 |
return [] |
|
ccf32cc…
|
leo
|
97 |
|
|
ccf32cc…
|
leo
|
98 |
t_names = [e.name for e in transcript_entities] |
|
ccf32cc…
|
leo
|
99 |
d_names = [e.name for e in diagram_entities] |
|
ccf32cc…
|
leo
|
100 |
|
|
ccf32cc…
|
leo
|
101 |
prompt = ( |
|
ccf32cc…
|
leo
|
102 |
"Match entities that refer to the same thing across these two lists.\n\n" |
|
ccf32cc…
|
leo
|
103 |
f"Transcript entities: {t_names}\n" |
|
ccf32cc…
|
leo
|
104 |
f"Diagram entities: {d_names}\n\n" |
|
ccf32cc…
|
leo
|
105 |
"Return a JSON array of matched pairs:\n" |
|
ccf32cc…
|
leo
|
106 |
'[{"transcript": "name from list 1", "diagram": "name from list 2"}]\n\n' |
|
ccf32cc…
|
leo
|
107 |
"Only include confident matches. Return empty array if no matches.\n" |
|
ccf32cc…
|
leo
|
108 |
"Return ONLY the JSON array." |
|
ccf32cc…
|
leo
|
109 |
) |
|
ccf32cc…
|
leo
|
110 |
|
|
ccf32cc…
|
leo
|
111 |
try: |
|
ccf32cc…
|
leo
|
112 |
raw = self.pm.chat([{"role": "user", "content": prompt}], temperature=0.2) |
|
ccf32cc…
|
leo
|
113 |
parsed = parse_json_from_response(raw) |
|
ccf32cc…
|
leo
|
114 |
if isinstance(parsed, list): |
|
ccf32cc…
|
leo
|
115 |
return [ |
|
ccf32cc…
|
leo
|
116 |
(item["transcript"], item["diagram"]) |
|
ccf32cc…
|
leo
|
117 |
for item in parsed |
|
ccf32cc…
|
leo
|
118 |
if isinstance(item, dict) and "transcript" in item and "diagram" in item |
|
ccf32cc…
|
leo
|
119 |
] |
|
ccf32cc…
|
leo
|
120 |
except Exception as e: |
|
ccf32cc…
|
leo
|
121 |
logger.warning(f"Fuzzy matching failed: {e}") |
|
ccf32cc…
|
leo
|
122 |
|
|
ccf32cc…
|
leo
|
123 |
return [] |
|
ccf32cc…
|
leo
|
124 |
|
|
ccf32cc…
|
leo
|
125 |
def enrich_key_points( |
|
ccf32cc…
|
leo
|
126 |
self, |
|
ccf32cc…
|
leo
|
127 |
key_points: List[KeyPoint], |
|
ccf32cc…
|
leo
|
128 |
diagrams: list, |
|
ccf32cc…
|
leo
|
129 |
transcript_text: str, |
|
ccf32cc…
|
leo
|
130 |
) -> List[KeyPoint]: |
|
ccf32cc…
|
leo
|
131 |
""" |
|
ccf32cc…
|
leo
|
132 |
Link key points to relevant diagrams by entity overlap and temporal proximity. |
|
ccf32cc…
|
leo
|
133 |
""" |
|
ccf32cc…
|
leo
|
134 |
if not diagrams: |
|
ccf32cc…
|
leo
|
135 |
return key_points |
|
ccf32cc…
|
leo
|
136 |
|
|
ccf32cc…
|
leo
|
137 |
# Build diagram entity index |
|
ccf32cc…
|
leo
|
138 |
diagram_entities: dict[int, set[str]] = {} |
|
ccf32cc…
|
leo
|
139 |
for i, d in enumerate(diagrams): |
|
ccf32cc…
|
leo
|
140 |
elements = d.get("elements", []) if isinstance(d, dict) else getattr(d, "elements", []) |
|
829e24a…
|
leo
|
141 |
text = ( |
|
829e24a…
|
leo
|
142 |
d.get("text_content", "") if isinstance(d, dict) else getattr(d, "text_content", "") |
|
829e24a…
|
leo
|
143 |
) |
|
ccf32cc…
|
leo
|
144 |
entities = set(str(e).lower() for e in elements) |
|
ccf32cc…
|
leo
|
145 |
if text: |
|
ccf32cc…
|
leo
|
146 |
entities.update(word.lower() for word in text.split() if len(word) > 3) |
|
ccf32cc…
|
leo
|
147 |
diagram_entities[i] = entities |
|
ccf32cc…
|
leo
|
148 |
|
|
ccf32cc…
|
leo
|
149 |
# Match key points to diagrams |
|
ccf32cc…
|
leo
|
150 |
for kp in key_points: |
|
ccf32cc…
|
leo
|
151 |
kp_words = set(kp.point.lower().split()) |
|
ccf32cc…
|
leo
|
152 |
if kp.details: |
|
ccf32cc…
|
leo
|
153 |
kp_words.update(kp.details.lower().split()) |
|
ccf32cc…
|
leo
|
154 |
|
|
ccf32cc…
|
leo
|
155 |
related = [] |
|
ccf32cc…
|
leo
|
156 |
for idx, d_entities in diagram_entities.items(): |
|
ccf32cc…
|
leo
|
157 |
overlap = kp_words & d_entities |
|
ccf32cc…
|
leo
|
158 |
if len(overlap) >= 2: |
|
ccf32cc…
|
leo
|
159 |
related.append(idx) |
|
ccf32cc…
|
leo
|
160 |
|
|
ccf32cc…
|
leo
|
161 |
if related: |
|
ccf32cc…
|
leo
|
162 |
kp.related_diagrams = related |
|
ccf32cc…
|
leo
|
163 |
|
|
ccf32cc…
|
leo
|
164 |
return key_points |