|
1
|
"""Content cross-referencing between transcript and diagram entities.""" |
|
2
|
|
|
3
|
import logging |
|
4
|
from typing import List, Optional |
|
5
|
|
|
6
|
from video_processor.models import Entity, KeyPoint |
|
7
|
from video_processor.providers.manager import ProviderManager |
|
8
|
from video_processor.utils.json_parsing import parse_json_from_response |
|
9
|
|
|
10
|
logger = logging.getLogger(__name__) |
|
11
|
|
|
12
|
|
|
13
|
class ContentAnalyzer: |
|
14
|
"""Cross-references transcript and diagram entities for richer knowledge.""" |
|
15
|
|
|
16
|
def __init__(self, provider_manager: Optional[ProviderManager] = None): |
|
17
|
self.pm = provider_manager |
|
18
|
|
|
19
|
def cross_reference( |
|
20
|
self, |
|
21
|
transcript_entities: List[Entity], |
|
22
|
diagram_entities: List[Entity], |
|
23
|
) -> List[Entity]: |
|
24
|
""" |
|
25
|
Merge entities from transcripts and diagrams. |
|
26
|
|
|
27
|
Merges by exact name overlap first, then uses LLM for fuzzy matching |
|
28
|
of remaining entities. Adds source attribution. |
|
29
|
""" |
|
30
|
merged: dict[str, Entity] = {} |
|
31
|
|
|
32
|
# Index transcript entities |
|
33
|
for e in transcript_entities: |
|
34
|
key = e.name.lower() |
|
35
|
merged[key] = Entity( |
|
36
|
name=e.name, |
|
37
|
type=e.type, |
|
38
|
descriptions=list(e.descriptions), |
|
39
|
source="transcript", |
|
40
|
occurrences=list(e.occurrences), |
|
41
|
) |
|
42
|
|
|
43
|
# Merge diagram entities |
|
44
|
for e in diagram_entities: |
|
45
|
key = e.name.lower() |
|
46
|
if key in merged: |
|
47
|
existing = merged[key] |
|
48
|
existing.source = "both" |
|
49
|
existing.descriptions = list(set(existing.descriptions + e.descriptions)) |
|
50
|
existing.occurrences.extend(e.occurrences) |
|
51
|
else: |
|
52
|
merged[key] = Entity( |
|
53
|
name=e.name, |
|
54
|
type=e.type, |
|
55
|
descriptions=list(e.descriptions), |
|
56
|
source="diagram", |
|
57
|
occurrences=list(e.occurrences), |
|
58
|
) |
|
59
|
|
|
60
|
# LLM fuzzy matching for unmatched entities |
|
61
|
if self.pm: |
|
62
|
unmatched_t = [ |
|
63
|
e |
|
64
|
for e in transcript_entities |
|
65
|
if e.name.lower() not in {d.name.lower() for d in diagram_entities} |
|
66
|
] |
|
67
|
unmatched_d = [ |
|
68
|
e |
|
69
|
for e in diagram_entities |
|
70
|
if e.name.lower() not in {t.name.lower() for t in transcript_entities} |
|
71
|
] |
|
72
|
|
|
73
|
if unmatched_t and unmatched_d: |
|
74
|
matches = self._fuzzy_match(unmatched_t, unmatched_d) |
|
75
|
for t_name, d_name in matches: |
|
76
|
t_key = t_name.lower() |
|
77
|
d_key = d_name.lower() |
|
78
|
if t_key in merged and d_key in merged: |
|
79
|
t_entity = merged[t_key] |
|
80
|
d_entity = merged.pop(d_key) |
|
81
|
t_entity.source = "both" |
|
82
|
t_entity.descriptions = list( |
|
83
|
set(t_entity.descriptions + d_entity.descriptions) |
|
84
|
) |
|
85
|
t_entity.occurrences.extend(d_entity.occurrences) |
|
86
|
|
|
87
|
return list(merged.values()) |
|
88
|
|
|
89
|
def _fuzzy_match( |
|
90
|
self, |
|
91
|
transcript_entities: List[Entity], |
|
92
|
diagram_entities: List[Entity], |
|
93
|
) -> List[tuple[str, str]]: |
|
94
|
"""Use LLM to fuzzy-match entity names across sources.""" |
|
95
|
if not self.pm: |
|
96
|
return [] |
|
97
|
|
|
98
|
t_names = [e.name for e in transcript_entities] |
|
99
|
d_names = [e.name for e in diagram_entities] |
|
100
|
|
|
101
|
prompt = ( |
|
102
|
"Match entities that refer to the same thing across these two lists.\n\n" |
|
103
|
f"Transcript entities: {t_names}\n" |
|
104
|
f"Diagram entities: {d_names}\n\n" |
|
105
|
"Return a JSON array of matched pairs:\n" |
|
106
|
'[{"transcript": "name from list 1", "diagram": "name from list 2"}]\n\n' |
|
107
|
"Only include confident matches. Return empty array if no matches.\n" |
|
108
|
"Return ONLY the JSON array." |
|
109
|
) |
|
110
|
|
|
111
|
try: |
|
112
|
raw = self.pm.chat([{"role": "user", "content": prompt}], temperature=0.2) |
|
113
|
parsed = parse_json_from_response(raw) |
|
114
|
if isinstance(parsed, list): |
|
115
|
return [ |
|
116
|
(item["transcript"], item["diagram"]) |
|
117
|
for item in parsed |
|
118
|
if isinstance(item, dict) and "transcript" in item and "diagram" in item |
|
119
|
] |
|
120
|
except Exception as e: |
|
121
|
logger.warning(f"Fuzzy matching failed: {e}") |
|
122
|
|
|
123
|
return [] |
|
124
|
|
|
125
|
def enrich_key_points( |
|
126
|
self, |
|
127
|
key_points: List[KeyPoint], |
|
128
|
diagrams: list, |
|
129
|
transcript_text: str, |
|
130
|
) -> List[KeyPoint]: |
|
131
|
""" |
|
132
|
Link key points to relevant diagrams by entity overlap and temporal proximity. |
|
133
|
""" |
|
134
|
if not diagrams: |
|
135
|
return key_points |
|
136
|
|
|
137
|
# Build diagram entity index |
|
138
|
diagram_entities: dict[int, set[str]] = {} |
|
139
|
for i, d in enumerate(diagrams): |
|
140
|
elements = d.get("elements", []) if isinstance(d, dict) else getattr(d, "elements", []) |
|
141
|
text = ( |
|
142
|
d.get("text_content", "") if isinstance(d, dict) else getattr(d, "text_content", "") |
|
143
|
) |
|
144
|
entities = set(str(e).lower() for e in elements) |
|
145
|
if text: |
|
146
|
entities.update(word.lower() for word in text.split() if len(word) > 3) |
|
147
|
diagram_entities[i] = entities |
|
148
|
|
|
149
|
# Match key points to diagrams |
|
150
|
for kp in key_points: |
|
151
|
kp_words = set(kp.point.lower().split()) |
|
152
|
if kp.details: |
|
153
|
kp_words.update(kp.details.lower().split()) |
|
154
|
|
|
155
|
related = [] |
|
156
|
for idx, d_entities in diagram_entities.items(): |
|
157
|
overlap = kp_words & d_entities |
|
158
|
if len(overlap) >= 2: |
|
159
|
related.append(idx) |
|
160
|
|
|
161
|
if related: |
|
162
|
kp.related_diagrams = related |
|
163
|
|
|
164
|
return key_points |
|
165
|
|