PlanOpticon

planopticon / video_processor / analyzers / content_analyzer.py

Blame History Raw 165 lines

1	`"""Content cross-referencing between transcript and diagram entities."""`
2
3	`import logging`
4	`from typing import List, Optional`
5
6	`from video_processor.models import Entity, KeyPoint`
7	`from video_processor.providers.manager import ProviderManager`
8	`from video_processor.utils.json_parsing import parse_json_from_response`
9
10	`logger = logging.getLogger(__name__)`
11
12
13	`class ContentAnalyzer:`
14	`"""Cross-references transcript and diagram entities for richer knowledge."""`
15
16	`def __init__(self, provider_manager: Optional[ProviderManager] = None):`
17	`self.pm = provider_manager`
18
19	`def cross_reference(`
20	`self,`
21	`transcript_entities: List[Entity],`
22	`diagram_entities: List[Entity],`
23	`) -> List[Entity]:`
24	`"""`
25	`Merge entities from transcripts and diagrams.`
26
27	`Merges by exact name overlap first, then uses LLM for fuzzy matching`
28	`of remaining entities. Adds source attribution.`
29	`"""`
30	`merged: dict[str, Entity] = {}`
31
32	`# Index transcript entities`
33	`for e in transcript_entities:`
34	`key = e.name.lower()`
35	`merged[key] = Entity(`
36	`name=e.name,`
37	`type=e.type,`
38	`descriptions=list(e.descriptions),`
39	`source="transcript",`
40	`occurrences=list(e.occurrences),`
41	`)`
42
43	`# Merge diagram entities`
44	`for e in diagram_entities:`
45	`key = e.name.lower()`
46	`if key in merged:`
47	`existing = merged[key]`
48	`existing.source = "both"`
49	`existing.descriptions = list(set(existing.descriptions + e.descriptions))`
50	`existing.occurrences.extend(e.occurrences)`
51	`else:`
52	`merged[key] = Entity(`
53	`name=e.name,`
54	`type=e.type,`
55	`descriptions=list(e.descriptions),`
56	`source="diagram",`
57	`occurrences=list(e.occurrences),`
58	`)`
59
60	`# LLM fuzzy matching for unmatched entities`
61	`if self.pm:`
62	`unmatched_t = [`
63	`e`
64	`for e in transcript_entities`
65	`if e.name.lower() not in {d.name.lower() for d in diagram_entities}`
66	`]`
67	`unmatched_d = [`
68	`e`
69	`for e in diagram_entities`
70	`if e.name.lower() not in {t.name.lower() for t in transcript_entities}`
71	`]`
72
73	`if unmatched_t and unmatched_d:`
74	`matches = self._fuzzy_match(unmatched_t, unmatched_d)`
75	`for t_name, d_name in matches:`
76	`t_key = t_name.lower()`
77	`d_key = d_name.lower()`
78	`if t_key in merged and d_key in merged:`
79	`t_entity = merged[t_key]`
80	`d_entity = merged.pop(d_key)`
81	`t_entity.source = "both"`
82	`t_entity.descriptions = list(`
83	`set(t_entity.descriptions + d_entity.descriptions)`
84	`)`
85	`t_entity.occurrences.extend(d_entity.occurrences)`
86
87	`return list(merged.values())`
88
89	`def _fuzzy_match(`
90	`self,`
91	`transcript_entities: List[Entity],`
92	`diagram_entities: List[Entity],`
93	`) -> List[tuple[str, str]]:`
94	`"""Use LLM to fuzzy-match entity names across sources."""`
95	`if not self.pm:`
96	`return []`
97
98	`t_names = [e.name for e in transcript_entities]`
99	`d_names = [e.name for e in diagram_entities]`
100
101	`prompt = (`
102	`"Match entities that refer to the same thing across these two lists.\n\n"`
103	`f"Transcript entities: {t_names}\n"`
104	`f"Diagram entities: {d_names}\n\n"`
105	`"Return a JSON array of matched pairs:\n"`
106	`'[{"transcript": "name from list 1", "diagram": "name from list 2"}]\n\n'`
107	`"Only include confident matches. Return empty array if no matches.\n"`
108	`"Return ONLY the JSON array."`
109	`)`
110
111	`try:`
112	`raw = self.pm.chat([{"role": "user", "content": prompt}], temperature=0.2)`
113	`parsed = parse_json_from_response(raw)`
114	`if isinstance(parsed, list):`
115	`return [`
116	`(item["transcript"], item["diagram"])`
117	`for item in parsed`
118	`if isinstance(item, dict) and "transcript" in item and "diagram" in item`
119	`]`
120	`except Exception as e:`
121	`logger.warning(f"Fuzzy matching failed: {e}")`
122
123	`return []`
124
125	`def enrich_key_points(`
126	`self,`
127	`key_points: List[KeyPoint],`
128	`diagrams: list,`
129	`transcript_text: str,`
130	`) -> List[KeyPoint]:`
131	`"""`
132	`Link key points to relevant diagrams by entity overlap and temporal proximity.`
133	`"""`
134	`if not diagrams:`
135	`return key_points`
136
137	`# Build diagram entity index`
138	`diagram_entities: dict[int, set[str]] = {}`
139	`for i, d in enumerate(diagrams):`
140	`elements = d.get("elements", []) if isinstance(d, dict) else getattr(d, "elements", [])`
141	`text = (`
142	`d.get("text_content", "") if isinstance(d, dict) else getattr(d, "text_content", "")`
143	`)`
144	`entities = set(str(e).lower() for e in elements)`
145	`if text:`
146	`entities.update(word.lower() for word in text.split() if len(word) > 3)`
147	`diagram_entities[i] = entities`
148
149	`# Match key points to diagrams`
150	`for kp in key_points:`
151	`kp_words = set(kp.point.lower().split())`
152	`if kp.details:`
153	`kp_words.update(kp.details.lower().split())`
154
155	`related = []`
156	`for idx, d_entities in diagram_entities.items():`
157	`overlap = kp_words & d_entities`
158	`if len(overlap) >= 2:`
159	`related.append(idx)`
160
161	`if related:`
162	`kp.related_diagrams = related`
163
164	`return key_points`
165

PlanOpticon

Keyboard Shortcuts