|
ccf32cc…
|
leo
|
1 |
"""Tests for content cross-referencing between transcript and diagram entities.""" |
|
ccf32cc…
|
leo
|
2 |
|
|
ccf32cc…
|
leo
|
3 |
import json |
|
829e24a…
|
leo
|
4 |
from unittest.mock import MagicMock |
|
ccf32cc…
|
leo
|
5 |
|
|
ccf32cc…
|
leo
|
6 |
from video_processor.analyzers.content_analyzer import ContentAnalyzer |
|
ccf32cc…
|
leo
|
7 |
from video_processor.models import Entity, KeyPoint |
|
ccf32cc…
|
leo
|
8 |
|
|
ccf32cc…
|
leo
|
9 |
|
|
ccf32cc…
|
leo
|
10 |
class TestCrossReference: |
|
ccf32cc…
|
leo
|
11 |
def test_exact_match_merges(self): |
|
ccf32cc…
|
leo
|
12 |
analyzer = ContentAnalyzer() |
|
ccf32cc…
|
leo
|
13 |
t_entities = [ |
|
ccf32cc…
|
leo
|
14 |
Entity(name="Python", type="concept", descriptions=["A language"]), |
|
ccf32cc…
|
leo
|
15 |
] |
|
ccf32cc…
|
leo
|
16 |
d_entities = [ |
|
ccf32cc…
|
leo
|
17 |
Entity(name="Python", type="concept", descriptions=["A snake-named lang"]), |
|
ccf32cc…
|
leo
|
18 |
] |
|
ccf32cc…
|
leo
|
19 |
result = analyzer.cross_reference(t_entities, d_entities) |
|
ccf32cc…
|
leo
|
20 |
assert len(result) == 1 |
|
ccf32cc…
|
leo
|
21 |
assert result[0].source == "both" |
|
ccf32cc…
|
leo
|
22 |
assert "A language" in result[0].descriptions |
|
ccf32cc…
|
leo
|
23 |
assert "A snake-named lang" in result[0].descriptions |
|
ccf32cc…
|
leo
|
24 |
|
|
ccf32cc…
|
leo
|
25 |
def test_case_insensitive_merge(self): |
|
ccf32cc…
|
leo
|
26 |
analyzer = ContentAnalyzer() |
|
ccf32cc…
|
leo
|
27 |
t_entities = [Entity(name="Docker", type="technology", descriptions=["Containers"])] |
|
ccf32cc…
|
leo
|
28 |
d_entities = [Entity(name="docker", type="technology", descriptions=["Container runtime"])] |
|
ccf32cc…
|
leo
|
29 |
result = analyzer.cross_reference(t_entities, d_entities) |
|
ccf32cc…
|
leo
|
30 |
assert len(result) == 1 |
|
ccf32cc…
|
leo
|
31 |
assert result[0].source == "both" |
|
ccf32cc…
|
leo
|
32 |
|
|
ccf32cc…
|
leo
|
33 |
def test_no_overlap_keeps_both(self): |
|
ccf32cc…
|
leo
|
34 |
analyzer = ContentAnalyzer() |
|
ccf32cc…
|
leo
|
35 |
t_entities = [Entity(name="Python", type="concept", descriptions=["Lang"])] |
|
ccf32cc…
|
leo
|
36 |
d_entities = [Entity(name="Rust", type="concept", descriptions=["Systems"])] |
|
ccf32cc…
|
leo
|
37 |
result = analyzer.cross_reference(t_entities, d_entities) |
|
ccf32cc…
|
leo
|
38 |
assert len(result) == 2 |
|
ccf32cc…
|
leo
|
39 |
names = {e.name for e in result} |
|
ccf32cc…
|
leo
|
40 |
assert names == {"Python", "Rust"} |
|
ccf32cc…
|
leo
|
41 |
|
|
ccf32cc…
|
leo
|
42 |
def test_transcript_only(self): |
|
ccf32cc…
|
leo
|
43 |
analyzer = ContentAnalyzer() |
|
ccf32cc…
|
leo
|
44 |
t_entities = [Entity(name="Foo", type="concept")] |
|
ccf32cc…
|
leo
|
45 |
result = analyzer.cross_reference(t_entities, []) |
|
ccf32cc…
|
leo
|
46 |
assert len(result) == 1 |
|
ccf32cc…
|
leo
|
47 |
assert result[0].source == "transcript" |
|
ccf32cc…
|
leo
|
48 |
|
|
ccf32cc…
|
leo
|
49 |
def test_diagram_only(self): |
|
ccf32cc…
|
leo
|
50 |
analyzer = ContentAnalyzer() |
|
ccf32cc…
|
leo
|
51 |
d_entities = [Entity(name="Bar", type="concept")] |
|
ccf32cc…
|
leo
|
52 |
result = analyzer.cross_reference([], d_entities) |
|
ccf32cc…
|
leo
|
53 |
assert len(result) == 1 |
|
ccf32cc…
|
leo
|
54 |
assert result[0].source == "diagram" |
|
ccf32cc…
|
leo
|
55 |
|
|
ccf32cc…
|
leo
|
56 |
def test_empty_inputs(self): |
|
ccf32cc…
|
leo
|
57 |
analyzer = ContentAnalyzer() |
|
ccf32cc…
|
leo
|
58 |
result = analyzer.cross_reference([], []) |
|
ccf32cc…
|
leo
|
59 |
assert result == [] |
|
ccf32cc…
|
leo
|
60 |
|
|
ccf32cc…
|
leo
|
61 |
def test_occurrences_merged(self): |
|
ccf32cc…
|
leo
|
62 |
analyzer = ContentAnalyzer() |
|
ccf32cc…
|
leo
|
63 |
t_entities = [ |
|
ccf32cc…
|
leo
|
64 |
Entity(name="API", type="concept", occurrences=[{"source": "transcript", "ts": 10}]), |
|
ccf32cc…
|
leo
|
65 |
] |
|
ccf32cc…
|
leo
|
66 |
d_entities = [ |
|
ccf32cc…
|
leo
|
67 |
Entity(name="API", type="concept", occurrences=[{"source": "diagram", "ts": 20}]), |
|
ccf32cc…
|
leo
|
68 |
] |
|
ccf32cc…
|
leo
|
69 |
result = analyzer.cross_reference(t_entities, d_entities) |
|
ccf32cc…
|
leo
|
70 |
assert len(result) == 1 |
|
ccf32cc…
|
leo
|
71 |
assert len(result[0].occurrences) == 2 |
|
ccf32cc…
|
leo
|
72 |
|
|
ccf32cc…
|
leo
|
73 |
|
|
ccf32cc…
|
leo
|
74 |
class TestFuzzyMatch: |
|
ccf32cc…
|
leo
|
75 |
def test_fuzzy_match_with_llm(self): |
|
ccf32cc…
|
leo
|
76 |
pm = MagicMock() |
|
829e24a…
|
leo
|
77 |
pm.chat.return_value = json.dumps( |
|
829e24a…
|
leo
|
78 |
[ |
|
829e24a…
|
leo
|
79 |
{"transcript": "K8s", "diagram": "Kubernetes"}, |
|
829e24a…
|
leo
|
80 |
] |
|
829e24a…
|
leo
|
81 |
) |
|
ccf32cc…
|
leo
|
82 |
analyzer = ContentAnalyzer(provider_manager=pm) |
|
ccf32cc…
|
leo
|
83 |
|
|
ccf32cc…
|
leo
|
84 |
t_entities = [ |
|
ccf32cc…
|
leo
|
85 |
Entity(name="K8s", type="technology", descriptions=["Container orchestration"]), |
|
ccf32cc…
|
leo
|
86 |
] |
|
ccf32cc…
|
leo
|
87 |
d_entities = [ |
|
ccf32cc…
|
leo
|
88 |
Entity(name="Kubernetes", type="technology", descriptions=["K8s system"]), |
|
ccf32cc…
|
leo
|
89 |
] |
|
ccf32cc…
|
leo
|
90 |
result = analyzer.cross_reference(t_entities, d_entities) |
|
ccf32cc…
|
leo
|
91 |
|
|
ccf32cc…
|
leo
|
92 |
# Fuzzy match should merge these |
|
ccf32cc…
|
leo
|
93 |
assert len(result) == 1 |
|
ccf32cc…
|
leo
|
94 |
assert result[0].source == "both" |
|
ccf32cc…
|
leo
|
95 |
assert result[0].name == "K8s" |
|
ccf32cc…
|
leo
|
96 |
|
|
ccf32cc…
|
leo
|
97 |
def test_fuzzy_match_no_matches(self): |
|
ccf32cc…
|
leo
|
98 |
pm = MagicMock() |
|
ccf32cc…
|
leo
|
99 |
pm.chat.return_value = "[]" |
|
ccf32cc…
|
leo
|
100 |
analyzer = ContentAnalyzer(provider_manager=pm) |
|
ccf32cc…
|
leo
|
101 |
|
|
ccf32cc…
|
leo
|
102 |
t_entities = [Entity(name="Alpha", type="concept")] |
|
ccf32cc…
|
leo
|
103 |
d_entities = [Entity(name="Beta", type="concept")] |
|
ccf32cc…
|
leo
|
104 |
result = analyzer.cross_reference(t_entities, d_entities) |
|
ccf32cc…
|
leo
|
105 |
assert len(result) == 2 |
|
ccf32cc…
|
leo
|
106 |
|
|
ccf32cc…
|
leo
|
107 |
def test_fuzzy_match_llm_error(self): |
|
ccf32cc…
|
leo
|
108 |
pm = MagicMock() |
|
ccf32cc…
|
leo
|
109 |
pm.chat.side_effect = Exception("API error") |
|
ccf32cc…
|
leo
|
110 |
analyzer = ContentAnalyzer(provider_manager=pm) |
|
ccf32cc…
|
leo
|
111 |
|
|
ccf32cc…
|
leo
|
112 |
t_entities = [Entity(name="X", type="concept")] |
|
ccf32cc…
|
leo
|
113 |
d_entities = [Entity(name="Y", type="concept")] |
|
ccf32cc…
|
leo
|
114 |
result = analyzer.cross_reference(t_entities, d_entities) |
|
ccf32cc…
|
leo
|
115 |
# Should still return both entities despite error |
|
ccf32cc…
|
leo
|
116 |
assert len(result) == 2 |
|
ccf32cc…
|
leo
|
117 |
|
|
ccf32cc…
|
leo
|
118 |
def test_fuzzy_match_bad_json(self): |
|
ccf32cc…
|
leo
|
119 |
pm = MagicMock() |
|
ccf32cc…
|
leo
|
120 |
pm.chat.return_value = "not json at all" |
|
ccf32cc…
|
leo
|
121 |
analyzer = ContentAnalyzer(provider_manager=pm) |
|
ccf32cc…
|
leo
|
122 |
|
|
ccf32cc…
|
leo
|
123 |
t_entities = [Entity(name="A", type="concept")] |
|
ccf32cc…
|
leo
|
124 |
d_entities = [Entity(name="B", type="concept")] |
|
ccf32cc…
|
leo
|
125 |
result = analyzer.cross_reference(t_entities, d_entities) |
|
ccf32cc…
|
leo
|
126 |
assert len(result) == 2 |
|
ccf32cc…
|
leo
|
127 |
|
|
ccf32cc…
|
leo
|
128 |
def test_fuzzy_match_skipped_without_provider(self): |
|
ccf32cc…
|
leo
|
129 |
analyzer = ContentAnalyzer() |
|
ccf32cc…
|
leo
|
130 |
t_entities = [Entity(name="ML", type="concept")] |
|
ccf32cc…
|
leo
|
131 |
d_entities = [Entity(name="Machine Learning", type="concept")] |
|
ccf32cc…
|
leo
|
132 |
result = analyzer.cross_reference(t_entities, d_entities) |
|
ccf32cc…
|
leo
|
133 |
# No LLM so no fuzzy matching — both remain separate |
|
ccf32cc…
|
leo
|
134 |
assert len(result) == 2 |
|
ccf32cc…
|
leo
|
135 |
|
|
ccf32cc…
|
leo
|
136 |
def test_fuzzy_match_skipped_when_all_exact(self): |
|
ccf32cc…
|
leo
|
137 |
pm = MagicMock() |
|
ccf32cc…
|
leo
|
138 |
analyzer = ContentAnalyzer(provider_manager=pm) |
|
ccf32cc…
|
leo
|
139 |
|
|
ccf32cc…
|
leo
|
140 |
t_entities = [Entity(name="Same", type="concept")] |
|
ccf32cc…
|
leo
|
141 |
d_entities = [Entity(name="Same", type="concept")] |
|
ccf32cc…
|
leo
|
142 |
result = analyzer.cross_reference(t_entities, d_entities) |
|
ccf32cc…
|
leo
|
143 |
# All matched exactly — no fuzzy match call needed |
|
ccf32cc…
|
leo
|
144 |
pm.chat.assert_not_called() |
|
ccf32cc…
|
leo
|
145 |
assert len(result) == 1 |
|
ccf32cc…
|
leo
|
146 |
|
|
ccf32cc…
|
leo
|
147 |
|
|
ccf32cc…
|
leo
|
148 |
class TestEnrichKeyPoints: |
|
ccf32cc…
|
leo
|
149 |
def test_enriches_with_matching_diagrams(self): |
|
ccf32cc…
|
leo
|
150 |
analyzer = ContentAnalyzer() |
|
ccf32cc…
|
leo
|
151 |
kps = [ |
|
ccf32cc…
|
leo
|
152 |
KeyPoint(point="The deployment pipeline uses Docker containers"), |
|
ccf32cc…
|
leo
|
153 |
] |
|
ccf32cc…
|
leo
|
154 |
diagrams = [ |
|
ccf32cc…
|
leo
|
155 |
{"elements": ["Docker", "Pipeline", "Build"], "text_content": "CI/CD flow"}, |
|
ccf32cc…
|
leo
|
156 |
] |
|
ccf32cc…
|
leo
|
157 |
result = analyzer.enrich_key_points(kps, diagrams, "") |
|
ccf32cc…
|
leo
|
158 |
assert len(result) == 1 |
|
ccf32cc…
|
leo
|
159 |
assert result[0].related_diagrams == [0] |
|
ccf32cc…
|
leo
|
160 |
|
|
ccf32cc…
|
leo
|
161 |
def test_no_match_below_threshold(self): |
|
ccf32cc…
|
leo
|
162 |
analyzer = ContentAnalyzer() |
|
ccf32cc…
|
leo
|
163 |
kps = [ |
|
ccf32cc…
|
leo
|
164 |
KeyPoint(point="Meeting scheduled for Friday"), |
|
ccf32cc…
|
leo
|
165 |
] |
|
ccf32cc…
|
leo
|
166 |
diagrams = [ |
|
ccf32cc…
|
leo
|
167 |
{"elements": ["Docker", "Pipeline"], "text_content": "Architecture diagram"}, |
|
ccf32cc…
|
leo
|
168 |
] |
|
ccf32cc…
|
leo
|
169 |
result = analyzer.enrich_key_points(kps, diagrams, "") |
|
ccf32cc…
|
leo
|
170 |
assert result[0].related_diagrams == [] |
|
ccf32cc…
|
leo
|
171 |
|
|
ccf32cc…
|
leo
|
172 |
def test_empty_diagrams_returns_unchanged(self): |
|
ccf32cc…
|
leo
|
173 |
analyzer = ContentAnalyzer() |
|
ccf32cc…
|
leo
|
174 |
kps = [KeyPoint(point="Test point")] |
|
ccf32cc…
|
leo
|
175 |
result = analyzer.enrich_key_points(kps, [], "") |
|
ccf32cc…
|
leo
|
176 |
assert len(result) == 1 |
|
ccf32cc…
|
leo
|
177 |
assert result[0].related_diagrams == [] |
|
ccf32cc…
|
leo
|
178 |
|
|
ccf32cc…
|
leo
|
179 |
def test_multiple_diagram_matches(self): |
|
ccf32cc…
|
leo
|
180 |
analyzer = ContentAnalyzer() |
|
ccf32cc…
|
leo
|
181 |
kps = [ |
|
ccf32cc…
|
leo
|
182 |
KeyPoint(point="Database migration requires testing schema changes"), |
|
ccf32cc…
|
leo
|
183 |
] |
|
ccf32cc…
|
leo
|
184 |
diagrams = [ |
|
ccf32cc…
|
leo
|
185 |
{"elements": ["Database", "Schema", "Migration"], "text_content": ""}, |
|
ccf32cc…
|
leo
|
186 |
{"elements": ["Testing", "Schema", "Validation"], "text_content": ""}, |
|
ccf32cc…
|
leo
|
187 |
] |
|
ccf32cc…
|
leo
|
188 |
result = analyzer.enrich_key_points(kps, diagrams, "") |
|
ccf32cc…
|
leo
|
189 |
assert len(result[0].related_diagrams) == 2 |
|
ccf32cc…
|
leo
|
190 |
|
|
ccf32cc…
|
leo
|
191 |
def test_details_used_for_matching(self): |
|
ccf32cc…
|
leo
|
192 |
analyzer = ContentAnalyzer() |
|
ccf32cc…
|
leo
|
193 |
kps = [ |
|
829e24a…
|
leo
|
194 |
KeyPoint( |
|
829e24a…
|
leo
|
195 |
point="Architecture overview", details="Uses Docker and Kubernetes for deployment" |
|
829e24a…
|
leo
|
196 |
), |
|
ccf32cc…
|
leo
|
197 |
] |
|
ccf32cc…
|
leo
|
198 |
diagrams = [ |
|
ccf32cc…
|
leo
|
199 |
{"elements": ["Docker", "Kubernetes"], "text_content": "deployment infrastructure"}, |
|
ccf32cc…
|
leo
|
200 |
] |
|
ccf32cc…
|
leo
|
201 |
result = analyzer.enrich_key_points(kps, diagrams, "") |
|
ccf32cc…
|
leo
|
202 |
assert 0 in result[0].related_diagrams |
|
ccf32cc…
|
leo
|
203 |
|
|
ccf32cc…
|
leo
|
204 |
def test_diagram_as_object_with_attrs(self): |
|
ccf32cc…
|
leo
|
205 |
analyzer = ContentAnalyzer() |
|
ccf32cc…
|
leo
|
206 |
|
|
ccf32cc…
|
leo
|
207 |
class FakeDiagram: |
|
ccf32cc…
|
leo
|
208 |
elements = ["Alpha", "Beta"] |
|
ccf32cc…
|
leo
|
209 |
text_content = "some relevant content" |
|
ccf32cc…
|
leo
|
210 |
|
|
ccf32cc…
|
leo
|
211 |
kps = [KeyPoint(point="Alpha Beta interaction patterns")] |
|
ccf32cc…
|
leo
|
212 |
result = analyzer.enrich_key_points(kps, [FakeDiagram()], "") |
|
ccf32cc…
|
leo
|
213 |
assert result[0].related_diagrams == [0] |