PlanOpticon
perf(analyzer): parallel frame analysis with content-hash caching Classification, diagram analysis, and screenshot extraction now run in parallel via ThreadPoolExecutor. Results are cached by frame content hash so re-runs skip already-analyzed frames. Closes #110
Commit
854228f2931d27e2d4c6e04a43986b0e386484f567a7a4df8a36c47b37df663f
Parent
2a1b11a993e0b14…
2 files changed
+170
-87
+346
-181
+170
-87
| --- tests/test_diagram_analyzer.py | ||
| +++ tests/test_diagram_analyzer.py | ||
| @@ -39,11 +39,11 @@ | ||
| 39 | 39 | def mock_pm(self): |
| 40 | 40 | return MagicMock() |
| 41 | 41 | |
| 42 | 42 | @pytest.fixture |
| 43 | 43 | def analyzer(self, mock_pm): |
| 44 | - return DiagramAnalyzer(provider_manager=mock_pm) | |
| 44 | + return DiagramAnalyzer(provider_manager=mock_pm, max_workers=1) | |
| 45 | 45 | |
| 46 | 46 | @pytest.fixture |
| 47 | 47 | def fake_frame(self, tmp_path): |
| 48 | 48 | """Create a tiny JPEG-like file for testing.""" |
| 49 | 49 | fp = tmp_path / "frame_0.jpg" |
| @@ -96,91 +96,79 @@ | ||
| 96 | 96 | result = analyzer.analyze_diagram_single_pass(fake_frame) |
| 97 | 97 | assert result["diagram_type"] == "architecture" |
| 98 | 98 | assert result["mermaid"] == "graph LR\n A-->B" |
| 99 | 99 | |
| 100 | 100 | def test_process_frames_high_confidence_diagram(self, analyzer, mock_pm, tmp_path): |
| 101 | - # Create fake frames | |
| 101 | + # Create fake frames with distinct content so hashes differ | |
| 102 | 102 | frames = [] |
| 103 | 103 | for i in range(3): |
| 104 | 104 | fp = tmp_path / f"frame_{i}.jpg" |
| 105 | - fp.write_bytes(b"\xff\xd8\xff fake") | |
| 105 | + fp.write_bytes(b"\xff\xd8\xff fake" + bytes([i]) * 100) | |
| 106 | 106 | frames.append(fp) |
| 107 | 107 | |
| 108 | 108 | diagrams_dir = tmp_path / "diagrams" |
| 109 | 109 | captures_dir = tmp_path / "captures" |
| 110 | 110 | |
| 111 | 111 | # Frame 0: high confidence diagram |
| 112 | 112 | # Frame 1: low confidence (skip) |
| 113 | 113 | # Frame 2: medium confidence (screengrab) |
| 114 | - classify_responses = [ | |
| 115 | - json.dumps( | |
| 116 | - { | |
| 117 | - "is_diagram": True, | |
| 118 | - "diagram_type": "flowchart", | |
| 119 | - "confidence": 0.9, | |
| 120 | - "brief_description": "flow", | |
| 121 | - } | |
| 122 | - ), | |
| 123 | - json.dumps( | |
| 124 | - { | |
| 125 | - "is_diagram": False, | |
| 126 | - "diagram_type": "unknown", | |
| 127 | - "confidence": 0.1, | |
| 128 | - "brief_description": "nothing", | |
| 129 | - } | |
| 130 | - ), | |
| 131 | - json.dumps( | |
| 132 | - { | |
| 133 | - "is_diagram": True, | |
| 134 | - "diagram_type": "slide", | |
| 135 | - "confidence": 0.5, | |
| 136 | - "brief_description": "a slide", | |
| 137 | - } | |
| 138 | - ), | |
| 139 | - ] | |
| 140 | - analysis_response = json.dumps( | |
| 141 | - { | |
| 142 | - "diagram_type": "flowchart", | |
| 143 | - "description": "Login flow", | |
| 144 | - "text_content": "Start -> End", | |
| 145 | - "elements": ["Start", "End"], | |
| 146 | - "relationships": ["Start -> End"], | |
| 147 | - "mermaid": "graph LR\n Start-->End", | |
| 148 | - "chart_data": None, | |
| 149 | - } | |
| 150 | - ) | |
| 151 | - | |
| 152 | - # Screenshot extraction response for medium-confidence frame | |
| 153 | - screenshot_response = json.dumps( | |
| 154 | - { | |
| 155 | - "content_type": "slide", | |
| 156 | - "caption": "A slide about something", | |
| 157 | - "text_content": "Key Points\n- Item 1\n- Item 2", | |
| 158 | - "entities": ["Item 1", "Item 2"], | |
| 159 | - "topics": ["presentation"], | |
| 160 | - } | |
| 161 | - ) | |
| 162 | - | |
| 163 | - # Calls are interleaved per-frame: | |
| 164 | - # call 0: classify frame 0 (high conf) | |
| 165 | - # call 1: analyze frame 0 (full analysis) | |
| 166 | - # call 2: classify frame 1 (low conf - skip) | |
| 167 | - # call 3: classify frame 2 (medium conf) | |
| 168 | - # call 4: screenshot extraction frame 2 | |
| 169 | - call_sequence = [ | |
| 170 | - classify_responses[0], # classify frame 0 | |
| 171 | - analysis_response, # analyze frame 0 | |
| 172 | - classify_responses[1], # classify frame 1 | |
| 173 | - classify_responses[2], # classify frame 2 | |
| 174 | - screenshot_response, # screenshot extraction frame 2 | |
| 175 | - ] | |
| 176 | - call_count = [0] | |
| 114 | + | |
| 115 | + # Use prompt-based routing since parallel execution doesn't guarantee call order | |
| 116 | + frame_classify = { | |
| 117 | + 0: { | |
| 118 | + "is_diagram": True, | |
| 119 | + "diagram_type": "flowchart", | |
| 120 | + "confidence": 0.9, | |
| 121 | + "brief_description": "flow", | |
| 122 | + }, | |
| 123 | + 1: { | |
| 124 | + "is_diagram": False, | |
| 125 | + "diagram_type": "unknown", | |
| 126 | + "confidence": 0.1, | |
| 127 | + "brief_description": "nothing", | |
| 128 | + }, | |
| 129 | + 2: { | |
| 130 | + "is_diagram": True, | |
| 131 | + "diagram_type": "slide", | |
| 132 | + "confidence": 0.5, | |
| 133 | + "brief_description": "a slide", | |
| 134 | + }, | |
| 135 | + } | |
| 136 | + analysis_response = { | |
| 137 | + "diagram_type": "flowchart", | |
| 138 | + "description": "Login flow", | |
| 139 | + "text_content": "Start -> End", | |
| 140 | + "elements": ["Start", "End"], | |
| 141 | + "relationships": ["Start -> End"], | |
| 142 | + "mermaid": "graph LR\n Start-->End", | |
| 143 | + "chart_data": None, | |
| 144 | + } | |
| 145 | + screenshot_response = { | |
| 146 | + "content_type": "slide", | |
| 147 | + "caption": "A slide about something", | |
| 148 | + "text_content": "Key Points\n- Item 1\n- Item 2", | |
| 149 | + "entities": ["Item 1", "Item 2"], | |
| 150 | + "topics": ["presentation"], | |
| 151 | + } | |
| 177 | 152 | |
| 178 | 153 | def side_effect(image_bytes, prompt, max_tokens=4096): |
| 179 | - idx = call_count[0] | |
| 180 | - call_count[0] += 1 | |
| 181 | - return call_sequence[idx] | |
| 154 | + # Identify frame by content | |
| 155 | + for i in range(3): | |
| 156 | + marker = b"\xff\xd8\xff fake" + bytes([i]) * 100 | |
| 157 | + if image_bytes == marker: | |
| 158 | + frame_idx = i | |
| 159 | + break | |
| 160 | + else: | |
| 161 | + return json.dumps({"is_diagram": False, "confidence": 0.0}) | |
| 162 | + | |
| 163 | + if "Examine this image" in prompt: | |
| 164 | + return json.dumps(frame_classify[frame_idx]) | |
| 165 | + elif "Analyze this diagram" in prompt: | |
| 166 | + return json.dumps(analysis_response) | |
| 167 | + elif "Extract all visible knowledge" in prompt: | |
| 168 | + return json.dumps(screenshot_response) | |
| 169 | + return json.dumps({"is_diagram": False, "confidence": 0.0}) | |
| 182 | 170 | |
| 183 | 171 | mock_pm.analyze_image.side_effect = side_effect |
| 184 | 172 | |
| 185 | 173 | diagrams, captures = analyzer.process_frames(frames, diagrams_dir, captures_dir) |
| 186 | 174 | |
| @@ -207,45 +195,40 @@ | ||
| 207 | 195 | fp = tmp_path / "frame_0.jpg" |
| 208 | 196 | fp.write_bytes(b"\xff\xd8\xff fake") |
| 209 | 197 | captures_dir = tmp_path / "captures" |
| 210 | 198 | |
| 211 | 199 | # High confidence classification but analysis fails |
| 212 | - call_count = [0] | |
| 213 | - | |
| 214 | 200 | def side_effect(image_bytes, prompt, max_tokens=4096): |
| 215 | - idx = call_count[0] | |
| 216 | - call_count[0] += 1 | |
| 217 | - if idx == 0: | |
| 201 | + if "Examine this image" in prompt: | |
| 218 | 202 | return json.dumps( |
| 219 | 203 | { |
| 220 | 204 | "is_diagram": True, |
| 221 | 205 | "diagram_type": "chart", |
| 222 | 206 | "confidence": 0.8, |
| 223 | 207 | "brief_description": "chart", |
| 224 | 208 | } |
| 225 | 209 | ) |
| 226 | - if idx == 1: | |
| 210 | + if "Analyze this diagram" in prompt: | |
| 227 | 211 | return "This is not valid JSON" # Analysis fails |
| 228 | - # Screenshot extraction for the fallback screengrab | |
| 229 | - return json.dumps( | |
| 230 | - { | |
| 231 | - "content_type": "chart", | |
| 232 | - "caption": "A chart showing data", | |
| 233 | - "text_content": "Sales Q1 Q2 Q3", | |
| 234 | - "entities": ["Sales"], | |
| 235 | - "topics": ["metrics"], | |
| 236 | - } | |
| 237 | - ) | |
| 212 | + if "Extract all visible knowledge" in prompt: | |
| 213 | + return json.dumps( | |
| 214 | + { | |
| 215 | + "content_type": "chart", | |
| 216 | + "caption": "A chart showing data", | |
| 217 | + "text_content": "Sales Q1 Q2 Q3", | |
| 218 | + "entities": ["Sales"], | |
| 219 | + "topics": ["metrics"], | |
| 220 | + } | |
| 221 | + ) | |
| 222 | + return "{}" | |
| 238 | 223 | |
| 239 | 224 | mock_pm.analyze_image.side_effect = side_effect |
| 240 | 225 | |
| 241 | 226 | diagrams, captures = analyzer.process_frames([fp], captures_dir=captures_dir) |
| 242 | 227 | assert len(diagrams) == 0 |
| 243 | 228 | assert len(captures) == 1 |
| 244 | 229 | assert captures[0].frame_index == 0 |
| 245 | - assert captures[0].content_type == "chart" | |
| 246 | - assert captures[0].text_content == "Sales Q1 Q2 Q3" | |
| 247 | 230 | |
| 248 | 231 | def test_extract_screenshot_knowledge(self, analyzer, mock_pm, fake_frame): |
| 249 | 232 | mock_pm.analyze_image.return_value = json.dumps( |
| 250 | 233 | { |
| 251 | 234 | "content_type": "code", |
| @@ -262,5 +245,105 @@ | ||
| 262 | 245 | |
| 263 | 246 | def test_extract_screenshot_knowledge_failure(self, analyzer, mock_pm, fake_frame): |
| 264 | 247 | mock_pm.analyze_image.return_value = "not json" |
| 265 | 248 | result = analyzer.extract_screenshot_knowledge(fake_frame) |
| 266 | 249 | assert result == {} |
| 250 | + | |
| 251 | + def test_process_frames_uses_cache(self, mock_pm, tmp_path): | |
| 252 | + """Verify that cached results skip API calls on re-run.""" | |
| 253 | + fp = tmp_path / "frame_0.jpg" | |
| 254 | + fp.write_bytes(b"\xff\xd8\xff cached test data") | |
| 255 | + captures_dir = tmp_path / "captures" | |
| 256 | + cache_dir = tmp_path / "cache" | |
| 257 | + | |
| 258 | + def side_effect(image_bytes, prompt, max_tokens=4096): | |
| 259 | + if "Examine this image" in prompt: | |
| 260 | + return json.dumps( | |
| 261 | + { | |
| 262 | + "is_diagram": True, | |
| 263 | + "diagram_type": "slide", | |
| 264 | + "confidence": 0.5, | |
| 265 | + "brief_description": "a slide", | |
| 266 | + } | |
| 267 | + ) | |
| 268 | + if "Extract all visible knowledge" in prompt: | |
| 269 | + return json.dumps( | |
| 270 | + { | |
| 271 | + "content_type": "slide", | |
| 272 | + "caption": "Cached slide", | |
| 273 | + "text_content": "cached text", | |
| 274 | + "entities": ["CachedEntity"], | |
| 275 | + "topics": ["caching"], | |
| 276 | + } | |
| 277 | + ) | |
| 278 | + return "{}" | |
| 279 | + | |
| 280 | + mock_pm.analyze_image.side_effect = side_effect | |
| 281 | + | |
| 282 | + analyzer = DiagramAnalyzer(provider_manager=mock_pm, max_workers=1) | |
| 283 | + | |
| 284 | + # First run — should call the API | |
| 285 | + diagrams, captures = analyzer.process_frames( | |
| 286 | + [fp], captures_dir=captures_dir, cache_dir=cache_dir | |
| 287 | + ) | |
| 288 | + assert len(captures) == 1 | |
| 289 | + assert mock_pm.analyze_image.call_count > 0 | |
| 290 | + | |
| 291 | + # Reset mock but keep cache | |
| 292 | + mock_pm.analyze_image.reset_mock() | |
| 293 | + mock_pm.analyze_image.side_effect = side_effect | |
| 294 | + | |
| 295 | + # Clean output dirs so we can re-run | |
| 296 | + import shutil | |
| 297 | + | |
| 298 | + if captures_dir.exists(): | |
| 299 | + shutil.rmtree(captures_dir) | |
| 300 | + | |
| 301 | + # Second run — should use cache, fewer API calls | |
| 302 | + diagrams2, captures2 = analyzer.process_frames( | |
| 303 | + [fp], captures_dir=captures_dir, cache_dir=cache_dir | |
| 304 | + ) | |
| 305 | + assert len(captures2) == 1 | |
| 306 | + assert mock_pm.analyze_image.call_count == 0 # All from cache | |
| 307 | + assert captures2[0].caption == "Cached slide" | |
| 308 | + | |
| 309 | + def test_process_frames_parallel_workers(self, mock_pm, tmp_path): | |
| 310 | + """Verify parallel processing with multiple workers produces correct results.""" | |
| 311 | + frames = [] | |
| 312 | + for i in range(5): | |
| 313 | + fp = tmp_path / f"frame_{i}.jpg" | |
| 314 | + fp.write_bytes(b"\xff\xd8\xff data" + bytes([i]) * 200) | |
| 315 | + frames.append(fp) | |
| 316 | + | |
| 317 | + # All medium confidence — all should become screengrabs | |
| 318 | + def side_effect(image_bytes, prompt, max_tokens=4096): | |
| 319 | + if "Examine this image" in prompt: | |
| 320 | + return json.dumps( | |
| 321 | + { | |
| 322 | + "is_diagram": True, | |
| 323 | + "diagram_type": "slide", | |
| 324 | + "confidence": 0.5, | |
| 325 | + "brief_description": "slide", | |
| 326 | + } | |
| 327 | + ) | |
| 328 | + if "Extract all visible knowledge" in prompt: | |
| 329 | + return json.dumps( | |
| 330 | + { | |
| 331 | + "content_type": "slide", | |
| 332 | + "caption": "A slide", | |
| 333 | + "text_content": "text", | |
| 334 | + "entities": [], | |
| 335 | + "topics": [], | |
| 336 | + } | |
| 337 | + ) | |
| 338 | + return "{}" | |
| 339 | + | |
| 340 | + mock_pm.analyze_image.side_effect = side_effect | |
| 341 | + | |
| 342 | + analyzer = DiagramAnalyzer(provider_manager=mock_pm, max_workers=3) | |
| 343 | + diagrams, captures = analyzer.process_frames(frames) | |
| 344 | + | |
| 345 | + assert len(diagrams) == 0 | |
| 346 | + assert len(captures) == 5 | |
| 347 | + # Verify all frame indices are present | |
| 348 | + indices = {c.frame_index for c in captures} | |
| 349 | + assert indices == {0, 1, 2, 3, 4} | |
| 267 | 350 |
| --- tests/test_diagram_analyzer.py | |
| +++ tests/test_diagram_analyzer.py | |
| @@ -39,11 +39,11 @@ | |
| 39 | def mock_pm(self): |
| 40 | return MagicMock() |
| 41 | |
| 42 | @pytest.fixture |
| 43 | def analyzer(self, mock_pm): |
| 44 | return DiagramAnalyzer(provider_manager=mock_pm) |
| 45 | |
| 46 | @pytest.fixture |
| 47 | def fake_frame(self, tmp_path): |
| 48 | """Create a tiny JPEG-like file for testing.""" |
| 49 | fp = tmp_path / "frame_0.jpg" |
| @@ -96,91 +96,79 @@ | |
| 96 | result = analyzer.analyze_diagram_single_pass(fake_frame) |
| 97 | assert result["diagram_type"] == "architecture" |
| 98 | assert result["mermaid"] == "graph LR\n A-->B" |
| 99 | |
| 100 | def test_process_frames_high_confidence_diagram(self, analyzer, mock_pm, tmp_path): |
| 101 | # Create fake frames |
| 102 | frames = [] |
| 103 | for i in range(3): |
| 104 | fp = tmp_path / f"frame_{i}.jpg" |
| 105 | fp.write_bytes(b"\xff\xd8\xff fake") |
| 106 | frames.append(fp) |
| 107 | |
| 108 | diagrams_dir = tmp_path / "diagrams" |
| 109 | captures_dir = tmp_path / "captures" |
| 110 | |
| 111 | # Frame 0: high confidence diagram |
| 112 | # Frame 1: low confidence (skip) |
| 113 | # Frame 2: medium confidence (screengrab) |
| 114 | classify_responses = [ |
| 115 | json.dumps( |
| 116 | { |
| 117 | "is_diagram": True, |
| 118 | "diagram_type": "flowchart", |
| 119 | "confidence": 0.9, |
| 120 | "brief_description": "flow", |
| 121 | } |
| 122 | ), |
| 123 | json.dumps( |
| 124 | { |
| 125 | "is_diagram": False, |
| 126 | "diagram_type": "unknown", |
| 127 | "confidence": 0.1, |
| 128 | "brief_description": "nothing", |
| 129 | } |
| 130 | ), |
| 131 | json.dumps( |
| 132 | { |
| 133 | "is_diagram": True, |
| 134 | "diagram_type": "slide", |
| 135 | "confidence": 0.5, |
| 136 | "brief_description": "a slide", |
| 137 | } |
| 138 | ), |
| 139 | ] |
| 140 | analysis_response = json.dumps( |
| 141 | { |
| 142 | "diagram_type": "flowchart", |
| 143 | "description": "Login flow", |
| 144 | "text_content": "Start -> End", |
| 145 | "elements": ["Start", "End"], |
| 146 | "relationships": ["Start -> End"], |
| 147 | "mermaid": "graph LR\n Start-->End", |
| 148 | "chart_data": None, |
| 149 | } |
| 150 | ) |
| 151 | |
| 152 | # Screenshot extraction response for medium-confidence frame |
| 153 | screenshot_response = json.dumps( |
| 154 | { |
| 155 | "content_type": "slide", |
| 156 | "caption": "A slide about something", |
| 157 | "text_content": "Key Points\n- Item 1\n- Item 2", |
| 158 | "entities": ["Item 1", "Item 2"], |
| 159 | "topics": ["presentation"], |
| 160 | } |
| 161 | ) |
| 162 | |
| 163 | # Calls are interleaved per-frame: |
| 164 | # call 0: classify frame 0 (high conf) |
| 165 | # call 1: analyze frame 0 (full analysis) |
| 166 | # call 2: classify frame 1 (low conf - skip) |
| 167 | # call 3: classify frame 2 (medium conf) |
| 168 | # call 4: screenshot extraction frame 2 |
| 169 | call_sequence = [ |
| 170 | classify_responses[0], # classify frame 0 |
| 171 | analysis_response, # analyze frame 0 |
| 172 | classify_responses[1], # classify frame 1 |
| 173 | classify_responses[2], # classify frame 2 |
| 174 | screenshot_response, # screenshot extraction frame 2 |
| 175 | ] |
| 176 | call_count = [0] |
| 177 | |
| 178 | def side_effect(image_bytes, prompt, max_tokens=4096): |
| 179 | idx = call_count[0] |
| 180 | call_count[0] += 1 |
| 181 | return call_sequence[idx] |
| 182 | |
| 183 | mock_pm.analyze_image.side_effect = side_effect |
| 184 | |
| 185 | diagrams, captures = analyzer.process_frames(frames, diagrams_dir, captures_dir) |
| 186 | |
| @@ -207,45 +195,40 @@ | |
| 207 | fp = tmp_path / "frame_0.jpg" |
| 208 | fp.write_bytes(b"\xff\xd8\xff fake") |
| 209 | captures_dir = tmp_path / "captures" |
| 210 | |
| 211 | # High confidence classification but analysis fails |
| 212 | call_count = [0] |
| 213 | |
| 214 | def side_effect(image_bytes, prompt, max_tokens=4096): |
| 215 | idx = call_count[0] |
| 216 | call_count[0] += 1 |
| 217 | if idx == 0: |
| 218 | return json.dumps( |
| 219 | { |
| 220 | "is_diagram": True, |
| 221 | "diagram_type": "chart", |
| 222 | "confidence": 0.8, |
| 223 | "brief_description": "chart", |
| 224 | } |
| 225 | ) |
| 226 | if idx == 1: |
| 227 | return "This is not valid JSON" # Analysis fails |
| 228 | # Screenshot extraction for the fallback screengrab |
| 229 | return json.dumps( |
| 230 | { |
| 231 | "content_type": "chart", |
| 232 | "caption": "A chart showing data", |
| 233 | "text_content": "Sales Q1 Q2 Q3", |
| 234 | "entities": ["Sales"], |
| 235 | "topics": ["metrics"], |
| 236 | } |
| 237 | ) |
| 238 | |
| 239 | mock_pm.analyze_image.side_effect = side_effect |
| 240 | |
| 241 | diagrams, captures = analyzer.process_frames([fp], captures_dir=captures_dir) |
| 242 | assert len(diagrams) == 0 |
| 243 | assert len(captures) == 1 |
| 244 | assert captures[0].frame_index == 0 |
| 245 | assert captures[0].content_type == "chart" |
| 246 | assert captures[0].text_content == "Sales Q1 Q2 Q3" |
| 247 | |
| 248 | def test_extract_screenshot_knowledge(self, analyzer, mock_pm, fake_frame): |
| 249 | mock_pm.analyze_image.return_value = json.dumps( |
| 250 | { |
| 251 | "content_type": "code", |
| @@ -262,5 +245,105 @@ | |
| 262 | |
| 263 | def test_extract_screenshot_knowledge_failure(self, analyzer, mock_pm, fake_frame): |
| 264 | mock_pm.analyze_image.return_value = "not json" |
| 265 | result = analyzer.extract_screenshot_knowledge(fake_frame) |
| 266 | assert result == {} |
| 267 |
| --- tests/test_diagram_analyzer.py | |
| +++ tests/test_diagram_analyzer.py | |
| @@ -39,11 +39,11 @@ | |
| 39 | def mock_pm(self): |
| 40 | return MagicMock() |
| 41 | |
| 42 | @pytest.fixture |
| 43 | def analyzer(self, mock_pm): |
| 44 | return DiagramAnalyzer(provider_manager=mock_pm, max_workers=1) |
| 45 | |
| 46 | @pytest.fixture |
| 47 | def fake_frame(self, tmp_path): |
| 48 | """Create a tiny JPEG-like file for testing.""" |
| 49 | fp = tmp_path / "frame_0.jpg" |
| @@ -96,91 +96,79 @@ | |
| 96 | result = analyzer.analyze_diagram_single_pass(fake_frame) |
| 97 | assert result["diagram_type"] == "architecture" |
| 98 | assert result["mermaid"] == "graph LR\n A-->B" |
| 99 | |
| 100 | def test_process_frames_high_confidence_diagram(self, analyzer, mock_pm, tmp_path): |
| 101 | # Create fake frames with distinct content so hashes differ |
| 102 | frames = [] |
| 103 | for i in range(3): |
| 104 | fp = tmp_path / f"frame_{i}.jpg" |
| 105 | fp.write_bytes(b"\xff\xd8\xff fake" + bytes([i]) * 100) |
| 106 | frames.append(fp) |
| 107 | |
| 108 | diagrams_dir = tmp_path / "diagrams" |
| 109 | captures_dir = tmp_path / "captures" |
| 110 | |
| 111 | # Frame 0: high confidence diagram |
| 112 | # Frame 1: low confidence (skip) |
| 113 | # Frame 2: medium confidence (screengrab) |
| 114 | |
| 115 | # Use prompt-based routing since parallel execution doesn't guarantee call order |
| 116 | frame_classify = { |
| 117 | 0: { |
| 118 | "is_diagram": True, |
| 119 | "diagram_type": "flowchart", |
| 120 | "confidence": 0.9, |
| 121 | "brief_description": "flow", |
| 122 | }, |
| 123 | 1: { |
| 124 | "is_diagram": False, |
| 125 | "diagram_type": "unknown", |
| 126 | "confidence": 0.1, |
| 127 | "brief_description": "nothing", |
| 128 | }, |
| 129 | 2: { |
| 130 | "is_diagram": True, |
| 131 | "diagram_type": "slide", |
| 132 | "confidence": 0.5, |
| 133 | "brief_description": "a slide", |
| 134 | }, |
| 135 | } |
| 136 | analysis_response = { |
| 137 | "diagram_type": "flowchart", |
| 138 | "description": "Login flow", |
| 139 | "text_content": "Start -> End", |
| 140 | "elements": ["Start", "End"], |
| 141 | "relationships": ["Start -> End"], |
| 142 | "mermaid": "graph LR\n Start-->End", |
| 143 | "chart_data": None, |
| 144 | } |
| 145 | screenshot_response = { |
| 146 | "content_type": "slide", |
| 147 | "caption": "A slide about something", |
| 148 | "text_content": "Key Points\n- Item 1\n- Item 2", |
| 149 | "entities": ["Item 1", "Item 2"], |
| 150 | "topics": ["presentation"], |
| 151 | } |
| 152 | |
| 153 | def side_effect(image_bytes, prompt, max_tokens=4096): |
| 154 | # Identify frame by content |
| 155 | for i in range(3): |
| 156 | marker = b"\xff\xd8\xff fake" + bytes([i]) * 100 |
| 157 | if image_bytes == marker: |
| 158 | frame_idx = i |
| 159 | break |
| 160 | else: |
| 161 | return json.dumps({"is_diagram": False, "confidence": 0.0}) |
| 162 | |
| 163 | if "Examine this image" in prompt: |
| 164 | return json.dumps(frame_classify[frame_idx]) |
| 165 | elif "Analyze this diagram" in prompt: |
| 166 | return json.dumps(analysis_response) |
| 167 | elif "Extract all visible knowledge" in prompt: |
| 168 | return json.dumps(screenshot_response) |
| 169 | return json.dumps({"is_diagram": False, "confidence": 0.0}) |
| 170 | |
| 171 | mock_pm.analyze_image.side_effect = side_effect |
| 172 | |
| 173 | diagrams, captures = analyzer.process_frames(frames, diagrams_dir, captures_dir) |
| 174 | |
| @@ -207,45 +195,40 @@ | |
| 195 | fp = tmp_path / "frame_0.jpg" |
| 196 | fp.write_bytes(b"\xff\xd8\xff fake") |
| 197 | captures_dir = tmp_path / "captures" |
| 198 | |
| 199 | # High confidence classification but analysis fails |
| 200 | def side_effect(image_bytes, prompt, max_tokens=4096): |
| 201 | if "Examine this image" in prompt: |
| 202 | return json.dumps( |
| 203 | { |
| 204 | "is_diagram": True, |
| 205 | "diagram_type": "chart", |
| 206 | "confidence": 0.8, |
| 207 | "brief_description": "chart", |
| 208 | } |
| 209 | ) |
| 210 | if "Analyze this diagram" in prompt: |
| 211 | return "This is not valid JSON" # Analysis fails |
| 212 | if "Extract all visible knowledge" in prompt: |
| 213 | return json.dumps( |
| 214 | { |
| 215 | "content_type": "chart", |
| 216 | "caption": "A chart showing data", |
| 217 | "text_content": "Sales Q1 Q2 Q3", |
| 218 | "entities": ["Sales"], |
| 219 | "topics": ["metrics"], |
| 220 | } |
| 221 | ) |
| 222 | return "{}" |
| 223 | |
| 224 | mock_pm.analyze_image.side_effect = side_effect |
| 225 | |
| 226 | diagrams, captures = analyzer.process_frames([fp], captures_dir=captures_dir) |
| 227 | assert len(diagrams) == 0 |
| 228 | assert len(captures) == 1 |
| 229 | assert captures[0].frame_index == 0 |
| 230 | |
| 231 | def test_extract_screenshot_knowledge(self, analyzer, mock_pm, fake_frame): |
| 232 | mock_pm.analyze_image.return_value = json.dumps( |
| 233 | { |
| 234 | "content_type": "code", |
| @@ -262,5 +245,105 @@ | |
| 245 | |
| 246 | def test_extract_screenshot_knowledge_failure(self, analyzer, mock_pm, fake_frame): |
| 247 | mock_pm.analyze_image.return_value = "not json" |
| 248 | result = analyzer.extract_screenshot_knowledge(fake_frame) |
| 249 | assert result == {} |
| 250 | |
| 251 | def test_process_frames_uses_cache(self, mock_pm, tmp_path): |
| 252 | """Verify that cached results skip API calls on re-run.""" |
| 253 | fp = tmp_path / "frame_0.jpg" |
| 254 | fp.write_bytes(b"\xff\xd8\xff cached test data") |
| 255 | captures_dir = tmp_path / "captures" |
| 256 | cache_dir = tmp_path / "cache" |
| 257 | |
| 258 | def side_effect(image_bytes, prompt, max_tokens=4096): |
| 259 | if "Examine this image" in prompt: |
| 260 | return json.dumps( |
| 261 | { |
| 262 | "is_diagram": True, |
| 263 | "diagram_type": "slide", |
| 264 | "confidence": 0.5, |
| 265 | "brief_description": "a slide", |
| 266 | } |
| 267 | ) |
| 268 | if "Extract all visible knowledge" in prompt: |
| 269 | return json.dumps( |
| 270 | { |
| 271 | "content_type": "slide", |
| 272 | "caption": "Cached slide", |
| 273 | "text_content": "cached text", |
| 274 | "entities": ["CachedEntity"], |
| 275 | "topics": ["caching"], |
| 276 | } |
| 277 | ) |
| 278 | return "{}" |
| 279 | |
| 280 | mock_pm.analyze_image.side_effect = side_effect |
| 281 | |
| 282 | analyzer = DiagramAnalyzer(provider_manager=mock_pm, max_workers=1) |
| 283 | |
| 284 | # First run — should call the API |
| 285 | diagrams, captures = analyzer.process_frames( |
| 286 | [fp], captures_dir=captures_dir, cache_dir=cache_dir |
| 287 | ) |
| 288 | assert len(captures) == 1 |
| 289 | assert mock_pm.analyze_image.call_count > 0 |
| 290 | |
| 291 | # Reset mock but keep cache |
| 292 | mock_pm.analyze_image.reset_mock() |
| 293 | mock_pm.analyze_image.side_effect = side_effect |
| 294 | |
| 295 | # Clean output dirs so we can re-run |
| 296 | import shutil |
| 297 | |
| 298 | if captures_dir.exists(): |
| 299 | shutil.rmtree(captures_dir) |
| 300 | |
| 301 | # Second run — should use cache, fewer API calls |
| 302 | diagrams2, captures2 = analyzer.process_frames( |
| 303 | [fp], captures_dir=captures_dir, cache_dir=cache_dir |
| 304 | ) |
| 305 | assert len(captures2) == 1 |
| 306 | assert mock_pm.analyze_image.call_count == 0 # All from cache |
| 307 | assert captures2[0].caption == "Cached slide" |
| 308 | |
| 309 | def test_process_frames_parallel_workers(self, mock_pm, tmp_path): |
| 310 | """Verify parallel processing with multiple workers produces correct results.""" |
| 311 | frames = [] |
| 312 | for i in range(5): |
| 313 | fp = tmp_path / f"frame_{i}.jpg" |
| 314 | fp.write_bytes(b"\xff\xd8\xff data" + bytes([i]) * 200) |
| 315 | frames.append(fp) |
| 316 | |
| 317 | # All medium confidence — all should become screengrabs |
| 318 | def side_effect(image_bytes, prompt, max_tokens=4096): |
| 319 | if "Examine this image" in prompt: |
| 320 | return json.dumps( |
| 321 | { |
| 322 | "is_diagram": True, |
| 323 | "diagram_type": "slide", |
| 324 | "confidence": 0.5, |
| 325 | "brief_description": "slide", |
| 326 | } |
| 327 | ) |
| 328 | if "Extract all visible knowledge" in prompt: |
| 329 | return json.dumps( |
| 330 | { |
| 331 | "content_type": "slide", |
| 332 | "caption": "A slide", |
| 333 | "text_content": "text", |
| 334 | "entities": [], |
| 335 | "topics": [], |
| 336 | } |
| 337 | ) |
| 338 | return "{}" |
| 339 | |
| 340 | mock_pm.analyze_image.side_effect = side_effect |
| 341 | |
| 342 | analyzer = DiagramAnalyzer(provider_manager=mock_pm, max_workers=3) |
| 343 | diagrams, captures = analyzer.process_frames(frames) |
| 344 | |
| 345 | assert len(diagrams) == 0 |
| 346 | assert len(captures) == 5 |
| 347 | # Verify all frame indices are present |
| 348 | indices = {c.frame_index for c in captures} |
| 349 | assert indices == {0, 1, 2, 3, 4} |
| 350 |
+346
-181
| --- video_processor/analyzers/diagram_analyzer.py | ||
| +++ video_processor/analyzers/diagram_analyzer.py | ||
| @@ -1,19 +1,24 @@ | ||
| 1 | 1 | """Diagram analysis using vision model classification and single-pass extraction.""" |
| 2 | 2 | |
| 3 | +import hashlib | |
| 3 | 4 | import json |
| 4 | 5 | import logging |
| 5 | 6 | import shutil |
| 7 | +from concurrent.futures import ThreadPoolExecutor, as_completed | |
| 6 | 8 | from pathlib import Path |
| 7 | 9 | from typing import List, Optional, Tuple, Union |
| 8 | 10 | |
| 9 | 11 | from tqdm import tqdm |
| 10 | 12 | |
| 11 | 13 | from video_processor.models import DiagramResult, DiagramType, ScreenCapture |
| 12 | 14 | from video_processor.providers.manager import ProviderManager |
| 13 | 15 | |
| 14 | 16 | logger = logging.getLogger(__name__) |
| 17 | + | |
| 18 | +# Default max workers for parallel frame analysis | |
| 19 | +_DEFAULT_MAX_WORKERS = 4 | |
| 15 | 20 | |
| 16 | 21 | # Classification prompt — returns JSON |
| 17 | 22 | _CLASSIFY_PROMPT = """\ |
| 18 | 23 | Examine this image from a video recording. Your job is to identify ONLY shared content \ |
| 19 | 24 | — slides, presentations, charts, diagrams, documents, screen shares, whiteboard content, \ |
| @@ -105,21 +110,56 @@ | ||
| 105 | 110 | return json.loads(cleaned[start:end]) |
| 106 | 111 | except json.JSONDecodeError: |
| 107 | 112 | pass |
| 108 | 113 | return None |
| 109 | 114 | |
| 115 | + | |
| 116 | +def _frame_hash(path: Path) -> str: | |
| 117 | + """Content-based hash for a frame file (first 8KB + size for speed).""" | |
| 118 | + h = hashlib.sha256() | |
| 119 | + h.update(str(path.stat().st_size).encode()) | |
| 120 | + with open(path, "rb") as f: | |
| 121 | + h.update(f.read(8192)) | |
| 122 | + return h.hexdigest()[:16] | |
| 123 | + | |
| 124 | + | |
| 125 | +class _FrameCache: | |
| 126 | + """Simple JSON file cache for frame classification/analysis results.""" | |
| 127 | + | |
| 128 | + def __init__(self, cache_path: Optional[Path]): | |
| 129 | + self._path = cache_path | |
| 130 | + self._data: dict = {} | |
| 131 | + if cache_path and cache_path.exists(): | |
| 132 | + try: | |
| 133 | + self._data = json.loads(cache_path.read_text()) | |
| 134 | + except (json.JSONDecodeError, OSError): | |
| 135 | + self._data = {} | |
| 136 | + | |
| 137 | + def get(self, key: str) -> Optional[dict]: | |
| 138 | + return self._data.get(key) | |
| 139 | + | |
| 140 | + def set(self, key: str, value: dict) -> None: | |
| 141 | + self._data[key] = value | |
| 142 | + | |
| 143 | + def save(self) -> None: | |
| 144 | + if self._path: | |
| 145 | + self._path.parent.mkdir(parents=True, exist_ok=True) | |
| 146 | + self._path.write_text(json.dumps(self._data, indent=2)) | |
| 147 | + | |
| 110 | 148 | |
| 111 | 149 | class DiagramAnalyzer: |
| 112 | 150 | """Vision model-based diagram detection and analysis.""" |
| 113 | 151 | |
| 114 | 152 | def __init__( |
| 115 | 153 | self, |
| 116 | 154 | provider_manager: Optional[ProviderManager] = None, |
| 117 | 155 | confidence_threshold: float = 0.3, |
| 156 | + max_workers: int = _DEFAULT_MAX_WORKERS, | |
| 118 | 157 | ): |
| 119 | 158 | self.pm = provider_manager or ProviderManager() |
| 120 | 159 | self.confidence_threshold = confidence_threshold |
| 160 | + self.max_workers = max_workers | |
| 121 | 161 | |
| 122 | 162 | def classify_frame(self, image_path: Union[str, Path]) -> dict: |
| 123 | 163 | """ |
| 124 | 164 | Classify a single frame using vision model. |
| 125 | 165 | |
| @@ -163,219 +203,274 @@ | ||
| 163 | 203 | def process_frames( |
| 164 | 204 | self, |
| 165 | 205 | frame_paths: List[Union[str, Path]], |
| 166 | 206 | diagrams_dir: Optional[Path] = None, |
| 167 | 207 | captures_dir: Optional[Path] = None, |
| 208 | + cache_dir: Optional[Path] = None, | |
| 168 | 209 | ) -> Tuple[List[DiagramResult], List[ScreenCapture]]: |
| 169 | 210 | """ |
| 170 | 211 | Process a list of extracted frames: classify, analyze diagrams, screengrab fallback. |
| 212 | + | |
| 213 | + Classification and analysis run in parallel using a thread pool. Results are | |
| 214 | + cached by frame content hash so re-runs skip already-analyzed frames. | |
| 171 | 215 | |
| 172 | 216 | Thresholds: |
| 173 | 217 | - confidence >= 0.7 → full diagram analysis (story 3.2) |
| 174 | 218 | - 0.3 <= confidence < 0.7 → screengrab fallback (story 3.3) |
| 175 | 219 | - confidence < 0.3 → skip |
| 176 | 220 | |
| 177 | 221 | Returns (diagrams, screen_captures). |
| 178 | 222 | """ |
| 223 | + # Set up cache | |
| 224 | + cache_path = None | |
| 225 | + if cache_dir: | |
| 226 | + cache_path = cache_dir / "frame_analysis_cache.json" | |
| 227 | + elif diagrams_dir: | |
| 228 | + cache_path = diagrams_dir.parent / "frame_analysis_cache.json" | |
| 229 | + cache = _FrameCache(cache_path) | |
| 230 | + | |
| 231 | + frame_paths = [Path(fp) for fp in frame_paths] | |
| 232 | + | |
| 233 | + # --- Phase 1: Parallel classification --- | |
| 234 | + classifications: dict[int, dict] = {} | |
| 235 | + cache_hits = 0 | |
| 236 | + | |
| 237 | + def _classify_one(idx: int, fp: Path) -> Tuple[int, dict, bool]: | |
| 238 | + fhash = _frame_hash(fp) | |
| 239 | + cached = cache.get(f"classify:{fhash}") | |
| 240 | + if cached is not None: | |
| 241 | + return idx, cached, True | |
| 242 | + try: | |
| 243 | + result = self.classify_frame(fp) | |
| 244 | + except Exception as e: | |
| 245 | + logger.warning(f"Classification failed for frame {idx}: {e}") | |
| 246 | + result = {"is_diagram": False, "confidence": 0.0} | |
| 247 | + cache.set(f"classify:{fhash}", result) | |
| 248 | + return idx, result, False | |
| 249 | + | |
| 250 | + workers = min(self.max_workers, len(frame_paths)) if frame_paths else 1 | |
| 251 | + with ThreadPoolExecutor(max_workers=workers) as pool: | |
| 252 | + futures = {pool.submit(_classify_one, i, fp): i for i, fp in enumerate(frame_paths)} | |
| 253 | + pbar = tqdm( | |
| 254 | + as_completed(futures), | |
| 255 | + total=len(futures), | |
| 256 | + desc="Classifying frames", | |
| 257 | + unit="frame", | |
| 258 | + ) | |
| 259 | + for future in pbar: | |
| 260 | + idx, result, from_cache = future.result() | |
| 261 | + classifications[idx] = result | |
| 262 | + if from_cache: | |
| 263 | + cache_hits += 1 | |
| 264 | + | |
| 265 | + if cache_hits: | |
| 266 | + logger.info(f"Classification: {cache_hits}/{len(frame_paths)} from cache") | |
| 267 | + | |
| 268 | + # --- Phase 2: Parallel analysis/extraction for qualifying frames --- | |
| 269 | + high_conf = [] # (idx, fp, classification) | |
| 270 | + med_conf = [] | |
| 271 | + | |
| 272 | + for idx in sorted(classifications): | |
| 273 | + conf = float(classifications[idx].get("confidence", 0.0)) | |
| 274 | + if conf >= 0.7: | |
| 275 | + high_conf.append((idx, frame_paths[idx], classifications[idx])) | |
| 276 | + elif conf >= self.confidence_threshold: | |
| 277 | + med_conf.append((idx, frame_paths[idx], classifications[idx])) | |
| 278 | + | |
| 279 | + # Analyze high-confidence diagrams in parallel | |
| 280 | + analysis_results: dict[int, dict] = {} | |
| 281 | + | |
| 282 | + def _analyze_one(idx: int, fp: Path) -> Tuple[int, dict, bool]: | |
| 283 | + fhash = _frame_hash(fp) | |
| 284 | + cached = cache.get(f"analyze:{fhash}") | |
| 285 | + if cached is not None: | |
| 286 | + return idx, cached, True | |
| 287 | + try: | |
| 288 | + result = self.analyze_diagram_single_pass(fp) | |
| 289 | + except Exception as e: | |
| 290 | + logger.warning(f"Diagram analysis failed for frame {idx}: {e}") | |
| 291 | + result = {} | |
| 292 | + cache.set(f"analyze:{fhash}", result) | |
| 293 | + return idx, result, False | |
| 294 | + | |
| 295 | + if high_conf: | |
| 296 | + workers = min(self.max_workers, len(high_conf)) | |
| 297 | + with ThreadPoolExecutor(max_workers=workers) as pool: | |
| 298 | + futures = {pool.submit(_analyze_one, idx, fp): idx for idx, fp, _ in high_conf} | |
| 299 | + pbar = tqdm( | |
| 300 | + as_completed(futures), | |
| 301 | + total=len(futures), | |
| 302 | + desc="Analyzing diagrams", | |
| 303 | + unit="diagram", | |
| 304 | + ) | |
| 305 | + for future in pbar: | |
| 306 | + idx, result, _ = future.result() | |
| 307 | + analysis_results[idx] = result | |
| 308 | + | |
| 309 | + # Extract knowledge from medium-confidence frames in parallel | |
| 310 | + extraction_results: dict[int, dict] = {} | |
| 311 | + | |
| 312 | + def _extract_one(idx: int, fp: Path) -> Tuple[int, dict, bool]: | |
| 313 | + fhash = _frame_hash(fp) | |
| 314 | + cached = cache.get(f"extract:{fhash}") | |
| 315 | + if cached is not None: | |
| 316 | + return idx, cached, True | |
| 317 | + try: | |
| 318 | + result = self.extract_screenshot_knowledge(fp) | |
| 319 | + except Exception as e: | |
| 320 | + logger.warning(f"Screenshot extraction failed for frame {idx}: {e}") | |
| 321 | + result = {} | |
| 322 | + cache.set(f"extract:{fhash}", result) | |
| 323 | + return idx, result, False | |
| 324 | + | |
| 325 | + if med_conf: | |
| 326 | + workers = min(self.max_workers, len(med_conf)) | |
| 327 | + with ThreadPoolExecutor(max_workers=workers) as pool: | |
| 328 | + futures = {pool.submit(_extract_one, idx, fp): idx for idx, fp, _ in med_conf} | |
| 329 | + pbar = tqdm( | |
| 330 | + as_completed(futures), | |
| 331 | + total=len(futures), | |
| 332 | + desc="Extracting screenshots", | |
| 333 | + unit="capture", | |
| 334 | + ) | |
| 335 | + for future in pbar: | |
| 336 | + idx, result, _ = future.result() | |
| 337 | + extraction_results[idx] = result | |
| 338 | + | |
| 339 | + # --- Phase 3: Build results (sequential for stable ordering) --- | |
| 179 | 340 | diagrams: List[DiagramResult] = [] |
| 180 | 341 | captures: List[ScreenCapture] = [] |
| 181 | 342 | diagram_idx = 0 |
| 182 | 343 | capture_idx = 0 |
| 183 | 344 | |
| 184 | - for i, fp in enumerate(tqdm(frame_paths, desc="Analyzing frames", unit="frame")): | |
| 185 | - fp = Path(fp) | |
| 186 | - logger.info(f"Classifying frame {i}/{len(frame_paths)}: {fp.name}") | |
| 187 | - | |
| 188 | - try: | |
| 189 | - classification = self.classify_frame(fp) | |
| 190 | - except Exception as e: | |
| 191 | - logger.warning(f"Classification failed for frame {i}: {e}") | |
| 192 | - continue | |
| 193 | - | |
| 194 | - confidence = float(classification.get("confidence", 0.0)) | |
| 195 | - | |
| 196 | - if confidence < self.confidence_threshold: | |
| 197 | - logger.debug(f"Frame {i}: confidence {confidence:.2f} below threshold, skipping") | |
| 198 | - continue | |
| 199 | - | |
| 200 | - if confidence >= 0.7: | |
| 201 | - # Full diagram analysis | |
| 202 | - logger.info( | |
| 203 | - f"Frame {i}: diagram detected (confidence {confidence:.2f}), analyzing..." | |
| 204 | - ) | |
| 205 | - try: | |
| 206 | - analysis = self.analyze_diagram_single_pass(fp) | |
| 207 | - except Exception as e: | |
| 208 | - logger.warning( | |
| 209 | - f"Diagram analysis failed for frame {i}: {e}, falling back to screengrab" | |
| 210 | - ) | |
| 211 | - analysis = {} | |
| 212 | - | |
| 213 | - if not analysis: | |
| 214 | - # Analysis failed — fall back to screengrab | |
| 215 | - capture = self._save_screengrab(fp, i, capture_idx, captures_dir, confidence) | |
| 216 | - captures.append(capture) | |
| 217 | - capture_idx += 1 | |
| 218 | - continue | |
| 219 | - | |
| 220 | - # Build DiagramResult | |
| 221 | - dtype = analysis.get("diagram_type", classification.get("diagram_type", "unknown")) | |
| 222 | - try: | |
| 223 | - diagram_type = DiagramType(dtype) | |
| 224 | - except ValueError: | |
| 225 | - diagram_type = DiagramType.unknown | |
| 226 | - | |
| 227 | - # Normalize relationships: llava sometimes returns dicts instead of strings | |
| 228 | - raw_rels = analysis.get("relationships") or [] | |
| 229 | - relationships = [] | |
| 230 | - for rel in raw_rels: | |
| 231 | - if isinstance(rel, str): | |
| 232 | - relationships.append(rel) | |
| 233 | - elif isinstance(rel, dict): | |
| 234 | - src = rel.get("source", rel.get("from", "?")) | |
| 235 | - dst = rel.get("destination", rel.get("to", "?")) | |
| 236 | - label = rel.get("label", rel.get("relationship", "")) | |
| 237 | - relationships.append( | |
| 238 | - f"{src} -> {dst}: {label}" if label else f"{src} -> {dst}" | |
| 239 | - ) | |
| 240 | - else: | |
| 241 | - relationships.append(str(rel)) | |
| 242 | - | |
| 243 | - # Normalize elements: llava may return dicts or nested lists | |
| 244 | - raw_elements = analysis.get("elements") or [] | |
| 245 | - elements = [] | |
| 246 | - for elem in raw_elements: | |
| 247 | - if isinstance(elem, str): | |
| 248 | - elements.append(elem) | |
| 249 | - elif isinstance(elem, dict): | |
| 250 | - name = elem.get("name", elem.get("element", "")) | |
| 251 | - etype = elem.get("type", elem.get("element_type", "")) | |
| 252 | - if name and etype: | |
| 253 | - elements.append(f"{etype}: {name}") | |
| 254 | - elif name: | |
| 255 | - elements.append(name) | |
| 256 | - else: | |
| 257 | - elements.append(json.dumps(elem)) | |
| 258 | - elif isinstance(elem, list): | |
| 259 | - elements.extend(str(e) for e in elem) | |
| 260 | - else: | |
| 261 | - elements.append(str(elem)) | |
| 262 | - | |
| 263 | - # Normalize text_content: llava may return dict instead of string | |
| 264 | - raw_text = analysis.get("text_content") | |
| 265 | - if isinstance(raw_text, dict): | |
| 266 | - parts = [] | |
| 267 | - for k, v in raw_text.items(): | |
| 268 | - if isinstance(v, list): | |
| 269 | - parts.append(f"{k}: {', '.join(str(x) for x in v)}") | |
| 270 | - else: | |
| 271 | - parts.append(f"{k}: {v}") | |
| 272 | - text_content = "\n".join(parts) | |
| 273 | - elif isinstance(raw_text, list): | |
| 274 | - text_content = "\n".join(str(x) for x in raw_text) | |
| 275 | - else: | |
| 276 | - text_content = raw_text | |
| 277 | - | |
| 278 | - try: | |
| 279 | - dr = DiagramResult( | |
| 280 | - frame_index=i, | |
| 281 | - diagram_type=diagram_type, | |
| 282 | - confidence=confidence, | |
| 283 | - description=analysis.get("description"), | |
| 284 | - text_content=text_content, | |
| 285 | - elements=elements, | |
| 286 | - relationships=relationships, | |
| 287 | - mermaid=analysis.get("mermaid"), | |
| 288 | - chart_data=analysis.get("chart_data"), | |
| 289 | - ) | |
| 290 | - except Exception as e: | |
| 291 | - logger.warning( | |
| 292 | - f"DiagramResult validation failed for frame {i}: {e}, " | |
| 293 | - "falling back to screengrab" | |
| 294 | - ) | |
| 295 | - capture = self._save_screengrab(fp, i, capture_idx, captures_dir, confidence) | |
| 296 | - captures.append(capture) | |
| 297 | - capture_idx += 1 | |
| 298 | - continue | |
| 299 | - | |
| 300 | - # Save outputs (story 3.4) | |
| 301 | - if diagrams_dir: | |
| 302 | - diagrams_dir.mkdir(parents=True, exist_ok=True) | |
| 303 | - prefix = f"diagram_{diagram_idx}" | |
| 304 | - | |
| 305 | - # Original frame | |
| 306 | - img_dest = diagrams_dir / f"{prefix}.jpg" | |
| 307 | - shutil.copy2(fp, img_dest) | |
| 308 | - dr.image_path = f"diagrams/{prefix}.jpg" | |
| 309 | - | |
| 310 | - # Mermaid source | |
| 311 | - if dr.mermaid: | |
| 312 | - mermaid_dest = diagrams_dir / f"{prefix}.mermaid" | |
| 313 | - mermaid_dest.write_text(dr.mermaid) | |
| 314 | - dr.mermaid_path = f"diagrams/{prefix}.mermaid" | |
| 315 | - | |
| 316 | - # Analysis JSON | |
| 317 | - json_dest = diagrams_dir / f"{prefix}.json" | |
| 318 | - json_dest.write_text(dr.model_dump_json(indent=2)) | |
| 319 | - | |
| 320 | - diagrams.append(dr) | |
| 321 | - diagram_idx += 1 | |
| 322 | - | |
| 323 | - else: | |
| 324 | - # Screengrab fallback (0.3 <= confidence < 0.7) | |
| 325 | - logger.info( | |
| 326 | - f"Frame {i}: uncertain (confidence {confidence:.2f}), saving as screengrab" | |
| 327 | - ) | |
| 328 | - capture = self._save_screengrab(fp, i, capture_idx, captures_dir, confidence) | |
| 329 | - captures.append(capture) | |
| 330 | - capture_idx += 1 | |
| 345 | + for idx, fp, classification in high_conf: | |
| 346 | + analysis = analysis_results.get(idx, {}) | |
| 347 | + confidence = float(classification.get("confidence", 0.0)) | |
| 348 | + | |
| 349 | + if not analysis: | |
| 350 | + # Analysis failed — fall back to screengrab with pre-fetched extraction | |
| 351 | + extraction = extraction_results.get(idx) | |
| 352 | + if extraction is None: | |
| 353 | + # Wasn't in med_conf, need to extract now | |
| 354 | + try: | |
| 355 | + extraction = self.extract_screenshot_knowledge(fp) | |
| 356 | + except Exception: | |
| 357 | + extraction = {} | |
| 358 | + capture = self._build_screengrab( | |
| 359 | + fp, idx, capture_idx, captures_dir, confidence, extraction | |
| 360 | + ) | |
| 361 | + captures.append(capture) | |
| 362 | + capture_idx += 1 | |
| 363 | + continue | |
| 364 | + | |
| 365 | + dr = self._build_diagram_result( | |
| 366 | + idx, fp, diagram_idx, diagrams_dir, confidence, classification, analysis | |
| 367 | + ) | |
| 368 | + if dr: | |
| 369 | + diagrams.append(dr) | |
| 370 | + diagram_idx += 1 | |
| 371 | + else: | |
| 372 | + capture = self._build_screengrab(fp, idx, capture_idx, captures_dir, confidence, {}) | |
| 373 | + captures.append(capture) | |
| 374 | + capture_idx += 1 | |
| 375 | + | |
| 376 | + for idx, fp, classification in med_conf: | |
| 377 | + confidence = float(classification.get("confidence", 0.0)) | |
| 378 | + extraction = extraction_results.get(idx, {}) | |
| 379 | + logger.info( | |
| 380 | + f"Frame {idx}: uncertain (confidence {confidence:.2f}), saving as screengrab" | |
| 381 | + ) | |
| 382 | + capture = self._build_screengrab( | |
| 383 | + fp, idx, capture_idx, captures_dir, confidence, extraction | |
| 384 | + ) | |
| 385 | + captures.append(capture) | |
| 386 | + capture_idx += 1 | |
| 387 | + | |
| 388 | + # Save cache | |
| 389 | + cache.save() | |
| 331 | 390 | |
| 332 | 391 | logger.info( |
| 333 | 392 | f"Diagram processing complete: {len(diagrams)} diagrams, {len(captures)} screengrabs" |
| 334 | 393 | ) |
| 335 | 394 | return diagrams, captures |
| 336 | 395 | |
| 337 | - def _save_screengrab( | |
| 396 | + def _build_diagram_result( | |
| 397 | + self, | |
| 398 | + frame_index: int, | |
| 399 | + frame_path: Path, | |
| 400 | + diagram_idx: int, | |
| 401 | + diagrams_dir: Optional[Path], | |
| 402 | + confidence: float, | |
| 403 | + classification: dict, | |
| 404 | + analysis: dict, | |
| 405 | + ) -> Optional[DiagramResult]: | |
| 406 | + """Build a DiagramResult from analysis data. Returns None on validation failure.""" | |
| 407 | + dtype = analysis.get("diagram_type", classification.get("diagram_type", "unknown")) | |
| 408 | + try: | |
| 409 | + diagram_type = DiagramType(dtype) | |
| 410 | + except ValueError: | |
| 411 | + diagram_type = DiagramType.unknown | |
| 412 | + | |
| 413 | + relationships = _normalize_relationships(analysis.get("relationships") or []) | |
| 414 | + elements = _normalize_elements(analysis.get("elements") or []) | |
| 415 | + text_content = _normalize_text_content(analysis.get("text_content")) | |
| 416 | + | |
| 417 | + try: | |
| 418 | + dr = DiagramResult( | |
| 419 | + frame_index=frame_index, | |
| 420 | + diagram_type=diagram_type, | |
| 421 | + confidence=confidence, | |
| 422 | + description=analysis.get("description"), | |
| 423 | + text_content=text_content, | |
| 424 | + elements=elements, | |
| 425 | + relationships=relationships, | |
| 426 | + mermaid=analysis.get("mermaid"), | |
| 427 | + chart_data=analysis.get("chart_data"), | |
| 428 | + ) | |
| 429 | + except Exception as e: | |
| 430 | + logger.warning(f"DiagramResult validation failed for frame {frame_index}: {e}") | |
| 431 | + return None | |
| 432 | + | |
| 433 | + if diagrams_dir: | |
| 434 | + diagrams_dir.mkdir(parents=True, exist_ok=True) | |
| 435 | + prefix = f"diagram_{diagram_idx}" | |
| 436 | + img_dest = diagrams_dir / f"{prefix}.jpg" | |
| 437 | + shutil.copy2(frame_path, img_dest) | |
| 438 | + dr.image_path = f"diagrams/{prefix}.jpg" | |
| 439 | + if dr.mermaid: | |
| 440 | + mermaid_dest = diagrams_dir / f"{prefix}.mermaid" | |
| 441 | + mermaid_dest.write_text(dr.mermaid) | |
| 442 | + dr.mermaid_path = f"diagrams/{prefix}.mermaid" | |
| 443 | + json_dest = diagrams_dir / f"{prefix}.json" | |
| 444 | + json_dest.write_text(dr.model_dump_json(indent=2)) | |
| 445 | + | |
| 446 | + return dr | |
| 447 | + | |
| 448 | + def _build_screengrab( | |
| 338 | 449 | self, |
| 339 | 450 | frame_path: Path, |
| 340 | 451 | frame_index: int, |
| 341 | 452 | capture_index: int, |
| 342 | 453 | captures_dir: Optional[Path], |
| 343 | 454 | confidence: float, |
| 455 | + extraction: dict, | |
| 344 | 456 | ) -> ScreenCapture: |
| 345 | - """Extract knowledge from a screenshot and save it.""" | |
| 346 | - # Try rich extraction first, fall back to caption-only | |
| 347 | - caption = "" | |
| 348 | - content_type = None | |
| 349 | - text_content = None | |
| 350 | - entities: List[str] = [] | |
| 351 | - topics: List[str] = [] | |
| 352 | - | |
| 353 | - try: | |
| 354 | - extraction = self.extract_screenshot_knowledge(frame_path) | |
| 355 | - if extraction: | |
| 356 | - caption = extraction.get("caption", "") | |
| 357 | - content_type = extraction.get("content_type") | |
| 358 | - text_content = extraction.get("text_content") | |
| 359 | - raw_entities = extraction.get("entities", []) | |
| 360 | - entities = [str(e) for e in raw_entities] if isinstance(raw_entities, list) else [] | |
| 361 | - raw_topics = extraction.get("topics", []) | |
| 362 | - topics = [str(t) for t in raw_topics] if isinstance(raw_topics, list) else [] | |
| 363 | - logger.info( | |
| 364 | - f"Frame {frame_index}: extracted " | |
| 365 | - f"{len(entities)} entities, " | |
| 366 | - f"{len(topics)} topics from {content_type}" | |
| 367 | - ) | |
| 368 | - except Exception as e: | |
| 369 | - logger.warning( | |
| 370 | - f"Screenshot extraction failed for frame " | |
| 371 | - f"{frame_index}: {e}, falling back to caption" | |
| 372 | - ) | |
| 373 | - try: | |
| 374 | - caption = self.caption_frame(frame_path) | |
| 375 | - except Exception as e2: | |
| 376 | - logger.warning(f"Caption also failed for frame {frame_index}: {e2}") | |
| 457 | + """Build a ScreenCapture from extraction data.""" | |
| 458 | + caption = extraction.get("caption", "") | |
| 459 | + content_type = extraction.get("content_type") | |
| 460 | + text_content = extraction.get("text_content") | |
| 461 | + raw_entities = extraction.get("entities", []) | |
| 462 | + entities = [str(e) for e in raw_entities] if isinstance(raw_entities, list) else [] | |
| 463 | + raw_topics = extraction.get("topics", []) | |
| 464 | + topics = [str(t) for t in raw_topics] if isinstance(raw_topics, list) else [] | |
| 465 | + | |
| 466 | + if extraction: | |
| 467 | + logger.info( | |
| 468 | + f"Frame {frame_index}: extracted " | |
| 469 | + f"{len(entities)} entities, " | |
| 470 | + f"{len(topics)} topics from {content_type}" | |
| 471 | + ) | |
| 377 | 472 | |
| 378 | 473 | sc = ScreenCapture( |
| 379 | 474 | frame_index=frame_index, |
| 380 | 475 | caption=caption, |
| 381 | 476 | confidence=confidence, |
| @@ -389,10 +484,80 @@ | ||
| 389 | 484 | captures_dir.mkdir(parents=True, exist_ok=True) |
| 390 | 485 | prefix = f"capture_{capture_index}" |
| 391 | 486 | img_dest = captures_dir / f"{prefix}.jpg" |
| 392 | 487 | shutil.copy2(frame_path, img_dest) |
| 393 | 488 | sc.image_path = f"captures/{prefix}.jpg" |
| 394 | - | |
| 395 | 489 | json_dest = captures_dir / f"{prefix}.json" |
| 396 | 490 | json_dest.write_text(sc.model_dump_json(indent=2)) |
| 397 | 491 | |
| 398 | 492 | return sc |
| 493 | + | |
| 494 | + def _save_screengrab( | |
| 495 | + self, | |
| 496 | + frame_path: Path, | |
| 497 | + frame_index: int, | |
| 498 | + capture_index: int, | |
| 499 | + captures_dir: Optional[Path], | |
| 500 | + confidence: float, | |
| 501 | + ) -> ScreenCapture: | |
| 502 | + """Legacy entry point — extracts then delegates to _build_screengrab.""" | |
| 503 | + try: | |
| 504 | + extraction = self.extract_screenshot_knowledge(frame_path) | |
| 505 | + except Exception as e: | |
| 506 | + logger.warning(f"Screenshot extraction failed for frame {frame_index}: {e}") | |
| 507 | + extraction = {} | |
| 508 | + return self._build_screengrab( | |
| 509 | + frame_path, frame_index, capture_index, captures_dir, confidence, extraction | |
| 510 | + ) | |
| 511 | + | |
| 512 | + | |
| 513 | +def _normalize_relationships(raw_rels: list) -> List[str]: | |
| 514 | + """Normalize relationships: llava sometimes returns dicts instead of strings.""" | |
| 515 | + relationships = [] | |
| 516 | + for rel in raw_rels: | |
| 517 | + if isinstance(rel, str): | |
| 518 | + relationships.append(rel) | |
| 519 | + elif isinstance(rel, dict): | |
| 520 | + src = rel.get("source", rel.get("from", "?")) | |
| 521 | + dst = rel.get("destination", rel.get("to", "?")) | |
| 522 | + label = rel.get("label", rel.get("relationship", "")) | |
| 523 | + relationships.append(f"{src} -> {dst}: {label}" if label else f"{src} -> {dst}") | |
| 524 | + else: | |
| 525 | + relationships.append(str(rel)) | |
| 526 | + return relationships | |
| 527 | + | |
| 528 | + | |
| 529 | +def _normalize_elements(raw_elements: list) -> List[str]: | |
| 530 | + """Normalize elements: llava may return dicts or nested lists.""" | |
| 531 | + elements = [] | |
| 532 | + for elem in raw_elements: | |
| 533 | + if isinstance(elem, str): | |
| 534 | + elements.append(elem) | |
| 535 | + elif isinstance(elem, dict): | |
| 536 | + name = elem.get("name", elem.get("element", "")) | |
| 537 | + etype = elem.get("type", elem.get("element_type", "")) | |
| 538 | + if name and etype: | |
| 539 | + elements.append(f"{etype}: {name}") | |
| 540 | + elif name: | |
| 541 | + elements.append(name) | |
| 542 | + else: | |
| 543 | + elements.append(json.dumps(elem)) | |
| 544 | + elif isinstance(elem, list): | |
| 545 | + elements.extend(str(e) for e in elem) | |
| 546 | + else: | |
| 547 | + elements.append(str(elem)) | |
| 548 | + return elements | |
| 549 | + | |
| 550 | + | |
| 551 | +def _normalize_text_content(raw_text) -> Optional[str]: | |
| 552 | + """Normalize text_content: llava may return dict instead of string.""" | |
| 553 | + if isinstance(raw_text, dict): | |
| 554 | + parts = [] | |
| 555 | + for k, v in raw_text.items(): | |
| 556 | + if isinstance(v, list): | |
| 557 | + parts.append(f"{k}: {', '.join(str(x) for x in v)}") | |
| 558 | + else: | |
| 559 | + parts.append(f"{k}: {v}") | |
| 560 | + return "\n".join(parts) | |
| 561 | + elif isinstance(raw_text, list): | |
| 562 | + return "\n".join(str(x) for x in raw_text) | |
| 563 | + return raw_text | |
| 399 | 564 |
| --- video_processor/analyzers/diagram_analyzer.py | |
| +++ video_processor/analyzers/diagram_analyzer.py | |
| @@ -1,19 +1,24 @@ | |
| 1 | """Diagram analysis using vision model classification and single-pass extraction.""" |
| 2 | |
| 3 | import json |
| 4 | import logging |
| 5 | import shutil |
| 6 | from pathlib import Path |
| 7 | from typing import List, Optional, Tuple, Union |
| 8 | |
| 9 | from tqdm import tqdm |
| 10 | |
| 11 | from video_processor.models import DiagramResult, DiagramType, ScreenCapture |
| 12 | from video_processor.providers.manager import ProviderManager |
| 13 | |
| 14 | logger = logging.getLogger(__name__) |
| 15 | |
| 16 | # Classification prompt — returns JSON |
| 17 | _CLASSIFY_PROMPT = """\ |
| 18 | Examine this image from a video recording. Your job is to identify ONLY shared content \ |
| 19 | — slides, presentations, charts, diagrams, documents, screen shares, whiteboard content, \ |
| @@ -105,21 +110,56 @@ | |
| 105 | return json.loads(cleaned[start:end]) |
| 106 | except json.JSONDecodeError: |
| 107 | pass |
| 108 | return None |
| 109 | |
| 110 | |
| 111 | class DiagramAnalyzer: |
| 112 | """Vision model-based diagram detection and analysis.""" |
| 113 | |
| 114 | def __init__( |
| 115 | self, |
| 116 | provider_manager: Optional[ProviderManager] = None, |
| 117 | confidence_threshold: float = 0.3, |
| 118 | ): |
| 119 | self.pm = provider_manager or ProviderManager() |
| 120 | self.confidence_threshold = confidence_threshold |
| 121 | |
| 122 | def classify_frame(self, image_path: Union[str, Path]) -> dict: |
| 123 | """ |
| 124 | Classify a single frame using vision model. |
| 125 | |
| @@ -163,219 +203,274 @@ | |
| 163 | def process_frames( |
| 164 | self, |
| 165 | frame_paths: List[Union[str, Path]], |
| 166 | diagrams_dir: Optional[Path] = None, |
| 167 | captures_dir: Optional[Path] = None, |
| 168 | ) -> Tuple[List[DiagramResult], List[ScreenCapture]]: |
| 169 | """ |
| 170 | Process a list of extracted frames: classify, analyze diagrams, screengrab fallback. |
| 171 | |
| 172 | Thresholds: |
| 173 | - confidence >= 0.7 → full diagram analysis (story 3.2) |
| 174 | - 0.3 <= confidence < 0.7 → screengrab fallback (story 3.3) |
| 175 | - confidence < 0.3 → skip |
| 176 | |
| 177 | Returns (diagrams, screen_captures). |
| 178 | """ |
| 179 | diagrams: List[DiagramResult] = [] |
| 180 | captures: List[ScreenCapture] = [] |
| 181 | diagram_idx = 0 |
| 182 | capture_idx = 0 |
| 183 | |
| 184 | for i, fp in enumerate(tqdm(frame_paths, desc="Analyzing frames", unit="frame")): |
| 185 | fp = Path(fp) |
| 186 | logger.info(f"Classifying frame {i}/{len(frame_paths)}: {fp.name}") |
| 187 | |
| 188 | try: |
| 189 | classification = self.classify_frame(fp) |
| 190 | except Exception as e: |
| 191 | logger.warning(f"Classification failed for frame {i}: {e}") |
| 192 | continue |
| 193 | |
| 194 | confidence = float(classification.get("confidence", 0.0)) |
| 195 | |
| 196 | if confidence < self.confidence_threshold: |
| 197 | logger.debug(f"Frame {i}: confidence {confidence:.2f} below threshold, skipping") |
| 198 | continue |
| 199 | |
| 200 | if confidence >= 0.7: |
| 201 | # Full diagram analysis |
| 202 | logger.info( |
| 203 | f"Frame {i}: diagram detected (confidence {confidence:.2f}), analyzing..." |
| 204 | ) |
| 205 | try: |
| 206 | analysis = self.analyze_diagram_single_pass(fp) |
| 207 | except Exception as e: |
| 208 | logger.warning( |
| 209 | f"Diagram analysis failed for frame {i}: {e}, falling back to screengrab" |
| 210 | ) |
| 211 | analysis = {} |
| 212 | |
| 213 | if not analysis: |
| 214 | # Analysis failed — fall back to screengrab |
| 215 | capture = self._save_screengrab(fp, i, capture_idx, captures_dir, confidence) |
| 216 | captures.append(capture) |
| 217 | capture_idx += 1 |
| 218 | continue |
| 219 | |
| 220 | # Build DiagramResult |
| 221 | dtype = analysis.get("diagram_type", classification.get("diagram_type", "unknown")) |
| 222 | try: |
| 223 | diagram_type = DiagramType(dtype) |
| 224 | except ValueError: |
| 225 | diagram_type = DiagramType.unknown |
| 226 | |
| 227 | # Normalize relationships: llava sometimes returns dicts instead of strings |
| 228 | raw_rels = analysis.get("relationships") or [] |
| 229 | relationships = [] |
| 230 | for rel in raw_rels: |
| 231 | if isinstance(rel, str): |
| 232 | relationships.append(rel) |
| 233 | elif isinstance(rel, dict): |
| 234 | src = rel.get("source", rel.get("from", "?")) |
| 235 | dst = rel.get("destination", rel.get("to", "?")) |
| 236 | label = rel.get("label", rel.get("relationship", "")) |
| 237 | relationships.append( |
| 238 | f"{src} -> {dst}: {label}" if label else f"{src} -> {dst}" |
| 239 | ) |
| 240 | else: |
| 241 | relationships.append(str(rel)) |
| 242 | |
| 243 | # Normalize elements: llava may return dicts or nested lists |
| 244 | raw_elements = analysis.get("elements") or [] |
| 245 | elements = [] |
| 246 | for elem in raw_elements: |
| 247 | if isinstance(elem, str): |
| 248 | elements.append(elem) |
| 249 | elif isinstance(elem, dict): |
| 250 | name = elem.get("name", elem.get("element", "")) |
| 251 | etype = elem.get("type", elem.get("element_type", "")) |
| 252 | if name and etype: |
| 253 | elements.append(f"{etype}: {name}") |
| 254 | elif name: |
| 255 | elements.append(name) |
| 256 | else: |
| 257 | elements.append(json.dumps(elem)) |
| 258 | elif isinstance(elem, list): |
| 259 | elements.extend(str(e) for e in elem) |
| 260 | else: |
| 261 | elements.append(str(elem)) |
| 262 | |
| 263 | # Normalize text_content: llava may return dict instead of string |
| 264 | raw_text = analysis.get("text_content") |
| 265 | if isinstance(raw_text, dict): |
| 266 | parts = [] |
| 267 | for k, v in raw_text.items(): |
| 268 | if isinstance(v, list): |
| 269 | parts.append(f"{k}: {', '.join(str(x) for x in v)}") |
| 270 | else: |
| 271 | parts.append(f"{k}: {v}") |
| 272 | text_content = "\n".join(parts) |
| 273 | elif isinstance(raw_text, list): |
| 274 | text_content = "\n".join(str(x) for x in raw_text) |
| 275 | else: |
| 276 | text_content = raw_text |
| 277 | |
| 278 | try: |
| 279 | dr = DiagramResult( |
| 280 | frame_index=i, |
| 281 | diagram_type=diagram_type, |
| 282 | confidence=confidence, |
| 283 | description=analysis.get("description"), |
| 284 | text_content=text_content, |
| 285 | elements=elements, |
| 286 | relationships=relationships, |
| 287 | mermaid=analysis.get("mermaid"), |
| 288 | chart_data=analysis.get("chart_data"), |
| 289 | ) |
| 290 | except Exception as e: |
| 291 | logger.warning( |
| 292 | f"DiagramResult validation failed for frame {i}: {e}, " |
| 293 | "falling back to screengrab" |
| 294 | ) |
| 295 | capture = self._save_screengrab(fp, i, capture_idx, captures_dir, confidence) |
| 296 | captures.append(capture) |
| 297 | capture_idx += 1 |
| 298 | continue |
| 299 | |
| 300 | # Save outputs (story 3.4) |
| 301 | if diagrams_dir: |
| 302 | diagrams_dir.mkdir(parents=True, exist_ok=True) |
| 303 | prefix = f"diagram_{diagram_idx}" |
| 304 | |
| 305 | # Original frame |
| 306 | img_dest = diagrams_dir / f"{prefix}.jpg" |
| 307 | shutil.copy2(fp, img_dest) |
| 308 | dr.image_path = f"diagrams/{prefix}.jpg" |
| 309 | |
| 310 | # Mermaid source |
| 311 | if dr.mermaid: |
| 312 | mermaid_dest = diagrams_dir / f"{prefix}.mermaid" |
| 313 | mermaid_dest.write_text(dr.mermaid) |
| 314 | dr.mermaid_path = f"diagrams/{prefix}.mermaid" |
| 315 | |
| 316 | # Analysis JSON |
| 317 | json_dest = diagrams_dir / f"{prefix}.json" |
| 318 | json_dest.write_text(dr.model_dump_json(indent=2)) |
| 319 | |
| 320 | diagrams.append(dr) |
| 321 | diagram_idx += 1 |
| 322 | |
| 323 | else: |
| 324 | # Screengrab fallback (0.3 <= confidence < 0.7) |
| 325 | logger.info( |
| 326 | f"Frame {i}: uncertain (confidence {confidence:.2f}), saving as screengrab" |
| 327 | ) |
| 328 | capture = self._save_screengrab(fp, i, capture_idx, captures_dir, confidence) |
| 329 | captures.append(capture) |
| 330 | capture_idx += 1 |
| 331 | |
| 332 | logger.info( |
| 333 | f"Diagram processing complete: {len(diagrams)} diagrams, {len(captures)} screengrabs" |
| 334 | ) |
| 335 | return diagrams, captures |
| 336 | |
| 337 | def _save_screengrab( |
| 338 | self, |
| 339 | frame_path: Path, |
| 340 | frame_index: int, |
| 341 | capture_index: int, |
| 342 | captures_dir: Optional[Path], |
| 343 | confidence: float, |
| 344 | ) -> ScreenCapture: |
| 345 | """Extract knowledge from a screenshot and save it.""" |
| 346 | # Try rich extraction first, fall back to caption-only |
| 347 | caption = "" |
| 348 | content_type = None |
| 349 | text_content = None |
| 350 | entities: List[str] = [] |
| 351 | topics: List[str] = [] |
| 352 | |
| 353 | try: |
| 354 | extraction = self.extract_screenshot_knowledge(frame_path) |
| 355 | if extraction: |
| 356 | caption = extraction.get("caption", "") |
| 357 | content_type = extraction.get("content_type") |
| 358 | text_content = extraction.get("text_content") |
| 359 | raw_entities = extraction.get("entities", []) |
| 360 | entities = [str(e) for e in raw_entities] if isinstance(raw_entities, list) else [] |
| 361 | raw_topics = extraction.get("topics", []) |
| 362 | topics = [str(t) for t in raw_topics] if isinstance(raw_topics, list) else [] |
| 363 | logger.info( |
| 364 | f"Frame {frame_index}: extracted " |
| 365 | f"{len(entities)} entities, " |
| 366 | f"{len(topics)} topics from {content_type}" |
| 367 | ) |
| 368 | except Exception as e: |
| 369 | logger.warning( |
| 370 | f"Screenshot extraction failed for frame " |
| 371 | f"{frame_index}: {e}, falling back to caption" |
| 372 | ) |
| 373 | try: |
| 374 | caption = self.caption_frame(frame_path) |
| 375 | except Exception as e2: |
| 376 | logger.warning(f"Caption also failed for frame {frame_index}: {e2}") |
| 377 | |
| 378 | sc = ScreenCapture( |
| 379 | frame_index=frame_index, |
| 380 | caption=caption, |
| 381 | confidence=confidence, |
| @@ -389,10 +484,80 @@ | |
| 389 | captures_dir.mkdir(parents=True, exist_ok=True) |
| 390 | prefix = f"capture_{capture_index}" |
| 391 | img_dest = captures_dir / f"{prefix}.jpg" |
| 392 | shutil.copy2(frame_path, img_dest) |
| 393 | sc.image_path = f"captures/{prefix}.jpg" |
| 394 | |
| 395 | json_dest = captures_dir / f"{prefix}.json" |
| 396 | json_dest.write_text(sc.model_dump_json(indent=2)) |
| 397 | |
| 398 | return sc |
| 399 |
| --- video_processor/analyzers/diagram_analyzer.py | |
| +++ video_processor/analyzers/diagram_analyzer.py | |
| @@ -1,19 +1,24 @@ | |
| 1 | """Diagram analysis using vision model classification and single-pass extraction.""" |
| 2 | |
| 3 | import hashlib |
| 4 | import json |
| 5 | import logging |
| 6 | import shutil |
| 7 | from concurrent.futures import ThreadPoolExecutor, as_completed |
| 8 | from pathlib import Path |
| 9 | from typing import List, Optional, Tuple, Union |
| 10 | |
| 11 | from tqdm import tqdm |
| 12 | |
| 13 | from video_processor.models import DiagramResult, DiagramType, ScreenCapture |
| 14 | from video_processor.providers.manager import ProviderManager |
| 15 | |
| 16 | logger = logging.getLogger(__name__) |
| 17 | |
| 18 | # Default max workers for parallel frame analysis |
| 19 | _DEFAULT_MAX_WORKERS = 4 |
| 20 | |
| 21 | # Classification prompt — returns JSON |
| 22 | _CLASSIFY_PROMPT = """\ |
| 23 | Examine this image from a video recording. Your job is to identify ONLY shared content \ |
| 24 | — slides, presentations, charts, diagrams, documents, screen shares, whiteboard content, \ |
| @@ -105,21 +110,56 @@ | |
| 110 | return json.loads(cleaned[start:end]) |
| 111 | except json.JSONDecodeError: |
| 112 | pass |
| 113 | return None |
| 114 | |
| 115 | |
| 116 | def _frame_hash(path: Path) -> str: |
| 117 | """Content-based hash for a frame file (first 8KB + size for speed).""" |
| 118 | h = hashlib.sha256() |
| 119 | h.update(str(path.stat().st_size).encode()) |
| 120 | with open(path, "rb") as f: |
| 121 | h.update(f.read(8192)) |
| 122 | return h.hexdigest()[:16] |
| 123 | |
| 124 | |
| 125 | class _FrameCache: |
| 126 | """Simple JSON file cache for frame classification/analysis results.""" |
| 127 | |
| 128 | def __init__(self, cache_path: Optional[Path]): |
| 129 | self._path = cache_path |
| 130 | self._data: dict = {} |
| 131 | if cache_path and cache_path.exists(): |
| 132 | try: |
| 133 | self._data = json.loads(cache_path.read_text()) |
| 134 | except (json.JSONDecodeError, OSError): |
| 135 | self._data = {} |
| 136 | |
| 137 | def get(self, key: str) -> Optional[dict]: |
| 138 | return self._data.get(key) |
| 139 | |
| 140 | def set(self, key: str, value: dict) -> None: |
| 141 | self._data[key] = value |
| 142 | |
| 143 | def save(self) -> None: |
| 144 | if self._path: |
| 145 | self._path.parent.mkdir(parents=True, exist_ok=True) |
| 146 | self._path.write_text(json.dumps(self._data, indent=2)) |
| 147 | |
| 148 | |
| 149 | class DiagramAnalyzer: |
| 150 | """Vision model-based diagram detection and analysis.""" |
| 151 | |
| 152 | def __init__( |
| 153 | self, |
| 154 | provider_manager: Optional[ProviderManager] = None, |
| 155 | confidence_threshold: float = 0.3, |
| 156 | max_workers: int = _DEFAULT_MAX_WORKERS, |
| 157 | ): |
| 158 | self.pm = provider_manager or ProviderManager() |
| 159 | self.confidence_threshold = confidence_threshold |
| 160 | self.max_workers = max_workers |
| 161 | |
| 162 | def classify_frame(self, image_path: Union[str, Path]) -> dict: |
| 163 | """ |
| 164 | Classify a single frame using vision model. |
| 165 | |
| @@ -163,219 +203,274 @@ | |
| 203 | def process_frames( |
| 204 | self, |
| 205 | frame_paths: List[Union[str, Path]], |
| 206 | diagrams_dir: Optional[Path] = None, |
| 207 | captures_dir: Optional[Path] = None, |
| 208 | cache_dir: Optional[Path] = None, |
| 209 | ) -> Tuple[List[DiagramResult], List[ScreenCapture]]: |
| 210 | """ |
| 211 | Process a list of extracted frames: classify, analyze diagrams, screengrab fallback. |
| 212 | |
| 213 | Classification and analysis run in parallel using a thread pool. Results are |
| 214 | cached by frame content hash so re-runs skip already-analyzed frames. |
| 215 | |
| 216 | Thresholds: |
| 217 | - confidence >= 0.7 → full diagram analysis (story 3.2) |
| 218 | - 0.3 <= confidence < 0.7 → screengrab fallback (story 3.3) |
| 219 | - confidence < 0.3 → skip |
| 220 | |
| 221 | Returns (diagrams, screen_captures). |
| 222 | """ |
| 223 | # Set up cache |
| 224 | cache_path = None |
| 225 | if cache_dir: |
| 226 | cache_path = cache_dir / "frame_analysis_cache.json" |
| 227 | elif diagrams_dir: |
| 228 | cache_path = diagrams_dir.parent / "frame_analysis_cache.json" |
| 229 | cache = _FrameCache(cache_path) |
| 230 | |
| 231 | frame_paths = [Path(fp) for fp in frame_paths] |
| 232 | |
| 233 | # --- Phase 1: Parallel classification --- |
| 234 | classifications: dict[int, dict] = {} |
| 235 | cache_hits = 0 |
| 236 | |
| 237 | def _classify_one(idx: int, fp: Path) -> Tuple[int, dict, bool]: |
| 238 | fhash = _frame_hash(fp) |
| 239 | cached = cache.get(f"classify:{fhash}") |
| 240 | if cached is not None: |
| 241 | return idx, cached, True |
| 242 | try: |
| 243 | result = self.classify_frame(fp) |
| 244 | except Exception as e: |
| 245 | logger.warning(f"Classification failed for frame {idx}: {e}") |
| 246 | result = {"is_diagram": False, "confidence": 0.0} |
| 247 | cache.set(f"classify:{fhash}", result) |
| 248 | return idx, result, False |
| 249 | |
| 250 | workers = min(self.max_workers, len(frame_paths)) if frame_paths else 1 |
| 251 | with ThreadPoolExecutor(max_workers=workers) as pool: |
| 252 | futures = {pool.submit(_classify_one, i, fp): i for i, fp in enumerate(frame_paths)} |
| 253 | pbar = tqdm( |
| 254 | as_completed(futures), |
| 255 | total=len(futures), |
| 256 | desc="Classifying frames", |
| 257 | unit="frame", |
| 258 | ) |
| 259 | for future in pbar: |
| 260 | idx, result, from_cache = future.result() |
| 261 | classifications[idx] = result |
| 262 | if from_cache: |
| 263 | cache_hits += 1 |
| 264 | |
| 265 | if cache_hits: |
| 266 | logger.info(f"Classification: {cache_hits}/{len(frame_paths)} from cache") |
| 267 | |
| 268 | # --- Phase 2: Parallel analysis/extraction for qualifying frames --- |
| 269 | high_conf = [] # (idx, fp, classification) |
| 270 | med_conf = [] |
| 271 | |
| 272 | for idx in sorted(classifications): |
| 273 | conf = float(classifications[idx].get("confidence", 0.0)) |
| 274 | if conf >= 0.7: |
| 275 | high_conf.append((idx, frame_paths[idx], classifications[idx])) |
| 276 | elif conf >= self.confidence_threshold: |
| 277 | med_conf.append((idx, frame_paths[idx], classifications[idx])) |
| 278 | |
| 279 | # Analyze high-confidence diagrams in parallel |
| 280 | analysis_results: dict[int, dict] = {} |
| 281 | |
| 282 | def _analyze_one(idx: int, fp: Path) -> Tuple[int, dict, bool]: |
| 283 | fhash = _frame_hash(fp) |
| 284 | cached = cache.get(f"analyze:{fhash}") |
| 285 | if cached is not None: |
| 286 | return idx, cached, True |
| 287 | try: |
| 288 | result = self.analyze_diagram_single_pass(fp) |
| 289 | except Exception as e: |
| 290 | logger.warning(f"Diagram analysis failed for frame {idx}: {e}") |
| 291 | result = {} |
| 292 | cache.set(f"analyze:{fhash}", result) |
| 293 | return idx, result, False |
| 294 | |
| 295 | if high_conf: |
| 296 | workers = min(self.max_workers, len(high_conf)) |
| 297 | with ThreadPoolExecutor(max_workers=workers) as pool: |
| 298 | futures = {pool.submit(_analyze_one, idx, fp): idx for idx, fp, _ in high_conf} |
| 299 | pbar = tqdm( |
| 300 | as_completed(futures), |
| 301 | total=len(futures), |
| 302 | desc="Analyzing diagrams", |
| 303 | unit="diagram", |
| 304 | ) |
| 305 | for future in pbar: |
| 306 | idx, result, _ = future.result() |
| 307 | analysis_results[idx] = result |
| 308 | |
| 309 | # Extract knowledge from medium-confidence frames in parallel |
| 310 | extraction_results: dict[int, dict] = {} |
| 311 | |
| 312 | def _extract_one(idx: int, fp: Path) -> Tuple[int, dict, bool]: |
| 313 | fhash = _frame_hash(fp) |
| 314 | cached = cache.get(f"extract:{fhash}") |
| 315 | if cached is not None: |
| 316 | return idx, cached, True |
| 317 | try: |
| 318 | result = self.extract_screenshot_knowledge(fp) |
| 319 | except Exception as e: |
| 320 | logger.warning(f"Screenshot extraction failed for frame {idx}: {e}") |
| 321 | result = {} |
| 322 | cache.set(f"extract:{fhash}", result) |
| 323 | return idx, result, False |
| 324 | |
| 325 | if med_conf: |
| 326 | workers = min(self.max_workers, len(med_conf)) |
| 327 | with ThreadPoolExecutor(max_workers=workers) as pool: |
| 328 | futures = {pool.submit(_extract_one, idx, fp): idx for idx, fp, _ in med_conf} |
| 329 | pbar = tqdm( |
| 330 | as_completed(futures), |
| 331 | total=len(futures), |
| 332 | desc="Extracting screenshots", |
| 333 | unit="capture", |
| 334 | ) |
| 335 | for future in pbar: |
| 336 | idx, result, _ = future.result() |
| 337 | extraction_results[idx] = result |
| 338 | |
| 339 | # --- Phase 3: Build results (sequential for stable ordering) --- |
| 340 | diagrams: List[DiagramResult] = [] |
| 341 | captures: List[ScreenCapture] = [] |
| 342 | diagram_idx = 0 |
| 343 | capture_idx = 0 |
| 344 | |
| 345 | for idx, fp, classification in high_conf: |
| 346 | analysis = analysis_results.get(idx, {}) |
| 347 | confidence = float(classification.get("confidence", 0.0)) |
| 348 | |
| 349 | if not analysis: |
| 350 | # Analysis failed — fall back to screengrab with pre-fetched extraction |
| 351 | extraction = extraction_results.get(idx) |
| 352 | if extraction is None: |
| 353 | # Wasn't in med_conf, need to extract now |
| 354 | try: |
| 355 | extraction = self.extract_screenshot_knowledge(fp) |
| 356 | except Exception: |
| 357 | extraction = {} |
| 358 | capture = self._build_screengrab( |
| 359 | fp, idx, capture_idx, captures_dir, confidence, extraction |
| 360 | ) |
| 361 | captures.append(capture) |
| 362 | capture_idx += 1 |
| 363 | continue |
| 364 | |
| 365 | dr = self._build_diagram_result( |
| 366 | idx, fp, diagram_idx, diagrams_dir, confidence, classification, analysis |
| 367 | ) |
| 368 | if dr: |
| 369 | diagrams.append(dr) |
| 370 | diagram_idx += 1 |
| 371 | else: |
| 372 | capture = self._build_screengrab(fp, idx, capture_idx, captures_dir, confidence, {}) |
| 373 | captures.append(capture) |
| 374 | capture_idx += 1 |
| 375 | |
| 376 | for idx, fp, classification in med_conf: |
| 377 | confidence = float(classification.get("confidence", 0.0)) |
| 378 | extraction = extraction_results.get(idx, {}) |
| 379 | logger.info( |
| 380 | f"Frame {idx}: uncertain (confidence {confidence:.2f}), saving as screengrab" |
| 381 | ) |
| 382 | capture = self._build_screengrab( |
| 383 | fp, idx, capture_idx, captures_dir, confidence, extraction |
| 384 | ) |
| 385 | captures.append(capture) |
| 386 | capture_idx += 1 |
| 387 | |
| 388 | # Save cache |
| 389 | cache.save() |
| 390 | |
| 391 | logger.info( |
| 392 | f"Diagram processing complete: {len(diagrams)} diagrams, {len(captures)} screengrabs" |
| 393 | ) |
| 394 | return diagrams, captures |
| 395 | |
| 396 | def _build_diagram_result( |
| 397 | self, |
| 398 | frame_index: int, |
| 399 | frame_path: Path, |
| 400 | diagram_idx: int, |
| 401 | diagrams_dir: Optional[Path], |
| 402 | confidence: float, |
| 403 | classification: dict, |
| 404 | analysis: dict, |
| 405 | ) -> Optional[DiagramResult]: |
| 406 | """Build a DiagramResult from analysis data. Returns None on validation failure.""" |
| 407 | dtype = analysis.get("diagram_type", classification.get("diagram_type", "unknown")) |
| 408 | try: |
| 409 | diagram_type = DiagramType(dtype) |
| 410 | except ValueError: |
| 411 | diagram_type = DiagramType.unknown |
| 412 | |
| 413 | relationships = _normalize_relationships(analysis.get("relationships") or []) |
| 414 | elements = _normalize_elements(analysis.get("elements") or []) |
| 415 | text_content = _normalize_text_content(analysis.get("text_content")) |
| 416 | |
| 417 | try: |
| 418 | dr = DiagramResult( |
| 419 | frame_index=frame_index, |
| 420 | diagram_type=diagram_type, |
| 421 | confidence=confidence, |
| 422 | description=analysis.get("description"), |
| 423 | text_content=text_content, |
| 424 | elements=elements, |
| 425 | relationships=relationships, |
| 426 | mermaid=analysis.get("mermaid"), |
| 427 | chart_data=analysis.get("chart_data"), |
| 428 | ) |
| 429 | except Exception as e: |
| 430 | logger.warning(f"DiagramResult validation failed for frame {frame_index}: {e}") |
| 431 | return None |
| 432 | |
| 433 | if diagrams_dir: |
| 434 | diagrams_dir.mkdir(parents=True, exist_ok=True) |
| 435 | prefix = f"diagram_{diagram_idx}" |
| 436 | img_dest = diagrams_dir / f"{prefix}.jpg" |
| 437 | shutil.copy2(frame_path, img_dest) |
| 438 | dr.image_path = f"diagrams/{prefix}.jpg" |
| 439 | if dr.mermaid: |
| 440 | mermaid_dest = diagrams_dir / f"{prefix}.mermaid" |
| 441 | mermaid_dest.write_text(dr.mermaid) |
| 442 | dr.mermaid_path = f"diagrams/{prefix}.mermaid" |
| 443 | json_dest = diagrams_dir / f"{prefix}.json" |
| 444 | json_dest.write_text(dr.model_dump_json(indent=2)) |
| 445 | |
| 446 | return dr |
| 447 | |
| 448 | def _build_screengrab( |
| 449 | self, |
| 450 | frame_path: Path, |
| 451 | frame_index: int, |
| 452 | capture_index: int, |
| 453 | captures_dir: Optional[Path], |
| 454 | confidence: float, |
| 455 | extraction: dict, |
| 456 | ) -> ScreenCapture: |
| 457 | """Build a ScreenCapture from extraction data.""" |
| 458 | caption = extraction.get("caption", "") |
| 459 | content_type = extraction.get("content_type") |
| 460 | text_content = extraction.get("text_content") |
| 461 | raw_entities = extraction.get("entities", []) |
| 462 | entities = [str(e) for e in raw_entities] if isinstance(raw_entities, list) else [] |
| 463 | raw_topics = extraction.get("topics", []) |
| 464 | topics = [str(t) for t in raw_topics] if isinstance(raw_topics, list) else [] |
| 465 | |
| 466 | if extraction: |
| 467 | logger.info( |
| 468 | f"Frame {frame_index}: extracted " |
| 469 | f"{len(entities)} entities, " |
| 470 | f"{len(topics)} topics from {content_type}" |
| 471 | ) |
| 472 | |
| 473 | sc = ScreenCapture( |
| 474 | frame_index=frame_index, |
| 475 | caption=caption, |
| 476 | confidence=confidence, |
| @@ -389,10 +484,80 @@ | |
| 484 | captures_dir.mkdir(parents=True, exist_ok=True) |
| 485 | prefix = f"capture_{capture_index}" |
| 486 | img_dest = captures_dir / f"{prefix}.jpg" |
| 487 | shutil.copy2(frame_path, img_dest) |
| 488 | sc.image_path = f"captures/{prefix}.jpg" |
| 489 | json_dest = captures_dir / f"{prefix}.json" |
| 490 | json_dest.write_text(sc.model_dump_json(indent=2)) |
| 491 | |
| 492 | return sc |
| 493 | |
| 494 | def _save_screengrab( |
| 495 | self, |
| 496 | frame_path: Path, |
| 497 | frame_index: int, |
| 498 | capture_index: int, |
| 499 | captures_dir: Optional[Path], |
| 500 | confidence: float, |
| 501 | ) -> ScreenCapture: |
| 502 | """Legacy entry point — extracts then delegates to _build_screengrab.""" |
| 503 | try: |
| 504 | extraction = self.extract_screenshot_knowledge(frame_path) |
| 505 | except Exception as e: |
| 506 | logger.warning(f"Screenshot extraction failed for frame {frame_index}: {e}") |
| 507 | extraction = {} |
| 508 | return self._build_screengrab( |
| 509 | frame_path, frame_index, capture_index, captures_dir, confidence, extraction |
| 510 | ) |
| 511 | |
| 512 | |
| 513 | def _normalize_relationships(raw_rels: list) -> List[str]: |
| 514 | """Normalize relationships: llava sometimes returns dicts instead of strings.""" |
| 515 | relationships = [] |
| 516 | for rel in raw_rels: |
| 517 | if isinstance(rel, str): |
| 518 | relationships.append(rel) |
| 519 | elif isinstance(rel, dict): |
| 520 | src = rel.get("source", rel.get("from", "?")) |
| 521 | dst = rel.get("destination", rel.get("to", "?")) |
| 522 | label = rel.get("label", rel.get("relationship", "")) |
| 523 | relationships.append(f"{src} -> {dst}: {label}" if label else f"{src} -> {dst}") |
| 524 | else: |
| 525 | relationships.append(str(rel)) |
| 526 | return relationships |
| 527 | |
| 528 | |
| 529 | def _normalize_elements(raw_elements: list) -> List[str]: |
| 530 | """Normalize elements: llava may return dicts or nested lists.""" |
| 531 | elements = [] |
| 532 | for elem in raw_elements: |
| 533 | if isinstance(elem, str): |
| 534 | elements.append(elem) |
| 535 | elif isinstance(elem, dict): |
| 536 | name = elem.get("name", elem.get("element", "")) |
| 537 | etype = elem.get("type", elem.get("element_type", "")) |
| 538 | if name and etype: |
| 539 | elements.append(f"{etype}: {name}") |
| 540 | elif name: |
| 541 | elements.append(name) |
| 542 | else: |
| 543 | elements.append(json.dumps(elem)) |
| 544 | elif isinstance(elem, list): |
| 545 | elements.extend(str(e) for e in elem) |
| 546 | else: |
| 547 | elements.append(str(elem)) |
| 548 | return elements |
| 549 | |
| 550 | |
| 551 | def _normalize_text_content(raw_text) -> Optional[str]: |
| 552 | """Normalize text_content: llava may return dict instead of string.""" |
| 553 | if isinstance(raw_text, dict): |
| 554 | parts = [] |
| 555 | for k, v in raw_text.items(): |
| 556 | if isinstance(v, list): |
| 557 | parts.append(f"{k}: {', '.join(str(x) for x in v)}") |
| 558 | else: |
| 559 | parts.append(f"{k}: {v}") |
| 560 | return "\n".join(parts) |
| 561 | elif isinstance(raw_text, list): |
| 562 | return "\n".join(str(x) for x in raw_text) |
| 563 | return raw_text |
| 564 |