|
ccf1b1a…
|
leo
|
1 |
"""Diagram analysis using vision model classification and single-pass extraction.""" |
|
ccf1b1a…
|
leo
|
2 |
|
|
6febc3f…
|
noreply
|
3 |
import hashlib |
|
ccf1b1a…
|
leo
|
4 |
import json |
|
ccf1b1a…
|
leo
|
5 |
import logging |
|
ccf1b1a…
|
leo
|
6 |
import shutil |
|
6febc3f…
|
noreply
|
7 |
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
ccf1b1a…
|
leo
|
8 |
from pathlib import Path |
|
ccf1b1a…
|
leo
|
9 |
from typing import List, Optional, Tuple, Union |
|
287a3bb…
|
leo
|
10 |
|
|
287a3bb…
|
leo
|
11 |
from tqdm import tqdm |
|
ccf1b1a…
|
leo
|
12 |
|
|
ccf1b1a…
|
leo
|
13 |
from video_processor.models import DiagramResult, DiagramType, ScreenCapture |
|
ccf1b1a…
|
leo
|
14 |
from video_processor.providers.manager import ProviderManager |
|
ccf1b1a…
|
leo
|
15 |
|
|
ccf1b1a…
|
leo
|
16 |
logger = logging.getLogger(__name__) |
|
6febc3f…
|
noreply
|
17 |
|
|
6febc3f…
|
noreply
|
18 |
# Default max workers for parallel frame analysis |
|
6febc3f…
|
noreply
|
19 |
_DEFAULT_MAX_WORKERS = 4 |
|
287a3bb…
|
leo
|
20 |
|
|
ccf1b1a…
|
leo
|
21 |
# Classification prompt — returns JSON |
|
ccf1b1a…
|
leo
|
22 |
_CLASSIFY_PROMPT = """\ |
|
287a3bb…
|
leo
|
23 |
Examine this image from a video recording. Your job is to identify ONLY shared content \ |
|
287a3bb…
|
leo
|
24 |
— slides, presentations, charts, diagrams, documents, screen shares, whiteboard content, \ |
|
287a3bb…
|
leo
|
25 |
architecture drawings, tables, or other structured visual information worth capturing. |
|
287a3bb…
|
leo
|
26 |
|
|
287a3bb…
|
leo
|
27 |
IMPORTANT: If the image primarily shows a person, people, webcam feeds, faces, or a \ |
|
287a3bb…
|
leo
|
28 |
video conference participant view, return confidence 0.0. We are ONLY interested in \ |
|
287a3bb…
|
leo
|
29 |
shared/presented content, NOT people or camera views. |
|
ccf1b1a…
|
leo
|
30 |
|
|
ccf1b1a…
|
leo
|
31 |
Return ONLY a JSON object (no markdown fences): |
|
ccf1b1a…
|
leo
|
32 |
{ |
|
ccf1b1a…
|
leo
|
33 |
"is_diagram": true/false, |
|
829e24a…
|
leo
|
34 |
"diagram_type": "flowchart"|"sequence"|"architecture" |
|
829e24a…
|
leo
|
35 |
|"whiteboard"|"chart"|"table"|"slide"|"screenshot"|"unknown", |
|
ccf1b1a…
|
leo
|
36 |
"confidence": 0.0 to 1.0, |
|
287a3bb…
|
leo
|
37 |
"content_type": "slide"|"diagram"|"document"|"screen_share"|"whiteboard"|"chart"|"person"|"other", |
|
ccf1b1a…
|
leo
|
38 |
"brief_description": "one-sentence description of what you see" |
|
ccf1b1a…
|
leo
|
39 |
} |
|
ccf1b1a…
|
leo
|
40 |
""" |
|
ccf1b1a…
|
leo
|
41 |
|
|
ccf1b1a…
|
leo
|
42 |
# Single-pass analysis prompt — extracts everything in one call |
|
ccf1b1a…
|
leo
|
43 |
_ANALYSIS_PROMPT = """\ |
|
829e24a…
|
leo
|
44 |
Analyze this diagram/visual content comprehensively. Extract ALL of the |
|
829e24a…
|
leo
|
45 |
following in a single JSON response (no markdown fences): |
|
ccf1b1a…
|
leo
|
46 |
{ |
|
829e24a…
|
leo
|
47 |
"diagram_type": "flowchart"|"sequence"|"architecture" |
|
829e24a…
|
leo
|
48 |
|"whiteboard"|"chart"|"table"|"slide"|"screenshot"|"unknown", |
|
ccf1b1a…
|
leo
|
49 |
"description": "detailed description of the visual content", |
|
ccf1b1a…
|
leo
|
50 |
"text_content": "all visible text, preserving structure", |
|
ccf1b1a…
|
leo
|
51 |
"elements": ["list", "of", "identified", "elements/components"], |
|
ccf1b1a…
|
leo
|
52 |
"relationships": ["element A -> element B: relationship", ...], |
|
ccf1b1a…
|
leo
|
53 |
"mermaid": "mermaid diagram syntax representing this visual (graph LR, sequenceDiagram, etc.)", |
|
ccf1b1a…
|
leo
|
54 |
"chart_data": null or {"labels": [...], "values": [...], "chart_type": "bar|line|pie|scatter"} |
|
ccf1b1a…
|
leo
|
55 |
} |
|
ccf1b1a…
|
leo
|
56 |
|
|
ccf1b1a…
|
leo
|
57 |
For the mermaid field: generate valid mermaid syntax that best represents the visual structure. |
|
ccf1b1a…
|
leo
|
58 |
For chart_data: only populate if this is a chart/graph with extractable numeric data. |
|
ccf1b1a…
|
leo
|
59 |
If any field cannot be determined, use null or empty list. |
|
ccf1b1a…
|
leo
|
60 |
""" |
|
ccf1b1a…
|
leo
|
61 |
|
|
ccf1b1a…
|
leo
|
62 |
# Caption prompt for screengrab fallback |
|
ccf1b1a…
|
leo
|
63 |
_CAPTION_PROMPT = "Briefly describe what this image shows in 1-2 sentences." |
|
ccf1b1a…
|
leo
|
64 |
|
|
2a1b11a…
|
noreply
|
65 |
# Rich screenshot extraction prompt — extracts knowledge from shared screens |
|
2a1b11a…
|
noreply
|
66 |
_SCREENSHOT_EXTRACT_PROMPT = """\ |
|
2a1b11a…
|
noreply
|
67 |
Analyze this screenshot from a video recording. Extract all visible knowledge. |
|
2a1b11a…
|
noreply
|
68 |
This is shared screen content (slides, code, documents, browser, terminal, etc.). |
|
2a1b11a…
|
noreply
|
69 |
|
|
2a1b11a…
|
noreply
|
70 |
Return ONLY a JSON object (no markdown fences): |
|
2a1b11a…
|
noreply
|
71 |
{ |
|
2a1b11a…
|
noreply
|
72 |
"content_type": "slide"|"code"|"document"|"terminal"|"browser"|"chat"|"other", |
|
2a1b11a…
|
noreply
|
73 |
"caption": "one-sentence description of what is shown", |
|
2a1b11a…
|
noreply
|
74 |
"text_content": "all visible text, preserving structure and line breaks", |
|
2a1b11a…
|
noreply
|
75 |
"entities": ["named things visible: people, technologies, tools, services, \ |
|
2a1b11a…
|
noreply
|
76 |
projects, libraries, APIs, error codes, URLs, file paths"], |
|
2a1b11a…
|
noreply
|
77 |
"topics": ["concepts or subjects this content is about"] |
|
2a1b11a…
|
noreply
|
78 |
} |
|
2a1b11a…
|
noreply
|
79 |
|
|
2a1b11a…
|
noreply
|
80 |
For text_content: extract ALL readable text — code, titles, bullet points, URLs, |
|
2a1b11a…
|
noreply
|
81 |
error messages, terminal output, chat messages, file names. Be thorough. |
|
2a1b11a…
|
noreply
|
82 |
For entities: extract specific named things, not generic words. |
|
2a1b11a…
|
noreply
|
83 |
For topics: extract 2-5 high-level topics this content relates to. |
|
2a1b11a…
|
noreply
|
84 |
""" |
|
2a1b11a…
|
noreply
|
85 |
|
|
ccf1b1a…
|
leo
|
86 |
|
|
ccf1b1a…
|
leo
|
87 |
def _read_image_bytes(image_path: Union[str, Path]) -> bytes: |
|
ccf1b1a…
|
leo
|
88 |
"""Read image file as bytes.""" |
|
ccf1b1a…
|
leo
|
89 |
return Path(image_path).read_bytes() |
|
ccf1b1a…
|
leo
|
90 |
|
|
ccf1b1a…
|
leo
|
91 |
|
|
ccf1b1a…
|
leo
|
92 |
def _parse_json_response(text: str) -> Optional[dict]: |
|
ccf1b1a…
|
leo
|
93 |
"""Try to parse JSON from an LLM response, handling markdown fences.""" |
|
ccf1b1a…
|
leo
|
94 |
if not text: |
|
ccf1b1a…
|
leo
|
95 |
return None |
|
ccf1b1a…
|
leo
|
96 |
# Strip markdown fences |
|
ccf1b1a…
|
leo
|
97 |
cleaned = text.strip() |
|
ccf1b1a…
|
leo
|
98 |
if cleaned.startswith("```"): |
|
ccf1b1a…
|
leo
|
99 |
lines = cleaned.split("\n") |
|
ccf1b1a…
|
leo
|
100 |
# Remove first and last fence lines |
|
829e24a…
|
leo
|
101 |
lines = [line for line in lines if not line.strip().startswith("```")] |
|
ccf1b1a…
|
leo
|
102 |
try: |
|
ccf1b1a…
|
leo
|
103 |
return json.loads(cleaned) |
|
ccf1b1a…
|
leo
|
104 |
except json.JSONDecodeError: |
|
ccf1b1a…
|
leo
|
105 |
# Try to find JSON object in the text |
|
ccf1b1a…
|
leo
|
106 |
start = cleaned.find("{") |
|
ccf1b1a…
|
leo
|
107 |
end = cleaned.rfind("}") + 1 |
|
ccf1b1a…
|
leo
|
108 |
if start >= 0 and end > start: |
|
ccf1b1a…
|
leo
|
109 |
try: |
|
ccf1b1a…
|
leo
|
110 |
return json.loads(cleaned[start:end]) |
|
ccf1b1a…
|
leo
|
111 |
except json.JSONDecodeError: |
|
ccf1b1a…
|
leo
|
112 |
pass |
|
ccf1b1a…
|
leo
|
113 |
return None |
|
ccf1b1a…
|
leo
|
114 |
|
|
ccf1b1a…
|
leo
|
115 |
|
|
6febc3f…
|
noreply
|
116 |
def _frame_hash(path: Path) -> str: |
|
6febc3f…
|
noreply
|
117 |
"""Content-based hash for a frame file (first 8KB + size for speed).""" |
|
6febc3f…
|
noreply
|
118 |
h = hashlib.sha256() |
|
6febc3f…
|
noreply
|
119 |
h.update(str(path.stat().st_size).encode()) |
|
6febc3f…
|
noreply
|
120 |
with open(path, "rb") as f: |
|
6febc3f…
|
noreply
|
121 |
h.update(f.read(8192)) |
|
6febc3f…
|
noreply
|
122 |
return h.hexdigest()[:16] |
|
6febc3f…
|
noreply
|
123 |
|
|
6febc3f…
|
noreply
|
124 |
|
|
6febc3f…
|
noreply
|
125 |
class _FrameCache: |
|
6febc3f…
|
noreply
|
126 |
"""Simple JSON file cache for frame classification/analysis results.""" |
|
6febc3f…
|
noreply
|
127 |
|
|
6febc3f…
|
noreply
|
128 |
def __init__(self, cache_path: Optional[Path]): |
|
6febc3f…
|
noreply
|
129 |
self._path = cache_path |
|
6febc3f…
|
noreply
|
130 |
self._data: dict = {} |
|
6febc3f…
|
noreply
|
131 |
if cache_path and cache_path.exists(): |
|
6febc3f…
|
noreply
|
132 |
try: |
|
6febc3f…
|
noreply
|
133 |
self._data = json.loads(cache_path.read_text()) |
|
6febc3f…
|
noreply
|
134 |
except (json.JSONDecodeError, OSError): |
|
6febc3f…
|
noreply
|
135 |
self._data = {} |
|
6febc3f…
|
noreply
|
136 |
|
|
6febc3f…
|
noreply
|
137 |
def get(self, key: str) -> Optional[dict]: |
|
6febc3f…
|
noreply
|
138 |
return self._data.get(key) |
|
6febc3f…
|
noreply
|
139 |
|
|
6febc3f…
|
noreply
|
140 |
def set(self, key: str, value: dict) -> None: |
|
6febc3f…
|
noreply
|
141 |
self._data[key] = value |
|
6febc3f…
|
noreply
|
142 |
|
|
6febc3f…
|
noreply
|
143 |
def save(self) -> None: |
|
6febc3f…
|
noreply
|
144 |
if self._path: |
|
6febc3f…
|
noreply
|
145 |
self._path.parent.mkdir(parents=True, exist_ok=True) |
|
6febc3f…
|
noreply
|
146 |
self._path.write_text(json.dumps(self._data, indent=2)) |
|
6febc3f…
|
noreply
|
147 |
|
|
6febc3f…
|
noreply
|
148 |
|
|
ccf1b1a…
|
leo
|
149 |
class DiagramAnalyzer: |
|
ccf1b1a…
|
leo
|
150 |
"""Vision model-based diagram detection and analysis.""" |
|
ccf1b1a…
|
leo
|
151 |
|
|
ccf1b1a…
|
leo
|
152 |
def __init__( |
|
ccf1b1a…
|
leo
|
153 |
self, |
|
ccf1b1a…
|
leo
|
154 |
provider_manager: Optional[ProviderManager] = None, |
|
ccf1b1a…
|
leo
|
155 |
confidence_threshold: float = 0.3, |
|
6febc3f…
|
noreply
|
156 |
max_workers: int = _DEFAULT_MAX_WORKERS, |
|
ccf1b1a…
|
leo
|
157 |
): |
|
ccf1b1a…
|
leo
|
158 |
self.pm = provider_manager or ProviderManager() |
|
ccf1b1a…
|
leo
|
159 |
self.confidence_threshold = confidence_threshold |
|
6febc3f…
|
noreply
|
160 |
self.max_workers = max_workers |
|
ccf1b1a…
|
leo
|
161 |
|
|
ccf1b1a…
|
leo
|
162 |
def classify_frame(self, image_path: Union[str, Path]) -> dict: |
|
ccf1b1a…
|
leo
|
163 |
""" |
|
ccf1b1a…
|
leo
|
164 |
Classify a single frame using vision model. |
|
ccf1b1a…
|
leo
|
165 |
|
|
ccf1b1a…
|
leo
|
166 |
Returns dict with is_diagram, diagram_type, confidence, brief_description. |
|
ccf1b1a…
|
leo
|
167 |
""" |
|
ccf1b1a…
|
leo
|
168 |
image_bytes = _read_image_bytes(image_path) |
|
ccf1b1a…
|
leo
|
169 |
raw = self.pm.analyze_image(image_bytes, _CLASSIFY_PROMPT, max_tokens=512) |
|
ccf1b1a…
|
leo
|
170 |
result = _parse_json_response(raw) |
|
ccf1b1a…
|
leo
|
171 |
if result is None: |
|
829e24a…
|
leo
|
172 |
return { |
|
829e24a…
|
leo
|
173 |
"is_diagram": False, |
|
829e24a…
|
leo
|
174 |
"diagram_type": "unknown", |
|
829e24a…
|
leo
|
175 |
"confidence": 0.0, |
|
829e24a…
|
leo
|
176 |
"brief_description": "", |
|
829e24a…
|
leo
|
177 |
} |
|
ccf1b1a…
|
leo
|
178 |
return result |
|
ccf1b1a…
|
leo
|
179 |
|
|
ccf1b1a…
|
leo
|
180 |
def analyze_diagram_single_pass(self, image_path: Union[str, Path]) -> dict: |
|
ccf1b1a…
|
leo
|
181 |
""" |
|
ccf1b1a…
|
leo
|
182 |
Full single-pass diagram analysis — description, text, mermaid, chart data. |
|
ccf1b1a…
|
leo
|
183 |
|
|
ccf1b1a…
|
leo
|
184 |
Returns parsed dict or empty dict on failure. |
|
ccf1b1a…
|
leo
|
185 |
""" |
|
ccf1b1a…
|
leo
|
186 |
image_bytes = _read_image_bytes(image_path) |
|
ccf1b1a…
|
leo
|
187 |
raw = self.pm.analyze_image(image_bytes, _ANALYSIS_PROMPT, max_tokens=4096) |
|
ccf1b1a…
|
leo
|
188 |
result = _parse_json_response(raw) |
|
ccf1b1a…
|
leo
|
189 |
return result or {} |
|
ccf1b1a…
|
leo
|
190 |
|
|
ccf1b1a…
|
leo
|
191 |
def caption_frame(self, image_path: Union[str, Path]) -> str: |
|
ccf1b1a…
|
leo
|
192 |
"""Get a brief caption for a screengrab fallback.""" |
|
ccf1b1a…
|
leo
|
193 |
image_bytes = _read_image_bytes(image_path) |
|
ccf1b1a…
|
leo
|
194 |
return self.pm.analyze_image(image_bytes, _CAPTION_PROMPT, max_tokens=256) |
|
ccf1b1a…
|
leo
|
195 |
|
|
2a1b11a…
|
noreply
|
196 |
def extract_screenshot_knowledge(self, image_path: Union[str, Path]) -> dict: |
|
2a1b11a…
|
noreply
|
197 |
"""Extract knowledge from a screenshot — text, entities, topics.""" |
|
2a1b11a…
|
noreply
|
198 |
image_bytes = _read_image_bytes(image_path) |
|
2a1b11a…
|
noreply
|
199 |
raw = self.pm.analyze_image(image_bytes, _SCREENSHOT_EXTRACT_PROMPT, max_tokens=2048) |
|
2a1b11a…
|
noreply
|
200 |
result = _parse_json_response(raw) |
|
2a1b11a…
|
noreply
|
201 |
return result or {} |
|
2a1b11a…
|
noreply
|
202 |
|
|
ccf1b1a…
|
leo
|
203 |
def process_frames( |
|
ccf1b1a…
|
leo
|
204 |
self, |
|
ccf1b1a…
|
leo
|
205 |
frame_paths: List[Union[str, Path]], |
|
ccf1b1a…
|
leo
|
206 |
diagrams_dir: Optional[Path] = None, |
|
ccf1b1a…
|
leo
|
207 |
captures_dir: Optional[Path] = None, |
|
6febc3f…
|
noreply
|
208 |
cache_dir: Optional[Path] = None, |
|
ccf1b1a…
|
leo
|
209 |
) -> Tuple[List[DiagramResult], List[ScreenCapture]]: |
|
ccf1b1a…
|
leo
|
210 |
""" |
|
ccf1b1a…
|
leo
|
211 |
Process a list of extracted frames: classify, analyze diagrams, screengrab fallback. |
|
6febc3f…
|
noreply
|
212 |
|
|
6febc3f…
|
noreply
|
213 |
Classification and analysis run in parallel using a thread pool. Results are |
|
6febc3f…
|
noreply
|
214 |
cached by frame content hash so re-runs skip already-analyzed frames. |
|
ccf1b1a…
|
leo
|
215 |
|
|
ccf1b1a…
|
leo
|
216 |
Thresholds: |
|
ccf1b1a…
|
leo
|
217 |
- confidence >= 0.7 → full diagram analysis (story 3.2) |
|
ccf1b1a…
|
leo
|
218 |
- 0.3 <= confidence < 0.7 → screengrab fallback (story 3.3) |
|
ccf1b1a…
|
leo
|
219 |
- confidence < 0.3 → skip |
|
ccf1b1a…
|
leo
|
220 |
|
|
ccf1b1a…
|
leo
|
221 |
Returns (diagrams, screen_captures). |
|
ccf1b1a…
|
leo
|
222 |
""" |
|
6febc3f…
|
noreply
|
223 |
# Set up cache |
|
6febc3f…
|
noreply
|
224 |
cache_path = None |
|
6febc3f…
|
noreply
|
225 |
if cache_dir: |
|
6febc3f…
|
noreply
|
226 |
cache_path = cache_dir / "frame_analysis_cache.json" |
|
6febc3f…
|
noreply
|
227 |
elif diagrams_dir: |
|
6febc3f…
|
noreply
|
228 |
cache_path = diagrams_dir.parent / "frame_analysis_cache.json" |
|
6febc3f…
|
noreply
|
229 |
cache = _FrameCache(cache_path) |
|
6febc3f…
|
noreply
|
230 |
|
|
6febc3f…
|
noreply
|
231 |
frame_paths = [Path(fp) for fp in frame_paths] |
|
6febc3f…
|
noreply
|
232 |
|
|
6febc3f…
|
noreply
|
233 |
# --- Phase 1: Parallel classification --- |
|
6febc3f…
|
noreply
|
234 |
classifications: dict[int, dict] = {} |
|
6febc3f…
|
noreply
|
235 |
cache_hits = 0 |
|
6febc3f…
|
noreply
|
236 |
|
|
6febc3f…
|
noreply
|
237 |
def _classify_one(idx: int, fp: Path) -> Tuple[int, dict, bool]: |
|
6febc3f…
|
noreply
|
238 |
fhash = _frame_hash(fp) |
|
6febc3f…
|
noreply
|
239 |
cached = cache.get(f"classify:{fhash}") |
|
6febc3f…
|
noreply
|
240 |
if cached is not None: |
|
6febc3f…
|
noreply
|
241 |
return idx, cached, True |
|
6febc3f…
|
noreply
|
242 |
try: |
|
6febc3f…
|
noreply
|
243 |
result = self.classify_frame(fp) |
|
6febc3f…
|
noreply
|
244 |
except Exception as e: |
|
6febc3f…
|
noreply
|
245 |
logger.warning(f"Classification failed for frame {idx}: {e}") |
|
6febc3f…
|
noreply
|
246 |
result = {"is_diagram": False, "confidence": 0.0} |
|
6febc3f…
|
noreply
|
247 |
cache.set(f"classify:{fhash}", result) |
|
6febc3f…
|
noreply
|
248 |
return idx, result, False |
|
6febc3f…
|
noreply
|
249 |
|
|
6febc3f…
|
noreply
|
250 |
workers = min(self.max_workers, len(frame_paths)) if frame_paths else 1 |
|
6febc3f…
|
noreply
|
251 |
with ThreadPoolExecutor(max_workers=workers) as pool: |
|
6febc3f…
|
noreply
|
252 |
futures = {pool.submit(_classify_one, i, fp): i for i, fp in enumerate(frame_paths)} |
|
6febc3f…
|
noreply
|
253 |
pbar = tqdm( |
|
6febc3f…
|
noreply
|
254 |
as_completed(futures), |
|
6febc3f…
|
noreply
|
255 |
total=len(futures), |
|
6febc3f…
|
noreply
|
256 |
desc="Classifying frames", |
|
6febc3f…
|
noreply
|
257 |
unit="frame", |
|
6febc3f…
|
noreply
|
258 |
) |
|
6febc3f…
|
noreply
|
259 |
for future in pbar: |
|
6febc3f…
|
noreply
|
260 |
idx, result, from_cache = future.result() |
|
6febc3f…
|
noreply
|
261 |
classifications[idx] = result |
|
6febc3f…
|
noreply
|
262 |
if from_cache: |
|
6febc3f…
|
noreply
|
263 |
cache_hits += 1 |
|
6febc3f…
|
noreply
|
264 |
|
|
6febc3f…
|
noreply
|
265 |
if cache_hits: |
|
6febc3f…
|
noreply
|
266 |
logger.info(f"Classification: {cache_hits}/{len(frame_paths)} from cache") |
|
6febc3f…
|
noreply
|
267 |
|
|
6febc3f…
|
noreply
|
268 |
# --- Phase 2: Parallel analysis/extraction for qualifying frames --- |
|
6febc3f…
|
noreply
|
269 |
high_conf = [] # (idx, fp, classification) |
|
6febc3f…
|
noreply
|
270 |
med_conf = [] |
|
6febc3f…
|
noreply
|
271 |
|
|
6febc3f…
|
noreply
|
272 |
for idx in sorted(classifications): |
|
6febc3f…
|
noreply
|
273 |
conf = float(classifications[idx].get("confidence", 0.0)) |
|
6febc3f…
|
noreply
|
274 |
if conf >= 0.7: |
|
6febc3f…
|
noreply
|
275 |
high_conf.append((idx, frame_paths[idx], classifications[idx])) |
|
6febc3f…
|
noreply
|
276 |
elif conf >= self.confidence_threshold: |
|
6febc3f…
|
noreply
|
277 |
med_conf.append((idx, frame_paths[idx], classifications[idx])) |
|
6febc3f…
|
noreply
|
278 |
|
|
6febc3f…
|
noreply
|
279 |
# Analyze high-confidence diagrams in parallel |
|
6febc3f…
|
noreply
|
280 |
analysis_results: dict[int, dict] = {} |
|
6febc3f…
|
noreply
|
281 |
|
|
6febc3f…
|
noreply
|
282 |
def _analyze_one(idx: int, fp: Path) -> Tuple[int, dict, bool]: |
|
6febc3f…
|
noreply
|
283 |
fhash = _frame_hash(fp) |
|
6febc3f…
|
noreply
|
284 |
cached = cache.get(f"analyze:{fhash}") |
|
6febc3f…
|
noreply
|
285 |
if cached is not None: |
|
6febc3f…
|
noreply
|
286 |
return idx, cached, True |
|
6febc3f…
|
noreply
|
287 |
try: |
|
6febc3f…
|
noreply
|
288 |
result = self.analyze_diagram_single_pass(fp) |
|
6febc3f…
|
noreply
|
289 |
except Exception as e: |
|
6febc3f…
|
noreply
|
290 |
logger.warning(f"Diagram analysis failed for frame {idx}: {e}") |
|
6febc3f…
|
noreply
|
291 |
result = {} |
|
6febc3f…
|
noreply
|
292 |
cache.set(f"analyze:{fhash}", result) |
|
6febc3f…
|
noreply
|
293 |
return idx, result, False |
|
6febc3f…
|
noreply
|
294 |
|
|
6febc3f…
|
noreply
|
295 |
if high_conf: |
|
6febc3f…
|
noreply
|
296 |
workers = min(self.max_workers, len(high_conf)) |
|
6febc3f…
|
noreply
|
297 |
with ThreadPoolExecutor(max_workers=workers) as pool: |
|
6febc3f…
|
noreply
|
298 |
futures = {pool.submit(_analyze_one, idx, fp): idx for idx, fp, _ in high_conf} |
|
6febc3f…
|
noreply
|
299 |
pbar = tqdm( |
|
6febc3f…
|
noreply
|
300 |
as_completed(futures), |
|
6febc3f…
|
noreply
|
301 |
total=len(futures), |
|
6febc3f…
|
noreply
|
302 |
desc="Analyzing diagrams", |
|
6febc3f…
|
noreply
|
303 |
unit="diagram", |
|
6febc3f…
|
noreply
|
304 |
) |
|
6febc3f…
|
noreply
|
305 |
for future in pbar: |
|
6febc3f…
|
noreply
|
306 |
idx, result, _ = future.result() |
|
6febc3f…
|
noreply
|
307 |
analysis_results[idx] = result |
|
6febc3f…
|
noreply
|
308 |
|
|
6febc3f…
|
noreply
|
309 |
# Extract knowledge from medium-confidence frames in parallel |
|
6febc3f…
|
noreply
|
310 |
extraction_results: dict[int, dict] = {} |
|
6febc3f…
|
noreply
|
311 |
|
|
6febc3f…
|
noreply
|
312 |
def _extract_one(idx: int, fp: Path) -> Tuple[int, dict, bool]: |
|
6febc3f…
|
noreply
|
313 |
fhash = _frame_hash(fp) |
|
6febc3f…
|
noreply
|
314 |
cached = cache.get(f"extract:{fhash}") |
|
6febc3f…
|
noreply
|
315 |
if cached is not None: |
|
6febc3f…
|
noreply
|
316 |
return idx, cached, True |
|
6febc3f…
|
noreply
|
317 |
try: |
|
6febc3f…
|
noreply
|
318 |
result = self.extract_screenshot_knowledge(fp) |
|
6febc3f…
|
noreply
|
319 |
except Exception as e: |
|
6febc3f…
|
noreply
|
320 |
logger.warning(f"Screenshot extraction failed for frame {idx}: {e}") |
|
6febc3f…
|
noreply
|
321 |
result = {} |
|
6febc3f…
|
noreply
|
322 |
cache.set(f"extract:{fhash}", result) |
|
6febc3f…
|
noreply
|
323 |
return idx, result, False |
|
6febc3f…
|
noreply
|
324 |
|
|
6febc3f…
|
noreply
|
325 |
if med_conf: |
|
6febc3f…
|
noreply
|
326 |
workers = min(self.max_workers, len(med_conf)) |
|
6febc3f…
|
noreply
|
327 |
with ThreadPoolExecutor(max_workers=workers) as pool: |
|
6febc3f…
|
noreply
|
328 |
futures = {pool.submit(_extract_one, idx, fp): idx for idx, fp, _ in med_conf} |
|
6febc3f…
|
noreply
|
329 |
pbar = tqdm( |
|
6febc3f…
|
noreply
|
330 |
as_completed(futures), |
|
6febc3f…
|
noreply
|
331 |
total=len(futures), |
|
6febc3f…
|
noreply
|
332 |
desc="Extracting screenshots", |
|
6febc3f…
|
noreply
|
333 |
unit="capture", |
|
6febc3f…
|
noreply
|
334 |
) |
|
6febc3f…
|
noreply
|
335 |
for future in pbar: |
|
6febc3f…
|
noreply
|
336 |
idx, result, _ = future.result() |
|
6febc3f…
|
noreply
|
337 |
extraction_results[idx] = result |
|
6febc3f…
|
noreply
|
338 |
|
|
6febc3f…
|
noreply
|
339 |
# --- Phase 3: Build results (sequential for stable ordering) --- |
|
ccf1b1a…
|
leo
|
340 |
diagrams: List[DiagramResult] = [] |
|
ccf1b1a…
|
leo
|
341 |
captures: List[ScreenCapture] = [] |
|
ccf1b1a…
|
leo
|
342 |
diagram_idx = 0 |
|
ccf1b1a…
|
leo
|
343 |
capture_idx = 0 |
|
ccf1b1a…
|
leo
|
344 |
|
|
6febc3f…
|
noreply
|
345 |
for idx, fp, classification in high_conf: |
|
6febc3f…
|
noreply
|
346 |
analysis = analysis_results.get(idx, {}) |
|
6febc3f…
|
noreply
|
347 |
confidence = float(classification.get("confidence", 0.0)) |
|
6febc3f…
|
noreply
|
348 |
|
|
6febc3f…
|
noreply
|
349 |
if not analysis: |
|
6febc3f…
|
noreply
|
350 |
# Analysis failed — fall back to screengrab with pre-fetched extraction |
|
6febc3f…
|
noreply
|
351 |
extraction = extraction_results.get(idx) |
|
6febc3f…
|
noreply
|
352 |
if extraction is None: |
|
6febc3f…
|
noreply
|
353 |
# Wasn't in med_conf, need to extract now |
|
6febc3f…
|
noreply
|
354 |
try: |
|
6febc3f…
|
noreply
|
355 |
extraction = self.extract_screenshot_knowledge(fp) |
|
6febc3f…
|
noreply
|
356 |
except Exception: |
|
6febc3f…
|
noreply
|
357 |
extraction = {} |
|
6febc3f…
|
noreply
|
358 |
capture = self._build_screengrab( |
|
6febc3f…
|
noreply
|
359 |
fp, idx, capture_idx, captures_dir, confidence, extraction |
|
6febc3f…
|
noreply
|
360 |
) |
|
6febc3f…
|
noreply
|
361 |
captures.append(capture) |
|
6febc3f…
|
noreply
|
362 |
capture_idx += 1 |
|
6febc3f…
|
noreply
|
363 |
continue |
|
6febc3f…
|
noreply
|
364 |
|
|
6febc3f…
|
noreply
|
365 |
dr = self._build_diagram_result( |
|
6febc3f…
|
noreply
|
366 |
idx, fp, diagram_idx, diagrams_dir, confidence, classification, analysis |
|
6febc3f…
|
noreply
|
367 |
) |
|
6febc3f…
|
noreply
|
368 |
if dr: |
|
6febc3f…
|
noreply
|
369 |
diagrams.append(dr) |
|
6febc3f…
|
noreply
|
370 |
diagram_idx += 1 |
|
6febc3f…
|
noreply
|
371 |
else: |
|
6febc3f…
|
noreply
|
372 |
capture = self._build_screengrab(fp, idx, capture_idx, captures_dir, confidence, {}) |
|
6febc3f…
|
noreply
|
373 |
captures.append(capture) |
|
6febc3f…
|
noreply
|
374 |
capture_idx += 1 |
|
6febc3f…
|
noreply
|
375 |
|
|
6febc3f…
|
noreply
|
376 |
for idx, fp, classification in med_conf: |
|
6febc3f…
|
noreply
|
377 |
confidence = float(classification.get("confidence", 0.0)) |
|
6febc3f…
|
noreply
|
378 |
extraction = extraction_results.get(idx, {}) |
|
6febc3f…
|
noreply
|
379 |
logger.info( |
|
6febc3f…
|
noreply
|
380 |
f"Frame {idx}: uncertain (confidence {confidence:.2f}), saving as screengrab" |
|
6febc3f…
|
noreply
|
381 |
) |
|
6febc3f…
|
noreply
|
382 |
capture = self._build_screengrab( |
|
6febc3f…
|
noreply
|
383 |
fp, idx, capture_idx, captures_dir, confidence, extraction |
|
6febc3f…
|
noreply
|
384 |
) |
|
6febc3f…
|
noreply
|
385 |
captures.append(capture) |
|
6febc3f…
|
noreply
|
386 |
capture_idx += 1 |
|
6febc3f…
|
noreply
|
387 |
|
|
6febc3f…
|
noreply
|
388 |
# Save cache |
|
6febc3f…
|
noreply
|
389 |
cache.save() |
|
829e24a…
|
leo
|
390 |
|
|
829e24a…
|
leo
|
391 |
logger.info( |
|
829e24a…
|
leo
|
392 |
f"Diagram processing complete: {len(diagrams)} diagrams, {len(captures)} screengrabs" |
|
829e24a…
|
leo
|
393 |
) |
|
829e24a…
|
leo
|
394 |
return diagrams, captures |
|
2a1b11a…
|
noreply
|
395 |
|
|
6febc3f…
|
noreply
|
396 |
def _build_diagram_result( |
|
6febc3f…
|
noreply
|
397 |
self, |
|
6febc3f…
|
noreply
|
398 |
frame_index: int, |
|
6febc3f…
|
noreply
|
399 |
frame_path: Path, |
|
6febc3f…
|
noreply
|
400 |
diagram_idx: int, |
|
6febc3f…
|
noreply
|
401 |
diagrams_dir: Optional[Path], |
|
6febc3f…
|
noreply
|
402 |
confidence: float, |
|
6febc3f…
|
noreply
|
403 |
classification: dict, |
|
6febc3f…
|
noreply
|
404 |
analysis: dict, |
|
6febc3f…
|
noreply
|
405 |
) -> Optional[DiagramResult]: |
|
6febc3f…
|
noreply
|
406 |
"""Build a DiagramResult from analysis data. Returns None on validation failure.""" |
|
6febc3f…
|
noreply
|
407 |
dtype = analysis.get("diagram_type", classification.get("diagram_type", "unknown")) |
|
6febc3f…
|
noreply
|
408 |
try: |
|
6febc3f…
|
noreply
|
409 |
diagram_type = DiagramType(dtype) |
|
6febc3f…
|
noreply
|
410 |
except ValueError: |
|
6febc3f…
|
noreply
|
411 |
diagram_type = DiagramType.unknown |
|
6febc3f…
|
noreply
|
412 |
|
|
6febc3f…
|
noreply
|
413 |
relationships = _normalize_relationships(analysis.get("relationships") or []) |
|
6febc3f…
|
noreply
|
414 |
elements = _normalize_elements(analysis.get("elements") or []) |
|
6febc3f…
|
noreply
|
415 |
text_content = _normalize_text_content(analysis.get("text_content")) |
|
6febc3f…
|
noreply
|
416 |
|
|
6febc3f…
|
noreply
|
417 |
try: |
|
6febc3f…
|
noreply
|
418 |
dr = DiagramResult( |
|
6febc3f…
|
noreply
|
419 |
frame_index=frame_index, |
|
6febc3f…
|
noreply
|
420 |
diagram_type=diagram_type, |
|
6febc3f…
|
noreply
|
421 |
confidence=confidence, |
|
6febc3f…
|
noreply
|
422 |
description=analysis.get("description"), |
|
6febc3f…
|
noreply
|
423 |
text_content=text_content, |
|
6febc3f…
|
noreply
|
424 |
elements=elements, |
|
6febc3f…
|
noreply
|
425 |
relationships=relationships, |
|
6febc3f…
|
noreply
|
426 |
mermaid=analysis.get("mermaid"), |
|
6febc3f…
|
noreply
|
427 |
chart_data=analysis.get("chart_data"), |
|
6febc3f…
|
noreply
|
428 |
) |
|
6febc3f…
|
noreply
|
429 |
except Exception as e: |
|
6febc3f…
|
noreply
|
430 |
logger.warning(f"DiagramResult validation failed for frame {frame_index}: {e}") |
|
6febc3f…
|
noreply
|
431 |
return None |
|
6febc3f…
|
noreply
|
432 |
|
|
6febc3f…
|
noreply
|
433 |
if diagrams_dir: |
|
6febc3f…
|
noreply
|
434 |
diagrams_dir.mkdir(parents=True, exist_ok=True) |
|
6febc3f…
|
noreply
|
435 |
prefix = f"diagram_{diagram_idx}" |
|
6febc3f…
|
noreply
|
436 |
img_dest = diagrams_dir / f"{prefix}.jpg" |
|
6febc3f…
|
noreply
|
437 |
shutil.copy2(frame_path, img_dest) |
|
6febc3f…
|
noreply
|
438 |
dr.image_path = f"diagrams/{prefix}.jpg" |
|
6febc3f…
|
noreply
|
439 |
if dr.mermaid: |
|
6febc3f…
|
noreply
|
440 |
mermaid_dest = diagrams_dir / f"{prefix}.mermaid" |
|
6febc3f…
|
noreply
|
441 |
mermaid_dest.write_text(dr.mermaid) |
|
6febc3f…
|
noreply
|
442 |
dr.mermaid_path = f"diagrams/{prefix}.mermaid" |
|
6febc3f…
|
noreply
|
443 |
json_dest = diagrams_dir / f"{prefix}.json" |
|
6febc3f…
|
noreply
|
444 |
json_dest.write_text(dr.model_dump_json(indent=2)) |
|
6febc3f…
|
noreply
|
445 |
|
|
6febc3f…
|
noreply
|
446 |
return dr |
|
6febc3f…
|
noreply
|
447 |
|
|
6febc3f…
|
noreply
|
448 |
def _build_screengrab( |
|
2a1b11a…
|
noreply
|
449 |
self, |
|
2a1b11a…
|
noreply
|
450 |
frame_path: Path, |
|
2a1b11a…
|
noreply
|
451 |
frame_index: int, |
|
2a1b11a…
|
noreply
|
452 |
capture_index: int, |
|
2a1b11a…
|
noreply
|
453 |
captures_dir: Optional[Path], |
|
2a1b11a…
|
noreply
|
454 |
confidence: float, |
|
6febc3f…
|
noreply
|
455 |
extraction: dict, |
|
2a1b11a…
|
noreply
|
456 |
) -> ScreenCapture: |
|
6febc3f…
|
noreply
|
457 |
"""Build a ScreenCapture from extraction data.""" |
|
6febc3f…
|
noreply
|
458 |
caption = extraction.get("caption", "") |
|
6febc3f…
|
noreply
|
459 |
content_type = extraction.get("content_type") |
|
6febc3f…
|
noreply
|
460 |
text_content = extraction.get("text_content") |
|
6febc3f…
|
noreply
|
461 |
raw_entities = extraction.get("entities", []) |
|
6febc3f…
|
noreply
|
462 |
entities = [str(e) for e in raw_entities] if isinstance(raw_entities, list) else [] |
|
6febc3f…
|
noreply
|
463 |
raw_topics = extraction.get("topics", []) |
|
6febc3f…
|
noreply
|
464 |
topics = [str(t) for t in raw_topics] if isinstance(raw_topics, list) else [] |
|
6febc3f…
|
noreply
|
465 |
|
|
6febc3f…
|
noreply
|
466 |
if extraction: |
|
6febc3f…
|
noreply
|
467 |
logger.info( |
|
6febc3f…
|
noreply
|
468 |
f"Frame {frame_index}: extracted " |
|
6febc3f…
|
noreply
|
469 |
f"{len(entities)} entities, " |
|
6febc3f…
|
noreply
|
470 |
f"{len(topics)} topics from {content_type}" |
|
6febc3f…
|
noreply
|
471 |
) |
|
2a1b11a…
|
noreply
|
472 |
|
|
2a1b11a…
|
noreply
|
473 |
sc = ScreenCapture( |
|
2a1b11a…
|
noreply
|
474 |
frame_index=frame_index, |
|
2a1b11a…
|
noreply
|
475 |
caption=caption, |
|
2a1b11a…
|
noreply
|
476 |
confidence=confidence, |
|
2a1b11a…
|
noreply
|
477 |
content_type=content_type, |
|
2a1b11a…
|
noreply
|
478 |
text_content=text_content, |
|
2a1b11a…
|
noreply
|
479 |
entities=entities, |
|
2a1b11a…
|
noreply
|
480 |
topics=topics, |
|
2a1b11a…
|
noreply
|
481 |
) |
|
2a1b11a…
|
noreply
|
482 |
|
|
2a1b11a…
|
noreply
|
483 |
if captures_dir: |
|
2a1b11a…
|
noreply
|
484 |
captures_dir.mkdir(parents=True, exist_ok=True) |
|
2a1b11a…
|
noreply
|
485 |
prefix = f"capture_{capture_index}" |
|
2a1b11a…
|
noreply
|
486 |
img_dest = captures_dir / f"{prefix}.jpg" |
|
2a1b11a…
|
noreply
|
487 |
shutil.copy2(frame_path, img_dest) |
|
2a1b11a…
|
noreply
|
488 |
sc.image_path = f"captures/{prefix}.jpg" |
|
2a1b11a…
|
noreply
|
489 |
json_dest = captures_dir / f"{prefix}.json" |
|
2a1b11a…
|
noreply
|
490 |
json_dest.write_text(sc.model_dump_json(indent=2)) |
|
2a1b11a…
|
noreply
|
491 |
|
|
2a1b11a…
|
noreply
|
492 |
return sc |
|
6febc3f…
|
noreply
|
493 |
|
|
6febc3f…
|
noreply
|
494 |
def _save_screengrab( |
|
6febc3f…
|
noreply
|
495 |
self, |
|
6febc3f…
|
noreply
|
496 |
frame_path: Path, |
|
6febc3f…
|
noreply
|
497 |
frame_index: int, |
|
6febc3f…
|
noreply
|
498 |
capture_index: int, |
|
6febc3f…
|
noreply
|
499 |
captures_dir: Optional[Path], |
|
6febc3f…
|
noreply
|
500 |
confidence: float, |
|
6febc3f…
|
noreply
|
501 |
) -> ScreenCapture: |
|
6febc3f…
|
noreply
|
502 |
"""Legacy entry point — extracts then delegates to _build_screengrab.""" |
|
6febc3f…
|
noreply
|
503 |
try: |
|
6febc3f…
|
noreply
|
504 |
extraction = self.extract_screenshot_knowledge(frame_path) |
|
6febc3f…
|
noreply
|
505 |
except Exception as e: |
|
6febc3f…
|
noreply
|
506 |
logger.warning(f"Screenshot extraction failed for frame {frame_index}: {e}") |
|
6febc3f…
|
noreply
|
507 |
extraction = {} |
|
6febc3f…
|
noreply
|
508 |
return self._build_screengrab( |
|
6febc3f…
|
noreply
|
509 |
frame_path, frame_index, capture_index, captures_dir, confidence, extraction |
|
6febc3f…
|
noreply
|
510 |
) |
|
6febc3f…
|
noreply
|
511 |
|
|
6febc3f…
|
noreply
|
512 |
|
|
6febc3f…
|
noreply
|
513 |
def _normalize_relationships(raw_rels: list) -> List[str]: |
|
6febc3f…
|
noreply
|
514 |
"""Normalize relationships: llava sometimes returns dicts instead of strings.""" |
|
6febc3f…
|
noreply
|
515 |
relationships = [] |
|
6febc3f…
|
noreply
|
516 |
for rel in raw_rels: |
|
6febc3f…
|
noreply
|
517 |
if isinstance(rel, str): |
|
6febc3f…
|
noreply
|
518 |
relationships.append(rel) |
|
6febc3f…
|
noreply
|
519 |
elif isinstance(rel, dict): |
|
6febc3f…
|
noreply
|
520 |
src = rel.get("source", rel.get("from", "?")) |
|
6febc3f…
|
noreply
|
521 |
dst = rel.get("destination", rel.get("to", "?")) |
|
6febc3f…
|
noreply
|
522 |
label = rel.get("label", rel.get("relationship", "")) |
|
6febc3f…
|
noreply
|
523 |
relationships.append(f"{src} -> {dst}: {label}" if label else f"{src} -> {dst}") |
|
6febc3f…
|
noreply
|
524 |
else: |
|
6febc3f…
|
noreply
|
525 |
relationships.append(str(rel)) |
|
6febc3f…
|
noreply
|
526 |
return relationships |
|
6febc3f…
|
noreply
|
527 |
|
|
6febc3f…
|
noreply
|
528 |
|
|
6febc3f…
|
noreply
|
529 |
def _normalize_elements(raw_elements: list) -> List[str]: |
|
6febc3f…
|
noreply
|
530 |
"""Normalize elements: llava may return dicts or nested lists.""" |
|
6febc3f…
|
noreply
|
531 |
elements = [] |
|
6febc3f…
|
noreply
|
532 |
for elem in raw_elements: |
|
6febc3f…
|
noreply
|
533 |
if isinstance(elem, str): |
|
6febc3f…
|
noreply
|
534 |
elements.append(elem) |
|
6febc3f…
|
noreply
|
535 |
elif isinstance(elem, dict): |
|
6febc3f…
|
noreply
|
536 |
name = elem.get("name", elem.get("element", "")) |
|
6febc3f…
|
noreply
|
537 |
etype = elem.get("type", elem.get("element_type", "")) |
|
6febc3f…
|
noreply
|
538 |
if name and etype: |
|
6febc3f…
|
noreply
|
539 |
elements.append(f"{etype}: {name}") |
|
6febc3f…
|
noreply
|
540 |
elif name: |
|
6febc3f…
|
noreply
|
541 |
elements.append(name) |
|
6febc3f…
|
noreply
|
542 |
else: |
|
6febc3f…
|
noreply
|
543 |
elements.append(json.dumps(elem)) |
|
6febc3f…
|
noreply
|
544 |
elif isinstance(elem, list): |
|
6febc3f…
|
noreply
|
545 |
elements.extend(str(e) for e in elem) |
|
6febc3f…
|
noreply
|
546 |
else: |
|
6febc3f…
|
noreply
|
547 |
elements.append(str(elem)) |
|
6febc3f…
|
noreply
|
548 |
return elements |
|
6febc3f…
|
noreply
|
549 |
|
|
6febc3f…
|
noreply
|
550 |
|
|
6febc3f…
|
noreply
|
551 |
def _normalize_text_content(raw_text) -> Optional[str]: |
|
6febc3f…
|
noreply
|
552 |
"""Normalize text_content: llava may return dict instead of string.""" |
|
6febc3f…
|
noreply
|
553 |
if isinstance(raw_text, dict): |
|
6febc3f…
|
noreply
|
554 |
parts = [] |
|
6febc3f…
|
noreply
|
555 |
for k, v in raw_text.items(): |
|
6febc3f…
|
noreply
|
556 |
if isinstance(v, list): |
|
6febc3f…
|
noreply
|
557 |
parts.append(f"{k}: {', '.join(str(x) for x in v)}") |
|
6febc3f…
|
noreply
|
558 |
else: |
|
6febc3f…
|
noreply
|
559 |
parts.append(f"{k}: {v}") |
|
6febc3f…
|
noreply
|
560 |
return "\n".join(parts) |
|
6febc3f…
|
noreply
|
561 |
elif isinstance(raw_text, list): |
|
6febc3f…
|
noreply
|
562 |
return "\n".join(str(x) for x in raw_text) |
|
6febc3f…
|
noreply
|
563 |
return raw_text |