PlanOpticon

planopticon / video_processor / analyzers / diagram_analyzer.py
Source Blame History 563 lines
ccf1b1a… leo 1 """Diagram analysis using vision model classification and single-pass extraction."""
ccf1b1a… leo 2
6febc3f… noreply 3 import hashlib
ccf1b1a… leo 4 import json
ccf1b1a… leo 5 import logging
ccf1b1a… leo 6 import shutil
6febc3f… noreply 7 from concurrent.futures import ThreadPoolExecutor, as_completed
ccf1b1a… leo 8 from pathlib import Path
ccf1b1a… leo 9 from typing import List, Optional, Tuple, Union
287a3bb… leo 10
287a3bb… leo 11 from tqdm import tqdm
ccf1b1a… leo 12
ccf1b1a… leo 13 from video_processor.models import DiagramResult, DiagramType, ScreenCapture
ccf1b1a… leo 14 from video_processor.providers.manager import ProviderManager
ccf1b1a… leo 15
ccf1b1a… leo 16 logger = logging.getLogger(__name__)
6febc3f… noreply 17
6febc3f… noreply 18 # Default max workers for parallel frame analysis
6febc3f… noreply 19 _DEFAULT_MAX_WORKERS = 4
287a3bb… leo 20
ccf1b1a… leo 21 # Classification prompt — returns JSON
ccf1b1a… leo 22 _CLASSIFY_PROMPT = """\
287a3bb… leo 23 Examine this image from a video recording. Your job is to identify ONLY shared content \
287a3bb… leo 24 — slides, presentations, charts, diagrams, documents, screen shares, whiteboard content, \
287a3bb… leo 25 architecture drawings, tables, or other structured visual information worth capturing.
287a3bb… leo 26
287a3bb… leo 27 IMPORTANT: If the image primarily shows a person, people, webcam feeds, faces, or a \
287a3bb… leo 28 video conference participant view, return confidence 0.0. We are ONLY interested in \
287a3bb… leo 29 shared/presented content, NOT people or camera views.
ccf1b1a… leo 30
ccf1b1a… leo 31 Return ONLY a JSON object (no markdown fences):
ccf1b1a… leo 32 {
ccf1b1a… leo 33 "is_diagram": true/false,
829e24a… leo 34 "diagram_type": "flowchart"|"sequence"|"architecture"
829e24a… leo 35 |"whiteboard"|"chart"|"table"|"slide"|"screenshot"|"unknown",
ccf1b1a… leo 36 "confidence": 0.0 to 1.0,
287a3bb… leo 37 "content_type": "slide"|"diagram"|"document"|"screen_share"|"whiteboard"|"chart"|"person"|"other",
ccf1b1a… leo 38 "brief_description": "one-sentence description of what you see"
ccf1b1a… leo 39 }
ccf1b1a… leo 40 """
ccf1b1a… leo 41
ccf1b1a… leo 42 # Single-pass analysis prompt — extracts everything in one call
ccf1b1a… leo 43 _ANALYSIS_PROMPT = """\
829e24a… leo 44 Analyze this diagram/visual content comprehensively. Extract ALL of the
829e24a… leo 45 following in a single JSON response (no markdown fences):
ccf1b1a… leo 46 {
829e24a… leo 47 "diagram_type": "flowchart"|"sequence"|"architecture"
829e24a… leo 48 |"whiteboard"|"chart"|"table"|"slide"|"screenshot"|"unknown",
ccf1b1a… leo 49 "description": "detailed description of the visual content",
ccf1b1a… leo 50 "text_content": "all visible text, preserving structure",
ccf1b1a… leo 51 "elements": ["list", "of", "identified", "elements/components"],
ccf1b1a… leo 52 "relationships": ["element A -> element B: relationship", ...],
ccf1b1a… leo 53 "mermaid": "mermaid diagram syntax representing this visual (graph LR, sequenceDiagram, etc.)",
ccf1b1a… leo 54 "chart_data": null or {"labels": [...], "values": [...], "chart_type": "bar|line|pie|scatter"}
ccf1b1a… leo 55 }
ccf1b1a… leo 56
ccf1b1a… leo 57 For the mermaid field: generate valid mermaid syntax that best represents the visual structure.
ccf1b1a… leo 58 For chart_data: only populate if this is a chart/graph with extractable numeric data.
ccf1b1a… leo 59 If any field cannot be determined, use null or empty list.
ccf1b1a… leo 60 """
ccf1b1a… leo 61
ccf1b1a… leo 62 # Caption prompt for screengrab fallback
ccf1b1a… leo 63 _CAPTION_PROMPT = "Briefly describe what this image shows in 1-2 sentences."
ccf1b1a… leo 64
2a1b11a… noreply 65 # Rich screenshot extraction prompt — extracts knowledge from shared screens
2a1b11a… noreply 66 _SCREENSHOT_EXTRACT_PROMPT = """\
2a1b11a… noreply 67 Analyze this screenshot from a video recording. Extract all visible knowledge.
2a1b11a… noreply 68 This is shared screen content (slides, code, documents, browser, terminal, etc.).
2a1b11a… noreply 69
2a1b11a… noreply 70 Return ONLY a JSON object (no markdown fences):
2a1b11a… noreply 71 {
2a1b11a… noreply 72 "content_type": "slide"|"code"|"document"|"terminal"|"browser"|"chat"|"other",
2a1b11a… noreply 73 "caption": "one-sentence description of what is shown",
2a1b11a… noreply 74 "text_content": "all visible text, preserving structure and line breaks",
2a1b11a… noreply 75 "entities": ["named things visible: people, technologies, tools, services, \
2a1b11a… noreply 76 projects, libraries, APIs, error codes, URLs, file paths"],
2a1b11a… noreply 77 "topics": ["concepts or subjects this content is about"]
2a1b11a… noreply 78 }
2a1b11a… noreply 79
2a1b11a… noreply 80 For text_content: extract ALL readable text — code, titles, bullet points, URLs,
2a1b11a… noreply 81 error messages, terminal output, chat messages, file names. Be thorough.
2a1b11a… noreply 82 For entities: extract specific named things, not generic words.
2a1b11a… noreply 83 For topics: extract 2-5 high-level topics this content relates to.
2a1b11a… noreply 84 """
2a1b11a… noreply 85
ccf1b1a… leo 86
ccf1b1a… leo 87 def _read_image_bytes(image_path: Union[str, Path]) -> bytes:
ccf1b1a… leo 88 """Read image file as bytes."""
ccf1b1a… leo 89 return Path(image_path).read_bytes()
ccf1b1a… leo 90
ccf1b1a… leo 91
ccf1b1a… leo 92 def _parse_json_response(text: str) -> Optional[dict]:
ccf1b1a… leo 93 """Try to parse JSON from an LLM response, handling markdown fences."""
ccf1b1a… leo 94 if not text:
ccf1b1a… leo 95 return None
ccf1b1a… leo 96 # Strip markdown fences
ccf1b1a… leo 97 cleaned = text.strip()
ccf1b1a… leo 98 if cleaned.startswith("```"):
ccf1b1a… leo 99 lines = cleaned.split("\n")
ccf1b1a… leo 100 # Remove first and last fence lines
829e24a… leo 101 lines = [line for line in lines if not line.strip().startswith("```")]
ccf1b1a… leo 102 try:
ccf1b1a… leo 103 return json.loads(cleaned)
ccf1b1a… leo 104 except json.JSONDecodeError:
ccf1b1a… leo 105 # Try to find JSON object in the text
ccf1b1a… leo 106 start = cleaned.find("{")
ccf1b1a… leo 107 end = cleaned.rfind("}") + 1
ccf1b1a… leo 108 if start >= 0 and end > start:
ccf1b1a… leo 109 try:
ccf1b1a… leo 110 return json.loads(cleaned[start:end])
ccf1b1a… leo 111 except json.JSONDecodeError:
ccf1b1a… leo 112 pass
ccf1b1a… leo 113 return None
ccf1b1a… leo 114
ccf1b1a… leo 115
6febc3f… noreply 116 def _frame_hash(path: Path) -> str:
6febc3f… noreply 117 """Content-based hash for a frame file (first 8KB + size for speed)."""
6febc3f… noreply 118 h = hashlib.sha256()
6febc3f… noreply 119 h.update(str(path.stat().st_size).encode())
6febc3f… noreply 120 with open(path, "rb") as f:
6febc3f… noreply 121 h.update(f.read(8192))
6febc3f… noreply 122 return h.hexdigest()[:16]
6febc3f… noreply 123
6febc3f… noreply 124
6febc3f… noreply 125 class _FrameCache:
6febc3f… noreply 126 """Simple JSON file cache for frame classification/analysis results."""
6febc3f… noreply 127
6febc3f… noreply 128 def __init__(self, cache_path: Optional[Path]):
6febc3f… noreply 129 self._path = cache_path
6febc3f… noreply 130 self._data: dict = {}
6febc3f… noreply 131 if cache_path and cache_path.exists():
6febc3f… noreply 132 try:
6febc3f… noreply 133 self._data = json.loads(cache_path.read_text())
6febc3f… noreply 134 except (json.JSONDecodeError, OSError):
6febc3f… noreply 135 self._data = {}
6febc3f… noreply 136
6febc3f… noreply 137 def get(self, key: str) -> Optional[dict]:
6febc3f… noreply 138 return self._data.get(key)
6febc3f… noreply 139
6febc3f… noreply 140 def set(self, key: str, value: dict) -> None:
6febc3f… noreply 141 self._data[key] = value
6febc3f… noreply 142
6febc3f… noreply 143 def save(self) -> None:
6febc3f… noreply 144 if self._path:
6febc3f… noreply 145 self._path.parent.mkdir(parents=True, exist_ok=True)
6febc3f… noreply 146 self._path.write_text(json.dumps(self._data, indent=2))
6febc3f… noreply 147
6febc3f… noreply 148
ccf1b1a… leo 149 class DiagramAnalyzer:
ccf1b1a… leo 150 """Vision model-based diagram detection and analysis."""
ccf1b1a… leo 151
ccf1b1a… leo 152 def __init__(
ccf1b1a… leo 153 self,
ccf1b1a… leo 154 provider_manager: Optional[ProviderManager] = None,
ccf1b1a… leo 155 confidence_threshold: float = 0.3,
6febc3f… noreply 156 max_workers: int = _DEFAULT_MAX_WORKERS,
ccf1b1a… leo 157 ):
ccf1b1a… leo 158 self.pm = provider_manager or ProviderManager()
ccf1b1a… leo 159 self.confidence_threshold = confidence_threshold
6febc3f… noreply 160 self.max_workers = max_workers
ccf1b1a… leo 161
ccf1b1a… leo 162 def classify_frame(self, image_path: Union[str, Path]) -> dict:
ccf1b1a… leo 163 """
ccf1b1a… leo 164 Classify a single frame using vision model.
ccf1b1a… leo 165
ccf1b1a… leo 166 Returns dict with is_diagram, diagram_type, confidence, brief_description.
ccf1b1a… leo 167 """
ccf1b1a… leo 168 image_bytes = _read_image_bytes(image_path)
ccf1b1a… leo 169 raw = self.pm.analyze_image(image_bytes, _CLASSIFY_PROMPT, max_tokens=512)
ccf1b1a… leo 170 result = _parse_json_response(raw)
ccf1b1a… leo 171 if result is None:
829e24a… leo 172 return {
829e24a… leo 173 "is_diagram": False,
829e24a… leo 174 "diagram_type": "unknown",
829e24a… leo 175 "confidence": 0.0,
829e24a… leo 176 "brief_description": "",
829e24a… leo 177 }
ccf1b1a… leo 178 return result
ccf1b1a… leo 179
ccf1b1a… leo 180 def analyze_diagram_single_pass(self, image_path: Union[str, Path]) -> dict:
ccf1b1a… leo 181 """
ccf1b1a… leo 182 Full single-pass diagram analysis — description, text, mermaid, chart data.
ccf1b1a… leo 183
ccf1b1a… leo 184 Returns parsed dict or empty dict on failure.
ccf1b1a… leo 185 """
ccf1b1a… leo 186 image_bytes = _read_image_bytes(image_path)
ccf1b1a… leo 187 raw = self.pm.analyze_image(image_bytes, _ANALYSIS_PROMPT, max_tokens=4096)
ccf1b1a… leo 188 result = _parse_json_response(raw)
ccf1b1a… leo 189 return result or {}
ccf1b1a… leo 190
ccf1b1a… leo 191 def caption_frame(self, image_path: Union[str, Path]) -> str:
ccf1b1a… leo 192 """Get a brief caption for a screengrab fallback."""
ccf1b1a… leo 193 image_bytes = _read_image_bytes(image_path)
ccf1b1a… leo 194 return self.pm.analyze_image(image_bytes, _CAPTION_PROMPT, max_tokens=256)
ccf1b1a… leo 195
2a1b11a… noreply 196 def extract_screenshot_knowledge(self, image_path: Union[str, Path]) -> dict:
2a1b11a… noreply 197 """Extract knowledge from a screenshot — text, entities, topics."""
2a1b11a… noreply 198 image_bytes = _read_image_bytes(image_path)
2a1b11a… noreply 199 raw = self.pm.analyze_image(image_bytes, _SCREENSHOT_EXTRACT_PROMPT, max_tokens=2048)
2a1b11a… noreply 200 result = _parse_json_response(raw)
2a1b11a… noreply 201 return result or {}
2a1b11a… noreply 202
ccf1b1a… leo 203 def process_frames(
ccf1b1a… leo 204 self,
ccf1b1a… leo 205 frame_paths: List[Union[str, Path]],
ccf1b1a… leo 206 diagrams_dir: Optional[Path] = None,
ccf1b1a… leo 207 captures_dir: Optional[Path] = None,
6febc3f… noreply 208 cache_dir: Optional[Path] = None,
ccf1b1a… leo 209 ) -> Tuple[List[DiagramResult], List[ScreenCapture]]:
ccf1b1a… leo 210 """
ccf1b1a… leo 211 Process a list of extracted frames: classify, analyze diagrams, screengrab fallback.
6febc3f… noreply 212
6febc3f… noreply 213 Classification and analysis run in parallel using a thread pool. Results are
6febc3f… noreply 214 cached by frame content hash so re-runs skip already-analyzed frames.
ccf1b1a… leo 215
ccf1b1a… leo 216 Thresholds:
ccf1b1a… leo 217 - confidence >= 0.7 → full diagram analysis (story 3.2)
ccf1b1a… leo 218 - 0.3 <= confidence < 0.7 → screengrab fallback (story 3.3)
ccf1b1a… leo 219 - confidence < 0.3 → skip
ccf1b1a… leo 220
ccf1b1a… leo 221 Returns (diagrams, screen_captures).
ccf1b1a… leo 222 """
6febc3f… noreply 223 # Set up cache
6febc3f… noreply 224 cache_path = None
6febc3f… noreply 225 if cache_dir:
6febc3f… noreply 226 cache_path = cache_dir / "frame_analysis_cache.json"
6febc3f… noreply 227 elif diagrams_dir:
6febc3f… noreply 228 cache_path = diagrams_dir.parent / "frame_analysis_cache.json"
6febc3f… noreply 229 cache = _FrameCache(cache_path)
6febc3f… noreply 230
6febc3f… noreply 231 frame_paths = [Path(fp) for fp in frame_paths]
6febc3f… noreply 232
6febc3f… noreply 233 # --- Phase 1: Parallel classification ---
6febc3f… noreply 234 classifications: dict[int, dict] = {}
6febc3f… noreply 235 cache_hits = 0
6febc3f… noreply 236
6febc3f… noreply 237 def _classify_one(idx: int, fp: Path) -> Tuple[int, dict, bool]:
6febc3f… noreply 238 fhash = _frame_hash(fp)
6febc3f… noreply 239 cached = cache.get(f"classify:{fhash}")
6febc3f… noreply 240 if cached is not None:
6febc3f… noreply 241 return idx, cached, True
6febc3f… noreply 242 try:
6febc3f… noreply 243 result = self.classify_frame(fp)
6febc3f… noreply 244 except Exception as e:
6febc3f… noreply 245 logger.warning(f"Classification failed for frame {idx}: {e}")
6febc3f… noreply 246 result = {"is_diagram": False, "confidence": 0.0}
6febc3f… noreply 247 cache.set(f"classify:{fhash}", result)
6febc3f… noreply 248 return idx, result, False
6febc3f… noreply 249
6febc3f… noreply 250 workers = min(self.max_workers, len(frame_paths)) if frame_paths else 1
6febc3f… noreply 251 with ThreadPoolExecutor(max_workers=workers) as pool:
6febc3f… noreply 252 futures = {pool.submit(_classify_one, i, fp): i for i, fp in enumerate(frame_paths)}
6febc3f… noreply 253 pbar = tqdm(
6febc3f… noreply 254 as_completed(futures),
6febc3f… noreply 255 total=len(futures),
6febc3f… noreply 256 desc="Classifying frames",
6febc3f… noreply 257 unit="frame",
6febc3f… noreply 258 )
6febc3f… noreply 259 for future in pbar:
6febc3f… noreply 260 idx, result, from_cache = future.result()
6febc3f… noreply 261 classifications[idx] = result
6febc3f… noreply 262 if from_cache:
6febc3f… noreply 263 cache_hits += 1
6febc3f… noreply 264
6febc3f… noreply 265 if cache_hits:
6febc3f… noreply 266 logger.info(f"Classification: {cache_hits}/{len(frame_paths)} from cache")
6febc3f… noreply 267
6febc3f… noreply 268 # --- Phase 2: Parallel analysis/extraction for qualifying frames ---
6febc3f… noreply 269 high_conf = [] # (idx, fp, classification)
6febc3f… noreply 270 med_conf = []
6febc3f… noreply 271
6febc3f… noreply 272 for idx in sorted(classifications):
6febc3f… noreply 273 conf = float(classifications[idx].get("confidence", 0.0))
6febc3f… noreply 274 if conf >= 0.7:
6febc3f… noreply 275 high_conf.append((idx, frame_paths[idx], classifications[idx]))
6febc3f… noreply 276 elif conf >= self.confidence_threshold:
6febc3f… noreply 277 med_conf.append((idx, frame_paths[idx], classifications[idx]))
6febc3f… noreply 278
6febc3f… noreply 279 # Analyze high-confidence diagrams in parallel
6febc3f… noreply 280 analysis_results: dict[int, dict] = {}
6febc3f… noreply 281
6febc3f… noreply 282 def _analyze_one(idx: int, fp: Path) -> Tuple[int, dict, bool]:
6febc3f… noreply 283 fhash = _frame_hash(fp)
6febc3f… noreply 284 cached = cache.get(f"analyze:{fhash}")
6febc3f… noreply 285 if cached is not None:
6febc3f… noreply 286 return idx, cached, True
6febc3f… noreply 287 try:
6febc3f… noreply 288 result = self.analyze_diagram_single_pass(fp)
6febc3f… noreply 289 except Exception as e:
6febc3f… noreply 290 logger.warning(f"Diagram analysis failed for frame {idx}: {e}")
6febc3f… noreply 291 result = {}
6febc3f… noreply 292 cache.set(f"analyze:{fhash}", result)
6febc3f… noreply 293 return idx, result, False
6febc3f… noreply 294
6febc3f… noreply 295 if high_conf:
6febc3f… noreply 296 workers = min(self.max_workers, len(high_conf))
6febc3f… noreply 297 with ThreadPoolExecutor(max_workers=workers) as pool:
6febc3f… noreply 298 futures = {pool.submit(_analyze_one, idx, fp): idx for idx, fp, _ in high_conf}
6febc3f… noreply 299 pbar = tqdm(
6febc3f… noreply 300 as_completed(futures),
6febc3f… noreply 301 total=len(futures),
6febc3f… noreply 302 desc="Analyzing diagrams",
6febc3f… noreply 303 unit="diagram",
6febc3f… noreply 304 )
6febc3f… noreply 305 for future in pbar:
6febc3f… noreply 306 idx, result, _ = future.result()
6febc3f… noreply 307 analysis_results[idx] = result
6febc3f… noreply 308
6febc3f… noreply 309 # Extract knowledge from medium-confidence frames in parallel
6febc3f… noreply 310 extraction_results: dict[int, dict] = {}
6febc3f… noreply 311
6febc3f… noreply 312 def _extract_one(idx: int, fp: Path) -> Tuple[int, dict, bool]:
6febc3f… noreply 313 fhash = _frame_hash(fp)
6febc3f… noreply 314 cached = cache.get(f"extract:{fhash}")
6febc3f… noreply 315 if cached is not None:
6febc3f… noreply 316 return idx, cached, True
6febc3f… noreply 317 try:
6febc3f… noreply 318 result = self.extract_screenshot_knowledge(fp)
6febc3f… noreply 319 except Exception as e:
6febc3f… noreply 320 logger.warning(f"Screenshot extraction failed for frame {idx}: {e}")
6febc3f… noreply 321 result = {}
6febc3f… noreply 322 cache.set(f"extract:{fhash}", result)
6febc3f… noreply 323 return idx, result, False
6febc3f… noreply 324
6febc3f… noreply 325 if med_conf:
6febc3f… noreply 326 workers = min(self.max_workers, len(med_conf))
6febc3f… noreply 327 with ThreadPoolExecutor(max_workers=workers) as pool:
6febc3f… noreply 328 futures = {pool.submit(_extract_one, idx, fp): idx for idx, fp, _ in med_conf}
6febc3f… noreply 329 pbar = tqdm(
6febc3f… noreply 330 as_completed(futures),
6febc3f… noreply 331 total=len(futures),
6febc3f… noreply 332 desc="Extracting screenshots",
6febc3f… noreply 333 unit="capture",
6febc3f… noreply 334 )
6febc3f… noreply 335 for future in pbar:
6febc3f… noreply 336 idx, result, _ = future.result()
6febc3f… noreply 337 extraction_results[idx] = result
6febc3f… noreply 338
6febc3f… noreply 339 # --- Phase 3: Build results (sequential for stable ordering) ---
ccf1b1a… leo 340 diagrams: List[DiagramResult] = []
ccf1b1a… leo 341 captures: List[ScreenCapture] = []
ccf1b1a… leo 342 diagram_idx = 0
ccf1b1a… leo 343 capture_idx = 0
ccf1b1a… leo 344
6febc3f… noreply 345 for idx, fp, classification in high_conf:
6febc3f… noreply 346 analysis = analysis_results.get(idx, {})
6febc3f… noreply 347 confidence = float(classification.get("confidence", 0.0))
6febc3f… noreply 348
6febc3f… noreply 349 if not analysis:
6febc3f… noreply 350 # Analysis failed — fall back to screengrab with pre-fetched extraction
6febc3f… noreply 351 extraction = extraction_results.get(idx)
6febc3f… noreply 352 if extraction is None:
6febc3f… noreply 353 # Wasn't in med_conf, need to extract now
6febc3f… noreply 354 try:
6febc3f… noreply 355 extraction = self.extract_screenshot_knowledge(fp)
6febc3f… noreply 356 except Exception:
6febc3f… noreply 357 extraction = {}
6febc3f… noreply 358 capture = self._build_screengrab(
6febc3f… noreply 359 fp, idx, capture_idx, captures_dir, confidence, extraction
6febc3f… noreply 360 )
6febc3f… noreply 361 captures.append(capture)
6febc3f… noreply 362 capture_idx += 1
6febc3f… noreply 363 continue
6febc3f… noreply 364
6febc3f… noreply 365 dr = self._build_diagram_result(
6febc3f… noreply 366 idx, fp, diagram_idx, diagrams_dir, confidence, classification, analysis
6febc3f… noreply 367 )
6febc3f… noreply 368 if dr:
6febc3f… noreply 369 diagrams.append(dr)
6febc3f… noreply 370 diagram_idx += 1
6febc3f… noreply 371 else:
6febc3f… noreply 372 capture = self._build_screengrab(fp, idx, capture_idx, captures_dir, confidence, {})
6febc3f… noreply 373 captures.append(capture)
6febc3f… noreply 374 capture_idx += 1
6febc3f… noreply 375
6febc3f… noreply 376 for idx, fp, classification in med_conf:
6febc3f… noreply 377 confidence = float(classification.get("confidence", 0.0))
6febc3f… noreply 378 extraction = extraction_results.get(idx, {})
6febc3f… noreply 379 logger.info(
6febc3f… noreply 380 f"Frame {idx}: uncertain (confidence {confidence:.2f}), saving as screengrab"
6febc3f… noreply 381 )
6febc3f… noreply 382 capture = self._build_screengrab(
6febc3f… noreply 383 fp, idx, capture_idx, captures_dir, confidence, extraction
6febc3f… noreply 384 )
6febc3f… noreply 385 captures.append(capture)
6febc3f… noreply 386 capture_idx += 1
6febc3f… noreply 387
6febc3f… noreply 388 # Save cache
6febc3f… noreply 389 cache.save()
829e24a… leo 390
829e24a… leo 391 logger.info(
829e24a… leo 392 f"Diagram processing complete: {len(diagrams)} diagrams, {len(captures)} screengrabs"
829e24a… leo 393 )
829e24a… leo 394 return diagrams, captures
2a1b11a… noreply 395
6febc3f… noreply 396 def _build_diagram_result(
6febc3f… noreply 397 self,
6febc3f… noreply 398 frame_index: int,
6febc3f… noreply 399 frame_path: Path,
6febc3f… noreply 400 diagram_idx: int,
6febc3f… noreply 401 diagrams_dir: Optional[Path],
6febc3f… noreply 402 confidence: float,
6febc3f… noreply 403 classification: dict,
6febc3f… noreply 404 analysis: dict,
6febc3f… noreply 405 ) -> Optional[DiagramResult]:
6febc3f… noreply 406 """Build a DiagramResult from analysis data. Returns None on validation failure."""
6febc3f… noreply 407 dtype = analysis.get("diagram_type", classification.get("diagram_type", "unknown"))
6febc3f… noreply 408 try:
6febc3f… noreply 409 diagram_type = DiagramType(dtype)
6febc3f… noreply 410 except ValueError:
6febc3f… noreply 411 diagram_type = DiagramType.unknown
6febc3f… noreply 412
6febc3f… noreply 413 relationships = _normalize_relationships(analysis.get("relationships") or [])
6febc3f… noreply 414 elements = _normalize_elements(analysis.get("elements") or [])
6febc3f… noreply 415 text_content = _normalize_text_content(analysis.get("text_content"))
6febc3f… noreply 416
6febc3f… noreply 417 try:
6febc3f… noreply 418 dr = DiagramResult(
6febc3f… noreply 419 frame_index=frame_index,
6febc3f… noreply 420 diagram_type=diagram_type,
6febc3f… noreply 421 confidence=confidence,
6febc3f… noreply 422 description=analysis.get("description"),
6febc3f… noreply 423 text_content=text_content,
6febc3f… noreply 424 elements=elements,
6febc3f… noreply 425 relationships=relationships,
6febc3f… noreply 426 mermaid=analysis.get("mermaid"),
6febc3f… noreply 427 chart_data=analysis.get("chart_data"),
6febc3f… noreply 428 )
6febc3f… noreply 429 except Exception as e:
6febc3f… noreply 430 logger.warning(f"DiagramResult validation failed for frame {frame_index}: {e}")
6febc3f… noreply 431 return None
6febc3f… noreply 432
6febc3f… noreply 433 if diagrams_dir:
6febc3f… noreply 434 diagrams_dir.mkdir(parents=True, exist_ok=True)
6febc3f… noreply 435 prefix = f"diagram_{diagram_idx}"
6febc3f… noreply 436 img_dest = diagrams_dir / f"{prefix}.jpg"
6febc3f… noreply 437 shutil.copy2(frame_path, img_dest)
6febc3f… noreply 438 dr.image_path = f"diagrams/{prefix}.jpg"
6febc3f… noreply 439 if dr.mermaid:
6febc3f… noreply 440 mermaid_dest = diagrams_dir / f"{prefix}.mermaid"
6febc3f… noreply 441 mermaid_dest.write_text(dr.mermaid)
6febc3f… noreply 442 dr.mermaid_path = f"diagrams/{prefix}.mermaid"
6febc3f… noreply 443 json_dest = diagrams_dir / f"{prefix}.json"
6febc3f… noreply 444 json_dest.write_text(dr.model_dump_json(indent=2))
6febc3f… noreply 445
6febc3f… noreply 446 return dr
6febc3f… noreply 447
6febc3f… noreply 448 def _build_screengrab(
2a1b11a… noreply 449 self,
2a1b11a… noreply 450 frame_path: Path,
2a1b11a… noreply 451 frame_index: int,
2a1b11a… noreply 452 capture_index: int,
2a1b11a… noreply 453 captures_dir: Optional[Path],
2a1b11a… noreply 454 confidence: float,
6febc3f… noreply 455 extraction: dict,
2a1b11a… noreply 456 ) -> ScreenCapture:
6febc3f… noreply 457 """Build a ScreenCapture from extraction data."""
6febc3f… noreply 458 caption = extraction.get("caption", "")
6febc3f… noreply 459 content_type = extraction.get("content_type")
6febc3f… noreply 460 text_content = extraction.get("text_content")
6febc3f… noreply 461 raw_entities = extraction.get("entities", [])
6febc3f… noreply 462 entities = [str(e) for e in raw_entities] if isinstance(raw_entities, list) else []
6febc3f… noreply 463 raw_topics = extraction.get("topics", [])
6febc3f… noreply 464 topics = [str(t) for t in raw_topics] if isinstance(raw_topics, list) else []
6febc3f… noreply 465
6febc3f… noreply 466 if extraction:
6febc3f… noreply 467 logger.info(
6febc3f… noreply 468 f"Frame {frame_index}: extracted "
6febc3f… noreply 469 f"{len(entities)} entities, "
6febc3f… noreply 470 f"{len(topics)} topics from {content_type}"
6febc3f… noreply 471 )
2a1b11a… noreply 472
2a1b11a… noreply 473 sc = ScreenCapture(
2a1b11a… noreply 474 frame_index=frame_index,
2a1b11a… noreply 475 caption=caption,
2a1b11a… noreply 476 confidence=confidence,
2a1b11a… noreply 477 content_type=content_type,
2a1b11a… noreply 478 text_content=text_content,
2a1b11a… noreply 479 entities=entities,
2a1b11a… noreply 480 topics=topics,
2a1b11a… noreply 481 )
2a1b11a… noreply 482
2a1b11a… noreply 483 if captures_dir:
2a1b11a… noreply 484 captures_dir.mkdir(parents=True, exist_ok=True)
2a1b11a… noreply 485 prefix = f"capture_{capture_index}"
2a1b11a… noreply 486 img_dest = captures_dir / f"{prefix}.jpg"
2a1b11a… noreply 487 shutil.copy2(frame_path, img_dest)
2a1b11a… noreply 488 sc.image_path = f"captures/{prefix}.jpg"
2a1b11a… noreply 489 json_dest = captures_dir / f"{prefix}.json"
2a1b11a… noreply 490 json_dest.write_text(sc.model_dump_json(indent=2))
2a1b11a… noreply 491
2a1b11a… noreply 492 return sc
6febc3f… noreply 493
6febc3f… noreply 494 def _save_screengrab(
6febc3f… noreply 495 self,
6febc3f… noreply 496 frame_path: Path,
6febc3f… noreply 497 frame_index: int,
6febc3f… noreply 498 capture_index: int,
6febc3f… noreply 499 captures_dir: Optional[Path],
6febc3f… noreply 500 confidence: float,
6febc3f… noreply 501 ) -> ScreenCapture:
6febc3f… noreply 502 """Legacy entry point — extracts then delegates to _build_screengrab."""
6febc3f… noreply 503 try:
6febc3f… noreply 504 extraction = self.extract_screenshot_knowledge(frame_path)
6febc3f… noreply 505 except Exception as e:
6febc3f… noreply 506 logger.warning(f"Screenshot extraction failed for frame {frame_index}: {e}")
6febc3f… noreply 507 extraction = {}
6febc3f… noreply 508 return self._build_screengrab(
6febc3f… noreply 509 frame_path, frame_index, capture_index, captures_dir, confidence, extraction
6febc3f… noreply 510 )
6febc3f… noreply 511
6febc3f… noreply 512
6febc3f… noreply 513 def _normalize_relationships(raw_rels: list) -> List[str]:
6febc3f… noreply 514 """Normalize relationships: llava sometimes returns dicts instead of strings."""
6febc3f… noreply 515 relationships = []
6febc3f… noreply 516 for rel in raw_rels:
6febc3f… noreply 517 if isinstance(rel, str):
6febc3f… noreply 518 relationships.append(rel)
6febc3f… noreply 519 elif isinstance(rel, dict):
6febc3f… noreply 520 src = rel.get("source", rel.get("from", "?"))
6febc3f… noreply 521 dst = rel.get("destination", rel.get("to", "?"))
6febc3f… noreply 522 label = rel.get("label", rel.get("relationship", ""))
6febc3f… noreply 523 relationships.append(f"{src} -> {dst}: {label}" if label else f"{src} -> {dst}")
6febc3f… noreply 524 else:
6febc3f… noreply 525 relationships.append(str(rel))
6febc3f… noreply 526 return relationships
6febc3f… noreply 527
6febc3f… noreply 528
6febc3f… noreply 529 def _normalize_elements(raw_elements: list) -> List[str]:
6febc3f… noreply 530 """Normalize elements: llava may return dicts or nested lists."""
6febc3f… noreply 531 elements = []
6febc3f… noreply 532 for elem in raw_elements:
6febc3f… noreply 533 if isinstance(elem, str):
6febc3f… noreply 534 elements.append(elem)
6febc3f… noreply 535 elif isinstance(elem, dict):
6febc3f… noreply 536 name = elem.get("name", elem.get("element", ""))
6febc3f… noreply 537 etype = elem.get("type", elem.get("element_type", ""))
6febc3f… noreply 538 if name and etype:
6febc3f… noreply 539 elements.append(f"{etype}: {name}")
6febc3f… noreply 540 elif name:
6febc3f… noreply 541 elements.append(name)
6febc3f… noreply 542 else:
6febc3f… noreply 543 elements.append(json.dumps(elem))
6febc3f… noreply 544 elif isinstance(elem, list):
6febc3f… noreply 545 elements.extend(str(e) for e in elem)
6febc3f… noreply 546 else:
6febc3f… noreply 547 elements.append(str(elem))
6febc3f… noreply 548 return elements
6febc3f… noreply 549
6febc3f… noreply 550
6febc3f… noreply 551 def _normalize_text_content(raw_text) -> Optional[str]:
6febc3f… noreply 552 """Normalize text_content: llava may return dict instead of string."""
6febc3f… noreply 553 if isinstance(raw_text, dict):
6febc3f… noreply 554 parts = []
6febc3f… noreply 555 for k, v in raw_text.items():
6febc3f… noreply 556 if isinstance(v, list):
6febc3f… noreply 557 parts.append(f"{k}: {', '.join(str(x) for x in v)}")
6febc3f… noreply 558 else:
6febc3f… noreply 559 parts.append(f"{k}: {v}")
6febc3f… noreply 560 return "\n".join(parts)
6febc3f… noreply 561 elif isinstance(raw_text, list):
6febc3f… noreply 562 return "\n".join(str(x) for x in raw_text)
6febc3f… noreply 563 return raw_text

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button