PlanOpticon

planopticon / video_processor / analyzers / diagram_analyzer.py

Source Blame History 563 lines

ccf1b1a…	leo	1	"""Diagram analysis using vision model classification and single-pass extraction."""
ccf1b1a…	leo	2
6febc3f…	noreply	3	import hashlib
ccf1b1a…	leo	4	import json
ccf1b1a…	leo	5	import logging
ccf1b1a…	leo	6	import shutil
6febc3f…	noreply	7	from concurrent.futures import ThreadPoolExecutor, as_completed
ccf1b1a…	leo	8	from pathlib import Path
ccf1b1a…	leo	9	from typing import List, Optional, Tuple, Union
287a3bb…	leo	10
287a3bb…	leo	11	from tqdm import tqdm
ccf1b1a…	leo	12
ccf1b1a…	leo	13	from video_processor.models import DiagramResult, DiagramType, ScreenCapture
ccf1b1a…	leo	14	from video_processor.providers.manager import ProviderManager
ccf1b1a…	leo	15
ccf1b1a…	leo	16	logger = logging.getLogger(__name__)
6febc3f…	noreply	17
6febc3f…	noreply	18	# Default max workers for parallel frame analysis
6febc3f…	noreply	19	_DEFAULT_MAX_WORKERS = 4
287a3bb…	leo	20
ccf1b1a…	leo	21	# Classification prompt — returns JSON
ccf1b1a…	leo	22	_CLASSIFY_PROMPT = """\
287a3bb…	leo	23	Examine this image from a video recording. Your job is to identify ONLY shared content \
287a3bb…	leo	24	— slides, presentations, charts, diagrams, documents, screen shares, whiteboard content, \
287a3bb…	leo	25	architecture drawings, tables, or other structured visual information worth capturing.
287a3bb…	leo	26
287a3bb…	leo	27	IMPORTANT: If the image primarily shows a person, people, webcam feeds, faces, or a \
287a3bb…	leo	28	video conference participant view, return confidence 0.0. We are ONLY interested in \
287a3bb…	leo	29	shared/presented content, NOT people or camera views.
ccf1b1a…	leo	30
ccf1b1a…	leo	31	Return ONLY a JSON object (no markdown fences):
ccf1b1a…	leo	32	{
ccf1b1a…	leo	33	"is_diagram": true/false,
829e24a…	leo	34	"diagram_type": "flowchart"\|"sequence"\|"architecture"
829e24a…	leo	35	\|"whiteboard"\|"chart"\|"table"\|"slide"\|"screenshot"\|"unknown",
ccf1b1a…	leo	36	"confidence": 0.0 to 1.0,
287a3bb…	leo	37	"content_type": "slide"\|"diagram"\|"document"\|"screen_share"\|"whiteboard"\|"chart"\|"person"\|"other",
ccf1b1a…	leo	38	"brief_description": "one-sentence description of what you see"
ccf1b1a…	leo	39	}
ccf1b1a…	leo	40	"""
ccf1b1a…	leo	41
ccf1b1a…	leo	42	# Single-pass analysis prompt — extracts everything in one call
ccf1b1a…	leo	43	_ANALYSIS_PROMPT = """\
829e24a…	leo	44	Analyze this diagram/visual content comprehensively. Extract ALL of the
829e24a…	leo	45	following in a single JSON response (no markdown fences):
ccf1b1a…	leo	46	{
829e24a…	leo	47	"diagram_type": "flowchart"\|"sequence"\|"architecture"
829e24a…	leo	48	\|"whiteboard"\|"chart"\|"table"\|"slide"\|"screenshot"\|"unknown",
ccf1b1a…	leo	49	"description": "detailed description of the visual content",
ccf1b1a…	leo	50	"text_content": "all visible text, preserving structure",
ccf1b1a…	leo	51	"elements": ["list", "of", "identified", "elements/components"],
ccf1b1a…	leo	52	"relationships": ["element A -> element B: relationship", ...],
ccf1b1a…	leo	53	"mermaid": "mermaid diagram syntax representing this visual (graph LR, sequenceDiagram, etc.)",
ccf1b1a…	leo	54	"chart_data": null or {"labels": [...], "values": [...], "chart_type": "bar\|line\|pie\|scatter"}
ccf1b1a…	leo	55	}
ccf1b1a…	leo	56
ccf1b1a…	leo	57	For the mermaid field: generate valid mermaid syntax that best represents the visual structure.
ccf1b1a…	leo	58	For chart_data: only populate if this is a chart/graph with extractable numeric data.
ccf1b1a…	leo	59	If any field cannot be determined, use null or empty list.
ccf1b1a…	leo	60	"""
ccf1b1a…	leo	61
ccf1b1a…	leo	62	# Caption prompt for screengrab fallback
ccf1b1a…	leo	63	_CAPTION_PROMPT = "Briefly describe what this image shows in 1-2 sentences."
ccf1b1a…	leo	64
2a1b11a…	noreply	65	# Rich screenshot extraction prompt — extracts knowledge from shared screens
2a1b11a…	noreply	66	_SCREENSHOT_EXTRACT_PROMPT = """\
2a1b11a…	noreply	67	Analyze this screenshot from a video recording. Extract all visible knowledge.
2a1b11a…	noreply	68	This is shared screen content (slides, code, documents, browser, terminal, etc.).
2a1b11a…	noreply	69
2a1b11a…	noreply	70	Return ONLY a JSON object (no markdown fences):
2a1b11a…	noreply	71	{
2a1b11a…	noreply	72	"content_type": "slide"\|"code"\|"document"\|"terminal"\|"browser"\|"chat"\|"other",
2a1b11a…	noreply	73	"caption": "one-sentence description of what is shown",
2a1b11a…	noreply	74	"text_content": "all visible text, preserving structure and line breaks",
2a1b11a…	noreply	75	"entities": ["named things visible: people, technologies, tools, services, \
2a1b11a…	noreply	76	projects, libraries, APIs, error codes, URLs, file paths"],
2a1b11a…	noreply	77	"topics": ["concepts or subjects this content is about"]
2a1b11a…	noreply	78	}
2a1b11a…	noreply	79
2a1b11a…	noreply	80	For text_content: extract ALL readable text — code, titles, bullet points, URLs,
2a1b11a…	noreply	81	error messages, terminal output, chat messages, file names. Be thorough.
2a1b11a…	noreply	82	For entities: extract specific named things, not generic words.
2a1b11a…	noreply	83	For topics: extract 2-5 high-level topics this content relates to.
2a1b11a…	noreply	84	"""
2a1b11a…	noreply	85
ccf1b1a…	leo	86
ccf1b1a…	leo	87	def _read_image_bytes(image_path: Union[str, Path]) -> bytes:
ccf1b1a…	leo	88	"""Read image file as bytes."""
ccf1b1a…	leo	89	return Path(image_path).read_bytes()
ccf1b1a…	leo	90
ccf1b1a…	leo	91
ccf1b1a…	leo	92	def _parse_json_response(text: str) -> Optional[dict]:
ccf1b1a…	leo	93	"""Try to parse JSON from an LLM response, handling markdown fences."""
ccf1b1a…	leo	94	if not text:
ccf1b1a…	leo	95	return None
ccf1b1a…	leo	96	# Strip markdown fences
ccf1b1a…	leo	97	cleaned = text.strip()
ccf1b1a…	leo	98	if cleaned.startswith("```"):
ccf1b1a…	leo	99	lines = cleaned.split("\n")
ccf1b1a…	leo	100	# Remove first and last fence lines
829e24a…	leo	101	lines = [line for line in lines if not line.strip().startswith("```")]
ccf1b1a…	leo	102	try:
ccf1b1a…	leo	103	return json.loads(cleaned)
ccf1b1a…	leo	104	except json.JSONDecodeError:
ccf1b1a…	leo	105	# Try to find JSON object in the text
ccf1b1a…	leo	106	start = cleaned.find("{")
ccf1b1a…	leo	107	end = cleaned.rfind("}") + 1
ccf1b1a…	leo	108	if start >= 0 and end > start:
ccf1b1a…	leo	109	try:
ccf1b1a…	leo	110	return json.loads(cleaned[start:end])
ccf1b1a…	leo	111	except json.JSONDecodeError:
ccf1b1a…	leo	112	pass
ccf1b1a…	leo	113	return None
ccf1b1a…	leo	114
ccf1b1a…	leo	115
6febc3f…	noreply	116	def _frame_hash(path: Path) -> str:
6febc3f…	noreply	117	"""Content-based hash for a frame file (first 8KB + size for speed)."""
6febc3f…	noreply	118	h = hashlib.sha256()
6febc3f…	noreply	119	h.update(str(path.stat().st_size).encode())
6febc3f…	noreply	120	with open(path, "rb") as f:
6febc3f…	noreply	121	h.update(f.read(8192))
6febc3f…	noreply	122	return h.hexdigest()[:16]
6febc3f…	noreply	123
6febc3f…	noreply	124
6febc3f…	noreply	125	class _FrameCache:
6febc3f…	noreply	126	"""Simple JSON file cache for frame classification/analysis results."""
6febc3f…	noreply	127
6febc3f…	noreply	128	def __init__(self, cache_path: Optional[Path]):
6febc3f…	noreply	129	self._path = cache_path
6febc3f…	noreply	130	self._data: dict = {}
6febc3f…	noreply	131	if cache_path and cache_path.exists():
6febc3f…	noreply	132	try:
6febc3f…	noreply	133	self._data = json.loads(cache_path.read_text())
6febc3f…	noreply	134	except (json.JSONDecodeError, OSError):
6febc3f…	noreply	135	self._data = {}
6febc3f…	noreply	136
6febc3f…	noreply	137	def get(self, key: str) -> Optional[dict]:
6febc3f…	noreply	138	return self._data.get(key)
6febc3f…	noreply	139
6febc3f…	noreply	140	def set(self, key: str, value: dict) -> None:
6febc3f…	noreply	141	self._data[key] = value
6febc3f…	noreply	142
6febc3f…	noreply	143	def save(self) -> None:
6febc3f…	noreply	144	if self._path:
6febc3f…	noreply	145	self._path.parent.mkdir(parents=True, exist_ok=True)
6febc3f…	noreply	146	self._path.write_text(json.dumps(self._data, indent=2))
6febc3f…	noreply	147
6febc3f…	noreply	148
ccf1b1a…	leo	149	class DiagramAnalyzer:
ccf1b1a…	leo	150	"""Vision model-based diagram detection and analysis."""
ccf1b1a…	leo	151
ccf1b1a…	leo	152	def __init__(
ccf1b1a…	leo	153	self,
ccf1b1a…	leo	154	provider_manager: Optional[ProviderManager] = None,
ccf1b1a…	leo	155	confidence_threshold: float = 0.3,
6febc3f…	noreply	156	max_workers: int = _DEFAULT_MAX_WORKERS,
ccf1b1a…	leo	157	):
ccf1b1a…	leo	158	self.pm = provider_manager or ProviderManager()
ccf1b1a…	leo	159	self.confidence_threshold = confidence_threshold
6febc3f…	noreply	160	self.max_workers = max_workers
ccf1b1a…	leo	161
ccf1b1a…	leo	162	def classify_frame(self, image_path: Union[str, Path]) -> dict:
ccf1b1a…	leo	163	"""
ccf1b1a…	leo	164	Classify a single frame using vision model.
ccf1b1a…	leo	165
ccf1b1a…	leo	166	Returns dict with is_diagram, diagram_type, confidence, brief_description.
ccf1b1a…	leo	167	"""
ccf1b1a…	leo	168	image_bytes = _read_image_bytes(image_path)
ccf1b1a…	leo	169	raw = self.pm.analyze_image(image_bytes, _CLASSIFY_PROMPT, max_tokens=512)
ccf1b1a…	leo	170	result = _parse_json_response(raw)
ccf1b1a…	leo	171	if result is None:
829e24a…	leo	172	return {
829e24a…	leo	173	"is_diagram": False,
829e24a…	leo	174	"diagram_type": "unknown",
829e24a…	leo	175	"confidence": 0.0,
829e24a…	leo	176	"brief_description": "",
829e24a…	leo	177	}
ccf1b1a…	leo	178	return result
ccf1b1a…	leo	179
ccf1b1a…	leo	180	def analyze_diagram_single_pass(self, image_path: Union[str, Path]) -> dict:
ccf1b1a…	leo	181	"""
ccf1b1a…	leo	182	Full single-pass diagram analysis — description, text, mermaid, chart data.
ccf1b1a…	leo	183
ccf1b1a…	leo	184	Returns parsed dict or empty dict on failure.
ccf1b1a…	leo	185	"""
ccf1b1a…	leo	186	image_bytes = _read_image_bytes(image_path)
ccf1b1a…	leo	187	raw = self.pm.analyze_image(image_bytes, _ANALYSIS_PROMPT, max_tokens=4096)
ccf1b1a…	leo	188	result = _parse_json_response(raw)
ccf1b1a…	leo	189	return result or {}
ccf1b1a…	leo	190
ccf1b1a…	leo	191	def caption_frame(self, image_path: Union[str, Path]) -> str:
ccf1b1a…	leo	192	"""Get a brief caption for a screengrab fallback."""
ccf1b1a…	leo	193	image_bytes = _read_image_bytes(image_path)
ccf1b1a…	leo	194	return self.pm.analyze_image(image_bytes, _CAPTION_PROMPT, max_tokens=256)
ccf1b1a…	leo	195
2a1b11a…	noreply	196	def extract_screenshot_knowledge(self, image_path: Union[str, Path]) -> dict:
2a1b11a…	noreply	197	"""Extract knowledge from a screenshot — text, entities, topics."""
2a1b11a…	noreply	198	image_bytes = _read_image_bytes(image_path)
2a1b11a…	noreply	199	raw = self.pm.analyze_image(image_bytes, _SCREENSHOT_EXTRACT_PROMPT, max_tokens=2048)
2a1b11a…	noreply	200	result = _parse_json_response(raw)
2a1b11a…	noreply	201	return result or {}
2a1b11a…	noreply	202
ccf1b1a…	leo	203	def process_frames(
ccf1b1a…	leo	204	self,
ccf1b1a…	leo	205	frame_paths: List[Union[str, Path]],
ccf1b1a…	leo	206	diagrams_dir: Optional[Path] = None,
ccf1b1a…	leo	207	captures_dir: Optional[Path] = None,
6febc3f…	noreply	208	cache_dir: Optional[Path] = None,
ccf1b1a…	leo	209	) -> Tuple[List[DiagramResult], List[ScreenCapture]]:
ccf1b1a…	leo	210	"""
ccf1b1a…	leo	211	Process a list of extracted frames: classify, analyze diagrams, screengrab fallback.
6febc3f…	noreply	212
6febc3f…	noreply	213	Classification and analysis run in parallel using a thread pool. Results are
6febc3f…	noreply	214	cached by frame content hash so re-runs skip already-analyzed frames.
ccf1b1a…	leo	215
ccf1b1a…	leo	216	Thresholds:
ccf1b1a…	leo	217	- confidence >= 0.7 → full diagram analysis (story 3.2)
ccf1b1a…	leo	218	- 0.3 <= confidence < 0.7 → screengrab fallback (story 3.3)
ccf1b1a…	leo	219	- confidence < 0.3 → skip
ccf1b1a…	leo	220
ccf1b1a…	leo	221	Returns (diagrams, screen_captures).
ccf1b1a…	leo	222	"""
6febc3f…	noreply	223	# Set up cache
6febc3f…	noreply	224	cache_path = None
6febc3f…	noreply	225	if cache_dir:
6febc3f…	noreply	226	cache_path = cache_dir / "frame_analysis_cache.json"
6febc3f…	noreply	227	elif diagrams_dir:
6febc3f…	noreply	228	cache_path = diagrams_dir.parent / "frame_analysis_cache.json"
6febc3f…	noreply	229	cache = _FrameCache(cache_path)
6febc3f…	noreply	230
6febc3f…	noreply	231	frame_paths = [Path(fp) for fp in frame_paths]
6febc3f…	noreply	232
6febc3f…	noreply	233	# --- Phase 1: Parallel classification ---
6febc3f…	noreply	234	classifications: dict[int, dict] = {}
6febc3f…	noreply	235	cache_hits = 0
6febc3f…	noreply	236
6febc3f…	noreply	237	def _classify_one(idx: int, fp: Path) -> Tuple[int, dict, bool]:
6febc3f…	noreply	238	fhash = _frame_hash(fp)
6febc3f…	noreply	239	cached = cache.get(f"classify:{fhash}")
6febc3f…	noreply	240	if cached is not None:
6febc3f…	noreply	241	return idx, cached, True
6febc3f…	noreply	242	try:
6febc3f…	noreply	243	result = self.classify_frame(fp)
6febc3f…	noreply	244	except Exception as e:
6febc3f…	noreply	245	logger.warning(f"Classification failed for frame {idx}: {e}")
6febc3f…	noreply	246	result = {"is_diagram": False, "confidence": 0.0}
6febc3f…	noreply	247	cache.set(f"classify:{fhash}", result)
6febc3f…	noreply	248	return idx, result, False
6febc3f…	noreply	249
6febc3f…	noreply	250	workers = min(self.max_workers, len(frame_paths)) if frame_paths else 1
6febc3f…	noreply	251	with ThreadPoolExecutor(max_workers=workers) as pool:
6febc3f…	noreply	252	futures = {pool.submit(_classify_one, i, fp): i for i, fp in enumerate(frame_paths)}
6febc3f…	noreply	253	pbar = tqdm(
6febc3f…	noreply	254	as_completed(futures),
6febc3f…	noreply	255	total=len(futures),
6febc3f…	noreply	256	desc="Classifying frames",
6febc3f…	noreply	257	unit="frame",
6febc3f…	noreply	258	)
6febc3f…	noreply	259	for future in pbar:
6febc3f…	noreply	260	idx, result, from_cache = future.result()
6febc3f…	noreply	261	classifications[idx] = result
6febc3f…	noreply	262	if from_cache:
6febc3f…	noreply	263	cache_hits += 1
6febc3f…	noreply	264
6febc3f…	noreply	265	if cache_hits:
6febc3f…	noreply	266	logger.info(f"Classification: {cache_hits}/{len(frame_paths)} from cache")
6febc3f…	noreply	267
6febc3f…	noreply	268	# --- Phase 2: Parallel analysis/extraction for qualifying frames ---
6febc3f…	noreply	269	high_conf = [] # (idx, fp, classification)
6febc3f…	noreply	270	med_conf = []
6febc3f…	noreply	271
6febc3f…	noreply	272	for idx in sorted(classifications):
6febc3f…	noreply	273	conf = float(classifications[idx].get("confidence", 0.0))
6febc3f…	noreply	274	if conf >= 0.7:
6febc3f…	noreply	275	high_conf.append((idx, frame_paths[idx], classifications[idx]))
6febc3f…	noreply	276	elif conf >= self.confidence_threshold:
6febc3f…	noreply	277	med_conf.append((idx, frame_paths[idx], classifications[idx]))
6febc3f…	noreply	278
6febc3f…	noreply	279	# Analyze high-confidence diagrams in parallel
6febc3f…	noreply	280	analysis_results: dict[int, dict] = {}
6febc3f…	noreply	281
6febc3f…	noreply	282	def _analyze_one(idx: int, fp: Path) -> Tuple[int, dict, bool]:
6febc3f…	noreply	283	fhash = _frame_hash(fp)
6febc3f…	noreply	284	cached = cache.get(f"analyze:{fhash}")
6febc3f…	noreply	285	if cached is not None:
6febc3f…	noreply	286	return idx, cached, True
6febc3f…	noreply	287	try:
6febc3f…	noreply	288	result = self.analyze_diagram_single_pass(fp)
6febc3f…	noreply	289	except Exception as e:
6febc3f…	noreply	290	logger.warning(f"Diagram analysis failed for frame {idx}: {e}")
6febc3f…	noreply	291	result = {}
6febc3f…	noreply	292	cache.set(f"analyze:{fhash}", result)
6febc3f…	noreply	293	return idx, result, False
6febc3f…	noreply	294
6febc3f…	noreply	295	if high_conf:
6febc3f…	noreply	296	workers = min(self.max_workers, len(high_conf))
6febc3f…	noreply	297	with ThreadPoolExecutor(max_workers=workers) as pool:
6febc3f…	noreply	298	futures = {pool.submit(_analyze_one, idx, fp): idx for idx, fp, _ in high_conf}
6febc3f…	noreply	299	pbar = tqdm(
6febc3f…	noreply	300	as_completed(futures),
6febc3f…	noreply	301	total=len(futures),
6febc3f…	noreply	302	desc="Analyzing diagrams",
6febc3f…	noreply	303	unit="diagram",
6febc3f…	noreply	304	)
6febc3f…	noreply	305	for future in pbar:
6febc3f…	noreply	306	idx, result, _ = future.result()
6febc3f…	noreply	307	analysis_results[idx] = result
6febc3f…	noreply	308
6febc3f…	noreply	309	# Extract knowledge from medium-confidence frames in parallel
6febc3f…	noreply	310	extraction_results: dict[int, dict] = {}
6febc3f…	noreply	311
6febc3f…	noreply	312	def _extract_one(idx: int, fp: Path) -> Tuple[int, dict, bool]:
6febc3f…	noreply	313	fhash = _frame_hash(fp)
6febc3f…	noreply	314	cached = cache.get(f"extract:{fhash}")
6febc3f…	noreply	315	if cached is not None:
6febc3f…	noreply	316	return idx, cached, True
6febc3f…	noreply	317	try:
6febc3f…	noreply	318	result = self.extract_screenshot_knowledge(fp)
6febc3f…	noreply	319	except Exception as e:
6febc3f…	noreply	320	logger.warning(f"Screenshot extraction failed for frame {idx}: {e}")
6febc3f…	noreply	321	result = {}
6febc3f…	noreply	322	cache.set(f"extract:{fhash}", result)
6febc3f…	noreply	323	return idx, result, False
6febc3f…	noreply	324
6febc3f…	noreply	325	if med_conf:
6febc3f…	noreply	326	workers = min(self.max_workers, len(med_conf))
6febc3f…	noreply	327	with ThreadPoolExecutor(max_workers=workers) as pool:
6febc3f…	noreply	328	futures = {pool.submit(_extract_one, idx, fp): idx for idx, fp, _ in med_conf}
6febc3f…	noreply	329	pbar = tqdm(
6febc3f…	noreply	330	as_completed(futures),
6febc3f…	noreply	331	total=len(futures),
6febc3f…	noreply	332	desc="Extracting screenshots",
6febc3f…	noreply	333	unit="capture",
6febc3f…	noreply	334	)
6febc3f…	noreply	335	for future in pbar:
6febc3f…	noreply	336	idx, result, _ = future.result()
6febc3f…	noreply	337	extraction_results[idx] = result
6febc3f…	noreply	338
6febc3f…	noreply	339	# --- Phase 3: Build results (sequential for stable ordering) ---
ccf1b1a…	leo	340	diagrams: List[DiagramResult] = []
ccf1b1a…	leo	341	captures: List[ScreenCapture] = []
ccf1b1a…	leo	342	diagram_idx = 0
ccf1b1a…	leo	343	capture_idx = 0
ccf1b1a…	leo	344
6febc3f…	noreply	345	for idx, fp, classification in high_conf:
6febc3f…	noreply	346	analysis = analysis_results.get(idx, {})
6febc3f…	noreply	347	confidence = float(classification.get("confidence", 0.0))
6febc3f…	noreply	348
6febc3f…	noreply	349	if not analysis:
6febc3f…	noreply	350	# Analysis failed — fall back to screengrab with pre-fetched extraction
6febc3f…	noreply	351	extraction = extraction_results.get(idx)
6febc3f…	noreply	352	if extraction is None:
6febc3f…	noreply	353	# Wasn't in med_conf, need to extract now
6febc3f…	noreply	354	try:
6febc3f…	noreply	355	extraction = self.extract_screenshot_knowledge(fp)
6febc3f…	noreply	356	except Exception:
6febc3f…	noreply	357	extraction = {}
6febc3f…	noreply	358	capture = self._build_screengrab(
6febc3f…	noreply	359	fp, idx, capture_idx, captures_dir, confidence, extraction
6febc3f…	noreply	360	)
6febc3f…	noreply	361	captures.append(capture)
6febc3f…	noreply	362	capture_idx += 1
6febc3f…	noreply	363	continue
6febc3f…	noreply	364
6febc3f…	noreply	365	dr = self._build_diagram_result(
6febc3f…	noreply	366	idx, fp, diagram_idx, diagrams_dir, confidence, classification, analysis
6febc3f…	noreply	367	)
6febc3f…	noreply	368	if dr:
6febc3f…	noreply	369	diagrams.append(dr)
6febc3f…	noreply	370	diagram_idx += 1
6febc3f…	noreply	371	else:
6febc3f…	noreply	372	capture = self._build_screengrab(fp, idx, capture_idx, captures_dir, confidence, {})
6febc3f…	noreply	373	captures.append(capture)
6febc3f…	noreply	374	capture_idx += 1
6febc3f…	noreply	375
6febc3f…	noreply	376	for idx, fp, classification in med_conf:
6febc3f…	noreply	377	confidence = float(classification.get("confidence", 0.0))
6febc3f…	noreply	378	extraction = extraction_results.get(idx, {})
6febc3f…	noreply	379	logger.info(
6febc3f…	noreply	380	f"Frame {idx}: uncertain (confidence {confidence:.2f}), saving as screengrab"
6febc3f…	noreply	381	)
6febc3f…	noreply	382	capture = self._build_screengrab(
6febc3f…	noreply	383	fp, idx, capture_idx, captures_dir, confidence, extraction
6febc3f…	noreply	384	)
6febc3f…	noreply	385	captures.append(capture)
6febc3f…	noreply	386	capture_idx += 1
6febc3f…	noreply	387
6febc3f…	noreply	388	# Save cache
6febc3f…	noreply	389	cache.save()
829e24a…	leo	390
829e24a…	leo	391	logger.info(
829e24a…	leo	392	f"Diagram processing complete: {len(diagrams)} diagrams, {len(captures)} screengrabs"
829e24a…	leo	393	)
829e24a…	leo	394	return diagrams, captures
2a1b11a…	noreply	395
6febc3f…	noreply	396	def _build_diagram_result(
6febc3f…	noreply	397	self,
6febc3f…	noreply	398	frame_index: int,
6febc3f…	noreply	399	frame_path: Path,
6febc3f…	noreply	400	diagram_idx: int,
6febc3f…	noreply	401	diagrams_dir: Optional[Path],
6febc3f…	noreply	402	confidence: float,
6febc3f…	noreply	403	classification: dict,
6febc3f…	noreply	404	analysis: dict,
6febc3f…	noreply	405	) -> Optional[DiagramResult]:
6febc3f…	noreply	406	"""Build a DiagramResult from analysis data. Returns None on validation failure."""
6febc3f…	noreply	407	dtype = analysis.get("diagram_type", classification.get("diagram_type", "unknown"))
6febc3f…	noreply	408	try:
6febc3f…	noreply	409	diagram_type = DiagramType(dtype)
6febc3f…	noreply	410	except ValueError:
6febc3f…	noreply	411	diagram_type = DiagramType.unknown
6febc3f…	noreply	412
6febc3f…	noreply	413	relationships = _normalize_relationships(analysis.get("relationships") or [])
6febc3f…	noreply	414	elements = _normalize_elements(analysis.get("elements") or [])
6febc3f…	noreply	415	text_content = _normalize_text_content(analysis.get("text_content"))
6febc3f…	noreply	416
6febc3f…	noreply	417	try:
6febc3f…	noreply	418	dr = DiagramResult(
6febc3f…	noreply	419	frame_index=frame_index,
6febc3f…	noreply	420	diagram_type=diagram_type,
6febc3f…	noreply	421	confidence=confidence,
6febc3f…	noreply	422	description=analysis.get("description"),
6febc3f…	noreply	423	text_content=text_content,
6febc3f…	noreply	424	elements=elements,
6febc3f…	noreply	425	relationships=relationships,
6febc3f…	noreply	426	mermaid=analysis.get("mermaid"),
6febc3f…	noreply	427	chart_data=analysis.get("chart_data"),
6febc3f…	noreply	428	)
6febc3f…	noreply	429	except Exception as e:
6febc3f…	noreply	430	logger.warning(f"DiagramResult validation failed for frame {frame_index}: {e}")
6febc3f…	noreply	431	return None
6febc3f…	noreply	432
6febc3f…	noreply	433	if diagrams_dir:
6febc3f…	noreply	434	diagrams_dir.mkdir(parents=True, exist_ok=True)
6febc3f…	noreply	435	prefix = f"diagram_{diagram_idx}"
6febc3f…	noreply	436	img_dest = diagrams_dir / f"{prefix}.jpg"
6febc3f…	noreply	437	shutil.copy2(frame_path, img_dest)
6febc3f…	noreply	438	dr.image_path = f"diagrams/{prefix}.jpg"
6febc3f…	noreply	439	if dr.mermaid:
6febc3f…	noreply	440	mermaid_dest = diagrams_dir / f"{prefix}.mermaid"
6febc3f…	noreply	441	mermaid_dest.write_text(dr.mermaid)
6febc3f…	noreply	442	dr.mermaid_path = f"diagrams/{prefix}.mermaid"
6febc3f…	noreply	443	json_dest = diagrams_dir / f"{prefix}.json"
6febc3f…	noreply	444	json_dest.write_text(dr.model_dump_json(indent=2))
6febc3f…	noreply	445
6febc3f…	noreply	446	return dr
6febc3f…	noreply	447
6febc3f…	noreply	448	def _build_screengrab(
2a1b11a…	noreply	449	self,
2a1b11a…	noreply	450	frame_path: Path,
2a1b11a…	noreply	451	frame_index: int,
2a1b11a…	noreply	452	capture_index: int,
2a1b11a…	noreply	453	captures_dir: Optional[Path],
2a1b11a…	noreply	454	confidence: float,
6febc3f…	noreply	455	extraction: dict,
2a1b11a…	noreply	456	) -> ScreenCapture:
6febc3f…	noreply	457	"""Build a ScreenCapture from extraction data."""
6febc3f…	noreply	458	caption = extraction.get("caption", "")
6febc3f…	noreply	459	content_type = extraction.get("content_type")
6febc3f…	noreply	460	text_content = extraction.get("text_content")
6febc3f…	noreply	461	raw_entities = extraction.get("entities", [])
6febc3f…	noreply	462	entities = [str(e) for e in raw_entities] if isinstance(raw_entities, list) else []
6febc3f…	noreply	463	raw_topics = extraction.get("topics", [])
6febc3f…	noreply	464	topics = [str(t) for t in raw_topics] if isinstance(raw_topics, list) else []
6febc3f…	noreply	465
6febc3f…	noreply	466	if extraction:
6febc3f…	noreply	467	logger.info(
6febc3f…	noreply	468	f"Frame {frame_index}: extracted "
6febc3f…	noreply	469	f"{len(entities)} entities, "
6febc3f…	noreply	470	f"{len(topics)} topics from {content_type}"
6febc3f…	noreply	471	)
2a1b11a…	noreply	472
2a1b11a…	noreply	473	sc = ScreenCapture(
2a1b11a…	noreply	474	frame_index=frame_index,
2a1b11a…	noreply	475	caption=caption,
2a1b11a…	noreply	476	confidence=confidence,
2a1b11a…	noreply	477	content_type=content_type,
2a1b11a…	noreply	478	text_content=text_content,
2a1b11a…	noreply	479	entities=entities,
2a1b11a…	noreply	480	topics=topics,
2a1b11a…	noreply	481	)
2a1b11a…	noreply	482
2a1b11a…	noreply	483	if captures_dir:
2a1b11a…	noreply	484	captures_dir.mkdir(parents=True, exist_ok=True)
2a1b11a…	noreply	485	prefix = f"capture_{capture_index}"
2a1b11a…	noreply	486	img_dest = captures_dir / f"{prefix}.jpg"
2a1b11a…	noreply	487	shutil.copy2(frame_path, img_dest)
2a1b11a…	noreply	488	sc.image_path = f"captures/{prefix}.jpg"
2a1b11a…	noreply	489	json_dest = captures_dir / f"{prefix}.json"
2a1b11a…	noreply	490	json_dest.write_text(sc.model_dump_json(indent=2))
2a1b11a…	noreply	491
2a1b11a…	noreply	492	return sc
6febc3f…	noreply	493
6febc3f…	noreply	494	def _save_screengrab(
6febc3f…	noreply	495	self,
6febc3f…	noreply	496	frame_path: Path,
6febc3f…	noreply	497	frame_index: int,
6febc3f…	noreply	498	capture_index: int,
6febc3f…	noreply	499	captures_dir: Optional[Path],
6febc3f…	noreply	500	confidence: float,
6febc3f…	noreply	501	) -> ScreenCapture:
6febc3f…	noreply	502	"""Legacy entry point — extracts then delegates to _build_screengrab."""
6febc3f…	noreply	503	try:
6febc3f…	noreply	504	extraction = self.extract_screenshot_knowledge(frame_path)
6febc3f…	noreply	505	except Exception as e:
6febc3f…	noreply	506	logger.warning(f"Screenshot extraction failed for frame {frame_index}: {e}")
6febc3f…	noreply	507	extraction = {}
6febc3f…	noreply	508	return self._build_screengrab(
6febc3f…	noreply	509	frame_path, frame_index, capture_index, captures_dir, confidence, extraction
6febc3f…	noreply	510	)
6febc3f…	noreply	511
6febc3f…	noreply	512
6febc3f…	noreply	513	def _normalize_relationships(raw_rels: list) -> List[str]:
6febc3f…	noreply	514	"""Normalize relationships: llava sometimes returns dicts instead of strings."""
6febc3f…	noreply	515	relationships = []
6febc3f…	noreply	516	for rel in raw_rels:
6febc3f…	noreply	517	if isinstance(rel, str):
6febc3f…	noreply	518	relationships.append(rel)
6febc3f…	noreply	519	elif isinstance(rel, dict):
6febc3f…	noreply	520	src = rel.get("source", rel.get("from", "?"))
6febc3f…	noreply	521	dst = rel.get("destination", rel.get("to", "?"))
6febc3f…	noreply	522	label = rel.get("label", rel.get("relationship", ""))
6febc3f…	noreply	523	relationships.append(f"{src} -> {dst}: {label}" if label else f"{src} -> {dst}")
6febc3f…	noreply	524	else:
6febc3f…	noreply	525	relationships.append(str(rel))
6febc3f…	noreply	526	return relationships
6febc3f…	noreply	527
6febc3f…	noreply	528
6febc3f…	noreply	529	def _normalize_elements(raw_elements: list) -> List[str]:
6febc3f…	noreply	530	"""Normalize elements: llava may return dicts or nested lists."""
6febc3f…	noreply	531	elements = []
6febc3f…	noreply	532	for elem in raw_elements:
6febc3f…	noreply	533	if isinstance(elem, str):
6febc3f…	noreply	534	elements.append(elem)
6febc3f…	noreply	535	elif isinstance(elem, dict):
6febc3f…	noreply	536	name = elem.get("name", elem.get("element", ""))
6febc3f…	noreply	537	etype = elem.get("type", elem.get("element_type", ""))
6febc3f…	noreply	538	if name and etype:
6febc3f…	noreply	539	elements.append(f"{etype}: {name}")
6febc3f…	noreply	540	elif name:
6febc3f…	noreply	541	elements.append(name)
6febc3f…	noreply	542	else:
6febc3f…	noreply	543	elements.append(json.dumps(elem))
6febc3f…	noreply	544	elif isinstance(elem, list):
6febc3f…	noreply	545	elements.extend(str(e) for e in elem)
6febc3f…	noreply	546	else:
6febc3f…	noreply	547	elements.append(str(elem))
6febc3f…	noreply	548	return elements
6febc3f…	noreply	549
6febc3f…	noreply	550
6febc3f…	noreply	551	def _normalize_text_content(raw_text) -> Optional[str]:
6febc3f…	noreply	552	"""Normalize text_content: llava may return dict instead of string."""
6febc3f…	noreply	553	if isinstance(raw_text, dict):
6febc3f…	noreply	554	parts = []
6febc3f…	noreply	555	for k, v in raw_text.items():
6febc3f…	noreply	556	if isinstance(v, list):
6febc3f…	noreply	557	parts.append(f"{k}: {', '.join(str(x) for x in v)}")
6febc3f…	noreply	558	else:
6febc3f…	noreply	559	parts.append(f"{k}: {v}")
6febc3f…	noreply	560	return "\n".join(parts)
6febc3f…	noreply	561	elif isinstance(raw_text, list):
6febc3f…	noreply	562	return "\n".join(str(x) for x in raw_text)
6febc3f…	noreply	563	return raw_text

PlanOpticon

Keyboard Shortcuts