PlanOpticon

Merge pull request #109 from ConflictHQ/feat/screenshot-knowledge-extraction feat: screenshot knowledge extraction

noreply 2026-03-08 00:31 trunk merge

Commit 2a1b11a993e0b1409089c89718da3b09fc44b35c1fa1b22355ddb5103cb69f72

Parent 3551b8081b07992…

7 files changed +49 -3 +74 +22 +61 -3 +32 +14 -1 +3

~ tests/test_diagram_analyzer.py ~ tests/test_knowledge_graph.py ~ tests/test_models.py ~ video_processor/analyzers/diagram_analyzer.py ~ video_processor/integrators/knowledge_graph.py ~ video_processor/models.py ~ video_processor/pipeline.py

M tests/test_diagram_analyzer.py

+49 -3

		--- tests/test_diagram_analyzer.py
		+++ tests/test_diagram_analyzer.py
		@@ -146,23 +146,34 @@
146	146	"relationships": ["Start -> End"],
147	147	"mermaid": "graph LR\n Start-->End",
148	148	"chart_data": None,
149	149	}
150	150	)
	151	+
	152	+ # Screenshot extraction response for medium-confidence frame
	153	+ screenshot_response = json.dumps(
	154	+ {
	155	+ "content_type": "slide",
	156	+ "caption": "A slide about something",
	157	+ "text_content": "Key Points\n- Item 1\n- Item 2",
	158	+ "entities": ["Item 1", "Item 2"],
	159	+ "topics": ["presentation"],
	160	+ }
	161	+ )
151	162
152	163	# Calls are interleaved per-frame:
153	164	# call 0: classify frame 0 (high conf)
154	165	# call 1: analyze frame 0 (full analysis)
155	166	# call 2: classify frame 1 (low conf - skip)
156	167	# call 3: classify frame 2 (medium conf)
157		- # call 4: caption frame 2 (screengrab)
	168	+ # call 4: screenshot extraction frame 2
158	169	call_sequence = [
159	170	classify_responses[0], # classify frame 0
160	171	analysis_response, # analyze frame 0
161	172	classify_responses[1], # classify frame 1
162	173	classify_responses[2], # classify frame 2
163		- "A slide about something", # caption frame 2
	174	+ screenshot_response, # screenshot extraction frame 2
164	175	]
165	176	call_count = [0]
166	177
167	178	def side_effect(image_bytes, prompt, max_tokens=4096):
168	179	idx = call_count[0]
		@@ -178,10 +189,14 @@
178	189	assert diagrams[0].diagram_type == DiagramType.flowchart
179	190	assert diagrams[0].mermaid == "graph LR\n Start-->End"
180	191
181	192	assert len(captures) == 1
182	193	assert captures[0].frame_index == 2
	194	+ assert captures[0].content_type == "slide"
	195	+ assert captures[0].text_content == "Key Points\n- Item 1\n- Item 2"
	196	+ assert "Item 1" in captures[0].entities
	197	+ assert "presentation" in captures[0].topics
183	198
184	199	# Check files were saved
185	200	assert (diagrams_dir / "diagram_0.jpg").exists()
186	201	assert (diagrams_dir / "diagram_0.mermaid").exists()
187	202	assert (diagrams_dir / "diagram_0.json").exists()
		@@ -208,13 +223,44 @@
208	223	"brief_description": "chart",
209	224	}
210	225	)
211	226	if idx == 1:
212	227	return "This is not valid JSON" # Analysis fails
213		- return "A chart showing data" # Caption
	228	+ # Screenshot extraction for the fallback screengrab
	229	+ return json.dumps(
	230	+ {
	231	+ "content_type": "chart",
	232	+ "caption": "A chart showing data",
	233	+ "text_content": "Sales Q1 Q2 Q3",
	234	+ "entities": ["Sales"],
	235	+ "topics": ["metrics"],
	236	+ }
	237	+ )
214	238
215	239	mock_pm.analyze_image.side_effect = side_effect
216	240
217	241	diagrams, captures = analyzer.process_frames([fp], captures_dir=captures_dir)
218	242	assert len(diagrams) == 0
219	243	assert len(captures) == 1
220	244	assert captures[0].frame_index == 0
	245	+ assert captures[0].content_type == "chart"
	246	+ assert captures[0].text_content == "Sales Q1 Q2 Q3"
	247	+
	248	+ def test_extract_screenshot_knowledge(self, analyzer, mock_pm, fake_frame):
	249	+ mock_pm.analyze_image.return_value = json.dumps(
	250	+ {
	251	+ "content_type": "code",
	252	+ "caption": "Python source code",
	253	+ "text_content": "def main():\n print('hello')",
	254	+ "entities": ["Python", "main function"],
	255	+ "topics": ["programming", "source code"],
	256	+ }
	257	+ )
	258	+ result = analyzer.extract_screenshot_knowledge(fake_frame)
	259	+ assert result["content_type"] == "code"
	260	+ assert "Python" in result["entities"]
	261	+ assert "def main" in result["text_content"]
	262	+
	263	+ def test_extract_screenshot_knowledge_failure(self, analyzer, mock_pm, fake_frame):
	264	+ mock_pm.analyze_image.return_value = "not json"
	265	+ result = analyzer.extract_screenshot_knowledge(fake_frame)
	266	+ assert result == {}
221	267

	--- tests/test_diagram_analyzer.py
	+++ tests/test_diagram_analyzer.py
	@@ -146,23 +146,34 @@
146	"relationships": ["Start -> End"],
147	"mermaid": "graph LR\n Start-->End",
148	"chart_data": None,
149	}
150	)











151
152	# Calls are interleaved per-frame:
153	# call 0: classify frame 0 (high conf)
154	# call 1: analyze frame 0 (full analysis)
155	# call 2: classify frame 1 (low conf - skip)
156	# call 3: classify frame 2 (medium conf)
157	# call 4: caption frame 2 (screengrab)
158	call_sequence = [
159	classify_responses[0], # classify frame 0
160	analysis_response, # analyze frame 0
161	classify_responses[1], # classify frame 1
162	classify_responses[2], # classify frame 2
163	"A slide about something", # caption frame 2
164	]
165	call_count = [0]
166
167	def side_effect(image_bytes, prompt, max_tokens=4096):
168	idx = call_count[0]
	@@ -178,10 +189,14 @@
178	assert diagrams[0].diagram_type == DiagramType.flowchart
179	assert diagrams[0].mermaid == "graph LR\n Start-->End"
180
181	assert len(captures) == 1
182	assert captures[0].frame_index == 2




183
184	# Check files were saved
185	assert (diagrams_dir / "diagram_0.jpg").exists()
186	assert (diagrams_dir / "diagram_0.mermaid").exists()
187	assert (diagrams_dir / "diagram_0.json").exists()
	@@ -208,13 +223,44 @@
208	"brief_description": "chart",
209	}
210	)
211	if idx == 1:
212	return "This is not valid JSON" # Analysis fails
213	return "A chart showing data" # Caption









214
215	mock_pm.analyze_image.side_effect = side_effect
216
217	diagrams, captures = analyzer.process_frames([fp], captures_dir=captures_dir)
218	assert len(diagrams) == 0
219	assert len(captures) == 1
220	assert captures[0].frame_index == 0






















221

	--- tests/test_diagram_analyzer.py
	+++ tests/test_diagram_analyzer.py
	@@ -146,23 +146,34 @@
146	"relationships": ["Start -> End"],
147	"mermaid": "graph LR\n Start-->End",
148	"chart_data": None,
149	}
150	)
151
152	# Screenshot extraction response for medium-confidence frame
153	screenshot_response = json.dumps(
154	{
155	"content_type": "slide",
156	"caption": "A slide about something",
157	"text_content": "Key Points\n- Item 1\n- Item 2",
158	"entities": ["Item 1", "Item 2"],
159	"topics": ["presentation"],
160	}
161	)
162
163	# Calls are interleaved per-frame:
164	# call 0: classify frame 0 (high conf)
165	# call 1: analyze frame 0 (full analysis)
166	# call 2: classify frame 1 (low conf - skip)
167	# call 3: classify frame 2 (medium conf)
168	# call 4: screenshot extraction frame 2
169	call_sequence = [
170	classify_responses[0], # classify frame 0
171	analysis_response, # analyze frame 0
172	classify_responses[1], # classify frame 1
173	classify_responses[2], # classify frame 2
174	screenshot_response, # screenshot extraction frame 2
175	]
176	call_count = [0]
177
178	def side_effect(image_bytes, prompt, max_tokens=4096):
179	idx = call_count[0]
	@@ -178,10 +189,14 @@
189	assert diagrams[0].diagram_type == DiagramType.flowchart
190	assert diagrams[0].mermaid == "graph LR\n Start-->End"
191
192	assert len(captures) == 1
193	assert captures[0].frame_index == 2
194	assert captures[0].content_type == "slide"
195	assert captures[0].text_content == "Key Points\n- Item 1\n- Item 2"
196	assert "Item 1" in captures[0].entities
197	assert "presentation" in captures[0].topics
198
199	# Check files were saved
200	assert (diagrams_dir / "diagram_0.jpg").exists()
201	assert (diagrams_dir / "diagram_0.mermaid").exists()
202	assert (diagrams_dir / "diagram_0.json").exists()
	@@ -208,13 +223,44 @@
223	"brief_description": "chart",
224	}
225	)
226	if idx == 1:
227	return "This is not valid JSON" # Analysis fails
228	# Screenshot extraction for the fallback screengrab
229	return json.dumps(
230	{
231	"content_type": "chart",
232	"caption": "A chart showing data",
233	"text_content": "Sales Q1 Q2 Q3",
234	"entities": ["Sales"],
235	"topics": ["metrics"],
236	}
237	)
238
239	mock_pm.analyze_image.side_effect = side_effect
240
241	diagrams, captures = analyzer.process_frames([fp], captures_dir=captures_dir)
242	assert len(diagrams) == 0
243	assert len(captures) == 1
244	assert captures[0].frame_index == 0
245	assert captures[0].content_type == "chart"
246	assert captures[0].text_content == "Sales Q1 Q2 Q3"
247
248	def test_extract_screenshot_knowledge(self, analyzer, mock_pm, fake_frame):
249	mock_pm.analyze_image.return_value = json.dumps(
250	{
251	"content_type": "code",
252	"caption": "Python source code",
253	"text_content": "def main():\n print('hello')",
254	"entities": ["Python", "main function"],
255	"topics": ["programming", "source code"],
256	}
257	)
258	result = analyzer.extract_screenshot_knowledge(fake_frame)
259	assert result["content_type"] == "code"
260	assert "Python" in result["entities"]
261	assert "def main" in result["text_content"]
262
263	def test_extract_screenshot_knowledge_failure(self, analyzer, mock_pm, fake_frame):
264	mock_pm.analyze_image.return_value = "not json"
265	result = analyzer.extract_screenshot_knowledge(fake_frame)
266	assert result == {}
267

M tests/test_knowledge_graph.py

+74

		--- tests/test_knowledge_graph.py
		+++ tests/test_knowledge_graph.py
		@@ -133,10 +133,84 @@
133	133	]
134	134	kg_with_provider.process_diagrams(diagrams)
135	135	assert kg_with_provider._store.has_entity("diagram_0")
136	136	assert kg_with_provider._store.has_entity("diagram_1")
137	137
	138	+
	139	+class TestProcessScreenshots:
	140	+ @pytest.fixture
	141	+ def mock_pm(self):
	142	+ pm = MagicMock()
	143	+ pm.chat.return_value = json.dumps(
	144	+ [
	145	+ {"name": "Python", "type": "technology", "description": "Language"},
	146	+ {"name": "Flask", "type": "technology", "description": "Framework"},
	147	+ ]
	148	+ )
	149	+ return pm
	150	+
	151	+ @pytest.fixture
	152	+ def kg_with_provider(self, mock_pm):
	153	+ return KnowledgeGraph(provider_manager=mock_pm)
	154	+
	155	+ def test_process_screenshots_with_text(self, kg_with_provider, mock_pm):
	156	+ screenshots = [
	157	+ {
	158	+ "text_content": "import flask\napp = Flask(__name__)",
	159	+ "content_type": "code",
	160	+ "entities": ["Flask", "Python"],
	161	+ "frame_index": 3,
	162	+ },
	163	+ ]
	164	+ kg_with_provider.process_screenshots(screenshots)
	165	+ # LLM extraction from text_content
	166	+ mock_pm.chat.assert_called()
	167	+ # Explicitly listed entities should be added
	168	+ assert kg_with_provider._store.has_entity("Flask")
	169	+ assert kg_with_provider._store.has_entity("Python")
	170	+
	171	+ def test_process_screenshots_without_text(self, kg_with_provider, mock_pm):
	172	+ screenshots = [
	173	+ {
	174	+ "text_content": "",
	175	+ "content_type": "other",
	176	+ "entities": ["Docker"],
	177	+ "frame_index": 5,
	178	+ },
	179	+ ]
	180	+ kg_with_provider.process_screenshots(screenshots)
	181	+ # No chat call for empty text
	182	+ mock_pm.chat.assert_not_called()
	183	+ # But explicit entities still added
	184	+ assert kg_with_provider._store.has_entity("Docker")
	185	+
	186	+ def test_process_screenshots_empty_entities(self, kg_with_provider):
	187	+ screenshots = [
	188	+ {
	189	+ "text_content": "",
	190	+ "content_type": "slide",
	191	+ "entities": [],
	192	+ "frame_index": 0,
	193	+ },
	194	+ ]
	195	+ kg_with_provider.process_screenshots(screenshots)
	196	+ # No crash, no entities added
	197	+
	198	+ def test_process_screenshots_filters_short_names(self, kg_with_provider):
	199	+ screenshots = [
	200	+ {
	201	+ "text_content": "",
	202	+ "entities": ["A", "Go", "Python"],
	203	+ "frame_index": 0,
	204	+ },
	205	+ ]
	206	+ kg_with_provider.process_screenshots(screenshots)
	207	+ # "A" is too short (< 2 chars), filtered out
	208	+ assert not kg_with_provider._store.has_entity("A")
	209	+ assert kg_with_provider._store.has_entity("Go")
	210	+ assert kg_with_provider._store.has_entity("Python")
	211	+
138	212
139	213	class TestToDictFromDict:
140	214	def test_round_trip_empty(self):
141	215	kg = KnowledgeGraph()
142	216	data = kg.to_dict()
143	217

	--- tests/test_knowledge_graph.py
	+++ tests/test_knowledge_graph.py
	@@ -133,10 +133,84 @@
133	]
134	kg_with_provider.process_diagrams(diagrams)
135	assert kg_with_provider._store.has_entity("diagram_0")
136	assert kg_with_provider._store.has_entity("diagram_1")
137










































































138
139	class TestToDictFromDict:
140	def test_round_trip_empty(self):
141	kg = KnowledgeGraph()
142	data = kg.to_dict()
143

	--- tests/test_knowledge_graph.py
	+++ tests/test_knowledge_graph.py
	@@ -133,10 +133,84 @@
133	]
134	kg_with_provider.process_diagrams(diagrams)
135	assert kg_with_provider._store.has_entity("diagram_0")
136	assert kg_with_provider._store.has_entity("diagram_1")
137
138
139	class TestProcessScreenshots:
140	@pytest.fixture
141	def mock_pm(self):
142	pm = MagicMock()
143	pm.chat.return_value = json.dumps(
144	[
145	{"name": "Python", "type": "technology", "description": "Language"},
146	{"name": "Flask", "type": "technology", "description": "Framework"},
147	]
148	)
149	return pm
150
151	@pytest.fixture
152	def kg_with_provider(self, mock_pm):
153	return KnowledgeGraph(provider_manager=mock_pm)
154
155	def test_process_screenshots_with_text(self, kg_with_provider, mock_pm):
156	screenshots = [
157	{
158	"text_content": "import flask\napp = Flask(__name__)",
159	"content_type": "code",
160	"entities": ["Flask", "Python"],
161	"frame_index": 3,
162	},
163	]
164	kg_with_provider.process_screenshots(screenshots)
165	# LLM extraction from text_content
166	mock_pm.chat.assert_called()
167	# Explicitly listed entities should be added
168	assert kg_with_provider._store.has_entity("Flask")
169	assert kg_with_provider._store.has_entity("Python")
170
171	def test_process_screenshots_without_text(self, kg_with_provider, mock_pm):
172	screenshots = [
173	{
174	"text_content": "",
175	"content_type": "other",
176	"entities": ["Docker"],
177	"frame_index": 5,
178	},
179	]
180	kg_with_provider.process_screenshots(screenshots)
181	# No chat call for empty text
182	mock_pm.chat.assert_not_called()
183	# But explicit entities still added
184	assert kg_with_provider._store.has_entity("Docker")
185
186	def test_process_screenshots_empty_entities(self, kg_with_provider):
187	screenshots = [
188	{
189	"text_content": "",
190	"content_type": "slide",
191	"entities": [],
192	"frame_index": 0,
193	},
194	]
195	kg_with_provider.process_screenshots(screenshots)
196	# No crash, no entities added
197
198	def test_process_screenshots_filters_short_names(self, kg_with_provider):
199	screenshots = [
200	{
201	"text_content": "",
202	"entities": ["A", "Go", "Python"],
203	"frame_index": 0,
204	},
205	]
206	kg_with_provider.process_screenshots(screenshots)
207	# "A" is too short (< 2 chars), filtered out
208	assert not kg_with_provider._store.has_entity("A")
209	assert kg_with_provider._store.has_entity("Go")
210	assert kg_with_provider._store.has_entity("Python")
211
212
213	class TestToDictFromDict:
214	def test_round_trip_empty(self):
215	kg = KnowledgeGraph()
216	data = kg.to_dict()
217

M tests/test_models.py

+22

		--- tests/test_models.py
		+++ tests/test_models.py
		@@ -115,18 +115,40 @@
115	115
116	116	class TestScreenCapture:
117	117	def test_basic(self):
118	118	sc = ScreenCapture(frame_index=10, caption="Architecture overview slide", confidence=0.5)
119	119	assert sc.image_path is None
	120	+ assert sc.content_type is None
	121	+ assert sc.text_content is None
	122	+ assert sc.entities == []
	123	+ assert sc.topics == []
	124	+
	125	+ def test_with_extraction(self):
	126	+ sc = ScreenCapture(
	127	+ frame_index=5,
	128	+ caption="Code editor showing Python",
	129	+ confidence=0.5,
	130	+ content_type="code",
	131	+ text_content="def main():\n print('hello')",
	132	+ entities=["Python", "main function"],
	133	+ topics=["programming"],
	134	+ )
	135	+ assert sc.content_type == "code"
	136	+ assert "Python" in sc.entities
	137	+ assert sc.text_content is not None
120	138
121	139	def test_round_trip(self):
122	140	sc = ScreenCapture(
123	141	frame_index=7,
124	142	timestamp=30.0,
125	143	caption="Timeline",
126	144	image_path="captures/capture_0.jpg",
127	145	confidence=0.45,
	146	+ content_type="slide",
	147	+ text_content="Q4 Roadmap",
	148	+ entities=["Roadmap"],
	149	+ topics=["planning"],
128	150	)
129	151	restored = ScreenCapture.model_validate_json(sc.model_dump_json())
130	152	assert restored == sc
131	153
132	154
133	155

	--- tests/test_models.py
	+++ tests/test_models.py
	@@ -115,18 +115,40 @@
115
116	class TestScreenCapture:
117	def test_basic(self):
118	sc = ScreenCapture(frame_index=10, caption="Architecture overview slide", confidence=0.5)
119	assert sc.image_path is None


















120
121	def test_round_trip(self):
122	sc = ScreenCapture(
123	frame_index=7,
124	timestamp=30.0,
125	caption="Timeline",
126	image_path="captures/capture_0.jpg",
127	confidence=0.45,




128	)
129	restored = ScreenCapture.model_validate_json(sc.model_dump_json())
130	assert restored == sc
131
132
133

	--- tests/test_models.py
	+++ tests/test_models.py
	@@ -115,18 +115,40 @@
115
116	class TestScreenCapture:
117	def test_basic(self):
118	sc = ScreenCapture(frame_index=10, caption="Architecture overview slide", confidence=0.5)
119	assert sc.image_path is None
120	assert sc.content_type is None
121	assert sc.text_content is None
122	assert sc.entities == []
123	assert sc.topics == []
124
125	def test_with_extraction(self):
126	sc = ScreenCapture(
127	frame_index=5,
128	caption="Code editor showing Python",
129	confidence=0.5,
130	content_type="code",
131	text_content="def main():\n print('hello')",
132	entities=["Python", "main function"],
133	topics=["programming"],
134	)
135	assert sc.content_type == "code"
136	assert "Python" in sc.entities
137	assert sc.text_content is not None
138
139	def test_round_trip(self):
140	sc = ScreenCapture(
141	frame_index=7,
142	timestamp=30.0,
143	caption="Timeline",
144	image_path="captures/capture_0.jpg",
145	confidence=0.45,
146	content_type="slide",
147	text_content="Q4 Roadmap",
148	entities=["Roadmap"],
149	topics=["planning"],
150	)
151	restored = ScreenCapture.model_validate_json(sc.model_dump_json())
152	assert restored == sc
153
154
155

M video_processor/analyzers/diagram_analyzer.py

+61 -3

		--- video_processor/analyzers/diagram_analyzer.py
		+++ video_processor/analyzers/diagram_analyzer.py
		@@ -55,10 +55,31 @@
55	55	"""
56	56
57	57	# Caption prompt for screengrab fallback
58	58	_CAPTION_PROMPT = "Briefly describe what this image shows in 1-2 sentences."
59	59
	60	+# Rich screenshot extraction prompt — extracts knowledge from shared screens
	61	+_SCREENSHOT_EXTRACT_PROMPT = """\
	62	+Analyze this screenshot from a video recording. Extract all visible knowledge.
	63	+This is shared screen content (slides, code, documents, browser, terminal, etc.).
	64	+
	65	+Return ONLY a JSON object (no markdown fences):
	66	+{
	67	+ "content_type": "slide"\|"code"\|"document"\|"terminal"\|"browser"\|"chat"\|"other",
	68	+ "caption": "one-sentence description of what is shown",
	69	+ "text_content": "all visible text, preserving structure and line breaks",
	70	+ "entities": ["named things visible: people, technologies, tools, services, \
	71	+projects, libraries, APIs, error codes, URLs, file paths"],
	72	+ "topics": ["concepts or subjects this content is about"]
	73	+}
	74	+
	75	+For text_content: extract ALL readable text — code, titles, bullet points, URLs,
	76	+error messages, terminal output, chat messages, file names. Be thorough.
	77	+For entities: extract specific named things, not generic words.
	78	+For topics: extract 2-5 high-level topics this content relates to.
	79	+"""
	80	+
60	81
61	82	def _read_image_bytes(image_path: Union[str, Path]) -> bytes:
62	83	"""Read image file as bytes."""
63	84	return Path(image_path).read_bytes()
64	85
		@@ -129,10 +150,17 @@
129	150
130	151	def caption_frame(self, image_path: Union[str, Path]) -> str:
131	152	"""Get a brief caption for a screengrab fallback."""
132	153	image_bytes = _read_image_bytes(image_path)
133	154	return self.pm.analyze_image(image_bytes, _CAPTION_PROMPT, max_tokens=256)
	155	+
	156	+ def extract_screenshot_knowledge(self, image_path: Union[str, Path]) -> dict:
	157	+ """Extract knowledge from a screenshot — text, entities, topics."""
	158	+ image_bytes = _read_image_bytes(image_path)
	159	+ raw = self.pm.analyze_image(image_bytes, _SCREENSHOT_EXTRACT_PROMPT, max_tokens=2048)
	160	+ result = _parse_json_response(raw)
	161	+ return result or {}
134	162
135	163	def process_frames(
136	164	self,
137	165	frame_paths: List[Union[str, Path]],
138	166	diagrams_dir: Optional[Path] = None,
		@@ -312,21 +340,51 @@
312	340	frame_index: int,
313	341	capture_index: int,
314	342	captures_dir: Optional[Path],
315	343	confidence: float,
316	344	) -> ScreenCapture:
317		- """Save a frame as a captioned screengrab."""
	345	+ """Extract knowledge from a screenshot and save it."""
	346	+ # Try rich extraction first, fall back to caption-only
318	347	caption = ""
	348	+ content_type = None
	349	+ text_content = None
	350	+ entities: List[str] = []
	351	+ topics: List[str] = []
	352	+
319	353	try:
320		- caption = self.caption_frame(frame_path)
	354	+ extraction = self.extract_screenshot_knowledge(frame_path)
	355	+ if extraction:
	356	+ caption = extraction.get("caption", "")
	357	+ content_type = extraction.get("content_type")
	358	+ text_content = extraction.get("text_content")
	359	+ raw_entities = extraction.get("entities", [])
	360	+ entities = [str(e) for e in raw_entities] if isinstance(raw_entities, list) else []
	361	+ raw_topics = extraction.get("topics", [])
	362	+ topics = [str(t) for t in raw_topics] if isinstance(raw_topics, list) else []
	363	+ logger.info(
	364	+ f"Frame {frame_index}: extracted "
	365	+ f"{len(entities)} entities, "
	366	+ f"{len(topics)} topics from {content_type}"
	367	+ )
321	368	except Exception as e:
322		- logger.warning(f"Caption failed for frame {frame_index}: {e}")
	369	+ logger.warning(
	370	+ f"Screenshot extraction failed for frame "
	371	+ f"{frame_index}: {e}, falling back to caption"
	372	+ )
	373	+ try:
	374	+ caption = self.caption_frame(frame_path)
	375	+ except Exception as e2:
	376	+ logger.warning(f"Caption also failed for frame {frame_index}: {e2}")
323	377
324	378	sc = ScreenCapture(
325	379	frame_index=frame_index,
326	380	caption=caption,
327	381	confidence=confidence,
	382	+ content_type=content_type,
	383	+ text_content=text_content,
	384	+ entities=entities,
	385	+ topics=topics,
328	386	)
329	387
330	388	if captures_dir:
331	389	captures_dir.mkdir(parents=True, exist_ok=True)
332	390	prefix = f"capture_{capture_index}"
333	391

	--- video_processor/analyzers/diagram_analyzer.py
	+++ video_processor/analyzers/diagram_analyzer.py
	@@ -55,10 +55,31 @@
55	"""
56
57	# Caption prompt for screengrab fallback
58	_CAPTION_PROMPT = "Briefly describe what this image shows in 1-2 sentences."
59





















60
61	def _read_image_bytes(image_path: Union[str, Path]) -> bytes:
62	"""Read image file as bytes."""
63	return Path(image_path).read_bytes()
64
	@@ -129,10 +150,17 @@
129
130	def caption_frame(self, image_path: Union[str, Path]) -> str:
131	"""Get a brief caption for a screengrab fallback."""
132	image_bytes = _read_image_bytes(image_path)
133	return self.pm.analyze_image(image_bytes, _CAPTION_PROMPT, max_tokens=256)







134
135	def process_frames(
136	self,
137	frame_paths: List[Union[str, Path]],
138	diagrams_dir: Optional[Path] = None,
	@@ -312,21 +340,51 @@
312	frame_index: int,
313	capture_index: int,
314	captures_dir: Optional[Path],
315	confidence: float,
316	) -> ScreenCapture:
317	"""Save a frame as a captioned screengrab."""

318	caption = ""





319	try:
320	caption = self.caption_frame(frame_path)













321	except Exception as e:
322	logger.warning(f"Caption failed for frame {frame_index}: {e}")







323
324	sc = ScreenCapture(
325	frame_index=frame_index,
326	caption=caption,
327	confidence=confidence,




328	)
329
330	if captures_dir:
331	captures_dir.mkdir(parents=True, exist_ok=True)
332	prefix = f"capture_{capture_index}"
333

	--- video_processor/analyzers/diagram_analyzer.py
	+++ video_processor/analyzers/diagram_analyzer.py
	@@ -55,10 +55,31 @@
55	"""
56
57	# Caption prompt for screengrab fallback
58	_CAPTION_PROMPT = "Briefly describe what this image shows in 1-2 sentences."
59
60	# Rich screenshot extraction prompt — extracts knowledge from shared screens
61	_SCREENSHOT_EXTRACT_PROMPT = """\
62	Analyze this screenshot from a video recording. Extract all visible knowledge.
63	This is shared screen content (slides, code, documents, browser, terminal, etc.).
64
65	Return ONLY a JSON object (no markdown fences):
66	{
67	"content_type": "slide"\|"code"\|"document"\|"terminal"\|"browser"\|"chat"\|"other",
68	"caption": "one-sentence description of what is shown",
69	"text_content": "all visible text, preserving structure and line breaks",
70	"entities": ["named things visible: people, technologies, tools, services, \
71	projects, libraries, APIs, error codes, URLs, file paths"],
72	"topics": ["concepts or subjects this content is about"]
73	}
74
75	For text_content: extract ALL readable text — code, titles, bullet points, URLs,
76	error messages, terminal output, chat messages, file names. Be thorough.
77	For entities: extract specific named things, not generic words.
78	For topics: extract 2-5 high-level topics this content relates to.
79	"""
80
81
82	def _read_image_bytes(image_path: Union[str, Path]) -> bytes:
83	"""Read image file as bytes."""
84	return Path(image_path).read_bytes()
85
	@@ -129,10 +150,17 @@
150
151	def caption_frame(self, image_path: Union[str, Path]) -> str:
152	"""Get a brief caption for a screengrab fallback."""
153	image_bytes = _read_image_bytes(image_path)
154	return self.pm.analyze_image(image_bytes, _CAPTION_PROMPT, max_tokens=256)
155
156	def extract_screenshot_knowledge(self, image_path: Union[str, Path]) -> dict:
157	"""Extract knowledge from a screenshot — text, entities, topics."""
158	image_bytes = _read_image_bytes(image_path)
159	raw = self.pm.analyze_image(image_bytes, _SCREENSHOT_EXTRACT_PROMPT, max_tokens=2048)
160	result = _parse_json_response(raw)
161	return result or {}
162
163	def process_frames(
164	self,
165	frame_paths: List[Union[str, Path]],
166	diagrams_dir: Optional[Path] = None,
	@@ -312,21 +340,51 @@
340	frame_index: int,
341	capture_index: int,
342	captures_dir: Optional[Path],
343	confidence: float,
344	) -> ScreenCapture:
345	"""Extract knowledge from a screenshot and save it."""
346	# Try rich extraction first, fall back to caption-only
347	caption = ""
348	content_type = None
349	text_content = None
350	entities: List[str] = []
351	topics: List[str] = []
352
353	try:
354	extraction = self.extract_screenshot_knowledge(frame_path)
355	if extraction:
356	caption = extraction.get("caption", "")
357	content_type = extraction.get("content_type")
358	text_content = extraction.get("text_content")
359	raw_entities = extraction.get("entities", [])
360	entities = [str(e) for e in raw_entities] if isinstance(raw_entities, list) else []
361	raw_topics = extraction.get("topics", [])
362	topics = [str(t) for t in raw_topics] if isinstance(raw_topics, list) else []
363	logger.info(
364	f"Frame {frame_index}: extracted "
365	f"{len(entities)} entities, "
366	f"{len(topics)} topics from {content_type}"
367	)
368	except Exception as e:
369	logger.warning(
370	f"Screenshot extraction failed for frame "
371	f"{frame_index}: {e}, falling back to caption"
372	)
373	try:
374	caption = self.caption_frame(frame_path)
375	except Exception as e2:
376	logger.warning(f"Caption also failed for frame {frame_index}: {e2}")
377
378	sc = ScreenCapture(
379	frame_index=frame_index,
380	caption=caption,
381	confidence=confidence,
382	content_type=content_type,
383	text_content=text_content,
384	entities=entities,
385	topics=topics,
386	)
387
388	if captures_dir:
389	captures_dir.mkdir(parents=True, exist_ok=True)
390	prefix = f"capture_{capture_index}"
391

M video_processor/integrators/knowledge_graph.py

+32

		--- video_processor/integrators/knowledge_graph.py
		+++ video_processor/integrators/knowledge_graph.py
		@@ -196,10 +196,42 @@
196	196	self._store.add_occurrence(
197	197	diagram_id,
198	198	source if text_content else diagram_id,
199	199	text=f"frame_index={diagram.get('frame_index')}",
200	200	)
	201	+
	202	+ def process_screenshots(self, screenshots: List[Dict]) -> None:
	203	+ """Process screenshot captures into knowledge graph.
	204	+
	205	+ Extracts entities from text_content and adds screenshot-specific
	206	+ entities from the entities list.
	207	+ """
	208	+ for i, capture in enumerate(screenshots):
	209	+ text_content = capture.get("text_content", "")
	210	+ source = f"screenshot_{i}"
	211	+ content_type = capture.get("content_type", "screenshot")
	212	+
	213	+ # Extract entities from visible text via LLM
	214	+ if text_content:
	215	+ self.add_content(text_content, source)
	216	+
	217	+ # Add explicitly identified entities from vision extraction
	218	+ for entity_name in capture.get("entities", []):
	219	+ if not entity_name or len(entity_name) < 2:
	220	+ continue
	221	+ if not self._store.has_entity(entity_name):
	222	+ self._store.merge_entity(
	223	+ entity_name,
	224	+ "concept",
	225	+ [f"Identified in {content_type} screenshot"],
	226	+ source=source,
	227	+ )
	228	+ self._store.add_occurrence(
	229	+ entity_name,
	230	+ source,
	231	+ text=f"Visible in {content_type} (frame {capture.get('frame_index', '?')})",
	232	+ )
201	233
202	234	def to_data(self) -> KnowledgeGraphData:
203	235	"""Convert to pydantic KnowledgeGraphData model."""
204	236	nodes = []
205	237	for entity in self._store.get_all_entities():
206	238

	--- video_processor/integrators/knowledge_graph.py
	+++ video_processor/integrators/knowledge_graph.py
	@@ -196,10 +196,42 @@
196	self._store.add_occurrence(
197	diagram_id,
198	source if text_content else diagram_id,
199	text=f"frame_index={diagram.get('frame_index')}",
200	)
































201
202	def to_data(self) -> KnowledgeGraphData:
203	"""Convert to pydantic KnowledgeGraphData model."""
204	nodes = []
205	for entity in self._store.get_all_entities():
206

	--- video_processor/integrators/knowledge_graph.py
	+++ video_processor/integrators/knowledge_graph.py
	@@ -196,10 +196,42 @@
196	self._store.add_occurrence(
197	diagram_id,
198	source if text_content else diagram_id,
199	text=f"frame_index={diagram.get('frame_index')}",
200	)
201
202	def process_screenshots(self, screenshots: List[Dict]) -> None:
203	"""Process screenshot captures into knowledge graph.
204
205	Extracts entities from text_content and adds screenshot-specific
206	entities from the entities list.
207	"""
208	for i, capture in enumerate(screenshots):
209	text_content = capture.get("text_content", "")
210	source = f"screenshot_{i}"
211	content_type = capture.get("content_type", "screenshot")
212
213	# Extract entities from visible text via LLM
214	if text_content:
215	self.add_content(text_content, source)
216
217	# Add explicitly identified entities from vision extraction
218	for entity_name in capture.get("entities", []):
219	if not entity_name or len(entity_name) < 2:
220	continue
221	if not self._store.has_entity(entity_name):
222	self._store.merge_entity(
223	entity_name,
224	"concept",
225	[f"Identified in {content_type} screenshot"],
226	source=source,
227	)
228	self._store.add_occurrence(
229	entity_name,
230	source,
231	text=f"Visible in {content_type} (frame {capture.get('frame_index', '?')})",
232	)
233
234	def to_data(self) -> KnowledgeGraphData:
235	"""Convert to pydantic KnowledgeGraphData model."""
236	nodes = []
237	for entity in self._store.get_all_entities():
238

M video_processor/models.py

+14 -1

		--- video_processor/models.py
		+++ video_processor/models.py
		@@ -97,19 +97,32 @@
97	97	png_path: Optional[str] = Field(default=None, description="Relative path to rendered PNG")
98	98	mermaid_path: Optional[str] = Field(default=None, description="Relative path to mermaid source")
99	99
100	100
101	101	class ScreenCapture(BaseModel):
102		- """A screengrab fallback when diagram extraction fails or is uncertain."""
	102	+ """A screen capture with knowledge extraction from shared content."""
103	103
104	104	frame_index: int = Field(description="Index of the source frame")
105	105	timestamp: Optional[float] = Field(default=None, description="Timestamp in video (seconds)")
106	106	caption: Optional[str] = Field(default=None, description="Brief description of the content")
107	107	image_path: Optional[str] = Field(default=None, description="Relative path to screenshot")
108	108	confidence: float = Field(
109	109	default=0.0, description="Detection confidence that triggered fallback"
110	110	)
	111	+ content_type: Optional[str] = Field(
	112	+ default=None,
	113	+ description="Content type: slide, code, document, terminal, browser, chat, other",
	114	+ )
	115	+ text_content: Optional[str] = Field(
	116	+ default=None, description="All visible text extracted from the screenshot"
	117	+ )
	118	+ entities: List[str] = Field(
	119	+ default_factory=list, description="Entities identified in the screenshot"
	120	+ )
	121	+ topics: List[str] = Field(
	122	+ default_factory=list, description="Topics or concepts visible in the screenshot"
	123	+ )
111	124
112	125
113	126	class SourceRecord(BaseModel):
114	127	"""A content source registered in the knowledge graph for provenance tracking."""
115	128
116	129

	--- video_processor/models.py
	+++ video_processor/models.py
	@@ -97,19 +97,32 @@
97	png_path: Optional[str] = Field(default=None, description="Relative path to rendered PNG")
98	mermaid_path: Optional[str] = Field(default=None, description="Relative path to mermaid source")
99
100
101	class ScreenCapture(BaseModel):
102	"""A screengrab fallback when diagram extraction fails or is uncertain."""
103
104	frame_index: int = Field(description="Index of the source frame")
105	timestamp: Optional[float] = Field(default=None, description="Timestamp in video (seconds)")
106	caption: Optional[str] = Field(default=None, description="Brief description of the content")
107	image_path: Optional[str] = Field(default=None, description="Relative path to screenshot")
108	confidence: float = Field(
109	default=0.0, description="Detection confidence that triggered fallback"
110	)













111
112
113	class SourceRecord(BaseModel):
114	"""A content source registered in the knowledge graph for provenance tracking."""
115
116

	--- video_processor/models.py
	+++ video_processor/models.py
	@@ -97,19 +97,32 @@
97	png_path: Optional[str] = Field(default=None, description="Relative path to rendered PNG")
98	mermaid_path: Optional[str] = Field(default=None, description="Relative path to mermaid source")
99
100
101	class ScreenCapture(BaseModel):
102	"""A screen capture with knowledge extraction from shared content."""
103
104	frame_index: int = Field(description="Index of the source frame")
105	timestamp: Optional[float] = Field(default=None, description="Timestamp in video (seconds)")
106	caption: Optional[str] = Field(default=None, description="Brief description of the content")
107	image_path: Optional[str] = Field(default=None, description="Relative path to screenshot")
108	confidence: float = Field(
109	default=0.0, description="Detection confidence that triggered fallback"
110	)
111	content_type: Optional[str] = Field(
112	default=None,
113	description="Content type: slide, code, document, terminal, browser, chat, other",
114	)
115	text_content: Optional[str] = Field(
116	default=None, description="All visible text extracted from the screenshot"
117	)
118	entities: List[str] = Field(
119	default_factory=list, description="Entities identified in the screenshot"
120	)
121	topics: List[str] = Field(
122	default_factory=list, description="Topics or concepts visible in the screenshot"
123	)
124
125
126	class SourceRecord(BaseModel):
127	"""A content source registered in the knowledge graph for provenance tracking."""
128
129

M video_processor/pipeline.py

		--- video_processor/pipeline.py
		+++ video_processor/pipeline.py
		@@ -243,10 +243,13 @@
243	243	)
244	244	kg.process_transcript(transcript_data)
245	245	if diagrams:
246	246	diagram_dicts = [d.model_dump() for d in diagrams]
247	247	kg.process_diagrams(diagram_dicts)
	248	+ if screen_captures:
	249	+ capture_dicts = [sc.model_dump() for sc in screen_captures]
	250	+ kg.process_screenshots(capture_dicts)
248	251	# Export JSON copy alongside the SQLite db
249	252	kg.save(kg_json_path)
250	253	pipeline_bar.update(1)
251	254	_notify(progress_callback, "on_step_complete", steps[4], 5, total_steps)
252	255
253	256

	--- video_processor/pipeline.py
	+++ video_processor/pipeline.py
	@@ -243,10 +243,13 @@
243	)
244	kg.process_transcript(transcript_data)
245	if diagrams:
246	diagram_dicts = [d.model_dump() for d in diagrams]
247	kg.process_diagrams(diagram_dicts)



248	# Export JSON copy alongside the SQLite db
249	kg.save(kg_json_path)
250	pipeline_bar.update(1)
251	_notify(progress_callback, "on_step_complete", steps[4], 5, total_steps)
252
253

	--- video_processor/pipeline.py
	+++ video_processor/pipeline.py
	@@ -243,10 +243,13 @@
243	)
244	kg.process_transcript(transcript_data)
245	if diagrams:
246	diagram_dicts = [d.model_dump() for d in diagrams]
247	kg.process_diagrams(diagram_dicts)
248	if screen_captures:
249	capture_dicts = [sc.model_dump() for sc in screen_captures]
250	kg.process_screenshots(capture_dicts)
251	# Export JSON copy alongside the SQLite db
252	kg.save(kg_json_path)
253	pipeline_bar.update(1)
254	_notify(progress_callback, "on_step_complete", steps[4], 5, total_steps)
255
256

PlanOpticon

Keyboard Shortcuts