PlanOpticon

Merge pull request #109 from ConflictHQ/feat/screenshot-knowledge-extraction feat: screenshot knowledge extraction

noreply 2026-03-08 00:31 trunk merge
Commit 2a1b11a993e0b1409089c89718da3b09fc44b35c1fa1b22355ddb5103cb69f72
--- tests/test_diagram_analyzer.py
+++ tests/test_diagram_analyzer.py
@@ -146,23 +146,34 @@
146146
"relationships": ["Start -> End"],
147147
"mermaid": "graph LR\n Start-->End",
148148
"chart_data": None,
149149
}
150150
)
151
+
152
+ # Screenshot extraction response for medium-confidence frame
153
+ screenshot_response = json.dumps(
154
+ {
155
+ "content_type": "slide",
156
+ "caption": "A slide about something",
157
+ "text_content": "Key Points\n- Item 1\n- Item 2",
158
+ "entities": ["Item 1", "Item 2"],
159
+ "topics": ["presentation"],
160
+ }
161
+ )
151162
152163
# Calls are interleaved per-frame:
153164
# call 0: classify frame 0 (high conf)
154165
# call 1: analyze frame 0 (full analysis)
155166
# call 2: classify frame 1 (low conf - skip)
156167
# call 3: classify frame 2 (medium conf)
157
- # call 4: caption frame 2 (screengrab)
168
+ # call 4: screenshot extraction frame 2
158169
call_sequence = [
159170
classify_responses[0], # classify frame 0
160171
analysis_response, # analyze frame 0
161172
classify_responses[1], # classify frame 1
162173
classify_responses[2], # classify frame 2
163
- "A slide about something", # caption frame 2
174
+ screenshot_response, # screenshot extraction frame 2
164175
]
165176
call_count = [0]
166177
167178
def side_effect(image_bytes, prompt, max_tokens=4096):
168179
idx = call_count[0]
@@ -178,10 +189,14 @@
178189
assert diagrams[0].diagram_type == DiagramType.flowchart
179190
assert diagrams[0].mermaid == "graph LR\n Start-->End"
180191
181192
assert len(captures) == 1
182193
assert captures[0].frame_index == 2
194
+ assert captures[0].content_type == "slide"
195
+ assert captures[0].text_content == "Key Points\n- Item 1\n- Item 2"
196
+ assert "Item 1" in captures[0].entities
197
+ assert "presentation" in captures[0].topics
183198
184199
# Check files were saved
185200
assert (diagrams_dir / "diagram_0.jpg").exists()
186201
assert (diagrams_dir / "diagram_0.mermaid").exists()
187202
assert (diagrams_dir / "diagram_0.json").exists()
@@ -208,13 +223,44 @@
208223
"brief_description": "chart",
209224
}
210225
)
211226
if idx == 1:
212227
return "This is not valid JSON" # Analysis fails
213
- return "A chart showing data" # Caption
228
+ # Screenshot extraction for the fallback screengrab
229
+ return json.dumps(
230
+ {
231
+ "content_type": "chart",
232
+ "caption": "A chart showing data",
233
+ "text_content": "Sales Q1 Q2 Q3",
234
+ "entities": ["Sales"],
235
+ "topics": ["metrics"],
236
+ }
237
+ )
214238
215239
mock_pm.analyze_image.side_effect = side_effect
216240
217241
diagrams, captures = analyzer.process_frames([fp], captures_dir=captures_dir)
218242
assert len(diagrams) == 0
219243
assert len(captures) == 1
220244
assert captures[0].frame_index == 0
245
+ assert captures[0].content_type == "chart"
246
+ assert captures[0].text_content == "Sales Q1 Q2 Q3"
247
+
248
+ def test_extract_screenshot_knowledge(self, analyzer, mock_pm, fake_frame):
249
+ mock_pm.analyze_image.return_value = json.dumps(
250
+ {
251
+ "content_type": "code",
252
+ "caption": "Python source code",
253
+ "text_content": "def main():\n print('hello')",
254
+ "entities": ["Python", "main function"],
255
+ "topics": ["programming", "source code"],
256
+ }
257
+ )
258
+ result = analyzer.extract_screenshot_knowledge(fake_frame)
259
+ assert result["content_type"] == "code"
260
+ assert "Python" in result["entities"]
261
+ assert "def main" in result["text_content"]
262
+
263
+ def test_extract_screenshot_knowledge_failure(self, analyzer, mock_pm, fake_frame):
264
+ mock_pm.analyze_image.return_value = "not json"
265
+ result = analyzer.extract_screenshot_knowledge(fake_frame)
266
+ assert result == {}
221267
--- tests/test_diagram_analyzer.py
+++ tests/test_diagram_analyzer.py
@@ -146,23 +146,34 @@
146 "relationships": ["Start -> End"],
147 "mermaid": "graph LR\n Start-->End",
148 "chart_data": None,
149 }
150 )
 
 
 
 
 
 
 
 
 
 
 
151
152 # Calls are interleaved per-frame:
153 # call 0: classify frame 0 (high conf)
154 # call 1: analyze frame 0 (full analysis)
155 # call 2: classify frame 1 (low conf - skip)
156 # call 3: classify frame 2 (medium conf)
157 # call 4: caption frame 2 (screengrab)
158 call_sequence = [
159 classify_responses[0], # classify frame 0
160 analysis_response, # analyze frame 0
161 classify_responses[1], # classify frame 1
162 classify_responses[2], # classify frame 2
163 "A slide about something", # caption frame 2
164 ]
165 call_count = [0]
166
167 def side_effect(image_bytes, prompt, max_tokens=4096):
168 idx = call_count[0]
@@ -178,10 +189,14 @@
178 assert diagrams[0].diagram_type == DiagramType.flowchart
179 assert diagrams[0].mermaid == "graph LR\n Start-->End"
180
181 assert len(captures) == 1
182 assert captures[0].frame_index == 2
 
 
 
 
183
184 # Check files were saved
185 assert (diagrams_dir / "diagram_0.jpg").exists()
186 assert (diagrams_dir / "diagram_0.mermaid").exists()
187 assert (diagrams_dir / "diagram_0.json").exists()
@@ -208,13 +223,44 @@
208 "brief_description": "chart",
209 }
210 )
211 if idx == 1:
212 return "This is not valid JSON" # Analysis fails
213 return "A chart showing data" # Caption
 
 
 
 
 
 
 
 
 
214
215 mock_pm.analyze_image.side_effect = side_effect
216
217 diagrams, captures = analyzer.process_frames([fp], captures_dir=captures_dir)
218 assert len(diagrams) == 0
219 assert len(captures) == 1
220 assert captures[0].frame_index == 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
--- tests/test_diagram_analyzer.py
+++ tests/test_diagram_analyzer.py
@@ -146,23 +146,34 @@
146 "relationships": ["Start -> End"],
147 "mermaid": "graph LR\n Start-->End",
148 "chart_data": None,
149 }
150 )
151
152 # Screenshot extraction response for medium-confidence frame
153 screenshot_response = json.dumps(
154 {
155 "content_type": "slide",
156 "caption": "A slide about something",
157 "text_content": "Key Points\n- Item 1\n- Item 2",
158 "entities": ["Item 1", "Item 2"],
159 "topics": ["presentation"],
160 }
161 )
162
163 # Calls are interleaved per-frame:
164 # call 0: classify frame 0 (high conf)
165 # call 1: analyze frame 0 (full analysis)
166 # call 2: classify frame 1 (low conf - skip)
167 # call 3: classify frame 2 (medium conf)
168 # call 4: screenshot extraction frame 2
169 call_sequence = [
170 classify_responses[0], # classify frame 0
171 analysis_response, # analyze frame 0
172 classify_responses[1], # classify frame 1
173 classify_responses[2], # classify frame 2
174 screenshot_response, # screenshot extraction frame 2
175 ]
176 call_count = [0]
177
178 def side_effect(image_bytes, prompt, max_tokens=4096):
179 idx = call_count[0]
@@ -178,10 +189,14 @@
189 assert diagrams[0].diagram_type == DiagramType.flowchart
190 assert diagrams[0].mermaid == "graph LR\n Start-->End"
191
192 assert len(captures) == 1
193 assert captures[0].frame_index == 2
194 assert captures[0].content_type == "slide"
195 assert captures[0].text_content == "Key Points\n- Item 1\n- Item 2"
196 assert "Item 1" in captures[0].entities
197 assert "presentation" in captures[0].topics
198
199 # Check files were saved
200 assert (diagrams_dir / "diagram_0.jpg").exists()
201 assert (diagrams_dir / "diagram_0.mermaid").exists()
202 assert (diagrams_dir / "diagram_0.json").exists()
@@ -208,13 +223,44 @@
223 "brief_description": "chart",
224 }
225 )
226 if idx == 1:
227 return "This is not valid JSON" # Analysis fails
228 # Screenshot extraction for the fallback screengrab
229 return json.dumps(
230 {
231 "content_type": "chart",
232 "caption": "A chart showing data",
233 "text_content": "Sales Q1 Q2 Q3",
234 "entities": ["Sales"],
235 "topics": ["metrics"],
236 }
237 )
238
239 mock_pm.analyze_image.side_effect = side_effect
240
241 diagrams, captures = analyzer.process_frames([fp], captures_dir=captures_dir)
242 assert len(diagrams) == 0
243 assert len(captures) == 1
244 assert captures[0].frame_index == 0
245 assert captures[0].content_type == "chart"
246 assert captures[0].text_content == "Sales Q1 Q2 Q3"
247
248 def test_extract_screenshot_knowledge(self, analyzer, mock_pm, fake_frame):
249 mock_pm.analyze_image.return_value = json.dumps(
250 {
251 "content_type": "code",
252 "caption": "Python source code",
253 "text_content": "def main():\n print('hello')",
254 "entities": ["Python", "main function"],
255 "topics": ["programming", "source code"],
256 }
257 )
258 result = analyzer.extract_screenshot_knowledge(fake_frame)
259 assert result["content_type"] == "code"
260 assert "Python" in result["entities"]
261 assert "def main" in result["text_content"]
262
263 def test_extract_screenshot_knowledge_failure(self, analyzer, mock_pm, fake_frame):
264 mock_pm.analyze_image.return_value = "not json"
265 result = analyzer.extract_screenshot_knowledge(fake_frame)
266 assert result == {}
267
--- tests/test_knowledge_graph.py
+++ tests/test_knowledge_graph.py
@@ -133,10 +133,84 @@
133133
]
134134
kg_with_provider.process_diagrams(diagrams)
135135
assert kg_with_provider._store.has_entity("diagram_0")
136136
assert kg_with_provider._store.has_entity("diagram_1")
137137
138
+
139
+class TestProcessScreenshots:
140
+ @pytest.fixture
141
+ def mock_pm(self):
142
+ pm = MagicMock()
143
+ pm.chat.return_value = json.dumps(
144
+ [
145
+ {"name": "Python", "type": "technology", "description": "Language"},
146
+ {"name": "Flask", "type": "technology", "description": "Framework"},
147
+ ]
148
+ )
149
+ return pm
150
+
151
+ @pytest.fixture
152
+ def kg_with_provider(self, mock_pm):
153
+ return KnowledgeGraph(provider_manager=mock_pm)
154
+
155
+ def test_process_screenshots_with_text(self, kg_with_provider, mock_pm):
156
+ screenshots = [
157
+ {
158
+ "text_content": "import flask\napp = Flask(__name__)",
159
+ "content_type": "code",
160
+ "entities": ["Flask", "Python"],
161
+ "frame_index": 3,
162
+ },
163
+ ]
164
+ kg_with_provider.process_screenshots(screenshots)
165
+ # LLM extraction from text_content
166
+ mock_pm.chat.assert_called()
167
+ # Explicitly listed entities should be added
168
+ assert kg_with_provider._store.has_entity("Flask")
169
+ assert kg_with_provider._store.has_entity("Python")
170
+
171
+ def test_process_screenshots_without_text(self, kg_with_provider, mock_pm):
172
+ screenshots = [
173
+ {
174
+ "text_content": "",
175
+ "content_type": "other",
176
+ "entities": ["Docker"],
177
+ "frame_index": 5,
178
+ },
179
+ ]
180
+ kg_with_provider.process_screenshots(screenshots)
181
+ # No chat call for empty text
182
+ mock_pm.chat.assert_not_called()
183
+ # But explicit entities still added
184
+ assert kg_with_provider._store.has_entity("Docker")
185
+
186
+ def test_process_screenshots_empty_entities(self, kg_with_provider):
187
+ screenshots = [
188
+ {
189
+ "text_content": "",
190
+ "content_type": "slide",
191
+ "entities": [],
192
+ "frame_index": 0,
193
+ },
194
+ ]
195
+ kg_with_provider.process_screenshots(screenshots)
196
+ # No crash, no entities added
197
+
198
+ def test_process_screenshots_filters_short_names(self, kg_with_provider):
199
+ screenshots = [
200
+ {
201
+ "text_content": "",
202
+ "entities": ["A", "Go", "Python"],
203
+ "frame_index": 0,
204
+ },
205
+ ]
206
+ kg_with_provider.process_screenshots(screenshots)
207
+ # "A" is too short (< 2 chars), filtered out
208
+ assert not kg_with_provider._store.has_entity("A")
209
+ assert kg_with_provider._store.has_entity("Go")
210
+ assert kg_with_provider._store.has_entity("Python")
211
+
138212
139213
class TestToDictFromDict:
140214
def test_round_trip_empty(self):
141215
kg = KnowledgeGraph()
142216
data = kg.to_dict()
143217
--- tests/test_knowledge_graph.py
+++ tests/test_knowledge_graph.py
@@ -133,10 +133,84 @@
133 ]
134 kg_with_provider.process_diagrams(diagrams)
135 assert kg_with_provider._store.has_entity("diagram_0")
136 assert kg_with_provider._store.has_entity("diagram_1")
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
139 class TestToDictFromDict:
140 def test_round_trip_empty(self):
141 kg = KnowledgeGraph()
142 data = kg.to_dict()
143
--- tests/test_knowledge_graph.py
+++ tests/test_knowledge_graph.py
@@ -133,10 +133,84 @@
133 ]
134 kg_with_provider.process_diagrams(diagrams)
135 assert kg_with_provider._store.has_entity("diagram_0")
136 assert kg_with_provider._store.has_entity("diagram_1")
137
138
139 class TestProcessScreenshots:
140 @pytest.fixture
141 def mock_pm(self):
142 pm = MagicMock()
143 pm.chat.return_value = json.dumps(
144 [
145 {"name": "Python", "type": "technology", "description": "Language"},
146 {"name": "Flask", "type": "technology", "description": "Framework"},
147 ]
148 )
149 return pm
150
151 @pytest.fixture
152 def kg_with_provider(self, mock_pm):
153 return KnowledgeGraph(provider_manager=mock_pm)
154
155 def test_process_screenshots_with_text(self, kg_with_provider, mock_pm):
156 screenshots = [
157 {
158 "text_content": "import flask\napp = Flask(__name__)",
159 "content_type": "code",
160 "entities": ["Flask", "Python"],
161 "frame_index": 3,
162 },
163 ]
164 kg_with_provider.process_screenshots(screenshots)
165 # LLM extraction from text_content
166 mock_pm.chat.assert_called()
167 # Explicitly listed entities should be added
168 assert kg_with_provider._store.has_entity("Flask")
169 assert kg_with_provider._store.has_entity("Python")
170
171 def test_process_screenshots_without_text(self, kg_with_provider, mock_pm):
172 screenshots = [
173 {
174 "text_content": "",
175 "content_type": "other",
176 "entities": ["Docker"],
177 "frame_index": 5,
178 },
179 ]
180 kg_with_provider.process_screenshots(screenshots)
181 # No chat call for empty text
182 mock_pm.chat.assert_not_called()
183 # But explicit entities still added
184 assert kg_with_provider._store.has_entity("Docker")
185
186 def test_process_screenshots_empty_entities(self, kg_with_provider):
187 screenshots = [
188 {
189 "text_content": "",
190 "content_type": "slide",
191 "entities": [],
192 "frame_index": 0,
193 },
194 ]
195 kg_with_provider.process_screenshots(screenshots)
196 # No crash, no entities added
197
198 def test_process_screenshots_filters_short_names(self, kg_with_provider):
199 screenshots = [
200 {
201 "text_content": "",
202 "entities": ["A", "Go", "Python"],
203 "frame_index": 0,
204 },
205 ]
206 kg_with_provider.process_screenshots(screenshots)
207 # "A" is too short (< 2 chars), filtered out
208 assert not kg_with_provider._store.has_entity("A")
209 assert kg_with_provider._store.has_entity("Go")
210 assert kg_with_provider._store.has_entity("Python")
211
212
213 class TestToDictFromDict:
214 def test_round_trip_empty(self):
215 kg = KnowledgeGraph()
216 data = kg.to_dict()
217
--- tests/test_models.py
+++ tests/test_models.py
@@ -115,18 +115,40 @@
115115
116116
class TestScreenCapture:
117117
def test_basic(self):
118118
sc = ScreenCapture(frame_index=10, caption="Architecture overview slide", confidence=0.5)
119119
assert sc.image_path is None
120
+ assert sc.content_type is None
121
+ assert sc.text_content is None
122
+ assert sc.entities == []
123
+ assert sc.topics == []
124
+
125
+ def test_with_extraction(self):
126
+ sc = ScreenCapture(
127
+ frame_index=5,
128
+ caption="Code editor showing Python",
129
+ confidence=0.5,
130
+ content_type="code",
131
+ text_content="def main():\n print('hello')",
132
+ entities=["Python", "main function"],
133
+ topics=["programming"],
134
+ )
135
+ assert sc.content_type == "code"
136
+ assert "Python" in sc.entities
137
+ assert sc.text_content is not None
120138
121139
def test_round_trip(self):
122140
sc = ScreenCapture(
123141
frame_index=7,
124142
timestamp=30.0,
125143
caption="Timeline",
126144
image_path="captures/capture_0.jpg",
127145
confidence=0.45,
146
+ content_type="slide",
147
+ text_content="Q4 Roadmap",
148
+ entities=["Roadmap"],
149
+ topics=["planning"],
128150
)
129151
restored = ScreenCapture.model_validate_json(sc.model_dump_json())
130152
assert restored == sc
131153
132154
133155
--- tests/test_models.py
+++ tests/test_models.py
@@ -115,18 +115,40 @@
115
116 class TestScreenCapture:
117 def test_basic(self):
118 sc = ScreenCapture(frame_index=10, caption="Architecture overview slide", confidence=0.5)
119 assert sc.image_path is None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
121 def test_round_trip(self):
122 sc = ScreenCapture(
123 frame_index=7,
124 timestamp=30.0,
125 caption="Timeline",
126 image_path="captures/capture_0.jpg",
127 confidence=0.45,
 
 
 
 
128 )
129 restored = ScreenCapture.model_validate_json(sc.model_dump_json())
130 assert restored == sc
131
132
133
--- tests/test_models.py
+++ tests/test_models.py
@@ -115,18 +115,40 @@
115
116 class TestScreenCapture:
117 def test_basic(self):
118 sc = ScreenCapture(frame_index=10, caption="Architecture overview slide", confidence=0.5)
119 assert sc.image_path is None
120 assert sc.content_type is None
121 assert sc.text_content is None
122 assert sc.entities == []
123 assert sc.topics == []
124
125 def test_with_extraction(self):
126 sc = ScreenCapture(
127 frame_index=5,
128 caption="Code editor showing Python",
129 confidence=0.5,
130 content_type="code",
131 text_content="def main():\n print('hello')",
132 entities=["Python", "main function"],
133 topics=["programming"],
134 )
135 assert sc.content_type == "code"
136 assert "Python" in sc.entities
137 assert sc.text_content is not None
138
139 def test_round_trip(self):
140 sc = ScreenCapture(
141 frame_index=7,
142 timestamp=30.0,
143 caption="Timeline",
144 image_path="captures/capture_0.jpg",
145 confidence=0.45,
146 content_type="slide",
147 text_content="Q4 Roadmap",
148 entities=["Roadmap"],
149 topics=["planning"],
150 )
151 restored = ScreenCapture.model_validate_json(sc.model_dump_json())
152 assert restored == sc
153
154
155
--- video_processor/analyzers/diagram_analyzer.py
+++ video_processor/analyzers/diagram_analyzer.py
@@ -55,10 +55,31 @@
5555
"""
5656
5757
# Caption prompt for screengrab fallback
5858
_CAPTION_PROMPT = "Briefly describe what this image shows in 1-2 sentences."
5959
60
+# Rich screenshot extraction prompt — extracts knowledge from shared screens
61
+_SCREENSHOT_EXTRACT_PROMPT = """\
62
+Analyze this screenshot from a video recording. Extract all visible knowledge.
63
+This is shared screen content (slides, code, documents, browser, terminal, etc.).
64
+
65
+Return ONLY a JSON object (no markdown fences):
66
+{
67
+ "content_type": "slide"|"code"|"document"|"terminal"|"browser"|"chat"|"other",
68
+ "caption": "one-sentence description of what is shown",
69
+ "text_content": "all visible text, preserving structure and line breaks",
70
+ "entities": ["named things visible: people, technologies, tools, services, \
71
+projects, libraries, APIs, error codes, URLs, file paths"],
72
+ "topics": ["concepts or subjects this content is about"]
73
+}
74
+
75
+For text_content: extract ALL readable text — code, titles, bullet points, URLs,
76
+error messages, terminal output, chat messages, file names. Be thorough.
77
+For entities: extract specific named things, not generic words.
78
+For topics: extract 2-5 high-level topics this content relates to.
79
+"""
80
+
6081
6182
def _read_image_bytes(image_path: Union[str, Path]) -> bytes:
6283
"""Read image file as bytes."""
6384
return Path(image_path).read_bytes()
6485
@@ -129,10 +150,17 @@
129150
130151
def caption_frame(self, image_path: Union[str, Path]) -> str:
131152
"""Get a brief caption for a screengrab fallback."""
132153
image_bytes = _read_image_bytes(image_path)
133154
return self.pm.analyze_image(image_bytes, _CAPTION_PROMPT, max_tokens=256)
155
+
156
+ def extract_screenshot_knowledge(self, image_path: Union[str, Path]) -> dict:
157
+ """Extract knowledge from a screenshot — text, entities, topics."""
158
+ image_bytes = _read_image_bytes(image_path)
159
+ raw = self.pm.analyze_image(image_bytes, _SCREENSHOT_EXTRACT_PROMPT, max_tokens=2048)
160
+ result = _parse_json_response(raw)
161
+ return result or {}
134162
135163
def process_frames(
136164
self,
137165
frame_paths: List[Union[str, Path]],
138166
diagrams_dir: Optional[Path] = None,
@@ -312,21 +340,51 @@
312340
frame_index: int,
313341
capture_index: int,
314342
captures_dir: Optional[Path],
315343
confidence: float,
316344
) -> ScreenCapture:
317
- """Save a frame as a captioned screengrab."""
345
+ """Extract knowledge from a screenshot and save it."""
346
+ # Try rich extraction first, fall back to caption-only
318347
caption = ""
348
+ content_type = None
349
+ text_content = None
350
+ entities: List[str] = []
351
+ topics: List[str] = []
352
+
319353
try:
320
- caption = self.caption_frame(frame_path)
354
+ extraction = self.extract_screenshot_knowledge(frame_path)
355
+ if extraction:
356
+ caption = extraction.get("caption", "")
357
+ content_type = extraction.get("content_type")
358
+ text_content = extraction.get("text_content")
359
+ raw_entities = extraction.get("entities", [])
360
+ entities = [str(e) for e in raw_entities] if isinstance(raw_entities, list) else []
361
+ raw_topics = extraction.get("topics", [])
362
+ topics = [str(t) for t in raw_topics] if isinstance(raw_topics, list) else []
363
+ logger.info(
364
+ f"Frame {frame_index}: extracted "
365
+ f"{len(entities)} entities, "
366
+ f"{len(topics)} topics from {content_type}"
367
+ )
321368
except Exception as e:
322
- logger.warning(f"Caption failed for frame {frame_index}: {e}")
369
+ logger.warning(
370
+ f"Screenshot extraction failed for frame "
371
+ f"{frame_index}: {e}, falling back to caption"
372
+ )
373
+ try:
374
+ caption = self.caption_frame(frame_path)
375
+ except Exception as e2:
376
+ logger.warning(f"Caption also failed for frame {frame_index}: {e2}")
323377
324378
sc = ScreenCapture(
325379
frame_index=frame_index,
326380
caption=caption,
327381
confidence=confidence,
382
+ content_type=content_type,
383
+ text_content=text_content,
384
+ entities=entities,
385
+ topics=topics,
328386
)
329387
330388
if captures_dir:
331389
captures_dir.mkdir(parents=True, exist_ok=True)
332390
prefix = f"capture_{capture_index}"
333391
--- video_processor/analyzers/diagram_analyzer.py
+++ video_processor/analyzers/diagram_analyzer.py
@@ -55,10 +55,31 @@
55 """
56
57 # Caption prompt for screengrab fallback
58 _CAPTION_PROMPT = "Briefly describe what this image shows in 1-2 sentences."
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
61 def _read_image_bytes(image_path: Union[str, Path]) -> bytes:
62 """Read image file as bytes."""
63 return Path(image_path).read_bytes()
64
@@ -129,10 +150,17 @@
129
130 def caption_frame(self, image_path: Union[str, Path]) -> str:
131 """Get a brief caption for a screengrab fallback."""
132 image_bytes = _read_image_bytes(image_path)
133 return self.pm.analyze_image(image_bytes, _CAPTION_PROMPT, max_tokens=256)
 
 
 
 
 
 
 
134
135 def process_frames(
136 self,
137 frame_paths: List[Union[str, Path]],
138 diagrams_dir: Optional[Path] = None,
@@ -312,21 +340,51 @@
312 frame_index: int,
313 capture_index: int,
314 captures_dir: Optional[Path],
315 confidence: float,
316 ) -> ScreenCapture:
317 """Save a frame as a captioned screengrab."""
 
318 caption = ""
 
 
 
 
 
319 try:
320 caption = self.caption_frame(frame_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
321 except Exception as e:
322 logger.warning(f"Caption failed for frame {frame_index}: {e}")
 
 
 
 
 
 
 
323
324 sc = ScreenCapture(
325 frame_index=frame_index,
326 caption=caption,
327 confidence=confidence,
 
 
 
 
328 )
329
330 if captures_dir:
331 captures_dir.mkdir(parents=True, exist_ok=True)
332 prefix = f"capture_{capture_index}"
333
--- video_processor/analyzers/diagram_analyzer.py
+++ video_processor/analyzers/diagram_analyzer.py
@@ -55,10 +55,31 @@
55 """
56
57 # Caption prompt for screengrab fallback
58 _CAPTION_PROMPT = "Briefly describe what this image shows in 1-2 sentences."
59
60 # Rich screenshot extraction prompt — extracts knowledge from shared screens
61 _SCREENSHOT_EXTRACT_PROMPT = """\
62 Analyze this screenshot from a video recording. Extract all visible knowledge.
63 This is shared screen content (slides, code, documents, browser, terminal, etc.).
64
65 Return ONLY a JSON object (no markdown fences):
66 {
67 "content_type": "slide"|"code"|"document"|"terminal"|"browser"|"chat"|"other",
68 "caption": "one-sentence description of what is shown",
69 "text_content": "all visible text, preserving structure and line breaks",
70 "entities": ["named things visible: people, technologies, tools, services, \
71 projects, libraries, APIs, error codes, URLs, file paths"],
72 "topics": ["concepts or subjects this content is about"]
73 }
74
75 For text_content: extract ALL readable text — code, titles, bullet points, URLs,
76 error messages, terminal output, chat messages, file names. Be thorough.
77 For entities: extract specific named things, not generic words.
78 For topics: extract 2-5 high-level topics this content relates to.
79 """
80
81
82 def _read_image_bytes(image_path: Union[str, Path]) -> bytes:
83 """Read image file as bytes."""
84 return Path(image_path).read_bytes()
85
@@ -129,10 +150,17 @@
150
151 def caption_frame(self, image_path: Union[str, Path]) -> str:
152 """Get a brief caption for a screengrab fallback."""
153 image_bytes = _read_image_bytes(image_path)
154 return self.pm.analyze_image(image_bytes, _CAPTION_PROMPT, max_tokens=256)
155
156 def extract_screenshot_knowledge(self, image_path: Union[str, Path]) -> dict:
157 """Extract knowledge from a screenshot — text, entities, topics."""
158 image_bytes = _read_image_bytes(image_path)
159 raw = self.pm.analyze_image(image_bytes, _SCREENSHOT_EXTRACT_PROMPT, max_tokens=2048)
160 result = _parse_json_response(raw)
161 return result or {}
162
163 def process_frames(
164 self,
165 frame_paths: List[Union[str, Path]],
166 diagrams_dir: Optional[Path] = None,
@@ -312,21 +340,51 @@
340 frame_index: int,
341 capture_index: int,
342 captures_dir: Optional[Path],
343 confidence: float,
344 ) -> ScreenCapture:
345 """Extract knowledge from a screenshot and save it."""
346 # Try rich extraction first, fall back to caption-only
347 caption = ""
348 content_type = None
349 text_content = None
350 entities: List[str] = []
351 topics: List[str] = []
352
353 try:
354 extraction = self.extract_screenshot_knowledge(frame_path)
355 if extraction:
356 caption = extraction.get("caption", "")
357 content_type = extraction.get("content_type")
358 text_content = extraction.get("text_content")
359 raw_entities = extraction.get("entities", [])
360 entities = [str(e) for e in raw_entities] if isinstance(raw_entities, list) else []
361 raw_topics = extraction.get("topics", [])
362 topics = [str(t) for t in raw_topics] if isinstance(raw_topics, list) else []
363 logger.info(
364 f"Frame {frame_index}: extracted "
365 f"{len(entities)} entities, "
366 f"{len(topics)} topics from {content_type}"
367 )
368 except Exception as e:
369 logger.warning(
370 f"Screenshot extraction failed for frame "
371 f"{frame_index}: {e}, falling back to caption"
372 )
373 try:
374 caption = self.caption_frame(frame_path)
375 except Exception as e2:
376 logger.warning(f"Caption also failed for frame {frame_index}: {e2}")
377
378 sc = ScreenCapture(
379 frame_index=frame_index,
380 caption=caption,
381 confidence=confidence,
382 content_type=content_type,
383 text_content=text_content,
384 entities=entities,
385 topics=topics,
386 )
387
388 if captures_dir:
389 captures_dir.mkdir(parents=True, exist_ok=True)
390 prefix = f"capture_{capture_index}"
391
--- video_processor/integrators/knowledge_graph.py
+++ video_processor/integrators/knowledge_graph.py
@@ -196,10 +196,42 @@
196196
self._store.add_occurrence(
197197
diagram_id,
198198
source if text_content else diagram_id,
199199
text=f"frame_index={diagram.get('frame_index')}",
200200
)
201
+
202
+ def process_screenshots(self, screenshots: List[Dict]) -> None:
203
+ """Process screenshot captures into knowledge graph.
204
+
205
+ Extracts entities from text_content and adds screenshot-specific
206
+ entities from the entities list.
207
+ """
208
+ for i, capture in enumerate(screenshots):
209
+ text_content = capture.get("text_content", "")
210
+ source = f"screenshot_{i}"
211
+ content_type = capture.get("content_type", "screenshot")
212
+
213
+ # Extract entities from visible text via LLM
214
+ if text_content:
215
+ self.add_content(text_content, source)
216
+
217
+ # Add explicitly identified entities from vision extraction
218
+ for entity_name in capture.get("entities", []):
219
+ if not entity_name or len(entity_name) < 2:
220
+ continue
221
+ if not self._store.has_entity(entity_name):
222
+ self._store.merge_entity(
223
+ entity_name,
224
+ "concept",
225
+ [f"Identified in {content_type} screenshot"],
226
+ source=source,
227
+ )
228
+ self._store.add_occurrence(
229
+ entity_name,
230
+ source,
231
+ text=f"Visible in {content_type} (frame {capture.get('frame_index', '?')})",
232
+ )
201233
202234
def to_data(self) -> KnowledgeGraphData:
203235
"""Convert to pydantic KnowledgeGraphData model."""
204236
nodes = []
205237
for entity in self._store.get_all_entities():
206238
--- video_processor/integrators/knowledge_graph.py
+++ video_processor/integrators/knowledge_graph.py
@@ -196,10 +196,42 @@
196 self._store.add_occurrence(
197 diagram_id,
198 source if text_content else diagram_id,
199 text=f"frame_index={diagram.get('frame_index')}",
200 )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
202 def to_data(self) -> KnowledgeGraphData:
203 """Convert to pydantic KnowledgeGraphData model."""
204 nodes = []
205 for entity in self._store.get_all_entities():
206
--- video_processor/integrators/knowledge_graph.py
+++ video_processor/integrators/knowledge_graph.py
@@ -196,10 +196,42 @@
196 self._store.add_occurrence(
197 diagram_id,
198 source if text_content else diagram_id,
199 text=f"frame_index={diagram.get('frame_index')}",
200 )
201
202 def process_screenshots(self, screenshots: List[Dict]) -> None:
203 """Process screenshot captures into knowledge graph.
204
205 Extracts entities from text_content and adds screenshot-specific
206 entities from the entities list.
207 """
208 for i, capture in enumerate(screenshots):
209 text_content = capture.get("text_content", "")
210 source = f"screenshot_{i}"
211 content_type = capture.get("content_type", "screenshot")
212
213 # Extract entities from visible text via LLM
214 if text_content:
215 self.add_content(text_content, source)
216
217 # Add explicitly identified entities from vision extraction
218 for entity_name in capture.get("entities", []):
219 if not entity_name or len(entity_name) < 2:
220 continue
221 if not self._store.has_entity(entity_name):
222 self._store.merge_entity(
223 entity_name,
224 "concept",
225 [f"Identified in {content_type} screenshot"],
226 source=source,
227 )
228 self._store.add_occurrence(
229 entity_name,
230 source,
231 text=f"Visible in {content_type} (frame {capture.get('frame_index', '?')})",
232 )
233
234 def to_data(self) -> KnowledgeGraphData:
235 """Convert to pydantic KnowledgeGraphData model."""
236 nodes = []
237 for entity in self._store.get_all_entities():
238
--- video_processor/models.py
+++ video_processor/models.py
@@ -97,19 +97,32 @@
9797
png_path: Optional[str] = Field(default=None, description="Relative path to rendered PNG")
9898
mermaid_path: Optional[str] = Field(default=None, description="Relative path to mermaid source")
9999
100100
101101
class ScreenCapture(BaseModel):
102
- """A screengrab fallback when diagram extraction fails or is uncertain."""
102
+ """A screen capture with knowledge extraction from shared content."""
103103
104104
frame_index: int = Field(description="Index of the source frame")
105105
timestamp: Optional[float] = Field(default=None, description="Timestamp in video (seconds)")
106106
caption: Optional[str] = Field(default=None, description="Brief description of the content")
107107
image_path: Optional[str] = Field(default=None, description="Relative path to screenshot")
108108
confidence: float = Field(
109109
default=0.0, description="Detection confidence that triggered fallback"
110110
)
111
+ content_type: Optional[str] = Field(
112
+ default=None,
113
+ description="Content type: slide, code, document, terminal, browser, chat, other",
114
+ )
115
+ text_content: Optional[str] = Field(
116
+ default=None, description="All visible text extracted from the screenshot"
117
+ )
118
+ entities: List[str] = Field(
119
+ default_factory=list, description="Entities identified in the screenshot"
120
+ )
121
+ topics: List[str] = Field(
122
+ default_factory=list, description="Topics or concepts visible in the screenshot"
123
+ )
111124
112125
113126
class SourceRecord(BaseModel):
114127
"""A content source registered in the knowledge graph for provenance tracking."""
115128
116129
--- video_processor/models.py
+++ video_processor/models.py
@@ -97,19 +97,32 @@
97 png_path: Optional[str] = Field(default=None, description="Relative path to rendered PNG")
98 mermaid_path: Optional[str] = Field(default=None, description="Relative path to mermaid source")
99
100
101 class ScreenCapture(BaseModel):
102 """A screengrab fallback when diagram extraction fails or is uncertain."""
103
104 frame_index: int = Field(description="Index of the source frame")
105 timestamp: Optional[float] = Field(default=None, description="Timestamp in video (seconds)")
106 caption: Optional[str] = Field(default=None, description="Brief description of the content")
107 image_path: Optional[str] = Field(default=None, description="Relative path to screenshot")
108 confidence: float = Field(
109 default=0.0, description="Detection confidence that triggered fallback"
110 )
 
 
 
 
 
 
 
 
 
 
 
 
 
111
112
113 class SourceRecord(BaseModel):
114 """A content source registered in the knowledge graph for provenance tracking."""
115
116
--- video_processor/models.py
+++ video_processor/models.py
@@ -97,19 +97,32 @@
97 png_path: Optional[str] = Field(default=None, description="Relative path to rendered PNG")
98 mermaid_path: Optional[str] = Field(default=None, description="Relative path to mermaid source")
99
100
101 class ScreenCapture(BaseModel):
102 """A screen capture with knowledge extraction from shared content."""
103
104 frame_index: int = Field(description="Index of the source frame")
105 timestamp: Optional[float] = Field(default=None, description="Timestamp in video (seconds)")
106 caption: Optional[str] = Field(default=None, description="Brief description of the content")
107 image_path: Optional[str] = Field(default=None, description="Relative path to screenshot")
108 confidence: float = Field(
109 default=0.0, description="Detection confidence that triggered fallback"
110 )
111 content_type: Optional[str] = Field(
112 default=None,
113 description="Content type: slide, code, document, terminal, browser, chat, other",
114 )
115 text_content: Optional[str] = Field(
116 default=None, description="All visible text extracted from the screenshot"
117 )
118 entities: List[str] = Field(
119 default_factory=list, description="Entities identified in the screenshot"
120 )
121 topics: List[str] = Field(
122 default_factory=list, description="Topics or concepts visible in the screenshot"
123 )
124
125
126 class SourceRecord(BaseModel):
127 """A content source registered in the knowledge graph for provenance tracking."""
128
129
--- video_processor/pipeline.py
+++ video_processor/pipeline.py
@@ -243,10 +243,13 @@
243243
)
244244
kg.process_transcript(transcript_data)
245245
if diagrams:
246246
diagram_dicts = [d.model_dump() for d in diagrams]
247247
kg.process_diagrams(diagram_dicts)
248
+ if screen_captures:
249
+ capture_dicts = [sc.model_dump() for sc in screen_captures]
250
+ kg.process_screenshots(capture_dicts)
248251
# Export JSON copy alongside the SQLite db
249252
kg.save(kg_json_path)
250253
pipeline_bar.update(1)
251254
_notify(progress_callback, "on_step_complete", steps[4], 5, total_steps)
252255
253256
--- video_processor/pipeline.py
+++ video_processor/pipeline.py
@@ -243,10 +243,13 @@
243 )
244 kg.process_transcript(transcript_data)
245 if diagrams:
246 diagram_dicts = [d.model_dump() for d in diagrams]
247 kg.process_diagrams(diagram_dicts)
 
 
 
248 # Export JSON copy alongside the SQLite db
249 kg.save(kg_json_path)
250 pipeline_bar.update(1)
251 _notify(progress_callback, "on_step_complete", steps[4], 5, total_steps)
252
253
--- video_processor/pipeline.py
+++ video_processor/pipeline.py
@@ -243,10 +243,13 @@
243 )
244 kg.process_transcript(transcript_data)
245 if diagrams:
246 diagram_dicts = [d.model_dump() for d in diagrams]
247 kg.process_diagrams(diagram_dicts)
248 if screen_captures:
249 capture_dicts = [sc.model_dump() for sc in screen_captures]
250 kg.process_screenshots(capture_dicts)
251 # Export JSON copy alongside the SQLite db
252 kg.save(kg_json_path)
253 pipeline_bar.update(1)
254 _notify(progress_callback, "on_step_complete", steps[4], 5, total_steps)
255
256

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button