|
1
|
"""Core video processing pipeline — the reusable function both CLI commands call.""" |
|
2
|
|
|
3
|
import hashlib |
|
4
|
import json |
|
5
|
import logging |
|
6
|
import mimetypes |
|
7
|
import time |
|
8
|
from datetime import datetime |
|
9
|
from pathlib import Path |
|
10
|
from typing import Optional |
|
11
|
|
|
12
|
from tqdm import tqdm |
|
13
|
|
|
14
|
from video_processor.analyzers.diagram_analyzer import DiagramAnalyzer |
|
15
|
from video_processor.extractors.audio_extractor import AudioExtractor |
|
16
|
from video_processor.extractors.frame_extractor import ( |
|
17
|
extract_frames, |
|
18
|
filter_people_frames, |
|
19
|
save_frames, |
|
20
|
) |
|
21
|
from video_processor.integrators.knowledge_graph import KnowledgeGraph |
|
22
|
from video_processor.integrators.plan_generator import PlanGenerator |
|
23
|
from video_processor.models import ( |
|
24
|
ActionItem, |
|
25
|
KeyPoint, |
|
26
|
ProcessingStats, |
|
27
|
ProgressCallback, |
|
28
|
VideoManifest, |
|
29
|
VideoMetadata, |
|
30
|
) |
|
31
|
from video_processor.output_structure import create_video_output_dirs, write_video_manifest |
|
32
|
from video_processor.providers.manager import ProviderManager |
|
33
|
from video_processor.utils.export import export_all_formats |
|
34
|
|
|
35
|
logger = logging.getLogger(__name__) |
|
36
|
|
|
37
|
|
|
38
|
def _notify(cb: Optional[ProgressCallback], method: str, *args, **kwargs) -> None: |
|
39
|
"""Safely invoke a callback method, logging any errors.""" |
|
40
|
if cb is None: |
|
41
|
return |
|
42
|
try: |
|
43
|
getattr(cb, method)(*args, **kwargs) |
|
44
|
except Exception as e: |
|
45
|
logger.warning(f"Progress callback {method} failed: {e}") |
|
46
|
|
|
47
|
|
|
48
|
def process_single_video( |
|
49
|
input_path: str | Path, |
|
50
|
output_dir: str | Path, |
|
51
|
provider_manager: Optional[ProviderManager] = None, |
|
52
|
depth: str = "standard", |
|
53
|
focus_areas: Optional[list[str]] = None, |
|
54
|
sampling_rate: float = 0.5, |
|
55
|
change_threshold: float = 0.15, |
|
56
|
periodic_capture_seconds: float = 30.0, |
|
57
|
use_gpu: bool = False, |
|
58
|
title: Optional[str] = None, |
|
59
|
progress_callback: Optional[ProgressCallback] = None, |
|
60
|
speaker_hints: Optional[list[str]] = None, |
|
61
|
) -> VideoManifest: |
|
62
|
""" |
|
63
|
Full pipeline: frames -> audio -> transcription -> diagrams -> KG -> report -> export. |
|
64
|
|
|
65
|
Returns a populated VideoManifest. |
|
66
|
""" |
|
67
|
start_time = time.time() |
|
68
|
input_path = Path(input_path) |
|
69
|
output_dir = Path(output_dir) |
|
70
|
pm = provider_manager or ProviderManager() |
|
71
|
focus_areas = focus_areas or [] |
|
72
|
|
|
73
|
video_name = input_path.stem |
|
74
|
if not title: |
|
75
|
title = f"Analysis of {video_name}" |
|
76
|
|
|
77
|
# Create standardized directory structure |
|
78
|
dirs = create_video_output_dirs(output_dir, video_name) |
|
79
|
|
|
80
|
logger.info(f"Processing: {input_path}") |
|
81
|
logger.info(f"Depth: {depth}, Focus: {focus_areas or 'all'}") |
|
82
|
|
|
83
|
steps = [ |
|
84
|
"Extract frames", |
|
85
|
"Extract audio", |
|
86
|
"Transcribe", |
|
87
|
"Analyze visuals", |
|
88
|
"Build knowledge graph", |
|
89
|
"Extract key points", |
|
90
|
"Generate report", |
|
91
|
"Export formats", |
|
92
|
] |
|
93
|
pipeline_bar = tqdm(steps, desc="Pipeline", unit="step", position=0) |
|
94
|
|
|
95
|
total_steps = len(steps) |
|
96
|
|
|
97
|
# --- Step 1: Extract frames --- |
|
98
|
_notify(progress_callback, "on_step_start", steps[0], 1, total_steps) |
|
99
|
pm.usage.start_step("Frame extraction") |
|
100
|
pipeline_bar.set_description("Pipeline: extracting frames") |
|
101
|
existing_frames = sorted(dirs["frames"].glob("frame_*.jpg")) |
|
102
|
people_removed = 0 |
|
103
|
if existing_frames: |
|
104
|
frame_paths = existing_frames |
|
105
|
logger.info(f"Resuming: found {len(frame_paths)} frames on disk, skipping extraction") |
|
106
|
else: |
|
107
|
logger.info("Extracting video frames...") |
|
108
|
frames = extract_frames( |
|
109
|
input_path, |
|
110
|
sampling_rate=sampling_rate, |
|
111
|
change_threshold=change_threshold, |
|
112
|
periodic_capture_seconds=periodic_capture_seconds, |
|
113
|
disable_gpu=not use_gpu, |
|
114
|
) |
|
115
|
logger.info(f"Extracted {len(frames)} raw frames") |
|
116
|
|
|
117
|
# Filter out people/webcam frames before saving |
|
118
|
frames, people_removed = filter_people_frames(frames) |
|
119
|
frame_paths = save_frames(frames, dirs["frames"], "frame") |
|
120
|
logger.info(f"Saved {len(frames)} content frames ({people_removed} people frames filtered)") |
|
121
|
pipeline_bar.update(1) |
|
122
|
_notify(progress_callback, "on_step_complete", steps[0], 1, total_steps) |
|
123
|
|
|
124
|
# --- Step 2: Extract audio --- |
|
125
|
_notify(progress_callback, "on_step_start", steps[1], 2, total_steps) |
|
126
|
pm.usage.start_step("Audio extraction") |
|
127
|
pipeline_bar.set_description("Pipeline: extracting audio") |
|
128
|
audio_path = dirs["root"] / "audio" / f"{video_name}.wav" |
|
129
|
audio_extractor = AudioExtractor() |
|
130
|
if audio_path.exists(): |
|
131
|
logger.info(f"Resuming: found audio at {audio_path}, skipping extraction") |
|
132
|
else: |
|
133
|
logger.info("Extracting audio...") |
|
134
|
audio_path = audio_extractor.extract_audio(input_path, output_path=audio_path) |
|
135
|
audio_props = audio_extractor.get_audio_properties(audio_path) |
|
136
|
pipeline_bar.update(1) |
|
137
|
_notify(progress_callback, "on_step_complete", steps[1], 2, total_steps) |
|
138
|
|
|
139
|
# --- Step 3: Transcribe --- |
|
140
|
_notify(progress_callback, "on_step_start", steps[2], 3, total_steps) |
|
141
|
pm.usage.start_step("Transcription") |
|
142
|
pipeline_bar.set_description("Pipeline: transcribing audio") |
|
143
|
transcript_json = dirs["transcript"] / "transcript.json" |
|
144
|
if transcript_json.exists(): |
|
145
|
logger.info("Resuming: found transcript on disk, skipping transcription") |
|
146
|
transcript_data = json.loads(transcript_json.read_text()) |
|
147
|
transcript_text = transcript_data.get("text", "") |
|
148
|
segments = transcript_data.get("segments", []) |
|
149
|
else: |
|
150
|
logger.info("Transcribing audio...") |
|
151
|
transcription = pm.transcribe_audio(audio_path, speaker_hints=speaker_hints) |
|
152
|
transcript_text = transcription.get("text", "") |
|
153
|
segments = transcription.get("segments", []) |
|
154
|
|
|
155
|
# Save transcript files |
|
156
|
transcript_data = { |
|
157
|
"text": transcript_text, |
|
158
|
"segments": segments, |
|
159
|
"duration": transcription.get("duration") or audio_props.get("duration"), |
|
160
|
"language": transcription.get("language"), |
|
161
|
"provider": transcription.get("provider"), |
|
162
|
"model": transcription.get("model"), |
|
163
|
} |
|
164
|
transcript_json.write_text(json.dumps(transcript_data, indent=2)) |
|
165
|
|
|
166
|
transcript_txt = dirs["transcript"] / "transcript.txt" |
|
167
|
transcript_txt.write_text(transcript_text) |
|
168
|
|
|
169
|
# SRT |
|
170
|
transcript_srt = dirs["transcript"] / "transcript.srt" |
|
171
|
srt_lines = [] |
|
172
|
for i, seg in enumerate(segments): |
|
173
|
start = seg.get("start", 0) |
|
174
|
end = seg.get("end", 0) |
|
175
|
srt_lines.append(str(i + 1)) |
|
176
|
srt_lines.append(f"{_format_srt_time(start)} --> {_format_srt_time(end)}") |
|
177
|
srt_lines.append(seg.get("text", "").strip()) |
|
178
|
srt_lines.append("") |
|
179
|
transcript_srt.write_text("\n".join(srt_lines)) |
|
180
|
pipeline_bar.update(1) |
|
181
|
_notify(progress_callback, "on_step_complete", steps[2], 3, total_steps) |
|
182
|
|
|
183
|
# --- Step 4: Diagram extraction --- |
|
184
|
_notify(progress_callback, "on_step_start", steps[3], 4, total_steps) |
|
185
|
pm.usage.start_step("Visual analysis") |
|
186
|
pipeline_bar.set_description("Pipeline: analyzing visuals") |
|
187
|
diagrams = [] |
|
188
|
screen_captures = [] |
|
189
|
existing_diagrams = ( |
|
190
|
sorted(dirs["diagrams"].glob("diagram_*.json")) if dirs["diagrams"].exists() else [] |
|
191
|
) |
|
192
|
if existing_diagrams: |
|
193
|
logger.info(f"Resuming: found {len(existing_diagrams)} diagrams on disk, skipping analysis") |
|
194
|
from video_processor.models import DiagramResult |
|
195
|
|
|
196
|
for dj in existing_diagrams: |
|
197
|
try: |
|
198
|
diagrams.append(DiagramResult.model_validate_json(dj.read_text())) |
|
199
|
except Exception as e: |
|
200
|
logger.warning(f"Failed to load diagram {dj}: {e}") |
|
201
|
elif depth != "basic" and (not focus_areas or "diagrams" in focus_areas): |
|
202
|
logger.info("Analyzing visual elements...") |
|
203
|
analyzer = DiagramAnalyzer(provider_manager=pm) |
|
204
|
max_frames = 10 if depth == "standard" else 20 |
|
205
|
# Evenly sample across all frames rather than just taking the first N |
|
206
|
if len(frame_paths) <= max_frames: |
|
207
|
subset = frame_paths |
|
208
|
else: |
|
209
|
step = len(frame_paths) / max_frames |
|
210
|
subset = [frame_paths[int(i * step)] for i in range(max_frames)] |
|
211
|
diagrams, screen_captures = analyzer.process_frames( |
|
212
|
subset, diagrams_dir=dirs["diagrams"], captures_dir=dirs["captures"] |
|
213
|
) |
|
214
|
pipeline_bar.update(1) |
|
215
|
_notify(progress_callback, "on_step_complete", steps[3], 4, total_steps) |
|
216
|
|
|
217
|
# --- Step 5: Knowledge graph --- |
|
218
|
_notify(progress_callback, "on_step_start", steps[4], 5, total_steps) |
|
219
|
pm.usage.start_step("Knowledge graph") |
|
220
|
pipeline_bar.set_description("Pipeline: building knowledge graph") |
|
221
|
kg_db_path = dirs["results"] / "knowledge_graph.db" |
|
222
|
kg_json_path = dirs["results"] / "knowledge_graph.json" |
|
223
|
# Generate a stable source ID from the input path |
|
224
|
source_id = hashlib.sha256(str(input_path).encode()).hexdigest()[:12] |
|
225
|
mime_type = mimetypes.guess_type(str(input_path))[0] or "video/mp4" |
|
226
|
|
|
227
|
if kg_db_path.exists(): |
|
228
|
logger.info("Resuming: found knowledge graph on disk, loading") |
|
229
|
kg = KnowledgeGraph(provider_manager=pm, db_path=kg_db_path) |
|
230
|
else: |
|
231
|
logger.info("Building knowledge graph...") |
|
232
|
kg = KnowledgeGraph(provider_manager=pm, db_path=kg_db_path) |
|
233
|
kg.register_source( |
|
234
|
{ |
|
235
|
"source_id": source_id, |
|
236
|
"source_type": "video", |
|
237
|
"title": title, |
|
238
|
"path": str(input_path), |
|
239
|
"mime_type": mime_type, |
|
240
|
"ingested_at": datetime.now().isoformat(), |
|
241
|
"metadata": {"duration_seconds": audio_props.get("duration")}, |
|
242
|
} |
|
243
|
) |
|
244
|
kg.process_transcript(transcript_data) |
|
245
|
if diagrams: |
|
246
|
diagram_dicts = [d.model_dump() for d in diagrams] |
|
247
|
kg.process_diagrams(diagram_dicts) |
|
248
|
if screen_captures: |
|
249
|
capture_dicts = [sc.model_dump() for sc in screen_captures] |
|
250
|
kg.process_screenshots(capture_dicts) |
|
251
|
# Export JSON copy alongside the SQLite db |
|
252
|
kg.save(kg_json_path) |
|
253
|
pipeline_bar.update(1) |
|
254
|
_notify(progress_callback, "on_step_complete", steps[4], 5, total_steps) |
|
255
|
|
|
256
|
# --- Step 6: Extract key points & action items --- |
|
257
|
_notify(progress_callback, "on_step_start", steps[5], 6, total_steps) |
|
258
|
pm.usage.start_step("Key points & actions") |
|
259
|
pipeline_bar.set_description("Pipeline: extracting key points") |
|
260
|
kp_path = dirs["results"] / "key_points.json" |
|
261
|
ai_path = dirs["results"] / "action_items.json" |
|
262
|
if kp_path.exists() and ai_path.exists(): |
|
263
|
logger.info("Resuming: found key points and action items on disk") |
|
264
|
key_points = [KeyPoint(**item) for item in json.loads(kp_path.read_text())] |
|
265
|
action_items = [ActionItem(**item) for item in json.loads(ai_path.read_text())] |
|
266
|
else: |
|
267
|
key_points = _extract_key_points(pm, transcript_text) |
|
268
|
action_items = _extract_action_items(pm, transcript_text) |
|
269
|
|
|
270
|
kp_path.write_text(json.dumps([kp.model_dump() for kp in key_points], indent=2)) |
|
271
|
ai_path.write_text(json.dumps([ai.model_dump() for ai in action_items], indent=2)) |
|
272
|
pipeline_bar.update(1) |
|
273
|
_notify(progress_callback, "on_step_complete", steps[5], 6, total_steps) |
|
274
|
|
|
275
|
# --- Step 7: Generate markdown report --- |
|
276
|
_notify(progress_callback, "on_step_start", steps[6], 7, total_steps) |
|
277
|
pm.usage.start_step("Report generation") |
|
278
|
pipeline_bar.set_description("Pipeline: generating report") |
|
279
|
md_path = dirs["results"] / "analysis.md" |
|
280
|
if md_path.exists(): |
|
281
|
logger.info("Resuming: found analysis report on disk, skipping generation") |
|
282
|
else: |
|
283
|
logger.info("Generating report...") |
|
284
|
plan_gen = PlanGenerator(provider_manager=pm, knowledge_graph=kg) |
|
285
|
plan_gen.generate_markdown( |
|
286
|
transcript=transcript_data, |
|
287
|
key_points=[kp.model_dump() for kp in key_points], |
|
288
|
diagrams=[d.model_dump() for d in diagrams], |
|
289
|
knowledge_graph=kg.to_dict(), |
|
290
|
video_title=title, |
|
291
|
output_path=md_path, |
|
292
|
) |
|
293
|
pipeline_bar.update(1) |
|
294
|
_notify(progress_callback, "on_step_complete", steps[6], 7, total_steps) |
|
295
|
|
|
296
|
# --- Build manifest --- |
|
297
|
elapsed = time.time() - start_time |
|
298
|
manifest = VideoManifest( |
|
299
|
video=VideoMetadata( |
|
300
|
title=title, |
|
301
|
source_path=str(input_path), |
|
302
|
duration_seconds=audio_props.get("duration"), |
|
303
|
), |
|
304
|
stats=ProcessingStats( |
|
305
|
start_time=datetime.now().isoformat(), |
|
306
|
duration_seconds=elapsed, |
|
307
|
frames_extracted=len(frame_paths), |
|
308
|
people_frames_filtered=people_removed, |
|
309
|
diagrams_detected=len(diagrams), |
|
310
|
screen_captures=len(screen_captures), |
|
311
|
transcript_duration_seconds=audio_props.get("duration"), |
|
312
|
models_used=pm.get_models_used(), |
|
313
|
), |
|
314
|
transcript_json="transcript/transcript.json", |
|
315
|
transcript_txt="transcript/transcript.txt", |
|
316
|
transcript_srt="transcript/transcript.srt", |
|
317
|
analysis_md="results/analysis.md", |
|
318
|
knowledge_graph_json="results/knowledge_graph.json", |
|
319
|
knowledge_graph_db="results/knowledge_graph.db", |
|
320
|
key_points_json="results/key_points.json", |
|
321
|
action_items_json="results/action_items.json", |
|
322
|
key_points=key_points, |
|
323
|
action_items=action_items, |
|
324
|
diagrams=diagrams, |
|
325
|
screen_captures=screen_captures, |
|
326
|
frame_paths=[f"frames/{Path(p).name}" for p in frame_paths], |
|
327
|
) |
|
328
|
|
|
329
|
# --- Step 8: Export all formats --- |
|
330
|
_notify(progress_callback, "on_step_start", steps[7], 8, total_steps) |
|
331
|
pm.usage.start_step("Export formats") |
|
332
|
pipeline_bar.set_description("Pipeline: exporting formats") |
|
333
|
manifest = export_all_formats(output_dir, manifest) |
|
334
|
|
|
335
|
pm.usage.end_step() |
|
336
|
pipeline_bar.update(1) |
|
337
|
_notify(progress_callback, "on_step_complete", steps[7], 8, total_steps) |
|
338
|
pipeline_bar.set_description("Pipeline: complete") |
|
339
|
pipeline_bar.close() |
|
340
|
|
|
341
|
# Write manifest |
|
342
|
write_video_manifest(manifest, output_dir) |
|
343
|
|
|
344
|
logger.info( |
|
345
|
f"Processing complete in {elapsed:.1f}s: {len(diagrams)} diagrams, " |
|
346
|
f"{len(screen_captures)} captures, {len(key_points)} key points, " |
|
347
|
f"{len(action_items)} action items" |
|
348
|
) |
|
349
|
|
|
350
|
return manifest |
|
351
|
|
|
352
|
|
|
353
|
def _extract_key_points(pm: ProviderManager, text: str) -> list[KeyPoint]: |
|
354
|
"""Extract key points via LLM.""" |
|
355
|
from video_processor.utils.json_parsing import parse_json_from_response |
|
356
|
|
|
357
|
prompt = ( |
|
358
|
"Extract the key points from this transcript.\n\n" |
|
359
|
f"TRANSCRIPT:\n{text[:8000]}\n\n" |
|
360
|
'Return a JSON array: [{"point": "...", "topic": "...", "details": "..."}]\n' |
|
361
|
"Return ONLY the JSON array." |
|
362
|
) |
|
363
|
try: |
|
364
|
raw = pm.chat([{"role": "user", "content": prompt}], temperature=0.3) |
|
365
|
parsed = parse_json_from_response(raw) |
|
366
|
if isinstance(parsed, list): |
|
367
|
return [ |
|
368
|
KeyPoint( |
|
369
|
point=item.get("point", ""), |
|
370
|
topic=item.get("topic"), |
|
371
|
details=item.get("details"), |
|
372
|
) |
|
373
|
for item in parsed |
|
374
|
if isinstance(item, dict) and item.get("point") |
|
375
|
] |
|
376
|
except Exception as e: |
|
377
|
logger.warning(f"Key point extraction failed: {e}") |
|
378
|
return [] |
|
379
|
|
|
380
|
|
|
381
|
def _extract_action_items(pm: ProviderManager, text: str) -> list[ActionItem]: |
|
382
|
"""Extract action items via LLM.""" |
|
383
|
from video_processor.utils.json_parsing import parse_json_from_response |
|
384
|
|
|
385
|
prompt = ( |
|
386
|
"Extract all action items from this transcript.\n\n" |
|
387
|
f"TRANSCRIPT:\n{text[:8000]}\n\n" |
|
388
|
'Return a JSON array: [{"action": "...", "assignee": "...", "deadline": "...", ' |
|
389
|
'"priority": "...", "context": "..."}]\n' |
|
390
|
"Return ONLY the JSON array." |
|
391
|
) |
|
392
|
try: |
|
393
|
raw = pm.chat([{"role": "user", "content": prompt}], temperature=0.3) |
|
394
|
parsed = parse_json_from_response(raw) |
|
395
|
if isinstance(parsed, list): |
|
396
|
return [ |
|
397
|
ActionItem( |
|
398
|
action=item.get("action", ""), |
|
399
|
assignee=item.get("assignee"), |
|
400
|
deadline=item.get("deadline"), |
|
401
|
priority=item.get("priority"), |
|
402
|
context=item.get("context"), |
|
403
|
) |
|
404
|
for item in parsed |
|
405
|
if isinstance(item, dict) and item.get("action") |
|
406
|
] |
|
407
|
except Exception as e: |
|
408
|
logger.warning(f"Action item extraction failed: {e}") |
|
409
|
return [] |
|
410
|
|
|
411
|
|
|
412
|
def _format_srt_time(seconds: float) -> str: |
|
413
|
"""Format seconds as SRT timestamp (HH:MM:SS,mmm).""" |
|
414
|
h = int(seconds // 3600) |
|
415
|
m = int((seconds % 3600) // 60) |
|
416
|
s = int(seconds % 60) |
|
417
|
ms = int((seconds % 1) * 1000) |
|
418
|
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" |
|
419
|
|