|
1
|
"""Frame extraction module for video processing.""" |
|
2
|
|
|
3
|
import functools |
|
4
|
import logging |
|
5
|
import sys |
|
6
|
import tempfile |
|
7
|
from pathlib import Path |
|
8
|
from typing import List, Optional, Tuple, Union |
|
9
|
|
|
10
|
import cv2 |
|
11
|
import numpy as np |
|
12
|
from tqdm import tqdm |
|
13
|
|
|
14
|
logger = logging.getLogger(__name__) |
|
15
|
|
|
16
|
# Haar cascade for face detection — ships with OpenCV |
|
17
|
_FACE_CASCADE_PATH = cv2.data.haarcascades + "haarcascade_frontalface_default.xml" |
|
18
|
_FACE_CASCADE = None |
|
19
|
|
|
20
|
|
|
21
|
def _get_face_cascade() -> cv2.CascadeClassifier: |
|
22
|
"""Lazy-load the face cascade classifier.""" |
|
23
|
global _FACE_CASCADE |
|
24
|
if _FACE_CASCADE is None: |
|
25
|
_FACE_CASCADE = cv2.CascadeClassifier(_FACE_CASCADE_PATH) |
|
26
|
return _FACE_CASCADE |
|
27
|
|
|
28
|
|
|
29
|
def detect_faces(frame: np.ndarray) -> List[Tuple[int, int, int, int]]: |
|
30
|
"""Detect faces in a frame using Haar cascade. Returns list of (x, y, w, h).""" |
|
31
|
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) if len(frame.shape) == 3 else frame |
|
32
|
cascade = _get_face_cascade() |
|
33
|
faces = cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(40, 40)) |
|
34
|
return list(faces) if len(faces) > 0 else [] |
|
35
|
|
|
36
|
|
|
37
|
def is_people_frame( |
|
38
|
frame: np.ndarray, |
|
39
|
face_area_threshold: float = 0.03, |
|
40
|
min_face_size: int = 90, |
|
41
|
) -> bool: |
|
42
|
""" |
|
43
|
Determine if a frame is primarily showing people (webcam/video conference). |
|
44
|
|
|
45
|
Heuristics: |
|
46
|
1. Face detection — if significant faces occupy enough frame area |
|
47
|
2. Black bar detection — video conferences often have thick black bars |
|
48
|
3. Small faces with black bars — profile pictures in conference UI |
|
49
|
|
|
50
|
Faces smaller than min_face_size are ignored (sidebar thumbnails in screen shares). |
|
51
|
|
|
52
|
Parameters |
|
53
|
---------- |
|
54
|
frame : np.ndarray |
|
55
|
BGR image frame |
|
56
|
face_area_threshold : float |
|
57
|
Minimum ratio of total face area to frame area to classify as people frame |
|
58
|
min_face_size : int |
|
59
|
Minimum face width/height in pixels to count as a significant face |
|
60
|
|
|
61
|
Returns |
|
62
|
------- |
|
63
|
bool |
|
64
|
True if frame is primarily people/webcam content |
|
65
|
""" |
|
66
|
h, w = frame.shape[:2] |
|
67
|
frame_area = h * w |
|
68
|
|
|
69
|
# Detect all faces |
|
70
|
all_faces = detect_faces(frame) |
|
71
|
|
|
72
|
# Separate significant faces (webcam-sized) from tiny ones (sidebar thumbnails) |
|
73
|
significant_faces = [(x, y, fw, fh) for (x, y, fw, fh) in all_faces if fw >= min_face_size] |
|
74
|
|
|
75
|
if significant_faces: |
|
76
|
total_face_area = sum(fw * fh for (_, _, fw, fh) in significant_faces) |
|
77
|
face_ratio = total_face_area / frame_area |
|
78
|
|
|
79
|
# Multiple significant faces or large face area → people frame |
|
80
|
if len(significant_faces) >= 2 or face_ratio >= face_area_threshold: |
|
81
|
logger.debug( |
|
82
|
f"People frame: {len(significant_faces)} significant faces, " |
|
83
|
f"face_ratio={face_ratio:.3f}" |
|
84
|
) |
|
85
|
return True |
|
86
|
|
|
87
|
# Check for video conference layout: large black border areas |
|
88
|
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) if len(frame.shape) == 3 else frame |
|
89
|
black_pixels = np.sum(gray < 15) |
|
90
|
black_ratio = black_pixels / frame_area |
|
91
|
|
|
92
|
if black_ratio > 0.25 and all_faces: |
|
93
|
# Significant black bars + any face = video conference UI (e.g., profile pic on black) |
|
94
|
logger.debug(f"People frame: black_ratio={black_ratio:.2f} with {len(all_faces)} faces") |
|
95
|
return True |
|
96
|
|
|
97
|
return False |
|
98
|
|
|
99
|
|
|
100
|
def filter_people_frames( |
|
101
|
frames: List[np.ndarray], |
|
102
|
face_area_threshold: float = 0.03, |
|
103
|
) -> Tuple[List[np.ndarray], int]: |
|
104
|
""" |
|
105
|
Filter out frames that primarily show people/webcam views. |
|
106
|
|
|
107
|
Returns (filtered_frames, num_removed). |
|
108
|
""" |
|
109
|
filtered = [] |
|
110
|
removed = 0 |
|
111
|
for frame in tqdm(frames, desc="Filtering people frames", unit="frame"): |
|
112
|
if is_people_frame(frame, face_area_threshold): |
|
113
|
removed += 1 |
|
114
|
else: |
|
115
|
filtered.append(frame) |
|
116
|
|
|
117
|
if removed: |
|
118
|
logger.info(f"Filtered out {removed}/{len(frames)} people/webcam frames") |
|
119
|
return filtered, removed |
|
120
|
|
|
121
|
|
|
122
|
def is_gpu_available() -> bool: |
|
123
|
"""Check if GPU acceleration is available for OpenCV.""" |
|
124
|
try: |
|
125
|
# Check if CUDA is available |
|
126
|
count = cv2.cuda.getCudaEnabledDeviceCount() |
|
127
|
return count > 0 |
|
128
|
except Exception: |
|
129
|
return False |
|
130
|
|
|
131
|
|
|
132
|
def gpu_accelerated(func): |
|
133
|
"""Decorator to use GPU implementation when available.""" |
|
134
|
|
|
135
|
@functools.wraps(func) |
|
136
|
def wrapper(*args, **kwargs): |
|
137
|
if is_gpu_available() and not kwargs.get("disable_gpu"): |
|
138
|
# Remove the disable_gpu kwarg if it exists |
|
139
|
kwargs.pop("disable_gpu", None) |
|
140
|
return func_gpu(*args, **kwargs) |
|
141
|
# Remove the disable_gpu kwarg if it exists |
|
142
|
kwargs.pop("disable_gpu", None) |
|
143
|
return func(*args, **kwargs) |
|
144
|
|
|
145
|
return wrapper |
|
146
|
|
|
147
|
|
|
148
|
def calculate_frame_difference(prev_frame: np.ndarray, curr_frame: np.ndarray) -> float: |
|
149
|
""" |
|
150
|
Calculate the difference between two frames. |
|
151
|
|
|
152
|
Parameters |
|
153
|
---------- |
|
154
|
prev_frame : np.ndarray |
|
155
|
Previous frame |
|
156
|
curr_frame : np.ndarray |
|
157
|
Current frame |
|
158
|
|
|
159
|
Returns |
|
160
|
------- |
|
161
|
float |
|
162
|
Difference score between 0 and 1 |
|
163
|
""" |
|
164
|
# Convert to grayscale |
|
165
|
if len(prev_frame.shape) == 3: |
|
166
|
prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY) |
|
167
|
else: |
|
168
|
prev_gray = prev_frame |
|
169
|
|
|
170
|
if len(curr_frame.shape) == 3: |
|
171
|
curr_gray = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2GRAY) |
|
172
|
else: |
|
173
|
curr_gray = curr_frame |
|
174
|
|
|
175
|
# Calculate absolute difference |
|
176
|
diff = cv2.absdiff(prev_gray, curr_gray) |
|
177
|
|
|
178
|
# Normalize and return mean difference |
|
179
|
return np.mean(diff) / 255.0 |
|
180
|
|
|
181
|
|
|
182
|
@gpu_accelerated |
|
183
|
def extract_frames( |
|
184
|
video_path: Union[str, Path], |
|
185
|
sampling_rate: float = 1.0, |
|
186
|
change_threshold: float = 0.15, |
|
187
|
periodic_capture_seconds: float = 30.0, |
|
188
|
max_frames: Optional[int] = None, |
|
189
|
resize_to: Optional[Tuple[int, int]] = None, |
|
190
|
max_memory_mb: int = 1024, |
|
191
|
) -> List[np.ndarray]: |
|
192
|
""" |
|
193
|
Extract frames from video based on visual change detection + periodic capture. |
|
194
|
|
|
195
|
Two capture strategies work together: |
|
196
|
1. Change detection: capture when visual difference exceeds threshold |
|
197
|
(catches transitions like webcam ↔ screen share) |
|
198
|
2. Periodic capture: capture every N seconds regardless of change |
|
199
|
(catches slow-evolving content like document scrolling) |
|
200
|
|
|
201
|
The downstream people filter removes any webcam frames captured periodically. |
|
202
|
|
|
203
|
Parameters |
|
204
|
---------- |
|
205
|
video_path : str or Path |
|
206
|
Path to video file |
|
207
|
sampling_rate : float |
|
208
|
Frame sampling rate (1.0 = every frame) |
|
209
|
change_threshold : float |
|
210
|
Threshold for detecting significant visual changes |
|
211
|
periodic_capture_seconds : float |
|
212
|
Capture a frame every N seconds regardless of change (0 to disable) |
|
213
|
max_frames : int, optional |
|
214
|
Maximum number of frames to extract |
|
215
|
resize_to : tuple of (width, height), optional |
|
216
|
Resize frames to this dimension |
|
217
|
max_memory_mb : int |
|
218
|
Approximate memory limit in MB for held frames. When approaching this |
|
219
|
limit, frames are flushed to disk early and only paths are retained |
|
220
|
internally. The returned list still contains numpy arrays (reloaded |
|
221
|
from the temp files at the end). Default 1024 MB. |
|
222
|
|
|
223
|
Returns |
|
224
|
------- |
|
225
|
list |
|
226
|
List of extracted frames as numpy arrays |
|
227
|
""" |
|
228
|
video_path = Path(video_path) |
|
229
|
if not video_path.exists(): |
|
230
|
raise FileNotFoundError(f"Video file not found: {video_path}") |
|
231
|
|
|
232
|
cap = cv2.VideoCapture(str(video_path)) |
|
233
|
if not cap.isOpened(): |
|
234
|
raise ValueError(f"Could not open video file: {video_path}") |
|
235
|
|
|
236
|
# Get video properties |
|
237
|
fps = cap.get(cv2.CAP_PROP_FPS) |
|
238
|
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) |
|
239
|
|
|
240
|
# Calculate frame interval based on sampling rate |
|
241
|
if sampling_rate <= 0: |
|
242
|
raise ValueError("Sampling rate must be positive") |
|
243
|
|
|
244
|
frame_interval = max(1, int(1 / sampling_rate)) |
|
245
|
|
|
246
|
# Periodic capture interval in frames (0 = disabled) |
|
247
|
periodic_interval = int(periodic_capture_seconds * fps) if periodic_capture_seconds > 0 else 0 |
|
248
|
|
|
249
|
logger.info( |
|
250
|
f"Video: {video_path.name}, FPS: {fps:.0f}, Frames: {frame_count}, " |
|
251
|
f"Sample interval: {frame_interval}, " |
|
252
|
f"Periodic capture: every {periodic_capture_seconds:.0f}s" |
|
253
|
) |
|
254
|
|
|
255
|
extracted_frames = [] |
|
256
|
prev_frame = None |
|
257
|
frame_idx = 0 |
|
258
|
last_capture_frame = -periodic_interval # allow first periodic capture immediately |
|
259
|
|
|
260
|
# Memory safety valve |
|
261
|
max_memory_bytes = max_memory_mb * 1024 * 1024 |
|
262
|
approx_memory_used = 0 |
|
263
|
_flush_dir = None # lazily created temp dir for flushed frames |
|
264
|
_flushed_paths: List[Path] = [] # paths of frames flushed to disk |
|
265
|
|
|
266
|
pbar = tqdm( |
|
267
|
total=frame_count, |
|
268
|
desc="Extracting frames", |
|
269
|
unit="frame", |
|
270
|
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]", |
|
271
|
) |
|
272
|
|
|
273
|
while cap.isOpened(): |
|
274
|
# Process frame only if it's a sampling point |
|
275
|
if frame_idx % frame_interval == 0: |
|
276
|
success, frame = cap.read() |
|
277
|
if not success: |
|
278
|
break |
|
279
|
|
|
280
|
# Resize if specified |
|
281
|
if resize_to is not None: |
|
282
|
frame = cv2.resize(frame, resize_to) |
|
283
|
|
|
284
|
should_capture = False |
|
285
|
reason = "" |
|
286
|
|
|
287
|
# First frame always gets extracted |
|
288
|
if prev_frame is None: |
|
289
|
should_capture = True |
|
290
|
reason = "first" |
|
291
|
else: |
|
292
|
# Change detection |
|
293
|
diff = calculate_frame_difference(prev_frame, frame) |
|
294
|
if diff > change_threshold: |
|
295
|
should_capture = True |
|
296
|
reason = f"change={diff:.3f}" |
|
297
|
|
|
298
|
# Periodic capture — even if change is small |
|
299
|
elif ( |
|
300
|
periodic_interval > 0 and (frame_idx - last_capture_frame) >= periodic_interval |
|
301
|
): |
|
302
|
should_capture = True |
|
303
|
reason = "periodic" |
|
304
|
|
|
305
|
if should_capture: |
|
306
|
extracted_frames.append(frame) |
|
307
|
approx_memory_used += sys.getsizeof(frame) + ( |
|
308
|
frame.nbytes if hasattr(frame, "nbytes") else 0 |
|
309
|
) |
|
310
|
prev_frame = frame |
|
311
|
last_capture_frame = frame_idx |
|
312
|
logger.debug(f"Frame {frame_idx} extracted ({reason})") |
|
313
|
|
|
314
|
# Memory safety valve: flush frames to disk when approaching limit |
|
315
|
if approx_memory_used >= max_memory_bytes * 0.9: |
|
316
|
if _flush_dir is None: |
|
317
|
_flush_dir = tempfile.mkdtemp(prefix="planopticon_frames_") |
|
318
|
logger.info( |
|
319
|
f"Memory limit ~{max_memory_mb}MB approaching, " |
|
320
|
f"flushing frames to {_flush_dir}" |
|
321
|
) |
|
322
|
for fi, f in enumerate(extracted_frames): |
|
323
|
flush_path = Path(_flush_dir) / f"flush_{len(_flushed_paths) + fi:06d}.jpg" |
|
324
|
cv2.imwrite(str(flush_path), f) |
|
325
|
_flushed_paths.append(flush_path) |
|
326
|
extracted_frames.clear() |
|
327
|
approx_memory_used = 0 |
|
328
|
|
|
329
|
pbar.set_postfix(extracted=len(extracted_frames)) |
|
330
|
|
|
331
|
# Check if we've reached the maximum |
|
332
|
if max_frames is not None and len(extracted_frames) >= max_frames: |
|
333
|
break |
|
334
|
else: |
|
335
|
# Skip frame but advance counter |
|
336
|
cap.grab() |
|
337
|
|
|
338
|
frame_idx += 1 |
|
339
|
pbar.update(frame_interval) |
|
340
|
|
|
341
|
pbar.close() |
|
342
|
cap.release() |
|
343
|
|
|
344
|
# If frames were flushed to disk, reload them |
|
345
|
if _flushed_paths: |
|
346
|
reloaded = [] |
|
347
|
for fp in _flushed_paths: |
|
348
|
img = cv2.imread(str(fp)) |
|
349
|
if img is not None: |
|
350
|
reloaded.append(img) |
|
351
|
reloaded.extend(extracted_frames) |
|
352
|
extracted_frames = reloaded |
|
353
|
logger.info(f"Reloaded {len(_flushed_paths)} flushed frames from disk") |
|
354
|
# Clean up temp files |
|
355
|
import shutil |
|
356
|
|
|
357
|
if _flush_dir: |
|
358
|
shutil.rmtree(_flush_dir, ignore_errors=True) |
|
359
|
|
|
360
|
logger.info(f"Extracted {len(extracted_frames)} frames from {frame_count} total frames") |
|
361
|
return extracted_frames |
|
362
|
|
|
363
|
|
|
364
|
def func_gpu(*args, **kwargs): |
|
365
|
"""GPU-accelerated version of extract_frames.""" |
|
366
|
# This would be implemented with CUDA acceleration |
|
367
|
# For now, fall back to the unwrapped CPU version |
|
368
|
logger.info("GPU acceleration not yet implemented, falling back to CPU") |
|
369
|
return extract_frames.__wrapped__(*args, **kwargs) |
|
370
|
|
|
371
|
|
|
372
|
def save_frames( |
|
373
|
frames: List[np.ndarray], output_dir: Union[str, Path], base_filename: str = "frame" |
|
374
|
) -> List[Path]: |
|
375
|
""" |
|
376
|
Save extracted frames to disk. |
|
377
|
|
|
378
|
Parameters |
|
379
|
---------- |
|
380
|
frames : list |
|
381
|
List of frames to save |
|
382
|
output_dir : str or Path |
|
383
|
Directory to save frames in |
|
384
|
base_filename : str |
|
385
|
Base name for frame files |
|
386
|
|
|
387
|
Returns |
|
388
|
------- |
|
389
|
list |
|
390
|
List of paths to saved frame files |
|
391
|
""" |
|
392
|
output_dir = Path(output_dir) |
|
393
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
394
|
|
|
395
|
saved_paths = [] |
|
396
|
for i, frame in enumerate(frames): |
|
397
|
output_path = output_dir / f"{base_filename}_{i:04d}.jpg" |
|
398
|
cv2.imwrite(str(output_path), frame) |
|
399
|
saved_paths.append(output_path) |
|
400
|
|
|
401
|
return saved_paths |
|
402
|
|