PlanOpticon

planopticon / video_processor / extractors / audio_extractor.py
Source Blame History 232 lines
287a3bb… leo 1 """Audio extraction and processing module for video analysis."""
829e24a… leo 2
287a3bb… leo 3 import logging
287a3bb… leo 4 import subprocess
287a3bb… leo 5 from pathlib import Path
287a3bb… leo 6 from typing import Dict, Optional, Tuple, Union
287a3bb… leo 7
287a3bb… leo 8 import librosa
287a3bb… leo 9 import numpy as np
287a3bb… leo 10 import soundfile as sf
287a3bb… leo 11
287a3bb… leo 12 logger = logging.getLogger(__name__)
287a3bb… leo 13
829e24a… leo 14
287a3bb… leo 15 class AudioExtractor:
287a3bb… leo 16 """Extract and process audio from video files."""
829e24a… leo 17
287a3bb… leo 18 def __init__(self, sample_rate: int = 16000, mono: bool = True):
287a3bb… leo 19 """
287a3bb… leo 20 Initialize the audio extractor.
829e24a… leo 21
287a3bb… leo 22 Parameters
287a3bb… leo 23 ----------
287a3bb… leo 24 sample_rate : int
287a3bb… leo 25 Target sample rate for extracted audio
287a3bb… leo 26 mono : bool
287a3bb… leo 27 Whether to convert audio to mono
287a3bb… leo 28 """
287a3bb… leo 29 self.sample_rate = sample_rate
287a3bb… leo 30 self.mono = mono
829e24a… leo 31
287a3bb… leo 32 def extract_audio(
829e24a… leo 33 self,
829e24a… leo 34 video_path: Union[str, Path],
829e24a… leo 35 output_path: Optional[Union[str, Path]] = None,
829e24a… leo 36 format: str = "wav",
287a3bb… leo 37 ) -> Path:
287a3bb… leo 38 """
287a3bb… leo 39 Extract audio from video file.
829e24a… leo 40
287a3bb… leo 41 Parameters
287a3bb… leo 42 ----------
287a3bb… leo 43 video_path : str or Path
287a3bb… leo 44 Path to video file
287a3bb… leo 45 output_path : str or Path, optional
287a3bb… leo 46 Path to save extracted audio (if None, saves alongside video)
287a3bb… leo 47 format : str
287a3bb… leo 48 Audio format to save (wav, mp3, etc.)
829e24a… leo 49
287a3bb… leo 50 Returns
287a3bb… leo 51 -------
287a3bb… leo 52 Path
287a3bb… leo 53 Path to extracted audio file
287a3bb… leo 54 """
287a3bb… leo 55 video_path = Path(video_path)
287a3bb… leo 56 if not video_path.exists():
287a3bb… leo 57 raise FileNotFoundError(f"Video file not found: {video_path}")
829e24a… leo 58
287a3bb… leo 59 # Generate output path if not provided
287a3bb… leo 60 if output_path is None:
287a3bb… leo 61 output_path = video_path.with_suffix(f".{format}")
287a3bb… leo 62 else:
287a3bb… leo 63 output_path = Path(output_path)
829e24a… leo 64
287a3bb… leo 65 # Ensure output directory exists
287a3bb… leo 66 output_path.parent.mkdir(parents=True, exist_ok=True)
829e24a… leo 67
287a3bb… leo 68 # Extract audio using ffmpeg
287a3bb… leo 69 try:
287a3bb… leo 70 cmd = [
829e24a… leo 71 "ffmpeg",
829e24a… leo 72 "-i",
829e24a… leo 73 str(video_path),
829e24a… leo 74 "-vn", # No video
829e24a… leo 75 "-acodec",
829e24a… leo 76 "pcm_s16le", # PCM 16-bit little-endian
829e24a… leo 77 "-ar",
829e24a… leo 78 str(self.sample_rate), # Sample rate
829e24a… leo 79 "-ac",
829e24a… leo 80 "1" if self.mono else "2", # Channels (mono or stereo)
829e24a… leo 81 "-y", # Overwrite output
829e24a… leo 82 str(output_path),
829e24a… leo 83 ]
829e24a… leo 84
829e24a… leo 85 # Run ffmpeg command
829e24a… leo 86 subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
829e24a… leo 87
287a3bb… leo 88 logger.info(f"Extracted audio from {video_path} to {output_path}")
287a3bb… leo 89 return output_path
829e24a… leo 90
287a3bb… leo 91 except subprocess.CalledProcessError as e:
287a3bb… leo 92 logger.error(f"Failed to extract audio: {e.stderr.decode()}")
287a3bb… leo 93 raise RuntimeError(f"Failed to extract audio: {e.stderr.decode()}")
287a3bb… leo 94 except Exception as e:
287a3bb… leo 95 logger.error(f"Error extracting audio: {str(e)}")
287a3bb… leo 96 raise
829e24a… leo 97
287a3bb… leo 98 def load_audio(self, audio_path: Union[str, Path]) -> Tuple[np.ndarray, int]:
287a3bb… leo 99 """
287a3bb… leo 100 Load audio file into memory.
829e24a… leo 101
287a3bb… leo 102 Parameters
287a3bb… leo 103 ----------
287a3bb… leo 104 audio_path : str or Path
287a3bb… leo 105 Path to audio file
829e24a… leo 106
287a3bb… leo 107 Returns
287a3bb… leo 108 -------
287a3bb… leo 109 tuple
287a3bb… leo 110 (audio_data, sample_rate)
287a3bb… leo 111 """
287a3bb… leo 112 audio_path = Path(audio_path)
287a3bb… leo 113 if not audio_path.exists():
287a3bb… leo 114 raise FileNotFoundError(f"Audio file not found: {audio_path}")
829e24a… leo 115
287a3bb… leo 116 # Load audio data
287a3bb… leo 117 audio_data, sr = librosa.load(
829e24a… leo 118 audio_path, sr=self.sample_rate if self.sample_rate else None, mono=self.mono
287a3bb… leo 119 )
829e24a… leo 120
287a3bb… leo 121 logger.info(f"Loaded audio from {audio_path}: shape={audio_data.shape}, sr={sr}")
287a3bb… leo 122 return audio_data, sr
829e24a… leo 123
287a3bb… leo 124 def get_audio_properties(self, audio_path: Union[str, Path]) -> Dict:
287a3bb… leo 125 """
287a3bb… leo 126 Get properties of audio file.
829e24a… leo 127
287a3bb… leo 128 Parameters
287a3bb… leo 129 ----------
287a3bb… leo 130 audio_path : str or Path
287a3bb… leo 131 Path to audio file
829e24a… leo 132
287a3bb… leo 133 Returns
287a3bb… leo 134 -------
287a3bb… leo 135 dict
287a3bb… leo 136 Audio properties (duration, sample_rate, channels, etc.)
287a3bb… leo 137 """
287a3bb… leo 138 audio_path = Path(audio_path)
287a3bb… leo 139 if not audio_path.exists():
287a3bb… leo 140 raise FileNotFoundError(f"Audio file not found: {audio_path}")
829e24a… leo 141
287a3bb… leo 142 # Get audio info
287a3bb… leo 143 info = sf.info(audio_path)
829e24a… leo 144
287a3bb… leo 145 properties = {
287a3bb… leo 146 "duration": info.duration,
287a3bb… leo 147 "sample_rate": info.samplerate,
287a3bb… leo 148 "channels": info.channels,
287a3bb… leo 149 "format": info.format,
287a3bb… leo 150 "subtype": info.subtype,
829e24a… leo 151 "path": str(audio_path),
287a3bb… leo 152 }
829e24a… leo 153
287a3bb… leo 154 return properties
829e24a… leo 155
287a3bb… leo 156 def segment_audio(
287a3bb… leo 157 self,
287a3bb… leo 158 audio_data: np.ndarray,
287a3bb… leo 159 sample_rate: int,
287a3bb… leo 160 segment_length_ms: int = 30000,
829e24a… leo 161 overlap_ms: int = 0,
287a3bb… leo 162 ) -> list:
287a3bb… leo 163 """
287a3bb… leo 164 Segment audio into chunks.
829e24a… leo 165
287a3bb… leo 166 Parameters
287a3bb… leo 167 ----------
287a3bb… leo 168 audio_data : np.ndarray
287a3bb… leo 169 Audio data
287a3bb… leo 170 sample_rate : int
287a3bb… leo 171 Sample rate of audio
287a3bb… leo 172 segment_length_ms : int
287a3bb… leo 173 Length of segments in milliseconds
287a3bb… leo 174 overlap_ms : int
287a3bb… leo 175 Overlap between segments in milliseconds
829e24a… leo 176
287a3bb… leo 177 Returns
287a3bb… leo 178 -------
287a3bb… leo 179 list
287a3bb… leo 180 List of audio segments as numpy arrays
287a3bb… leo 181 """
287a3bb… leo 182 # Convert ms to samples
287a3bb… leo 183 segment_length_samples = int(segment_length_ms * sample_rate / 1000)
287a3bb… leo 184 overlap_samples = int(overlap_ms * sample_rate / 1000)
829e24a… leo 185
287a3bb… leo 186 # Calculate hop length
287a3bb… leo 187 hop_length = segment_length_samples - overlap_samples
829e24a… leo 188
287a3bb… leo 189 # Initialize segments list
287a3bb… leo 190 segments = []
829e24a… leo 191
287a3bb… leo 192 # Generate segments
287a3bb… leo 193 for i in range(0, len(audio_data), hop_length):
287a3bb… leo 194 end_idx = min(i + segment_length_samples, len(audio_data))
287a3bb… leo 195 segment = audio_data[i:end_idx]
829e24a… leo 196
287a3bb… leo 197 # Only add if segment is long enough (at least 50% of target length)
287a3bb… leo 198 if len(segment) >= segment_length_samples * 0.5:
287a3bb… leo 199 segments.append(segment)
829e24a… leo 200
287a3bb… leo 201 # Break if we've reached the end
287a3bb… leo 202 if end_idx == len(audio_data):
287a3bb… leo 203 break
829e24a… leo 204
287a3bb… leo 205 logger.info(f"Segmented audio into {len(segments)} chunks")
287a3bb… leo 206 return segments
829e24a… leo 207
287a3bb… leo 208 def save_segment(
829e24a… leo 209 self, segment: np.ndarray, output_path: Union[str, Path], sample_rate: int
287a3bb… leo 210 ) -> Path:
287a3bb… leo 211 """
287a3bb… leo 212 Save audio segment to file.
829e24a… leo 213
287a3bb… leo 214 Parameters
287a3bb… leo 215 ----------
287a3bb… leo 216 segment : np.ndarray
287a3bb… leo 217 Audio segment data
287a3bb… leo 218 output_path : str or Path
287a3bb… leo 219 Path to save segment
287a3bb… leo 220 sample_rate : int
287a3bb… leo 221 Sample rate of segment
829e24a… leo 222
287a3bb… leo 223 Returns
287a3bb… leo 224 -------
287a3bb… leo 225 Path
287a3bb… leo 226 Path to saved segment
287a3bb… leo 227 """
287a3bb… leo 228 output_path = Path(output_path)
287a3bb… leo 229 output_path.parent.mkdir(parents=True, exist_ok=True)
829e24a… leo 230
287a3bb… leo 231 sf.write(output_path, segment, sample_rate)
287a3bb… leo 232 return output_path

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button