PlanOpticon

planopticon / video_processor / providers / whisper_local.py
Source Blame History 133 lines
287a3bb… leo 1 """Local Whisper transcription provider — runs on-device with GPU acceleration."""
287a3bb… leo 2
287a3bb… leo 3 import logging
287a3bb… leo 4 from pathlib import Path
287a3bb… leo 5 from typing import Optional
287a3bb… leo 6
287a3bb… leo 7 logger = logging.getLogger(__name__)
287a3bb… leo 8
287a3bb… leo 9 # Model size → approximate VRAM/RAM usage
287a3bb… leo 10 _MODEL_SIZES = {
287a3bb… leo 11 "tiny": "~1GB",
287a3bb… leo 12 "base": "~1GB",
287a3bb… leo 13 "small": "~2GB",
287a3bb… leo 14 "medium": "~5GB",
287a3bb… leo 15 "large": "~10GB",
287a3bb… leo 16 "turbo": "~6GB",
287a3bb… leo 17 }
287a3bb… leo 18
287a3bb… leo 19
287a3bb… leo 20 class WhisperLocal:
287a3bb… leo 21 """
287a3bb… leo 22 Local Whisper transcription using openai-whisper.
287a3bb… leo 23
287a3bb… leo 24 Uses MPS (Apple Silicon) or CUDA when available, falls back to CPU.
287a3bb… leo 25 No file size limits — processes audio directly on device.
287a3bb… leo 26 """
287a3bb… leo 27
287a3bb… leo 28 def __init__(self, model_size: str = "large", device: Optional[str] = None):
287a3bb… leo 29 """
287a3bb… leo 30 Initialize local Whisper.
287a3bb… leo 31
287a3bb… leo 32 Parameters
287a3bb… leo 33 ----------
287a3bb… leo 34 model_size : str
287a3bb… leo 35 Whisper model size: tiny, base, small, medium, large, turbo
287a3bb… leo 36 device : str, optional
287a3bb… leo 37 Force device: 'mps', 'cuda', 'cpu'. Auto-detects if None.
287a3bb… leo 38 """
287a3bb… leo 39 self.model_size = model_size
287a3bb… leo 40 self._model = None
287a3bb… leo 41
287a3bb… leo 42 if device:
287a3bb… leo 43 self.device = device
287a3bb… leo 44 else:
287a3bb… leo 45 self.device = self._detect_device()
287a3bb… leo 46
287a3bb… leo 47 logger.info(
287a3bb… leo 48 f"WhisperLocal: model={model_size} ({_MODEL_SIZES.get(model_size, '?')}), "
287a3bb… leo 49 f"device={self.device}"
287a3bb… leo 50 )
287a3bb… leo 51
287a3bb… leo 52 @staticmethod
287a3bb… leo 53 def _detect_device() -> str:
287a3bb… leo 54 """Auto-detect the best available device."""
287a3bb… leo 55 try:
287a3bb… leo 56 import torch
287a3bb… leo 57
287a3bb… leo 58 if torch.cuda.is_available():
287a3bb… leo 59 return "cuda"
287a3bb… leo 60 if torch.backends.mps.is_available():
287a3bb… leo 61 return "mps"
287a3bb… leo 62 except ImportError:
287a3bb… leo 63 pass
287a3bb… leo 64 return "cpu"
287a3bb… leo 65
287a3bb… leo 66 def _load_model(self):
287a3bb… leo 67 """Lazy-load the Whisper model."""
287a3bb… leo 68 if self._model is not None:
287a3bb… leo 69 return
287a3bb… leo 70
287a3bb… leo 71 try:
287a3bb… leo 72 import whisper
287a3bb… leo 73 except ImportError:
829e24a… leo 74 raise ImportError("openai-whisper not installed. Run: pip install openai-whisper torch")
287a3bb… leo 75
287a3bb… leo 76 logger.info(f"Loading Whisper {self.model_size} model on {self.device}...")
287a3bb… leo 77 self._model = whisper.load_model(self.model_size, device=self.device)
287a3bb… leo 78 logger.info("Whisper model loaded")
287a3bb… leo 79
287a3bb… leo 80 def transcribe(
287a3bb… leo 81 self,
287a3bb… leo 82 audio_path: str | Path,
287a3bb… leo 83 language: Optional[str] = None,
287a3bb… leo 84 ) -> dict:
287a3bb… leo 85 """
287a3bb… leo 86 Transcribe audio using local Whisper.
287a3bb… leo 87
287a3bb… leo 88 No file size limits. Runs entirely on device.
287a3bb… leo 89
287a3bb… leo 90 Returns dict compatible with ProviderManager transcription format.
287a3bb… leo 91 """
287a3bb… leo 92 self._load_model()
287a3bb… leo 93 audio_path = Path(audio_path)
287a3bb… leo 94
287a3bb… leo 95 logger.info(f"Transcribing {audio_path.name} with Whisper {self.model_size}...")
287a3bb… leo 96
287a3bb… leo 97 # fp16 only works reliably on CUDA; MPS produces NaN with large models
287a3bb… leo 98 kwargs = {"fp16": self.device == "cuda"}
287a3bb… leo 99 if language:
287a3bb… leo 100 kwargs["language"] = language
287a3bb… leo 101
287a3bb… leo 102 result = self._model.transcribe(str(audio_path), **kwargs)
287a3bb… leo 103
287a3bb… leo 104 segments = [
287a3bb… leo 105 {
287a3bb… leo 106 "start": seg["start"],
287a3bb… leo 107 "end": seg["end"],
287a3bb… leo 108 "text": seg["text"].strip(),
287a3bb… leo 109 }
287a3bb… leo 110 for seg in result.get("segments", [])
287a3bb… leo 111 ]
287a3bb… leo 112
287a3bb… leo 113 duration = segments[-1]["end"] if segments else None
287a3bb… leo 114
287a3bb… leo 115 return {
287a3bb… leo 116 "text": result.get("text", "").strip(),
287a3bb… leo 117 "segments": segments,
287a3bb… leo 118 "language": result.get("language", language),
287a3bb… leo 119 "duration": duration,
287a3bb… leo 120 "provider": "whisper-local",
287a3bb… leo 121 "model": f"whisper-{self.model_size}",
287a3bb… leo 122 }
287a3bb… leo 123
287a3bb… leo 124 @staticmethod
287a3bb… leo 125 def is_available() -> bool:
287a3bb… leo 126 """Check if local Whisper is installed and usable."""
287a3bb… leo 127 try:
829e24a… leo 128 import torch # noqa: F401
829e24a… leo 129 import whisper # noqa: F401
829e24a… leo 130
287a3bb… leo 131 return True
287a3bb… leo 132 except ImportError:
287a3bb… leo 133 return False

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button