PlanOpticon

planopticon / video_processor / providers / whisper_local.py
Blame History Raw 134 lines
1
"""Local Whisper transcription provider — runs on-device with GPU acceleration."""
2
3
import logging
4
from pathlib import Path
5
from typing import Optional
6
7
logger = logging.getLogger(__name__)
8
9
# Model size → approximate VRAM/RAM usage
10
_MODEL_SIZES = {
11
"tiny": "~1GB",
12
"base": "~1GB",
13
"small": "~2GB",
14
"medium": "~5GB",
15
"large": "~10GB",
16
"turbo": "~6GB",
17
}
18
19
20
class WhisperLocal:
21
"""
22
Local Whisper transcription using openai-whisper.
23
24
Uses MPS (Apple Silicon) or CUDA when available, falls back to CPU.
25
No file size limits — processes audio directly on device.
26
"""
27
28
def __init__(self, model_size: str = "large", device: Optional[str] = None):
29
"""
30
Initialize local Whisper.
31
32
Parameters
33
----------
34
model_size : str
35
Whisper model size: tiny, base, small, medium, large, turbo
36
device : str, optional
37
Force device: 'mps', 'cuda', 'cpu'. Auto-detects if None.
38
"""
39
self.model_size = model_size
40
self._model = None
41
42
if device:
43
self.device = device
44
else:
45
self.device = self._detect_device()
46
47
logger.info(
48
f"WhisperLocal: model={model_size} ({_MODEL_SIZES.get(model_size, '?')}), "
49
f"device={self.device}"
50
)
51
52
@staticmethod
53
def _detect_device() -> str:
54
"""Auto-detect the best available device."""
55
try:
56
import torch
57
58
if torch.cuda.is_available():
59
return "cuda"
60
if torch.backends.mps.is_available():
61
return "mps"
62
except ImportError:
63
pass
64
return "cpu"
65
66
def _load_model(self):
67
"""Lazy-load the Whisper model."""
68
if self._model is not None:
69
return
70
71
try:
72
import whisper
73
except ImportError:
74
raise ImportError("openai-whisper not installed. Run: pip install openai-whisper torch")
75
76
logger.info(f"Loading Whisper {self.model_size} model on {self.device}...")
77
self._model = whisper.load_model(self.model_size, device=self.device)
78
logger.info("Whisper model loaded")
79
80
def transcribe(
81
self,
82
audio_path: str | Path,
83
language: Optional[str] = None,
84
) -> dict:
85
"""
86
Transcribe audio using local Whisper.
87
88
No file size limits. Runs entirely on device.
89
90
Returns dict compatible with ProviderManager transcription format.
91
"""
92
self._load_model()
93
audio_path = Path(audio_path)
94
95
logger.info(f"Transcribing {audio_path.name} with Whisper {self.model_size}...")
96
97
# fp16 only works reliably on CUDA; MPS produces NaN with large models
98
kwargs = {"fp16": self.device == "cuda"}
99
if language:
100
kwargs["language"] = language
101
102
result = self._model.transcribe(str(audio_path), **kwargs)
103
104
segments = [
105
{
106
"start": seg["start"],
107
"end": seg["end"],
108
"text": seg["text"].strip(),
109
}
110
for seg in result.get("segments", [])
111
]
112
113
duration = segments[-1]["end"] if segments else None
114
115
return {
116
"text": result.get("text", "").strip(),
117
"segments": segments,
118
"language": result.get("language", language),
119
"duration": duration,
120
"provider": "whisper-local",
121
"model": f"whisper-{self.model_size}",
122
}
123
124
@staticmethod
125
def is_available() -> bool:
126
"""Check if local Whisper is installed and usable."""
127
try:
128
import torch # noqa: F401
129
import whisper # noqa: F401
130
131
return True
132
except ImportError:
133
return False
134

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button