PlanOpticon

planopticon / video_processor / providers / whisper_local.py

Source Blame History 133 lines

287a3bb…	leo	1	"""Local Whisper transcription provider — runs on-device with GPU acceleration."""
287a3bb…	leo	2
287a3bb…	leo	3	import logging
287a3bb…	leo	4	from pathlib import Path
287a3bb…	leo	5	from typing import Optional
287a3bb…	leo	6
287a3bb…	leo	7	logger = logging.getLogger(__name__)
287a3bb…	leo	8
287a3bb…	leo	9	# Model size → approximate VRAM/RAM usage
287a3bb…	leo	10	_MODEL_SIZES = {
287a3bb…	leo	11	"tiny": "~1GB",
287a3bb…	leo	12	"base": "~1GB",
287a3bb…	leo	13	"small": "~2GB",
287a3bb…	leo	14	"medium": "~5GB",
287a3bb…	leo	15	"large": "~10GB",
287a3bb…	leo	16	"turbo": "~6GB",
287a3bb…	leo	17	}
287a3bb…	leo	18
287a3bb…	leo	19
287a3bb…	leo	20	class WhisperLocal:
287a3bb…	leo	21	"""
287a3bb…	leo	22	Local Whisper transcription using openai-whisper.
287a3bb…	leo	23
287a3bb…	leo	24	Uses MPS (Apple Silicon) or CUDA when available, falls back to CPU.
287a3bb…	leo	25	No file size limits — processes audio directly on device.
287a3bb…	leo	26	"""
287a3bb…	leo	27
287a3bb…	leo	28	def __init__(self, model_size: str = "large", device: Optional[str] = None):
287a3bb…	leo	29	"""
287a3bb…	leo	30	Initialize local Whisper.
287a3bb…	leo	31
287a3bb…	leo	32	Parameters
287a3bb…	leo	33	----------
287a3bb…	leo	34	model_size : str
287a3bb…	leo	35	Whisper model size: tiny, base, small, medium, large, turbo
287a3bb…	leo	36	device : str, optional
287a3bb…	leo	37	Force device: 'mps', 'cuda', 'cpu'. Auto-detects if None.
287a3bb…	leo	38	"""
287a3bb…	leo	39	self.model_size = model_size
287a3bb…	leo	40	self._model = None
287a3bb…	leo	41
287a3bb…	leo	42	if device:
287a3bb…	leo	43	self.device = device
287a3bb…	leo	44	else:
287a3bb…	leo	45	self.device = self._detect_device()
287a3bb…	leo	46
287a3bb…	leo	47	logger.info(
287a3bb…	leo	48	f"WhisperLocal: model={model_size} ({_MODEL_SIZES.get(model_size, '?')}), "
287a3bb…	leo	49	f"device={self.device}"
287a3bb…	leo	50	)
287a3bb…	leo	51
287a3bb…	leo	52	@staticmethod
287a3bb…	leo	53	def _detect_device() -> str:
287a3bb…	leo	54	"""Auto-detect the best available device."""
287a3bb…	leo	55	try:
287a3bb…	leo	56	import torch
287a3bb…	leo	57
287a3bb…	leo	58	if torch.cuda.is_available():
287a3bb…	leo	59	return "cuda"
287a3bb…	leo	60	if torch.backends.mps.is_available():
287a3bb…	leo	61	return "mps"
287a3bb…	leo	62	except ImportError:
287a3bb…	leo	63	pass
287a3bb…	leo	64	return "cpu"
287a3bb…	leo	65
287a3bb…	leo	66	def _load_model(self):
287a3bb…	leo	67	"""Lazy-load the Whisper model."""
287a3bb…	leo	68	if self._model is not None:
287a3bb…	leo	69	return
287a3bb…	leo	70
287a3bb…	leo	71	try:
287a3bb…	leo	72	import whisper
287a3bb…	leo	73	except ImportError:
829e24a…	leo	74	raise ImportError("openai-whisper not installed. Run: pip install openai-whisper torch")
287a3bb…	leo	75
287a3bb…	leo	76	logger.info(f"Loading Whisper {self.model_size} model on {self.device}...")
287a3bb…	leo	77	self._model = whisper.load_model(self.model_size, device=self.device)
287a3bb…	leo	78	logger.info("Whisper model loaded")
287a3bb…	leo	79
287a3bb…	leo	80	def transcribe(
287a3bb…	leo	81	self,
287a3bb…	leo	82	audio_path: str \| Path,
287a3bb…	leo	83	language: Optional[str] = None,
287a3bb…	leo	84	) -> dict:
287a3bb…	leo	85	"""
287a3bb…	leo	86	Transcribe audio using local Whisper.
287a3bb…	leo	87
287a3bb…	leo	88	No file size limits. Runs entirely on device.
287a3bb…	leo	89
287a3bb…	leo	90	Returns dict compatible with ProviderManager transcription format.
287a3bb…	leo	91	"""
287a3bb…	leo	92	self._load_model()
287a3bb…	leo	93	audio_path = Path(audio_path)
287a3bb…	leo	94
287a3bb…	leo	95	logger.info(f"Transcribing {audio_path.name} with Whisper {self.model_size}...")
287a3bb…	leo	96
287a3bb…	leo	97	# fp16 only works reliably on CUDA; MPS produces NaN with large models
287a3bb…	leo	98	kwargs = {"fp16": self.device == "cuda"}
287a3bb…	leo	99	if language:
287a3bb…	leo	100	kwargs["language"] = language
287a3bb…	leo	101
287a3bb…	leo	102	result = self._model.transcribe(str(audio_path), **kwargs)
287a3bb…	leo	103
287a3bb…	leo	104	segments = [
287a3bb…	leo	105	{
287a3bb…	leo	106	"start": seg["start"],
287a3bb…	leo	107	"end": seg["end"],
287a3bb…	leo	108	"text": seg["text"].strip(),
287a3bb…	leo	109	}
287a3bb…	leo	110	for seg in result.get("segments", [])
287a3bb…	leo	111	]
287a3bb…	leo	112
287a3bb…	leo	113	duration = segments[-1]["end"] if segments else None
287a3bb…	leo	114
287a3bb…	leo	115	return {
287a3bb…	leo	116	"text": result.get("text", "").strip(),
287a3bb…	leo	117	"segments": segments,
287a3bb…	leo	118	"language": result.get("language", language),
287a3bb…	leo	119	"duration": duration,
287a3bb…	leo	120	"provider": "whisper-local",
287a3bb…	leo	121	"model": f"whisper-{self.model_size}",
287a3bb…	leo	122	}
287a3bb…	leo	123
287a3bb…	leo	124	@staticmethod
287a3bb…	leo	125	def is_available() -> bool:
287a3bb…	leo	126	"""Check if local Whisper is installed and usable."""
287a3bb…	leo	127	try:
829e24a…	leo	128	import torch # noqa: F401
829e24a…	leo	129	import whisper # noqa: F401
829e24a…	leo	130
287a3bb…	leo	131	return True
287a3bb…	leo	132	except ImportError:
287a3bb…	leo	133	return False

PlanOpticon

Keyboard Shortcuts