| | @@ -220,10 +220,11 @@ |
| 220 | 220 | |
| 221 | 221 | def transcribe_audio( |
| 222 | 222 | self, |
| 223 | 223 | audio_path: str | Path, |
| 224 | 224 | language: Optional[str] = None, |
| 225 | + speaker_hints: Optional[list[str]] = None, |
| 225 | 226 | ) -> dict: |
| 226 | 227 | """Transcribe audio using local Whisper if available, otherwise API.""" |
| 227 | 228 | # Prefer local Whisper — no file size limits, no API costs |
| 228 | 229 | if not self.transcription_model or self.transcription_model.startswith("whisper-local"): |
| 229 | 230 | try: |
| | @@ -235,11 +236,17 @@ |
| 235 | 236 | if self.transcription_model and ":" in self.transcription_model: |
| 236 | 237 | size = self.transcription_model.split(":", 1)[1] |
| 237 | 238 | if not hasattr(self, "_whisper_local"): |
| 238 | 239 | self._whisper_local = WhisperLocal(model_size=size) |
| 239 | 240 | logger.info(f"Transcription: using local whisper-{size}") |
| 240 | | - result = self._whisper_local.transcribe(audio_path, language=language) |
| 241 | + # Pass speaker names as initial prompt hint for Whisper |
| 242 | + whisper_kwargs = {"language": language} |
| 243 | + if speaker_hints: |
| 244 | + whisper_kwargs["initial_prompt"] = ( |
| 245 | + "Speakers: " + ", ".join(speaker_hints) + "." |
| 246 | + ) |
| 247 | + result = self._whisper_local.transcribe(audio_path, **whisper_kwargs) |
| 241 | 248 | duration = result.get("duration") or 0 |
| 242 | 249 | self.usage.record( |
| 243 | 250 | provider="local", |
| 244 | 251 | model=f"whisper-{size}", |
| 245 | 252 | audio_minutes=duration / 60 if duration else 0, |
| | @@ -252,11 +259,19 @@ |
| 252 | 259 | prov_name, model = self._resolve_model( |
| 253 | 260 | self.transcription_model, "audio", _TRANSCRIPTION_PREFERENCES |
| 254 | 261 | ) |
| 255 | 262 | logger.info(f"Transcription: using {prov_name}/{model}") |
| 256 | 263 | provider = self._get_provider(prov_name) |
| 257 | | - result = provider.transcribe_audio(audio_path, language=language, model=model) |
| 264 | + # Build transcription kwargs, passing speaker hints where supported |
| 265 | + transcribe_kwargs: dict = {"language": language, "model": model} |
| 266 | + if speaker_hints: |
| 267 | + if prov_name == "openai": |
| 268 | + # OpenAI Whisper supports a 'prompt' parameter for hints |
| 269 | + transcribe_kwargs["prompt"] = "Speakers: " + ", ".join(speaker_hints) + "." |
| 270 | + else: |
| 271 | + transcribe_kwargs["speaker_hints"] = speaker_hints |
| 272 | + result = provider.transcribe_audio(audio_path, **transcribe_kwargs) |
| 258 | 273 | duration = result.get("duration") or 0 |
| 259 | 274 | self.usage.record( |
| 260 | 275 | provider=prov_name, |
| 261 | 276 | model=model, |
| 262 | 277 | audio_minutes=duration / 60 if duration else 0, |
| 263 | 278 | |