PlanOpticon

feat(transcription): add speaker diarization hints via --speakers flag

lmata 2026-03-07 22:16 trunk

Commit 8abc5d431ec08f96a64a2104c2978465cf89c2cbe93076549303e673c1c34403

Parent 4ce2ecb424f52a2…

2 files changed +9 +17 -2

~ video_processor/cli/commands.py ~ video_processor/providers/manager.py

M video_processor/cli/commands.py

		--- video_processor/cli/commands.py
		+++ video_processor/cli/commands.py
		@@ -104,10 +104,16 @@
104	104	"--templates-dir",
105	105	type=click.Path(exists=True),
106	106	default=None,
107	107	help="Directory with custom prompt template .txt files",
108	108	)
	109	+@click.option(
	110	+ "--speakers",
	111	+ type=str,
	112	+ default=None,
	113	+ help='Comma-separated speaker names for diarization hints (e.g., "Alice,Bob,Carol")',
	114	+)
109	115	@click.pass_context
110	116	def analyze(
111	117	ctx,
112	118	input,
113	119	output,
		@@ -121,16 +127,18 @@
121	127	provider,
122	128	vision_model,
123	129	chat_model,
124	130	output_format,
125	131	templates_dir,
	132	+ speakers,
126	133	):
127	134	"""Analyze a single video and extract structured knowledge."""
128	135	from video_processor.pipeline import process_single_video
129	136	from video_processor.providers.manager import ProviderManager
130	137
131	138	focus_areas = [a.strip().lower() for a in focus.split(",")] if focus else []
	139	+ speaker_hints = [s.strip() for s in speakers.split(",")] if speakers else None
132	140	prov = None if provider == "auto" else provider
133	141
134	142	pm = ProviderManager(
135	143	vision_model=vision_model,
136	144	chat_model=chat_model,
		@@ -152,10 +160,11 @@
152	160	sampling_rate=sampling_rate,
153	161	change_threshold=change_threshold,
154	162	periodic_capture_seconds=periodic_capture,
155	163	use_gpu=use_gpu,
156	164	title=title,
	165	+ speaker_hints=speaker_hints,
157	166	)
158	167	if output_format == "json":
159	168	click.echo(json.dumps(manifest.model_dump(), indent=2, default=str))
160	169	else:
161	170	click.echo(pm.usage.format_summary())
162	171

	--- video_processor/cli/commands.py
	+++ video_processor/cli/commands.py
	@@ -104,10 +104,16 @@
104	"--templates-dir",
105	type=click.Path(exists=True),
106	default=None,
107	help="Directory with custom prompt template .txt files",
108	)






109	@click.pass_context
110	def analyze(
111	ctx,
112	input,
113	output,
	@@ -121,16 +127,18 @@
121	provider,
122	vision_model,
123	chat_model,
124	output_format,
125	templates_dir,

126	):
127	"""Analyze a single video and extract structured knowledge."""
128	from video_processor.pipeline import process_single_video
129	from video_processor.providers.manager import ProviderManager
130
131	focus_areas = [a.strip().lower() for a in focus.split(",")] if focus else []

132	prov = None if provider == "auto" else provider
133
134	pm = ProviderManager(
135	vision_model=vision_model,
136	chat_model=chat_model,
	@@ -152,10 +160,11 @@
152	sampling_rate=sampling_rate,
153	change_threshold=change_threshold,
154	periodic_capture_seconds=periodic_capture,
155	use_gpu=use_gpu,
156	title=title,

157	)
158	if output_format == "json":
159	click.echo(json.dumps(manifest.model_dump(), indent=2, default=str))
160	else:
161	click.echo(pm.usage.format_summary())
162

	--- video_processor/cli/commands.py
	+++ video_processor/cli/commands.py
	@@ -104,10 +104,16 @@
104	"--templates-dir",
105	type=click.Path(exists=True),
106	default=None,
107	help="Directory with custom prompt template .txt files",
108	)
109	@click.option(
110	"--speakers",
111	type=str,
112	default=None,
113	help='Comma-separated speaker names for diarization hints (e.g., "Alice,Bob,Carol")',
114	)
115	@click.pass_context
116	def analyze(
117	ctx,
118	input,
119	output,
	@@ -121,16 +127,18 @@
127	provider,
128	vision_model,
129	chat_model,
130	output_format,
131	templates_dir,
132	speakers,
133	):
134	"""Analyze a single video and extract structured knowledge."""
135	from video_processor.pipeline import process_single_video
136	from video_processor.providers.manager import ProviderManager
137
138	focus_areas = [a.strip().lower() for a in focus.split(",")] if focus else []
139	speaker_hints = [s.strip() for s in speakers.split(",")] if speakers else None
140	prov = None if provider == "auto" else provider
141
142	pm = ProviderManager(
143	vision_model=vision_model,
144	chat_model=chat_model,
	@@ -152,10 +160,11 @@
160	sampling_rate=sampling_rate,
161	change_threshold=change_threshold,
162	periodic_capture_seconds=periodic_capture,
163	use_gpu=use_gpu,
164	title=title,
165	speaker_hints=speaker_hints,
166	)
167	if output_format == "json":
168	click.echo(json.dumps(manifest.model_dump(), indent=2, default=str))
169	else:
170	click.echo(pm.usage.format_summary())
171

M video_processor/providers/manager.py

+17 -2

		--- video_processor/providers/manager.py
		+++ video_processor/providers/manager.py
		@@ -220,10 +220,11 @@
220	220
221	221	def transcribe_audio(
222	222	self,
223	223	audio_path: str \| Path,
224	224	language: Optional[str] = None,
	225	+ speaker_hints: Optional[list[str]] = None,
225	226	) -> dict:
226	227	"""Transcribe audio using local Whisper if available, otherwise API."""
227	228	# Prefer local Whisper — no file size limits, no API costs
228	229	if not self.transcription_model or self.transcription_model.startswith("whisper-local"):
229	230	try:
		@@ -235,11 +236,17 @@
235	236	if self.transcription_model and ":" in self.transcription_model:
236	237	size = self.transcription_model.split(":", 1)[1]
237	238	if not hasattr(self, "_whisper_local"):
238	239	self._whisper_local = WhisperLocal(model_size=size)
239	240	logger.info(f"Transcription: using local whisper-{size}")
240		- result = self._whisper_local.transcribe(audio_path, language=language)
	241	+ # Pass speaker names as initial prompt hint for Whisper
	242	+ whisper_kwargs = {"language": language}
	243	+ if speaker_hints:
	244	+ whisper_kwargs["initial_prompt"] = (
	245	+ "Speakers: " + ", ".join(speaker_hints) + "."
	246	+ )
	247	+ result = self._whisper_local.transcribe(audio_path, **whisper_kwargs)
241	248	duration = result.get("duration") or 0
242	249	self.usage.record(
243	250	provider="local",
244	251	model=f"whisper-{size}",
245	252	audio_minutes=duration / 60 if duration else 0,
		@@ -252,11 +259,19 @@
252	259	prov_name, model = self._resolve_model(
253	260	self.transcription_model, "audio", _TRANSCRIPTION_PREFERENCES
254	261	)
255	262	logger.info(f"Transcription: using {prov_name}/{model}")
256	263	provider = self._get_provider(prov_name)
257		- result = provider.transcribe_audio(audio_path, language=language, model=model)
	264	+ # Build transcription kwargs, passing speaker hints where supported
	265	+ transcribe_kwargs: dict = {"language": language, "model": model}
	266	+ if speaker_hints:
	267	+ if prov_name == "openai":
	268	+ # OpenAI Whisper supports a 'prompt' parameter for hints
	269	+ transcribe_kwargs["prompt"] = "Speakers: " + ", ".join(speaker_hints) + "."
	270	+ else:
	271	+ transcribe_kwargs["speaker_hints"] = speaker_hints
	272	+ result = provider.transcribe_audio(audio_path, **transcribe_kwargs)
258	273	duration = result.get("duration") or 0
259	274	self.usage.record(
260	275	provider=prov_name,
261	276	model=model,
262	277	audio_minutes=duration / 60 if duration else 0,
263	278

	--- video_processor/providers/manager.py
	+++ video_processor/providers/manager.py
	@@ -220,10 +220,11 @@
220
221	def transcribe_audio(
222	self,
223	audio_path: str \| Path,
224	language: Optional[str] = None,

225	) -> dict:
226	"""Transcribe audio using local Whisper if available, otherwise API."""
227	# Prefer local Whisper — no file size limits, no API costs
228	if not self.transcription_model or self.transcription_model.startswith("whisper-local"):
229	try:
	@@ -235,11 +236,17 @@
235	if self.transcription_model and ":" in self.transcription_model:
236	size = self.transcription_model.split(":", 1)[1]
237	if not hasattr(self, "_whisper_local"):
238	self._whisper_local = WhisperLocal(model_size=size)
239	logger.info(f"Transcription: using local whisper-{size}")
240	result = self._whisper_local.transcribe(audio_path, language=language)






241	duration = result.get("duration") or 0
242	self.usage.record(
243	provider="local",
244	model=f"whisper-{size}",
245	audio_minutes=duration / 60 if duration else 0,
	@@ -252,11 +259,19 @@
252	prov_name, model = self._resolve_model(
253	self.transcription_model, "audio", _TRANSCRIPTION_PREFERENCES
254	)
255	logger.info(f"Transcription: using {prov_name}/{model}")
256	provider = self._get_provider(prov_name)
257	result = provider.transcribe_audio(audio_path, language=language, model=model)








258	duration = result.get("duration") or 0
259	self.usage.record(
260	provider=prov_name,
261	model=model,
262	audio_minutes=duration / 60 if duration else 0,
263

	--- video_processor/providers/manager.py
	+++ video_processor/providers/manager.py
	@@ -220,10 +220,11 @@
220
221	def transcribe_audio(
222	self,
223	audio_path: str \| Path,
224	language: Optional[str] = None,
225	speaker_hints: Optional[list[str]] = None,
226	) -> dict:
227	"""Transcribe audio using local Whisper if available, otherwise API."""
228	# Prefer local Whisper — no file size limits, no API costs
229	if not self.transcription_model or self.transcription_model.startswith("whisper-local"):
230	try:
	@@ -235,11 +236,17 @@
236	if self.transcription_model and ":" in self.transcription_model:
237	size = self.transcription_model.split(":", 1)[1]
238	if not hasattr(self, "_whisper_local"):
239	self._whisper_local = WhisperLocal(model_size=size)
240	logger.info(f"Transcription: using local whisper-{size}")
241	# Pass speaker names as initial prompt hint for Whisper
242	whisper_kwargs = {"language": language}
243	if speaker_hints:
244	whisper_kwargs["initial_prompt"] = (
245	"Speakers: " + ", ".join(speaker_hints) + "."
246	)
247	result = self._whisper_local.transcribe(audio_path, **whisper_kwargs)
248	duration = result.get("duration") or 0
249	self.usage.record(
250	provider="local",
251	model=f"whisper-{size}",
252	audio_minutes=duration / 60 if duration else 0,
	@@ -252,11 +259,19 @@
259	prov_name, model = self._resolve_model(
260	self.transcription_model, "audio", _TRANSCRIPTION_PREFERENCES
261	)
262	logger.info(f"Transcription: using {prov_name}/{model}")
263	provider = self._get_provider(prov_name)
264	# Build transcription kwargs, passing speaker hints where supported
265	transcribe_kwargs: dict = {"language": language, "model": model}
266	if speaker_hints:
267	if prov_name == "openai":
268	# OpenAI Whisper supports a 'prompt' parameter for hints
269	transcribe_kwargs["prompt"] = "Speakers: " + ", ".join(speaker_hints) + "."
270	else:
271	transcribe_kwargs["speaker_hints"] = speaker_hints
272	result = provider.transcribe_audio(audio_path, **transcribe_kwargs)
273	duration = result.get("duration") or 0
274	self.usage.record(
275	provider=prov_name,
276	model=model,
277	audio_minutes=duration / 60 if duration else 0,
278

PlanOpticon

Keyboard Shortcuts