PlanOpticon

feat(sources): add Zoom, Teams, and Meet recording connectors - ZoomSource: OAuth2 with PKCE, Server-to-Server, and saved token auth - TeamsRecordingSource: m365 CLI wrapper with Graph API fallbacks - MeetRecordingSource: gws CLI wrapper searching Drive for recordings - CLI: planopticon recordings {zoom-list,teams-list,meet-list} - 29 new tests (757 total passing)

lmata 2026-03-07 23:22 trunk

Commit 4bdd77c40949daad3589e28cfd5f2aa8bf1250d2754974b2ec36b13c9dd1a9fc

Parent 82dc7b969aca623…

7 files changed +29 +228 +102 +6 +280 +375 +399

~ tests/test_cli.py ~ tests/test_sources.py ~ video_processor/cli/commands.py ~ video_processor/sources/__init__.py + video_processor/sources/meet_recording_source.py + video_processor/sources/teams_recording_source.py + video_processor/sources/zoom_source.py

M tests/test_cli.py

+29

		--- tests/test_cli.py
		+++ tests/test_cli.py
		@@ -22,10 +22,11 @@
22	22	assert "query" in result.output
23	23	assert "agent" in result.output
24	24	assert "kg" in result.output
25	25	assert "gws" in result.output
26	26	assert "m365" in result.output
	27	+ assert "recordings" in result.output
27	28	assert "ingest" in result.output
28	29	assert "batch" in result.output
29	30
30	31
31	32	class TestAnalyzeHelp:
		@@ -193,13 +194,41 @@
193	194	assert result.exit_code == 0
194	195	assert "--web-url" in result.output
195	196	assert "--file-id" in result.output
196	197	assert "--db-path" in result.output
197	198
	199	+
	200	+class TestRecordingsHelp:
	201	+ def test_group_help(self):
	202	+ runner = CliRunner()
	203	+ result = runner.invoke(cli, ["recordings", "--help"])
	204	+ assert result.exit_code == 0
	205	+ assert "zoom-list" in result.output
	206	+ assert "teams-list" in result.output
	207	+ assert "meet-list" in result.output
	208	+
	209	+ def test_zoom_list_help(self):
	210	+ runner = CliRunner()
	211	+ result = runner.invoke(cli, ["recordings", "zoom-list", "--help"])
	212	+ assert result.exit_code == 0
	213	+ assert "ZOOM_CLIENT_ID" in result.output
	214	+
	215	+ def test_teams_list_help(self):
	216	+ runner = CliRunner()
	217	+ result = runner.invoke(cli, ["recordings", "teams-list", "--help"])
	218	+ assert result.exit_code == 0
	219	+ assert "--user-id" in result.output
	220	+
	221	+ def test_meet_list_help(self):
	222	+ runner = CliRunner()
	223	+ result = runner.invoke(cli, ["recordings", "meet-list", "--help"])
	224	+ assert result.exit_code == 0
	225	+ assert "--folder-id" in result.output
	226	+
198	227
199	228	class TestAuthHelp:
200	229	def test_help(self):
201	230	runner = CliRunner()
202	231	result = runner.invoke(cli, ["auth", "--help"])
203	232	assert result.exit_code == 0
204	233	assert "google" in result.output
205	234	assert "dropbox" in result.output
206	235

	--- tests/test_cli.py
	+++ tests/test_cli.py
	@@ -22,10 +22,11 @@
22	assert "query" in result.output
23	assert "agent" in result.output
24	assert "kg" in result.output
25	assert "gws" in result.output
26	assert "m365" in result.output

27	assert "ingest" in result.output
28	assert "batch" in result.output
29
30
31	class TestAnalyzeHelp:
	@@ -193,13 +194,41 @@
193	assert result.exit_code == 0
194	assert "--web-url" in result.output
195	assert "--file-id" in result.output
196	assert "--db-path" in result.output
197




























198
199	class TestAuthHelp:
200	def test_help(self):
201	runner = CliRunner()
202	result = runner.invoke(cli, ["auth", "--help"])
203	assert result.exit_code == 0
204	assert "google" in result.output
205	assert "dropbox" in result.output
206

	--- tests/test_cli.py
	+++ tests/test_cli.py
	@@ -22,10 +22,11 @@
22	assert "query" in result.output
23	assert "agent" in result.output
24	assert "kg" in result.output
25	assert "gws" in result.output
26	assert "m365" in result.output
27	assert "recordings" in result.output
28	assert "ingest" in result.output
29	assert "batch" in result.output
30
31
32	class TestAnalyzeHelp:
	@@ -193,13 +194,41 @@
194	assert result.exit_code == 0
195	assert "--web-url" in result.output
196	assert "--file-id" in result.output
197	assert "--db-path" in result.output
198
199
200	class TestRecordingsHelp:
201	def test_group_help(self):
202	runner = CliRunner()
203	result = runner.invoke(cli, ["recordings", "--help"])
204	assert result.exit_code == 0
205	assert "zoom-list" in result.output
206	assert "teams-list" in result.output
207	assert "meet-list" in result.output
208
209	def test_zoom_list_help(self):
210	runner = CliRunner()
211	result = runner.invoke(cli, ["recordings", "zoom-list", "--help"])
212	assert result.exit_code == 0
213	assert "ZOOM_CLIENT_ID" in result.output
214
215	def test_teams_list_help(self):
216	runner = CliRunner()
217	result = runner.invoke(cli, ["recordings", "teams-list", "--help"])
218	assert result.exit_code == 0
219	assert "--user-id" in result.output
220
221	def test_meet_list_help(self):
222	runner = CliRunner()
223	result = runner.invoke(cli, ["recordings", "meet-list", "--help"])
224	assert result.exit_code == 0
225	assert "--folder-id" in result.output
226
227
228	class TestAuthHelp:
229	def test_help(self):
230	runner = CliRunner()
231	result = runner.invoke(cli, ["auth", "--help"])
232	assert result.exit_code == 0
233	assert "google" in result.output
234	assert "dropbox" in result.output
235

M tests/test_sources.py

+228

		--- tests/test_sources.py
		+++ tests/test_sources.py
		@@ -1340,5 +1340,233 @@
1340	1340	html = "<tag> "quoted" 'apos'  space"
1341	1341	result = _html_to_text(html)
1342	1342	assert "<tag>" in result
1343	1343	assert '"quoted"' in result
1344	1344	assert "'apos'" in result
	1345	+
	1346	+
	1347	+# ---------------------------------------------------------------------------
	1348	+# ZoomSource
	1349	+# ---------------------------------------------------------------------------
	1350	+
	1351	+
	1352	+class TestZoomSource:
	1353	+ def test_import(self):
	1354	+ from video_processor.sources.zoom_source import ZoomSource
	1355	+
	1356	+ assert ZoomSource is not None
	1357	+
	1358	+ def test_constructor_defaults(self):
	1359	+ from video_processor.sources.zoom_source import ZoomSource
	1360	+
	1361	+ src = ZoomSource()
	1362	+ assert src.client_id is None or isinstance(src.client_id, str)
	1363	+ assert src._access_token is None
	1364	+
	1365	+ def test_constructor_explicit(self):
	1366	+ from video_processor.sources.zoom_source import ZoomSource
	1367	+
	1368	+ src = ZoomSource(
	1369	+ client_id="cid",
	1370	+ client_secret="csec",
	1371	+ account_id="aid",
	1372	+ )
	1373	+ assert src.client_id == "cid"
	1374	+ assert src.client_secret == "csec"
	1375	+ assert src.account_id == "aid"
	1376	+
	1377	+ def test_authenticate_no_credentials(self):
	1378	+ from video_processor.sources.zoom_source import ZoomSource
	1379	+
	1380	+ src = ZoomSource(client_id=None, client_secret=None, account_id=None)
	1381	+ # No saved token, no account_id, no client_id → should fail
	1382	+ assert src.authenticate() is False
	1383	+
	1384	+ def test_list_videos_not_authenticated(self):
	1385	+ from video_processor.sources.zoom_source import ZoomSource
	1386	+
	1387	+ src = ZoomSource()
	1388	+ with pytest.raises(RuntimeError, match="Not authenticated"):
	1389	+ src.list_videos()
	1390	+
	1391	+ def test_download_not_authenticated(self):
	1392	+ from video_processor.sources.zoom_source import ZoomSource
	1393	+
	1394	+ src = ZoomSource()
	1395	+ sf = SourceFile(name="test.mp4", id="123")
	1396	+ with pytest.raises(RuntimeError, match="Not authenticated"):
	1397	+ src.download(sf, "/tmp/test.mp4")
	1398	+
	1399	+ def test_fetch_transcript_not_authenticated(self):
	1400	+ from video_processor.sources.zoom_source import ZoomSource
	1401	+
	1402	+ src = ZoomSource()
	1403	+ with pytest.raises(RuntimeError, match="Not authenticated"):
	1404	+ src.fetch_transcript("meeting123")
	1405	+
	1406	+ def test_mime_types_mapping(self):
	1407	+ from video_processor.sources.zoom_source import _MIME_TYPES
	1408	+
	1409	+ assert _MIME_TYPES["MP4"] == "video/mp4"
	1410	+ assert _MIME_TYPES["TRANSCRIPT"] == "text/vtt"
	1411	+ assert _MIME_TYPES["M4A"] == "audio/mp4"
	1412	+
	1413	+
	1414	+# ---------------------------------------------------------------------------
	1415	+# TeamsRecordingSource
	1416	+# ---------------------------------------------------------------------------
	1417	+
	1418	+
	1419	+class TestTeamsRecordingSource:
	1420	+ def test_import(self):
	1421	+ from video_processor.sources.teams_recording_source import (
	1422	+ TeamsRecordingSource,
	1423	+ )
	1424	+
	1425	+ assert TeamsRecordingSource is not None
	1426	+
	1427	+ def test_constructor_default(self):
	1428	+ from video_processor.sources.teams_recording_source import (
	1429	+ TeamsRecordingSource,
	1430	+ )
	1431	+
	1432	+ src = TeamsRecordingSource()
	1433	+ assert src.user_id == "me"
	1434	+
	1435	+ def test_constructor_custom_user(self):
	1436	+ from video_processor.sources.teams_recording_source import (
	1437	+ TeamsRecordingSource,
	1438	+ )
	1439	+
	1440	+ src = TeamsRecordingSource(user_id="[email protected]")
	1441	+ assert src.user_id == "[email protected]"
	1442	+
	1443	+ @patch("shutil.which", return_value=None)
	1444	+ def test_authenticate_no_m365(self, _mock_which):
	1445	+ from video_processor.sources.teams_recording_source import (
	1446	+ TeamsRecordingSource,
	1447	+ )
	1448	+
	1449	+ src = TeamsRecordingSource()
	1450	+ assert src.authenticate() is False
	1451	+
	1452	+ def test_vtt_to_text(self):
	1453	+ from video_processor.sources.teams_recording_source import (
	1454	+ _vtt_to_text,
	1455	+ )
	1456	+
	1457	+ vtt = (
	1458	+ "WEBVTT\n\n"
	1459	+ "1\n"
	1460	+ "00:00:01.000 --> 00:00:05.000\n"
	1461	+ "<v Speaker1>Hello everyone\n\n"
	1462	+ "2\n"
	1463	+ "00:00:05.000 --> 00:00:10.000\n"
	1464	+ "<v Speaker2>Welcome to the meeting\n"
	1465	+ )
	1466	+ result = _vtt_to_text(vtt)
	1467	+ assert "Hello everyone" in result
	1468	+ assert "Welcome to the meeting" in result
	1469	+ assert "WEBVTT" not in result
	1470	+ assert "-->" not in result
	1471	+
	1472	+ def test_vtt_to_text_empty(self):
	1473	+ from video_processor.sources.teams_recording_source import (
	1474	+ _vtt_to_text,
	1475	+ )
	1476	+
	1477	+ assert _vtt_to_text("") == ""
	1478	+
	1479	+ def test_vtt_to_text_deduplicates(self):
	1480	+ from video_processor.sources.teams_recording_source import (
	1481	+ _vtt_to_text,
	1482	+ )
	1483	+
	1484	+ vtt = (
	1485	+ "WEBVTT\n\n"
	1486	+ "00:00:01.000 --> 00:00:03.000\n"
	1487	+ "Same line\n\n"
	1488	+ "00:00:03.000 --> 00:00:05.000\n"
	1489	+ "Same line\n"
	1490	+ )
	1491	+ result = _vtt_to_text(vtt)
	1492	+ assert result.count("Same line") == 1
	1493	+
	1494	+ def test_extract_meetings_list_dict(self):
	1495	+ from video_processor.sources.teams_recording_source import (
	1496	+ TeamsRecordingSource,
	1497	+ )
	1498	+
	1499	+ src = TeamsRecordingSource()
	1500	+ result = src._extract_meetings_list({"value": [{"id": "m1"}]})
	1501	+ assert len(result) == 1
	1502	+
	1503	+ def test_extract_meetings_list_list(self):
	1504	+ from video_processor.sources.teams_recording_source import (
	1505	+ TeamsRecordingSource,
	1506	+ )
	1507	+
	1508	+ src = TeamsRecordingSource()
	1509	+ result = src._extract_meetings_list([{"id": "m1"}])
	1510	+ assert len(result) == 1
	1511	+
	1512	+
	1513	+# ---------------------------------------------------------------------------
	1514	+# MeetRecordingSource
	1515	+# ---------------------------------------------------------------------------
	1516	+
	1517	+
	1518	+class TestMeetRecordingSource:
	1519	+ def test_import(self):
	1520	+ from video_processor.sources.meet_recording_source import (
	1521	+ MeetRecordingSource,
	1522	+ )
	1523	+
	1524	+ assert MeetRecordingSource is not None
	1525	+
	1526	+ def test_constructor_default(self):
	1527	+ from video_processor.sources.meet_recording_source import (
	1528	+ MeetRecordingSource,
	1529	+ )
	1530	+
	1531	+ src = MeetRecordingSource()
	1532	+ assert src.drive_folder_id is None
	1533	+
	1534	+ def test_constructor_with_folder(self):
	1535	+ from video_processor.sources.meet_recording_source import (
	1536	+ MeetRecordingSource,
	1537	+ )
	1538	+
	1539	+ src = MeetRecordingSource(drive_folder_id="folder123")
	1540	+ assert src.drive_folder_id == "folder123"
	1541	+
	1542	+ @patch("shutil.which", return_value=None)
	1543	+ def test_authenticate_no_gws(self, _mock_which):
	1544	+ from video_processor.sources.meet_recording_source import (
	1545	+ MeetRecordingSource,
	1546	+ )
	1547	+
	1548	+ src = MeetRecordingSource()
	1549	+ assert src.authenticate() is False
	1550	+
	1551	+ def test_find_matching_transcript_date_extraction(self):
	1552	+ import re
	1553	+
	1554	+ name = "Meet Recording 2026-03-07T14:30:00"
	1555	+ match = re.search(r"\d{4}-\d{2}-\d{2}", name)
	1556	+ assert match is not None
	1557	+ assert match.group(0) == "2026-03-07"
	1558	+
	1559	+ def test_lazy_import(self):
	1560	+ from video_processor.sources import MeetRecordingSource
	1561	+
	1562	+ assert MeetRecordingSource is not None
	1563	+
	1564	+ def test_teams_lazy_import(self):
	1565	+ from video_processor.sources import TeamsRecordingSource
	1566	+
	1567	+ assert TeamsRecordingSource is not None
	1568	+
	1569	+ def test_zoom_lazy_import(self):
	1570	+ from video_processor.sources import ZoomSource
	1571	+
	1572	+ assert ZoomSource is not None
1345	1573

	--- tests/test_sources.py
	+++ tests/test_sources.py
	@@ -1340,5 +1340,233 @@
1340	html = "<tag> "quoted" 'apos'  space"
1341	result = _html_to_text(html)
1342	assert "<tag>" in result
1343	assert '"quoted"' in result
1344	assert "'apos'" in result




































































































































































































































1345

	--- tests/test_sources.py
	+++ tests/test_sources.py
	@@ -1340,5 +1340,233 @@
1340	html = "<tag> "quoted" 'apos'  space"
1341	result = _html_to_text(html)
1342	assert "<tag>" in result
1343	assert '"quoted"' in result
1344	assert "'apos'" in result
1345
1346
1347	# ---------------------------------------------------------------------------
1348	# ZoomSource
1349	# ---------------------------------------------------------------------------
1350
1351
1352	class TestZoomSource:
1353	def test_import(self):
1354	from video_processor.sources.zoom_source import ZoomSource
1355
1356	assert ZoomSource is not None
1357
1358	def test_constructor_defaults(self):
1359	from video_processor.sources.zoom_source import ZoomSource
1360
1361	src = ZoomSource()
1362	assert src.client_id is None or isinstance(src.client_id, str)
1363	assert src._access_token is None
1364
1365	def test_constructor_explicit(self):
1366	from video_processor.sources.zoom_source import ZoomSource
1367
1368	src = ZoomSource(
1369	client_id="cid",
1370	client_secret="csec",
1371	account_id="aid",
1372	)
1373	assert src.client_id == "cid"
1374	assert src.client_secret == "csec"
1375	assert src.account_id == "aid"
1376
1377	def test_authenticate_no_credentials(self):
1378	from video_processor.sources.zoom_source import ZoomSource
1379
1380	src = ZoomSource(client_id=None, client_secret=None, account_id=None)
1381	# No saved token, no account_id, no client_id → should fail
1382	assert src.authenticate() is False
1383
1384	def test_list_videos_not_authenticated(self):
1385	from video_processor.sources.zoom_source import ZoomSource
1386
1387	src = ZoomSource()
1388	with pytest.raises(RuntimeError, match="Not authenticated"):
1389	src.list_videos()
1390
1391	def test_download_not_authenticated(self):
1392	from video_processor.sources.zoom_source import ZoomSource
1393
1394	src = ZoomSource()
1395	sf = SourceFile(name="test.mp4", id="123")
1396	with pytest.raises(RuntimeError, match="Not authenticated"):
1397	src.download(sf, "/tmp/test.mp4")
1398
1399	def test_fetch_transcript_not_authenticated(self):
1400	from video_processor.sources.zoom_source import ZoomSource
1401
1402	src = ZoomSource()
1403	with pytest.raises(RuntimeError, match="Not authenticated"):
1404	src.fetch_transcript("meeting123")
1405
1406	def test_mime_types_mapping(self):
1407	from video_processor.sources.zoom_source import _MIME_TYPES
1408
1409	assert _MIME_TYPES["MP4"] == "video/mp4"
1410	assert _MIME_TYPES["TRANSCRIPT"] == "text/vtt"
1411	assert _MIME_TYPES["M4A"] == "audio/mp4"
1412
1413
1414	# ---------------------------------------------------------------------------
1415	# TeamsRecordingSource
1416	# ---------------------------------------------------------------------------
1417
1418
1419	class TestTeamsRecordingSource:
1420	def test_import(self):
1421	from video_processor.sources.teams_recording_source import (
1422	TeamsRecordingSource,
1423	)
1424
1425	assert TeamsRecordingSource is not None
1426
1427	def test_constructor_default(self):
1428	from video_processor.sources.teams_recording_source import (
1429	TeamsRecordingSource,
1430	)
1431
1432	src = TeamsRecordingSource()
1433	assert src.user_id == "me"
1434
1435	def test_constructor_custom_user(self):
1436	from video_processor.sources.teams_recording_source import (
1437	TeamsRecordingSource,
1438	)
1439
1440	src = TeamsRecordingSource(user_id="[email protected]")
1441	assert src.user_id == "[email protected]"
1442
1443	@patch("shutil.which", return_value=None)
1444	def test_authenticate_no_m365(self, _mock_which):
1445	from video_processor.sources.teams_recording_source import (
1446	TeamsRecordingSource,
1447	)
1448
1449	src = TeamsRecordingSource()
1450	assert src.authenticate() is False
1451
1452	def test_vtt_to_text(self):
1453	from video_processor.sources.teams_recording_source import (
1454	_vtt_to_text,
1455	)
1456
1457	vtt = (
1458	"WEBVTT\n\n"
1459	"1\n"
1460	"00:00:01.000 --> 00:00:05.000\n"
1461	"<v Speaker1>Hello everyone\n\n"
1462	"2\n"
1463	"00:00:05.000 --> 00:00:10.000\n"
1464	"<v Speaker2>Welcome to the meeting\n"
1465	)
1466	result = _vtt_to_text(vtt)
1467	assert "Hello everyone" in result
1468	assert "Welcome to the meeting" in result
1469	assert "WEBVTT" not in result
1470	assert "-->" not in result
1471
1472	def test_vtt_to_text_empty(self):
1473	from video_processor.sources.teams_recording_source import (
1474	_vtt_to_text,
1475	)
1476
1477	assert _vtt_to_text("") == ""
1478
1479	def test_vtt_to_text_deduplicates(self):
1480	from video_processor.sources.teams_recording_source import (
1481	_vtt_to_text,
1482	)
1483
1484	vtt = (
1485	"WEBVTT\n\n"
1486	"00:00:01.000 --> 00:00:03.000\n"
1487	"Same line\n\n"
1488	"00:00:03.000 --> 00:00:05.000\n"
1489	"Same line\n"
1490	)
1491	result = _vtt_to_text(vtt)
1492	assert result.count("Same line") == 1
1493
1494	def test_extract_meetings_list_dict(self):
1495	from video_processor.sources.teams_recording_source import (
1496	TeamsRecordingSource,
1497	)
1498
1499	src = TeamsRecordingSource()
1500	result = src._extract_meetings_list({"value": [{"id": "m1"}]})
1501	assert len(result) == 1
1502
1503	def test_extract_meetings_list_list(self):
1504	from video_processor.sources.teams_recording_source import (
1505	TeamsRecordingSource,
1506	)
1507
1508	src = TeamsRecordingSource()
1509	result = src._extract_meetings_list([{"id": "m1"}])
1510	assert len(result) == 1
1511
1512
1513	# ---------------------------------------------------------------------------
1514	# MeetRecordingSource
1515	# ---------------------------------------------------------------------------
1516
1517
1518	class TestMeetRecordingSource:
1519	def test_import(self):
1520	from video_processor.sources.meet_recording_source import (
1521	MeetRecordingSource,
1522	)
1523
1524	assert MeetRecordingSource is not None
1525
1526	def test_constructor_default(self):
1527	from video_processor.sources.meet_recording_source import (
1528	MeetRecordingSource,
1529	)
1530
1531	src = MeetRecordingSource()
1532	assert src.drive_folder_id is None
1533
1534	def test_constructor_with_folder(self):
1535	from video_processor.sources.meet_recording_source import (
1536	MeetRecordingSource,
1537	)
1538
1539	src = MeetRecordingSource(drive_folder_id="folder123")
1540	assert src.drive_folder_id == "folder123"
1541
1542	@patch("shutil.which", return_value=None)
1543	def test_authenticate_no_gws(self, _mock_which):
1544	from video_processor.sources.meet_recording_source import (
1545	MeetRecordingSource,
1546	)
1547
1548	src = MeetRecordingSource()
1549	assert src.authenticate() is False
1550
1551	def test_find_matching_transcript_date_extraction(self):
1552	import re
1553
1554	name = "Meet Recording 2026-03-07T14:30:00"
1555	match = re.search(r"\d{4}-\d{2}-\d{2}", name)
1556	assert match is not None
1557	assert match.group(0) == "2026-03-07"
1558
1559	def test_lazy_import(self):
1560	from video_processor.sources import MeetRecordingSource
1561
1562	assert MeetRecordingSource is not None
1563
1564	def test_teams_lazy_import(self):
1565	from video_processor.sources import TeamsRecordingSource
1566
1567	assert TeamsRecordingSource is not None
1568
1569	def test_zoom_lazy_import(self):
1570	from video_processor.sources import ZoomSource
1571
1572	assert ZoomSource is not None
1573

M video_processor/cli/commands.py

+102

		--- video_processor/cli/commands.py
		+++ video_processor/cli/commands.py
		@@ -1582,10 +1582,112 @@
1582	1582	click.echo(f"Wiki pushed to https://github.com/{repo}/wiki")
1583	1583	else:
1584	1584	click.echo("Wiki push failed. Check auth and repo permissions.", err=True)
1585	1585	sys.exit(1)
1586	1586
	1587	+
	1588	+@cli.group()
	1589	+def recordings():
	1590	+ """Fetch meeting recordings from Zoom, Teams, and Google Meet."""
	1591	+ pass
	1592	+
	1593	+
	1594	+@recordings.command("zoom-list")
	1595	+@click.option("--json", "as_json", is_flag=True, help="Output as JSON")
	1596	+def recordings_zoom_list(as_json):
	1597	+ """List Zoom cloud recordings.
	1598	+
	1599	+ Requires ZOOM_CLIENT_ID (and optionally ZOOM_CLIENT_SECRET,
	1600	+ ZOOM_ACCOUNT_ID) environment variables.
	1601	+
	1602	+ Examples:
	1603	+
	1604	+ planopticon recordings zoom-list
	1605	+
	1606	+ planopticon recordings zoom-list --json
	1607	+ """
	1608	+ from video_processor.sources.zoom_source import ZoomSource
	1609	+
	1610	+ source = ZoomSource()
	1611	+ if not source.authenticate():
	1612	+ click.echo("Zoom authentication failed.", err=True)
	1613	+ sys.exit(1)
	1614	+
	1615	+ files = source.list_videos()
	1616	+ if as_json:
	1617	+ click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str))
	1618	+ else:
	1619	+ click.echo(f"Found {len(files)} recording(s):")
	1620	+ for f in files:
	1621	+ size = f"{f.size_bytes // 1_000_000} MB" if f.size_bytes else "unknown"
	1622	+ click.echo(f" {f.name} ({size}) {f.modified_at or ''}")
	1623	+
	1624	+
	1625	+@recordings.command("teams-list")
	1626	+@click.option("--user-id", default="me", help="Microsoft user ID")
	1627	+@click.option("--json", "as_json", is_flag=True, help="Output as JSON")
	1628	+def recordings_teams_list(user_id, as_json):
	1629	+ """List Teams meeting recordings via the m365 CLI.
	1630	+
	1631	+ Requires: npm install -g @pnp/cli-microsoft365 && m365 login
	1632	+
	1633	+ Examples:
	1634	+
	1635	+ planopticon recordings teams-list
	1636	+
	1637	+ planopticon recordings teams-list --json
	1638	+ """
	1639	+ from video_processor.sources.teams_recording_source import (
	1640	+ TeamsRecordingSource,
	1641	+ )
	1642	+
	1643	+ source = TeamsRecordingSource(user_id=user_id)
	1644	+ if not source.authenticate():
	1645	+ click.echo("Teams authentication failed.", err=True)
	1646	+ sys.exit(1)
	1647	+
	1648	+ files = source.list_videos()
	1649	+ if as_json:
	1650	+ click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str))
	1651	+ else:
	1652	+ click.echo(f"Found {len(files)} recording(s):")
	1653	+ for f in files:
	1654	+ click.echo(f" {f.name} {f.modified_at or ''}")
	1655	+
	1656	+
	1657	+@recordings.command("meet-list")
	1658	+@click.option("--folder-id", default=None, help="Drive folder ID")
	1659	+@click.option("--json", "as_json", is_flag=True, help="Output as JSON")
	1660	+def recordings_meet_list(folder_id, as_json):
	1661	+ """List Google Meet recordings in Drive via the gws CLI.
	1662	+
	1663	+ Requires: npm install -g @googleworkspace/cli && gws auth login
	1664	+
	1665	+ Examples:
	1666	+
	1667	+ planopticon recordings meet-list
	1668	+
	1669	+ planopticon recordings meet-list --folder-id abc123
	1670	+ """
	1671	+ from video_processor.sources.meet_recording_source import (
	1672	+ MeetRecordingSource,
	1673	+ )
	1674	+
	1675	+ source = MeetRecordingSource(drive_folder_id=folder_id)
	1676	+ if not source.authenticate():
	1677	+ click.echo("Google Meet authentication failed.", err=True)
	1678	+ sys.exit(1)
	1679	+
	1680	+ files = source.list_videos()
	1681	+ if as_json:
	1682	+ click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str))
	1683	+ else:
	1684	+ click.echo(f"Found {len(files)} recording(s):")
	1685	+ for f in files:
	1686	+ size = f"{f.size_bytes // 1_000_000} MB" if f.size_bytes else "unknown"
	1687	+ click.echo(f" {f.name} ({size}) {f.modified_at or ''}")
	1688	+
1587	1689
1588	1690	@cli.group()
1589	1691	def kg():
1590	1692	"""Knowledge graph utilities: convert, sync, and inspect."""
1591	1693	pass
1592	1694

	--- video_processor/cli/commands.py
	+++ video_processor/cli/commands.py
	@@ -1582,10 +1582,112 @@
1582	click.echo(f"Wiki pushed to https://github.com/{repo}/wiki")
1583	else:
1584	click.echo("Wiki push failed. Check auth and repo permissions.", err=True)
1585	sys.exit(1)
1586






































































































1587
1588	@cli.group()
1589	def kg():
1590	"""Knowledge graph utilities: convert, sync, and inspect."""
1591	pass
1592

	--- video_processor/cli/commands.py
	+++ video_processor/cli/commands.py
	@@ -1582,10 +1582,112 @@
1582	click.echo(f"Wiki pushed to https://github.com/{repo}/wiki")
1583	else:
1584	click.echo("Wiki push failed. Check auth and repo permissions.", err=True)
1585	sys.exit(1)
1586
1587
1588	@cli.group()
1589	def recordings():
1590	"""Fetch meeting recordings from Zoom, Teams, and Google Meet."""
1591	pass
1592
1593
1594	@recordings.command("zoom-list")
1595	@click.option("--json", "as_json", is_flag=True, help="Output as JSON")
1596	def recordings_zoom_list(as_json):
1597	"""List Zoom cloud recordings.
1598
1599	Requires ZOOM_CLIENT_ID (and optionally ZOOM_CLIENT_SECRET,
1600	ZOOM_ACCOUNT_ID) environment variables.
1601
1602	Examples:
1603
1604	planopticon recordings zoom-list
1605
1606	planopticon recordings zoom-list --json
1607	"""
1608	from video_processor.sources.zoom_source import ZoomSource
1609
1610	source = ZoomSource()
1611	if not source.authenticate():
1612	click.echo("Zoom authentication failed.", err=True)
1613	sys.exit(1)
1614
1615	files = source.list_videos()
1616	if as_json:
1617	click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str))
1618	else:
1619	click.echo(f"Found {len(files)} recording(s):")
1620	for f in files:
1621	size = f"{f.size_bytes // 1_000_000} MB" if f.size_bytes else "unknown"
1622	click.echo(f" {f.name} ({size}) {f.modified_at or ''}")
1623
1624
1625	@recordings.command("teams-list")
1626	@click.option("--user-id", default="me", help="Microsoft user ID")
1627	@click.option("--json", "as_json", is_flag=True, help="Output as JSON")
1628	def recordings_teams_list(user_id, as_json):
1629	"""List Teams meeting recordings via the m365 CLI.
1630
1631	Requires: npm install -g @pnp/cli-microsoft365 && m365 login
1632
1633	Examples:
1634
1635	planopticon recordings teams-list
1636
1637	planopticon recordings teams-list --json
1638	"""
1639	from video_processor.sources.teams_recording_source import (
1640	TeamsRecordingSource,
1641	)
1642
1643	source = TeamsRecordingSource(user_id=user_id)
1644	if not source.authenticate():
1645	click.echo("Teams authentication failed.", err=True)
1646	sys.exit(1)
1647
1648	files = source.list_videos()
1649	if as_json:
1650	click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str))
1651	else:
1652	click.echo(f"Found {len(files)} recording(s):")
1653	for f in files:
1654	click.echo(f" {f.name} {f.modified_at or ''}")
1655
1656
1657	@recordings.command("meet-list")
1658	@click.option("--folder-id", default=None, help="Drive folder ID")
1659	@click.option("--json", "as_json", is_flag=True, help="Output as JSON")
1660	def recordings_meet_list(folder_id, as_json):
1661	"""List Google Meet recordings in Drive via the gws CLI.
1662
1663	Requires: npm install -g @googleworkspace/cli && gws auth login
1664
1665	Examples:
1666
1667	planopticon recordings meet-list
1668
1669	planopticon recordings meet-list --folder-id abc123
1670	"""
1671	from video_processor.sources.meet_recording_source import (
1672	MeetRecordingSource,
1673	)
1674
1675	source = MeetRecordingSource(drive_folder_id=folder_id)
1676	if not source.authenticate():
1677	click.echo("Google Meet authentication failed.", err=True)
1678	sys.exit(1)
1679
1680	files = source.list_videos()
1681	if as_json:
1682	click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str))
1683	else:
1684	click.echo(f"Found {len(files)} recording(s):")
1685	for f in files:
1686	size = f"{f.size_bytes // 1_000_000} MB" if f.size_bytes else "unknown"
1687	click.echo(f" {f.name} ({size}) {f.modified_at or ''}")
1688
1689
1690	@cli.group()
1691	def kg():
1692	"""Knowledge graph utilities: convert, sync, and inspect."""
1693	pass
1694

M video_processor/sources/__init__.py

		--- video_processor/sources/__init__.py
		+++ video_processor/sources/__init__.py
		@@ -12,19 +12,22 @@
12	12	"GoogleKeepSource",
13	13	"GWSSource",
14	14	"HackerNewsSource",
15	15	"LogseqSource",
16	16	"M365Source",
	17	+ "MeetRecordingSource",
17	18	"NotionSource",
18	19	"ObsidianSource",
19	20	"OneNoteSource",
20	21	"PodcastSource",
	22	+ "TeamsRecordingSource",
21	23	"RedditSource",
22	24	"RSSSource",
23	25	"TwitterSource",
24	26	"WebSource",
25	27	"YouTubeSource",
	28	+ "ZoomSource",
26	29	]
27	30
28	31
29	32	def __getattr__(name: str):
30	33	"""Lazy imports to avoid pulling in optional dependencies at import time."""
		@@ -36,21 +39,24 @@
36	39	"GoogleKeepSource": "video_processor.sources.google_keep_source",
37	40	"GWSSource": "video_processor.sources.gws_source",
38	41	"HackerNewsSource": "video_processor.sources.hackernews_source",
39	42	"LogseqSource": "video_processor.sources.logseq_source",
40	43	"M365Source": "video_processor.sources.m365_source",
	44	+ "MeetRecordingSource": "video_processor.sources.meet_recording_source",
41	45	"NotionSource": "video_processor.sources.notion_source",
42	46	"ObsidianSource": "video_processor.sources.obsidian_source",
43	47	"OneNoteSource": "video_processor.sources.onenote_source",
44	48	"PodcastSource": "video_processor.sources.podcast_source",
	49	+ "TeamsRecordingSource": "video_processor.sources.teams_recording_source",
45	50	"RedditSource": "video_processor.sources.reddit_source",
46	51	"RSSSource": "video_processor.sources.rss_source",
47	52	"TwitterSource": "video_processor.sources.twitter_source",
48	53	"WebSource": "video_processor.sources.web_source",
49	54	"YouTubeSource": "video_processor.sources.youtube_source",
	55	+ "ZoomSource": "video_processor.sources.zoom_source",
50	56	}
51	57	if name in _lazy_map:
52	58	import importlib
53	59
54	60	module = importlib.import_module(_lazy_map[name])
55	61	return getattr(module, name)
56	62	raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
57	63
58	64	ADDED video_processor/sources/meet_recording_source.py
59	65	ADDED video_processor/sources/teams_recording_source.py
60	66	ADDED video_processor/sources/zoom_source.py

	--- video_processor/sources/__init__.py
	+++ video_processor/sources/__init__.py
	@@ -12,19 +12,22 @@
12	"GoogleKeepSource",
13	"GWSSource",
14	"HackerNewsSource",
15	"LogseqSource",
16	"M365Source",

17	"NotionSource",
18	"ObsidianSource",
19	"OneNoteSource",
20	"PodcastSource",

21	"RedditSource",
22	"RSSSource",
23	"TwitterSource",
24	"WebSource",
25	"YouTubeSource",

26	]
27
28
29	def __getattr__(name: str):
30	"""Lazy imports to avoid pulling in optional dependencies at import time."""
	@@ -36,21 +39,24 @@
36	"GoogleKeepSource": "video_processor.sources.google_keep_source",
37	"GWSSource": "video_processor.sources.gws_source",
38	"HackerNewsSource": "video_processor.sources.hackernews_source",
39	"LogseqSource": "video_processor.sources.logseq_source",
40	"M365Source": "video_processor.sources.m365_source",

41	"NotionSource": "video_processor.sources.notion_source",
42	"ObsidianSource": "video_processor.sources.obsidian_source",
43	"OneNoteSource": "video_processor.sources.onenote_source",
44	"PodcastSource": "video_processor.sources.podcast_source",

45	"RedditSource": "video_processor.sources.reddit_source",
46	"RSSSource": "video_processor.sources.rss_source",
47	"TwitterSource": "video_processor.sources.twitter_source",
48	"WebSource": "video_processor.sources.web_source",
49	"YouTubeSource": "video_processor.sources.youtube_source",

50	}
51	if name in _lazy_map:
52	import importlib
53
54	module = importlib.import_module(_lazy_map[name])
55	return getattr(module, name)
56	raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
57
58	DDED video_processor/sources/meet_recording_source.py
59	DDED video_processor/sources/teams_recording_source.py
60	DDED video_processor/sources/zoom_source.py

	--- video_processor/sources/__init__.py
	+++ video_processor/sources/__init__.py
	@@ -12,19 +12,22 @@
12	"GoogleKeepSource",
13	"GWSSource",
14	"HackerNewsSource",
15	"LogseqSource",
16	"M365Source",
17	"MeetRecordingSource",
18	"NotionSource",
19	"ObsidianSource",
20	"OneNoteSource",
21	"PodcastSource",
22	"TeamsRecordingSource",
23	"RedditSource",
24	"RSSSource",
25	"TwitterSource",
26	"WebSource",
27	"YouTubeSource",
28	"ZoomSource",
29	]
30
31
32	def __getattr__(name: str):
33	"""Lazy imports to avoid pulling in optional dependencies at import time."""
	@@ -36,21 +39,24 @@
39	"GoogleKeepSource": "video_processor.sources.google_keep_source",
40	"GWSSource": "video_processor.sources.gws_source",
41	"HackerNewsSource": "video_processor.sources.hackernews_source",
42	"LogseqSource": "video_processor.sources.logseq_source",
43	"M365Source": "video_processor.sources.m365_source",
44	"MeetRecordingSource": "video_processor.sources.meet_recording_source",
45	"NotionSource": "video_processor.sources.notion_source",
46	"ObsidianSource": "video_processor.sources.obsidian_source",
47	"OneNoteSource": "video_processor.sources.onenote_source",
48	"PodcastSource": "video_processor.sources.podcast_source",
49	"TeamsRecordingSource": "video_processor.sources.teams_recording_source",
50	"RedditSource": "video_processor.sources.reddit_source",
51	"RSSSource": "video_processor.sources.rss_source",
52	"TwitterSource": "video_processor.sources.twitter_source",
53	"WebSource": "video_processor.sources.web_source",
54	"YouTubeSource": "video_processor.sources.youtube_source",
55	"ZoomSource": "video_processor.sources.zoom_source",
56	}
57	if name in _lazy_map:
58	import importlib
59
60	module = importlib.import_module(_lazy_map[name])
61	return getattr(module, name)
62	raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
63
64	DDED video_processor/sources/meet_recording_source.py
65	DDED video_processor/sources/teams_recording_source.py
66	DDED video_processor/sources/zoom_source.py

A video_processor/sources/meet_recording_source.py

+280

		--- a/video_processor/sources/meet_recording_source.py
		+++ b/video_processor/sources/meet_recording_source.py
		@@ -0,0 +1,280 @@
	1	+"""Google Meet recording source using the gws CLI (googleworkspace/cli).
	2	+
	3	+Fetches Meet recordings and companion transcripts from Google Drive
	4	+via the `gws` CLI tool.
	5	+
	6	+Requires: npm install -g @googleworkspace/cli
	7	+Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless)
	8	+"""
	9	+
	10	+import json
	11	+import logging
	12	+import re
	13	+import shutil
	14	+import subprocess
	15	+from pathlib import Path
	16	+from typing import List, Optional
	17	+
	18	+from video_processor.sources.base import BaseSource, SourceFile
	19	+from video_processor.sources.gws_source import _run_gws
	20	+
	21	+logger = logging.getLogger(__name__)
	22	+
	23	+
	24	+class MeetRecordingSource(BaseSource):
	25	+ """
	26	+ Fetch Google Meet recordings and transcripts from Google Drive via the gws CLI.
	27	+
	28	+ Meet stores recordings as MP4 files in Drive (typically in a "Meet Recordings"
	29	+ folder) and auto-generated transcripts as Google Docs.
	30	+
	31	+ Usage:
	32	+ source = MeetRecordingSource()
	33	+ source.authenticate()
	34	+ recordings = source.list_videos()
	35	+ source.download_all(recordings, Path("./recordings"))
	36	+
	37	+ # Fetch transcript for a specific recording
	38	+ transcript = source.fetch_transcript("Meet Recording 2026-03-07")
	39	+ """
	40	+
	41	+ def __init__(self, drive_folder_id: Optional[str] = None):
	42	+ self.drive_folder_id = drive_folder_id
	43	+
	44	+ def authenticate(self) -> bool:
	45	+ """Check if gws CLI is installed and authenticated."""
	46	+ if not shutil.which("gws"):
	47	+ logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli")
	48	+ return False
	49	+ try:
	50	+ _run_gws(["auth", "status"], timeout=10)
	51	+ return True
	52	+ except (RuntimeError, subprocess.TimeoutExpired):
	53	+ logger.error("gws not authenticated. Run: gws auth login")
	54	+ return False
	55	+
	56	+ def list_videos(
	57	+ self,
	58	+ folder_id: Optional[str] = None,
	59	+ folder_path: Optional[str] = None,
	60	+ patterns: Optional[List[str]] = None,
	61	+ ) -> List[SourceFile]:
	62	+ """List Google Meet recordings in Drive.
	63	+
	64	+ Searches for MP4 files with 'Meet Recording' in the name. If a
	65	+ drive_folder_id is set, restricts search to that folder.
	66	+ Also discovers companion transcript docs for each recording.
	67	+ """
	68	+ target_folder = folder_id or self.drive_folder_id
	69	+ files: List[SourceFile] = []
	70	+
	71	+ # Build the Drive search query for Meet recordings
	72	+ q_parts = [
	73	+ "mimeType='video/mp4'",
	74	+ "name contains 'Meet Recording'",
	75	+ "trashed=false",
	76	+ ]
	77	+ if target_folder:
	78	+ q_parts.append(f"'{target_folder}' in parents")
	79	+
	80	+ params = {
	81	+ "q": " and ".join(q_parts),
	82	+ "fields": "files(id,name,mimeType,size,modifiedTime)",
	83	+ "pageSize": 50,
	84	+ "orderBy": "modifiedTime desc",
	85	+ }
	86	+
	87	+ try:
	88	+ result = _run_gws(
	89	+ [
	90	+ "drive",
	91	+ "files",
	92	+ "list",
	93	+ "--params",
	94	+ json.dumps(params),
	95	+ ],
	96	+ timeout=60,
	97	+ )
	98	+ except RuntimeError as e:
	99	+ logger.error(f"Failed to list Meet recordings: {e}")
	100	+ return []
	101	+
	102	+ recordings = result.get("files", [])
	103	+ for item in recordings:
	104	+ size = item.get("size")
	105	+ files.append(
	106	+ SourceFile(
	107	+ name=item.get("name", "Meet Recording"),
	108	+ id=item.get("id", ""),
	109	+ size_bytes=int(size) if size else None,
	110	+ mime_type=item.get("mimeType", "video/mp4"),
	111	+ modified_at=item.get("modifiedTime"),
	112	+ )
	113	+ )
	114	+
	115	+ # Also search for auto-generated transcript docs
	116	+ transcript_params = {
	117	+ "q": " and ".join(
	118	+ [
	119	+ "mimeType='application/vnd.google-apps.document'",
	120	+ "(name contains 'Transcript' or name contains 'Meeting notes')",
	121	+ "trashed=false",
	122	+ ]
	123	+ + ([f"'{target_folder}' in parents"] if target_folder else [])
	124	+ ),
	125	+ "fields": "files(id,name,mimeType,modifiedTime)",
	126	+ "pageSize": 50,
	127	+ "orderBy": "modifiedTime desc",
	128	+ }
	129	+
	130	+ try:
	131	+ transcript_result = _run_gws(
	132	+ [
	133	+ "drive",
	134	+ "files",
	135	+ "list",
	136	+ "--params",
	137	+ json.dumps(transcript_params),
	138	+ ],
	139	+ timeout=60,
	140	+ )
	141	+ transcript_files = transcript_result.get("files", [])
	142	+ logger.info(
	143	+ f"Found {len(recordings)} recording(s) and "
	144	+ f"{len(transcript_files)} transcript doc(s) in Drive"
	145	+ )
	146	+ except RuntimeError as e:
	147	+ logger.debug(f"Transcript search failed: {e}")
	148	+
	149	+ if not files:
	150	+ logger.warning("No Google Meet recordings found in Drive")
	151	+
	152	+ logger.info(f"Found {len(files)} Meet recording(s)")
	153	+ return files
	154	+
	155	+ def download(self, file: SourceFile, destination: Path) -> Path:
	156	+ """Download a Meet recording from Drive."""
	157	+ destination = Path(destination)
	158	+ destination.parent.mkdir(parents=True, exist_ok=True)
	159	+
	160	+ # For video files, download binary content via alt=media
	161	+ result = _run_gws(
	162	+ [
	163	+ "drive",
	164	+ "files",
	165	+ "get",
	166	+ "--params",
	167	+ json.dumps({"fileId": file.id, "alt": "media"}),
	168	+ ],
	169	+ timeout=300,
	170	+ )
	171	+
	172	+ # Write the content — result may be raw binary or a dict wrapper
	173	+ raw = result.get("raw", "") if isinstance(result, dict) else str(result)
	174	+ destination.write_text(raw, encoding="utf-8")
	175	+ logger.info(f"Downloaded {file.name} to {destination}")
	176	+ return destination
	177	+
	178	+ def fetch_transcript(self, recording_name: str) -> Optional[str]:
	179	+ """Fetch the companion transcript for a Meet recording.
	180	+
	181	+ Google Meet creates transcript docs with names that typically match
	182	+ the recording date/time. This method searches for the matching
	183	+ Google Doc and extracts its text content.
	184	+ """
	185	+ transcript_id = self._find_matching_transcript(recording_name)
	186	+ if not transcript_id:
	187	+ logger.info(f"No matching transcript found for: {recording_name}")
	188	+ return None
	189	+
	190	+ # Fetch the Google Doc content via the Docs API
	191	+ try:
	192	+ result = _run_gws(
	193	+ [
	194	+ "docs",
	195	+ "documents",
	196	+ "get",
	197	+ "--params",
	198	+ json.dumps({"documentId": transcript_id}),
	199	+ ],
	200	+ timeout=60,
	201	+ )
	202	+ except RuntimeError as e:
	203	+ logger.warning(f"Failed to fetch transcript doc {transcript_id}: {e}")
	204	+ return None
	205	+
	206	+ # Extract text from the Docs API structural response
	207	+ body = result.get("body", {})
	208	+ text_parts: list[str] = []
	209	+ for element in body.get("content", []):
	210	+ paragraph = element.get("paragraph", {})
	211	+ for pe in paragraph.get("elements", []):
	212	+ text_run = pe.get("textRun", {})
	213	+ text = text_run.get("content", "")
	214	+ if text.strip():
	215	+ text_parts.append(text)
	216	+
	217	+ if not text_parts:
	218	+ logger.warning(f"Transcript doc {transcript_id} had no extractable text")
	219	+ return None
	220	+
	221	+ return "".join(text_parts)
	222	+
	223	+ def _find_matching_transcript(self, recording_name: str) -> Optional[str]:
	224	+ """Search Drive for a transcript doc that matches a recording name.
	225	+
	226	+ Meet recordings are typically named like:
	227	+ "Meet Recording 2026-03-07T14:30:00"
	228	+ And transcripts are named like:
	229	+ "Meeting Transcript 2026-03-07" or "2026-03-07 - Transcript"
	230	+
	231	+ This extracts the date portion and searches for matching transcript docs.
	232	+ """
	233	+ # Extract a date string from the recording name (YYYY-MM-DD pattern)
	234	+ date_match = re.search(r"\d{4}-\d{2}-\d{2}", recording_name)
	235	+ date_str = date_match.group(0) if date_match else recording_name
	236	+
	237	+ # Search for transcript docs matching the date
	238	+ search_query = " and ".join(
	239	+ [
	240	+ "mimeType='application/vnd.google-apps.document'",
	241	+ f"name contains '{date_str}'",
	242	+ "(name contains 'Transcript' or name contains 'transcript' "
	243	+ "or name contains 'Meeting notes')",
	244	+ "trashed=false",
	245	+ ]
	246	+ )
	247	+ if self.drive_folder_id:
	248	+ search_query += f" and '{self.drive_folder_id}' in parents"
	249	+
	250	+ try:
	251	+ result = _run_gws(
	252	+ [
	253	+ "drive",
	254	+ "files",
	255	+ "list",
	256	+ "--params",
	257	+ json.dumps(
	258	+ {
	259	+ "q": search_query,
	260	+ "fields": "files(id,name,modifiedTime)",
	261	+ "pageSize": 5,
	262	+ "orderBy": "modifiedTime desc",
	263	+ }
	264	+ ),
	265	+ ],
	266	+ timeout=60,
	267	+ )
	268	+ except RuntimeError as e:
	269	+ logger.debug(f"Transcript search failed for '{date_str}': {e}")
	270	+ return None
	271	+
	272	+ files = result.get("files", [])
	273	+ if not files:
	274	+ logger.debug(f"No transcript docs found matching '{date_str}'")
	275	+ return None
	276	+
	277	+ # Return the most recently modified match
	278	+ best = files[0]
	279	+ logger.info(f"Matched transcript: {best.get('name')} for recording: {recording_name}")
	280	+ return best.get("id")

	--- a/video_processor/sources/meet_recording_source.py
	+++ b/video_processor/sources/meet_recording_source.py
	@@ -0,0 +1,280 @@

	--- a/video_processor/sources/meet_recording_source.py
	+++ b/video_processor/sources/meet_recording_source.py
	@@ -0,0 +1,280 @@
1	"""Google Meet recording source using the gws CLI (googleworkspace/cli).
2
3	Fetches Meet recordings and companion transcripts from Google Drive
4	via the `gws` CLI tool.
5
6	Requires: npm install -g @googleworkspace/cli
7	Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless)
8	"""
9
10	import json
11	import logging
12	import re
13	import shutil
14	import subprocess
15	from pathlib import Path
16	from typing import List, Optional
17
18	from video_processor.sources.base import BaseSource, SourceFile
19	from video_processor.sources.gws_source import _run_gws
20
21	logger = logging.getLogger(__name__)
22
23
24	class MeetRecordingSource(BaseSource):
25	"""
26	Fetch Google Meet recordings and transcripts from Google Drive via the gws CLI.
27
28	Meet stores recordings as MP4 files in Drive (typically in a "Meet Recordings"
29	folder) and auto-generated transcripts as Google Docs.
30
31	Usage:
32	source = MeetRecordingSource()
33	source.authenticate()
34	recordings = source.list_videos()
35	source.download_all(recordings, Path("./recordings"))
36
37	# Fetch transcript for a specific recording
38	transcript = source.fetch_transcript("Meet Recording 2026-03-07")
39	"""
40
41	def __init__(self, drive_folder_id: Optional[str] = None):
42	self.drive_folder_id = drive_folder_id
43
44	def authenticate(self) -> bool:
45	"""Check if gws CLI is installed and authenticated."""
46	if not shutil.which("gws"):
47	logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli")
48	return False
49	try:
50	_run_gws(["auth", "status"], timeout=10)
51	return True
52	except (RuntimeError, subprocess.TimeoutExpired):
53	logger.error("gws not authenticated. Run: gws auth login")
54	return False
55
56	def list_videos(
57	self,
58	folder_id: Optional[str] = None,
59	folder_path: Optional[str] = None,
60	patterns: Optional[List[str]] = None,
61	) -> List[SourceFile]:
62	"""List Google Meet recordings in Drive.
63
64	Searches for MP4 files with 'Meet Recording' in the name. If a
65	drive_folder_id is set, restricts search to that folder.
66	Also discovers companion transcript docs for each recording.
67	"""
68	target_folder = folder_id or self.drive_folder_id
69	files: List[SourceFile] = []
70
71	# Build the Drive search query for Meet recordings
72	q_parts = [
73	"mimeType='video/mp4'",
74	"name contains 'Meet Recording'",
75	"trashed=false",
76	]
77	if target_folder:
78	q_parts.append(f"'{target_folder}' in parents")
79
80	params = {
81	"q": " and ".join(q_parts),
82	"fields": "files(id,name,mimeType,size,modifiedTime)",
83	"pageSize": 50,
84	"orderBy": "modifiedTime desc",
85	}
86
87	try:
88	result = _run_gws(
89	[
90	"drive",
91	"files",
92	"list",
93	"--params",
94	json.dumps(params),
95	],
96	timeout=60,
97	)
98	except RuntimeError as e:
99	logger.error(f"Failed to list Meet recordings: {e}")
100	return []
101
102	recordings = result.get("files", [])
103	for item in recordings:
104	size = item.get("size")
105	files.append(
106	SourceFile(
107	name=item.get("name", "Meet Recording"),
108	id=item.get("id", ""),
109	size_bytes=int(size) if size else None,
110	mime_type=item.get("mimeType", "video/mp4"),
111	modified_at=item.get("modifiedTime"),
112	)
113	)
114
115	# Also search for auto-generated transcript docs
116	transcript_params = {
117	"q": " and ".join(
118	[
119	"mimeType='application/vnd.google-apps.document'",
120	"(name contains 'Transcript' or name contains 'Meeting notes')",
121	"trashed=false",
122	]
123	+ ([f"'{target_folder}' in parents"] if target_folder else [])
124	),
125	"fields": "files(id,name,mimeType,modifiedTime)",
126	"pageSize": 50,
127	"orderBy": "modifiedTime desc",
128	}
129
130	try:
131	transcript_result = _run_gws(
132	[
133	"drive",
134	"files",
135	"list",
136	"--params",
137	json.dumps(transcript_params),
138	],
139	timeout=60,
140	)
141	transcript_files = transcript_result.get("files", [])
142	logger.info(
143	f"Found {len(recordings)} recording(s) and "
144	f"{len(transcript_files)} transcript doc(s) in Drive"
145	)
146	except RuntimeError as e:
147	logger.debug(f"Transcript search failed: {e}")
148
149	if not files:
150	logger.warning("No Google Meet recordings found in Drive")
151
152	logger.info(f"Found {len(files)} Meet recording(s)")
153	return files
154
155	def download(self, file: SourceFile, destination: Path) -> Path:
156	"""Download a Meet recording from Drive."""
157	destination = Path(destination)
158	destination.parent.mkdir(parents=True, exist_ok=True)
159
160	# For video files, download binary content via alt=media
161	result = _run_gws(
162	[
163	"drive",
164	"files",
165	"get",
166	"--params",
167	json.dumps({"fileId": file.id, "alt": "media"}),
168	],
169	timeout=300,
170	)
171
172	# Write the content — result may be raw binary or a dict wrapper
173	raw = result.get("raw", "") if isinstance(result, dict) else str(result)
174	destination.write_text(raw, encoding="utf-8")
175	logger.info(f"Downloaded {file.name} to {destination}")
176	return destination
177
178	def fetch_transcript(self, recording_name: str) -> Optional[str]:
179	"""Fetch the companion transcript for a Meet recording.
180
181	Google Meet creates transcript docs with names that typically match
182	the recording date/time. This method searches for the matching
183	Google Doc and extracts its text content.
184	"""
185	transcript_id = self._find_matching_transcript(recording_name)
186	if not transcript_id:
187	logger.info(f"No matching transcript found for: {recording_name}")
188	return None
189
190	# Fetch the Google Doc content via the Docs API
191	try:
192	result = _run_gws(
193	[
194	"docs",
195	"documents",
196	"get",
197	"--params",
198	json.dumps({"documentId": transcript_id}),
199	],
200	timeout=60,
201	)
202	except RuntimeError as e:
203	logger.warning(f"Failed to fetch transcript doc {transcript_id}: {e}")
204	return None
205
206	# Extract text from the Docs API structural response
207	body = result.get("body", {})
208	text_parts: list[str] = []
209	for element in body.get("content", []):
210	paragraph = element.get("paragraph", {})
211	for pe in paragraph.get("elements", []):
212	text_run = pe.get("textRun", {})
213	text = text_run.get("content", "")
214	if text.strip():
215	text_parts.append(text)
216
217	if not text_parts:
218	logger.warning(f"Transcript doc {transcript_id} had no extractable text")
219	return None
220
221	return "".join(text_parts)
222
223	def _find_matching_transcript(self, recording_name: str) -> Optional[str]:
224	"""Search Drive for a transcript doc that matches a recording name.
225
226	Meet recordings are typically named like:
227	"Meet Recording 2026-03-07T14:30:00"
228	And transcripts are named like:
229	"Meeting Transcript 2026-03-07" or "2026-03-07 - Transcript"
230
231	This extracts the date portion and searches for matching transcript docs.
232	"""
233	# Extract a date string from the recording name (YYYY-MM-DD pattern)
234	date_match = re.search(r"\d{4}-\d{2}-\d{2}", recording_name)
235	date_str = date_match.group(0) if date_match else recording_name
236
237	# Search for transcript docs matching the date
238	search_query = " and ".join(
239	[
240	"mimeType='application/vnd.google-apps.document'",
241	f"name contains '{date_str}'",
242	"(name contains 'Transcript' or name contains 'transcript' "
243	"or name contains 'Meeting notes')",
244	"trashed=false",
245	]
246	)
247	if self.drive_folder_id:
248	search_query += f" and '{self.drive_folder_id}' in parents"
249
250	try:
251	result = _run_gws(
252	[
253	"drive",
254	"files",
255	"list",
256	"--params",
257	json.dumps(
258	{
259	"q": search_query,
260	"fields": "files(id,name,modifiedTime)",
261	"pageSize": 5,
262	"orderBy": "modifiedTime desc",
263	}
264	),
265	],
266	timeout=60,
267	)
268	except RuntimeError as e:
269	logger.debug(f"Transcript search failed for '{date_str}': {e}")
270	return None
271
272	files = result.get("files", [])
273	if not files:
274	logger.debug(f"No transcript docs found matching '{date_str}'")
275	return None
276
277	# Return the most recently modified match
278	best = files[0]
279	logger.info(f"Matched transcript: {best.get('name')} for recording: {recording_name}")
280	return best.get("id")

A video_processor/sources/teams_recording_source.py

+375

		--- a/video_processor/sources/teams_recording_source.py
		+++ b/video_processor/sources/teams_recording_source.py
		@@ -0,0 +1,375 @@
	1	+"""Microsoft Teams meeting recording source using the m365 CLI.
	2	+
	3	+Fetches Teams meeting recordings and transcripts via the Microsoft Graph API
	4	+through the `m365` CLI tool.
	5	+
	6	+Requires: npm install -g @pnp/cli-microsoft365
	7	+Auth: m365 login (interactive)
	8	+Docs: https://pnp.github.io/cli-microsoft365/
	9	+"""
	10	+
	11	+import logging
	12	+import re
	13	+import shutil
	14	+import subprocess
	15	+from pathlib import Path
	16	+from typing import List, Optional
	17	+
	18	+from video_processor.sources.base import BaseSource, SourceFile
	19	+from video_processor.sources.m365_source import _run_m365
	20	+
	21	+logger = logging.getLogger(__name__)
	22	+
	23	+
	24	+def _vtt_to_text(vtt: str) -> str:
	25	+ """Strip VTT timing metadata and return plain text.
	26	+
	27	+ Removes WEBVTT headers, timestamps (00:00:00.000 --> 00:00:05.000),
	28	+ cue identifiers, and deduplicates consecutive identical lines.
	29	+ """
	30	+ lines = vtt.splitlines()
	31	+ text_lines: list[str] = []
	32	+ prev_line = ""
	33	+
	34	+ for line in lines:
	35	+ stripped = line.strip()
	36	+ # Skip WEBVTT header and NOTE blocks
	37	+ if stripped.startswith("WEBVTT") or stripped.startswith("NOTE"):
	38	+ continue
	39	+ # Skip timestamp lines (e.g. 00:00:01.000 --> 00:00:05.000)
	40	+ if re.match(r"\d{2}:\d{2}[:\.][\d.]+ --> \d{2}:\d{2}[:\.][\d.]+", stripped):
	41	+ continue
	42	+ # Skip numeric cue identifiers
	43	+ if re.match(r"^\d+$", stripped):
	44	+ continue
	45	+ # Skip blank lines
	46	+ if not stripped:
	47	+ continue
	48	+ # Strip inline VTT tags like <v Speaker>
	49	+ cleaned = re.sub(r"<[^>]+>", "", stripped).strip()
	50	+ if cleaned and cleaned != prev_line:
	51	+ text_lines.append(cleaned)
	52	+ prev_line = cleaned
	53	+
	54	+ return "\n".join(text_lines)
	55	+
	56	+
	57	+class TeamsRecordingSource(BaseSource):
	58	+ """
	59	+ Fetch Teams meeting recordings and transcripts via the m365 CLI / Graph API.
	60	+
	61	+ Usage:
	62	+ source = TeamsRecordingSource(user_id="me")
	63	+ source.authenticate()
	64	+ recordings = source.list_videos()
	65	+ source.download_all(recordings, Path("./recordings"))
	66	+
	67	+ # Fetch transcript for a specific meeting
	68	+ transcript = source.fetch_transcript(meeting_id)
	69	+ """
	70	+
	71	+ def __init__(self, user_id: str = "me"):
	72	+ self.user_id = user_id
	73	+
	74	+ def authenticate(self) -> bool:
	75	+ """Check if m365 CLI is installed and logged in."""
	76	+ if not shutil.which("m365"):
	77	+ logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365")
	78	+ return False
	79	+ try:
	80	+ result = _run_m365(["status"], timeout=10)
	81	+ if isinstance(result, dict) and result.get("connectedAs"):
	82	+ return True
	83	+ if isinstance(result, str) and "Logged in" in result:
	84	+ return True
	85	+ logger.error("m365 not logged in. Run: m365 login")
	86	+ return False
	87	+ except (RuntimeError, subprocess.TimeoutExpired):
	88	+ logger.error("m365 not logged in. Run: m365 login")
	89	+ return False
	90	+
	91	+ def list_videos(
	92	+ self,
	93	+ folder_id: Optional[str] = None,
	94	+ folder_path: Optional[str] = None,
	95	+ patterns: Optional[List[str]] = None,
	96	+ ) -> List[SourceFile]:
	97	+ """List Teams meeting recordings available for the user.
	98	+
	99	+ Tries multiple approaches:
	100	+ 1. Graph API onlineMeetings endpoint
	101	+ 2. m365 teams meeting list command
	102	+ 3. Fallback: search chat messages for recording links
	103	+ """
	104	+ files: List[SourceFile] = []
	105	+
	106	+ # Approach 1: Graph API — list online meetings
	107	+ try:
	108	+ result = _run_m365(
	109	+ [
	110	+ "request",
	111	+ "--url",
	112	+ f"https://graph.microsoft.com/v1.0/{self.user_id}/onlineMeetings",
	113	+ "--method",
	114	+ "get",
	115	+ ],
	116	+ timeout=60,
	117	+ )
	118	+ meetings = self._extract_meetings_list(result)
	119	+ for meeting in meetings:
	120	+ recording_files = self._get_meeting_recordings(meeting)
	121	+ files.extend(recording_files)
	122	+
	123	+ if files:
	124	+ logger.info(f"Found {len(files)} recording(s) via Graph API onlineMeetings")
	125	+ return files
	126	+ except RuntimeError as e:
	127	+ logger.debug(f"onlineMeetings endpoint failed: {e}")
	128	+
	129	+ # Approach 2: m365 teams meeting list
	130	+ try:
	131	+ result = _run_m365(["teams", "meeting", "list"], timeout=60)
	132	+ meetings = result if isinstance(result, list) else []
	133	+ for meeting in meetings:
	134	+ recording_files = self._get_meeting_recordings(meeting)
	135	+ files.extend(recording_files)
	136	+
	137	+ if files:
	138	+ logger.info(f"Found {len(files)} recording(s) via m365 teams meeting list")
	139	+ return files
	140	+ except RuntimeError as e:
	141	+ logger.debug(f"teams meeting list failed: {e}")
	142	+
	143	+ # Approach 3: Fallback — search chat messages for recording links
	144	+ try:
	145	+ result = _run_m365(
	146	+ [
	147	+ "request",
	148	+ "--url",
	149	+ (
	150	+ f"https://graph.microsoft.com/v1.0/{self.user_id}/chats"
	151	+ "?$expand=messages($top=50)"
	152	+ "&$filter=chatType eq 'meeting'"
	153	+ "&$top=25"
	154	+ ),
	155	+ "--method",
	156	+ "get",
	157	+ ],
	158	+ timeout=60,
	159	+ )
	160	+ chats = self._extract_value_list(result)
	161	+ for chat in chats:
	162	+ messages = chat.get("messages", [])
	163	+ for msg in messages:
	164	+ body = msg.get("body", {}).get("content", "")
	165	+ if "recording" in body.lower() or ".mp4" in body.lower():
	166	+ topic = chat.get("topic", "Meeting Recording")
	167	+ chat_id = chat.get("id", "")
	168	+ msg_id = msg.get("id", "")
	169	+ files.append(
	170	+ SourceFile(
	171	+ name=f"{topic}.mp4",
	172	+ id=f"{chat_id}:{msg_id}",
	173	+ mime_type="video/mp4",
	174	+ modified_at=msg.get("createdDateTime"),
	175	+ )
	176	+ )
	177	+ if files:
	178	+ logger.info(f"Found {len(files)} recording link(s) in meeting chats")
	179	+ except RuntimeError as e:
	180	+ logger.debug(f"Chat message fallback failed: {e}")
	181	+
	182	+ if not files:
	183	+ logger.warning("No Teams meeting recordings found")
	184	+
	185	+ return files
	186	+
	187	+ def download(self, file: SourceFile, destination: Path) -> Path:
	188	+ """Download a recording via its Graph API download URL."""
	189	+ destination = Path(destination)
	190	+ destination.parent.mkdir(parents=True, exist_ok=True)
	191	+
	192	+ download_url = file.path
	193	+ if download_url:
	194	+ # Use the direct download URL from contentUrl / @microsoft.graph.downloadUrl
	195	+ _run_m365(
	196	+ [
	197	+ "request",
	198	+ "--url",
	199	+ download_url,
	200	+ "--method",
	201	+ "get",
	202	+ "--filePath",
	203	+ str(destination),
	204	+ ],
	205	+ timeout=300,
	206	+ )
	207	+ else:
	208	+ # Try to get download URL from the recording ID
	209	+ meeting_id, _, recording_id = file.id.partition(":")
	210	+ if recording_id:
	211	+ url = (
	212	+ f"https://graph.microsoft.com/v1.0/{self.user_id}"
	213	+ f"/onlineMeetings/{meeting_id}"
	214	+ f"/recordings/{recording_id}/content"
	215	+ )
	216	+ else:
	217	+ url = (
	218	+ f"https://graph.microsoft.com/v1.0/{self.user_id}"
	219	+ f"/onlineMeetings/{meeting_id}/recordings"
	220	+ )
	221	+ # Fetch recording list to find the content URL
	222	+ result = _run_m365(
	223	+ ["request", "--url", url, "--method", "get"],
	224	+ timeout=60,
	225	+ )
	226	+ recordings = self._extract_value_list(result)
	227	+ if recordings:
	228	+ url = (
	229	+ f"https://graph.microsoft.com/v1.0/{self.user_id}"
	230	+ f"/onlineMeetings/{meeting_id}"
	231	+ f"/recordings/{recordings[0].get('id', '')}/content"
	232	+ )
	233	+
	234	+ _run_m365(
	235	+ [
	236	+ "request",
	237	+ "--url",
	238	+ url,
	239	+ "--method",
	240	+ "get",
	241	+ "--filePath",
	242	+ str(destination),
	243	+ ],
	244	+ timeout=300,
	245	+ )
	246	+
	247	+ logger.info(f"Downloaded {file.name} to {destination}")
	248	+ return destination
	249	+
	250	+ def fetch_transcript(self, meeting_id: str) -> Optional[str]:
	251	+ """Fetch the transcript for a Teams meeting.
	252	+
	253	+ Queries the Graph API transcripts endpoint, downloads the transcript
	254	+ content, and converts VTT format to plain text.
	255	+ """
	256	+ try:
	257	+ result = _run_m365(
	258	+ [
	259	+ "request",
	260	+ "--url",
	261	+ (
	262	+ f"https://graph.microsoft.com/v1.0/{self.user_id}"
	263	+ f"/onlineMeetings/{meeting_id}/transcripts"
	264	+ ),
	265	+ "--method",
	266	+ "get",
	267	+ ],
	268	+ timeout=60,
	269	+ )
	270	+ except RuntimeError as e:
	271	+ logger.warning(f"Failed to list transcripts for meeting {meeting_id}: {e}")
	272	+ return None
	273	+
	274	+ transcripts = self._extract_value_list(result)
	275	+ if not transcripts:
	276	+ logger.info(f"No transcripts found for meeting {meeting_id}")
	277	+ return None
	278	+
	279	+ # Download the first available transcript
	280	+ transcript_id = transcripts[0].get("id", "")
	281	+ try:
	282	+ content_result = _run_m365(
	283	+ [
	284	+ "request",
	285	+ "--url",
	286	+ (
	287	+ f"https://graph.microsoft.com/v1.0/{self.user_id}"
	288	+ f"/onlineMeetings/{meeting_id}"
	289	+ f"/transcripts/{transcript_id}/content"
	290	+ ),
	291	+ "--method",
	292	+ "get",
	293	+ "--accept",
	294	+ "text/vtt",
	295	+ ],
	296	+ timeout=60,
	297	+ )
	298	+ except RuntimeError as e:
	299	+ logger.warning(f"Failed to download transcript {transcript_id}: {e}")
	300	+ return None
	301	+
	302	+ # content_result may be raw VTT text or a dict with raw key
	303	+ if isinstance(content_result, dict):
	304	+ raw = content_result.get("raw", "")
	305	+ else:
	306	+ raw = str(content_result)
	307	+
	308	+ if not raw:
	309	+ logger.warning(f"Empty transcript content for meeting {meeting_id}")
	310	+ return None
	311	+
	312	+ return _vtt_to_text(raw)
	313	+
	314	+ # ------------------------------------------------------------------
	315	+ # Internal helpers
	316	+ # ------------------------------------------------------------------
	317	+
	318	+ def _extract_meetings_list(self, result) -> list:
	319	+ """Extract meetings list from Graph API response."""
	320	+ if isinstance(result, list):
	321	+ return result
	322	+ if isinstance(result, dict):
	323	+ return result.get("value", [])
	324	+ return []
	325	+
	326	+ def _extract_value_list(self, result) -> list:
	327	+ """Extract value list from a Graph API response."""
	328	+ if isinstance(result, list):
	329	+ return result
	330	+ if isinstance(result, dict):
	331	+ return result.get("value", [])
	332	+ return []
	333	+
	334	+ def _get_meeting_recordings(self, meeting: dict) -> List[SourceFile]:
	335	+ """Fetch recordings for a single meeting and return SourceFile entries."""
	336	+ meeting_id = meeting.get("id", "")
	337	+ subject = meeting.get("subject", meeting.get("topic", "Teams Meeting"))
	338	+ start_time = meeting.get("startDateTime", meeting.get("createdDateTime"))
	339	+
	340	+ if not meeting_id:
	341	+ return []
	342	+
	343	+ try:
	344	+ result = _run_m365(
	345	+ [
	346	+ "request",
	347	+ "--url",
	348	+ (
	349	+ f"https://graph.microsoft.com/v1.0/{self.user_id}"
	350	+ f"/onlineMeetings/{meeting_id}/recordings"
	351	+ ),
	352	+ "--method",
	353	+ "get",
	354	+ ],
	355	+ timeout=60,
	356	+ )
	357	+ except RuntimeError:
	358	+ return []
	359	+
	360	+ recordings = self._extract_value_list(result)
	361	+ files: List[SourceFile] = []
	362	+ for rec in recordings:
	363	+ rec_id = rec.get("id", "")
	364	+ download_url = rec.get("content.downloadUrl", rec.get("contentUrl"))
	365	+ files.append(
	366	+ SourceFile(
	367	+ name=f"{subject}.mp4",
	368	+ id=f"{meeting_id}:{rec_id}",
	369	+ mime_type="video/mp4",
	370	+ modified_at=start_time,
	371	+ path=download_url,
	372	+ )
	373	+ )
	374	+
	375	+ return files

	--- a/video_processor/sources/teams_recording_source.py
	+++ b/video_processor/sources/teams_recording_source.py
	@@ -0,0 +1,375 @@

	--- a/video_processor/sources/teams_recording_source.py
	+++ b/video_processor/sources/teams_recording_source.py
	@@ -0,0 +1,375 @@
1	"""Microsoft Teams meeting recording source using the m365 CLI.
2
3	Fetches Teams meeting recordings and transcripts via the Microsoft Graph API
4	through the `m365` CLI tool.
5
6	Requires: npm install -g @pnp/cli-microsoft365
7	Auth: m365 login (interactive)
8	Docs: https://pnp.github.io/cli-microsoft365/
9	"""
10
11	import logging
12	import re
13	import shutil
14	import subprocess
15	from pathlib import Path
16	from typing import List, Optional
17
18	from video_processor.sources.base import BaseSource, SourceFile
19	from video_processor.sources.m365_source import _run_m365
20
21	logger = logging.getLogger(__name__)
22
23
24	def _vtt_to_text(vtt: str) -> str:
25	"""Strip VTT timing metadata and return plain text.
26
27	Removes WEBVTT headers, timestamps (00:00:00.000 --> 00:00:05.000),
28	cue identifiers, and deduplicates consecutive identical lines.
29	"""
30	lines = vtt.splitlines()
31	text_lines: list[str] = []
32	prev_line = ""
33
34	for line in lines:
35	stripped = line.strip()
36	# Skip WEBVTT header and NOTE blocks
37	if stripped.startswith("WEBVTT") or stripped.startswith("NOTE"):
38	continue
39	# Skip timestamp lines (e.g. 00:00:01.000 --> 00:00:05.000)
40	if re.match(r"\d{2}:\d{2}[:\.][\d.]+ --> \d{2}:\d{2}[:\.][\d.]+", stripped):
41	continue
42	# Skip numeric cue identifiers
43	if re.match(r"^\d+$", stripped):
44	continue
45	# Skip blank lines
46	if not stripped:
47	continue
48	# Strip inline VTT tags like <v Speaker>
49	cleaned = re.sub(r"<[^>]+>", "", stripped).strip()
50	if cleaned and cleaned != prev_line:
51	text_lines.append(cleaned)
52	prev_line = cleaned
53
54	return "\n".join(text_lines)
55
56
57	class TeamsRecordingSource(BaseSource):
58	"""
59	Fetch Teams meeting recordings and transcripts via the m365 CLI / Graph API.
60
61	Usage:
62	source = TeamsRecordingSource(user_id="me")
63	source.authenticate()
64	recordings = source.list_videos()
65	source.download_all(recordings, Path("./recordings"))
66
67	# Fetch transcript for a specific meeting
68	transcript = source.fetch_transcript(meeting_id)
69	"""
70
71	def __init__(self, user_id: str = "me"):
72	self.user_id = user_id
73
74	def authenticate(self) -> bool:
75	"""Check if m365 CLI is installed and logged in."""
76	if not shutil.which("m365"):
77	logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365")
78	return False
79	try:
80	result = _run_m365(["status"], timeout=10)
81	if isinstance(result, dict) and result.get("connectedAs"):
82	return True
83	if isinstance(result, str) and "Logged in" in result:
84	return True
85	logger.error("m365 not logged in. Run: m365 login")
86	return False
87	except (RuntimeError, subprocess.TimeoutExpired):
88	logger.error("m365 not logged in. Run: m365 login")
89	return False
90
91	def list_videos(
92	self,
93	folder_id: Optional[str] = None,
94	folder_path: Optional[str] = None,
95	patterns: Optional[List[str]] = None,
96	) -> List[SourceFile]:
97	"""List Teams meeting recordings available for the user.
98
99	Tries multiple approaches:
100	1. Graph API onlineMeetings endpoint
101	2. m365 teams meeting list command
102	3. Fallback: search chat messages for recording links
103	"""
104	files: List[SourceFile] = []
105
106	# Approach 1: Graph API — list online meetings
107	try:
108	result = _run_m365(
109	[
110	"request",
111	"--url",
112	f"https://graph.microsoft.com/v1.0/{self.user_id}/onlineMeetings",
113	"--method",
114	"get",
115	],
116	timeout=60,
117	)
118	meetings = self._extract_meetings_list(result)
119	for meeting in meetings:
120	recording_files = self._get_meeting_recordings(meeting)
121	files.extend(recording_files)
122
123	if files:
124	logger.info(f"Found {len(files)} recording(s) via Graph API onlineMeetings")
125	return files
126	except RuntimeError as e:
127	logger.debug(f"onlineMeetings endpoint failed: {e}")
128
129	# Approach 2: m365 teams meeting list
130	try:
131	result = _run_m365(["teams", "meeting", "list"], timeout=60)
132	meetings = result if isinstance(result, list) else []
133	for meeting in meetings:
134	recording_files = self._get_meeting_recordings(meeting)
135	files.extend(recording_files)
136
137	if files:
138	logger.info(f"Found {len(files)} recording(s) via m365 teams meeting list")
139	return files
140	except RuntimeError as e:
141	logger.debug(f"teams meeting list failed: {e}")
142
143	# Approach 3: Fallback — search chat messages for recording links
144	try:
145	result = _run_m365(
146	[
147	"request",
148	"--url",
149	(
150	f"https://graph.microsoft.com/v1.0/{self.user_id}/chats"
151	"?$expand=messages($top=50)"
152	"&$filter=chatType eq 'meeting'"
153	"&$top=25"
154	),
155	"--method",
156	"get",
157	],
158	timeout=60,
159	)
160	chats = self._extract_value_list(result)
161	for chat in chats:
162	messages = chat.get("messages", [])
163	for msg in messages:
164	body = msg.get("body", {}).get("content", "")
165	if "recording" in body.lower() or ".mp4" in body.lower():
166	topic = chat.get("topic", "Meeting Recording")
167	chat_id = chat.get("id", "")
168	msg_id = msg.get("id", "")
169	files.append(
170	SourceFile(
171	name=f"{topic}.mp4",
172	id=f"{chat_id}:{msg_id}",
173	mime_type="video/mp4",
174	modified_at=msg.get("createdDateTime"),
175	)
176	)
177	if files:
178	logger.info(f"Found {len(files)} recording link(s) in meeting chats")
179	except RuntimeError as e:
180	logger.debug(f"Chat message fallback failed: {e}")
181
182	if not files:
183	logger.warning("No Teams meeting recordings found")
184
185	return files
186
187	def download(self, file: SourceFile, destination: Path) -> Path:
188	"""Download a recording via its Graph API download URL."""
189	destination = Path(destination)
190	destination.parent.mkdir(parents=True, exist_ok=True)
191
192	download_url = file.path
193	if download_url:
194	# Use the direct download URL from contentUrl / @microsoft.graph.downloadUrl
195	_run_m365(
196	[
197	"request",
198	"--url",
199	download_url,
200	"--method",
201	"get",
202	"--filePath",
203	str(destination),
204	],
205	timeout=300,
206	)
207	else:
208	# Try to get download URL from the recording ID
209	meeting_id, _, recording_id = file.id.partition(":")
210	if recording_id:
211	url = (
212	f"https://graph.microsoft.com/v1.0/{self.user_id}"
213	f"/onlineMeetings/{meeting_id}"
214	f"/recordings/{recording_id}/content"
215	)
216	else:
217	url = (
218	f"https://graph.microsoft.com/v1.0/{self.user_id}"
219	f"/onlineMeetings/{meeting_id}/recordings"
220	)
221	# Fetch recording list to find the content URL
222	result = _run_m365(
223	["request", "--url", url, "--method", "get"],
224	timeout=60,
225	)
226	recordings = self._extract_value_list(result)
227	if recordings:
228	url = (
229	f"https://graph.microsoft.com/v1.0/{self.user_id}"
230	f"/onlineMeetings/{meeting_id}"
231	f"/recordings/{recordings[0].get('id', '')}/content"
232	)
233
234	_run_m365(
235	[
236	"request",
237	"--url",
238	url,
239	"--method",
240	"get",
241	"--filePath",
242	str(destination),
243	],
244	timeout=300,
245	)
246
247	logger.info(f"Downloaded {file.name} to {destination}")
248	return destination
249
250	def fetch_transcript(self, meeting_id: str) -> Optional[str]:
251	"""Fetch the transcript for a Teams meeting.
252
253	Queries the Graph API transcripts endpoint, downloads the transcript
254	content, and converts VTT format to plain text.
255	"""
256	try:
257	result = _run_m365(
258	[
259	"request",
260	"--url",
261	(
262	f"https://graph.microsoft.com/v1.0/{self.user_id}"
263	f"/onlineMeetings/{meeting_id}/transcripts"
264	),
265	"--method",
266	"get",
267	],
268	timeout=60,
269	)
270	except RuntimeError as e:
271	logger.warning(f"Failed to list transcripts for meeting {meeting_id}: {e}")
272	return None
273
274	transcripts = self._extract_value_list(result)
275	if not transcripts:
276	logger.info(f"No transcripts found for meeting {meeting_id}")
277	return None
278
279	# Download the first available transcript
280	transcript_id = transcripts[0].get("id", "")
281	try:
282	content_result = _run_m365(
283	[
284	"request",
285	"--url",
286	(
287	f"https://graph.microsoft.com/v1.0/{self.user_id}"
288	f"/onlineMeetings/{meeting_id}"
289	f"/transcripts/{transcript_id}/content"
290	),
291	"--method",
292	"get",
293	"--accept",
294	"text/vtt",
295	],
296	timeout=60,
297	)
298	except RuntimeError as e:
299	logger.warning(f"Failed to download transcript {transcript_id}: {e}")
300	return None
301
302	# content_result may be raw VTT text or a dict with raw key
303	if isinstance(content_result, dict):
304	raw = content_result.get("raw", "")
305	else:
306	raw = str(content_result)
307
308	if not raw:
309	logger.warning(f"Empty transcript content for meeting {meeting_id}")
310	return None
311
312	return _vtt_to_text(raw)
313
314	# ------------------------------------------------------------------
315	# Internal helpers
316	# ------------------------------------------------------------------
317
318	def _extract_meetings_list(self, result) -> list:
319	"""Extract meetings list from Graph API response."""
320	if isinstance(result, list):
321	return result
322	if isinstance(result, dict):
323	return result.get("value", [])
324	return []
325
326	def _extract_value_list(self, result) -> list:
327	"""Extract value list from a Graph API response."""
328	if isinstance(result, list):
329	return result
330	if isinstance(result, dict):
331	return result.get("value", [])
332	return []
333
334	def _get_meeting_recordings(self, meeting: dict) -> List[SourceFile]:
335	"""Fetch recordings for a single meeting and return SourceFile entries."""
336	meeting_id = meeting.get("id", "")
337	subject = meeting.get("subject", meeting.get("topic", "Teams Meeting"))
338	start_time = meeting.get("startDateTime", meeting.get("createdDateTime"))
339
340	if not meeting_id:
341	return []
342
343	try:
344	result = _run_m365(
345	[
346	"request",
347	"--url",
348	(
349	f"https://graph.microsoft.com/v1.0/{self.user_id}"
350	f"/onlineMeetings/{meeting_id}/recordings"
351	),
352	"--method",
353	"get",
354	],
355	timeout=60,
356	)
357	except RuntimeError:
358	return []
359
360	recordings = self._extract_value_list(result)
361	files: List[SourceFile] = []
362	for rec in recordings:
363	rec_id = rec.get("id", "")
364	download_url = rec.get("content.downloadUrl", rec.get("contentUrl"))
365	files.append(
366	SourceFile(
367	name=f"{subject}.mp4",
368	id=f"{meeting_id}:{rec_id}",
369	mime_type="video/mp4",
370	modified_at=start_time,
371	path=download_url,
372	)
373	)
374
375	return files

A video_processor/sources/zoom_source.py

+399

		--- a/video_processor/sources/zoom_source.py
		+++ b/video_processor/sources/zoom_source.py
		@@ -0,0 +1,399 @@
	1	+"""Zoom cloud recordings source integration with OAuth support."""
	2	+
	3	+import base64
	4	+import hashlib
	5	+import json
	6	+import logging
	7	+import os
	8	+import secrets
	9	+import time
	10	+import webbrowser
	11	+from pathlib import Path
	12	+from typing import Dict, List, Optional
	13	+
	14	+import requests
	15	+
	16	+from video_processor.sources.base import BaseSource, SourceFile
	17	+
	18	+logger = logging.getLogger(__name__)
	19	+
	20	+_TOKEN_PATH = Path.home() / ".planopticon" / "zoom_token.json"
	21	+_BASE_URL = "https://api.zoom.us/v2"
	22	+_OAUTH_BASE = "https://zoom.us/oauth"
	23	+
	24	+# Map Zoom file_type values to MIME types
	25	+_MIME_TYPES = {
	26	+ "MP4": "video/mp4",
	27	+ "M4A": "audio/mp4",
	28	+ "CHAT": "text/plain",
	29	+ "TRANSCRIPT": "text/vtt",
	30	+ "CSV": "text/csv",
	31	+ "TIMELINE": "application/json",
	32	+}
	33	+
	34	+
	35	+class ZoomSource(BaseSource):
	36	+ """
	37	+ Zoom cloud recordings source with OAuth2 support.
	38	+
	39	+ Auth methods (tried in order):
	40	+ 1. Saved token: Load from token_path, refresh if expired
	41	+ 2. Server-to-Server OAuth: Uses account_id with client credentials
	42	+ 3. OAuth2 Authorization Code with PKCE: Interactive browser flow
	43	+ """
	44	+
	45	+ def __init__(
	46	+ self,
	47	+ client_id: Optional[str] = None,
	48	+ client_secret: Optional[str] = None,
	49	+ account_id: Optional[str] = None,
	50	+ token_path: Optional[Path] = None,
	51	+ ):
	52	+ """
	53	+ Initialize Zoom source.
	54	+
	55	+ Parameters
	56	+ ----------
	57	+ client_id : str, optional
	58	+ Zoom OAuth app client ID. Falls back to ZOOM_CLIENT_ID env var.
	59	+ client_secret : str, optional
	60	+ Zoom OAuth app client secret. Falls back to ZOOM_CLIENT_SECRET env var.
	61	+ account_id : str, optional
	62	+ Zoom account ID for Server-to-Server OAuth. Falls back to ZOOM_ACCOUNT_ID env var.
	63	+ token_path : Path, optional
	64	+ Where to store/load OAuth tokens.
	65	+ """
	66	+ self.client_id = client_id or os.environ.get("ZOOM_CLIENT_ID")
	67	+ self.client_secret = client_secret or os.environ.get("ZOOM_CLIENT_SECRET")
	68	+ self.account_id = account_id or os.environ.get("ZOOM_ACCOUNT_ID")
	69	+ self.token_path = token_path or _TOKEN_PATH
	70	+ self._access_token: Optional[str] = None
	71	+ self._token_data: Optional[Dict] = None
	72	+
	73	+ def authenticate(self) -> bool:
	74	+ """Authenticate with Zoom API."""
	75	+ # Try 1: Load saved token
	76	+ if self.token_path.exists():
	77	+ if self._auth_saved_token():
	78	+ return True
	79	+
	80	+ # Try 2: Server-to-Server OAuth (if account_id is set)
	81	+ if self.account_id:
	82	+ return self._auth_server_to_server()
	83	+
	84	+ # Try 3: OAuth2 Authorization Code flow with PKCE
	85	+ return self._auth_oauth_pkce()
	86	+
	87	+ def _auth_saved_token(self) -> bool:
	88	+ """Authenticate using a saved OAuth token, refreshing if expired."""
	89	+ try:
	90	+ data = json.loads(self.token_path.read_text())
	91	+ expires_at = data.get("expires_at", 0)
	92	+
	93	+ if time.time() < expires_at:
	94	+ # Token still valid
	95	+ self._access_token = data["access_token"]
	96	+ self._token_data = data
	97	+ logger.info("Authenticated with Zoom via saved token")
	98	+ return True
	99	+
	100	+ # Token expired, try to refresh
	101	+ if data.get("refresh_token"):
	102	+ return self._refresh_token()
	103	+
	104	+ # Server-to-Server tokens don't have refresh tokens;
	105	+ # fall through to re-authenticate
	106	+ return False
	107	+ except Exception:
	108	+ return False
	109	+
	110	+ def _auth_server_to_server(self) -> bool:
	111	+ """Authenticate using Server-to-Server OAuth (account credentials)."""
	112	+ if not self.client_id or not self.client_secret:
	113	+ logger.error(
	114	+ "Zoom client_id and client_secret required for Server-to-Server OAuth. "
	115	+ "Set ZOOM_CLIENT_ID and ZOOM_CLIENT_SECRET env vars."
	116	+ )
	117	+ return False
	118	+
	119	+ try:
	120	+ resp = requests.post(
	121	+ f"{_OAUTH_BASE}/token",
	122	+ params={
	123	+ "grant_type": "account_credentials",
	124	+ "account_id": self.account_id,
	125	+ },
	126	+ auth=(self.client_id, self.client_secret),
	127	+ timeout=30,
	128	+ )
	129	+ resp.raise_for_status()
	130	+ token_data = resp.json()
	131	+
	132	+ self._access_token = token_data["access_token"]
	133	+ self._token_data = {
	134	+ "access_token": token_data["access_token"],
	135	+ "expires_at": time.time() + token_data.get("expires_in", 3600) - 60,
	136	+ "token_type": token_data.get("token_type", "bearer"),
	137	+ }
	138	+
	139	+ self._save_token(self._token_data)
	140	+ logger.info("Authenticated with Zoom via Server-to-Server OAuth")
	141	+ return True
	142	+ except Exception as e:
	143	+ logger.error(f"Zoom Server-to-Server OAuth failed: {e}")
	144	+ return False
	145	+
	146	+ def _auth_oauth_pkce(self) -> bool:
	147	+ """Run OAuth2 Authorization Code flow with PKCE."""
	148	+ if not self.client_id:
	149	+ logger.error("Zoom client_id required for OAuth. Set ZOOM_CLIENT_ID env var.")
	150	+ return False
	151	+
	152	+ try:
	153	+ # Generate PKCE code verifier and challenge
	154	+ code_verifier = secrets.token_urlsafe(64)
	155	+ code_challenge = (
	156	+ base64.urlsafe_b64encode(hashlib.sha256(code_verifier.encode("ascii")).digest())
	157	+ .rstrip(b"=")
	158	+ .decode("ascii")
	159	+ )
	160	+
	161	+ authorize_url = (
	162	+ f"{_OAUTH_BASE}/authorize"
	163	+ f"?response_type=code"
	164	+ f"&client_id={self.client_id}"
	165	+ f"&redirect_uri=urn:ietf:wg:oauth:2.0:oob"
	166	+ f"&code_challenge={code_challenge}"
	167	+ f"&code_challenge_method=S256"
	168	+ )
	169	+
	170	+ print(f"\nOpen this URL to authorize PlanOpticon:\n{authorize_url}\n")
	171	+
	172	+ try:
	173	+ webbrowser.open(authorize_url)
	174	+ except Exception:
	175	+ pass
	176	+
	177	+ auth_code = input("Enter the authorization code: ").strip()
	178	+
	179	+ # Exchange authorization code for tokens
	180	+ payload = {
	181	+ "grant_type": "authorization_code",
	182	+ "code": auth_code,
	183	+ "redirect_uri": "urn:ietf:wg:oauth:2.0:oob",
	184	+ "code_verifier": code_verifier,
	185	+ }
	186	+
	187	+ resp = requests.post(
	188	+ f"{_OAUTH_BASE}/token",
	189	+ data=payload,
	190	+ auth=(self.client_id, self.client_secret or ""),
	191	+ timeout=30,
	192	+ )
	193	+ resp.raise_for_status()
	194	+ token_data = resp.json()
	195	+
	196	+ self._access_token = token_data["access_token"]
	197	+ self._token_data = {
	198	+ "access_token": token_data["access_token"],
	199	+ "refresh_token": token_data.get("refresh_token"),
	200	+ "expires_at": time.time() + token_data.get("expires_in", 3600) - 60,
	201	+ "token_type": token_data.get("token_type", "bearer"),
	202	+ "client_id": self.client_id,
	203	+ "client_secret": self.client_secret or "",
	204	+ }
	205	+
	206	+ self._save_token(self._token_data)
	207	+ logger.info("Authenticated with Zoom via OAuth PKCE")
	208	+ return True
	209	+ except Exception as e:
	210	+ logger.error(f"Zoom OAuth PKCE failed: {e}")
	211	+ return False
	212	+
	213	+ def _refresh_token(self) -> bool:
	214	+ """Refresh an expired OAuth token."""
	215	+ try:
	216	+ data = json.loads(self.token_path.read_text())
	217	+ refresh_token = data.get("refresh_token")
	218	+ client_id = data.get("client_id") or self.client_id
	219	+ client_secret = data.get("client_secret") or self.client_secret
	220	+
	221	+ if not refresh_token or not client_id:
	222	+ return False
	223	+
	224	+ resp = requests.post(
	225	+ f"{_OAUTH_BASE}/token",
	226	+ data={
	227	+ "grant_type": "refresh_token",
	228	+ "refresh_token": refresh_token,
	229	+ },
	230	+ auth=(client_id, client_secret or ""),
	231	+ timeout=30,
	232	+ )
	233	+ resp.raise_for_status()
	234	+ token_data = resp.json()
	235	+
	236	+ self._access_token = token_data["access_token"]
	237	+ self._token_data = {
	238	+ "access_token": token_data["access_token"],
	239	+ "refresh_token": token_data.get("refresh_token", refresh_token),
	240	+ "expires_at": time.time() + token_data.get("expires_in", 3600) - 60,
	241	+ "token_type": token_data.get("token_type", "bearer"),
	242	+ "client_id": client_id,
	243	+ "client_secret": client_secret or "",
	244	+ }
	245	+
	246	+ self._save_token(self._token_data)
	247	+ logger.info("Refreshed Zoom OAuth token")
	248	+ return True
	249	+ except Exception as e:
	250	+ logger.error(f"Zoom token refresh failed: {e}")
	251	+ return False
	252	+
	253	+ def _save_token(self, data: Dict) -> None:
	254	+ """Save token data to disk."""
	255	+ self.token_path.parent.mkdir(parents=True, exist_ok=True)
	256	+ self.token_path.write_text(json.dumps(data))
	257	+ logger.info(f"OAuth token saved to {self.token_path}")
	258	+
	259	+ def _api_get(self, endpoint: str, params: Optional[Dict] = None) -> requests.Response:
	260	+ """Make an authenticated GET request to the Zoom API."""
	261	+ if not self._access_token:
	262	+ raise RuntimeError("Not authenticated. Call authenticate() first.")
	263	+
	264	+ url = f"{_BASE_URL}/{endpoint.lstrip('/')}"
	265	+ resp = requests.get(
	266	+ url,
	267	+ headers={"Authorization": f"Bearer {self._access_token}"},
	268	+ params=params,
	269	+ timeout=30,
	270	+ )
	271	+ resp.raise_for_status()
	272	+ return resp
	273	+
	274	+ def list_videos(
	275	+ self,
	276	+ folder_id: Optional[str] = None,
	277	+ folder_path: Optional[str] = None,
	278	+ patterns: Optional[List[str]] = None,
	279	+ ) -> List[SourceFile]:
	280	+ """List video files from Zoom cloud recordings."""
	281	+ if not self._access_token:
	282	+ raise RuntimeError("Not authenticated. Call authenticate() first.")
	283	+
	284	+ files: List[SourceFile] = []
	285	+ next_page_token = ""
	286	+
	287	+ while True:
	288	+ params: Dict = {}
	289	+ if next_page_token:
	290	+ params["next_page_token"] = next_page_token
	291	+
	292	+ resp = self._api_get("users/me/recordings", params=params)
	293	+ data = resp.json()
	294	+
	295	+ for meeting in data.get("meetings", []):
	296	+ meeting_id = str(meeting.get("id", ""))
	297	+ topic = meeting.get("topic", "Untitled Meeting")
	298	+ start_time = meeting.get("start_time")
	299	+
	300	+ for rec_file in meeting.get("recording_files", []):
	301	+ file_type = rec_file.get("file_type", "")
	302	+ mime_type = _MIME_TYPES.get(file_type)
	303	+
	304	+ # Build a descriptive name
	305	+ file_ext = rec_file.get("file_extension", file_type).lower()
	306	+ file_name = f"{topic}.{file_ext}"
	307	+
	308	+ if patterns:
	309	+ if not any(file_name.endswith(p.replace("*", "")) for p in patterns):
	310	+ continue
	311	+
	312	+ files.append(
	313	+ SourceFile(
	314	+ name=file_name,
	315	+ id=meeting_id,
	316	+ size_bytes=rec_file.get("file_size"),
	317	+ mime_type=mime_type,
	318	+ modified_at=start_time,
	319	+ path=rec_file.get("download_url"),
	320	+ )
	321	+ )
	322	+
	323	+ next_page_token = data.get("next_page_token", "")
	324	+ if not next_page_token:
	325	+ break
	326	+
	327	+ logger.info(f"Found {len(files)} recordings in Zoom")
	328	+ return files
	329	+
	330	+ def download(self, file: SourceFile, destination: Path) -> Path:
	331	+ """Download a recording file from Zoom."""
	332	+ if not self._access_token:
	333	+ raise RuntimeError("Not authenticated. Call authenticate() first.")
	334	+
	335	+ destination = Path(destination)
	336	+ destination.parent.mkdir(parents=True, exist_ok=True)
	337	+
	338	+ download_url = file.path
	339	+ if not download_url:
	340	+ raise ValueError(f"No download URL for file: {file.name}")
	341	+
	342	+ resp = requests.get(
	343	+ download_url,
	344	+ headers={"Authorization": f"Bearer {self._access_token}"},
	345	+ stream=True,
	346	+ timeout=60,
	347	+ )
	348	+ resp.raise_for_status()
	349	+
	350	+ with open(destination, "wb") as f:
	351	+ for chunk in resp.iter_content(chunk_size=8192):
	352	+ f.write(chunk)
	353	+
	354	+ logger.info(f"Downloaded {file.name} to {destination}")
	355	+ return destination
	356	+
	357	+ def fetch_transcript(self, meeting_id: str) -> Optional[str]:
	358	+ """
	359	+ Fetch the transcript (VTT) for a Zoom meeting recording.
	360	+
	361	+ Looks for transcript files in the recording's file list and downloads
	362	+ the content as text.
	363	+
	364	+ Parameters
	365	+ ----------
	366	+ meeting_id : str
	367	+ The Zoom meeting ID.
	368	+
	369	+ Returns
	370	+ -------
	371	+ str or None
	372	+ Transcript text if available, None otherwise.
	373	+ """
	374	+ if not self._access_token:
	375	+ raise RuntimeError("Not authenticated. Call authenticate() first.")
	376	+
	377	+ try:
	378	+ resp = self._api_get(f"meetings/{meeting_id}/recordings")
	379	+ data = resp.json()
	380	+
	381	+ for rec_file in data.get("recording_files", []):
	382	+ file_type = rec_file.get("file_type", "")
	383	+ if file_type == "TRANSCRIPT":
	384	+ download_url = rec_file.get("download_url")
	385	+ if download_url:
	386	+ dl_resp = requests.get(
	387	+ download_url,
	388	+ headers={"Authorization": f"Bearer {self._access_token}"},
	389	+ timeout=30,
	390	+ )
	391	+ dl_resp.raise_for_status()
	392	+ logger.info(f"Fetched transcript for meeting {meeting_id}")
	393	+ return dl_resp.text
	394	+
	395	+ logger.info(f"No transcript found for meeting {meeting_id}")
	396	+ return None
	397	+ except Exception as e:
	398	+ logger.error(f"Failed to fetch transcript for meeting {meeting_id}: {e}")
	399	+ return None

	--- a/video_processor/sources/zoom_source.py
	+++ b/video_processor/sources/zoom_source.py
	@@ -0,0 +1,399 @@

	--- a/video_processor/sources/zoom_source.py
	+++ b/video_processor/sources/zoom_source.py
	@@ -0,0 +1,399 @@
1	"""Zoom cloud recordings source integration with OAuth support."""
2
3	import base64
4	import hashlib
5	import json
6	import logging
7	import os
8	import secrets
9	import time
10	import webbrowser
11	from pathlib import Path
12	from typing import Dict, List, Optional
13
14	import requests
15
16	from video_processor.sources.base import BaseSource, SourceFile
17
18	logger = logging.getLogger(__name__)
19
20	_TOKEN_PATH = Path.home() / ".planopticon" / "zoom_token.json"
21	_BASE_URL = "https://api.zoom.us/v2"
22	_OAUTH_BASE = "https://zoom.us/oauth"
23
24	# Map Zoom file_type values to MIME types
25	_MIME_TYPES = {
26	"MP4": "video/mp4",
27	"M4A": "audio/mp4",
28	"CHAT": "text/plain",
29	"TRANSCRIPT": "text/vtt",
30	"CSV": "text/csv",
31	"TIMELINE": "application/json",
32	}
33
34
35	class ZoomSource(BaseSource):
36	"""
37	Zoom cloud recordings source with OAuth2 support.
38
39	Auth methods (tried in order):
40	1. Saved token: Load from token_path, refresh if expired
41	2. Server-to-Server OAuth: Uses account_id with client credentials
42	3. OAuth2 Authorization Code with PKCE: Interactive browser flow
43	"""
44
45	def __init__(
46	self,
47	client_id: Optional[str] = None,
48	client_secret: Optional[str] = None,
49	account_id: Optional[str] = None,
50	token_path: Optional[Path] = None,
51	):
52	"""
53	Initialize Zoom source.
54
55	Parameters
56	----------
57	client_id : str, optional
58	Zoom OAuth app client ID. Falls back to ZOOM_CLIENT_ID env var.
59	client_secret : str, optional
60	Zoom OAuth app client secret. Falls back to ZOOM_CLIENT_SECRET env var.
61	account_id : str, optional
62	Zoom account ID for Server-to-Server OAuth. Falls back to ZOOM_ACCOUNT_ID env var.
63	token_path : Path, optional
64	Where to store/load OAuth tokens.
65	"""
66	self.client_id = client_id or os.environ.get("ZOOM_CLIENT_ID")
67	self.client_secret = client_secret or os.environ.get("ZOOM_CLIENT_SECRET")
68	self.account_id = account_id or os.environ.get("ZOOM_ACCOUNT_ID")
69	self.token_path = token_path or _TOKEN_PATH
70	self._access_token: Optional[str] = None
71	self._token_data: Optional[Dict] = None
72
73	def authenticate(self) -> bool:
74	"""Authenticate with Zoom API."""
75	# Try 1: Load saved token
76	if self.token_path.exists():
77	if self._auth_saved_token():
78	return True
79
80	# Try 2: Server-to-Server OAuth (if account_id is set)
81	if self.account_id:
82	return self._auth_server_to_server()
83
84	# Try 3: OAuth2 Authorization Code flow with PKCE
85	return self._auth_oauth_pkce()
86
87	def _auth_saved_token(self) -> bool:
88	"""Authenticate using a saved OAuth token, refreshing if expired."""
89	try:
90	data = json.loads(self.token_path.read_text())
91	expires_at = data.get("expires_at", 0)
92
93	if time.time() < expires_at:
94	# Token still valid
95	self._access_token = data["access_token"]
96	self._token_data = data
97	logger.info("Authenticated with Zoom via saved token")
98	return True
99
100	# Token expired, try to refresh
101	if data.get("refresh_token"):
102	return self._refresh_token()
103
104	# Server-to-Server tokens don't have refresh tokens;
105	# fall through to re-authenticate
106	return False
107	except Exception:
108	return False
109
110	def _auth_server_to_server(self) -> bool:
111	"""Authenticate using Server-to-Server OAuth (account credentials)."""
112	if not self.client_id or not self.client_secret:
113	logger.error(
114	"Zoom client_id and client_secret required for Server-to-Server OAuth. "
115	"Set ZOOM_CLIENT_ID and ZOOM_CLIENT_SECRET env vars."
116	)
117	return False
118
119	try:
120	resp = requests.post(
121	f"{_OAUTH_BASE}/token",
122	params={
123	"grant_type": "account_credentials",
124	"account_id": self.account_id,
125	},
126	auth=(self.client_id, self.client_secret),
127	timeout=30,
128	)
129	resp.raise_for_status()
130	token_data = resp.json()
131
132	self._access_token = token_data["access_token"]
133	self._token_data = {
134	"access_token": token_data["access_token"],
135	"expires_at": time.time() + token_data.get("expires_in", 3600) - 60,
136	"token_type": token_data.get("token_type", "bearer"),
137	}
138
139	self._save_token(self._token_data)
140	logger.info("Authenticated with Zoom via Server-to-Server OAuth")
141	return True
142	except Exception as e:
143	logger.error(f"Zoom Server-to-Server OAuth failed: {e}")
144	return False
145
146	def _auth_oauth_pkce(self) -> bool:
147	"""Run OAuth2 Authorization Code flow with PKCE."""
148	if not self.client_id:
149	logger.error("Zoom client_id required for OAuth. Set ZOOM_CLIENT_ID env var.")
150	return False
151
152	try:
153	# Generate PKCE code verifier and challenge
154	code_verifier = secrets.token_urlsafe(64)
155	code_challenge = (
156	base64.urlsafe_b64encode(hashlib.sha256(code_verifier.encode("ascii")).digest())
157	.rstrip(b"=")
158	.decode("ascii")
159	)
160
161	authorize_url = (
162	f"{_OAUTH_BASE}/authorize"
163	f"?response_type=code"
164	f"&client_id={self.client_id}"
165	f"&redirect_uri=urn:ietf:wg:oauth:2.0:oob"
166	f"&code_challenge={code_challenge}"
167	f"&code_challenge_method=S256"
168	)
169
170	print(f"\nOpen this URL to authorize PlanOpticon:\n{authorize_url}\n")
171
172	try:
173	webbrowser.open(authorize_url)
174	except Exception:
175	pass
176
177	auth_code = input("Enter the authorization code: ").strip()
178
179	# Exchange authorization code for tokens
180	payload = {
181	"grant_type": "authorization_code",
182	"code": auth_code,
183	"redirect_uri": "urn:ietf:wg:oauth:2.0:oob",
184	"code_verifier": code_verifier,
185	}
186
187	resp = requests.post(
188	f"{_OAUTH_BASE}/token",
189	data=payload,
190	auth=(self.client_id, self.client_secret or ""),
191	timeout=30,
192	)
193	resp.raise_for_status()
194	token_data = resp.json()
195
196	self._access_token = token_data["access_token"]
197	self._token_data = {
198	"access_token": token_data["access_token"],
199	"refresh_token": token_data.get("refresh_token"),
200	"expires_at": time.time() + token_data.get("expires_in", 3600) - 60,
201	"token_type": token_data.get("token_type", "bearer"),
202	"client_id": self.client_id,
203	"client_secret": self.client_secret or "",
204	}
205
206	self._save_token(self._token_data)
207	logger.info("Authenticated with Zoom via OAuth PKCE")
208	return True
209	except Exception as e:
210	logger.error(f"Zoom OAuth PKCE failed: {e}")
211	return False
212
213	def _refresh_token(self) -> bool:
214	"""Refresh an expired OAuth token."""
215	try:
216	data = json.loads(self.token_path.read_text())
217	refresh_token = data.get("refresh_token")
218	client_id = data.get("client_id") or self.client_id
219	client_secret = data.get("client_secret") or self.client_secret
220
221	if not refresh_token or not client_id:
222	return False
223
224	resp = requests.post(
225	f"{_OAUTH_BASE}/token",
226	data={
227	"grant_type": "refresh_token",
228	"refresh_token": refresh_token,
229	},
230	auth=(client_id, client_secret or ""),
231	timeout=30,
232	)
233	resp.raise_for_status()
234	token_data = resp.json()
235
236	self._access_token = token_data["access_token"]
237	self._token_data = {
238	"access_token": token_data["access_token"],
239	"refresh_token": token_data.get("refresh_token", refresh_token),
240	"expires_at": time.time() + token_data.get("expires_in", 3600) - 60,
241	"token_type": token_data.get("token_type", "bearer"),
242	"client_id": client_id,
243	"client_secret": client_secret or "",
244	}
245
246	self._save_token(self._token_data)
247	logger.info("Refreshed Zoom OAuth token")
248	return True
249	except Exception as e:
250	logger.error(f"Zoom token refresh failed: {e}")
251	return False
252
253	def _save_token(self, data: Dict) -> None:
254	"""Save token data to disk."""
255	self.token_path.parent.mkdir(parents=True, exist_ok=True)
256	self.token_path.write_text(json.dumps(data))
257	logger.info(f"OAuth token saved to {self.token_path}")
258
259	def _api_get(self, endpoint: str, params: Optional[Dict] = None) -> requests.Response:
260	"""Make an authenticated GET request to the Zoom API."""
261	if not self._access_token:
262	raise RuntimeError("Not authenticated. Call authenticate() first.")
263
264	url = f"{_BASE_URL}/{endpoint.lstrip('/')}"
265	resp = requests.get(
266	url,
267	headers={"Authorization": f"Bearer {self._access_token}"},
268	params=params,
269	timeout=30,
270	)
271	resp.raise_for_status()
272	return resp
273
274	def list_videos(
275	self,
276	folder_id: Optional[str] = None,
277	folder_path: Optional[str] = None,
278	patterns: Optional[List[str]] = None,
279	) -> List[SourceFile]:
280	"""List video files from Zoom cloud recordings."""
281	if not self._access_token:
282	raise RuntimeError("Not authenticated. Call authenticate() first.")
283
284	files: List[SourceFile] = []
285	next_page_token = ""
286
287	while True:
288	params: Dict = {}
289	if next_page_token:
290	params["next_page_token"] = next_page_token
291
292	resp = self._api_get("users/me/recordings", params=params)
293	data = resp.json()
294
295	for meeting in data.get("meetings", []):
296	meeting_id = str(meeting.get("id", ""))
297	topic = meeting.get("topic", "Untitled Meeting")
298	start_time = meeting.get("start_time")
299
300	for rec_file in meeting.get("recording_files", []):
301	file_type = rec_file.get("file_type", "")
302	mime_type = _MIME_TYPES.get(file_type)
303
304	# Build a descriptive name
305	file_ext = rec_file.get("file_extension", file_type).lower()
306	file_name = f"{topic}.{file_ext}"
307
308	if patterns:
309	if not any(file_name.endswith(p.replace("*", "")) for p in patterns):
310	continue
311
312	files.append(
313	SourceFile(
314	name=file_name,
315	id=meeting_id,
316	size_bytes=rec_file.get("file_size"),
317	mime_type=mime_type,
318	modified_at=start_time,
319	path=rec_file.get("download_url"),
320	)
321	)
322
323	next_page_token = data.get("next_page_token", "")
324	if not next_page_token:
325	break
326
327	logger.info(f"Found {len(files)} recordings in Zoom")
328	return files
329
330	def download(self, file: SourceFile, destination: Path) -> Path:
331	"""Download a recording file from Zoom."""
332	if not self._access_token:
333	raise RuntimeError("Not authenticated. Call authenticate() first.")
334
335	destination = Path(destination)
336	destination.parent.mkdir(parents=True, exist_ok=True)
337
338	download_url = file.path
339	if not download_url:
340	raise ValueError(f"No download URL for file: {file.name}")
341
342	resp = requests.get(
343	download_url,
344	headers={"Authorization": f"Bearer {self._access_token}"},
345	stream=True,
346	timeout=60,
347	)
348	resp.raise_for_status()
349
350	with open(destination, "wb") as f:
351	for chunk in resp.iter_content(chunk_size=8192):
352	f.write(chunk)
353
354	logger.info(f"Downloaded {file.name} to {destination}")
355	return destination
356
357	def fetch_transcript(self, meeting_id: str) -> Optional[str]:
358	"""
359	Fetch the transcript (VTT) for a Zoom meeting recording.
360
361	Looks for transcript files in the recording's file list and downloads
362	the content as text.
363
364	Parameters
365	----------
366	meeting_id : str
367	The Zoom meeting ID.
368
369	Returns
370	-------
371	str or None
372	Transcript text if available, None otherwise.
373	"""
374	if not self._access_token:
375	raise RuntimeError("Not authenticated. Call authenticate() first.")
376
377	try:
378	resp = self._api_get(f"meetings/{meeting_id}/recordings")
379	data = resp.json()
380
381	for rec_file in data.get("recording_files", []):
382	file_type = rec_file.get("file_type", "")
383	if file_type == "TRANSCRIPT":
384	download_url = rec_file.get("download_url")
385	if download_url:
386	dl_resp = requests.get(
387	download_url,
388	headers={"Authorization": f"Bearer {self._access_token}"},
389	timeout=30,
390	)
391	dl_resp.raise_for_status()
392	logger.info(f"Fetched transcript for meeting {meeting_id}")
393	return dl_resp.text
394
395	logger.info(f"No transcript found for meeting {meeting_id}")
396	return None
397	except Exception as e:
398	logger.error(f"Failed to fetch transcript for meeting {meeting_id}: {e}")
399	return None

PlanOpticon

Keyboard Shortcuts