PlanOpticon

feat(sources): add Zoom, Teams, and Meet recording connectors - ZoomSource: OAuth2 with PKCE, Server-to-Server, and saved token auth - TeamsRecordingSource: m365 CLI wrapper with Graph API fallbacks - MeetRecordingSource: gws CLI wrapper searching Drive for recordings - CLI: planopticon recordings {zoom-list,teams-list,meet-list} - 29 new tests (757 total passing)

lmata 2026-03-07 23:22 trunk
Commit 4bdd77c40949daad3589e28cfd5f2aa8bf1250d2754974b2ec36b13c9dd1a9fc
--- tests/test_cli.py
+++ tests/test_cli.py
@@ -22,10 +22,11 @@
2222
assert "query" in result.output
2323
assert "agent" in result.output
2424
assert "kg" in result.output
2525
assert "gws" in result.output
2626
assert "m365" in result.output
27
+ assert "recordings" in result.output
2728
assert "ingest" in result.output
2829
assert "batch" in result.output
2930
3031
3132
class TestAnalyzeHelp:
@@ -193,13 +194,41 @@
193194
assert result.exit_code == 0
194195
assert "--web-url" in result.output
195196
assert "--file-id" in result.output
196197
assert "--db-path" in result.output
197198
199
+
200
+class TestRecordingsHelp:
201
+ def test_group_help(self):
202
+ runner = CliRunner()
203
+ result = runner.invoke(cli, ["recordings", "--help"])
204
+ assert result.exit_code == 0
205
+ assert "zoom-list" in result.output
206
+ assert "teams-list" in result.output
207
+ assert "meet-list" in result.output
208
+
209
+ def test_zoom_list_help(self):
210
+ runner = CliRunner()
211
+ result = runner.invoke(cli, ["recordings", "zoom-list", "--help"])
212
+ assert result.exit_code == 0
213
+ assert "ZOOM_CLIENT_ID" in result.output
214
+
215
+ def test_teams_list_help(self):
216
+ runner = CliRunner()
217
+ result = runner.invoke(cli, ["recordings", "teams-list", "--help"])
218
+ assert result.exit_code == 0
219
+ assert "--user-id" in result.output
220
+
221
+ def test_meet_list_help(self):
222
+ runner = CliRunner()
223
+ result = runner.invoke(cli, ["recordings", "meet-list", "--help"])
224
+ assert result.exit_code == 0
225
+ assert "--folder-id" in result.output
226
+
198227
199228
class TestAuthHelp:
200229
def test_help(self):
201230
runner = CliRunner()
202231
result = runner.invoke(cli, ["auth", "--help"])
203232
assert result.exit_code == 0
204233
assert "google" in result.output
205234
assert "dropbox" in result.output
206235
--- tests/test_cli.py
+++ tests/test_cli.py
@@ -22,10 +22,11 @@
22 assert "query" in result.output
23 assert "agent" in result.output
24 assert "kg" in result.output
25 assert "gws" in result.output
26 assert "m365" in result.output
 
27 assert "ingest" in result.output
28 assert "batch" in result.output
29
30
31 class TestAnalyzeHelp:
@@ -193,13 +194,41 @@
193 assert result.exit_code == 0
194 assert "--web-url" in result.output
195 assert "--file-id" in result.output
196 assert "--db-path" in result.output
197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
199 class TestAuthHelp:
200 def test_help(self):
201 runner = CliRunner()
202 result = runner.invoke(cli, ["auth", "--help"])
203 assert result.exit_code == 0
204 assert "google" in result.output
205 assert "dropbox" in result.output
206
--- tests/test_cli.py
+++ tests/test_cli.py
@@ -22,10 +22,11 @@
22 assert "query" in result.output
23 assert "agent" in result.output
24 assert "kg" in result.output
25 assert "gws" in result.output
26 assert "m365" in result.output
27 assert "recordings" in result.output
28 assert "ingest" in result.output
29 assert "batch" in result.output
30
31
32 class TestAnalyzeHelp:
@@ -193,13 +194,41 @@
194 assert result.exit_code == 0
195 assert "--web-url" in result.output
196 assert "--file-id" in result.output
197 assert "--db-path" in result.output
198
199
200 class TestRecordingsHelp:
201 def test_group_help(self):
202 runner = CliRunner()
203 result = runner.invoke(cli, ["recordings", "--help"])
204 assert result.exit_code == 0
205 assert "zoom-list" in result.output
206 assert "teams-list" in result.output
207 assert "meet-list" in result.output
208
209 def test_zoom_list_help(self):
210 runner = CliRunner()
211 result = runner.invoke(cli, ["recordings", "zoom-list", "--help"])
212 assert result.exit_code == 0
213 assert "ZOOM_CLIENT_ID" in result.output
214
215 def test_teams_list_help(self):
216 runner = CliRunner()
217 result = runner.invoke(cli, ["recordings", "teams-list", "--help"])
218 assert result.exit_code == 0
219 assert "--user-id" in result.output
220
221 def test_meet_list_help(self):
222 runner = CliRunner()
223 result = runner.invoke(cli, ["recordings", "meet-list", "--help"])
224 assert result.exit_code == 0
225 assert "--folder-id" in result.output
226
227
228 class TestAuthHelp:
229 def test_help(self):
230 runner = CliRunner()
231 result = runner.invoke(cli, ["auth", "--help"])
232 assert result.exit_code == 0
233 assert "google" in result.output
234 assert "dropbox" in result.output
235
--- tests/test_sources.py
+++ tests/test_sources.py
@@ -1340,5 +1340,233 @@
13401340
html = "<tag> "quoted" 'apos'  space"
13411341
result = _html_to_text(html)
13421342
assert "<tag>" in result
13431343
assert '"quoted"' in result
13441344
assert "'apos'" in result
1345
+
1346
+
1347
+# ---------------------------------------------------------------------------
1348
+# ZoomSource
1349
+# ---------------------------------------------------------------------------
1350
+
1351
+
1352
+class TestZoomSource:
1353
+ def test_import(self):
1354
+ from video_processor.sources.zoom_source import ZoomSource
1355
+
1356
+ assert ZoomSource is not None
1357
+
1358
+ def test_constructor_defaults(self):
1359
+ from video_processor.sources.zoom_source import ZoomSource
1360
+
1361
+ src = ZoomSource()
1362
+ assert src.client_id is None or isinstance(src.client_id, str)
1363
+ assert src._access_token is None
1364
+
1365
+ def test_constructor_explicit(self):
1366
+ from video_processor.sources.zoom_source import ZoomSource
1367
+
1368
+ src = ZoomSource(
1369
+ client_id="cid",
1370
+ client_secret="csec",
1371
+ account_id="aid",
1372
+ )
1373
+ assert src.client_id == "cid"
1374
+ assert src.client_secret == "csec"
1375
+ assert src.account_id == "aid"
1376
+
1377
+ def test_authenticate_no_credentials(self):
1378
+ from video_processor.sources.zoom_source import ZoomSource
1379
+
1380
+ src = ZoomSource(client_id=None, client_secret=None, account_id=None)
1381
+ # No saved token, no account_id, no client_id → should fail
1382
+ assert src.authenticate() is False
1383
+
1384
+ def test_list_videos_not_authenticated(self):
1385
+ from video_processor.sources.zoom_source import ZoomSource
1386
+
1387
+ src = ZoomSource()
1388
+ with pytest.raises(RuntimeError, match="Not authenticated"):
1389
+ src.list_videos()
1390
+
1391
+ def test_download_not_authenticated(self):
1392
+ from video_processor.sources.zoom_source import ZoomSource
1393
+
1394
+ src = ZoomSource()
1395
+ sf = SourceFile(name="test.mp4", id="123")
1396
+ with pytest.raises(RuntimeError, match="Not authenticated"):
1397
+ src.download(sf, "/tmp/test.mp4")
1398
+
1399
+ def test_fetch_transcript_not_authenticated(self):
1400
+ from video_processor.sources.zoom_source import ZoomSource
1401
+
1402
+ src = ZoomSource()
1403
+ with pytest.raises(RuntimeError, match="Not authenticated"):
1404
+ src.fetch_transcript("meeting123")
1405
+
1406
+ def test_mime_types_mapping(self):
1407
+ from video_processor.sources.zoom_source import _MIME_TYPES
1408
+
1409
+ assert _MIME_TYPES["MP4"] == "video/mp4"
1410
+ assert _MIME_TYPES["TRANSCRIPT"] == "text/vtt"
1411
+ assert _MIME_TYPES["M4A"] == "audio/mp4"
1412
+
1413
+
1414
+# ---------------------------------------------------------------------------
1415
+# TeamsRecordingSource
1416
+# ---------------------------------------------------------------------------
1417
+
1418
+
1419
+class TestTeamsRecordingSource:
1420
+ def test_import(self):
1421
+ from video_processor.sources.teams_recording_source import (
1422
+ TeamsRecordingSource,
1423
+ )
1424
+
1425
+ assert TeamsRecordingSource is not None
1426
+
1427
+ def test_constructor_default(self):
1428
+ from video_processor.sources.teams_recording_source import (
1429
+ TeamsRecordingSource,
1430
+ )
1431
+
1432
+ src = TeamsRecordingSource()
1433
+ assert src.user_id == "me"
1434
+
1435
+ def test_constructor_custom_user(self):
1436
+ from video_processor.sources.teams_recording_source import (
1437
+ TeamsRecordingSource,
1438
+ )
1439
+
1440
+ src = TeamsRecordingSource(user_id="[email protected]")
1441
+ assert src.user_id == "[email protected]"
1442
+
1443
+ @patch("shutil.which", return_value=None)
1444
+ def test_authenticate_no_m365(self, _mock_which):
1445
+ from video_processor.sources.teams_recording_source import (
1446
+ TeamsRecordingSource,
1447
+ )
1448
+
1449
+ src = TeamsRecordingSource()
1450
+ assert src.authenticate() is False
1451
+
1452
+ def test_vtt_to_text(self):
1453
+ from video_processor.sources.teams_recording_source import (
1454
+ _vtt_to_text,
1455
+ )
1456
+
1457
+ vtt = (
1458
+ "WEBVTT\n\n"
1459
+ "1\n"
1460
+ "00:00:01.000 --> 00:00:05.000\n"
1461
+ "<v Speaker1>Hello everyone\n\n"
1462
+ "2\n"
1463
+ "00:00:05.000 --> 00:00:10.000\n"
1464
+ "<v Speaker2>Welcome to the meeting\n"
1465
+ )
1466
+ result = _vtt_to_text(vtt)
1467
+ assert "Hello everyone" in result
1468
+ assert "Welcome to the meeting" in result
1469
+ assert "WEBVTT" not in result
1470
+ assert "-->" not in result
1471
+
1472
+ def test_vtt_to_text_empty(self):
1473
+ from video_processor.sources.teams_recording_source import (
1474
+ _vtt_to_text,
1475
+ )
1476
+
1477
+ assert _vtt_to_text("") == ""
1478
+
1479
+ def test_vtt_to_text_deduplicates(self):
1480
+ from video_processor.sources.teams_recording_source import (
1481
+ _vtt_to_text,
1482
+ )
1483
+
1484
+ vtt = (
1485
+ "WEBVTT\n\n"
1486
+ "00:00:01.000 --> 00:00:03.000\n"
1487
+ "Same line\n\n"
1488
+ "00:00:03.000 --> 00:00:05.000\n"
1489
+ "Same line\n"
1490
+ )
1491
+ result = _vtt_to_text(vtt)
1492
+ assert result.count("Same line") == 1
1493
+
1494
+ def test_extract_meetings_list_dict(self):
1495
+ from video_processor.sources.teams_recording_source import (
1496
+ TeamsRecordingSource,
1497
+ )
1498
+
1499
+ src = TeamsRecordingSource()
1500
+ result = src._extract_meetings_list({"value": [{"id": "m1"}]})
1501
+ assert len(result) == 1
1502
+
1503
+ def test_extract_meetings_list_list(self):
1504
+ from video_processor.sources.teams_recording_source import (
1505
+ TeamsRecordingSource,
1506
+ )
1507
+
1508
+ src = TeamsRecordingSource()
1509
+ result = src._extract_meetings_list([{"id": "m1"}])
1510
+ assert len(result) == 1
1511
+
1512
+
1513
+# ---------------------------------------------------------------------------
1514
+# MeetRecordingSource
1515
+# ---------------------------------------------------------------------------
1516
+
1517
+
1518
+class TestMeetRecordingSource:
1519
+ def test_import(self):
1520
+ from video_processor.sources.meet_recording_source import (
1521
+ MeetRecordingSource,
1522
+ )
1523
+
1524
+ assert MeetRecordingSource is not None
1525
+
1526
+ def test_constructor_default(self):
1527
+ from video_processor.sources.meet_recording_source import (
1528
+ MeetRecordingSource,
1529
+ )
1530
+
1531
+ src = MeetRecordingSource()
1532
+ assert src.drive_folder_id is None
1533
+
1534
+ def test_constructor_with_folder(self):
1535
+ from video_processor.sources.meet_recording_source import (
1536
+ MeetRecordingSource,
1537
+ )
1538
+
1539
+ src = MeetRecordingSource(drive_folder_id="folder123")
1540
+ assert src.drive_folder_id == "folder123"
1541
+
1542
+ @patch("shutil.which", return_value=None)
1543
+ def test_authenticate_no_gws(self, _mock_which):
1544
+ from video_processor.sources.meet_recording_source import (
1545
+ MeetRecordingSource,
1546
+ )
1547
+
1548
+ src = MeetRecordingSource()
1549
+ assert src.authenticate() is False
1550
+
1551
+ def test_find_matching_transcript_date_extraction(self):
1552
+ import re
1553
+
1554
+ name = "Meet Recording 2026-03-07T14:30:00"
1555
+ match = re.search(r"\d{4}-\d{2}-\d{2}", name)
1556
+ assert match is not None
1557
+ assert match.group(0) == "2026-03-07"
1558
+
1559
+ def test_lazy_import(self):
1560
+ from video_processor.sources import MeetRecordingSource
1561
+
1562
+ assert MeetRecordingSource is not None
1563
+
1564
+ def test_teams_lazy_import(self):
1565
+ from video_processor.sources import TeamsRecordingSource
1566
+
1567
+ assert TeamsRecordingSource is not None
1568
+
1569
+ def test_zoom_lazy_import(self):
1570
+ from video_processor.sources import ZoomSource
1571
+
1572
+ assert ZoomSource is not None
13451573
--- tests/test_sources.py
+++ tests/test_sources.py
@@ -1340,5 +1340,233 @@
1340 html = "&lt;tag&gt; &quot;quoted&quot; &apos;apos&apos; &nbsp;space"
1341 result = _html_to_text(html)
1342 assert "<tag>" in result
1343 assert '"quoted"' in result
1344 assert "'apos'" in result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1345
--- tests/test_sources.py
+++ tests/test_sources.py
@@ -1340,5 +1340,233 @@
1340 html = "&lt;tag&gt; &quot;quoted&quot; &apos;apos&apos; &nbsp;space"
1341 result = _html_to_text(html)
1342 assert "<tag>" in result
1343 assert '"quoted"' in result
1344 assert "'apos'" in result
1345
1346
1347 # ---------------------------------------------------------------------------
1348 # ZoomSource
1349 # ---------------------------------------------------------------------------
1350
1351
1352 class TestZoomSource:
1353 def test_import(self):
1354 from video_processor.sources.zoom_source import ZoomSource
1355
1356 assert ZoomSource is not None
1357
1358 def test_constructor_defaults(self):
1359 from video_processor.sources.zoom_source import ZoomSource
1360
1361 src = ZoomSource()
1362 assert src.client_id is None or isinstance(src.client_id, str)
1363 assert src._access_token is None
1364
1365 def test_constructor_explicit(self):
1366 from video_processor.sources.zoom_source import ZoomSource
1367
1368 src = ZoomSource(
1369 client_id="cid",
1370 client_secret="csec",
1371 account_id="aid",
1372 )
1373 assert src.client_id == "cid"
1374 assert src.client_secret == "csec"
1375 assert src.account_id == "aid"
1376
1377 def test_authenticate_no_credentials(self):
1378 from video_processor.sources.zoom_source import ZoomSource
1379
1380 src = ZoomSource(client_id=None, client_secret=None, account_id=None)
1381 # No saved token, no account_id, no client_id → should fail
1382 assert src.authenticate() is False
1383
1384 def test_list_videos_not_authenticated(self):
1385 from video_processor.sources.zoom_source import ZoomSource
1386
1387 src = ZoomSource()
1388 with pytest.raises(RuntimeError, match="Not authenticated"):
1389 src.list_videos()
1390
1391 def test_download_not_authenticated(self):
1392 from video_processor.sources.zoom_source import ZoomSource
1393
1394 src = ZoomSource()
1395 sf = SourceFile(name="test.mp4", id="123")
1396 with pytest.raises(RuntimeError, match="Not authenticated"):
1397 src.download(sf, "/tmp/test.mp4")
1398
1399 def test_fetch_transcript_not_authenticated(self):
1400 from video_processor.sources.zoom_source import ZoomSource
1401
1402 src = ZoomSource()
1403 with pytest.raises(RuntimeError, match="Not authenticated"):
1404 src.fetch_transcript("meeting123")
1405
1406 def test_mime_types_mapping(self):
1407 from video_processor.sources.zoom_source import _MIME_TYPES
1408
1409 assert _MIME_TYPES["MP4"] == "video/mp4"
1410 assert _MIME_TYPES["TRANSCRIPT"] == "text/vtt"
1411 assert _MIME_TYPES["M4A"] == "audio/mp4"
1412
1413
1414 # ---------------------------------------------------------------------------
1415 # TeamsRecordingSource
1416 # ---------------------------------------------------------------------------
1417
1418
1419 class TestTeamsRecordingSource:
1420 def test_import(self):
1421 from video_processor.sources.teams_recording_source import (
1422 TeamsRecordingSource,
1423 )
1424
1425 assert TeamsRecordingSource is not None
1426
1427 def test_constructor_default(self):
1428 from video_processor.sources.teams_recording_source import (
1429 TeamsRecordingSource,
1430 )
1431
1432 src = TeamsRecordingSource()
1433 assert src.user_id == "me"
1434
1435 def test_constructor_custom_user(self):
1436 from video_processor.sources.teams_recording_source import (
1437 TeamsRecordingSource,
1438 )
1439
1440 src = TeamsRecordingSource(user_id="[email protected]")
1441 assert src.user_id == "[email protected]"
1442
1443 @patch("shutil.which", return_value=None)
1444 def test_authenticate_no_m365(self, _mock_which):
1445 from video_processor.sources.teams_recording_source import (
1446 TeamsRecordingSource,
1447 )
1448
1449 src = TeamsRecordingSource()
1450 assert src.authenticate() is False
1451
1452 def test_vtt_to_text(self):
1453 from video_processor.sources.teams_recording_source import (
1454 _vtt_to_text,
1455 )
1456
1457 vtt = (
1458 "WEBVTT\n\n"
1459 "1\n"
1460 "00:00:01.000 --> 00:00:05.000\n"
1461 "<v Speaker1>Hello everyone\n\n"
1462 "2\n"
1463 "00:00:05.000 --> 00:00:10.000\n"
1464 "<v Speaker2>Welcome to the meeting\n"
1465 )
1466 result = _vtt_to_text(vtt)
1467 assert "Hello everyone" in result
1468 assert "Welcome to the meeting" in result
1469 assert "WEBVTT" not in result
1470 assert "-->" not in result
1471
1472 def test_vtt_to_text_empty(self):
1473 from video_processor.sources.teams_recording_source import (
1474 _vtt_to_text,
1475 )
1476
1477 assert _vtt_to_text("") == ""
1478
1479 def test_vtt_to_text_deduplicates(self):
1480 from video_processor.sources.teams_recording_source import (
1481 _vtt_to_text,
1482 )
1483
1484 vtt = (
1485 "WEBVTT\n\n"
1486 "00:00:01.000 --> 00:00:03.000\n"
1487 "Same line\n\n"
1488 "00:00:03.000 --> 00:00:05.000\n"
1489 "Same line\n"
1490 )
1491 result = _vtt_to_text(vtt)
1492 assert result.count("Same line") == 1
1493
1494 def test_extract_meetings_list_dict(self):
1495 from video_processor.sources.teams_recording_source import (
1496 TeamsRecordingSource,
1497 )
1498
1499 src = TeamsRecordingSource()
1500 result = src._extract_meetings_list({"value": [{"id": "m1"}]})
1501 assert len(result) == 1
1502
1503 def test_extract_meetings_list_list(self):
1504 from video_processor.sources.teams_recording_source import (
1505 TeamsRecordingSource,
1506 )
1507
1508 src = TeamsRecordingSource()
1509 result = src._extract_meetings_list([{"id": "m1"}])
1510 assert len(result) == 1
1511
1512
1513 # ---------------------------------------------------------------------------
1514 # MeetRecordingSource
1515 # ---------------------------------------------------------------------------
1516
1517
1518 class TestMeetRecordingSource:
1519 def test_import(self):
1520 from video_processor.sources.meet_recording_source import (
1521 MeetRecordingSource,
1522 )
1523
1524 assert MeetRecordingSource is not None
1525
1526 def test_constructor_default(self):
1527 from video_processor.sources.meet_recording_source import (
1528 MeetRecordingSource,
1529 )
1530
1531 src = MeetRecordingSource()
1532 assert src.drive_folder_id is None
1533
1534 def test_constructor_with_folder(self):
1535 from video_processor.sources.meet_recording_source import (
1536 MeetRecordingSource,
1537 )
1538
1539 src = MeetRecordingSource(drive_folder_id="folder123")
1540 assert src.drive_folder_id == "folder123"
1541
1542 @patch("shutil.which", return_value=None)
1543 def test_authenticate_no_gws(self, _mock_which):
1544 from video_processor.sources.meet_recording_source import (
1545 MeetRecordingSource,
1546 )
1547
1548 src = MeetRecordingSource()
1549 assert src.authenticate() is False
1550
1551 def test_find_matching_transcript_date_extraction(self):
1552 import re
1553
1554 name = "Meet Recording 2026-03-07T14:30:00"
1555 match = re.search(r"\d{4}-\d{2}-\d{2}", name)
1556 assert match is not None
1557 assert match.group(0) == "2026-03-07"
1558
1559 def test_lazy_import(self):
1560 from video_processor.sources import MeetRecordingSource
1561
1562 assert MeetRecordingSource is not None
1563
1564 def test_teams_lazy_import(self):
1565 from video_processor.sources import TeamsRecordingSource
1566
1567 assert TeamsRecordingSource is not None
1568
1569 def test_zoom_lazy_import(self):
1570 from video_processor.sources import ZoomSource
1571
1572 assert ZoomSource is not None
1573
--- video_processor/cli/commands.py
+++ video_processor/cli/commands.py
@@ -1582,10 +1582,112 @@
15821582
click.echo(f"Wiki pushed to https://github.com/{repo}/wiki")
15831583
else:
15841584
click.echo("Wiki push failed. Check auth and repo permissions.", err=True)
15851585
sys.exit(1)
15861586
1587
+
1588
+@cli.group()
1589
+def recordings():
1590
+ """Fetch meeting recordings from Zoom, Teams, and Google Meet."""
1591
+ pass
1592
+
1593
+
1594
+@recordings.command("zoom-list")
1595
+@click.option("--json", "as_json", is_flag=True, help="Output as JSON")
1596
+def recordings_zoom_list(as_json):
1597
+ """List Zoom cloud recordings.
1598
+
1599
+ Requires ZOOM_CLIENT_ID (and optionally ZOOM_CLIENT_SECRET,
1600
+ ZOOM_ACCOUNT_ID) environment variables.
1601
+
1602
+ Examples:
1603
+
1604
+ planopticon recordings zoom-list
1605
+
1606
+ planopticon recordings zoom-list --json
1607
+ """
1608
+ from video_processor.sources.zoom_source import ZoomSource
1609
+
1610
+ source = ZoomSource()
1611
+ if not source.authenticate():
1612
+ click.echo("Zoom authentication failed.", err=True)
1613
+ sys.exit(1)
1614
+
1615
+ files = source.list_videos()
1616
+ if as_json:
1617
+ click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str))
1618
+ else:
1619
+ click.echo(f"Found {len(files)} recording(s):")
1620
+ for f in files:
1621
+ size = f"{f.size_bytes // 1_000_000} MB" if f.size_bytes else "unknown"
1622
+ click.echo(f" {f.name} ({size}) {f.modified_at or ''}")
1623
+
1624
+
1625
+@recordings.command("teams-list")
1626
+@click.option("--user-id", default="me", help="Microsoft user ID")
1627
+@click.option("--json", "as_json", is_flag=True, help="Output as JSON")
1628
+def recordings_teams_list(user_id, as_json):
1629
+ """List Teams meeting recordings via the m365 CLI.
1630
+
1631
+ Requires: npm install -g @pnp/cli-microsoft365 && m365 login
1632
+
1633
+ Examples:
1634
+
1635
+ planopticon recordings teams-list
1636
+
1637
+ planopticon recordings teams-list --json
1638
+ """
1639
+ from video_processor.sources.teams_recording_source import (
1640
+ TeamsRecordingSource,
1641
+ )
1642
+
1643
+ source = TeamsRecordingSource(user_id=user_id)
1644
+ if not source.authenticate():
1645
+ click.echo("Teams authentication failed.", err=True)
1646
+ sys.exit(1)
1647
+
1648
+ files = source.list_videos()
1649
+ if as_json:
1650
+ click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str))
1651
+ else:
1652
+ click.echo(f"Found {len(files)} recording(s):")
1653
+ for f in files:
1654
+ click.echo(f" {f.name} {f.modified_at or ''}")
1655
+
1656
+
1657
+@recordings.command("meet-list")
1658
+@click.option("--folder-id", default=None, help="Drive folder ID")
1659
+@click.option("--json", "as_json", is_flag=True, help="Output as JSON")
1660
+def recordings_meet_list(folder_id, as_json):
1661
+ """List Google Meet recordings in Drive via the gws CLI.
1662
+
1663
+ Requires: npm install -g @googleworkspace/cli && gws auth login
1664
+
1665
+ Examples:
1666
+
1667
+ planopticon recordings meet-list
1668
+
1669
+ planopticon recordings meet-list --folder-id abc123
1670
+ """
1671
+ from video_processor.sources.meet_recording_source import (
1672
+ MeetRecordingSource,
1673
+ )
1674
+
1675
+ source = MeetRecordingSource(drive_folder_id=folder_id)
1676
+ if not source.authenticate():
1677
+ click.echo("Google Meet authentication failed.", err=True)
1678
+ sys.exit(1)
1679
+
1680
+ files = source.list_videos()
1681
+ if as_json:
1682
+ click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str))
1683
+ else:
1684
+ click.echo(f"Found {len(files)} recording(s):")
1685
+ for f in files:
1686
+ size = f"{f.size_bytes // 1_000_000} MB" if f.size_bytes else "unknown"
1687
+ click.echo(f" {f.name} ({size}) {f.modified_at or ''}")
1688
+
15871689
15881690
@cli.group()
15891691
def kg():
15901692
"""Knowledge graph utilities: convert, sync, and inspect."""
15911693
pass
15921694
--- video_processor/cli/commands.py
+++ video_processor/cli/commands.py
@@ -1582,10 +1582,112 @@
1582 click.echo(f"Wiki pushed to https://github.com/{repo}/wiki")
1583 else:
1584 click.echo("Wiki push failed. Check auth and repo permissions.", err=True)
1585 sys.exit(1)
1586
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1587
1588 @cli.group()
1589 def kg():
1590 """Knowledge graph utilities: convert, sync, and inspect."""
1591 pass
1592
--- video_processor/cli/commands.py
+++ video_processor/cli/commands.py
@@ -1582,10 +1582,112 @@
1582 click.echo(f"Wiki pushed to https://github.com/{repo}/wiki")
1583 else:
1584 click.echo("Wiki push failed. Check auth and repo permissions.", err=True)
1585 sys.exit(1)
1586
1587
1588 @cli.group()
1589 def recordings():
1590 """Fetch meeting recordings from Zoom, Teams, and Google Meet."""
1591 pass
1592
1593
1594 @recordings.command("zoom-list")
1595 @click.option("--json", "as_json", is_flag=True, help="Output as JSON")
1596 def recordings_zoom_list(as_json):
1597 """List Zoom cloud recordings.
1598
1599 Requires ZOOM_CLIENT_ID (and optionally ZOOM_CLIENT_SECRET,
1600 ZOOM_ACCOUNT_ID) environment variables.
1601
1602 Examples:
1603
1604 planopticon recordings zoom-list
1605
1606 planopticon recordings zoom-list --json
1607 """
1608 from video_processor.sources.zoom_source import ZoomSource
1609
1610 source = ZoomSource()
1611 if not source.authenticate():
1612 click.echo("Zoom authentication failed.", err=True)
1613 sys.exit(1)
1614
1615 files = source.list_videos()
1616 if as_json:
1617 click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str))
1618 else:
1619 click.echo(f"Found {len(files)} recording(s):")
1620 for f in files:
1621 size = f"{f.size_bytes // 1_000_000} MB" if f.size_bytes else "unknown"
1622 click.echo(f" {f.name} ({size}) {f.modified_at or ''}")
1623
1624
1625 @recordings.command("teams-list")
1626 @click.option("--user-id", default="me", help="Microsoft user ID")
1627 @click.option("--json", "as_json", is_flag=True, help="Output as JSON")
1628 def recordings_teams_list(user_id, as_json):
1629 """List Teams meeting recordings via the m365 CLI.
1630
1631 Requires: npm install -g @pnp/cli-microsoft365 && m365 login
1632
1633 Examples:
1634
1635 planopticon recordings teams-list
1636
1637 planopticon recordings teams-list --json
1638 """
1639 from video_processor.sources.teams_recording_source import (
1640 TeamsRecordingSource,
1641 )
1642
1643 source = TeamsRecordingSource(user_id=user_id)
1644 if not source.authenticate():
1645 click.echo("Teams authentication failed.", err=True)
1646 sys.exit(1)
1647
1648 files = source.list_videos()
1649 if as_json:
1650 click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str))
1651 else:
1652 click.echo(f"Found {len(files)} recording(s):")
1653 for f in files:
1654 click.echo(f" {f.name} {f.modified_at or ''}")
1655
1656
1657 @recordings.command("meet-list")
1658 @click.option("--folder-id", default=None, help="Drive folder ID")
1659 @click.option("--json", "as_json", is_flag=True, help="Output as JSON")
1660 def recordings_meet_list(folder_id, as_json):
1661 """List Google Meet recordings in Drive via the gws CLI.
1662
1663 Requires: npm install -g @googleworkspace/cli && gws auth login
1664
1665 Examples:
1666
1667 planopticon recordings meet-list
1668
1669 planopticon recordings meet-list --folder-id abc123
1670 """
1671 from video_processor.sources.meet_recording_source import (
1672 MeetRecordingSource,
1673 )
1674
1675 source = MeetRecordingSource(drive_folder_id=folder_id)
1676 if not source.authenticate():
1677 click.echo("Google Meet authentication failed.", err=True)
1678 sys.exit(1)
1679
1680 files = source.list_videos()
1681 if as_json:
1682 click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str))
1683 else:
1684 click.echo(f"Found {len(files)} recording(s):")
1685 for f in files:
1686 size = f"{f.size_bytes // 1_000_000} MB" if f.size_bytes else "unknown"
1687 click.echo(f" {f.name} ({size}) {f.modified_at or ''}")
1688
1689
1690 @cli.group()
1691 def kg():
1692 """Knowledge graph utilities: convert, sync, and inspect."""
1693 pass
1694
--- video_processor/sources/__init__.py
+++ video_processor/sources/__init__.py
@@ -12,19 +12,22 @@
1212
"GoogleKeepSource",
1313
"GWSSource",
1414
"HackerNewsSource",
1515
"LogseqSource",
1616
"M365Source",
17
+ "MeetRecordingSource",
1718
"NotionSource",
1819
"ObsidianSource",
1920
"OneNoteSource",
2021
"PodcastSource",
22
+ "TeamsRecordingSource",
2123
"RedditSource",
2224
"RSSSource",
2325
"TwitterSource",
2426
"WebSource",
2527
"YouTubeSource",
28
+ "ZoomSource",
2629
]
2730
2831
2932
def __getattr__(name: str):
3033
"""Lazy imports to avoid pulling in optional dependencies at import time."""
@@ -36,21 +39,24 @@
3639
"GoogleKeepSource": "video_processor.sources.google_keep_source",
3740
"GWSSource": "video_processor.sources.gws_source",
3841
"HackerNewsSource": "video_processor.sources.hackernews_source",
3942
"LogseqSource": "video_processor.sources.logseq_source",
4043
"M365Source": "video_processor.sources.m365_source",
44
+ "MeetRecordingSource": "video_processor.sources.meet_recording_source",
4145
"NotionSource": "video_processor.sources.notion_source",
4246
"ObsidianSource": "video_processor.sources.obsidian_source",
4347
"OneNoteSource": "video_processor.sources.onenote_source",
4448
"PodcastSource": "video_processor.sources.podcast_source",
49
+ "TeamsRecordingSource": "video_processor.sources.teams_recording_source",
4550
"RedditSource": "video_processor.sources.reddit_source",
4651
"RSSSource": "video_processor.sources.rss_source",
4752
"TwitterSource": "video_processor.sources.twitter_source",
4853
"WebSource": "video_processor.sources.web_source",
4954
"YouTubeSource": "video_processor.sources.youtube_source",
55
+ "ZoomSource": "video_processor.sources.zoom_source",
5056
}
5157
if name in _lazy_map:
5258
import importlib
5359
5460
module = importlib.import_module(_lazy_map[name])
5561
return getattr(module, name)
5662
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
5763
5864
ADDED video_processor/sources/meet_recording_source.py
5965
ADDED video_processor/sources/teams_recording_source.py
6066
ADDED video_processor/sources/zoom_source.py
--- video_processor/sources/__init__.py
+++ video_processor/sources/__init__.py
@@ -12,19 +12,22 @@
12 "GoogleKeepSource",
13 "GWSSource",
14 "HackerNewsSource",
15 "LogseqSource",
16 "M365Source",
 
17 "NotionSource",
18 "ObsidianSource",
19 "OneNoteSource",
20 "PodcastSource",
 
21 "RedditSource",
22 "RSSSource",
23 "TwitterSource",
24 "WebSource",
25 "YouTubeSource",
 
26 ]
27
28
29 def __getattr__(name: str):
30 """Lazy imports to avoid pulling in optional dependencies at import time."""
@@ -36,21 +39,24 @@
36 "GoogleKeepSource": "video_processor.sources.google_keep_source",
37 "GWSSource": "video_processor.sources.gws_source",
38 "HackerNewsSource": "video_processor.sources.hackernews_source",
39 "LogseqSource": "video_processor.sources.logseq_source",
40 "M365Source": "video_processor.sources.m365_source",
 
41 "NotionSource": "video_processor.sources.notion_source",
42 "ObsidianSource": "video_processor.sources.obsidian_source",
43 "OneNoteSource": "video_processor.sources.onenote_source",
44 "PodcastSource": "video_processor.sources.podcast_source",
 
45 "RedditSource": "video_processor.sources.reddit_source",
46 "RSSSource": "video_processor.sources.rss_source",
47 "TwitterSource": "video_processor.sources.twitter_source",
48 "WebSource": "video_processor.sources.web_source",
49 "YouTubeSource": "video_processor.sources.youtube_source",
 
50 }
51 if name in _lazy_map:
52 import importlib
53
54 module = importlib.import_module(_lazy_map[name])
55 return getattr(module, name)
56 raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
57
58 DDED video_processor/sources/meet_recording_source.py
59 DDED video_processor/sources/teams_recording_source.py
60 DDED video_processor/sources/zoom_source.py
--- video_processor/sources/__init__.py
+++ video_processor/sources/__init__.py
@@ -12,19 +12,22 @@
12 "GoogleKeepSource",
13 "GWSSource",
14 "HackerNewsSource",
15 "LogseqSource",
16 "M365Source",
17 "MeetRecordingSource",
18 "NotionSource",
19 "ObsidianSource",
20 "OneNoteSource",
21 "PodcastSource",
22 "TeamsRecordingSource",
23 "RedditSource",
24 "RSSSource",
25 "TwitterSource",
26 "WebSource",
27 "YouTubeSource",
28 "ZoomSource",
29 ]
30
31
32 def __getattr__(name: str):
33 """Lazy imports to avoid pulling in optional dependencies at import time."""
@@ -36,21 +39,24 @@
39 "GoogleKeepSource": "video_processor.sources.google_keep_source",
40 "GWSSource": "video_processor.sources.gws_source",
41 "HackerNewsSource": "video_processor.sources.hackernews_source",
42 "LogseqSource": "video_processor.sources.logseq_source",
43 "M365Source": "video_processor.sources.m365_source",
44 "MeetRecordingSource": "video_processor.sources.meet_recording_source",
45 "NotionSource": "video_processor.sources.notion_source",
46 "ObsidianSource": "video_processor.sources.obsidian_source",
47 "OneNoteSource": "video_processor.sources.onenote_source",
48 "PodcastSource": "video_processor.sources.podcast_source",
49 "TeamsRecordingSource": "video_processor.sources.teams_recording_source",
50 "RedditSource": "video_processor.sources.reddit_source",
51 "RSSSource": "video_processor.sources.rss_source",
52 "TwitterSource": "video_processor.sources.twitter_source",
53 "WebSource": "video_processor.sources.web_source",
54 "YouTubeSource": "video_processor.sources.youtube_source",
55 "ZoomSource": "video_processor.sources.zoom_source",
56 }
57 if name in _lazy_map:
58 import importlib
59
60 module = importlib.import_module(_lazy_map[name])
61 return getattr(module, name)
62 raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
63
64 DDED video_processor/sources/meet_recording_source.py
65 DDED video_processor/sources/teams_recording_source.py
66 DDED video_processor/sources/zoom_source.py
--- a/video_processor/sources/meet_recording_source.py
+++ b/video_processor/sources/meet_recording_source.py
@@ -0,0 +1,280 @@
1
+"""Google Meet recording source using the gws CLI (googleworkspace/cli).
2
+
3
+Fetches Meet recordings and companion transcripts from Google Drive
4
+via the `gws` CLI tool.
5
+
6
+Requires: npm install -g @googleworkspace/cli
7
+Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless)
8
+"""
9
+
10
+import json
11
+import logging
12
+import re
13
+import shutil
14
+import subprocess
15
+from pathlib import Path
16
+from typing import List, Optional
17
+
18
+from video_processor.sources.base import BaseSource, SourceFile
19
+from video_processor.sources.gws_source import _run_gws
20
+
21
+logger = logging.getLogger(__name__)
22
+
23
+
24
+class MeetRecordingSource(BaseSource):
25
+ """
26
+ Fetch Google Meet recordings and transcripts from Google Drive via the gws CLI.
27
+
28
+ Meet stores recordings as MP4 files in Drive (typically in a "Meet Recordings"
29
+ folder) and auto-generated transcripts as Google Docs.
30
+
31
+ Usage:
32
+ source = MeetRecordingSource()
33
+ source.authenticate()
34
+ recordings = source.list_videos()
35
+ source.download_all(recordings, Path("./recordings"))
36
+
37
+ # Fetch transcript for a specific recording
38
+ transcript = source.fetch_transcript("Meet Recording 2026-03-07")
39
+ """
40
+
41
+ def __init__(self, drive_folder_id: Optional[str] = None):
42
+ self.drive_folder_id = drive_folder_id
43
+
44
+ def authenticate(self) -> bool:
45
+ """Check if gws CLI is installed and authenticated."""
46
+ if not shutil.which("gws"):
47
+ logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli")
48
+ return False
49
+ try:
50
+ _run_gws(["auth", "status"], timeout=10)
51
+ return True
52
+ except (RuntimeError, subprocess.TimeoutExpired):
53
+ logger.error("gws not authenticated. Run: gws auth login")
54
+ return False
55
+
56
+ def list_videos(
57
+ self,
58
+ folder_id: Optional[str] = None,
59
+ folder_path: Optional[str] = None,
60
+ patterns: Optional[List[str]] = None,
61
+ ) -> List[SourceFile]:
62
+ """List Google Meet recordings in Drive.
63
+
64
+ Searches for MP4 files with 'Meet Recording' in the name. If a
65
+ drive_folder_id is set, restricts search to that folder.
66
+ Also discovers companion transcript docs for each recording.
67
+ """
68
+ target_folder = folder_id or self.drive_folder_id
69
+ files: List[SourceFile] = []
70
+
71
+ # Build the Drive search query for Meet recordings
72
+ q_parts = [
73
+ "mimeType='video/mp4'",
74
+ "name contains 'Meet Recording'",
75
+ "trashed=false",
76
+ ]
77
+ if target_folder:
78
+ q_parts.append(f"'{target_folder}' in parents")
79
+
80
+ params = {
81
+ "q": " and ".join(q_parts),
82
+ "fields": "files(id,name,mimeType,size,modifiedTime)",
83
+ "pageSize": 50,
84
+ "orderBy": "modifiedTime desc",
85
+ }
86
+
87
+ try:
88
+ result = _run_gws(
89
+ [
90
+ "drive",
91
+ "files",
92
+ "list",
93
+ "--params",
94
+ json.dumps(params),
95
+ ],
96
+ timeout=60,
97
+ )
98
+ except RuntimeError as e:
99
+ logger.error(f"Failed to list Meet recordings: {e}")
100
+ return []
101
+
102
+ recordings = result.get("files", [])
103
+ for item in recordings:
104
+ size = item.get("size")
105
+ files.append(
106
+ SourceFile(
107
+ name=item.get("name", "Meet Recording"),
108
+ id=item.get("id", ""),
109
+ size_bytes=int(size) if size else None,
110
+ mime_type=item.get("mimeType", "video/mp4"),
111
+ modified_at=item.get("modifiedTime"),
112
+ )
113
+ )
114
+
115
+ # Also search for auto-generated transcript docs
116
+ transcript_params = {
117
+ "q": " and ".join(
118
+ [
119
+ "mimeType='application/vnd.google-apps.document'",
120
+ "(name contains 'Transcript' or name contains 'Meeting notes')",
121
+ "trashed=false",
122
+ ]
123
+ + ([f"'{target_folder}' in parents"] if target_folder else [])
124
+ ),
125
+ "fields": "files(id,name,mimeType,modifiedTime)",
126
+ "pageSize": 50,
127
+ "orderBy": "modifiedTime desc",
128
+ }
129
+
130
+ try:
131
+ transcript_result = _run_gws(
132
+ [
133
+ "drive",
134
+ "files",
135
+ "list",
136
+ "--params",
137
+ json.dumps(transcript_params),
138
+ ],
139
+ timeout=60,
140
+ )
141
+ transcript_files = transcript_result.get("files", [])
142
+ logger.info(
143
+ f"Found {len(recordings)} recording(s) and "
144
+ f"{len(transcript_files)} transcript doc(s) in Drive"
145
+ )
146
+ except RuntimeError as e:
147
+ logger.debug(f"Transcript search failed: {e}")
148
+
149
+ if not files:
150
+ logger.warning("No Google Meet recordings found in Drive")
151
+
152
+ logger.info(f"Found {len(files)} Meet recording(s)")
153
+ return files
154
+
155
+ def download(self, file: SourceFile, destination: Path) -> Path:
156
+ """Download a Meet recording from Drive."""
157
+ destination = Path(destination)
158
+ destination.parent.mkdir(parents=True, exist_ok=True)
159
+
160
+ # For video files, download binary content via alt=media
161
+ result = _run_gws(
162
+ [
163
+ "drive",
164
+ "files",
165
+ "get",
166
+ "--params",
167
+ json.dumps({"fileId": file.id, "alt": "media"}),
168
+ ],
169
+ timeout=300,
170
+ )
171
+
172
+ # Write the content — result may be raw binary or a dict wrapper
173
+ raw = result.get("raw", "") if isinstance(result, dict) else str(result)
174
+ destination.write_text(raw, encoding="utf-8")
175
+ logger.info(f"Downloaded {file.name} to {destination}")
176
+ return destination
177
+
178
+ def fetch_transcript(self, recording_name: str) -> Optional[str]:
179
+ """Fetch the companion transcript for a Meet recording.
180
+
181
+ Google Meet creates transcript docs with names that typically match
182
+ the recording date/time. This method searches for the matching
183
+ Google Doc and extracts its text content.
184
+ """
185
+ transcript_id = self._find_matching_transcript(recording_name)
186
+ if not transcript_id:
187
+ logger.info(f"No matching transcript found for: {recording_name}")
188
+ return None
189
+
190
+ # Fetch the Google Doc content via the Docs API
191
+ try:
192
+ result = _run_gws(
193
+ [
194
+ "docs",
195
+ "documents",
196
+ "get",
197
+ "--params",
198
+ json.dumps({"documentId": transcript_id}),
199
+ ],
200
+ timeout=60,
201
+ )
202
+ except RuntimeError as e:
203
+ logger.warning(f"Failed to fetch transcript doc {transcript_id}: {e}")
204
+ return None
205
+
206
+ # Extract text from the Docs API structural response
207
+ body = result.get("body", {})
208
+ text_parts: list[str] = []
209
+ for element in body.get("content", []):
210
+ paragraph = element.get("paragraph", {})
211
+ for pe in paragraph.get("elements", []):
212
+ text_run = pe.get("textRun", {})
213
+ text = text_run.get("content", "")
214
+ if text.strip():
215
+ text_parts.append(text)
216
+
217
+ if not text_parts:
218
+ logger.warning(f"Transcript doc {transcript_id} had no extractable text")
219
+ return None
220
+
221
+ return "".join(text_parts)
222
+
223
+ def _find_matching_transcript(self, recording_name: str) -> Optional[str]:
224
+ """Search Drive for a transcript doc that matches a recording name.
225
+
226
+ Meet recordings are typically named like:
227
+ "Meet Recording 2026-03-07T14:30:00"
228
+ And transcripts are named like:
229
+ "Meeting Transcript 2026-03-07" or "2026-03-07 - Transcript"
230
+
231
+ This extracts the date portion and searches for matching transcript docs.
232
+ """
233
+ # Extract a date string from the recording name (YYYY-MM-DD pattern)
234
+ date_match = re.search(r"\d{4}-\d{2}-\d{2}", recording_name)
235
+ date_str = date_match.group(0) if date_match else recording_name
236
+
237
+ # Search for transcript docs matching the date
238
+ search_query = " and ".join(
239
+ [
240
+ "mimeType='application/vnd.google-apps.document'",
241
+ f"name contains '{date_str}'",
242
+ "(name contains 'Transcript' or name contains 'transcript' "
243
+ "or name contains 'Meeting notes')",
244
+ "trashed=false",
245
+ ]
246
+ )
247
+ if self.drive_folder_id:
248
+ search_query += f" and '{self.drive_folder_id}' in parents"
249
+
250
+ try:
251
+ result = _run_gws(
252
+ [
253
+ "drive",
254
+ "files",
255
+ "list",
256
+ "--params",
257
+ json.dumps(
258
+ {
259
+ "q": search_query,
260
+ "fields": "files(id,name,modifiedTime)",
261
+ "pageSize": 5,
262
+ "orderBy": "modifiedTime desc",
263
+ }
264
+ ),
265
+ ],
266
+ timeout=60,
267
+ )
268
+ except RuntimeError as e:
269
+ logger.debug(f"Transcript search failed for '{date_str}': {e}")
270
+ return None
271
+
272
+ files = result.get("files", [])
273
+ if not files:
274
+ logger.debug(f"No transcript docs found matching '{date_str}'")
275
+ return None
276
+
277
+ # Return the most recently modified match
278
+ best = files[0]
279
+ logger.info(f"Matched transcript: {best.get('name')} for recording: {recording_name}")
280
+ return best.get("id")
--- a/video_processor/sources/meet_recording_source.py
+++ b/video_processor/sources/meet_recording_source.py
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/meet_recording_source.py
+++ b/video_processor/sources/meet_recording_source.py
@@ -0,0 +1,280 @@
1 """Google Meet recording source using the gws CLI (googleworkspace/cli).
2
3 Fetches Meet recordings and companion transcripts from Google Drive
4 via the `gws` CLI tool.
5
6 Requires: npm install -g @googleworkspace/cli
7 Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless)
8 """
9
10 import json
11 import logging
12 import re
13 import shutil
14 import subprocess
15 from pathlib import Path
16 from typing import List, Optional
17
18 from video_processor.sources.base import BaseSource, SourceFile
19 from video_processor.sources.gws_source import _run_gws
20
21 logger = logging.getLogger(__name__)
22
23
24 class MeetRecordingSource(BaseSource):
25 """
26 Fetch Google Meet recordings and transcripts from Google Drive via the gws CLI.
27
28 Meet stores recordings as MP4 files in Drive (typically in a "Meet Recordings"
29 folder) and auto-generated transcripts as Google Docs.
30
31 Usage:
32 source = MeetRecordingSource()
33 source.authenticate()
34 recordings = source.list_videos()
35 source.download_all(recordings, Path("./recordings"))
36
37 # Fetch transcript for a specific recording
38 transcript = source.fetch_transcript("Meet Recording 2026-03-07")
39 """
40
41 def __init__(self, drive_folder_id: Optional[str] = None):
42 self.drive_folder_id = drive_folder_id
43
44 def authenticate(self) -> bool:
45 """Check if gws CLI is installed and authenticated."""
46 if not shutil.which("gws"):
47 logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli")
48 return False
49 try:
50 _run_gws(["auth", "status"], timeout=10)
51 return True
52 except (RuntimeError, subprocess.TimeoutExpired):
53 logger.error("gws not authenticated. Run: gws auth login")
54 return False
55
56 def list_videos(
57 self,
58 folder_id: Optional[str] = None,
59 folder_path: Optional[str] = None,
60 patterns: Optional[List[str]] = None,
61 ) -> List[SourceFile]:
62 """List Google Meet recordings in Drive.
63
64 Searches for MP4 files with 'Meet Recording' in the name. If a
65 drive_folder_id is set, restricts search to that folder.
66 Also discovers companion transcript docs for each recording.
67 """
68 target_folder = folder_id or self.drive_folder_id
69 files: List[SourceFile] = []
70
71 # Build the Drive search query for Meet recordings
72 q_parts = [
73 "mimeType='video/mp4'",
74 "name contains 'Meet Recording'",
75 "trashed=false",
76 ]
77 if target_folder:
78 q_parts.append(f"'{target_folder}' in parents")
79
80 params = {
81 "q": " and ".join(q_parts),
82 "fields": "files(id,name,mimeType,size,modifiedTime)",
83 "pageSize": 50,
84 "orderBy": "modifiedTime desc",
85 }
86
87 try:
88 result = _run_gws(
89 [
90 "drive",
91 "files",
92 "list",
93 "--params",
94 json.dumps(params),
95 ],
96 timeout=60,
97 )
98 except RuntimeError as e:
99 logger.error(f"Failed to list Meet recordings: {e}")
100 return []
101
102 recordings = result.get("files", [])
103 for item in recordings:
104 size = item.get("size")
105 files.append(
106 SourceFile(
107 name=item.get("name", "Meet Recording"),
108 id=item.get("id", ""),
109 size_bytes=int(size) if size else None,
110 mime_type=item.get("mimeType", "video/mp4"),
111 modified_at=item.get("modifiedTime"),
112 )
113 )
114
115 # Also search for auto-generated transcript docs
116 transcript_params = {
117 "q": " and ".join(
118 [
119 "mimeType='application/vnd.google-apps.document'",
120 "(name contains 'Transcript' or name contains 'Meeting notes')",
121 "trashed=false",
122 ]
123 + ([f"'{target_folder}' in parents"] if target_folder else [])
124 ),
125 "fields": "files(id,name,mimeType,modifiedTime)",
126 "pageSize": 50,
127 "orderBy": "modifiedTime desc",
128 }
129
130 try:
131 transcript_result = _run_gws(
132 [
133 "drive",
134 "files",
135 "list",
136 "--params",
137 json.dumps(transcript_params),
138 ],
139 timeout=60,
140 )
141 transcript_files = transcript_result.get("files", [])
142 logger.info(
143 f"Found {len(recordings)} recording(s) and "
144 f"{len(transcript_files)} transcript doc(s) in Drive"
145 )
146 except RuntimeError as e:
147 logger.debug(f"Transcript search failed: {e}")
148
149 if not files:
150 logger.warning("No Google Meet recordings found in Drive")
151
152 logger.info(f"Found {len(files)} Meet recording(s)")
153 return files
154
155 def download(self, file: SourceFile, destination: Path) -> Path:
156 """Download a Meet recording from Drive."""
157 destination = Path(destination)
158 destination.parent.mkdir(parents=True, exist_ok=True)
159
160 # For video files, download binary content via alt=media
161 result = _run_gws(
162 [
163 "drive",
164 "files",
165 "get",
166 "--params",
167 json.dumps({"fileId": file.id, "alt": "media"}),
168 ],
169 timeout=300,
170 )
171
172 # Write the content — result may be raw binary or a dict wrapper
173 raw = result.get("raw", "") if isinstance(result, dict) else str(result)
174 destination.write_text(raw, encoding="utf-8")
175 logger.info(f"Downloaded {file.name} to {destination}")
176 return destination
177
178 def fetch_transcript(self, recording_name: str) -> Optional[str]:
179 """Fetch the companion transcript for a Meet recording.
180
181 Google Meet creates transcript docs with names that typically match
182 the recording date/time. This method searches for the matching
183 Google Doc and extracts its text content.
184 """
185 transcript_id = self._find_matching_transcript(recording_name)
186 if not transcript_id:
187 logger.info(f"No matching transcript found for: {recording_name}")
188 return None
189
190 # Fetch the Google Doc content via the Docs API
191 try:
192 result = _run_gws(
193 [
194 "docs",
195 "documents",
196 "get",
197 "--params",
198 json.dumps({"documentId": transcript_id}),
199 ],
200 timeout=60,
201 )
202 except RuntimeError as e:
203 logger.warning(f"Failed to fetch transcript doc {transcript_id}: {e}")
204 return None
205
206 # Extract text from the Docs API structural response
207 body = result.get("body", {})
208 text_parts: list[str] = []
209 for element in body.get("content", []):
210 paragraph = element.get("paragraph", {})
211 for pe in paragraph.get("elements", []):
212 text_run = pe.get("textRun", {})
213 text = text_run.get("content", "")
214 if text.strip():
215 text_parts.append(text)
216
217 if not text_parts:
218 logger.warning(f"Transcript doc {transcript_id} had no extractable text")
219 return None
220
221 return "".join(text_parts)
222
223 def _find_matching_transcript(self, recording_name: str) -> Optional[str]:
224 """Search Drive for a transcript doc that matches a recording name.
225
226 Meet recordings are typically named like:
227 "Meet Recording 2026-03-07T14:30:00"
228 And transcripts are named like:
229 "Meeting Transcript 2026-03-07" or "2026-03-07 - Transcript"
230
231 This extracts the date portion and searches for matching transcript docs.
232 """
233 # Extract a date string from the recording name (YYYY-MM-DD pattern)
234 date_match = re.search(r"\d{4}-\d{2}-\d{2}", recording_name)
235 date_str = date_match.group(0) if date_match else recording_name
236
237 # Search for transcript docs matching the date
238 search_query = " and ".join(
239 [
240 "mimeType='application/vnd.google-apps.document'",
241 f"name contains '{date_str}'",
242 "(name contains 'Transcript' or name contains 'transcript' "
243 "or name contains 'Meeting notes')",
244 "trashed=false",
245 ]
246 )
247 if self.drive_folder_id:
248 search_query += f" and '{self.drive_folder_id}' in parents"
249
250 try:
251 result = _run_gws(
252 [
253 "drive",
254 "files",
255 "list",
256 "--params",
257 json.dumps(
258 {
259 "q": search_query,
260 "fields": "files(id,name,modifiedTime)",
261 "pageSize": 5,
262 "orderBy": "modifiedTime desc",
263 }
264 ),
265 ],
266 timeout=60,
267 )
268 except RuntimeError as e:
269 logger.debug(f"Transcript search failed for '{date_str}': {e}")
270 return None
271
272 files = result.get("files", [])
273 if not files:
274 logger.debug(f"No transcript docs found matching '{date_str}'")
275 return None
276
277 # Return the most recently modified match
278 best = files[0]
279 logger.info(f"Matched transcript: {best.get('name')} for recording: {recording_name}")
280 return best.get("id")
--- a/video_processor/sources/teams_recording_source.py
+++ b/video_processor/sources/teams_recording_source.py
@@ -0,0 +1,375 @@
1
+"""Microsoft Teams meeting recording source using the m365 CLI.
2
+
3
+Fetches Teams meeting recordings and transcripts via the Microsoft Graph API
4
+through the `m365` CLI tool.
5
+
6
+Requires: npm install -g @pnp/cli-microsoft365
7
+Auth: m365 login (interactive)
8
+Docs: https://pnp.github.io/cli-microsoft365/
9
+"""
10
+
11
+import logging
12
+import re
13
+import shutil
14
+import subprocess
15
+from pathlib import Path
16
+from typing import List, Optional
17
+
18
+from video_processor.sources.base import BaseSource, SourceFile
19
+from video_processor.sources.m365_source import _run_m365
20
+
21
+logger = logging.getLogger(__name__)
22
+
23
+
24
+def _vtt_to_text(vtt: str) -> str:
25
+ """Strip VTT timing metadata and return plain text.
26
+
27
+ Removes WEBVTT headers, timestamps (00:00:00.000 --> 00:00:05.000),
28
+ cue identifiers, and deduplicates consecutive identical lines.
29
+ """
30
+ lines = vtt.splitlines()
31
+ text_lines: list[str] = []
32
+ prev_line = ""
33
+
34
+ for line in lines:
35
+ stripped = line.strip()
36
+ # Skip WEBVTT header and NOTE blocks
37
+ if stripped.startswith("WEBVTT") or stripped.startswith("NOTE"):
38
+ continue
39
+ # Skip timestamp lines (e.g. 00:00:01.000 --> 00:00:05.000)
40
+ if re.match(r"\d{2}:\d{2}[:\.][\d.]+ --> \d{2}:\d{2}[:\.][\d.]+", stripped):
41
+ continue
42
+ # Skip numeric cue identifiers
43
+ if re.match(r"^\d+$", stripped):
44
+ continue
45
+ # Skip blank lines
46
+ if not stripped:
47
+ continue
48
+ # Strip inline VTT tags like <v Speaker>
49
+ cleaned = re.sub(r"<[^>]+>", "", stripped).strip()
50
+ if cleaned and cleaned != prev_line:
51
+ text_lines.append(cleaned)
52
+ prev_line = cleaned
53
+
54
+ return "\n".join(text_lines)
55
+
56
+
57
+class TeamsRecordingSource(BaseSource):
58
+ """
59
+ Fetch Teams meeting recordings and transcripts via the m365 CLI / Graph API.
60
+
61
+ Usage:
62
+ source = TeamsRecordingSource(user_id="me")
63
+ source.authenticate()
64
+ recordings = source.list_videos()
65
+ source.download_all(recordings, Path("./recordings"))
66
+
67
+ # Fetch transcript for a specific meeting
68
+ transcript = source.fetch_transcript(meeting_id)
69
+ """
70
+
71
+ def __init__(self, user_id: str = "me"):
72
+ self.user_id = user_id
73
+
74
+ def authenticate(self) -> bool:
75
+ """Check if m365 CLI is installed and logged in."""
76
+ if not shutil.which("m365"):
77
+ logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365")
78
+ return False
79
+ try:
80
+ result = _run_m365(["status"], timeout=10)
81
+ if isinstance(result, dict) and result.get("connectedAs"):
82
+ return True
83
+ if isinstance(result, str) and "Logged in" in result:
84
+ return True
85
+ logger.error("m365 not logged in. Run: m365 login")
86
+ return False
87
+ except (RuntimeError, subprocess.TimeoutExpired):
88
+ logger.error("m365 not logged in. Run: m365 login")
89
+ return False
90
+
91
+ def list_videos(
92
+ self,
93
+ folder_id: Optional[str] = None,
94
+ folder_path: Optional[str] = None,
95
+ patterns: Optional[List[str]] = None,
96
+ ) -> List[SourceFile]:
97
+ """List Teams meeting recordings available for the user.
98
+
99
+ Tries multiple approaches:
100
+ 1. Graph API onlineMeetings endpoint
101
+ 2. m365 teams meeting list command
102
+ 3. Fallback: search chat messages for recording links
103
+ """
104
+ files: List[SourceFile] = []
105
+
106
+ # Approach 1: Graph API — list online meetings
107
+ try:
108
+ result = _run_m365(
109
+ [
110
+ "request",
111
+ "--url",
112
+ f"https://graph.microsoft.com/v1.0/{self.user_id}/onlineMeetings",
113
+ "--method",
114
+ "get",
115
+ ],
116
+ timeout=60,
117
+ )
118
+ meetings = self._extract_meetings_list(result)
119
+ for meeting in meetings:
120
+ recording_files = self._get_meeting_recordings(meeting)
121
+ files.extend(recording_files)
122
+
123
+ if files:
124
+ logger.info(f"Found {len(files)} recording(s) via Graph API onlineMeetings")
125
+ return files
126
+ except RuntimeError as e:
127
+ logger.debug(f"onlineMeetings endpoint failed: {e}")
128
+
129
+ # Approach 2: m365 teams meeting list
130
+ try:
131
+ result = _run_m365(["teams", "meeting", "list"], timeout=60)
132
+ meetings = result if isinstance(result, list) else []
133
+ for meeting in meetings:
134
+ recording_files = self._get_meeting_recordings(meeting)
135
+ files.extend(recording_files)
136
+
137
+ if files:
138
+ logger.info(f"Found {len(files)} recording(s) via m365 teams meeting list")
139
+ return files
140
+ except RuntimeError as e:
141
+ logger.debug(f"teams meeting list failed: {e}")
142
+
143
+ # Approach 3: Fallback — search chat messages for recording links
144
+ try:
145
+ result = _run_m365(
146
+ [
147
+ "request",
148
+ "--url",
149
+ (
150
+ f"https://graph.microsoft.com/v1.0/{self.user_id}/chats"
151
+ "?$expand=messages($top=50)"
152
+ "&$filter=chatType eq 'meeting'"
153
+ "&$top=25"
154
+ ),
155
+ "--method",
156
+ "get",
157
+ ],
158
+ timeout=60,
159
+ )
160
+ chats = self._extract_value_list(result)
161
+ for chat in chats:
162
+ messages = chat.get("messages", [])
163
+ for msg in messages:
164
+ body = msg.get("body", {}).get("content", "")
165
+ if "recording" in body.lower() or ".mp4" in body.lower():
166
+ topic = chat.get("topic", "Meeting Recording")
167
+ chat_id = chat.get("id", "")
168
+ msg_id = msg.get("id", "")
169
+ files.append(
170
+ SourceFile(
171
+ name=f"{topic}.mp4",
172
+ id=f"{chat_id}:{msg_id}",
173
+ mime_type="video/mp4",
174
+ modified_at=msg.get("createdDateTime"),
175
+ )
176
+ )
177
+ if files:
178
+ logger.info(f"Found {len(files)} recording link(s) in meeting chats")
179
+ except RuntimeError as e:
180
+ logger.debug(f"Chat message fallback failed: {e}")
181
+
182
+ if not files:
183
+ logger.warning("No Teams meeting recordings found")
184
+
185
+ return files
186
+
187
+ def download(self, file: SourceFile, destination: Path) -> Path:
188
+ """Download a recording via its Graph API download URL."""
189
+ destination = Path(destination)
190
+ destination.parent.mkdir(parents=True, exist_ok=True)
191
+
192
+ download_url = file.path
193
+ if download_url:
194
+ # Use the direct download URL from contentUrl / @microsoft.graph.downloadUrl
195
+ _run_m365(
196
+ [
197
+ "request",
198
+ "--url",
199
+ download_url,
200
+ "--method",
201
+ "get",
202
+ "--filePath",
203
+ str(destination),
204
+ ],
205
+ timeout=300,
206
+ )
207
+ else:
208
+ # Try to get download URL from the recording ID
209
+ meeting_id, _, recording_id = file.id.partition(":")
210
+ if recording_id:
211
+ url = (
212
+ f"https://graph.microsoft.com/v1.0/{self.user_id}"
213
+ f"/onlineMeetings/{meeting_id}"
214
+ f"/recordings/{recording_id}/content"
215
+ )
216
+ else:
217
+ url = (
218
+ f"https://graph.microsoft.com/v1.0/{self.user_id}"
219
+ f"/onlineMeetings/{meeting_id}/recordings"
220
+ )
221
+ # Fetch recording list to find the content URL
222
+ result = _run_m365(
223
+ ["request", "--url", url, "--method", "get"],
224
+ timeout=60,
225
+ )
226
+ recordings = self._extract_value_list(result)
227
+ if recordings:
228
+ url = (
229
+ f"https://graph.microsoft.com/v1.0/{self.user_id}"
230
+ f"/onlineMeetings/{meeting_id}"
231
+ f"/recordings/{recordings[0].get('id', '')}/content"
232
+ )
233
+
234
+ _run_m365(
235
+ [
236
+ "request",
237
+ "--url",
238
+ url,
239
+ "--method",
240
+ "get",
241
+ "--filePath",
242
+ str(destination),
243
+ ],
244
+ timeout=300,
245
+ )
246
+
247
+ logger.info(f"Downloaded {file.name} to {destination}")
248
+ return destination
249
+
250
+ def fetch_transcript(self, meeting_id: str) -> Optional[str]:
251
+ """Fetch the transcript for a Teams meeting.
252
+
253
+ Queries the Graph API transcripts endpoint, downloads the transcript
254
+ content, and converts VTT format to plain text.
255
+ """
256
+ try:
257
+ result = _run_m365(
258
+ [
259
+ "request",
260
+ "--url",
261
+ (
262
+ f"https://graph.microsoft.com/v1.0/{self.user_id}"
263
+ f"/onlineMeetings/{meeting_id}/transcripts"
264
+ ),
265
+ "--method",
266
+ "get",
267
+ ],
268
+ timeout=60,
269
+ )
270
+ except RuntimeError as e:
271
+ logger.warning(f"Failed to list transcripts for meeting {meeting_id}: {e}")
272
+ return None
273
+
274
+ transcripts = self._extract_value_list(result)
275
+ if not transcripts:
276
+ logger.info(f"No transcripts found for meeting {meeting_id}")
277
+ return None
278
+
279
+ # Download the first available transcript
280
+ transcript_id = transcripts[0].get("id", "")
281
+ try:
282
+ content_result = _run_m365(
283
+ [
284
+ "request",
285
+ "--url",
286
+ (
287
+ f"https://graph.microsoft.com/v1.0/{self.user_id}"
288
+ f"/onlineMeetings/{meeting_id}"
289
+ f"/transcripts/{transcript_id}/content"
290
+ ),
291
+ "--method",
292
+ "get",
293
+ "--accept",
294
+ "text/vtt",
295
+ ],
296
+ timeout=60,
297
+ )
298
+ except RuntimeError as e:
299
+ logger.warning(f"Failed to download transcript {transcript_id}: {e}")
300
+ return None
301
+
302
+ # content_result may be raw VTT text or a dict with raw key
303
+ if isinstance(content_result, dict):
304
+ raw = content_result.get("raw", "")
305
+ else:
306
+ raw = str(content_result)
307
+
308
+ if not raw:
309
+ logger.warning(f"Empty transcript content for meeting {meeting_id}")
310
+ return None
311
+
312
+ return _vtt_to_text(raw)
313
+
314
+ # ------------------------------------------------------------------
315
+ # Internal helpers
316
+ # ------------------------------------------------------------------
317
+
318
+ def _extract_meetings_list(self, result) -> list:
319
+ """Extract meetings list from Graph API response."""
320
+ if isinstance(result, list):
321
+ return result
322
+ if isinstance(result, dict):
323
+ return result.get("value", [])
324
+ return []
325
+
326
+ def _extract_value_list(self, result) -> list:
327
+ """Extract value list from a Graph API response."""
328
+ if isinstance(result, list):
329
+ return result
330
+ if isinstance(result, dict):
331
+ return result.get("value", [])
332
+ return []
333
+
334
+ def _get_meeting_recordings(self, meeting: dict) -> List[SourceFile]:
335
+ """Fetch recordings for a single meeting and return SourceFile entries."""
336
+ meeting_id = meeting.get("id", "")
337
+ subject = meeting.get("subject", meeting.get("topic", "Teams Meeting"))
338
+ start_time = meeting.get("startDateTime", meeting.get("createdDateTime"))
339
+
340
+ if not meeting_id:
341
+ return []
342
+
343
+ try:
344
+ result = _run_m365(
345
+ [
346
+ "request",
347
+ "--url",
348
+ (
349
+ f"https://graph.microsoft.com/v1.0/{self.user_id}"
350
+ f"/onlineMeetings/{meeting_id}/recordings"
351
+ ),
352
+ "--method",
353
+ "get",
354
+ ],
355
+ timeout=60,
356
+ )
357
+ except RuntimeError:
358
+ return []
359
+
360
+ recordings = self._extract_value_list(result)
361
+ files: List[SourceFile] = []
362
+ for rec in recordings:
363
+ rec_id = rec.get("id", "")
364
+ download_url = rec.get("content.downloadUrl", rec.get("contentUrl"))
365
+ files.append(
366
+ SourceFile(
367
+ name=f"{subject}.mp4",
368
+ id=f"{meeting_id}:{rec_id}",
369
+ mime_type="video/mp4",
370
+ modified_at=start_time,
371
+ path=download_url,
372
+ )
373
+ )
374
+
375
+ return files
--- a/video_processor/sources/teams_recording_source.py
+++ b/video_processor/sources/teams_recording_source.py
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/teams_recording_source.py
+++ b/video_processor/sources/teams_recording_source.py
@@ -0,0 +1,375 @@
1 """Microsoft Teams meeting recording source using the m365 CLI.
2
3 Fetches Teams meeting recordings and transcripts via the Microsoft Graph API
4 through the `m365` CLI tool.
5
6 Requires: npm install -g @pnp/cli-microsoft365
7 Auth: m365 login (interactive)
8 Docs: https://pnp.github.io/cli-microsoft365/
9 """
10
11 import logging
12 import re
13 import shutil
14 import subprocess
15 from pathlib import Path
16 from typing import List, Optional
17
18 from video_processor.sources.base import BaseSource, SourceFile
19 from video_processor.sources.m365_source import _run_m365
20
21 logger = logging.getLogger(__name__)
22
23
24 def _vtt_to_text(vtt: str) -> str:
25 """Strip VTT timing metadata and return plain text.
26
27 Removes WEBVTT headers, timestamps (00:00:00.000 --> 00:00:05.000),
28 cue identifiers, and deduplicates consecutive identical lines.
29 """
30 lines = vtt.splitlines()
31 text_lines: list[str] = []
32 prev_line = ""
33
34 for line in lines:
35 stripped = line.strip()
36 # Skip WEBVTT header and NOTE blocks
37 if stripped.startswith("WEBVTT") or stripped.startswith("NOTE"):
38 continue
39 # Skip timestamp lines (e.g. 00:00:01.000 --> 00:00:05.000)
40 if re.match(r"\d{2}:\d{2}[:\.][\d.]+ --> \d{2}:\d{2}[:\.][\d.]+", stripped):
41 continue
42 # Skip numeric cue identifiers
43 if re.match(r"^\d+$", stripped):
44 continue
45 # Skip blank lines
46 if not stripped:
47 continue
48 # Strip inline VTT tags like <v Speaker>
49 cleaned = re.sub(r"<[^>]+>", "", stripped).strip()
50 if cleaned and cleaned != prev_line:
51 text_lines.append(cleaned)
52 prev_line = cleaned
53
54 return "\n".join(text_lines)
55
56
57 class TeamsRecordingSource(BaseSource):
58 """
59 Fetch Teams meeting recordings and transcripts via the m365 CLI / Graph API.
60
61 Usage:
62 source = TeamsRecordingSource(user_id="me")
63 source.authenticate()
64 recordings = source.list_videos()
65 source.download_all(recordings, Path("./recordings"))
66
67 # Fetch transcript for a specific meeting
68 transcript = source.fetch_transcript(meeting_id)
69 """
70
71 def __init__(self, user_id: str = "me"):
72 self.user_id = user_id
73
74 def authenticate(self) -> bool:
75 """Check if m365 CLI is installed and logged in."""
76 if not shutil.which("m365"):
77 logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365")
78 return False
79 try:
80 result = _run_m365(["status"], timeout=10)
81 if isinstance(result, dict) and result.get("connectedAs"):
82 return True
83 if isinstance(result, str) and "Logged in" in result:
84 return True
85 logger.error("m365 not logged in. Run: m365 login")
86 return False
87 except (RuntimeError, subprocess.TimeoutExpired):
88 logger.error("m365 not logged in. Run: m365 login")
89 return False
90
91 def list_videos(
92 self,
93 folder_id: Optional[str] = None,
94 folder_path: Optional[str] = None,
95 patterns: Optional[List[str]] = None,
96 ) -> List[SourceFile]:
97 """List Teams meeting recordings available for the user.
98
99 Tries multiple approaches:
100 1. Graph API onlineMeetings endpoint
101 2. m365 teams meeting list command
102 3. Fallback: search chat messages for recording links
103 """
104 files: List[SourceFile] = []
105
106 # Approach 1: Graph API — list online meetings
107 try:
108 result = _run_m365(
109 [
110 "request",
111 "--url",
112 f"https://graph.microsoft.com/v1.0/{self.user_id}/onlineMeetings",
113 "--method",
114 "get",
115 ],
116 timeout=60,
117 )
118 meetings = self._extract_meetings_list(result)
119 for meeting in meetings:
120 recording_files = self._get_meeting_recordings(meeting)
121 files.extend(recording_files)
122
123 if files:
124 logger.info(f"Found {len(files)} recording(s) via Graph API onlineMeetings")
125 return files
126 except RuntimeError as e:
127 logger.debug(f"onlineMeetings endpoint failed: {e}")
128
129 # Approach 2: m365 teams meeting list
130 try:
131 result = _run_m365(["teams", "meeting", "list"], timeout=60)
132 meetings = result if isinstance(result, list) else []
133 for meeting in meetings:
134 recording_files = self._get_meeting_recordings(meeting)
135 files.extend(recording_files)
136
137 if files:
138 logger.info(f"Found {len(files)} recording(s) via m365 teams meeting list")
139 return files
140 except RuntimeError as e:
141 logger.debug(f"teams meeting list failed: {e}")
142
143 # Approach 3: Fallback — search chat messages for recording links
144 try:
145 result = _run_m365(
146 [
147 "request",
148 "--url",
149 (
150 f"https://graph.microsoft.com/v1.0/{self.user_id}/chats"
151 "?$expand=messages($top=50)"
152 "&$filter=chatType eq 'meeting'"
153 "&$top=25"
154 ),
155 "--method",
156 "get",
157 ],
158 timeout=60,
159 )
160 chats = self._extract_value_list(result)
161 for chat in chats:
162 messages = chat.get("messages", [])
163 for msg in messages:
164 body = msg.get("body", {}).get("content", "")
165 if "recording" in body.lower() or ".mp4" in body.lower():
166 topic = chat.get("topic", "Meeting Recording")
167 chat_id = chat.get("id", "")
168 msg_id = msg.get("id", "")
169 files.append(
170 SourceFile(
171 name=f"{topic}.mp4",
172 id=f"{chat_id}:{msg_id}",
173 mime_type="video/mp4",
174 modified_at=msg.get("createdDateTime"),
175 )
176 )
177 if files:
178 logger.info(f"Found {len(files)} recording link(s) in meeting chats")
179 except RuntimeError as e:
180 logger.debug(f"Chat message fallback failed: {e}")
181
182 if not files:
183 logger.warning("No Teams meeting recordings found")
184
185 return files
186
187 def download(self, file: SourceFile, destination: Path) -> Path:
188 """Download a recording via its Graph API download URL."""
189 destination = Path(destination)
190 destination.parent.mkdir(parents=True, exist_ok=True)
191
192 download_url = file.path
193 if download_url:
194 # Use the direct download URL from contentUrl / @microsoft.graph.downloadUrl
195 _run_m365(
196 [
197 "request",
198 "--url",
199 download_url,
200 "--method",
201 "get",
202 "--filePath",
203 str(destination),
204 ],
205 timeout=300,
206 )
207 else:
208 # Try to get download URL from the recording ID
209 meeting_id, _, recording_id = file.id.partition(":")
210 if recording_id:
211 url = (
212 f"https://graph.microsoft.com/v1.0/{self.user_id}"
213 f"/onlineMeetings/{meeting_id}"
214 f"/recordings/{recording_id}/content"
215 )
216 else:
217 url = (
218 f"https://graph.microsoft.com/v1.0/{self.user_id}"
219 f"/onlineMeetings/{meeting_id}/recordings"
220 )
221 # Fetch recording list to find the content URL
222 result = _run_m365(
223 ["request", "--url", url, "--method", "get"],
224 timeout=60,
225 )
226 recordings = self._extract_value_list(result)
227 if recordings:
228 url = (
229 f"https://graph.microsoft.com/v1.0/{self.user_id}"
230 f"/onlineMeetings/{meeting_id}"
231 f"/recordings/{recordings[0].get('id', '')}/content"
232 )
233
234 _run_m365(
235 [
236 "request",
237 "--url",
238 url,
239 "--method",
240 "get",
241 "--filePath",
242 str(destination),
243 ],
244 timeout=300,
245 )
246
247 logger.info(f"Downloaded {file.name} to {destination}")
248 return destination
249
250 def fetch_transcript(self, meeting_id: str) -> Optional[str]:
251 """Fetch the transcript for a Teams meeting.
252
253 Queries the Graph API transcripts endpoint, downloads the transcript
254 content, and converts VTT format to plain text.
255 """
256 try:
257 result = _run_m365(
258 [
259 "request",
260 "--url",
261 (
262 f"https://graph.microsoft.com/v1.0/{self.user_id}"
263 f"/onlineMeetings/{meeting_id}/transcripts"
264 ),
265 "--method",
266 "get",
267 ],
268 timeout=60,
269 )
270 except RuntimeError as e:
271 logger.warning(f"Failed to list transcripts for meeting {meeting_id}: {e}")
272 return None
273
274 transcripts = self._extract_value_list(result)
275 if not transcripts:
276 logger.info(f"No transcripts found for meeting {meeting_id}")
277 return None
278
279 # Download the first available transcript
280 transcript_id = transcripts[0].get("id", "")
281 try:
282 content_result = _run_m365(
283 [
284 "request",
285 "--url",
286 (
287 f"https://graph.microsoft.com/v1.0/{self.user_id}"
288 f"/onlineMeetings/{meeting_id}"
289 f"/transcripts/{transcript_id}/content"
290 ),
291 "--method",
292 "get",
293 "--accept",
294 "text/vtt",
295 ],
296 timeout=60,
297 )
298 except RuntimeError as e:
299 logger.warning(f"Failed to download transcript {transcript_id}: {e}")
300 return None
301
302 # content_result may be raw VTT text or a dict with raw key
303 if isinstance(content_result, dict):
304 raw = content_result.get("raw", "")
305 else:
306 raw = str(content_result)
307
308 if not raw:
309 logger.warning(f"Empty transcript content for meeting {meeting_id}")
310 return None
311
312 return _vtt_to_text(raw)
313
314 # ------------------------------------------------------------------
315 # Internal helpers
316 # ------------------------------------------------------------------
317
318 def _extract_meetings_list(self, result) -> list:
319 """Extract meetings list from Graph API response."""
320 if isinstance(result, list):
321 return result
322 if isinstance(result, dict):
323 return result.get("value", [])
324 return []
325
326 def _extract_value_list(self, result) -> list:
327 """Extract value list from a Graph API response."""
328 if isinstance(result, list):
329 return result
330 if isinstance(result, dict):
331 return result.get("value", [])
332 return []
333
334 def _get_meeting_recordings(self, meeting: dict) -> List[SourceFile]:
335 """Fetch recordings for a single meeting and return SourceFile entries."""
336 meeting_id = meeting.get("id", "")
337 subject = meeting.get("subject", meeting.get("topic", "Teams Meeting"))
338 start_time = meeting.get("startDateTime", meeting.get("createdDateTime"))
339
340 if not meeting_id:
341 return []
342
343 try:
344 result = _run_m365(
345 [
346 "request",
347 "--url",
348 (
349 f"https://graph.microsoft.com/v1.0/{self.user_id}"
350 f"/onlineMeetings/{meeting_id}/recordings"
351 ),
352 "--method",
353 "get",
354 ],
355 timeout=60,
356 )
357 except RuntimeError:
358 return []
359
360 recordings = self._extract_value_list(result)
361 files: List[SourceFile] = []
362 for rec in recordings:
363 rec_id = rec.get("id", "")
364 download_url = rec.get("content.downloadUrl", rec.get("contentUrl"))
365 files.append(
366 SourceFile(
367 name=f"{subject}.mp4",
368 id=f"{meeting_id}:{rec_id}",
369 mime_type="video/mp4",
370 modified_at=start_time,
371 path=download_url,
372 )
373 )
374
375 return files
--- a/video_processor/sources/zoom_source.py
+++ b/video_processor/sources/zoom_source.py
@@ -0,0 +1,399 @@
1
+"""Zoom cloud recordings source integration with OAuth support."""
2
+
3
+import base64
4
+import hashlib
5
+import json
6
+import logging
7
+import os
8
+import secrets
9
+import time
10
+import webbrowser
11
+from pathlib import Path
12
+from typing import Dict, List, Optional
13
+
14
+import requests
15
+
16
+from video_processor.sources.base import BaseSource, SourceFile
17
+
18
+logger = logging.getLogger(__name__)
19
+
20
+_TOKEN_PATH = Path.home() / ".planopticon" / "zoom_token.json"
21
+_BASE_URL = "https://api.zoom.us/v2"
22
+_OAUTH_BASE = "https://zoom.us/oauth"
23
+
24
+# Map Zoom file_type values to MIME types
25
+_MIME_TYPES = {
26
+ "MP4": "video/mp4",
27
+ "M4A": "audio/mp4",
28
+ "CHAT": "text/plain",
29
+ "TRANSCRIPT": "text/vtt",
30
+ "CSV": "text/csv",
31
+ "TIMELINE": "application/json",
32
+}
33
+
34
+
35
+class ZoomSource(BaseSource):
36
+ """
37
+ Zoom cloud recordings source with OAuth2 support.
38
+
39
+ Auth methods (tried in order):
40
+ 1. Saved token: Load from token_path, refresh if expired
41
+ 2. Server-to-Server OAuth: Uses account_id with client credentials
42
+ 3. OAuth2 Authorization Code with PKCE: Interactive browser flow
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ client_id: Optional[str] = None,
48
+ client_secret: Optional[str] = None,
49
+ account_id: Optional[str] = None,
50
+ token_path: Optional[Path] = None,
51
+ ):
52
+ """
53
+ Initialize Zoom source.
54
+
55
+ Parameters
56
+ ----------
57
+ client_id : str, optional
58
+ Zoom OAuth app client ID. Falls back to ZOOM_CLIENT_ID env var.
59
+ client_secret : str, optional
60
+ Zoom OAuth app client secret. Falls back to ZOOM_CLIENT_SECRET env var.
61
+ account_id : str, optional
62
+ Zoom account ID for Server-to-Server OAuth. Falls back to ZOOM_ACCOUNT_ID env var.
63
+ token_path : Path, optional
64
+ Where to store/load OAuth tokens.
65
+ """
66
+ self.client_id = client_id or os.environ.get("ZOOM_CLIENT_ID")
67
+ self.client_secret = client_secret or os.environ.get("ZOOM_CLIENT_SECRET")
68
+ self.account_id = account_id or os.environ.get("ZOOM_ACCOUNT_ID")
69
+ self.token_path = token_path or _TOKEN_PATH
70
+ self._access_token: Optional[str] = None
71
+ self._token_data: Optional[Dict] = None
72
+
73
+ def authenticate(self) -> bool:
74
+ """Authenticate with Zoom API."""
75
+ # Try 1: Load saved token
76
+ if self.token_path.exists():
77
+ if self._auth_saved_token():
78
+ return True
79
+
80
+ # Try 2: Server-to-Server OAuth (if account_id is set)
81
+ if self.account_id:
82
+ return self._auth_server_to_server()
83
+
84
+ # Try 3: OAuth2 Authorization Code flow with PKCE
85
+ return self._auth_oauth_pkce()
86
+
87
+ def _auth_saved_token(self) -> bool:
88
+ """Authenticate using a saved OAuth token, refreshing if expired."""
89
+ try:
90
+ data = json.loads(self.token_path.read_text())
91
+ expires_at = data.get("expires_at", 0)
92
+
93
+ if time.time() < expires_at:
94
+ # Token still valid
95
+ self._access_token = data["access_token"]
96
+ self._token_data = data
97
+ logger.info("Authenticated with Zoom via saved token")
98
+ return True
99
+
100
+ # Token expired, try to refresh
101
+ if data.get("refresh_token"):
102
+ return self._refresh_token()
103
+
104
+ # Server-to-Server tokens don't have refresh tokens;
105
+ # fall through to re-authenticate
106
+ return False
107
+ except Exception:
108
+ return False
109
+
110
+ def _auth_server_to_server(self) -> bool:
111
+ """Authenticate using Server-to-Server OAuth (account credentials)."""
112
+ if not self.client_id or not self.client_secret:
113
+ logger.error(
114
+ "Zoom client_id and client_secret required for Server-to-Server OAuth. "
115
+ "Set ZOOM_CLIENT_ID and ZOOM_CLIENT_SECRET env vars."
116
+ )
117
+ return False
118
+
119
+ try:
120
+ resp = requests.post(
121
+ f"{_OAUTH_BASE}/token",
122
+ params={
123
+ "grant_type": "account_credentials",
124
+ "account_id": self.account_id,
125
+ },
126
+ auth=(self.client_id, self.client_secret),
127
+ timeout=30,
128
+ )
129
+ resp.raise_for_status()
130
+ token_data = resp.json()
131
+
132
+ self._access_token = token_data["access_token"]
133
+ self._token_data = {
134
+ "access_token": token_data["access_token"],
135
+ "expires_at": time.time() + token_data.get("expires_in", 3600) - 60,
136
+ "token_type": token_data.get("token_type", "bearer"),
137
+ }
138
+
139
+ self._save_token(self._token_data)
140
+ logger.info("Authenticated with Zoom via Server-to-Server OAuth")
141
+ return True
142
+ except Exception as e:
143
+ logger.error(f"Zoom Server-to-Server OAuth failed: {e}")
144
+ return False
145
+
146
+ def _auth_oauth_pkce(self) -> bool:
147
+ """Run OAuth2 Authorization Code flow with PKCE."""
148
+ if not self.client_id:
149
+ logger.error("Zoom client_id required for OAuth. Set ZOOM_CLIENT_ID env var.")
150
+ return False
151
+
152
+ try:
153
+ # Generate PKCE code verifier and challenge
154
+ code_verifier = secrets.token_urlsafe(64)
155
+ code_challenge = (
156
+ base64.urlsafe_b64encode(hashlib.sha256(code_verifier.encode("ascii")).digest())
157
+ .rstrip(b"=")
158
+ .decode("ascii")
159
+ )
160
+
161
+ authorize_url = (
162
+ f"{_OAUTH_BASE}/authorize"
163
+ f"?response_type=code"
164
+ f"&client_id={self.client_id}"
165
+ f"&redirect_uri=urn:ietf:wg:oauth:2.0:oob"
166
+ f"&code_challenge={code_challenge}"
167
+ f"&code_challenge_method=S256"
168
+ )
169
+
170
+ print(f"\nOpen this URL to authorize PlanOpticon:\n{authorize_url}\n")
171
+
172
+ try:
173
+ webbrowser.open(authorize_url)
174
+ except Exception:
175
+ pass
176
+
177
+ auth_code = input("Enter the authorization code: ").strip()
178
+
179
+ # Exchange authorization code for tokens
180
+ payload = {
181
+ "grant_type": "authorization_code",
182
+ "code": auth_code,
183
+ "redirect_uri": "urn:ietf:wg:oauth:2.0:oob",
184
+ "code_verifier": code_verifier,
185
+ }
186
+
187
+ resp = requests.post(
188
+ f"{_OAUTH_BASE}/token",
189
+ data=payload,
190
+ auth=(self.client_id, self.client_secret or ""),
191
+ timeout=30,
192
+ )
193
+ resp.raise_for_status()
194
+ token_data = resp.json()
195
+
196
+ self._access_token = token_data["access_token"]
197
+ self._token_data = {
198
+ "access_token": token_data["access_token"],
199
+ "refresh_token": token_data.get("refresh_token"),
200
+ "expires_at": time.time() + token_data.get("expires_in", 3600) - 60,
201
+ "token_type": token_data.get("token_type", "bearer"),
202
+ "client_id": self.client_id,
203
+ "client_secret": self.client_secret or "",
204
+ }
205
+
206
+ self._save_token(self._token_data)
207
+ logger.info("Authenticated with Zoom via OAuth PKCE")
208
+ return True
209
+ except Exception as e:
210
+ logger.error(f"Zoom OAuth PKCE failed: {e}")
211
+ return False
212
+
213
+ def _refresh_token(self) -> bool:
214
+ """Refresh an expired OAuth token."""
215
+ try:
216
+ data = json.loads(self.token_path.read_text())
217
+ refresh_token = data.get("refresh_token")
218
+ client_id = data.get("client_id") or self.client_id
219
+ client_secret = data.get("client_secret") or self.client_secret
220
+
221
+ if not refresh_token or not client_id:
222
+ return False
223
+
224
+ resp = requests.post(
225
+ f"{_OAUTH_BASE}/token",
226
+ data={
227
+ "grant_type": "refresh_token",
228
+ "refresh_token": refresh_token,
229
+ },
230
+ auth=(client_id, client_secret or ""),
231
+ timeout=30,
232
+ )
233
+ resp.raise_for_status()
234
+ token_data = resp.json()
235
+
236
+ self._access_token = token_data["access_token"]
237
+ self._token_data = {
238
+ "access_token": token_data["access_token"],
239
+ "refresh_token": token_data.get("refresh_token", refresh_token),
240
+ "expires_at": time.time() + token_data.get("expires_in", 3600) - 60,
241
+ "token_type": token_data.get("token_type", "bearer"),
242
+ "client_id": client_id,
243
+ "client_secret": client_secret or "",
244
+ }
245
+
246
+ self._save_token(self._token_data)
247
+ logger.info("Refreshed Zoom OAuth token")
248
+ return True
249
+ except Exception as e:
250
+ logger.error(f"Zoom token refresh failed: {e}")
251
+ return False
252
+
253
+ def _save_token(self, data: Dict) -> None:
254
+ """Save token data to disk."""
255
+ self.token_path.parent.mkdir(parents=True, exist_ok=True)
256
+ self.token_path.write_text(json.dumps(data))
257
+ logger.info(f"OAuth token saved to {self.token_path}")
258
+
259
+ def _api_get(self, endpoint: str, params: Optional[Dict] = None) -> requests.Response:
260
+ """Make an authenticated GET request to the Zoom API."""
261
+ if not self._access_token:
262
+ raise RuntimeError("Not authenticated. Call authenticate() first.")
263
+
264
+ url = f"{_BASE_URL}/{endpoint.lstrip('/')}"
265
+ resp = requests.get(
266
+ url,
267
+ headers={"Authorization": f"Bearer {self._access_token}"},
268
+ params=params,
269
+ timeout=30,
270
+ )
271
+ resp.raise_for_status()
272
+ return resp
273
+
274
+ def list_videos(
275
+ self,
276
+ folder_id: Optional[str] = None,
277
+ folder_path: Optional[str] = None,
278
+ patterns: Optional[List[str]] = None,
279
+ ) -> List[SourceFile]:
280
+ """List video files from Zoom cloud recordings."""
281
+ if not self._access_token:
282
+ raise RuntimeError("Not authenticated. Call authenticate() first.")
283
+
284
+ files: List[SourceFile] = []
285
+ next_page_token = ""
286
+
287
+ while True:
288
+ params: Dict = {}
289
+ if next_page_token:
290
+ params["next_page_token"] = next_page_token
291
+
292
+ resp = self._api_get("users/me/recordings", params=params)
293
+ data = resp.json()
294
+
295
+ for meeting in data.get("meetings", []):
296
+ meeting_id = str(meeting.get("id", ""))
297
+ topic = meeting.get("topic", "Untitled Meeting")
298
+ start_time = meeting.get("start_time")
299
+
300
+ for rec_file in meeting.get("recording_files", []):
301
+ file_type = rec_file.get("file_type", "")
302
+ mime_type = _MIME_TYPES.get(file_type)
303
+
304
+ # Build a descriptive name
305
+ file_ext = rec_file.get("file_extension", file_type).lower()
306
+ file_name = f"{topic}.{file_ext}"
307
+
308
+ if patterns:
309
+ if not any(file_name.endswith(p.replace("*", "")) for p in patterns):
310
+ continue
311
+
312
+ files.append(
313
+ SourceFile(
314
+ name=file_name,
315
+ id=meeting_id,
316
+ size_bytes=rec_file.get("file_size"),
317
+ mime_type=mime_type,
318
+ modified_at=start_time,
319
+ path=rec_file.get("download_url"),
320
+ )
321
+ )
322
+
323
+ next_page_token = data.get("next_page_token", "")
324
+ if not next_page_token:
325
+ break
326
+
327
+ logger.info(f"Found {len(files)} recordings in Zoom")
328
+ return files
329
+
330
+ def download(self, file: SourceFile, destination: Path) -> Path:
331
+ """Download a recording file from Zoom."""
332
+ if not self._access_token:
333
+ raise RuntimeError("Not authenticated. Call authenticate() first.")
334
+
335
+ destination = Path(destination)
336
+ destination.parent.mkdir(parents=True, exist_ok=True)
337
+
338
+ download_url = file.path
339
+ if not download_url:
340
+ raise ValueError(f"No download URL for file: {file.name}")
341
+
342
+ resp = requests.get(
343
+ download_url,
344
+ headers={"Authorization": f"Bearer {self._access_token}"},
345
+ stream=True,
346
+ timeout=60,
347
+ )
348
+ resp.raise_for_status()
349
+
350
+ with open(destination, "wb") as f:
351
+ for chunk in resp.iter_content(chunk_size=8192):
352
+ f.write(chunk)
353
+
354
+ logger.info(f"Downloaded {file.name} to {destination}")
355
+ return destination
356
+
357
+ def fetch_transcript(self, meeting_id: str) -> Optional[str]:
358
+ """
359
+ Fetch the transcript (VTT) for a Zoom meeting recording.
360
+
361
+ Looks for transcript files in the recording's file list and downloads
362
+ the content as text.
363
+
364
+ Parameters
365
+ ----------
366
+ meeting_id : str
367
+ The Zoom meeting ID.
368
+
369
+ Returns
370
+ -------
371
+ str or None
372
+ Transcript text if available, None otherwise.
373
+ """
374
+ if not self._access_token:
375
+ raise RuntimeError("Not authenticated. Call authenticate() first.")
376
+
377
+ try:
378
+ resp = self._api_get(f"meetings/{meeting_id}/recordings")
379
+ data = resp.json()
380
+
381
+ for rec_file in data.get("recording_files", []):
382
+ file_type = rec_file.get("file_type", "")
383
+ if file_type == "TRANSCRIPT":
384
+ download_url = rec_file.get("download_url")
385
+ if download_url:
386
+ dl_resp = requests.get(
387
+ download_url,
388
+ headers={"Authorization": f"Bearer {self._access_token}"},
389
+ timeout=30,
390
+ )
391
+ dl_resp.raise_for_status()
392
+ logger.info(f"Fetched transcript for meeting {meeting_id}")
393
+ return dl_resp.text
394
+
395
+ logger.info(f"No transcript found for meeting {meeting_id}")
396
+ return None
397
+ except Exception as e:
398
+ logger.error(f"Failed to fetch transcript for meeting {meeting_id}: {e}")
399
+ return None
--- a/video_processor/sources/zoom_source.py
+++ b/video_processor/sources/zoom_source.py
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/zoom_source.py
+++ b/video_processor/sources/zoom_source.py
@@ -0,0 +1,399 @@
1 """Zoom cloud recordings source integration with OAuth support."""
2
3 import base64
4 import hashlib
5 import json
6 import logging
7 import os
8 import secrets
9 import time
10 import webbrowser
11 from pathlib import Path
12 from typing import Dict, List, Optional
13
14 import requests
15
16 from video_processor.sources.base import BaseSource, SourceFile
17
18 logger = logging.getLogger(__name__)
19
20 _TOKEN_PATH = Path.home() / ".planopticon" / "zoom_token.json"
21 _BASE_URL = "https://api.zoom.us/v2"
22 _OAUTH_BASE = "https://zoom.us/oauth"
23
24 # Map Zoom file_type values to MIME types
25 _MIME_TYPES = {
26 "MP4": "video/mp4",
27 "M4A": "audio/mp4",
28 "CHAT": "text/plain",
29 "TRANSCRIPT": "text/vtt",
30 "CSV": "text/csv",
31 "TIMELINE": "application/json",
32 }
33
34
35 class ZoomSource(BaseSource):
36 """
37 Zoom cloud recordings source with OAuth2 support.
38
39 Auth methods (tried in order):
40 1. Saved token: Load from token_path, refresh if expired
41 2. Server-to-Server OAuth: Uses account_id with client credentials
42 3. OAuth2 Authorization Code with PKCE: Interactive browser flow
43 """
44
45 def __init__(
46 self,
47 client_id: Optional[str] = None,
48 client_secret: Optional[str] = None,
49 account_id: Optional[str] = None,
50 token_path: Optional[Path] = None,
51 ):
52 """
53 Initialize Zoom source.
54
55 Parameters
56 ----------
57 client_id : str, optional
58 Zoom OAuth app client ID. Falls back to ZOOM_CLIENT_ID env var.
59 client_secret : str, optional
60 Zoom OAuth app client secret. Falls back to ZOOM_CLIENT_SECRET env var.
61 account_id : str, optional
62 Zoom account ID for Server-to-Server OAuth. Falls back to ZOOM_ACCOUNT_ID env var.
63 token_path : Path, optional
64 Where to store/load OAuth tokens.
65 """
66 self.client_id = client_id or os.environ.get("ZOOM_CLIENT_ID")
67 self.client_secret = client_secret or os.environ.get("ZOOM_CLIENT_SECRET")
68 self.account_id = account_id or os.environ.get("ZOOM_ACCOUNT_ID")
69 self.token_path = token_path or _TOKEN_PATH
70 self._access_token: Optional[str] = None
71 self._token_data: Optional[Dict] = None
72
73 def authenticate(self) -> bool:
74 """Authenticate with Zoom API."""
75 # Try 1: Load saved token
76 if self.token_path.exists():
77 if self._auth_saved_token():
78 return True
79
80 # Try 2: Server-to-Server OAuth (if account_id is set)
81 if self.account_id:
82 return self._auth_server_to_server()
83
84 # Try 3: OAuth2 Authorization Code flow with PKCE
85 return self._auth_oauth_pkce()
86
87 def _auth_saved_token(self) -> bool:
88 """Authenticate using a saved OAuth token, refreshing if expired."""
89 try:
90 data = json.loads(self.token_path.read_text())
91 expires_at = data.get("expires_at", 0)
92
93 if time.time() < expires_at:
94 # Token still valid
95 self._access_token = data["access_token"]
96 self._token_data = data
97 logger.info("Authenticated with Zoom via saved token")
98 return True
99
100 # Token expired, try to refresh
101 if data.get("refresh_token"):
102 return self._refresh_token()
103
104 # Server-to-Server tokens don't have refresh tokens;
105 # fall through to re-authenticate
106 return False
107 except Exception:
108 return False
109
110 def _auth_server_to_server(self) -> bool:
111 """Authenticate using Server-to-Server OAuth (account credentials)."""
112 if not self.client_id or not self.client_secret:
113 logger.error(
114 "Zoom client_id and client_secret required for Server-to-Server OAuth. "
115 "Set ZOOM_CLIENT_ID and ZOOM_CLIENT_SECRET env vars."
116 )
117 return False
118
119 try:
120 resp = requests.post(
121 f"{_OAUTH_BASE}/token",
122 params={
123 "grant_type": "account_credentials",
124 "account_id": self.account_id,
125 },
126 auth=(self.client_id, self.client_secret),
127 timeout=30,
128 )
129 resp.raise_for_status()
130 token_data = resp.json()
131
132 self._access_token = token_data["access_token"]
133 self._token_data = {
134 "access_token": token_data["access_token"],
135 "expires_at": time.time() + token_data.get("expires_in", 3600) - 60,
136 "token_type": token_data.get("token_type", "bearer"),
137 }
138
139 self._save_token(self._token_data)
140 logger.info("Authenticated with Zoom via Server-to-Server OAuth")
141 return True
142 except Exception as e:
143 logger.error(f"Zoom Server-to-Server OAuth failed: {e}")
144 return False
145
146 def _auth_oauth_pkce(self) -> bool:
147 """Run OAuth2 Authorization Code flow with PKCE."""
148 if not self.client_id:
149 logger.error("Zoom client_id required for OAuth. Set ZOOM_CLIENT_ID env var.")
150 return False
151
152 try:
153 # Generate PKCE code verifier and challenge
154 code_verifier = secrets.token_urlsafe(64)
155 code_challenge = (
156 base64.urlsafe_b64encode(hashlib.sha256(code_verifier.encode("ascii")).digest())
157 .rstrip(b"=")
158 .decode("ascii")
159 )
160
161 authorize_url = (
162 f"{_OAUTH_BASE}/authorize"
163 f"?response_type=code"
164 f"&client_id={self.client_id}"
165 f"&redirect_uri=urn:ietf:wg:oauth:2.0:oob"
166 f"&code_challenge={code_challenge}"
167 f"&code_challenge_method=S256"
168 )
169
170 print(f"\nOpen this URL to authorize PlanOpticon:\n{authorize_url}\n")
171
172 try:
173 webbrowser.open(authorize_url)
174 except Exception:
175 pass
176
177 auth_code = input("Enter the authorization code: ").strip()
178
179 # Exchange authorization code for tokens
180 payload = {
181 "grant_type": "authorization_code",
182 "code": auth_code,
183 "redirect_uri": "urn:ietf:wg:oauth:2.0:oob",
184 "code_verifier": code_verifier,
185 }
186
187 resp = requests.post(
188 f"{_OAUTH_BASE}/token",
189 data=payload,
190 auth=(self.client_id, self.client_secret or ""),
191 timeout=30,
192 )
193 resp.raise_for_status()
194 token_data = resp.json()
195
196 self._access_token = token_data["access_token"]
197 self._token_data = {
198 "access_token": token_data["access_token"],
199 "refresh_token": token_data.get("refresh_token"),
200 "expires_at": time.time() + token_data.get("expires_in", 3600) - 60,
201 "token_type": token_data.get("token_type", "bearer"),
202 "client_id": self.client_id,
203 "client_secret": self.client_secret or "",
204 }
205
206 self._save_token(self._token_data)
207 logger.info("Authenticated with Zoom via OAuth PKCE")
208 return True
209 except Exception as e:
210 logger.error(f"Zoom OAuth PKCE failed: {e}")
211 return False
212
213 def _refresh_token(self) -> bool:
214 """Refresh an expired OAuth token."""
215 try:
216 data = json.loads(self.token_path.read_text())
217 refresh_token = data.get("refresh_token")
218 client_id = data.get("client_id") or self.client_id
219 client_secret = data.get("client_secret") or self.client_secret
220
221 if not refresh_token or not client_id:
222 return False
223
224 resp = requests.post(
225 f"{_OAUTH_BASE}/token",
226 data={
227 "grant_type": "refresh_token",
228 "refresh_token": refresh_token,
229 },
230 auth=(client_id, client_secret or ""),
231 timeout=30,
232 )
233 resp.raise_for_status()
234 token_data = resp.json()
235
236 self._access_token = token_data["access_token"]
237 self._token_data = {
238 "access_token": token_data["access_token"],
239 "refresh_token": token_data.get("refresh_token", refresh_token),
240 "expires_at": time.time() + token_data.get("expires_in", 3600) - 60,
241 "token_type": token_data.get("token_type", "bearer"),
242 "client_id": client_id,
243 "client_secret": client_secret or "",
244 }
245
246 self._save_token(self._token_data)
247 logger.info("Refreshed Zoom OAuth token")
248 return True
249 except Exception as e:
250 logger.error(f"Zoom token refresh failed: {e}")
251 return False
252
253 def _save_token(self, data: Dict) -> None:
254 """Save token data to disk."""
255 self.token_path.parent.mkdir(parents=True, exist_ok=True)
256 self.token_path.write_text(json.dumps(data))
257 logger.info(f"OAuth token saved to {self.token_path}")
258
259 def _api_get(self, endpoint: str, params: Optional[Dict] = None) -> requests.Response:
260 """Make an authenticated GET request to the Zoom API."""
261 if not self._access_token:
262 raise RuntimeError("Not authenticated. Call authenticate() first.")
263
264 url = f"{_BASE_URL}/{endpoint.lstrip('/')}"
265 resp = requests.get(
266 url,
267 headers={"Authorization": f"Bearer {self._access_token}"},
268 params=params,
269 timeout=30,
270 )
271 resp.raise_for_status()
272 return resp
273
274 def list_videos(
275 self,
276 folder_id: Optional[str] = None,
277 folder_path: Optional[str] = None,
278 patterns: Optional[List[str]] = None,
279 ) -> List[SourceFile]:
280 """List video files from Zoom cloud recordings."""
281 if not self._access_token:
282 raise RuntimeError("Not authenticated. Call authenticate() first.")
283
284 files: List[SourceFile] = []
285 next_page_token = ""
286
287 while True:
288 params: Dict = {}
289 if next_page_token:
290 params["next_page_token"] = next_page_token
291
292 resp = self._api_get("users/me/recordings", params=params)
293 data = resp.json()
294
295 for meeting in data.get("meetings", []):
296 meeting_id = str(meeting.get("id", ""))
297 topic = meeting.get("topic", "Untitled Meeting")
298 start_time = meeting.get("start_time")
299
300 for rec_file in meeting.get("recording_files", []):
301 file_type = rec_file.get("file_type", "")
302 mime_type = _MIME_TYPES.get(file_type)
303
304 # Build a descriptive name
305 file_ext = rec_file.get("file_extension", file_type).lower()
306 file_name = f"{topic}.{file_ext}"
307
308 if patterns:
309 if not any(file_name.endswith(p.replace("*", "")) for p in patterns):
310 continue
311
312 files.append(
313 SourceFile(
314 name=file_name,
315 id=meeting_id,
316 size_bytes=rec_file.get("file_size"),
317 mime_type=mime_type,
318 modified_at=start_time,
319 path=rec_file.get("download_url"),
320 )
321 )
322
323 next_page_token = data.get("next_page_token", "")
324 if not next_page_token:
325 break
326
327 logger.info(f"Found {len(files)} recordings in Zoom")
328 return files
329
330 def download(self, file: SourceFile, destination: Path) -> Path:
331 """Download a recording file from Zoom."""
332 if not self._access_token:
333 raise RuntimeError("Not authenticated. Call authenticate() first.")
334
335 destination = Path(destination)
336 destination.parent.mkdir(parents=True, exist_ok=True)
337
338 download_url = file.path
339 if not download_url:
340 raise ValueError(f"No download URL for file: {file.name}")
341
342 resp = requests.get(
343 download_url,
344 headers={"Authorization": f"Bearer {self._access_token}"},
345 stream=True,
346 timeout=60,
347 )
348 resp.raise_for_status()
349
350 with open(destination, "wb") as f:
351 for chunk in resp.iter_content(chunk_size=8192):
352 f.write(chunk)
353
354 logger.info(f"Downloaded {file.name} to {destination}")
355 return destination
356
357 def fetch_transcript(self, meeting_id: str) -> Optional[str]:
358 """
359 Fetch the transcript (VTT) for a Zoom meeting recording.
360
361 Looks for transcript files in the recording's file list and downloads
362 the content as text.
363
364 Parameters
365 ----------
366 meeting_id : str
367 The Zoom meeting ID.
368
369 Returns
370 -------
371 str or None
372 Transcript text if available, None otherwise.
373 """
374 if not self._access_token:
375 raise RuntimeError("Not authenticated. Call authenticate() first.")
376
377 try:
378 resp = self._api_get(f"meetings/{meeting_id}/recordings")
379 data = resp.json()
380
381 for rec_file in data.get("recording_files", []):
382 file_type = rec_file.get("file_type", "")
383 if file_type == "TRANSCRIPT":
384 download_url = rec_file.get("download_url")
385 if download_url:
386 dl_resp = requests.get(
387 download_url,
388 headers={"Authorization": f"Bearer {self._access_token}"},
389 timeout=30,
390 )
391 dl_resp.raise_for_status()
392 logger.info(f"Fetched transcript for meeting {meeting_id}")
393 return dl_resp.text
394
395 logger.info(f"No transcript found for meeting {meeting_id}")
396 return None
397 except Exception as e:
398 logger.error(f"Failed to fetch transcript for meeting {meeting_id}: {e}")
399 return None

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button