PlanOpticon
feat(sources): add Zoom, Teams, and Meet recording connectors - ZoomSource: OAuth2 with PKCE, Server-to-Server, and saved token auth - TeamsRecordingSource: m365 CLI wrapper with Graph API fallbacks - MeetRecordingSource: gws CLI wrapper searching Drive for recordings - CLI: planopticon recordings {zoom-list,teams-list,meet-list} - 29 new tests (757 total passing)
Commit
4bdd77c40949daad3589e28cfd5f2aa8bf1250d2754974b2ec36b13c9dd1a9fc
Parent
82dc7b969aca623…
7 files changed
+29
+228
+102
+6
+280
+375
+399
+29
| --- tests/test_cli.py | ||
| +++ tests/test_cli.py | ||
| @@ -22,10 +22,11 @@ | ||
| 22 | 22 | assert "query" in result.output |
| 23 | 23 | assert "agent" in result.output |
| 24 | 24 | assert "kg" in result.output |
| 25 | 25 | assert "gws" in result.output |
| 26 | 26 | assert "m365" in result.output |
| 27 | + assert "recordings" in result.output | |
| 27 | 28 | assert "ingest" in result.output |
| 28 | 29 | assert "batch" in result.output |
| 29 | 30 | |
| 30 | 31 | |
| 31 | 32 | class TestAnalyzeHelp: |
| @@ -193,13 +194,41 @@ | ||
| 193 | 194 | assert result.exit_code == 0 |
| 194 | 195 | assert "--web-url" in result.output |
| 195 | 196 | assert "--file-id" in result.output |
| 196 | 197 | assert "--db-path" in result.output |
| 197 | 198 | |
| 199 | + | |
| 200 | +class TestRecordingsHelp: | |
| 201 | + def test_group_help(self): | |
| 202 | + runner = CliRunner() | |
| 203 | + result = runner.invoke(cli, ["recordings", "--help"]) | |
| 204 | + assert result.exit_code == 0 | |
| 205 | + assert "zoom-list" in result.output | |
| 206 | + assert "teams-list" in result.output | |
| 207 | + assert "meet-list" in result.output | |
| 208 | + | |
| 209 | + def test_zoom_list_help(self): | |
| 210 | + runner = CliRunner() | |
| 211 | + result = runner.invoke(cli, ["recordings", "zoom-list", "--help"]) | |
| 212 | + assert result.exit_code == 0 | |
| 213 | + assert "ZOOM_CLIENT_ID" in result.output | |
| 214 | + | |
| 215 | + def test_teams_list_help(self): | |
| 216 | + runner = CliRunner() | |
| 217 | + result = runner.invoke(cli, ["recordings", "teams-list", "--help"]) | |
| 218 | + assert result.exit_code == 0 | |
| 219 | + assert "--user-id" in result.output | |
| 220 | + | |
| 221 | + def test_meet_list_help(self): | |
| 222 | + runner = CliRunner() | |
| 223 | + result = runner.invoke(cli, ["recordings", "meet-list", "--help"]) | |
| 224 | + assert result.exit_code == 0 | |
| 225 | + assert "--folder-id" in result.output | |
| 226 | + | |
| 198 | 227 | |
| 199 | 228 | class TestAuthHelp: |
| 200 | 229 | def test_help(self): |
| 201 | 230 | runner = CliRunner() |
| 202 | 231 | result = runner.invoke(cli, ["auth", "--help"]) |
| 203 | 232 | assert result.exit_code == 0 |
| 204 | 233 | assert "google" in result.output |
| 205 | 234 | assert "dropbox" in result.output |
| 206 | 235 |
| --- tests/test_cli.py | |
| +++ tests/test_cli.py | |
| @@ -22,10 +22,11 @@ | |
| 22 | assert "query" in result.output |
| 23 | assert "agent" in result.output |
| 24 | assert "kg" in result.output |
| 25 | assert "gws" in result.output |
| 26 | assert "m365" in result.output |
| 27 | assert "ingest" in result.output |
| 28 | assert "batch" in result.output |
| 29 | |
| 30 | |
| 31 | class TestAnalyzeHelp: |
| @@ -193,13 +194,41 @@ | |
| 193 | assert result.exit_code == 0 |
| 194 | assert "--web-url" in result.output |
| 195 | assert "--file-id" in result.output |
| 196 | assert "--db-path" in result.output |
| 197 | |
| 198 | |
| 199 | class TestAuthHelp: |
| 200 | def test_help(self): |
| 201 | runner = CliRunner() |
| 202 | result = runner.invoke(cli, ["auth", "--help"]) |
| 203 | assert result.exit_code == 0 |
| 204 | assert "google" in result.output |
| 205 | assert "dropbox" in result.output |
| 206 |
| --- tests/test_cli.py | |
| +++ tests/test_cli.py | |
| @@ -22,10 +22,11 @@ | |
| 22 | assert "query" in result.output |
| 23 | assert "agent" in result.output |
| 24 | assert "kg" in result.output |
| 25 | assert "gws" in result.output |
| 26 | assert "m365" in result.output |
| 27 | assert "recordings" in result.output |
| 28 | assert "ingest" in result.output |
| 29 | assert "batch" in result.output |
| 30 | |
| 31 | |
| 32 | class TestAnalyzeHelp: |
| @@ -193,13 +194,41 @@ | |
| 194 | assert result.exit_code == 0 |
| 195 | assert "--web-url" in result.output |
| 196 | assert "--file-id" in result.output |
| 197 | assert "--db-path" in result.output |
| 198 | |
| 199 | |
| 200 | class TestRecordingsHelp: |
| 201 | def test_group_help(self): |
| 202 | runner = CliRunner() |
| 203 | result = runner.invoke(cli, ["recordings", "--help"]) |
| 204 | assert result.exit_code == 0 |
| 205 | assert "zoom-list" in result.output |
| 206 | assert "teams-list" in result.output |
| 207 | assert "meet-list" in result.output |
| 208 | |
| 209 | def test_zoom_list_help(self): |
| 210 | runner = CliRunner() |
| 211 | result = runner.invoke(cli, ["recordings", "zoom-list", "--help"]) |
| 212 | assert result.exit_code == 0 |
| 213 | assert "ZOOM_CLIENT_ID" in result.output |
| 214 | |
| 215 | def test_teams_list_help(self): |
| 216 | runner = CliRunner() |
| 217 | result = runner.invoke(cli, ["recordings", "teams-list", "--help"]) |
| 218 | assert result.exit_code == 0 |
| 219 | assert "--user-id" in result.output |
| 220 | |
| 221 | def test_meet_list_help(self): |
| 222 | runner = CliRunner() |
| 223 | result = runner.invoke(cli, ["recordings", "meet-list", "--help"]) |
| 224 | assert result.exit_code == 0 |
| 225 | assert "--folder-id" in result.output |
| 226 | |
| 227 | |
| 228 | class TestAuthHelp: |
| 229 | def test_help(self): |
| 230 | runner = CliRunner() |
| 231 | result = runner.invoke(cli, ["auth", "--help"]) |
| 232 | assert result.exit_code == 0 |
| 233 | assert "google" in result.output |
| 234 | assert "dropbox" in result.output |
| 235 |
+228
| --- tests/test_sources.py | ||
| +++ tests/test_sources.py | ||
| @@ -1340,5 +1340,233 @@ | ||
| 1340 | 1340 | html = "<tag> "quoted" 'apos' space" |
| 1341 | 1341 | result = _html_to_text(html) |
| 1342 | 1342 | assert "<tag>" in result |
| 1343 | 1343 | assert '"quoted"' in result |
| 1344 | 1344 | assert "'apos'" in result |
| 1345 | + | |
| 1346 | + | |
| 1347 | +# --------------------------------------------------------------------------- | |
| 1348 | +# ZoomSource | |
| 1349 | +# --------------------------------------------------------------------------- | |
| 1350 | + | |
| 1351 | + | |
| 1352 | +class TestZoomSource: | |
| 1353 | + def test_import(self): | |
| 1354 | + from video_processor.sources.zoom_source import ZoomSource | |
| 1355 | + | |
| 1356 | + assert ZoomSource is not None | |
| 1357 | + | |
| 1358 | + def test_constructor_defaults(self): | |
| 1359 | + from video_processor.sources.zoom_source import ZoomSource | |
| 1360 | + | |
| 1361 | + src = ZoomSource() | |
| 1362 | + assert src.client_id is None or isinstance(src.client_id, str) | |
| 1363 | + assert src._access_token is None | |
| 1364 | + | |
| 1365 | + def test_constructor_explicit(self): | |
| 1366 | + from video_processor.sources.zoom_source import ZoomSource | |
| 1367 | + | |
| 1368 | + src = ZoomSource( | |
| 1369 | + client_id="cid", | |
| 1370 | + client_secret="csec", | |
| 1371 | + account_id="aid", | |
| 1372 | + ) | |
| 1373 | + assert src.client_id == "cid" | |
| 1374 | + assert src.client_secret == "csec" | |
| 1375 | + assert src.account_id == "aid" | |
| 1376 | + | |
| 1377 | + def test_authenticate_no_credentials(self): | |
| 1378 | + from video_processor.sources.zoom_source import ZoomSource | |
| 1379 | + | |
| 1380 | + src = ZoomSource(client_id=None, client_secret=None, account_id=None) | |
| 1381 | + # No saved token, no account_id, no client_id → should fail | |
| 1382 | + assert src.authenticate() is False | |
| 1383 | + | |
| 1384 | + def test_list_videos_not_authenticated(self): | |
| 1385 | + from video_processor.sources.zoom_source import ZoomSource | |
| 1386 | + | |
| 1387 | + src = ZoomSource() | |
| 1388 | + with pytest.raises(RuntimeError, match="Not authenticated"): | |
| 1389 | + src.list_videos() | |
| 1390 | + | |
| 1391 | + def test_download_not_authenticated(self): | |
| 1392 | + from video_processor.sources.zoom_source import ZoomSource | |
| 1393 | + | |
| 1394 | + src = ZoomSource() | |
| 1395 | + sf = SourceFile(name="test.mp4", id="123") | |
| 1396 | + with pytest.raises(RuntimeError, match="Not authenticated"): | |
| 1397 | + src.download(sf, "/tmp/test.mp4") | |
| 1398 | + | |
| 1399 | + def test_fetch_transcript_not_authenticated(self): | |
| 1400 | + from video_processor.sources.zoom_source import ZoomSource | |
| 1401 | + | |
| 1402 | + src = ZoomSource() | |
| 1403 | + with pytest.raises(RuntimeError, match="Not authenticated"): | |
| 1404 | + src.fetch_transcript("meeting123") | |
| 1405 | + | |
| 1406 | + def test_mime_types_mapping(self): | |
| 1407 | + from video_processor.sources.zoom_source import _MIME_TYPES | |
| 1408 | + | |
| 1409 | + assert _MIME_TYPES["MP4"] == "video/mp4" | |
| 1410 | + assert _MIME_TYPES["TRANSCRIPT"] == "text/vtt" | |
| 1411 | + assert _MIME_TYPES["M4A"] == "audio/mp4" | |
| 1412 | + | |
| 1413 | + | |
| 1414 | +# --------------------------------------------------------------------------- | |
| 1415 | +# TeamsRecordingSource | |
| 1416 | +# --------------------------------------------------------------------------- | |
| 1417 | + | |
| 1418 | + | |
| 1419 | +class TestTeamsRecordingSource: | |
| 1420 | + def test_import(self): | |
| 1421 | + from video_processor.sources.teams_recording_source import ( | |
| 1422 | + TeamsRecordingSource, | |
| 1423 | + ) | |
| 1424 | + | |
| 1425 | + assert TeamsRecordingSource is not None | |
| 1426 | + | |
| 1427 | + def test_constructor_default(self): | |
| 1428 | + from video_processor.sources.teams_recording_source import ( | |
| 1429 | + TeamsRecordingSource, | |
| 1430 | + ) | |
| 1431 | + | |
| 1432 | + src = TeamsRecordingSource() | |
| 1433 | + assert src.user_id == "me" | |
| 1434 | + | |
| 1435 | + def test_constructor_custom_user(self): | |
| 1436 | + from video_processor.sources.teams_recording_source import ( | |
| 1437 | + TeamsRecordingSource, | |
| 1438 | + ) | |
| 1439 | + | |
| 1440 | + src = TeamsRecordingSource(user_id="[email protected]") | |
| 1441 | + assert src.user_id == "[email protected]" | |
| 1442 | + | |
| 1443 | + @patch("shutil.which", return_value=None) | |
| 1444 | + def test_authenticate_no_m365(self, _mock_which): | |
| 1445 | + from video_processor.sources.teams_recording_source import ( | |
| 1446 | + TeamsRecordingSource, | |
| 1447 | + ) | |
| 1448 | + | |
| 1449 | + src = TeamsRecordingSource() | |
| 1450 | + assert src.authenticate() is False | |
| 1451 | + | |
| 1452 | + def test_vtt_to_text(self): | |
| 1453 | + from video_processor.sources.teams_recording_source import ( | |
| 1454 | + _vtt_to_text, | |
| 1455 | + ) | |
| 1456 | + | |
| 1457 | + vtt = ( | |
| 1458 | + "WEBVTT\n\n" | |
| 1459 | + "1\n" | |
| 1460 | + "00:00:01.000 --> 00:00:05.000\n" | |
| 1461 | + "<v Speaker1>Hello everyone\n\n" | |
| 1462 | + "2\n" | |
| 1463 | + "00:00:05.000 --> 00:00:10.000\n" | |
| 1464 | + "<v Speaker2>Welcome to the meeting\n" | |
| 1465 | + ) | |
| 1466 | + result = _vtt_to_text(vtt) | |
| 1467 | + assert "Hello everyone" in result | |
| 1468 | + assert "Welcome to the meeting" in result | |
| 1469 | + assert "WEBVTT" not in result | |
| 1470 | + assert "-->" not in result | |
| 1471 | + | |
| 1472 | + def test_vtt_to_text_empty(self): | |
| 1473 | + from video_processor.sources.teams_recording_source import ( | |
| 1474 | + _vtt_to_text, | |
| 1475 | + ) | |
| 1476 | + | |
| 1477 | + assert _vtt_to_text("") == "" | |
| 1478 | + | |
| 1479 | + def test_vtt_to_text_deduplicates(self): | |
| 1480 | + from video_processor.sources.teams_recording_source import ( | |
| 1481 | + _vtt_to_text, | |
| 1482 | + ) | |
| 1483 | + | |
| 1484 | + vtt = ( | |
| 1485 | + "WEBVTT\n\n" | |
| 1486 | + "00:00:01.000 --> 00:00:03.000\n" | |
| 1487 | + "Same line\n\n" | |
| 1488 | + "00:00:03.000 --> 00:00:05.000\n" | |
| 1489 | + "Same line\n" | |
| 1490 | + ) | |
| 1491 | + result = _vtt_to_text(vtt) | |
| 1492 | + assert result.count("Same line") == 1 | |
| 1493 | + | |
| 1494 | + def test_extract_meetings_list_dict(self): | |
| 1495 | + from video_processor.sources.teams_recording_source import ( | |
| 1496 | + TeamsRecordingSource, | |
| 1497 | + ) | |
| 1498 | + | |
| 1499 | + src = TeamsRecordingSource() | |
| 1500 | + result = src._extract_meetings_list({"value": [{"id": "m1"}]}) | |
| 1501 | + assert len(result) == 1 | |
| 1502 | + | |
| 1503 | + def test_extract_meetings_list_list(self): | |
| 1504 | + from video_processor.sources.teams_recording_source import ( | |
| 1505 | + TeamsRecordingSource, | |
| 1506 | + ) | |
| 1507 | + | |
| 1508 | + src = TeamsRecordingSource() | |
| 1509 | + result = src._extract_meetings_list([{"id": "m1"}]) | |
| 1510 | + assert len(result) == 1 | |
| 1511 | + | |
| 1512 | + | |
| 1513 | +# --------------------------------------------------------------------------- | |
| 1514 | +# MeetRecordingSource | |
| 1515 | +# --------------------------------------------------------------------------- | |
| 1516 | + | |
| 1517 | + | |
| 1518 | +class TestMeetRecordingSource: | |
| 1519 | + def test_import(self): | |
| 1520 | + from video_processor.sources.meet_recording_source import ( | |
| 1521 | + MeetRecordingSource, | |
| 1522 | + ) | |
| 1523 | + | |
| 1524 | + assert MeetRecordingSource is not None | |
| 1525 | + | |
| 1526 | + def test_constructor_default(self): | |
| 1527 | + from video_processor.sources.meet_recording_source import ( | |
| 1528 | + MeetRecordingSource, | |
| 1529 | + ) | |
| 1530 | + | |
| 1531 | + src = MeetRecordingSource() | |
| 1532 | + assert src.drive_folder_id is None | |
| 1533 | + | |
| 1534 | + def test_constructor_with_folder(self): | |
| 1535 | + from video_processor.sources.meet_recording_source import ( | |
| 1536 | + MeetRecordingSource, | |
| 1537 | + ) | |
| 1538 | + | |
| 1539 | + src = MeetRecordingSource(drive_folder_id="folder123") | |
| 1540 | + assert src.drive_folder_id == "folder123" | |
| 1541 | + | |
| 1542 | + @patch("shutil.which", return_value=None) | |
| 1543 | + def test_authenticate_no_gws(self, _mock_which): | |
| 1544 | + from video_processor.sources.meet_recording_source import ( | |
| 1545 | + MeetRecordingSource, | |
| 1546 | + ) | |
| 1547 | + | |
| 1548 | + src = MeetRecordingSource() | |
| 1549 | + assert src.authenticate() is False | |
| 1550 | + | |
| 1551 | + def test_find_matching_transcript_date_extraction(self): | |
| 1552 | + import re | |
| 1553 | + | |
| 1554 | + name = "Meet Recording 2026-03-07T14:30:00" | |
| 1555 | + match = re.search(r"\d{4}-\d{2}-\d{2}", name) | |
| 1556 | + assert match is not None | |
| 1557 | + assert match.group(0) == "2026-03-07" | |
| 1558 | + | |
| 1559 | + def test_lazy_import(self): | |
| 1560 | + from video_processor.sources import MeetRecordingSource | |
| 1561 | + | |
| 1562 | + assert MeetRecordingSource is not None | |
| 1563 | + | |
| 1564 | + def test_teams_lazy_import(self): | |
| 1565 | + from video_processor.sources import TeamsRecordingSource | |
| 1566 | + | |
| 1567 | + assert TeamsRecordingSource is not None | |
| 1568 | + | |
| 1569 | + def test_zoom_lazy_import(self): | |
| 1570 | + from video_processor.sources import ZoomSource | |
| 1571 | + | |
| 1572 | + assert ZoomSource is not None | |
| 1345 | 1573 |
| --- tests/test_sources.py | |
| +++ tests/test_sources.py | |
| @@ -1340,5 +1340,233 @@ | |
| 1340 | html = "<tag> "quoted" 'apos' space" |
| 1341 | result = _html_to_text(html) |
| 1342 | assert "<tag>" in result |
| 1343 | assert '"quoted"' in result |
| 1344 | assert "'apos'" in result |
| 1345 |
| --- tests/test_sources.py | |
| +++ tests/test_sources.py | |
| @@ -1340,5 +1340,233 @@ | |
| 1340 | html = "<tag> "quoted" 'apos' space" |
| 1341 | result = _html_to_text(html) |
| 1342 | assert "<tag>" in result |
| 1343 | assert '"quoted"' in result |
| 1344 | assert "'apos'" in result |
| 1345 | |
| 1346 | |
| 1347 | # --------------------------------------------------------------------------- |
| 1348 | # ZoomSource |
| 1349 | # --------------------------------------------------------------------------- |
| 1350 | |
| 1351 | |
| 1352 | class TestZoomSource: |
| 1353 | def test_import(self): |
| 1354 | from video_processor.sources.zoom_source import ZoomSource |
| 1355 | |
| 1356 | assert ZoomSource is not None |
| 1357 | |
| 1358 | def test_constructor_defaults(self): |
| 1359 | from video_processor.sources.zoom_source import ZoomSource |
| 1360 | |
| 1361 | src = ZoomSource() |
| 1362 | assert src.client_id is None or isinstance(src.client_id, str) |
| 1363 | assert src._access_token is None |
| 1364 | |
| 1365 | def test_constructor_explicit(self): |
| 1366 | from video_processor.sources.zoom_source import ZoomSource |
| 1367 | |
| 1368 | src = ZoomSource( |
| 1369 | client_id="cid", |
| 1370 | client_secret="csec", |
| 1371 | account_id="aid", |
| 1372 | ) |
| 1373 | assert src.client_id == "cid" |
| 1374 | assert src.client_secret == "csec" |
| 1375 | assert src.account_id == "aid" |
| 1376 | |
| 1377 | def test_authenticate_no_credentials(self): |
| 1378 | from video_processor.sources.zoom_source import ZoomSource |
| 1379 | |
| 1380 | src = ZoomSource(client_id=None, client_secret=None, account_id=None) |
| 1381 | # No saved token, no account_id, no client_id → should fail |
| 1382 | assert src.authenticate() is False |
| 1383 | |
| 1384 | def test_list_videos_not_authenticated(self): |
| 1385 | from video_processor.sources.zoom_source import ZoomSource |
| 1386 | |
| 1387 | src = ZoomSource() |
| 1388 | with pytest.raises(RuntimeError, match="Not authenticated"): |
| 1389 | src.list_videos() |
| 1390 | |
| 1391 | def test_download_not_authenticated(self): |
| 1392 | from video_processor.sources.zoom_source import ZoomSource |
| 1393 | |
| 1394 | src = ZoomSource() |
| 1395 | sf = SourceFile(name="test.mp4", id="123") |
| 1396 | with pytest.raises(RuntimeError, match="Not authenticated"): |
| 1397 | src.download(sf, "/tmp/test.mp4") |
| 1398 | |
| 1399 | def test_fetch_transcript_not_authenticated(self): |
| 1400 | from video_processor.sources.zoom_source import ZoomSource |
| 1401 | |
| 1402 | src = ZoomSource() |
| 1403 | with pytest.raises(RuntimeError, match="Not authenticated"): |
| 1404 | src.fetch_transcript("meeting123") |
| 1405 | |
| 1406 | def test_mime_types_mapping(self): |
| 1407 | from video_processor.sources.zoom_source import _MIME_TYPES |
| 1408 | |
| 1409 | assert _MIME_TYPES["MP4"] == "video/mp4" |
| 1410 | assert _MIME_TYPES["TRANSCRIPT"] == "text/vtt" |
| 1411 | assert _MIME_TYPES["M4A"] == "audio/mp4" |
| 1412 | |
| 1413 | |
| 1414 | # --------------------------------------------------------------------------- |
| 1415 | # TeamsRecordingSource |
| 1416 | # --------------------------------------------------------------------------- |
| 1417 | |
| 1418 | |
| 1419 | class TestTeamsRecordingSource: |
| 1420 | def test_import(self): |
| 1421 | from video_processor.sources.teams_recording_source import ( |
| 1422 | TeamsRecordingSource, |
| 1423 | ) |
| 1424 | |
| 1425 | assert TeamsRecordingSource is not None |
| 1426 | |
| 1427 | def test_constructor_default(self): |
| 1428 | from video_processor.sources.teams_recording_source import ( |
| 1429 | TeamsRecordingSource, |
| 1430 | ) |
| 1431 | |
| 1432 | src = TeamsRecordingSource() |
| 1433 | assert src.user_id == "me" |
| 1434 | |
| 1435 | def test_constructor_custom_user(self): |
| 1436 | from video_processor.sources.teams_recording_source import ( |
| 1437 | TeamsRecordingSource, |
| 1438 | ) |
| 1439 | |
| 1440 | src = TeamsRecordingSource(user_id="[email protected]") |
| 1441 | assert src.user_id == "[email protected]" |
| 1442 | |
| 1443 | @patch("shutil.which", return_value=None) |
| 1444 | def test_authenticate_no_m365(self, _mock_which): |
| 1445 | from video_processor.sources.teams_recording_source import ( |
| 1446 | TeamsRecordingSource, |
| 1447 | ) |
| 1448 | |
| 1449 | src = TeamsRecordingSource() |
| 1450 | assert src.authenticate() is False |
| 1451 | |
| 1452 | def test_vtt_to_text(self): |
| 1453 | from video_processor.sources.teams_recording_source import ( |
| 1454 | _vtt_to_text, |
| 1455 | ) |
| 1456 | |
| 1457 | vtt = ( |
| 1458 | "WEBVTT\n\n" |
| 1459 | "1\n" |
| 1460 | "00:00:01.000 --> 00:00:05.000\n" |
| 1461 | "<v Speaker1>Hello everyone\n\n" |
| 1462 | "2\n" |
| 1463 | "00:00:05.000 --> 00:00:10.000\n" |
| 1464 | "<v Speaker2>Welcome to the meeting\n" |
| 1465 | ) |
| 1466 | result = _vtt_to_text(vtt) |
| 1467 | assert "Hello everyone" in result |
| 1468 | assert "Welcome to the meeting" in result |
| 1469 | assert "WEBVTT" not in result |
| 1470 | assert "-->" not in result |
| 1471 | |
| 1472 | def test_vtt_to_text_empty(self): |
| 1473 | from video_processor.sources.teams_recording_source import ( |
| 1474 | _vtt_to_text, |
| 1475 | ) |
| 1476 | |
| 1477 | assert _vtt_to_text("") == "" |
| 1478 | |
| 1479 | def test_vtt_to_text_deduplicates(self): |
| 1480 | from video_processor.sources.teams_recording_source import ( |
| 1481 | _vtt_to_text, |
| 1482 | ) |
| 1483 | |
| 1484 | vtt = ( |
| 1485 | "WEBVTT\n\n" |
| 1486 | "00:00:01.000 --> 00:00:03.000\n" |
| 1487 | "Same line\n\n" |
| 1488 | "00:00:03.000 --> 00:00:05.000\n" |
| 1489 | "Same line\n" |
| 1490 | ) |
| 1491 | result = _vtt_to_text(vtt) |
| 1492 | assert result.count("Same line") == 1 |
| 1493 | |
| 1494 | def test_extract_meetings_list_dict(self): |
| 1495 | from video_processor.sources.teams_recording_source import ( |
| 1496 | TeamsRecordingSource, |
| 1497 | ) |
| 1498 | |
| 1499 | src = TeamsRecordingSource() |
| 1500 | result = src._extract_meetings_list({"value": [{"id": "m1"}]}) |
| 1501 | assert len(result) == 1 |
| 1502 | |
| 1503 | def test_extract_meetings_list_list(self): |
| 1504 | from video_processor.sources.teams_recording_source import ( |
| 1505 | TeamsRecordingSource, |
| 1506 | ) |
| 1507 | |
| 1508 | src = TeamsRecordingSource() |
| 1509 | result = src._extract_meetings_list([{"id": "m1"}]) |
| 1510 | assert len(result) == 1 |
| 1511 | |
| 1512 | |
| 1513 | # --------------------------------------------------------------------------- |
| 1514 | # MeetRecordingSource |
| 1515 | # --------------------------------------------------------------------------- |
| 1516 | |
| 1517 | |
| 1518 | class TestMeetRecordingSource: |
| 1519 | def test_import(self): |
| 1520 | from video_processor.sources.meet_recording_source import ( |
| 1521 | MeetRecordingSource, |
| 1522 | ) |
| 1523 | |
| 1524 | assert MeetRecordingSource is not None |
| 1525 | |
| 1526 | def test_constructor_default(self): |
| 1527 | from video_processor.sources.meet_recording_source import ( |
| 1528 | MeetRecordingSource, |
| 1529 | ) |
| 1530 | |
| 1531 | src = MeetRecordingSource() |
| 1532 | assert src.drive_folder_id is None |
| 1533 | |
| 1534 | def test_constructor_with_folder(self): |
| 1535 | from video_processor.sources.meet_recording_source import ( |
| 1536 | MeetRecordingSource, |
| 1537 | ) |
| 1538 | |
| 1539 | src = MeetRecordingSource(drive_folder_id="folder123") |
| 1540 | assert src.drive_folder_id == "folder123" |
| 1541 | |
| 1542 | @patch("shutil.which", return_value=None) |
| 1543 | def test_authenticate_no_gws(self, _mock_which): |
| 1544 | from video_processor.sources.meet_recording_source import ( |
| 1545 | MeetRecordingSource, |
| 1546 | ) |
| 1547 | |
| 1548 | src = MeetRecordingSource() |
| 1549 | assert src.authenticate() is False |
| 1550 | |
| 1551 | def test_find_matching_transcript_date_extraction(self): |
| 1552 | import re |
| 1553 | |
| 1554 | name = "Meet Recording 2026-03-07T14:30:00" |
| 1555 | match = re.search(r"\d{4}-\d{2}-\d{2}", name) |
| 1556 | assert match is not None |
| 1557 | assert match.group(0) == "2026-03-07" |
| 1558 | |
| 1559 | def test_lazy_import(self): |
| 1560 | from video_processor.sources import MeetRecordingSource |
| 1561 | |
| 1562 | assert MeetRecordingSource is not None |
| 1563 | |
| 1564 | def test_teams_lazy_import(self): |
| 1565 | from video_processor.sources import TeamsRecordingSource |
| 1566 | |
| 1567 | assert TeamsRecordingSource is not None |
| 1568 | |
| 1569 | def test_zoom_lazy_import(self): |
| 1570 | from video_processor.sources import ZoomSource |
| 1571 | |
| 1572 | assert ZoomSource is not None |
| 1573 |
| --- video_processor/cli/commands.py | ||
| +++ video_processor/cli/commands.py | ||
| @@ -1582,10 +1582,112 @@ | ||
| 1582 | 1582 | click.echo(f"Wiki pushed to https://github.com/{repo}/wiki") |
| 1583 | 1583 | else: |
| 1584 | 1584 | click.echo("Wiki push failed. Check auth and repo permissions.", err=True) |
| 1585 | 1585 | sys.exit(1) |
| 1586 | 1586 | |
| 1587 | + | |
| 1588 | +@cli.group() | |
| 1589 | +def recordings(): | |
| 1590 | + """Fetch meeting recordings from Zoom, Teams, and Google Meet.""" | |
| 1591 | + pass | |
| 1592 | + | |
| 1593 | + | |
| 1594 | +@recordings.command("zoom-list") | |
| 1595 | +@click.option("--json", "as_json", is_flag=True, help="Output as JSON") | |
| 1596 | +def recordings_zoom_list(as_json): | |
| 1597 | + """List Zoom cloud recordings. | |
| 1598 | + | |
| 1599 | + Requires ZOOM_CLIENT_ID (and optionally ZOOM_CLIENT_SECRET, | |
| 1600 | + ZOOM_ACCOUNT_ID) environment variables. | |
| 1601 | + | |
| 1602 | + Examples: | |
| 1603 | + | |
| 1604 | + planopticon recordings zoom-list | |
| 1605 | + | |
| 1606 | + planopticon recordings zoom-list --json | |
| 1607 | + """ | |
| 1608 | + from video_processor.sources.zoom_source import ZoomSource | |
| 1609 | + | |
| 1610 | + source = ZoomSource() | |
| 1611 | + if not source.authenticate(): | |
| 1612 | + click.echo("Zoom authentication failed.", err=True) | |
| 1613 | + sys.exit(1) | |
| 1614 | + | |
| 1615 | + files = source.list_videos() | |
| 1616 | + if as_json: | |
| 1617 | + click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str)) | |
| 1618 | + else: | |
| 1619 | + click.echo(f"Found {len(files)} recording(s):") | |
| 1620 | + for f in files: | |
| 1621 | + size = f"{f.size_bytes // 1_000_000} MB" if f.size_bytes else "unknown" | |
| 1622 | + click.echo(f" {f.name} ({size}) {f.modified_at or ''}") | |
| 1623 | + | |
| 1624 | + | |
| 1625 | +@recordings.command("teams-list") | |
| 1626 | +@click.option("--user-id", default="me", help="Microsoft user ID") | |
| 1627 | +@click.option("--json", "as_json", is_flag=True, help="Output as JSON") | |
| 1628 | +def recordings_teams_list(user_id, as_json): | |
| 1629 | + """List Teams meeting recordings via the m365 CLI. | |
| 1630 | + | |
| 1631 | + Requires: npm install -g @pnp/cli-microsoft365 && m365 login | |
| 1632 | + | |
| 1633 | + Examples: | |
| 1634 | + | |
| 1635 | + planopticon recordings teams-list | |
| 1636 | + | |
| 1637 | + planopticon recordings teams-list --json | |
| 1638 | + """ | |
| 1639 | + from video_processor.sources.teams_recording_source import ( | |
| 1640 | + TeamsRecordingSource, | |
| 1641 | + ) | |
| 1642 | + | |
| 1643 | + source = TeamsRecordingSource(user_id=user_id) | |
| 1644 | + if not source.authenticate(): | |
| 1645 | + click.echo("Teams authentication failed.", err=True) | |
| 1646 | + sys.exit(1) | |
| 1647 | + | |
| 1648 | + files = source.list_videos() | |
| 1649 | + if as_json: | |
| 1650 | + click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str)) | |
| 1651 | + else: | |
| 1652 | + click.echo(f"Found {len(files)} recording(s):") | |
| 1653 | + for f in files: | |
| 1654 | + click.echo(f" {f.name} {f.modified_at or ''}") | |
| 1655 | + | |
| 1656 | + | |
| 1657 | +@recordings.command("meet-list") | |
| 1658 | +@click.option("--folder-id", default=None, help="Drive folder ID") | |
| 1659 | +@click.option("--json", "as_json", is_flag=True, help="Output as JSON") | |
| 1660 | +def recordings_meet_list(folder_id, as_json): | |
| 1661 | + """List Google Meet recordings in Drive via the gws CLI. | |
| 1662 | + | |
| 1663 | + Requires: npm install -g @googleworkspace/cli && gws auth login | |
| 1664 | + | |
| 1665 | + Examples: | |
| 1666 | + | |
| 1667 | + planopticon recordings meet-list | |
| 1668 | + | |
| 1669 | + planopticon recordings meet-list --folder-id abc123 | |
| 1670 | + """ | |
| 1671 | + from video_processor.sources.meet_recording_source import ( | |
| 1672 | + MeetRecordingSource, | |
| 1673 | + ) | |
| 1674 | + | |
| 1675 | + source = MeetRecordingSource(drive_folder_id=folder_id) | |
| 1676 | + if not source.authenticate(): | |
| 1677 | + click.echo("Google Meet authentication failed.", err=True) | |
| 1678 | + sys.exit(1) | |
| 1679 | + | |
| 1680 | + files = source.list_videos() | |
| 1681 | + if as_json: | |
| 1682 | + click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str)) | |
| 1683 | + else: | |
| 1684 | + click.echo(f"Found {len(files)} recording(s):") | |
| 1685 | + for f in files: | |
| 1686 | + size = f"{f.size_bytes // 1_000_000} MB" if f.size_bytes else "unknown" | |
| 1687 | + click.echo(f" {f.name} ({size}) {f.modified_at or ''}") | |
| 1688 | + | |
| 1587 | 1689 | |
| 1588 | 1690 | @cli.group() |
| 1589 | 1691 | def kg(): |
| 1590 | 1692 | """Knowledge graph utilities: convert, sync, and inspect.""" |
| 1591 | 1693 | pass |
| 1592 | 1694 |
| --- video_processor/cli/commands.py | |
| +++ video_processor/cli/commands.py | |
| @@ -1582,10 +1582,112 @@ | |
| 1582 | click.echo(f"Wiki pushed to https://github.com/{repo}/wiki") |
| 1583 | else: |
| 1584 | click.echo("Wiki push failed. Check auth and repo permissions.", err=True) |
| 1585 | sys.exit(1) |
| 1586 | |
| 1587 | |
| 1588 | @cli.group() |
| 1589 | def kg(): |
| 1590 | """Knowledge graph utilities: convert, sync, and inspect.""" |
| 1591 | pass |
| 1592 |
| --- video_processor/cli/commands.py | |
| +++ video_processor/cli/commands.py | |
| @@ -1582,10 +1582,112 @@ | |
| 1582 | click.echo(f"Wiki pushed to https://github.com/{repo}/wiki") |
| 1583 | else: |
| 1584 | click.echo("Wiki push failed. Check auth and repo permissions.", err=True) |
| 1585 | sys.exit(1) |
| 1586 | |
| 1587 | |
| 1588 | @cli.group() |
| 1589 | def recordings(): |
| 1590 | """Fetch meeting recordings from Zoom, Teams, and Google Meet.""" |
| 1591 | pass |
| 1592 | |
| 1593 | |
| 1594 | @recordings.command("zoom-list") |
| 1595 | @click.option("--json", "as_json", is_flag=True, help="Output as JSON") |
| 1596 | def recordings_zoom_list(as_json): |
| 1597 | """List Zoom cloud recordings. |
| 1598 | |
| 1599 | Requires ZOOM_CLIENT_ID (and optionally ZOOM_CLIENT_SECRET, |
| 1600 | ZOOM_ACCOUNT_ID) environment variables. |
| 1601 | |
| 1602 | Examples: |
| 1603 | |
| 1604 | planopticon recordings zoom-list |
| 1605 | |
| 1606 | planopticon recordings zoom-list --json |
| 1607 | """ |
| 1608 | from video_processor.sources.zoom_source import ZoomSource |
| 1609 | |
| 1610 | source = ZoomSource() |
| 1611 | if not source.authenticate(): |
| 1612 | click.echo("Zoom authentication failed.", err=True) |
| 1613 | sys.exit(1) |
| 1614 | |
| 1615 | files = source.list_videos() |
| 1616 | if as_json: |
| 1617 | click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str)) |
| 1618 | else: |
| 1619 | click.echo(f"Found {len(files)} recording(s):") |
| 1620 | for f in files: |
| 1621 | size = f"{f.size_bytes // 1_000_000} MB" if f.size_bytes else "unknown" |
| 1622 | click.echo(f" {f.name} ({size}) {f.modified_at or ''}") |
| 1623 | |
| 1624 | |
| 1625 | @recordings.command("teams-list") |
| 1626 | @click.option("--user-id", default="me", help="Microsoft user ID") |
| 1627 | @click.option("--json", "as_json", is_flag=True, help="Output as JSON") |
| 1628 | def recordings_teams_list(user_id, as_json): |
| 1629 | """List Teams meeting recordings via the m365 CLI. |
| 1630 | |
| 1631 | Requires: npm install -g @pnp/cli-microsoft365 && m365 login |
| 1632 | |
| 1633 | Examples: |
| 1634 | |
| 1635 | planopticon recordings teams-list |
| 1636 | |
| 1637 | planopticon recordings teams-list --json |
| 1638 | """ |
| 1639 | from video_processor.sources.teams_recording_source import ( |
| 1640 | TeamsRecordingSource, |
| 1641 | ) |
| 1642 | |
| 1643 | source = TeamsRecordingSource(user_id=user_id) |
| 1644 | if not source.authenticate(): |
| 1645 | click.echo("Teams authentication failed.", err=True) |
| 1646 | sys.exit(1) |
| 1647 | |
| 1648 | files = source.list_videos() |
| 1649 | if as_json: |
| 1650 | click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str)) |
| 1651 | else: |
| 1652 | click.echo(f"Found {len(files)} recording(s):") |
| 1653 | for f in files: |
| 1654 | click.echo(f" {f.name} {f.modified_at or ''}") |
| 1655 | |
| 1656 | |
| 1657 | @recordings.command("meet-list") |
| 1658 | @click.option("--folder-id", default=None, help="Drive folder ID") |
| 1659 | @click.option("--json", "as_json", is_flag=True, help="Output as JSON") |
| 1660 | def recordings_meet_list(folder_id, as_json): |
| 1661 | """List Google Meet recordings in Drive via the gws CLI. |
| 1662 | |
| 1663 | Requires: npm install -g @googleworkspace/cli && gws auth login |
| 1664 | |
| 1665 | Examples: |
| 1666 | |
| 1667 | planopticon recordings meet-list |
| 1668 | |
| 1669 | planopticon recordings meet-list --folder-id abc123 |
| 1670 | """ |
| 1671 | from video_processor.sources.meet_recording_source import ( |
| 1672 | MeetRecordingSource, |
| 1673 | ) |
| 1674 | |
| 1675 | source = MeetRecordingSource(drive_folder_id=folder_id) |
| 1676 | if not source.authenticate(): |
| 1677 | click.echo("Google Meet authentication failed.", err=True) |
| 1678 | sys.exit(1) |
| 1679 | |
| 1680 | files = source.list_videos() |
| 1681 | if as_json: |
| 1682 | click.echo(json.dumps([f.__dict__ for f in files], indent=2, default=str)) |
| 1683 | else: |
| 1684 | click.echo(f"Found {len(files)} recording(s):") |
| 1685 | for f in files: |
| 1686 | size = f"{f.size_bytes // 1_000_000} MB" if f.size_bytes else "unknown" |
| 1687 | click.echo(f" {f.name} ({size}) {f.modified_at or ''}") |
| 1688 | |
| 1689 | |
| 1690 | @cli.group() |
| 1691 | def kg(): |
| 1692 | """Knowledge graph utilities: convert, sync, and inspect.""" |
| 1693 | pass |
| 1694 |
| --- video_processor/sources/__init__.py | ||
| +++ video_processor/sources/__init__.py | ||
| @@ -12,19 +12,22 @@ | ||
| 12 | 12 | "GoogleKeepSource", |
| 13 | 13 | "GWSSource", |
| 14 | 14 | "HackerNewsSource", |
| 15 | 15 | "LogseqSource", |
| 16 | 16 | "M365Source", |
| 17 | + "MeetRecordingSource", | |
| 17 | 18 | "NotionSource", |
| 18 | 19 | "ObsidianSource", |
| 19 | 20 | "OneNoteSource", |
| 20 | 21 | "PodcastSource", |
| 22 | + "TeamsRecordingSource", | |
| 21 | 23 | "RedditSource", |
| 22 | 24 | "RSSSource", |
| 23 | 25 | "TwitterSource", |
| 24 | 26 | "WebSource", |
| 25 | 27 | "YouTubeSource", |
| 28 | + "ZoomSource", | |
| 26 | 29 | ] |
| 27 | 30 | |
| 28 | 31 | |
| 29 | 32 | def __getattr__(name: str): |
| 30 | 33 | """Lazy imports to avoid pulling in optional dependencies at import time.""" |
| @@ -36,21 +39,24 @@ | ||
| 36 | 39 | "GoogleKeepSource": "video_processor.sources.google_keep_source", |
| 37 | 40 | "GWSSource": "video_processor.sources.gws_source", |
| 38 | 41 | "HackerNewsSource": "video_processor.sources.hackernews_source", |
| 39 | 42 | "LogseqSource": "video_processor.sources.logseq_source", |
| 40 | 43 | "M365Source": "video_processor.sources.m365_source", |
| 44 | + "MeetRecordingSource": "video_processor.sources.meet_recording_source", | |
| 41 | 45 | "NotionSource": "video_processor.sources.notion_source", |
| 42 | 46 | "ObsidianSource": "video_processor.sources.obsidian_source", |
| 43 | 47 | "OneNoteSource": "video_processor.sources.onenote_source", |
| 44 | 48 | "PodcastSource": "video_processor.sources.podcast_source", |
| 49 | + "TeamsRecordingSource": "video_processor.sources.teams_recording_source", | |
| 45 | 50 | "RedditSource": "video_processor.sources.reddit_source", |
| 46 | 51 | "RSSSource": "video_processor.sources.rss_source", |
| 47 | 52 | "TwitterSource": "video_processor.sources.twitter_source", |
| 48 | 53 | "WebSource": "video_processor.sources.web_source", |
| 49 | 54 | "YouTubeSource": "video_processor.sources.youtube_source", |
| 55 | + "ZoomSource": "video_processor.sources.zoom_source", | |
| 50 | 56 | } |
| 51 | 57 | if name in _lazy_map: |
| 52 | 58 | import importlib |
| 53 | 59 | |
| 54 | 60 | module = importlib.import_module(_lazy_map[name]) |
| 55 | 61 | return getattr(module, name) |
| 56 | 62 | raise AttributeError(f"module {__name__!r} has no attribute {name!r}") |
| 57 | 63 | |
| 58 | 64 | ADDED video_processor/sources/meet_recording_source.py |
| 59 | 65 | ADDED video_processor/sources/teams_recording_source.py |
| 60 | 66 | ADDED video_processor/sources/zoom_source.py |
| --- video_processor/sources/__init__.py | |
| +++ video_processor/sources/__init__.py | |
| @@ -12,19 +12,22 @@ | |
| 12 | "GoogleKeepSource", |
| 13 | "GWSSource", |
| 14 | "HackerNewsSource", |
| 15 | "LogseqSource", |
| 16 | "M365Source", |
| 17 | "NotionSource", |
| 18 | "ObsidianSource", |
| 19 | "OneNoteSource", |
| 20 | "PodcastSource", |
| 21 | "RedditSource", |
| 22 | "RSSSource", |
| 23 | "TwitterSource", |
| 24 | "WebSource", |
| 25 | "YouTubeSource", |
| 26 | ] |
| 27 | |
| 28 | |
| 29 | def __getattr__(name: str): |
| 30 | """Lazy imports to avoid pulling in optional dependencies at import time.""" |
| @@ -36,21 +39,24 @@ | |
| 36 | "GoogleKeepSource": "video_processor.sources.google_keep_source", |
| 37 | "GWSSource": "video_processor.sources.gws_source", |
| 38 | "HackerNewsSource": "video_processor.sources.hackernews_source", |
| 39 | "LogseqSource": "video_processor.sources.logseq_source", |
| 40 | "M365Source": "video_processor.sources.m365_source", |
| 41 | "NotionSource": "video_processor.sources.notion_source", |
| 42 | "ObsidianSource": "video_processor.sources.obsidian_source", |
| 43 | "OneNoteSource": "video_processor.sources.onenote_source", |
| 44 | "PodcastSource": "video_processor.sources.podcast_source", |
| 45 | "RedditSource": "video_processor.sources.reddit_source", |
| 46 | "RSSSource": "video_processor.sources.rss_source", |
| 47 | "TwitterSource": "video_processor.sources.twitter_source", |
| 48 | "WebSource": "video_processor.sources.web_source", |
| 49 | "YouTubeSource": "video_processor.sources.youtube_source", |
| 50 | } |
| 51 | if name in _lazy_map: |
| 52 | import importlib |
| 53 | |
| 54 | module = importlib.import_module(_lazy_map[name]) |
| 55 | return getattr(module, name) |
| 56 | raise AttributeError(f"module {__name__!r} has no attribute {name!r}") |
| 57 | |
| 58 | DDED video_processor/sources/meet_recording_source.py |
| 59 | DDED video_processor/sources/teams_recording_source.py |
| 60 | DDED video_processor/sources/zoom_source.py |
| --- video_processor/sources/__init__.py | |
| +++ video_processor/sources/__init__.py | |
| @@ -12,19 +12,22 @@ | |
| 12 | "GoogleKeepSource", |
| 13 | "GWSSource", |
| 14 | "HackerNewsSource", |
| 15 | "LogseqSource", |
| 16 | "M365Source", |
| 17 | "MeetRecordingSource", |
| 18 | "NotionSource", |
| 19 | "ObsidianSource", |
| 20 | "OneNoteSource", |
| 21 | "PodcastSource", |
| 22 | "TeamsRecordingSource", |
| 23 | "RedditSource", |
| 24 | "RSSSource", |
| 25 | "TwitterSource", |
| 26 | "WebSource", |
| 27 | "YouTubeSource", |
| 28 | "ZoomSource", |
| 29 | ] |
| 30 | |
| 31 | |
| 32 | def __getattr__(name: str): |
| 33 | """Lazy imports to avoid pulling in optional dependencies at import time.""" |
| @@ -36,21 +39,24 @@ | |
| 39 | "GoogleKeepSource": "video_processor.sources.google_keep_source", |
| 40 | "GWSSource": "video_processor.sources.gws_source", |
| 41 | "HackerNewsSource": "video_processor.sources.hackernews_source", |
| 42 | "LogseqSource": "video_processor.sources.logseq_source", |
| 43 | "M365Source": "video_processor.sources.m365_source", |
| 44 | "MeetRecordingSource": "video_processor.sources.meet_recording_source", |
| 45 | "NotionSource": "video_processor.sources.notion_source", |
| 46 | "ObsidianSource": "video_processor.sources.obsidian_source", |
| 47 | "OneNoteSource": "video_processor.sources.onenote_source", |
| 48 | "PodcastSource": "video_processor.sources.podcast_source", |
| 49 | "TeamsRecordingSource": "video_processor.sources.teams_recording_source", |
| 50 | "RedditSource": "video_processor.sources.reddit_source", |
| 51 | "RSSSource": "video_processor.sources.rss_source", |
| 52 | "TwitterSource": "video_processor.sources.twitter_source", |
| 53 | "WebSource": "video_processor.sources.web_source", |
| 54 | "YouTubeSource": "video_processor.sources.youtube_source", |
| 55 | "ZoomSource": "video_processor.sources.zoom_source", |
| 56 | } |
| 57 | if name in _lazy_map: |
| 58 | import importlib |
| 59 | |
| 60 | module = importlib.import_module(_lazy_map[name]) |
| 61 | return getattr(module, name) |
| 62 | raise AttributeError(f"module {__name__!r} has no attribute {name!r}") |
| 63 | |
| 64 | DDED video_processor/sources/meet_recording_source.py |
| 65 | DDED video_processor/sources/teams_recording_source.py |
| 66 | DDED video_processor/sources/zoom_source.py |
| --- a/video_processor/sources/meet_recording_source.py | ||
| +++ b/video_processor/sources/meet_recording_source.py | ||
| @@ -0,0 +1,280 @@ | ||
| 1 | +"""Google Meet recording source using the gws CLI (googleworkspace/cli). | |
| 2 | + | |
| 3 | +Fetches Meet recordings and companion transcripts from Google Drive | |
| 4 | +via the `gws` CLI tool. | |
| 5 | + | |
| 6 | +Requires: npm install -g @googleworkspace/cli | |
| 7 | +Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless) | |
| 8 | +""" | |
| 9 | + | |
| 10 | +import json | |
| 11 | +import logging | |
| 12 | +import re | |
| 13 | +import shutil | |
| 14 | +import subprocess | |
| 15 | +from pathlib import Path | |
| 16 | +from typing import List, Optional | |
| 17 | + | |
| 18 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 19 | +from video_processor.sources.gws_source import _run_gws | |
| 20 | + | |
| 21 | +logger = logging.getLogger(__name__) | |
| 22 | + | |
| 23 | + | |
| 24 | +class MeetRecordingSource(BaseSource): | |
| 25 | + """ | |
| 26 | + Fetch Google Meet recordings and transcripts from Google Drive via the gws CLI. | |
| 27 | + | |
| 28 | + Meet stores recordings as MP4 files in Drive (typically in a "Meet Recordings" | |
| 29 | + folder) and auto-generated transcripts as Google Docs. | |
| 30 | + | |
| 31 | + Usage: | |
| 32 | + source = MeetRecordingSource() | |
| 33 | + source.authenticate() | |
| 34 | + recordings = source.list_videos() | |
| 35 | + source.download_all(recordings, Path("./recordings")) | |
| 36 | + | |
| 37 | + # Fetch transcript for a specific recording | |
| 38 | + transcript = source.fetch_transcript("Meet Recording 2026-03-07") | |
| 39 | + """ | |
| 40 | + | |
| 41 | + def __init__(self, drive_folder_id: Optional[str] = None): | |
| 42 | + self.drive_folder_id = drive_folder_id | |
| 43 | + | |
| 44 | + def authenticate(self) -> bool: | |
| 45 | + """Check if gws CLI is installed and authenticated.""" | |
| 46 | + if not shutil.which("gws"): | |
| 47 | + logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli") | |
| 48 | + return False | |
| 49 | + try: | |
| 50 | + _run_gws(["auth", "status"], timeout=10) | |
| 51 | + return True | |
| 52 | + except (RuntimeError, subprocess.TimeoutExpired): | |
| 53 | + logger.error("gws not authenticated. Run: gws auth login") | |
| 54 | + return False | |
| 55 | + | |
| 56 | + def list_videos( | |
| 57 | + self, | |
| 58 | + folder_id: Optional[str] = None, | |
| 59 | + folder_path: Optional[str] = None, | |
| 60 | + patterns: Optional[List[str]] = None, | |
| 61 | + ) -> List[SourceFile]: | |
| 62 | + """List Google Meet recordings in Drive. | |
| 63 | + | |
| 64 | + Searches for MP4 files with 'Meet Recording' in the name. If a | |
| 65 | + drive_folder_id is set, restricts search to that folder. | |
| 66 | + Also discovers companion transcript docs for each recording. | |
| 67 | + """ | |
| 68 | + target_folder = folder_id or self.drive_folder_id | |
| 69 | + files: List[SourceFile] = [] | |
| 70 | + | |
| 71 | + # Build the Drive search query for Meet recordings | |
| 72 | + q_parts = [ | |
| 73 | + "mimeType='video/mp4'", | |
| 74 | + "name contains 'Meet Recording'", | |
| 75 | + "trashed=false", | |
| 76 | + ] | |
| 77 | + if target_folder: | |
| 78 | + q_parts.append(f"'{target_folder}' in parents") | |
| 79 | + | |
| 80 | + params = { | |
| 81 | + "q": " and ".join(q_parts), | |
| 82 | + "fields": "files(id,name,mimeType,size,modifiedTime)", | |
| 83 | + "pageSize": 50, | |
| 84 | + "orderBy": "modifiedTime desc", | |
| 85 | + } | |
| 86 | + | |
| 87 | + try: | |
| 88 | + result = _run_gws( | |
| 89 | + [ | |
| 90 | + "drive", | |
| 91 | + "files", | |
| 92 | + "list", | |
| 93 | + "--params", | |
| 94 | + json.dumps(params), | |
| 95 | + ], | |
| 96 | + timeout=60, | |
| 97 | + ) | |
| 98 | + except RuntimeError as e: | |
| 99 | + logger.error(f"Failed to list Meet recordings: {e}") | |
| 100 | + return [] | |
| 101 | + | |
| 102 | + recordings = result.get("files", []) | |
| 103 | + for item in recordings: | |
| 104 | + size = item.get("size") | |
| 105 | + files.append( | |
| 106 | + SourceFile( | |
| 107 | + name=item.get("name", "Meet Recording"), | |
| 108 | + id=item.get("id", ""), | |
| 109 | + size_bytes=int(size) if size else None, | |
| 110 | + mime_type=item.get("mimeType", "video/mp4"), | |
| 111 | + modified_at=item.get("modifiedTime"), | |
| 112 | + ) | |
| 113 | + ) | |
| 114 | + | |
| 115 | + # Also search for auto-generated transcript docs | |
| 116 | + transcript_params = { | |
| 117 | + "q": " and ".join( | |
| 118 | + [ | |
| 119 | + "mimeType='application/vnd.google-apps.document'", | |
| 120 | + "(name contains 'Transcript' or name contains 'Meeting notes')", | |
| 121 | + "trashed=false", | |
| 122 | + ] | |
| 123 | + + ([f"'{target_folder}' in parents"] if target_folder else []) | |
| 124 | + ), | |
| 125 | + "fields": "files(id,name,mimeType,modifiedTime)", | |
| 126 | + "pageSize": 50, | |
| 127 | + "orderBy": "modifiedTime desc", | |
| 128 | + } | |
| 129 | + | |
| 130 | + try: | |
| 131 | + transcript_result = _run_gws( | |
| 132 | + [ | |
| 133 | + "drive", | |
| 134 | + "files", | |
| 135 | + "list", | |
| 136 | + "--params", | |
| 137 | + json.dumps(transcript_params), | |
| 138 | + ], | |
| 139 | + timeout=60, | |
| 140 | + ) | |
| 141 | + transcript_files = transcript_result.get("files", []) | |
| 142 | + logger.info( | |
| 143 | + f"Found {len(recordings)} recording(s) and " | |
| 144 | + f"{len(transcript_files)} transcript doc(s) in Drive" | |
| 145 | + ) | |
| 146 | + except RuntimeError as e: | |
| 147 | + logger.debug(f"Transcript search failed: {e}") | |
| 148 | + | |
| 149 | + if not files: | |
| 150 | + logger.warning("No Google Meet recordings found in Drive") | |
| 151 | + | |
| 152 | + logger.info(f"Found {len(files)} Meet recording(s)") | |
| 153 | + return files | |
| 154 | + | |
| 155 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 156 | + """Download a Meet recording from Drive.""" | |
| 157 | + destination = Path(destination) | |
| 158 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 159 | + | |
| 160 | + # For video files, download binary content via alt=media | |
| 161 | + result = _run_gws( | |
| 162 | + [ | |
| 163 | + "drive", | |
| 164 | + "files", | |
| 165 | + "get", | |
| 166 | + "--params", | |
| 167 | + json.dumps({"fileId": file.id, "alt": "media"}), | |
| 168 | + ], | |
| 169 | + timeout=300, | |
| 170 | + ) | |
| 171 | + | |
| 172 | + # Write the content — result may be raw binary or a dict wrapper | |
| 173 | + raw = result.get("raw", "") if isinstance(result, dict) else str(result) | |
| 174 | + destination.write_text(raw, encoding="utf-8") | |
| 175 | + logger.info(f"Downloaded {file.name} to {destination}") | |
| 176 | + return destination | |
| 177 | + | |
| 178 | + def fetch_transcript(self, recording_name: str) -> Optional[str]: | |
| 179 | + """Fetch the companion transcript for a Meet recording. | |
| 180 | + | |
| 181 | + Google Meet creates transcript docs with names that typically match | |
| 182 | + the recording date/time. This method searches for the matching | |
| 183 | + Google Doc and extracts its text content. | |
| 184 | + """ | |
| 185 | + transcript_id = self._find_matching_transcript(recording_name) | |
| 186 | + if not transcript_id: | |
| 187 | + logger.info(f"No matching transcript found for: {recording_name}") | |
| 188 | + return None | |
| 189 | + | |
| 190 | + # Fetch the Google Doc content via the Docs API | |
| 191 | + try: | |
| 192 | + result = _run_gws( | |
| 193 | + [ | |
| 194 | + "docs", | |
| 195 | + "documents", | |
| 196 | + "get", | |
| 197 | + "--params", | |
| 198 | + json.dumps({"documentId": transcript_id}), | |
| 199 | + ], | |
| 200 | + timeout=60, | |
| 201 | + ) | |
| 202 | + except RuntimeError as e: | |
| 203 | + logger.warning(f"Failed to fetch transcript doc {transcript_id}: {e}") | |
| 204 | + return None | |
| 205 | + | |
| 206 | + # Extract text from the Docs API structural response | |
| 207 | + body = result.get("body", {}) | |
| 208 | + text_parts: list[str] = [] | |
| 209 | + for element in body.get("content", []): | |
| 210 | + paragraph = element.get("paragraph", {}) | |
| 211 | + for pe in paragraph.get("elements", []): | |
| 212 | + text_run = pe.get("textRun", {}) | |
| 213 | + text = text_run.get("content", "") | |
| 214 | + if text.strip(): | |
| 215 | + text_parts.append(text) | |
| 216 | + | |
| 217 | + if not text_parts: | |
| 218 | + logger.warning(f"Transcript doc {transcript_id} had no extractable text") | |
| 219 | + return None | |
| 220 | + | |
| 221 | + return "".join(text_parts) | |
| 222 | + | |
| 223 | + def _find_matching_transcript(self, recording_name: str) -> Optional[str]: | |
| 224 | + """Search Drive for a transcript doc that matches a recording name. | |
| 225 | + | |
| 226 | + Meet recordings are typically named like: | |
| 227 | + "Meet Recording 2026-03-07T14:30:00" | |
| 228 | + And transcripts are named like: | |
| 229 | + "Meeting Transcript 2026-03-07" or "2026-03-07 - Transcript" | |
| 230 | + | |
| 231 | + This extracts the date portion and searches for matching transcript docs. | |
| 232 | + """ | |
| 233 | + # Extract a date string from the recording name (YYYY-MM-DD pattern) | |
| 234 | + date_match = re.search(r"\d{4}-\d{2}-\d{2}", recording_name) | |
| 235 | + date_str = date_match.group(0) if date_match else recording_name | |
| 236 | + | |
| 237 | + # Search for transcript docs matching the date | |
| 238 | + search_query = " and ".join( | |
| 239 | + [ | |
| 240 | + "mimeType='application/vnd.google-apps.document'", | |
| 241 | + f"name contains '{date_str}'", | |
| 242 | + "(name contains 'Transcript' or name contains 'transcript' " | |
| 243 | + "or name contains 'Meeting notes')", | |
| 244 | + "trashed=false", | |
| 245 | + ] | |
| 246 | + ) | |
| 247 | + if self.drive_folder_id: | |
| 248 | + search_query += f" and '{self.drive_folder_id}' in parents" | |
| 249 | + | |
| 250 | + try: | |
| 251 | + result = _run_gws( | |
| 252 | + [ | |
| 253 | + "drive", | |
| 254 | + "files", | |
| 255 | + "list", | |
| 256 | + "--params", | |
| 257 | + json.dumps( | |
| 258 | + { | |
| 259 | + "q": search_query, | |
| 260 | + "fields": "files(id,name,modifiedTime)", | |
| 261 | + "pageSize": 5, | |
| 262 | + "orderBy": "modifiedTime desc", | |
| 263 | + } | |
| 264 | + ), | |
| 265 | + ], | |
| 266 | + timeout=60, | |
| 267 | + ) | |
| 268 | + except RuntimeError as e: | |
| 269 | + logger.debug(f"Transcript search failed for '{date_str}': {e}") | |
| 270 | + return None | |
| 271 | + | |
| 272 | + files = result.get("files", []) | |
| 273 | + if not files: | |
| 274 | + logger.debug(f"No transcript docs found matching '{date_str}'") | |
| 275 | + return None | |
| 276 | + | |
| 277 | + # Return the most recently modified match | |
| 278 | + best = files[0] | |
| 279 | + logger.info(f"Matched transcript: {best.get('name')} for recording: {recording_name}") | |
| 280 | + return best.get("id") |
| --- a/video_processor/sources/meet_recording_source.py | |
| +++ b/video_processor/sources/meet_recording_source.py | |
| @@ -0,0 +1,280 @@ | |
| --- a/video_processor/sources/meet_recording_source.py | |
| +++ b/video_processor/sources/meet_recording_source.py | |
| @@ -0,0 +1,280 @@ | |
| 1 | """Google Meet recording source using the gws CLI (googleworkspace/cli). |
| 2 | |
| 3 | Fetches Meet recordings and companion transcripts from Google Drive |
| 4 | via the `gws` CLI tool. |
| 5 | |
| 6 | Requires: npm install -g @googleworkspace/cli |
| 7 | Auth: gws auth login (interactive) or GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE (headless) |
| 8 | """ |
| 9 | |
| 10 | import json |
| 11 | import logging |
| 12 | import re |
| 13 | import shutil |
| 14 | import subprocess |
| 15 | from pathlib import Path |
| 16 | from typing import List, Optional |
| 17 | |
| 18 | from video_processor.sources.base import BaseSource, SourceFile |
| 19 | from video_processor.sources.gws_source import _run_gws |
| 20 | |
| 21 | logger = logging.getLogger(__name__) |
| 22 | |
| 23 | |
| 24 | class MeetRecordingSource(BaseSource): |
| 25 | """ |
| 26 | Fetch Google Meet recordings and transcripts from Google Drive via the gws CLI. |
| 27 | |
| 28 | Meet stores recordings as MP4 files in Drive (typically in a "Meet Recordings" |
| 29 | folder) and auto-generated transcripts as Google Docs. |
| 30 | |
| 31 | Usage: |
| 32 | source = MeetRecordingSource() |
| 33 | source.authenticate() |
| 34 | recordings = source.list_videos() |
| 35 | source.download_all(recordings, Path("./recordings")) |
| 36 | |
| 37 | # Fetch transcript for a specific recording |
| 38 | transcript = source.fetch_transcript("Meet Recording 2026-03-07") |
| 39 | """ |
| 40 | |
| 41 | def __init__(self, drive_folder_id: Optional[str] = None): |
| 42 | self.drive_folder_id = drive_folder_id |
| 43 | |
| 44 | def authenticate(self) -> bool: |
| 45 | """Check if gws CLI is installed and authenticated.""" |
| 46 | if not shutil.which("gws"): |
| 47 | logger.error("gws CLI not found. Install with: npm install -g @googleworkspace/cli") |
| 48 | return False |
| 49 | try: |
| 50 | _run_gws(["auth", "status"], timeout=10) |
| 51 | return True |
| 52 | except (RuntimeError, subprocess.TimeoutExpired): |
| 53 | logger.error("gws not authenticated. Run: gws auth login") |
| 54 | return False |
| 55 | |
| 56 | def list_videos( |
| 57 | self, |
| 58 | folder_id: Optional[str] = None, |
| 59 | folder_path: Optional[str] = None, |
| 60 | patterns: Optional[List[str]] = None, |
| 61 | ) -> List[SourceFile]: |
| 62 | """List Google Meet recordings in Drive. |
| 63 | |
| 64 | Searches for MP4 files with 'Meet Recording' in the name. If a |
| 65 | drive_folder_id is set, restricts search to that folder. |
| 66 | Also discovers companion transcript docs for each recording. |
| 67 | """ |
| 68 | target_folder = folder_id or self.drive_folder_id |
| 69 | files: List[SourceFile] = [] |
| 70 | |
| 71 | # Build the Drive search query for Meet recordings |
| 72 | q_parts = [ |
| 73 | "mimeType='video/mp4'", |
| 74 | "name contains 'Meet Recording'", |
| 75 | "trashed=false", |
| 76 | ] |
| 77 | if target_folder: |
| 78 | q_parts.append(f"'{target_folder}' in parents") |
| 79 | |
| 80 | params = { |
| 81 | "q": " and ".join(q_parts), |
| 82 | "fields": "files(id,name,mimeType,size,modifiedTime)", |
| 83 | "pageSize": 50, |
| 84 | "orderBy": "modifiedTime desc", |
| 85 | } |
| 86 | |
| 87 | try: |
| 88 | result = _run_gws( |
| 89 | [ |
| 90 | "drive", |
| 91 | "files", |
| 92 | "list", |
| 93 | "--params", |
| 94 | json.dumps(params), |
| 95 | ], |
| 96 | timeout=60, |
| 97 | ) |
| 98 | except RuntimeError as e: |
| 99 | logger.error(f"Failed to list Meet recordings: {e}") |
| 100 | return [] |
| 101 | |
| 102 | recordings = result.get("files", []) |
| 103 | for item in recordings: |
| 104 | size = item.get("size") |
| 105 | files.append( |
| 106 | SourceFile( |
| 107 | name=item.get("name", "Meet Recording"), |
| 108 | id=item.get("id", ""), |
| 109 | size_bytes=int(size) if size else None, |
| 110 | mime_type=item.get("mimeType", "video/mp4"), |
| 111 | modified_at=item.get("modifiedTime"), |
| 112 | ) |
| 113 | ) |
| 114 | |
| 115 | # Also search for auto-generated transcript docs |
| 116 | transcript_params = { |
| 117 | "q": " and ".join( |
| 118 | [ |
| 119 | "mimeType='application/vnd.google-apps.document'", |
| 120 | "(name contains 'Transcript' or name contains 'Meeting notes')", |
| 121 | "trashed=false", |
| 122 | ] |
| 123 | + ([f"'{target_folder}' in parents"] if target_folder else []) |
| 124 | ), |
| 125 | "fields": "files(id,name,mimeType,modifiedTime)", |
| 126 | "pageSize": 50, |
| 127 | "orderBy": "modifiedTime desc", |
| 128 | } |
| 129 | |
| 130 | try: |
| 131 | transcript_result = _run_gws( |
| 132 | [ |
| 133 | "drive", |
| 134 | "files", |
| 135 | "list", |
| 136 | "--params", |
| 137 | json.dumps(transcript_params), |
| 138 | ], |
| 139 | timeout=60, |
| 140 | ) |
| 141 | transcript_files = transcript_result.get("files", []) |
| 142 | logger.info( |
| 143 | f"Found {len(recordings)} recording(s) and " |
| 144 | f"{len(transcript_files)} transcript doc(s) in Drive" |
| 145 | ) |
| 146 | except RuntimeError as e: |
| 147 | logger.debug(f"Transcript search failed: {e}") |
| 148 | |
| 149 | if not files: |
| 150 | logger.warning("No Google Meet recordings found in Drive") |
| 151 | |
| 152 | logger.info(f"Found {len(files)} Meet recording(s)") |
| 153 | return files |
| 154 | |
| 155 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 156 | """Download a Meet recording from Drive.""" |
| 157 | destination = Path(destination) |
| 158 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 159 | |
| 160 | # For video files, download binary content via alt=media |
| 161 | result = _run_gws( |
| 162 | [ |
| 163 | "drive", |
| 164 | "files", |
| 165 | "get", |
| 166 | "--params", |
| 167 | json.dumps({"fileId": file.id, "alt": "media"}), |
| 168 | ], |
| 169 | timeout=300, |
| 170 | ) |
| 171 | |
| 172 | # Write the content — result may be raw binary or a dict wrapper |
| 173 | raw = result.get("raw", "") if isinstance(result, dict) else str(result) |
| 174 | destination.write_text(raw, encoding="utf-8") |
| 175 | logger.info(f"Downloaded {file.name} to {destination}") |
| 176 | return destination |
| 177 | |
| 178 | def fetch_transcript(self, recording_name: str) -> Optional[str]: |
| 179 | """Fetch the companion transcript for a Meet recording. |
| 180 | |
| 181 | Google Meet creates transcript docs with names that typically match |
| 182 | the recording date/time. This method searches for the matching |
| 183 | Google Doc and extracts its text content. |
| 184 | """ |
| 185 | transcript_id = self._find_matching_transcript(recording_name) |
| 186 | if not transcript_id: |
| 187 | logger.info(f"No matching transcript found for: {recording_name}") |
| 188 | return None |
| 189 | |
| 190 | # Fetch the Google Doc content via the Docs API |
| 191 | try: |
| 192 | result = _run_gws( |
| 193 | [ |
| 194 | "docs", |
| 195 | "documents", |
| 196 | "get", |
| 197 | "--params", |
| 198 | json.dumps({"documentId": transcript_id}), |
| 199 | ], |
| 200 | timeout=60, |
| 201 | ) |
| 202 | except RuntimeError as e: |
| 203 | logger.warning(f"Failed to fetch transcript doc {transcript_id}: {e}") |
| 204 | return None |
| 205 | |
| 206 | # Extract text from the Docs API structural response |
| 207 | body = result.get("body", {}) |
| 208 | text_parts: list[str] = [] |
| 209 | for element in body.get("content", []): |
| 210 | paragraph = element.get("paragraph", {}) |
| 211 | for pe in paragraph.get("elements", []): |
| 212 | text_run = pe.get("textRun", {}) |
| 213 | text = text_run.get("content", "") |
| 214 | if text.strip(): |
| 215 | text_parts.append(text) |
| 216 | |
| 217 | if not text_parts: |
| 218 | logger.warning(f"Transcript doc {transcript_id} had no extractable text") |
| 219 | return None |
| 220 | |
| 221 | return "".join(text_parts) |
| 222 | |
| 223 | def _find_matching_transcript(self, recording_name: str) -> Optional[str]: |
| 224 | """Search Drive for a transcript doc that matches a recording name. |
| 225 | |
| 226 | Meet recordings are typically named like: |
| 227 | "Meet Recording 2026-03-07T14:30:00" |
| 228 | And transcripts are named like: |
| 229 | "Meeting Transcript 2026-03-07" or "2026-03-07 - Transcript" |
| 230 | |
| 231 | This extracts the date portion and searches for matching transcript docs. |
| 232 | """ |
| 233 | # Extract a date string from the recording name (YYYY-MM-DD pattern) |
| 234 | date_match = re.search(r"\d{4}-\d{2}-\d{2}", recording_name) |
| 235 | date_str = date_match.group(0) if date_match else recording_name |
| 236 | |
| 237 | # Search for transcript docs matching the date |
| 238 | search_query = " and ".join( |
| 239 | [ |
| 240 | "mimeType='application/vnd.google-apps.document'", |
| 241 | f"name contains '{date_str}'", |
| 242 | "(name contains 'Transcript' or name contains 'transcript' " |
| 243 | "or name contains 'Meeting notes')", |
| 244 | "trashed=false", |
| 245 | ] |
| 246 | ) |
| 247 | if self.drive_folder_id: |
| 248 | search_query += f" and '{self.drive_folder_id}' in parents" |
| 249 | |
| 250 | try: |
| 251 | result = _run_gws( |
| 252 | [ |
| 253 | "drive", |
| 254 | "files", |
| 255 | "list", |
| 256 | "--params", |
| 257 | json.dumps( |
| 258 | { |
| 259 | "q": search_query, |
| 260 | "fields": "files(id,name,modifiedTime)", |
| 261 | "pageSize": 5, |
| 262 | "orderBy": "modifiedTime desc", |
| 263 | } |
| 264 | ), |
| 265 | ], |
| 266 | timeout=60, |
| 267 | ) |
| 268 | except RuntimeError as e: |
| 269 | logger.debug(f"Transcript search failed for '{date_str}': {e}") |
| 270 | return None |
| 271 | |
| 272 | files = result.get("files", []) |
| 273 | if not files: |
| 274 | logger.debug(f"No transcript docs found matching '{date_str}'") |
| 275 | return None |
| 276 | |
| 277 | # Return the most recently modified match |
| 278 | best = files[0] |
| 279 | logger.info(f"Matched transcript: {best.get('name')} for recording: {recording_name}") |
| 280 | return best.get("id") |
| --- a/video_processor/sources/teams_recording_source.py | ||
| +++ b/video_processor/sources/teams_recording_source.py | ||
| @@ -0,0 +1,375 @@ | ||
| 1 | +"""Microsoft Teams meeting recording source using the m365 CLI. | |
| 2 | + | |
| 3 | +Fetches Teams meeting recordings and transcripts via the Microsoft Graph API | |
| 4 | +through the `m365` CLI tool. | |
| 5 | + | |
| 6 | +Requires: npm install -g @pnp/cli-microsoft365 | |
| 7 | +Auth: m365 login (interactive) | |
| 8 | +Docs: https://pnp.github.io/cli-microsoft365/ | |
| 9 | +""" | |
| 10 | + | |
| 11 | +import logging | |
| 12 | +import re | |
| 13 | +import shutil | |
| 14 | +import subprocess | |
| 15 | +from pathlib import Path | |
| 16 | +from typing import List, Optional | |
| 17 | + | |
| 18 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 19 | +from video_processor.sources.m365_source import _run_m365 | |
| 20 | + | |
| 21 | +logger = logging.getLogger(__name__) | |
| 22 | + | |
| 23 | + | |
| 24 | +def _vtt_to_text(vtt: str) -> str: | |
| 25 | + """Strip VTT timing metadata and return plain text. | |
| 26 | + | |
| 27 | + Removes WEBVTT headers, timestamps (00:00:00.000 --> 00:00:05.000), | |
| 28 | + cue identifiers, and deduplicates consecutive identical lines. | |
| 29 | + """ | |
| 30 | + lines = vtt.splitlines() | |
| 31 | + text_lines: list[str] = [] | |
| 32 | + prev_line = "" | |
| 33 | + | |
| 34 | + for line in lines: | |
| 35 | + stripped = line.strip() | |
| 36 | + # Skip WEBVTT header and NOTE blocks | |
| 37 | + if stripped.startswith("WEBVTT") or stripped.startswith("NOTE"): | |
| 38 | + continue | |
| 39 | + # Skip timestamp lines (e.g. 00:00:01.000 --> 00:00:05.000) | |
| 40 | + if re.match(r"\d{2}:\d{2}[:\.][\d.]+ --> \d{2}:\d{2}[:\.][\d.]+", stripped): | |
| 41 | + continue | |
| 42 | + # Skip numeric cue identifiers | |
| 43 | + if re.match(r"^\d+$", stripped): | |
| 44 | + continue | |
| 45 | + # Skip blank lines | |
| 46 | + if not stripped: | |
| 47 | + continue | |
| 48 | + # Strip inline VTT tags like <v Speaker> | |
| 49 | + cleaned = re.sub(r"<[^>]+>", "", stripped).strip() | |
| 50 | + if cleaned and cleaned != prev_line: | |
| 51 | + text_lines.append(cleaned) | |
| 52 | + prev_line = cleaned | |
| 53 | + | |
| 54 | + return "\n".join(text_lines) | |
| 55 | + | |
| 56 | + | |
| 57 | +class TeamsRecordingSource(BaseSource): | |
| 58 | + """ | |
| 59 | + Fetch Teams meeting recordings and transcripts via the m365 CLI / Graph API. | |
| 60 | + | |
| 61 | + Usage: | |
| 62 | + source = TeamsRecordingSource(user_id="me") | |
| 63 | + source.authenticate() | |
| 64 | + recordings = source.list_videos() | |
| 65 | + source.download_all(recordings, Path("./recordings")) | |
| 66 | + | |
| 67 | + # Fetch transcript for a specific meeting | |
| 68 | + transcript = source.fetch_transcript(meeting_id) | |
| 69 | + """ | |
| 70 | + | |
| 71 | + def __init__(self, user_id: str = "me"): | |
| 72 | + self.user_id = user_id | |
| 73 | + | |
| 74 | + def authenticate(self) -> bool: | |
| 75 | + """Check if m365 CLI is installed and logged in.""" | |
| 76 | + if not shutil.which("m365"): | |
| 77 | + logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365") | |
| 78 | + return False | |
| 79 | + try: | |
| 80 | + result = _run_m365(["status"], timeout=10) | |
| 81 | + if isinstance(result, dict) and result.get("connectedAs"): | |
| 82 | + return True | |
| 83 | + if isinstance(result, str) and "Logged in" in result: | |
| 84 | + return True | |
| 85 | + logger.error("m365 not logged in. Run: m365 login") | |
| 86 | + return False | |
| 87 | + except (RuntimeError, subprocess.TimeoutExpired): | |
| 88 | + logger.error("m365 not logged in. Run: m365 login") | |
| 89 | + return False | |
| 90 | + | |
| 91 | + def list_videos( | |
| 92 | + self, | |
| 93 | + folder_id: Optional[str] = None, | |
| 94 | + folder_path: Optional[str] = None, | |
| 95 | + patterns: Optional[List[str]] = None, | |
| 96 | + ) -> List[SourceFile]: | |
| 97 | + """List Teams meeting recordings available for the user. | |
| 98 | + | |
| 99 | + Tries multiple approaches: | |
| 100 | + 1. Graph API onlineMeetings endpoint | |
| 101 | + 2. m365 teams meeting list command | |
| 102 | + 3. Fallback: search chat messages for recording links | |
| 103 | + """ | |
| 104 | + files: List[SourceFile] = [] | |
| 105 | + | |
| 106 | + # Approach 1: Graph API — list online meetings | |
| 107 | + try: | |
| 108 | + result = _run_m365( | |
| 109 | + [ | |
| 110 | + "request", | |
| 111 | + "--url", | |
| 112 | + f"https://graph.microsoft.com/v1.0/{self.user_id}/onlineMeetings", | |
| 113 | + "--method", | |
| 114 | + "get", | |
| 115 | + ], | |
| 116 | + timeout=60, | |
| 117 | + ) | |
| 118 | + meetings = self._extract_meetings_list(result) | |
| 119 | + for meeting in meetings: | |
| 120 | + recording_files = self._get_meeting_recordings(meeting) | |
| 121 | + files.extend(recording_files) | |
| 122 | + | |
| 123 | + if files: | |
| 124 | + logger.info(f"Found {len(files)} recording(s) via Graph API onlineMeetings") | |
| 125 | + return files | |
| 126 | + except RuntimeError as e: | |
| 127 | + logger.debug(f"onlineMeetings endpoint failed: {e}") | |
| 128 | + | |
| 129 | + # Approach 2: m365 teams meeting list | |
| 130 | + try: | |
| 131 | + result = _run_m365(["teams", "meeting", "list"], timeout=60) | |
| 132 | + meetings = result if isinstance(result, list) else [] | |
| 133 | + for meeting in meetings: | |
| 134 | + recording_files = self._get_meeting_recordings(meeting) | |
| 135 | + files.extend(recording_files) | |
| 136 | + | |
| 137 | + if files: | |
| 138 | + logger.info(f"Found {len(files)} recording(s) via m365 teams meeting list") | |
| 139 | + return files | |
| 140 | + except RuntimeError as e: | |
| 141 | + logger.debug(f"teams meeting list failed: {e}") | |
| 142 | + | |
| 143 | + # Approach 3: Fallback — search chat messages for recording links | |
| 144 | + try: | |
| 145 | + result = _run_m365( | |
| 146 | + [ | |
| 147 | + "request", | |
| 148 | + "--url", | |
| 149 | + ( | |
| 150 | + f"https://graph.microsoft.com/v1.0/{self.user_id}/chats" | |
| 151 | + "?$expand=messages($top=50)" | |
| 152 | + "&$filter=chatType eq 'meeting'" | |
| 153 | + "&$top=25" | |
| 154 | + ), | |
| 155 | + "--method", | |
| 156 | + "get", | |
| 157 | + ], | |
| 158 | + timeout=60, | |
| 159 | + ) | |
| 160 | + chats = self._extract_value_list(result) | |
| 161 | + for chat in chats: | |
| 162 | + messages = chat.get("messages", []) | |
| 163 | + for msg in messages: | |
| 164 | + body = msg.get("body", {}).get("content", "") | |
| 165 | + if "recording" in body.lower() or ".mp4" in body.lower(): | |
| 166 | + topic = chat.get("topic", "Meeting Recording") | |
| 167 | + chat_id = chat.get("id", "") | |
| 168 | + msg_id = msg.get("id", "") | |
| 169 | + files.append( | |
| 170 | + SourceFile( | |
| 171 | + name=f"{topic}.mp4", | |
| 172 | + id=f"{chat_id}:{msg_id}", | |
| 173 | + mime_type="video/mp4", | |
| 174 | + modified_at=msg.get("createdDateTime"), | |
| 175 | + ) | |
| 176 | + ) | |
| 177 | + if files: | |
| 178 | + logger.info(f"Found {len(files)} recording link(s) in meeting chats") | |
| 179 | + except RuntimeError as e: | |
| 180 | + logger.debug(f"Chat message fallback failed: {e}") | |
| 181 | + | |
| 182 | + if not files: | |
| 183 | + logger.warning("No Teams meeting recordings found") | |
| 184 | + | |
| 185 | + return files | |
| 186 | + | |
| 187 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 188 | + """Download a recording via its Graph API download URL.""" | |
| 189 | + destination = Path(destination) | |
| 190 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 191 | + | |
| 192 | + download_url = file.path | |
| 193 | + if download_url: | |
| 194 | + # Use the direct download URL from contentUrl / @microsoft.graph.downloadUrl | |
| 195 | + _run_m365( | |
| 196 | + [ | |
| 197 | + "request", | |
| 198 | + "--url", | |
| 199 | + download_url, | |
| 200 | + "--method", | |
| 201 | + "get", | |
| 202 | + "--filePath", | |
| 203 | + str(destination), | |
| 204 | + ], | |
| 205 | + timeout=300, | |
| 206 | + ) | |
| 207 | + else: | |
| 208 | + # Try to get download URL from the recording ID | |
| 209 | + meeting_id, _, recording_id = file.id.partition(":") | |
| 210 | + if recording_id: | |
| 211 | + url = ( | |
| 212 | + f"https://graph.microsoft.com/v1.0/{self.user_id}" | |
| 213 | + f"/onlineMeetings/{meeting_id}" | |
| 214 | + f"/recordings/{recording_id}/content" | |
| 215 | + ) | |
| 216 | + else: | |
| 217 | + url = ( | |
| 218 | + f"https://graph.microsoft.com/v1.0/{self.user_id}" | |
| 219 | + f"/onlineMeetings/{meeting_id}/recordings" | |
| 220 | + ) | |
| 221 | + # Fetch recording list to find the content URL | |
| 222 | + result = _run_m365( | |
| 223 | + ["request", "--url", url, "--method", "get"], | |
| 224 | + timeout=60, | |
| 225 | + ) | |
| 226 | + recordings = self._extract_value_list(result) | |
| 227 | + if recordings: | |
| 228 | + url = ( | |
| 229 | + f"https://graph.microsoft.com/v1.0/{self.user_id}" | |
| 230 | + f"/onlineMeetings/{meeting_id}" | |
| 231 | + f"/recordings/{recordings[0].get('id', '')}/content" | |
| 232 | + ) | |
| 233 | + | |
| 234 | + _run_m365( | |
| 235 | + [ | |
| 236 | + "request", | |
| 237 | + "--url", | |
| 238 | + url, | |
| 239 | + "--method", | |
| 240 | + "get", | |
| 241 | + "--filePath", | |
| 242 | + str(destination), | |
| 243 | + ], | |
| 244 | + timeout=300, | |
| 245 | + ) | |
| 246 | + | |
| 247 | + logger.info(f"Downloaded {file.name} to {destination}") | |
| 248 | + return destination | |
| 249 | + | |
| 250 | + def fetch_transcript(self, meeting_id: str) -> Optional[str]: | |
| 251 | + """Fetch the transcript for a Teams meeting. | |
| 252 | + | |
| 253 | + Queries the Graph API transcripts endpoint, downloads the transcript | |
| 254 | + content, and converts VTT format to plain text. | |
| 255 | + """ | |
| 256 | + try: | |
| 257 | + result = _run_m365( | |
| 258 | + [ | |
| 259 | + "request", | |
| 260 | + "--url", | |
| 261 | + ( | |
| 262 | + f"https://graph.microsoft.com/v1.0/{self.user_id}" | |
| 263 | + f"/onlineMeetings/{meeting_id}/transcripts" | |
| 264 | + ), | |
| 265 | + "--method", | |
| 266 | + "get", | |
| 267 | + ], | |
| 268 | + timeout=60, | |
| 269 | + ) | |
| 270 | + except RuntimeError as e: | |
| 271 | + logger.warning(f"Failed to list transcripts for meeting {meeting_id}: {e}") | |
| 272 | + return None | |
| 273 | + | |
| 274 | + transcripts = self._extract_value_list(result) | |
| 275 | + if not transcripts: | |
| 276 | + logger.info(f"No transcripts found for meeting {meeting_id}") | |
| 277 | + return None | |
| 278 | + | |
| 279 | + # Download the first available transcript | |
| 280 | + transcript_id = transcripts[0].get("id", "") | |
| 281 | + try: | |
| 282 | + content_result = _run_m365( | |
| 283 | + [ | |
| 284 | + "request", | |
| 285 | + "--url", | |
| 286 | + ( | |
| 287 | + f"https://graph.microsoft.com/v1.0/{self.user_id}" | |
| 288 | + f"/onlineMeetings/{meeting_id}" | |
| 289 | + f"/transcripts/{transcript_id}/content" | |
| 290 | + ), | |
| 291 | + "--method", | |
| 292 | + "get", | |
| 293 | + "--accept", | |
| 294 | + "text/vtt", | |
| 295 | + ], | |
| 296 | + timeout=60, | |
| 297 | + ) | |
| 298 | + except RuntimeError as e: | |
| 299 | + logger.warning(f"Failed to download transcript {transcript_id}: {e}") | |
| 300 | + return None | |
| 301 | + | |
| 302 | + # content_result may be raw VTT text or a dict with raw key | |
| 303 | + if isinstance(content_result, dict): | |
| 304 | + raw = content_result.get("raw", "") | |
| 305 | + else: | |
| 306 | + raw = str(content_result) | |
| 307 | + | |
| 308 | + if not raw: | |
| 309 | + logger.warning(f"Empty transcript content for meeting {meeting_id}") | |
| 310 | + return None | |
| 311 | + | |
| 312 | + return _vtt_to_text(raw) | |
| 313 | + | |
| 314 | + # ------------------------------------------------------------------ | |
| 315 | + # Internal helpers | |
| 316 | + # ------------------------------------------------------------------ | |
| 317 | + | |
| 318 | + def _extract_meetings_list(self, result) -> list: | |
| 319 | + """Extract meetings list from Graph API response.""" | |
| 320 | + if isinstance(result, list): | |
| 321 | + return result | |
| 322 | + if isinstance(result, dict): | |
| 323 | + return result.get("value", []) | |
| 324 | + return [] | |
| 325 | + | |
| 326 | + def _extract_value_list(self, result) -> list: | |
| 327 | + """Extract value list from a Graph API response.""" | |
| 328 | + if isinstance(result, list): | |
| 329 | + return result | |
| 330 | + if isinstance(result, dict): | |
| 331 | + return result.get("value", []) | |
| 332 | + return [] | |
| 333 | + | |
| 334 | + def _get_meeting_recordings(self, meeting: dict) -> List[SourceFile]: | |
| 335 | + """Fetch recordings for a single meeting and return SourceFile entries.""" | |
| 336 | + meeting_id = meeting.get("id", "") | |
| 337 | + subject = meeting.get("subject", meeting.get("topic", "Teams Meeting")) | |
| 338 | + start_time = meeting.get("startDateTime", meeting.get("createdDateTime")) | |
| 339 | + | |
| 340 | + if not meeting_id: | |
| 341 | + return [] | |
| 342 | + | |
| 343 | + try: | |
| 344 | + result = _run_m365( | |
| 345 | + [ | |
| 346 | + "request", | |
| 347 | + "--url", | |
| 348 | + ( | |
| 349 | + f"https://graph.microsoft.com/v1.0/{self.user_id}" | |
| 350 | + f"/onlineMeetings/{meeting_id}/recordings" | |
| 351 | + ), | |
| 352 | + "--method", | |
| 353 | + "get", | |
| 354 | + ], | |
| 355 | + timeout=60, | |
| 356 | + ) | |
| 357 | + except RuntimeError: | |
| 358 | + return [] | |
| 359 | + | |
| 360 | + recordings = self._extract_value_list(result) | |
| 361 | + files: List[SourceFile] = [] | |
| 362 | + for rec in recordings: | |
| 363 | + rec_id = rec.get("id", "") | |
| 364 | + download_url = rec.get("content.downloadUrl", rec.get("contentUrl")) | |
| 365 | + files.append( | |
| 366 | + SourceFile( | |
| 367 | + name=f"{subject}.mp4", | |
| 368 | + id=f"{meeting_id}:{rec_id}", | |
| 369 | + mime_type="video/mp4", | |
| 370 | + modified_at=start_time, | |
| 371 | + path=download_url, | |
| 372 | + ) | |
| 373 | + ) | |
| 374 | + | |
| 375 | + return files |
| --- a/video_processor/sources/teams_recording_source.py | |
| +++ b/video_processor/sources/teams_recording_source.py | |
| @@ -0,0 +1,375 @@ | |
| --- a/video_processor/sources/teams_recording_source.py | |
| +++ b/video_processor/sources/teams_recording_source.py | |
| @@ -0,0 +1,375 @@ | |
| 1 | """Microsoft Teams meeting recording source using the m365 CLI. |
| 2 | |
| 3 | Fetches Teams meeting recordings and transcripts via the Microsoft Graph API |
| 4 | through the `m365` CLI tool. |
| 5 | |
| 6 | Requires: npm install -g @pnp/cli-microsoft365 |
| 7 | Auth: m365 login (interactive) |
| 8 | Docs: https://pnp.github.io/cli-microsoft365/ |
| 9 | """ |
| 10 | |
| 11 | import logging |
| 12 | import re |
| 13 | import shutil |
| 14 | import subprocess |
| 15 | from pathlib import Path |
| 16 | from typing import List, Optional |
| 17 | |
| 18 | from video_processor.sources.base import BaseSource, SourceFile |
| 19 | from video_processor.sources.m365_source import _run_m365 |
| 20 | |
| 21 | logger = logging.getLogger(__name__) |
| 22 | |
| 23 | |
| 24 | def _vtt_to_text(vtt: str) -> str: |
| 25 | """Strip VTT timing metadata and return plain text. |
| 26 | |
| 27 | Removes WEBVTT headers, timestamps (00:00:00.000 --> 00:00:05.000), |
| 28 | cue identifiers, and deduplicates consecutive identical lines. |
| 29 | """ |
| 30 | lines = vtt.splitlines() |
| 31 | text_lines: list[str] = [] |
| 32 | prev_line = "" |
| 33 | |
| 34 | for line in lines: |
| 35 | stripped = line.strip() |
| 36 | # Skip WEBVTT header and NOTE blocks |
| 37 | if stripped.startswith("WEBVTT") or stripped.startswith("NOTE"): |
| 38 | continue |
| 39 | # Skip timestamp lines (e.g. 00:00:01.000 --> 00:00:05.000) |
| 40 | if re.match(r"\d{2}:\d{2}[:\.][\d.]+ --> \d{2}:\d{2}[:\.][\d.]+", stripped): |
| 41 | continue |
| 42 | # Skip numeric cue identifiers |
| 43 | if re.match(r"^\d+$", stripped): |
| 44 | continue |
| 45 | # Skip blank lines |
| 46 | if not stripped: |
| 47 | continue |
| 48 | # Strip inline VTT tags like <v Speaker> |
| 49 | cleaned = re.sub(r"<[^>]+>", "", stripped).strip() |
| 50 | if cleaned and cleaned != prev_line: |
| 51 | text_lines.append(cleaned) |
| 52 | prev_line = cleaned |
| 53 | |
| 54 | return "\n".join(text_lines) |
| 55 | |
| 56 | |
| 57 | class TeamsRecordingSource(BaseSource): |
| 58 | """ |
| 59 | Fetch Teams meeting recordings and transcripts via the m365 CLI / Graph API. |
| 60 | |
| 61 | Usage: |
| 62 | source = TeamsRecordingSource(user_id="me") |
| 63 | source.authenticate() |
| 64 | recordings = source.list_videos() |
| 65 | source.download_all(recordings, Path("./recordings")) |
| 66 | |
| 67 | # Fetch transcript for a specific meeting |
| 68 | transcript = source.fetch_transcript(meeting_id) |
| 69 | """ |
| 70 | |
| 71 | def __init__(self, user_id: str = "me"): |
| 72 | self.user_id = user_id |
| 73 | |
| 74 | def authenticate(self) -> bool: |
| 75 | """Check if m365 CLI is installed and logged in.""" |
| 76 | if not shutil.which("m365"): |
| 77 | logger.error("m365 CLI not found. Install with: npm install -g @pnp/cli-microsoft365") |
| 78 | return False |
| 79 | try: |
| 80 | result = _run_m365(["status"], timeout=10) |
| 81 | if isinstance(result, dict) and result.get("connectedAs"): |
| 82 | return True |
| 83 | if isinstance(result, str) and "Logged in" in result: |
| 84 | return True |
| 85 | logger.error("m365 not logged in. Run: m365 login") |
| 86 | return False |
| 87 | except (RuntimeError, subprocess.TimeoutExpired): |
| 88 | logger.error("m365 not logged in. Run: m365 login") |
| 89 | return False |
| 90 | |
| 91 | def list_videos( |
| 92 | self, |
| 93 | folder_id: Optional[str] = None, |
| 94 | folder_path: Optional[str] = None, |
| 95 | patterns: Optional[List[str]] = None, |
| 96 | ) -> List[SourceFile]: |
| 97 | """List Teams meeting recordings available for the user. |
| 98 | |
| 99 | Tries multiple approaches: |
| 100 | 1. Graph API onlineMeetings endpoint |
| 101 | 2. m365 teams meeting list command |
| 102 | 3. Fallback: search chat messages for recording links |
| 103 | """ |
| 104 | files: List[SourceFile] = [] |
| 105 | |
| 106 | # Approach 1: Graph API — list online meetings |
| 107 | try: |
| 108 | result = _run_m365( |
| 109 | [ |
| 110 | "request", |
| 111 | "--url", |
| 112 | f"https://graph.microsoft.com/v1.0/{self.user_id}/onlineMeetings", |
| 113 | "--method", |
| 114 | "get", |
| 115 | ], |
| 116 | timeout=60, |
| 117 | ) |
| 118 | meetings = self._extract_meetings_list(result) |
| 119 | for meeting in meetings: |
| 120 | recording_files = self._get_meeting_recordings(meeting) |
| 121 | files.extend(recording_files) |
| 122 | |
| 123 | if files: |
| 124 | logger.info(f"Found {len(files)} recording(s) via Graph API onlineMeetings") |
| 125 | return files |
| 126 | except RuntimeError as e: |
| 127 | logger.debug(f"onlineMeetings endpoint failed: {e}") |
| 128 | |
| 129 | # Approach 2: m365 teams meeting list |
| 130 | try: |
| 131 | result = _run_m365(["teams", "meeting", "list"], timeout=60) |
| 132 | meetings = result if isinstance(result, list) else [] |
| 133 | for meeting in meetings: |
| 134 | recording_files = self._get_meeting_recordings(meeting) |
| 135 | files.extend(recording_files) |
| 136 | |
| 137 | if files: |
| 138 | logger.info(f"Found {len(files)} recording(s) via m365 teams meeting list") |
| 139 | return files |
| 140 | except RuntimeError as e: |
| 141 | logger.debug(f"teams meeting list failed: {e}") |
| 142 | |
| 143 | # Approach 3: Fallback — search chat messages for recording links |
| 144 | try: |
| 145 | result = _run_m365( |
| 146 | [ |
| 147 | "request", |
| 148 | "--url", |
| 149 | ( |
| 150 | f"https://graph.microsoft.com/v1.0/{self.user_id}/chats" |
| 151 | "?$expand=messages($top=50)" |
| 152 | "&$filter=chatType eq 'meeting'" |
| 153 | "&$top=25" |
| 154 | ), |
| 155 | "--method", |
| 156 | "get", |
| 157 | ], |
| 158 | timeout=60, |
| 159 | ) |
| 160 | chats = self._extract_value_list(result) |
| 161 | for chat in chats: |
| 162 | messages = chat.get("messages", []) |
| 163 | for msg in messages: |
| 164 | body = msg.get("body", {}).get("content", "") |
| 165 | if "recording" in body.lower() or ".mp4" in body.lower(): |
| 166 | topic = chat.get("topic", "Meeting Recording") |
| 167 | chat_id = chat.get("id", "") |
| 168 | msg_id = msg.get("id", "") |
| 169 | files.append( |
| 170 | SourceFile( |
| 171 | name=f"{topic}.mp4", |
| 172 | id=f"{chat_id}:{msg_id}", |
| 173 | mime_type="video/mp4", |
| 174 | modified_at=msg.get("createdDateTime"), |
| 175 | ) |
| 176 | ) |
| 177 | if files: |
| 178 | logger.info(f"Found {len(files)} recording link(s) in meeting chats") |
| 179 | except RuntimeError as e: |
| 180 | logger.debug(f"Chat message fallback failed: {e}") |
| 181 | |
| 182 | if not files: |
| 183 | logger.warning("No Teams meeting recordings found") |
| 184 | |
| 185 | return files |
| 186 | |
| 187 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 188 | """Download a recording via its Graph API download URL.""" |
| 189 | destination = Path(destination) |
| 190 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 191 | |
| 192 | download_url = file.path |
| 193 | if download_url: |
| 194 | # Use the direct download URL from contentUrl / @microsoft.graph.downloadUrl |
| 195 | _run_m365( |
| 196 | [ |
| 197 | "request", |
| 198 | "--url", |
| 199 | download_url, |
| 200 | "--method", |
| 201 | "get", |
| 202 | "--filePath", |
| 203 | str(destination), |
| 204 | ], |
| 205 | timeout=300, |
| 206 | ) |
| 207 | else: |
| 208 | # Try to get download URL from the recording ID |
| 209 | meeting_id, _, recording_id = file.id.partition(":") |
| 210 | if recording_id: |
| 211 | url = ( |
| 212 | f"https://graph.microsoft.com/v1.0/{self.user_id}" |
| 213 | f"/onlineMeetings/{meeting_id}" |
| 214 | f"/recordings/{recording_id}/content" |
| 215 | ) |
| 216 | else: |
| 217 | url = ( |
| 218 | f"https://graph.microsoft.com/v1.0/{self.user_id}" |
| 219 | f"/onlineMeetings/{meeting_id}/recordings" |
| 220 | ) |
| 221 | # Fetch recording list to find the content URL |
| 222 | result = _run_m365( |
| 223 | ["request", "--url", url, "--method", "get"], |
| 224 | timeout=60, |
| 225 | ) |
| 226 | recordings = self._extract_value_list(result) |
| 227 | if recordings: |
| 228 | url = ( |
| 229 | f"https://graph.microsoft.com/v1.0/{self.user_id}" |
| 230 | f"/onlineMeetings/{meeting_id}" |
| 231 | f"/recordings/{recordings[0].get('id', '')}/content" |
| 232 | ) |
| 233 | |
| 234 | _run_m365( |
| 235 | [ |
| 236 | "request", |
| 237 | "--url", |
| 238 | url, |
| 239 | "--method", |
| 240 | "get", |
| 241 | "--filePath", |
| 242 | str(destination), |
| 243 | ], |
| 244 | timeout=300, |
| 245 | ) |
| 246 | |
| 247 | logger.info(f"Downloaded {file.name} to {destination}") |
| 248 | return destination |
| 249 | |
| 250 | def fetch_transcript(self, meeting_id: str) -> Optional[str]: |
| 251 | """Fetch the transcript for a Teams meeting. |
| 252 | |
| 253 | Queries the Graph API transcripts endpoint, downloads the transcript |
| 254 | content, and converts VTT format to plain text. |
| 255 | """ |
| 256 | try: |
| 257 | result = _run_m365( |
| 258 | [ |
| 259 | "request", |
| 260 | "--url", |
| 261 | ( |
| 262 | f"https://graph.microsoft.com/v1.0/{self.user_id}" |
| 263 | f"/onlineMeetings/{meeting_id}/transcripts" |
| 264 | ), |
| 265 | "--method", |
| 266 | "get", |
| 267 | ], |
| 268 | timeout=60, |
| 269 | ) |
| 270 | except RuntimeError as e: |
| 271 | logger.warning(f"Failed to list transcripts for meeting {meeting_id}: {e}") |
| 272 | return None |
| 273 | |
| 274 | transcripts = self._extract_value_list(result) |
| 275 | if not transcripts: |
| 276 | logger.info(f"No transcripts found for meeting {meeting_id}") |
| 277 | return None |
| 278 | |
| 279 | # Download the first available transcript |
| 280 | transcript_id = transcripts[0].get("id", "") |
| 281 | try: |
| 282 | content_result = _run_m365( |
| 283 | [ |
| 284 | "request", |
| 285 | "--url", |
| 286 | ( |
| 287 | f"https://graph.microsoft.com/v1.0/{self.user_id}" |
| 288 | f"/onlineMeetings/{meeting_id}" |
| 289 | f"/transcripts/{transcript_id}/content" |
| 290 | ), |
| 291 | "--method", |
| 292 | "get", |
| 293 | "--accept", |
| 294 | "text/vtt", |
| 295 | ], |
| 296 | timeout=60, |
| 297 | ) |
| 298 | except RuntimeError as e: |
| 299 | logger.warning(f"Failed to download transcript {transcript_id}: {e}") |
| 300 | return None |
| 301 | |
| 302 | # content_result may be raw VTT text or a dict with raw key |
| 303 | if isinstance(content_result, dict): |
| 304 | raw = content_result.get("raw", "") |
| 305 | else: |
| 306 | raw = str(content_result) |
| 307 | |
| 308 | if not raw: |
| 309 | logger.warning(f"Empty transcript content for meeting {meeting_id}") |
| 310 | return None |
| 311 | |
| 312 | return _vtt_to_text(raw) |
| 313 | |
| 314 | # ------------------------------------------------------------------ |
| 315 | # Internal helpers |
| 316 | # ------------------------------------------------------------------ |
| 317 | |
| 318 | def _extract_meetings_list(self, result) -> list: |
| 319 | """Extract meetings list from Graph API response.""" |
| 320 | if isinstance(result, list): |
| 321 | return result |
| 322 | if isinstance(result, dict): |
| 323 | return result.get("value", []) |
| 324 | return [] |
| 325 | |
| 326 | def _extract_value_list(self, result) -> list: |
| 327 | """Extract value list from a Graph API response.""" |
| 328 | if isinstance(result, list): |
| 329 | return result |
| 330 | if isinstance(result, dict): |
| 331 | return result.get("value", []) |
| 332 | return [] |
| 333 | |
| 334 | def _get_meeting_recordings(self, meeting: dict) -> List[SourceFile]: |
| 335 | """Fetch recordings for a single meeting and return SourceFile entries.""" |
| 336 | meeting_id = meeting.get("id", "") |
| 337 | subject = meeting.get("subject", meeting.get("topic", "Teams Meeting")) |
| 338 | start_time = meeting.get("startDateTime", meeting.get("createdDateTime")) |
| 339 | |
| 340 | if not meeting_id: |
| 341 | return [] |
| 342 | |
| 343 | try: |
| 344 | result = _run_m365( |
| 345 | [ |
| 346 | "request", |
| 347 | "--url", |
| 348 | ( |
| 349 | f"https://graph.microsoft.com/v1.0/{self.user_id}" |
| 350 | f"/onlineMeetings/{meeting_id}/recordings" |
| 351 | ), |
| 352 | "--method", |
| 353 | "get", |
| 354 | ], |
| 355 | timeout=60, |
| 356 | ) |
| 357 | except RuntimeError: |
| 358 | return [] |
| 359 | |
| 360 | recordings = self._extract_value_list(result) |
| 361 | files: List[SourceFile] = [] |
| 362 | for rec in recordings: |
| 363 | rec_id = rec.get("id", "") |
| 364 | download_url = rec.get("content.downloadUrl", rec.get("contentUrl")) |
| 365 | files.append( |
| 366 | SourceFile( |
| 367 | name=f"{subject}.mp4", |
| 368 | id=f"{meeting_id}:{rec_id}", |
| 369 | mime_type="video/mp4", |
| 370 | modified_at=start_time, |
| 371 | path=download_url, |
| 372 | ) |
| 373 | ) |
| 374 | |
| 375 | return files |
| --- a/video_processor/sources/zoom_source.py | ||
| +++ b/video_processor/sources/zoom_source.py | ||
| @@ -0,0 +1,399 @@ | ||
| 1 | +"""Zoom cloud recordings source integration with OAuth support.""" | |
| 2 | + | |
| 3 | +import base64 | |
| 4 | +import hashlib | |
| 5 | +import json | |
| 6 | +import logging | |
| 7 | +import os | |
| 8 | +import secrets | |
| 9 | +import time | |
| 10 | +import webbrowser | |
| 11 | +from pathlib import Path | |
| 12 | +from typing import Dict, List, Optional | |
| 13 | + | |
| 14 | +import requests | |
| 15 | + | |
| 16 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 17 | + | |
| 18 | +logger = logging.getLogger(__name__) | |
| 19 | + | |
| 20 | +_TOKEN_PATH = Path.home() / ".planopticon" / "zoom_token.json" | |
| 21 | +_BASE_URL = "https://api.zoom.us/v2" | |
| 22 | +_OAUTH_BASE = "https://zoom.us/oauth" | |
| 23 | + | |
| 24 | +# Map Zoom file_type values to MIME types | |
| 25 | +_MIME_TYPES = { | |
| 26 | + "MP4": "video/mp4", | |
| 27 | + "M4A": "audio/mp4", | |
| 28 | + "CHAT": "text/plain", | |
| 29 | + "TRANSCRIPT": "text/vtt", | |
| 30 | + "CSV": "text/csv", | |
| 31 | + "TIMELINE": "application/json", | |
| 32 | +} | |
| 33 | + | |
| 34 | + | |
| 35 | +class ZoomSource(BaseSource): | |
| 36 | + """ | |
| 37 | + Zoom cloud recordings source with OAuth2 support. | |
| 38 | + | |
| 39 | + Auth methods (tried in order): | |
| 40 | + 1. Saved token: Load from token_path, refresh if expired | |
| 41 | + 2. Server-to-Server OAuth: Uses account_id with client credentials | |
| 42 | + 3. OAuth2 Authorization Code with PKCE: Interactive browser flow | |
| 43 | + """ | |
| 44 | + | |
| 45 | + def __init__( | |
| 46 | + self, | |
| 47 | + client_id: Optional[str] = None, | |
| 48 | + client_secret: Optional[str] = None, | |
| 49 | + account_id: Optional[str] = None, | |
| 50 | + token_path: Optional[Path] = None, | |
| 51 | + ): | |
| 52 | + """ | |
| 53 | + Initialize Zoom source. | |
| 54 | + | |
| 55 | + Parameters | |
| 56 | + ---------- | |
| 57 | + client_id : str, optional | |
| 58 | + Zoom OAuth app client ID. Falls back to ZOOM_CLIENT_ID env var. | |
| 59 | + client_secret : str, optional | |
| 60 | + Zoom OAuth app client secret. Falls back to ZOOM_CLIENT_SECRET env var. | |
| 61 | + account_id : str, optional | |
| 62 | + Zoom account ID for Server-to-Server OAuth. Falls back to ZOOM_ACCOUNT_ID env var. | |
| 63 | + token_path : Path, optional | |
| 64 | + Where to store/load OAuth tokens. | |
| 65 | + """ | |
| 66 | + self.client_id = client_id or os.environ.get("ZOOM_CLIENT_ID") | |
| 67 | + self.client_secret = client_secret or os.environ.get("ZOOM_CLIENT_SECRET") | |
| 68 | + self.account_id = account_id or os.environ.get("ZOOM_ACCOUNT_ID") | |
| 69 | + self.token_path = token_path or _TOKEN_PATH | |
| 70 | + self._access_token: Optional[str] = None | |
| 71 | + self._token_data: Optional[Dict] = None | |
| 72 | + | |
| 73 | + def authenticate(self) -> bool: | |
| 74 | + """Authenticate with Zoom API.""" | |
| 75 | + # Try 1: Load saved token | |
| 76 | + if self.token_path.exists(): | |
| 77 | + if self._auth_saved_token(): | |
| 78 | + return True | |
| 79 | + | |
| 80 | + # Try 2: Server-to-Server OAuth (if account_id is set) | |
| 81 | + if self.account_id: | |
| 82 | + return self._auth_server_to_server() | |
| 83 | + | |
| 84 | + # Try 3: OAuth2 Authorization Code flow with PKCE | |
| 85 | + return self._auth_oauth_pkce() | |
| 86 | + | |
| 87 | + def _auth_saved_token(self) -> bool: | |
| 88 | + """Authenticate using a saved OAuth token, refreshing if expired.""" | |
| 89 | + try: | |
| 90 | + data = json.loads(self.token_path.read_text()) | |
| 91 | + expires_at = data.get("expires_at", 0) | |
| 92 | + | |
| 93 | + if time.time() < expires_at: | |
| 94 | + # Token still valid | |
| 95 | + self._access_token = data["access_token"] | |
| 96 | + self._token_data = data | |
| 97 | + logger.info("Authenticated with Zoom via saved token") | |
| 98 | + return True | |
| 99 | + | |
| 100 | + # Token expired, try to refresh | |
| 101 | + if data.get("refresh_token"): | |
| 102 | + return self._refresh_token() | |
| 103 | + | |
| 104 | + # Server-to-Server tokens don't have refresh tokens; | |
| 105 | + # fall through to re-authenticate | |
| 106 | + return False | |
| 107 | + except Exception: | |
| 108 | + return False | |
| 109 | + | |
| 110 | + def _auth_server_to_server(self) -> bool: | |
| 111 | + """Authenticate using Server-to-Server OAuth (account credentials).""" | |
| 112 | + if not self.client_id or not self.client_secret: | |
| 113 | + logger.error( | |
| 114 | + "Zoom client_id and client_secret required for Server-to-Server OAuth. " | |
| 115 | + "Set ZOOM_CLIENT_ID and ZOOM_CLIENT_SECRET env vars." | |
| 116 | + ) | |
| 117 | + return False | |
| 118 | + | |
| 119 | + try: | |
| 120 | + resp = requests.post( | |
| 121 | + f"{_OAUTH_BASE}/token", | |
| 122 | + params={ | |
| 123 | + "grant_type": "account_credentials", | |
| 124 | + "account_id": self.account_id, | |
| 125 | + }, | |
| 126 | + auth=(self.client_id, self.client_secret), | |
| 127 | + timeout=30, | |
| 128 | + ) | |
| 129 | + resp.raise_for_status() | |
| 130 | + token_data = resp.json() | |
| 131 | + | |
| 132 | + self._access_token = token_data["access_token"] | |
| 133 | + self._token_data = { | |
| 134 | + "access_token": token_data["access_token"], | |
| 135 | + "expires_at": time.time() + token_data.get("expires_in", 3600) - 60, | |
| 136 | + "token_type": token_data.get("token_type", "bearer"), | |
| 137 | + } | |
| 138 | + | |
| 139 | + self._save_token(self._token_data) | |
| 140 | + logger.info("Authenticated with Zoom via Server-to-Server OAuth") | |
| 141 | + return True | |
| 142 | + except Exception as e: | |
| 143 | + logger.error(f"Zoom Server-to-Server OAuth failed: {e}") | |
| 144 | + return False | |
| 145 | + | |
| 146 | + def _auth_oauth_pkce(self) -> bool: | |
| 147 | + """Run OAuth2 Authorization Code flow with PKCE.""" | |
| 148 | + if not self.client_id: | |
| 149 | + logger.error("Zoom client_id required for OAuth. Set ZOOM_CLIENT_ID env var.") | |
| 150 | + return False | |
| 151 | + | |
| 152 | + try: | |
| 153 | + # Generate PKCE code verifier and challenge | |
| 154 | + code_verifier = secrets.token_urlsafe(64) | |
| 155 | + code_challenge = ( | |
| 156 | + base64.urlsafe_b64encode(hashlib.sha256(code_verifier.encode("ascii")).digest()) | |
| 157 | + .rstrip(b"=") | |
| 158 | + .decode("ascii") | |
| 159 | + ) | |
| 160 | + | |
| 161 | + authorize_url = ( | |
| 162 | + f"{_OAUTH_BASE}/authorize" | |
| 163 | + f"?response_type=code" | |
| 164 | + f"&client_id={self.client_id}" | |
| 165 | + f"&redirect_uri=urn:ietf:wg:oauth:2.0:oob" | |
| 166 | + f"&code_challenge={code_challenge}" | |
| 167 | + f"&code_challenge_method=S256" | |
| 168 | + ) | |
| 169 | + | |
| 170 | + print(f"\nOpen this URL to authorize PlanOpticon:\n{authorize_url}\n") | |
| 171 | + | |
| 172 | + try: | |
| 173 | + webbrowser.open(authorize_url) | |
| 174 | + except Exception: | |
| 175 | + pass | |
| 176 | + | |
| 177 | + auth_code = input("Enter the authorization code: ").strip() | |
| 178 | + | |
| 179 | + # Exchange authorization code for tokens | |
| 180 | + payload = { | |
| 181 | + "grant_type": "authorization_code", | |
| 182 | + "code": auth_code, | |
| 183 | + "redirect_uri": "urn:ietf:wg:oauth:2.0:oob", | |
| 184 | + "code_verifier": code_verifier, | |
| 185 | + } | |
| 186 | + | |
| 187 | + resp = requests.post( | |
| 188 | + f"{_OAUTH_BASE}/token", | |
| 189 | + data=payload, | |
| 190 | + auth=(self.client_id, self.client_secret or ""), | |
| 191 | + timeout=30, | |
| 192 | + ) | |
| 193 | + resp.raise_for_status() | |
| 194 | + token_data = resp.json() | |
| 195 | + | |
| 196 | + self._access_token = token_data["access_token"] | |
| 197 | + self._token_data = { | |
| 198 | + "access_token": token_data["access_token"], | |
| 199 | + "refresh_token": token_data.get("refresh_token"), | |
| 200 | + "expires_at": time.time() + token_data.get("expires_in", 3600) - 60, | |
| 201 | + "token_type": token_data.get("token_type", "bearer"), | |
| 202 | + "client_id": self.client_id, | |
| 203 | + "client_secret": self.client_secret or "", | |
| 204 | + } | |
| 205 | + | |
| 206 | + self._save_token(self._token_data) | |
| 207 | + logger.info("Authenticated with Zoom via OAuth PKCE") | |
| 208 | + return True | |
| 209 | + except Exception as e: | |
| 210 | + logger.error(f"Zoom OAuth PKCE failed: {e}") | |
| 211 | + return False | |
| 212 | + | |
| 213 | + def _refresh_token(self) -> bool: | |
| 214 | + """Refresh an expired OAuth token.""" | |
| 215 | + try: | |
| 216 | + data = json.loads(self.token_path.read_text()) | |
| 217 | + refresh_token = data.get("refresh_token") | |
| 218 | + client_id = data.get("client_id") or self.client_id | |
| 219 | + client_secret = data.get("client_secret") or self.client_secret | |
| 220 | + | |
| 221 | + if not refresh_token or not client_id: | |
| 222 | + return False | |
| 223 | + | |
| 224 | + resp = requests.post( | |
| 225 | + f"{_OAUTH_BASE}/token", | |
| 226 | + data={ | |
| 227 | + "grant_type": "refresh_token", | |
| 228 | + "refresh_token": refresh_token, | |
| 229 | + }, | |
| 230 | + auth=(client_id, client_secret or ""), | |
| 231 | + timeout=30, | |
| 232 | + ) | |
| 233 | + resp.raise_for_status() | |
| 234 | + token_data = resp.json() | |
| 235 | + | |
| 236 | + self._access_token = token_data["access_token"] | |
| 237 | + self._token_data = { | |
| 238 | + "access_token": token_data["access_token"], | |
| 239 | + "refresh_token": token_data.get("refresh_token", refresh_token), | |
| 240 | + "expires_at": time.time() + token_data.get("expires_in", 3600) - 60, | |
| 241 | + "token_type": token_data.get("token_type", "bearer"), | |
| 242 | + "client_id": client_id, | |
| 243 | + "client_secret": client_secret or "", | |
| 244 | + } | |
| 245 | + | |
| 246 | + self._save_token(self._token_data) | |
| 247 | + logger.info("Refreshed Zoom OAuth token") | |
| 248 | + return True | |
| 249 | + except Exception as e: | |
| 250 | + logger.error(f"Zoom token refresh failed: {e}") | |
| 251 | + return False | |
| 252 | + | |
| 253 | + def _save_token(self, data: Dict) -> None: | |
| 254 | + """Save token data to disk.""" | |
| 255 | + self.token_path.parent.mkdir(parents=True, exist_ok=True) | |
| 256 | + self.token_path.write_text(json.dumps(data)) | |
| 257 | + logger.info(f"OAuth token saved to {self.token_path}") | |
| 258 | + | |
| 259 | + def _api_get(self, endpoint: str, params: Optional[Dict] = None) -> requests.Response: | |
| 260 | + """Make an authenticated GET request to the Zoom API.""" | |
| 261 | + if not self._access_token: | |
| 262 | + raise RuntimeError("Not authenticated. Call authenticate() first.") | |
| 263 | + | |
| 264 | + url = f"{_BASE_URL}/{endpoint.lstrip('/')}" | |
| 265 | + resp = requests.get( | |
| 266 | + url, | |
| 267 | + headers={"Authorization": f"Bearer {self._access_token}"}, | |
| 268 | + params=params, | |
| 269 | + timeout=30, | |
| 270 | + ) | |
| 271 | + resp.raise_for_status() | |
| 272 | + return resp | |
| 273 | + | |
| 274 | + def list_videos( | |
| 275 | + self, | |
| 276 | + folder_id: Optional[str] = None, | |
| 277 | + folder_path: Optional[str] = None, | |
| 278 | + patterns: Optional[List[str]] = None, | |
| 279 | + ) -> List[SourceFile]: | |
| 280 | + """List video files from Zoom cloud recordings.""" | |
| 281 | + if not self._access_token: | |
| 282 | + raise RuntimeError("Not authenticated. Call authenticate() first.") | |
| 283 | + | |
| 284 | + files: List[SourceFile] = [] | |
| 285 | + next_page_token = "" | |
| 286 | + | |
| 287 | + while True: | |
| 288 | + params: Dict = {} | |
| 289 | + if next_page_token: | |
| 290 | + params["next_page_token"] = next_page_token | |
| 291 | + | |
| 292 | + resp = self._api_get("users/me/recordings", params=params) | |
| 293 | + data = resp.json() | |
| 294 | + | |
| 295 | + for meeting in data.get("meetings", []): | |
| 296 | + meeting_id = str(meeting.get("id", "")) | |
| 297 | + topic = meeting.get("topic", "Untitled Meeting") | |
| 298 | + start_time = meeting.get("start_time") | |
| 299 | + | |
| 300 | + for rec_file in meeting.get("recording_files", []): | |
| 301 | + file_type = rec_file.get("file_type", "") | |
| 302 | + mime_type = _MIME_TYPES.get(file_type) | |
| 303 | + | |
| 304 | + # Build a descriptive name | |
| 305 | + file_ext = rec_file.get("file_extension", file_type).lower() | |
| 306 | + file_name = f"{topic}.{file_ext}" | |
| 307 | + | |
| 308 | + if patterns: | |
| 309 | + if not any(file_name.endswith(p.replace("*", "")) for p in patterns): | |
| 310 | + continue | |
| 311 | + | |
| 312 | + files.append( | |
| 313 | + SourceFile( | |
| 314 | + name=file_name, | |
| 315 | + id=meeting_id, | |
| 316 | + size_bytes=rec_file.get("file_size"), | |
| 317 | + mime_type=mime_type, | |
| 318 | + modified_at=start_time, | |
| 319 | + path=rec_file.get("download_url"), | |
| 320 | + ) | |
| 321 | + ) | |
| 322 | + | |
| 323 | + next_page_token = data.get("next_page_token", "") | |
| 324 | + if not next_page_token: | |
| 325 | + break | |
| 326 | + | |
| 327 | + logger.info(f"Found {len(files)} recordings in Zoom") | |
| 328 | + return files | |
| 329 | + | |
| 330 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 331 | + """Download a recording file from Zoom.""" | |
| 332 | + if not self._access_token: | |
| 333 | + raise RuntimeError("Not authenticated. Call authenticate() first.") | |
| 334 | + | |
| 335 | + destination = Path(destination) | |
| 336 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 337 | + | |
| 338 | + download_url = file.path | |
| 339 | + if not download_url: | |
| 340 | + raise ValueError(f"No download URL for file: {file.name}") | |
| 341 | + | |
| 342 | + resp = requests.get( | |
| 343 | + download_url, | |
| 344 | + headers={"Authorization": f"Bearer {self._access_token}"}, | |
| 345 | + stream=True, | |
| 346 | + timeout=60, | |
| 347 | + ) | |
| 348 | + resp.raise_for_status() | |
| 349 | + | |
| 350 | + with open(destination, "wb") as f: | |
| 351 | + for chunk in resp.iter_content(chunk_size=8192): | |
| 352 | + f.write(chunk) | |
| 353 | + | |
| 354 | + logger.info(f"Downloaded {file.name} to {destination}") | |
| 355 | + return destination | |
| 356 | + | |
| 357 | + def fetch_transcript(self, meeting_id: str) -> Optional[str]: | |
| 358 | + """ | |
| 359 | + Fetch the transcript (VTT) for a Zoom meeting recording. | |
| 360 | + | |
| 361 | + Looks for transcript files in the recording's file list and downloads | |
| 362 | + the content as text. | |
| 363 | + | |
| 364 | + Parameters | |
| 365 | + ---------- | |
| 366 | + meeting_id : str | |
| 367 | + The Zoom meeting ID. | |
| 368 | + | |
| 369 | + Returns | |
| 370 | + ------- | |
| 371 | + str or None | |
| 372 | + Transcript text if available, None otherwise. | |
| 373 | + """ | |
| 374 | + if not self._access_token: | |
| 375 | + raise RuntimeError("Not authenticated. Call authenticate() first.") | |
| 376 | + | |
| 377 | + try: | |
| 378 | + resp = self._api_get(f"meetings/{meeting_id}/recordings") | |
| 379 | + data = resp.json() | |
| 380 | + | |
| 381 | + for rec_file in data.get("recording_files", []): | |
| 382 | + file_type = rec_file.get("file_type", "") | |
| 383 | + if file_type == "TRANSCRIPT": | |
| 384 | + download_url = rec_file.get("download_url") | |
| 385 | + if download_url: | |
| 386 | + dl_resp = requests.get( | |
| 387 | + download_url, | |
| 388 | + headers={"Authorization": f"Bearer {self._access_token}"}, | |
| 389 | + timeout=30, | |
| 390 | + ) | |
| 391 | + dl_resp.raise_for_status() | |
| 392 | + logger.info(f"Fetched transcript for meeting {meeting_id}") | |
| 393 | + return dl_resp.text | |
| 394 | + | |
| 395 | + logger.info(f"No transcript found for meeting {meeting_id}") | |
| 396 | + return None | |
| 397 | + except Exception as e: | |
| 398 | + logger.error(f"Failed to fetch transcript for meeting {meeting_id}: {e}") | |
| 399 | + return None |
| --- a/video_processor/sources/zoom_source.py | |
| +++ b/video_processor/sources/zoom_source.py | |
| @@ -0,0 +1,399 @@ | |
| --- a/video_processor/sources/zoom_source.py | |
| +++ b/video_processor/sources/zoom_source.py | |
| @@ -0,0 +1,399 @@ | |
| 1 | """Zoom cloud recordings source integration with OAuth support.""" |
| 2 | |
| 3 | import base64 |
| 4 | import hashlib |
| 5 | import json |
| 6 | import logging |
| 7 | import os |
| 8 | import secrets |
| 9 | import time |
| 10 | import webbrowser |
| 11 | from pathlib import Path |
| 12 | from typing import Dict, List, Optional |
| 13 | |
| 14 | import requests |
| 15 | |
| 16 | from video_processor.sources.base import BaseSource, SourceFile |
| 17 | |
| 18 | logger = logging.getLogger(__name__) |
| 19 | |
| 20 | _TOKEN_PATH = Path.home() / ".planopticon" / "zoom_token.json" |
| 21 | _BASE_URL = "https://api.zoom.us/v2" |
| 22 | _OAUTH_BASE = "https://zoom.us/oauth" |
| 23 | |
| 24 | # Map Zoom file_type values to MIME types |
| 25 | _MIME_TYPES = { |
| 26 | "MP4": "video/mp4", |
| 27 | "M4A": "audio/mp4", |
| 28 | "CHAT": "text/plain", |
| 29 | "TRANSCRIPT": "text/vtt", |
| 30 | "CSV": "text/csv", |
| 31 | "TIMELINE": "application/json", |
| 32 | } |
| 33 | |
| 34 | |
| 35 | class ZoomSource(BaseSource): |
| 36 | """ |
| 37 | Zoom cloud recordings source with OAuth2 support. |
| 38 | |
| 39 | Auth methods (tried in order): |
| 40 | 1. Saved token: Load from token_path, refresh if expired |
| 41 | 2. Server-to-Server OAuth: Uses account_id with client credentials |
| 42 | 3. OAuth2 Authorization Code with PKCE: Interactive browser flow |
| 43 | """ |
| 44 | |
| 45 | def __init__( |
| 46 | self, |
| 47 | client_id: Optional[str] = None, |
| 48 | client_secret: Optional[str] = None, |
| 49 | account_id: Optional[str] = None, |
| 50 | token_path: Optional[Path] = None, |
| 51 | ): |
| 52 | """ |
| 53 | Initialize Zoom source. |
| 54 | |
| 55 | Parameters |
| 56 | ---------- |
| 57 | client_id : str, optional |
| 58 | Zoom OAuth app client ID. Falls back to ZOOM_CLIENT_ID env var. |
| 59 | client_secret : str, optional |
| 60 | Zoom OAuth app client secret. Falls back to ZOOM_CLIENT_SECRET env var. |
| 61 | account_id : str, optional |
| 62 | Zoom account ID for Server-to-Server OAuth. Falls back to ZOOM_ACCOUNT_ID env var. |
| 63 | token_path : Path, optional |
| 64 | Where to store/load OAuth tokens. |
| 65 | """ |
| 66 | self.client_id = client_id or os.environ.get("ZOOM_CLIENT_ID") |
| 67 | self.client_secret = client_secret or os.environ.get("ZOOM_CLIENT_SECRET") |
| 68 | self.account_id = account_id or os.environ.get("ZOOM_ACCOUNT_ID") |
| 69 | self.token_path = token_path or _TOKEN_PATH |
| 70 | self._access_token: Optional[str] = None |
| 71 | self._token_data: Optional[Dict] = None |
| 72 | |
| 73 | def authenticate(self) -> bool: |
| 74 | """Authenticate with Zoom API.""" |
| 75 | # Try 1: Load saved token |
| 76 | if self.token_path.exists(): |
| 77 | if self._auth_saved_token(): |
| 78 | return True |
| 79 | |
| 80 | # Try 2: Server-to-Server OAuth (if account_id is set) |
| 81 | if self.account_id: |
| 82 | return self._auth_server_to_server() |
| 83 | |
| 84 | # Try 3: OAuth2 Authorization Code flow with PKCE |
| 85 | return self._auth_oauth_pkce() |
| 86 | |
| 87 | def _auth_saved_token(self) -> bool: |
| 88 | """Authenticate using a saved OAuth token, refreshing if expired.""" |
| 89 | try: |
| 90 | data = json.loads(self.token_path.read_text()) |
| 91 | expires_at = data.get("expires_at", 0) |
| 92 | |
| 93 | if time.time() < expires_at: |
| 94 | # Token still valid |
| 95 | self._access_token = data["access_token"] |
| 96 | self._token_data = data |
| 97 | logger.info("Authenticated with Zoom via saved token") |
| 98 | return True |
| 99 | |
| 100 | # Token expired, try to refresh |
| 101 | if data.get("refresh_token"): |
| 102 | return self._refresh_token() |
| 103 | |
| 104 | # Server-to-Server tokens don't have refresh tokens; |
| 105 | # fall through to re-authenticate |
| 106 | return False |
| 107 | except Exception: |
| 108 | return False |
| 109 | |
| 110 | def _auth_server_to_server(self) -> bool: |
| 111 | """Authenticate using Server-to-Server OAuth (account credentials).""" |
| 112 | if not self.client_id or not self.client_secret: |
| 113 | logger.error( |
| 114 | "Zoom client_id and client_secret required for Server-to-Server OAuth. " |
| 115 | "Set ZOOM_CLIENT_ID and ZOOM_CLIENT_SECRET env vars." |
| 116 | ) |
| 117 | return False |
| 118 | |
| 119 | try: |
| 120 | resp = requests.post( |
| 121 | f"{_OAUTH_BASE}/token", |
| 122 | params={ |
| 123 | "grant_type": "account_credentials", |
| 124 | "account_id": self.account_id, |
| 125 | }, |
| 126 | auth=(self.client_id, self.client_secret), |
| 127 | timeout=30, |
| 128 | ) |
| 129 | resp.raise_for_status() |
| 130 | token_data = resp.json() |
| 131 | |
| 132 | self._access_token = token_data["access_token"] |
| 133 | self._token_data = { |
| 134 | "access_token": token_data["access_token"], |
| 135 | "expires_at": time.time() + token_data.get("expires_in", 3600) - 60, |
| 136 | "token_type": token_data.get("token_type", "bearer"), |
| 137 | } |
| 138 | |
| 139 | self._save_token(self._token_data) |
| 140 | logger.info("Authenticated with Zoom via Server-to-Server OAuth") |
| 141 | return True |
| 142 | except Exception as e: |
| 143 | logger.error(f"Zoom Server-to-Server OAuth failed: {e}") |
| 144 | return False |
| 145 | |
| 146 | def _auth_oauth_pkce(self) -> bool: |
| 147 | """Run OAuth2 Authorization Code flow with PKCE.""" |
| 148 | if not self.client_id: |
| 149 | logger.error("Zoom client_id required for OAuth. Set ZOOM_CLIENT_ID env var.") |
| 150 | return False |
| 151 | |
| 152 | try: |
| 153 | # Generate PKCE code verifier and challenge |
| 154 | code_verifier = secrets.token_urlsafe(64) |
| 155 | code_challenge = ( |
| 156 | base64.urlsafe_b64encode(hashlib.sha256(code_verifier.encode("ascii")).digest()) |
| 157 | .rstrip(b"=") |
| 158 | .decode("ascii") |
| 159 | ) |
| 160 | |
| 161 | authorize_url = ( |
| 162 | f"{_OAUTH_BASE}/authorize" |
| 163 | f"?response_type=code" |
| 164 | f"&client_id={self.client_id}" |
| 165 | f"&redirect_uri=urn:ietf:wg:oauth:2.0:oob" |
| 166 | f"&code_challenge={code_challenge}" |
| 167 | f"&code_challenge_method=S256" |
| 168 | ) |
| 169 | |
| 170 | print(f"\nOpen this URL to authorize PlanOpticon:\n{authorize_url}\n") |
| 171 | |
| 172 | try: |
| 173 | webbrowser.open(authorize_url) |
| 174 | except Exception: |
| 175 | pass |
| 176 | |
| 177 | auth_code = input("Enter the authorization code: ").strip() |
| 178 | |
| 179 | # Exchange authorization code for tokens |
| 180 | payload = { |
| 181 | "grant_type": "authorization_code", |
| 182 | "code": auth_code, |
| 183 | "redirect_uri": "urn:ietf:wg:oauth:2.0:oob", |
| 184 | "code_verifier": code_verifier, |
| 185 | } |
| 186 | |
| 187 | resp = requests.post( |
| 188 | f"{_OAUTH_BASE}/token", |
| 189 | data=payload, |
| 190 | auth=(self.client_id, self.client_secret or ""), |
| 191 | timeout=30, |
| 192 | ) |
| 193 | resp.raise_for_status() |
| 194 | token_data = resp.json() |
| 195 | |
| 196 | self._access_token = token_data["access_token"] |
| 197 | self._token_data = { |
| 198 | "access_token": token_data["access_token"], |
| 199 | "refresh_token": token_data.get("refresh_token"), |
| 200 | "expires_at": time.time() + token_data.get("expires_in", 3600) - 60, |
| 201 | "token_type": token_data.get("token_type", "bearer"), |
| 202 | "client_id": self.client_id, |
| 203 | "client_secret": self.client_secret or "", |
| 204 | } |
| 205 | |
| 206 | self._save_token(self._token_data) |
| 207 | logger.info("Authenticated with Zoom via OAuth PKCE") |
| 208 | return True |
| 209 | except Exception as e: |
| 210 | logger.error(f"Zoom OAuth PKCE failed: {e}") |
| 211 | return False |
| 212 | |
| 213 | def _refresh_token(self) -> bool: |
| 214 | """Refresh an expired OAuth token.""" |
| 215 | try: |
| 216 | data = json.loads(self.token_path.read_text()) |
| 217 | refresh_token = data.get("refresh_token") |
| 218 | client_id = data.get("client_id") or self.client_id |
| 219 | client_secret = data.get("client_secret") or self.client_secret |
| 220 | |
| 221 | if not refresh_token or not client_id: |
| 222 | return False |
| 223 | |
| 224 | resp = requests.post( |
| 225 | f"{_OAUTH_BASE}/token", |
| 226 | data={ |
| 227 | "grant_type": "refresh_token", |
| 228 | "refresh_token": refresh_token, |
| 229 | }, |
| 230 | auth=(client_id, client_secret or ""), |
| 231 | timeout=30, |
| 232 | ) |
| 233 | resp.raise_for_status() |
| 234 | token_data = resp.json() |
| 235 | |
| 236 | self._access_token = token_data["access_token"] |
| 237 | self._token_data = { |
| 238 | "access_token": token_data["access_token"], |
| 239 | "refresh_token": token_data.get("refresh_token", refresh_token), |
| 240 | "expires_at": time.time() + token_data.get("expires_in", 3600) - 60, |
| 241 | "token_type": token_data.get("token_type", "bearer"), |
| 242 | "client_id": client_id, |
| 243 | "client_secret": client_secret or "", |
| 244 | } |
| 245 | |
| 246 | self._save_token(self._token_data) |
| 247 | logger.info("Refreshed Zoom OAuth token") |
| 248 | return True |
| 249 | except Exception as e: |
| 250 | logger.error(f"Zoom token refresh failed: {e}") |
| 251 | return False |
| 252 | |
| 253 | def _save_token(self, data: Dict) -> None: |
| 254 | """Save token data to disk.""" |
| 255 | self.token_path.parent.mkdir(parents=True, exist_ok=True) |
| 256 | self.token_path.write_text(json.dumps(data)) |
| 257 | logger.info(f"OAuth token saved to {self.token_path}") |
| 258 | |
| 259 | def _api_get(self, endpoint: str, params: Optional[Dict] = None) -> requests.Response: |
| 260 | """Make an authenticated GET request to the Zoom API.""" |
| 261 | if not self._access_token: |
| 262 | raise RuntimeError("Not authenticated. Call authenticate() first.") |
| 263 | |
| 264 | url = f"{_BASE_URL}/{endpoint.lstrip('/')}" |
| 265 | resp = requests.get( |
| 266 | url, |
| 267 | headers={"Authorization": f"Bearer {self._access_token}"}, |
| 268 | params=params, |
| 269 | timeout=30, |
| 270 | ) |
| 271 | resp.raise_for_status() |
| 272 | return resp |
| 273 | |
| 274 | def list_videos( |
| 275 | self, |
| 276 | folder_id: Optional[str] = None, |
| 277 | folder_path: Optional[str] = None, |
| 278 | patterns: Optional[List[str]] = None, |
| 279 | ) -> List[SourceFile]: |
| 280 | """List video files from Zoom cloud recordings.""" |
| 281 | if not self._access_token: |
| 282 | raise RuntimeError("Not authenticated. Call authenticate() first.") |
| 283 | |
| 284 | files: List[SourceFile] = [] |
| 285 | next_page_token = "" |
| 286 | |
| 287 | while True: |
| 288 | params: Dict = {} |
| 289 | if next_page_token: |
| 290 | params["next_page_token"] = next_page_token |
| 291 | |
| 292 | resp = self._api_get("users/me/recordings", params=params) |
| 293 | data = resp.json() |
| 294 | |
| 295 | for meeting in data.get("meetings", []): |
| 296 | meeting_id = str(meeting.get("id", "")) |
| 297 | topic = meeting.get("topic", "Untitled Meeting") |
| 298 | start_time = meeting.get("start_time") |
| 299 | |
| 300 | for rec_file in meeting.get("recording_files", []): |
| 301 | file_type = rec_file.get("file_type", "") |
| 302 | mime_type = _MIME_TYPES.get(file_type) |
| 303 | |
| 304 | # Build a descriptive name |
| 305 | file_ext = rec_file.get("file_extension", file_type).lower() |
| 306 | file_name = f"{topic}.{file_ext}" |
| 307 | |
| 308 | if patterns: |
| 309 | if not any(file_name.endswith(p.replace("*", "")) for p in patterns): |
| 310 | continue |
| 311 | |
| 312 | files.append( |
| 313 | SourceFile( |
| 314 | name=file_name, |
| 315 | id=meeting_id, |
| 316 | size_bytes=rec_file.get("file_size"), |
| 317 | mime_type=mime_type, |
| 318 | modified_at=start_time, |
| 319 | path=rec_file.get("download_url"), |
| 320 | ) |
| 321 | ) |
| 322 | |
| 323 | next_page_token = data.get("next_page_token", "") |
| 324 | if not next_page_token: |
| 325 | break |
| 326 | |
| 327 | logger.info(f"Found {len(files)} recordings in Zoom") |
| 328 | return files |
| 329 | |
| 330 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 331 | """Download a recording file from Zoom.""" |
| 332 | if not self._access_token: |
| 333 | raise RuntimeError("Not authenticated. Call authenticate() first.") |
| 334 | |
| 335 | destination = Path(destination) |
| 336 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 337 | |
| 338 | download_url = file.path |
| 339 | if not download_url: |
| 340 | raise ValueError(f"No download URL for file: {file.name}") |
| 341 | |
| 342 | resp = requests.get( |
| 343 | download_url, |
| 344 | headers={"Authorization": f"Bearer {self._access_token}"}, |
| 345 | stream=True, |
| 346 | timeout=60, |
| 347 | ) |
| 348 | resp.raise_for_status() |
| 349 | |
| 350 | with open(destination, "wb") as f: |
| 351 | for chunk in resp.iter_content(chunk_size=8192): |
| 352 | f.write(chunk) |
| 353 | |
| 354 | logger.info(f"Downloaded {file.name} to {destination}") |
| 355 | return destination |
| 356 | |
| 357 | def fetch_transcript(self, meeting_id: str) -> Optional[str]: |
| 358 | """ |
| 359 | Fetch the transcript (VTT) for a Zoom meeting recording. |
| 360 | |
| 361 | Looks for transcript files in the recording's file list and downloads |
| 362 | the content as text. |
| 363 | |
| 364 | Parameters |
| 365 | ---------- |
| 366 | meeting_id : str |
| 367 | The Zoom meeting ID. |
| 368 | |
| 369 | Returns |
| 370 | ------- |
| 371 | str or None |
| 372 | Transcript text if available, None otherwise. |
| 373 | """ |
| 374 | if not self._access_token: |
| 375 | raise RuntimeError("Not authenticated. Call authenticate() first.") |
| 376 | |
| 377 | try: |
| 378 | resp = self._api_get(f"meetings/{meeting_id}/recordings") |
| 379 | data = resp.json() |
| 380 | |
| 381 | for rec_file in data.get("recording_files", []): |
| 382 | file_type = rec_file.get("file_type", "") |
| 383 | if file_type == "TRANSCRIPT": |
| 384 | download_url = rec_file.get("download_url") |
| 385 | if download_url: |
| 386 | dl_resp = requests.get( |
| 387 | download_url, |
| 388 | headers={"Authorization": f"Bearer {self._access_token}"}, |
| 389 | timeout=30, |
| 390 | ) |
| 391 | dl_resp.raise_for_status() |
| 392 | logger.info(f"Fetched transcript for meeting {meeting_id}") |
| 393 | return dl_resp.text |
| 394 | |
| 395 | logger.info(f"No transcript found for meeting {meeting_id}") |
| 396 | return None |
| 397 | except Exception as e: |
| 398 | logger.error(f"Failed to fetch transcript for meeting {meeting_id}: {e}") |
| 399 | return None |