PlanOpticon

planopticon / video_processor / sources / youtube_source.py
Blame History Raw 119 lines
1
"""YouTube source connector using yt-dlp for video/audio download and caption extraction."""
2
3
import logging
4
import re
5
from pathlib import Path
6
from typing import List, Optional
7
8
from video_processor.sources.base import BaseSource, SourceFile
9
10
logger = logging.getLogger(__name__)
11
12
_YT_URL_PATTERN = re.compile(
13
r"(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/shorts/)([\w-]{11})"
14
)
15
16
17
def _extract_video_id(url: str) -> str:
18
"""Extract the 11-character video ID from a YouTube URL."""
19
match = _YT_URL_PATTERN.search(url)
20
if not match:
21
raise ValueError(f"Could not extract YouTube video ID from: {url}")
22
return match.group(1)
23
24
25
class YouTubeSource(BaseSource):
26
"""
27
Download YouTube videos/audio and extract captions via yt-dlp.
28
29
Requires: pip install yt-dlp
30
"""
31
32
def __init__(self, url: str, audio_only: bool = False):
33
self.url = url
34
self.video_id = _extract_video_id(url)
35
self.audio_only = audio_only
36
37
def authenticate(self) -> bool:
38
"""No auth needed for public videos. Returns True if yt-dlp is available."""
39
try:
40
import yt_dlp # noqa: F401
41
42
return True
43
except ImportError:
44
logger.error("yt-dlp not installed. Run: pip install yt-dlp")
45
return False
46
47
def list_videos(
48
self,
49
folder_id: Optional[str] = None,
50
folder_path: Optional[str] = None,
51
patterns: Optional[List[str]] = None,
52
) -> List[SourceFile]:
53
"""Return a single SourceFile representing the YouTube video."""
54
import yt_dlp
55
56
with yt_dlp.YoutubeDL({"quiet": True}) as ydl:
57
info = ydl.extract_info(self.url, download=False)
58
59
return [
60
SourceFile(
61
name=info.get("title", self.video_id),
62
id=self.video_id,
63
size_bytes=info.get("filesize"),
64
mime_type="audio/webm" if self.audio_only else "video/mp4",
65
)
66
]
67
68
def download(self, file: SourceFile, destination: Path) -> Path:
69
"""Download the video or audio to destination path."""
70
import yt_dlp
71
72
destination = Path(destination)
73
destination.parent.mkdir(parents=True, exist_ok=True)
74
75
opts = {
76
"outtmpl": str(destination),
77
"quiet": True,
78
}
79
if self.audio_only:
80
opts["format"] = "bestaudio/best"
81
opts["postprocessors"] = [{"key": "FFmpegExtractAudio", "preferredcodec": "mp3"}]
82
else:
83
opts["format"] = "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"
84
85
with yt_dlp.YoutubeDL(opts) as ydl:
86
ydl.download([self.url])
87
88
logger.info(f"Downloaded YouTube video {self.video_id} to {destination}")
89
return destination
90
91
def fetch_captions(self, lang: str = "en") -> Optional[str]:
92
"""Extract auto-generated or manual captions as plain text."""
93
import yt_dlp
94
95
opts = {
96
"quiet": True,
97
"writeautomaticsub": True,
98
"writesubtitles": True,
99
"subtitleslangs": [lang],
100
"skip_download": True,
101
}
102
with yt_dlp.YoutubeDL(opts) as ydl:
103
info = ydl.extract_info(self.url, download=False)
104
105
subs = info.get("subtitles", {}).get(lang) or info.get("automatic_captions", {}).get(lang)
106
if not subs:
107
logger.warning(f"No captions found for language '{lang}'")
108
return None
109
110
# Prefer vtt/srv format for text extraction
111
for fmt in subs:
112
if fmt.get("ext") in ("vtt", "srv3", "json3"):
113
import requests
114
115
resp = requests.get(fmt["url"], timeout=30)
116
return resp.text
117
118
return None
119

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button