PlanOpticon

planopticon / video_processor / sources / youtube_source.py

Blame History Raw 119 lines

1	`"""YouTube source connector using yt-dlp for video/audio download and caption extraction."""`
2
3	`import logging`
4	`import re`
5	`from pathlib import Path`
6	`from typing import List, Optional`
7
8	`from video_processor.sources.base import BaseSource, SourceFile`
9
10	`logger = logging.getLogger(__name__)`
11
12	`_YT_URL_PATTERN = re.compile(`
13	`r"(?:youtube\.com/watch\?v=\|youtu\.be/\|youtube\.com/shorts/)([\w-]{11})"`
14	`)`
15
16
17	`def _extract_video_id(url: str) -> str:`
18	`"""Extract the 11-character video ID from a YouTube URL."""`
19	`match = _YT_URL_PATTERN.search(url)`
20	`if not match:`
21	`raise ValueError(f"Could not extract YouTube video ID from: {url}")`
22	`return match.group(1)`
23
24
25	`class YouTubeSource(BaseSource):`
26	`"""`
27	`Download YouTube videos/audio and extract captions via yt-dlp.`
28
29	`Requires: pip install yt-dlp`
30	`"""`
31
32	`def __init__(self, url: str, audio_only: bool = False):`
33	`self.url = url`
34	`self.video_id = _extract_video_id(url)`
35	`self.audio_only = audio_only`
36
37	`def authenticate(self) -> bool:`
38	`"""No auth needed for public videos. Returns True if yt-dlp is available."""`
39	`try:`
40	`import yt_dlp # noqa: F401`
41
42	`return True`
43	`except ImportError:`
44	`logger.error("yt-dlp not installed. Run: pip install yt-dlp")`
45	`return False`
46
47	`def list_videos(`
48	`self,`
49	`folder_id: Optional[str] = None,`
50	`folder_path: Optional[str] = None,`
51	`patterns: Optional[List[str]] = None,`
52	`) -> List[SourceFile]:`
53	`"""Return a single SourceFile representing the YouTube video."""`
54	`import yt_dlp`
55
56	`with yt_dlp.YoutubeDL({"quiet": True}) as ydl:`
57	`info = ydl.extract_info(self.url, download=False)`
58
59	`return [`
60	`SourceFile(`
61	`name=info.get("title", self.video_id),`
62	`id=self.video_id,`
63	`size_bytes=info.get("filesize"),`
64	`mime_type="audio/webm" if self.audio_only else "video/mp4",`
65	`)`
66	`]`
67
68	`def download(self, file: SourceFile, destination: Path) -> Path:`
69	`"""Download the video or audio to destination path."""`
70	`import yt_dlp`
71
72	`destination = Path(destination)`
73	`destination.parent.mkdir(parents=True, exist_ok=True)`
74
75	`opts = {`
76	`"outtmpl": str(destination),`
77	`"quiet": True,`
78	`}`
79	`if self.audio_only:`
80	`opts["format"] = "bestaudio/best"`
81	`opts["postprocessors"] = [{"key": "FFmpegExtractAudio", "preferredcodec": "mp3"}]`
82	`else:`
83	`opts["format"] = "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"`
84
85	`with yt_dlp.YoutubeDL(opts) as ydl:`
86	`ydl.download([self.url])`
87
88	`logger.info(f"Downloaded YouTube video {self.video_id} to {destination}")`
89	`return destination`
90
91	`def fetch_captions(self, lang: str = "en") -> Optional[str]:`
92	`"""Extract auto-generated or manual captions as plain text."""`
93	`import yt_dlp`
94
95	`opts = {`
96	`"quiet": True,`
97	`"writeautomaticsub": True,`
98	`"writesubtitles": True,`
99	`"subtitleslangs": [lang],`
100	`"skip_download": True,`
101	`}`
102	`with yt_dlp.YoutubeDL(opts) as ydl:`
103	`info = ydl.extract_info(self.url, download=False)`
104
105	`subs = info.get("subtitles", {}).get(lang) or info.get("automatic_captions", {}).get(lang)`
106	`if not subs:`
107	`logger.warning(f"No captions found for language '{lang}'")`
108	`return None`
109
110	`# Prefer vtt/srv format for text extraction`
111	`for fmt in subs:`
112	`if fmt.get("ext") in ("vtt", "srv3", "json3"):`
113	`import requests`
114
115	`resp = requests.get(fmt["url"], timeout=30)`
116	`return resp.text`
117
118	`return None`
119

PlanOpticon

Keyboard Shortcuts