PlanOpticon

planopticon / video_processor / sources / youtube_source.py
Source Blame History 118 lines
0981a08… noreply 1 """YouTube source connector using yt-dlp for video/audio download and caption extraction."""
0981a08… noreply 2
0981a08… noreply 3 import logging
0981a08… noreply 4 import re
0981a08… noreply 5 from pathlib import Path
0981a08… noreply 6 from typing import List, Optional
0981a08… noreply 7
0981a08… noreply 8 from video_processor.sources.base import BaseSource, SourceFile
0981a08… noreply 9
0981a08… noreply 10 logger = logging.getLogger(__name__)
0981a08… noreply 11
0981a08… noreply 12 _YT_URL_PATTERN = re.compile(
0981a08… noreply 13 r"(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/shorts/)([\w-]{11})"
0981a08… noreply 14 )
0981a08… noreply 15
0981a08… noreply 16
0981a08… noreply 17 def _extract_video_id(url: str) -> str:
0981a08… noreply 18 """Extract the 11-character video ID from a YouTube URL."""
0981a08… noreply 19 match = _YT_URL_PATTERN.search(url)
0981a08… noreply 20 if not match:
0981a08… noreply 21 raise ValueError(f"Could not extract YouTube video ID from: {url}")
0981a08… noreply 22 return match.group(1)
0981a08… noreply 23
0981a08… noreply 24
0981a08… noreply 25 class YouTubeSource(BaseSource):
0981a08… noreply 26 """
0981a08… noreply 27 Download YouTube videos/audio and extract captions via yt-dlp.
0981a08… noreply 28
0981a08… noreply 29 Requires: pip install yt-dlp
0981a08… noreply 30 """
0981a08… noreply 31
0981a08… noreply 32 def __init__(self, url: str, audio_only: bool = False):
0981a08… noreply 33 self.url = url
0981a08… noreply 34 self.video_id = _extract_video_id(url)
0981a08… noreply 35 self.audio_only = audio_only
0981a08… noreply 36
0981a08… noreply 37 def authenticate(self) -> bool:
0981a08… noreply 38 """No auth needed for public videos. Returns True if yt-dlp is available."""
0981a08… noreply 39 try:
0981a08… noreply 40 import yt_dlp # noqa: F401
0981a08… noreply 41
0981a08… noreply 42 return True
0981a08… noreply 43 except ImportError:
0981a08… noreply 44 logger.error("yt-dlp not installed. Run: pip install yt-dlp")
0981a08… noreply 45 return False
0981a08… noreply 46
0981a08… noreply 47 def list_videos(
0981a08… noreply 48 self,
0981a08… noreply 49 folder_id: Optional[str] = None,
0981a08… noreply 50 folder_path: Optional[str] = None,
0981a08… noreply 51 patterns: Optional[List[str]] = None,
0981a08… noreply 52 ) -> List[SourceFile]:
0981a08… noreply 53 """Return a single SourceFile representing the YouTube video."""
0981a08… noreply 54 import yt_dlp
0981a08… noreply 55
0981a08… noreply 56 with yt_dlp.YoutubeDL({"quiet": True}) as ydl:
0981a08… noreply 57 info = ydl.extract_info(self.url, download=False)
0981a08… noreply 58
0981a08… noreply 59 return [
0981a08… noreply 60 SourceFile(
0981a08… noreply 61 name=info.get("title", self.video_id),
0981a08… noreply 62 id=self.video_id,
0981a08… noreply 63 size_bytes=info.get("filesize"),
0981a08… noreply 64 mime_type="audio/webm" if self.audio_only else "video/mp4",
0981a08… noreply 65 )
0981a08… noreply 66 ]
0981a08… noreply 67
0981a08… noreply 68 def download(self, file: SourceFile, destination: Path) -> Path:
0981a08… noreply 69 """Download the video or audio to destination path."""
0981a08… noreply 70 import yt_dlp
0981a08… noreply 71
0981a08… noreply 72 destination = Path(destination)
0981a08… noreply 73 destination.parent.mkdir(parents=True, exist_ok=True)
0981a08… noreply 74
0981a08… noreply 75 opts = {
0981a08… noreply 76 "outtmpl": str(destination),
0981a08… noreply 77 "quiet": True,
0981a08… noreply 78 }
0981a08… noreply 79 if self.audio_only:
0981a08… noreply 80 opts["format"] = "bestaudio/best"
0981a08… noreply 81 opts["postprocessors"] = [{"key": "FFmpegExtractAudio", "preferredcodec": "mp3"}]
0981a08… noreply 82 else:
0981a08… noreply 83 opts["format"] = "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"
0981a08… noreply 84
0981a08… noreply 85 with yt_dlp.YoutubeDL(opts) as ydl:
0981a08… noreply 86 ydl.download([self.url])
0981a08… noreply 87
0981a08… noreply 88 logger.info(f"Downloaded YouTube video {self.video_id} to {destination}")
0981a08… noreply 89 return destination
0981a08… noreply 90
0981a08… noreply 91 def fetch_captions(self, lang: str = "en") -> Optional[str]:
0981a08… noreply 92 """Extract auto-generated or manual captions as plain text."""
0981a08… noreply 93 import yt_dlp
0981a08… noreply 94
0981a08… noreply 95 opts = {
0981a08… noreply 96 "quiet": True,
0981a08… noreply 97 "writeautomaticsub": True,
0981a08… noreply 98 "writesubtitles": True,
0981a08… noreply 99 "subtitleslangs": [lang],
0981a08… noreply 100 "skip_download": True,
0981a08… noreply 101 }
0981a08… noreply 102 with yt_dlp.YoutubeDL(opts) as ydl:
0981a08… noreply 103 info = ydl.extract_info(self.url, download=False)
0981a08… noreply 104
0981a08… noreply 105 subs = info.get("subtitles", {}).get(lang) or info.get("automatic_captions", {}).get(lang)
0981a08… noreply 106 if not subs:
0981a08… noreply 107 logger.warning(f"No captions found for language '{lang}'")
0981a08… noreply 108 return None
0981a08… noreply 109
0981a08… noreply 110 # Prefer vtt/srv format for text extraction
0981a08… noreply 111 for fmt in subs:
0981a08… noreply 112 if fmt.get("ext") in ("vtt", "srv3", "json3"):
0981a08… noreply 113 import requests
0981a08… noreply 114
0981a08… noreply 115 resp = requests.get(fmt["url"], timeout=30)
0981a08… noreply 116 return resp.text
0981a08… noreply 117
0981a08… noreply 118 return None

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button