PlanOpticon
feat(sources): add YouTube, web, GitHub, Reddit, HN, RSS, podcast, Twitter, arXiv connectors Each connector implements the SourceConnector pattern with optional external dependencies (yt-dlp, feedparser, beautifulsoup4).
Commit
b1d426992ff933d5c74d4f32a0257ce7eadb03ee76026a0edbaf442c0c60f746
Parent
1e76ab0a2b6c1c6…
10 files changed
+37
-2
+117
+156
+112
+119
+103
+114
+129
+90
+118
~
video_processor/sources/__init__.py
+
video_processor/sources/arxiv_source.py
+
video_processor/sources/github_source.py
+
video_processor/sources/hackernews_source.py
+
video_processor/sources/podcast_source.py
+
video_processor/sources/reddit_source.py
+
video_processor/sources/rss_source.py
+
video_processor/sources/twitter_source.py
+
video_processor/sources/web_source.py
+
video_processor/sources/youtube_source.py
+37
-2
| --- video_processor/sources/__init__.py | ||
| +++ video_processor/sources/__init__.py | ||
| @@ -1,5 +1,40 @@ | ||
| 1 | -"""Cloud source integrations for fetching videos from remote storage.""" | |
| 1 | +"""Cloud and web source integrations for fetching content from remote sources.""" | |
| 2 | 2 | |
| 3 | 3 | from video_processor.sources.base import BaseSource, SourceFile |
| 4 | 4 | |
| 5 | -__all__ = ["BaseSource", "SourceFile"] | |
| 5 | +__all__ = [ | |
| 6 | + "BaseSource", | |
| 7 | + "SourceFile", | |
| 8 | + "ArxivSource", | |
| 9 | + "GitHubSource", | |
| 10 | + "GoogleDriveSource", | |
| 11 | + "HackerNewsSource", | |
| 12 | + "PodcastSource", | |
| 13 | + "RedditSource", | |
| 14 | + "RSSSource", | |
| 15 | + "TwitterSource", | |
| 16 | + "WebSource", | |
| 17 | + "YouTubeSource", | |
| 18 | +] | |
| 19 | + | |
| 20 | + | |
| 21 | +def __getattr__(name: str): | |
| 22 | + """Lazy imports to avoid pulling in optional dependencies at import time.""" | |
| 23 | + _lazy_map = { | |
| 24 | + "ArxivSource": "video_processor.sources.arxiv_source", | |
| 25 | + "GitHubSource": "video_processor.sources.github_source", | |
| 26 | + "GoogleDriveSource": "video_processor.sources.google_drive", | |
| 27 | + "HackerNewsSource": "video_processor.sources.hackernews_source", | |
| 28 | + "PodcastSource": "video_processor.sources.podcast_source", | |
| 29 | + "RedditSource": "video_processor.sources.reddit_source", | |
| 30 | + "RSSSource": "video_processor.sources.rss_source", | |
| 31 | + "TwitterSource": "video_processor.sources.twitter_source", | |
| 32 | + "WebSource": "video_processor.sources.web_source", | |
| 33 | + "YouTubeSource": "video_processor.sources.youtube_source", | |
| 34 | + } | |
| 35 | + if name in _lazy_map: | |
| 36 | + import importlib | |
| 37 | + | |
| 38 | + module = importlib.import_module(_lazy_map[name]) | |
| 39 | + return getattr(module, name) | |
| 40 | + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") | |
| 6 | 41 | |
| 7 | 42 | ADDED video_processor/sources/arxiv_source.py |
| 8 | 43 | ADDED video_processor/sources/github_source.py |
| 9 | 44 | ADDED video_processor/sources/hackernews_source.py |
| 10 | 45 | ADDED video_processor/sources/podcast_source.py |
| 11 | 46 | ADDED video_processor/sources/reddit_source.py |
| 12 | 47 | ADDED video_processor/sources/rss_source.py |
| 13 | 48 | ADDED video_processor/sources/twitter_source.py |
| 14 | 49 | ADDED video_processor/sources/web_source.py |
| 15 | 50 | ADDED video_processor/sources/youtube_source.py |
| --- video_processor/sources/__init__.py | |
| +++ video_processor/sources/__init__.py | |
| @@ -1,5 +1,40 @@ | |
| 1 | """Cloud source integrations for fetching videos from remote storage.""" |
| 2 | |
| 3 | from video_processor.sources.base import BaseSource, SourceFile |
| 4 | |
| 5 | __all__ = ["BaseSource", "SourceFile"] |
| 6 | |
| 7 | DDED video_processor/sources/arxiv_source.py |
| 8 | DDED video_processor/sources/github_source.py |
| 9 | DDED video_processor/sources/hackernews_source.py |
| 10 | DDED video_processor/sources/podcast_source.py |
| 11 | DDED video_processor/sources/reddit_source.py |
| 12 | DDED video_processor/sources/rss_source.py |
| 13 | DDED video_processor/sources/twitter_source.py |
| 14 | DDED video_processor/sources/web_source.py |
| 15 | DDED video_processor/sources/youtube_source.py |
| --- video_processor/sources/__init__.py | |
| +++ video_processor/sources/__init__.py | |
| @@ -1,5 +1,40 @@ | |
| 1 | """Cloud and web source integrations for fetching content from remote sources.""" |
| 2 | |
| 3 | from video_processor.sources.base import BaseSource, SourceFile |
| 4 | |
| 5 | __all__ = [ |
| 6 | "BaseSource", |
| 7 | "SourceFile", |
| 8 | "ArxivSource", |
| 9 | "GitHubSource", |
| 10 | "GoogleDriveSource", |
| 11 | "HackerNewsSource", |
| 12 | "PodcastSource", |
| 13 | "RedditSource", |
| 14 | "RSSSource", |
| 15 | "TwitterSource", |
| 16 | "WebSource", |
| 17 | "YouTubeSource", |
| 18 | ] |
| 19 | |
| 20 | |
| 21 | def __getattr__(name: str): |
| 22 | """Lazy imports to avoid pulling in optional dependencies at import time.""" |
| 23 | _lazy_map = { |
| 24 | "ArxivSource": "video_processor.sources.arxiv_source", |
| 25 | "GitHubSource": "video_processor.sources.github_source", |
| 26 | "GoogleDriveSource": "video_processor.sources.google_drive", |
| 27 | "HackerNewsSource": "video_processor.sources.hackernews_source", |
| 28 | "PodcastSource": "video_processor.sources.podcast_source", |
| 29 | "RedditSource": "video_processor.sources.reddit_source", |
| 30 | "RSSSource": "video_processor.sources.rss_source", |
| 31 | "TwitterSource": "video_processor.sources.twitter_source", |
| 32 | "WebSource": "video_processor.sources.web_source", |
| 33 | "YouTubeSource": "video_processor.sources.youtube_source", |
| 34 | } |
| 35 | if name in _lazy_map: |
| 36 | import importlib |
| 37 | |
| 38 | module = importlib.import_module(_lazy_map[name]) |
| 39 | return getattr(module, name) |
| 40 | raise AttributeError(f"module {__name__!r} has no attribute {name!r}") |
| 41 | |
| 42 | DDED video_processor/sources/arxiv_source.py |
| 43 | DDED video_processor/sources/github_source.py |
| 44 | DDED video_processor/sources/hackernews_source.py |
| 45 | DDED video_processor/sources/podcast_source.py |
| 46 | DDED video_processor/sources/reddit_source.py |
| 47 | DDED video_processor/sources/rss_source.py |
| 48 | DDED video_processor/sources/twitter_source.py |
| 49 | DDED video_processor/sources/web_source.py |
| 50 | DDED video_processor/sources/youtube_source.py |
| --- a/video_processor/sources/arxiv_source.py | ||
| +++ b/video_processor/sources/arxiv_source.py | ||
| @@ -0,0 +1,117 @@ | ||
| 1 | +"""arXiv source connector for fetching paper metadata and PDFs.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import re | |
| 5 | +from pathlib import Path | |
| 6 | +from typing import List, Optional | |
| 7 | + | |
| 8 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 9 | + | |
| 10 | +logger = logging.getLogger(__name__) | |
| 11 | + | |
| 12 | +_ARXIV_ID_PATTERN = re.compile(r"(\d{4}\.\d{4,5})(v\d+)?") | |
| 13 | +ARXIV_API = "http://export.arxiv.org/api/query" | |
| 14 | + | |
| 15 | + | |
| 16 | +def _extract_arxiv_id(url_or_id: str) -> str: | |
| 17 | + """Extract arXiv paper ID from a URL or bare ID string.""" | |
| 18 | + match = _ARXIV_ID_PATTERN.search(url_or_id) | |
| 19 | + if not match: | |
| 20 | + raise ValueError(f"Could not extract arXiv ID from: {url_or_id}") | |
| 21 | + return match.group(0) | |
| 22 | + | |
| 23 | + | |
| 24 | +class ArxivSource(BaseSource): | |
| 25 | + """ | |
| 26 | + Fetch arXiv paper metadata and PDF. | |
| 27 | + | |
| 28 | + Uses the arXiv API (Atom feed) for metadata and direct PDF download. | |
| 29 | + Requires: pip install requests | |
| 30 | + """ | |
| 31 | + | |
| 32 | + def __init__(self, url_or_id: str): | |
| 33 | + self.arxiv_id = _extract_arxiv_id(url_or_id) | |
| 34 | + self._metadata: Optional[dict] = None | |
| 35 | + | |
| 36 | + def authenticate(self) -> bool: | |
| 37 | + """No auth needed for arXiv.""" | |
| 38 | + return True | |
| 39 | + | |
| 40 | + def _fetch_metadata(self) -> dict: | |
| 41 | + """Fetch paper metadata from the arXiv API.""" | |
| 42 | + if self._metadata: | |
| 43 | + return self._metadata | |
| 44 | + | |
| 45 | + import xml.etree.ElementTree as ET | |
| 46 | + | |
| 47 | + import requests | |
| 48 | + | |
| 49 | + resp = requests.get(ARXIV_API, params={"id_list": self.arxiv_id}, timeout=15) | |
| 50 | + resp.raise_for_status() | |
| 51 | + | |
| 52 | + ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"} | |
| 53 | + root = ET.fromstring(resp.text) | |
| 54 | + entry = root.find("atom:entry", ns) | |
| 55 | + if entry is None: | |
| 56 | + raise ValueError(f"Paper not found: {self.arxiv_id}") | |
| 57 | + | |
| 58 | + self._metadata = { | |
| 59 | + "title": (entry.findtext("atom:title", namespaces=ns) or "").strip(), | |
| 60 | + "summary": (entry.findtext("atom:summary", namespaces=ns) or "").strip(), | |
| 61 | + "authors": [ | |
| 62 | + a.findtext("atom:name", namespaces=ns) or "" | |
| 63 | + for a in entry.findall("atom:author", ns) | |
| 64 | + ], | |
| 65 | + "published": entry.findtext("atom:published", namespaces=ns) or "", | |
| 66 | + "pdf_url": f"https://arxiv.org/pdf/{self.arxiv_id}.pdf", | |
| 67 | + } | |
| 68 | + return self._metadata | |
| 69 | + | |
| 70 | + def list_videos( | |
| 71 | + self, | |
| 72 | + folder_id: Optional[str] = None, | |
| 73 | + folder_path: Optional[str] = None, | |
| 74 | + patterns: Optional[List[str]] = None, | |
| 75 | + ) -> List[SourceFile]: | |
| 76 | + """Return SourceFiles for the paper metadata and PDF.""" | |
| 77 | + meta = self._fetch_metadata() | |
| 78 | + return [ | |
| 79 | + SourceFile( | |
| 80 | + name=f"{meta['title']} (metadata)", | |
| 81 | + id=f"meta:{self.arxiv_id}", | |
| 82 | + mime_type="text/plain", | |
| 83 | + ), | |
| 84 | + SourceFile( | |
| 85 | + name=f"{meta['title']}.pdf", | |
| 86 | + id=f"pdf:{self.arxiv_id}", | |
| 87 | + mime_type="application/pdf", | |
| 88 | + ), | |
| 89 | + ] | |
| 90 | + | |
| 91 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 92 | + """Download paper metadata as text or the PDF file.""" | |
| 93 | + import requests | |
| 94 | + | |
| 95 | + destination = Path(destination) | |
| 96 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 97 | + meta = self._fetch_metadata() | |
| 98 | + | |
| 99 | + if file.id.startswith("meta:"): | |
| 100 | + authors = ", ".join(meta["authors"]) | |
| 101 | + text = ( | |
| 102 | + f"# {meta['title']}\n\n" | |
| 103 | + f"Authors: {authors}\n" | |
| 104 | + f"Published: {meta['published']}\n" | |
| 105 | + f"arXiv: {self.arxiv_id}\n\n" | |
| 106 | + f"## Abstract\n\n{meta['summary']}" | |
| 107 | + ) | |
| 108 | + destination.write_text(text, encoding="utf-8") | |
| 109 | + elif file.id.startswith("pdf:"): | |
| 110 | + resp = requests.get(meta["pdf_url"], timeout=60, stream=True) | |
| 111 | + resp.raise_for_status() | |
| 112 | + with open(destination, "wb") as f: | |
| 113 | + for chunk in resp.iter_content(chunk_size=8192): | |
| 114 | + f.write(chunk) | |
| 115 | + | |
| 116 | + logger.info(f"Downloaded arXiv {self.arxiv_id} to {destination}") | |
| 117 | + return destination |
| --- a/video_processor/sources/arxiv_source.py | |
| +++ b/video_processor/sources/arxiv_source.py | |
| @@ -0,0 +1,117 @@ | |
| --- a/video_processor/sources/arxiv_source.py | |
| +++ b/video_processor/sources/arxiv_source.py | |
| @@ -0,0 +1,117 @@ | |
| 1 | """arXiv source connector for fetching paper metadata and PDFs.""" |
| 2 | |
| 3 | import logging |
| 4 | import re |
| 5 | from pathlib import Path |
| 6 | from typing import List, Optional |
| 7 | |
| 8 | from video_processor.sources.base import BaseSource, SourceFile |
| 9 | |
| 10 | logger = logging.getLogger(__name__) |
| 11 | |
| 12 | _ARXIV_ID_PATTERN = re.compile(r"(\d{4}\.\d{4,5})(v\d+)?") |
| 13 | ARXIV_API = "http://export.arxiv.org/api/query" |
| 14 | |
| 15 | |
| 16 | def _extract_arxiv_id(url_or_id: str) -> str: |
| 17 | """Extract arXiv paper ID from a URL or bare ID string.""" |
| 18 | match = _ARXIV_ID_PATTERN.search(url_or_id) |
| 19 | if not match: |
| 20 | raise ValueError(f"Could not extract arXiv ID from: {url_or_id}") |
| 21 | return match.group(0) |
| 22 | |
| 23 | |
| 24 | class ArxivSource(BaseSource): |
| 25 | """ |
| 26 | Fetch arXiv paper metadata and PDF. |
| 27 | |
| 28 | Uses the arXiv API (Atom feed) for metadata and direct PDF download. |
| 29 | Requires: pip install requests |
| 30 | """ |
| 31 | |
| 32 | def __init__(self, url_or_id: str): |
| 33 | self.arxiv_id = _extract_arxiv_id(url_or_id) |
| 34 | self._metadata: Optional[dict] = None |
| 35 | |
| 36 | def authenticate(self) -> bool: |
| 37 | """No auth needed for arXiv.""" |
| 38 | return True |
| 39 | |
| 40 | def _fetch_metadata(self) -> dict: |
| 41 | """Fetch paper metadata from the arXiv API.""" |
| 42 | if self._metadata: |
| 43 | return self._metadata |
| 44 | |
| 45 | import xml.etree.ElementTree as ET |
| 46 | |
| 47 | import requests |
| 48 | |
| 49 | resp = requests.get(ARXIV_API, params={"id_list": self.arxiv_id}, timeout=15) |
| 50 | resp.raise_for_status() |
| 51 | |
| 52 | ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"} |
| 53 | root = ET.fromstring(resp.text) |
| 54 | entry = root.find("atom:entry", ns) |
| 55 | if entry is None: |
| 56 | raise ValueError(f"Paper not found: {self.arxiv_id}") |
| 57 | |
| 58 | self._metadata = { |
| 59 | "title": (entry.findtext("atom:title", namespaces=ns) or "").strip(), |
| 60 | "summary": (entry.findtext("atom:summary", namespaces=ns) or "").strip(), |
| 61 | "authors": [ |
| 62 | a.findtext("atom:name", namespaces=ns) or "" |
| 63 | for a in entry.findall("atom:author", ns) |
| 64 | ], |
| 65 | "published": entry.findtext("atom:published", namespaces=ns) or "", |
| 66 | "pdf_url": f"https://arxiv.org/pdf/{self.arxiv_id}.pdf", |
| 67 | } |
| 68 | return self._metadata |
| 69 | |
| 70 | def list_videos( |
| 71 | self, |
| 72 | folder_id: Optional[str] = None, |
| 73 | folder_path: Optional[str] = None, |
| 74 | patterns: Optional[List[str]] = None, |
| 75 | ) -> List[SourceFile]: |
| 76 | """Return SourceFiles for the paper metadata and PDF.""" |
| 77 | meta = self._fetch_metadata() |
| 78 | return [ |
| 79 | SourceFile( |
| 80 | name=f"{meta['title']} (metadata)", |
| 81 | id=f"meta:{self.arxiv_id}", |
| 82 | mime_type="text/plain", |
| 83 | ), |
| 84 | SourceFile( |
| 85 | name=f"{meta['title']}.pdf", |
| 86 | id=f"pdf:{self.arxiv_id}", |
| 87 | mime_type="application/pdf", |
| 88 | ), |
| 89 | ] |
| 90 | |
| 91 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 92 | """Download paper metadata as text or the PDF file.""" |
| 93 | import requests |
| 94 | |
| 95 | destination = Path(destination) |
| 96 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 97 | meta = self._fetch_metadata() |
| 98 | |
| 99 | if file.id.startswith("meta:"): |
| 100 | authors = ", ".join(meta["authors"]) |
| 101 | text = ( |
| 102 | f"# {meta['title']}\n\n" |
| 103 | f"Authors: {authors}\n" |
| 104 | f"Published: {meta['published']}\n" |
| 105 | f"arXiv: {self.arxiv_id}\n\n" |
| 106 | f"## Abstract\n\n{meta['summary']}" |
| 107 | ) |
| 108 | destination.write_text(text, encoding="utf-8") |
| 109 | elif file.id.startswith("pdf:"): |
| 110 | resp = requests.get(meta["pdf_url"], timeout=60, stream=True) |
| 111 | resp.raise_for_status() |
| 112 | with open(destination, "wb") as f: |
| 113 | for chunk in resp.iter_content(chunk_size=8192): |
| 114 | f.write(chunk) |
| 115 | |
| 116 | logger.info(f"Downloaded arXiv {self.arxiv_id} to {destination}") |
| 117 | return destination |
| --- a/video_processor/sources/github_source.py | ||
| +++ b/video_processor/sources/github_source.py | ||
| @@ -0,0 +1,156 @@ | ||
| 1 | +"""GitHub source connector for fetching repo content, issues, and PRs.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import os | |
| 5 | +from pathlib import Path | |
| 6 | +from typing import List, Optional | |
| 7 | + | |
| 8 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 9 | + | |
| 10 | +logger = logging.getLogger(__name__) | |
| 11 | + | |
| 12 | +API_BASE = "https://api.github.com" | |
| 13 | + | |
| 14 | + | |
| 15 | +class GitHubSource(BaseSource): | |
| 16 | + """ | |
| 17 | + Fetch GitHub repository README, issues, and pull requests as text documents. | |
| 18 | + | |
| 19 | + Auth: Set GITHUB_TOKEN env var, or use `gh auth token` output. | |
| 20 | + Requires: pip install requests | |
| 21 | + """ | |
| 22 | + | |
| 23 | + def __init__(self, repo: str, include_issues: bool = True, include_prs: bool = True): | |
| 24 | + """ | |
| 25 | + Parameters | |
| 26 | + ---------- | |
| 27 | + repo : str | |
| 28 | + GitHub repo in "owner/repo" format. | |
| 29 | + """ | |
| 30 | + self.repo = repo | |
| 31 | + self.include_issues = include_issues | |
| 32 | + self.include_prs = include_prs | |
| 33 | + self._token: Optional[str] = None | |
| 34 | + | |
| 35 | + def authenticate(self) -> bool: | |
| 36 | + """Authenticate via GITHUB_TOKEN env var or gh CLI.""" | |
| 37 | + self._token = os.environ.get("GITHUB_TOKEN") | |
| 38 | + if not self._token: | |
| 39 | + try: | |
| 40 | + import subprocess | |
| 41 | + | |
| 42 | + result = subprocess.run(["gh", "auth", "token"], capture_output=True, text=True) | |
| 43 | + if result.returncode == 0: | |
| 44 | + self._token = result.stdout.strip() | |
| 45 | + except FileNotFoundError: | |
| 46 | + pass | |
| 47 | + if not self._token: | |
| 48 | + logger.warning( | |
| 49 | + "No GitHub token found. Public repos only. Set GITHUB_TOKEN for private repos." | |
| 50 | + ) | |
| 51 | + return True | |
| 52 | + | |
| 53 | + def _headers(self) -> dict: | |
| 54 | + h = {"Accept": "application/vnd.github.v3+json"} | |
| 55 | + if self._token: | |
| 56 | + h["Authorization"] = f"Bearer {self._token}" | |
| 57 | + return h | |
| 58 | + | |
| 59 | + def list_videos( | |
| 60 | + self, | |
| 61 | + folder_id: Optional[str] = None, | |
| 62 | + folder_path: Optional[str] = None, | |
| 63 | + patterns: Optional[List[str]] = None, | |
| 64 | + ) -> List[SourceFile]: | |
| 65 | + """List available documents (README, issues, PRs) as SourceFiles.""" | |
| 66 | + import requests | |
| 67 | + | |
| 68 | + files = [] | |
| 69 | + # README | |
| 70 | + resp = requests.get( | |
| 71 | + f"{API_BASE}/repos/{self.repo}/readme", headers=self._headers(), timeout=15 | |
| 72 | + ) | |
| 73 | + if resp.ok: | |
| 74 | + files.append(SourceFile(name="README", id="readme", mime_type="text/markdown")) | |
| 75 | + | |
| 76 | + # Issues | |
| 77 | + if self.include_issues: | |
| 78 | + resp = requests.get( | |
| 79 | + f"{API_BASE}/repos/{self.repo}/issues", | |
| 80 | + headers=self._headers(), | |
| 81 | + params={"state": "all", "per_page": 100}, | |
| 82 | + timeout=15, | |
| 83 | + ) | |
| 84 | + if resp.ok: | |
| 85 | + for issue in resp.json(): | |
| 86 | + if "pull_request" not in issue: | |
| 87 | + files.append( | |
| 88 | + SourceFile( | |
| 89 | + name=f"Issue #{issue['number']}: {issue['title']}", | |
| 90 | + id=f"issue:{issue['number']}", | |
| 91 | + mime_type="text/plain", | |
| 92 | + ) | |
| 93 | + ) | |
| 94 | + | |
| 95 | + # PRs | |
| 96 | + if self.include_prs: | |
| 97 | + resp = requests.get( | |
| 98 | + f"{API_BASE}/repos/{self.repo}/pulls", | |
| 99 | + headers=self._headers(), | |
| 100 | + params={"state": "all", "per_page": 100}, | |
| 101 | + timeout=15, | |
| 102 | + ) | |
| 103 | + if resp.ok: | |
| 104 | + for pr in resp.json(): | |
| 105 | + files.append( | |
| 106 | + SourceFile( | |
| 107 | + name=f"PR #{pr['number']}: {pr['title']}", | |
| 108 | + id=f"pr:{pr['number']}", | |
| 109 | + mime_type="text/plain", | |
| 110 | + ) | |
| 111 | + ) | |
| 112 | + | |
| 113 | + return files | |
| 114 | + | |
| 115 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 116 | + """Download a single document (README, issue, or PR) as text.""" | |
| 117 | + import requests | |
| 118 | + | |
| 119 | + destination = Path(destination) | |
| 120 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 121 | + | |
| 122 | + if file.id == "readme": | |
| 123 | + resp = requests.get( | |
| 124 | + f"{API_BASE}/repos/{self.repo}/readme", | |
| 125 | + headers={**self._headers(), "Accept": "application/vnd.github.v3.raw"}, | |
| 126 | + timeout=15, | |
| 127 | + ) | |
| 128 | + destination.write_text(resp.text, encoding="utf-8") | |
| 129 | + elif file.id.startswith("issue:"): | |
| 130 | + num = file.id.split(":")[1] | |
| 131 | + resp = requests.get( | |
| 132 | + f"{API_BASE}/repos/{self.repo}/issues/{num}", | |
| 133 | + headers=self._headers(), | |
| 134 | + timeout=15, | |
| 135 | + ) | |
| 136 | + data = resp.json() | |
| 137 | + text = f"# {data['title']}\n\n{data.get('body', '') or ''}" | |
| 138 | + # Append comments | |
| 139 | + comments_resp = requests.get(data["comments_url"], headers=self._headers(), timeout=15) | |
| 140 | + if comments_resp.ok: | |
| 141 | + for c in comments_resp.json(): | |
| 142 | + text += f"\n\n---\n**{c['user']['login']}**: {c.get('body', '')}" | |
| 143 | + destination.write_text(text, encoding="utf-8") | |
| 144 | + elif file.id.startswith("pr:"): | |
| 145 | + num = file.id.split(":")[1] | |
| 146 | + resp = requests.get( | |
| 147 | + f"{API_BASE}/repos/{self.repo}/pulls/{num}", | |
| 148 | + headers=self._headers(), | |
| 149 | + timeout=15, | |
| 150 | + ) | |
| 151 | + data = resp.json() | |
| 152 | + text = f"# PR: {data['title']}\n\n{data.get('body', '') or ''}" | |
| 153 | + destination.write_text(text, encoding="utf-8") | |
| 154 | + | |
| 155 | + logger.info(f"Downloaded {file.name} to {destination}") | |
| 156 | + return destination |
| --- a/video_processor/sources/github_source.py | |
| +++ b/video_processor/sources/github_source.py | |
| @@ -0,0 +1,156 @@ | |
| --- a/video_processor/sources/github_source.py | |
| +++ b/video_processor/sources/github_source.py | |
| @@ -0,0 +1,156 @@ | |
| 1 | """GitHub source connector for fetching repo content, issues, and PRs.""" |
| 2 | |
| 3 | import logging |
| 4 | import os |
| 5 | from pathlib import Path |
| 6 | from typing import List, Optional |
| 7 | |
| 8 | from video_processor.sources.base import BaseSource, SourceFile |
| 9 | |
| 10 | logger = logging.getLogger(__name__) |
| 11 | |
| 12 | API_BASE = "https://api.github.com" |
| 13 | |
| 14 | |
| 15 | class GitHubSource(BaseSource): |
| 16 | """ |
| 17 | Fetch GitHub repository README, issues, and pull requests as text documents. |
| 18 | |
| 19 | Auth: Set GITHUB_TOKEN env var, or use `gh auth token` output. |
| 20 | Requires: pip install requests |
| 21 | """ |
| 22 | |
| 23 | def __init__(self, repo: str, include_issues: bool = True, include_prs: bool = True): |
| 24 | """ |
| 25 | Parameters |
| 26 | ---------- |
| 27 | repo : str |
| 28 | GitHub repo in "owner/repo" format. |
| 29 | """ |
| 30 | self.repo = repo |
| 31 | self.include_issues = include_issues |
| 32 | self.include_prs = include_prs |
| 33 | self._token: Optional[str] = None |
| 34 | |
| 35 | def authenticate(self) -> bool: |
| 36 | """Authenticate via GITHUB_TOKEN env var or gh CLI.""" |
| 37 | self._token = os.environ.get("GITHUB_TOKEN") |
| 38 | if not self._token: |
| 39 | try: |
| 40 | import subprocess |
| 41 | |
| 42 | result = subprocess.run(["gh", "auth", "token"], capture_output=True, text=True) |
| 43 | if result.returncode == 0: |
| 44 | self._token = result.stdout.strip() |
| 45 | except FileNotFoundError: |
| 46 | pass |
| 47 | if not self._token: |
| 48 | logger.warning( |
| 49 | "No GitHub token found. Public repos only. Set GITHUB_TOKEN for private repos." |
| 50 | ) |
| 51 | return True |
| 52 | |
| 53 | def _headers(self) -> dict: |
| 54 | h = {"Accept": "application/vnd.github.v3+json"} |
| 55 | if self._token: |
| 56 | h["Authorization"] = f"Bearer {self._token}" |
| 57 | return h |
| 58 | |
| 59 | def list_videos( |
| 60 | self, |
| 61 | folder_id: Optional[str] = None, |
| 62 | folder_path: Optional[str] = None, |
| 63 | patterns: Optional[List[str]] = None, |
| 64 | ) -> List[SourceFile]: |
| 65 | """List available documents (README, issues, PRs) as SourceFiles.""" |
| 66 | import requests |
| 67 | |
| 68 | files = [] |
| 69 | # README |
| 70 | resp = requests.get( |
| 71 | f"{API_BASE}/repos/{self.repo}/readme", headers=self._headers(), timeout=15 |
| 72 | ) |
| 73 | if resp.ok: |
| 74 | files.append(SourceFile(name="README", id="readme", mime_type="text/markdown")) |
| 75 | |
| 76 | # Issues |
| 77 | if self.include_issues: |
| 78 | resp = requests.get( |
| 79 | f"{API_BASE}/repos/{self.repo}/issues", |
| 80 | headers=self._headers(), |
| 81 | params={"state": "all", "per_page": 100}, |
| 82 | timeout=15, |
| 83 | ) |
| 84 | if resp.ok: |
| 85 | for issue in resp.json(): |
| 86 | if "pull_request" not in issue: |
| 87 | files.append( |
| 88 | SourceFile( |
| 89 | name=f"Issue #{issue['number']}: {issue['title']}", |
| 90 | id=f"issue:{issue['number']}", |
| 91 | mime_type="text/plain", |
| 92 | ) |
| 93 | ) |
| 94 | |
| 95 | # PRs |
| 96 | if self.include_prs: |
| 97 | resp = requests.get( |
| 98 | f"{API_BASE}/repos/{self.repo}/pulls", |
| 99 | headers=self._headers(), |
| 100 | params={"state": "all", "per_page": 100}, |
| 101 | timeout=15, |
| 102 | ) |
| 103 | if resp.ok: |
| 104 | for pr in resp.json(): |
| 105 | files.append( |
| 106 | SourceFile( |
| 107 | name=f"PR #{pr['number']}: {pr['title']}", |
| 108 | id=f"pr:{pr['number']}", |
| 109 | mime_type="text/plain", |
| 110 | ) |
| 111 | ) |
| 112 | |
| 113 | return files |
| 114 | |
| 115 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 116 | """Download a single document (README, issue, or PR) as text.""" |
| 117 | import requests |
| 118 | |
| 119 | destination = Path(destination) |
| 120 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 121 | |
| 122 | if file.id == "readme": |
| 123 | resp = requests.get( |
| 124 | f"{API_BASE}/repos/{self.repo}/readme", |
| 125 | headers={**self._headers(), "Accept": "application/vnd.github.v3.raw"}, |
| 126 | timeout=15, |
| 127 | ) |
| 128 | destination.write_text(resp.text, encoding="utf-8") |
| 129 | elif file.id.startswith("issue:"): |
| 130 | num = file.id.split(":")[1] |
| 131 | resp = requests.get( |
| 132 | f"{API_BASE}/repos/{self.repo}/issues/{num}", |
| 133 | headers=self._headers(), |
| 134 | timeout=15, |
| 135 | ) |
| 136 | data = resp.json() |
| 137 | text = f"# {data['title']}\n\n{data.get('body', '') or ''}" |
| 138 | # Append comments |
| 139 | comments_resp = requests.get(data["comments_url"], headers=self._headers(), timeout=15) |
| 140 | if comments_resp.ok: |
| 141 | for c in comments_resp.json(): |
| 142 | text += f"\n\n---\n**{c['user']['login']}**: {c.get('body', '')}" |
| 143 | destination.write_text(text, encoding="utf-8") |
| 144 | elif file.id.startswith("pr:"): |
| 145 | num = file.id.split(":")[1] |
| 146 | resp = requests.get( |
| 147 | f"{API_BASE}/repos/{self.repo}/pulls/{num}", |
| 148 | headers=self._headers(), |
| 149 | timeout=15, |
| 150 | ) |
| 151 | data = resp.json() |
| 152 | text = f"# PR: {data['title']}\n\n{data.get('body', '') or ''}" |
| 153 | destination.write_text(text, encoding="utf-8") |
| 154 | |
| 155 | logger.info(f"Downloaded {file.name} to {destination}") |
| 156 | return destination |
| --- a/video_processor/sources/hackernews_source.py | ||
| +++ b/video_processor/sources/hackernews_source.py | ||
| @@ -0,0 +1,112 @@ | ||
| 1 | +"""Hacker News source connector using the official Firebase API.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +from pathlib import Path | |
| 5 | +from typing import List, Optional | |
| 6 | + | |
| 7 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 8 | + | |
| 9 | +logger = logging.getLogger(__name__) | |
| 10 | + | |
| 11 | +HN_API = "https://hacker-news.firebaseio.com/v0" | |
| 12 | + | |
| 13 | + | |
| 14 | +class HackerNewsSource(BaseSource): | |
| 15 | + """ | |
| 16 | + Fetch Hacker News stories and comments via the public API. | |
| 17 | + | |
| 18 | + API docs: https://github.com/HackerNews/API | |
| 19 | + Requires: pip install requests | |
| 20 | + """ | |
| 21 | + | |
| 22 | + def __init__(self, item_id: int, max_comments: int = 200): | |
| 23 | + """ | |
| 24 | + Parameters | |
| 25 | + ---------- | |
| 26 | + item_id : int | |
| 27 | + HN story/item ID (e.g., 12345678). | |
| 28 | + max_comments : int | |
| 29 | + Maximum number of comments to fetch (default 200). | |
| 30 | + """ | |
| 31 | + self.item_id = item_id | |
| 32 | + self.max_comments = max_comments | |
| 33 | + | |
| 34 | + def authenticate(self) -> bool: | |
| 35 | + """No auth needed for the HN API.""" | |
| 36 | + return True | |
| 37 | + | |
| 38 | + def list_videos( | |
| 39 | + self, | |
| 40 | + folder_id: Optional[str] = None, | |
| 41 | + folder_path: Optional[str] = None, | |
| 42 | + patterns: Optional[List[str]] = None, | |
| 43 | + ) -> List[SourceFile]: | |
| 44 | + """Return a single SourceFile for the HN story.""" | |
| 45 | + return [ | |
| 46 | + SourceFile( | |
| 47 | + name=f"hn_{self.item_id}", | |
| 48 | + id=str(self.item_id), | |
| 49 | + mime_type="text/plain", | |
| 50 | + ) | |
| 51 | + ] | |
| 52 | + | |
| 53 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 54 | + """Download the story and comments as plain text.""" | |
| 55 | + destination = Path(destination) | |
| 56 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 57 | + text = self.fetch_text() | |
| 58 | + destination.write_text(text, encoding="utf-8") | |
| 59 | + logger.info(f"Saved HN story {self.item_id} to {destination}") | |
| 60 | + return destination | |
| 61 | + | |
| 62 | + def _get_item(self, item_id: int) -> dict: | |
| 63 | + import requests | |
| 64 | + | |
| 65 | + resp = requests.get(f"{HN_API}/item/{item_id}.json", timeout=10) | |
| 66 | + resp.raise_for_status() | |
| 67 | + return resp.json() or {} | |
| 68 | + | |
| 69 | + def fetch_text(self) -> str: | |
| 70 | + """Fetch story and comments as structured text.""" | |
| 71 | + story = self._get_item(self.item_id) | |
| 72 | + lines = [] | |
| 73 | + lines.append(f"# {story.get('title', 'Untitled')}") | |
| 74 | + lines.append(f"by {story.get('by', 'unknown')} | {story.get('score', 0)} points") | |
| 75 | + if story.get("url"): | |
| 76 | + lines.append(f"URL: {story['url']}") | |
| 77 | + if story.get("text"): | |
| 78 | + lines.append(f"\n{story['text']}") | |
| 79 | + lines.append("") | |
| 80 | + | |
| 81 | + # Fetch comments | |
| 82 | + kid_ids = story.get("kids", []) | |
| 83 | + if kid_ids: | |
| 84 | + lines.append("## Comments\n") | |
| 85 | + count = [0] | |
| 86 | + self._fetch_comments(kid_ids, lines, depth=0, count=count) | |
| 87 | + | |
| 88 | + return "\n".join(lines) | |
| 89 | + | |
| 90 | + def _fetch_comments(self, kid_ids: list, lines: list, depth: int, count: list) -> None: | |
| 91 | + """Recursively fetch and format comments.""" | |
| 92 | + indent = " " * depth | |
| 93 | + for kid_id in kid_ids: | |
| 94 | + if count[0] >= self.max_comments: | |
| 95 | + return | |
| 96 | + try: | |
| 97 | + item = self._get_item(kid_id) | |
| 98 | + except Exception: | |
| 99 | + continue | |
| 100 | + | |
| 101 | + if item.get("deleted") or item.get("dead"): | |
| 102 | + continue | |
| 103 | + | |
| 104 | + count[0] += 1 | |
| 105 | + author = item.get("by", "[deleted]") | |
| 106 | + text = item.get("text", "") | |
| 107 | + lines.append(f"{indent}**{author}**:") | |
| 108 | + lines.append(f"{indent}{text}") | |
| 109 | + lines.append("") | |
| 110 | + | |
| 111 | + if item.get("kids"): | |
| 112 | + self._fetch_comments(item["kids"], lines, depth + 1, count) |
| --- a/video_processor/sources/hackernews_source.py | |
| +++ b/video_processor/sources/hackernews_source.py | |
| @@ -0,0 +1,112 @@ | |
| --- a/video_processor/sources/hackernews_source.py | |
| +++ b/video_processor/sources/hackernews_source.py | |
| @@ -0,0 +1,112 @@ | |
| 1 | """Hacker News source connector using the official Firebase API.""" |
| 2 | |
| 3 | import logging |
| 4 | from pathlib import Path |
| 5 | from typing import List, Optional |
| 6 | |
| 7 | from video_processor.sources.base import BaseSource, SourceFile |
| 8 | |
| 9 | logger = logging.getLogger(__name__) |
| 10 | |
| 11 | HN_API = "https://hacker-news.firebaseio.com/v0" |
| 12 | |
| 13 | |
| 14 | class HackerNewsSource(BaseSource): |
| 15 | """ |
| 16 | Fetch Hacker News stories and comments via the public API. |
| 17 | |
| 18 | API docs: https://github.com/HackerNews/API |
| 19 | Requires: pip install requests |
| 20 | """ |
| 21 | |
| 22 | def __init__(self, item_id: int, max_comments: int = 200): |
| 23 | """ |
| 24 | Parameters |
| 25 | ---------- |
| 26 | item_id : int |
| 27 | HN story/item ID (e.g., 12345678). |
| 28 | max_comments : int |
| 29 | Maximum number of comments to fetch (default 200). |
| 30 | """ |
| 31 | self.item_id = item_id |
| 32 | self.max_comments = max_comments |
| 33 | |
| 34 | def authenticate(self) -> bool: |
| 35 | """No auth needed for the HN API.""" |
| 36 | return True |
| 37 | |
| 38 | def list_videos( |
| 39 | self, |
| 40 | folder_id: Optional[str] = None, |
| 41 | folder_path: Optional[str] = None, |
| 42 | patterns: Optional[List[str]] = None, |
| 43 | ) -> List[SourceFile]: |
| 44 | """Return a single SourceFile for the HN story.""" |
| 45 | return [ |
| 46 | SourceFile( |
| 47 | name=f"hn_{self.item_id}", |
| 48 | id=str(self.item_id), |
| 49 | mime_type="text/plain", |
| 50 | ) |
| 51 | ] |
| 52 | |
| 53 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 54 | """Download the story and comments as plain text.""" |
| 55 | destination = Path(destination) |
| 56 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 57 | text = self.fetch_text() |
| 58 | destination.write_text(text, encoding="utf-8") |
| 59 | logger.info(f"Saved HN story {self.item_id} to {destination}") |
| 60 | return destination |
| 61 | |
| 62 | def _get_item(self, item_id: int) -> dict: |
| 63 | import requests |
| 64 | |
| 65 | resp = requests.get(f"{HN_API}/item/{item_id}.json", timeout=10) |
| 66 | resp.raise_for_status() |
| 67 | return resp.json() or {} |
| 68 | |
| 69 | def fetch_text(self) -> str: |
| 70 | """Fetch story and comments as structured text.""" |
| 71 | story = self._get_item(self.item_id) |
| 72 | lines = [] |
| 73 | lines.append(f"# {story.get('title', 'Untitled')}") |
| 74 | lines.append(f"by {story.get('by', 'unknown')} | {story.get('score', 0)} points") |
| 75 | if story.get("url"): |
| 76 | lines.append(f"URL: {story['url']}") |
| 77 | if story.get("text"): |
| 78 | lines.append(f"\n{story['text']}") |
| 79 | lines.append("") |
| 80 | |
| 81 | # Fetch comments |
| 82 | kid_ids = story.get("kids", []) |
| 83 | if kid_ids: |
| 84 | lines.append("## Comments\n") |
| 85 | count = [0] |
| 86 | self._fetch_comments(kid_ids, lines, depth=0, count=count) |
| 87 | |
| 88 | return "\n".join(lines) |
| 89 | |
| 90 | def _fetch_comments(self, kid_ids: list, lines: list, depth: int, count: list) -> None: |
| 91 | """Recursively fetch and format comments.""" |
| 92 | indent = " " * depth |
| 93 | for kid_id in kid_ids: |
| 94 | if count[0] >= self.max_comments: |
| 95 | return |
| 96 | try: |
| 97 | item = self._get_item(kid_id) |
| 98 | except Exception: |
| 99 | continue |
| 100 | |
| 101 | if item.get("deleted") or item.get("dead"): |
| 102 | continue |
| 103 | |
| 104 | count[0] += 1 |
| 105 | author = item.get("by", "[deleted]") |
| 106 | text = item.get("text", "") |
| 107 | lines.append(f"{indent}**{author}**:") |
| 108 | lines.append(f"{indent}{text}") |
| 109 | lines.append("") |
| 110 | |
| 111 | if item.get("kids"): |
| 112 | self._fetch_comments(item["kids"], lines, depth + 1, count) |
| --- a/video_processor/sources/podcast_source.py | ||
| +++ b/video_processor/sources/podcast_source.py | ||
| @@ -0,0 +1,119 @@ | ||
| 1 | +"""Podcast feed source connector -- extends RSS for audio enclosures.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +from pathlib import Path | |
| 5 | +from typing import List, Optional | |
| 6 | + | |
| 7 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 8 | + | |
| 9 | +logger = logging.getLogger(__name__) | |
| 10 | + | |
| 11 | + | |
| 12 | +class PodcastSource(BaseSource): | |
| 13 | + """ | |
| 14 | + Parse podcast RSS feeds and download audio episodes for pipeline processing. | |
| 15 | + | |
| 16 | + Extends the RSS pattern to extract <enclosure> audio URLs. | |
| 17 | + Requires: pip install requests | |
| 18 | + Optional: pip install feedparser | |
| 19 | + """ | |
| 20 | + | |
| 21 | + def __init__(self, feed_url: str, max_episodes: int = 10): | |
| 22 | + self.feed_url = feed_url | |
| 23 | + self.max_episodes = max_episodes | |
| 24 | + self._episodes: List[dict] = [] | |
| 25 | + | |
| 26 | + def authenticate(self) -> bool: | |
| 27 | + """No auth needed for public podcast feeds.""" | |
| 28 | + return True | |
| 29 | + | |
| 30 | + def _parse_feed(self) -> None: | |
| 31 | + """Fetch and parse the podcast feed for audio enclosures.""" | |
| 32 | + if self._episodes: | |
| 33 | + return | |
| 34 | + | |
| 35 | + import requests | |
| 36 | + | |
| 37 | + resp = requests.get(self.feed_url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"}) | |
| 38 | + resp.raise_for_status() | |
| 39 | + | |
| 40 | + try: | |
| 41 | + import feedparser | |
| 42 | + | |
| 43 | + feed = feedparser.parse(resp.text) | |
| 44 | + for entry in feed.entries[: self.max_episodes]: | |
| 45 | + audio_url = None | |
| 46 | + for link in entry.get("links", []): | |
| 47 | + if link.get("type", "").startswith("audio/"): | |
| 48 | + audio_url = link.get("href") | |
| 49 | + break | |
| 50 | + if not audio_url and entry.get("enclosures"): | |
| 51 | + audio_url = entry["enclosures"][0].get("href") | |
| 52 | + if audio_url: | |
| 53 | + self._episodes.append( | |
| 54 | + { | |
| 55 | + "title": entry.get("title", "Untitled"), | |
| 56 | + "url": audio_url, | |
| 57 | + "published": entry.get("published", ""), | |
| 58 | + "duration": entry.get("itunes_duration", ""), | |
| 59 | + } | |
| 60 | + ) | |
| 61 | + except ImportError: | |
| 62 | + logger.debug("feedparser not available, using xml.etree fallback") | |
| 63 | + self._parse_xml(resp.text) | |
| 64 | + | |
| 65 | + def _parse_xml(self, text: str) -> None: | |
| 66 | + """Fallback parser for podcast XML using stdlib.""" | |
| 67 | + import xml.etree.ElementTree as ET | |
| 68 | + | |
| 69 | + root = ET.fromstring(text) | |
| 70 | + items = root.findall(".//item") | |
| 71 | + for item in items[: self.max_episodes]: | |
| 72 | + enclosure = item.find("enclosure") | |
| 73 | + if enclosure is None: | |
| 74 | + continue | |
| 75 | + audio_url = enclosure.get("url", "") | |
| 76 | + if not audio_url: | |
| 77 | + continue | |
| 78 | + title = item.findtext("title") or "Untitled" | |
| 79 | + pub = item.findtext("pubDate") or "" | |
| 80 | + self._episodes.append( | |
| 81 | + {"title": title, "url": audio_url, "published": pub, "duration": ""} | |
| 82 | + ) | |
| 83 | + | |
| 84 | + def list_videos( | |
| 85 | + self, | |
| 86 | + folder_id: Optional[str] = None, | |
| 87 | + folder_path: Optional[str] = None, | |
| 88 | + patterns: Optional[List[str]] = None, | |
| 89 | + ) -> List[SourceFile]: | |
| 90 | + """List podcast episodes as SourceFiles.""" | |
| 91 | + self._parse_feed() | |
| 92 | + return [ | |
| 93 | + SourceFile( | |
| 94 | + name=ep["title"], | |
| 95 | + id=ep["url"], | |
| 96 | + mime_type="audio/mpeg", | |
| 97 | + modified_at=ep["published"], | |
| 98 | + ) | |
| 99 | + for ep in self._episodes | |
| 100 | + ] | |
| 101 | + | |
| 102 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 103 | + """Download the podcast audio file.""" | |
| 104 | + import requests | |
| 105 | + | |
| 106 | + destination = Path(destination) | |
| 107 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 108 | + | |
| 109 | + resp = requests.get( | |
| 110 | + file.id, timeout=60, stream=True, headers={"User-Agent": "PlanOpticon/0.3"} | |
| 111 | + ) | |
| 112 | + resp.raise_for_status() | |
| 113 | + | |
| 114 | + with open(destination, "wb") as f: | |
| 115 | + for chunk in resp.iter_content(chunk_size=8192): | |
| 116 | + f.write(chunk) | |
| 117 | + | |
| 118 | + logger.info(f"Downloaded podcast episode to {destination}") | |
| 119 | + return destination |
| --- a/video_processor/sources/podcast_source.py | |
| +++ b/video_processor/sources/podcast_source.py | |
| @@ -0,0 +1,119 @@ | |
| --- a/video_processor/sources/podcast_source.py | |
| +++ b/video_processor/sources/podcast_source.py | |
| @@ -0,0 +1,119 @@ | |
| 1 | """Podcast feed source connector -- extends RSS for audio enclosures.""" |
| 2 | |
| 3 | import logging |
| 4 | from pathlib import Path |
| 5 | from typing import List, Optional |
| 6 | |
| 7 | from video_processor.sources.base import BaseSource, SourceFile |
| 8 | |
| 9 | logger = logging.getLogger(__name__) |
| 10 | |
| 11 | |
| 12 | class PodcastSource(BaseSource): |
| 13 | """ |
| 14 | Parse podcast RSS feeds and download audio episodes for pipeline processing. |
| 15 | |
| 16 | Extends the RSS pattern to extract <enclosure> audio URLs. |
| 17 | Requires: pip install requests |
| 18 | Optional: pip install feedparser |
| 19 | """ |
| 20 | |
| 21 | def __init__(self, feed_url: str, max_episodes: int = 10): |
| 22 | self.feed_url = feed_url |
| 23 | self.max_episodes = max_episodes |
| 24 | self._episodes: List[dict] = [] |
| 25 | |
| 26 | def authenticate(self) -> bool: |
| 27 | """No auth needed for public podcast feeds.""" |
| 28 | return True |
| 29 | |
| 30 | def _parse_feed(self) -> None: |
| 31 | """Fetch and parse the podcast feed for audio enclosures.""" |
| 32 | if self._episodes: |
| 33 | return |
| 34 | |
| 35 | import requests |
| 36 | |
| 37 | resp = requests.get(self.feed_url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"}) |
| 38 | resp.raise_for_status() |
| 39 | |
| 40 | try: |
| 41 | import feedparser |
| 42 | |
| 43 | feed = feedparser.parse(resp.text) |
| 44 | for entry in feed.entries[: self.max_episodes]: |
| 45 | audio_url = None |
| 46 | for link in entry.get("links", []): |
| 47 | if link.get("type", "").startswith("audio/"): |
| 48 | audio_url = link.get("href") |
| 49 | break |
| 50 | if not audio_url and entry.get("enclosures"): |
| 51 | audio_url = entry["enclosures"][0].get("href") |
| 52 | if audio_url: |
| 53 | self._episodes.append( |
| 54 | { |
| 55 | "title": entry.get("title", "Untitled"), |
| 56 | "url": audio_url, |
| 57 | "published": entry.get("published", ""), |
| 58 | "duration": entry.get("itunes_duration", ""), |
| 59 | } |
| 60 | ) |
| 61 | except ImportError: |
| 62 | logger.debug("feedparser not available, using xml.etree fallback") |
| 63 | self._parse_xml(resp.text) |
| 64 | |
| 65 | def _parse_xml(self, text: str) -> None: |
| 66 | """Fallback parser for podcast XML using stdlib.""" |
| 67 | import xml.etree.ElementTree as ET |
| 68 | |
| 69 | root = ET.fromstring(text) |
| 70 | items = root.findall(".//item") |
| 71 | for item in items[: self.max_episodes]: |
| 72 | enclosure = item.find("enclosure") |
| 73 | if enclosure is None: |
| 74 | continue |
| 75 | audio_url = enclosure.get("url", "") |
| 76 | if not audio_url: |
| 77 | continue |
| 78 | title = item.findtext("title") or "Untitled" |
| 79 | pub = item.findtext("pubDate") or "" |
| 80 | self._episodes.append( |
| 81 | {"title": title, "url": audio_url, "published": pub, "duration": ""} |
| 82 | ) |
| 83 | |
| 84 | def list_videos( |
| 85 | self, |
| 86 | folder_id: Optional[str] = None, |
| 87 | folder_path: Optional[str] = None, |
| 88 | patterns: Optional[List[str]] = None, |
| 89 | ) -> List[SourceFile]: |
| 90 | """List podcast episodes as SourceFiles.""" |
| 91 | self._parse_feed() |
| 92 | return [ |
| 93 | SourceFile( |
| 94 | name=ep["title"], |
| 95 | id=ep["url"], |
| 96 | mime_type="audio/mpeg", |
| 97 | modified_at=ep["published"], |
| 98 | ) |
| 99 | for ep in self._episodes |
| 100 | ] |
| 101 | |
| 102 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 103 | """Download the podcast audio file.""" |
| 104 | import requests |
| 105 | |
| 106 | destination = Path(destination) |
| 107 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 108 | |
| 109 | resp = requests.get( |
| 110 | file.id, timeout=60, stream=True, headers={"User-Agent": "PlanOpticon/0.3"} |
| 111 | ) |
| 112 | resp.raise_for_status() |
| 113 | |
| 114 | with open(destination, "wb") as f: |
| 115 | for chunk in resp.iter_content(chunk_size=8192): |
| 116 | f.write(chunk) |
| 117 | |
| 118 | logger.info(f"Downloaded podcast episode to {destination}") |
| 119 | return destination |
| --- a/video_processor/sources/reddit_source.py | ||
| +++ b/video_processor/sources/reddit_source.py | ||
| @@ -0,0 +1,103 @@ | ||
| 1 | +"""Reddit source connector using the public JSON API.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +from pathlib import Path | |
| 5 | +from typing import List, Optional | |
| 6 | + | |
| 7 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 8 | + | |
| 9 | +logger = logging.getLogger(__name__) | |
| 10 | + | |
| 11 | + | |
| 12 | +class RedditSource(BaseSource): | |
| 13 | + """ | |
| 14 | + Fetch Reddit posts and comments via the public JSON API. | |
| 15 | + | |
| 16 | + No auth required for public posts. Append .json to any Reddit URL. | |
| 17 | + Requires: pip install requests | |
| 18 | + """ | |
| 19 | + | |
| 20 | + def __init__(self, url: str): | |
| 21 | + """ | |
| 22 | + Parameters | |
| 23 | + ---------- | |
| 24 | + url : str | |
| 25 | + Reddit post or subreddit URL. | |
| 26 | + """ | |
| 27 | + self.url = url.rstrip("/") | |
| 28 | + | |
| 29 | + def authenticate(self) -> bool: | |
| 30 | + """No auth needed for public Reddit content.""" | |
| 31 | + return True | |
| 32 | + | |
| 33 | + def list_videos( | |
| 34 | + self, | |
| 35 | + folder_id: Optional[str] = None, | |
| 36 | + folder_path: Optional[str] = None, | |
| 37 | + patterns: Optional[List[str]] = None, | |
| 38 | + ) -> List[SourceFile]: | |
| 39 | + """Return a single SourceFile for the Reddit post.""" | |
| 40 | + return [ | |
| 41 | + SourceFile( | |
| 42 | + name=self.url.split("/")[-1] or "reddit_post", | |
| 43 | + id=self.url, | |
| 44 | + mime_type="text/plain", | |
| 45 | + ) | |
| 46 | + ] | |
| 47 | + | |
| 48 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 49 | + """Download post and comments as plain text.""" | |
| 50 | + destination = Path(destination) | |
| 51 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 52 | + text = self.fetch_text() | |
| 53 | + destination.write_text(text, encoding="utf-8") | |
| 54 | + logger.info(f"Saved Reddit content to {destination}") | |
| 55 | + return destination | |
| 56 | + | |
| 57 | + def fetch_text(self) -> str: | |
| 58 | + """Fetch the Reddit post and comments as structured text.""" | |
| 59 | + import requests | |
| 60 | + | |
| 61 | + json_url = self.url.rstrip("/") + ".json" | |
| 62 | + resp = requests.get( | |
| 63 | + json_url, | |
| 64 | + timeout=15, | |
| 65 | + headers={"User-Agent": "PlanOpticon/0.3 (source connector)"}, | |
| 66 | + ) | |
| 67 | + resp.raise_for_status() | |
| 68 | + data = resp.json() | |
| 69 | + | |
| 70 | + lines = [] | |
| 71 | + # Post data is in first listing | |
| 72 | + if isinstance(data, list) and len(data) > 0: | |
| 73 | + post = data[0]["data"]["children"][0]["data"] | |
| 74 | + lines.append(f"# {post.get('title', 'Untitled')}") | |
| 75 | + lines.append(f"by u/{post.get('author', '[deleted]')} | {post.get('score', 0)} points") | |
| 76 | + lines.append("") | |
| 77 | + if post.get("selftext"): | |
| 78 | + lines.append(post["selftext"]) | |
| 79 | + lines.append("") | |
| 80 | + | |
| 81 | + # Comments in second listing | |
| 82 | + if len(data) > 1: | |
| 83 | + lines.append("## Comments\n") | |
| 84 | + self._extract_comments(data[1]["data"]["children"], lines, depth=0) | |
| 85 | + | |
| 86 | + return "\n".join(lines) | |
| 87 | + | |
| 88 | + def _extract_comments(self, children: list, lines: list, depth: int) -> None: | |
| 89 | + """Recursively extract comment text.""" | |
| 90 | + indent = " " * depth | |
| 91 | + for child in children: | |
| 92 | + if child.get("kind") != "t1": | |
| 93 | + continue | |
| 94 | + c = child["data"] | |
| 95 | + author = c.get("author", "[deleted]") | |
| 96 | + body = c.get("body", "") | |
| 97 | + lines.append(f"{indent}**{author}** ({c.get('score', 0)} pts):") | |
| 98 | + lines.append(f"{indent}{body}") | |
| 99 | + lines.append("") | |
| 100 | + # Recurse into replies | |
| 101 | + replies = c.get("replies") | |
| 102 | + if isinstance(replies, dict): | |
| 103 | + self._extract_comments(replies["data"]["children"], lines, depth + 1) |
| --- a/video_processor/sources/reddit_source.py | |
| +++ b/video_processor/sources/reddit_source.py | |
| @@ -0,0 +1,103 @@ | |
| --- a/video_processor/sources/reddit_source.py | |
| +++ b/video_processor/sources/reddit_source.py | |
| @@ -0,0 +1,103 @@ | |
| 1 | """Reddit source connector using the public JSON API.""" |
| 2 | |
| 3 | import logging |
| 4 | from pathlib import Path |
| 5 | from typing import List, Optional |
| 6 | |
| 7 | from video_processor.sources.base import BaseSource, SourceFile |
| 8 | |
| 9 | logger = logging.getLogger(__name__) |
| 10 | |
| 11 | |
| 12 | class RedditSource(BaseSource): |
| 13 | """ |
| 14 | Fetch Reddit posts and comments via the public JSON API. |
| 15 | |
| 16 | No auth required for public posts. Append .json to any Reddit URL. |
| 17 | Requires: pip install requests |
| 18 | """ |
| 19 | |
| 20 | def __init__(self, url: str): |
| 21 | """ |
| 22 | Parameters |
| 23 | ---------- |
| 24 | url : str |
| 25 | Reddit post or subreddit URL. |
| 26 | """ |
| 27 | self.url = url.rstrip("/") |
| 28 | |
| 29 | def authenticate(self) -> bool: |
| 30 | """No auth needed for public Reddit content.""" |
| 31 | return True |
| 32 | |
| 33 | def list_videos( |
| 34 | self, |
| 35 | folder_id: Optional[str] = None, |
| 36 | folder_path: Optional[str] = None, |
| 37 | patterns: Optional[List[str]] = None, |
| 38 | ) -> List[SourceFile]: |
| 39 | """Return a single SourceFile for the Reddit post.""" |
| 40 | return [ |
| 41 | SourceFile( |
| 42 | name=self.url.split("/")[-1] or "reddit_post", |
| 43 | id=self.url, |
| 44 | mime_type="text/plain", |
| 45 | ) |
| 46 | ] |
| 47 | |
| 48 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 49 | """Download post and comments as plain text.""" |
| 50 | destination = Path(destination) |
| 51 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 52 | text = self.fetch_text() |
| 53 | destination.write_text(text, encoding="utf-8") |
| 54 | logger.info(f"Saved Reddit content to {destination}") |
| 55 | return destination |
| 56 | |
| 57 | def fetch_text(self) -> str: |
| 58 | """Fetch the Reddit post and comments as structured text.""" |
| 59 | import requests |
| 60 | |
| 61 | json_url = self.url.rstrip("/") + ".json" |
| 62 | resp = requests.get( |
| 63 | json_url, |
| 64 | timeout=15, |
| 65 | headers={"User-Agent": "PlanOpticon/0.3 (source connector)"}, |
| 66 | ) |
| 67 | resp.raise_for_status() |
| 68 | data = resp.json() |
| 69 | |
| 70 | lines = [] |
| 71 | # Post data is in first listing |
| 72 | if isinstance(data, list) and len(data) > 0: |
| 73 | post = data[0]["data"]["children"][0]["data"] |
| 74 | lines.append(f"# {post.get('title', 'Untitled')}") |
| 75 | lines.append(f"by u/{post.get('author', '[deleted]')} | {post.get('score', 0)} points") |
| 76 | lines.append("") |
| 77 | if post.get("selftext"): |
| 78 | lines.append(post["selftext"]) |
| 79 | lines.append("") |
| 80 | |
| 81 | # Comments in second listing |
| 82 | if len(data) > 1: |
| 83 | lines.append("## Comments\n") |
| 84 | self._extract_comments(data[1]["data"]["children"], lines, depth=0) |
| 85 | |
| 86 | return "\n".join(lines) |
| 87 | |
| 88 | def _extract_comments(self, children: list, lines: list, depth: int) -> None: |
| 89 | """Recursively extract comment text.""" |
| 90 | indent = " " * depth |
| 91 | for child in children: |
| 92 | if child.get("kind") != "t1": |
| 93 | continue |
| 94 | c = child["data"] |
| 95 | author = c.get("author", "[deleted]") |
| 96 | body = c.get("body", "") |
| 97 | lines.append(f"{indent}**{author}** ({c.get('score', 0)} pts):") |
| 98 | lines.append(f"{indent}{body}") |
| 99 | lines.append("") |
| 100 | # Recurse into replies |
| 101 | replies = c.get("replies") |
| 102 | if isinstance(replies, dict): |
| 103 | self._extract_comments(replies["data"]["children"], lines, depth + 1) |
| --- a/video_processor/sources/rss_source.py | ||
| +++ b/video_processor/sources/rss_source.py | ||
| @@ -0,0 +1,114 @@ | ||
| 1 | +"""RSS/Atom feed source connector.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +from pathlib import Path | |
| 5 | +from typing import List, Optional | |
| 6 | + | |
| 7 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 8 | + | |
| 9 | +logger = logging.getLogger(__name__) | |
| 10 | + | |
| 11 | + | |
| 12 | +class RSSSource(BaseSource): | |
| 13 | + """ | |
| 14 | + Parse RSS/Atom feeds and extract entries as text documents. | |
| 15 | + | |
| 16 | + Optional: pip install feedparser (falls back to xml.etree.ElementTree) | |
| 17 | + Requires: pip install requests | |
| 18 | + """ | |
| 19 | + | |
| 20 | + def __init__(self, url: str, max_entries: int = 50): | |
| 21 | + self.url = url | |
| 22 | + self.max_entries = max_entries | |
| 23 | + self._entries: List[dict] = [] | |
| 24 | + | |
| 25 | + def authenticate(self) -> bool: | |
| 26 | + """No auth needed for public feeds.""" | |
| 27 | + return True | |
| 28 | + | |
| 29 | + def _parse_feed(self) -> None: | |
| 30 | + """Fetch and parse the feed.""" | |
| 31 | + if self._entries: | |
| 32 | + return | |
| 33 | + | |
| 34 | + import requests | |
| 35 | + | |
| 36 | + resp = requests.get(self.url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"}) | |
| 37 | + resp.raise_for_status() | |
| 38 | + | |
| 39 | + try: | |
| 40 | + import feedparser | |
| 41 | + | |
| 42 | + feed = feedparser.parse(resp.text) | |
| 43 | + for entry in feed.entries[: self.max_entries]: | |
| 44 | + self._entries.append( | |
| 45 | + { | |
| 46 | + "title": entry.get("title", "Untitled"), | |
| 47 | + "link": entry.get("link", ""), | |
| 48 | + "summary": entry.get("summary", ""), | |
| 49 | + "published": entry.get("published", ""), | |
| 50 | + "id": entry.get("id", entry.get("link", "")), | |
| 51 | + } | |
| 52 | + ) | |
| 53 | + except ImportError: | |
| 54 | + logger.debug("feedparser not available, using xml.etree fallback") | |
| 55 | + self._parse_xml(resp.text) | |
| 56 | + | |
| 57 | + def _parse_xml(self, text: str) -> None: | |
| 58 | + """Fallback parser using stdlib xml.etree.""" | |
| 59 | + import xml.etree.ElementTree as ET | |
| 60 | + | |
| 61 | + root = ET.fromstring(text) | |
| 62 | + # Handle RSS 2.0 | |
| 63 | + ns = {"atom": "http://www.w3.org/2005/Atom"} | |
| 64 | + items = root.findall(".//item") or root.findall(".//atom:entry", ns) | |
| 65 | + for item in items[: self.max_entries]: | |
| 66 | + title = ( | |
| 67 | + item.findtext("title") or item.findtext("atom:title", namespaces=ns) or "Untitled" | |
| 68 | + ) | |
| 69 | + link = item.findtext("link") or "" | |
| 70 | + if not link: | |
| 71 | + link_el = item.find("atom:link", ns) | |
| 72 | + link = link_el.get("href", "") if link_el is not None else "" | |
| 73 | + desc = ( | |
| 74 | + item.findtext("description") or item.findtext("atom:summary", namespaces=ns) or "" | |
| 75 | + ) | |
| 76 | + pub = item.findtext("pubDate") or item.findtext("atom:published", namespaces=ns) or "" | |
| 77 | + self._entries.append( | |
| 78 | + {"title": title, "link": link, "summary": desc, "published": pub, "id": link} | |
| 79 | + ) | |
| 80 | + | |
| 81 | + def list_videos( | |
| 82 | + self, | |
| 83 | + folder_id: Optional[str] = None, | |
| 84 | + folder_path: Optional[str] = None, | |
| 85 | + patterns: Optional[List[str]] = None, | |
| 86 | + ) -> List[SourceFile]: | |
| 87 | + """List feed entries as SourceFiles.""" | |
| 88 | + self._parse_feed() | |
| 89 | + return [ | |
| 90 | + SourceFile( | |
| 91 | + name=e["title"], id=e["id"], mime_type="text/plain", modified_at=e["published"] | |
| 92 | + ) | |
| 93 | + for e in self._entries | |
| 94 | + ] | |
| 95 | + | |
| 96 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 97 | + """Write an entry's content as a text file.""" | |
| 98 | + self._parse_feed() | |
| 99 | + destination = Path(destination) | |
| 100 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 101 | + | |
| 102 | + entry = next((e for e in self._entries if e["id"] == file.id), None) | |
| 103 | + if not entry: | |
| 104 | + raise ValueError(f"Entry not found: {file.id}") | |
| 105 | + | |
| 106 | + text = ( | |
| 107 | + f"# {entry['title']}\n\n" | |
| 108 | + f"Published: {entry['published']}\n" | |
| 109 | + f"Link: {entry['link']}\n\n" | |
| 110 | + f"{entry['summary']}" | |
| 111 | + ) | |
| 112 | + destination.write_text(text, encoding="utf-8") | |
| 113 | + logger.info(f"Saved RSS entry to {destination}") | |
| 114 | + return destination |
| --- a/video_processor/sources/rss_source.py | |
| +++ b/video_processor/sources/rss_source.py | |
| @@ -0,0 +1,114 @@ | |
| --- a/video_processor/sources/rss_source.py | |
| +++ b/video_processor/sources/rss_source.py | |
| @@ -0,0 +1,114 @@ | |
| 1 | """RSS/Atom feed source connector.""" |
| 2 | |
| 3 | import logging |
| 4 | from pathlib import Path |
| 5 | from typing import List, Optional |
| 6 | |
| 7 | from video_processor.sources.base import BaseSource, SourceFile |
| 8 | |
| 9 | logger = logging.getLogger(__name__) |
| 10 | |
| 11 | |
| 12 | class RSSSource(BaseSource): |
| 13 | """ |
| 14 | Parse RSS/Atom feeds and extract entries as text documents. |
| 15 | |
| 16 | Optional: pip install feedparser (falls back to xml.etree.ElementTree) |
| 17 | Requires: pip install requests |
| 18 | """ |
| 19 | |
| 20 | def __init__(self, url: str, max_entries: int = 50): |
| 21 | self.url = url |
| 22 | self.max_entries = max_entries |
| 23 | self._entries: List[dict] = [] |
| 24 | |
| 25 | def authenticate(self) -> bool: |
| 26 | """No auth needed for public feeds.""" |
| 27 | return True |
| 28 | |
| 29 | def _parse_feed(self) -> None: |
| 30 | """Fetch and parse the feed.""" |
| 31 | if self._entries: |
| 32 | return |
| 33 | |
| 34 | import requests |
| 35 | |
| 36 | resp = requests.get(self.url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"}) |
| 37 | resp.raise_for_status() |
| 38 | |
| 39 | try: |
| 40 | import feedparser |
| 41 | |
| 42 | feed = feedparser.parse(resp.text) |
| 43 | for entry in feed.entries[: self.max_entries]: |
| 44 | self._entries.append( |
| 45 | { |
| 46 | "title": entry.get("title", "Untitled"), |
| 47 | "link": entry.get("link", ""), |
| 48 | "summary": entry.get("summary", ""), |
| 49 | "published": entry.get("published", ""), |
| 50 | "id": entry.get("id", entry.get("link", "")), |
| 51 | } |
| 52 | ) |
| 53 | except ImportError: |
| 54 | logger.debug("feedparser not available, using xml.etree fallback") |
| 55 | self._parse_xml(resp.text) |
| 56 | |
| 57 | def _parse_xml(self, text: str) -> None: |
| 58 | """Fallback parser using stdlib xml.etree.""" |
| 59 | import xml.etree.ElementTree as ET |
| 60 | |
| 61 | root = ET.fromstring(text) |
| 62 | # Handle RSS 2.0 |
| 63 | ns = {"atom": "http://www.w3.org/2005/Atom"} |
| 64 | items = root.findall(".//item") or root.findall(".//atom:entry", ns) |
| 65 | for item in items[: self.max_entries]: |
| 66 | title = ( |
| 67 | item.findtext("title") or item.findtext("atom:title", namespaces=ns) or "Untitled" |
| 68 | ) |
| 69 | link = item.findtext("link") or "" |
| 70 | if not link: |
| 71 | link_el = item.find("atom:link", ns) |
| 72 | link = link_el.get("href", "") if link_el is not None else "" |
| 73 | desc = ( |
| 74 | item.findtext("description") or item.findtext("atom:summary", namespaces=ns) or "" |
| 75 | ) |
| 76 | pub = item.findtext("pubDate") or item.findtext("atom:published", namespaces=ns) or "" |
| 77 | self._entries.append( |
| 78 | {"title": title, "link": link, "summary": desc, "published": pub, "id": link} |
| 79 | ) |
| 80 | |
| 81 | def list_videos( |
| 82 | self, |
| 83 | folder_id: Optional[str] = None, |
| 84 | folder_path: Optional[str] = None, |
| 85 | patterns: Optional[List[str]] = None, |
| 86 | ) -> List[SourceFile]: |
| 87 | """List feed entries as SourceFiles.""" |
| 88 | self._parse_feed() |
| 89 | return [ |
| 90 | SourceFile( |
| 91 | name=e["title"], id=e["id"], mime_type="text/plain", modified_at=e["published"] |
| 92 | ) |
| 93 | for e in self._entries |
| 94 | ] |
| 95 | |
| 96 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 97 | """Write an entry's content as a text file.""" |
| 98 | self._parse_feed() |
| 99 | destination = Path(destination) |
| 100 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 101 | |
| 102 | entry = next((e for e in self._entries if e["id"] == file.id), None) |
| 103 | if not entry: |
| 104 | raise ValueError(f"Entry not found: {file.id}") |
| 105 | |
| 106 | text = ( |
| 107 | f"# {entry['title']}\n\n" |
| 108 | f"Published: {entry['published']}\n" |
| 109 | f"Link: {entry['link']}\n\n" |
| 110 | f"{entry['summary']}" |
| 111 | ) |
| 112 | destination.write_text(text, encoding="utf-8") |
| 113 | logger.info(f"Saved RSS entry to {destination}") |
| 114 | return destination |
| --- a/video_processor/sources/twitter_source.py | ||
| +++ b/video_processor/sources/twitter_source.py | ||
| @@ -0,0 +1,129 @@ | ||
| 1 | +"""Twitter/X source connector -- stub requiring auth or gallery-dl.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +from pathlib import Path | |
| 5 | +from typing import List, Optional | |
| 6 | + | |
| 7 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 8 | + | |
| 9 | +logger = logging.getLogger(__name__) | |
| 10 | + | |
| 11 | + | |
| 12 | +class TwitterSource(BaseSource): | |
| 13 | + """ | |
| 14 | + Fetch Twitter/X posts and threads. | |
| 15 | + | |
| 16 | + Twitter API v2 requires authentication. This connector attempts to use | |
| 17 | + gallery-dl as a fallback for public tweets. | |
| 18 | + | |
| 19 | + Auth options: | |
| 20 | + - Set TWITTER_BEARER_TOKEN env var for API v2 access | |
| 21 | + - Install gallery-dl for scraping public tweets: pip install gallery-dl | |
| 22 | + """ | |
| 23 | + | |
| 24 | + def __init__(self, url: str): | |
| 25 | + self.url = url | |
| 26 | + self._bearer_token: Optional[str] = None | |
| 27 | + | |
| 28 | + def authenticate(self) -> bool: | |
| 29 | + """Check for Twitter API token or gallery-dl availability.""" | |
| 30 | + import os | |
| 31 | + | |
| 32 | + self._bearer_token = os.environ.get("TWITTER_BEARER_TOKEN") | |
| 33 | + if self._bearer_token: | |
| 34 | + return True | |
| 35 | + | |
| 36 | + # Check for gallery-dl fallback | |
| 37 | + try: | |
| 38 | + import gallery_dl # noqa: F401 | |
| 39 | + | |
| 40 | + logger.info("Using gallery-dl for Twitter content extraction") | |
| 41 | + return True | |
| 42 | + except ImportError: | |
| 43 | + pass | |
| 44 | + | |
| 45 | + logger.error( | |
| 46 | + "Twitter source requires either:\n" | |
| 47 | + " 1. TWITTER_BEARER_TOKEN env var (Twitter API v2)\n" | |
| 48 | + " 2. gallery-dl installed: pip install gallery-dl\n" | |
| 49 | + "Twitter API access: https://developer.twitter.com/en/portal/dashboard" | |
| 50 | + ) | |
| 51 | + return False | |
| 52 | + | |
| 53 | + def list_videos( | |
| 54 | + self, | |
| 55 | + folder_id: Optional[str] = None, | |
| 56 | + folder_path: Optional[str] = None, | |
| 57 | + patterns: Optional[List[str]] = None, | |
| 58 | + ) -> List[SourceFile]: | |
| 59 | + """Return a single SourceFile for the tweet/thread.""" | |
| 60 | + return [ | |
| 61 | + SourceFile( | |
| 62 | + name=self.url.split("/")[-1] or "tweet", | |
| 63 | + id=self.url, | |
| 64 | + mime_type="text/plain", | |
| 65 | + ) | |
| 66 | + ] | |
| 67 | + | |
| 68 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 69 | + """Download tweet content as text.""" | |
| 70 | + destination = Path(destination) | |
| 71 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 72 | + text = self.fetch_text() | |
| 73 | + destination.write_text(text, encoding="utf-8") | |
| 74 | + logger.info(f"Saved Twitter content to {destination}") | |
| 75 | + return destination | |
| 76 | + | |
| 77 | + def fetch_text(self) -> str: | |
| 78 | + """Extract tweet text via API or gallery-dl.""" | |
| 79 | + if self._bearer_token: | |
| 80 | + return self._fetch_via_api() | |
| 81 | + | |
| 82 | + try: | |
| 83 | + return self._fetch_via_gallery_dl() | |
| 84 | + except ImportError: | |
| 85 | + raise RuntimeError( | |
| 86 | + "No Twitter extraction method available. See authenticate() for setup." | |
| 87 | + ) | |
| 88 | + | |
| 89 | + def _fetch_via_api(self) -> str: | |
| 90 | + """Fetch tweet via Twitter API v2.""" | |
| 91 | + import re | |
| 92 | + | |
| 93 | + import requests | |
| 94 | + | |
| 95 | + match = re.search(r"/status/(\d+)", self.url) | |
| 96 | + if not match: | |
| 97 | + raise ValueError(f"Could not extract tweet ID from: {self.url}") | |
| 98 | + | |
| 99 | + tweet_id = match.group(1) | |
| 100 | + resp = requests.get( | |
| 101 | + f"https://api.twitter.com/2/tweets/{tweet_id}", | |
| 102 | + headers={"Authorization": f"Bearer {self._bearer_token}"}, | |
| 103 | + params={"tweet.fields": "author_id,created_at,text"}, | |
| 104 | + timeout=15, | |
| 105 | + ) | |
| 106 | + resp.raise_for_status() | |
| 107 | + data = resp.json().get("data", {}) | |
| 108 | + return f"{data.get('text', '')}\n\nCreated: {data.get('created_at', 'unknown')}" | |
| 109 | + | |
| 110 | + def _fetch_via_gallery_dl(self) -> str: | |
| 111 | + """Use gallery-dl to extract tweet metadata.""" | |
| 112 | + import json | |
| 113 | + import subprocess | |
| 114 | + | |
| 115 | + result = subprocess.run( | |
| 116 | + ["gallery-dl", "--dump-json", self.url], | |
| 117 | + capture_output=True, | |
| 118 | + text=True, | |
| 119 | + timeout=30, | |
| 120 | + ) | |
| 121 | + if result.returncode != 0: | |
| 122 | + raise RuntimeError(f"gallery-dl failed: {result.stderr}") | |
| 123 | + | |
| 124 | + items = json.loads(result.stdout) | |
| 125 | + texts = [] | |
| 126 | + for item in items if isinstance(items, list) else [items]: | |
| 127 | + if isinstance(item, dict): | |
| 128 | + texts.append(item.get("content", item.get("text", str(item)))) | |
| 129 | + return "\n\n".join(texts) if texts else "No text content extracted." |
| --- a/video_processor/sources/twitter_source.py | |
| +++ b/video_processor/sources/twitter_source.py | |
| @@ -0,0 +1,129 @@ | |
| --- a/video_processor/sources/twitter_source.py | |
| +++ b/video_processor/sources/twitter_source.py | |
| @@ -0,0 +1,129 @@ | |
| 1 | """Twitter/X source connector -- stub requiring auth or gallery-dl.""" |
| 2 | |
| 3 | import logging |
| 4 | from pathlib import Path |
| 5 | from typing import List, Optional |
| 6 | |
| 7 | from video_processor.sources.base import BaseSource, SourceFile |
| 8 | |
| 9 | logger = logging.getLogger(__name__) |
| 10 | |
| 11 | |
| 12 | class TwitterSource(BaseSource): |
| 13 | """ |
| 14 | Fetch Twitter/X posts and threads. |
| 15 | |
| 16 | Twitter API v2 requires authentication. This connector attempts to use |
| 17 | gallery-dl as a fallback for public tweets. |
| 18 | |
| 19 | Auth options: |
| 20 | - Set TWITTER_BEARER_TOKEN env var for API v2 access |
| 21 | - Install gallery-dl for scraping public tweets: pip install gallery-dl |
| 22 | """ |
| 23 | |
| 24 | def __init__(self, url: str): |
| 25 | self.url = url |
| 26 | self._bearer_token: Optional[str] = None |
| 27 | |
| 28 | def authenticate(self) -> bool: |
| 29 | """Check for Twitter API token or gallery-dl availability.""" |
| 30 | import os |
| 31 | |
| 32 | self._bearer_token = os.environ.get("TWITTER_BEARER_TOKEN") |
| 33 | if self._bearer_token: |
| 34 | return True |
| 35 | |
| 36 | # Check for gallery-dl fallback |
| 37 | try: |
| 38 | import gallery_dl # noqa: F401 |
| 39 | |
| 40 | logger.info("Using gallery-dl for Twitter content extraction") |
| 41 | return True |
| 42 | except ImportError: |
| 43 | pass |
| 44 | |
| 45 | logger.error( |
| 46 | "Twitter source requires either:\n" |
| 47 | " 1. TWITTER_BEARER_TOKEN env var (Twitter API v2)\n" |
| 48 | " 2. gallery-dl installed: pip install gallery-dl\n" |
| 49 | "Twitter API access: https://developer.twitter.com/en/portal/dashboard" |
| 50 | ) |
| 51 | return False |
| 52 | |
| 53 | def list_videos( |
| 54 | self, |
| 55 | folder_id: Optional[str] = None, |
| 56 | folder_path: Optional[str] = None, |
| 57 | patterns: Optional[List[str]] = None, |
| 58 | ) -> List[SourceFile]: |
| 59 | """Return a single SourceFile for the tweet/thread.""" |
| 60 | return [ |
| 61 | SourceFile( |
| 62 | name=self.url.split("/")[-1] or "tweet", |
| 63 | id=self.url, |
| 64 | mime_type="text/plain", |
| 65 | ) |
| 66 | ] |
| 67 | |
| 68 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 69 | """Download tweet content as text.""" |
| 70 | destination = Path(destination) |
| 71 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 72 | text = self.fetch_text() |
| 73 | destination.write_text(text, encoding="utf-8") |
| 74 | logger.info(f"Saved Twitter content to {destination}") |
| 75 | return destination |
| 76 | |
| 77 | def fetch_text(self) -> str: |
| 78 | """Extract tweet text via API or gallery-dl.""" |
| 79 | if self._bearer_token: |
| 80 | return self._fetch_via_api() |
| 81 | |
| 82 | try: |
| 83 | return self._fetch_via_gallery_dl() |
| 84 | except ImportError: |
| 85 | raise RuntimeError( |
| 86 | "No Twitter extraction method available. See authenticate() for setup." |
| 87 | ) |
| 88 | |
| 89 | def _fetch_via_api(self) -> str: |
| 90 | """Fetch tweet via Twitter API v2.""" |
| 91 | import re |
| 92 | |
| 93 | import requests |
| 94 | |
| 95 | match = re.search(r"/status/(\d+)", self.url) |
| 96 | if not match: |
| 97 | raise ValueError(f"Could not extract tweet ID from: {self.url}") |
| 98 | |
| 99 | tweet_id = match.group(1) |
| 100 | resp = requests.get( |
| 101 | f"https://api.twitter.com/2/tweets/{tweet_id}", |
| 102 | headers={"Authorization": f"Bearer {self._bearer_token}"}, |
| 103 | params={"tweet.fields": "author_id,created_at,text"}, |
| 104 | timeout=15, |
| 105 | ) |
| 106 | resp.raise_for_status() |
| 107 | data = resp.json().get("data", {}) |
| 108 | return f"{data.get('text', '')}\n\nCreated: {data.get('created_at', 'unknown')}" |
| 109 | |
| 110 | def _fetch_via_gallery_dl(self) -> str: |
| 111 | """Use gallery-dl to extract tweet metadata.""" |
| 112 | import json |
| 113 | import subprocess |
| 114 | |
| 115 | result = subprocess.run( |
| 116 | ["gallery-dl", "--dump-json", self.url], |
| 117 | capture_output=True, |
| 118 | text=True, |
| 119 | timeout=30, |
| 120 | ) |
| 121 | if result.returncode != 0: |
| 122 | raise RuntimeError(f"gallery-dl failed: {result.stderr}") |
| 123 | |
| 124 | items = json.loads(result.stdout) |
| 125 | texts = [] |
| 126 | for item in items if isinstance(items, list) else [items]: |
| 127 | if isinstance(item, dict): |
| 128 | texts.append(item.get("content", item.get("text", str(item)))) |
| 129 | return "\n\n".join(texts) if texts else "No text content extracted." |
| --- a/video_processor/sources/web_source.py | ||
| +++ b/video_processor/sources/web_source.py | ||
| @@ -0,0 +1,90 @@ | ||
| 1 | +"""Web page source connector for fetching and extracting text from URLs.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import re | |
| 5 | +from pathlib import Path | |
| 6 | +from typing import List, Optional | |
| 7 | + | |
| 8 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 9 | + | |
| 10 | +logger = logging.getLogger(__name__) | |
| 11 | + | |
| 12 | + | |
| 13 | +def _strip_html_tags(html: str) -> str: | |
| 14 | + """Minimal HTML tag stripper using stdlib only.""" | |
| 15 | + text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE) | |
| 16 | + text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE) | |
| 17 | + text = re.sub(r"<(nav|footer|header)[^>]*>.*?</\1>", "", text, flags=re.DOTALL | re.IGNORECASE) | |
| 18 | + text = re.sub(r"<[^>]+>", " ", text) | |
| 19 | + text = re.sub(r"\s+", " ", text).strip() | |
| 20 | + return text | |
| 21 | + | |
| 22 | + | |
| 23 | +class WebSource(BaseSource): | |
| 24 | + """ | |
| 25 | + Fetch web pages and extract main text content. | |
| 26 | + | |
| 27 | + Uses requests + BeautifulSoup (optional) for content extraction. | |
| 28 | + Falls back to regex-based tag stripping if bs4 is unavailable. | |
| 29 | + | |
| 30 | + Requires: pip install requests (included in most environments) | |
| 31 | + Optional: pip install beautifulsoup4 lxml | |
| 32 | + """ | |
| 33 | + | |
| 34 | + def __init__(self, url: str): | |
| 35 | + self.url = url | |
| 36 | + self._content: Optional[str] = None | |
| 37 | + | |
| 38 | + def authenticate(self) -> bool: | |
| 39 | + """No auth needed for public web pages.""" | |
| 40 | + return True | |
| 41 | + | |
| 42 | + def list_videos( | |
| 43 | + self, | |
| 44 | + folder_id: Optional[str] = None, | |
| 45 | + folder_path: Optional[str] = None, | |
| 46 | + patterns: Optional[List[str]] = None, | |
| 47 | + ) -> List[SourceFile]: | |
| 48 | + """Return a single SourceFile representing the web page.""" | |
| 49 | + return [ | |
| 50 | + SourceFile( | |
| 51 | + name=self.url.split("/")[-1] or "page", | |
| 52 | + id=self.url, | |
| 53 | + mime_type="text/html", | |
| 54 | + ) | |
| 55 | + ] | |
| 56 | + | |
| 57 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 58 | + """Download and save the extracted text content.""" | |
| 59 | + destination = Path(destination) | |
| 60 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 61 | + text = self.fetch_text() | |
| 62 | + destination.write_text(text, encoding="utf-8") | |
| 63 | + logger.info(f"Saved web content to {destination}") | |
| 64 | + return destination | |
| 65 | + | |
| 66 | + def fetch_text(self) -> str: | |
| 67 | + """Fetch the URL and extract main text content.""" | |
| 68 | + if self._content is not None: | |
| 69 | + return self._content | |
| 70 | + | |
| 71 | + import requests | |
| 72 | + | |
| 73 | + resp = requests.get(self.url, timeout=30, headers={"User-Agent": "PlanOpticon/0.3"}) | |
| 74 | + resp.raise_for_status() | |
| 75 | + | |
| 76 | + try: | |
| 77 | + from bs4 import BeautifulSoup | |
| 78 | + | |
| 79 | + soup = BeautifulSoup(resp.text, "html.parser") | |
| 80 | + # Remove non-content elements | |
| 81 | + for tag in soup(["script", "style", "nav", "footer", "header", "aside"]): | |
| 82 | + tag.decompose() | |
| 83 | + # Prefer <article> or <main> if present | |
| 84 | + main = soup.find("article") or soup.find("main") or soup.find("body") | |
| 85 | + self._content = main.get_text(separator="\n", strip=True) if main else soup.get_text() | |
| 86 | + except ImportError: | |
| 87 | + logger.debug("beautifulsoup4 not available, using regex fallback") | |
| 88 | + self._content = _strip_html_tags(resp.text) | |
| 89 | + | |
| 90 | + return self._content |
| --- a/video_processor/sources/web_source.py | |
| +++ b/video_processor/sources/web_source.py | |
| @@ -0,0 +1,90 @@ | |
| --- a/video_processor/sources/web_source.py | |
| +++ b/video_processor/sources/web_source.py | |
| @@ -0,0 +1,90 @@ | |
| 1 | """Web page source connector for fetching and extracting text from URLs.""" |
| 2 | |
| 3 | import logging |
| 4 | import re |
| 5 | from pathlib import Path |
| 6 | from typing import List, Optional |
| 7 | |
| 8 | from video_processor.sources.base import BaseSource, SourceFile |
| 9 | |
| 10 | logger = logging.getLogger(__name__) |
| 11 | |
| 12 | |
| 13 | def _strip_html_tags(html: str) -> str: |
| 14 | """Minimal HTML tag stripper using stdlib only.""" |
| 15 | text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE) |
| 16 | text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE) |
| 17 | text = re.sub(r"<(nav|footer|header)[^>]*>.*?</\1>", "", text, flags=re.DOTALL | re.IGNORECASE) |
| 18 | text = re.sub(r"<[^>]+>", " ", text) |
| 19 | text = re.sub(r"\s+", " ", text).strip() |
| 20 | return text |
| 21 | |
| 22 | |
| 23 | class WebSource(BaseSource): |
| 24 | """ |
| 25 | Fetch web pages and extract main text content. |
| 26 | |
| 27 | Uses requests + BeautifulSoup (optional) for content extraction. |
| 28 | Falls back to regex-based tag stripping if bs4 is unavailable. |
| 29 | |
| 30 | Requires: pip install requests (included in most environments) |
| 31 | Optional: pip install beautifulsoup4 lxml |
| 32 | """ |
| 33 | |
| 34 | def __init__(self, url: str): |
| 35 | self.url = url |
| 36 | self._content: Optional[str] = None |
| 37 | |
| 38 | def authenticate(self) -> bool: |
| 39 | """No auth needed for public web pages.""" |
| 40 | return True |
| 41 | |
| 42 | def list_videos( |
| 43 | self, |
| 44 | folder_id: Optional[str] = None, |
| 45 | folder_path: Optional[str] = None, |
| 46 | patterns: Optional[List[str]] = None, |
| 47 | ) -> List[SourceFile]: |
| 48 | """Return a single SourceFile representing the web page.""" |
| 49 | return [ |
| 50 | SourceFile( |
| 51 | name=self.url.split("/")[-1] or "page", |
| 52 | id=self.url, |
| 53 | mime_type="text/html", |
| 54 | ) |
| 55 | ] |
| 56 | |
| 57 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 58 | """Download and save the extracted text content.""" |
| 59 | destination = Path(destination) |
| 60 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 61 | text = self.fetch_text() |
| 62 | destination.write_text(text, encoding="utf-8") |
| 63 | logger.info(f"Saved web content to {destination}") |
| 64 | return destination |
| 65 | |
| 66 | def fetch_text(self) -> str: |
| 67 | """Fetch the URL and extract main text content.""" |
| 68 | if self._content is not None: |
| 69 | return self._content |
| 70 | |
| 71 | import requests |
| 72 | |
| 73 | resp = requests.get(self.url, timeout=30, headers={"User-Agent": "PlanOpticon/0.3"}) |
| 74 | resp.raise_for_status() |
| 75 | |
| 76 | try: |
| 77 | from bs4 import BeautifulSoup |
| 78 | |
| 79 | soup = BeautifulSoup(resp.text, "html.parser") |
| 80 | # Remove non-content elements |
| 81 | for tag in soup(["script", "style", "nav", "footer", "header", "aside"]): |
| 82 | tag.decompose() |
| 83 | # Prefer <article> or <main> if present |
| 84 | main = soup.find("article") or soup.find("main") or soup.find("body") |
| 85 | self._content = main.get_text(separator="\n", strip=True) if main else soup.get_text() |
| 86 | except ImportError: |
| 87 | logger.debug("beautifulsoup4 not available, using regex fallback") |
| 88 | self._content = _strip_html_tags(resp.text) |
| 89 | |
| 90 | return self._content |
| --- a/video_processor/sources/youtube_source.py | ||
| +++ b/video_processor/sources/youtube_source.py | ||
| @@ -0,0 +1,118 @@ | ||
| 1 | +"""YouTube source connector using yt-dlp for video/audio download and caption extraction.""" | |
| 2 | + | |
| 3 | +import logging | |
| 4 | +import re | |
| 5 | +from pathlib import Path | |
| 6 | +from typing import List, Optional | |
| 7 | + | |
| 8 | +from video_processor.sources.base import BaseSource, SourceFile | |
| 9 | + | |
| 10 | +logger = logging.getLogger(__name__) | |
| 11 | + | |
| 12 | +_YT_URL_PATTERN = re.compile( | |
| 13 | + r"(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/shorts/)([\w-]{11})" | |
| 14 | +) | |
| 15 | + | |
| 16 | + | |
| 17 | +def _extract_video_id(url: str) -> str: | |
| 18 | + """Extract the 11-character video ID from a YouTube URL.""" | |
| 19 | + match = _YT_URL_PATTERN.search(url) | |
| 20 | + if not match: | |
| 21 | + raise ValueError(f"Could not extract YouTube video ID from: {url}") | |
| 22 | + return match.group(1) | |
| 23 | + | |
| 24 | + | |
| 25 | +class YouTubeSource(BaseSource): | |
| 26 | + """ | |
| 27 | + Download YouTube videos/audio and extract captions via yt-dlp. | |
| 28 | + | |
| 29 | + Requires: pip install yt-dlp | |
| 30 | + """ | |
| 31 | + | |
| 32 | + def __init__(self, url: str, audio_only: bool = False): | |
| 33 | + self.url = url | |
| 34 | + self.video_id = _extract_video_id(url) | |
| 35 | + self.audio_only = audio_only | |
| 36 | + | |
| 37 | + def authenticate(self) -> bool: | |
| 38 | + """No auth needed for public videos. Returns True if yt-dlp is available.""" | |
| 39 | + try: | |
| 40 | + import yt_dlp # noqa: F401 | |
| 41 | + | |
| 42 | + return True | |
| 43 | + except ImportError: | |
| 44 | + logger.error("yt-dlp not installed. Run: pip install yt-dlp") | |
| 45 | + return False | |
| 46 | + | |
| 47 | + def list_videos( | |
| 48 | + self, | |
| 49 | + folder_id: Optional[str] = None, | |
| 50 | + folder_path: Optional[str] = None, | |
| 51 | + patterns: Optional[List[str]] = None, | |
| 52 | + ) -> List[SourceFile]: | |
| 53 | + """Return a single SourceFile representing the YouTube video.""" | |
| 54 | + import yt_dlp | |
| 55 | + | |
| 56 | + with yt_dlp.YoutubeDL({"quiet": True}) as ydl: | |
| 57 | + info = ydl.extract_info(self.url, download=False) | |
| 58 | + | |
| 59 | + return [ | |
| 60 | + SourceFile( | |
| 61 | + name=info.get("title", self.video_id), | |
| 62 | + id=self.video_id, | |
| 63 | + size_bytes=info.get("filesize"), | |
| 64 | + mime_type="audio/webm" if self.audio_only else "video/mp4", | |
| 65 | + ) | |
| 66 | + ] | |
| 67 | + | |
| 68 | + def download(self, file: SourceFile, destination: Path) -> Path: | |
| 69 | + """Download the video or audio to destination path.""" | |
| 70 | + import yt_dlp | |
| 71 | + | |
| 72 | + destination = Path(destination) | |
| 73 | + destination.parent.mkdir(parents=True, exist_ok=True) | |
| 74 | + | |
| 75 | + opts = { | |
| 76 | + "outtmpl": str(destination), | |
| 77 | + "quiet": True, | |
| 78 | + } | |
| 79 | + if self.audio_only: | |
| 80 | + opts["format"] = "bestaudio/best" | |
| 81 | + opts["postprocessors"] = [{"key": "FFmpegExtractAudio", "preferredcodec": "mp3"}] | |
| 82 | + else: | |
| 83 | + opts["format"] = "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best" | |
| 84 | + | |
| 85 | + with yt_dlp.YoutubeDL(opts) as ydl: | |
| 86 | + ydl.download([self.url]) | |
| 87 | + | |
| 88 | + logger.info(f"Downloaded YouTube video {self.video_id} to {destination}") | |
| 89 | + return destination | |
| 90 | + | |
| 91 | + def fetch_captions(self, lang: str = "en") -> Optional[str]: | |
| 92 | + """Extract auto-generated or manual captions as plain text.""" | |
| 93 | + import yt_dlp | |
| 94 | + | |
| 95 | + opts = { | |
| 96 | + "quiet": True, | |
| 97 | + "writeautomaticsub": True, | |
| 98 | + "writesubtitles": True, | |
| 99 | + "subtitleslangs": [lang], | |
| 100 | + "skip_download": True, | |
| 101 | + } | |
| 102 | + with yt_dlp.YoutubeDL(opts) as ydl: | |
| 103 | + info = ydl.extract_info(self.url, download=False) | |
| 104 | + | |
| 105 | + subs = info.get("subtitles", {}).get(lang) or info.get("automatic_captions", {}).get(lang) | |
| 106 | + if not subs: | |
| 107 | + logger.warning(f"No captions found for language '{lang}'") | |
| 108 | + return None | |
| 109 | + | |
| 110 | + # Prefer vtt/srv format for text extraction | |
| 111 | + for fmt in subs: | |
| 112 | + if fmt.get("ext") in ("vtt", "srv3", "json3"): | |
| 113 | + import requests | |
| 114 | + | |
| 115 | + resp = requests.get(fmt["url"], timeout=30) | |
| 116 | + return resp.text | |
| 117 | + | |
| 118 | + return None |
| --- a/video_processor/sources/youtube_source.py | |
| +++ b/video_processor/sources/youtube_source.py | |
| @@ -0,0 +1,118 @@ | |
| --- a/video_processor/sources/youtube_source.py | |
| +++ b/video_processor/sources/youtube_source.py | |
| @@ -0,0 +1,118 @@ | |
| 1 | """YouTube source connector using yt-dlp for video/audio download and caption extraction.""" |
| 2 | |
| 3 | import logging |
| 4 | import re |
| 5 | from pathlib import Path |
| 6 | from typing import List, Optional |
| 7 | |
| 8 | from video_processor.sources.base import BaseSource, SourceFile |
| 9 | |
| 10 | logger = logging.getLogger(__name__) |
| 11 | |
| 12 | _YT_URL_PATTERN = re.compile( |
| 13 | r"(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/shorts/)([\w-]{11})" |
| 14 | ) |
| 15 | |
| 16 | |
| 17 | def _extract_video_id(url: str) -> str: |
| 18 | """Extract the 11-character video ID from a YouTube URL.""" |
| 19 | match = _YT_URL_PATTERN.search(url) |
| 20 | if not match: |
| 21 | raise ValueError(f"Could not extract YouTube video ID from: {url}") |
| 22 | return match.group(1) |
| 23 | |
| 24 | |
| 25 | class YouTubeSource(BaseSource): |
| 26 | """ |
| 27 | Download YouTube videos/audio and extract captions via yt-dlp. |
| 28 | |
| 29 | Requires: pip install yt-dlp |
| 30 | """ |
| 31 | |
| 32 | def __init__(self, url: str, audio_only: bool = False): |
| 33 | self.url = url |
| 34 | self.video_id = _extract_video_id(url) |
| 35 | self.audio_only = audio_only |
| 36 | |
| 37 | def authenticate(self) -> bool: |
| 38 | """No auth needed for public videos. Returns True if yt-dlp is available.""" |
| 39 | try: |
| 40 | import yt_dlp # noqa: F401 |
| 41 | |
| 42 | return True |
| 43 | except ImportError: |
| 44 | logger.error("yt-dlp not installed. Run: pip install yt-dlp") |
| 45 | return False |
| 46 | |
| 47 | def list_videos( |
| 48 | self, |
| 49 | folder_id: Optional[str] = None, |
| 50 | folder_path: Optional[str] = None, |
| 51 | patterns: Optional[List[str]] = None, |
| 52 | ) -> List[SourceFile]: |
| 53 | """Return a single SourceFile representing the YouTube video.""" |
| 54 | import yt_dlp |
| 55 | |
| 56 | with yt_dlp.YoutubeDL({"quiet": True}) as ydl: |
| 57 | info = ydl.extract_info(self.url, download=False) |
| 58 | |
| 59 | return [ |
| 60 | SourceFile( |
| 61 | name=info.get("title", self.video_id), |
| 62 | id=self.video_id, |
| 63 | size_bytes=info.get("filesize"), |
| 64 | mime_type="audio/webm" if self.audio_only else "video/mp4", |
| 65 | ) |
| 66 | ] |
| 67 | |
| 68 | def download(self, file: SourceFile, destination: Path) -> Path: |
| 69 | """Download the video or audio to destination path.""" |
| 70 | import yt_dlp |
| 71 | |
| 72 | destination = Path(destination) |
| 73 | destination.parent.mkdir(parents=True, exist_ok=True) |
| 74 | |
| 75 | opts = { |
| 76 | "outtmpl": str(destination), |
| 77 | "quiet": True, |
| 78 | } |
| 79 | if self.audio_only: |
| 80 | opts["format"] = "bestaudio/best" |
| 81 | opts["postprocessors"] = [{"key": "FFmpegExtractAudio", "preferredcodec": "mp3"}] |
| 82 | else: |
| 83 | opts["format"] = "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best" |
| 84 | |
| 85 | with yt_dlp.YoutubeDL(opts) as ydl: |
| 86 | ydl.download([self.url]) |
| 87 | |
| 88 | logger.info(f"Downloaded YouTube video {self.video_id} to {destination}") |
| 89 | return destination |
| 90 | |
| 91 | def fetch_captions(self, lang: str = "en") -> Optional[str]: |
| 92 | """Extract auto-generated or manual captions as plain text.""" |
| 93 | import yt_dlp |
| 94 | |
| 95 | opts = { |
| 96 | "quiet": True, |
| 97 | "writeautomaticsub": True, |
| 98 | "writesubtitles": True, |
| 99 | "subtitleslangs": [lang], |
| 100 | "skip_download": True, |
| 101 | } |
| 102 | with yt_dlp.YoutubeDL(opts) as ydl: |
| 103 | info = ydl.extract_info(self.url, download=False) |
| 104 | |
| 105 | subs = info.get("subtitles", {}).get(lang) or info.get("automatic_captions", {}).get(lang) |
| 106 | if not subs: |
| 107 | logger.warning(f"No captions found for language '{lang}'") |
| 108 | return None |
| 109 | |
| 110 | # Prefer vtt/srv format for text extraction |
| 111 | for fmt in subs: |
| 112 | if fmt.get("ext") in ("vtt", "srv3", "json3"): |
| 113 | import requests |
| 114 | |
| 115 | resp = requests.get(fmt["url"], timeout=30) |
| 116 | return resp.text |
| 117 | |
| 118 | return None |