PlanOpticon

feat(sources): add YouTube, web, GitHub, Reddit, HN, RSS, podcast, Twitter, arXiv connectors Each connector implements the SourceConnector pattern with optional external dependencies (yt-dlp, feedparser, beautifulsoup4).

lmata 2026-03-07 22:10 trunk
Commit b1d426992ff933d5c74d4f32a0257ce7eadb03ee76026a0edbaf442c0c60f746
--- video_processor/sources/__init__.py
+++ video_processor/sources/__init__.py
@@ -1,5 +1,40 @@
1
-"""Cloud source integrations for fetching videos from remote storage."""
1
+"""Cloud and web source integrations for fetching content from remote sources."""
22
33
from video_processor.sources.base import BaseSource, SourceFile
44
5
-__all__ = ["BaseSource", "SourceFile"]
5
+__all__ = [
6
+ "BaseSource",
7
+ "SourceFile",
8
+ "ArxivSource",
9
+ "GitHubSource",
10
+ "GoogleDriveSource",
11
+ "HackerNewsSource",
12
+ "PodcastSource",
13
+ "RedditSource",
14
+ "RSSSource",
15
+ "TwitterSource",
16
+ "WebSource",
17
+ "YouTubeSource",
18
+]
19
+
20
+
21
+def __getattr__(name: str):
22
+ """Lazy imports to avoid pulling in optional dependencies at import time."""
23
+ _lazy_map = {
24
+ "ArxivSource": "video_processor.sources.arxiv_source",
25
+ "GitHubSource": "video_processor.sources.github_source",
26
+ "GoogleDriveSource": "video_processor.sources.google_drive",
27
+ "HackerNewsSource": "video_processor.sources.hackernews_source",
28
+ "PodcastSource": "video_processor.sources.podcast_source",
29
+ "RedditSource": "video_processor.sources.reddit_source",
30
+ "RSSSource": "video_processor.sources.rss_source",
31
+ "TwitterSource": "video_processor.sources.twitter_source",
32
+ "WebSource": "video_processor.sources.web_source",
33
+ "YouTubeSource": "video_processor.sources.youtube_source",
34
+ }
35
+ if name in _lazy_map:
36
+ import importlib
37
+
38
+ module = importlib.import_module(_lazy_map[name])
39
+ return getattr(module, name)
40
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
641
742
ADDED video_processor/sources/arxiv_source.py
843
ADDED video_processor/sources/github_source.py
944
ADDED video_processor/sources/hackernews_source.py
1045
ADDED video_processor/sources/podcast_source.py
1146
ADDED video_processor/sources/reddit_source.py
1247
ADDED video_processor/sources/rss_source.py
1348
ADDED video_processor/sources/twitter_source.py
1449
ADDED video_processor/sources/web_source.py
1550
ADDED video_processor/sources/youtube_source.py
--- video_processor/sources/__init__.py
+++ video_processor/sources/__init__.py
@@ -1,5 +1,40 @@
1 """Cloud source integrations for fetching videos from remote storage."""
2
3 from video_processor.sources.base import BaseSource, SourceFile
4
5 __all__ = ["BaseSource", "SourceFile"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
7 DDED video_processor/sources/arxiv_source.py
8 DDED video_processor/sources/github_source.py
9 DDED video_processor/sources/hackernews_source.py
10 DDED video_processor/sources/podcast_source.py
11 DDED video_processor/sources/reddit_source.py
12 DDED video_processor/sources/rss_source.py
13 DDED video_processor/sources/twitter_source.py
14 DDED video_processor/sources/web_source.py
15 DDED video_processor/sources/youtube_source.py
--- video_processor/sources/__init__.py
+++ video_processor/sources/__init__.py
@@ -1,5 +1,40 @@
1 """Cloud and web source integrations for fetching content from remote sources."""
2
3 from video_processor.sources.base import BaseSource, SourceFile
4
5 __all__ = [
6 "BaseSource",
7 "SourceFile",
8 "ArxivSource",
9 "GitHubSource",
10 "GoogleDriveSource",
11 "HackerNewsSource",
12 "PodcastSource",
13 "RedditSource",
14 "RSSSource",
15 "TwitterSource",
16 "WebSource",
17 "YouTubeSource",
18 ]
19
20
21 def __getattr__(name: str):
22 """Lazy imports to avoid pulling in optional dependencies at import time."""
23 _lazy_map = {
24 "ArxivSource": "video_processor.sources.arxiv_source",
25 "GitHubSource": "video_processor.sources.github_source",
26 "GoogleDriveSource": "video_processor.sources.google_drive",
27 "HackerNewsSource": "video_processor.sources.hackernews_source",
28 "PodcastSource": "video_processor.sources.podcast_source",
29 "RedditSource": "video_processor.sources.reddit_source",
30 "RSSSource": "video_processor.sources.rss_source",
31 "TwitterSource": "video_processor.sources.twitter_source",
32 "WebSource": "video_processor.sources.web_source",
33 "YouTubeSource": "video_processor.sources.youtube_source",
34 }
35 if name in _lazy_map:
36 import importlib
37
38 module = importlib.import_module(_lazy_map[name])
39 return getattr(module, name)
40 raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
41
42 DDED video_processor/sources/arxiv_source.py
43 DDED video_processor/sources/github_source.py
44 DDED video_processor/sources/hackernews_source.py
45 DDED video_processor/sources/podcast_source.py
46 DDED video_processor/sources/reddit_source.py
47 DDED video_processor/sources/rss_source.py
48 DDED video_processor/sources/twitter_source.py
49 DDED video_processor/sources/web_source.py
50 DDED video_processor/sources/youtube_source.py
--- a/video_processor/sources/arxiv_source.py
+++ b/video_processor/sources/arxiv_source.py
@@ -0,0 +1,117 @@
1
+"""arXiv source connector for fetching paper metadata and PDFs."""
2
+
3
+import logging
4
+import re
5
+from pathlib import Path
6
+from typing import List, Optional
7
+
8
+from video_processor.sources.base import BaseSource, SourceFile
9
+
10
+logger = logging.getLogger(__name__)
11
+
12
+_ARXIV_ID_PATTERN = re.compile(r"(\d{4}\.\d{4,5})(v\d+)?")
13
+ARXIV_API = "http://export.arxiv.org/api/query"
14
+
15
+
16
+def _extract_arxiv_id(url_or_id: str) -> str:
17
+ """Extract arXiv paper ID from a URL or bare ID string."""
18
+ match = _ARXIV_ID_PATTERN.search(url_or_id)
19
+ if not match:
20
+ raise ValueError(f"Could not extract arXiv ID from: {url_or_id}")
21
+ return match.group(0)
22
+
23
+
24
+class ArxivSource(BaseSource):
25
+ """
26
+ Fetch arXiv paper metadata and PDF.
27
+
28
+ Uses the arXiv API (Atom feed) for metadata and direct PDF download.
29
+ Requires: pip install requests
30
+ """
31
+
32
+ def __init__(self, url_or_id: str):
33
+ self.arxiv_id = _extract_arxiv_id(url_or_id)
34
+ self._metadata: Optional[dict] = None
35
+
36
+ def authenticate(self) -> bool:
37
+ """No auth needed for arXiv."""
38
+ return True
39
+
40
+ def _fetch_metadata(self) -> dict:
41
+ """Fetch paper metadata from the arXiv API."""
42
+ if self._metadata:
43
+ return self._metadata
44
+
45
+ import xml.etree.ElementTree as ET
46
+
47
+ import requests
48
+
49
+ resp = requests.get(ARXIV_API, params={"id_list": self.arxiv_id}, timeout=15)
50
+ resp.raise_for_status()
51
+
52
+ ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
53
+ root = ET.fromstring(resp.text)
54
+ entry = root.find("atom:entry", ns)
55
+ if entry is None:
56
+ raise ValueError(f"Paper not found: {self.arxiv_id}")
57
+
58
+ self._metadata = {
59
+ "title": (entry.findtext("atom:title", namespaces=ns) or "").strip(),
60
+ "summary": (entry.findtext("atom:summary", namespaces=ns) or "").strip(),
61
+ "authors": [
62
+ a.findtext("atom:name", namespaces=ns) or ""
63
+ for a in entry.findall("atom:author", ns)
64
+ ],
65
+ "published": entry.findtext("atom:published", namespaces=ns) or "",
66
+ "pdf_url": f"https://arxiv.org/pdf/{self.arxiv_id}.pdf",
67
+ }
68
+ return self._metadata
69
+
70
+ def list_videos(
71
+ self,
72
+ folder_id: Optional[str] = None,
73
+ folder_path: Optional[str] = None,
74
+ patterns: Optional[List[str]] = None,
75
+ ) -> List[SourceFile]:
76
+ """Return SourceFiles for the paper metadata and PDF."""
77
+ meta = self._fetch_metadata()
78
+ return [
79
+ SourceFile(
80
+ name=f"{meta['title']} (metadata)",
81
+ id=f"meta:{self.arxiv_id}",
82
+ mime_type="text/plain",
83
+ ),
84
+ SourceFile(
85
+ name=f"{meta['title']}.pdf",
86
+ id=f"pdf:{self.arxiv_id}",
87
+ mime_type="application/pdf",
88
+ ),
89
+ ]
90
+
91
+ def download(self, file: SourceFile, destination: Path) -> Path:
92
+ """Download paper metadata as text or the PDF file."""
93
+ import requests
94
+
95
+ destination = Path(destination)
96
+ destination.parent.mkdir(parents=True, exist_ok=True)
97
+ meta = self._fetch_metadata()
98
+
99
+ if file.id.startswith("meta:"):
100
+ authors = ", ".join(meta["authors"])
101
+ text = (
102
+ f"# {meta['title']}\n\n"
103
+ f"Authors: {authors}\n"
104
+ f"Published: {meta['published']}\n"
105
+ f"arXiv: {self.arxiv_id}\n\n"
106
+ f"## Abstract\n\n{meta['summary']}"
107
+ )
108
+ destination.write_text(text, encoding="utf-8")
109
+ elif file.id.startswith("pdf:"):
110
+ resp = requests.get(meta["pdf_url"], timeout=60, stream=True)
111
+ resp.raise_for_status()
112
+ with open(destination, "wb") as f:
113
+ for chunk in resp.iter_content(chunk_size=8192):
114
+ f.write(chunk)
115
+
116
+ logger.info(f"Downloaded arXiv {self.arxiv_id} to {destination}")
117
+ return destination
--- a/video_processor/sources/arxiv_source.py
+++ b/video_processor/sources/arxiv_source.py
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/arxiv_source.py
+++ b/video_processor/sources/arxiv_source.py
@@ -0,0 +1,117 @@
1 """arXiv source connector for fetching paper metadata and PDFs."""
2
3 import logging
4 import re
5 from pathlib import Path
6 from typing import List, Optional
7
8 from video_processor.sources.base import BaseSource, SourceFile
9
10 logger = logging.getLogger(__name__)
11
12 _ARXIV_ID_PATTERN = re.compile(r"(\d{4}\.\d{4,5})(v\d+)?")
13 ARXIV_API = "http://export.arxiv.org/api/query"
14
15
16 def _extract_arxiv_id(url_or_id: str) -> str:
17 """Extract arXiv paper ID from a URL or bare ID string."""
18 match = _ARXIV_ID_PATTERN.search(url_or_id)
19 if not match:
20 raise ValueError(f"Could not extract arXiv ID from: {url_or_id}")
21 return match.group(0)
22
23
24 class ArxivSource(BaseSource):
25 """
26 Fetch arXiv paper metadata and PDF.
27
28 Uses the arXiv API (Atom feed) for metadata and direct PDF download.
29 Requires: pip install requests
30 """
31
32 def __init__(self, url_or_id: str):
33 self.arxiv_id = _extract_arxiv_id(url_or_id)
34 self._metadata: Optional[dict] = None
35
36 def authenticate(self) -> bool:
37 """No auth needed for arXiv."""
38 return True
39
40 def _fetch_metadata(self) -> dict:
41 """Fetch paper metadata from the arXiv API."""
42 if self._metadata:
43 return self._metadata
44
45 import xml.etree.ElementTree as ET
46
47 import requests
48
49 resp = requests.get(ARXIV_API, params={"id_list": self.arxiv_id}, timeout=15)
50 resp.raise_for_status()
51
52 ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
53 root = ET.fromstring(resp.text)
54 entry = root.find("atom:entry", ns)
55 if entry is None:
56 raise ValueError(f"Paper not found: {self.arxiv_id}")
57
58 self._metadata = {
59 "title": (entry.findtext("atom:title", namespaces=ns) or "").strip(),
60 "summary": (entry.findtext("atom:summary", namespaces=ns) or "").strip(),
61 "authors": [
62 a.findtext("atom:name", namespaces=ns) or ""
63 for a in entry.findall("atom:author", ns)
64 ],
65 "published": entry.findtext("atom:published", namespaces=ns) or "",
66 "pdf_url": f"https://arxiv.org/pdf/{self.arxiv_id}.pdf",
67 }
68 return self._metadata
69
70 def list_videos(
71 self,
72 folder_id: Optional[str] = None,
73 folder_path: Optional[str] = None,
74 patterns: Optional[List[str]] = None,
75 ) -> List[SourceFile]:
76 """Return SourceFiles for the paper metadata and PDF."""
77 meta = self._fetch_metadata()
78 return [
79 SourceFile(
80 name=f"{meta['title']} (metadata)",
81 id=f"meta:{self.arxiv_id}",
82 mime_type="text/plain",
83 ),
84 SourceFile(
85 name=f"{meta['title']}.pdf",
86 id=f"pdf:{self.arxiv_id}",
87 mime_type="application/pdf",
88 ),
89 ]
90
91 def download(self, file: SourceFile, destination: Path) -> Path:
92 """Download paper metadata as text or the PDF file."""
93 import requests
94
95 destination = Path(destination)
96 destination.parent.mkdir(parents=True, exist_ok=True)
97 meta = self._fetch_metadata()
98
99 if file.id.startswith("meta:"):
100 authors = ", ".join(meta["authors"])
101 text = (
102 f"# {meta['title']}\n\n"
103 f"Authors: {authors}\n"
104 f"Published: {meta['published']}\n"
105 f"arXiv: {self.arxiv_id}\n\n"
106 f"## Abstract\n\n{meta['summary']}"
107 )
108 destination.write_text(text, encoding="utf-8")
109 elif file.id.startswith("pdf:"):
110 resp = requests.get(meta["pdf_url"], timeout=60, stream=True)
111 resp.raise_for_status()
112 with open(destination, "wb") as f:
113 for chunk in resp.iter_content(chunk_size=8192):
114 f.write(chunk)
115
116 logger.info(f"Downloaded arXiv {self.arxiv_id} to {destination}")
117 return destination
--- a/video_processor/sources/github_source.py
+++ b/video_processor/sources/github_source.py
@@ -0,0 +1,156 @@
1
+"""GitHub source connector for fetching repo content, issues, and PRs."""
2
+
3
+import logging
4
+import os
5
+from pathlib import Path
6
+from typing import List, Optional
7
+
8
+from video_processor.sources.base import BaseSource, SourceFile
9
+
10
+logger = logging.getLogger(__name__)
11
+
12
+API_BASE = "https://api.github.com"
13
+
14
+
15
+class GitHubSource(BaseSource):
16
+ """
17
+ Fetch GitHub repository README, issues, and pull requests as text documents.
18
+
19
+ Auth: Set GITHUB_TOKEN env var, or use `gh auth token` output.
20
+ Requires: pip install requests
21
+ """
22
+
23
+ def __init__(self, repo: str, include_issues: bool = True, include_prs: bool = True):
24
+ """
25
+ Parameters
26
+ ----------
27
+ repo : str
28
+ GitHub repo in "owner/repo" format.
29
+ """
30
+ self.repo = repo
31
+ self.include_issues = include_issues
32
+ self.include_prs = include_prs
33
+ self._token: Optional[str] = None
34
+
35
+ def authenticate(self) -> bool:
36
+ """Authenticate via GITHUB_TOKEN env var or gh CLI."""
37
+ self._token = os.environ.get("GITHUB_TOKEN")
38
+ if not self._token:
39
+ try:
40
+ import subprocess
41
+
42
+ result = subprocess.run(["gh", "auth", "token"], capture_output=True, text=True)
43
+ if result.returncode == 0:
44
+ self._token = result.stdout.strip()
45
+ except FileNotFoundError:
46
+ pass
47
+ if not self._token:
48
+ logger.warning(
49
+ "No GitHub token found. Public repos only. Set GITHUB_TOKEN for private repos."
50
+ )
51
+ return True
52
+
53
+ def _headers(self) -> dict:
54
+ h = {"Accept": "application/vnd.github.v3+json"}
55
+ if self._token:
56
+ h["Authorization"] = f"Bearer {self._token}"
57
+ return h
58
+
59
+ def list_videos(
60
+ self,
61
+ folder_id: Optional[str] = None,
62
+ folder_path: Optional[str] = None,
63
+ patterns: Optional[List[str]] = None,
64
+ ) -> List[SourceFile]:
65
+ """List available documents (README, issues, PRs) as SourceFiles."""
66
+ import requests
67
+
68
+ files = []
69
+ # README
70
+ resp = requests.get(
71
+ f"{API_BASE}/repos/{self.repo}/readme", headers=self._headers(), timeout=15
72
+ )
73
+ if resp.ok:
74
+ files.append(SourceFile(name="README", id="readme", mime_type="text/markdown"))
75
+
76
+ # Issues
77
+ if self.include_issues:
78
+ resp = requests.get(
79
+ f"{API_BASE}/repos/{self.repo}/issues",
80
+ headers=self._headers(),
81
+ params={"state": "all", "per_page": 100},
82
+ timeout=15,
83
+ )
84
+ if resp.ok:
85
+ for issue in resp.json():
86
+ if "pull_request" not in issue:
87
+ files.append(
88
+ SourceFile(
89
+ name=f"Issue #{issue['number']}: {issue['title']}",
90
+ id=f"issue:{issue['number']}",
91
+ mime_type="text/plain",
92
+ )
93
+ )
94
+
95
+ # PRs
96
+ if self.include_prs:
97
+ resp = requests.get(
98
+ f"{API_BASE}/repos/{self.repo}/pulls",
99
+ headers=self._headers(),
100
+ params={"state": "all", "per_page": 100},
101
+ timeout=15,
102
+ )
103
+ if resp.ok:
104
+ for pr in resp.json():
105
+ files.append(
106
+ SourceFile(
107
+ name=f"PR #{pr['number']}: {pr['title']}",
108
+ id=f"pr:{pr['number']}",
109
+ mime_type="text/plain",
110
+ )
111
+ )
112
+
113
+ return files
114
+
115
+ def download(self, file: SourceFile, destination: Path) -> Path:
116
+ """Download a single document (README, issue, or PR) as text."""
117
+ import requests
118
+
119
+ destination = Path(destination)
120
+ destination.parent.mkdir(parents=True, exist_ok=True)
121
+
122
+ if file.id == "readme":
123
+ resp = requests.get(
124
+ f"{API_BASE}/repos/{self.repo}/readme",
125
+ headers={**self._headers(), "Accept": "application/vnd.github.v3.raw"},
126
+ timeout=15,
127
+ )
128
+ destination.write_text(resp.text, encoding="utf-8")
129
+ elif file.id.startswith("issue:"):
130
+ num = file.id.split(":")[1]
131
+ resp = requests.get(
132
+ f"{API_BASE}/repos/{self.repo}/issues/{num}",
133
+ headers=self._headers(),
134
+ timeout=15,
135
+ )
136
+ data = resp.json()
137
+ text = f"# {data['title']}\n\n{data.get('body', '') or ''}"
138
+ # Append comments
139
+ comments_resp = requests.get(data["comments_url"], headers=self._headers(), timeout=15)
140
+ if comments_resp.ok:
141
+ for c in comments_resp.json():
142
+ text += f"\n\n---\n**{c['user']['login']}**: {c.get('body', '')}"
143
+ destination.write_text(text, encoding="utf-8")
144
+ elif file.id.startswith("pr:"):
145
+ num = file.id.split(":")[1]
146
+ resp = requests.get(
147
+ f"{API_BASE}/repos/{self.repo}/pulls/{num}",
148
+ headers=self._headers(),
149
+ timeout=15,
150
+ )
151
+ data = resp.json()
152
+ text = f"# PR: {data['title']}\n\n{data.get('body', '') or ''}"
153
+ destination.write_text(text, encoding="utf-8")
154
+
155
+ logger.info(f"Downloaded {file.name} to {destination}")
156
+ return destination
--- a/video_processor/sources/github_source.py
+++ b/video_processor/sources/github_source.py
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/github_source.py
+++ b/video_processor/sources/github_source.py
@@ -0,0 +1,156 @@
1 """GitHub source connector for fetching repo content, issues, and PRs."""
2
3 import logging
4 import os
5 from pathlib import Path
6 from typing import List, Optional
7
8 from video_processor.sources.base import BaseSource, SourceFile
9
10 logger = logging.getLogger(__name__)
11
12 API_BASE = "https://api.github.com"
13
14
15 class GitHubSource(BaseSource):
16 """
17 Fetch GitHub repository README, issues, and pull requests as text documents.
18
19 Auth: Set GITHUB_TOKEN env var, or use `gh auth token` output.
20 Requires: pip install requests
21 """
22
23 def __init__(self, repo: str, include_issues: bool = True, include_prs: bool = True):
24 """
25 Parameters
26 ----------
27 repo : str
28 GitHub repo in "owner/repo" format.
29 """
30 self.repo = repo
31 self.include_issues = include_issues
32 self.include_prs = include_prs
33 self._token: Optional[str] = None
34
35 def authenticate(self) -> bool:
36 """Authenticate via GITHUB_TOKEN env var or gh CLI."""
37 self._token = os.environ.get("GITHUB_TOKEN")
38 if not self._token:
39 try:
40 import subprocess
41
42 result = subprocess.run(["gh", "auth", "token"], capture_output=True, text=True)
43 if result.returncode == 0:
44 self._token = result.stdout.strip()
45 except FileNotFoundError:
46 pass
47 if not self._token:
48 logger.warning(
49 "No GitHub token found. Public repos only. Set GITHUB_TOKEN for private repos."
50 )
51 return True
52
53 def _headers(self) -> dict:
54 h = {"Accept": "application/vnd.github.v3+json"}
55 if self._token:
56 h["Authorization"] = f"Bearer {self._token}"
57 return h
58
59 def list_videos(
60 self,
61 folder_id: Optional[str] = None,
62 folder_path: Optional[str] = None,
63 patterns: Optional[List[str]] = None,
64 ) -> List[SourceFile]:
65 """List available documents (README, issues, PRs) as SourceFiles."""
66 import requests
67
68 files = []
69 # README
70 resp = requests.get(
71 f"{API_BASE}/repos/{self.repo}/readme", headers=self._headers(), timeout=15
72 )
73 if resp.ok:
74 files.append(SourceFile(name="README", id="readme", mime_type="text/markdown"))
75
76 # Issues
77 if self.include_issues:
78 resp = requests.get(
79 f"{API_BASE}/repos/{self.repo}/issues",
80 headers=self._headers(),
81 params={"state": "all", "per_page": 100},
82 timeout=15,
83 )
84 if resp.ok:
85 for issue in resp.json():
86 if "pull_request" not in issue:
87 files.append(
88 SourceFile(
89 name=f"Issue #{issue['number']}: {issue['title']}",
90 id=f"issue:{issue['number']}",
91 mime_type="text/plain",
92 )
93 )
94
95 # PRs
96 if self.include_prs:
97 resp = requests.get(
98 f"{API_BASE}/repos/{self.repo}/pulls",
99 headers=self._headers(),
100 params={"state": "all", "per_page": 100},
101 timeout=15,
102 )
103 if resp.ok:
104 for pr in resp.json():
105 files.append(
106 SourceFile(
107 name=f"PR #{pr['number']}: {pr['title']}",
108 id=f"pr:{pr['number']}",
109 mime_type="text/plain",
110 )
111 )
112
113 return files
114
115 def download(self, file: SourceFile, destination: Path) -> Path:
116 """Download a single document (README, issue, or PR) as text."""
117 import requests
118
119 destination = Path(destination)
120 destination.parent.mkdir(parents=True, exist_ok=True)
121
122 if file.id == "readme":
123 resp = requests.get(
124 f"{API_BASE}/repos/{self.repo}/readme",
125 headers={**self._headers(), "Accept": "application/vnd.github.v3.raw"},
126 timeout=15,
127 )
128 destination.write_text(resp.text, encoding="utf-8")
129 elif file.id.startswith("issue:"):
130 num = file.id.split(":")[1]
131 resp = requests.get(
132 f"{API_BASE}/repos/{self.repo}/issues/{num}",
133 headers=self._headers(),
134 timeout=15,
135 )
136 data = resp.json()
137 text = f"# {data['title']}\n\n{data.get('body', '') or ''}"
138 # Append comments
139 comments_resp = requests.get(data["comments_url"], headers=self._headers(), timeout=15)
140 if comments_resp.ok:
141 for c in comments_resp.json():
142 text += f"\n\n---\n**{c['user']['login']}**: {c.get('body', '')}"
143 destination.write_text(text, encoding="utf-8")
144 elif file.id.startswith("pr:"):
145 num = file.id.split(":")[1]
146 resp = requests.get(
147 f"{API_BASE}/repos/{self.repo}/pulls/{num}",
148 headers=self._headers(),
149 timeout=15,
150 )
151 data = resp.json()
152 text = f"# PR: {data['title']}\n\n{data.get('body', '') or ''}"
153 destination.write_text(text, encoding="utf-8")
154
155 logger.info(f"Downloaded {file.name} to {destination}")
156 return destination
--- a/video_processor/sources/hackernews_source.py
+++ b/video_processor/sources/hackernews_source.py
@@ -0,0 +1,112 @@
1
+"""Hacker News source connector using the official Firebase API."""
2
+
3
+import logging
4
+from pathlib import Path
5
+from typing import List, Optional
6
+
7
+from video_processor.sources.base import BaseSource, SourceFile
8
+
9
+logger = logging.getLogger(__name__)
10
+
11
+HN_API = "https://hacker-news.firebaseio.com/v0"
12
+
13
+
14
+class HackerNewsSource(BaseSource):
15
+ """
16
+ Fetch Hacker News stories and comments via the public API.
17
+
18
+ API docs: https://github.com/HackerNews/API
19
+ Requires: pip install requests
20
+ """
21
+
22
+ def __init__(self, item_id: int, max_comments: int = 200):
23
+ """
24
+ Parameters
25
+ ----------
26
+ item_id : int
27
+ HN story/item ID (e.g., 12345678).
28
+ max_comments : int
29
+ Maximum number of comments to fetch (default 200).
30
+ """
31
+ self.item_id = item_id
32
+ self.max_comments = max_comments
33
+
34
+ def authenticate(self) -> bool:
35
+ """No auth needed for the HN API."""
36
+ return True
37
+
38
+ def list_videos(
39
+ self,
40
+ folder_id: Optional[str] = None,
41
+ folder_path: Optional[str] = None,
42
+ patterns: Optional[List[str]] = None,
43
+ ) -> List[SourceFile]:
44
+ """Return a single SourceFile for the HN story."""
45
+ return [
46
+ SourceFile(
47
+ name=f"hn_{self.item_id}",
48
+ id=str(self.item_id),
49
+ mime_type="text/plain",
50
+ )
51
+ ]
52
+
53
+ def download(self, file: SourceFile, destination: Path) -> Path:
54
+ """Download the story and comments as plain text."""
55
+ destination = Path(destination)
56
+ destination.parent.mkdir(parents=True, exist_ok=True)
57
+ text = self.fetch_text()
58
+ destination.write_text(text, encoding="utf-8")
59
+ logger.info(f"Saved HN story {self.item_id} to {destination}")
60
+ return destination
61
+
62
+ def _get_item(self, item_id: int) -> dict:
63
+ import requests
64
+
65
+ resp = requests.get(f"{HN_API}/item/{item_id}.json", timeout=10)
66
+ resp.raise_for_status()
67
+ return resp.json() or {}
68
+
69
+ def fetch_text(self) -> str:
70
+ """Fetch story and comments as structured text."""
71
+ story = self._get_item(self.item_id)
72
+ lines = []
73
+ lines.append(f"# {story.get('title', 'Untitled')}")
74
+ lines.append(f"by {story.get('by', 'unknown')} | {story.get('score', 0)} points")
75
+ if story.get("url"):
76
+ lines.append(f"URL: {story['url']}")
77
+ if story.get("text"):
78
+ lines.append(f"\n{story['text']}")
79
+ lines.append("")
80
+
81
+ # Fetch comments
82
+ kid_ids = story.get("kids", [])
83
+ if kid_ids:
84
+ lines.append("## Comments\n")
85
+ count = [0]
86
+ self._fetch_comments(kid_ids, lines, depth=0, count=count)
87
+
88
+ return "\n".join(lines)
89
+
90
+ def _fetch_comments(self, kid_ids: list, lines: list, depth: int, count: list) -> None:
91
+ """Recursively fetch and format comments."""
92
+ indent = " " * depth
93
+ for kid_id in kid_ids:
94
+ if count[0] >= self.max_comments:
95
+ return
96
+ try:
97
+ item = self._get_item(kid_id)
98
+ except Exception:
99
+ continue
100
+
101
+ if item.get("deleted") or item.get("dead"):
102
+ continue
103
+
104
+ count[0] += 1
105
+ author = item.get("by", "[deleted]")
106
+ text = item.get("text", "")
107
+ lines.append(f"{indent}**{author}**:")
108
+ lines.append(f"{indent}{text}")
109
+ lines.append("")
110
+
111
+ if item.get("kids"):
112
+ self._fetch_comments(item["kids"], lines, depth + 1, count)
--- a/video_processor/sources/hackernews_source.py
+++ b/video_processor/sources/hackernews_source.py
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/hackernews_source.py
+++ b/video_processor/sources/hackernews_source.py
@@ -0,0 +1,112 @@
1 """Hacker News source connector using the official Firebase API."""
2
3 import logging
4 from pathlib import Path
5 from typing import List, Optional
6
7 from video_processor.sources.base import BaseSource, SourceFile
8
9 logger = logging.getLogger(__name__)
10
11 HN_API = "https://hacker-news.firebaseio.com/v0"
12
13
14 class HackerNewsSource(BaseSource):
15 """
16 Fetch Hacker News stories and comments via the public API.
17
18 API docs: https://github.com/HackerNews/API
19 Requires: pip install requests
20 """
21
22 def __init__(self, item_id: int, max_comments: int = 200):
23 """
24 Parameters
25 ----------
26 item_id : int
27 HN story/item ID (e.g., 12345678).
28 max_comments : int
29 Maximum number of comments to fetch (default 200).
30 """
31 self.item_id = item_id
32 self.max_comments = max_comments
33
34 def authenticate(self) -> bool:
35 """No auth needed for the HN API."""
36 return True
37
38 def list_videos(
39 self,
40 folder_id: Optional[str] = None,
41 folder_path: Optional[str] = None,
42 patterns: Optional[List[str]] = None,
43 ) -> List[SourceFile]:
44 """Return a single SourceFile for the HN story."""
45 return [
46 SourceFile(
47 name=f"hn_{self.item_id}",
48 id=str(self.item_id),
49 mime_type="text/plain",
50 )
51 ]
52
53 def download(self, file: SourceFile, destination: Path) -> Path:
54 """Download the story and comments as plain text."""
55 destination = Path(destination)
56 destination.parent.mkdir(parents=True, exist_ok=True)
57 text = self.fetch_text()
58 destination.write_text(text, encoding="utf-8")
59 logger.info(f"Saved HN story {self.item_id} to {destination}")
60 return destination
61
62 def _get_item(self, item_id: int) -> dict:
63 import requests
64
65 resp = requests.get(f"{HN_API}/item/{item_id}.json", timeout=10)
66 resp.raise_for_status()
67 return resp.json() or {}
68
69 def fetch_text(self) -> str:
70 """Fetch story and comments as structured text."""
71 story = self._get_item(self.item_id)
72 lines = []
73 lines.append(f"# {story.get('title', 'Untitled')}")
74 lines.append(f"by {story.get('by', 'unknown')} | {story.get('score', 0)} points")
75 if story.get("url"):
76 lines.append(f"URL: {story['url']}")
77 if story.get("text"):
78 lines.append(f"\n{story['text']}")
79 lines.append("")
80
81 # Fetch comments
82 kid_ids = story.get("kids", [])
83 if kid_ids:
84 lines.append("## Comments\n")
85 count = [0]
86 self._fetch_comments(kid_ids, lines, depth=0, count=count)
87
88 return "\n".join(lines)
89
90 def _fetch_comments(self, kid_ids: list, lines: list, depth: int, count: list) -> None:
91 """Recursively fetch and format comments."""
92 indent = " " * depth
93 for kid_id in kid_ids:
94 if count[0] >= self.max_comments:
95 return
96 try:
97 item = self._get_item(kid_id)
98 except Exception:
99 continue
100
101 if item.get("deleted") or item.get("dead"):
102 continue
103
104 count[0] += 1
105 author = item.get("by", "[deleted]")
106 text = item.get("text", "")
107 lines.append(f"{indent}**{author}**:")
108 lines.append(f"{indent}{text}")
109 lines.append("")
110
111 if item.get("kids"):
112 self._fetch_comments(item["kids"], lines, depth + 1, count)
--- a/video_processor/sources/podcast_source.py
+++ b/video_processor/sources/podcast_source.py
@@ -0,0 +1,119 @@
1
+"""Podcast feed source connector -- extends RSS for audio enclosures."""
2
+
3
+import logging
4
+from pathlib import Path
5
+from typing import List, Optional
6
+
7
+from video_processor.sources.base import BaseSource, SourceFile
8
+
9
+logger = logging.getLogger(__name__)
10
+
11
+
12
+class PodcastSource(BaseSource):
13
+ """
14
+ Parse podcast RSS feeds and download audio episodes for pipeline processing.
15
+
16
+ Extends the RSS pattern to extract <enclosure> audio URLs.
17
+ Requires: pip install requests
18
+ Optional: pip install feedparser
19
+ """
20
+
21
+ def __init__(self, feed_url: str, max_episodes: int = 10):
22
+ self.feed_url = feed_url
23
+ self.max_episodes = max_episodes
24
+ self._episodes: List[dict] = []
25
+
26
+ def authenticate(self) -> bool:
27
+ """No auth needed for public podcast feeds."""
28
+ return True
29
+
30
+ def _parse_feed(self) -> None:
31
+ """Fetch and parse the podcast feed for audio enclosures."""
32
+ if self._episodes:
33
+ return
34
+
35
+ import requests
36
+
37
+ resp = requests.get(self.feed_url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"})
38
+ resp.raise_for_status()
39
+
40
+ try:
41
+ import feedparser
42
+
43
+ feed = feedparser.parse(resp.text)
44
+ for entry in feed.entries[: self.max_episodes]:
45
+ audio_url = None
46
+ for link in entry.get("links", []):
47
+ if link.get("type", "").startswith("audio/"):
48
+ audio_url = link.get("href")
49
+ break
50
+ if not audio_url and entry.get("enclosures"):
51
+ audio_url = entry["enclosures"][0].get("href")
52
+ if audio_url:
53
+ self._episodes.append(
54
+ {
55
+ "title": entry.get("title", "Untitled"),
56
+ "url": audio_url,
57
+ "published": entry.get("published", ""),
58
+ "duration": entry.get("itunes_duration", ""),
59
+ }
60
+ )
61
+ except ImportError:
62
+ logger.debug("feedparser not available, using xml.etree fallback")
63
+ self._parse_xml(resp.text)
64
+
65
+ def _parse_xml(self, text: str) -> None:
66
+ """Fallback parser for podcast XML using stdlib."""
67
+ import xml.etree.ElementTree as ET
68
+
69
+ root = ET.fromstring(text)
70
+ items = root.findall(".//item")
71
+ for item in items[: self.max_episodes]:
72
+ enclosure = item.find("enclosure")
73
+ if enclosure is None:
74
+ continue
75
+ audio_url = enclosure.get("url", "")
76
+ if not audio_url:
77
+ continue
78
+ title = item.findtext("title") or "Untitled"
79
+ pub = item.findtext("pubDate") or ""
80
+ self._episodes.append(
81
+ {"title": title, "url": audio_url, "published": pub, "duration": ""}
82
+ )
83
+
84
+ def list_videos(
85
+ self,
86
+ folder_id: Optional[str] = None,
87
+ folder_path: Optional[str] = None,
88
+ patterns: Optional[List[str]] = None,
89
+ ) -> List[SourceFile]:
90
+ """List podcast episodes as SourceFiles."""
91
+ self._parse_feed()
92
+ return [
93
+ SourceFile(
94
+ name=ep["title"],
95
+ id=ep["url"],
96
+ mime_type="audio/mpeg",
97
+ modified_at=ep["published"],
98
+ )
99
+ for ep in self._episodes
100
+ ]
101
+
102
+ def download(self, file: SourceFile, destination: Path) -> Path:
103
+ """Download the podcast audio file."""
104
+ import requests
105
+
106
+ destination = Path(destination)
107
+ destination.parent.mkdir(parents=True, exist_ok=True)
108
+
109
+ resp = requests.get(
110
+ file.id, timeout=60, stream=True, headers={"User-Agent": "PlanOpticon/0.3"}
111
+ )
112
+ resp.raise_for_status()
113
+
114
+ with open(destination, "wb") as f:
115
+ for chunk in resp.iter_content(chunk_size=8192):
116
+ f.write(chunk)
117
+
118
+ logger.info(f"Downloaded podcast episode to {destination}")
119
+ return destination
--- a/video_processor/sources/podcast_source.py
+++ b/video_processor/sources/podcast_source.py
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/podcast_source.py
+++ b/video_processor/sources/podcast_source.py
@@ -0,0 +1,119 @@
1 """Podcast feed source connector -- extends RSS for audio enclosures."""
2
3 import logging
4 from pathlib import Path
5 from typing import List, Optional
6
7 from video_processor.sources.base import BaseSource, SourceFile
8
9 logger = logging.getLogger(__name__)
10
11
12 class PodcastSource(BaseSource):
13 """
14 Parse podcast RSS feeds and download audio episodes for pipeline processing.
15
16 Extends the RSS pattern to extract <enclosure> audio URLs.
17 Requires: pip install requests
18 Optional: pip install feedparser
19 """
20
21 def __init__(self, feed_url: str, max_episodes: int = 10):
22 self.feed_url = feed_url
23 self.max_episodes = max_episodes
24 self._episodes: List[dict] = []
25
26 def authenticate(self) -> bool:
27 """No auth needed for public podcast feeds."""
28 return True
29
30 def _parse_feed(self) -> None:
31 """Fetch and parse the podcast feed for audio enclosures."""
32 if self._episodes:
33 return
34
35 import requests
36
37 resp = requests.get(self.feed_url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"})
38 resp.raise_for_status()
39
40 try:
41 import feedparser
42
43 feed = feedparser.parse(resp.text)
44 for entry in feed.entries[: self.max_episodes]:
45 audio_url = None
46 for link in entry.get("links", []):
47 if link.get("type", "").startswith("audio/"):
48 audio_url = link.get("href")
49 break
50 if not audio_url and entry.get("enclosures"):
51 audio_url = entry["enclosures"][0].get("href")
52 if audio_url:
53 self._episodes.append(
54 {
55 "title": entry.get("title", "Untitled"),
56 "url": audio_url,
57 "published": entry.get("published", ""),
58 "duration": entry.get("itunes_duration", ""),
59 }
60 )
61 except ImportError:
62 logger.debug("feedparser not available, using xml.etree fallback")
63 self._parse_xml(resp.text)
64
65 def _parse_xml(self, text: str) -> None:
66 """Fallback parser for podcast XML using stdlib."""
67 import xml.etree.ElementTree as ET
68
69 root = ET.fromstring(text)
70 items = root.findall(".//item")
71 for item in items[: self.max_episodes]:
72 enclosure = item.find("enclosure")
73 if enclosure is None:
74 continue
75 audio_url = enclosure.get("url", "")
76 if not audio_url:
77 continue
78 title = item.findtext("title") or "Untitled"
79 pub = item.findtext("pubDate") or ""
80 self._episodes.append(
81 {"title": title, "url": audio_url, "published": pub, "duration": ""}
82 )
83
84 def list_videos(
85 self,
86 folder_id: Optional[str] = None,
87 folder_path: Optional[str] = None,
88 patterns: Optional[List[str]] = None,
89 ) -> List[SourceFile]:
90 """List podcast episodes as SourceFiles."""
91 self._parse_feed()
92 return [
93 SourceFile(
94 name=ep["title"],
95 id=ep["url"],
96 mime_type="audio/mpeg",
97 modified_at=ep["published"],
98 )
99 for ep in self._episodes
100 ]
101
102 def download(self, file: SourceFile, destination: Path) -> Path:
103 """Download the podcast audio file."""
104 import requests
105
106 destination = Path(destination)
107 destination.parent.mkdir(parents=True, exist_ok=True)
108
109 resp = requests.get(
110 file.id, timeout=60, stream=True, headers={"User-Agent": "PlanOpticon/0.3"}
111 )
112 resp.raise_for_status()
113
114 with open(destination, "wb") as f:
115 for chunk in resp.iter_content(chunk_size=8192):
116 f.write(chunk)
117
118 logger.info(f"Downloaded podcast episode to {destination}")
119 return destination
--- a/video_processor/sources/reddit_source.py
+++ b/video_processor/sources/reddit_source.py
@@ -0,0 +1,103 @@
1
+"""Reddit source connector using the public JSON API."""
2
+
3
+import logging
4
+from pathlib import Path
5
+from typing import List, Optional
6
+
7
+from video_processor.sources.base import BaseSource, SourceFile
8
+
9
+logger = logging.getLogger(__name__)
10
+
11
+
12
+class RedditSource(BaseSource):
13
+ """
14
+ Fetch Reddit posts and comments via the public JSON API.
15
+
16
+ No auth required for public posts. Append .json to any Reddit URL.
17
+ Requires: pip install requests
18
+ """
19
+
20
+ def __init__(self, url: str):
21
+ """
22
+ Parameters
23
+ ----------
24
+ url : str
25
+ Reddit post or subreddit URL.
26
+ """
27
+ self.url = url.rstrip("/")
28
+
29
+ def authenticate(self) -> bool:
30
+ """No auth needed for public Reddit content."""
31
+ return True
32
+
33
+ def list_videos(
34
+ self,
35
+ folder_id: Optional[str] = None,
36
+ folder_path: Optional[str] = None,
37
+ patterns: Optional[List[str]] = None,
38
+ ) -> List[SourceFile]:
39
+ """Return a single SourceFile for the Reddit post."""
40
+ return [
41
+ SourceFile(
42
+ name=self.url.split("/")[-1] or "reddit_post",
43
+ id=self.url,
44
+ mime_type="text/plain",
45
+ )
46
+ ]
47
+
48
+ def download(self, file: SourceFile, destination: Path) -> Path:
49
+ """Download post and comments as plain text."""
50
+ destination = Path(destination)
51
+ destination.parent.mkdir(parents=True, exist_ok=True)
52
+ text = self.fetch_text()
53
+ destination.write_text(text, encoding="utf-8")
54
+ logger.info(f"Saved Reddit content to {destination}")
55
+ return destination
56
+
57
+ def fetch_text(self) -> str:
58
+ """Fetch the Reddit post and comments as structured text."""
59
+ import requests
60
+
61
+ json_url = self.url.rstrip("/") + ".json"
62
+ resp = requests.get(
63
+ json_url,
64
+ timeout=15,
65
+ headers={"User-Agent": "PlanOpticon/0.3 (source connector)"},
66
+ )
67
+ resp.raise_for_status()
68
+ data = resp.json()
69
+
70
+ lines = []
71
+ # Post data is in first listing
72
+ if isinstance(data, list) and len(data) > 0:
73
+ post = data[0]["data"]["children"][0]["data"]
74
+ lines.append(f"# {post.get('title', 'Untitled')}")
75
+ lines.append(f"by u/{post.get('author', '[deleted]')} | {post.get('score', 0)} points")
76
+ lines.append("")
77
+ if post.get("selftext"):
78
+ lines.append(post["selftext"])
79
+ lines.append("")
80
+
81
+ # Comments in second listing
82
+ if len(data) > 1:
83
+ lines.append("## Comments\n")
84
+ self._extract_comments(data[1]["data"]["children"], lines, depth=0)
85
+
86
+ return "\n".join(lines)
87
+
88
+ def _extract_comments(self, children: list, lines: list, depth: int) -> None:
89
+ """Recursively extract comment text."""
90
+ indent = " " * depth
91
+ for child in children:
92
+ if child.get("kind") != "t1":
93
+ continue
94
+ c = child["data"]
95
+ author = c.get("author", "[deleted]")
96
+ body = c.get("body", "")
97
+ lines.append(f"{indent}**{author}** ({c.get('score', 0)} pts):")
98
+ lines.append(f"{indent}{body}")
99
+ lines.append("")
100
+ # Recurse into replies
101
+ replies = c.get("replies")
102
+ if isinstance(replies, dict):
103
+ self._extract_comments(replies["data"]["children"], lines, depth + 1)
--- a/video_processor/sources/reddit_source.py
+++ b/video_processor/sources/reddit_source.py
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/reddit_source.py
+++ b/video_processor/sources/reddit_source.py
@@ -0,0 +1,103 @@
1 """Reddit source connector using the public JSON API."""
2
3 import logging
4 from pathlib import Path
5 from typing import List, Optional
6
7 from video_processor.sources.base import BaseSource, SourceFile
8
9 logger = logging.getLogger(__name__)
10
11
12 class RedditSource(BaseSource):
13 """
14 Fetch Reddit posts and comments via the public JSON API.
15
16 No auth required for public posts. Append .json to any Reddit URL.
17 Requires: pip install requests
18 """
19
20 def __init__(self, url: str):
21 """
22 Parameters
23 ----------
24 url : str
25 Reddit post or subreddit URL.
26 """
27 self.url = url.rstrip("/")
28
29 def authenticate(self) -> bool:
30 """No auth needed for public Reddit content."""
31 return True
32
33 def list_videos(
34 self,
35 folder_id: Optional[str] = None,
36 folder_path: Optional[str] = None,
37 patterns: Optional[List[str]] = None,
38 ) -> List[SourceFile]:
39 """Return a single SourceFile for the Reddit post."""
40 return [
41 SourceFile(
42 name=self.url.split("/")[-1] or "reddit_post",
43 id=self.url,
44 mime_type="text/plain",
45 )
46 ]
47
48 def download(self, file: SourceFile, destination: Path) -> Path:
49 """Download post and comments as plain text."""
50 destination = Path(destination)
51 destination.parent.mkdir(parents=True, exist_ok=True)
52 text = self.fetch_text()
53 destination.write_text(text, encoding="utf-8")
54 logger.info(f"Saved Reddit content to {destination}")
55 return destination
56
57 def fetch_text(self) -> str:
58 """Fetch the Reddit post and comments as structured text."""
59 import requests
60
61 json_url = self.url.rstrip("/") + ".json"
62 resp = requests.get(
63 json_url,
64 timeout=15,
65 headers={"User-Agent": "PlanOpticon/0.3 (source connector)"},
66 )
67 resp.raise_for_status()
68 data = resp.json()
69
70 lines = []
71 # Post data is in first listing
72 if isinstance(data, list) and len(data) > 0:
73 post = data[0]["data"]["children"][0]["data"]
74 lines.append(f"# {post.get('title', 'Untitled')}")
75 lines.append(f"by u/{post.get('author', '[deleted]')} | {post.get('score', 0)} points")
76 lines.append("")
77 if post.get("selftext"):
78 lines.append(post["selftext"])
79 lines.append("")
80
81 # Comments in second listing
82 if len(data) > 1:
83 lines.append("## Comments\n")
84 self._extract_comments(data[1]["data"]["children"], lines, depth=0)
85
86 return "\n".join(lines)
87
88 def _extract_comments(self, children: list, lines: list, depth: int) -> None:
89 """Recursively extract comment text."""
90 indent = " " * depth
91 for child in children:
92 if child.get("kind") != "t1":
93 continue
94 c = child["data"]
95 author = c.get("author", "[deleted]")
96 body = c.get("body", "")
97 lines.append(f"{indent}**{author}** ({c.get('score', 0)} pts):")
98 lines.append(f"{indent}{body}")
99 lines.append("")
100 # Recurse into replies
101 replies = c.get("replies")
102 if isinstance(replies, dict):
103 self._extract_comments(replies["data"]["children"], lines, depth + 1)
--- a/video_processor/sources/rss_source.py
+++ b/video_processor/sources/rss_source.py
@@ -0,0 +1,114 @@
1
+"""RSS/Atom feed source connector."""
2
+
3
+import logging
4
+from pathlib import Path
5
+from typing import List, Optional
6
+
7
+from video_processor.sources.base import BaseSource, SourceFile
8
+
9
+logger = logging.getLogger(__name__)
10
+
11
+
12
+class RSSSource(BaseSource):
13
+ """
14
+ Parse RSS/Atom feeds and extract entries as text documents.
15
+
16
+ Optional: pip install feedparser (falls back to xml.etree.ElementTree)
17
+ Requires: pip install requests
18
+ """
19
+
20
+ def __init__(self, url: str, max_entries: int = 50):
21
+ self.url = url
22
+ self.max_entries = max_entries
23
+ self._entries: List[dict] = []
24
+
25
+ def authenticate(self) -> bool:
26
+ """No auth needed for public feeds."""
27
+ return True
28
+
29
+ def _parse_feed(self) -> None:
30
+ """Fetch and parse the feed."""
31
+ if self._entries:
32
+ return
33
+
34
+ import requests
35
+
36
+ resp = requests.get(self.url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"})
37
+ resp.raise_for_status()
38
+
39
+ try:
40
+ import feedparser
41
+
42
+ feed = feedparser.parse(resp.text)
43
+ for entry in feed.entries[: self.max_entries]:
44
+ self._entries.append(
45
+ {
46
+ "title": entry.get("title", "Untitled"),
47
+ "link": entry.get("link", ""),
48
+ "summary": entry.get("summary", ""),
49
+ "published": entry.get("published", ""),
50
+ "id": entry.get("id", entry.get("link", "")),
51
+ }
52
+ )
53
+ except ImportError:
54
+ logger.debug("feedparser not available, using xml.etree fallback")
55
+ self._parse_xml(resp.text)
56
+
57
+ def _parse_xml(self, text: str) -> None:
58
+ """Fallback parser using stdlib xml.etree."""
59
+ import xml.etree.ElementTree as ET
60
+
61
+ root = ET.fromstring(text)
62
+ # Handle RSS 2.0
63
+ ns = {"atom": "http://www.w3.org/2005/Atom"}
64
+ items = root.findall(".//item") or root.findall(".//atom:entry", ns)
65
+ for item in items[: self.max_entries]:
66
+ title = (
67
+ item.findtext("title") or item.findtext("atom:title", namespaces=ns) or "Untitled"
68
+ )
69
+ link = item.findtext("link") or ""
70
+ if not link:
71
+ link_el = item.find("atom:link", ns)
72
+ link = link_el.get("href", "") if link_el is not None else ""
73
+ desc = (
74
+ item.findtext("description") or item.findtext("atom:summary", namespaces=ns) or ""
75
+ )
76
+ pub = item.findtext("pubDate") or item.findtext("atom:published", namespaces=ns) or ""
77
+ self._entries.append(
78
+ {"title": title, "link": link, "summary": desc, "published": pub, "id": link}
79
+ )
80
+
81
+ def list_videos(
82
+ self,
83
+ folder_id: Optional[str] = None,
84
+ folder_path: Optional[str] = None,
85
+ patterns: Optional[List[str]] = None,
86
+ ) -> List[SourceFile]:
87
+ """List feed entries as SourceFiles."""
88
+ self._parse_feed()
89
+ return [
90
+ SourceFile(
91
+ name=e["title"], id=e["id"], mime_type="text/plain", modified_at=e["published"]
92
+ )
93
+ for e in self._entries
94
+ ]
95
+
96
+ def download(self, file: SourceFile, destination: Path) -> Path:
97
+ """Write an entry's content as a text file."""
98
+ self._parse_feed()
99
+ destination = Path(destination)
100
+ destination.parent.mkdir(parents=True, exist_ok=True)
101
+
102
+ entry = next((e for e in self._entries if e["id"] == file.id), None)
103
+ if not entry:
104
+ raise ValueError(f"Entry not found: {file.id}")
105
+
106
+ text = (
107
+ f"# {entry['title']}\n\n"
108
+ f"Published: {entry['published']}\n"
109
+ f"Link: {entry['link']}\n\n"
110
+ f"{entry['summary']}"
111
+ )
112
+ destination.write_text(text, encoding="utf-8")
113
+ logger.info(f"Saved RSS entry to {destination}")
114
+ return destination
--- a/video_processor/sources/rss_source.py
+++ b/video_processor/sources/rss_source.py
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/rss_source.py
+++ b/video_processor/sources/rss_source.py
@@ -0,0 +1,114 @@
1 """RSS/Atom feed source connector."""
2
3 import logging
4 from pathlib import Path
5 from typing import List, Optional
6
7 from video_processor.sources.base import BaseSource, SourceFile
8
9 logger = logging.getLogger(__name__)
10
11
12 class RSSSource(BaseSource):
13 """
14 Parse RSS/Atom feeds and extract entries as text documents.
15
16 Optional: pip install feedparser (falls back to xml.etree.ElementTree)
17 Requires: pip install requests
18 """
19
20 def __init__(self, url: str, max_entries: int = 50):
21 self.url = url
22 self.max_entries = max_entries
23 self._entries: List[dict] = []
24
25 def authenticate(self) -> bool:
26 """No auth needed for public feeds."""
27 return True
28
29 def _parse_feed(self) -> None:
30 """Fetch and parse the feed."""
31 if self._entries:
32 return
33
34 import requests
35
36 resp = requests.get(self.url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"})
37 resp.raise_for_status()
38
39 try:
40 import feedparser
41
42 feed = feedparser.parse(resp.text)
43 for entry in feed.entries[: self.max_entries]:
44 self._entries.append(
45 {
46 "title": entry.get("title", "Untitled"),
47 "link": entry.get("link", ""),
48 "summary": entry.get("summary", ""),
49 "published": entry.get("published", ""),
50 "id": entry.get("id", entry.get("link", "")),
51 }
52 )
53 except ImportError:
54 logger.debug("feedparser not available, using xml.etree fallback")
55 self._parse_xml(resp.text)
56
57 def _parse_xml(self, text: str) -> None:
58 """Fallback parser using stdlib xml.etree."""
59 import xml.etree.ElementTree as ET
60
61 root = ET.fromstring(text)
62 # Handle RSS 2.0
63 ns = {"atom": "http://www.w3.org/2005/Atom"}
64 items = root.findall(".//item") or root.findall(".//atom:entry", ns)
65 for item in items[: self.max_entries]:
66 title = (
67 item.findtext("title") or item.findtext("atom:title", namespaces=ns) or "Untitled"
68 )
69 link = item.findtext("link") or ""
70 if not link:
71 link_el = item.find("atom:link", ns)
72 link = link_el.get("href", "") if link_el is not None else ""
73 desc = (
74 item.findtext("description") or item.findtext("atom:summary", namespaces=ns) or ""
75 )
76 pub = item.findtext("pubDate") or item.findtext("atom:published", namespaces=ns) or ""
77 self._entries.append(
78 {"title": title, "link": link, "summary": desc, "published": pub, "id": link}
79 )
80
81 def list_videos(
82 self,
83 folder_id: Optional[str] = None,
84 folder_path: Optional[str] = None,
85 patterns: Optional[List[str]] = None,
86 ) -> List[SourceFile]:
87 """List feed entries as SourceFiles."""
88 self._parse_feed()
89 return [
90 SourceFile(
91 name=e["title"], id=e["id"], mime_type="text/plain", modified_at=e["published"]
92 )
93 for e in self._entries
94 ]
95
96 def download(self, file: SourceFile, destination: Path) -> Path:
97 """Write an entry's content as a text file."""
98 self._parse_feed()
99 destination = Path(destination)
100 destination.parent.mkdir(parents=True, exist_ok=True)
101
102 entry = next((e for e in self._entries if e["id"] == file.id), None)
103 if not entry:
104 raise ValueError(f"Entry not found: {file.id}")
105
106 text = (
107 f"# {entry['title']}\n\n"
108 f"Published: {entry['published']}\n"
109 f"Link: {entry['link']}\n\n"
110 f"{entry['summary']}"
111 )
112 destination.write_text(text, encoding="utf-8")
113 logger.info(f"Saved RSS entry to {destination}")
114 return destination
--- a/video_processor/sources/twitter_source.py
+++ b/video_processor/sources/twitter_source.py
@@ -0,0 +1,129 @@
1
+"""Twitter/X source connector -- stub requiring auth or gallery-dl."""
2
+
3
+import logging
4
+from pathlib import Path
5
+from typing import List, Optional
6
+
7
+from video_processor.sources.base import BaseSource, SourceFile
8
+
9
+logger = logging.getLogger(__name__)
10
+
11
+
12
+class TwitterSource(BaseSource):
13
+ """
14
+ Fetch Twitter/X posts and threads.
15
+
16
+ Twitter API v2 requires authentication. This connector attempts to use
17
+ gallery-dl as a fallback for public tweets.
18
+
19
+ Auth options:
20
+ - Set TWITTER_BEARER_TOKEN env var for API v2 access
21
+ - Install gallery-dl for scraping public tweets: pip install gallery-dl
22
+ """
23
+
24
+ def __init__(self, url: str):
25
+ self.url = url
26
+ self._bearer_token: Optional[str] = None
27
+
28
+ def authenticate(self) -> bool:
29
+ """Check for Twitter API token or gallery-dl availability."""
30
+ import os
31
+
32
+ self._bearer_token = os.environ.get("TWITTER_BEARER_TOKEN")
33
+ if self._bearer_token:
34
+ return True
35
+
36
+ # Check for gallery-dl fallback
37
+ try:
38
+ import gallery_dl # noqa: F401
39
+
40
+ logger.info("Using gallery-dl for Twitter content extraction")
41
+ return True
42
+ except ImportError:
43
+ pass
44
+
45
+ logger.error(
46
+ "Twitter source requires either:\n"
47
+ " 1. TWITTER_BEARER_TOKEN env var (Twitter API v2)\n"
48
+ " 2. gallery-dl installed: pip install gallery-dl\n"
49
+ "Twitter API access: https://developer.twitter.com/en/portal/dashboard"
50
+ )
51
+ return False
52
+
53
+ def list_videos(
54
+ self,
55
+ folder_id: Optional[str] = None,
56
+ folder_path: Optional[str] = None,
57
+ patterns: Optional[List[str]] = None,
58
+ ) -> List[SourceFile]:
59
+ """Return a single SourceFile for the tweet/thread."""
60
+ return [
61
+ SourceFile(
62
+ name=self.url.split("/")[-1] or "tweet",
63
+ id=self.url,
64
+ mime_type="text/plain",
65
+ )
66
+ ]
67
+
68
+ def download(self, file: SourceFile, destination: Path) -> Path:
69
+ """Download tweet content as text."""
70
+ destination = Path(destination)
71
+ destination.parent.mkdir(parents=True, exist_ok=True)
72
+ text = self.fetch_text()
73
+ destination.write_text(text, encoding="utf-8")
74
+ logger.info(f"Saved Twitter content to {destination}")
75
+ return destination
76
+
77
+ def fetch_text(self) -> str:
78
+ """Extract tweet text via API or gallery-dl."""
79
+ if self._bearer_token:
80
+ return self._fetch_via_api()
81
+
82
+ try:
83
+ return self._fetch_via_gallery_dl()
84
+ except ImportError:
85
+ raise RuntimeError(
86
+ "No Twitter extraction method available. See authenticate() for setup."
87
+ )
88
+
89
+ def _fetch_via_api(self) -> str:
90
+ """Fetch tweet via Twitter API v2."""
91
+ import re
92
+
93
+ import requests
94
+
95
+ match = re.search(r"/status/(\d+)", self.url)
96
+ if not match:
97
+ raise ValueError(f"Could not extract tweet ID from: {self.url}")
98
+
99
+ tweet_id = match.group(1)
100
+ resp = requests.get(
101
+ f"https://api.twitter.com/2/tweets/{tweet_id}",
102
+ headers={"Authorization": f"Bearer {self._bearer_token}"},
103
+ params={"tweet.fields": "author_id,created_at,text"},
104
+ timeout=15,
105
+ )
106
+ resp.raise_for_status()
107
+ data = resp.json().get("data", {})
108
+ return f"{data.get('text', '')}\n\nCreated: {data.get('created_at', 'unknown')}"
109
+
110
+ def _fetch_via_gallery_dl(self) -> str:
111
+ """Use gallery-dl to extract tweet metadata."""
112
+ import json
113
+ import subprocess
114
+
115
+ result = subprocess.run(
116
+ ["gallery-dl", "--dump-json", self.url],
117
+ capture_output=True,
118
+ text=True,
119
+ timeout=30,
120
+ )
121
+ if result.returncode != 0:
122
+ raise RuntimeError(f"gallery-dl failed: {result.stderr}")
123
+
124
+ items = json.loads(result.stdout)
125
+ texts = []
126
+ for item in items if isinstance(items, list) else [items]:
127
+ if isinstance(item, dict):
128
+ texts.append(item.get("content", item.get("text", str(item))))
129
+ return "\n\n".join(texts) if texts else "No text content extracted."
--- a/video_processor/sources/twitter_source.py
+++ b/video_processor/sources/twitter_source.py
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/twitter_source.py
+++ b/video_processor/sources/twitter_source.py
@@ -0,0 +1,129 @@
1 """Twitter/X source connector -- stub requiring auth or gallery-dl."""
2
3 import logging
4 from pathlib import Path
5 from typing import List, Optional
6
7 from video_processor.sources.base import BaseSource, SourceFile
8
9 logger = logging.getLogger(__name__)
10
11
12 class TwitterSource(BaseSource):
13 """
14 Fetch Twitter/X posts and threads.
15
16 Twitter API v2 requires authentication. This connector attempts to use
17 gallery-dl as a fallback for public tweets.
18
19 Auth options:
20 - Set TWITTER_BEARER_TOKEN env var for API v2 access
21 - Install gallery-dl for scraping public tweets: pip install gallery-dl
22 """
23
24 def __init__(self, url: str):
25 self.url = url
26 self._bearer_token: Optional[str] = None
27
28 def authenticate(self) -> bool:
29 """Check for Twitter API token or gallery-dl availability."""
30 import os
31
32 self._bearer_token = os.environ.get("TWITTER_BEARER_TOKEN")
33 if self._bearer_token:
34 return True
35
36 # Check for gallery-dl fallback
37 try:
38 import gallery_dl # noqa: F401
39
40 logger.info("Using gallery-dl for Twitter content extraction")
41 return True
42 except ImportError:
43 pass
44
45 logger.error(
46 "Twitter source requires either:\n"
47 " 1. TWITTER_BEARER_TOKEN env var (Twitter API v2)\n"
48 " 2. gallery-dl installed: pip install gallery-dl\n"
49 "Twitter API access: https://developer.twitter.com/en/portal/dashboard"
50 )
51 return False
52
53 def list_videos(
54 self,
55 folder_id: Optional[str] = None,
56 folder_path: Optional[str] = None,
57 patterns: Optional[List[str]] = None,
58 ) -> List[SourceFile]:
59 """Return a single SourceFile for the tweet/thread."""
60 return [
61 SourceFile(
62 name=self.url.split("/")[-1] or "tweet",
63 id=self.url,
64 mime_type="text/plain",
65 )
66 ]
67
68 def download(self, file: SourceFile, destination: Path) -> Path:
69 """Download tweet content as text."""
70 destination = Path(destination)
71 destination.parent.mkdir(parents=True, exist_ok=True)
72 text = self.fetch_text()
73 destination.write_text(text, encoding="utf-8")
74 logger.info(f"Saved Twitter content to {destination}")
75 return destination
76
77 def fetch_text(self) -> str:
78 """Extract tweet text via API or gallery-dl."""
79 if self._bearer_token:
80 return self._fetch_via_api()
81
82 try:
83 return self._fetch_via_gallery_dl()
84 except ImportError:
85 raise RuntimeError(
86 "No Twitter extraction method available. See authenticate() for setup."
87 )
88
89 def _fetch_via_api(self) -> str:
90 """Fetch tweet via Twitter API v2."""
91 import re
92
93 import requests
94
95 match = re.search(r"/status/(\d+)", self.url)
96 if not match:
97 raise ValueError(f"Could not extract tweet ID from: {self.url}")
98
99 tweet_id = match.group(1)
100 resp = requests.get(
101 f"https://api.twitter.com/2/tweets/{tweet_id}",
102 headers={"Authorization": f"Bearer {self._bearer_token}"},
103 params={"tweet.fields": "author_id,created_at,text"},
104 timeout=15,
105 )
106 resp.raise_for_status()
107 data = resp.json().get("data", {})
108 return f"{data.get('text', '')}\n\nCreated: {data.get('created_at', 'unknown')}"
109
110 def _fetch_via_gallery_dl(self) -> str:
111 """Use gallery-dl to extract tweet metadata."""
112 import json
113 import subprocess
114
115 result = subprocess.run(
116 ["gallery-dl", "--dump-json", self.url],
117 capture_output=True,
118 text=True,
119 timeout=30,
120 )
121 if result.returncode != 0:
122 raise RuntimeError(f"gallery-dl failed: {result.stderr}")
123
124 items = json.loads(result.stdout)
125 texts = []
126 for item in items if isinstance(items, list) else [items]:
127 if isinstance(item, dict):
128 texts.append(item.get("content", item.get("text", str(item))))
129 return "\n\n".join(texts) if texts else "No text content extracted."
--- a/video_processor/sources/web_source.py
+++ b/video_processor/sources/web_source.py
@@ -0,0 +1,90 @@
1
+"""Web page source connector for fetching and extracting text from URLs."""
2
+
3
+import logging
4
+import re
5
+from pathlib import Path
6
+from typing import List, Optional
7
+
8
+from video_processor.sources.base import BaseSource, SourceFile
9
+
10
+logger = logging.getLogger(__name__)
11
+
12
+
13
+def _strip_html_tags(html: str) -> str:
14
+ """Minimal HTML tag stripper using stdlib only."""
15
+ text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
16
+ text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
17
+ text = re.sub(r"<(nav|footer|header)[^>]*>.*?</\1>", "", text, flags=re.DOTALL | re.IGNORECASE)
18
+ text = re.sub(r"<[^>]+>", " ", text)
19
+ text = re.sub(r"\s+", " ", text).strip()
20
+ return text
21
+
22
+
23
+class WebSource(BaseSource):
24
+ """
25
+ Fetch web pages and extract main text content.
26
+
27
+ Uses requests + BeautifulSoup (optional) for content extraction.
28
+ Falls back to regex-based tag stripping if bs4 is unavailable.
29
+
30
+ Requires: pip install requests (included in most environments)
31
+ Optional: pip install beautifulsoup4 lxml
32
+ """
33
+
34
+ def __init__(self, url: str):
35
+ self.url = url
36
+ self._content: Optional[str] = None
37
+
38
+ def authenticate(self) -> bool:
39
+ """No auth needed for public web pages."""
40
+ return True
41
+
42
+ def list_videos(
43
+ self,
44
+ folder_id: Optional[str] = None,
45
+ folder_path: Optional[str] = None,
46
+ patterns: Optional[List[str]] = None,
47
+ ) -> List[SourceFile]:
48
+ """Return a single SourceFile representing the web page."""
49
+ return [
50
+ SourceFile(
51
+ name=self.url.split("/")[-1] or "page",
52
+ id=self.url,
53
+ mime_type="text/html",
54
+ )
55
+ ]
56
+
57
+ def download(self, file: SourceFile, destination: Path) -> Path:
58
+ """Download and save the extracted text content."""
59
+ destination = Path(destination)
60
+ destination.parent.mkdir(parents=True, exist_ok=True)
61
+ text = self.fetch_text()
62
+ destination.write_text(text, encoding="utf-8")
63
+ logger.info(f"Saved web content to {destination}")
64
+ return destination
65
+
66
+ def fetch_text(self) -> str:
67
+ """Fetch the URL and extract main text content."""
68
+ if self._content is not None:
69
+ return self._content
70
+
71
+ import requests
72
+
73
+ resp = requests.get(self.url, timeout=30, headers={"User-Agent": "PlanOpticon/0.3"})
74
+ resp.raise_for_status()
75
+
76
+ try:
77
+ from bs4 import BeautifulSoup
78
+
79
+ soup = BeautifulSoup(resp.text, "html.parser")
80
+ # Remove non-content elements
81
+ for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
82
+ tag.decompose()
83
+ # Prefer <article> or <main> if present
84
+ main = soup.find("article") or soup.find("main") or soup.find("body")
85
+ self._content = main.get_text(separator="\n", strip=True) if main else soup.get_text()
86
+ except ImportError:
87
+ logger.debug("beautifulsoup4 not available, using regex fallback")
88
+ self._content = _strip_html_tags(resp.text)
89
+
90
+ return self._content
--- a/video_processor/sources/web_source.py
+++ b/video_processor/sources/web_source.py
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/web_source.py
+++ b/video_processor/sources/web_source.py
@@ -0,0 +1,90 @@
1 """Web page source connector for fetching and extracting text from URLs."""
2
3 import logging
4 import re
5 from pathlib import Path
6 from typing import List, Optional
7
8 from video_processor.sources.base import BaseSource, SourceFile
9
10 logger = logging.getLogger(__name__)
11
12
13 def _strip_html_tags(html: str) -> str:
14 """Minimal HTML tag stripper using stdlib only."""
15 text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
16 text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
17 text = re.sub(r"<(nav|footer|header)[^>]*>.*?</\1>", "", text, flags=re.DOTALL | re.IGNORECASE)
18 text = re.sub(r"<[^>]+>", " ", text)
19 text = re.sub(r"\s+", " ", text).strip()
20 return text
21
22
23 class WebSource(BaseSource):
24 """
25 Fetch web pages and extract main text content.
26
27 Uses requests + BeautifulSoup (optional) for content extraction.
28 Falls back to regex-based tag stripping if bs4 is unavailable.
29
30 Requires: pip install requests (included in most environments)
31 Optional: pip install beautifulsoup4 lxml
32 """
33
34 def __init__(self, url: str):
35 self.url = url
36 self._content: Optional[str] = None
37
38 def authenticate(self) -> bool:
39 """No auth needed for public web pages."""
40 return True
41
42 def list_videos(
43 self,
44 folder_id: Optional[str] = None,
45 folder_path: Optional[str] = None,
46 patterns: Optional[List[str]] = None,
47 ) -> List[SourceFile]:
48 """Return a single SourceFile representing the web page."""
49 return [
50 SourceFile(
51 name=self.url.split("/")[-1] or "page",
52 id=self.url,
53 mime_type="text/html",
54 )
55 ]
56
57 def download(self, file: SourceFile, destination: Path) -> Path:
58 """Download and save the extracted text content."""
59 destination = Path(destination)
60 destination.parent.mkdir(parents=True, exist_ok=True)
61 text = self.fetch_text()
62 destination.write_text(text, encoding="utf-8")
63 logger.info(f"Saved web content to {destination}")
64 return destination
65
66 def fetch_text(self) -> str:
67 """Fetch the URL and extract main text content."""
68 if self._content is not None:
69 return self._content
70
71 import requests
72
73 resp = requests.get(self.url, timeout=30, headers={"User-Agent": "PlanOpticon/0.3"})
74 resp.raise_for_status()
75
76 try:
77 from bs4 import BeautifulSoup
78
79 soup = BeautifulSoup(resp.text, "html.parser")
80 # Remove non-content elements
81 for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
82 tag.decompose()
83 # Prefer <article> or <main> if present
84 main = soup.find("article") or soup.find("main") or soup.find("body")
85 self._content = main.get_text(separator="\n", strip=True) if main else soup.get_text()
86 except ImportError:
87 logger.debug("beautifulsoup4 not available, using regex fallback")
88 self._content = _strip_html_tags(resp.text)
89
90 return self._content
--- a/video_processor/sources/youtube_source.py
+++ b/video_processor/sources/youtube_source.py
@@ -0,0 +1,118 @@
1
+"""YouTube source connector using yt-dlp for video/audio download and caption extraction."""
2
+
3
+import logging
4
+import re
5
+from pathlib import Path
6
+from typing import List, Optional
7
+
8
+from video_processor.sources.base import BaseSource, SourceFile
9
+
10
+logger = logging.getLogger(__name__)
11
+
12
+_YT_URL_PATTERN = re.compile(
13
+ r"(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/shorts/)([\w-]{11})"
14
+)
15
+
16
+
17
+def _extract_video_id(url: str) -> str:
18
+ """Extract the 11-character video ID from a YouTube URL."""
19
+ match = _YT_URL_PATTERN.search(url)
20
+ if not match:
21
+ raise ValueError(f"Could not extract YouTube video ID from: {url}")
22
+ return match.group(1)
23
+
24
+
25
+class YouTubeSource(BaseSource):
26
+ """
27
+ Download YouTube videos/audio and extract captions via yt-dlp.
28
+
29
+ Requires: pip install yt-dlp
30
+ """
31
+
32
+ def __init__(self, url: str, audio_only: bool = False):
33
+ self.url = url
34
+ self.video_id = _extract_video_id(url)
35
+ self.audio_only = audio_only
36
+
37
+ def authenticate(self) -> bool:
38
+ """No auth needed for public videos. Returns True if yt-dlp is available."""
39
+ try:
40
+ import yt_dlp # noqa: F401
41
+
42
+ return True
43
+ except ImportError:
44
+ logger.error("yt-dlp not installed. Run: pip install yt-dlp")
45
+ return False
46
+
47
+ def list_videos(
48
+ self,
49
+ folder_id: Optional[str] = None,
50
+ folder_path: Optional[str] = None,
51
+ patterns: Optional[List[str]] = None,
52
+ ) -> List[SourceFile]:
53
+ """Return a single SourceFile representing the YouTube video."""
54
+ import yt_dlp
55
+
56
+ with yt_dlp.YoutubeDL({"quiet": True}) as ydl:
57
+ info = ydl.extract_info(self.url, download=False)
58
+
59
+ return [
60
+ SourceFile(
61
+ name=info.get("title", self.video_id),
62
+ id=self.video_id,
63
+ size_bytes=info.get("filesize"),
64
+ mime_type="audio/webm" if self.audio_only else "video/mp4",
65
+ )
66
+ ]
67
+
68
+ def download(self, file: SourceFile, destination: Path) -> Path:
69
+ """Download the video or audio to destination path."""
70
+ import yt_dlp
71
+
72
+ destination = Path(destination)
73
+ destination.parent.mkdir(parents=True, exist_ok=True)
74
+
75
+ opts = {
76
+ "outtmpl": str(destination),
77
+ "quiet": True,
78
+ }
79
+ if self.audio_only:
80
+ opts["format"] = "bestaudio/best"
81
+ opts["postprocessors"] = [{"key": "FFmpegExtractAudio", "preferredcodec": "mp3"}]
82
+ else:
83
+ opts["format"] = "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"
84
+
85
+ with yt_dlp.YoutubeDL(opts) as ydl:
86
+ ydl.download([self.url])
87
+
88
+ logger.info(f"Downloaded YouTube video {self.video_id} to {destination}")
89
+ return destination
90
+
91
+ def fetch_captions(self, lang: str = "en") -> Optional[str]:
92
+ """Extract auto-generated or manual captions as plain text."""
93
+ import yt_dlp
94
+
95
+ opts = {
96
+ "quiet": True,
97
+ "writeautomaticsub": True,
98
+ "writesubtitles": True,
99
+ "subtitleslangs": [lang],
100
+ "skip_download": True,
101
+ }
102
+ with yt_dlp.YoutubeDL(opts) as ydl:
103
+ info = ydl.extract_info(self.url, download=False)
104
+
105
+ subs = info.get("subtitles", {}).get(lang) or info.get("automatic_captions", {}).get(lang)
106
+ if not subs:
107
+ logger.warning(f"No captions found for language '{lang}'")
108
+ return None
109
+
110
+ # Prefer vtt/srv format for text extraction
111
+ for fmt in subs:
112
+ if fmt.get("ext") in ("vtt", "srv3", "json3"):
113
+ import requests
114
+
115
+ resp = requests.get(fmt["url"], timeout=30)
116
+ return resp.text
117
+
118
+ return None
--- a/video_processor/sources/youtube_source.py
+++ b/video_processor/sources/youtube_source.py
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/video_processor/sources/youtube_source.py
+++ b/video_processor/sources/youtube_source.py
@@ -0,0 +1,118 @@
1 """YouTube source connector using yt-dlp for video/audio download and caption extraction."""
2
3 import logging
4 import re
5 from pathlib import Path
6 from typing import List, Optional
7
8 from video_processor.sources.base import BaseSource, SourceFile
9
10 logger = logging.getLogger(__name__)
11
12 _YT_URL_PATTERN = re.compile(
13 r"(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/shorts/)([\w-]{11})"
14 )
15
16
17 def _extract_video_id(url: str) -> str:
18 """Extract the 11-character video ID from a YouTube URL."""
19 match = _YT_URL_PATTERN.search(url)
20 if not match:
21 raise ValueError(f"Could not extract YouTube video ID from: {url}")
22 return match.group(1)
23
24
25 class YouTubeSource(BaseSource):
26 """
27 Download YouTube videos/audio and extract captions via yt-dlp.
28
29 Requires: pip install yt-dlp
30 """
31
32 def __init__(self, url: str, audio_only: bool = False):
33 self.url = url
34 self.video_id = _extract_video_id(url)
35 self.audio_only = audio_only
36
37 def authenticate(self) -> bool:
38 """No auth needed for public videos. Returns True if yt-dlp is available."""
39 try:
40 import yt_dlp # noqa: F401
41
42 return True
43 except ImportError:
44 logger.error("yt-dlp not installed. Run: pip install yt-dlp")
45 return False
46
47 def list_videos(
48 self,
49 folder_id: Optional[str] = None,
50 folder_path: Optional[str] = None,
51 patterns: Optional[List[str]] = None,
52 ) -> List[SourceFile]:
53 """Return a single SourceFile representing the YouTube video."""
54 import yt_dlp
55
56 with yt_dlp.YoutubeDL({"quiet": True}) as ydl:
57 info = ydl.extract_info(self.url, download=False)
58
59 return [
60 SourceFile(
61 name=info.get("title", self.video_id),
62 id=self.video_id,
63 size_bytes=info.get("filesize"),
64 mime_type="audio/webm" if self.audio_only else "video/mp4",
65 )
66 ]
67
68 def download(self, file: SourceFile, destination: Path) -> Path:
69 """Download the video or audio to destination path."""
70 import yt_dlp
71
72 destination = Path(destination)
73 destination.parent.mkdir(parents=True, exist_ok=True)
74
75 opts = {
76 "outtmpl": str(destination),
77 "quiet": True,
78 }
79 if self.audio_only:
80 opts["format"] = "bestaudio/best"
81 opts["postprocessors"] = [{"key": "FFmpegExtractAudio", "preferredcodec": "mp3"}]
82 else:
83 opts["format"] = "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"
84
85 with yt_dlp.YoutubeDL(opts) as ydl:
86 ydl.download([self.url])
87
88 logger.info(f"Downloaded YouTube video {self.video_id} to {destination}")
89 return destination
90
91 def fetch_captions(self, lang: str = "en") -> Optional[str]:
92 """Extract auto-generated or manual captions as plain text."""
93 import yt_dlp
94
95 opts = {
96 "quiet": True,
97 "writeautomaticsub": True,
98 "writesubtitles": True,
99 "subtitleslangs": [lang],
100 "skip_download": True,
101 }
102 with yt_dlp.YoutubeDL(opts) as ydl:
103 info = ydl.extract_info(self.url, download=False)
104
105 subs = info.get("subtitles", {}).get(lang) or info.get("automatic_captions", {}).get(lang)
106 if not subs:
107 logger.warning(f"No captions found for language '{lang}'")
108 return None
109
110 # Prefer vtt/srv format for text extraction
111 for fmt in subs:
112 if fmt.get("ext") in ("vtt", "srv3", "json3"):
113 import requests
114
115 resp = requests.get(fmt["url"], timeout=30)
116 return resp.text
117
118 return None

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button