PlanOpticon

feat(sources): add YouTube, web, GitHub, Reddit, HN, RSS, podcast, Twitter, arXiv connectors Each connector implements the SourceConnector pattern with optional external dependencies (yt-dlp, feedparser, beautifulsoup4).

lmata 2026-03-07 22:10 trunk

Commit b1d426992ff933d5c74d4f32a0257ce7eadb03ee76026a0edbaf442c0c60f746

Parent 1e76ab0a2b6c1c6…

10 files changed +37 -2 +117 +156 +112 +119 +103 +114 +129 +90 +118

~ video_processor/sources/__init__.py + video_processor/sources/arxiv_source.py + video_processor/sources/github_source.py + video_processor/sources/hackernews_source.py + video_processor/sources/podcast_source.py + video_processor/sources/reddit_source.py + video_processor/sources/rss_source.py + video_processor/sources/twitter_source.py + video_processor/sources/web_source.py + video_processor/sources/youtube_source.py

M video_processor/sources/__init__.py

+37 -2

		--- video_processor/sources/__init__.py
		+++ video_processor/sources/__init__.py
		@@ -1,5 +1,40 @@
1		-"""Cloud source integrations for fetching videos from remote storage."""
	1	+"""Cloud and web source integrations for fetching content from remote sources."""
2	2
3	3	from video_processor.sources.base import BaseSource, SourceFile
4	4
5		-__all__ = ["BaseSource", "SourceFile"]
	5	+__all__ = [
	6	+ "BaseSource",
	7	+ "SourceFile",
	8	+ "ArxivSource",
	9	+ "GitHubSource",
	10	+ "GoogleDriveSource",
	11	+ "HackerNewsSource",
	12	+ "PodcastSource",
	13	+ "RedditSource",
	14	+ "RSSSource",
	15	+ "TwitterSource",
	16	+ "WebSource",
	17	+ "YouTubeSource",
	18	+]
	19	+
	20	+
	21	+def __getattr__(name: str):
	22	+ """Lazy imports to avoid pulling in optional dependencies at import time."""
	23	+ _lazy_map = {
	24	+ "ArxivSource": "video_processor.sources.arxiv_source",
	25	+ "GitHubSource": "video_processor.sources.github_source",
	26	+ "GoogleDriveSource": "video_processor.sources.google_drive",
	27	+ "HackerNewsSource": "video_processor.sources.hackernews_source",
	28	+ "PodcastSource": "video_processor.sources.podcast_source",
	29	+ "RedditSource": "video_processor.sources.reddit_source",
	30	+ "RSSSource": "video_processor.sources.rss_source",
	31	+ "TwitterSource": "video_processor.sources.twitter_source",
	32	+ "WebSource": "video_processor.sources.web_source",
	33	+ "YouTubeSource": "video_processor.sources.youtube_source",
	34	+ }
	35	+ if name in _lazy_map:
	36	+ import importlib
	37	+
	38	+ module = importlib.import_module(_lazy_map[name])
	39	+ return getattr(module, name)
	40	+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
6	41
7	42	ADDED video_processor/sources/arxiv_source.py
8	43	ADDED video_processor/sources/github_source.py
9	44	ADDED video_processor/sources/hackernews_source.py
10	45	ADDED video_processor/sources/podcast_source.py
11	46	ADDED video_processor/sources/reddit_source.py
12	47	ADDED video_processor/sources/rss_source.py
13	48	ADDED video_processor/sources/twitter_source.py
14	49	ADDED video_processor/sources/web_source.py
15	50	ADDED video_processor/sources/youtube_source.py

	--- video_processor/sources/__init__.py
	+++ video_processor/sources/__init__.py
	@@ -1,5 +1,40 @@
1	"""Cloud source integrations for fetching videos from remote storage."""
2
3	from video_processor.sources.base import BaseSource, SourceFile
4
5	__all__ = ["BaseSource", "SourceFile"]



































6
7	DDED video_processor/sources/arxiv_source.py
8	DDED video_processor/sources/github_source.py
9	DDED video_processor/sources/hackernews_source.py
10	DDED video_processor/sources/podcast_source.py
11	DDED video_processor/sources/reddit_source.py
12	DDED video_processor/sources/rss_source.py
13	DDED video_processor/sources/twitter_source.py
14	DDED video_processor/sources/web_source.py
15	DDED video_processor/sources/youtube_source.py

	--- video_processor/sources/__init__.py
	+++ video_processor/sources/__init__.py
	@@ -1,5 +1,40 @@
1	"""Cloud and web source integrations for fetching content from remote sources."""
2
3	from video_processor.sources.base import BaseSource, SourceFile
4
5	__all__ = [
6	"BaseSource",
7	"SourceFile",
8	"ArxivSource",
9	"GitHubSource",
10	"GoogleDriveSource",
11	"HackerNewsSource",
12	"PodcastSource",
13	"RedditSource",
14	"RSSSource",
15	"TwitterSource",
16	"WebSource",
17	"YouTubeSource",
18	]
19
20
21	def __getattr__(name: str):
22	"""Lazy imports to avoid pulling in optional dependencies at import time."""
23	_lazy_map = {
24	"ArxivSource": "video_processor.sources.arxiv_source",
25	"GitHubSource": "video_processor.sources.github_source",
26	"GoogleDriveSource": "video_processor.sources.google_drive",
27	"HackerNewsSource": "video_processor.sources.hackernews_source",
28	"PodcastSource": "video_processor.sources.podcast_source",
29	"RedditSource": "video_processor.sources.reddit_source",
30	"RSSSource": "video_processor.sources.rss_source",
31	"TwitterSource": "video_processor.sources.twitter_source",
32	"WebSource": "video_processor.sources.web_source",
33	"YouTubeSource": "video_processor.sources.youtube_source",
34	}
35	if name in _lazy_map:
36	import importlib
37
38	module = importlib.import_module(_lazy_map[name])
39	return getattr(module, name)
40	raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
41
42	DDED video_processor/sources/arxiv_source.py
43	DDED video_processor/sources/github_source.py
44	DDED video_processor/sources/hackernews_source.py
45	DDED video_processor/sources/podcast_source.py
46	DDED video_processor/sources/reddit_source.py
47	DDED video_processor/sources/rss_source.py
48	DDED video_processor/sources/twitter_source.py
49	DDED video_processor/sources/web_source.py
50	DDED video_processor/sources/youtube_source.py

A video_processor/sources/arxiv_source.py

+117

		--- a/video_processor/sources/arxiv_source.py
		+++ b/video_processor/sources/arxiv_source.py
		@@ -0,0 +1,117 @@
	1	+"""arXiv source connector for fetching paper metadata and PDFs."""
	2	+
	3	+import logging
	4	+import re
	5	+from pathlib import Path
	6	+from typing import List, Optional
	7	+
	8	+from video_processor.sources.base import BaseSource, SourceFile
	9	+
	10	+logger = logging.getLogger(__name__)
	11	+
	12	+_ARXIV_ID_PATTERN = re.compile(r"(\d{4}\.\d{4,5})(v\d+)?")
	13	+ARXIV_API = "http://export.arxiv.org/api/query"
	14	+
	15	+
	16	+def _extract_arxiv_id(url_or_id: str) -> str:
	17	+ """Extract arXiv paper ID from a URL or bare ID string."""
	18	+ match = _ARXIV_ID_PATTERN.search(url_or_id)
	19	+ if not match:
	20	+ raise ValueError(f"Could not extract arXiv ID from: {url_or_id}")
	21	+ return match.group(0)
	22	+
	23	+
	24	+class ArxivSource(BaseSource):
	25	+ """
	26	+ Fetch arXiv paper metadata and PDF.
	27	+
	28	+ Uses the arXiv API (Atom feed) for metadata and direct PDF download.
	29	+ Requires: pip install requests
	30	+ """
	31	+
	32	+ def __init__(self, url_or_id: str):
	33	+ self.arxiv_id = _extract_arxiv_id(url_or_id)
	34	+ self._metadata: Optional[dict] = None
	35	+
	36	+ def authenticate(self) -> bool:
	37	+ """No auth needed for arXiv."""
	38	+ return True
	39	+
	40	+ def _fetch_metadata(self) -> dict:
	41	+ """Fetch paper metadata from the arXiv API."""
	42	+ if self._metadata:
	43	+ return self._metadata
	44	+
	45	+ import xml.etree.ElementTree as ET
	46	+
	47	+ import requests
	48	+
	49	+ resp = requests.get(ARXIV_API, params={"id_list": self.arxiv_id}, timeout=15)
	50	+ resp.raise_for_status()
	51	+
	52	+ ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
	53	+ root = ET.fromstring(resp.text)
	54	+ entry = root.find("atom:entry", ns)
	55	+ if entry is None:
	56	+ raise ValueError(f"Paper not found: {self.arxiv_id}")
	57	+
	58	+ self._metadata = {
	59	+ "title": (entry.findtext("atom:title", namespaces=ns) or "").strip(),
	60	+ "summary": (entry.findtext("atom:summary", namespaces=ns) or "").strip(),
	61	+ "authors": [
	62	+ a.findtext("atom:name", namespaces=ns) or ""
	63	+ for a in entry.findall("atom:author", ns)
	64	+ ],
	65	+ "published": entry.findtext("atom:published", namespaces=ns) or "",
	66	+ "pdf_url": f"https://arxiv.org/pdf/{self.arxiv_id}.pdf",
	67	+ }
	68	+ return self._metadata
	69	+
	70	+ def list_videos(
	71	+ self,
	72	+ folder_id: Optional[str] = None,
	73	+ folder_path: Optional[str] = None,
	74	+ patterns: Optional[List[str]] = None,
	75	+ ) -> List[SourceFile]:
	76	+ """Return SourceFiles for the paper metadata and PDF."""
	77	+ meta = self._fetch_metadata()
	78	+ return [
	79	+ SourceFile(
	80	+ name=f"{meta['title']} (metadata)",
	81	+ id=f"meta:{self.arxiv_id}",
	82	+ mime_type="text/plain",
	83	+ ),
	84	+ SourceFile(
	85	+ name=f"{meta['title']}.pdf",
	86	+ id=f"pdf:{self.arxiv_id}",
	87	+ mime_type="application/pdf",
	88	+ ),
	89	+ ]
	90	+
	91	+ def download(self, file: SourceFile, destination: Path) -> Path:
	92	+ """Download paper metadata as text or the PDF file."""
	93	+ import requests
	94	+
	95	+ destination = Path(destination)
	96	+ destination.parent.mkdir(parents=True, exist_ok=True)
	97	+ meta = self._fetch_metadata()
	98	+
	99	+ if file.id.startswith("meta:"):
	100	+ authors = ", ".join(meta["authors"])
	101	+ text = (
	102	+ f"# {meta['title']}\n\n"
	103	+ f"Authors: {authors}\n"
	104	+ f"Published: {meta['published']}\n"
	105	+ f"arXiv: {self.arxiv_id}\n\n"
	106	+ f"## Abstract\n\n{meta['summary']}"
	107	+ )
	108	+ destination.write_text(text, encoding="utf-8")
	109	+ elif file.id.startswith("pdf:"):
	110	+ resp = requests.get(meta["pdf_url"], timeout=60, stream=True)
	111	+ resp.raise_for_status()
	112	+ with open(destination, "wb") as f:
	113	+ for chunk in resp.iter_content(chunk_size=8192):
	114	+ f.write(chunk)
	115	+
	116	+ logger.info(f"Downloaded arXiv {self.arxiv_id} to {destination}")
	117	+ return destination

	--- a/video_processor/sources/arxiv_source.py
	+++ b/video_processor/sources/arxiv_source.py
	@@ -0,0 +1,117 @@

	--- a/video_processor/sources/arxiv_source.py
	+++ b/video_processor/sources/arxiv_source.py
	@@ -0,0 +1,117 @@
1	"""arXiv source connector for fetching paper metadata and PDFs."""
2
3	import logging
4	import re
5	from pathlib import Path
6	from typing import List, Optional
7
8	from video_processor.sources.base import BaseSource, SourceFile
9
10	logger = logging.getLogger(__name__)
11
12	_ARXIV_ID_PATTERN = re.compile(r"(\d{4}\.\d{4,5})(v\d+)?")
13	ARXIV_API = "http://export.arxiv.org/api/query"
14
15
16	def _extract_arxiv_id(url_or_id: str) -> str:
17	"""Extract arXiv paper ID from a URL or bare ID string."""
18	match = _ARXIV_ID_PATTERN.search(url_or_id)
19	if not match:
20	raise ValueError(f"Could not extract arXiv ID from: {url_or_id}")
21	return match.group(0)
22
23
24	class ArxivSource(BaseSource):
25	"""
26	Fetch arXiv paper metadata and PDF.
27
28	Uses the arXiv API (Atom feed) for metadata and direct PDF download.
29	Requires: pip install requests
30	"""
31
32	def __init__(self, url_or_id: str):
33	self.arxiv_id = _extract_arxiv_id(url_or_id)
34	self._metadata: Optional[dict] = None
35
36	def authenticate(self) -> bool:
37	"""No auth needed for arXiv."""
38	return True
39
40	def _fetch_metadata(self) -> dict:
41	"""Fetch paper metadata from the arXiv API."""
42	if self._metadata:
43	return self._metadata
44
45	import xml.etree.ElementTree as ET
46
47	import requests
48
49	resp = requests.get(ARXIV_API, params={"id_list": self.arxiv_id}, timeout=15)
50	resp.raise_for_status()
51
52	ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
53	root = ET.fromstring(resp.text)
54	entry = root.find("atom:entry", ns)
55	if entry is None:
56	raise ValueError(f"Paper not found: {self.arxiv_id}")
57
58	self._metadata = {
59	"title": (entry.findtext("atom:title", namespaces=ns) or "").strip(),
60	"summary": (entry.findtext("atom:summary", namespaces=ns) or "").strip(),
61	"authors": [
62	a.findtext("atom:name", namespaces=ns) or ""
63	for a in entry.findall("atom:author", ns)
64	],
65	"published": entry.findtext("atom:published", namespaces=ns) or "",
66	"pdf_url": f"https://arxiv.org/pdf/{self.arxiv_id}.pdf",
67	}
68	return self._metadata
69
70	def list_videos(
71	self,
72	folder_id: Optional[str] = None,
73	folder_path: Optional[str] = None,
74	patterns: Optional[List[str]] = None,
75	) -> List[SourceFile]:
76	"""Return SourceFiles for the paper metadata and PDF."""
77	meta = self._fetch_metadata()
78	return [
79	SourceFile(
80	name=f"{meta['title']} (metadata)",
81	id=f"meta:{self.arxiv_id}",
82	mime_type="text/plain",
83	),
84	SourceFile(
85	name=f"{meta['title']}.pdf",
86	id=f"pdf:{self.arxiv_id}",
87	mime_type="application/pdf",
88	),
89	]
90
91	def download(self, file: SourceFile, destination: Path) -> Path:
92	"""Download paper metadata as text or the PDF file."""
93	import requests
94
95	destination = Path(destination)
96	destination.parent.mkdir(parents=True, exist_ok=True)
97	meta = self._fetch_metadata()
98
99	if file.id.startswith("meta:"):
100	authors = ", ".join(meta["authors"])
101	text = (
102	f"# {meta['title']}\n\n"
103	f"Authors: {authors}\n"
104	f"Published: {meta['published']}\n"
105	f"arXiv: {self.arxiv_id}\n\n"
106	f"## Abstract\n\n{meta['summary']}"
107	)
108	destination.write_text(text, encoding="utf-8")
109	elif file.id.startswith("pdf:"):
110	resp = requests.get(meta["pdf_url"], timeout=60, stream=True)
111	resp.raise_for_status()
112	with open(destination, "wb") as f:
113	for chunk in resp.iter_content(chunk_size=8192):
114	f.write(chunk)
115
116	logger.info(f"Downloaded arXiv {self.arxiv_id} to {destination}")
117	return destination

A video_processor/sources/github_source.py

+156

		--- a/video_processor/sources/github_source.py
		+++ b/video_processor/sources/github_source.py
		@@ -0,0 +1,156 @@
	1	+"""GitHub source connector for fetching repo content, issues, and PRs."""
	2	+
	3	+import logging
	4	+import os
	5	+from pathlib import Path
	6	+from typing import List, Optional
	7	+
	8	+from video_processor.sources.base import BaseSource, SourceFile
	9	+
	10	+logger = logging.getLogger(__name__)
	11	+
	12	+API_BASE = "https://api.github.com"
	13	+
	14	+
	15	+class GitHubSource(BaseSource):
	16	+ """
	17	+ Fetch GitHub repository README, issues, and pull requests as text documents.
	18	+
	19	+ Auth: Set GITHUB_TOKEN env var, or use `gh auth token` output.
	20	+ Requires: pip install requests
	21	+ """
	22	+
	23	+ def __init__(self, repo: str, include_issues: bool = True, include_prs: bool = True):
	24	+ """
	25	+ Parameters
	26	+ ----------
	27	+ repo : str
	28	+ GitHub repo in "owner/repo" format.
	29	+ """
	30	+ self.repo = repo
	31	+ self.include_issues = include_issues
	32	+ self.include_prs = include_prs
	33	+ self._token: Optional[str] = None
	34	+
	35	+ def authenticate(self) -> bool:
	36	+ """Authenticate via GITHUB_TOKEN env var or gh CLI."""
	37	+ self._token = os.environ.get("GITHUB_TOKEN")
	38	+ if not self._token:
	39	+ try:
	40	+ import subprocess
	41	+
	42	+ result = subprocess.run(["gh", "auth", "token"], capture_output=True, text=True)
	43	+ if result.returncode == 0:
	44	+ self._token = result.stdout.strip()
	45	+ except FileNotFoundError:
	46	+ pass
	47	+ if not self._token:
	48	+ logger.warning(
	49	+ "No GitHub token found. Public repos only. Set GITHUB_TOKEN for private repos."
	50	+ )
	51	+ return True
	52	+
	53	+ def _headers(self) -> dict:
	54	+ h = {"Accept": "application/vnd.github.v3+json"}
	55	+ if self._token:
	56	+ h["Authorization"] = f"Bearer {self._token}"
	57	+ return h
	58	+
	59	+ def list_videos(
	60	+ self,
	61	+ folder_id: Optional[str] = None,
	62	+ folder_path: Optional[str] = None,
	63	+ patterns: Optional[List[str]] = None,
	64	+ ) -> List[SourceFile]:
	65	+ """List available documents (README, issues, PRs) as SourceFiles."""
	66	+ import requests
	67	+
	68	+ files = []
	69	+ # README
	70	+ resp = requests.get(
	71	+ f"{API_BASE}/repos/{self.repo}/readme", headers=self._headers(), timeout=15
	72	+ )
	73	+ if resp.ok:
	74	+ files.append(SourceFile(name="README", id="readme", mime_type="text/markdown"))
	75	+
	76	+ # Issues
	77	+ if self.include_issues:
	78	+ resp = requests.get(
	79	+ f"{API_BASE}/repos/{self.repo}/issues",
	80	+ headers=self._headers(),
	81	+ params={"state": "all", "per_page": 100},
	82	+ timeout=15,
	83	+ )
	84	+ if resp.ok:
	85	+ for issue in resp.json():
	86	+ if "pull_request" not in issue:
	87	+ files.append(
	88	+ SourceFile(
	89	+ name=f"Issue #{issue['number']}: {issue['title']}",
	90	+ id=f"issue:{issue['number']}",
	91	+ mime_type="text/plain",
	92	+ )
	93	+ )
	94	+
	95	+ # PRs
	96	+ if self.include_prs:
	97	+ resp = requests.get(
	98	+ f"{API_BASE}/repos/{self.repo}/pulls",
	99	+ headers=self._headers(),
	100	+ params={"state": "all", "per_page": 100},
	101	+ timeout=15,
	102	+ )
	103	+ if resp.ok:
	104	+ for pr in resp.json():
	105	+ files.append(
	106	+ SourceFile(
	107	+ name=f"PR #{pr['number']}: {pr['title']}",
	108	+ id=f"pr:{pr['number']}",
	109	+ mime_type="text/plain",
	110	+ )
	111	+ )
	112	+
	113	+ return files
	114	+
	115	+ def download(self, file: SourceFile, destination: Path) -> Path:
	116	+ """Download a single document (README, issue, or PR) as text."""
	117	+ import requests
	118	+
	119	+ destination = Path(destination)
	120	+ destination.parent.mkdir(parents=True, exist_ok=True)
	121	+
	122	+ if file.id == "readme":
	123	+ resp = requests.get(
	124	+ f"{API_BASE}/repos/{self.repo}/readme",
	125	+ headers={**self._headers(), "Accept": "application/vnd.github.v3.raw"},
	126	+ timeout=15,
	127	+ )
	128	+ destination.write_text(resp.text, encoding="utf-8")
	129	+ elif file.id.startswith("issue:"):
	130	+ num = file.id.split(":")[1]
	131	+ resp = requests.get(
	132	+ f"{API_BASE}/repos/{self.repo}/issues/{num}",
	133	+ headers=self._headers(),
	134	+ timeout=15,
	135	+ )
	136	+ data = resp.json()
	137	+ text = f"# {data['title']}\n\n{data.get('body', '') or ''}"
	138	+ # Append comments
	139	+ comments_resp = requests.get(data["comments_url"], headers=self._headers(), timeout=15)
	140	+ if comments_resp.ok:
	141	+ for c in comments_resp.json():
	142	+ text += f"\n\n---\n{c['user']['login']}: {c.get('body', '')}"
	143	+ destination.write_text(text, encoding="utf-8")
	144	+ elif file.id.startswith("pr:"):
	145	+ num = file.id.split(":")[1]
	146	+ resp = requests.get(
	147	+ f"{API_BASE}/repos/{self.repo}/pulls/{num}",
	148	+ headers=self._headers(),
	149	+ timeout=15,
	150	+ )
	151	+ data = resp.json()
	152	+ text = f"# PR: {data['title']}\n\n{data.get('body', '') or ''}"
	153	+ destination.write_text(text, encoding="utf-8")
	154	+
	155	+ logger.info(f"Downloaded {file.name} to {destination}")
	156	+ return destination

	--- a/video_processor/sources/github_source.py
	+++ b/video_processor/sources/github_source.py
	@@ -0,0 +1,156 @@

	--- a/video_processor/sources/github_source.py
	+++ b/video_processor/sources/github_source.py
	@@ -0,0 +1,156 @@
1	"""GitHub source connector for fetching repo content, issues, and PRs."""
2
3	import logging
4	import os
5	from pathlib import Path
6	from typing import List, Optional
7
8	from video_processor.sources.base import BaseSource, SourceFile
9
10	logger = logging.getLogger(__name__)
11
12	API_BASE = "https://api.github.com"
13
14
15	class GitHubSource(BaseSource):
16	"""
17	Fetch GitHub repository README, issues, and pull requests as text documents.
18
19	Auth: Set GITHUB_TOKEN env var, or use `gh auth token` output.
20	Requires: pip install requests
21	"""
22
23	def __init__(self, repo: str, include_issues: bool = True, include_prs: bool = True):
24	"""
25	Parameters
26	----------
27	repo : str
28	GitHub repo in "owner/repo" format.
29	"""
30	self.repo = repo
31	self.include_issues = include_issues
32	self.include_prs = include_prs
33	self._token: Optional[str] = None
34
35	def authenticate(self) -> bool:
36	"""Authenticate via GITHUB_TOKEN env var or gh CLI."""
37	self._token = os.environ.get("GITHUB_TOKEN")
38	if not self._token:
39	try:
40	import subprocess
41
42	result = subprocess.run(["gh", "auth", "token"], capture_output=True, text=True)
43	if result.returncode == 0:
44	self._token = result.stdout.strip()
45	except FileNotFoundError:
46	pass
47	if not self._token:
48	logger.warning(
49	"No GitHub token found. Public repos only. Set GITHUB_TOKEN for private repos."
50	)
51	return True
52
53	def _headers(self) -> dict:
54	h = {"Accept": "application/vnd.github.v3+json"}
55	if self._token:
56	h["Authorization"] = f"Bearer {self._token}"
57	return h
58
59	def list_videos(
60	self,
61	folder_id: Optional[str] = None,
62	folder_path: Optional[str] = None,
63	patterns: Optional[List[str]] = None,
64	) -> List[SourceFile]:
65	"""List available documents (README, issues, PRs) as SourceFiles."""
66	import requests
67
68	files = []
69	# README
70	resp = requests.get(
71	f"{API_BASE}/repos/{self.repo}/readme", headers=self._headers(), timeout=15
72	)
73	if resp.ok:
74	files.append(SourceFile(name="README", id="readme", mime_type="text/markdown"))
75
76	# Issues
77	if self.include_issues:
78	resp = requests.get(
79	f"{API_BASE}/repos/{self.repo}/issues",
80	headers=self._headers(),
81	params={"state": "all", "per_page": 100},
82	timeout=15,
83	)
84	if resp.ok:
85	for issue in resp.json():
86	if "pull_request" not in issue:
87	files.append(
88	SourceFile(
89	name=f"Issue #{issue['number']}: {issue['title']}",
90	id=f"issue:{issue['number']}",
91	mime_type="text/plain",
92	)
93	)
94
95	# PRs
96	if self.include_prs:
97	resp = requests.get(
98	f"{API_BASE}/repos/{self.repo}/pulls",
99	headers=self._headers(),
100	params={"state": "all", "per_page": 100},
101	timeout=15,
102	)
103	if resp.ok:
104	for pr in resp.json():
105	files.append(
106	SourceFile(
107	name=f"PR #{pr['number']}: {pr['title']}",
108	id=f"pr:{pr['number']}",
109	mime_type="text/plain",
110	)
111	)
112
113	return files
114
115	def download(self, file: SourceFile, destination: Path) -> Path:
116	"""Download a single document (README, issue, or PR) as text."""
117	import requests
118
119	destination = Path(destination)
120	destination.parent.mkdir(parents=True, exist_ok=True)
121
122	if file.id == "readme":
123	resp = requests.get(
124	f"{API_BASE}/repos/{self.repo}/readme",
125	headers={**self._headers(), "Accept": "application/vnd.github.v3.raw"},
126	timeout=15,
127	)
128	destination.write_text(resp.text, encoding="utf-8")
129	elif file.id.startswith("issue:"):
130	num = file.id.split(":")[1]
131	resp = requests.get(
132	f"{API_BASE}/repos/{self.repo}/issues/{num}",
133	headers=self._headers(),
134	timeout=15,
135	)
136	data = resp.json()
137	text = f"# {data['title']}\n\n{data.get('body', '') or ''}"
138	# Append comments
139	comments_resp = requests.get(data["comments_url"], headers=self._headers(), timeout=15)
140	if comments_resp.ok:
141	for c in comments_resp.json():
142	text += f"\n\n---\n{c['user']['login']}: {c.get('body', '')}"
143	destination.write_text(text, encoding="utf-8")
144	elif file.id.startswith("pr:"):
145	num = file.id.split(":")[1]
146	resp = requests.get(
147	f"{API_BASE}/repos/{self.repo}/pulls/{num}",
148	headers=self._headers(),
149	timeout=15,
150	)
151	data = resp.json()
152	text = f"# PR: {data['title']}\n\n{data.get('body', '') or ''}"
153	destination.write_text(text, encoding="utf-8")
154
155	logger.info(f"Downloaded {file.name} to {destination}")
156	return destination

A video_processor/sources/hackernews_source.py

+112

		--- a/video_processor/sources/hackernews_source.py
		+++ b/video_processor/sources/hackernews_source.py
		@@ -0,0 +1,112 @@
	1	+"""Hacker News source connector using the official Firebase API."""
	2	+
	3	+import logging
	4	+from pathlib import Path
	5	+from typing import List, Optional
	6	+
	7	+from video_processor.sources.base import BaseSource, SourceFile
	8	+
	9	+logger = logging.getLogger(__name__)
	10	+
	11	+HN_API = "https://hacker-news.firebaseio.com/v0"
	12	+
	13	+
	14	+class HackerNewsSource(BaseSource):
	15	+ """
	16	+ Fetch Hacker News stories and comments via the public API.
	17	+
	18	+ API docs: https://github.com/HackerNews/API
	19	+ Requires: pip install requests
	20	+ """
	21	+
	22	+ def __init__(self, item_id: int, max_comments: int = 200):
	23	+ """
	24	+ Parameters
	25	+ ----------
	26	+ item_id : int
	27	+ HN story/item ID (e.g., 12345678).
	28	+ max_comments : int
	29	+ Maximum number of comments to fetch (default 200).
	30	+ """
	31	+ self.item_id = item_id
	32	+ self.max_comments = max_comments
	33	+
	34	+ def authenticate(self) -> bool:
	35	+ """No auth needed for the HN API."""
	36	+ return True
	37	+
	38	+ def list_videos(
	39	+ self,
	40	+ folder_id: Optional[str] = None,
	41	+ folder_path: Optional[str] = None,
	42	+ patterns: Optional[List[str]] = None,
	43	+ ) -> List[SourceFile]:
	44	+ """Return a single SourceFile for the HN story."""
	45	+ return [
	46	+ SourceFile(
	47	+ name=f"hn_{self.item_id}",
	48	+ id=str(self.item_id),
	49	+ mime_type="text/plain",
	50	+ )
	51	+ ]
	52	+
	53	+ def download(self, file: SourceFile, destination: Path) -> Path:
	54	+ """Download the story and comments as plain text."""
	55	+ destination = Path(destination)
	56	+ destination.parent.mkdir(parents=True, exist_ok=True)
	57	+ text = self.fetch_text()
	58	+ destination.write_text(text, encoding="utf-8")
	59	+ logger.info(f"Saved HN story {self.item_id} to {destination}")
	60	+ return destination
	61	+
	62	+ def _get_item(self, item_id: int) -> dict:
	63	+ import requests
	64	+
	65	+ resp = requests.get(f"{HN_API}/item/{item_id}.json", timeout=10)
	66	+ resp.raise_for_status()
	67	+ return resp.json() or {}
	68	+
	69	+ def fetch_text(self) -> str:
	70	+ """Fetch story and comments as structured text."""
	71	+ story = self._get_item(self.item_id)
	72	+ lines = []
	73	+ lines.append(f"# {story.get('title', 'Untitled')}")
	74	+ lines.append(f"by {story.get('by', 'unknown')} \| {story.get('score', 0)} points")
	75	+ if story.get("url"):
	76	+ lines.append(f"URL: {story['url']}")
	77	+ if story.get("text"):
	78	+ lines.append(f"\n{story['text']}")
	79	+ lines.append("")
	80	+
	81	+ # Fetch comments
	82	+ kid_ids = story.get("kids", [])
	83	+ if kid_ids:
	84	+ lines.append("## Comments\n")
	85	+ count = [0]
	86	+ self._fetch_comments(kid_ids, lines, depth=0, count=count)
	87	+
	88	+ return "\n".join(lines)
	89	+
	90	+ def _fetch_comments(self, kid_ids: list, lines: list, depth: int, count: list) -> None:
	91	+ """Recursively fetch and format comments."""
	92	+ indent = " " * depth
	93	+ for kid_id in kid_ids:
	94	+ if count[0] >= self.max_comments:
	95	+ return
	96	+ try:
	97	+ item = self._get_item(kid_id)
	98	+ except Exception:
	99	+ continue
	100	+
	101	+ if item.get("deleted") or item.get("dead"):
	102	+ continue
	103	+
	104	+ count[0] += 1
	105	+ author = item.get("by", "[deleted]")
	106	+ text = item.get("text", "")
	107	+ lines.append(f"{indent}{author}:")
	108	+ lines.append(f"{indent}{text}")
	109	+ lines.append("")
	110	+
	111	+ if item.get("kids"):
	112	+ self._fetch_comments(item["kids"], lines, depth + 1, count)

	--- a/video_processor/sources/hackernews_source.py
	+++ b/video_processor/sources/hackernews_source.py
	@@ -0,0 +1,112 @@

	--- a/video_processor/sources/hackernews_source.py
	+++ b/video_processor/sources/hackernews_source.py
	@@ -0,0 +1,112 @@
1	"""Hacker News source connector using the official Firebase API."""
2
3	import logging
4	from pathlib import Path
5	from typing import List, Optional
6
7	from video_processor.sources.base import BaseSource, SourceFile
8
9	logger = logging.getLogger(__name__)
10
11	HN_API = "https://hacker-news.firebaseio.com/v0"
12
13
14	class HackerNewsSource(BaseSource):
15	"""
16	Fetch Hacker News stories and comments via the public API.
17
18	API docs: https://github.com/HackerNews/API
19	Requires: pip install requests
20	"""
21
22	def __init__(self, item_id: int, max_comments: int = 200):
23	"""
24	Parameters
25	----------
26	item_id : int
27	HN story/item ID (e.g., 12345678).
28	max_comments : int
29	Maximum number of comments to fetch (default 200).
30	"""
31	self.item_id = item_id
32	self.max_comments = max_comments
33
34	def authenticate(self) -> bool:
35	"""No auth needed for the HN API."""
36	return True
37
38	def list_videos(
39	self,
40	folder_id: Optional[str] = None,
41	folder_path: Optional[str] = None,
42	patterns: Optional[List[str]] = None,
43	) -> List[SourceFile]:
44	"""Return a single SourceFile for the HN story."""
45	return [
46	SourceFile(
47	name=f"hn_{self.item_id}",
48	id=str(self.item_id),
49	mime_type="text/plain",
50	)
51	]
52
53	def download(self, file: SourceFile, destination: Path) -> Path:
54	"""Download the story and comments as plain text."""
55	destination = Path(destination)
56	destination.parent.mkdir(parents=True, exist_ok=True)
57	text = self.fetch_text()
58	destination.write_text(text, encoding="utf-8")
59	logger.info(f"Saved HN story {self.item_id} to {destination}")
60	return destination
61
62	def _get_item(self, item_id: int) -> dict:
63	import requests
64
65	resp = requests.get(f"{HN_API}/item/{item_id}.json", timeout=10)
66	resp.raise_for_status()
67	return resp.json() or {}
68
69	def fetch_text(self) -> str:
70	"""Fetch story and comments as structured text."""
71	story = self._get_item(self.item_id)
72	lines = []
73	lines.append(f"# {story.get('title', 'Untitled')}")
74	lines.append(f"by {story.get('by', 'unknown')} \| {story.get('score', 0)} points")
75	if story.get("url"):
76	lines.append(f"URL: {story['url']}")
77	if story.get("text"):
78	lines.append(f"\n{story['text']}")
79	lines.append("")
80
81	# Fetch comments
82	kid_ids = story.get("kids", [])
83	if kid_ids:
84	lines.append("## Comments\n")
85	count = [0]
86	self._fetch_comments(kid_ids, lines, depth=0, count=count)
87
88	return "\n".join(lines)
89
90	def _fetch_comments(self, kid_ids: list, lines: list, depth: int, count: list) -> None:
91	"""Recursively fetch and format comments."""
92	indent = " " * depth
93	for kid_id in kid_ids:
94	if count[0] >= self.max_comments:
95	return
96	try:
97	item = self._get_item(kid_id)
98	except Exception:
99	continue
100
101	if item.get("deleted") or item.get("dead"):
102	continue
103
104	count[0] += 1
105	author = item.get("by", "[deleted]")
106	text = item.get("text", "")
107	lines.append(f"{indent}{author}:")
108	lines.append(f"{indent}{text}")
109	lines.append("")
110
111	if item.get("kids"):
112	self._fetch_comments(item["kids"], lines, depth + 1, count)

A video_processor/sources/podcast_source.py

+119

		--- a/video_processor/sources/podcast_source.py
		+++ b/video_processor/sources/podcast_source.py
		@@ -0,0 +1,119 @@
	1	+"""Podcast feed source connector -- extends RSS for audio enclosures."""
	2	+
	3	+import logging
	4	+from pathlib import Path
	5	+from typing import List, Optional
	6	+
	7	+from video_processor.sources.base import BaseSource, SourceFile
	8	+
	9	+logger = logging.getLogger(__name__)
	10	+
	11	+
	12	+class PodcastSource(BaseSource):
	13	+ """
	14	+ Parse podcast RSS feeds and download audio episodes for pipeline processing.
	15	+
	16	+ Extends the RSS pattern to extract <enclosure> audio URLs.
	17	+ Requires: pip install requests
	18	+ Optional: pip install feedparser
	19	+ """
	20	+
	21	+ def __init__(self, feed_url: str, max_episodes: int = 10):
	22	+ self.feed_url = feed_url
	23	+ self.max_episodes = max_episodes
	24	+ self._episodes: List[dict] = []
	25	+
	26	+ def authenticate(self) -> bool:
	27	+ """No auth needed for public podcast feeds."""
	28	+ return True
	29	+
	30	+ def _parse_feed(self) -> None:
	31	+ """Fetch and parse the podcast feed for audio enclosures."""
	32	+ if self._episodes:
	33	+ return
	34	+
	35	+ import requests
	36	+
	37	+ resp = requests.get(self.feed_url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"})
	38	+ resp.raise_for_status()
	39	+
	40	+ try:
	41	+ import feedparser
	42	+
	43	+ feed = feedparser.parse(resp.text)
	44	+ for entry in feed.entries[: self.max_episodes]:
	45	+ audio_url = None
	46	+ for link in entry.get("links", []):
	47	+ if link.get("type", "").startswith("audio/"):
	48	+ audio_url = link.get("href")
	49	+ break
	50	+ if not audio_url and entry.get("enclosures"):
	51	+ audio_url = entry["enclosures"][0].get("href")
	52	+ if audio_url:
	53	+ self._episodes.append(
	54	+ {
	55	+ "title": entry.get("title", "Untitled"),
	56	+ "url": audio_url,
	57	+ "published": entry.get("published", ""),
	58	+ "duration": entry.get("itunes_duration", ""),
	59	+ }
	60	+ )
	61	+ except ImportError:
	62	+ logger.debug("feedparser not available, using xml.etree fallback")
	63	+ self._parse_xml(resp.text)
	64	+
	65	+ def _parse_xml(self, text: str) -> None:
	66	+ """Fallback parser for podcast XML using stdlib."""
	67	+ import xml.etree.ElementTree as ET
	68	+
	69	+ root = ET.fromstring(text)
	70	+ items = root.findall(".//item")
	71	+ for item in items[: self.max_episodes]:
	72	+ enclosure = item.find("enclosure")
	73	+ if enclosure is None:
	74	+ continue
	75	+ audio_url = enclosure.get("url", "")
	76	+ if not audio_url:
	77	+ continue
	78	+ title = item.findtext("title") or "Untitled"
	79	+ pub = item.findtext("pubDate") or ""
	80	+ self._episodes.append(
	81	+ {"title": title, "url": audio_url, "published": pub, "duration": ""}
	82	+ )
	83	+
	84	+ def list_videos(
	85	+ self,
	86	+ folder_id: Optional[str] = None,
	87	+ folder_path: Optional[str] = None,
	88	+ patterns: Optional[List[str]] = None,
	89	+ ) -> List[SourceFile]:
	90	+ """List podcast episodes as SourceFiles."""
	91	+ self._parse_feed()
	92	+ return [
	93	+ SourceFile(
	94	+ name=ep["title"],
	95	+ id=ep["url"],
	96	+ mime_type="audio/mpeg",
	97	+ modified_at=ep["published"],
	98	+ )
	99	+ for ep in self._episodes
	100	+ ]
	101	+
	102	+ def download(self, file: SourceFile, destination: Path) -> Path:
	103	+ """Download the podcast audio file."""
	104	+ import requests
	105	+
	106	+ destination = Path(destination)
	107	+ destination.parent.mkdir(parents=True, exist_ok=True)
	108	+
	109	+ resp = requests.get(
	110	+ file.id, timeout=60, stream=True, headers={"User-Agent": "PlanOpticon/0.3"}
	111	+ )
	112	+ resp.raise_for_status()
	113	+
	114	+ with open(destination, "wb") as f:
	115	+ for chunk in resp.iter_content(chunk_size=8192):
	116	+ f.write(chunk)
	117	+
	118	+ logger.info(f"Downloaded podcast episode to {destination}")
	119	+ return destination

	--- a/video_processor/sources/podcast_source.py
	+++ b/video_processor/sources/podcast_source.py
	@@ -0,0 +1,119 @@

	--- a/video_processor/sources/podcast_source.py
	+++ b/video_processor/sources/podcast_source.py
	@@ -0,0 +1,119 @@
1	"""Podcast feed source connector -- extends RSS for audio enclosures."""
2
3	import logging
4	from pathlib import Path
5	from typing import List, Optional
6
7	from video_processor.sources.base import BaseSource, SourceFile
8
9	logger = logging.getLogger(__name__)
10
11
12	class PodcastSource(BaseSource):
13	"""
14	Parse podcast RSS feeds and download audio episodes for pipeline processing.
15
16	Extends the RSS pattern to extract <enclosure> audio URLs.
17	Requires: pip install requests
18	Optional: pip install feedparser
19	"""
20
21	def __init__(self, feed_url: str, max_episodes: int = 10):
22	self.feed_url = feed_url
23	self.max_episodes = max_episodes
24	self._episodes: List[dict] = []
25
26	def authenticate(self) -> bool:
27	"""No auth needed for public podcast feeds."""
28	return True
29
30	def _parse_feed(self) -> None:
31	"""Fetch and parse the podcast feed for audio enclosures."""
32	if self._episodes:
33	return
34
35	import requests
36
37	resp = requests.get(self.feed_url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"})
38	resp.raise_for_status()
39
40	try:
41	import feedparser
42
43	feed = feedparser.parse(resp.text)
44	for entry in feed.entries[: self.max_episodes]:
45	audio_url = None
46	for link in entry.get("links", []):
47	if link.get("type", "").startswith("audio/"):
48	audio_url = link.get("href")
49	break
50	if not audio_url and entry.get("enclosures"):
51	audio_url = entry["enclosures"][0].get("href")
52	if audio_url:
53	self._episodes.append(
54	{
55	"title": entry.get("title", "Untitled"),
56	"url": audio_url,
57	"published": entry.get("published", ""),
58	"duration": entry.get("itunes_duration", ""),
59	}
60	)
61	except ImportError:
62	logger.debug("feedparser not available, using xml.etree fallback")
63	self._parse_xml(resp.text)
64
65	def _parse_xml(self, text: str) -> None:
66	"""Fallback parser for podcast XML using stdlib."""
67	import xml.etree.ElementTree as ET
68
69	root = ET.fromstring(text)
70	items = root.findall(".//item")
71	for item in items[: self.max_episodes]:
72	enclosure = item.find("enclosure")
73	if enclosure is None:
74	continue
75	audio_url = enclosure.get("url", "")
76	if not audio_url:
77	continue
78	title = item.findtext("title") or "Untitled"
79	pub = item.findtext("pubDate") or ""
80	self._episodes.append(
81	{"title": title, "url": audio_url, "published": pub, "duration": ""}
82	)
83
84	def list_videos(
85	self,
86	folder_id: Optional[str] = None,
87	folder_path: Optional[str] = None,
88	patterns: Optional[List[str]] = None,
89	) -> List[SourceFile]:
90	"""List podcast episodes as SourceFiles."""
91	self._parse_feed()
92	return [
93	SourceFile(
94	name=ep["title"],
95	id=ep["url"],
96	mime_type="audio/mpeg",
97	modified_at=ep["published"],
98	)
99	for ep in self._episodes
100	]
101
102	def download(self, file: SourceFile, destination: Path) -> Path:
103	"""Download the podcast audio file."""
104	import requests
105
106	destination = Path(destination)
107	destination.parent.mkdir(parents=True, exist_ok=True)
108
109	resp = requests.get(
110	file.id, timeout=60, stream=True, headers={"User-Agent": "PlanOpticon/0.3"}
111	)
112	resp.raise_for_status()
113
114	with open(destination, "wb") as f:
115	for chunk in resp.iter_content(chunk_size=8192):
116	f.write(chunk)
117
118	logger.info(f"Downloaded podcast episode to {destination}")
119	return destination

A video_processor/sources/reddit_source.py

+103

		--- a/video_processor/sources/reddit_source.py
		+++ b/video_processor/sources/reddit_source.py
		@@ -0,0 +1,103 @@
	1	+"""Reddit source connector using the public JSON API."""
	2	+
	3	+import logging
	4	+from pathlib import Path
	5	+from typing import List, Optional
	6	+
	7	+from video_processor.sources.base import BaseSource, SourceFile
	8	+
	9	+logger = logging.getLogger(__name__)
	10	+
	11	+
	12	+class RedditSource(BaseSource):
	13	+ """
	14	+ Fetch Reddit posts and comments via the public JSON API.
	15	+
	16	+ No auth required for public posts. Append .json to any Reddit URL.
	17	+ Requires: pip install requests
	18	+ """
	19	+
	20	+ def __init__(self, url: str):
	21	+ """
	22	+ Parameters
	23	+ ----------
	24	+ url : str
	25	+ Reddit post or subreddit URL.
	26	+ """
	27	+ self.url = url.rstrip("/")
	28	+
	29	+ def authenticate(self) -> bool:
	30	+ """No auth needed for public Reddit content."""
	31	+ return True
	32	+
	33	+ def list_videos(
	34	+ self,
	35	+ folder_id: Optional[str] = None,
	36	+ folder_path: Optional[str] = None,
	37	+ patterns: Optional[List[str]] = None,
	38	+ ) -> List[SourceFile]:
	39	+ """Return a single SourceFile for the Reddit post."""
	40	+ return [
	41	+ SourceFile(
	42	+ name=self.url.split("/")[-1] or "reddit_post",
	43	+ id=self.url,
	44	+ mime_type="text/plain",
	45	+ )
	46	+ ]
	47	+
	48	+ def download(self, file: SourceFile, destination: Path) -> Path:
	49	+ """Download post and comments as plain text."""
	50	+ destination = Path(destination)
	51	+ destination.parent.mkdir(parents=True, exist_ok=True)
	52	+ text = self.fetch_text()
	53	+ destination.write_text(text, encoding="utf-8")
	54	+ logger.info(f"Saved Reddit content to {destination}")
	55	+ return destination
	56	+
	57	+ def fetch_text(self) -> str:
	58	+ """Fetch the Reddit post and comments as structured text."""
	59	+ import requests
	60	+
	61	+ json_url = self.url.rstrip("/") + ".json"
	62	+ resp = requests.get(
	63	+ json_url,
	64	+ timeout=15,
	65	+ headers={"User-Agent": "PlanOpticon/0.3 (source connector)"},
	66	+ )
	67	+ resp.raise_for_status()
	68	+ data = resp.json()
	69	+
	70	+ lines = []
	71	+ # Post data is in first listing
	72	+ if isinstance(data, list) and len(data) > 0:
	73	+ post = data[0]["data"]["children"][0]["data"]
	74	+ lines.append(f"# {post.get('title', 'Untitled')}")
	75	+ lines.append(f"by u/{post.get('author', '[deleted]')} \| {post.get('score', 0)} points")
	76	+ lines.append("")
	77	+ if post.get("selftext"):
	78	+ lines.append(post["selftext"])
	79	+ lines.append("")
	80	+
	81	+ # Comments in second listing
	82	+ if len(data) > 1:
	83	+ lines.append("## Comments\n")
	84	+ self._extract_comments(data[1]["data"]["children"], lines, depth=0)
	85	+
	86	+ return "\n".join(lines)
	87	+
	88	+ def _extract_comments(self, children: list, lines: list, depth: int) -> None:
	89	+ """Recursively extract comment text."""
	90	+ indent = " " * depth
	91	+ for child in children:
	92	+ if child.get("kind") != "t1":
	93	+ continue
	94	+ c = child["data"]
	95	+ author = c.get("author", "[deleted]")
	96	+ body = c.get("body", "")
	97	+ lines.append(f"{indent}{author} ({c.get('score', 0)} pts):")
	98	+ lines.append(f"{indent}{body}")
	99	+ lines.append("")
	100	+ # Recurse into replies
	101	+ replies = c.get("replies")
	102	+ if isinstance(replies, dict):
	103	+ self._extract_comments(replies["data"]["children"], lines, depth + 1)

	--- a/video_processor/sources/reddit_source.py
	+++ b/video_processor/sources/reddit_source.py
	@@ -0,0 +1,103 @@

	--- a/video_processor/sources/reddit_source.py
	+++ b/video_processor/sources/reddit_source.py
	@@ -0,0 +1,103 @@
1	"""Reddit source connector using the public JSON API."""
2
3	import logging
4	from pathlib import Path
5	from typing import List, Optional
6
7	from video_processor.sources.base import BaseSource, SourceFile
8
9	logger = logging.getLogger(__name__)
10
11
12	class RedditSource(BaseSource):
13	"""
14	Fetch Reddit posts and comments via the public JSON API.
15
16	No auth required for public posts. Append .json to any Reddit URL.
17	Requires: pip install requests
18	"""
19
20	def __init__(self, url: str):
21	"""
22	Parameters
23	----------
24	url : str
25	Reddit post or subreddit URL.
26	"""
27	self.url = url.rstrip("/")
28
29	def authenticate(self) -> bool:
30	"""No auth needed for public Reddit content."""
31	return True
32
33	def list_videos(
34	self,
35	folder_id: Optional[str] = None,
36	folder_path: Optional[str] = None,
37	patterns: Optional[List[str]] = None,
38	) -> List[SourceFile]:
39	"""Return a single SourceFile for the Reddit post."""
40	return [
41	SourceFile(
42	name=self.url.split("/")[-1] or "reddit_post",
43	id=self.url,
44	mime_type="text/plain",
45	)
46	]
47
48	def download(self, file: SourceFile, destination: Path) -> Path:
49	"""Download post and comments as plain text."""
50	destination = Path(destination)
51	destination.parent.mkdir(parents=True, exist_ok=True)
52	text = self.fetch_text()
53	destination.write_text(text, encoding="utf-8")
54	logger.info(f"Saved Reddit content to {destination}")
55	return destination
56
57	def fetch_text(self) -> str:
58	"""Fetch the Reddit post and comments as structured text."""
59	import requests
60
61	json_url = self.url.rstrip("/") + ".json"
62	resp = requests.get(
63	json_url,
64	timeout=15,
65	headers={"User-Agent": "PlanOpticon/0.3 (source connector)"},
66	)
67	resp.raise_for_status()
68	data = resp.json()
69
70	lines = []
71	# Post data is in first listing
72	if isinstance(data, list) and len(data) > 0:
73	post = data[0]["data"]["children"][0]["data"]
74	lines.append(f"# {post.get('title', 'Untitled')}")
75	lines.append(f"by u/{post.get('author', '[deleted]')} \| {post.get('score', 0)} points")
76	lines.append("")
77	if post.get("selftext"):
78	lines.append(post["selftext"])
79	lines.append("")
80
81	# Comments in second listing
82	if len(data) > 1:
83	lines.append("## Comments\n")
84	self._extract_comments(data[1]["data"]["children"], lines, depth=0)
85
86	return "\n".join(lines)
87
88	def _extract_comments(self, children: list, lines: list, depth: int) -> None:
89	"""Recursively extract comment text."""
90	indent = " " * depth
91	for child in children:
92	if child.get("kind") != "t1":
93	continue
94	c = child["data"]
95	author = c.get("author", "[deleted]")
96	body = c.get("body", "")
97	lines.append(f"{indent}{author} ({c.get('score', 0)} pts):")
98	lines.append(f"{indent}{body}")
99	lines.append("")
100	# Recurse into replies
101	replies = c.get("replies")
102	if isinstance(replies, dict):
103	self._extract_comments(replies["data"]["children"], lines, depth + 1)

A video_processor/sources/rss_source.py

+114

		--- a/video_processor/sources/rss_source.py
		+++ b/video_processor/sources/rss_source.py
		@@ -0,0 +1,114 @@
	1	+"""RSS/Atom feed source connector."""
	2	+
	3	+import logging
	4	+from pathlib import Path
	5	+from typing import List, Optional
	6	+
	7	+from video_processor.sources.base import BaseSource, SourceFile
	8	+
	9	+logger = logging.getLogger(__name__)
	10	+
	11	+
	12	+class RSSSource(BaseSource):
	13	+ """
	14	+ Parse RSS/Atom feeds and extract entries as text documents.
	15	+
	16	+ Optional: pip install feedparser (falls back to xml.etree.ElementTree)
	17	+ Requires: pip install requests
	18	+ """
	19	+
	20	+ def __init__(self, url: str, max_entries: int = 50):
	21	+ self.url = url
	22	+ self.max_entries = max_entries
	23	+ self._entries: List[dict] = []
	24	+
	25	+ def authenticate(self) -> bool:
	26	+ """No auth needed for public feeds."""
	27	+ return True
	28	+
	29	+ def _parse_feed(self) -> None:
	30	+ """Fetch and parse the feed."""
	31	+ if self._entries:
	32	+ return
	33	+
	34	+ import requests
	35	+
	36	+ resp = requests.get(self.url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"})
	37	+ resp.raise_for_status()
	38	+
	39	+ try:
	40	+ import feedparser
	41	+
	42	+ feed = feedparser.parse(resp.text)
	43	+ for entry in feed.entries[: self.max_entries]:
	44	+ self._entries.append(
	45	+ {
	46	+ "title": entry.get("title", "Untitled"),
	47	+ "link": entry.get("link", ""),
	48	+ "summary": entry.get("summary", ""),
	49	+ "published": entry.get("published", ""),
	50	+ "id": entry.get("id", entry.get("link", "")),
	51	+ }
	52	+ )
	53	+ except ImportError:
	54	+ logger.debug("feedparser not available, using xml.etree fallback")
	55	+ self._parse_xml(resp.text)
	56	+
	57	+ def _parse_xml(self, text: str) -> None:
	58	+ """Fallback parser using stdlib xml.etree."""
	59	+ import xml.etree.ElementTree as ET
	60	+
	61	+ root = ET.fromstring(text)
	62	+ # Handle RSS 2.0
	63	+ ns = {"atom": "http://www.w3.org/2005/Atom"}
	64	+ items = root.findall(".//item") or root.findall(".//atom:entry", ns)
	65	+ for item in items[: self.max_entries]:
	66	+ title = (
	67	+ item.findtext("title") or item.findtext("atom:title", namespaces=ns) or "Untitled"
	68	+ )
	69	+ link = item.findtext("link") or ""
	70	+ if not link:
	71	+ link_el = item.find("atom:link", ns)
	72	+ link = link_el.get("href", "") if link_el is not None else ""
	73	+ desc = (
	74	+ item.findtext("description") or item.findtext("atom:summary", namespaces=ns) or ""
	75	+ )
	76	+ pub = item.findtext("pubDate") or item.findtext("atom:published", namespaces=ns) or ""
	77	+ self._entries.append(
	78	+ {"title": title, "link": link, "summary": desc, "published": pub, "id": link}
	79	+ )
	80	+
	81	+ def list_videos(
	82	+ self,
	83	+ folder_id: Optional[str] = None,
	84	+ folder_path: Optional[str] = None,
	85	+ patterns: Optional[List[str]] = None,
	86	+ ) -> List[SourceFile]:
	87	+ """List feed entries as SourceFiles."""
	88	+ self._parse_feed()
	89	+ return [
	90	+ SourceFile(
	91	+ name=e["title"], id=e["id"], mime_type="text/plain", modified_at=e["published"]
	92	+ )
	93	+ for e in self._entries
	94	+ ]
	95	+
	96	+ def download(self, file: SourceFile, destination: Path) -> Path:
	97	+ """Write an entry's content as a text file."""
	98	+ self._parse_feed()
	99	+ destination = Path(destination)
	100	+ destination.parent.mkdir(parents=True, exist_ok=True)
	101	+
	102	+ entry = next((e for e in self._entries if e["id"] == file.id), None)
	103	+ if not entry:
	104	+ raise ValueError(f"Entry not found: {file.id}")
	105	+
	106	+ text = (
	107	+ f"# {entry['title']}\n\n"
	108	+ f"Published: {entry['published']}\n"
	109	+ f"Link: {entry['link']}\n\n"
	110	+ f"{entry['summary']}"
	111	+ )
	112	+ destination.write_text(text, encoding="utf-8")
	113	+ logger.info(f"Saved RSS entry to {destination}")
	114	+ return destination

	--- a/video_processor/sources/rss_source.py
	+++ b/video_processor/sources/rss_source.py
	@@ -0,0 +1,114 @@

	--- a/video_processor/sources/rss_source.py
	+++ b/video_processor/sources/rss_source.py
	@@ -0,0 +1,114 @@
1	"""RSS/Atom feed source connector."""
2
3	import logging
4	from pathlib import Path
5	from typing import List, Optional
6
7	from video_processor.sources.base import BaseSource, SourceFile
8
9	logger = logging.getLogger(__name__)
10
11
12	class RSSSource(BaseSource):
13	"""
14	Parse RSS/Atom feeds and extract entries as text documents.
15
16	Optional: pip install feedparser (falls back to xml.etree.ElementTree)
17	Requires: pip install requests
18	"""
19
20	def __init__(self, url: str, max_entries: int = 50):
21	self.url = url
22	self.max_entries = max_entries
23	self._entries: List[dict] = []
24
25	def authenticate(self) -> bool:
26	"""No auth needed for public feeds."""
27	return True
28
29	def _parse_feed(self) -> None:
30	"""Fetch and parse the feed."""
31	if self._entries:
32	return
33
34	import requests
35
36	resp = requests.get(self.url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"})
37	resp.raise_for_status()
38
39	try:
40	import feedparser
41
42	feed = feedparser.parse(resp.text)
43	for entry in feed.entries[: self.max_entries]:
44	self._entries.append(
45	{
46	"title": entry.get("title", "Untitled"),
47	"link": entry.get("link", ""),
48	"summary": entry.get("summary", ""),
49	"published": entry.get("published", ""),
50	"id": entry.get("id", entry.get("link", "")),
51	}
52	)
53	except ImportError:
54	logger.debug("feedparser not available, using xml.etree fallback")
55	self._parse_xml(resp.text)
56
57	def _parse_xml(self, text: str) -> None:
58	"""Fallback parser using stdlib xml.etree."""
59	import xml.etree.ElementTree as ET
60
61	root = ET.fromstring(text)
62	# Handle RSS 2.0
63	ns = {"atom": "http://www.w3.org/2005/Atom"}
64	items = root.findall(".//item") or root.findall(".//atom:entry", ns)
65	for item in items[: self.max_entries]:
66	title = (
67	item.findtext("title") or item.findtext("atom:title", namespaces=ns) or "Untitled"
68	)
69	link = item.findtext("link") or ""
70	if not link:
71	link_el = item.find("atom:link", ns)
72	link = link_el.get("href", "") if link_el is not None else ""
73	desc = (
74	item.findtext("description") or item.findtext("atom:summary", namespaces=ns) or ""
75	)
76	pub = item.findtext("pubDate") or item.findtext("atom:published", namespaces=ns) or ""
77	self._entries.append(
78	{"title": title, "link": link, "summary": desc, "published": pub, "id": link}
79	)
80
81	def list_videos(
82	self,
83	folder_id: Optional[str] = None,
84	folder_path: Optional[str] = None,
85	patterns: Optional[List[str]] = None,
86	) -> List[SourceFile]:
87	"""List feed entries as SourceFiles."""
88	self._parse_feed()
89	return [
90	SourceFile(
91	name=e["title"], id=e["id"], mime_type="text/plain", modified_at=e["published"]
92	)
93	for e in self._entries
94	]
95
96	def download(self, file: SourceFile, destination: Path) -> Path:
97	"""Write an entry's content as a text file."""
98	self._parse_feed()
99	destination = Path(destination)
100	destination.parent.mkdir(parents=True, exist_ok=True)
101
102	entry = next((e for e in self._entries if e["id"] == file.id), None)
103	if not entry:
104	raise ValueError(f"Entry not found: {file.id}")
105
106	text = (
107	f"# {entry['title']}\n\n"
108	f"Published: {entry['published']}\n"
109	f"Link: {entry['link']}\n\n"
110	f"{entry['summary']}"
111	)
112	destination.write_text(text, encoding="utf-8")
113	logger.info(f"Saved RSS entry to {destination}")
114	return destination

A video_processor/sources/twitter_source.py

+129

		--- a/video_processor/sources/twitter_source.py
		+++ b/video_processor/sources/twitter_source.py
		@@ -0,0 +1,129 @@
	1	+"""Twitter/X source connector -- stub requiring auth or gallery-dl."""
	2	+
	3	+import logging
	4	+from pathlib import Path
	5	+from typing import List, Optional
	6	+
	7	+from video_processor.sources.base import BaseSource, SourceFile
	8	+
	9	+logger = logging.getLogger(__name__)
	10	+
	11	+
	12	+class TwitterSource(BaseSource):
	13	+ """
	14	+ Fetch Twitter/X posts and threads.
	15	+
	16	+ Twitter API v2 requires authentication. This connector attempts to use
	17	+ gallery-dl as a fallback for public tweets.
	18	+
	19	+ Auth options:
	20	+ - Set TWITTER_BEARER_TOKEN env var for API v2 access
	21	+ - Install gallery-dl for scraping public tweets: pip install gallery-dl
	22	+ """
	23	+
	24	+ def __init__(self, url: str):
	25	+ self.url = url
	26	+ self._bearer_token: Optional[str] = None
	27	+
	28	+ def authenticate(self) -> bool:
	29	+ """Check for Twitter API token or gallery-dl availability."""
	30	+ import os
	31	+
	32	+ self._bearer_token = os.environ.get("TWITTER_BEARER_TOKEN")
	33	+ if self._bearer_token:
	34	+ return True
	35	+
	36	+ # Check for gallery-dl fallback
	37	+ try:
	38	+ import gallery_dl # noqa: F401
	39	+
	40	+ logger.info("Using gallery-dl for Twitter content extraction")
	41	+ return True
	42	+ except ImportError:
	43	+ pass
	44	+
	45	+ logger.error(
	46	+ "Twitter source requires either:\n"
	47	+ " 1. TWITTER_BEARER_TOKEN env var (Twitter API v2)\n"
	48	+ " 2. gallery-dl installed: pip install gallery-dl\n"
	49	+ "Twitter API access: https://developer.twitter.com/en/portal/dashboard"
	50	+ )
	51	+ return False
	52	+
	53	+ def list_videos(
	54	+ self,
	55	+ folder_id: Optional[str] = None,
	56	+ folder_path: Optional[str] = None,
	57	+ patterns: Optional[List[str]] = None,
	58	+ ) -> List[SourceFile]:
	59	+ """Return a single SourceFile for the tweet/thread."""
	60	+ return [
	61	+ SourceFile(
	62	+ name=self.url.split("/")[-1] or "tweet",
	63	+ id=self.url,
	64	+ mime_type="text/plain",
	65	+ )
	66	+ ]
	67	+
	68	+ def download(self, file: SourceFile, destination: Path) -> Path:
	69	+ """Download tweet content as text."""
	70	+ destination = Path(destination)
	71	+ destination.parent.mkdir(parents=True, exist_ok=True)
	72	+ text = self.fetch_text()
	73	+ destination.write_text(text, encoding="utf-8")
	74	+ logger.info(f"Saved Twitter content to {destination}")
	75	+ return destination
	76	+
	77	+ def fetch_text(self) -> str:
	78	+ """Extract tweet text via API or gallery-dl."""
	79	+ if self._bearer_token:
	80	+ return self._fetch_via_api()
	81	+
	82	+ try:
	83	+ return self._fetch_via_gallery_dl()
	84	+ except ImportError:
	85	+ raise RuntimeError(
	86	+ "No Twitter extraction method available. See authenticate() for setup."
	87	+ )
	88	+
	89	+ def _fetch_via_api(self) -> str:
	90	+ """Fetch tweet via Twitter API v2."""
	91	+ import re
	92	+
	93	+ import requests
	94	+
	95	+ match = re.search(r"/status/(\d+)", self.url)
	96	+ if not match:
	97	+ raise ValueError(f"Could not extract tweet ID from: {self.url}")
	98	+
	99	+ tweet_id = match.group(1)
	100	+ resp = requests.get(
	101	+ f"https://api.twitter.com/2/tweets/{tweet_id}",
	102	+ headers={"Authorization": f"Bearer {self._bearer_token}"},
	103	+ params={"tweet.fields": "author_id,created_at,text"},
	104	+ timeout=15,
	105	+ )
	106	+ resp.raise_for_status()
	107	+ data = resp.json().get("data", {})
	108	+ return f"{data.get('text', '')}\n\nCreated: {data.get('created_at', 'unknown')}"
	109	+
	110	+ def _fetch_via_gallery_dl(self) -> str:
	111	+ """Use gallery-dl to extract tweet metadata."""
	112	+ import json
	113	+ import subprocess
	114	+
	115	+ result = subprocess.run(
	116	+ ["gallery-dl", "--dump-json", self.url],
	117	+ capture_output=True,
	118	+ text=True,
	119	+ timeout=30,
	120	+ )
	121	+ if result.returncode != 0:
	122	+ raise RuntimeError(f"gallery-dl failed: {result.stderr}")
	123	+
	124	+ items = json.loads(result.stdout)
	125	+ texts = []
	126	+ for item in items if isinstance(items, list) else [items]:
	127	+ if isinstance(item, dict):
	128	+ texts.append(item.get("content", item.get("text", str(item))))
	129	+ return "\n\n".join(texts) if texts else "No text content extracted."

	--- a/video_processor/sources/twitter_source.py
	+++ b/video_processor/sources/twitter_source.py
	@@ -0,0 +1,129 @@

	--- a/video_processor/sources/twitter_source.py
	+++ b/video_processor/sources/twitter_source.py
	@@ -0,0 +1,129 @@
1	"""Twitter/X source connector -- stub requiring auth or gallery-dl."""
2
3	import logging
4	from pathlib import Path
5	from typing import List, Optional
6
7	from video_processor.sources.base import BaseSource, SourceFile
8
9	logger = logging.getLogger(__name__)
10
11
12	class TwitterSource(BaseSource):
13	"""
14	Fetch Twitter/X posts and threads.
15
16	Twitter API v2 requires authentication. This connector attempts to use
17	gallery-dl as a fallback for public tweets.
18
19	Auth options:
20	- Set TWITTER_BEARER_TOKEN env var for API v2 access
21	- Install gallery-dl for scraping public tweets: pip install gallery-dl
22	"""
23
24	def __init__(self, url: str):
25	self.url = url
26	self._bearer_token: Optional[str] = None
27
28	def authenticate(self) -> bool:
29	"""Check for Twitter API token or gallery-dl availability."""
30	import os
31
32	self._bearer_token = os.environ.get("TWITTER_BEARER_TOKEN")
33	if self._bearer_token:
34	return True
35
36	# Check for gallery-dl fallback
37	try:
38	import gallery_dl # noqa: F401
39
40	logger.info("Using gallery-dl for Twitter content extraction")
41	return True
42	except ImportError:
43	pass
44
45	logger.error(
46	"Twitter source requires either:\n"
47	" 1. TWITTER_BEARER_TOKEN env var (Twitter API v2)\n"
48	" 2. gallery-dl installed: pip install gallery-dl\n"
49	"Twitter API access: https://developer.twitter.com/en/portal/dashboard"
50	)
51	return False
52
53	def list_videos(
54	self,
55	folder_id: Optional[str] = None,
56	folder_path: Optional[str] = None,
57	patterns: Optional[List[str]] = None,
58	) -> List[SourceFile]:
59	"""Return a single SourceFile for the tweet/thread."""
60	return [
61	SourceFile(
62	name=self.url.split("/")[-1] or "tweet",
63	id=self.url,
64	mime_type="text/plain",
65	)
66	]
67
68	def download(self, file: SourceFile, destination: Path) -> Path:
69	"""Download tweet content as text."""
70	destination = Path(destination)
71	destination.parent.mkdir(parents=True, exist_ok=True)
72	text = self.fetch_text()
73	destination.write_text(text, encoding="utf-8")
74	logger.info(f"Saved Twitter content to {destination}")
75	return destination
76
77	def fetch_text(self) -> str:
78	"""Extract tweet text via API or gallery-dl."""
79	if self._bearer_token:
80	return self._fetch_via_api()
81
82	try:
83	return self._fetch_via_gallery_dl()
84	except ImportError:
85	raise RuntimeError(
86	"No Twitter extraction method available. See authenticate() for setup."
87	)
88
89	def _fetch_via_api(self) -> str:
90	"""Fetch tweet via Twitter API v2."""
91	import re
92
93	import requests
94
95	match = re.search(r"/status/(\d+)", self.url)
96	if not match:
97	raise ValueError(f"Could not extract tweet ID from: {self.url}")
98
99	tweet_id = match.group(1)
100	resp = requests.get(
101	f"https://api.twitter.com/2/tweets/{tweet_id}",
102	headers={"Authorization": f"Bearer {self._bearer_token}"},
103	params={"tweet.fields": "author_id,created_at,text"},
104	timeout=15,
105	)
106	resp.raise_for_status()
107	data = resp.json().get("data", {})
108	return f"{data.get('text', '')}\n\nCreated: {data.get('created_at', 'unknown')}"
109
110	def _fetch_via_gallery_dl(self) -> str:
111	"""Use gallery-dl to extract tweet metadata."""
112	import json
113	import subprocess
114
115	result = subprocess.run(
116	["gallery-dl", "--dump-json", self.url],
117	capture_output=True,
118	text=True,
119	timeout=30,
120	)
121	if result.returncode != 0:
122	raise RuntimeError(f"gallery-dl failed: {result.stderr}")
123
124	items = json.loads(result.stdout)
125	texts = []
126	for item in items if isinstance(items, list) else [items]:
127	if isinstance(item, dict):
128	texts.append(item.get("content", item.get("text", str(item))))
129	return "\n\n".join(texts) if texts else "No text content extracted."

A video_processor/sources/web_source.py

+90

		--- a/video_processor/sources/web_source.py
		+++ b/video_processor/sources/web_source.py
		@@ -0,0 +1,90 @@
	1	+"""Web page source connector for fetching and extracting text from URLs."""
	2	+
	3	+import logging
	4	+import re
	5	+from pathlib import Path
	6	+from typing import List, Optional
	7	+
	8	+from video_processor.sources.base import BaseSource, SourceFile
	9	+
	10	+logger = logging.getLogger(__name__)
	11	+
	12	+
	13	+def _strip_html_tags(html: str) -> str:
	14	+ """Minimal HTML tag stripper using stdlib only."""
	15	+ text = re.sub(r"<script[^>]>.?</script>", "", html, flags=re.DOTALL \| re.IGNORECASE)
	16	+ text = re.sub(r"<style[^>]>.?</style>", "", text, flags=re.DOTALL \| re.IGNORECASE)
	17	+ text = re.sub(r"<(nav\|footer\|header)[^>]>.?</\1>", "", text, flags=re.DOTALL \| re.IGNORECASE)
	18	+ text = re.sub(r"<[^>]+>", " ", text)
	19	+ text = re.sub(r"\s+", " ", text).strip()
	20	+ return text
	21	+
	22	+
	23	+class WebSource(BaseSource):
	24	+ """
	25	+ Fetch web pages and extract main text content.
	26	+
	27	+ Uses requests + BeautifulSoup (optional) for content extraction.
	28	+ Falls back to regex-based tag stripping if bs4 is unavailable.
	29	+
	30	+ Requires: pip install requests (included in most environments)
	31	+ Optional: pip install beautifulsoup4 lxml
	32	+ """
	33	+
	34	+ def __init__(self, url: str):
	35	+ self.url = url
	36	+ self._content: Optional[str] = None
	37	+
	38	+ def authenticate(self) -> bool:
	39	+ """No auth needed for public web pages."""
	40	+ return True
	41	+
	42	+ def list_videos(
	43	+ self,
	44	+ folder_id: Optional[str] = None,
	45	+ folder_path: Optional[str] = None,
	46	+ patterns: Optional[List[str]] = None,
	47	+ ) -> List[SourceFile]:
	48	+ """Return a single SourceFile representing the web page."""
	49	+ return [
	50	+ SourceFile(
	51	+ name=self.url.split("/")[-1] or "page",
	52	+ id=self.url,
	53	+ mime_type="text/html",
	54	+ )
	55	+ ]
	56	+
	57	+ def download(self, file: SourceFile, destination: Path) -> Path:
	58	+ """Download and save the extracted text content."""
	59	+ destination = Path(destination)
	60	+ destination.parent.mkdir(parents=True, exist_ok=True)
	61	+ text = self.fetch_text()
	62	+ destination.write_text(text, encoding="utf-8")
	63	+ logger.info(f"Saved web content to {destination}")
	64	+ return destination
	65	+
	66	+ def fetch_text(self) -> str:
	67	+ """Fetch the URL and extract main text content."""
	68	+ if self._content is not None:
	69	+ return self._content
	70	+
	71	+ import requests
	72	+
	73	+ resp = requests.get(self.url, timeout=30, headers={"User-Agent": "PlanOpticon/0.3"})
	74	+ resp.raise_for_status()
	75	+
	76	+ try:
	77	+ from bs4 import BeautifulSoup
	78	+
	79	+ soup = BeautifulSoup(resp.text, "html.parser")
	80	+ # Remove non-content elements
	81	+ for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
	82	+ tag.decompose()
	83	+ # Prefer <article> or <main> if present
	84	+ main = soup.find("article") or soup.find("main") or soup.find("body")
	85	+ self._content = main.get_text(separator="\n", strip=True) if main else soup.get_text()
	86	+ except ImportError:
	87	+ logger.debug("beautifulsoup4 not available, using regex fallback")
	88	+ self._content = _strip_html_tags(resp.text)
	89	+
	90	+ return self._content

	--- a/video_processor/sources/web_source.py
	+++ b/video_processor/sources/web_source.py
	@@ -0,0 +1,90 @@

	--- a/video_processor/sources/web_source.py
	+++ b/video_processor/sources/web_source.py
	@@ -0,0 +1,90 @@
1	"""Web page source connector for fetching and extracting text from URLs."""
2
3	import logging
4	import re
5	from pathlib import Path
6	from typing import List, Optional
7
8	from video_processor.sources.base import BaseSource, SourceFile
9
10	logger = logging.getLogger(__name__)
11
12
13	def _strip_html_tags(html: str) -> str:
14	"""Minimal HTML tag stripper using stdlib only."""
15	text = re.sub(r"<script[^>]>.?</script>", "", html, flags=re.DOTALL \| re.IGNORECASE)
16	text = re.sub(r"<style[^>]>.?</style>", "", text, flags=re.DOTALL \| re.IGNORECASE)
17	text = re.sub(r"<(nav\|footer\|header)[^>]>.?</\1>", "", text, flags=re.DOTALL \| re.IGNORECASE)
18	text = re.sub(r"<[^>]+>", " ", text)
19	text = re.sub(r"\s+", " ", text).strip()
20	return text
21
22
23	class WebSource(BaseSource):
24	"""
25	Fetch web pages and extract main text content.
26
27	Uses requests + BeautifulSoup (optional) for content extraction.
28	Falls back to regex-based tag stripping if bs4 is unavailable.
29
30	Requires: pip install requests (included in most environments)
31	Optional: pip install beautifulsoup4 lxml
32	"""
33
34	def __init__(self, url: str):
35	self.url = url
36	self._content: Optional[str] = None
37
38	def authenticate(self) -> bool:
39	"""No auth needed for public web pages."""
40	return True
41
42	def list_videos(
43	self,
44	folder_id: Optional[str] = None,
45	folder_path: Optional[str] = None,
46	patterns: Optional[List[str]] = None,
47	) -> List[SourceFile]:
48	"""Return a single SourceFile representing the web page."""
49	return [
50	SourceFile(
51	name=self.url.split("/")[-1] or "page",
52	id=self.url,
53	mime_type="text/html",
54	)
55	]
56
57	def download(self, file: SourceFile, destination: Path) -> Path:
58	"""Download and save the extracted text content."""
59	destination = Path(destination)
60	destination.parent.mkdir(parents=True, exist_ok=True)
61	text = self.fetch_text()
62	destination.write_text(text, encoding="utf-8")
63	logger.info(f"Saved web content to {destination}")
64	return destination
65
66	def fetch_text(self) -> str:
67	"""Fetch the URL and extract main text content."""
68	if self._content is not None:
69	return self._content
70
71	import requests
72
73	resp = requests.get(self.url, timeout=30, headers={"User-Agent": "PlanOpticon/0.3"})
74	resp.raise_for_status()
75
76	try:
77	from bs4 import BeautifulSoup
78
79	soup = BeautifulSoup(resp.text, "html.parser")
80	# Remove non-content elements
81	for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
82	tag.decompose()
83	# Prefer <article> or <main> if present
84	main = soup.find("article") or soup.find("main") or soup.find("body")
85	self._content = main.get_text(separator="\n", strip=True) if main else soup.get_text()
86	except ImportError:
87	logger.debug("beautifulsoup4 not available, using regex fallback")
88	self._content = _strip_html_tags(resp.text)
89
90	return self._content

A video_processor/sources/youtube_source.py

+118

		--- a/video_processor/sources/youtube_source.py
		+++ b/video_processor/sources/youtube_source.py
		@@ -0,0 +1,118 @@
	1	+"""YouTube source connector using yt-dlp for video/audio download and caption extraction."""
	2	+
	3	+import logging
	4	+import re
	5	+from pathlib import Path
	6	+from typing import List, Optional
	7	+
	8	+from video_processor.sources.base import BaseSource, SourceFile
	9	+
	10	+logger = logging.getLogger(__name__)
	11	+
	12	+_YT_URL_PATTERN = re.compile(
	13	+ r"(?:youtube\.com/watch\?v=\|youtu\.be/\|youtube\.com/shorts/)([\w-]{11})"
	14	+)
	15	+
	16	+
	17	+def _extract_video_id(url: str) -> str:
	18	+ """Extract the 11-character video ID from a YouTube URL."""
	19	+ match = _YT_URL_PATTERN.search(url)
	20	+ if not match:
	21	+ raise ValueError(f"Could not extract YouTube video ID from: {url}")
	22	+ return match.group(1)
	23	+
	24	+
	25	+class YouTubeSource(BaseSource):
	26	+ """
	27	+ Download YouTube videos/audio and extract captions via yt-dlp.
	28	+
	29	+ Requires: pip install yt-dlp
	30	+ """
	31	+
	32	+ def __init__(self, url: str, audio_only: bool = False):
	33	+ self.url = url
	34	+ self.video_id = _extract_video_id(url)
	35	+ self.audio_only = audio_only
	36	+
	37	+ def authenticate(self) -> bool:
	38	+ """No auth needed for public videos. Returns True if yt-dlp is available."""
	39	+ try:
	40	+ import yt_dlp # noqa: F401
	41	+
	42	+ return True
	43	+ except ImportError:
	44	+ logger.error("yt-dlp not installed. Run: pip install yt-dlp")
	45	+ return False
	46	+
	47	+ def list_videos(
	48	+ self,
	49	+ folder_id: Optional[str] = None,
	50	+ folder_path: Optional[str] = None,
	51	+ patterns: Optional[List[str]] = None,
	52	+ ) -> List[SourceFile]:
	53	+ """Return a single SourceFile representing the YouTube video."""
	54	+ import yt_dlp
	55	+
	56	+ with yt_dlp.YoutubeDL({"quiet": True}) as ydl:
	57	+ info = ydl.extract_info(self.url, download=False)
	58	+
	59	+ return [
	60	+ SourceFile(
	61	+ name=info.get("title", self.video_id),
	62	+ id=self.video_id,
	63	+ size_bytes=info.get("filesize"),
	64	+ mime_type="audio/webm" if self.audio_only else "video/mp4",
	65	+ )
	66	+ ]
	67	+
	68	+ def download(self, file: SourceFile, destination: Path) -> Path:
	69	+ """Download the video or audio to destination path."""
	70	+ import yt_dlp
	71	+
	72	+ destination = Path(destination)
	73	+ destination.parent.mkdir(parents=True, exist_ok=True)
	74	+
	75	+ opts = {
	76	+ "outtmpl": str(destination),
	77	+ "quiet": True,
	78	+ }
	79	+ if self.audio_only:
	80	+ opts["format"] = "bestaudio/best"
	81	+ opts["postprocessors"] = [{"key": "FFmpegExtractAudio", "preferredcodec": "mp3"}]
	82	+ else:
	83	+ opts["format"] = "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"
	84	+
	85	+ with yt_dlp.YoutubeDL(opts) as ydl:
	86	+ ydl.download([self.url])
	87	+
	88	+ logger.info(f"Downloaded YouTube video {self.video_id} to {destination}")
	89	+ return destination
	90	+
	91	+ def fetch_captions(self, lang: str = "en") -> Optional[str]:
	92	+ """Extract auto-generated or manual captions as plain text."""
	93	+ import yt_dlp
	94	+
	95	+ opts = {
	96	+ "quiet": True,
	97	+ "writeautomaticsub": True,
	98	+ "writesubtitles": True,
	99	+ "subtitleslangs": [lang],
	100	+ "skip_download": True,
	101	+ }
	102	+ with yt_dlp.YoutubeDL(opts) as ydl:
	103	+ info = ydl.extract_info(self.url, download=False)
	104	+
	105	+ subs = info.get("subtitles", {}).get(lang) or info.get("automatic_captions", {}).get(lang)
	106	+ if not subs:
	107	+ logger.warning(f"No captions found for language '{lang}'")
	108	+ return None
	109	+
	110	+ # Prefer vtt/srv format for text extraction
	111	+ for fmt in subs:
	112	+ if fmt.get("ext") in ("vtt", "srv3", "json3"):
	113	+ import requests
	114	+
	115	+ resp = requests.get(fmt["url"], timeout=30)
	116	+ return resp.text
	117	+
	118	+ return None

	--- a/video_processor/sources/youtube_source.py
	+++ b/video_processor/sources/youtube_source.py
	@@ -0,0 +1,118 @@

	--- a/video_processor/sources/youtube_source.py
	+++ b/video_processor/sources/youtube_source.py
	@@ -0,0 +1,118 @@
1	"""YouTube source connector using yt-dlp for video/audio download and caption extraction."""
2
3	import logging
4	import re
5	from pathlib import Path
6	from typing import List, Optional
7
8	from video_processor.sources.base import BaseSource, SourceFile
9
10	logger = logging.getLogger(__name__)
11
12	_YT_URL_PATTERN = re.compile(
13	r"(?:youtube\.com/watch\?v=\|youtu\.be/\|youtube\.com/shorts/)([\w-]{11})"
14	)
15
16
17	def _extract_video_id(url: str) -> str:
18	"""Extract the 11-character video ID from a YouTube URL."""
19	match = _YT_URL_PATTERN.search(url)
20	if not match:
21	raise ValueError(f"Could not extract YouTube video ID from: {url}")
22	return match.group(1)
23
24
25	class YouTubeSource(BaseSource):
26	"""
27	Download YouTube videos/audio and extract captions via yt-dlp.
28
29	Requires: pip install yt-dlp
30	"""
31
32	def __init__(self, url: str, audio_only: bool = False):
33	self.url = url
34	self.video_id = _extract_video_id(url)
35	self.audio_only = audio_only
36
37	def authenticate(self) -> bool:
38	"""No auth needed for public videos. Returns True if yt-dlp is available."""
39	try:
40	import yt_dlp # noqa: F401
41
42	return True
43	except ImportError:
44	logger.error("yt-dlp not installed. Run: pip install yt-dlp")
45	return False
46
47	def list_videos(
48	self,
49	folder_id: Optional[str] = None,
50	folder_path: Optional[str] = None,
51	patterns: Optional[List[str]] = None,
52	) -> List[SourceFile]:
53	"""Return a single SourceFile representing the YouTube video."""
54	import yt_dlp
55
56	with yt_dlp.YoutubeDL({"quiet": True}) as ydl:
57	info = ydl.extract_info(self.url, download=False)
58
59	return [
60	SourceFile(
61	name=info.get("title", self.video_id),
62	id=self.video_id,
63	size_bytes=info.get("filesize"),
64	mime_type="audio/webm" if self.audio_only else "video/mp4",
65	)
66	]
67
68	def download(self, file: SourceFile, destination: Path) -> Path:
69	"""Download the video or audio to destination path."""
70	import yt_dlp
71
72	destination = Path(destination)
73	destination.parent.mkdir(parents=True, exist_ok=True)
74
75	opts = {
76	"outtmpl": str(destination),
77	"quiet": True,
78	}
79	if self.audio_only:
80	opts["format"] = "bestaudio/best"
81	opts["postprocessors"] = [{"key": "FFmpegExtractAudio", "preferredcodec": "mp3"}]
82	else:
83	opts["format"] = "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"
84
85	with yt_dlp.YoutubeDL(opts) as ydl:
86	ydl.download([self.url])
87
88	logger.info(f"Downloaded YouTube video {self.video_id} to {destination}")
89	return destination
90
91	def fetch_captions(self, lang: str = "en") -> Optional[str]:
92	"""Extract auto-generated or manual captions as plain text."""
93	import yt_dlp
94
95	opts = {
96	"quiet": True,
97	"writeautomaticsub": True,
98	"writesubtitles": True,
99	"subtitleslangs": [lang],
100	"skip_download": True,
101	}
102	with yt_dlp.YoutubeDL(opts) as ydl:
103	info = ydl.extract_info(self.url, download=False)
104
105	subs = info.get("subtitles", {}).get(lang) or info.get("automatic_captions", {}).get(lang)
106	if not subs:
107	logger.warning(f"No captions found for language '{lang}'")
108	return None
109
110	# Prefer vtt/srv format for text extraction
111	for fmt in subs:
112	if fmt.get("ext") in ("vtt", "srv3", "json3"):
113	import requests
114
115	resp = requests.get(fmt["url"], timeout=30)
116	return resp.text
117
118	return None

PlanOpticon

Keyboard Shortcuts