PlanOpticon

planopticon / video_processor / sources / rss_source.py

Source Blame History 114 lines

0981a08…	noreply	1	"""RSS/Atom feed source connector."""
0981a08…	noreply	2
0981a08…	noreply	3	import logging
0981a08…	noreply	4	from pathlib import Path
0981a08…	noreply	5	from typing import List, Optional
0981a08…	noreply	6
0981a08…	noreply	7	from video_processor.sources.base import BaseSource, SourceFile
0981a08…	noreply	8
0981a08…	noreply	9	logger = logging.getLogger(__name__)
0981a08…	noreply	10
0981a08…	noreply	11
0981a08…	noreply	12	class RSSSource(BaseSource):
0981a08…	noreply	13	"""
0981a08…	noreply	14	Parse RSS/Atom feeds and extract entries as text documents.
0981a08…	noreply	15
0981a08…	noreply	16	Optional: pip install feedparser (falls back to xml.etree.ElementTree)
0981a08…	noreply	17	Requires: pip install requests
0981a08…	noreply	18	"""
0981a08…	noreply	19
0981a08…	noreply	20	def __init__(self, url: str, max_entries: int = 50):
0981a08…	noreply	21	self.url = url
0981a08…	noreply	22	self.max_entries = max_entries
0981a08…	noreply	23	self._entries: List[dict] = []
0981a08…	noreply	24
0981a08…	noreply	25	def authenticate(self) -> bool:
0981a08…	noreply	26	"""No auth needed for public feeds."""
0981a08…	noreply	27	return True
0981a08…	noreply	28
0981a08…	noreply	29	def _parse_feed(self) -> None:
0981a08…	noreply	30	"""Fetch and parse the feed."""
0981a08…	noreply	31	if self._entries:
0981a08…	noreply	32	return
0981a08…	noreply	33
0981a08…	noreply	34	import requests
0981a08…	noreply	35
0981a08…	noreply	36	resp = requests.get(self.url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"})
0981a08…	noreply	37	resp.raise_for_status()
0981a08…	noreply	38
0981a08…	noreply	39	try:
0981a08…	noreply	40	import feedparser
0981a08…	noreply	41
0981a08…	noreply	42	feed = feedparser.parse(resp.text)
0981a08…	noreply	43	for entry in feed.entries[: self.max_entries]:
0981a08…	noreply	44	self._entries.append(
0981a08…	noreply	45	{
0981a08…	noreply	46	"title": entry.get("title", "Untitled"),
0981a08…	noreply	47	"link": entry.get("link", ""),
0981a08…	noreply	48	"summary": entry.get("summary", ""),
0981a08…	noreply	49	"published": entry.get("published", ""),
0981a08…	noreply	50	"id": entry.get("id", entry.get("link", "")),
0981a08…	noreply	51	}
0981a08…	noreply	52	)
0981a08…	noreply	53	except ImportError:
0981a08…	noreply	54	logger.debug("feedparser not available, using xml.etree fallback")
0981a08…	noreply	55	self._parse_xml(resp.text)
0981a08…	noreply	56
0981a08…	noreply	57	def _parse_xml(self, text: str) -> None:
0981a08…	noreply	58	"""Fallback parser using stdlib xml.etree."""
0981a08…	noreply	59	import xml.etree.ElementTree as ET
0981a08…	noreply	60
0981a08…	noreply	61	root = ET.fromstring(text)
0981a08…	noreply	62	# Handle RSS 2.0
0981a08…	noreply	63	ns = {"atom": "http://www.w3.org/2005/Atom"}
0981a08…	noreply	64	items = root.findall(".//item") or root.findall(".//atom:entry", ns)
0981a08…	noreply	65	for item in items[: self.max_entries]:
0981a08…	noreply	66	title = (
0981a08…	noreply	67	item.findtext("title") or item.findtext("atom:title", namespaces=ns) or "Untitled"
0981a08…	noreply	68	)
0981a08…	noreply	69	link = item.findtext("link") or ""
0981a08…	noreply	70	if not link:
0981a08…	noreply	71	link_el = item.find("atom:link", ns)
0981a08…	noreply	72	link = link_el.get("href", "") if link_el is not None else ""
0981a08…	noreply	73	desc = (
0981a08…	noreply	74	item.findtext("description") or item.findtext("atom:summary", namespaces=ns) or ""
0981a08…	noreply	75	)
0981a08…	noreply	76	pub = item.findtext("pubDate") or item.findtext("atom:published", namespaces=ns) or ""
0981a08…	noreply	77	self._entries.append(
0981a08…	noreply	78	{"title": title, "link": link, "summary": desc, "published": pub, "id": link}
0981a08…	noreply	79	)
0981a08…	noreply	80
0981a08…	noreply	81	def list_videos(
0981a08…	noreply	82	self,
0981a08…	noreply	83	folder_id: Optional[str] = None,
0981a08…	noreply	84	folder_path: Optional[str] = None,
0981a08…	noreply	85	patterns: Optional[List[str]] = None,
0981a08…	noreply	86	) -> List[SourceFile]:
0981a08…	noreply	87	"""List feed entries as SourceFiles."""
0981a08…	noreply	88	self._parse_feed()
0981a08…	noreply	89	return [
0981a08…	noreply	90	SourceFile(
0981a08…	noreply	91	name=e["title"], id=e["id"], mime_type="text/plain", modified_at=e["published"]
0981a08…	noreply	92	)
0981a08…	noreply	93	for e in self._entries
0981a08…	noreply	94	]
0981a08…	noreply	95
0981a08…	noreply	96	def download(self, file: SourceFile, destination: Path) -> Path:
0981a08…	noreply	97	"""Write an entry's content as a text file."""
0981a08…	noreply	98	self._parse_feed()
0981a08…	noreply	99	destination = Path(destination)
0981a08…	noreply	100	destination.parent.mkdir(parents=True, exist_ok=True)
0981a08…	noreply	101
0981a08…	noreply	102	entry = next((e for e in self._entries if e["id"] == file.id), None)
0981a08…	noreply	103	if not entry:
0981a08…	noreply	104	raise ValueError(f"Entry not found: {file.id}")
0981a08…	noreply	105
0981a08…	noreply	106	text = (
0981a08…	noreply	107	f"# {entry['title']}\n\n"
0981a08…	noreply	108	f"Published: {entry['published']}\n"
0981a08…	noreply	109	f"Link: {entry['link']}\n\n"
0981a08…	noreply	110	f"{entry['summary']}"
0981a08…	noreply	111	)
0981a08…	noreply	112	destination.write_text(text, encoding="utf-8")
0981a08…	noreply	113	logger.info(f"Saved RSS entry to {destination}")
0981a08…	noreply	114	return destination

PlanOpticon

Keyboard Shortcuts