PlanOpticon

planopticon / video_processor / sources / rss_source.py

Blame History Raw 115 lines

1	`"""RSS/Atom feed source connector."""`
2
3	`import logging`
4	`from pathlib import Path`
5	`from typing import List, Optional`
6
7	`from video_processor.sources.base import BaseSource, SourceFile`
8
9	`logger = logging.getLogger(__name__)`
10
11
12	`class RSSSource(BaseSource):`
13	`"""`
14	`Parse RSS/Atom feeds and extract entries as text documents.`
15
16	`Optional: pip install feedparser (falls back to xml.etree.ElementTree)`
17	`Requires: pip install requests`
18	`"""`
19
20	`def __init__(self, url: str, max_entries: int = 50):`
21	`self.url = url`
22	`self.max_entries = max_entries`
23	`self._entries: List[dict] = []`
24
25	`def authenticate(self) -> bool:`
26	`"""No auth needed for public feeds."""`
27	`return True`
28
29	`def _parse_feed(self) -> None:`
30	`"""Fetch and parse the feed."""`
31	`if self._entries:`
32	`return`
33
34	`import requests`
35
36	`resp = requests.get(self.url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"})`
37	`resp.raise_for_status()`
38
39	`try:`
40	`import feedparser`
41
42	`feed = feedparser.parse(resp.text)`
43	`for entry in feed.entries[: self.max_entries]:`
44	`self._entries.append(`
45	`{`
46	`"title": entry.get("title", "Untitled"),`
47	`"link": entry.get("link", ""),`
48	`"summary": entry.get("summary", ""),`
49	`"published": entry.get("published", ""),`
50	`"id": entry.get("id", entry.get("link", "")),`
51	`}`
52	`)`
53	`except ImportError:`
54	`logger.debug("feedparser not available, using xml.etree fallback")`
55	`self._parse_xml(resp.text)`
56
57	`def _parse_xml(self, text: str) -> None:`
58	`"""Fallback parser using stdlib xml.etree."""`
59	`import xml.etree.ElementTree as ET`
60
61	`root = ET.fromstring(text)`
62	`# Handle RSS 2.0`
63	`ns = {"atom": "http://www.w3.org/2005/Atom"}`
64	`items = root.findall(".//item") or root.findall(".//atom:entry", ns)`
65	`for item in items[: self.max_entries]:`
66	`title = (`
67	`item.findtext("title") or item.findtext("atom:title", namespaces=ns) or "Untitled"`
68	`)`
69	`link = item.findtext("link") or ""`
70	`if not link:`
71	`link_el = item.find("atom:link", ns)`
72	`link = link_el.get("href", "") if link_el is not None else ""`
73	`desc = (`
74	`item.findtext("description") or item.findtext("atom:summary", namespaces=ns) or ""`
75	`)`
76	`pub = item.findtext("pubDate") or item.findtext("atom:published", namespaces=ns) or ""`
77	`self._entries.append(`
78	`{"title": title, "link": link, "summary": desc, "published": pub, "id": link}`
79	`)`
80
81	`def list_videos(`
82	`self,`
83	`folder_id: Optional[str] = None,`
84	`folder_path: Optional[str] = None,`
85	`patterns: Optional[List[str]] = None,`
86	`) -> List[SourceFile]:`
87	`"""List feed entries as SourceFiles."""`
88	`self._parse_feed()`
89	`return [`
90	`SourceFile(`
91	`name=e["title"], id=e["id"], mime_type="text/plain", modified_at=e["published"]`
92	`)`
93	`for e in self._entries`
94	`]`
95
96	`def download(self, file: SourceFile, destination: Path) -> Path:`
97	`"""Write an entry's content as a text file."""`
98	`self._parse_feed()`
99	`destination = Path(destination)`
100	`destination.parent.mkdir(parents=True, exist_ok=True)`
101
102	`entry = next((e for e in self._entries if e["id"] == file.id), None)`
103	`if not entry:`
104	`raise ValueError(f"Entry not found: {file.id}")`
105
106	`text = (`
107	`f"# {entry['title']}\n\n"`
108	`f"Published: {entry['published']}\n"`
109	`f"Link: {entry['link']}\n\n"`
110	`f"{entry['summary']}"`
111	`)`
112	`destination.write_text(text, encoding="utf-8")`
113	`logger.info(f"Saved RSS entry to {destination}")`
114	`return destination`
115

PlanOpticon

Keyboard Shortcuts