PlanOpticon

planopticon / video_processor / sources / rss_source.py
Blame History Raw 115 lines
1
"""RSS/Atom feed source connector."""
2
3
import logging
4
from pathlib import Path
5
from typing import List, Optional
6
7
from video_processor.sources.base import BaseSource, SourceFile
8
9
logger = logging.getLogger(__name__)
10
11
12
class RSSSource(BaseSource):
13
"""
14
Parse RSS/Atom feeds and extract entries as text documents.
15
16
Optional: pip install feedparser (falls back to xml.etree.ElementTree)
17
Requires: pip install requests
18
"""
19
20
def __init__(self, url: str, max_entries: int = 50):
21
self.url = url
22
self.max_entries = max_entries
23
self._entries: List[dict] = []
24
25
def authenticate(self) -> bool:
26
"""No auth needed for public feeds."""
27
return True
28
29
def _parse_feed(self) -> None:
30
"""Fetch and parse the feed."""
31
if self._entries:
32
return
33
34
import requests
35
36
resp = requests.get(self.url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"})
37
resp.raise_for_status()
38
39
try:
40
import feedparser
41
42
feed = feedparser.parse(resp.text)
43
for entry in feed.entries[: self.max_entries]:
44
self._entries.append(
45
{
46
"title": entry.get("title", "Untitled"),
47
"link": entry.get("link", ""),
48
"summary": entry.get("summary", ""),
49
"published": entry.get("published", ""),
50
"id": entry.get("id", entry.get("link", "")),
51
}
52
)
53
except ImportError:
54
logger.debug("feedparser not available, using xml.etree fallback")
55
self._parse_xml(resp.text)
56
57
def _parse_xml(self, text: str) -> None:
58
"""Fallback parser using stdlib xml.etree."""
59
import xml.etree.ElementTree as ET
60
61
root = ET.fromstring(text)
62
# Handle RSS 2.0
63
ns = {"atom": "http://www.w3.org/2005/Atom"}
64
items = root.findall(".//item") or root.findall(".//atom:entry", ns)
65
for item in items[: self.max_entries]:
66
title = (
67
item.findtext("title") or item.findtext("atom:title", namespaces=ns) or "Untitled"
68
)
69
link = item.findtext("link") or ""
70
if not link:
71
link_el = item.find("atom:link", ns)
72
link = link_el.get("href", "") if link_el is not None else ""
73
desc = (
74
item.findtext("description") or item.findtext("atom:summary", namespaces=ns) or ""
75
)
76
pub = item.findtext("pubDate") or item.findtext("atom:published", namespaces=ns) or ""
77
self._entries.append(
78
{"title": title, "link": link, "summary": desc, "published": pub, "id": link}
79
)
80
81
def list_videos(
82
self,
83
folder_id: Optional[str] = None,
84
folder_path: Optional[str] = None,
85
patterns: Optional[List[str]] = None,
86
) -> List[SourceFile]:
87
"""List feed entries as SourceFiles."""
88
self._parse_feed()
89
return [
90
SourceFile(
91
name=e["title"], id=e["id"], mime_type="text/plain", modified_at=e["published"]
92
)
93
for e in self._entries
94
]
95
96
def download(self, file: SourceFile, destination: Path) -> Path:
97
"""Write an entry's content as a text file."""
98
self._parse_feed()
99
destination = Path(destination)
100
destination.parent.mkdir(parents=True, exist_ok=True)
101
102
entry = next((e for e in self._entries if e["id"] == file.id), None)
103
if not entry:
104
raise ValueError(f"Entry not found: {file.id}")
105
106
text = (
107
f"# {entry['title']}\n\n"
108
f"Published: {entry['published']}\n"
109
f"Link: {entry['link']}\n\n"
110
f"{entry['summary']}"
111
)
112
destination.write_text(text, encoding="utf-8")
113
logger.info(f"Saved RSS entry to {destination}")
114
return destination
115

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button