|
1
|
"""RSS/Atom feed source connector.""" |
|
2
|
|
|
3
|
import logging |
|
4
|
from pathlib import Path |
|
5
|
from typing import List, Optional |
|
6
|
|
|
7
|
from video_processor.sources.base import BaseSource, SourceFile |
|
8
|
|
|
9
|
logger = logging.getLogger(__name__) |
|
10
|
|
|
11
|
|
|
12
|
class RSSSource(BaseSource): |
|
13
|
""" |
|
14
|
Parse RSS/Atom feeds and extract entries as text documents. |
|
15
|
|
|
16
|
Optional: pip install feedparser (falls back to xml.etree.ElementTree) |
|
17
|
Requires: pip install requests |
|
18
|
""" |
|
19
|
|
|
20
|
def __init__(self, url: str, max_entries: int = 50): |
|
21
|
self.url = url |
|
22
|
self.max_entries = max_entries |
|
23
|
self._entries: List[dict] = [] |
|
24
|
|
|
25
|
def authenticate(self) -> bool: |
|
26
|
"""No auth needed for public feeds.""" |
|
27
|
return True |
|
28
|
|
|
29
|
def _parse_feed(self) -> None: |
|
30
|
"""Fetch and parse the feed.""" |
|
31
|
if self._entries: |
|
32
|
return |
|
33
|
|
|
34
|
import requests |
|
35
|
|
|
36
|
resp = requests.get(self.url, timeout=15, headers={"User-Agent": "PlanOpticon/0.3"}) |
|
37
|
resp.raise_for_status() |
|
38
|
|
|
39
|
try: |
|
40
|
import feedparser |
|
41
|
|
|
42
|
feed = feedparser.parse(resp.text) |
|
43
|
for entry in feed.entries[: self.max_entries]: |
|
44
|
self._entries.append( |
|
45
|
{ |
|
46
|
"title": entry.get("title", "Untitled"), |
|
47
|
"link": entry.get("link", ""), |
|
48
|
"summary": entry.get("summary", ""), |
|
49
|
"published": entry.get("published", ""), |
|
50
|
"id": entry.get("id", entry.get("link", "")), |
|
51
|
} |
|
52
|
) |
|
53
|
except ImportError: |
|
54
|
logger.debug("feedparser not available, using xml.etree fallback") |
|
55
|
self._parse_xml(resp.text) |
|
56
|
|
|
57
|
def _parse_xml(self, text: str) -> None: |
|
58
|
"""Fallback parser using stdlib xml.etree.""" |
|
59
|
import xml.etree.ElementTree as ET |
|
60
|
|
|
61
|
root = ET.fromstring(text) |
|
62
|
# Handle RSS 2.0 |
|
63
|
ns = {"atom": "http://www.w3.org/2005/Atom"} |
|
64
|
items = root.findall(".//item") or root.findall(".//atom:entry", ns) |
|
65
|
for item in items[: self.max_entries]: |
|
66
|
title = ( |
|
67
|
item.findtext("title") or item.findtext("atom:title", namespaces=ns) or "Untitled" |
|
68
|
) |
|
69
|
link = item.findtext("link") or "" |
|
70
|
if not link: |
|
71
|
link_el = item.find("atom:link", ns) |
|
72
|
link = link_el.get("href", "") if link_el is not None else "" |
|
73
|
desc = ( |
|
74
|
item.findtext("description") or item.findtext("atom:summary", namespaces=ns) or "" |
|
75
|
) |
|
76
|
pub = item.findtext("pubDate") or item.findtext("atom:published", namespaces=ns) or "" |
|
77
|
self._entries.append( |
|
78
|
{"title": title, "link": link, "summary": desc, "published": pub, "id": link} |
|
79
|
) |
|
80
|
|
|
81
|
def list_videos( |
|
82
|
self, |
|
83
|
folder_id: Optional[str] = None, |
|
84
|
folder_path: Optional[str] = None, |
|
85
|
patterns: Optional[List[str]] = None, |
|
86
|
) -> List[SourceFile]: |
|
87
|
"""List feed entries as SourceFiles.""" |
|
88
|
self._parse_feed() |
|
89
|
return [ |
|
90
|
SourceFile( |
|
91
|
name=e["title"], id=e["id"], mime_type="text/plain", modified_at=e["published"] |
|
92
|
) |
|
93
|
for e in self._entries |
|
94
|
] |
|
95
|
|
|
96
|
def download(self, file: SourceFile, destination: Path) -> Path: |
|
97
|
"""Write an entry's content as a text file.""" |
|
98
|
self._parse_feed() |
|
99
|
destination = Path(destination) |
|
100
|
destination.parent.mkdir(parents=True, exist_ok=True) |
|
101
|
|
|
102
|
entry = next((e for e in self._entries if e["id"] == file.id), None) |
|
103
|
if not entry: |
|
104
|
raise ValueError(f"Entry not found: {file.id}") |
|
105
|
|
|
106
|
text = ( |
|
107
|
f"# {entry['title']}\n\n" |
|
108
|
f"Published: {entry['published']}\n" |
|
109
|
f"Link: {entry['link']}\n\n" |
|
110
|
f"{entry['summary']}" |
|
111
|
) |
|
112
|
destination.write_text(text, encoding="utf-8") |
|
113
|
logger.info(f"Saved RSS entry to {destination}") |
|
114
|
return destination |
|
115
|
|