PlanOpticon

planopticon / video_processor / sources / web_source.py
Blame History Raw 91 lines
1
"""Web page source connector for fetching and extracting text from URLs."""
2
3
import logging
4
import re
5
from pathlib import Path
6
from typing import List, Optional
7
8
from video_processor.sources.base import BaseSource, SourceFile
9
10
logger = logging.getLogger(__name__)
11
12
13
def _strip_html_tags(html: str) -> str:
14
"""Minimal HTML tag stripper using stdlib only."""
15
text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
16
text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
17
text = re.sub(r"<(nav|footer|header)[^>]*>.*?</\1>", "", text, flags=re.DOTALL | re.IGNORECASE)
18
text = re.sub(r"<[^>]+>", " ", text)
19
text = re.sub(r"\s+", " ", text).strip()
20
return text
21
22
23
class WebSource(BaseSource):
24
"""
25
Fetch web pages and extract main text content.
26
27
Uses requests + BeautifulSoup (optional) for content extraction.
28
Falls back to regex-based tag stripping if bs4 is unavailable.
29
30
Requires: pip install requests (included in most environments)
31
Optional: pip install beautifulsoup4 lxml
32
"""
33
34
def __init__(self, url: str):
35
self.url = url
36
self._content: Optional[str] = None
37
38
def authenticate(self) -> bool:
39
"""No auth needed for public web pages."""
40
return True
41
42
def list_videos(
43
self,
44
folder_id: Optional[str] = None,
45
folder_path: Optional[str] = None,
46
patterns: Optional[List[str]] = None,
47
) -> List[SourceFile]:
48
"""Return a single SourceFile representing the web page."""
49
return [
50
SourceFile(
51
name=self.url.split("/")[-1] or "page",
52
id=self.url,
53
mime_type="text/html",
54
)
55
]
56
57
def download(self, file: SourceFile, destination: Path) -> Path:
58
"""Download and save the extracted text content."""
59
destination = Path(destination)
60
destination.parent.mkdir(parents=True, exist_ok=True)
61
text = self.fetch_text()
62
destination.write_text(text, encoding="utf-8")
63
logger.info(f"Saved web content to {destination}")
64
return destination
65
66
def fetch_text(self) -> str:
67
"""Fetch the URL and extract main text content."""
68
if self._content is not None:
69
return self._content
70
71
import requests
72
73
resp = requests.get(self.url, timeout=30, headers={"User-Agent": "PlanOpticon/0.3"})
74
resp.raise_for_status()
75
76
try:
77
from bs4 import BeautifulSoup
78
79
soup = BeautifulSoup(resp.text, "html.parser")
80
# Remove non-content elements
81
for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
82
tag.decompose()
83
# Prefer <article> or <main> if present
84
main = soup.find("article") or soup.find("main") or soup.find("body")
85
self._content = main.get_text(separator="\n", strip=True) if main else soup.get_text()
86
except ImportError:
87
logger.debug("beautifulsoup4 not available, using regex fallback")
88
self._content = _strip_html_tags(resp.text)
89
90
return self._content
91

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button