|
1
|
"""Web page source connector for fetching and extracting text from URLs.""" |
|
2
|
|
|
3
|
import logging |
|
4
|
import re |
|
5
|
from pathlib import Path |
|
6
|
from typing import List, Optional |
|
7
|
|
|
8
|
from video_processor.sources.base import BaseSource, SourceFile |
|
9
|
|
|
10
|
logger = logging.getLogger(__name__) |
|
11
|
|
|
12
|
|
|
13
|
def _strip_html_tags(html: str) -> str: |
|
14
|
"""Minimal HTML tag stripper using stdlib only.""" |
|
15
|
text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE) |
|
16
|
text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE) |
|
17
|
text = re.sub(r"<(nav|footer|header)[^>]*>.*?</\1>", "", text, flags=re.DOTALL | re.IGNORECASE) |
|
18
|
text = re.sub(r"<[^>]+>", " ", text) |
|
19
|
text = re.sub(r"\s+", " ", text).strip() |
|
20
|
return text |
|
21
|
|
|
22
|
|
|
23
|
class WebSource(BaseSource): |
|
24
|
""" |
|
25
|
Fetch web pages and extract main text content. |
|
26
|
|
|
27
|
Uses requests + BeautifulSoup (optional) for content extraction. |
|
28
|
Falls back to regex-based tag stripping if bs4 is unavailable. |
|
29
|
|
|
30
|
Requires: pip install requests (included in most environments) |
|
31
|
Optional: pip install beautifulsoup4 lxml |
|
32
|
""" |
|
33
|
|
|
34
|
def __init__(self, url: str): |
|
35
|
self.url = url |
|
36
|
self._content: Optional[str] = None |
|
37
|
|
|
38
|
def authenticate(self) -> bool: |
|
39
|
"""No auth needed for public web pages.""" |
|
40
|
return True |
|
41
|
|
|
42
|
def list_videos( |
|
43
|
self, |
|
44
|
folder_id: Optional[str] = None, |
|
45
|
folder_path: Optional[str] = None, |
|
46
|
patterns: Optional[List[str]] = None, |
|
47
|
) -> List[SourceFile]: |
|
48
|
"""Return a single SourceFile representing the web page.""" |
|
49
|
return [ |
|
50
|
SourceFile( |
|
51
|
name=self.url.split("/")[-1] or "page", |
|
52
|
id=self.url, |
|
53
|
mime_type="text/html", |
|
54
|
) |
|
55
|
] |
|
56
|
|
|
57
|
def download(self, file: SourceFile, destination: Path) -> Path: |
|
58
|
"""Download and save the extracted text content.""" |
|
59
|
destination = Path(destination) |
|
60
|
destination.parent.mkdir(parents=True, exist_ok=True) |
|
61
|
text = self.fetch_text() |
|
62
|
destination.write_text(text, encoding="utf-8") |
|
63
|
logger.info(f"Saved web content to {destination}") |
|
64
|
return destination |
|
65
|
|
|
66
|
def fetch_text(self) -> str: |
|
67
|
"""Fetch the URL and extract main text content.""" |
|
68
|
if self._content is not None: |
|
69
|
return self._content |
|
70
|
|
|
71
|
import requests |
|
72
|
|
|
73
|
resp = requests.get(self.url, timeout=30, headers={"User-Agent": "PlanOpticon/0.3"}) |
|
74
|
resp.raise_for_status() |
|
75
|
|
|
76
|
try: |
|
77
|
from bs4 import BeautifulSoup |
|
78
|
|
|
79
|
soup = BeautifulSoup(resp.text, "html.parser") |
|
80
|
# Remove non-content elements |
|
81
|
for tag in soup(["script", "style", "nav", "footer", "header", "aside"]): |
|
82
|
tag.decompose() |
|
83
|
# Prefer <article> or <main> if present |
|
84
|
main = soup.find("article") or soup.find("main") or soup.find("body") |
|
85
|
self._content = main.get_text(separator="\n", strip=True) if main else soup.get_text() |
|
86
|
except ImportError: |
|
87
|
logger.debug("beautifulsoup4 not available, using regex fallback") |
|
88
|
self._content = _strip_html_tags(resp.text) |
|
89
|
|
|
90
|
return self._content |
|
91
|
|