PlanOpticon

planopticon / video_processor / sources / web_source.py

Blame History Raw 91 lines

1	`"""Web page source connector for fetching and extracting text from URLs."""`
2
3	`import logging`
4	`import re`
5	`from pathlib import Path`
6	`from typing import List, Optional`
7
8	`from video_processor.sources.base import BaseSource, SourceFile`
9
10	`logger = logging.getLogger(__name__)`
11
12
13	`def _strip_html_tags(html: str) -> str:`
14	`"""Minimal HTML tag stripper using stdlib only."""`
15	`text = re.sub(r"<script[^>]>.?</script>", "", html, flags=re.DOTALL \| re.IGNORECASE)`
16	`text = re.sub(r"<style[^>]>.?</style>", "", text, flags=re.DOTALL \| re.IGNORECASE)`
17	`text = re.sub(r"<(nav\|footer\|header)[^>]>.?</\1>", "", text, flags=re.DOTALL \| re.IGNORECASE)`
18	`text = re.sub(r"<[^>]+>", " ", text)`
19	`text = re.sub(r"\s+", " ", text).strip()`
20	`return text`
21
22
23	`class WebSource(BaseSource):`
24	`"""`
25	`Fetch web pages and extract main text content.`
26
27	`Uses requests + BeautifulSoup (optional) for content extraction.`
28	`Falls back to regex-based tag stripping if bs4 is unavailable.`
29
30	`Requires: pip install requests (included in most environments)`
31	`Optional: pip install beautifulsoup4 lxml`
32	`"""`
33
34	`def __init__(self, url: str):`
35	`self.url = url`
36	`self._content: Optional[str] = None`
37
38	`def authenticate(self) -> bool:`
39	`"""No auth needed for public web pages."""`
40	`return True`
41
42	`def list_videos(`
43	`self,`
44	`folder_id: Optional[str] = None,`
45	`folder_path: Optional[str] = None,`
46	`patterns: Optional[List[str]] = None,`
47	`) -> List[SourceFile]:`
48	`"""Return a single SourceFile representing the web page."""`
49	`return [`
50	`SourceFile(`
51	`name=self.url.split("/")[-1] or "page",`
52	`id=self.url,`
53	`mime_type="text/html",`
54	`)`
55	`]`
56
57	`def download(self, file: SourceFile, destination: Path) -> Path:`
58	`"""Download and save the extracted text content."""`
59	`destination = Path(destination)`
60	`destination.parent.mkdir(parents=True, exist_ok=True)`
61	`text = self.fetch_text()`
62	`destination.write_text(text, encoding="utf-8")`
63	`logger.info(f"Saved web content to {destination}")`
64	`return destination`
65
66	`def fetch_text(self) -> str:`
67	`"""Fetch the URL and extract main text content."""`
68	`if self._content is not None:`
69	`return self._content`
70
71	`import requests`
72
73	`resp = requests.get(self.url, timeout=30, headers={"User-Agent": "PlanOpticon/0.3"})`
74	`resp.raise_for_status()`
75
76	`try:`
77	`from bs4 import BeautifulSoup`
78
79	`soup = BeautifulSoup(resp.text, "html.parser")`
80	`# Remove non-content elements`
81	`for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):`
82	`tag.decompose()`
83	`# Prefer <article> or <main> if present`
84	`main = soup.find("article") or soup.find("main") or soup.find("body")`
85	`self._content = main.get_text(separator="\n", strip=True) if main else soup.get_text()`
86	`except ImportError:`
87	`logger.debug("beautifulsoup4 not available, using regex fallback")`
88	`self._content = _strip_html_tags(resp.text)`
89
90	`return self._content`
91

PlanOpticon

Keyboard Shortcuts