PlanOpticon

planopticon / video_processor / sources / web_source.py
Source Blame History 90 lines
0981a08… noreply 1 """Web page source connector for fetching and extracting text from URLs."""
0981a08… noreply 2
0981a08… noreply 3 import logging
0981a08… noreply 4 import re
0981a08… noreply 5 from pathlib import Path
0981a08… noreply 6 from typing import List, Optional
0981a08… noreply 7
0981a08… noreply 8 from video_processor.sources.base import BaseSource, SourceFile
0981a08… noreply 9
0981a08… noreply 10 logger = logging.getLogger(__name__)
0981a08… noreply 11
0981a08… noreply 12
0981a08… noreply 13 def _strip_html_tags(html: str) -> str:
0981a08… noreply 14 """Minimal HTML tag stripper using stdlib only."""
0981a08… noreply 15 text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
0981a08… noreply 16 text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
0981a08… noreply 17 text = re.sub(r"<(nav|footer|header)[^>]*>.*?</\1>", "", text, flags=re.DOTALL | re.IGNORECASE)
0981a08… noreply 18 text = re.sub(r"<[^>]+>", " ", text)
0981a08… noreply 19 text = re.sub(r"\s+", " ", text).strip()
0981a08… noreply 20 return text
0981a08… noreply 21
0981a08… noreply 22
0981a08… noreply 23 class WebSource(BaseSource):
0981a08… noreply 24 """
0981a08… noreply 25 Fetch web pages and extract main text content.
0981a08… noreply 26
0981a08… noreply 27 Uses requests + BeautifulSoup (optional) for content extraction.
0981a08… noreply 28 Falls back to regex-based tag stripping if bs4 is unavailable.
0981a08… noreply 29
0981a08… noreply 30 Requires: pip install requests (included in most environments)
0981a08… noreply 31 Optional: pip install beautifulsoup4 lxml
0981a08… noreply 32 """
0981a08… noreply 33
0981a08… noreply 34 def __init__(self, url: str):
0981a08… noreply 35 self.url = url
0981a08… noreply 36 self._content: Optional[str] = None
0981a08… noreply 37
0981a08… noreply 38 def authenticate(self) -> bool:
0981a08… noreply 39 """No auth needed for public web pages."""
0981a08… noreply 40 return True
0981a08… noreply 41
0981a08… noreply 42 def list_videos(
0981a08… noreply 43 self,
0981a08… noreply 44 folder_id: Optional[str] = None,
0981a08… noreply 45 folder_path: Optional[str] = None,
0981a08… noreply 46 patterns: Optional[List[str]] = None,
0981a08… noreply 47 ) -> List[SourceFile]:
0981a08… noreply 48 """Return a single SourceFile representing the web page."""
0981a08… noreply 49 return [
0981a08… noreply 50 SourceFile(
0981a08… noreply 51 name=self.url.split("/")[-1] or "page",
0981a08… noreply 52 id=self.url,
0981a08… noreply 53 mime_type="text/html",
0981a08… noreply 54 )
0981a08… noreply 55 ]
0981a08… noreply 56
0981a08… noreply 57 def download(self, file: SourceFile, destination: Path) -> Path:
0981a08… noreply 58 """Download and save the extracted text content."""
0981a08… noreply 59 destination = Path(destination)
0981a08… noreply 60 destination.parent.mkdir(parents=True, exist_ok=True)
0981a08… noreply 61 text = self.fetch_text()
0981a08… noreply 62 destination.write_text(text, encoding="utf-8")
0981a08… noreply 63 logger.info(f"Saved web content to {destination}")
0981a08… noreply 64 return destination
0981a08… noreply 65
0981a08… noreply 66 def fetch_text(self) -> str:
0981a08… noreply 67 """Fetch the URL and extract main text content."""
0981a08… noreply 68 if self._content is not None:
0981a08… noreply 69 return self._content
0981a08… noreply 70
0981a08… noreply 71 import requests
0981a08… noreply 72
0981a08… noreply 73 resp = requests.get(self.url, timeout=30, headers={"User-Agent": "PlanOpticon/0.3"})
0981a08… noreply 74 resp.raise_for_status()
0981a08… noreply 75
0981a08… noreply 76 try:
0981a08… noreply 77 from bs4 import BeautifulSoup
0981a08… noreply 78
0981a08… noreply 79 soup = BeautifulSoup(resp.text, "html.parser")
0981a08… noreply 80 # Remove non-content elements
0981a08… noreply 81 for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
0981a08… noreply 82 tag.decompose()
0981a08… noreply 83 # Prefer <article> or <main> if present
0981a08… noreply 84 main = soup.find("article") or soup.find("main") or soup.find("body")
0981a08… noreply 85 self._content = main.get_text(separator="\n", strip=True) if main else soup.get_text()
0981a08… noreply 86 except ImportError:
0981a08… noreply 87 logger.debug("beautifulsoup4 not available, using regex fallback")
0981a08… noreply 88 self._content = _strip_html_tags(resp.text)
0981a08… noreply 89
0981a08… noreply 90 return self._content

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button