PlanOpticon

planopticon / video_processor / sources / web_source.py

Source Blame History 90 lines

0981a08…	noreply	1	"""Web page source connector for fetching and extracting text from URLs."""
0981a08…	noreply	2
0981a08…	noreply	3	import logging
0981a08…	noreply	4	import re
0981a08…	noreply	5	from pathlib import Path
0981a08…	noreply	6	from typing import List, Optional
0981a08…	noreply	7
0981a08…	noreply	8	from video_processor.sources.base import BaseSource, SourceFile
0981a08…	noreply	9
0981a08…	noreply	10	logger = logging.getLogger(__name__)
0981a08…	noreply	11
0981a08…	noreply	12
0981a08…	noreply	13	def _strip_html_tags(html: str) -> str:
0981a08…	noreply	14	"""Minimal HTML tag stripper using stdlib only."""
0981a08…	noreply	15	text = re.sub(r"<script[^>]>.?</script>", "", html, flags=re.DOTALL \| re.IGNORECASE)
0981a08…	noreply	16	text = re.sub(r"<style[^>]>.?</style>", "", text, flags=re.DOTALL \| re.IGNORECASE)
0981a08…	noreply	17	text = re.sub(r"<(nav\|footer\|header)[^>]>.?</\1>", "", text, flags=re.DOTALL \| re.IGNORECASE)
0981a08…	noreply	18	text = re.sub(r"<[^>]+>", " ", text)
0981a08…	noreply	19	text = re.sub(r"\s+", " ", text).strip()
0981a08…	noreply	20	return text
0981a08…	noreply	21
0981a08…	noreply	22
0981a08…	noreply	23	class WebSource(BaseSource):
0981a08…	noreply	24	"""
0981a08…	noreply	25	Fetch web pages and extract main text content.
0981a08…	noreply	26
0981a08…	noreply	27	Uses requests + BeautifulSoup (optional) for content extraction.
0981a08…	noreply	28	Falls back to regex-based tag stripping if bs4 is unavailable.
0981a08…	noreply	29
0981a08…	noreply	30	Requires: pip install requests (included in most environments)
0981a08…	noreply	31	Optional: pip install beautifulsoup4 lxml
0981a08…	noreply	32	"""
0981a08…	noreply	33
0981a08…	noreply	34	def __init__(self, url: str):
0981a08…	noreply	35	self.url = url
0981a08…	noreply	36	self._content: Optional[str] = None
0981a08…	noreply	37
0981a08…	noreply	38	def authenticate(self) -> bool:
0981a08…	noreply	39	"""No auth needed for public web pages."""
0981a08…	noreply	40	return True
0981a08…	noreply	41
0981a08…	noreply	42	def list_videos(
0981a08…	noreply	43	self,
0981a08…	noreply	44	folder_id: Optional[str] = None,
0981a08…	noreply	45	folder_path: Optional[str] = None,
0981a08…	noreply	46	patterns: Optional[List[str]] = None,
0981a08…	noreply	47	) -> List[SourceFile]:
0981a08…	noreply	48	"""Return a single SourceFile representing the web page."""
0981a08…	noreply	49	return [
0981a08…	noreply	50	SourceFile(
0981a08…	noreply	51	name=self.url.split("/")[-1] or "page",
0981a08…	noreply	52	id=self.url,
0981a08…	noreply	53	mime_type="text/html",
0981a08…	noreply	54	)
0981a08…	noreply	55	]
0981a08…	noreply	56
0981a08…	noreply	57	def download(self, file: SourceFile, destination: Path) -> Path:
0981a08…	noreply	58	"""Download and save the extracted text content."""
0981a08…	noreply	59	destination = Path(destination)
0981a08…	noreply	60	destination.parent.mkdir(parents=True, exist_ok=True)
0981a08…	noreply	61	text = self.fetch_text()
0981a08…	noreply	62	destination.write_text(text, encoding="utf-8")
0981a08…	noreply	63	logger.info(f"Saved web content to {destination}")
0981a08…	noreply	64	return destination
0981a08…	noreply	65
0981a08…	noreply	66	def fetch_text(self) -> str:
0981a08…	noreply	67	"""Fetch the URL and extract main text content."""
0981a08…	noreply	68	if self._content is not None:
0981a08…	noreply	69	return self._content
0981a08…	noreply	70
0981a08…	noreply	71	import requests
0981a08…	noreply	72
0981a08…	noreply	73	resp = requests.get(self.url, timeout=30, headers={"User-Agent": "PlanOpticon/0.3"})
0981a08…	noreply	74	resp.raise_for_status()
0981a08…	noreply	75
0981a08…	noreply	76	try:
0981a08…	noreply	77	from bs4 import BeautifulSoup
0981a08…	noreply	78
0981a08…	noreply	79	soup = BeautifulSoup(resp.text, "html.parser")
0981a08…	noreply	80	# Remove non-content elements
0981a08…	noreply	81	for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
0981a08…	noreply	82	tag.decompose()
0981a08…	noreply	83	# Prefer <article> or <main> if present
0981a08…	noreply	84	main = soup.find("article") or soup.find("main") or soup.find("body")
0981a08…	noreply	85	self._content = main.get_text(separator="\n", strip=True) if main else soup.get_text()
0981a08…	noreply	86	except ImportError:
0981a08…	noreply	87	logger.debug("beautifulsoup4 not available, using regex fallback")
0981a08…	noreply	88	self._content = _strip_html_tags(resp.text)
0981a08…	noreply	89
0981a08…	noreply	90	return self._content

PlanOpticon

Keyboard Shortcuts