PlanOpticon

planopticon / video_processor / sources / arxiv_source.py
Source Blame History 117 lines
0981a08… noreply 1 """arXiv source connector for fetching paper metadata and PDFs."""
0981a08… noreply 2
0981a08… noreply 3 import logging
0981a08… noreply 4 import re
0981a08… noreply 5 from pathlib import Path
0981a08… noreply 6 from typing import List, Optional
0981a08… noreply 7
0981a08… noreply 8 from video_processor.sources.base import BaseSource, SourceFile
0981a08… noreply 9
0981a08… noreply 10 logger = logging.getLogger(__name__)
0981a08… noreply 11
0981a08… noreply 12 _ARXIV_ID_PATTERN = re.compile(r"(\d{4}\.\d{4,5})(v\d+)?")
0981a08… noreply 13 ARXIV_API = "http://export.arxiv.org/api/query"
0981a08… noreply 14
0981a08… noreply 15
0981a08… noreply 16 def _extract_arxiv_id(url_or_id: str) -> str:
0981a08… noreply 17 """Extract arXiv paper ID from a URL or bare ID string."""
0981a08… noreply 18 match = _ARXIV_ID_PATTERN.search(url_or_id)
0981a08… noreply 19 if not match:
0981a08… noreply 20 raise ValueError(f"Could not extract arXiv ID from: {url_or_id}")
0981a08… noreply 21 return match.group(0)
0981a08… noreply 22
0981a08… noreply 23
0981a08… noreply 24 class ArxivSource(BaseSource):
0981a08… noreply 25 """
0981a08… noreply 26 Fetch arXiv paper metadata and PDF.
0981a08… noreply 27
0981a08… noreply 28 Uses the arXiv API (Atom feed) for metadata and direct PDF download.
0981a08… noreply 29 Requires: pip install requests
0981a08… noreply 30 """
0981a08… noreply 31
0981a08… noreply 32 def __init__(self, url_or_id: str):
0981a08… noreply 33 self.arxiv_id = _extract_arxiv_id(url_or_id)
0981a08… noreply 34 self._metadata: Optional[dict] = None
0981a08… noreply 35
0981a08… noreply 36 def authenticate(self) -> bool:
0981a08… noreply 37 """No auth needed for arXiv."""
0981a08… noreply 38 return True
0981a08… noreply 39
0981a08… noreply 40 def _fetch_metadata(self) -> dict:
0981a08… noreply 41 """Fetch paper metadata from the arXiv API."""
0981a08… noreply 42 if self._metadata:
0981a08… noreply 43 return self._metadata
0981a08… noreply 44
0981a08… noreply 45 import xml.etree.ElementTree as ET
0981a08… noreply 46
0981a08… noreply 47 import requests
0981a08… noreply 48
0981a08… noreply 49 resp = requests.get(ARXIV_API, params={"id_list": self.arxiv_id}, timeout=15)
0981a08… noreply 50 resp.raise_for_status()
0981a08… noreply 51
0981a08… noreply 52 ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
0981a08… noreply 53 root = ET.fromstring(resp.text)
0981a08… noreply 54 entry = root.find("atom:entry", ns)
0981a08… noreply 55 if entry is None:
0981a08… noreply 56 raise ValueError(f"Paper not found: {self.arxiv_id}")
0981a08… noreply 57
0981a08… noreply 58 self._metadata = {
0981a08… noreply 59 "title": (entry.findtext("atom:title", namespaces=ns) or "").strip(),
0981a08… noreply 60 "summary": (entry.findtext("atom:summary", namespaces=ns) or "").strip(),
0981a08… noreply 61 "authors": [
0981a08… noreply 62 a.findtext("atom:name", namespaces=ns) or ""
0981a08… noreply 63 for a in entry.findall("atom:author", ns)
0981a08… noreply 64 ],
0981a08… noreply 65 "published": entry.findtext("atom:published", namespaces=ns) or "",
0981a08… noreply 66 "pdf_url": f"https://arxiv.org/pdf/{self.arxiv_id}.pdf",
0981a08… noreply 67 }
0981a08… noreply 68 return self._metadata
0981a08… noreply 69
0981a08… noreply 70 def list_videos(
0981a08… noreply 71 self,
0981a08… noreply 72 folder_id: Optional[str] = None,
0981a08… noreply 73 folder_path: Optional[str] = None,
0981a08… noreply 74 patterns: Optional[List[str]] = None,
0981a08… noreply 75 ) -> List[SourceFile]:
0981a08… noreply 76 """Return SourceFiles for the paper metadata and PDF."""
0981a08… noreply 77 meta = self._fetch_metadata()
0981a08… noreply 78 return [
0981a08… noreply 79 SourceFile(
0981a08… noreply 80 name=f"{meta['title']} (metadata)",
0981a08… noreply 81 id=f"meta:{self.arxiv_id}",
0981a08… noreply 82 mime_type="text/plain",
0981a08… noreply 83 ),
0981a08… noreply 84 SourceFile(
0981a08… noreply 85 name=f"{meta['title']}.pdf",
0981a08… noreply 86 id=f"pdf:{self.arxiv_id}",
0981a08… noreply 87 mime_type="application/pdf",
0981a08… noreply 88 ),
0981a08… noreply 89 ]
0981a08… noreply 90
0981a08… noreply 91 def download(self, file: SourceFile, destination: Path) -> Path:
0981a08… noreply 92 """Download paper metadata as text or the PDF file."""
0981a08… noreply 93 import requests
0981a08… noreply 94
0981a08… noreply 95 destination = Path(destination)
0981a08… noreply 96 destination.parent.mkdir(parents=True, exist_ok=True)
0981a08… noreply 97 meta = self._fetch_metadata()
0981a08… noreply 98
0981a08… noreply 99 if file.id.startswith("meta:"):
0981a08… noreply 100 authors = ", ".join(meta["authors"])
0981a08… noreply 101 text = (
0981a08… noreply 102 f"# {meta['title']}\n\n"
0981a08… noreply 103 f"Authors: {authors}\n"
0981a08… noreply 104 f"Published: {meta['published']}\n"
0981a08… noreply 105 f"arXiv: {self.arxiv_id}\n\n"
0981a08… noreply 106 f"## Abstract\n\n{meta['summary']}"
0981a08… noreply 107 )
0981a08… noreply 108 destination.write_text(text, encoding="utf-8")
0981a08… noreply 109 elif file.id.startswith("pdf:"):
0981a08… noreply 110 resp = requests.get(meta["pdf_url"], timeout=60, stream=True)
0981a08… noreply 111 resp.raise_for_status()
0981a08… noreply 112 with open(destination, "wb") as f:
0981a08… noreply 113 for chunk in resp.iter_content(chunk_size=8192):
0981a08… noreply 114 f.write(chunk)
0981a08… noreply 115
0981a08… noreply 116 logger.info(f"Downloaded arXiv {self.arxiv_id} to {destination}")
0981a08… noreply 117 return destination

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button