PlanOpticon

planopticon / video_processor / sources / arxiv_source.py

Blame History Raw 118 lines

1	`"""arXiv source connector for fetching paper metadata and PDFs."""`
2
3	`import logging`
4	`import re`
5	`from pathlib import Path`
6	`from typing import List, Optional`
7
8	`from video_processor.sources.base import BaseSource, SourceFile`
9
10	`logger = logging.getLogger(__name__)`
11
12	`_ARXIV_ID_PATTERN = re.compile(r"(\d{4}\.\d{4,5})(v\d+)?")`
13	`ARXIV_API = "http://export.arxiv.org/api/query"`
14
15
16	`def _extract_arxiv_id(url_or_id: str) -> str:`
17	`"""Extract arXiv paper ID from a URL or bare ID string."""`
18	`match = _ARXIV_ID_PATTERN.search(url_or_id)`
19	`if not match:`
20	`raise ValueError(f"Could not extract arXiv ID from: {url_or_id}")`
21	`return match.group(0)`
22
23
24	`class ArxivSource(BaseSource):`
25	`"""`
26	`Fetch arXiv paper metadata and PDF.`
27
28	`Uses the arXiv API (Atom feed) for metadata and direct PDF download.`
29	`Requires: pip install requests`
30	`"""`
31
32	`def __init__(self, url_or_id: str):`
33	`self.arxiv_id = _extract_arxiv_id(url_or_id)`
34	`self._metadata: Optional[dict] = None`
35
36	`def authenticate(self) -> bool:`
37	`"""No auth needed for arXiv."""`
38	`return True`
39
40	`def _fetch_metadata(self) -> dict:`
41	`"""Fetch paper metadata from the arXiv API."""`
42	`if self._metadata:`
43	`return self._metadata`
44
45	`import xml.etree.ElementTree as ET`
46
47	`import requests`
48
49	`resp = requests.get(ARXIV_API, params={"id_list": self.arxiv_id}, timeout=15)`
50	`resp.raise_for_status()`
51
52	`ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}`
53	`root = ET.fromstring(resp.text)`
54	`entry = root.find("atom:entry", ns)`
55	`if entry is None:`
56	`raise ValueError(f"Paper not found: {self.arxiv_id}")`
57
58	`self._metadata = {`
59	`"title": (entry.findtext("atom:title", namespaces=ns) or "").strip(),`
60	`"summary": (entry.findtext("atom:summary", namespaces=ns) or "").strip(),`
61	`"authors": [`
62	`a.findtext("atom:name", namespaces=ns) or ""`
63	`for a in entry.findall("atom:author", ns)`
64	`],`
65	`"published": entry.findtext("atom:published", namespaces=ns) or "",`
66	`"pdf_url": f"https://arxiv.org/pdf/{self.arxiv_id}.pdf",`
67	`}`
68	`return self._metadata`
69
70	`def list_videos(`
71	`self,`
72	`folder_id: Optional[str] = None,`
73	`folder_path: Optional[str] = None,`
74	`patterns: Optional[List[str]] = None,`
75	`) -> List[SourceFile]:`
76	`"""Return SourceFiles for the paper metadata and PDF."""`
77	`meta = self._fetch_metadata()`
78	`return [`
79	`SourceFile(`
80	`name=f"{meta['title']} (metadata)",`
81	`id=f"meta:{self.arxiv_id}",`
82	`mime_type="text/plain",`
83	`),`
84	`SourceFile(`
85	`name=f"{meta['title']}.pdf",`
86	`id=f"pdf:{self.arxiv_id}",`
87	`mime_type="application/pdf",`
88	`),`
89	`]`
90
91	`def download(self, file: SourceFile, destination: Path) -> Path:`
92	`"""Download paper metadata as text or the PDF file."""`
93	`import requests`
94
95	`destination = Path(destination)`
96	`destination.parent.mkdir(parents=True, exist_ok=True)`
97	`meta = self._fetch_metadata()`
98
99	`if file.id.startswith("meta:"):`
100	`authors = ", ".join(meta["authors"])`
101	`text = (`
102	`f"# {meta['title']}\n\n"`
103	`f"Authors: {authors}\n"`
104	`f"Published: {meta['published']}\n"`
105	`f"arXiv: {self.arxiv_id}\n\n"`
106	`f"## Abstract\n\n{meta['summary']}"`
107	`)`
108	`destination.write_text(text, encoding="utf-8")`
109	`elif file.id.startswith("pdf:"):`
110	`resp = requests.get(meta["pdf_url"], timeout=60, stream=True)`
111	`resp.raise_for_status()`
112	`with open(destination, "wb") as f:`
113	`for chunk in resp.iter_content(chunk_size=8192):`
114	`f.write(chunk)`
115
116	`logger.info(f"Downloaded arXiv {self.arxiv_id} to {destination}")`
117	`return destination`
118

PlanOpticon

Keyboard Shortcuts