PlanOpticon

planopticon / video_processor / sources / arxiv_source.py
Blame History Raw 118 lines
1
"""arXiv source connector for fetching paper metadata and PDFs."""
2
3
import logging
4
import re
5
from pathlib import Path
6
from typing import List, Optional
7
8
from video_processor.sources.base import BaseSource, SourceFile
9
10
logger = logging.getLogger(__name__)
11
12
_ARXIV_ID_PATTERN = re.compile(r"(\d{4}\.\d{4,5})(v\d+)?")
13
ARXIV_API = "http://export.arxiv.org/api/query"
14
15
16
def _extract_arxiv_id(url_or_id: str) -> str:
17
"""Extract arXiv paper ID from a URL or bare ID string."""
18
match = _ARXIV_ID_PATTERN.search(url_or_id)
19
if not match:
20
raise ValueError(f"Could not extract arXiv ID from: {url_or_id}")
21
return match.group(0)
22
23
24
class ArxivSource(BaseSource):
25
"""
26
Fetch arXiv paper metadata and PDF.
27
28
Uses the arXiv API (Atom feed) for metadata and direct PDF download.
29
Requires: pip install requests
30
"""
31
32
def __init__(self, url_or_id: str):
33
self.arxiv_id = _extract_arxiv_id(url_or_id)
34
self._metadata: Optional[dict] = None
35
36
def authenticate(self) -> bool:
37
"""No auth needed for arXiv."""
38
return True
39
40
def _fetch_metadata(self) -> dict:
41
"""Fetch paper metadata from the arXiv API."""
42
if self._metadata:
43
return self._metadata
44
45
import xml.etree.ElementTree as ET
46
47
import requests
48
49
resp = requests.get(ARXIV_API, params={"id_list": self.arxiv_id}, timeout=15)
50
resp.raise_for_status()
51
52
ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
53
root = ET.fromstring(resp.text)
54
entry = root.find("atom:entry", ns)
55
if entry is None:
56
raise ValueError(f"Paper not found: {self.arxiv_id}")
57
58
self._metadata = {
59
"title": (entry.findtext("atom:title", namespaces=ns) or "").strip(),
60
"summary": (entry.findtext("atom:summary", namespaces=ns) or "").strip(),
61
"authors": [
62
a.findtext("atom:name", namespaces=ns) or ""
63
for a in entry.findall("atom:author", ns)
64
],
65
"published": entry.findtext("atom:published", namespaces=ns) or "",
66
"pdf_url": f"https://arxiv.org/pdf/{self.arxiv_id}.pdf",
67
}
68
return self._metadata
69
70
def list_videos(
71
self,
72
folder_id: Optional[str] = None,
73
folder_path: Optional[str] = None,
74
patterns: Optional[List[str]] = None,
75
) -> List[SourceFile]:
76
"""Return SourceFiles for the paper metadata and PDF."""
77
meta = self._fetch_metadata()
78
return [
79
SourceFile(
80
name=f"{meta['title']} (metadata)",
81
id=f"meta:{self.arxiv_id}",
82
mime_type="text/plain",
83
),
84
SourceFile(
85
name=f"{meta['title']}.pdf",
86
id=f"pdf:{self.arxiv_id}",
87
mime_type="application/pdf",
88
),
89
]
90
91
def download(self, file: SourceFile, destination: Path) -> Path:
92
"""Download paper metadata as text or the PDF file."""
93
import requests
94
95
destination = Path(destination)
96
destination.parent.mkdir(parents=True, exist_ok=True)
97
meta = self._fetch_metadata()
98
99
if file.id.startswith("meta:"):
100
authors = ", ".join(meta["authors"])
101
text = (
102
f"# {meta['title']}\n\n"
103
f"Authors: {authors}\n"
104
f"Published: {meta['published']}\n"
105
f"arXiv: {self.arxiv_id}\n\n"
106
f"## Abstract\n\n{meta['summary']}"
107
)
108
destination.write_text(text, encoding="utf-8")
109
elif file.id.startswith("pdf:"):
110
resp = requests.get(meta["pdf_url"], timeout=60, stream=True)
111
resp.raise_for_status()
112
with open(destination, "wb") as f:
113
for chunk in resp.iter_content(chunk_size=8192):
114
f.write(chunk)
115
116
logger.info(f"Downloaded arXiv {self.arxiv_id} to {destination}")
117
return destination
118

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button