|
1
|
"""arXiv source connector for fetching paper metadata and PDFs.""" |
|
2
|
|
|
3
|
import logging |
|
4
|
import re |
|
5
|
from pathlib import Path |
|
6
|
from typing import List, Optional |
|
7
|
|
|
8
|
from video_processor.sources.base import BaseSource, SourceFile |
|
9
|
|
|
10
|
logger = logging.getLogger(__name__) |
|
11
|
|
|
12
|
_ARXIV_ID_PATTERN = re.compile(r"(\d{4}\.\d{4,5})(v\d+)?") |
|
13
|
ARXIV_API = "http://export.arxiv.org/api/query" |
|
14
|
|
|
15
|
|
|
16
|
def _extract_arxiv_id(url_or_id: str) -> str: |
|
17
|
"""Extract arXiv paper ID from a URL or bare ID string.""" |
|
18
|
match = _ARXIV_ID_PATTERN.search(url_or_id) |
|
19
|
if not match: |
|
20
|
raise ValueError(f"Could not extract arXiv ID from: {url_or_id}") |
|
21
|
return match.group(0) |
|
22
|
|
|
23
|
|
|
24
|
class ArxivSource(BaseSource): |
|
25
|
""" |
|
26
|
Fetch arXiv paper metadata and PDF. |
|
27
|
|
|
28
|
Uses the arXiv API (Atom feed) for metadata and direct PDF download. |
|
29
|
Requires: pip install requests |
|
30
|
""" |
|
31
|
|
|
32
|
def __init__(self, url_or_id: str): |
|
33
|
self.arxiv_id = _extract_arxiv_id(url_or_id) |
|
34
|
self._metadata: Optional[dict] = None |
|
35
|
|
|
36
|
def authenticate(self) -> bool: |
|
37
|
"""No auth needed for arXiv.""" |
|
38
|
return True |
|
39
|
|
|
40
|
def _fetch_metadata(self) -> dict: |
|
41
|
"""Fetch paper metadata from the arXiv API.""" |
|
42
|
if self._metadata: |
|
43
|
return self._metadata |
|
44
|
|
|
45
|
import xml.etree.ElementTree as ET |
|
46
|
|
|
47
|
import requests |
|
48
|
|
|
49
|
resp = requests.get(ARXIV_API, params={"id_list": self.arxiv_id}, timeout=15) |
|
50
|
resp.raise_for_status() |
|
51
|
|
|
52
|
ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"} |
|
53
|
root = ET.fromstring(resp.text) |
|
54
|
entry = root.find("atom:entry", ns) |
|
55
|
if entry is None: |
|
56
|
raise ValueError(f"Paper not found: {self.arxiv_id}") |
|
57
|
|
|
58
|
self._metadata = { |
|
59
|
"title": (entry.findtext("atom:title", namespaces=ns) or "").strip(), |
|
60
|
"summary": (entry.findtext("atom:summary", namespaces=ns) or "").strip(), |
|
61
|
"authors": [ |
|
62
|
a.findtext("atom:name", namespaces=ns) or "" |
|
63
|
for a in entry.findall("atom:author", ns) |
|
64
|
], |
|
65
|
"published": entry.findtext("atom:published", namespaces=ns) or "", |
|
66
|
"pdf_url": f"https://arxiv.org/pdf/{self.arxiv_id}.pdf", |
|
67
|
} |
|
68
|
return self._metadata |
|
69
|
|
|
70
|
def list_videos( |
|
71
|
self, |
|
72
|
folder_id: Optional[str] = None, |
|
73
|
folder_path: Optional[str] = None, |
|
74
|
patterns: Optional[List[str]] = None, |
|
75
|
) -> List[SourceFile]: |
|
76
|
"""Return SourceFiles for the paper metadata and PDF.""" |
|
77
|
meta = self._fetch_metadata() |
|
78
|
return [ |
|
79
|
SourceFile( |
|
80
|
name=f"{meta['title']} (metadata)", |
|
81
|
id=f"meta:{self.arxiv_id}", |
|
82
|
mime_type="text/plain", |
|
83
|
), |
|
84
|
SourceFile( |
|
85
|
name=f"{meta['title']}.pdf", |
|
86
|
id=f"pdf:{self.arxiv_id}", |
|
87
|
mime_type="application/pdf", |
|
88
|
), |
|
89
|
] |
|
90
|
|
|
91
|
def download(self, file: SourceFile, destination: Path) -> Path: |
|
92
|
"""Download paper metadata as text or the PDF file.""" |
|
93
|
import requests |
|
94
|
|
|
95
|
destination = Path(destination) |
|
96
|
destination.parent.mkdir(parents=True, exist_ok=True) |
|
97
|
meta = self._fetch_metadata() |
|
98
|
|
|
99
|
if file.id.startswith("meta:"): |
|
100
|
authors = ", ".join(meta["authors"]) |
|
101
|
text = ( |
|
102
|
f"# {meta['title']}\n\n" |
|
103
|
f"Authors: {authors}\n" |
|
104
|
f"Published: {meta['published']}\n" |
|
105
|
f"arXiv: {self.arxiv_id}\n\n" |
|
106
|
f"## Abstract\n\n{meta['summary']}" |
|
107
|
) |
|
108
|
destination.write_text(text, encoding="utf-8") |
|
109
|
elif file.id.startswith("pdf:"): |
|
110
|
resp = requests.get(meta["pdf_url"], timeout=60, stream=True) |
|
111
|
resp.raise_for_status() |
|
112
|
with open(destination, "wb") as f: |
|
113
|
for chunk in resp.iter_content(chunk_size=8192): |
|
114
|
f.write(chunk) |
|
115
|
|
|
116
|
logger.info(f"Downloaded arXiv {self.arxiv_id} to {destination}") |
|
117
|
return destination |
|
118
|
|