|
0981a08…
|
noreply
|
1 |
"""arXiv source connector for fetching paper metadata and PDFs.""" |
|
0981a08…
|
noreply
|
2 |
|
|
0981a08…
|
noreply
|
3 |
import logging |
|
0981a08…
|
noreply
|
4 |
import re |
|
0981a08…
|
noreply
|
5 |
from pathlib import Path |
|
0981a08…
|
noreply
|
6 |
from typing import List, Optional |
|
0981a08…
|
noreply
|
7 |
|
|
0981a08…
|
noreply
|
8 |
from video_processor.sources.base import BaseSource, SourceFile |
|
0981a08…
|
noreply
|
9 |
|
|
0981a08…
|
noreply
|
10 |
logger = logging.getLogger(__name__) |
|
0981a08…
|
noreply
|
11 |
|
|
0981a08…
|
noreply
|
12 |
_ARXIV_ID_PATTERN = re.compile(r"(\d{4}\.\d{4,5})(v\d+)?") |
|
0981a08…
|
noreply
|
13 |
ARXIV_API = "http://export.arxiv.org/api/query" |
|
0981a08…
|
noreply
|
14 |
|
|
0981a08…
|
noreply
|
15 |
|
|
0981a08…
|
noreply
|
16 |
def _extract_arxiv_id(url_or_id: str) -> str: |
|
0981a08…
|
noreply
|
17 |
"""Extract arXiv paper ID from a URL or bare ID string.""" |
|
0981a08…
|
noreply
|
18 |
match = _ARXIV_ID_PATTERN.search(url_or_id) |
|
0981a08…
|
noreply
|
19 |
if not match: |
|
0981a08…
|
noreply
|
20 |
raise ValueError(f"Could not extract arXiv ID from: {url_or_id}") |
|
0981a08…
|
noreply
|
21 |
return match.group(0) |
|
0981a08…
|
noreply
|
22 |
|
|
0981a08…
|
noreply
|
23 |
|
|
0981a08…
|
noreply
|
24 |
class ArxivSource(BaseSource): |
|
0981a08…
|
noreply
|
25 |
""" |
|
0981a08…
|
noreply
|
26 |
Fetch arXiv paper metadata and PDF. |
|
0981a08…
|
noreply
|
27 |
|
|
0981a08…
|
noreply
|
28 |
Uses the arXiv API (Atom feed) for metadata and direct PDF download. |
|
0981a08…
|
noreply
|
29 |
Requires: pip install requests |
|
0981a08…
|
noreply
|
30 |
""" |
|
0981a08…
|
noreply
|
31 |
|
|
0981a08…
|
noreply
|
32 |
def __init__(self, url_or_id: str): |
|
0981a08…
|
noreply
|
33 |
self.arxiv_id = _extract_arxiv_id(url_or_id) |
|
0981a08…
|
noreply
|
34 |
self._metadata: Optional[dict] = None |
|
0981a08…
|
noreply
|
35 |
|
|
0981a08…
|
noreply
|
36 |
def authenticate(self) -> bool: |
|
0981a08…
|
noreply
|
37 |
"""No auth needed for arXiv.""" |
|
0981a08…
|
noreply
|
38 |
return True |
|
0981a08…
|
noreply
|
39 |
|
|
0981a08…
|
noreply
|
40 |
def _fetch_metadata(self) -> dict: |
|
0981a08…
|
noreply
|
41 |
"""Fetch paper metadata from the arXiv API.""" |
|
0981a08…
|
noreply
|
42 |
if self._metadata: |
|
0981a08…
|
noreply
|
43 |
return self._metadata |
|
0981a08…
|
noreply
|
44 |
|
|
0981a08…
|
noreply
|
45 |
import xml.etree.ElementTree as ET |
|
0981a08…
|
noreply
|
46 |
|
|
0981a08…
|
noreply
|
47 |
import requests |
|
0981a08…
|
noreply
|
48 |
|
|
0981a08…
|
noreply
|
49 |
resp = requests.get(ARXIV_API, params={"id_list": self.arxiv_id}, timeout=15) |
|
0981a08…
|
noreply
|
50 |
resp.raise_for_status() |
|
0981a08…
|
noreply
|
51 |
|
|
0981a08…
|
noreply
|
52 |
ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"} |
|
0981a08…
|
noreply
|
53 |
root = ET.fromstring(resp.text) |
|
0981a08…
|
noreply
|
54 |
entry = root.find("atom:entry", ns) |
|
0981a08…
|
noreply
|
55 |
if entry is None: |
|
0981a08…
|
noreply
|
56 |
raise ValueError(f"Paper not found: {self.arxiv_id}") |
|
0981a08…
|
noreply
|
57 |
|
|
0981a08…
|
noreply
|
58 |
self._metadata = { |
|
0981a08…
|
noreply
|
59 |
"title": (entry.findtext("atom:title", namespaces=ns) or "").strip(), |
|
0981a08…
|
noreply
|
60 |
"summary": (entry.findtext("atom:summary", namespaces=ns) or "").strip(), |
|
0981a08…
|
noreply
|
61 |
"authors": [ |
|
0981a08…
|
noreply
|
62 |
a.findtext("atom:name", namespaces=ns) or "" |
|
0981a08…
|
noreply
|
63 |
for a in entry.findall("atom:author", ns) |
|
0981a08…
|
noreply
|
64 |
], |
|
0981a08…
|
noreply
|
65 |
"published": entry.findtext("atom:published", namespaces=ns) or "", |
|
0981a08…
|
noreply
|
66 |
"pdf_url": f"https://arxiv.org/pdf/{self.arxiv_id}.pdf", |
|
0981a08…
|
noreply
|
67 |
} |
|
0981a08…
|
noreply
|
68 |
return self._metadata |
|
0981a08…
|
noreply
|
69 |
|
|
0981a08…
|
noreply
|
70 |
def list_videos( |
|
0981a08…
|
noreply
|
71 |
self, |
|
0981a08…
|
noreply
|
72 |
folder_id: Optional[str] = None, |
|
0981a08…
|
noreply
|
73 |
folder_path: Optional[str] = None, |
|
0981a08…
|
noreply
|
74 |
patterns: Optional[List[str]] = None, |
|
0981a08…
|
noreply
|
75 |
) -> List[SourceFile]: |
|
0981a08…
|
noreply
|
76 |
"""Return SourceFiles for the paper metadata and PDF.""" |
|
0981a08…
|
noreply
|
77 |
meta = self._fetch_metadata() |
|
0981a08…
|
noreply
|
78 |
return [ |
|
0981a08…
|
noreply
|
79 |
SourceFile( |
|
0981a08…
|
noreply
|
80 |
name=f"{meta['title']} (metadata)", |
|
0981a08…
|
noreply
|
81 |
id=f"meta:{self.arxiv_id}", |
|
0981a08…
|
noreply
|
82 |
mime_type="text/plain", |
|
0981a08…
|
noreply
|
83 |
), |
|
0981a08…
|
noreply
|
84 |
SourceFile( |
|
0981a08…
|
noreply
|
85 |
name=f"{meta['title']}.pdf", |
|
0981a08…
|
noreply
|
86 |
id=f"pdf:{self.arxiv_id}", |
|
0981a08…
|
noreply
|
87 |
mime_type="application/pdf", |
|
0981a08…
|
noreply
|
88 |
), |
|
0981a08…
|
noreply
|
89 |
] |
|
0981a08…
|
noreply
|
90 |
|
|
0981a08…
|
noreply
|
91 |
def download(self, file: SourceFile, destination: Path) -> Path: |
|
0981a08…
|
noreply
|
92 |
"""Download paper metadata as text or the PDF file.""" |
|
0981a08…
|
noreply
|
93 |
import requests |
|
0981a08…
|
noreply
|
94 |
|
|
0981a08…
|
noreply
|
95 |
destination = Path(destination) |
|
0981a08…
|
noreply
|
96 |
destination.parent.mkdir(parents=True, exist_ok=True) |
|
0981a08…
|
noreply
|
97 |
meta = self._fetch_metadata() |
|
0981a08…
|
noreply
|
98 |
|
|
0981a08…
|
noreply
|
99 |
if file.id.startswith("meta:"): |
|
0981a08…
|
noreply
|
100 |
authors = ", ".join(meta["authors"]) |
|
0981a08…
|
noreply
|
101 |
text = ( |
|
0981a08…
|
noreply
|
102 |
f"# {meta['title']}\n\n" |
|
0981a08…
|
noreply
|
103 |
f"Authors: {authors}\n" |
|
0981a08…
|
noreply
|
104 |
f"Published: {meta['published']}\n" |
|
0981a08…
|
noreply
|
105 |
f"arXiv: {self.arxiv_id}\n\n" |
|
0981a08…
|
noreply
|
106 |
f"## Abstract\n\n{meta['summary']}" |
|
0981a08…
|
noreply
|
107 |
) |
|
0981a08…
|
noreply
|
108 |
destination.write_text(text, encoding="utf-8") |
|
0981a08…
|
noreply
|
109 |
elif file.id.startswith("pdf:"): |
|
0981a08…
|
noreply
|
110 |
resp = requests.get(meta["pdf_url"], timeout=60, stream=True) |
|
0981a08…
|
noreply
|
111 |
resp.raise_for_status() |
|
0981a08…
|
noreply
|
112 |
with open(destination, "wb") as f: |
|
0981a08…
|
noreply
|
113 |
for chunk in resp.iter_content(chunk_size=8192): |
|
0981a08…
|
noreply
|
114 |
f.write(chunk) |
|
0981a08…
|
noreply
|
115 |
|
|
0981a08…
|
noreply
|
116 |
logger.info(f"Downloaded arXiv {self.arxiv_id} to {destination}") |
|
0981a08…
|
noreply
|
117 |
return destination |