PlanOpticon

planopticon / video_processor / processors / pdf_processor.py
Blame History Raw 78 lines
1
"""PDF document processor with graceful fallback between extraction libraries."""
2
3
from pathlib import Path
4
from typing import List
5
6
from video_processor.processors.base import (
7
DocumentChunk,
8
DocumentProcessor,
9
register_processor,
10
)
11
12
13
class PdfProcessor(DocumentProcessor):
14
"""Process PDF files using pymupdf or pdfplumber."""
15
16
supported_extensions = [".pdf"]
17
18
def can_process(self, path: Path) -> bool:
19
return path.suffix.lower() in self.supported_extensions
20
21
def process(self, path: Path) -> List[DocumentChunk]:
22
"""Process a PDF, trying pymupdf first, then pdfplumber."""
23
try:
24
return self._process_pymupdf(path)
25
except ImportError:
26
pass
27
28
try:
29
return self._process_pdfplumber(path)
30
except ImportError:
31
raise ImportError(
32
"PDF processing requires pymupdf or pdfplumber. "
33
"Install with: pip install 'planopticon[pdf]' OR pip install pdfplumber"
34
)
35
36
def _process_pymupdf(self, path: Path) -> List[DocumentChunk]:
37
import pymupdf
38
39
doc = pymupdf.open(str(path))
40
chunks: List[DocumentChunk] = []
41
for page_num, page in enumerate(doc):
42
text = page.get_text()
43
if text.strip():
44
chunks.append(
45
DocumentChunk(
46
text=text,
47
source_file=str(path),
48
chunk_index=page_num,
49
page=page_num + 1,
50
metadata={"extraction_method": "pymupdf"},
51
)
52
)
53
doc.close()
54
return chunks
55
56
def _process_pdfplumber(self, path: Path) -> List[DocumentChunk]:
57
import pdfplumber
58
59
chunks: List[DocumentChunk] = []
60
with pdfplumber.open(str(path)) as pdf:
61
for page_num, page in enumerate(pdf.pages):
62
text = page.extract_text() or ""
63
if text.strip():
64
chunks.append(
65
DocumentChunk(
66
text=text,
67
source_file=str(path),
68
chunk_index=page_num,
69
page=page_num + 1,
70
metadata={"extraction_method": "pdfplumber"},
71
)
72
)
73
return chunks
74
75
76
# Register processor
77
register_processor(PdfProcessor.supported_extensions, PdfProcessor)
78

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button