PlanOpticon

planopticon / video_processor / processors / pdf_processor.py
Source Blame History 77 lines
0981a08… noreply 1 """PDF document processor with graceful fallback between extraction libraries."""
0981a08… noreply 2
0981a08… noreply 3 from pathlib import Path
0981a08… noreply 4 from typing import List
0981a08… noreply 5
0981a08… noreply 6 from video_processor.processors.base import (
0981a08… noreply 7 DocumentChunk,
0981a08… noreply 8 DocumentProcessor,
0981a08… noreply 9 register_processor,
0981a08… noreply 10 )
0981a08… noreply 11
0981a08… noreply 12
0981a08… noreply 13 class PdfProcessor(DocumentProcessor):
0981a08… noreply 14 """Process PDF files using pymupdf or pdfplumber."""
0981a08… noreply 15
0981a08… noreply 16 supported_extensions = [".pdf"]
0981a08… noreply 17
0981a08… noreply 18 def can_process(self, path: Path) -> bool:
0981a08… noreply 19 return path.suffix.lower() in self.supported_extensions
0981a08… noreply 20
0981a08… noreply 21 def process(self, path: Path) -> List[DocumentChunk]:
0981a08… noreply 22 """Process a PDF, trying pymupdf first, then pdfplumber."""
0981a08… noreply 23 try:
0981a08… noreply 24 return self._process_pymupdf(path)
0981a08… noreply 25 except ImportError:
0981a08… noreply 26 pass
0981a08… noreply 27
0981a08… noreply 28 try:
0981a08… noreply 29 return self._process_pdfplumber(path)
0981a08… noreply 30 except ImportError:
0981a08… noreply 31 raise ImportError(
0981a08… noreply 32 "PDF processing requires pymupdf or pdfplumber. "
0981a08… noreply 33 "Install with: pip install 'planopticon[pdf]' OR pip install pdfplumber"
0981a08… noreply 34 )
0981a08… noreply 35
0981a08… noreply 36 def _process_pymupdf(self, path: Path) -> List[DocumentChunk]:
0981a08… noreply 37 import pymupdf
0981a08… noreply 38
0981a08… noreply 39 doc = pymupdf.open(str(path))
0981a08… noreply 40 chunks: List[DocumentChunk] = []
0981a08… noreply 41 for page_num, page in enumerate(doc):
0981a08… noreply 42 text = page.get_text()
0981a08… noreply 43 if text.strip():
0981a08… noreply 44 chunks.append(
0981a08… noreply 45 DocumentChunk(
0981a08… noreply 46 text=text,
0981a08… noreply 47 source_file=str(path),
0981a08… noreply 48 chunk_index=page_num,
0981a08… noreply 49 page=page_num + 1,
0981a08… noreply 50 metadata={"extraction_method": "pymupdf"},
0981a08… noreply 51 )
0981a08… noreply 52 )
0981a08… noreply 53 doc.close()
0981a08… noreply 54 return chunks
0981a08… noreply 55
0981a08… noreply 56 def _process_pdfplumber(self, path: Path) -> List[DocumentChunk]:
0981a08… noreply 57 import pdfplumber
0981a08… noreply 58
0981a08… noreply 59 chunks: List[DocumentChunk] = []
0981a08… noreply 60 with pdfplumber.open(str(path)) as pdf:
0981a08… noreply 61 for page_num, page in enumerate(pdf.pages):
0981a08… noreply 62 text = page.extract_text() or ""
0981a08… noreply 63 if text.strip():
0981a08… noreply 64 chunks.append(
0981a08… noreply 65 DocumentChunk(
0981a08… noreply 66 text=text,
0981a08… noreply 67 source_file=str(path),
0981a08… noreply 68 chunk_index=page_num,
0981a08… noreply 69 page=page_num + 1,
0981a08… noreply 70 metadata={"extraction_method": "pdfplumber"},
0981a08… noreply 71 )
0981a08… noreply 72 )
0981a08… noreply 73 return chunks
0981a08… noreply 74
0981a08… noreply 75
0981a08… noreply 76 # Register processor
0981a08… noreply 77 register_processor(PdfProcessor.supported_extensions, PdfProcessor)

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button