PlanOpticon
| 0981a08… | noreply | 1 | """PDF document processor with graceful fallback between extraction libraries.""" |
| 0981a08… | noreply | 2 | |
| 0981a08… | noreply | 3 | from pathlib import Path |
| 0981a08… | noreply | 4 | from typing import List |
| 0981a08… | noreply | 5 | |
| 0981a08… | noreply | 6 | from video_processor.processors.base import ( |
| 0981a08… | noreply | 7 | DocumentChunk, |
| 0981a08… | noreply | 8 | DocumentProcessor, |
| 0981a08… | noreply | 9 | register_processor, |
| 0981a08… | noreply | 10 | ) |
| 0981a08… | noreply | 11 | |
| 0981a08… | noreply | 12 | |
| 0981a08… | noreply | 13 | class PdfProcessor(DocumentProcessor): |
| 0981a08… | noreply | 14 | """Process PDF files using pymupdf or pdfplumber.""" |
| 0981a08… | noreply | 15 | |
| 0981a08… | noreply | 16 | supported_extensions = [".pdf"] |
| 0981a08… | noreply | 17 | |
| 0981a08… | noreply | 18 | def can_process(self, path: Path) -> bool: |
| 0981a08… | noreply | 19 | return path.suffix.lower() in self.supported_extensions |
| 0981a08… | noreply | 20 | |
| 0981a08… | noreply | 21 | def process(self, path: Path) -> List[DocumentChunk]: |
| 0981a08… | noreply | 22 | """Process a PDF, trying pymupdf first, then pdfplumber.""" |
| 0981a08… | noreply | 23 | try: |
| 0981a08… | noreply | 24 | return self._process_pymupdf(path) |
| 0981a08… | noreply | 25 | except ImportError: |
| 0981a08… | noreply | 26 | pass |
| 0981a08… | noreply | 27 | |
| 0981a08… | noreply | 28 | try: |
| 0981a08… | noreply | 29 | return self._process_pdfplumber(path) |
| 0981a08… | noreply | 30 | except ImportError: |
| 0981a08… | noreply | 31 | raise ImportError( |
| 0981a08… | noreply | 32 | "PDF processing requires pymupdf or pdfplumber. " |
| 0981a08… | noreply | 33 | "Install with: pip install 'planopticon[pdf]' OR pip install pdfplumber" |
| 0981a08… | noreply | 34 | ) |
| 0981a08… | noreply | 35 | |
| 0981a08… | noreply | 36 | def _process_pymupdf(self, path: Path) -> List[DocumentChunk]: |
| 0981a08… | noreply | 37 | import pymupdf |
| 0981a08… | noreply | 38 | |
| 0981a08… | noreply | 39 | doc = pymupdf.open(str(path)) |
| 0981a08… | noreply | 40 | chunks: List[DocumentChunk] = [] |
| 0981a08… | noreply | 41 | for page_num, page in enumerate(doc): |
| 0981a08… | noreply | 42 | text = page.get_text() |
| 0981a08… | noreply | 43 | if text.strip(): |
| 0981a08… | noreply | 44 | chunks.append( |
| 0981a08… | noreply | 45 | DocumentChunk( |
| 0981a08… | noreply | 46 | text=text, |
| 0981a08… | noreply | 47 | source_file=str(path), |
| 0981a08… | noreply | 48 | chunk_index=page_num, |
| 0981a08… | noreply | 49 | page=page_num + 1, |
| 0981a08… | noreply | 50 | metadata={"extraction_method": "pymupdf"}, |
| 0981a08… | noreply | 51 | ) |
| 0981a08… | noreply | 52 | ) |
| 0981a08… | noreply | 53 | doc.close() |
| 0981a08… | noreply | 54 | return chunks |
| 0981a08… | noreply | 55 | |
| 0981a08… | noreply | 56 | def _process_pdfplumber(self, path: Path) -> List[DocumentChunk]: |
| 0981a08… | noreply | 57 | import pdfplumber |
| 0981a08… | noreply | 58 | |
| 0981a08… | noreply | 59 | chunks: List[DocumentChunk] = [] |
| 0981a08… | noreply | 60 | with pdfplumber.open(str(path)) as pdf: |
| 0981a08… | noreply | 61 | for page_num, page in enumerate(pdf.pages): |
| 0981a08… | noreply | 62 | text = page.extract_text() or "" |
| 0981a08… | noreply | 63 | if text.strip(): |
| 0981a08… | noreply | 64 | chunks.append( |
| 0981a08… | noreply | 65 | DocumentChunk( |
| 0981a08… | noreply | 66 | text=text, |
| 0981a08… | noreply | 67 | source_file=str(path), |
| 0981a08… | noreply | 68 | chunk_index=page_num, |
| 0981a08… | noreply | 69 | page=page_num + 1, |
| 0981a08… | noreply | 70 | metadata={"extraction_method": "pdfplumber"}, |
| 0981a08… | noreply | 71 | ) |
| 0981a08… | noreply | 72 | ) |
| 0981a08… | noreply | 73 | return chunks |
| 0981a08… | noreply | 74 | |
| 0981a08… | noreply | 75 | |
| 0981a08… | noreply | 76 | # Register processor |
| 0981a08… | noreply | 77 | register_processor(PdfProcessor.supported_extensions, PdfProcessor) |