PlanOpticon
| 0981a08… | noreply | 1 | """Base classes and registry for document processors.""" |
| 0981a08… | noreply | 2 | |
| 0981a08… | noreply | 3 | from abc import ABC, abstractmethod |
| 0981a08… | noreply | 4 | from pathlib import Path |
| 0981a08… | noreply | 5 | from typing import Any, Dict, List, Optional |
| 0981a08… | noreply | 6 | |
| 0981a08… | noreply | 7 | from pydantic import BaseModel, Field |
| 0981a08… | noreply | 8 | |
| 0981a08… | noreply | 9 | |
| 0981a08… | noreply | 10 | class DocumentChunk(BaseModel): |
| 0981a08… | noreply | 11 | """A chunk of text from a processed document.""" |
| 0981a08… | noreply | 12 | |
| 0981a08… | noreply | 13 | text: str |
| 0981a08… | noreply | 14 | source_file: str |
| 0981a08… | noreply | 15 | chunk_index: int = 0 |
| 0981a08… | noreply | 16 | page: Optional[int] = None |
| 0981a08… | noreply | 17 | section: Optional[str] = None |
| 0981a08… | noreply | 18 | metadata: Dict[str, Any] = Field(default_factory=dict) |
| 0981a08… | noreply | 19 | |
| 0981a08… | noreply | 20 | |
| 0981a08… | noreply | 21 | class DocumentProcessor(ABC): |
| 0981a08… | noreply | 22 | """Base class for document processors.""" |
| 0981a08… | noreply | 23 | |
| 0981a08… | noreply | 24 | supported_extensions: List[str] = [] |
| 0981a08… | noreply | 25 | |
| 0981a08… | noreply | 26 | @abstractmethod |
| 0981a08… | noreply | 27 | def process(self, path: Path) -> List[DocumentChunk]: |
| 0981a08… | noreply | 28 | """Process a document into chunks.""" |
| 0981a08… | noreply | 29 | ... |
| 0981a08… | noreply | 30 | |
| 0981a08… | noreply | 31 | @abstractmethod |
| 0981a08… | noreply | 32 | def can_process(self, path: Path) -> bool: |
| 0981a08… | noreply | 33 | """Check if this processor can handle the file.""" |
| 0981a08… | noreply | 34 | ... |
| 0981a08… | noreply | 35 | |
| 0981a08… | noreply | 36 | |
| 0981a08… | noreply | 37 | # Registry |
| 0981a08… | noreply | 38 | _processors: Dict[str, type] = {} |
| 0981a08… | noreply | 39 | |
| 0981a08… | noreply | 40 | |
| 0981a08… | noreply | 41 | def register_processor(extensions: List[str], processor_class: type) -> None: |
| 0981a08… | noreply | 42 | """Register a processor class for the given file extensions.""" |
| 0981a08… | noreply | 43 | for ext in extensions: |
| 0981a08… | noreply | 44 | _processors[ext.lower()] = processor_class |
| 0981a08… | noreply | 45 | |
| 0981a08… | noreply | 46 | |
| 0981a08… | noreply | 47 | def get_processor(path: Path) -> Optional[DocumentProcessor]: |
| 0981a08… | noreply | 48 | """Get a processor instance for the given file path, or None if unsupported.""" |
| 0981a08… | noreply | 49 | ext = path.suffix.lower() |
| 0981a08… | noreply | 50 | cls = _processors.get(ext) |
| 0981a08… | noreply | 51 | return cls() if cls else None |
| 0981a08… | noreply | 52 | |
| 0981a08… | noreply | 53 | |
| 0981a08… | noreply | 54 | def list_supported_extensions() -> List[str]: |
| 0981a08… | noreply | 55 | """Return sorted list of all registered file extensions.""" |
| 0981a08… | noreply | 56 | return sorted(_processors.keys()) |