PlanOpticon

Source Blame History 56 lines
0981a08… noreply 1 """Base classes and registry for document processors."""
0981a08… noreply 2
0981a08… noreply 3 from abc import ABC, abstractmethod
0981a08… noreply 4 from pathlib import Path
0981a08… noreply 5 from typing import Any, Dict, List, Optional
0981a08… noreply 6
0981a08… noreply 7 from pydantic import BaseModel, Field
0981a08… noreply 8
0981a08… noreply 9
0981a08… noreply 10 class DocumentChunk(BaseModel):
0981a08… noreply 11 """A chunk of text from a processed document."""
0981a08… noreply 12
0981a08… noreply 13 text: str
0981a08… noreply 14 source_file: str
0981a08… noreply 15 chunk_index: int = 0
0981a08… noreply 16 page: Optional[int] = None
0981a08… noreply 17 section: Optional[str] = None
0981a08… noreply 18 metadata: Dict[str, Any] = Field(default_factory=dict)
0981a08… noreply 19
0981a08… noreply 20
0981a08… noreply 21 class DocumentProcessor(ABC):
0981a08… noreply 22 """Base class for document processors."""
0981a08… noreply 23
0981a08… noreply 24 supported_extensions: List[str] = []
0981a08… noreply 25
0981a08… noreply 26 @abstractmethod
0981a08… noreply 27 def process(self, path: Path) -> List[DocumentChunk]:
0981a08… noreply 28 """Process a document into chunks."""
0981a08… noreply 29 ...
0981a08… noreply 30
0981a08… noreply 31 @abstractmethod
0981a08… noreply 32 def can_process(self, path: Path) -> bool:
0981a08… noreply 33 """Check if this processor can handle the file."""
0981a08… noreply 34 ...
0981a08… noreply 35
0981a08… noreply 36
0981a08… noreply 37 # Registry
0981a08… noreply 38 _processors: Dict[str, type] = {}
0981a08… noreply 39
0981a08… noreply 40
0981a08… noreply 41 def register_processor(extensions: List[str], processor_class: type) -> None:
0981a08… noreply 42 """Register a processor class for the given file extensions."""
0981a08… noreply 43 for ext in extensions:
0981a08… noreply 44 _processors[ext.lower()] = processor_class
0981a08… noreply 45
0981a08… noreply 46
0981a08… noreply 47 def get_processor(path: Path) -> Optional[DocumentProcessor]:
0981a08… noreply 48 """Get a processor instance for the given file path, or None if unsupported."""
0981a08… noreply 49 ext = path.suffix.lower()
0981a08… noreply 50 cls = _processors.get(ext)
0981a08… noreply 51 return cls() if cls else None
0981a08… noreply 52
0981a08… noreply 53
0981a08… noreply 54 def list_supported_extensions() -> List[str]:
0981a08… noreply 55 """Return sorted list of all registered file extensions."""
0981a08… noreply 56 return sorted(_processors.keys())

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button