PlanOpticon

1
"""Base classes and registry for document processors."""
2
3
from abc import ABC, abstractmethod
4
from pathlib import Path
5
from typing import Any, Dict, List, Optional
6
7
from pydantic import BaseModel, Field
8
9
10
class DocumentChunk(BaseModel):
11
"""A chunk of text from a processed document."""
12
13
text: str
14
source_file: str
15
chunk_index: int = 0
16
page: Optional[int] = None
17
section: Optional[str] = None
18
metadata: Dict[str, Any] = Field(default_factory=dict)
19
20
21
class DocumentProcessor(ABC):
22
"""Base class for document processors."""
23
24
supported_extensions: List[str] = []
25
26
@abstractmethod
27
def process(self, path: Path) -> List[DocumentChunk]:
28
"""Process a document into chunks."""
29
...
30
31
@abstractmethod
32
def can_process(self, path: Path) -> bool:
33
"""Check if this processor can handle the file."""
34
...
35
36
37
# Registry
38
_processors: Dict[str, type] = {}
39
40
41
def register_processor(extensions: List[str], processor_class: type) -> None:
42
"""Register a processor class for the given file extensions."""
43
for ext in extensions:
44
_processors[ext.lower()] = processor_class
45
46
47
def get_processor(path: Path) -> Optional[DocumentProcessor]:
48
"""Get a processor instance for the given file path, or None if unsupported."""
49
ext = path.suffix.lower()
50
cls = _processors.get(ext)
51
return cls() if cls else None
52
53
54
def list_supported_extensions() -> List[str]:
55
"""Return sorted list of all registered file extensions."""
56
return sorted(_processors.keys())
57

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button