|
1
|
"""Base classes and registry for document processors.""" |
|
2
|
|
|
3
|
from abc import ABC, abstractmethod |
|
4
|
from pathlib import Path |
|
5
|
from typing import Any, Dict, List, Optional |
|
6
|
|
|
7
|
from pydantic import BaseModel, Field |
|
8
|
|
|
9
|
|
|
10
|
class DocumentChunk(BaseModel): |
|
11
|
"""A chunk of text from a processed document.""" |
|
12
|
|
|
13
|
text: str |
|
14
|
source_file: str |
|
15
|
chunk_index: int = 0 |
|
16
|
page: Optional[int] = None |
|
17
|
section: Optional[str] = None |
|
18
|
metadata: Dict[str, Any] = Field(default_factory=dict) |
|
19
|
|
|
20
|
|
|
21
|
class DocumentProcessor(ABC): |
|
22
|
"""Base class for document processors.""" |
|
23
|
|
|
24
|
supported_extensions: List[str] = [] |
|
25
|
|
|
26
|
@abstractmethod |
|
27
|
def process(self, path: Path) -> List[DocumentChunk]: |
|
28
|
"""Process a document into chunks.""" |
|
29
|
... |
|
30
|
|
|
31
|
@abstractmethod |
|
32
|
def can_process(self, path: Path) -> bool: |
|
33
|
"""Check if this processor can handle the file.""" |
|
34
|
... |
|
35
|
|
|
36
|
|
|
37
|
# Registry |
|
38
|
_processors: Dict[str, type] = {} |
|
39
|
|
|
40
|
|
|
41
|
def register_processor(extensions: List[str], processor_class: type) -> None: |
|
42
|
"""Register a processor class for the given file extensions.""" |
|
43
|
for ext in extensions: |
|
44
|
_processors[ext.lower()] = processor_class |
|
45
|
|
|
46
|
|
|
47
|
def get_processor(path: Path) -> Optional[DocumentProcessor]: |
|
48
|
"""Get a processor instance for the given file path, or None if unsupported.""" |
|
49
|
ext = path.suffix.lower() |
|
50
|
cls = _processors.get(ext) |
|
51
|
return cls() if cls else None |
|
52
|
|
|
53
|
|
|
54
|
def list_supported_extensions() -> List[str]: |
|
55
|
"""Return sorted list of all registered file extensions.""" |
|
56
|
return sorted(_processors.keys()) |
|
57
|
|