|
0981a08…
|
noreply
|
1 |
"""Document ingestion — process files and add content to a knowledge graph.""" |
|
0981a08…
|
noreply
|
2 |
|
|
0981a08…
|
noreply
|
3 |
import hashlib |
|
0981a08…
|
noreply
|
4 |
import logging |
|
0981a08…
|
noreply
|
5 |
import mimetypes |
|
0981a08…
|
noreply
|
6 |
from datetime import datetime |
|
0981a08…
|
noreply
|
7 |
from pathlib import Path |
|
0981a08…
|
noreply
|
8 |
from typing import Dict, List, Optional |
|
0981a08…
|
noreply
|
9 |
|
|
0981a08…
|
noreply
|
10 |
from video_processor.integrators.knowledge_graph import KnowledgeGraph |
|
0981a08…
|
noreply
|
11 |
from video_processor.processors.base import get_processor, list_supported_extensions |
|
0981a08…
|
noreply
|
12 |
|
|
0981a08…
|
noreply
|
13 |
logger = logging.getLogger(__name__) |
|
0981a08…
|
noreply
|
14 |
|
|
0981a08…
|
noreply
|
15 |
|
|
0981a08…
|
noreply
|
16 |
def ingest_file( |
|
0981a08…
|
noreply
|
17 |
path: Path, |
|
0981a08…
|
noreply
|
18 |
knowledge_graph: KnowledgeGraph, |
|
0981a08…
|
noreply
|
19 |
source_id: Optional[str] = None, |
|
0981a08…
|
noreply
|
20 |
) -> int: |
|
0981a08…
|
noreply
|
21 |
"""Process a single file and add its content to the knowledge graph. |
|
0981a08…
|
noreply
|
22 |
|
|
0981a08…
|
noreply
|
23 |
Returns the number of chunks processed. |
|
0981a08…
|
noreply
|
24 |
""" |
|
0981a08…
|
noreply
|
25 |
processor = get_processor(path) |
|
0981a08…
|
noreply
|
26 |
if processor is None: |
|
0981a08…
|
noreply
|
27 |
raise ValueError( |
|
0981a08…
|
noreply
|
28 |
f"No processor for {path.suffix}. Supported: {', '.join(list_supported_extensions())}" |
|
0981a08…
|
noreply
|
29 |
) |
|
0981a08…
|
noreply
|
30 |
|
|
0981a08…
|
noreply
|
31 |
chunks = processor.process(path) |
|
0981a08…
|
noreply
|
32 |
|
|
0981a08…
|
noreply
|
33 |
if source_id is None: |
|
0981a08…
|
noreply
|
34 |
source_id = hashlib.sha256(str(path.resolve()).encode()).hexdigest()[:12] |
|
0981a08…
|
noreply
|
35 |
|
|
0981a08…
|
noreply
|
36 |
mime = mimetypes.guess_type(str(path))[0] or "application/octet-stream" |
|
0981a08…
|
noreply
|
37 |
knowledge_graph.register_source( |
|
0981a08…
|
noreply
|
38 |
{ |
|
0981a08…
|
noreply
|
39 |
"source_id": source_id, |
|
0981a08…
|
noreply
|
40 |
"source_type": "document", |
|
0981a08…
|
noreply
|
41 |
"title": path.stem, |
|
0981a08…
|
noreply
|
42 |
"path": str(path), |
|
0981a08…
|
noreply
|
43 |
"mime_type": mime, |
|
0981a08…
|
noreply
|
44 |
"ingested_at": datetime.now().isoformat(), |
|
0981a08…
|
noreply
|
45 |
"metadata": {"chunks": len(chunks), "extension": path.suffix}, |
|
0981a08…
|
noreply
|
46 |
} |
|
0981a08…
|
noreply
|
47 |
) |
|
0981a08…
|
noreply
|
48 |
|
|
0981a08…
|
noreply
|
49 |
for chunk in chunks: |
|
0981a08…
|
noreply
|
50 |
content_source = f"document:{path.name}" |
|
0981a08…
|
noreply
|
51 |
if chunk.page is not None: |
|
0981a08…
|
noreply
|
52 |
content_source += f":page:{chunk.page}" |
|
0981a08…
|
noreply
|
53 |
elif chunk.section: |
|
0981a08…
|
noreply
|
54 |
content_source += f":section:{chunk.section}" |
|
0981a08…
|
noreply
|
55 |
knowledge_graph.add_content(chunk.text, content_source) |
|
0981a08…
|
noreply
|
56 |
|
|
0981a08…
|
noreply
|
57 |
return len(chunks) |
|
0981a08…
|
noreply
|
58 |
|
|
0981a08…
|
noreply
|
59 |
|
|
0981a08…
|
noreply
|
60 |
def ingest_directory( |
|
0981a08…
|
noreply
|
61 |
directory: Path, |
|
0981a08…
|
noreply
|
62 |
knowledge_graph: KnowledgeGraph, |
|
0981a08…
|
noreply
|
63 |
recursive: bool = True, |
|
0981a08…
|
noreply
|
64 |
extensions: Optional[List[str]] = None, |
|
0981a08…
|
noreply
|
65 |
) -> Dict[str, int]: |
|
0981a08…
|
noreply
|
66 |
"""Process all supported files in a directory. |
|
0981a08…
|
noreply
|
67 |
|
|
0981a08…
|
noreply
|
68 |
Returns a dict mapping filename to chunk count. |
|
0981a08…
|
noreply
|
69 |
""" |
|
0981a08…
|
noreply
|
70 |
if not directory.is_dir(): |
|
0981a08…
|
noreply
|
71 |
raise ValueError(f"Not a directory: {directory}") |
|
0981a08…
|
noreply
|
72 |
|
|
0981a08…
|
noreply
|
73 |
supported = set(extensions) if extensions else set(list_supported_extensions()) |
|
0981a08…
|
noreply
|
74 |
results: Dict[str, int] = {} |
|
0981a08…
|
noreply
|
75 |
|
|
0981a08…
|
noreply
|
76 |
glob_fn = directory.rglob if recursive else directory.glob |
|
0981a08…
|
noreply
|
77 |
files = sorted(f for f in glob_fn("*") if f.is_file() and f.suffix.lower() in supported) |
|
0981a08…
|
noreply
|
78 |
|
|
0981a08…
|
noreply
|
79 |
for file_path in files: |
|
0981a08…
|
noreply
|
80 |
try: |
|
0981a08…
|
noreply
|
81 |
count = ingest_file(file_path, knowledge_graph) |
|
0981a08…
|
noreply
|
82 |
results[str(file_path)] = count |
|
0981a08…
|
noreply
|
83 |
logger.info(f"Ingested {file_path.name}: {count} chunks") |
|
0981a08…
|
noreply
|
84 |
except Exception as e: |
|
0981a08…
|
noreply
|
85 |
logger.warning(f"Failed to ingest {file_path.name}: {e}") |
|
0981a08…
|
noreply
|
86 |
results[str(file_path)] = 0 |
|
0981a08…
|
noreply
|
87 |
|
|
0981a08…
|
noreply
|
88 |
return results |