PlanOpticon

Source Blame History 88 lines
0981a08… noreply 1 """Document ingestion — process files and add content to a knowledge graph."""
0981a08… noreply 2
0981a08… noreply 3 import hashlib
0981a08… noreply 4 import logging
0981a08… noreply 5 import mimetypes
0981a08… noreply 6 from datetime import datetime
0981a08… noreply 7 from pathlib import Path
0981a08… noreply 8 from typing import Dict, List, Optional
0981a08… noreply 9
0981a08… noreply 10 from video_processor.integrators.knowledge_graph import KnowledgeGraph
0981a08… noreply 11 from video_processor.processors.base import get_processor, list_supported_extensions
0981a08… noreply 12
0981a08… noreply 13 logger = logging.getLogger(__name__)
0981a08… noreply 14
0981a08… noreply 15
0981a08… noreply 16 def ingest_file(
0981a08… noreply 17 path: Path,
0981a08… noreply 18 knowledge_graph: KnowledgeGraph,
0981a08… noreply 19 source_id: Optional[str] = None,
0981a08… noreply 20 ) -> int:
0981a08… noreply 21 """Process a single file and add its content to the knowledge graph.
0981a08… noreply 22
0981a08… noreply 23 Returns the number of chunks processed.
0981a08… noreply 24 """
0981a08… noreply 25 processor = get_processor(path)
0981a08… noreply 26 if processor is None:
0981a08… noreply 27 raise ValueError(
0981a08… noreply 28 f"No processor for {path.suffix}. Supported: {', '.join(list_supported_extensions())}"
0981a08… noreply 29 )
0981a08… noreply 30
0981a08… noreply 31 chunks = processor.process(path)
0981a08… noreply 32
0981a08… noreply 33 if source_id is None:
0981a08… noreply 34 source_id = hashlib.sha256(str(path.resolve()).encode()).hexdigest()[:12]
0981a08… noreply 35
0981a08… noreply 36 mime = mimetypes.guess_type(str(path))[0] or "application/octet-stream"
0981a08… noreply 37 knowledge_graph.register_source(
0981a08… noreply 38 {
0981a08… noreply 39 "source_id": source_id,
0981a08… noreply 40 "source_type": "document",
0981a08… noreply 41 "title": path.stem,
0981a08… noreply 42 "path": str(path),
0981a08… noreply 43 "mime_type": mime,
0981a08… noreply 44 "ingested_at": datetime.now().isoformat(),
0981a08… noreply 45 "metadata": {"chunks": len(chunks), "extension": path.suffix},
0981a08… noreply 46 }
0981a08… noreply 47 )
0981a08… noreply 48
0981a08… noreply 49 for chunk in chunks:
0981a08… noreply 50 content_source = f"document:{path.name}"
0981a08… noreply 51 if chunk.page is not None:
0981a08… noreply 52 content_source += f":page:{chunk.page}"
0981a08… noreply 53 elif chunk.section:
0981a08… noreply 54 content_source += f":section:{chunk.section}"
0981a08… noreply 55 knowledge_graph.add_content(chunk.text, content_source)
0981a08… noreply 56
0981a08… noreply 57 return len(chunks)
0981a08… noreply 58
0981a08… noreply 59
0981a08… noreply 60 def ingest_directory(
0981a08… noreply 61 directory: Path,
0981a08… noreply 62 knowledge_graph: KnowledgeGraph,
0981a08… noreply 63 recursive: bool = True,
0981a08… noreply 64 extensions: Optional[List[str]] = None,
0981a08… noreply 65 ) -> Dict[str, int]:
0981a08… noreply 66 """Process all supported files in a directory.
0981a08… noreply 67
0981a08… noreply 68 Returns a dict mapping filename to chunk count.
0981a08… noreply 69 """
0981a08… noreply 70 if not directory.is_dir():
0981a08… noreply 71 raise ValueError(f"Not a directory: {directory}")
0981a08… noreply 72
0981a08… noreply 73 supported = set(extensions) if extensions else set(list_supported_extensions())
0981a08… noreply 74 results: Dict[str, int] = {}
0981a08… noreply 75
0981a08… noreply 76 glob_fn = directory.rglob if recursive else directory.glob
0981a08… noreply 77 files = sorted(f for f in glob_fn("*") if f.is_file() and f.suffix.lower() in supported)
0981a08… noreply 78
0981a08… noreply 79 for file_path in files:
0981a08… noreply 80 try:
0981a08… noreply 81 count = ingest_file(file_path, knowledge_graph)
0981a08… noreply 82 results[str(file_path)] = count
0981a08… noreply 83 logger.info(f"Ingested {file_path.name}: {count} chunks")
0981a08… noreply 84 except Exception as e:
0981a08… noreply 85 logger.warning(f"Failed to ingest {file_path.name}: {e}")
0981a08… noreply 86 results[str(file_path)] = 0
0981a08… noreply 87
0981a08… noreply 88 return results

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button