PlanOpticon

planopticon / video_processor / processors / ingest.py

Blame History Raw 89 lines

1	`"""Document ingestion — process files and add content to a knowledge graph."""`
2
3	`import hashlib`
4	`import logging`
5	`import mimetypes`
6	`from datetime import datetime`
7	`from pathlib import Path`
8	`from typing import Dict, List, Optional`
9
10	`from video_processor.integrators.knowledge_graph import KnowledgeGraph`
11	`from video_processor.processors.base import get_processor, list_supported_extensions`
12
13	`logger = logging.getLogger(__name__)`
14
15
16	`def ingest_file(`
17	`path: Path,`
18	`knowledge_graph: KnowledgeGraph,`
19	`source_id: Optional[str] = None,`
20	`) -> int:`
21	`"""Process a single file and add its content to the knowledge graph.`
22
23	`Returns the number of chunks processed.`
24	`"""`
25	`processor = get_processor(path)`
26	`if processor is None:`
27	`raise ValueError(`
28	`f"No processor for {path.suffix}. Supported: {', '.join(list_supported_extensions())}"`
29	`)`
30
31	`chunks = processor.process(path)`
32
33	`if source_id is None:`
34	`source_id = hashlib.sha256(str(path.resolve()).encode()).hexdigest()[:12]`
35
36	`mime = mimetypes.guess_type(str(path))[0] or "application/octet-stream"`
37	`knowledge_graph.register_source(`
38	`{`
39	`"source_id": source_id,`
40	`"source_type": "document",`
41	`"title": path.stem,`
42	`"path": str(path),`
43	`"mime_type": mime,`
44	`"ingested_at": datetime.now().isoformat(),`
45	`"metadata": {"chunks": len(chunks), "extension": path.suffix},`
46	`}`
47	`)`
48
49	`for chunk in chunks:`
50	`content_source = f"document:{path.name}"`
51	`if chunk.page is not None:`
52	`content_source += f":page:{chunk.page}"`
53	`elif chunk.section:`
54	`content_source += f":section:{chunk.section}"`
55	`knowledge_graph.add_content(chunk.text, content_source)`
56
57	`return len(chunks)`
58
59
60	`def ingest_directory(`
61	`directory: Path,`
62	`knowledge_graph: KnowledgeGraph,`
63	`recursive: bool = True,`
64	`extensions: Optional[List[str]] = None,`
65	`) -> Dict[str, int]:`
66	`"""Process all supported files in a directory.`
67
68	`Returns a dict mapping filename to chunk count.`
69	`"""`
70	`if not directory.is_dir():`
71	`raise ValueError(f"Not a directory: {directory}")`
72
73	`supported = set(extensions) if extensions else set(list_supported_extensions())`
74	`results: Dict[str, int] = {}`
75
76	`glob_fn = directory.rglob if recursive else directory.glob`
77	`files = sorted(f for f in glob_fn("*") if f.is_file() and f.suffix.lower() in supported)`
78
79	`for file_path in files:`
80	`try:`
81	`count = ingest_file(file_path, knowledge_graph)`
82	`results[str(file_path)] = count`
83	`logger.info(f"Ingested {file_path.name}: {count} chunks")`
84	`except Exception as e:`
85	`logger.warning(f"Failed to ingest {file_path.name}: {e}")`
86	`results[str(file_path)] = 0`
87
88	`return results`
89

PlanOpticon

Keyboard Shortcuts