|
0981a08…
|
noreply
|
1 |
"""Tests for document processors and ingestion pipeline.""" |
|
0981a08…
|
noreply
|
2 |
|
|
0981a08…
|
noreply
|
3 |
import textwrap |
|
0981a08…
|
noreply
|
4 |
from pathlib import Path |
|
0981a08…
|
noreply
|
5 |
from unittest.mock import MagicMock, patch |
|
0981a08…
|
noreply
|
6 |
|
|
0981a08…
|
noreply
|
7 |
import pytest |
|
0981a08…
|
noreply
|
8 |
|
|
0981a08…
|
noreply
|
9 |
from video_processor.processors.base import ( |
|
0981a08…
|
noreply
|
10 |
DocumentChunk, |
|
0981a08…
|
noreply
|
11 |
DocumentProcessor, |
|
0981a08…
|
noreply
|
12 |
get_processor, |
|
0981a08…
|
noreply
|
13 |
list_supported_extensions, |
|
0981a08…
|
noreply
|
14 |
register_processor, |
|
0981a08…
|
noreply
|
15 |
) |
|
0981a08…
|
noreply
|
16 |
from video_processor.processors.markdown_processor import ( |
|
0981a08…
|
noreply
|
17 |
MarkdownProcessor, |
|
0981a08…
|
noreply
|
18 |
PlaintextProcessor, |
|
0981a08…
|
noreply
|
19 |
_chunk_by_paragraphs, |
|
0981a08…
|
noreply
|
20 |
) |
|
0981a08…
|
noreply
|
21 |
from video_processor.processors.pdf_processor import PdfProcessor |
|
0981a08…
|
noreply
|
22 |
|
|
0981a08…
|
noreply
|
23 |
# --- Base / Registry --- |
|
0981a08…
|
noreply
|
24 |
|
|
0981a08…
|
noreply
|
25 |
|
|
0981a08…
|
noreply
|
26 |
class TestRegistry: |
|
0981a08…
|
noreply
|
27 |
def test_list_supported_extensions_includes_builtins(self): |
|
0981a08…
|
noreply
|
28 |
exts = list_supported_extensions() |
|
0981a08…
|
noreply
|
29 |
assert ".md" in exts |
|
0981a08…
|
noreply
|
30 |
assert ".txt" in exts |
|
0981a08…
|
noreply
|
31 |
assert ".pdf" in exts |
|
0981a08…
|
noreply
|
32 |
|
|
0981a08…
|
noreply
|
33 |
def test_get_processor_markdown(self, tmp_path): |
|
0981a08…
|
noreply
|
34 |
f = tmp_path / "doc.md" |
|
0981a08…
|
noreply
|
35 |
f.write_text("hello") |
|
0981a08…
|
noreply
|
36 |
proc = get_processor(f) |
|
0981a08…
|
noreply
|
37 |
assert isinstance(proc, MarkdownProcessor) |
|
0981a08…
|
noreply
|
38 |
|
|
0981a08…
|
noreply
|
39 |
def test_get_processor_txt(self, tmp_path): |
|
0981a08…
|
noreply
|
40 |
f = tmp_path / "doc.txt" |
|
0981a08…
|
noreply
|
41 |
f.write_text("hello") |
|
0981a08…
|
noreply
|
42 |
proc = get_processor(f) |
|
0981a08…
|
noreply
|
43 |
assert isinstance(proc, PlaintextProcessor) |
|
0981a08…
|
noreply
|
44 |
|
|
0981a08…
|
noreply
|
45 |
def test_get_processor_pdf(self, tmp_path): |
|
0981a08…
|
noreply
|
46 |
f = tmp_path / "doc.pdf" |
|
0981a08…
|
noreply
|
47 |
f.write_text("") |
|
0981a08…
|
noreply
|
48 |
proc = get_processor(f) |
|
0981a08…
|
noreply
|
49 |
assert isinstance(proc, PdfProcessor) |
|
0981a08…
|
noreply
|
50 |
|
|
0981a08…
|
noreply
|
51 |
def test_get_processor_unknown(self, tmp_path): |
|
0981a08…
|
noreply
|
52 |
f = tmp_path / "doc.xyz" |
|
0981a08…
|
noreply
|
53 |
f.write_text("") |
|
0981a08…
|
noreply
|
54 |
assert get_processor(f) is None |
|
0981a08…
|
noreply
|
55 |
|
|
0981a08…
|
noreply
|
56 |
def test_register_custom_processor(self, tmp_path): |
|
0981a08…
|
noreply
|
57 |
class CustomProcessor(DocumentProcessor): |
|
0981a08…
|
noreply
|
58 |
supported_extensions = [".custom"] |
|
0981a08…
|
noreply
|
59 |
|
|
0981a08…
|
noreply
|
60 |
def can_process(self, path): |
|
0981a08…
|
noreply
|
61 |
return path.suffix == ".custom" |
|
0981a08…
|
noreply
|
62 |
|
|
0981a08…
|
noreply
|
63 |
def process(self, path): |
|
0981a08…
|
noreply
|
64 |
return [DocumentChunk(text="custom", source_file=str(path), chunk_index=0)] |
|
0981a08…
|
noreply
|
65 |
|
|
0981a08…
|
noreply
|
66 |
register_processor([".custom"], CustomProcessor) |
|
0981a08…
|
noreply
|
67 |
f = tmp_path / "test.custom" |
|
0981a08…
|
noreply
|
68 |
f.write_text("data") |
|
0981a08…
|
noreply
|
69 |
proc = get_processor(f) |
|
0981a08…
|
noreply
|
70 |
assert isinstance(proc, CustomProcessor) |
|
0981a08…
|
noreply
|
71 |
chunks = proc.process(f) |
|
0981a08…
|
noreply
|
72 |
assert len(chunks) == 1 |
|
0981a08…
|
noreply
|
73 |
assert chunks[0].text == "custom" |
|
0981a08…
|
noreply
|
74 |
|
|
0981a08…
|
noreply
|
75 |
|
|
0981a08…
|
noreply
|
76 |
# --- Markdown --- |
|
0981a08…
|
noreply
|
77 |
|
|
0981a08…
|
noreply
|
78 |
|
|
0981a08…
|
noreply
|
79 |
class TestMarkdownProcessor: |
|
0981a08…
|
noreply
|
80 |
def test_splits_by_headings(self, tmp_path): |
|
0981a08…
|
noreply
|
81 |
md = tmp_path / "test.md" |
|
0981a08…
|
noreply
|
82 |
md.write_text( |
|
0981a08…
|
noreply
|
83 |
textwrap.dedent("""\ |
|
0981a08…
|
noreply
|
84 |
# Introduction |
|
0981a08…
|
noreply
|
85 |
Some intro text. |
|
0981a08…
|
noreply
|
86 |
|
|
0981a08…
|
noreply
|
87 |
## Details |
|
0981a08…
|
noreply
|
88 |
More details here. |
|
0981a08…
|
noreply
|
89 |
|
|
0981a08…
|
noreply
|
90 |
## Conclusion |
|
0981a08…
|
noreply
|
91 |
Final thoughts. |
|
0981a08…
|
noreply
|
92 |
""") |
|
0981a08…
|
noreply
|
93 |
) |
|
0981a08…
|
noreply
|
94 |
proc = MarkdownProcessor() |
|
0981a08…
|
noreply
|
95 |
assert proc.can_process(md) |
|
0981a08…
|
noreply
|
96 |
chunks = proc.process(md) |
|
0981a08…
|
noreply
|
97 |
|
|
0981a08…
|
noreply
|
98 |
assert len(chunks) == 3 |
|
0981a08…
|
noreply
|
99 |
assert chunks[0].section == "Introduction" |
|
0981a08…
|
noreply
|
100 |
assert "intro text" in chunks[0].text |
|
0981a08…
|
noreply
|
101 |
assert chunks[1].section == "Details" |
|
0981a08…
|
noreply
|
102 |
assert chunks[2].section == "Conclusion" |
|
0981a08…
|
noreply
|
103 |
|
|
0981a08…
|
noreply
|
104 |
def test_preamble_before_first_heading(self, tmp_path): |
|
0981a08…
|
noreply
|
105 |
md = tmp_path / "test.md" |
|
0981a08…
|
noreply
|
106 |
md.write_text( |
|
0981a08…
|
noreply
|
107 |
textwrap.dedent("""\ |
|
0981a08…
|
noreply
|
108 |
Some preamble text. |
|
0981a08…
|
noreply
|
109 |
|
|
0981a08…
|
noreply
|
110 |
# First Heading |
|
0981a08…
|
noreply
|
111 |
Content here. |
|
0981a08…
|
noreply
|
112 |
""") |
|
0981a08…
|
noreply
|
113 |
) |
|
0981a08…
|
noreply
|
114 |
proc = MarkdownProcessor() |
|
0981a08…
|
noreply
|
115 |
chunks = proc.process(md) |
|
0981a08…
|
noreply
|
116 |
assert len(chunks) == 2 |
|
0981a08…
|
noreply
|
117 |
assert chunks[0].section == "(preamble)" |
|
0981a08…
|
noreply
|
118 |
assert "preamble" in chunks[0].text |
|
0981a08…
|
noreply
|
119 |
|
|
0981a08…
|
noreply
|
120 |
def test_no_headings_falls_back_to_paragraphs(self, tmp_path): |
|
0981a08…
|
noreply
|
121 |
md = tmp_path / "test.md" |
|
0981a08…
|
noreply
|
122 |
md.write_text("Paragraph one.\n\nParagraph two.\n\nParagraph three.") |
|
0981a08…
|
noreply
|
123 |
proc = MarkdownProcessor() |
|
0981a08…
|
noreply
|
124 |
chunks = proc.process(md) |
|
0981a08…
|
noreply
|
125 |
assert len(chunks) >= 1 |
|
0981a08…
|
noreply
|
126 |
# All text should be captured |
|
0981a08…
|
noreply
|
127 |
full_text = " ".join(c.text for c in chunks) |
|
0981a08…
|
noreply
|
128 |
assert "Paragraph one" in full_text |
|
0981a08…
|
noreply
|
129 |
assert "Paragraph three" in full_text |
|
0981a08…
|
noreply
|
130 |
|
|
0981a08…
|
noreply
|
131 |
def test_chunk_index_increments(self, tmp_path): |
|
0981a08…
|
noreply
|
132 |
md = tmp_path / "test.md" |
|
0981a08…
|
noreply
|
133 |
md.write_text("# A\ntext\n# B\ntext\n# C\ntext") |
|
0981a08…
|
noreply
|
134 |
proc = MarkdownProcessor() |
|
0981a08…
|
noreply
|
135 |
chunks = proc.process(md) |
|
0981a08…
|
noreply
|
136 |
indices = [c.chunk_index for c in chunks] |
|
0981a08…
|
noreply
|
137 |
assert indices == list(range(len(chunks))) |
|
0981a08…
|
noreply
|
138 |
|
|
0981a08…
|
noreply
|
139 |
def test_source_file_set(self, tmp_path): |
|
0981a08…
|
noreply
|
140 |
md = tmp_path / "test.md" |
|
0981a08…
|
noreply
|
141 |
md.write_text("# Heading\nContent") |
|
0981a08…
|
noreply
|
142 |
proc = MarkdownProcessor() |
|
0981a08…
|
noreply
|
143 |
chunks = proc.process(md) |
|
0981a08…
|
noreply
|
144 |
assert chunks[0].source_file == str(md) |
|
0981a08…
|
noreply
|
145 |
|
|
0981a08…
|
noreply
|
146 |
|
|
0981a08…
|
noreply
|
147 |
# --- Plaintext --- |
|
0981a08…
|
noreply
|
148 |
|
|
0981a08…
|
noreply
|
149 |
|
|
0981a08…
|
noreply
|
150 |
class TestPlaintextProcessor: |
|
0981a08…
|
noreply
|
151 |
def test_basic_paragraphs(self, tmp_path): |
|
0981a08…
|
noreply
|
152 |
txt = tmp_path / "test.txt" |
|
0981a08…
|
noreply
|
153 |
txt.write_text("First paragraph.\n\nSecond paragraph.\n\nThird paragraph.") |
|
0981a08…
|
noreply
|
154 |
proc = PlaintextProcessor() |
|
0981a08…
|
noreply
|
155 |
assert proc.can_process(txt) |
|
0981a08…
|
noreply
|
156 |
chunks = proc.process(txt) |
|
0981a08…
|
noreply
|
157 |
assert len(chunks) >= 1 |
|
0981a08…
|
noreply
|
158 |
full_text = " ".join(c.text for c in chunks) |
|
0981a08…
|
noreply
|
159 |
assert "First paragraph" in full_text |
|
0981a08…
|
noreply
|
160 |
assert "Third paragraph" in full_text |
|
0981a08…
|
noreply
|
161 |
|
|
0981a08…
|
noreply
|
162 |
def test_handles_log_files(self, tmp_path): |
|
0981a08…
|
noreply
|
163 |
log = tmp_path / "app.log" |
|
0981a08…
|
noreply
|
164 |
log.write_text("line 1\nline 2\nline 3") |
|
0981a08…
|
noreply
|
165 |
proc = PlaintextProcessor() |
|
0981a08…
|
noreply
|
166 |
assert proc.can_process(log) |
|
0981a08…
|
noreply
|
167 |
chunks = proc.process(log) |
|
0981a08…
|
noreply
|
168 |
assert len(chunks) >= 1 |
|
0981a08…
|
noreply
|
169 |
|
|
0981a08…
|
noreply
|
170 |
def test_handles_csv(self, tmp_path): |
|
0981a08…
|
noreply
|
171 |
csv = tmp_path / "data.csv" |
|
0981a08…
|
noreply
|
172 |
csv.write_text("a,b,c\n1,2,3\n4,5,6") |
|
0981a08…
|
noreply
|
173 |
proc = PlaintextProcessor() |
|
0981a08…
|
noreply
|
174 |
assert proc.can_process(csv) |
|
0981a08…
|
noreply
|
175 |
chunks = proc.process(csv) |
|
0981a08…
|
noreply
|
176 |
assert len(chunks) >= 1 |
|
0981a08…
|
noreply
|
177 |
|
|
0981a08…
|
noreply
|
178 |
def test_empty_file(self, tmp_path): |
|
0981a08…
|
noreply
|
179 |
txt = tmp_path / "empty.txt" |
|
0981a08…
|
noreply
|
180 |
txt.write_text("") |
|
0981a08…
|
noreply
|
181 |
proc = PlaintextProcessor() |
|
0981a08…
|
noreply
|
182 |
chunks = proc.process(txt) |
|
0981a08…
|
noreply
|
183 |
assert chunks == [] |
|
0981a08…
|
noreply
|
184 |
|
|
0981a08…
|
noreply
|
185 |
|
|
0981a08…
|
noreply
|
186 |
class TestChunkByParagraphs: |
|
0981a08…
|
noreply
|
187 |
def test_respects_max_chunk_size(self): |
|
0981a08…
|
noreply
|
188 |
# Create text with many paragraphs that exceed max size |
|
0981a08…
|
noreply
|
189 |
paragraphs = ["A" * 500 for _ in range(10)] |
|
0981a08…
|
noreply
|
190 |
text = "\n\n".join(paragraphs) |
|
0981a08…
|
noreply
|
191 |
chunks = _chunk_by_paragraphs(text, "test.txt", max_chunk_size=1200, overlap=100) |
|
0981a08…
|
noreply
|
192 |
assert len(chunks) > 1 |
|
0981a08…
|
noreply
|
193 |
for chunk in chunks: |
|
0981a08…
|
noreply
|
194 |
# Each chunk should be reasonably sized (allowing for overlap) |
|
0981a08…
|
noreply
|
195 |
assert len(chunk.text) < 2000 |
|
0981a08…
|
noreply
|
196 |
|
|
0981a08…
|
noreply
|
197 |
def test_overlap(self): |
|
0981a08…
|
noreply
|
198 |
text = "Para A " * 300 + "\n\n" + "Para B " * 300 + "\n\n" + "Para C " * 300 |
|
0981a08…
|
noreply
|
199 |
chunks = _chunk_by_paragraphs(text, "test.txt", max_chunk_size=2500, overlap=200) |
|
0981a08…
|
noreply
|
200 |
if len(chunks) > 1: |
|
0981a08…
|
noreply
|
201 |
# The second chunk should contain some overlap from the first |
|
0981a08…
|
noreply
|
202 |
assert len(chunks[1].text) > 200 |
|
0981a08…
|
noreply
|
203 |
|
|
0981a08…
|
noreply
|
204 |
|
|
0981a08…
|
noreply
|
205 |
# --- PDF --- |
|
0981a08…
|
noreply
|
206 |
|
|
0981a08…
|
noreply
|
207 |
|
|
0981a08…
|
noreply
|
208 |
class TestPdfProcessor: |
|
0981a08…
|
noreply
|
209 |
def test_can_process(self, tmp_path): |
|
0981a08…
|
noreply
|
210 |
f = tmp_path / "doc.pdf" |
|
0981a08…
|
noreply
|
211 |
f.write_text("") |
|
0981a08…
|
noreply
|
212 |
proc = PdfProcessor() |
|
0981a08…
|
noreply
|
213 |
assert proc.can_process(f) |
|
0981a08…
|
noreply
|
214 |
assert not proc.can_process(tmp_path / "doc.txt") |
|
0981a08…
|
noreply
|
215 |
|
|
0981a08…
|
noreply
|
216 |
def test_process_pymupdf(self, tmp_path): |
|
0981a08…
|
noreply
|
217 |
f = tmp_path / "doc.pdf" |
|
0981a08…
|
noreply
|
218 |
f.write_text("") |
|
0981a08…
|
noreply
|
219 |
|
|
0981a08…
|
noreply
|
220 |
mock_page = MagicMock() |
|
0981a08…
|
noreply
|
221 |
mock_page.get_text.return_value = "Page 1 content" |
|
0981a08…
|
noreply
|
222 |
mock_doc = MagicMock() |
|
0981a08…
|
noreply
|
223 |
mock_doc.__iter__ = MagicMock(return_value=iter([mock_page])) |
|
0981a08…
|
noreply
|
224 |
mock_doc.__enter__ = MagicMock(return_value=mock_doc) |
|
0981a08…
|
noreply
|
225 |
mock_doc.__exit__ = MagicMock(return_value=False) |
|
0981a08…
|
noreply
|
226 |
|
|
0981a08…
|
noreply
|
227 |
mock_pymupdf = MagicMock() |
|
0981a08…
|
noreply
|
228 |
mock_pymupdf.open.return_value = mock_doc |
|
0981a08…
|
noreply
|
229 |
|
|
0981a08…
|
noreply
|
230 |
with patch.dict("sys.modules", {"pymupdf": mock_pymupdf}): |
|
0981a08…
|
noreply
|
231 |
proc = PdfProcessor() |
|
0981a08…
|
noreply
|
232 |
chunks = proc._process_pymupdf(f) |
|
0981a08…
|
noreply
|
233 |
assert len(chunks) == 1 |
|
0981a08…
|
noreply
|
234 |
assert chunks[0].text == "Page 1 content" |
|
0981a08…
|
noreply
|
235 |
assert chunks[0].page == 1 |
|
0981a08…
|
noreply
|
236 |
assert chunks[0].metadata["extraction_method"] == "pymupdf" |
|
0981a08…
|
noreply
|
237 |
|
|
0981a08…
|
noreply
|
238 |
def test_process_pdfplumber(self, tmp_path): |
|
0981a08…
|
noreply
|
239 |
f = tmp_path / "doc.pdf" |
|
0981a08…
|
noreply
|
240 |
f.write_text("") |
|
0981a08…
|
noreply
|
241 |
|
|
0981a08…
|
noreply
|
242 |
mock_page = MagicMock() |
|
0981a08…
|
noreply
|
243 |
mock_page.extract_text.return_value = "Page 1 via pdfplumber" |
|
0981a08…
|
noreply
|
244 |
mock_pdf = MagicMock() |
|
0981a08…
|
noreply
|
245 |
mock_pdf.pages = [mock_page] |
|
0981a08…
|
noreply
|
246 |
mock_pdf.__enter__ = MagicMock(return_value=mock_pdf) |
|
0981a08…
|
noreply
|
247 |
mock_pdf.__exit__ = MagicMock(return_value=False) |
|
0981a08…
|
noreply
|
248 |
|
|
0981a08…
|
noreply
|
249 |
mock_pdfplumber = MagicMock() |
|
0981a08…
|
noreply
|
250 |
mock_pdfplumber.open.return_value = mock_pdf |
|
0981a08…
|
noreply
|
251 |
|
|
0981a08…
|
noreply
|
252 |
with patch.dict("sys.modules", {"pdfplumber": mock_pdfplumber}): |
|
0981a08…
|
noreply
|
253 |
proc = PdfProcessor() |
|
0981a08…
|
noreply
|
254 |
chunks = proc._process_pdfplumber(f) |
|
0981a08…
|
noreply
|
255 |
assert len(chunks) == 1 |
|
0981a08…
|
noreply
|
256 |
assert chunks[0].text == "Page 1 via pdfplumber" |
|
0981a08…
|
noreply
|
257 |
assert chunks[0].metadata["extraction_method"] == "pdfplumber" |
|
0981a08…
|
noreply
|
258 |
|
|
0981a08…
|
noreply
|
259 |
def test_raises_if_no_library(self, tmp_path): |
|
0981a08…
|
noreply
|
260 |
f = tmp_path / "doc.pdf" |
|
0981a08…
|
noreply
|
261 |
f.write_text("") |
|
0981a08…
|
noreply
|
262 |
proc = PdfProcessor() |
|
0981a08…
|
noreply
|
263 |
|
|
0981a08…
|
noreply
|
264 |
with patch.object(proc, "_process_pymupdf", side_effect=ImportError): |
|
0981a08…
|
noreply
|
265 |
with patch.object(proc, "_process_pdfplumber", side_effect=ImportError): |
|
0981a08…
|
noreply
|
266 |
with pytest.raises(ImportError, match="pymupdf or pdfplumber"): |
|
0981a08…
|
noreply
|
267 |
proc.process(f) |
|
0981a08…
|
noreply
|
268 |
|
|
0981a08…
|
noreply
|
269 |
|
|
0981a08…
|
noreply
|
270 |
# --- Ingest --- |
|
0981a08…
|
noreply
|
271 |
|
|
0981a08…
|
noreply
|
272 |
|
|
0981a08…
|
noreply
|
273 |
class TestIngest: |
|
0981a08…
|
noreply
|
274 |
def test_ingest_file(self, tmp_path): |
|
0981a08…
|
noreply
|
275 |
md = tmp_path / "doc.md" |
|
0981a08…
|
noreply
|
276 |
md.write_text("# Title\nSome content here.") |
|
0981a08…
|
noreply
|
277 |
|
|
0981a08…
|
noreply
|
278 |
mock_kg = MagicMock() |
|
0981a08…
|
noreply
|
279 |
mock_kg.register_source = MagicMock() |
|
0981a08…
|
noreply
|
280 |
mock_kg.add_content = MagicMock() |
|
0981a08…
|
noreply
|
281 |
|
|
0981a08…
|
noreply
|
282 |
from video_processor.processors.ingest import ingest_file |
|
0981a08…
|
noreply
|
283 |
|
|
0981a08…
|
noreply
|
284 |
count = ingest_file(md, mock_kg) |
|
0981a08…
|
noreply
|
285 |
assert count == 1 |
|
0981a08…
|
noreply
|
286 |
mock_kg.register_source.assert_called_once() |
|
0981a08…
|
noreply
|
287 |
source_arg = mock_kg.register_source.call_args[0][0] |
|
0981a08…
|
noreply
|
288 |
assert source_arg["source_type"] == "document" |
|
0981a08…
|
noreply
|
289 |
assert source_arg["title"] == "doc" |
|
0981a08…
|
noreply
|
290 |
mock_kg.add_content.assert_called_once() |
|
0981a08…
|
noreply
|
291 |
|
|
0981a08…
|
noreply
|
292 |
def test_ingest_file_unsupported(self, tmp_path): |
|
0981a08…
|
noreply
|
293 |
f = tmp_path / "data.xyz" |
|
0981a08…
|
noreply
|
294 |
f.write_text("stuff") |
|
0981a08…
|
noreply
|
295 |
mock_kg = MagicMock() |
|
0981a08…
|
noreply
|
296 |
|
|
0981a08…
|
noreply
|
297 |
from video_processor.processors.ingest import ingest_file |
|
0981a08…
|
noreply
|
298 |
|
|
0981a08…
|
noreply
|
299 |
with pytest.raises(ValueError, match="No processor"): |
|
0981a08…
|
noreply
|
300 |
ingest_file(f, mock_kg) |
|
0981a08…
|
noreply
|
301 |
|
|
0981a08…
|
noreply
|
302 |
def test_ingest_directory(self, tmp_path): |
|
0981a08…
|
noreply
|
303 |
(tmp_path / "a.md").write_text("# A\nContent A") |
|
0981a08…
|
noreply
|
304 |
(tmp_path / "b.txt").write_text("Content B") |
|
0981a08…
|
noreply
|
305 |
(tmp_path / "c.xyz").write_text("Ignored") |
|
0981a08…
|
noreply
|
306 |
|
|
0981a08…
|
noreply
|
307 |
mock_kg = MagicMock() |
|
0981a08…
|
noreply
|
308 |
|
|
0981a08…
|
noreply
|
309 |
from video_processor.processors.ingest import ingest_directory |
|
0981a08…
|
noreply
|
310 |
|
|
0981a08…
|
noreply
|
311 |
results = ingest_directory(tmp_path, mock_kg, recursive=False) |
|
0981a08…
|
noreply
|
312 |
# Should process a.md and b.txt but not c.xyz |
|
0981a08…
|
noreply
|
313 |
assert len(results) == 2 |
|
0981a08…
|
noreply
|
314 |
processed_names = {Path(p).name for p in results} |
|
0981a08…
|
noreply
|
315 |
assert "a.md" in processed_names |
|
0981a08…
|
noreply
|
316 |
assert "b.txt" in processed_names |
|
0981a08…
|
noreply
|
317 |
|
|
0981a08…
|
noreply
|
318 |
def test_ingest_directory_recursive(self, tmp_path): |
|
0981a08…
|
noreply
|
319 |
sub = tmp_path / "sub" |
|
0981a08…
|
noreply
|
320 |
sub.mkdir() |
|
0981a08…
|
noreply
|
321 |
(tmp_path / "top.md").write_text("# Top\nTop level") |
|
0981a08…
|
noreply
|
322 |
(sub / "nested.md").write_text("# Nested\nNested content") |
|
0981a08…
|
noreply
|
323 |
|
|
0981a08…
|
noreply
|
324 |
mock_kg = MagicMock() |
|
0981a08…
|
noreply
|
325 |
|
|
0981a08…
|
noreply
|
326 |
from video_processor.processors.ingest import ingest_directory |
|
0981a08…
|
noreply
|
327 |
|
|
0981a08…
|
noreply
|
328 |
results = ingest_directory(tmp_path, mock_kg, recursive=True) |
|
0981a08…
|
noreply
|
329 |
assert len(results) == 2 |
|
0981a08…
|
noreply
|
330 |
processed_names = {Path(p).name for p in results} |
|
0981a08…
|
noreply
|
331 |
assert "top.md" in processed_names |
|
0981a08…
|
noreply
|
332 |
assert "nested.md" in processed_names |
|
0981a08…
|
noreply
|
333 |
|
|
0981a08…
|
noreply
|
334 |
def test_ingest_file_custom_source_id(self, tmp_path): |
|
0981a08…
|
noreply
|
335 |
md = tmp_path / "doc.md" |
|
0981a08…
|
noreply
|
336 |
md.write_text("# Title\nContent") |
|
0981a08…
|
noreply
|
337 |
|
|
0981a08…
|
noreply
|
338 |
mock_kg = MagicMock() |
|
0981a08…
|
noreply
|
339 |
|
|
0981a08…
|
noreply
|
340 |
from video_processor.processors.ingest import ingest_file |
|
0981a08…
|
noreply
|
341 |
|
|
0981a08…
|
noreply
|
342 |
ingest_file(md, mock_kg, source_id="custom-123") |
|
0981a08…
|
noreply
|
343 |
source_arg = mock_kg.register_source.call_args[0][0] |
|
0981a08…
|
noreply
|
344 |
assert source_arg["source_id"] == "custom-123" |
|
0981a08…
|
noreply
|
345 |
|
|
0981a08…
|
noreply
|
346 |
def test_ingest_content_source_format_with_section(self, tmp_path): |
|
0981a08…
|
noreply
|
347 |
md = tmp_path / "doc.md" |
|
0981a08…
|
noreply
|
348 |
md.write_text("# Introduction\nSome text\n\n## Details\nMore text") |
|
0981a08…
|
noreply
|
349 |
|
|
0981a08…
|
noreply
|
350 |
mock_kg = MagicMock() |
|
0981a08…
|
noreply
|
351 |
|
|
0981a08…
|
noreply
|
352 |
from video_processor.processors.ingest import ingest_file |
|
0981a08…
|
noreply
|
353 |
|
|
0981a08…
|
noreply
|
354 |
ingest_file(md, mock_kg) |
|
0981a08…
|
noreply
|
355 |
# Check content_source includes section info |
|
0981a08…
|
noreply
|
356 |
calls = mock_kg.add_content.call_args_list |
|
0981a08…
|
noreply
|
357 |
assert len(calls) == 2 |
|
0981a08…
|
noreply
|
358 |
assert "document:doc.md:section:Introduction" in calls[0][0][1] |
|
0981a08…
|
noreply
|
359 |
assert "document:doc.md:section:Details" in calls[1][0][1] |