PlanOpticon

planopticon / tests / test_processors.py
Source Blame History 359 lines
0981a08… noreply 1 """Tests for document processors and ingestion pipeline."""
0981a08… noreply 2
0981a08… noreply 3 import textwrap
0981a08… noreply 4 from pathlib import Path
0981a08… noreply 5 from unittest.mock import MagicMock, patch
0981a08… noreply 6
0981a08… noreply 7 import pytest
0981a08… noreply 8
0981a08… noreply 9 from video_processor.processors.base import (
0981a08… noreply 10 DocumentChunk,
0981a08… noreply 11 DocumentProcessor,
0981a08… noreply 12 get_processor,
0981a08… noreply 13 list_supported_extensions,
0981a08… noreply 14 register_processor,
0981a08… noreply 15 )
0981a08… noreply 16 from video_processor.processors.markdown_processor import (
0981a08… noreply 17 MarkdownProcessor,
0981a08… noreply 18 PlaintextProcessor,
0981a08… noreply 19 _chunk_by_paragraphs,
0981a08… noreply 20 )
0981a08… noreply 21 from video_processor.processors.pdf_processor import PdfProcessor
0981a08… noreply 22
0981a08… noreply 23 # --- Base / Registry ---
0981a08… noreply 24
0981a08… noreply 25
0981a08… noreply 26 class TestRegistry:
0981a08… noreply 27 def test_list_supported_extensions_includes_builtins(self):
0981a08… noreply 28 exts = list_supported_extensions()
0981a08… noreply 29 assert ".md" in exts
0981a08… noreply 30 assert ".txt" in exts
0981a08… noreply 31 assert ".pdf" in exts
0981a08… noreply 32
0981a08… noreply 33 def test_get_processor_markdown(self, tmp_path):
0981a08… noreply 34 f = tmp_path / "doc.md"
0981a08… noreply 35 f.write_text("hello")
0981a08… noreply 36 proc = get_processor(f)
0981a08… noreply 37 assert isinstance(proc, MarkdownProcessor)
0981a08… noreply 38
0981a08… noreply 39 def test_get_processor_txt(self, tmp_path):
0981a08… noreply 40 f = tmp_path / "doc.txt"
0981a08… noreply 41 f.write_text("hello")
0981a08… noreply 42 proc = get_processor(f)
0981a08… noreply 43 assert isinstance(proc, PlaintextProcessor)
0981a08… noreply 44
0981a08… noreply 45 def test_get_processor_pdf(self, tmp_path):
0981a08… noreply 46 f = tmp_path / "doc.pdf"
0981a08… noreply 47 f.write_text("")
0981a08… noreply 48 proc = get_processor(f)
0981a08… noreply 49 assert isinstance(proc, PdfProcessor)
0981a08… noreply 50
0981a08… noreply 51 def test_get_processor_unknown(self, tmp_path):
0981a08… noreply 52 f = tmp_path / "doc.xyz"
0981a08… noreply 53 f.write_text("")
0981a08… noreply 54 assert get_processor(f) is None
0981a08… noreply 55
0981a08… noreply 56 def test_register_custom_processor(self, tmp_path):
0981a08… noreply 57 class CustomProcessor(DocumentProcessor):
0981a08… noreply 58 supported_extensions = [".custom"]
0981a08… noreply 59
0981a08… noreply 60 def can_process(self, path):
0981a08… noreply 61 return path.suffix == ".custom"
0981a08… noreply 62
0981a08… noreply 63 def process(self, path):
0981a08… noreply 64 return [DocumentChunk(text="custom", source_file=str(path), chunk_index=0)]
0981a08… noreply 65
0981a08… noreply 66 register_processor([".custom"], CustomProcessor)
0981a08… noreply 67 f = tmp_path / "test.custom"
0981a08… noreply 68 f.write_text("data")
0981a08… noreply 69 proc = get_processor(f)
0981a08… noreply 70 assert isinstance(proc, CustomProcessor)
0981a08… noreply 71 chunks = proc.process(f)
0981a08… noreply 72 assert len(chunks) == 1
0981a08… noreply 73 assert chunks[0].text == "custom"
0981a08… noreply 74
0981a08… noreply 75
0981a08… noreply 76 # --- Markdown ---
0981a08… noreply 77
0981a08… noreply 78
0981a08… noreply 79 class TestMarkdownProcessor:
0981a08… noreply 80 def test_splits_by_headings(self, tmp_path):
0981a08… noreply 81 md = tmp_path / "test.md"
0981a08… noreply 82 md.write_text(
0981a08… noreply 83 textwrap.dedent("""\
0981a08… noreply 84 # Introduction
0981a08… noreply 85 Some intro text.
0981a08… noreply 86
0981a08… noreply 87 ## Details
0981a08… noreply 88 More details here.
0981a08… noreply 89
0981a08… noreply 90 ## Conclusion
0981a08… noreply 91 Final thoughts.
0981a08… noreply 92 """)
0981a08… noreply 93 )
0981a08… noreply 94 proc = MarkdownProcessor()
0981a08… noreply 95 assert proc.can_process(md)
0981a08… noreply 96 chunks = proc.process(md)
0981a08… noreply 97
0981a08… noreply 98 assert len(chunks) == 3
0981a08… noreply 99 assert chunks[0].section == "Introduction"
0981a08… noreply 100 assert "intro text" in chunks[0].text
0981a08… noreply 101 assert chunks[1].section == "Details"
0981a08… noreply 102 assert chunks[2].section == "Conclusion"
0981a08… noreply 103
0981a08… noreply 104 def test_preamble_before_first_heading(self, tmp_path):
0981a08… noreply 105 md = tmp_path / "test.md"
0981a08… noreply 106 md.write_text(
0981a08… noreply 107 textwrap.dedent("""\
0981a08… noreply 108 Some preamble text.
0981a08… noreply 109
0981a08… noreply 110 # First Heading
0981a08… noreply 111 Content here.
0981a08… noreply 112 """)
0981a08… noreply 113 )
0981a08… noreply 114 proc = MarkdownProcessor()
0981a08… noreply 115 chunks = proc.process(md)
0981a08… noreply 116 assert len(chunks) == 2
0981a08… noreply 117 assert chunks[0].section == "(preamble)"
0981a08… noreply 118 assert "preamble" in chunks[0].text
0981a08… noreply 119
0981a08… noreply 120 def test_no_headings_falls_back_to_paragraphs(self, tmp_path):
0981a08… noreply 121 md = tmp_path / "test.md"
0981a08… noreply 122 md.write_text("Paragraph one.\n\nParagraph two.\n\nParagraph three.")
0981a08… noreply 123 proc = MarkdownProcessor()
0981a08… noreply 124 chunks = proc.process(md)
0981a08… noreply 125 assert len(chunks) >= 1
0981a08… noreply 126 # All text should be captured
0981a08… noreply 127 full_text = " ".join(c.text for c in chunks)
0981a08… noreply 128 assert "Paragraph one" in full_text
0981a08… noreply 129 assert "Paragraph three" in full_text
0981a08… noreply 130
0981a08… noreply 131 def test_chunk_index_increments(self, tmp_path):
0981a08… noreply 132 md = tmp_path / "test.md"
0981a08… noreply 133 md.write_text("# A\ntext\n# B\ntext\n# C\ntext")
0981a08… noreply 134 proc = MarkdownProcessor()
0981a08… noreply 135 chunks = proc.process(md)
0981a08… noreply 136 indices = [c.chunk_index for c in chunks]
0981a08… noreply 137 assert indices == list(range(len(chunks)))
0981a08… noreply 138
0981a08… noreply 139 def test_source_file_set(self, tmp_path):
0981a08… noreply 140 md = tmp_path / "test.md"
0981a08… noreply 141 md.write_text("# Heading\nContent")
0981a08… noreply 142 proc = MarkdownProcessor()
0981a08… noreply 143 chunks = proc.process(md)
0981a08… noreply 144 assert chunks[0].source_file == str(md)
0981a08… noreply 145
0981a08… noreply 146
0981a08… noreply 147 # --- Plaintext ---
0981a08… noreply 148
0981a08… noreply 149
0981a08… noreply 150 class TestPlaintextProcessor:
0981a08… noreply 151 def test_basic_paragraphs(self, tmp_path):
0981a08… noreply 152 txt = tmp_path / "test.txt"
0981a08… noreply 153 txt.write_text("First paragraph.\n\nSecond paragraph.\n\nThird paragraph.")
0981a08… noreply 154 proc = PlaintextProcessor()
0981a08… noreply 155 assert proc.can_process(txt)
0981a08… noreply 156 chunks = proc.process(txt)
0981a08… noreply 157 assert len(chunks) >= 1
0981a08… noreply 158 full_text = " ".join(c.text for c in chunks)
0981a08… noreply 159 assert "First paragraph" in full_text
0981a08… noreply 160 assert "Third paragraph" in full_text
0981a08… noreply 161
0981a08… noreply 162 def test_handles_log_files(self, tmp_path):
0981a08… noreply 163 log = tmp_path / "app.log"
0981a08… noreply 164 log.write_text("line 1\nline 2\nline 3")
0981a08… noreply 165 proc = PlaintextProcessor()
0981a08… noreply 166 assert proc.can_process(log)
0981a08… noreply 167 chunks = proc.process(log)
0981a08… noreply 168 assert len(chunks) >= 1
0981a08… noreply 169
0981a08… noreply 170 def test_handles_csv(self, tmp_path):
0981a08… noreply 171 csv = tmp_path / "data.csv"
0981a08… noreply 172 csv.write_text("a,b,c\n1,2,3\n4,5,6")
0981a08… noreply 173 proc = PlaintextProcessor()
0981a08… noreply 174 assert proc.can_process(csv)
0981a08… noreply 175 chunks = proc.process(csv)
0981a08… noreply 176 assert len(chunks) >= 1
0981a08… noreply 177
0981a08… noreply 178 def test_empty_file(self, tmp_path):
0981a08… noreply 179 txt = tmp_path / "empty.txt"
0981a08… noreply 180 txt.write_text("")
0981a08… noreply 181 proc = PlaintextProcessor()
0981a08… noreply 182 chunks = proc.process(txt)
0981a08… noreply 183 assert chunks == []
0981a08… noreply 184
0981a08… noreply 185
0981a08… noreply 186 class TestChunkByParagraphs:
0981a08… noreply 187 def test_respects_max_chunk_size(self):
0981a08… noreply 188 # Create text with many paragraphs that exceed max size
0981a08… noreply 189 paragraphs = ["A" * 500 for _ in range(10)]
0981a08… noreply 190 text = "\n\n".join(paragraphs)
0981a08… noreply 191 chunks = _chunk_by_paragraphs(text, "test.txt", max_chunk_size=1200, overlap=100)
0981a08… noreply 192 assert len(chunks) > 1
0981a08… noreply 193 for chunk in chunks:
0981a08… noreply 194 # Each chunk should be reasonably sized (allowing for overlap)
0981a08… noreply 195 assert len(chunk.text) < 2000
0981a08… noreply 196
0981a08… noreply 197 def test_overlap(self):
0981a08… noreply 198 text = "Para A " * 300 + "\n\n" + "Para B " * 300 + "\n\n" + "Para C " * 300
0981a08… noreply 199 chunks = _chunk_by_paragraphs(text, "test.txt", max_chunk_size=2500, overlap=200)
0981a08… noreply 200 if len(chunks) > 1:
0981a08… noreply 201 # The second chunk should contain some overlap from the first
0981a08… noreply 202 assert len(chunks[1].text) > 200
0981a08… noreply 203
0981a08… noreply 204
0981a08… noreply 205 # --- PDF ---
0981a08… noreply 206
0981a08… noreply 207
0981a08… noreply 208 class TestPdfProcessor:
0981a08… noreply 209 def test_can_process(self, tmp_path):
0981a08… noreply 210 f = tmp_path / "doc.pdf"
0981a08… noreply 211 f.write_text("")
0981a08… noreply 212 proc = PdfProcessor()
0981a08… noreply 213 assert proc.can_process(f)
0981a08… noreply 214 assert not proc.can_process(tmp_path / "doc.txt")
0981a08… noreply 215
0981a08… noreply 216 def test_process_pymupdf(self, tmp_path):
0981a08… noreply 217 f = tmp_path / "doc.pdf"
0981a08… noreply 218 f.write_text("")
0981a08… noreply 219
0981a08… noreply 220 mock_page = MagicMock()
0981a08… noreply 221 mock_page.get_text.return_value = "Page 1 content"
0981a08… noreply 222 mock_doc = MagicMock()
0981a08… noreply 223 mock_doc.__iter__ = MagicMock(return_value=iter([mock_page]))
0981a08… noreply 224 mock_doc.__enter__ = MagicMock(return_value=mock_doc)
0981a08… noreply 225 mock_doc.__exit__ = MagicMock(return_value=False)
0981a08… noreply 226
0981a08… noreply 227 mock_pymupdf = MagicMock()
0981a08… noreply 228 mock_pymupdf.open.return_value = mock_doc
0981a08… noreply 229
0981a08… noreply 230 with patch.dict("sys.modules", {"pymupdf": mock_pymupdf}):
0981a08… noreply 231 proc = PdfProcessor()
0981a08… noreply 232 chunks = proc._process_pymupdf(f)
0981a08… noreply 233 assert len(chunks) == 1
0981a08… noreply 234 assert chunks[0].text == "Page 1 content"
0981a08… noreply 235 assert chunks[0].page == 1
0981a08… noreply 236 assert chunks[0].metadata["extraction_method"] == "pymupdf"
0981a08… noreply 237
0981a08… noreply 238 def test_process_pdfplumber(self, tmp_path):
0981a08… noreply 239 f = tmp_path / "doc.pdf"
0981a08… noreply 240 f.write_text("")
0981a08… noreply 241
0981a08… noreply 242 mock_page = MagicMock()
0981a08… noreply 243 mock_page.extract_text.return_value = "Page 1 via pdfplumber"
0981a08… noreply 244 mock_pdf = MagicMock()
0981a08… noreply 245 mock_pdf.pages = [mock_page]
0981a08… noreply 246 mock_pdf.__enter__ = MagicMock(return_value=mock_pdf)
0981a08… noreply 247 mock_pdf.__exit__ = MagicMock(return_value=False)
0981a08… noreply 248
0981a08… noreply 249 mock_pdfplumber = MagicMock()
0981a08… noreply 250 mock_pdfplumber.open.return_value = mock_pdf
0981a08… noreply 251
0981a08… noreply 252 with patch.dict("sys.modules", {"pdfplumber": mock_pdfplumber}):
0981a08… noreply 253 proc = PdfProcessor()
0981a08… noreply 254 chunks = proc._process_pdfplumber(f)
0981a08… noreply 255 assert len(chunks) == 1
0981a08… noreply 256 assert chunks[0].text == "Page 1 via pdfplumber"
0981a08… noreply 257 assert chunks[0].metadata["extraction_method"] == "pdfplumber"
0981a08… noreply 258
0981a08… noreply 259 def test_raises_if_no_library(self, tmp_path):
0981a08… noreply 260 f = tmp_path / "doc.pdf"
0981a08… noreply 261 f.write_text("")
0981a08… noreply 262 proc = PdfProcessor()
0981a08… noreply 263
0981a08… noreply 264 with patch.object(proc, "_process_pymupdf", side_effect=ImportError):
0981a08… noreply 265 with patch.object(proc, "_process_pdfplumber", side_effect=ImportError):
0981a08… noreply 266 with pytest.raises(ImportError, match="pymupdf or pdfplumber"):
0981a08… noreply 267 proc.process(f)
0981a08… noreply 268
0981a08… noreply 269
0981a08… noreply 270 # --- Ingest ---
0981a08… noreply 271
0981a08… noreply 272
0981a08… noreply 273 class TestIngest:
0981a08… noreply 274 def test_ingest_file(self, tmp_path):
0981a08… noreply 275 md = tmp_path / "doc.md"
0981a08… noreply 276 md.write_text("# Title\nSome content here.")
0981a08… noreply 277
0981a08… noreply 278 mock_kg = MagicMock()
0981a08… noreply 279 mock_kg.register_source = MagicMock()
0981a08… noreply 280 mock_kg.add_content = MagicMock()
0981a08… noreply 281
0981a08… noreply 282 from video_processor.processors.ingest import ingest_file
0981a08… noreply 283
0981a08… noreply 284 count = ingest_file(md, mock_kg)
0981a08… noreply 285 assert count == 1
0981a08… noreply 286 mock_kg.register_source.assert_called_once()
0981a08… noreply 287 source_arg = mock_kg.register_source.call_args[0][0]
0981a08… noreply 288 assert source_arg["source_type"] == "document"
0981a08… noreply 289 assert source_arg["title"] == "doc"
0981a08… noreply 290 mock_kg.add_content.assert_called_once()
0981a08… noreply 291
0981a08… noreply 292 def test_ingest_file_unsupported(self, tmp_path):
0981a08… noreply 293 f = tmp_path / "data.xyz"
0981a08… noreply 294 f.write_text("stuff")
0981a08… noreply 295 mock_kg = MagicMock()
0981a08… noreply 296
0981a08… noreply 297 from video_processor.processors.ingest import ingest_file
0981a08… noreply 298
0981a08… noreply 299 with pytest.raises(ValueError, match="No processor"):
0981a08… noreply 300 ingest_file(f, mock_kg)
0981a08… noreply 301
0981a08… noreply 302 def test_ingest_directory(self, tmp_path):
0981a08… noreply 303 (tmp_path / "a.md").write_text("# A\nContent A")
0981a08… noreply 304 (tmp_path / "b.txt").write_text("Content B")
0981a08… noreply 305 (tmp_path / "c.xyz").write_text("Ignored")
0981a08… noreply 306
0981a08… noreply 307 mock_kg = MagicMock()
0981a08… noreply 308
0981a08… noreply 309 from video_processor.processors.ingest import ingest_directory
0981a08… noreply 310
0981a08… noreply 311 results = ingest_directory(tmp_path, mock_kg, recursive=False)
0981a08… noreply 312 # Should process a.md and b.txt but not c.xyz
0981a08… noreply 313 assert len(results) == 2
0981a08… noreply 314 processed_names = {Path(p).name for p in results}
0981a08… noreply 315 assert "a.md" in processed_names
0981a08… noreply 316 assert "b.txt" in processed_names
0981a08… noreply 317
0981a08… noreply 318 def test_ingest_directory_recursive(self, tmp_path):
0981a08… noreply 319 sub = tmp_path / "sub"
0981a08… noreply 320 sub.mkdir()
0981a08… noreply 321 (tmp_path / "top.md").write_text("# Top\nTop level")
0981a08… noreply 322 (sub / "nested.md").write_text("# Nested\nNested content")
0981a08… noreply 323
0981a08… noreply 324 mock_kg = MagicMock()
0981a08… noreply 325
0981a08… noreply 326 from video_processor.processors.ingest import ingest_directory
0981a08… noreply 327
0981a08… noreply 328 results = ingest_directory(tmp_path, mock_kg, recursive=True)
0981a08… noreply 329 assert len(results) == 2
0981a08… noreply 330 processed_names = {Path(p).name for p in results}
0981a08… noreply 331 assert "top.md" in processed_names
0981a08… noreply 332 assert "nested.md" in processed_names
0981a08… noreply 333
0981a08… noreply 334 def test_ingest_file_custom_source_id(self, tmp_path):
0981a08… noreply 335 md = tmp_path / "doc.md"
0981a08… noreply 336 md.write_text("# Title\nContent")
0981a08… noreply 337
0981a08… noreply 338 mock_kg = MagicMock()
0981a08… noreply 339
0981a08… noreply 340 from video_processor.processors.ingest import ingest_file
0981a08… noreply 341
0981a08… noreply 342 ingest_file(md, mock_kg, source_id="custom-123")
0981a08… noreply 343 source_arg = mock_kg.register_source.call_args[0][0]
0981a08… noreply 344 assert source_arg["source_id"] == "custom-123"
0981a08… noreply 345
0981a08… noreply 346 def test_ingest_content_source_format_with_section(self, tmp_path):
0981a08… noreply 347 md = tmp_path / "doc.md"
0981a08… noreply 348 md.write_text("# Introduction\nSome text\n\n## Details\nMore text")
0981a08… noreply 349
0981a08… noreply 350 mock_kg = MagicMock()
0981a08… noreply 351
0981a08… noreply 352 from video_processor.processors.ingest import ingest_file
0981a08… noreply 353
0981a08… noreply 354 ingest_file(md, mock_kg)
0981a08… noreply 355 # Check content_source includes section info
0981a08… noreply 356 calls = mock_kg.add_content.call_args_list
0981a08… noreply 357 assert len(calls) == 2
0981a08… noreply 358 assert "document:doc.md:section:Introduction" in calls[0][0][1]
0981a08… noreply 359 assert "document:doc.md:section:Details" in calls[1][0][1]

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button