PlanOpticon

planopticon / tests / test_processors.py

Source Blame History 359 lines

0981a08…	noreply	1	"""Tests for document processors and ingestion pipeline."""
0981a08…	noreply	2
0981a08…	noreply	3	import textwrap
0981a08…	noreply	4	from pathlib import Path
0981a08…	noreply	5	from unittest.mock import MagicMock, patch
0981a08…	noreply	6
0981a08…	noreply	7	import pytest
0981a08…	noreply	8
0981a08…	noreply	9	from video_processor.processors.base import (
0981a08…	noreply	10	DocumentChunk,
0981a08…	noreply	11	DocumentProcessor,
0981a08…	noreply	12	get_processor,
0981a08…	noreply	13	list_supported_extensions,
0981a08…	noreply	14	register_processor,
0981a08…	noreply	15	)
0981a08…	noreply	16	from video_processor.processors.markdown_processor import (
0981a08…	noreply	17	MarkdownProcessor,
0981a08…	noreply	18	PlaintextProcessor,
0981a08…	noreply	19	_chunk_by_paragraphs,
0981a08…	noreply	20	)
0981a08…	noreply	21	from video_processor.processors.pdf_processor import PdfProcessor
0981a08…	noreply	22
0981a08…	noreply	23	# --- Base / Registry ---
0981a08…	noreply	24
0981a08…	noreply	25
0981a08…	noreply	26	class TestRegistry:
0981a08…	noreply	27	def test_list_supported_extensions_includes_builtins(self):
0981a08…	noreply	28	exts = list_supported_extensions()
0981a08…	noreply	29	assert ".md" in exts
0981a08…	noreply	30	assert ".txt" in exts
0981a08…	noreply	31	assert ".pdf" in exts
0981a08…	noreply	32
0981a08…	noreply	33	def test_get_processor_markdown(self, tmp_path):
0981a08…	noreply	34	f = tmp_path / "doc.md"
0981a08…	noreply	35	f.write_text("hello")
0981a08…	noreply	36	proc = get_processor(f)
0981a08…	noreply	37	assert isinstance(proc, MarkdownProcessor)
0981a08…	noreply	38
0981a08…	noreply	39	def test_get_processor_txt(self, tmp_path):
0981a08…	noreply	40	f = tmp_path / "doc.txt"
0981a08…	noreply	41	f.write_text("hello")
0981a08…	noreply	42	proc = get_processor(f)
0981a08…	noreply	43	assert isinstance(proc, PlaintextProcessor)
0981a08…	noreply	44
0981a08…	noreply	45	def test_get_processor_pdf(self, tmp_path):
0981a08…	noreply	46	f = tmp_path / "doc.pdf"
0981a08…	noreply	47	f.write_text("")
0981a08…	noreply	48	proc = get_processor(f)
0981a08…	noreply	49	assert isinstance(proc, PdfProcessor)
0981a08…	noreply	50
0981a08…	noreply	51	def test_get_processor_unknown(self, tmp_path):
0981a08…	noreply	52	f = tmp_path / "doc.xyz"
0981a08…	noreply	53	f.write_text("")
0981a08…	noreply	54	assert get_processor(f) is None
0981a08…	noreply	55
0981a08…	noreply	56	def test_register_custom_processor(self, tmp_path):
0981a08…	noreply	57	class CustomProcessor(DocumentProcessor):
0981a08…	noreply	58	supported_extensions = [".custom"]
0981a08…	noreply	59
0981a08…	noreply	60	def can_process(self, path):
0981a08…	noreply	61	return path.suffix == ".custom"
0981a08…	noreply	62
0981a08…	noreply	63	def process(self, path):
0981a08…	noreply	64	return [DocumentChunk(text="custom", source_file=str(path), chunk_index=0)]
0981a08…	noreply	65
0981a08…	noreply	66	register_processor([".custom"], CustomProcessor)
0981a08…	noreply	67	f = tmp_path / "test.custom"
0981a08…	noreply	68	f.write_text("data")
0981a08…	noreply	69	proc = get_processor(f)
0981a08…	noreply	70	assert isinstance(proc, CustomProcessor)
0981a08…	noreply	71	chunks = proc.process(f)
0981a08…	noreply	72	assert len(chunks) == 1
0981a08…	noreply	73	assert chunks[0].text == "custom"
0981a08…	noreply	74
0981a08…	noreply	75
0981a08…	noreply	76	# --- Markdown ---
0981a08…	noreply	77
0981a08…	noreply	78
0981a08…	noreply	79	class TestMarkdownProcessor:
0981a08…	noreply	80	def test_splits_by_headings(self, tmp_path):
0981a08…	noreply	81	md = tmp_path / "test.md"
0981a08…	noreply	82	md.write_text(
0981a08…	noreply	83	textwrap.dedent("""\
0981a08…	noreply	84	# Introduction
0981a08…	noreply	85	Some intro text.
0981a08…	noreply	86
0981a08…	noreply	87	## Details
0981a08…	noreply	88	More details here.
0981a08…	noreply	89
0981a08…	noreply	90	## Conclusion
0981a08…	noreply	91	Final thoughts.
0981a08…	noreply	92	""")
0981a08…	noreply	93	)
0981a08…	noreply	94	proc = MarkdownProcessor()
0981a08…	noreply	95	assert proc.can_process(md)
0981a08…	noreply	96	chunks = proc.process(md)
0981a08…	noreply	97
0981a08…	noreply	98	assert len(chunks) == 3
0981a08…	noreply	99	assert chunks[0].section == "Introduction"
0981a08…	noreply	100	assert "intro text" in chunks[0].text
0981a08…	noreply	101	assert chunks[1].section == "Details"
0981a08…	noreply	102	assert chunks[2].section == "Conclusion"
0981a08…	noreply	103
0981a08…	noreply	104	def test_preamble_before_first_heading(self, tmp_path):
0981a08…	noreply	105	md = tmp_path / "test.md"
0981a08…	noreply	106	md.write_text(
0981a08…	noreply	107	textwrap.dedent("""\
0981a08…	noreply	108	Some preamble text.
0981a08…	noreply	109
0981a08…	noreply	110	# First Heading
0981a08…	noreply	111	Content here.
0981a08…	noreply	112	""")
0981a08…	noreply	113	)
0981a08…	noreply	114	proc = MarkdownProcessor()
0981a08…	noreply	115	chunks = proc.process(md)
0981a08…	noreply	116	assert len(chunks) == 2
0981a08…	noreply	117	assert chunks[0].section == "(preamble)"
0981a08…	noreply	118	assert "preamble" in chunks[0].text
0981a08…	noreply	119
0981a08…	noreply	120	def test_no_headings_falls_back_to_paragraphs(self, tmp_path):
0981a08…	noreply	121	md = tmp_path / "test.md"
0981a08…	noreply	122	md.write_text("Paragraph one.\n\nParagraph two.\n\nParagraph three.")
0981a08…	noreply	123	proc = MarkdownProcessor()
0981a08…	noreply	124	chunks = proc.process(md)
0981a08…	noreply	125	assert len(chunks) >= 1
0981a08…	noreply	126	# All text should be captured
0981a08…	noreply	127	full_text = " ".join(c.text for c in chunks)
0981a08…	noreply	128	assert "Paragraph one" in full_text
0981a08…	noreply	129	assert "Paragraph three" in full_text
0981a08…	noreply	130
0981a08…	noreply	131	def test_chunk_index_increments(self, tmp_path):
0981a08…	noreply	132	md = tmp_path / "test.md"
0981a08…	noreply	133	md.write_text("# A\ntext\n# B\ntext\n# C\ntext")
0981a08…	noreply	134	proc = MarkdownProcessor()
0981a08…	noreply	135	chunks = proc.process(md)
0981a08…	noreply	136	indices = [c.chunk_index for c in chunks]
0981a08…	noreply	137	assert indices == list(range(len(chunks)))
0981a08…	noreply	138
0981a08…	noreply	139	def test_source_file_set(self, tmp_path):
0981a08…	noreply	140	md = tmp_path / "test.md"
0981a08…	noreply	141	md.write_text("# Heading\nContent")
0981a08…	noreply	142	proc = MarkdownProcessor()
0981a08…	noreply	143	chunks = proc.process(md)
0981a08…	noreply	144	assert chunks[0].source_file == str(md)
0981a08…	noreply	145
0981a08…	noreply	146
0981a08…	noreply	147	# --- Plaintext ---
0981a08…	noreply	148
0981a08…	noreply	149
0981a08…	noreply	150	class TestPlaintextProcessor:
0981a08…	noreply	151	def test_basic_paragraphs(self, tmp_path):
0981a08…	noreply	152	txt = tmp_path / "test.txt"
0981a08…	noreply	153	txt.write_text("First paragraph.\n\nSecond paragraph.\n\nThird paragraph.")
0981a08…	noreply	154	proc = PlaintextProcessor()
0981a08…	noreply	155	assert proc.can_process(txt)
0981a08…	noreply	156	chunks = proc.process(txt)
0981a08…	noreply	157	assert len(chunks) >= 1
0981a08…	noreply	158	full_text = " ".join(c.text for c in chunks)
0981a08…	noreply	159	assert "First paragraph" in full_text
0981a08…	noreply	160	assert "Third paragraph" in full_text
0981a08…	noreply	161
0981a08…	noreply	162	def test_handles_log_files(self, tmp_path):
0981a08…	noreply	163	log = tmp_path / "app.log"
0981a08…	noreply	164	log.write_text("line 1\nline 2\nline 3")
0981a08…	noreply	165	proc = PlaintextProcessor()
0981a08…	noreply	166	assert proc.can_process(log)
0981a08…	noreply	167	chunks = proc.process(log)
0981a08…	noreply	168	assert len(chunks) >= 1
0981a08…	noreply	169
0981a08…	noreply	170	def test_handles_csv(self, tmp_path):
0981a08…	noreply	171	csv = tmp_path / "data.csv"
0981a08…	noreply	172	csv.write_text("a,b,c\n1,2,3\n4,5,6")
0981a08…	noreply	173	proc = PlaintextProcessor()
0981a08…	noreply	174	assert proc.can_process(csv)
0981a08…	noreply	175	chunks = proc.process(csv)
0981a08…	noreply	176	assert len(chunks) >= 1
0981a08…	noreply	177
0981a08…	noreply	178	def test_empty_file(self, tmp_path):
0981a08…	noreply	179	txt = tmp_path / "empty.txt"
0981a08…	noreply	180	txt.write_text("")
0981a08…	noreply	181	proc = PlaintextProcessor()
0981a08…	noreply	182	chunks = proc.process(txt)
0981a08…	noreply	183	assert chunks == []
0981a08…	noreply	184
0981a08…	noreply	185
0981a08…	noreply	186	class TestChunkByParagraphs:
0981a08…	noreply	187	def test_respects_max_chunk_size(self):
0981a08…	noreply	188	# Create text with many paragraphs that exceed max size
0981a08…	noreply	189	paragraphs = ["A" * 500 for _ in range(10)]
0981a08…	noreply	190	text = "\n\n".join(paragraphs)
0981a08…	noreply	191	chunks = _chunk_by_paragraphs(text, "test.txt", max_chunk_size=1200, overlap=100)
0981a08…	noreply	192	assert len(chunks) > 1
0981a08…	noreply	193	for chunk in chunks:
0981a08…	noreply	194	# Each chunk should be reasonably sized (allowing for overlap)
0981a08…	noreply	195	assert len(chunk.text) < 2000
0981a08…	noreply	196
0981a08…	noreply	197	def test_overlap(self):
0981a08…	noreply	198	text = "Para A " * 300 + "\n\n" + "Para B " * 300 + "\n\n" + "Para C " * 300
0981a08…	noreply	199	chunks = _chunk_by_paragraphs(text, "test.txt", max_chunk_size=2500, overlap=200)
0981a08…	noreply	200	if len(chunks) > 1:
0981a08…	noreply	201	# The second chunk should contain some overlap from the first
0981a08…	noreply	202	assert len(chunks[1].text) > 200
0981a08…	noreply	203
0981a08…	noreply	204
0981a08…	noreply	205	# --- PDF ---
0981a08…	noreply	206
0981a08…	noreply	207
0981a08…	noreply	208	class TestPdfProcessor:
0981a08…	noreply	209	def test_can_process(self, tmp_path):
0981a08…	noreply	210	f = tmp_path / "doc.pdf"
0981a08…	noreply	211	f.write_text("")
0981a08…	noreply	212	proc = PdfProcessor()
0981a08…	noreply	213	assert proc.can_process(f)
0981a08…	noreply	214	assert not proc.can_process(tmp_path / "doc.txt")
0981a08…	noreply	215
0981a08…	noreply	216	def test_process_pymupdf(self, tmp_path):
0981a08…	noreply	217	f = tmp_path / "doc.pdf"
0981a08…	noreply	218	f.write_text("")
0981a08…	noreply	219
0981a08…	noreply	220	mock_page = MagicMock()
0981a08…	noreply	221	mock_page.get_text.return_value = "Page 1 content"
0981a08…	noreply	222	mock_doc = MagicMock()
0981a08…	noreply	223	mock_doc.__iter__ = MagicMock(return_value=iter([mock_page]))
0981a08…	noreply	224	mock_doc.__enter__ = MagicMock(return_value=mock_doc)
0981a08…	noreply	225	mock_doc.__exit__ = MagicMock(return_value=False)
0981a08…	noreply	226
0981a08…	noreply	227	mock_pymupdf = MagicMock()
0981a08…	noreply	228	mock_pymupdf.open.return_value = mock_doc
0981a08…	noreply	229
0981a08…	noreply	230	with patch.dict("sys.modules", {"pymupdf": mock_pymupdf}):
0981a08…	noreply	231	proc = PdfProcessor()
0981a08…	noreply	232	chunks = proc._process_pymupdf(f)
0981a08…	noreply	233	assert len(chunks) == 1
0981a08…	noreply	234	assert chunks[0].text == "Page 1 content"
0981a08…	noreply	235	assert chunks[0].page == 1
0981a08…	noreply	236	assert chunks[0].metadata["extraction_method"] == "pymupdf"
0981a08…	noreply	237
0981a08…	noreply	238	def test_process_pdfplumber(self, tmp_path):
0981a08…	noreply	239	f = tmp_path / "doc.pdf"
0981a08…	noreply	240	f.write_text("")
0981a08…	noreply	241
0981a08…	noreply	242	mock_page = MagicMock()
0981a08…	noreply	243	mock_page.extract_text.return_value = "Page 1 via pdfplumber"
0981a08…	noreply	244	mock_pdf = MagicMock()
0981a08…	noreply	245	mock_pdf.pages = [mock_page]
0981a08…	noreply	246	mock_pdf.__enter__ = MagicMock(return_value=mock_pdf)
0981a08…	noreply	247	mock_pdf.__exit__ = MagicMock(return_value=False)
0981a08…	noreply	248
0981a08…	noreply	249	mock_pdfplumber = MagicMock()
0981a08…	noreply	250	mock_pdfplumber.open.return_value = mock_pdf
0981a08…	noreply	251
0981a08…	noreply	252	with patch.dict("sys.modules", {"pdfplumber": mock_pdfplumber}):
0981a08…	noreply	253	proc = PdfProcessor()
0981a08…	noreply	254	chunks = proc._process_pdfplumber(f)
0981a08…	noreply	255	assert len(chunks) == 1
0981a08…	noreply	256	assert chunks[0].text == "Page 1 via pdfplumber"
0981a08…	noreply	257	assert chunks[0].metadata["extraction_method"] == "pdfplumber"
0981a08…	noreply	258
0981a08…	noreply	259	def test_raises_if_no_library(self, tmp_path):
0981a08…	noreply	260	f = tmp_path / "doc.pdf"
0981a08…	noreply	261	f.write_text("")
0981a08…	noreply	262	proc = PdfProcessor()
0981a08…	noreply	263
0981a08…	noreply	264	with patch.object(proc, "_process_pymupdf", side_effect=ImportError):
0981a08…	noreply	265	with patch.object(proc, "_process_pdfplumber", side_effect=ImportError):
0981a08…	noreply	266	with pytest.raises(ImportError, match="pymupdf or pdfplumber"):
0981a08…	noreply	267	proc.process(f)
0981a08…	noreply	268
0981a08…	noreply	269
0981a08…	noreply	270	# --- Ingest ---
0981a08…	noreply	271
0981a08…	noreply	272
0981a08…	noreply	273	class TestIngest:
0981a08…	noreply	274	def test_ingest_file(self, tmp_path):
0981a08…	noreply	275	md = tmp_path / "doc.md"
0981a08…	noreply	276	md.write_text("# Title\nSome content here.")
0981a08…	noreply	277
0981a08…	noreply	278	mock_kg = MagicMock()
0981a08…	noreply	279	mock_kg.register_source = MagicMock()
0981a08…	noreply	280	mock_kg.add_content = MagicMock()
0981a08…	noreply	281
0981a08…	noreply	282	from video_processor.processors.ingest import ingest_file
0981a08…	noreply	283
0981a08…	noreply	284	count = ingest_file(md, mock_kg)
0981a08…	noreply	285	assert count == 1
0981a08…	noreply	286	mock_kg.register_source.assert_called_once()
0981a08…	noreply	287	source_arg = mock_kg.register_source.call_args[0][0]
0981a08…	noreply	288	assert source_arg["source_type"] == "document"
0981a08…	noreply	289	assert source_arg["title"] == "doc"
0981a08…	noreply	290	mock_kg.add_content.assert_called_once()
0981a08…	noreply	291
0981a08…	noreply	292	def test_ingest_file_unsupported(self, tmp_path):
0981a08…	noreply	293	f = tmp_path / "data.xyz"
0981a08…	noreply	294	f.write_text("stuff")
0981a08…	noreply	295	mock_kg = MagicMock()
0981a08…	noreply	296
0981a08…	noreply	297	from video_processor.processors.ingest import ingest_file
0981a08…	noreply	298
0981a08…	noreply	299	with pytest.raises(ValueError, match="No processor"):
0981a08…	noreply	300	ingest_file(f, mock_kg)
0981a08…	noreply	301
0981a08…	noreply	302	def test_ingest_directory(self, tmp_path):
0981a08…	noreply	303	(tmp_path / "a.md").write_text("# A\nContent A")
0981a08…	noreply	304	(tmp_path / "b.txt").write_text("Content B")
0981a08…	noreply	305	(tmp_path / "c.xyz").write_text("Ignored")
0981a08…	noreply	306
0981a08…	noreply	307	mock_kg = MagicMock()
0981a08…	noreply	308
0981a08…	noreply	309	from video_processor.processors.ingest import ingest_directory
0981a08…	noreply	310
0981a08…	noreply	311	results = ingest_directory(tmp_path, mock_kg, recursive=False)
0981a08…	noreply	312	# Should process a.md and b.txt but not c.xyz
0981a08…	noreply	313	assert len(results) == 2
0981a08…	noreply	314	processed_names = {Path(p).name for p in results}
0981a08…	noreply	315	assert "a.md" in processed_names
0981a08…	noreply	316	assert "b.txt" in processed_names
0981a08…	noreply	317
0981a08…	noreply	318	def test_ingest_directory_recursive(self, tmp_path):
0981a08…	noreply	319	sub = tmp_path / "sub"
0981a08…	noreply	320	sub.mkdir()
0981a08…	noreply	321	(tmp_path / "top.md").write_text("# Top\nTop level")
0981a08…	noreply	322	(sub / "nested.md").write_text("# Nested\nNested content")
0981a08…	noreply	323
0981a08…	noreply	324	mock_kg = MagicMock()
0981a08…	noreply	325
0981a08…	noreply	326	from video_processor.processors.ingest import ingest_directory
0981a08…	noreply	327
0981a08…	noreply	328	results = ingest_directory(tmp_path, mock_kg, recursive=True)
0981a08…	noreply	329	assert len(results) == 2
0981a08…	noreply	330	processed_names = {Path(p).name for p in results}
0981a08…	noreply	331	assert "top.md" in processed_names
0981a08…	noreply	332	assert "nested.md" in processed_names
0981a08…	noreply	333
0981a08…	noreply	334	def test_ingest_file_custom_source_id(self, tmp_path):
0981a08…	noreply	335	md = tmp_path / "doc.md"
0981a08…	noreply	336	md.write_text("# Title\nContent")
0981a08…	noreply	337
0981a08…	noreply	338	mock_kg = MagicMock()
0981a08…	noreply	339
0981a08…	noreply	340	from video_processor.processors.ingest import ingest_file
0981a08…	noreply	341
0981a08…	noreply	342	ingest_file(md, mock_kg, source_id="custom-123")
0981a08…	noreply	343	source_arg = mock_kg.register_source.call_args[0][0]
0981a08…	noreply	344	assert source_arg["source_id"] == "custom-123"
0981a08…	noreply	345
0981a08…	noreply	346	def test_ingest_content_source_format_with_section(self, tmp_path):
0981a08…	noreply	347	md = tmp_path / "doc.md"
0981a08…	noreply	348	md.write_text("# Introduction\nSome text\n\n## Details\nMore text")
0981a08…	noreply	349
0981a08…	noreply	350	mock_kg = MagicMock()
0981a08…	noreply	351
0981a08…	noreply	352	from video_processor.processors.ingest import ingest_file
0981a08…	noreply	353
0981a08…	noreply	354	ingest_file(md, mock_kg)
0981a08…	noreply	355	# Check content_source includes section info
0981a08…	noreply	356	calls = mock_kg.add_content.call_args_list
0981a08…	noreply	357	assert len(calls) == 2
0981a08…	noreply	358	assert "document:doc.md:section:Introduction" in calls[0][0][1]
0981a08…	noreply	359	assert "document:doc.md:section:Details" in calls[1][0][1]

PlanOpticon

Keyboard Shortcuts