Navegador

navegador / tests / test_ingestion_wiki.py
Source Blame History 298 lines
b663b12… lmata 1 """Tests for navegador.ingestion.wiki — WikiIngester."""
b663b12… lmata 2
b663b12… lmata 3 import tempfile
b663b12… lmata 4 from pathlib import Path
b663b12… lmata 5 from unittest.mock import MagicMock, patch
b663b12… lmata 6
b663b12… lmata 7 import pytest
b663b12… lmata 8
b663b12… lmata 9 from navegador.graph.schema import NodeLabel
b663b12… lmata 10 from navegador.ingestion.wiki import WikiIngester, _extract_terms
b663b12… lmata 11
b663b12… lmata 12 # ── Unit: _extract_terms ──────────────────────────────────────────────────────
b663b12… lmata 13
b663b12… lmata 14 class TestExtractTerms:
b663b12… lmata 15 def test_extracts_headings(self):
b663b12… lmata 16 md = "# Introduction\n## Getting Started\n### Deep Dive\n"
b663b12… lmata 17 terms = _extract_terms(md)
b663b12… lmata 18 assert "Introduction" in terms
b663b12… lmata 19 assert "Getting Started" in terms
b663b12… lmata 20 assert "Deep Dive" in terms
b663b12… lmata 21
b663b12… lmata 22 def test_extracts_bold_asterisk(self):
b663b12… lmata 23 md = "Use **GraphStore** for all persistence."
b663b12… lmata 24 terms = _extract_terms(md)
b663b12… lmata 25 assert "GraphStore" in terms
b663b12… lmata 26
b663b12… lmata 27 def test_extracts_bold_underscore(self):
b663b12… lmata 28 md = "The __FalkorDB__ module is required."
b663b12… lmata 29 terms = _extract_terms(md)
b663b12… lmata 30 assert "FalkorDB" in terms
b663b12… lmata 31
b663b12… lmata 32 def test_deduplicates(self):
b663b12… lmata 33 md = "# GraphStore\nUse **GraphStore** here too."
b663b12… lmata 34 terms = _extract_terms(md)
b663b12… lmata 35 assert terms.count("GraphStore") == 1
b663b12… lmata 36
b663b12… lmata 37 def test_empty_markdown(self):
b663b12… lmata 38 assert _extract_terms("") == []
b663b12… lmata 39
b663b12… lmata 40 def test_no_headings_no_bold(self):
b663b12… lmata 41 terms = _extract_terms("plain text with no markup")
b663b12… lmata 42 assert terms == []
b663b12… lmata 43
b663b12… lmata 44 def test_preserves_order(self):
b663b12… lmata 45 md = "# Alpha\n# Beta\n**Gamma**"
b663b12… lmata 46 terms = _extract_terms(md)
b663b12… lmata 47 assert terms == ["Alpha", "Beta", "Gamma"]
b663b12… lmata 48
b663b12… lmata 49
b663b12… lmata 50 # ── Unit: ingest_local ────────────────────────────────────────────────────────
b663b12… lmata 51
b663b12… lmata 52 class TestIngestLocal:
b663b12… lmata 53 def _make_store(self):
b663b12… lmata 54 store = MagicMock()
b663b12… lmata 55 store.query.return_value = MagicMock(result_set=[])
b663b12… lmata 56 return store
b663b12… lmata 57
b663b12… lmata 58 def test_ingests_markdown_files(self):
b663b12… lmata 59 store = self._make_store()
b663b12… lmata 60 ingester = WikiIngester(store)
b663b12… lmata 61 with tempfile.TemporaryDirectory() as tmpdir:
b663b12… lmata 62 (Path(tmpdir) / "home.md").write_text("# Welcome\nThis is home.")
b663b12… lmata 63 (Path(tmpdir) / "guide.md").write_text("## Usage\nSome guide.")
b663b12… lmata 64 stats = ingester.ingest_local(tmpdir)
b663b12… lmata 65 assert stats["pages"] == 2
b663b12… lmata 66
b663b12… lmata 67 def test_skips_non_markdown(self):
b663b12… lmata 68 store = self._make_store()
b663b12… lmata 69 ingester = WikiIngester(store)
b663b12… lmata 70 with tempfile.TemporaryDirectory() as tmpdir:
b663b12… lmata 71 (Path(tmpdir) / "readme.md").write_text("# Readme")
b663b12… lmata 72 (Path(tmpdir) / "image.png").write_bytes(b"\x89PNG")
b663b12… lmata 73 stats = ingester.ingest_local(tmpdir)
b663b12… lmata 74 assert stats["pages"] == 1
b663b12… lmata 75
b663b12… lmata 76 def test_raises_if_dir_missing(self):
b663b12… lmata 77 store = self._make_store()
b663b12… lmata 78 ingester = WikiIngester(store)
b663b12… lmata 79 with pytest.raises(FileNotFoundError):
b663b12… lmata 80 ingester.ingest_local("/nonexistent/path")
b663b12… lmata 81
b663b12… lmata 82 def test_creates_wiki_page_node(self):
b663b12… lmata 83 store = self._make_store()
b663b12… lmata 84 ingester = WikiIngester(store)
b663b12… lmata 85 with tempfile.TemporaryDirectory() as tmpdir:
b663b12… lmata 86 (Path(tmpdir) / "arch.md").write_text("# Architecture")
b663b12… lmata 87 ingester.ingest_local(tmpdir)
b663b12… lmata 88 store.create_node.assert_called_once()
b663b12… lmata 89 call_args = store.create_node.call_args
b663b12… lmata 90 assert call_args[0][0] == NodeLabel.WikiPage
b663b12… lmata 91 props = call_args[0][1]
b663b12… lmata 92 assert props["name"] == "arch"
b663b12… lmata 93 assert props["source"] == "local"
b663b12… lmata 94
b663b12… lmata 95 def test_page_name_normalisation(self):
b663b12… lmata 96 store = self._make_store()
b663b12… lmata 97 ingester = WikiIngester(store)
b663b12… lmata 98 with tempfile.TemporaryDirectory() as tmpdir:
b663b12… lmata 99 (Path(tmpdir) / "getting-started.md").write_text("# Hi")
b663b12… lmata 100 ingester.ingest_local(tmpdir)
b663b12… lmata 101 props = store.create_node.call_args[0][1]
b663b12… lmata 102 assert props["name"] == "getting started"
b663b12… lmata 103
b663b12… lmata 104 def test_creates_documents_edge_when_term_matches(self):
b663b12… lmata 105 store = MagicMock()
b663b12… lmata 106 store.query.return_value = MagicMock(result_set=[["Concept", "GraphStore"]])
b663b12… lmata 107 ingester = WikiIngester(store)
b663b12… lmata 108 with tempfile.TemporaryDirectory() as tmpdir:
b663b12… lmata 109 (Path(tmpdir) / "page.md").write_text("# GraphStore\nSome text.")
b663b12… lmata 110 stats = ingester.ingest_local(tmpdir)
b663b12… lmata 111 assert stats["links"] >= 1
b663b12… lmata 112 store.create_edge.assert_called()
b663b12… lmata 113
b663b12… lmata 114 def test_no_links_when_no_term_match(self):
b663b12… lmata 115 store = self._make_store() # query returns []
b663b12… lmata 116 ingester = WikiIngester(store)
b663b12… lmata 117 with tempfile.TemporaryDirectory() as tmpdir:
b663b12… lmata 118 (Path(tmpdir) / "page.md").write_text("# UnknownTerm\nText.")
b663b12… lmata 119 stats = ingester.ingest_local(tmpdir)
b663b12… lmata 120 assert stats["links"] == 0
b663b12… lmata 121 store.create_edge.assert_not_called()
b663b12… lmata 122
b663b12… lmata 123 def test_content_capped_at_4000_chars(self):
b663b12… lmata 124 store = self._make_store()
b663b12… lmata 125 ingester = WikiIngester(store)
b663b12… lmata 126 with tempfile.TemporaryDirectory() as tmpdir:
b663b12… lmata 127 (Path(tmpdir) / "long.md").write_text("x" * 10000)
b663b12… lmata 128 ingester.ingest_local(tmpdir)
b663b12… lmata 129 props = store.create_node.call_args[0][1]
b663b12… lmata 130 assert len(props["content"]) <= 4000
b663b12… lmata 131
b663b12… lmata 132 def test_returns_stats_dict(self):
b663b12… lmata 133 store = self._make_store()
b663b12… lmata 134 ingester = WikiIngester(store)
b663b12… lmata 135 with tempfile.TemporaryDirectory() as tmpdir:
b663b12… lmata 136 stats = ingester.ingest_local(tmpdir)
b663b12… lmata 137 assert "pages" in stats
b663b12… lmata 138 assert "links" in stats
b663b12… lmata 139 assert stats["pages"] == 0
b663b12… lmata 140
b663b12… lmata 141 def test_recursive_glob(self):
b663b12… lmata 142 store = self._make_store()
b663b12… lmata 143 ingester = WikiIngester(store)
b663b12… lmata 144 with tempfile.TemporaryDirectory() as tmpdir:
b663b12… lmata 145 subdir = Path(tmpdir) / "sub"
b663b12… lmata 146 subdir.mkdir()
b663b12… lmata 147 (subdir / "nested.md").write_text("# Nested")
b663b12… lmata 148 stats = ingester.ingest_local(tmpdir)
b663b12… lmata 149 assert stats["pages"] == 1
b663b12… lmata 150
b663b12… lmata 151
b663b12… lmata 152 # ── Unit: _try_link edge-type handling ────────────────────────────────────────
b663b12… lmata 153
b663b12… lmata 154 class TestTryLink:
b663b12… lmata 155 def test_handles_invalid_label_gracefully(self):
b663b12… lmata 156 store = MagicMock()
b663b12… lmata 157 store.query.return_value = MagicMock(result_set=[["InvalidLabel", "foo"]])
b663b12… lmata 158 ingester = WikiIngester(store)
b663b12… lmata 159 result = ingester._try_link("page", "foo")
b663b12… lmata 160 assert result == 0
b663b12… lmata 161
b663b12… lmata 162 def test_creates_edge_for_valid_label(self):
b663b12… lmata 163 store = MagicMock()
b663b12… lmata 164 store.query.return_value = MagicMock(result_set=[["Concept", "MyService"]])
b663b12… lmata 165 ingester = WikiIngester(store)
b663b12… lmata 166 result = ingester._try_link("wiki page", "MyService")
b663b12… lmata 167 assert result == 1
b663b12… lmata 168 store.create_edge.assert_called_once()
b663b12… lmata 169
7e708ec… lmata 170 def test_returns_zero_on_unknown_label(self):
7e708ec… lmata 171 store = MagicMock()
7e708ec… lmata 172 store.query.return_value = MagicMock(result_set=[["UnknownLabel", "node"]])
7e708ec… lmata 173 ingester = WikiIngester(store)
7e708ec… lmata 174 result = ingester._try_link("page", "node")
7e708ec… lmata 175 assert result == 0
7e708ec… lmata 176
7e708ec… lmata 177 def test_propagates_store_error(self):
b663b12… lmata 178 store = MagicMock()
b663b12… lmata 179 store.query.return_value = MagicMock(result_set=[["Concept", "node"]])
b663b12… lmata 180 store.create_edge.side_effect = Exception("DB error")
b663b12… lmata 181 ingester = WikiIngester(store)
7e708ec… lmata 182 with pytest.raises(Exception, match="DB error"):
7e708ec… lmata 183 ingester._try_link("page", "node")
b663b12… lmata 184
b663b12… lmata 185
b663b12… lmata 186 # ── GitHub clone (ingest_github) ──────────────────────────────────────────────
b663b12… lmata 187
b663b12… lmata 188 class TestIngestGithub:
b663b12… lmata 189 def _make_store(self):
b663b12… lmata 190 store = MagicMock()
b663b12… lmata 191 store.query.return_value = MagicMock(result_set=[])
b663b12… lmata 192 return store
b663b12… lmata 193
b663b12… lmata 194 def test_clones_wiki_and_ingests_local(self):
b663b12… lmata 195 store = self._make_store()
b663b12… lmata 196 ingester = WikiIngester(store)
b663b12… lmata 197
b663b12… lmata 198 with tempfile.TemporaryDirectory() as tmpdir:
b663b12… lmata 199 wiki_dir = Path(tmpdir)
b663b12… lmata 200 (wiki_dir / "home.md").write_text("# Home\nWelcome.")
b663b12… lmata 201
b663b12… lmata 202 mock_result = MagicMock()
b663b12… lmata 203 mock_result.returncode = 0
b663b12… lmata 204
b663b12… lmata 205 with patch("subprocess.run", return_value=mock_result) as mock_run, \
b663b12… lmata 206 patch("tempfile.mkdtemp", return_value=str(tmpdir)):
b663b12… lmata 207 stats = ingester.ingest_github("owner/repo")
b663b12… lmata 208 mock_run.assert_called_once()
b663b12… lmata 209 cmd = mock_run.call_args[0][0]
b663b12… lmata 210 assert "git" in cmd
b663b12… lmata 211 assert "clone" in cmd
b663b12… lmata 212 assert "https://github.com/owner/repo.wiki.git" in cmd
b663b12… lmata 213 assert stats["pages"] == 1
b663b12… lmata 214
b663b12… lmata 215 def test_returns_empty_on_clone_failure(self):
b663b12… lmata 216 store = self._make_store()
b663b12… lmata 217 ingester = WikiIngester(store)
b663b12… lmata 218
b663b12… lmata 219 mock_result = MagicMock()
b663b12… lmata 220 mock_result.returncode = 1
b663b12… lmata 221 mock_result.stderr = "fatal: repository not found"
b663b12… lmata 222
b663b12… lmata 223 with patch("subprocess.run", return_value=mock_result):
b663b12… lmata 224 stats = ingester.ingest_github("owner/empty-repo")
b663b12… lmata 225 assert stats == {"pages": 0, "links": 0}
b663b12… lmata 226
b663b12… lmata 227 def test_uses_token_in_url(self):
b663b12… lmata 228 store = self._make_store()
b663b12… lmata 229 ingester = WikiIngester(store)
b663b12… lmata 230
b663b12… lmata 231 mock_result = MagicMock()
b663b12… lmata 232 mock_result.returncode = 1
b663b12… lmata 233 mock_result.stderr = "auth error"
b663b12… lmata 234
b663b12… lmata 235 with patch("subprocess.run", return_value=mock_result) as mock_run:
b663b12… lmata 236 ingester.ingest_github("owner/repo", token="mytoken")
b663b12… lmata 237 cmd = mock_run.call_args[0][0]
b663b12… lmata 238 assert "[email protected]" in cmd[3]
b663b12… lmata 239
b663b12… lmata 240 def test_uses_explicit_clone_dir(self):
b663b12… lmata 241 store = self._make_store()
b663b12… lmata 242 ingester = WikiIngester(store)
b663b12… lmata 243
b663b12… lmata 244 with tempfile.TemporaryDirectory() as tmpdir:
b663b12… lmata 245 mock_result = MagicMock()
b663b12… lmata 246 mock_result.returncode = 0
b663b12… lmata 247
b663b12… lmata 248 with patch("subprocess.run", return_value=mock_result):
b663b12… lmata 249 ingester.ingest_github("owner/repo", clone_dir=tmpdir)
b663b12… lmata 250 # Should not crash
b663b12… lmata 251
b663b12… lmata 252
b663b12… lmata 253 # ── GitHub API (ingest_github_api) ────────────────────────────────────────────
b663b12… lmata 254
b663b12… lmata 255 class TestIngestGithubApi:
b663b12… lmata 256 def _make_store(self):
b663b12… lmata 257 store = MagicMock()
b663b12… lmata 258 store.query.return_value = MagicMock(result_set=[])
b663b12… lmata 259 return store
b663b12… lmata 260
b663b12… lmata 261 def test_fetches_readme_and_ingests(self):
b663b12… lmata 262 store = self._make_store()
b663b12… lmata 263 ingester = WikiIngester(store)
b663b12… lmata 264
b663b12… lmata 265 import base64
b663b12… lmata 266 import json as _json
b663b12… lmata 267 readme_content = base64.b64encode(b"# README\nSome content").decode()
b663b12… lmata 268 mock_response_data = {
b663b12… lmata 269 "content": readme_content,
b663b12… lmata 270 "html_url": "https://github.com/owner/repo/blob/main/README.md",
b663b12… lmata 271 }
b663b12… lmata 272
b663b12… lmata 273 mock_resp = MagicMock()
b663b12… lmata 274 mock_resp.read.return_value = _json.dumps(mock_response_data).encode()
b663b12… lmata 275 mock_resp.__enter__ = lambda s: s
b663b12… lmata 276 mock_resp.__exit__ = MagicMock(return_value=False)
b663b12… lmata 277
b663b12… lmata 278 with patch("urllib.request.urlopen", return_value=mock_resp):
b663b12… lmata 279 stats = ingester.ingest_github_api("owner/repo")
b663b12… lmata 280 assert stats["pages"] >= 1
b663b12… lmata 281
b663b12… lmata 282 def test_skips_missing_files_gracefully(self):
b663b12… lmata 283 store = self._make_store()
b663b12… lmata 284 ingester = WikiIngester(store)
b663b12… lmata 285
b663b12… lmata 286 with patch("urllib.request.urlopen", side_effect=Exception("404")):
b663b12… lmata 287 stats = ingester.ingest_github_api("owner/repo")
b663b12… lmata 288 assert stats == {"pages": 0, "links": 0}
b663b12… lmata 289
b663b12… lmata 290 def test_uses_auth_header_with_token(self):
b663b12… lmata 291 store = self._make_store()
b663b12… lmata 292 ingester = WikiIngester(store)
b663b12… lmata 293
b663b12… lmata 294 with patch("urllib.request.urlopen", side_effect=Exception("skip")), \
b663b12… lmata 295 patch("urllib.request.Request") as mock_req:
b663b12… lmata 296 ingester.ingest_github_api("owner/repo", token="mytoken")
b663b12… lmata 297 # Just verify no crash and token path was exercised
b663b12… lmata 298 assert mock_req.called

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button