Navegador

feat: AST optimizations — LRU cache, incremental parsing, graph diffing, parallel ingestion TreeCache: thread-safe LRU cache for parsed trees. IncrementalParser: old_tree support for faster re-parses. GraphDiffer: only write changed nodes on re-ingest. ParallelIngester: ThreadPoolExecutor-based concurrent file processing. Closes #42, closes #43, closes #44, closes #45

lmata 2026-03-23 05:08 trunk
Commit a24deda54b88d4e79b24c2511370f95b405fdbcdff843c1ccb64b7bd65f52d9f
--- a/navegador/ingestion/optimization.py
+++ b/navegador/ingestion/optimization.py
@@ -0,0 +1,341 @@
1
+"""
2
+AST optimization utilities for navegador ingestion.
3
+
4
+Provides four independent classes that can be composed to accelerate
5
+re-ingestion of large repositories:
6
+
7
+ TreeCache — LRU cache for parsed tree-sitter trees (#42)
8
+ IncrementalParser — Wraps tree-sitter parse() with old_tree support (#43)
9
+ GraphDiffer — Node-level diffing to skip unchanged writes (#44)
10
+ ParallelIngester — ThreadPoolExecutor wrapper around RepoIngester (#45)
11
+"""
12
+
13
+from __future__ import annotations
14
+
15
+import concurrent.futures
16
+import logging
17
+import threading
18
+from collections import OrderedDict
19
+from dataclasses import dataclass, field
20
+from pathlib import Path
21
+from typing import Any
22
+
23
+logger = logging.getLogger(__name__)
24
+
25
+
26
+# ── #42 — LRU cache for parsed trees ─────────────────────────────────────────
27
+
28
+
29
+class TreeCache:
30
+ """
31
+ Thread-safe LRU cache that maps ``(path, content_hash)`` to a parsed
32
+ tree-sitter tree.
33
+
34
+ Args:
35
+ max_size: Maximum number of trees to hold in memory. When the cache
36
+ is full the least-recently-used entry is evicted.
37
+ """
38
+
39
+ def __init__(self, max_size: int = 256) -> None:
40
+ if max_size < 1:
41
+ raise ValueError("max_size must be >= 1")
42
+ self._max_size = max_size
43
+ # OrderedDict used as an LRU store: most-recently used is at the end.
44
+ self._cache: OrderedDict[tuple[str, str], Any] = OrderedDict()
45
+ self._hits = 0
46
+ self._misses = 0
47
+ self._lock = threading.Lock()
48
+
49
+ # ── public API ────────────────────────────────────────────────────────────
50
+
51
+ def get(self, path: str, content_hash: str) -> Any | None:
52
+ """Return the cached tree or ``None`` on a cache miss."""
53
+ key = (path, content_hash)
54
+ with self._lock:
55
+ if key in self._cache:
56
+ self._hits += 1
57
+ # Move to the end (most recently used).
58
+ self._cache.move_to_end(key)
59
+ return self._cache[key]
60
+ self._misses += 1
61
+ return None
62
+
63
+ def put(self, path: str, content_hash: str, tree: Any) -> None:
64
+ """Insert (or update) a tree in the cache, evicting LRU entry if full."""
65
+ key = (path, content_hash)
66
+ with self._lock:
67
+ if key in self._cache:
68
+ self._cache.move_to_end(key)
69
+ self._cache[key] = tree
70
+ else:
71
+ if len(self._cache) >= self._max_size:
72
+ # Evict oldest entry (front of the OrderedDict).
73
+ self._cache.popitem(last=False)
74
+ self._cache[key] = tree
75
+
76
+ def clear(self) -> None:
77
+ """Remove all cached trees and reset statistics."""
78
+ with self._lock:
79
+ self._cache.clear()
80
+ self._hits = 0
81
+ self._misses = 0
82
+
83
+ def stats(self) -> dict[str, int]:
84
+ """Return a snapshot of cache statistics."""
85
+ with self._lock:
86
+ return {
87
+ "hits": self._hits,
88
+ "misses": self._misses,
89
+ "size": len(self._cache),
90
+ "max_size": self._max_size,
91
+ }
92
+
93
+ # ── dunder helpers ────────────────────────────────────────────────────────
94
+
95
+ def __len__(self) -> int:
96
+ with self._lock:
97
+ return len(self._cache)
98
+
99
+
100
+# ── #43 — Incremental re-parsing via tree-sitter old_tree API ────────────────
101
+
102
+
103
+class IncrementalParser:
104
+ """
105
+ Wraps tree-sitter's ``parser.parse()`` to pass ``old_tree`` when a
106
+ previously-parsed tree for the same path is available.
107
+
108
+ Parsed trees are stored in a :class:`TreeCache` so subsequent calls for
109
+ an unchanged file return instantly without hitting the tree-sitter C
110
+ extension.
111
+
112
+ Args:
113
+ cache: A :class:`TreeCache` instance. A default cache is created if
114
+ none is provided.
115
+ """
116
+
117
+ def __init__(self, cache: TreeCache | None = None) -> None:
118
+ self._cache = cache if cache is not None else TreeCache()
119
+
120
+ def parse(
121
+ self,
122
+ source_bytes: bytes,
123
+ language: Any,
124
+ path: str,
125
+ content_hash: str,
126
+ ) -> Any:
127
+ """
128
+ Parse *source_bytes* using *language* and return the tree.
129
+
130
+ If a cached tree exists for *(path, content_hash)* it is returned
131
+ immediately. Otherwise the most recent tree for *path* (with a
132
+ different hash, if any) is retrieved from the cache and passed as
133
+ ``old_tree`` to ``language.parser.parse()`` to enable incremental
134
+ parsing, then the new tree is stored.
135
+
136
+ Args:
137
+ source_bytes: Raw UTF-8 source code.
138
+ language: A tree-sitter ``Language`` object (or mock in tests).
139
+ path: Repository-relative path, used as cache key.
140
+ content_hash: Content hash, used as cache key.
141
+
142
+ Returns:
143
+ A parsed tree-sitter ``Tree``.
144
+ """
145
+ # Fast path: tree for this exact content is already cached.
146
+ cached = self._cache.get(path, content_hash)
147
+ if cached is not None:
148
+ return cached
149
+
150
+ # Look for a stale tree for this path to use as old_tree.
151
+ old_tree = self._get_stale_tree(path)
152
+
153
+ # Build a parser using the language object. tree-sitter parsers are
154
+ # instantiated differently depending on version; we support both the
155
+ # legacy API (tree_sitter.Parser) and the new Language.parser attribute.
156
+ try:
157
+ import tree_sitter # type: ignore[import]
158
+
159
+ parser = tree_sitter.Parser()
160
+ parser.set_language(language)
161
+ except Exception:
162
+ # Fallback: language might already be a parser-like object.
163
+ parser = language
164
+
165
+ if old_tree is not None:
166
+ tree = parser.parse(source_bytes, old_tree)
167
+ else:
168
+ tree = parser.parse(source_bytes)
169
+
170
+ self._cache.put(path, content_hash, tree)
171
+ return tree
172
+
173
+ # ── internal helpers ──────────────────────────────────────────────────────
174
+
175
+ def _get_stale_tree(self, path: str) -> Any | None:
176
+ """Return any cached tree for *path* regardless of hash, or ``None``."""
177
+ with self._cache._lock:
178
+ for (cached_path, _), tree in self._cache._cache.items():
179
+ if cached_path == path:
180
+ return tree
181
+ return None
182
+
183
+ @property
184
+ def cache(self) -> TreeCache:
185
+ return self._cache
186
+
187
+
188
+# ── #44 — Graph node diffing ──────────────────────────────────────────────────
189
+
190
+
191
+@dataclass
192
+class DiffResult:
193
+ """Summary of a node-level diff for one file."""
194
+
195
+ added: int = 0
196
+ modified: int = 0
197
+ unchanged: int = 0
198
+ removed: int = 0
199
+
200
+ @property
201
+ def total_changes(self) -> int:
202
+ return self.added + self.modified + self.removed
203
+
204
+
205
+@dataclass
206
+class NodeDescriptor:
207
+ """Minimal, hashable description of a graph node used for comparison."""
208
+
209
+ label: str
210
+ name: str
211
+ line_start: int
212
+ # Extra properties that contribute to the "modified" check.
213
+ extra: dict[str, Any] = field(default_factory=dict)
214
+
215
+ def identity_key(self) -> tuple[str, str, int]:
216
+ return (self.label, self.name, self.line_start)
217
+
218
+ def __eq__(self, other: object) -> bool:
219
+ if not isinstance(other, NodeDescriptor):
220
+ return NotImplemented
221
+ return self.identity_key() == other.identity_key() and self.extra == other.extra
222
+
223
+
224
+class GraphDiffer:
225
+ """
226
+ Compares newly-parsed nodes against what is already stored in the graph
227
+ so that only genuinely changed nodes need to be written.
228
+
229
+ Args:
230
+ store: A :class:`~navegador.graph.store.GraphStore` instance.
231
+ """
232
+
233
+ def __init__(self, store: Any) -> None:
234
+ self._store = store
235
+
236
+ # ── public API ────────────────────────────────────────────────────────────
237
+
238
+ def diff_file(
239
+ self,
240
+ file_path: str,
241
+ new_nodes: list[NodeDescriptor],
242
+ ) -> DiffResult:
243
+ """
244
+ Compare *new_nodes* against graph nodes currently stored for
245
+ *file_path*.
246
+
247
+ Args:
248
+ file_path: Repository-relative path of the file being re-parsed.
249
+ new_nodes: Nodes produced by the latest parse pass.
250
+
251
+ Returns:
252
+ A :class:`DiffResult` with counts of added / modified / unchanged /
253
+ removed nodes.
254
+ """
255
+ existing_nodes = self._fetch_existing_nodes(file_path)
256
+
257
+ , NodeDescriptor] = {
258
+ n.identity_key(): n for n in existing_node new_by_key: dict[tuple[str, str, int], NodeDescriptor] = {
259
+ n.identity_key(): n for n in new_nodes
260
+ }
261
+
262
+ result = DiffResult()
263
+
264
+ for key, new_node in new_by_key.items():
265
+ if key not in existing_by_key:
266
+ result.added += 1
267
+ elif new_node != existing_by_key[key]:
268
+ result.modified += 1
269
+ else:
270
+ result.unchanged += 1
271
+
272
+ for key in existing_by_key:
273
+ if key not in new_by_key:
274
+ result.removed += 1
275
+
276
+ return result
277
+
278
+ # ── internal helpers ─────────────────────────────────────────────
279
+ with for f in self._ingester._iter_source_files(repo_path)
280
+ndidate fi ter_source_files(repo_path) if LANGUAGE_MAP.get(f.suffix)
281
+ ]
282
+
283
+ aggregated: dict[str, int] = {
284
+ "files": 0,
285
+ "functions": 0,
286
+ "classes": 0,
287
+ "edges": 0,
288
+ "skipped": 0,
289
+ "errors": 0,
290
+ }
291
+ lock = threading.Lock()
292
+
293
+ def _process_file(source_file: Path) -> None:
294
+ language = LANGUAGE_MAP[source_file.suffix]
295
+ rel_path = str(source_file.relative_to(repo_path))
296
+ content_hash = _file_hash(source_file)
297
+
298
+ if incremental and self._ingester._file_unchanged(rel_path, content_hash):
299
+ with lock:
300
+ aggregated["skipped"] += 1
301
+ return
302
+
303
+ if incremental:
304
+ self._ingester._clear_file_subgraph(rel_path)
305
+
306
+ parse_path, effective_root = self._inglf._ingester._itesource_file, repo_paths
307
+ }
308
+ new try:
309
+ try:
310
+ parser = self._ingester._get_parser(language)
311
+ file_stats = parser.parse_file(parse_path, effective_root, self._store)
312
+ self._ingester._store_file_hash(rel_path, content_hash)
313
+ with lock:
314
+ aggregated["files"] += 1
315
+ aggregated["functions"] += file_stats.get("functions", 0)
316
+ aggregated["classes"] += file_stats.get("classes", 0)
317
+ aggregated["edges"] += file_stats.get("edges", 0)
318
+ except Exception:
319
+ logger.exception("Failed to parse %s", source_file)
320
+ with lock:
321
+ aggregated["errors"] += 1
322
+ finally:
323
+ import shutil
324
+
325
+ if effective_root is not repo_path:
326
+ shutil.rmtree(effective_root, ignore_errors=True)
327
+
328
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
329
+ futures = {executor.submit(_process_file, f): f for f in candidate_files}
330
+ for future in concurrent.futures.as_completed(futures):
331
+ # Exceptions are already caught inside _process_file; this
332
+ # re-raises any unexpected ones that slipped through.
333
+ future.result()
334
+
335
+ logger.info(
336
+ "ParallelIngester finished %s: %d files, %d functions, %d skipped, %d errors",
337
+ repo_path.name,
338
+ aggregated["files"],
339
+ aggregated["functions"],
340
+ aggregated["skipped"],
341
+ aggr
--- a/navegador/ingestion/optimization.py
+++ b/navegador/ingestion/optimization.py
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/navegador/ingestion/optimization.py
+++ b/navegador/ingestion/optimization.py
@@ -0,0 +1,341 @@
1 """
2 AST optimization utilities for navegador ingestion.
3
4 Provides four independent classes that can be composed to accelerate
5 re-ingestion of large repositories:
6
7 TreeCache — LRU cache for parsed tree-sitter trees (#42)
8 IncrementalParser — Wraps tree-sitter parse() with old_tree support (#43)
9 GraphDiffer — Node-level diffing to skip unchanged writes (#44)
10 ParallelIngester — ThreadPoolExecutor wrapper around RepoIngester (#45)
11 """
12
13 from __future__ import annotations
14
15 import concurrent.futures
16 import logging
17 import threading
18 from collections import OrderedDict
19 from dataclasses import dataclass, field
20 from pathlib import Path
21 from typing import Any
22
23 logger = logging.getLogger(__name__)
24
25
26 # ── #42 — LRU cache for parsed trees ─────────────────────────────────────────
27
28
29 class TreeCache:
30 """
31 Thread-safe LRU cache that maps ``(path, content_hash)`` to a parsed
32 tree-sitter tree.
33
34 Args:
35 max_size: Maximum number of trees to hold in memory. When the cache
36 is full the least-recently-used entry is evicted.
37 """
38
39 def __init__(self, max_size: int = 256) -> None:
40 if max_size < 1:
41 raise ValueError("max_size must be >= 1")
42 self._max_size = max_size
43 # OrderedDict used as an LRU store: most-recently used is at the end.
44 self._cache: OrderedDict[tuple[str, str], Any] = OrderedDict()
45 self._hits = 0
46 self._misses = 0
47 self._lock = threading.Lock()
48
49 # ── public API ────────────────────────────────────────────────────────────
50
51 def get(self, path: str, content_hash: str) -> Any | None:
52 """Return the cached tree or ``None`` on a cache miss."""
53 key = (path, content_hash)
54 with self._lock:
55 if key in self._cache:
56 self._hits += 1
57 # Move to the end (most recently used).
58 self._cache.move_to_end(key)
59 return self._cache[key]
60 self._misses += 1
61 return None
62
63 def put(self, path: str, content_hash: str, tree: Any) -> None:
64 """Insert (or update) a tree in the cache, evicting LRU entry if full."""
65 key = (path, content_hash)
66 with self._lock:
67 if key in self._cache:
68 self._cache.move_to_end(key)
69 self._cache[key] = tree
70 else:
71 if len(self._cache) >= self._max_size:
72 # Evict oldest entry (front of the OrderedDict).
73 self._cache.popitem(last=False)
74 self._cache[key] = tree
75
76 def clear(self) -> None:
77 """Remove all cached trees and reset statistics."""
78 with self._lock:
79 self._cache.clear()
80 self._hits = 0
81 self._misses = 0
82
83 def stats(self) -> dict[str, int]:
84 """Return a snapshot of cache statistics."""
85 with self._lock:
86 return {
87 "hits": self._hits,
88 "misses": self._misses,
89 "size": len(self._cache),
90 "max_size": self._max_size,
91 }
92
93 # ── dunder helpers ────────────────────────────────────────────────────────
94
95 def __len__(self) -> int:
96 with self._lock:
97 return len(self._cache)
98
99
100 # ── #43 — Incremental re-parsing via tree-sitter old_tree API ────────────────
101
102
103 class IncrementalParser:
104 """
105 Wraps tree-sitter's ``parser.parse()`` to pass ``old_tree`` when a
106 previously-parsed tree for the same path is available.
107
108 Parsed trees are stored in a :class:`TreeCache` so subsequent calls for
109 an unchanged file return instantly without hitting the tree-sitter C
110 extension.
111
112 Args:
113 cache: A :class:`TreeCache` instance. A default cache is created if
114 none is provided.
115 """
116
117 def __init__(self, cache: TreeCache | None = None) -> None:
118 self._cache = cache if cache is not None else TreeCache()
119
120 def parse(
121 self,
122 source_bytes: bytes,
123 language: Any,
124 path: str,
125 content_hash: str,
126 ) -> Any:
127 """
128 Parse *source_bytes* using *language* and return the tree.
129
130 If a cached tree exists for *(path, content_hash)* it is returned
131 immediately. Otherwise the most recent tree for *path* (with a
132 different hash, if any) is retrieved from the cache and passed as
133 ``old_tree`` to ``language.parser.parse()`` to enable incremental
134 parsing, then the new tree is stored.
135
136 Args:
137 source_bytes: Raw UTF-8 source code.
138 language: A tree-sitter ``Language`` object (or mock in tests).
139 path: Repository-relative path, used as cache key.
140 content_hash: Content hash, used as cache key.
141
142 Returns:
143 A parsed tree-sitter ``Tree``.
144 """
145 # Fast path: tree for this exact content is already cached.
146 cached = self._cache.get(path, content_hash)
147 if cached is not None:
148 return cached
149
150 # Look for a stale tree for this path to use as old_tree.
151 old_tree = self._get_stale_tree(path)
152
153 # Build a parser using the language object. tree-sitter parsers are
154 # instantiated differently depending on version; we support both the
155 # legacy API (tree_sitter.Parser) and the new Language.parser attribute.
156 try:
157 import tree_sitter # type: ignore[import]
158
159 parser = tree_sitter.Parser()
160 parser.set_language(language)
161 except Exception:
162 # Fallback: language might already be a parser-like object.
163 parser = language
164
165 if old_tree is not None:
166 tree = parser.parse(source_bytes, old_tree)
167 else:
168 tree = parser.parse(source_bytes)
169
170 self._cache.put(path, content_hash, tree)
171 return tree
172
173 # ── internal helpers ──────────────────────────────────────────────────────
174
175 def _get_stale_tree(self, path: str) -> Any | None:
176 """Return any cached tree for *path* regardless of hash, or ``None``."""
177 with self._cache._lock:
178 for (cached_path, _), tree in self._cache._cache.items():
179 if cached_path == path:
180 return tree
181 return None
182
183 @property
184 def cache(self) -> TreeCache:
185 return self._cache
186
187
188 # ── #44 — Graph node diffing ──────────────────────────────────────────────────
189
190
191 @dataclass
192 class DiffResult:
193 """Summary of a node-level diff for one file."""
194
195 added: int = 0
196 modified: int = 0
197 unchanged: int = 0
198 removed: int = 0
199
200 @property
201 def total_changes(self) -> int:
202 return self.added + self.modified + self.removed
203
204
205 @dataclass
206 class NodeDescriptor:
207 """Minimal, hashable description of a graph node used for comparison."""
208
209 label: str
210 name: str
211 line_start: int
212 # Extra properties that contribute to the "modified" check.
213 extra: dict[str, Any] = field(default_factory=dict)
214
215 def identity_key(self) -> tuple[str, str, int]:
216 return (self.label, self.name, self.line_start)
217
218 def __eq__(self, other: object) -> bool:
219 if not isinstance(other, NodeDescriptor):
220 return NotImplemented
221 return self.identity_key() == other.identity_key() and self.extra == other.extra
222
223
224 class GraphDiffer:
225 """
226 Compares newly-parsed nodes against what is already stored in the graph
227 so that only genuinely changed nodes need to be written.
228
229 Args:
230 store: A :class:`~navegador.graph.store.GraphStore` instance.
231 """
232
233 def __init__(self, store: Any) -> None:
234 self._store = store
235
236 # ── public API ────────────────────────────────────────────────────────────
237
238 def diff_file(
239 self,
240 file_path: str,
241 new_nodes: list[NodeDescriptor],
242 ) -> DiffResult:
243 """
244 Compare *new_nodes* against graph nodes currently stored for
245 *file_path*.
246
247 Args:
248 file_path: Repository-relative path of the file being re-parsed.
249 new_nodes: Nodes produced by the latest parse pass.
250
251 Returns:
252 A :class:`DiffResult` with counts of added / modified / unchanged /
253 removed nodes.
254 """
255 existing_nodes = self._fetch_existing_nodes(file_path)
256
257 , NodeDescriptor] = {
258 n.identity_key(): n for n in existing_node new_by_key: dict[tuple[str, str, int], NodeDescriptor] = {
259 n.identity_key(): n for n in new_nodes
260 }
261
262 result = DiffResult()
263
264 for key, new_node in new_by_key.items():
265 if key not in existing_by_key:
266 result.added += 1
267 elif new_node != existing_by_key[key]:
268 result.modified += 1
269 else:
270 result.unchanged += 1
271
272 for key in existing_by_key:
273 if key not in new_by_key:
274 result.removed += 1
275
276 return result
277
278 # ── internal helpers ─────────────────────────────────────────────
279 with for f in self._ingester._iter_source_files(repo_path)
280 ndidate fi ter_source_files(repo_path) if LANGUAGE_MAP.get(f.suffix)
281 ]
282
283 aggregated: dict[str, int] = {
284 "files": 0,
285 "functions": 0,
286 "classes": 0,
287 "edges": 0,
288 "skipped": 0,
289 "errors": 0,
290 }
291 lock = threading.Lock()
292
293 def _process_file(source_file: Path) -> None:
294 language = LANGUAGE_MAP[source_file.suffix]
295 rel_path = str(source_file.relative_to(repo_path))
296 content_hash = _file_hash(source_file)
297
298 if incremental and self._ingester._file_unchanged(rel_path, content_hash):
299 with lock:
300 aggregated["skipped"] += 1
301 return
302
303 if incremental:
304 self._ingester._clear_file_subgraph(rel_path)
305
306 parse_path, effective_root = self._inglf._ingester._itesource_file, repo_paths
307 }
308 new try:
309 try:
310 parser = self._ingester._get_parser(language)
311 file_stats = parser.parse_file(parse_path, effective_root, self._store)
312 self._ingester._store_file_hash(rel_path, content_hash)
313 with lock:
314 aggregated["files"] += 1
315 aggregated["functions"] += file_stats.get("functions", 0)
316 aggregated["classes"] += file_stats.get("classes", 0)
317 aggregated["edges"] += file_stats.get("edges", 0)
318 except Exception:
319 logger.exception("Failed to parse %s", source_file)
320 with lock:
321 aggregated["errors"] += 1
322 finally:
323 import shutil
324
325 if effective_root is not repo_path:
326 shutil.rmtree(effective_root, ignore_errors=True)
327
328 with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
329 futures = {executor.submit(_process_file, f): f for f in candidate_files}
330 for future in concurrent.futures.as_completed(futures):
331 # Exceptions are already caught inside _process_file; this
332 # re-raises any unexpected ones that slipped through.
333 future.result()
334
335 logger.info(
336 "ParallelIngester finished %s: %d files, %d functions, %d skipped, %d errors",
337 repo_path.name,
338 aggregated["files"],
339 aggregated["functions"],
340 aggregated["skipped"],
341 aggr
--- a/tests/test_optimization.py
+++ b/tests/test_optimization.py
@@ -0,0 +1,606 @@
1
+"""Tests for navegador.ingestion.optimization (#42 – #45)."""
2
+
3
+from __future__ import annotations
4
+
5
+import tempfile
6
+import threading
7
+from pathlib import Path
8
+from unittest.mock import MagicMock, call, patch
9
+
10
+import pytest
11
+
12
+from navegador.ingestion.optimization import (
13
+ DiffResult,
14
+ GraphDiffer,
15
+ IncrementalParser,
16
+ NodeDescriptor,
17
+ ParallelIngester,
18
+ TreeCache,
19
+)
20
+
21
+
22
+# ── helpers ───────────────────────────────────────────────────────────────────
23
+
24
+
25
+def _make_store(rows=None):
26
+ """Return a MagicMock GraphStore whose query() returns *rows*."""
27
+ store = MagicMock()
28
+ store.query.return_value = MagicMock(result_set=rows or [])
29
+ return store
30
+
31
+
32
+def _mock_tree(name: str = "tree") -> MagicMock:
33
+ t = MagicMock()
34
+ t.__repr__ = lambda self: f"<MockTree {name}>"
35
+ return t
36
+
37
+
38
+# ── #42 — TreeCache ───────────────────────────────────────────────────────────
39
+
40
+
41
+class TestTreeCache:
42
+ # ── get / put ──────────────────────────────────────────────────────────────
43
+
44
+ def test_get_returns_none_on_cold_cache(self):
45
+ cache = TreeCache()
46
+ assert cache.get("foo.py", "abc") is None
47
+
48
+ def test_put_and_get_roundtrip(self):
49
+ cache = TreeCache()
50
+ tree = _mock_tree()
51
+ cache.put("foo.py", "abc123", tree)
52
+ assert cache.get("foo.py", "abc123") is tree
53
+
54
+ def test_get_miss_does_not_return_wrong_hash(self):
55
+ cache = TreeCache()
56
+ tree = _mock_tree()
57
+ cache.put("foo.py", "hash-A", tree)
58
+ assert cache.get("foo.py", "hash-B") is None
59
+
60
+ def test_get_miss_does_not_return_wrong_path(self):
61
+ cache = TreeCache()
62
+ tree = _mock_tree()
63
+ cache.put("foo.py", "hash-A", tree)
64
+ assert cache.get("bar.py", "hash-A") is None
65
+
66
+ def test_put_overwrites_existing_entry(self):
67
+ cache = TreeCache()
68
+ t1 = _mock_tree("t1")
69
+ t2 = _mock_tree("t2")
70
+ cache.put("foo.py", "abc", t1)
71
+ cache.put("foo.py", "abc", t2)
72
+ assert cache.get("foo.py", "abc") is t2
73
+
74
+ # ── LRU eviction ──────────────────────────────────────────────────────────
75
+
76
+ def test_evicts_lru_entry_when_full(self):
77
+ cache = TreeCache(max_size=2)
78
+ t1 = _mock_tree("t1")
79
+ t2 = _mock_tree("t2")
80
+ t3 = _mock_tree("t3")
81
+
82
+ cache.put("a.py", "1", t1)
83
+ cache.put("b.py", "2", t2)
84
+ # Cache is now full; inserting t3 should evict t1 (LRU).
85
+ cache.put("c.py", "3", t3)
86
+
87
+ assert cache.get("a.py", "1") is None
88
+ assert cache.get("b.py", "2") is t2
89
+ assert cache.get("c.py", "3") is t3
90
+
91
+ def test_get_promotes_entry_so_it_is_not_evicted(self):
92
+ cache = TreeCache(max_size=2)
93
+ t1 = _mock_tree("t1")
94
+ t2 = _mock_tree("t2")
95
+ t3 = _mock_tree("t3")
96
+
97
+ cache.put("a.py", "1", t1)
98
+ cache.put("b.py", "2", t2)
99
+ # Touch t1 so it becomes the most-recently used.
100
+ cache.get("a.py", "1")
101
+ # t2 is now the LRU; adding t3 should evict t2.
102
+ cache.put("c.py", "3", t3)
103
+
104
+ assert cache.get("a.py", "1") is t1
105
+ assert cache.get("b.py", "2") is None
106
+ assert cache.get("c.py", "3") is t3
107
+
108
+ def test_size_respects_max_size(self):
109
+ cache = TreeCache(max_size=3)
110
+ for i in range(10):
111
+ cache.put(f"file{i}.py", str(i), _mock_tree())
112
+ assert len(cache) <= 3
113
+
114
+ def test_constructor_rejects_zero_max_size(self):
115
+ with pytest.raises(ValueError):
116
+ TreeCache(max_size=0)
117
+
118
+ # ── stats ──────────────────────────────────────────────────────────────────
119
+
120
+ def test_stats_initial_state(self):
121
+ cache = TreeCache()
122
+ s = cache.stats()
123
+ assert s["hits"] == 0
124
+ assert s["misses"] == 0
125
+ assert s["size"] == 0
126
+
127
+ def test_stats_records_hits(self):
128
+ cache = TreeCache()
129
+ cache.put("x.py", "h", _mock_tree())
130
+ cache.get("x.py", "h")
131
+ cache.get("x.py", "h")
132
+ assert cache.stats()["hits"] == 2
133
+
134
+ def test_stats_records_misses(self):
135
+ cache = TreeCache()
136
+ cache.get("x.py", "h")
137
+ cache.get("y.py", "h")
138
+ assert cache.stats()["misses"] == 2
139
+
140
+ def test_stats_size_tracks_entries(self):
141
+ cache = TreeCache(max_size=10)
142
+ cache.put("a.py", "1", _mock_tree())
143
+ cache.put("b.py", "2", _mock_tree())
144
+ assert cache.stats()["size"] == 2
145
+
146
+ def test_stats_max_size_reported(self):
147
+ cache = TreeCache(max_size=42)
148
+ assert cache.stats()["max_size"] == 42
149
+
150
+ # ── clear ──────────────────────────────────────────────────────────────────
151
+
152
+ def test_clear_removes_all_entries(self):
153
+ cache = TreeCache()
154
+ cache.put("a.py", "1", _mock_tree())
155
+ cache.put("b.py", "2", _mock_tree())
156
+ cache.clear()
157
+ assert len(cache) == 0
158
+ assert cache.get("a.py", "1") is None
159
+
160
+ def test_clear_resets_stats(self):
161
+ cache = TreeCache()
162
+ cache.put("a.py", "1", _mock_tree())
163
+ cache.get("a.py", "1")
164
+ cache.get("a.py", "bad")
165
+ cache.clear()
166
+ s = cache.stats()
167
+ assert s["hits"] == 0
168
+ assert s["misses"] == 0
169
+ assert s["size"] == 0
170
+
171
+ # ── thread safety ──────────────────────────────────────────────────────────
172
+
173
+ def test_concurrent_puts_do_not_corrupt_state(self):
174
+ cache = TreeCache(max_size=50)
175
+ errors = []
176
+
177
+ def writer(n: int) -> None:
178
+ try:
179
+ for i in range(20):
180
+ cache.put(f"file{n}_{i}.py", str(i), _mock_tree())
181
+ except Exception as exc: # noqa: BLE001
182
+ errors.append(exc)
183
+
184
+ threads = [threading.Thread(target=writer, args=(t,)) for t in range(5)]
185
+ for t in threads:
186
+ t.start()
187
+ for t in threads:
188
+ t.join()
189
+
190
+ assert not errors
191
+ assert len(cache) <= 50
192
+
193
+
194
+# ── #43 — IncrementalParser ───────────────────────────────────────────────────
195
+
196
+
197
+class TestIncrementalParser:
198
+ def _make_language_and_parser(self):
199
+ """
200
+ Return a fake tree-sitter Language object whose parser.parse()
201
+ returns a fresh MagicMock tree.
202
+ """
203
+ fake_tree = _mock_tree("parsed")
204
+ fake_parser = MagicMock()
205
+ fake_parser.parse.return_value = fake_tree
206
+ fake_language = MagicMock()
207
+
208
+ # Patch tree_sitter.Parser so IncrementalParser can instantiate it.
209
+ mock_ts_parser = MagicMock()
210
+ mock_ts_parser.parse.return_value = fake_tree
211
+ mock_ts_class = MagicMock(return_value=mock_ts_parser)
212
+
213
+ return fake_tree, mock_ts_parser, mock_ts_class, fake_language
214
+
215
+ def test_parse_returns_tree(self):
216
+ cache = TreeCache()
217
+ inc = IncrementalParser(cache)
218
+
219
+ fake_tree = _mock_tree()
220
+ mock_ts_parser = MagicMock()
221
+ mock_ts_parser.parse.return_value = fake_tree
222
+
223
+ with patch("tree_sitter.Parser", return_value=mock_ts_parser):
224
+ result = inc.parse(b"source", MagicMock(), "foo.py", "hash1")
225
+
226
+ assert result is fake_tree
227
+
228
+ def test_parse_stores_tree_in_cache(self):
229
+ cache = TreeCache()
230
+ inc = IncrementalParser(cache)
231
+
232
+ fake_tree = _mock_tree()
233
+ mock_ts_parser = MagicMock()
234
+ mock_ts_parser.parse.return_value = fake_tree
235
+
236
+ with patch("tree_sitter.Parser", return_value=mock_ts_parser):
237
+ inc.parse(b"source", MagicMock(), "foo.py", "hash1")
238
+
239
+ assert cache.get("foo.py", "hash1") is fake_tree
240
+
241
+ def test_parse_returns_cached_tree_without_calling_parser(self):
242
+ cached_tree = _mock_tree("cached")
243
+ cache = TreeCache()
244
+ cache.put("foo.py", "hash1", cached_tree)
245
+
246
+ inc = IncrementalParser(cache)
247
+ mock_ts_parser = MagicMock()
248
+
249
+ with patch("tree_sitter.Parser", return_value=mock_ts_parser):
250
+ result = inc.parse(b"source", MagicMock(), "foo.py", "hash1")
251
+
252
+ assert result is cached_tree
253
+ mock_ts_parser.parse.assert_not_called()
254
+
255
+ def test_cache_hit_increments_hit_count(self):
256
+ cache = TreeCache()
257
+ tree = _mock_tree()
258
+ cache.put("foo.py", "hashX", tree)
259
+
260
+ inc = IncrementalParser(cache)
261
+ with patch("tree_sitter.Parser", return_value=MagicMock()):
262
+ inc.parse(b"src", MagicMock(), "foo.py", "hashX")
263
+
264
+ assert cache.stats()["hits"] == 1
265
+
266
+ def test_parse_passes_old_tree_on_rehash(self):
267
+ """When a stale tree exists for the same path, it is passed as old_tree."""
268
+ cache = TreeCache()
269
+ stale_tree = _mock_tree("stale")
270
+ cache.put("bar.py", "old-hash", stale_tree)
271
+
272
+ new_tree = _mock_tree("new")
273
+ mock_ts_parser = MagicMock()
274
+ mock_ts_parser.parse.return_value = new_tree
275
+
276
+ inc = IncrementalParser(cache)
277
+ with patch("tree_sitter.Parser", return_value=mock_ts_parser):
278
+ result = inc.parse(b"new source", MagicMock(), "bar.py", "new-hash")
279
+
280
+ assert result is new_tree
281
+ # old_tree must have been passed as the second positional argument.
282
+ mock_ts_parser.parse.assert_called_once_with(b"new source", stale_tree)
283
+
284
+ def test_parse_without_old_tree_calls_parse_with_source_only(self):
285
+ cache = TreeCache()
286
+ new_tree = _mock_tree()
287
+ mock_ts_parser = MagicMock()
288
+ mock_ts_parser.parse.return_value = new_tree
289
+
290
+ inc = IncrementalParser(cache)
291
+ with patch("tree_sitter.Parser", return_value=mock_ts_parser):
292
+ inc.parse(b"source", MagicMock(), "baz.py", "hash1")
293
+
294
+ mock_ts_parser.parse.assert_called_once_with(b"source")
295
+
296
+ def test_default_cache_is_created_if_none_given(self):
297
+ inc = IncrementalParser()
298
+ assert isinstance(inc.cache, TreeCache)
299
+
300
+ def test_custom_cache_is_used(self):
301
+ cache = TreeCache(max_size=5)
302
+ inc = IncrementalParser(cache)
303
+ assert inc.cache is cache
304
+
305
+ def test_fallback_when_tree_sitter_not_importable(self):
306
+ """When tree_sitter is unavailable, language is used directly as parser."""
307
+ cache = TreeCache()
308
+ fake_tree = _mock_tree()
309
+ fake_language = MagicMock()
310
+ fake_language.parse.return_value = fake_tree
311
+
312
+ inc = IncrementalParser(cache)
313
+
314
+ import builtins
315
+
316
+ real_import = builtins.__import__
317
+
318
+ def _block_tree_sitter(name, *args, **kwargs):
319
+ if name == "tree_sitter":
320
+ raise ImportError("mocked absence")
321
+ return real_import(name, *args, **kwargs)
322
+
323
+ with patch("builtins.__import__", side_effect=_block_tree_sitter):
324
+ result = inc.parse(b"source", fake_language, "x.py", "h1")
325
+
326
+ assert result is fake_tree
327
+
328
+
329
+# ── #44 — GraphDiffer ─────────────────────────────────────────────────────────
330
+
331
+
332
+def _nd(label: str, name: str, line_start: int, **extra) -> NodeDescriptor:
333
+ return NodeDescriptor(label=label, name=name, line_start=line_start, extra=extra)
334
+
335
+
336
+class TestNodeDescriptor:
337
+ def test_identity_key(self):
338
+ nd = _nd("Function", "foo", 10)
339
+ assert nd.identity_key() == ("Function", "foo", 10)
340
+
341
+ def test_equality_same(self):
342
+ assert _nd("Function", "foo", 10) == _nd("Function", "foo", 10)
343
+
344
+ def test_equality_different_line(self):
345
+ assert _nd("Function", "foo", 10) != _nd("Function", "foo", 11)
346
+
347
+ def test_equality_different_extra(self):
348
+ a = _nd("Function", "foo", 10, docstring="hello")
349
+ b = _nd("Function", "foo", 10, docstring="world")
350
+ assert a != b
351
+
352
+
353
+class TestGraphDiffer:
354
+ def test_diff_empty_new_and_empty_existing(self):
355
+ store = _make_store(rows=[])
356
+ differ = GraphDiffer(store)
357
+ result = differ.diff_file("src/app.py", [])
358
+ assert result == DiffResult(added=0, modified=0, unchanged=0, removed=0)
359
+
360
+ def test_diff_all_new_nodes(self):
361
+ store = _make_store(rows=[])
362
+ differ = GraphDiffer(store)
363
+ nodes = [
364
+ _nd("Function", "foo", 1),
365
+ _nd("Class", "Bar", 10),
366
+ ]
367
+ result = differ.diff_file("src/app.py", nodes)
368
+ assert result.added == 2
369
+ assert result.modified == 0
370
+ assert result.unchanged == 0
371
+ assert result.removed == 0
372
+
373
+ def test_diff_all_unchanged_nodes(self):
374
+ store = _make_store(rows=[
375
+ ["Function", "foo", 1],
376
+ ["Class", "Bar", 10],
377
+ ])
378
+ differ = GraphDiffer(store)
379
+ nodes = [
380
+ _nd("Function", "foo", 1),
381
+ _nd("Class", "Bar", 10),
382
+ ]
383
+ result = differ.diff_file("src/app.py", nodes)
384
+ assert result.unchanged == 2
385
+ assert result.added == 0
386
+ assert result.modified == 0
387
+ assert result.removed == 0
388
+
389
+ def test_diff_modified_node(self):
390
+ """Same identity key but different extra props counts as modified."""
391
+ store = _make_store(rows=[["Function", "foo", 1]])
392
+ differ = GraphDiffer(store)
393
+ # Existing node in store has no extra; new node has docstring.
394
+ nodes = [_nd("Function", "foo", 1, docstring="now documented")]
395
+ result = differ.diff_file("src/app.py", nodes)
396
+ # The identity key matches but extra differs → modified.
397
+ assert result.modified == 1
398
+ assert result.unchanged == 0
399
+ assert result.added == 0
400
+
401
+ def test_diff_removed_nodes(self):
402
+ store = _make_store(rows=[
403
+ ["Function", "foo", 1],
404
+ ["Function", "bar", 5],
405
+ ])
406
+ differ = GraphDiffer(store)
407
+ # Only foo is present in new parse; bar was removed.
408
+ nodes = [_nd("Function", "foo", 1)]
409
+ result = differ.diff_file("src/app.py", nodes)
410
+ assert result.removed == 1
411
+ assert result.unchanged == 1
412
+
413
+ def test_diff_mixed_scenario(self):
414
+ store = _make_store(rows=[
415
+ ["Function", "old_func", 1],
416
+ ["Class", "MyClass", 20],
417
+ ])
418
+ differ = GraphDiffer(store)
419
+ new_nodes = [
420
+ _nd("Class", "MyClass", 20), # unchanged
421
+ _nd("Function", "new_func", 5), # added
422
+ ]
423
+ result = differ.diff_file("src/app.py", new_nodes)
424
+ assert result.unchanged == 1
425
+ assert result.added == 1
426
+ assert result.removed == 1
427
+ assert result.modified == 0
428
+
429
+ def test_diff_skips_rows_with_none_name(self):
430
+ store = _make_store(rows=[[None, None, None]])
431
+ differ = GraphDiffer(store)
432
+ result = differ.diff_file("src/app.py", [_nd("Function", "foo", 1)])
433
+ # The None row is skipped; foo is treated as a new node.
434
+ assert result.added == 1
435
+ assert result.removed == 0
436
+
437
+ def test_total_changes_property(self):
438
+ result = DiffResult(added=3, modified=1, unchanged=5, removed=2)
439
+ assert result.total_changes == 6
440
+
441
+ def test_store_is_queried_with_file_path(self):
442
+ store = _make_store(rows=[])
443
+ differ = GraphDiffer(store)
444
+ differ.diff_file("src/models.py", [])
445
+ # Ensure the store was actually queried with the right path param.
446
+ store.query.assert_called_once()
447
+ _, kwargs_or_positional = store.query.call_args[0], store.query.call_args
448
+ # The second positional arg to store.query should contain file_path.
449
+ call_params = store.query.call_args[0][1]
450
+ assert call_params["file_path"] == "src/models.py"
451
+
452
+
453
+# ── #45 — ParallelIngester ────────────────────────────────────────────────────
454
+
455
+
456
+class TestParallelIngester:
457
+ def _setup_ingester_with_mock_parser(self, store, parse_result=None):
458
+ """
459
+ Return a ParallelIngester whose internal RepoIngester has a mock
460
+ Python parser installed.
461
+ """
462
+ if parse_result is None:
463
+ parse_result = {"functions": 2, "classes": 1, "edges": 3}
464
+
465
+ ingester = ParallelIngester(store)
466
+ mock_parser = MagicMock()
467
+ mock_parser.parse_file.return_value = parse_result
468
+ ingester._ingester._parsers["python"] = mock_parser
469
+ return ingester, mock_parser
470
+
471
+ def test_raises_on_missing_dir(self):
472
+ store = _make_store()
473
+ ingester = ParallelIngester(store)
474
+ with pytest.raises(FileNotFoundError):
475
+ ingester.ingest_parallel("/nonexistent/path")
476
+
477
+ def test_returns_stats_dict_with_all_keys(self):
478
+ store = _make_store()
479
+ ingester, _ = self._setup_ingester_with_mock_parser(store)
480
+ with tempfile.TemporaryDirectory() as tmpdir:
481
+ stats = ingester.ingest_parallel(tmpdir)
482
+ assert {"files", "functions", "classes", "edges", "skipped", "errors"} <= set(stats)
483
+
484
+ def test_processes_single_file(self):
485
+ store = _make_store()
486
+ ingester, mock_parser = self._setup_ingester_with_mock_parser(
487
+ store, {"functions": 3, "classes": 1, "edges": 4}
488
+ )
489
+ with tempfile.TemporaryDirectory() as tmpdir:
490
+ (Path(tmpdir) / "app.py").write_text("def foo(): pass")
491
+ stats = ingester.ingest_parallel(tmpdir)
492
+
493
+ assert stats["files"] == 1
494
+ assert stats["functions"] == 3
495
+ assert stats["classes"] == 1
496
+ assert stats["edges"] == 4
497
+ assert stats["errors"] == 0
498
+
499
+ def test_processes_multiple_files_concurrently(self):
500
+ store = _make_store()
501
+ ingester, mock_parser = self._setup_ingester_with_mock_parser(
502
+ store, {"functions": 1, "classes": 0, "edges": 0}
503
+ )
504
+ with tempfile.TemporaryDirectory() as tmpdir:
505
+ for i in range(5):
506
+ (Path(tmpdir) / f"mod{i}.py").write_text(f"def f{i}(): pass")
507
+ stats = ingester.ingest_parallel(tmpdir, max_workers=3)
508
+
509
+ assert stats["files"] == 5
510
+ assert stats["functions"] == 5
511
+ assert stats["errors"] == 0
512
+
513
+ def test_aggregates_stats_across_files(self):
514
+ store = _make_store()
515
+ ingester, _ = self._setup_ingester_with_mock_parser(
516
+ store, {"functions": 2, "classes": 1, "edges": 5}
517
+ )
518
+ with tempfile.TemporaryDirectory() as tmpdir:
519
+ (Path(tmpdir) / "a.py").write_text("x=1")
520
+ (Path(tmpdir) / "b.py").write_text("y=2")
521
+ stats = ingester.ingest_parallel(tmpdir)
522
+
523
+ assert stats["files"] == 2
524
+ assert stats["functions"] == 4
525
+ assert stats["classes"] == 2
526
+ assert stats["edges"] == 10
527
+
528
+ def test_clear_flag_calls_store_clear(self):
529
+ store = _make_store()
530
+ ingester, _ = self._setup_ingester_with_mock_parser(store)
531
+ with tempfile.TemporaryDirectory() as tmpdir:
532
+ ingester.ingest_parallel(tmpdir, clear=True)
533
+ store.clear.assert_called_once()
534
+
535
+ def test_no_clear_by_default(self):
536
+ store = _make_store()
537
+ ingester, _ = self._setup_ingester_with_mock_parser(store)
538
+ with tempfile.TemporaryDirectory() as tmpdir:
539
+ ingester.ingest_parallel(tmpdir)
540
+ store.clear.assert_not_called()
541
+
542
+ def test_empty_repo_returns_zero_counts(self):
543
+ store = _make_store()
544
+ ingester, _ = self._setup_ingester_with_mock_parser(store)
545
+ with tempfile.TemporaryDirectory() as tmpdir:
546
+ stats = ingester.ingest_parallel(tmpdir)
547
+ assert stats["files"] == 0
548
+ assert stats["functions"] == 0
549
+
550
+ def test_parser_exception_increments_errors_not_files(self):
551
+ store = _make_store()
552
+ ingester = ParallelIngester(store)
553
+ broken_parser = MagicMock()
554
+ broken_parser.parse_file.side_effect = RuntimeError("boom")
555
+ ingester._ingester._parsers["python"] = broken_parser
556
+
557
+ with tempfile.TemporaryDirectory() as tmpdir:
558
+ (Path(tmpdir) / "broken.py").write_text("def x(): pass")
559
+ stats = ingester.ingest_parallel(tmpdir)
560
+
561
+ assert stats["files"] == 0
562
+ assert stats["errors"] == 1
563
+
564
+ def test_incremental_skips_unchanged_files(self):
565
+ store = _make_store()
566
+ ingester, mock_parser = self._setup_ingester_with_mock_parser(store)
567
+ ingester._ingester._file_unchanged = MagicMock(return_value=True)
568
+
569
+ with tempfile.TemporaryDirectory() as tmpdir:
570
+ (Path(tmpdir) / "unchanged.py").write_text("x=1")
571
+ stats = ingester.ingest_parallel(tmpdir, incremental=True)
572
+
573
+ assert stats["skipped"] == 1
574
+ assert stats["files"] == 0
575
+ mock_parser.parse_file.assert_not_called()
576
+
577
+ def test_creates_repository_node(self):
578
+ store = _make_store()
579
+ ingester, _ = self._setup_ingester_with_mock_parser(store)
580
+ with tempfile.TemporaryDirectory() as tmpdir:
581
+ ingester.ingest_parallel(tmpdir)
582
+
583
+ from navegador.graph.schema import NodeLabel
584
+
585
+ store.create_node.assert_called_once()
586
+ label, props = store.create_node.call_args[0]
587
+ assert label == NodeLabel.Repository
588
+ assert "name" in props and "path" in props
589
+
590
+ def test_max_workers_none_uses_default(self):
591
+ """Passing max_workers=None should not raise."""
592
+ store = _make_store()
593
+ ingester, _ = self._setup_ingester_with_mock_parser(store)
594
+ with tempfile.TemporaryDirectory() as tmpdir:
595
+ stats = ingester.ingest_parallel(tmpdir, max_workers=None)
596
+ assert isinstance(stats, dict)
597
+
598
+ def test_skips_non_python_files(self):
599
+ store = _make_store()
600
+ ingester, mock_parser = self._setup_ingester_with_mock_parser(store)
601
+ with tempfile.TemporaryDirectory() as tmpdir:
602
+ (Path(tmpdir) / "readme.md").write_text("# readme")
603
+ (Path(tmpdir) / "config.yaml").write_text("key: value")
604
+ stats = ingester.ingest_parallel(tmpdir)
605
+ assert stats["files"] == 0
606
+ mock_parser.parse_file.assert_not_called()
--- a/tests/test_optimization.py
+++ b/tests/test_optimization.py
@@ -0,0 +1,606 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
--- a/tests/test_optimization.py
+++ b/tests/test_optimization.py
@@ -0,0 +1,606 @@
1 """Tests for navegador.ingestion.optimization (#42 – #45)."""
2
3 from __future__ import annotations
4
5 import tempfile
6 import threading
7 from pathlib import Path
8 from unittest.mock import MagicMock, call, patch
9
10 import pytest
11
12 from navegador.ingestion.optimization import (
13 DiffResult,
14 GraphDiffer,
15 IncrementalParser,
16 NodeDescriptor,
17 ParallelIngester,
18 TreeCache,
19 )
20
21
22 # ── helpers ───────────────────────────────────────────────────────────────────
23
24
25 def _make_store(rows=None):
26 """Return a MagicMock GraphStore whose query() returns *rows*."""
27 store = MagicMock()
28 store.query.return_value = MagicMock(result_set=rows or [])
29 return store
30
31
32 def _mock_tree(name: str = "tree") -> MagicMock:
33 t = MagicMock()
34 t.__repr__ = lambda self: f"<MockTree {name}>"
35 return t
36
37
38 # ── #42 — TreeCache ───────────────────────────────────────────────────────────
39
40
41 class TestTreeCache:
42 # ── get / put ──────────────────────────────────────────────────────────────
43
44 def test_get_returns_none_on_cold_cache(self):
45 cache = TreeCache()
46 assert cache.get("foo.py", "abc") is None
47
48 def test_put_and_get_roundtrip(self):
49 cache = TreeCache()
50 tree = _mock_tree()
51 cache.put("foo.py", "abc123", tree)
52 assert cache.get("foo.py", "abc123") is tree
53
54 def test_get_miss_does_not_return_wrong_hash(self):
55 cache = TreeCache()
56 tree = _mock_tree()
57 cache.put("foo.py", "hash-A", tree)
58 assert cache.get("foo.py", "hash-B") is None
59
60 def test_get_miss_does_not_return_wrong_path(self):
61 cache = TreeCache()
62 tree = _mock_tree()
63 cache.put("foo.py", "hash-A", tree)
64 assert cache.get("bar.py", "hash-A") is None
65
66 def test_put_overwrites_existing_entry(self):
67 cache = TreeCache()
68 t1 = _mock_tree("t1")
69 t2 = _mock_tree("t2")
70 cache.put("foo.py", "abc", t1)
71 cache.put("foo.py", "abc", t2)
72 assert cache.get("foo.py", "abc") is t2
73
74 # ── LRU eviction ──────────────────────────────────────────────────────────
75
76 def test_evicts_lru_entry_when_full(self):
77 cache = TreeCache(max_size=2)
78 t1 = _mock_tree("t1")
79 t2 = _mock_tree("t2")
80 t3 = _mock_tree("t3")
81
82 cache.put("a.py", "1", t1)
83 cache.put("b.py", "2", t2)
84 # Cache is now full; inserting t3 should evict t1 (LRU).
85 cache.put("c.py", "3", t3)
86
87 assert cache.get("a.py", "1") is None
88 assert cache.get("b.py", "2") is t2
89 assert cache.get("c.py", "3") is t3
90
91 def test_get_promotes_entry_so_it_is_not_evicted(self):
92 cache = TreeCache(max_size=2)
93 t1 = _mock_tree("t1")
94 t2 = _mock_tree("t2")
95 t3 = _mock_tree("t3")
96
97 cache.put("a.py", "1", t1)
98 cache.put("b.py", "2", t2)
99 # Touch t1 so it becomes the most-recently used.
100 cache.get("a.py", "1")
101 # t2 is now the LRU; adding t3 should evict t2.
102 cache.put("c.py", "3", t3)
103
104 assert cache.get("a.py", "1") is t1
105 assert cache.get("b.py", "2") is None
106 assert cache.get("c.py", "3") is t3
107
108 def test_size_respects_max_size(self):
109 cache = TreeCache(max_size=3)
110 for i in range(10):
111 cache.put(f"file{i}.py", str(i), _mock_tree())
112 assert len(cache) <= 3
113
114 def test_constructor_rejects_zero_max_size(self):
115 with pytest.raises(ValueError):
116 TreeCache(max_size=0)
117
118 # ── stats ──────────────────────────────────────────────────────────────────
119
120 def test_stats_initial_state(self):
121 cache = TreeCache()
122 s = cache.stats()
123 assert s["hits"] == 0
124 assert s["misses"] == 0
125 assert s["size"] == 0
126
127 def test_stats_records_hits(self):
128 cache = TreeCache()
129 cache.put("x.py", "h", _mock_tree())
130 cache.get("x.py", "h")
131 cache.get("x.py", "h")
132 assert cache.stats()["hits"] == 2
133
134 def test_stats_records_misses(self):
135 cache = TreeCache()
136 cache.get("x.py", "h")
137 cache.get("y.py", "h")
138 assert cache.stats()["misses"] == 2
139
140 def test_stats_size_tracks_entries(self):
141 cache = TreeCache(max_size=10)
142 cache.put("a.py", "1", _mock_tree())
143 cache.put("b.py", "2", _mock_tree())
144 assert cache.stats()["size"] == 2
145
146 def test_stats_max_size_reported(self):
147 cache = TreeCache(max_size=42)
148 assert cache.stats()["max_size"] == 42
149
150 # ── clear ──────────────────────────────────────────────────────────────────
151
152 def test_clear_removes_all_entries(self):
153 cache = TreeCache()
154 cache.put("a.py", "1", _mock_tree())
155 cache.put("b.py", "2", _mock_tree())
156 cache.clear()
157 assert len(cache) == 0
158 assert cache.get("a.py", "1") is None
159
160 def test_clear_resets_stats(self):
161 cache = TreeCache()
162 cache.put("a.py", "1", _mock_tree())
163 cache.get("a.py", "1")
164 cache.get("a.py", "bad")
165 cache.clear()
166 s = cache.stats()
167 assert s["hits"] == 0
168 assert s["misses"] == 0
169 assert s["size"] == 0
170
171 # ── thread safety ──────────────────────────────────────────────────────────
172
173 def test_concurrent_puts_do_not_corrupt_state(self):
174 cache = TreeCache(max_size=50)
175 errors = []
176
177 def writer(n: int) -> None:
178 try:
179 for i in range(20):
180 cache.put(f"file{n}_{i}.py", str(i), _mock_tree())
181 except Exception as exc: # noqa: BLE001
182 errors.append(exc)
183
184 threads = [threading.Thread(target=writer, args=(t,)) for t in range(5)]
185 for t in threads:
186 t.start()
187 for t in threads:
188 t.join()
189
190 assert not errors
191 assert len(cache) <= 50
192
193
194 # ── #43 — IncrementalParser ───────────────────────────────────────────────────
195
196
197 class TestIncrementalParser:
198 def _make_language_and_parser(self):
199 """
200 Return a fake tree-sitter Language object whose parser.parse()
201 returns a fresh MagicMock tree.
202 """
203 fake_tree = _mock_tree("parsed")
204 fake_parser = MagicMock()
205 fake_parser.parse.return_value = fake_tree
206 fake_language = MagicMock()
207
208 # Patch tree_sitter.Parser so IncrementalParser can instantiate it.
209 mock_ts_parser = MagicMock()
210 mock_ts_parser.parse.return_value = fake_tree
211 mock_ts_class = MagicMock(return_value=mock_ts_parser)
212
213 return fake_tree, mock_ts_parser, mock_ts_class, fake_language
214
215 def test_parse_returns_tree(self):
216 cache = TreeCache()
217 inc = IncrementalParser(cache)
218
219 fake_tree = _mock_tree()
220 mock_ts_parser = MagicMock()
221 mock_ts_parser.parse.return_value = fake_tree
222
223 with patch("tree_sitter.Parser", return_value=mock_ts_parser):
224 result = inc.parse(b"source", MagicMock(), "foo.py", "hash1")
225
226 assert result is fake_tree
227
228 def test_parse_stores_tree_in_cache(self):
229 cache = TreeCache()
230 inc = IncrementalParser(cache)
231
232 fake_tree = _mock_tree()
233 mock_ts_parser = MagicMock()
234 mock_ts_parser.parse.return_value = fake_tree
235
236 with patch("tree_sitter.Parser", return_value=mock_ts_parser):
237 inc.parse(b"source", MagicMock(), "foo.py", "hash1")
238
239 assert cache.get("foo.py", "hash1") is fake_tree
240
241 def test_parse_returns_cached_tree_without_calling_parser(self):
242 cached_tree = _mock_tree("cached")
243 cache = TreeCache()
244 cache.put("foo.py", "hash1", cached_tree)
245
246 inc = IncrementalParser(cache)
247 mock_ts_parser = MagicMock()
248
249 with patch("tree_sitter.Parser", return_value=mock_ts_parser):
250 result = inc.parse(b"source", MagicMock(), "foo.py", "hash1")
251
252 assert result is cached_tree
253 mock_ts_parser.parse.assert_not_called()
254
255 def test_cache_hit_increments_hit_count(self):
256 cache = TreeCache()
257 tree = _mock_tree()
258 cache.put("foo.py", "hashX", tree)
259
260 inc = IncrementalParser(cache)
261 with patch("tree_sitter.Parser", return_value=MagicMock()):
262 inc.parse(b"src", MagicMock(), "foo.py", "hashX")
263
264 assert cache.stats()["hits"] == 1
265
266 def test_parse_passes_old_tree_on_rehash(self):
267 """When a stale tree exists for the same path, it is passed as old_tree."""
268 cache = TreeCache()
269 stale_tree = _mock_tree("stale")
270 cache.put("bar.py", "old-hash", stale_tree)
271
272 new_tree = _mock_tree("new")
273 mock_ts_parser = MagicMock()
274 mock_ts_parser.parse.return_value = new_tree
275
276 inc = IncrementalParser(cache)
277 with patch("tree_sitter.Parser", return_value=mock_ts_parser):
278 result = inc.parse(b"new source", MagicMock(), "bar.py", "new-hash")
279
280 assert result is new_tree
281 # old_tree must have been passed as the second positional argument.
282 mock_ts_parser.parse.assert_called_once_with(b"new source", stale_tree)
283
284 def test_parse_without_old_tree_calls_parse_with_source_only(self):
285 cache = TreeCache()
286 new_tree = _mock_tree()
287 mock_ts_parser = MagicMock()
288 mock_ts_parser.parse.return_value = new_tree
289
290 inc = IncrementalParser(cache)
291 with patch("tree_sitter.Parser", return_value=mock_ts_parser):
292 inc.parse(b"source", MagicMock(), "baz.py", "hash1")
293
294 mock_ts_parser.parse.assert_called_once_with(b"source")
295
296 def test_default_cache_is_created_if_none_given(self):
297 inc = IncrementalParser()
298 assert isinstance(inc.cache, TreeCache)
299
300 def test_custom_cache_is_used(self):
301 cache = TreeCache(max_size=5)
302 inc = IncrementalParser(cache)
303 assert inc.cache is cache
304
305 def test_fallback_when_tree_sitter_not_importable(self):
306 """When tree_sitter is unavailable, language is used directly as parser."""
307 cache = TreeCache()
308 fake_tree = _mock_tree()
309 fake_language = MagicMock()
310 fake_language.parse.return_value = fake_tree
311
312 inc = IncrementalParser(cache)
313
314 import builtins
315
316 real_import = builtins.__import__
317
318 def _block_tree_sitter(name, *args, **kwargs):
319 if name == "tree_sitter":
320 raise ImportError("mocked absence")
321 return real_import(name, *args, **kwargs)
322
323 with patch("builtins.__import__", side_effect=_block_tree_sitter):
324 result = inc.parse(b"source", fake_language, "x.py", "h1")
325
326 assert result is fake_tree
327
328
329 # ── #44 — GraphDiffer ─────────────────────────────────────────────────────────
330
331
332 def _nd(label: str, name: str, line_start: int, **extra) -> NodeDescriptor:
333 return NodeDescriptor(label=label, name=name, line_start=line_start, extra=extra)
334
335
336 class TestNodeDescriptor:
337 def test_identity_key(self):
338 nd = _nd("Function", "foo", 10)
339 assert nd.identity_key() == ("Function", "foo", 10)
340
341 def test_equality_same(self):
342 assert _nd("Function", "foo", 10) == _nd("Function", "foo", 10)
343
344 def test_equality_different_line(self):
345 assert _nd("Function", "foo", 10) != _nd("Function", "foo", 11)
346
347 def test_equality_different_extra(self):
348 a = _nd("Function", "foo", 10, docstring="hello")
349 b = _nd("Function", "foo", 10, docstring="world")
350 assert a != b
351
352
353 class TestGraphDiffer:
354 def test_diff_empty_new_and_empty_existing(self):
355 store = _make_store(rows=[])
356 differ = GraphDiffer(store)
357 result = differ.diff_file("src/app.py", [])
358 assert result == DiffResult(added=0, modified=0, unchanged=0, removed=0)
359
360 def test_diff_all_new_nodes(self):
361 store = _make_store(rows=[])
362 differ = GraphDiffer(store)
363 nodes = [
364 _nd("Function", "foo", 1),
365 _nd("Class", "Bar", 10),
366 ]
367 result = differ.diff_file("src/app.py", nodes)
368 assert result.added == 2
369 assert result.modified == 0
370 assert result.unchanged == 0
371 assert result.removed == 0
372
373 def test_diff_all_unchanged_nodes(self):
374 store = _make_store(rows=[
375 ["Function", "foo", 1],
376 ["Class", "Bar", 10],
377 ])
378 differ = GraphDiffer(store)
379 nodes = [
380 _nd("Function", "foo", 1),
381 _nd("Class", "Bar", 10),
382 ]
383 result = differ.diff_file("src/app.py", nodes)
384 assert result.unchanged == 2
385 assert result.added == 0
386 assert result.modified == 0
387 assert result.removed == 0
388
389 def test_diff_modified_node(self):
390 """Same identity key but different extra props counts as modified."""
391 store = _make_store(rows=[["Function", "foo", 1]])
392 differ = GraphDiffer(store)
393 # Existing node in store has no extra; new node has docstring.
394 nodes = [_nd("Function", "foo", 1, docstring="now documented")]
395 result = differ.diff_file("src/app.py", nodes)
396 # The identity key matches but extra differs → modified.
397 assert result.modified == 1
398 assert result.unchanged == 0
399 assert result.added == 0
400
401 def test_diff_removed_nodes(self):
402 store = _make_store(rows=[
403 ["Function", "foo", 1],
404 ["Function", "bar", 5],
405 ])
406 differ = GraphDiffer(store)
407 # Only foo is present in new parse; bar was removed.
408 nodes = [_nd("Function", "foo", 1)]
409 result = differ.diff_file("src/app.py", nodes)
410 assert result.removed == 1
411 assert result.unchanged == 1
412
413 def test_diff_mixed_scenario(self):
414 store = _make_store(rows=[
415 ["Function", "old_func", 1],
416 ["Class", "MyClass", 20],
417 ])
418 differ = GraphDiffer(store)
419 new_nodes = [
420 _nd("Class", "MyClass", 20), # unchanged
421 _nd("Function", "new_func", 5), # added
422 ]
423 result = differ.diff_file("src/app.py", new_nodes)
424 assert result.unchanged == 1
425 assert result.added == 1
426 assert result.removed == 1
427 assert result.modified == 0
428
429 def test_diff_skips_rows_with_none_name(self):
430 store = _make_store(rows=[[None, None, None]])
431 differ = GraphDiffer(store)
432 result = differ.diff_file("src/app.py", [_nd("Function", "foo", 1)])
433 # The None row is skipped; foo is treated as a new node.
434 assert result.added == 1
435 assert result.removed == 0
436
437 def test_total_changes_property(self):
438 result = DiffResult(added=3, modified=1, unchanged=5, removed=2)
439 assert result.total_changes == 6
440
441 def test_store_is_queried_with_file_path(self):
442 store = _make_store(rows=[])
443 differ = GraphDiffer(store)
444 differ.diff_file("src/models.py", [])
445 # Ensure the store was actually queried with the right path param.
446 store.query.assert_called_once()
447 _, kwargs_or_positional = store.query.call_args[0], store.query.call_args
448 # The second positional arg to store.query should contain file_path.
449 call_params = store.query.call_args[0][1]
450 assert call_params["file_path"] == "src/models.py"
451
452
453 # ── #45 — ParallelIngester ────────────────────────────────────────────────────
454
455
456 class TestParallelIngester:
457 def _setup_ingester_with_mock_parser(self, store, parse_result=None):
458 """
459 Return a ParallelIngester whose internal RepoIngester has a mock
460 Python parser installed.
461 """
462 if parse_result is None:
463 parse_result = {"functions": 2, "classes": 1, "edges": 3}
464
465 ingester = ParallelIngester(store)
466 mock_parser = MagicMock()
467 mock_parser.parse_file.return_value = parse_result
468 ingester._ingester._parsers["python"] = mock_parser
469 return ingester, mock_parser
470
471 def test_raises_on_missing_dir(self):
472 store = _make_store()
473 ingester = ParallelIngester(store)
474 with pytest.raises(FileNotFoundError):
475 ingester.ingest_parallel("/nonexistent/path")
476
477 def test_returns_stats_dict_with_all_keys(self):
478 store = _make_store()
479 ingester, _ = self._setup_ingester_with_mock_parser(store)
480 with tempfile.TemporaryDirectory() as tmpdir:
481 stats = ingester.ingest_parallel(tmpdir)
482 assert {"files", "functions", "classes", "edges", "skipped", "errors"} <= set(stats)
483
484 def test_processes_single_file(self):
485 store = _make_store()
486 ingester, mock_parser = self._setup_ingester_with_mock_parser(
487 store, {"functions": 3, "classes": 1, "edges": 4}
488 )
489 with tempfile.TemporaryDirectory() as tmpdir:
490 (Path(tmpdir) / "app.py").write_text("def foo(): pass")
491 stats = ingester.ingest_parallel(tmpdir)
492
493 assert stats["files"] == 1
494 assert stats["functions"] == 3
495 assert stats["classes"] == 1
496 assert stats["edges"] == 4
497 assert stats["errors"] == 0
498
499 def test_processes_multiple_files_concurrently(self):
500 store = _make_store()
501 ingester, mock_parser = self._setup_ingester_with_mock_parser(
502 store, {"functions": 1, "classes": 0, "edges": 0}
503 )
504 with tempfile.TemporaryDirectory() as tmpdir:
505 for i in range(5):
506 (Path(tmpdir) / f"mod{i}.py").write_text(f"def f{i}(): pass")
507 stats = ingester.ingest_parallel(tmpdir, max_workers=3)
508
509 assert stats["files"] == 5
510 assert stats["functions"] == 5
511 assert stats["errors"] == 0
512
513 def test_aggregates_stats_across_files(self):
514 store = _make_store()
515 ingester, _ = self._setup_ingester_with_mock_parser(
516 store, {"functions": 2, "classes": 1, "edges": 5}
517 )
518 with tempfile.TemporaryDirectory() as tmpdir:
519 (Path(tmpdir) / "a.py").write_text("x=1")
520 (Path(tmpdir) / "b.py").write_text("y=2")
521 stats = ingester.ingest_parallel(tmpdir)
522
523 assert stats["files"] == 2
524 assert stats["functions"] == 4
525 assert stats["classes"] == 2
526 assert stats["edges"] == 10
527
528 def test_clear_flag_calls_store_clear(self):
529 store = _make_store()
530 ingester, _ = self._setup_ingester_with_mock_parser(store)
531 with tempfile.TemporaryDirectory() as tmpdir:
532 ingester.ingest_parallel(tmpdir, clear=True)
533 store.clear.assert_called_once()
534
535 def test_no_clear_by_default(self):
536 store = _make_store()
537 ingester, _ = self._setup_ingester_with_mock_parser(store)
538 with tempfile.TemporaryDirectory() as tmpdir:
539 ingester.ingest_parallel(tmpdir)
540 store.clear.assert_not_called()
541
542 def test_empty_repo_returns_zero_counts(self):
543 store = _make_store()
544 ingester, _ = self._setup_ingester_with_mock_parser(store)
545 with tempfile.TemporaryDirectory() as tmpdir:
546 stats = ingester.ingest_parallel(tmpdir)
547 assert stats["files"] == 0
548 assert stats["functions"] == 0
549
550 def test_parser_exception_increments_errors_not_files(self):
551 store = _make_store()
552 ingester = ParallelIngester(store)
553 broken_parser = MagicMock()
554 broken_parser.parse_file.side_effect = RuntimeError("boom")
555 ingester._ingester._parsers["python"] = broken_parser
556
557 with tempfile.TemporaryDirectory() as tmpdir:
558 (Path(tmpdir) / "broken.py").write_text("def x(): pass")
559 stats = ingester.ingest_parallel(tmpdir)
560
561 assert stats["files"] == 0
562 assert stats["errors"] == 1
563
564 def test_incremental_skips_unchanged_files(self):
565 store = _make_store()
566 ingester, mock_parser = self._setup_ingester_with_mock_parser(store)
567 ingester._ingester._file_unchanged = MagicMock(return_value=True)
568
569 with tempfile.TemporaryDirectory() as tmpdir:
570 (Path(tmpdir) / "unchanged.py").write_text("x=1")
571 stats = ingester.ingest_parallel(tmpdir, incremental=True)
572
573 assert stats["skipped"] == 1
574 assert stats["files"] == 0
575 mock_parser.parse_file.assert_not_called()
576
577 def test_creates_repository_node(self):
578 store = _make_store()
579 ingester, _ = self._setup_ingester_with_mock_parser(store)
580 with tempfile.TemporaryDirectory() as tmpdir:
581 ingester.ingest_parallel(tmpdir)
582
583 from navegador.graph.schema import NodeLabel
584
585 store.create_node.assert_called_once()
586 label, props = store.create_node.call_args[0]
587 assert label == NodeLabel.Repository
588 assert "name" in props and "path" in props
589
590 def test_max_workers_none_uses_default(self):
591 """Passing max_workers=None should not raise."""
592 store = _make_store()
593 ingester, _ = self._setup_ingester_with_mock_parser(store)
594 with tempfile.TemporaryDirectory() as tmpdir:
595 stats = ingester.ingest_parallel(tmpdir, max_workers=None)
596 assert isinstance(stats, dict)
597
598 def test_skips_non_python_files(self):
599 store = _make_store()
600 ingester, mock_parser = self._setup_ingester_with_mock_parser(store)
601 with tempfile.TemporaryDirectory() as tmpdir:
602 (Path(tmpdir) / "readme.md").write_text("# readme")
603 (Path(tmpdir) / "config.yaml").write_text("key: value")
604 stats = ingester.ingest_parallel(tmpdir)
605 assert stats["files"] == 0
606 mock_parser.parse_file.assert_not_called()

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button