diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f3aca6..347ee4e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,44 @@ All notable changes to this project are documented here. The format is based on ## [Unreleased] +### Changed — retrieval ranking & fusion (requires a one-time reindex) +- **RRF fusion rescaled and re-keyed.** Fused scores were ~`w/k` (≈0.017), an order + of magnitude below the reranker's bounded bonuses, so rerank silently became the + primary ranker. RRF is now scaled by `k` (a pure monotonic rescale — order is + unchanged) so fused scores and rerank bonuses share an O(1) scale. Fusion also + merges on a coarse `(path, line-bucket)` key instead of an exact `(path, start, + end)` one: different retrievers report different ranges for the same place, so the + exact key almost never coincided and cross-source agreement never fired. + `agreeing_sources` is now counted at file granularity. +- **Confidence uses a scale-invariant relative gap** instead of absolute thresholds. +- **Per-file diversification**: at most 3 hits per file stay on the page; the rest + are pushed to the tail (nothing is dropped). Combined with bucketing this removes + the "same small file returned six times at different line slivers" noise. +- **FTS recall on natural-language queries**: stopwords (`how`, `does`, `the`, …) + are dropped before building the FTS `MATCH`, so a query like "how does auth work" + no longer AND-s in filler that code chunks never contain. +- **Symbol names are FTS-indexed.** `chunks` gained a denormalized `symbol_names` + column (mirrored verbatim by the FTS sync triggers, so external-content delete/ + update stays consistent) — a query matching a symbol's name now hits even when the + body text doesn't repeat it. **Bumps `SCHEMA_VERSION` 1 → 2.** Older indexes are + still readable; `index`/`update` detect the mismatch and rebuild from scratch. +- **Centrality fallback for ambiguous names**: symbols whose name isn't globally + unique never get a resolved `in_degree`; they now receive a damped, half-capped + bonus from a name-reference count so common names (`run`, `handle`, …) aren't + flatly zeroed. Precise `in_degree`, where present, still takes precedence. +- **Test-file demotion is word-boundary aware**: `contest/`, `latest.py`, + `testimonials.tsx` are no longer mistaken for test files. +- **Language-aware import resolution**: `import './base'` from a `.ts` file resolves + to `base.ts` rather than a same-named `base.py` earlier in the fallback order. +- **Freshness is content-aware**: a bare `touch` (mtime change, identical bytes) is + a no-op for `update`, so it no longer reports the index as stale — freshness now + mirrors the sha-based incremental decision. + +### Removed +- Dead legacy lexical-search path in `retrieval/searchers.py` (`fts_response`, + `fts_search`, the second `Candidate` dataclass and `_confidence`/`_fallbacks`/ + `_trim`) — the live path goes through `pipeline.search` → `fts_candidates`. + ## [1.4.0] - 2026-06-14 ### Added diff --git a/docs/SCHEMA.md b/docs/SCHEMA.md index f5c21bd..21f2735 100644 --- a/docs/SCHEMA.md +++ b/docs/SCHEMA.md @@ -39,7 +39,8 @@ CREATE TABLE chunks ( kind TEXT, -- 'symbol_body' | 'window' | 'doc' symbol_id INTEGER REFERENCES symbols(id) ON DELETE SET NULL, content TEXT NOT NULL, -- raw text (secret-redacted before snippet output) - token_est INTEGER NOT NULL -- estimated tokens, for budgeting + token_est INTEGER NOT NULL, -- estimated tokens, for budgeting + symbol_names TEXT NOT NULL DEFAULT '' -- denormalized symbol name, FTS-indexed (mirrored by triggers) ); CREATE INDEX idx_chunks_file ON chunks(file_id); diff --git a/src/codebase_index/cli.py b/src/codebase_index/cli.py index 0e18ccc..692158a 100644 --- a/src/codebase_index/cli.py +++ b/src/codebase_index/cli.py @@ -54,6 +54,13 @@ def _ensure_index(ctx: "typer.Context") -> tuple[Path, Any]: return db_path, cfg +def _remove_db_files(db_path: Path) -> None: + """Delete the SQLite db and its WAL/SHM sidecars (used to force a clean rebuild).""" + for p in (db_path, *(db_path.with_name(db_path.name + s) for s in ("-wal", "-shm"))): + if p.exists(): + p.unlink() + + def _open_in_browser(path: Path) -> None: uri = path.resolve().as_uri() try: @@ -278,13 +285,15 @@ def index( from .config import load from .indexer.pipeline import build_index - from .storage.db import Database + from .storage.db import SCHEMA_VERSION, Database, peek_schema_version root_opt = ctx.obj.get("root") if ctx.obj else None cfg = load(root_opt) db_path = Path(cfg.root) / ".claude" / "cache" / "codebase-index" / "index.sqlite" - if rebuild and db_path.exists(): - db_path.unlink() + # A full build discards an outdated-schema index: schema.sql is applied with + # IF NOT EXISTS, so an upgrade can't add columns/triggers in place — recreate. + if rebuild or (db_path.exists() and peek_schema_version(db_path) < SCHEMA_VERSION): + _remove_db_files(db_path) with Database(db_path) as db: stats = build_index(cfg, db, root=Path(cfg.root)) @@ -321,8 +330,8 @@ def update( import json as _json from .config import load - from .indexer.pipeline import update_index - from .storage.db import Database + from .indexer.pipeline import build_index, update_index + from .storage.db import SCHEMA_VERSION, Database, peek_schema_version is_json = bool(ctx.obj and ctx.obj.get("json")) quiet = bool(ctx.obj and ctx.obj.get("quiet")) @@ -336,8 +345,15 @@ def update( typer.echo("No index found. Run `codebase-index index` first.") raise typer.Exit(code=0) - with Database(db_path) as db: - stats = update_index(cfg, db, root=Path(cfg.root), since=since, all_files=all_files) + if peek_schema_version(db_path) < SCHEMA_VERSION: + # Schema changed under the index; an incremental write would target old + # tables. Upgrade by rebuilding from scratch (the index is a derived cache). + _remove_db_files(db_path) + with Database(db_path) as db: + stats = build_index(cfg, db, root=Path(cfg.root)) + else: + with Database(db_path) as db: + stats = update_index(cfg, db, root=Path(cfg.root), since=since, all_files=all_files) if is_json: typer.echo( diff --git a/src/codebase_index/discovery/classify.py b/src/codebase_index/discovery/classify.py index f41be35..d9983ef 100644 --- a/src/codebase_index/discovery/classify.py +++ b/src/codebase_index/discovery/classify.py @@ -132,3 +132,20 @@ def is_generated(path: str) -> bool: or name.endswith(".min.js") or name.endswith(".min.css") ) + + +# Directory names that mark a test tree, and filename patterns for test modules. +# Matched on whole path segments / filename stems — NOT a bare substring — so +# `contest/`, `latest.py`, or `testimonials.ts` are never mistaken for tests. +_TEST_DIRS = {"test", "tests", "__tests__", "__test__", "testing", "spec", "specs", "e2e"} + + +def is_test_path(path: str) -> bool: + pure = PurePosixPath(path.replace("\\", "/")) + if any(part.lower() in _TEST_DIRS for part in pure.parts[:-1]): + return True + name = pure.name.lower() + stem = name.split(".", 1)[0] + if stem == "test" or stem.startswith("test_") or stem.endswith("_test"): + return True + return ".test." in name or ".spec." in name diff --git a/src/codebase_index/graph/builder.py b/src/codebase_index/graph/builder.py index 99f6ede..f2e342b 100644 --- a/src/codebase_index/graph/builder.py +++ b/src/codebase_index/graph/builder.py @@ -42,7 +42,7 @@ def resolve_edges(conn: sqlite3.Connection) -> int: for edge in edges: name = edge["dst_name"] if edge["edge_type"] == "import": - file_id = _module_to_file_id(suffix_map, name) + file_id = _module_to_file_id(suffix_map, name, lang=edge["lang"]) if file_id is not None: resolutions.append(("file", file_id, edge["id"])) elif edge["edge_type"] in _SYMBOL_EDGE_TYPES: @@ -70,13 +70,36 @@ def _path_suffix_map(rows: list[sqlite3.Row]) -> dict[str, Optional[int]]: return mapping +def _lang_suffixes(lang: Optional[str], base: str, rust_base: str, go_pkg: str) -> list[str]: + """Import-path suffixes specific to one language, most-specific first.""" + return { + "python": [f"{base}.py", f"{base}/__init__.py"], + "typescript": [f"{base}.ts", f"{base}.tsx", f"{base}/index.ts", f"{base}/index.tsx"], + "javascript": [f"{base}.js", f"{base}/index.js"], + "java": [f"{base}.java"], + "kotlin": [f"{base}.kt"], + "go": [f"{go_pkg}.go"], + "rust": [ + f"{rust_base}.rs", f"{rust_base}/mod.rs", + f"src/{rust_base}.rs", f"src/{rust_base}/mod.rs", + ], + "csharp": [f"{base}.cs"], + "ruby": [f"{base}.rb"], + "php": [f"{base}.php"], + }.get(lang or "", []) + + def _module_to_file_id( - suffix_map: dict[str, Optional[int]], module: str + suffix_map: dict[str, Optional[int]], module: str, lang: Optional[str] = None ) -> Optional[int]: """Resolve a module/import path to a unique file id, or None. Handles Python, TypeScript/JavaScript, Java/Kotlin/Scala, Rust (:: separator), - Go (last path segment), C#, Ruby, and PHP import conventions. + Go (last path segment), C#, Ruby, and PHP import conventions. The importing + file's `lang` is tried first so that, in a polyglot repo, `import './base'` from + a .ts file resolves to base.ts rather than a same-named base.py earlier in the + fixed fallback order. The fallback order is unchanged, so single-language repos + and the lang-unknown path behave exactly as before. """ base = module.lower().replace(".", "/").strip("/") rust_base = module.lower().replace("::", "/").strip("/") @@ -85,7 +108,7 @@ def _module_to_file_id( # Last segment used for Go package-level resolution go_pkg = base.rsplit("/", 1)[-1] if "/" in base else base - for suffix in ( + fallback = ( # Python f"{base}.py", f"{base}/__init__.py", @@ -113,7 +136,8 @@ def _module_to_file_id( f"{base}.rb", # PHP f"{base}.php", - ): + ) + for suffix in (*_lang_suffixes(lang, base, rust_base, go_pkg), *fallback): file_id = suffix_map.get(suffix) if file_id is not None: return file_id diff --git a/src/codebase_index/indexer/freshness.py b/src/codebase_index/indexer/freshness.py index b70f6ea..fbdb678 100644 --- a/src/codebase_index/indexer/freshness.py +++ b/src/codebase_index/indexer/freshness.py @@ -15,6 +15,7 @@ from __future__ import annotations +import hashlib import subprocess from pathlib import Path @@ -47,25 +48,47 @@ def compute_freshness(conn, root: Path, config: Config) -> IndexFreshness: def _changed_count(conn, root: Path, config: Config) -> int: - """Added + removed + mtime-modified indexable files vs. the index.""" - current: dict[str, int] = {} + """Added + removed + content-modified indexable files vs. the index. + + Mirrors the incremental update's decision (indexer/pipeline.py): a file is + unchanged when (mtime, size) match, and even when they differ it is only + counted as changed if its sha256 differs. A bare `touch` that rewrites mtime + without changing bytes is a no-op for update_index, so it must not register as + stale here either. + """ + indexed = repo.fingerprints(conn) # path -> (mtime_ns, size_bytes, sha256) + seen: set[str] = set() + changed = 0 for cand in walk(root, config): try: - current[cand.rel_path] = cand.path.stat().st_mtime_ns + st = cand.path.stat() except OSError: continue - indexed = repo.path_mtimes(conn) - - changed = 0 - for path, mtime in current.items(): - if path not in indexed or indexed[path] != mtime: - changed += 1 - for path in indexed: - if path not in current: + seen.add(cand.rel_path) + prior = indexed.get(cand.rel_path) + if prior is None: changed += 1 + continue + if prior[0] == st.st_mtime_ns and prior[1] == cand.size_bytes: + continue + try: + if prior[2] == _sha256_file(cand.path): + continue + except OSError: + pass + changed += 1 + changed += sum(1 for path in indexed if path not in seen) return changed +def _sha256_file(path: Path) -> str: + h = hashlib.sha256() + with path.open("rb") as fh: + for block in iter(lambda: fh.read(65536), b""): + h.update(block) + return h.hexdigest() + + def _git_clean_at(root: Path, indexed_head: "str | None") -> bool: """True iff git is available, HEAD == indexed_head, and the tree has no changes.""" if indexed_head is None or not (root / ".git").exists(): diff --git a/src/codebase_index/retrieval/fusion.py b/src/codebase_index/retrieval/fusion.py index ea92610..a410545 100644 --- a/src/codebase_index/retrieval/fusion.py +++ b/src/codebase_index/retrieval/fusion.py @@ -1,8 +1,21 @@ """Reciprocal Rank Fusion across per-source ranked candidate lists. -RRF(d) = Σ_r w_r / (k + rank_r(d)) — robust to incomparable raw scores. -On merge, the candidate carrying the most signal (symbol > fts > path) is kept -as the representative so downstream rerank/snippet logic has the richest fields. +RRF(d) = Σ_r w_r · k / (k + rank_r(d)) — robust to incomparable raw scores. + +Two deliberate departures from the textbook formula: + +* Scaled by k. Raw RRF tops out at w/k (≈0.017 for k=60), an order of magnitude + below the bounded bonuses the reranker layers on top, so rerank would silently + become the primary ranker and RRF a mere tiebreak. Multiplying by k is a pure + monotonic rescale (fusion order is identical) that lifts the top contribution to + ≈w, putting fused scores and rerank bonuses on the same O(1) scale. +* Fused on a coarse (path, line-bucket) key, not (path, start, end). Different + retrievers report different line ranges for the same place; an exact key almost + never coincides across sources, so cross-source agreement — RRF's whole point — + would never fire. `agreeing_sources` is therefore counted at file granularity. + +On merge, the candidate carrying the most signal (symbol > fts > path) is kept as +the representative so downstream rerank/snippet logic has the richest fields. """ from __future__ import annotations @@ -26,18 +39,24 @@ def fuse( ) -> list[Candidate]: accum: dict[tuple, float] = {} rep: dict[tuple, Candidate] = {} - agree: dict[tuple, set[str]] = {} + seen: set[tuple] = set() + file_sources: dict[str, set[str]] = {} for source, candidates in lists.items(): w = weights.get(source, 0.0) if w <= 0.0: continue for rank, cand in enumerate(candidates): - key = cand.key() - accum[key] = accum.get(key, 0.0) + w / (k + rank) - agree.setdefault(key, set()).add(source) + file_sources.setdefault(cand.path, set()).add(source) + key = cand.fuse_key() + # One contribution per source per locator: a file matching three FTS + # chunks in the same bucket is one lexical signal, not three. + if (source, key) in seen: + continue + seen.add((source, key)) + accum[key] = accum.get(key, 0.0) + w * k / (k + rank) rep[key] = _richer(rep[key], cand) if key in rep else cand fused = [_replace(rep[key], score=score) for key, score in accum.items()] fused.sort(key=lambda c: c.score, reverse=True) - return [_replace(c, agreeing_sources=len(agree[c.key()])) for c in fused] + return [_replace(c, agreeing_sources=len(file_sources[c.path])) for c in fused] diff --git a/src/codebase_index/retrieval/pipeline.py b/src/codebase_index/retrieval/pipeline.py index 0264bc8..8a82096 100644 --- a/src/codebase_index/retrieval/pipeline.py +++ b/src/codebase_index/retrieval/pipeline.py @@ -22,6 +22,10 @@ _TERM_RE = re.compile(r"[A-Za-z0-9_]+") _RRF_K = 60 +# Max results kept per file before extras are pushed to the tail. Bucketed fusion +# already collapses co-located hits; this caps the long tail of one big file +# dominating the page so distinct files get surfaced. +_MAX_PER_FILE = 3 _KIND_ALIASES = { "method": "method", "methods": "method", @@ -67,9 +71,14 @@ def _confidence(ranked) -> Confidence: if not ranked: return Confidence.LOW top = ranked[0] + if top.score <= 0: + return Confidence.LOW if len(ranked) == 1: - return Confidence.MEDIUM if top.score > 0 else Confidence.LOW - gap = top.score - ranked[1].score + return Confidence.MEDIUM + # Relative gap, not absolute: scale-invariant, so it stays meaningful regardless + # of fusion's score magnitude. agreeing_sources is file-level (how many retrievers + # surfaced the winning file at all), the signal RRF agreement is meant to capture. + rel_gap = (top.score - ranked[1].score) / top.score agree = getattr(top, "agreeing_sources", 1) exact = getattr(top, "exact_symbol", False) n = len(ranked) @@ -77,18 +86,31 @@ def _confidence(ranked) -> Confidence: if exact: return Confidence.HIGH # Strong multi-source agreement with a clear score gap - if agree >= 3 and gap > 0.005: + if agree >= 3 and rel_gap > 0.15: return Confidence.HIGH - if agree >= 2 and gap > 0.01: + if agree >= 2 and rel_gap > 0.25: return Confidence.HIGH # Single source but very dominant winner - if agree == 1 and gap > 0.05 and top.score > 0.1: + if agree == 1 and rel_gap > 0.5: return Confidence.HIGH - if top.score > 0 and (agree >= 2 or gap > 0.005 or n >= 5): + if agree >= 2 or rel_gap > 0.1 or n >= 5: return Confidence.MEDIUM return Confidence.LOW +def _diversify(ranked: list, *, per_file: int) -> list: + """Stable reorder: keep the first `per_file` hits of each file in place, push + the rest to the tail (preserving their relative order). Nothing is dropped, so + recall is intact; the page just isn't monopolised by one file's many regions.""" + kept: list = [] + overflow: list = [] + counts: dict[str, int] = {} + for c in ranked: + counts[c.path] = counts.get(c.path, 0) + 1 + (kept if counts[c.path] <= per_file else overflow).append(c) + return kept + overflow + + def _fallback_suggestions(query, ranked) -> dict: terms = _TERM_RE.findall(query) if not terms: @@ -121,7 +143,8 @@ def search( conn, query, mode=mode, limit=fetch_limit, weights=plan.weights, backend=backend ) fused = fuse(lists, weights=weights, k=_RRF_K) - ranked = rerank(fused, query=query, intent=plan.intent)[:fetch_limit] + ranked = _diversify(rerank(fused, query=query, intent=plan.intent), per_file=_MAX_PER_FILE) + ranked = ranked[:fetch_limit] confidence = _confidence(ranked) # Scale budget proportionally so later pages receive snippet coverage. scaled_budget = token_budget * fetch_limit // max(limit, 1) if offset > 0 else token_budget diff --git a/src/codebase_index/retrieval/rerank.py b/src/codebase_index/retrieval/rerank.py index 551c485..32a6ac0 100644 --- a/src/codebase_index/retrieval/rerank.py +++ b/src/codebase_index/retrieval/rerank.py @@ -10,6 +10,7 @@ import math import re +from ..discovery.classify import is_test_path from .types import Candidate, Intent _TERM_RE = re.compile(r"[A-Za-z0-9_]+") @@ -45,11 +46,19 @@ def rerank(candidates: list[Candidate], *, query: str, intent: Intent) -> list[C if c.in_degree: bonus += min(_DEGREE_CAP, math.log1p(c.in_degree) * _DEGREE_SCALE) reasons.append(f"{c.in_degree} callers") + elif c.ref_count: + # Precise in_degree is only computed for globally-unique symbol names + # (ambiguous names never resolve), so common names like `run`/`handle` + # always score 0. Fall back to a damped name-reference count — half the + # scale and cap — so centrality still breaks ties without overriding the + # precise signal where it exists. + bonus += min(_DEGREE_CAP / 2, math.log1p(c.ref_count) * (_DEGREE_SCALE / 2)) + reasons.append(f"~{c.ref_count} refs by name") if intent is Intent.ARCHITECTURE and (c.in_degree + c.out_degree): bonus += min(_DEGREE_CAP, math.log1p(c.in_degree + c.out_degree) * (_DEGREE_SCALE / 2)) wants_tests = "test" in terms or "tests" in terms - if c.is_generated or (("test" in c.path.lower()) and not wants_tests): + if c.is_generated or (is_test_path(c.path) and not wants_tests): bonus -= 0.15 reasons.append("generated/test demoted") diff --git a/src/codebase_index/retrieval/searchers.py b/src/codebase_index/retrieval/searchers.py index 0cb0d00..138bf80 100644 --- a/src/codebase_index/retrieval/searchers.py +++ b/src/codebase_index/retrieval/searchers.py @@ -8,31 +8,24 @@ import re import sqlite3 -from dataclasses import dataclass from pathlib import Path from typing import Optional from ..config import Config from ..indexer.freshness import compute_freshness from ..models import ( - Confidence, GraphCoverage, IndexFreshness, - ReadRange, RefSite, RefsResponse, - Result, - SearchResponse, SymbolDef, SymbolResponse, ) -from ..output.redact import redact_snippet from ..storage import repo from .types import Candidate as M4Candidate _WORD_RE = re.compile(r"[A-Za-z0-9_]+") _CAMEL_RE = re.compile(r"[A-Z]+(?![a-z])|[A-Z]?[a-z0-9]+") -_SNIPPET_MAX_LINES = 18 def fts_candidates(conn: sqlite3.Connection, query: str, *, limit: int) -> list[M4Candidate]: @@ -142,6 +135,15 @@ def symbol_candidates( exact_symbol=exact, ) ) + + # Damped centrality fallback: symbols whose name is not globally unique never + # get a resolved in_degree, so back-fill a name-reference count for the zero ones. + zero_deg = [c.symbol for c in out if not c.in_degree and c.symbol] + if zero_deg: + counts = repo.name_ref_counts(conn, zero_deg) + for c in out: + if not c.in_degree and c.symbol: + c.ref_count = counts.get(c.symbol, 0) return out @@ -161,18 +163,6 @@ def path_candidates(conn: sqlite3.Connection, query: str, *, limit: int) -> list return out -@dataclass -class Candidate: - chunk_id: int - path: str - line_start: int - line_end: int - content: str - token_est: int - bm25: float - kind: str = "window" - - def _subtokens(term: str) -> list[str]: parts: list[str] = [] for piece in term.split("_"): @@ -181,115 +171,30 @@ def _subtokens(term: str) -> list[str]: def build_match_query(query: str) -> str: + """Build the FTS5 MATCH expression for `query`. + + Each whitespace term expands to an OR group over the term and its + camelCase/snake_case subtokens; groups are AND-ed. Natural-language filler + ("how does X work") is dropped first: otherwise FTS would AND-in stopwords + that code chunks never contain, collapsing recall to zero on the very intents + (HOW_IT_WORKS / DEBUG_ERROR) that weight FTS highest. If *every* term is a + stopword we fall back to the full set rather than emit an empty match. + """ groups: list[str] = [] + salient: list[str] = [] for term in _WORD_RE.findall(query): variants = {term, *_subtokens(term)} variants = {v for v in variants if len(v) >= 2} if not variants: continue ored = " OR ".join(f'"{v}"' for v in sorted(variants, key=str.lower)) - groups.append(f"({ored})" if len(variants) > 1 else ored) - # FTS5 rejects implicit AND (space) when a group contains parenthesised OR - # expressions; explicit AND is required between all groups. - return " AND ".join(groups) - - -def fts_search(conn: sqlite3.Connection, query: str, *, limit: int) -> list[Candidate]: - match = build_match_query(query) - rows = repo.fts_search(conn, match, limit=limit) - return [ - Candidate( - chunk_id=r["chunk_id"], - path=r["path"], - line_start=r["line_start"], - line_end=r["line_end"], - content=r["content"], - token_est=r["token_est"], - bm25=r["bm25"], - kind=r.get("kind", "window"), # type: ignore[attr-defined] - ) - for r in rows - ] - - -def fts_response( - conn: sqlite3.Connection, - query: str, - *, - limit: int, - token_budget: int, - root: Path, - config: Optional[Config] = None, -) -> SearchResponse: - candidates = fts_search(conn, query, limit=limit) - results: list[Result] = [] - recommended: list[ReadRange] = [] - spent = 0 - - for rank, candidate in enumerate(candidates, start=1): - recommended.append( - ReadRange( - path=candidate.path, - line_start=candidate.line_start, - line_end=candidate.line_end, - ) - ) - snippet: Optional[str] = None - if spent + candidate.token_est <= token_budget: - snippet = redact_snippet(_trim(candidate.content)) - spent += candidate.token_est - results.append( - Result( - rank=rank, - path=candidate.path, - line_start=candidate.line_start, - line_end=candidate.line_end, - symbols=[], - score=round(1.0 / rank, 4), - reason="doc match" if candidate.kind == "doc" else "lexical match (bm25)", - snippet=snippet, - ) - ) - - confidence = _confidence(candidates) - return SearchResponse( - query=query, - intent="keyword", - index=_freshness(conn, root, config), - confidence=confidence, - results=results, - recommended_reads=recommended, - fallback_suggestions=_fallbacks(query) if confidence != "high" else {}, - ) - - -def _trim(content: str) -> str: - lines = content.splitlines() - if len(lines) <= _SNIPPET_MAX_LINES: - return content - return "\n".join(lines[:_SNIPPET_MAX_LINES]) + "\n..." - - -def _confidence(candidates: list[Candidate]) -> Confidence: - if not candidates: - return "low" - if len(candidates) == 1: - return "medium" - top, second = candidates[0], candidates[1] - gap = abs(second.bm25 - top.bm25) - n = len(candidates) - # Clear BM25 separation, or moderate gap with several corroborating results - if gap >= 2.0 or (gap >= 1.0 and n >= 3): - return "high" - if gap >= 0.3 or n >= 3: - return "medium" - return "low" - - -def _fallbacks(query: str) -> dict[str, list[str]]: - terms = _WORD_RE.findall(query) - primary = terms[0] if terms else query - return {"ripgrep": [f'rg -n "{primary}"', f'rg -ni "{primary}"']} + # FTS5 rejects implicit AND (space) when a group contains parenthesised OR + # expressions; explicit AND is required between all groups. + group = f"({ored})" if len(variants) > 1 else ored + groups.append(group) + if term.lower() not in _SYMBOL_STOPWORDS: + salient.append(group) + return " AND ".join(salient or groups) def _freshness( diff --git a/src/codebase_index/retrieval/types.py b/src/codebase_index/retrieval/types.py index 6f184a9..3fcf71d 100644 --- a/src/codebase_index/retrieval/types.py +++ b/src/codebase_index/retrieval/types.py @@ -6,6 +6,11 @@ from enum import Enum from typing import Optional +# Line window used by Candidate.fuse_key to group co-located hits across retrievers. +# Wide enough to merge a symbol body and the FTS window that overlaps it, narrow +# enough to keep distinct regions of a large file separate. +_FUSE_BUCKET_LINES = 40 + class Intent(str, Enum): LOCATE_IMPL = "locate_impl" @@ -39,6 +44,7 @@ class Candidate: token_est: int = 0 in_degree: int = 0 out_degree: int = 0 + ref_count: int = 0 is_generated: bool = False exact_symbol: bool = False reason: str = "" @@ -47,6 +53,19 @@ class Candidate: def key(self) -> tuple[str, int, int]: return (self.path, self.line_start, self.line_end) + def fuse_key(self) -> tuple[str, int]: + """Coarse locator for RRF fusion: path + line bucket. + + Different retrievers emit different granularities for the same place — a + symbol body, an 80-line FTS window, a path hit anchored at line 1 — so an + exact (path, start, end) key almost never coincides across sources and RRF + degenerates into a weighted round-robin that never rewards agreement. + Bucketing line_start collapses co-located hits onto one key so their + per-source RRF contributions actually sum, while still separating genuinely + distant regions of a large file. + """ + return (self.path, (max(self.line_start, 1) - 1) // _FUSE_BUCKET_LINES) + @dataclass class IntentPlan: diff --git a/src/codebase_index/storage/db.py b/src/codebase_index/storage/db.py index f6b2f15..60abbcd 100644 --- a/src/codebase_index/storage/db.py +++ b/src/codebase_index/storage/db.py @@ -7,7 +7,8 @@ from pathlib import Path from typing import Optional -SCHEMA_VERSION = 1 +# 2: chunks gained a denormalized `symbol_names` column (FTS symbol-name boost). +SCHEMA_VERSION = 2 class Database: @@ -73,6 +74,11 @@ def _guard_version(self) -> None: f"Index schema_version {current} is newer than supported {SCHEMA_VERSION}; " "rebuild the index with an updated CLI." ) + # current < SCHEMA_VERSION is tolerated on open: queries never read the + # added columns, so an older index is still safely *readable*. The build + # commands (index/update) detect the mismatch via peek_schema_version and + # rebuild from scratch, since there is no in-place migration framework and + # schema.sql is applied with IF NOT EXISTS (old tables/triggers persist). def enable_vectors(self) -> None: """Load the sqlite-vec extension into this connection (optional extra).""" @@ -85,3 +91,25 @@ def enable_vectors(self) -> None: self.conn.enable_load_extension(True) sqlite_vec.load(self.conn) self.conn.enable_load_extension(False) + + +def peek_schema_version(path: Path | str) -> int: + """Read meta.schema_version without applying schema or running the guard. + + Returns 0 when the file, the meta table, or the key is absent/unreadable, so + callers can treat "0 < peek < SCHEMA_VERSION" (or a missing meta) as "rebuild". + """ + p = Path(path) + if not p.exists(): + return 0 + try: + conn = sqlite3.connect(p) + try: + row = conn.execute( + "SELECT value FROM meta WHERE key = 'schema_version'" + ).fetchone() + return int(row[0]) if row else 0 + finally: + conn.close() + except (sqlite3.Error, ValueError, OSError): + return 0 diff --git a/src/codebase_index/storage/repo.py b/src/codebase_index/storage/repo.py index debd4e2..77a54cd 100644 --- a/src/codebase_index/storage/repo.py +++ b/src/codebase_index/storage/repo.py @@ -102,12 +102,15 @@ def _symbol_id(chunk: Chunk) -> Optional[int]: return symbol_ids[chunk.symbol_index] return None + # symbol_names is denormalized into the chunk (see schema.sql): resolve the + # name from the just-inserted symbol row (replace_symbols runs first). Stored so + # the FTS triggers can replay it verbatim on delete/update. conn.executemany( """ INSERT INTO chunks - (file_id, line_start, line_end, kind, symbol_id, content, token_est) + (file_id, line_start, line_end, kind, symbol_id, content, token_est, symbol_names) VALUES - (?, ?, ?, ?, ?, ?, ?) + (?, ?, ?, ?, ?, ?, ?, COALESCE((SELECT name FROM symbols WHERE id = ?), '')) """, [ ( @@ -118,6 +121,7 @@ def _symbol_id(chunk: Chunk) -> Optional[int]: _symbol_id(c), c.content, c.token_est, + _symbol_id(c), ) for c in chunks ], @@ -369,8 +373,10 @@ def symbol_search( def unresolved_edges(conn: sqlite3.Connection) -> list[sqlite3.Row]: return conn.execute( - "SELECT id, edge_type, dst_name FROM edges " - "WHERE resolved = 0 AND dst_name IS NOT NULL ORDER BY id" + "SELECT e.id AS id, e.edge_type AS edge_type, e.dst_name AS dst_name, " + " f.lang AS lang " + "FROM edges e JOIN files f ON f.id = e.file_id " + "WHERE e.resolved = 0 AND e.dst_name IS NOT NULL ORDER BY e.id" ).fetchall() @@ -391,6 +397,25 @@ def resolve_edges_bulk( ) +def name_ref_counts(conn: sqlite3.Connection, names: Sequence[str]) -> dict[str, int]: + """Count edges targeting each name (any resolution state), keyed by dst_name. + + A damped centrality proxy for symbols whose precise in_degree is 0 because their + name is not globally unique (ambiguous edges never resolve). Over-counts across + same-named symbols by design — it is only used as a weak tiebreak fallback. + """ + uniq = [n for n in dict.fromkeys(names) if n] + if not uniq: + return {} + placeholders = ",".join("?" * len(uniq)) + rows = conn.execute( + f"SELECT dst_name, COUNT(*) AS c FROM edges " + f"WHERE dst_name IN ({placeholders}) GROUP BY dst_name", + tuple(uniq), + ).fetchall() + return {row["dst_name"]: int(row["c"]) for row in rows} + + def unique_symbol_ids_by_name(conn: sqlite3.Connection) -> dict[str, int]: """Map symbol name -> id for names defined exactly once in the repo.""" return { diff --git a/src/codebase_index/storage/schema.sql b/src/codebase_index/storage/schema.sql index e0d08ed..10bde93 100644 --- a/src/codebase_index/storage/schema.sql +++ b/src/codebase_index/storage/schema.sql @@ -44,7 +44,12 @@ CREATE TABLE IF NOT EXISTS chunks ( kind TEXT, symbol_id INTEGER REFERENCES symbols(id) ON DELETE SET NULL, content TEXT NOT NULL, - token_est INTEGER NOT NULL + token_est INTEGER NOT NULL, + -- Denormalized copy of the chunk's symbol name, populated at write time. + -- Stored (not a live join) so the FTS triggers below can replay the exact + -- indexed value on delete/update; a subquery would read a symbol row that the + -- ON DELETE SET NULL cascade may already have detached, corrupting the index. + symbol_names TEXT NOT NULL DEFAULT '' ); CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file_id); @@ -92,19 +97,23 @@ CREATE VIRTUAL TABLE IF NOT EXISTS fts_chunks USING fts5( tokenize = "unicode61 remove_diacritics 2" ); +-- symbol_names mirrors new/old.symbol_names (the stored chunk column), NOT a live +-- join: external-content FTS requires the delete to replay the exact value that was +-- indexed, which a join could no longer reproduce after a symbol cascade. path is +-- UNINDEXED so its delete value is irrelevant. CREATE TRIGGER IF NOT EXISTS chunks_ai AFTER INSERT ON chunks BEGIN INSERT INTO fts_chunks(rowid, content, symbol_names, path) - VALUES (new.id, new.content, '', (SELECT path FROM files WHERE id = new.file_id)); + VALUES (new.id, new.content, new.symbol_names, (SELECT path FROM files WHERE id = new.file_id)); END; CREATE TRIGGER IF NOT EXISTS chunks_ad AFTER DELETE ON chunks BEGIN INSERT INTO fts_chunks(fts_chunks, rowid, content, symbol_names, path) - VALUES ('delete', old.id, old.content, '', ''); + VALUES ('delete', old.id, old.content, old.symbol_names, ''); END; CREATE TRIGGER IF NOT EXISTS chunks_au AFTER UPDATE ON chunks BEGIN INSERT INTO fts_chunks(fts_chunks, rowid, content, symbol_names, path) - VALUES ('delete', old.id, old.content, '', ''); + VALUES ('delete', old.id, old.content, old.symbol_names, ''); INSERT INTO fts_chunks(rowid, content, symbol_names, path) - VALUES (new.id, new.content, '', (SELECT path FROM files WHERE id = new.file_id)); + VALUES (new.id, new.content, new.symbol_names, (SELECT path FROM files WHERE id = new.file_id)); END; -- vec_chunks (sqlite-vec) is created at runtime ONLY when embeddings.enabled = true. diff --git a/tests/golden/mcp_search_code.json b/tests/golden/mcp_search_code.json index 7f81862..d213911 100644 --- a/tests/golden/mcp_search_code.json +++ b/tests/golden/mcp_search_code.json @@ -16,36 +16,6 @@ "line_end": 6, "line_start": 4, "path": "src/auth/token.py" - }, - { - "line_end": 5, - "line_start": 4, - "path": "src/auth/token.py" - }, - { - "line_end": 11, - "line_start": 9, - "path": "src/auth/token.py" - }, - { - "line_end": 2, - "line_start": 1, - "path": "src/auth/token.py" - }, - { - "line_end": 10, - "line_start": 9, - "path": "src/auth/token.py" - }, - { - "line_end": 1, - "line_start": 1, - "path": "src/auth/token.py" - }, - { - "line_end": 5, - "line_start": 1, - "path": "src/api/service.py" } ], "results": [ @@ -55,89 +25,23 @@ "path": "src/auth/token.py", "rank": 1, "reason": "in src/auth/ · 2 callers", - "score": 0.1596, + "score": 2.233, "snippet": "def refresh_access_token(refresh_token: str) -> str:", "symbols": [ "refresh_access_token" ], "token_est": 13 }, - { - "line_end": 5, - "line_start": 4, - "path": "src/auth/token.py", - "rank": 2, - "reason": "in src/auth/", - "score": 0.0664, - "snippet": "Exchange a refresh token for a new access token.", - "symbols": [], - "token_est": 12 - }, - { - "line_end": 11, - "line_start": 9, - "path": "src/auth/token.py", - "rank": 3, - "reason": "in src/auth/", - "score": 0.0661, - "snippet": "def login(refresh_token: str) -> str:\n \"\"\"Calls refresh_access_token so refs/impact tests have an edge.\"\"\"\n return refresh_access_token(refresh_token)", - "symbols": [], - "token_est": 39 - }, - { - "line_end": 2, - "line_start": 1, - "path": "src/auth/token.py", - "rank": 4, - "reason": "in src/auth/", - "score": 0.0659, - "snippet": "\"\"\"Token helpers (fixture).\"\"\"\n", - "symbols": [], - "token_est": 8 - }, - { - "line_end": 10, - "line_start": 9, - "path": "src/auth/token.py", - "rank": 5, - "reason": "in src/auth/", - "score": 0.0652, - "snippet": "Calls refresh_access_token so refs/impact tests have an edge.", - "symbols": [], - "token_est": 15 - }, - { - "line_end": 1, - "line_start": 1, - "path": "src/auth/token.py", - "rank": 6, - "reason": "in src/auth/", - "score": 0.0583, - "snippet": null, - "symbols": [], - "token_est": 0 - }, { "line_end": 11, "line_start": 7, "path": "src/api/service.py", - "rank": 7, + "rank": 2, "reason": "fts", - "score": 0.0156, + "score": 0.9375, "snippet": "class AdminUser(User):\n \"\"\"Subclass of User; imported-from edge target for impact tests.\"\"\"\n\n def renew(self, refresh_token: str) -> str:\n return refresh_access_token(refresh_token)", "symbols": [], "token_est": 48 - }, - { - "line_end": 5, - "line_start": 1, - "path": "src/api/service.py", - "rank": 8, - "reason": "fts", - "score": 0.0154, - "snippet": "\"\"\"Service layer (fixture) - exercises cross-file edges for impact tests.\"\"\"\n\nfrom auth.token import refresh_access_token\nfrom models.user import User\n", - "symbols": [], - "token_est": 38 } ], "schema_version": 1, diff --git a/tests/golden/search_token.json b/tests/golden/search_token.json index 732f6c4..a04d5fe 100644 --- a/tests/golden/search_token.json +++ b/tests/golden/search_token.json @@ -16,36 +16,6 @@ "line_end": 6, "line_start": 4, "path": "src/auth/token.py" - }, - { - "line_end": 5, - "line_start": 4, - "path": "src/auth/token.py" - }, - { - "line_end": 11, - "line_start": 9, - "path": "src/auth/token.py" - }, - { - "line_end": 2, - "line_start": 1, - "path": "src/auth/token.py" - }, - { - "line_end": 10, - "line_start": 9, - "path": "src/auth/token.py" - }, - { - "line_end": 1, - "line_start": 1, - "path": "src/auth/token.py" - }, - { - "line_end": 5, - "line_start": 1, - "path": "src/api/service.py" } ], "results": [ @@ -55,89 +25,23 @@ "path": "src/auth/token.py", "rank": 1, "reason": "in src/auth/ · 2 callers", - "score": 0.1596, + "score": 2.233, "snippet": "def refresh_access_token(refresh_token: str) -> str:", "symbols": [ "refresh_access_token" ], "token_est": 13 }, - { - "line_end": 5, - "line_start": 4, - "path": "src/auth/token.py", - "rank": 2, - "reason": "in src/auth/", - "score": 0.0664, - "snippet": "Exchange a refresh token for a new access token.", - "symbols": [], - "token_est": 12 - }, - { - "line_end": 11, - "line_start": 9, - "path": "src/auth/token.py", - "rank": 3, - "reason": "in src/auth/", - "score": 0.0661, - "snippet": "def login(refresh_token: str) -> str:\n \"\"\"Calls refresh_access_token so refs/impact tests have an edge.\"\"\"\n return refresh_access_token(refresh_token)", - "symbols": [], - "token_est": 39 - }, - { - "line_end": 2, - "line_start": 1, - "path": "src/auth/token.py", - "rank": 4, - "reason": "in src/auth/", - "score": 0.0659, - "snippet": "\"\"\"Token helpers (fixture).\"\"\"\n", - "symbols": [], - "token_est": 8 - }, - { - "line_end": 10, - "line_start": 9, - "path": "src/auth/token.py", - "rank": 5, - "reason": "in src/auth/", - "score": 0.0652, - "snippet": "Calls refresh_access_token so refs/impact tests have an edge.", - "symbols": [], - "token_est": 15 - }, - { - "line_end": 1, - "line_start": 1, - "path": "src/auth/token.py", - "rank": 6, - "reason": "in src/auth/", - "score": 0.0583, - "snippet": null, - "symbols": [], - "token_est": 0 - }, { "line_end": 11, "line_start": 7, "path": "src/api/service.py", - "rank": 7, + "rank": 2, "reason": "fts", - "score": 0.0156, + "score": 0.9375, "snippet": "class AdminUser(User):\n \"\"\"Subclass of User; imported-from edge target for impact tests.\"\"\"\n\n def renew(self, refresh_token: str) -> str:\n return refresh_access_token(refresh_token)", "symbols": [], "token_est": 48 - }, - { - "line_end": 5, - "line_start": 1, - "path": "src/api/service.py", - "rank": 8, - "reason": "fts", - "score": 0.0154, - "snippet": "\"\"\"Service layer (fixture) - exercises cross-file edges for impact tests.\"\"\"\n\nfrom auth.token import refresh_access_token\nfrom models.user import User\n", - "symbols": [], - "token_est": 38 } ] } diff --git a/tests/test_classify.py b/tests/test_classify.py index 87aa43c..547e518 100644 --- a/tests/test_classify.py +++ b/tests/test_classify.py @@ -4,6 +4,7 @@ detect_language, is_generated, is_secret_filename, + is_test_path, looks_binary, parser_for, ) @@ -65,4 +66,28 @@ def test_binary_detection(): def test_generated_detection(): assert is_generated("src/schema.generated.ts") assert is_generated("web/app.min.js") - assert not is_generated("web/app.ts") \ No newline at end of file + assert not is_generated("web/app.ts") + + +def test_is_test_path_matches_test_trees_and_modules(): + for path in [ + "tests/test_auth.py", + "src/__tests__/user.test.ts", + "pkg/foo_test.go", + "app/user.spec.ts", + "e2e/login.py", + "project/test/Thing.java", + ]: + assert is_test_path(path), path + + +def test_is_test_path_does_not_match_substring_lookalikes(): + # Word-boundary, not bare substring: these contain "test" but are not tests. + for path in [ + "src/contest/leaderboard.py", + "lib/latest.py", + "util/fastest_path.ts", + "web/testimonials.tsx", + "src/attestation.py", + ]: + assert not is_test_path(path), path \ No newline at end of file diff --git a/tests/test_freshness.py b/tests/test_freshness.py index 7a7ee89..af83cac 100644 --- a/tests/test_freshness.py +++ b/tests/test_freshness.py @@ -30,18 +30,40 @@ def test_freshly_built_index_is_not_stale(sample_repo, tmp_path): db.close() -def test_edited_file_makes_index_stale(sample_repo, tmp_path, monkeypatch): - """An indexed file whose mtime advanced past the build is counted as changed.""" +def test_edited_file_content_makes_index_stale(sample_repo, tmp_path): + """A file whose indexed content (sha256) no longer matches disk is stale.""" cfg, db = _indexed(sample_repo, tmp_path) from codebase_index.storage import repo - indexed = repo.path_mtimes(db.conn) + indexed = repo.fingerprints(db.conn) a_path = next(iter(indexed)) - repo.set_meta(db.conn, "head_commit", "deadbeef") - db.conn.execute("UPDATE files SET mtime_ns = 1 WHERE path = ?", (a_path,)) + repo.set_meta(db.conn, "head_commit", "deadbeef") # force the accurate (non-git) path + # Corrupt the stored fingerprint so the on-disk content hashes differently; + # mtime is bumped so the (mtime,size) fast-equal check can't short-circuit. + db.conn.execute( + "UPDATE files SET mtime_ns = 1, sha256 = 'stale-sha' WHERE path = ?", (a_path,) + ) db.conn.commit() fr = compute_freshness(db.conn, sample_repo, cfg) assert fr.stale is True assert fr.files_changed_since_build >= 1 db.close() + + +def test_touch_without_content_change_is_not_stale(sample_repo, tmp_path): + """A bare mtime bump with unchanged bytes is a no-op for update_index, so + freshness must not flag it as stale (it mirrors the sha-based decision).""" + cfg, db = _indexed(sample_repo, tmp_path) + + from codebase_index.storage import repo + indexed = repo.fingerprints(db.conn) + a_path = next(iter(indexed)) + repo.set_meta(db.conn, "head_commit", "deadbeef") # force the accurate (non-git) path + db.conn.execute("UPDATE files SET mtime_ns = 1 WHERE path = ?", (a_path,)) + db.conn.commit() + + fr = compute_freshness(db.conn, sample_repo, cfg) + assert fr.stale is False + assert fr.files_changed_since_build == 0 + db.close() diff --git a/tests/test_fusion.py b/tests/test_fusion.py index c7e30aa..693068f 100644 --- a/tests/test_fusion.py +++ b/tests/test_fusion.py @@ -34,3 +34,33 @@ def test_vector_source_participates_in_fusion(): vec = [Candidate(path="v.py", line_start=1, line_end=2, source="vector", score=0.9)] fused = fuse({"vector": vec}, weights={"vector": 1.0}, k=60) assert fused and fused[0].path == "v.py" + + +def test_fuse_merges_co_located_hits_across_line_ranges(): + """Different retrievers report different line ranges for the same place; fusion + buckets line_start so co-located cross-source hits still reinforce each other.""" + fts = [Candidate(path="a.py", line_start=10, line_end=80, source="fts", score=0.9)] + sym = [Candidate(path="a.py", line_start=12, line_end=20, source="symbol", score=0.8)] + fused = fuse({"fts": fts, "symbol": sym}, weights={"fts": 1.0, "symbol": 1.0}, k=60) + assert len(fused) == 1 # merged despite differing ranges + assert fused[0].agreeing_sources == 2 # file-level agreement counted + + +def test_fuse_scores_are_order_one(): + """RRF is rescaled by k so the top contribution is ~weight (≈1), not ~w/k (≈0.017), + keeping fused scores on the same scale as the reranker's bounded bonuses.""" + fts = [Candidate(path="a.py", line_start=1, line_end=2, source="fts", score=0.9)] + fused = fuse({"fts": fts}, weights={"fts": 1.0}, k=60) + assert 0.9 < fused[0].score <= 1.0 + + +def test_fuse_dedupes_repeated_source_hits_in_one_bucket(): + """Three FTS chunks of the same file/bucket are one lexical signal, not three.""" + fts = [ + Candidate(path="a.py", line_start=1, line_end=10, source="fts", score=0.9), + Candidate(path="a.py", line_start=11, line_end=20, source="fts", score=0.8), + Candidate(path="a.py", line_start=21, line_end=30, source="fts", score=0.7), + ] + fused = fuse({"fts": fts}, weights={"fts": 1.0}, k=60) + assert len(fused) == 1 + assert fused[0].score <= 1.0 # single best-rank contribution, not summed 3x diff --git a/tests/test_rerank.py b/tests/test_rerank.py index 08fe1c5..208557f 100644 --- a/tests/test_rerank.py +++ b/tests/test_rerank.py @@ -40,6 +40,31 @@ def test_in_degree_bonus_is_sublinear_and_capped(): assert scores[2] < 2 * scores[1] # 100 callers nowhere near 10x of 10 +def test_ref_count_is_damped_fallback_when_in_degree_zero(): + """A symbol with no resolved in_degree (ambiguous name) still gets a small + centrality nudge from its name-reference count — but capped below the precise + in_degree bonus so it can never override real callers.""" + no_signal = _c("a.py", "symbol", 0.0) + by_name = _c("b.py", "symbol", 0.0, ref_count=50) + rerank([no_signal, by_name], query="zzz", intent=Intent.KEYWORD) + assert by_name.score > no_signal.score + assert by_name.score <= _DEGREE_CAP / 2 + 1e-9 # damped: half the cap + + # Precise in_degree, when present, takes precedence over the name-based proxy. + precise = _c("c.py", "symbol", 0.0, in_degree=3) + proxy = _c("d.py", "symbol", 0.0, ref_count=3) + rerank([precise, proxy], query="zzz", intent=Intent.KEYWORD) + assert precise.score > proxy.score + + +def test_contest_path_is_not_demoted_as_test(): + """The test demotion is word-boundary aware: 'contest' is not a test path.""" + contest = _c("src/contest/board.py", "fts", 0.5) + real_test = _c("tests/test_board.py", "fts", 0.5) + rerank([contest, real_test], query="board", intent=Intent.KEYWORD) + assert contest.score > real_test.score + + def test_god_class_does_not_outrank_relevant_match_on_stray_term(): """A high-in_degree 'god class' that matches only a stray term must not float above a genuinely relevant (name/path) match with a slightly lower base score. diff --git a/tests/test_search_cli.py b/tests/test_search_cli.py index d0896a4..905891a 100644 --- a/tests/test_search_cli.py +++ b/tests/test_search_cli.py @@ -115,7 +115,9 @@ def test_search_reports_stale_after_edit(sample_repo, tmp_path, monkeypatch): db_path = sample_repo / ".claude" / "cache" / "codebase-index" / "index.sqlite" conn = sqlite3.connect(str(db_path)) - conn.execute("UPDATE files SET mtime_ns = 1") + # Freshness is content-aware (sha), not mtime-only: corrupt the stored hash so + # the recomputed on-disk content no longer matches the index → genuinely stale. + conn.execute("UPDATE files SET mtime_ns = 1, sha256 = 'stale-sha'") conn.execute("DELETE FROM meta WHERE key = 'head_commit'") conn.commit() conn.close() @@ -147,7 +149,9 @@ def test_explain_reports_stale_after_edit(sample_repo): db_path = sample_repo / ".claude" / "cache" / "codebase-index" / "index.sqlite" conn = sqlite3.connect(str(db_path)) - conn.execute("UPDATE files SET mtime_ns = 1") + # Freshness is content-aware (sha), not mtime-only: corrupt the stored hash so + # the recomputed on-disk content no longer matches the index → genuinely stale. + conn.execute("UPDATE files SET mtime_ns = 1, sha256 = 'stale-sha'") conn.execute("DELETE FROM meta WHERE key = 'head_commit'") conn.commit() conn.close() diff --git a/tests/test_searchers.py b/tests/test_searchers.py index 05772f3..285337d 100644 --- a/tests/test_searchers.py +++ b/tests/test_searchers.py @@ -1,8 +1,22 @@ from codebase_index.retrieval.searchers import ( - fts_candidates, path_candidates, symbol_candidates, + build_match_query, fts_candidates, path_candidates, symbol_candidates, ) +def test_build_match_query_drops_stopwords(): + # Natural-language filler must not be AND-ed into the match (it kills recall). + q = build_match_query("how does authentication work") + assert "how" not in q.lower() and "does" not in q.lower() + assert "authentication" in q.lower() and "work" in q.lower() + assert " AND " in q # salient terms are still AND-ed together + + +def test_build_match_query_falls_back_when_all_stopwords(): + # If every term is a stopword we must still emit a (non-empty) match, not "". + q = build_match_query("how does it") + assert q.strip() != "" + + def test_fts_candidates_uniform_shape(seeded_index): cands = fts_candidates(seeded_index.conn, "token", limit=10) assert cands and all(c.source == "fts" for c in cands) diff --git a/tests/test_storage.py b/tests/test_storage.py index 92d1a8c..c43effa 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -269,6 +269,99 @@ def test_replace_chunks_with_symbol_ids(tmp_path): db.close() +def test_chunk_symbol_names_populated_and_searchable(tmp_path): + """The chunk's symbol name is denormalized into symbol_names and indexed by FTS, + so a query matching only the symbol name (not the body text) still hits.""" + db = _open(tmp_path) + fid = repo.upsert_file( + db.conn, path="m.py", lang="python", size_bytes=1, sha256="h", mtime_ns=1, + git_status=None, parser="treesitter", indexed_at="t", is_generated=False, + ) + sids = repo.replace_symbols( + db.conn, fid, + [Symbol(name="refresh_access_token", kind="function", + line_start=1, line_end=2, qualified="refresh_access_token")], + ) + # Body text deliberately omits the symbol name, isolating the symbol_names column. + repo.replace_chunks( + db.conn, fid, + [Chunk(line_start=1, line_end=2, content="def f():\n return 1", + token_est=3, kind="symbol_body", symbol_index=0)], + symbol_ids=sids, + ) + assert repo.chunks_for_file(db.conn, fid)[0]["symbol_names"] == "refresh_access_token" + hit = repo.fts_search(db.conn, "refresh_access_token", limit=10) + assert len(hit) == 1 and hit[0]["path"] == "m.py" + db.close() + + +def test_chunk_symbol_names_delete_keeps_fts_consistent(tmp_path): + """External-content FTS corrupts if a delete replays the wrong indexed value. + Replacing chunks (and cascading symbol deletes) must leave a consistent index.""" + db = _open(tmp_path) + fid = repo.upsert_file( + db.conn, path="m.py", lang="python", size_bytes=1, sha256="h", mtime_ns=1, + git_status=None, parser="treesitter", indexed_at="t", is_generated=False, + ) + sids = repo.replace_symbols( + db.conn, fid, + [Symbol(name="alpha_symbol", kind="function", line_start=1, line_end=2, + qualified="alpha_symbol")], + ) + repo.replace_chunks( + db.conn, fid, + [Chunk(line_start=1, line_end=2, content="body one", token_est=2, + kind="symbol_body", symbol_index=0)], + symbol_ids=sids, + ) + assert repo.fts_search(db.conn, "alpha_symbol", limit=10) + + # Re-index the file: deletes the old symbol (cascades chunk.symbol_id -> NULL) + # and old chunk, inserts a fresh one with a different symbol name. + sids2 = repo.replace_symbols( + db.conn, fid, + [Symbol(name="beta_symbol", kind="function", line_start=1, line_end=2, + qualified="beta_symbol")], + ) + repo.replace_chunks( + db.conn, fid, + [Chunk(line_start=1, line_end=2, content="body two", token_est=2, + kind="symbol_body", symbol_index=0)], + symbol_ids=sids2, + ) + assert repo.fts_search(db.conn, "alpha_symbol", limit=10) == [] # old name gone + assert repo.fts_search(db.conn, "beta_symbol", limit=10) # new name present + assert db.conn.execute("PRAGMA integrity_check").fetchone()[0] == "ok" + db.close() + + +def test_name_ref_counts(tmp_path): + db = _open(tmp_path) + fid = repo.upsert_file( + db.conn, path="m.py", lang="python", size_bytes=1, sha256="h", mtime_ns=1, + git_status=None, parser="treesitter", indexed_at="t", is_generated=False, + ) + sids = repo.replace_symbols( + db.conn, fid, + [Symbol(name="caller", kind="function", line_start=1, line_end=2, qualified="caller")], + ) + repo.replace_edges( + db.conn, fid, + [ + {"edge_type": "call", "src_kind": "symbol", "src_id": sids[0], + "dst_kind": None, "dst_id": None, "dst_name": "run", "line": 1, "resolved": 0}, + {"edge_type": "call", "src_kind": "symbol", "src_id": sids[0], + "dst_kind": None, "dst_id": None, "dst_name": "run", "line": 2, "resolved": 0}, + {"edge_type": "call", "src_kind": "symbol", "src_id": sids[0], + "dst_kind": None, "dst_id": None, "dst_name": "once", "line": 3, "resolved": 0}, + ], + ) + counts = repo.name_ref_counts(db.conn, ["run", "once", "absent"]) + assert counts == {"run": 2, "once": 1} + assert repo.name_ref_counts(db.conn, []) == {} + db.close() + + def test_replace_edges_and_refs_for_name(tmp_path): db = _open(tmp_path) fid = repo.upsert_file(