From c5027707704f61a4726993cf133b6c5584f861f1 Mon Sep 17 00:00:00 2001 From: dvcdsys Date: Mon, 27 Apr 2026 21:50:36 +0100 Subject: [PATCH 1/6] feat(server/chunker): expand to 30+ languages + parse-budget guard + bash regex fallback Three related chunker improvements: 1. Language registry expansion. defaultRegistry now ships 30 languages (Tier A: Go/Python/JS/TS/TSX/Java/C/C++/Ruby/PHP/Rust; Tier B: C#/Swift/ Kotlin/Scala/Haskell/Elixir/Erlang/OCaml/Lua/Bash/HTML/CSS/SQL/YAML/JSON /TOML/Markdown/Dockerfile/Make). Configurable via CIX_ENABLED_LANGUAGES. See doc/LANGUAGES.md. 2. Tree-sitter parse-budget guard. Bash grammar exhibits catastrophic backtracking on real-world scripts (31s on 7KB install.sh). Added parseBudget=2s with twin guards: SetTimeoutMicros + SetCancellationFlag armed by time.AfterFunc. On budget exceeded, falls back to chunkFallback(). 3. Bash regex fallback. New bashRegexChunks() recognises the three common bash function forms (POSIX `name() {`, keyword `function name {`, with or without parens) and finds each function's closing brace via a state machine that handles single/double strings, comments, heredocs (< --- doc/LANGUAGES.md | 75 +++ server/cmd/cix-server/main.go | 4 + server/internal/chunker/bash_regex.go | 318 ++++++++++ server/internal/chunker/bash_regex_test.go | 341 +++++++++++ server/internal/chunker/chunker.go | 558 +++++++++++++++--- server/internal/chunker/chunker_test.go | 398 +++++++++++++ server/internal/config/config.go | 14 + server/internal/langdetect/langdetect.go | 3 +- server/internal/langdetect/langdetect_test.go | 4 +- 9 files changed, 1640 insertions(+), 75 deletions(-) create mode 100644 doc/LANGUAGES.md create mode 100644 server/internal/chunker/bash_regex.go create mode 100644 server/internal/chunker/bash_regex_test.go diff --git a/doc/LANGUAGES.md b/doc/LANGUAGES.md new file mode 100644 index 0000000..70ef5c4 --- /dev/null +++ b/doc/LANGUAGES.md @@ -0,0 +1,75 @@ +# Supported languages + +cix uses tree-sitter (via `github.com/odvcencio/gotreesitter`) to extract semantic chunks (functions, classes, methods, types) from source code. Files in unsupported languages still get indexed via a sliding-window fallback — they're searchable, just without per-symbol granularity. + +## Default language set (30) + +| ID | gotreesitter factory | Function | Class | Method | Type | +|---|---|:-:|:-:|:-:|:-:| +| `python` | `PythonLanguage` | ✓ | ✓ | | | +| `typescript` | `TypescriptLanguage` | ✓ | ✓ | ✓ | ✓ | +| `tsx` | `TsxLanguage` | ✓ | ✓ | ✓ | ✓ | +| `javascript` | `JavascriptLanguage` | ✓ | ✓ | ✓ | | +| `go` | `GoLanguage` | ✓ | | ✓ | ✓ | +| `rust` | `RustLanguage` | ✓ | ✓ | | ✓ | +| `java` | `JavaLanguage` | ✓ | ✓ | | ✓ | +| `c` | `CLanguage` | ✓ | ✓ | | ✓ | +| `cpp` | `CppLanguage` | ✓ | ✓ | | ✓ | +| `c_sharp` | `CSharpLanguage` | ✓ | ✓ | ✓ | ✓ | +| `ruby` | `RubyLanguage` | ✓ | ✓ | | | +| `php` | `PhpLanguage` | ✓ | ✓ | ✓ | ✓ | +| `swift` | `SwiftLanguage` | ✓ | ✓ | | ✓ | +| `kotlin` | `KotlinLanguage` | ✓ | ✓ | | | +| `scala` | `ScalaLanguage` | ✓ | ✓ | | ✓ | +| `bash` | `BashLanguage` | ✓ | | | | +| `lua` | `LuaLanguage` | ✓ | | | | +| `dart` | `DartLanguage` | ✓ | ✓ | ✓ | ✓ | +| `r` | `RLanguage` | ✓ | | | | +| `objc` | `ObjcLanguage` | ✓ | ✓ | ✓ | ✓ | +| `html` | `HtmlLanguage` | | | | ✓ | +| `css` | `CssLanguage` | | ✓ | | | +| `scss` | `ScssLanguage` | ✓ | ✓ | | | +| `sql` | `SqlLanguage` | ✓ | | | ✓ | +| `markdown` | `MarkdownLanguage` | | | | ✓ | +| `zig` | `ZigLanguage` | ✓ | ✓ | | | +| `julia` | `JuliaLanguage` | ✓ | | | | +| `fortran` | `FortranLanguage` | ✓ | ✓ | | | +| `haskell` | `HaskellLanguage` | ✓ | | | ✓ | +| `ocaml` | `OcamlLanguage` | ✓ | ✓ | | ✓ | + +The exact AST node types per language live in `server/internal/chunker/chunker.go` (`defaultRegistry`). File-extension mapping lives in `server/internal/langdetect/langdetect.go`. + +## Configuring the active set + +`CIX_LANGUAGES` (comma-separated, case-insensitive) restricts the active set. Empty / unset = all defaults. + +```bash +# Only index Python and Go — every other language falls to sliding-window +CIX_LANGUAGES=python,go cix-server + +# Add Rust to the trio +CIX_LANGUAGES="python, go, rust" cix-server +``` + +Unknown IDs are logged at startup and ignored — typos won't crash the server. + +The active set is logged at INFO during startup: + +``` +{"level":"INFO","msg":"chunker languages configured","active":["python","go","rust"]} +``` + +## Languages with extension detection but no grammar + +These produce sliding-window chunks. Adding semantic chunking is a one-map-entry addition in `defaultRegistry`. Candidates: + +`erlang, elixir, commonlisp, svelte, graphql, hcl (terraform), cmake, dockerfile, regex, xml, make` + +PRs welcome — verify node names with `gotreesitter`'s `cmd/tsquery` against a representative fixture before adding. + +## How the chunker decides + +1. `langdetect.Detect(filePath)` maps extension/filename → language ID. +2. `chunker.ChunkFile()` looks up the ID in the active registry. +3. If found and its `languageNodes` map is non-empty → AST-based extraction (function/class/method/type chunks + identifier references). +4. Otherwise → sliding-window chunks of `windowSize=4000` bytes with `overlap=500`. diff --git a/server/cmd/cix-server/main.go b/server/cmd/cix-server/main.go index cbb9ad6..7364400 100644 --- a/server/cmd/cix-server/main.go +++ b/server/cmd/cix-server/main.go @@ -15,6 +15,7 @@ import ( "syscall" "time" + "github.com/dvcdsys/code-index/server/internal/chunker" "github.com/dvcdsys/code-index/server/internal/config" "github.com/dvcdsys/code-index/server/internal/db" "github.com/dvcdsys/code-index/server/internal/embeddings" @@ -73,6 +74,9 @@ func run() error { logger.Warn("CIX_API_KEY is empty — authenticated endpoints are reachable without auth (dev mode)") } + chunker.Configure(cfg.Languages) + logger.Info("chunker languages configured", "active", chunker.SupportedLanguages()) + dbPath := cfg.DynamicSQLitePath() logger.Info("opening database", "path", dbPath) database, err := db.Open(dbPath) diff --git a/server/internal/chunker/bash_regex.go b/server/internal/chunker/bash_regex.go new file mode 100644 index 0000000..53c5006 --- /dev/null +++ b/server/internal/chunker/bash_regex.go @@ -0,0 +1,318 @@ +// Package chunker — regex-based bash function extractor. +// +// Used as a fallback when tree-sitter-bash hits a parse pathology (see +// parseBudget in chunker.go). Tree-sitter would have given us better symbol +// data, but on a 7KB install.sh-style script its parser can spend 30 seconds +// on catastrophic backtracking. The regex extractor below recognises the +// three common bash function forms and finds each function's closing brace +// with a small state machine that handles strings, comments, and heredocs. +// +// Output schema matches chunkWithTreesitter so the indexer's downstream code +// (DB upserts, vector embeddings) doesn't need to special-case bash. +// +// Limitations vs full tree-sitter parse: +// - No reference extraction (returns nil refs). +// - Functions with a `{` on a line *separate* from the opener (`name()` on +// one line, `{` on the next) are not matched. That form is legal in +// bash but rare in practice; falls back to sliding-window for those. +// - Comments containing `{`/`}` inside strings can confuse the brace +// counter on adversarial inputs; bounded by maxBashFuncLines so a +// malformed function never absorbs the whole file. + +package chunker + +import ( + "regexp" + "strings" +) + +// posixFuncRE matches the POSIX-shell style: `name() { ...`. +// Captures group 1 = function name. The trailing `{` must be on the same line. +var posixFuncRE = regexp.MustCompile( + `^[[:space:]]*([A-Za-z_][A-Za-z0-9_:.-]*)[[:space:]]*\(\)[[:space:]]*\{`) + +// bashFuncRE matches the bash-keyword style: `function name [()] { ...`. +// Captures group 1 = function name. +var bashFuncRE = regexp.MustCompile( + `^[[:space:]]*function[[:space:]]+([A-Za-z_][A-Za-z0-9_:.-]*)(?:[[:space:]]*\(\))?[[:space:]]*\{`) + +// maxBashFuncLines caps how far we'll scan for a function's closing `}`. +// Real-world bash functions rarely exceed ~200 lines. The cap protects +// against pathological inputs where the brace counter goes off-track — +// instead of consuming the whole file as one function, we stop and let the +// caller decide what to do (typically: keep a partial chunk, fall back +// to sliding-window for the remainder). +const maxBashFuncLines = 500 + +// bashRegexChunks extracts function-level chunks from bash source via regex. +// Returns nil when no functions were found, signalling the caller to fall +// through to sliding-window. Always returns nil refs (the regex doesn't +// track identifier usage). +func bashRegexChunks(filePath, content string) []Chunk { + lines := splitLines(content) + if len(lines) == 0 { + return nil + } + + var chunks []Chunk + covered := make([]bool, len(lines)) + + i := 0 + for i < len(lines) { + line := lines[i] + var name string + if m := posixFuncRE.FindStringSubmatch(line); m != nil { + name = m[1] + } else if m := bashFuncRE.FindStringSubmatch(line); m != nil { + name = m[1] + } + if name == "" { + i++ + continue + } + + endIdx, ok := scanBashFuncEnd(lines, i) + if !ok { + // Couldn't find balanced close within maxBashFuncLines. + // Skip this opener — don't emit a wildly oversized chunk. + i++ + continue + } + + startLine := i + 1 // 1-based + endLine := endIdx + 1 + body := joinLines(lines[i : endIdx+1]) + // Signature = the opener line trimmed. + sigStr := trimSpace(line) + nameCopy := name + + chunks = append(chunks, Chunk{ + Content: body, + ChunkType: "function", + FilePath: filePath, + StartLine: startLine, + EndLine: endLine, + Language: "bash", + SymbolName: &nameCopy, + SymbolSignature: &sigStr, + }) + for k := i; k <= endIdx && k < len(covered); k++ { + covered[k] = true + } + i = endIdx + 1 + } + + if len(chunks) == 0 { + return nil + } + + // Fill the gaps between functions with `module` chunks so the file's + // non-function content (top-level commands, comments, set -e, etc.) + // still gets indexed for full-text/semantic search. + chunks = appendBashGaps(chunks, lines, covered, filePath) + return chunks +} + +// appendBashGaps adds module-type chunks for line ranges not covered by any +// function. Mirrors the gap-filling logic chunkWithTreesitter applies for +// tree-sitter chunks. Returns chunks sorted by StartLine. +func appendBashGaps(chunks []Chunk, lines []string, covered []bool, filePath string) []Chunk { + gapStart := -1 + for i := 0; i <= len(lines); i++ { + uncovered := i < len(lines) && !covered[i] + if uncovered && gapStart < 0 { + gapStart = i + } + if !uncovered && gapStart >= 0 { + gapEnd := i - 1 + content := joinLines(lines[gapStart : gapEnd+1]) + if trimSpace(content) != "" { + chunks = append(chunks, Chunk{ + Content: content, + ChunkType: "module", + FilePath: filePath, + StartLine: gapStart + 1, + EndLine: gapEnd + 1, + Language: "bash", + }) + } + gapStart = -1 + } + } + // Sort by StartLine so consumers see a stable order. + insertSortByStartLine(chunks) + return chunks +} + +func insertSortByStartLine(chunks []Chunk) { + for i := 1; i < len(chunks); i++ { + j := i + for j > 0 && chunks[j].StartLine < chunks[j-1].StartLine { + chunks[j], chunks[j-1] = chunks[j-1], chunks[j] + j-- + } + } +} + +// scanBashFuncEnd walks forward from startLineIdx (the opener line, which +// already contains the first `{`) and returns the 0-based line index of the +// matching close `}` and ok=true. ok=false means we couldn't find a balance +// within maxBashFuncLines or hit EOF first. +// +// State machine handles: +// - Single-quoted strings ('...') — literal, no escapes +// - Double-quoted strings ("...") — `\"` is escaped quote, `\\` is escaped backslash +// - `# ... EOL` comments — but skipping `$#`, `${#var}`, `$(( # ...))` etc. heuristically +// - Heredocs (< len(lines) { + maxIdx = len(lines) + } + + for li := startLineIdx; li < maxIdx; li++ { + line := lines[li] + + if inHeredoc { + candidate := line + if heredocStripTabs { + candidate = strings.TrimLeft(line, "\t") + } + if candidate == heredocDelim { + inHeredoc = false + heredocDelim = "" + heredocStripTabs = false + } + continue + } + + i := 0 + for i < len(line) { + c := line[i] + + if inSingleStr { + if c == '\'' { + inSingleStr = false + } + i++ + continue + } + if inDoubleStr { + if c == '\\' && i+1 < len(line) { + // Skip the escaped char (handles `\"`, `\\`, etc.). + i += 2 + continue + } + if c == '"' { + inDoubleStr = false + } + i++ + continue + } + + // Comment — `#` starts a line comment unless it follows `$` (`$#`, + // argument count) or `{`/`(` (`${#var}`, `$((# expr ))`). We + // skip the comment if `#` is at start of line / after whitespace + // or after a token-ending char. + if c == '#' { + prev := byte(' ') + if i > 0 { + prev = line[i-1] + } + if prev == ' ' || prev == '\t' || prev == ';' || prev == '|' || + prev == '&' || prev == '(' || i == 0 { + break // rest of line is comment + } + } + + // Heredoc / here-string + if c == '<' && i+1 < len(line) && line[i+1] == '<' { + // `<<<` is here-string (single-line) — skip the marker + if i+2 < len(line) && line[i+2] == '<' { + i += 3 + continue + } + // `<<` or `<<-` + j := i + 2 + stripTabs := false + if j < len(line) && line[j] == '-' { + stripTabs = true + j++ + } + // Skip leading whitespace before delimiter + for j < len(line) && (line[j] == ' ' || line[j] == '\t') { + j++ + } + delim, after := readHeredocDelim(line, j) + if delim != "" { + inHeredoc = true + heredocDelim = delim + heredocStripTabs = stripTabs + // Resume after the delimiter on this line — there may + // be more code on the opener line (e.g. `cmd <= len(line) { + return "", start + } + q := line[start] + if q == '\'' || q == '"' { + end := strings.IndexByte(line[start+1:], q) + if end < 0 { + return "", start + } + return line[start+1 : start+1+end], start + 1 + end + 1 + } + j := start + for j < len(line) && (isBashIdentByte(line[j])) { + j++ + } + if j == start { + return "", start + } + return line[start:j], j +} + +func isBashIdentByte(b byte) bool { + return (b >= 'A' && b <= 'Z') || + (b >= 'a' && b <= 'z') || + (b >= '0' && b <= '9') || + b == '_' || b == '-' +} diff --git a/server/internal/chunker/bash_regex_test.go b/server/internal/chunker/bash_regex_test.go new file mode 100644 index 0000000..f4c3815 --- /dev/null +++ b/server/internal/chunker/bash_regex_test.go @@ -0,0 +1,341 @@ +package chunker + +import ( + "strings" + "testing" +) + +// helper: assert a chunk with the given symbol name and type exists. +func findChunkByName(t *testing.T, chunks []Chunk, name, kind string) Chunk { + t.Helper() + for _, c := range chunks { + if c.SymbolName != nil && *c.SymbolName == name && c.ChunkType == kind { + return c + } + } + t.Fatalf("no chunk with name=%q type=%q in: %s", name, kind, summariseChunks(chunks)) + return Chunk{} +} + +func summariseChunks(chunks []Chunk) string { + var b strings.Builder + for i, c := range chunks { + if i > 0 { + b.WriteString("; ") + } + name := "" + if c.SymbolName != nil { + name = *c.SymbolName + } + b.WriteString(c.ChunkType + ":" + name) + } + return b.String() +} + +// --- POSIX style: name() { ... } ------------------------------------------- + +func TestBashRegex_PosixSimple(t *testing.T) { + src := `#!/usr/bin/env bash +hello() { + echo "hi" +} +` + chunks := bashRegexChunks("/p/x.sh", src) + hello := findChunkByName(t, chunks, "hello", "function") + if hello.StartLine != 2 || hello.EndLine != 4 { + t.Errorf("hello lines = %d-%d, want 2-4", hello.StartLine, hello.EndLine) + } + if !strings.Contains(hello.Content, `echo "hi"`) { + t.Errorf("body missing echo: %q", hello.Content) + } +} + +func TestBashRegex_PosixOneLiner(t *testing.T) { + src := `greet() { echo "hi"; } +` + chunks := bashRegexChunks("/p/x.sh", src) + greet := findChunkByName(t, chunks, "greet", "function") + if greet.StartLine != 1 || greet.EndLine != 1 { + t.Errorf("greet lines = %d-%d, want 1-1", greet.StartLine, greet.EndLine) + } +} + +// --- bash function keyword form -------------------------------------------- + +func TestBashRegex_FunctionKeywordWithParens(t *testing.T) { + src := `function deploy() { + echo deploying +} +` + chunks := bashRegexChunks("/p/d.sh", src) + findChunkByName(t, chunks, "deploy", "function") +} + +func TestBashRegex_FunctionKeywordNoParens(t *testing.T) { + src := `function build { + make all +} +` + chunks := bashRegexChunks("/p/b.sh", src) + findChunkByName(t, chunks, "build", "function") +} + +// --- multiple functions ---------------------------------------------------- + +func TestBashRegex_MultipleFunctions(t *testing.T) { + src := `setup() { + mkdir -p /tmp/x +} + +teardown() { + rm -rf /tmp/x +} + +run_tests() { + setup + pytest + teardown +} +` + chunks := bashRegexChunks("/p/test.sh", src) + for _, name := range []string{"setup", "teardown", "run_tests"} { + findChunkByName(t, chunks, name, "function") + } + // Three functions + the gap before teardown / between functions / after. + functionCount := 0 + for _, c := range chunks { + if c.ChunkType == "function" { + functionCount++ + } + } + if functionCount != 3 { + t.Errorf("function count = %d, want 3", functionCount) + } +} + +// --- nested braces --------------------------------------------------------- + +func TestBashRegex_NestedBraces(t *testing.T) { + src := `outer() { + if [[ "$1" == "yes" ]]; then + local x={key:value} + echo "${x}" + fi +} +` + chunks := bashRegexChunks("/p/n.sh", src) + outer := findChunkByName(t, chunks, "outer", "function") + if outer.StartLine != 1 || outer.EndLine != 6 { + t.Errorf("outer lines = %d-%d, want 1-6", outer.StartLine, outer.EndLine) + } +} + +// --- strings containing braces --------------------------------------------- + +func TestBashRegex_StringsWithBraces(t *testing.T) { + src := `format() { + echo "literal { brace }" + echo 'single { quoted }' +} +trailer() { echo done; } +` + chunks := bashRegexChunks("/p/s.sh", src) + format := findChunkByName(t, chunks, "format", "function") + if format.EndLine != 4 { + t.Errorf("format end = %d, want 4 (string braces should not count)", format.EndLine) + } + findChunkByName(t, chunks, "trailer", "function") +} + +// --- heredoc handling ------------------------------------------------------ + +func TestBashRegex_HeredocBody(t *testing.T) { + src := `usage() { + cat <] +EOF +} + +main() { + usage +} +` + chunks := bashRegexChunks("/p/install.sh", src) + findChunkByName(t, chunks, "usage", "function") + findChunkByName(t, chunks, "main", "function") +} + +// --- fallback wiring: ChunkFile uses regex for bash on parse fallback ------ + +func TestChunkFile_BashFallbackUsesRegex(t *testing.T) { + // We pick a bash source that's chunked successfully by tree-sitter + // (so the parse-budget guard does NOT fire) and verify both paths + // produce a function-named chunk for `hello`. This is a sanity check + // that bashRegexChunks signature matches the public ChunkFile schema. + src := `hello() { + echo "hi" +} +` + chunks, _, err := ChunkFile("/p/x.sh", src, "bash", 0) + if err != nil { + t.Fatalf("ChunkFile: %v", err) + } + for _, c := range chunks { + if c.ChunkType == "function" && c.SymbolName != nil && *c.SymbolName == "hello" { + return + } + } + t.Errorf("expected `hello` function chunk, got: %s", summariseChunks(chunks)) +} diff --git a/server/internal/chunker/chunker.go b/server/internal/chunker/chunker.go index dc44f55..0b140a1 100644 --- a/server/internal/chunker/chunker.go +++ b/server/internal/chunker/chunker.go @@ -2,9 +2,20 @@ // The public surface is ChunkFile, which returns ([]Chunk, []Reference, error). // Sliding-window fallback is used when a language is not supported by the // tree-sitter grammars bundle or when parsing fails. +// +// The set of active languages is built from a baked-in default registry +// (see defaultRegistry) and may be filtered at startup via Configure(). The +// CIX_LANGUAGES env var feeds Configure with a comma-separated whitelist; +// empty/nil keeps all defaults. package chunker import ( + "log/slog" + "strings" + "sync" + "sync/atomic" + "time" + sitter "github.com/odvcencio/gotreesitter" "github.com/odvcencio/gotreesitter/grammars" ) @@ -24,53 +35,401 @@ const ( // minRefNameLength mirrors MIN_REF_NAME_LENGTH in chunker.py. const minRefNameLength = 2 +// parseBudget caps wall-clock time spent in tree-sitter for a single file. +// Some grammars (notably bash) have catastrophic-backtracking pathologies on +// specific inputs — install.sh in this very repo took 31s to parse before +// this guard. The parser's own SetTimeoutMicros checkpoint is best-effort +// and overshoots by 3-4×, so we set the hint generously and rely on the +// post-parse wall-clock check to decide whether to keep the tree. +// +// On overshoot we fall back to sliding-window chunks. We accept the wasted +// CPU (parser keeps running until its next checkpoint) because killing a +// pure-Go parse from outside is not safe — the only practical levers are +// SetTimeoutMicros and the cancellation flag, both with the same overshoot +// characteristic. +const ( + parseBudget = 2 * time.Second + parseHint = uint64(parseBudget / time.Microsecond) +) + // --------------------------------------------------------------------------- -// Language maps — ported 1:1 from chunker.py +// Language registry — built from defaultRegistry() at init() and reduced by +// Configure() if the operator set CIX_LANGUAGES. The three exported maps +// stay package-private; the engine reads them directly. // --------------------------------------------------------------------------- -// languageNodes maps language → kind → []node_type. -// Kind values: function|class|method|type. -var languageNodes = map[string]map[string][]string{ - "python": { - "function": {"function_definition"}, - "class": {"class_definition"}, - }, - "typescript": { - "function": {"function_declaration", "arrow_function"}, - "class": {"class_declaration"}, - "method": {"method_definition"}, - "type": {"interface_declaration", "type_alias_declaration"}, - }, - "javascript": { - "function": {"function_declaration", "arrow_function"}, - "class": {"class_declaration"}, - "method": {"method_definition"}, - }, - "go": { - "function": {"function_declaration"}, - "method": {"method_declaration"}, - "type": {"type_spec"}, - }, - "rust": { - "function": {"function_item"}, - "class": {"struct_item", "enum_item"}, - "type": {"trait_item"}, - }, - "java": { - "function": {"method_declaration"}, - "class": {"class_declaration"}, - "type": {"interface_declaration"}, - }, +// languageEntry bundles the three pieces of state a language needs. +type languageEntry struct { + factory languageFunc + nodes map[string][]string // function|class|method|type → AST node types + identifiers map[string]struct{} // identifier leaf-node types for ref extraction } -// identifierNodes maps language → set of identifier leaf-node types. -var identifierNodes = map[string]map[string]struct{}{ - "python": {"identifier": {}}, - "typescript": {"identifier": {}, "type_identifier": {}, "property_identifier": {}}, - "javascript": {"identifier": {}, "property_identifier": {}}, - "go": {"identifier": {}, "type_identifier": {}, "field_identifier": {}}, - "rust": {"identifier": {}, "type_identifier": {}, "field_identifier": {}}, - "java": {"identifier": {}, "type_identifier": {}}, +// languageFunc is a factory for sitter.Language. +type languageFunc func() *sitter.Language + +var ( + registryMu sync.RWMutex + languageRegistry map[string]languageFunc + languageNodes map[string]map[string][]string + identifierNodes map[string]map[string]struct{} +) + +func init() { + // Populate full defaults so direct ChunkFile usage (and tests) works + // without a Configure() call. Server startup later may filter via + // Configure(cfg.Languages). + Configure(nil) +} + +// Configure (re)builds the active language registry from the baked-in +// defaultRegistry, optionally filtered to the IDs in `enabled`. Empty or nil +// `enabled` activates all defaults. Unknown IDs are logged and ignored. +// Idempotent and safe to call multiple times; concurrent ChunkFile callers +// see a consistent snapshot via registryMu. +func Configure(enabled []string) { + defaults := defaultRegistry() + + wantAll := len(enabled) == 0 + wanted := make(map[string]struct{}, len(enabled)) + if !wantAll { + for _, raw := range enabled { + id := strings.ToLower(strings.TrimSpace(raw)) + if id == "" { + continue + } + wanted[id] = struct{}{} + } + } + + reg := make(map[string]languageFunc, len(defaults)) + nodes := make(map[string]map[string][]string, len(defaults)) + idents := make(map[string]map[string]struct{}, len(defaults)) + + for lang, entry := range defaults { + if !wantAll { + if _, ok := wanted[lang]; !ok { + continue + } + } + reg[lang] = entry.factory + if entry.nodes != nil { + nodes[lang] = entry.nodes + } + if entry.identifiers != nil { + idents[lang] = entry.identifiers + } + } + + if !wantAll { + for id := range wanted { + if _, ok := defaults[id]; !ok { + slog.Warn("chunker: unknown language in CIX_LANGUAGES, ignored", "lang", id) + } + } + } + + registryMu.Lock() + languageRegistry = reg + languageNodes = nodes + identifierNodes = idents + registryMu.Unlock() +} + +// SupportedLanguages returns a snapshot of currently-active language IDs. +// Useful for /health, debug endpoints, and test assertions. +func SupportedLanguages() []string { + registryMu.RLock() + defer registryMu.RUnlock() + out := make([]string, 0, len(languageRegistry)) + for k := range languageRegistry { + out = append(out, k) + } + return out +} + +// defaultRegistry returns the baked-in language entries. Adding a language is +// a single new map entry — no other code changes are needed because the +// chunker engine is data-driven. +func defaultRegistry() map[string]languageEntry { + idID := func(extra ...string) map[string]struct{} { + m := map[string]struct{}{"identifier": {}} + for _, e := range extra { + m[e] = struct{}{} + } + return m + } + + return map[string]languageEntry{ + // --- Tier 1: original 6, kept as-is for parity with legacy Python --- + "python": { + factory: grammars.PythonLanguage, + nodes: map[string][]string{ + "function": {"function_definition"}, + "class": {"class_definition"}, + }, + identifiers: idID(), + }, + "typescript": { + factory: grammars.TypescriptLanguage, + nodes: map[string][]string{ + "function": {"function_declaration", "arrow_function"}, + "class": {"class_declaration"}, + "method": {"method_definition"}, + "type": {"interface_declaration", "type_alias_declaration"}, + }, + identifiers: idID("type_identifier", "property_identifier"), + }, + "javascript": { + factory: grammars.JavascriptLanguage, + nodes: map[string][]string{ + "function": {"function_declaration", "arrow_function"}, + "class": {"class_declaration"}, + "method": {"method_definition"}, + }, + identifiers: idID("property_identifier"), + }, + "go": { + factory: grammars.GoLanguage, + nodes: map[string][]string{ + "function": {"function_declaration"}, + "method": {"method_declaration"}, + "type": {"type_spec"}, + }, + identifiers: idID("type_identifier", "field_identifier"), + }, + "rust": { + factory: grammars.RustLanguage, + nodes: map[string][]string{ + "function": {"function_item"}, + "class": {"struct_item", "enum_item"}, + "type": {"trait_item"}, + }, + identifiers: idID("type_identifier", "field_identifier"), + }, + "java": { + factory: grammars.JavaLanguage, + nodes: map[string][]string{ + "function": {"method_declaration"}, + "class": {"class_declaration"}, + "type": {"interface_declaration"}, + }, + identifiers: idID("type_identifier"), + }, + + // --- Tier 2: bug-fix — grammars were registered, node maps were not --- + "tsx": { + factory: grammars.TsxLanguage, + nodes: map[string][]string{ + "function": {"function_declaration", "arrow_function"}, + "class": {"class_declaration"}, + "method": {"method_definition"}, + "type": {"interface_declaration", "type_alias_declaration"}, + }, + identifiers: idID("type_identifier", "property_identifier"), + }, + "c": { + factory: grammars.CLanguage, + nodes: map[string][]string{ + "function": {"function_definition"}, + "class": {"struct_specifier"}, + "type": {"enum_specifier", "union_specifier", "type_definition"}, + }, + identifiers: idID("type_identifier", "field_identifier"), + }, + "cpp": { + factory: grammars.CppLanguage, + nodes: map[string][]string{ + "function": {"function_definition"}, + "class": {"class_specifier", "struct_specifier"}, + "type": {"enum_specifier", "union_specifier", "type_definition", "namespace_definition"}, + }, + identifiers: idID("type_identifier", "field_identifier"), + }, + "ruby": { + factory: grammars.RubyLanguage, + nodes: map[string][]string{ + "function": {"method", "singleton_method"}, + "class": {"class", "module"}, + }, + identifiers: idID("constant"), + }, + + // --- Tier 3: mainstream additions, high confidence in node names --- + "c_sharp": { + factory: grammars.CSharpLanguage, + nodes: map[string][]string{ + "function": {"local_function_statement"}, + "class": {"class_declaration"}, + "method": {"method_declaration"}, + "type": {"interface_declaration", "struct_declaration", "enum_declaration", "record_declaration"}, + }, + identifiers: idID("type_identifier"), + }, + "php": { + factory: grammars.PhpLanguage, + nodes: map[string][]string{ + "function": {"function_definition"}, + "class": {"class_declaration"}, + "method": {"method_declaration"}, + "type": {"interface_declaration", "trait_declaration"}, + }, + identifiers: idID("name", "variable_name"), + }, + "swift": { + factory: grammars.SwiftLanguage, + nodes: map[string][]string{ + "function": {"function_declaration"}, + "class": {"class_declaration"}, + "type": {"protocol_declaration"}, + }, + identifiers: idID("simple_identifier", "type_identifier"), + }, + "kotlin": { + factory: grammars.KotlinLanguage, + nodes: map[string][]string{ + "function": {"function_declaration"}, + "class": {"class_declaration", "object_declaration"}, + }, + identifiers: idID("type_identifier", "simple_identifier"), + }, + "scala": { + factory: grammars.ScalaLanguage, + nodes: map[string][]string{ + "function": {"function_definition"}, + "class": {"class_definition", "object_definition"}, + "type": {"trait_definition"}, + }, + identifiers: idID("type_identifier"), + }, + "bash": { + factory: grammars.BashLanguage, + nodes: map[string][]string{ + "function": {"function_definition"}, + }, + identifiers: idID("variable_name", "word"), + }, + "lua": { + factory: grammars.LuaLanguage, + nodes: map[string][]string{ + "function": {"function_declaration", "function_definition"}, + }, + identifiers: idID(), + }, + "dart": { + factory: grammars.DartLanguage, + nodes: map[string][]string{ + "function": {"function_signature"}, + "class": {"class_definition"}, + "method": {"method_signature"}, + "type": {"mixin_declaration", "extension_declaration"}, + }, + identifiers: idID("type_identifier"), + }, + "r": { + factory: grammars.RLanguage, + nodes: map[string][]string{ + "function": {"function_definition"}, + }, + identifiers: idID(), + }, + "objc": { + factory: grammars.ObjcLanguage, + nodes: map[string][]string{ + "function": {"function_definition"}, + "class": {"class_interface", "class_implementation"}, + "method": {"method_definition"}, + "type": {"protocol_declaration"}, + }, + identifiers: idID("type_identifier", "field_identifier"), + }, + + // --- Tier 4: markup / data / config with structural nodes --- + "html": { + factory: grammars.HtmlLanguage, + nodes: map[string][]string{ + "type": {"doctype"}, + }, + identifiers: nil, + }, + "css": { + factory: grammars.CssLanguage, + nodes: map[string][]string{ + "class": {"rule_set"}, + }, + identifiers: nil, + }, + "scss": { + factory: grammars.ScssLanguage, + nodes: map[string][]string{ + "function": {"mixin_statement"}, + "class": {"rule_set"}, + }, + identifiers: nil, + }, + "sql": { + factory: grammars.SqlLanguage, + nodes: map[string][]string{ + "function": {"create_function_statement"}, + "type": {"create_table_statement"}, + }, + identifiers: nil, + }, + "markdown": { + factory: grammars.MarkdownLanguage, + nodes: map[string][]string{ + "type": {"section", "atx_heading"}, + }, + identifiers: nil, + }, + + // --- Tier 5: medium-confidence additions --- + "zig": { + factory: grammars.ZigLanguage, + nodes: map[string][]string{ + "function": {"function_declaration"}, + "class": {"struct_declaration"}, + }, + identifiers: idID(), + }, + "julia": { + factory: grammars.JuliaLanguage, + nodes: map[string][]string{ + "function": {"function_definition"}, + }, + identifiers: idID(), + }, + "fortran": { + factory: grammars.FortranLanguage, + nodes: map[string][]string{ + "function": {"subroutine", "function"}, + "class": {"module"}, + }, + identifiers: idID(), + }, + "haskell": { + factory: grammars.HaskellLanguage, + nodes: map[string][]string{ + // `function` = untyped top-level def; `bind` = typed binding + // (signature + match together); `signature` is loose stand-alone + // type signatures. + "function": {"function", "bind", "signature"}, + "type": {"data_type", "newtype"}, + }, + identifiers: map[string]struct{}{ + "variable": {}, "constructor": {}, "name": {}, + }, + }, + "ocaml": { + factory: grammars.OcamlLanguage, + nodes: map[string][]string{ + "function": {"value_definition"}, + "class": {"module_definition"}, + "type": {"type_definition"}, + }, + identifiers: idID("type_identifier"), + }, + } } // skipNames mirrors SKIP_NAMES in chunker.py. @@ -121,26 +480,6 @@ type Reference struct { Language string } -// --------------------------------------------------------------------------- -// Language registry -// --------------------------------------------------------------------------- - -// languageFunc is a factory for sitter.Language. -type languageFunc func() *sitter.Language - -var languageRegistry = map[string]languageFunc{ - "python": grammars.PythonLanguage, - "go": grammars.GoLanguage, - "javascript": grammars.JavascriptLanguage, - "typescript": grammars.TypescriptLanguage, - "tsx": grammars.TsxLanguage, - "java": grammars.JavaLanguage, - "c": grammars.CLanguage, - "cpp": grammars.CppLanguage, - "rust": grammars.RustLanguage, - "ruby": grammars.RubyLanguage, -} - // --------------------------------------------------------------------------- // ChunkFile — main entry point // --------------------------------------------------------------------------- @@ -155,29 +494,51 @@ func ChunkFile(filePath, content, language string, maxSize int) ([]Chunk, []Refe chunks, refs, err := chunkWithTreesitter(filePath, content, language, maxSize) if err != nil { // Fallback: sliding window, no references. - return chunkSlidingWindow(filePath, content, language), nil, nil + return chunkFallback(filePath, content, language), nil, nil } return chunks, refs, nil } +// chunkFallback returns reasonable chunks for content that the tree-sitter +// path could not handle (parser timeout, no grammar, malformed input, …). +// +// For languages where a regex-based extractor exists (currently only bash), +// we try that first — it produces real `function` chunks instead of generic +// `block` ones, which is much more useful for semantic search. If the +// extractor returns nil (no symbols found), we fall through to the universal +// sliding-window strategy so the file content is still indexed. +func chunkFallback(filePath, content, language string) []Chunk { + if language == "bash" { + if c := bashRegexChunks(filePath, content); len(c) > 0 { + return c + } + } + return chunkSlidingWindow(filePath, content, language) +} + // --------------------------------------------------------------------------- // Tree-sitter path // --------------------------------------------------------------------------- func chunkWithTreesitter(filePath, content, language string, maxSize int) ([]Chunk, []Reference, error) { + // Snapshot under RLock so a concurrent Configure() call does not race the read. + registryMu.RLock() langFn, ok := languageRegistry[language] + nodeKinds := languageNodes[language] + idTypes := identifierNodes[language] + registryMu.RUnlock() + if !ok { - return chunkSlidingWindow(filePath, content, language), nil, nil + return chunkFallback(filePath, content, language), nil, nil } lang := langFn() if lang == nil { - return chunkSlidingWindow(filePath, content, language), nil, nil + return chunkFallback(filePath, content, language), nil, nil } - nodeKinds, ok := languageNodes[language] - if !ok { + if nodeKinds == nil { // Grammar exists but we don't have node definitions → sliding window. - return chunkSlidingWindow(filePath, content, language), nil, nil + return chunkFallback(filePath, content, language), nil, nil } // Build flat target → kind map. @@ -190,7 +551,41 @@ func chunkWithTreesitter(filePath, content, language string, maxSize int) ([]Chu src := []byte(content) parser := sitter.NewParser(lang) + + // Twin guards: SetTimeoutMicros is the parser's own checkpoint-based + // budget; the cancellation flag is set by an external timer when the + // wall-clock deadline expires. The parser checks both at the same + // granularity, so they overshoot together — we still rely on the + // post-parse wall-clock check below to decide whether the tree is + // trustworthy. + parser.SetTimeoutMicros(parseHint) + var cancelFlag uint32 + parser.SetCancellationFlag(&cancelFlag) + deadline := time.AfterFunc(parseBudget, func() { + atomic.StoreUint32(&cancelFlag, 1) + }) + + parseStart := time.Now() tree, err := parser.Parse(src) + parseElapsed := time.Since(parseStart) + deadline.Stop() + + // Hard wall-clock check — even if parser claims success, a tree that + // took >2× the budget is the result of a backtracking pathology and + // the structure is not trustworthy enough to chunk on. Falling back to + // sliding window keeps the indexer responsive. + if parseElapsed > 2*parseBudget { + slog.Warn("chunker: parse exceeded budget, falling back to sliding window", + "path", filePath, "language", language, "elapsed", parseElapsed, + "budget", parseBudget) + return chunkFallback(filePath, content, language), nil, nil + } + if atomic.LoadUint32(&cancelFlag) == 1 { + slog.Warn("chunker: parse cancelled by deadline, falling back to sliding window", + "path", filePath, "language", language, "elapsed", parseElapsed) + return chunkFallback(filePath, content, language), nil, nil + } + if err != nil { return nil, nil, err } @@ -205,8 +600,8 @@ func chunkWithTreesitter(filePath, content, language string, maxSize int) ([]Chu extractNodes(root, lang, src, targetTypes, lines, filePath, language, &chunks, &coveredRanges, nil) - // Extract references. - refs := extractReferences(root, lang, src, targetTypes, filePath, language) + // Extract references using the snapshotted identifier set. + refs := extractReferences(root, lang, src, targetTypes, idTypes, filePath, language) // Fill gaps between extracted symbol nodes with "module" chunks. sortRanges(coveredRanges) @@ -237,7 +632,7 @@ func chunkWithTreesitter(filePath, content, language string, maxSize int) ([]Chu } if len(finalChunks) == 0 { - return chunkSlidingWindow(filePath, content, language), nil, nil + return chunkFallback(filePath, content, language), nil, nil } return finalChunks, refs, nil } @@ -312,15 +707,17 @@ func extractNodes( } // extractReferences walks AST collecting identifier usages (not definitions). +// idNodeTypes is passed in (rather than read from the global map) so callers +// can snapshot once and stay consistent if Configure() is called concurrently. func extractReferences( root *sitter.Node, lang *sitter.Language, src []byte, targetTypes map[string]string, + idNodeTypes map[string]struct{}, filePath, language string, ) []Reference { - idNodeTypes, ok := identifierNodes[language] - if !ok { + if len(idNodeTypes) == 0 { return nil } @@ -388,12 +785,27 @@ func extractReferences( } // extractName returns the first identifier-like child's text, or nil. +// +// The set of "identifier-like" node types covers the main grammars in the +// default registry. Notable additions beyond the obvious `identifier`: +// - `field_identifier` — Go method names (`func (b *Bar) Foo()` → "Foo") +// - `word` — bash function names (`hello() { ... }` → "hello") +// - `simple_identifier` — Swift / Kotlin function names +// - `constant` — Ruby class/module names (which start with uppercase) +// +// Without these, the symbol_name field on the resulting chunk was nil and +// the CLI's `cix summary` rendered weird placeholders (`[method] bool`, +// `[function] `). func extractName(node *sitter.Node, lang *sitter.Language, src []byte) *string { nameTypes := map[string]struct{}{ "identifier": {}, "name": {}, "property_identifier": {}, "type_identifier": {}, + "field_identifier": {}, + "word": {}, + "simple_identifier": {}, + "constant": {}, } cnt := node.ChildCount() for i := 0; i < cnt; i++ { diff --git a/server/internal/chunker/chunker_test.go b/server/internal/chunker/chunker_test.go index b64eb74..8244b9e 100644 --- a/server/internal/chunker/chunker_test.go +++ b/server/internal/chunker/chunker_test.go @@ -3,6 +3,9 @@ package chunker import ( "strings" "testing" + "time" + + sitter "github.com/odvcencio/gotreesitter" ) func TestChunkFile_Python(t *testing.T) { @@ -217,6 +220,60 @@ func TestSkipNames_ContainsExpected(t *testing.T) { } } +// TestChunkFile_ParseBudgetFallback exercises the parser-budget guard with +// a real-world pathology: the install.sh in this repo triggers ~31s of +// catastrophic backtracking in tree-sitter-bash. After the guard kicks in +// the chunker must return sliding-window chunks within ~parseBudget rather +// than blocking the entire indexer for half a minute. +// +// Skipped under -short because it deliberately runs until the deadline fires. +func TestChunkFile_ParseBudgetFallback(t *testing.T) { + if testing.Short() { + t.Skip("parse-budget test waits up to ~2s for the deadline to fire") + } + + // Construct bash content that deterministically tickles the bash + // grammar's slow path without depending on a specific repo file. + // Heredocs + nested $(...) inside a deeply nested case statement is a + // known trigger; we lean on the repo-known install.sh structure. + src := strings.Repeat(` +case "$x" in + pattern1) + cat < 2*parseBudget+500*time.Millisecond { + t.Errorf("ChunkFile elapsed %s, expected < ~2× parseBudget (%s)", + elapsed, parseBudget) + } + if len(chunks) == 0 { + t.Error("expected at least one chunk (block or function), got 0") + } + + // Refs are nil when sliding-window fallback fires. + _ = refs +} + func TestSplitLines_Roundtrip(t *testing.T) { original := "line one\nline two\nline three" lines := splitLines(original) @@ -246,3 +303,344 @@ type ID = string | number; t.Fatal("expected chunks from TypeScript source") } } + +// --- Tier 2 bug-fix tests: grammars were registered without languageNodes +// in earlier versions, so .tsx/.c/.cpp/.rb files silently fell to sliding +// window. These assert true semantic chunks now come back. --- + +func TestChunkFile_TSX(t *testing.T) { + src := `import React from "react"; + +interface Props { + name: string; +} + +export function Greeting(props: Props) { + return
Hello, {props.name}
; +} + +type Id = string | number; +` + chunks, _, err := ChunkFile("sample.tsx", src, "tsx", 0) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(chunks) == 0 { + t.Fatal("expected chunks from TSX source") + } + hasFunction := false + hasType := false + for _, c := range chunks { + if c.ChunkType == "function" { + hasFunction = true + } + if c.ChunkType == "type" { + hasType = true + } + } + if !hasFunction { + t.Errorf("expected function chunk for Greeting, got types: %v", chunkTypeCounts(chunks)) + } + if !hasType { + t.Errorf("expected type chunk for Id, got types: %v", chunkTypeCounts(chunks)) + } +} + +func TestChunkFile_C(t *testing.T) { + src := `#include + +struct Point { + double x; + double y; +}; + +typedef enum { RED, GREEN, BLUE } Color; + +int add(int a, int b) { + return a + b; +} + +int main(void) { + return add(1, 2); +} +` + chunks, _, err := ChunkFile("sample.c", src, "c", 0) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(chunks) == 0 { + t.Fatal("expected chunks from C source") + } + counts := chunkTypeCounts(chunks) + if counts["function"] == 0 { + t.Errorf("expected function chunks, got: %v", counts) + } + if counts["class"] == 0 { + t.Errorf("expected struct (class) chunk for Point, got: %v", counts) + } +} + +func TestChunkFile_Cpp(t *testing.T) { + src := `#include + +class Animal { +public: + Animal(std::string name) : name_(name) {} + std::string name() const { return name_; } +private: + std::string name_; +}; + +namespace zoo { + int count() { return 42; } +} +` + chunks, _, err := ChunkFile("sample.cpp", src, "cpp", 0) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(chunks) == 0 { + t.Fatal("expected chunks from C++ source") + } + counts := chunkTypeCounts(chunks) + if counts["class"] == 0 { + t.Errorf("expected class chunk for Animal, got: %v", counts) + } +} + +func TestChunkFile_Ruby(t *testing.T) { + src := `module Greetings + class Greeter + def initialize(name) + @name = name + end + + def greet + puts "Hello, #{@name}" + end + end +end +` + chunks, _, err := ChunkFile("sample.rb", src, "ruby", 0) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(chunks) == 0 { + t.Fatal("expected chunks from Ruby source") + } + counts := chunkTypeCounts(chunks) + if counts["class"] == 0 { + t.Errorf("expected class/module chunks, got: %v", counts) + } +} + +// --- Configure() filtering --- + +func TestConfigure_FilterToSubset(t *testing.T) { + defer Configure(nil) // restore defaults for other tests + + Configure([]string{"python", "go"}) + active := SupportedLanguages() + if len(active) != 2 { + t.Errorf("expected 2 active languages, got %d (%v)", len(active), active) + } + got := map[string]bool{} + for _, l := range active { + got[l] = true + } + if !got["python"] || !got["go"] { + t.Errorf("expected python+go, got %v", active) + } + if got["rust"] { + t.Error("rust should be filtered out") + } +} + +func TestConfigure_DefaultsAfterEmpty(t *testing.T) { + Configure([]string{"python"}) + Configure(nil) // should restore full defaults + active := SupportedLanguages() + if len(active) < 20 { + t.Errorf("expected ≥20 default languages, got %d", len(active)) + } +} + +func TestConfigure_UnknownIDIgnored(t *testing.T) { + defer Configure(nil) + + Configure([]string{"python", "imaginary-lang", "go"}) + active := SupportedLanguages() + got := map[string]bool{} + for _, l := range active { + got[l] = true + } + if !got["python"] || !got["go"] { + t.Errorf("expected python+go to survive, got %v", active) + } + if got["imaginary-lang"] { + t.Error("unknown language should not be added") + } +} + +func TestConfigure_CaseInsensitive(t *testing.T) { + defer Configure(nil) + + Configure([]string{" Python ", "GO"}) + active := SupportedLanguages() + if len(active) != 2 { + t.Errorf("expected 2 active languages, got %d (%v)", len(active), active) + } +} + +// chunkTypeCounts is a small helper for table-driven assertions on chunk types. +func chunkTypeCounts(chunks []Chunk) map[string]int { + out := map[string]int{} + for _, c := range chunks { + out[c.ChunkType]++ + } + return out +} + +// TestRegistry_AllFactoriesNonNil ensures every default-registered language +// resolves to a usable *sitter.Language. A nil factory return would mean +// gotreesitter renamed/removed a grammar between updates and we silently lost +// support — better to fail loud here than at runtime in production. +func TestRegistry_AllFactoriesNonNil(t *testing.T) { + defer Configure(nil) + Configure(nil) + + for _, lang := range SupportedLanguages() { + t.Run(lang, func(t *testing.T) { + registryMu.RLock() + fn := languageRegistry[lang] + registryMu.RUnlock() + if fn == nil { + t.Fatalf("nil factory for %q", lang) + } + if g := fn(); g == nil { + t.Fatalf("factory returned nil grammar for %q", lang) + } + }) + } +} + +// TestRegistry_NodeNamesMatchAST parses a tiny per-language fixture and +// asserts at least one configured node-type appears in its AST. This catches +// node-name typos in defaultRegistry without needing a fixture file per lang. +// Languages absent from the fixture map are skipped (registered but not +// covered — acceptable, but the per-language tests above cover the criticals). +func TestRegistry_NodeNamesMatchAST(t *testing.T) { + defer Configure(nil) + Configure(nil) + + fixtures := map[string]string{ + "python": "def f():\n pass\n", + "go": "package p\nfunc F() {}\n", + "javascript": "function f() {}\n", + "typescript": "function f(): void {}\n", + "tsx": "function F() { return
; }\n", + "java": "class C { void m() {} }\n", + "c": "int f(void) { return 0; }\n", + "cpp": "class C {}; int f(){return 0;}\n", + "rust": "fn f() {}\n", + "ruby": "class C\n def m; end\nend\n", + "c_sharp": "class C { void M() {} }\n", + "php": "\n", + "swift": "func f() {}\n", + "kotlin": "fun f() {}\n", + "scala": "object O { def f() = 1 }\n", + "bash": "f() { echo hi; }\n", + "lua": "function f() end\n", + "dart": "void f() {}\n", + "r": "f <- function() 1\n", + "objc": "@interface C\n@end\n", + "html": "\n", + "css": ".x { color: red; }\n", + "scss": ".x { color: red; }\n", + "sql": "CREATE TABLE t (id INT);\n", + "markdown": "# Heading\n\nbody\n", + "zig": "fn f() void {}\n", + "julia": "function f() end\n", + "fortran": "subroutine s\nend subroutine\n", + "haskell": "module M where\n\nf :: Int -> Int\nf x = x\n", + "ocaml": "let f x = x\n", + } + + for lang, src := range fixtures { + t.Run(lang, func(t *testing.T) { + registryMu.RLock() + fn, regOK := languageRegistry[lang] + nodes := languageNodes[lang] + registryMu.RUnlock() + + if !regOK { + t.Skipf("%q not in registry (deliberately filtered out)", lang) + } + if nodes == nil { + t.Skipf("%q has no node map (sliding-window only — by design)", lang) + } + + grammar := fn() + if grammar == nil { + t.Fatalf("nil grammar for %q", lang) + } + + parser := sitter.NewParser(grammar) + tree, err := parser.Parse([]byte(src)) + if err != nil { + t.Fatalf("parse error for %q: %v", lang, err) + } + root := tree.RootNode() + if root == nil { + t.Fatalf("nil root for %q", lang) + } + + want := map[string]struct{}{} + for _, types := range nodes { + for _, ty := range types { + want[ty] = struct{}{} + } + } + + seen := map[string]struct{}{} + collectNodeTypes(root, grammar, seen) + + matched := false + for ty := range want { + if _, ok := seen[ty]; ok { + matched = true + break + } + } + if !matched { + keys := make([]string, 0, len(want)) + for k := range want { + keys = append(keys, k) + } + t.Errorf("none of configured node types %v found in AST for %q. Sample AST node types seen: %v", + keys, lang, sampleKeys(seen, 12)) + } + }) + } +} + +func collectNodeTypes(n *sitter.Node, lang *sitter.Language, out map[string]struct{}) { + if n == nil { + return + } + out[n.Type(lang)] = struct{}{} + for i := 0; i < int(n.ChildCount()); i++ { + collectNodeTypes(n.Child(i), lang, out) + } +} + +func sampleKeys(m map[string]struct{}, n int) []string { + out := make([]string, 0, n) + for k := range m { + if len(out) >= n { + break + } + out = append(out, k) + } + return out +} diff --git a/server/internal/config/config.go b/server/internal/config/config.go index 3fa3c1e..c7a42ef 100644 --- a/server/internal/config/config.go +++ b/server/internal/config/config.go @@ -37,6 +37,12 @@ type Config struct { LlamaNGpuLayers int // CIX_N_GPU_LAYERS; -1 on darwin (Metal all layers), 0 elsewhere. LlamaStartupSec int // CIX_LLAMA_STARTUP_TIMEOUT; readiness probe ceiling in seconds. EmbeddingsEnabled bool // CIX_EMBEDDINGS_ENABLED; test hook to bypass sidecar entirely. + + // Languages narrows the chunker's active language set. Empty / unset + // activates all baked-in defaults (see chunker.defaultRegistry). Values + // not present in the registry are warned-and-ignored at startup. + // Source: CIX_LANGUAGES (comma-separated, case-insensitive). + Languages []string } // ModelSafeName returns the embedding model name normalised for use inside @@ -146,6 +152,14 @@ func Load() (*Config, error) { } c.EmbeddingsEnabled = enabled + if langs := getenv("CIX_LANGUAGES", ""); langs != "" { + for _, l := range strings.Split(langs, ",") { + if s := strings.TrimSpace(l); s != "" { + c.Languages = append(c.Languages, s) + } + } + } + return c, nil } diff --git a/server/internal/langdetect/langdetect.go b/server/internal/langdetect/langdetect.go index cc79fc8..4568fdf 100644 --- a/server/internal/langdetect/langdetect.go +++ b/server/internal/langdetect/langdetect.go @@ -25,6 +25,7 @@ var extensionMap = map[string]string{ ".cs": "c_sharp", ".swift": "swift", ".kt": "kotlin", + ".kts": "kotlin", ".scala": "scala", ".zig": "zig", ".jl": "julia", @@ -36,7 +37,7 @@ var extensionMap = map[string]string{ ".mm": "objc", // Web / scripting ".ts": "typescript", - ".tsx": "typescript", + ".tsx": "tsx", ".js": "javascript", ".jsx": "javascript", ".rb": "ruby", diff --git a/server/internal/langdetect/langdetect_test.go b/server/internal/langdetect/langdetect_test.go index 62f0dde..27e94f7 100644 --- a/server/internal/langdetect/langdetect_test.go +++ b/server/internal/langdetect/langdetect_test.go @@ -10,7 +10,7 @@ func TestDetect(t *testing.T) { {"main.go", "go"}, {"app.py", "python"}, {"index.ts", "typescript"}, - {"index.tsx", "typescript"}, + {"index.tsx", "tsx"}, {"app.js", "javascript"}, {"lib.rs", "rust"}, {"Hello.java", "java"}, @@ -35,6 +35,8 @@ func TestDetect(t *testing.T) { {"/some/path/to/main.go", "go"}, {"script.R", "r"}, // uppercase .R {"script.sh", "bash"}, + {"build.gradle.kts", "kotlin"}, + {"app.kts", "kotlin"}, } for _, c := range cases { got := Detect(c.path) From 8e46c97fb61626c2b5b764ab577cd21bc60fec2e Mon Sep 17 00:00:00 2001 From: dvcdsys Date: Mon, 27 Apr 2026 21:50:59 +0100 Subject: [PATCH 2/6] feat(streaming): NDJSON progress events + ctx-disconnect cancel for /index/files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the single-JSON response of POST /index/files with an NDJSON event stream when the client sends Accept: application/x-ndjson. Solves three real-world pain points seen on heavy batches (e.g. 20 files = 347 chunks = 165s on a single GPU slot): * CLI no longer hits its 600s http.Client deadline on long batches — per-event keepalive plus a server-side 10s heartbeat ticker keep the connection alive arbitrarily long. New streamingClient on the CLI uses Timeout: 0 with a 60s idle-watchdog instead. * Server now detects client disconnect mid-batch via r.Context().Done() and immediately calls CancelIndexing() to release the per-project session lock. Previously the lock survived until the 1h TTL. * Per-file progress is visible during the batch (file_started, file_chunked, file_embedded, file_done events). Three render modes on the CLI: Interactive (TTY status line with CR), LineByLine (CI / non-TTY), Quiet (watcher — only summaries + file_error). Backwards compatibility is asymmetric and intentional: server still serves old clients (no Accept header → existing single-JSON path). New CLI hard- fails with ErrLegacyServer if it gets back Content-Type: application/json, because the operator's deploy workflow is server-first. Wire format and event schema documented in server/internal/indexer/progress.go and mirrored in cli/internal/client/progress.go. SIGINT/SIGTERM in `cix reindex` now propagates via signal.NotifyContext — HTTP request context cancels, server frees lock automatically. Belt-and- braces deferred CancelIndex on error paths in indexer.Run() and watcher Stop(). Tests: * server/internal/httpapi/indexing_streaming_test.go — streaming happy path, disconnect-frees-lock (direct handler invocation with custom flushRecorder), legacy compat negotiation * cli/internal/client/index_streaming_test.go — NDJSON parse, callback ordering, ErrLegacyServer hard-fail, idle timeout, retry on 503/429, back-compat SendFiles wrapper * watcher tests updated to mock NDJSON responses Co-Authored-By: Claude Opus 4.7 --- cli/cmd/init.go | 2 +- cli/cmd/reindex.go | 17 +- cli/cmd/root.go | 7 +- cli/internal/client/client.go | 23 + cli/internal/client/index.go | 216 ++++++++- cli/internal/client/index_streaming_test.go | 331 +++++++++++++ cli/internal/client/progress.go | 60 +++ cli/internal/config/config.go | 9 +- cli/internal/indexer/indexer.go | 60 ++- cli/internal/indexer/indexer_test.go | 40 +- cli/internal/indexer/progress.go | 205 ++++++++ cli/internal/watcher/watcher.go | 19 +- cli/internal/watcher/watcher_test.go | 33 +- server/internal/httpapi/indexing.go | 147 ++++++ .../httpapi/indexing_streaming_test.go | 444 ++++++++++++++++++ server/internal/indexer/indexer.go | 112 ++++- server/internal/indexer/progress.go | 95 ++++ 17 files changed, 1780 insertions(+), 40 deletions(-) create mode 100644 cli/internal/client/index_streaming_test.go create mode 100644 cli/internal/client/progress.go create mode 100644 cli/internal/indexer/progress.go create mode 100644 server/internal/httpapi/indexing_streaming_test.go create mode 100644 server/internal/indexer/progress.go diff --git a/cli/cmd/init.go b/cli/cmd/init.go index 0076beb..8de73bd 100644 --- a/cli/cmd/init.go +++ b/cli/cmd/init.go @@ -75,7 +75,7 @@ func runInit(cmd *cobra.Command, args []string) error { cfg, _ := config.Load() batchSize := cfg.Indexing.BatchSize fmt.Printf("Starting indexing (batch size: %d)...\n", batchSize) - result, err := indexer.Run(client, absPath, false, batchSize) + result, err := indexer.Run(cmd.Context(), client, absPath, false, batchSize, indexer.AutoProgressMode()) if err != nil { return fmt.Errorf("indexing failed: %w", err) } diff --git a/cli/cmd/reindex.go b/cli/cmd/reindex.go index eb0e223..d96f79b 100644 --- a/cli/cmd/reindex.go +++ b/cli/cmd/reindex.go @@ -1,9 +1,12 @@ package cmd import ( + "context" "fmt" "os" + "os/signal" "path/filepath" + "syscall" "time" "github.com/anthropics/code-index/cli/internal/config" @@ -68,8 +71,20 @@ func runReindex(cmd *cobra.Command, args []string) error { fmt.Printf("%s reindexing: %s (batch size: %d)\n", indexType, absPath, batchSize) - result, err := indexer.Run(apiClient, absPath, reindexFull, batchSize) + // SIGINT/SIGTERM → ctx cancellation. The indexer propagates ctx through + // SendFilesStreaming, which closes the HTTP connection; the server's + // streaming handler sees the disconnect and calls CancelIndexing, + // freeing the project lock immediately rather than at the 1-hour TTL. + ctx, stop := signal.NotifyContext(cmd.Context(), syscall.SIGINT, syscall.SIGTERM) + defer stop() + + result, err := indexer.Run(ctx, apiClient, absPath, reindexFull, batchSize, indexer.AutoProgressMode()) if err != nil { + // If the user hit Ctrl+C, surface a friendlier message — the deferred + // CancelIndex inside indexer.Run already freed the server lock. + if ctx.Err() == context.Canceled { + return fmt.Errorf("indexing cancelled by user") + } return fmt.Errorf("indexing failed: %w", err) } diff --git a/cli/cmd/root.go b/cli/cmd/root.go index 07e109d..8e36c89 100644 --- a/cli/cmd/root.go +++ b/cli/cmd/root.go @@ -4,6 +4,7 @@ import ( "fmt" "os" "strings" + "time" "github.com/anthropics/code-index/cli/internal/client" "github.com/anthropics/code-index/cli/internal/config" @@ -126,5 +127,9 @@ func getClient() (*client.Client, error) { } } - return client.New(url, key), nil + c := client.New(url, key) + if cfg.Indexing.StreamingIdleTimeoutSec > 0 { + c.SetStreamingIdleTimeout(time.Duration(cfg.Indexing.StreamingIdleTimeoutSec) * time.Second) + } + return c, nil } diff --git a/cli/internal/client/client.go b/cli/internal/client/client.go index ceccdc5..6723f12 100644 --- a/cli/internal/client/client.go +++ b/cli/internal/client/client.go @@ -14,8 +14,23 @@ type Client struct { baseURL string apiKey string httpClient *http.Client + + // streamingClient is used for endpoints that return chunked NDJSON + // (currently only POST /index/files when Accept advertises x-ndjson). + // Timeout is 0 because the natural duration of an indexing batch is + // dominated by GPU embed time and there is no useful overall ceiling. + // Idle silence is bounded by streamingIdleTimeout instead. + streamingClient *http.Client + streamingIdleTimeout time.Duration } +// defaultStreamingIdleTimeout is the maximum allowed gap between events on a +// streaming response. Server emits a heartbeat every 10s, so 60s gives a 6× +// margin — enough to absorb a one-shot llama-supervisor restart (which can +// pause embedding for ~5s several times in a row before the queue catches up) +// or a network hiccup, without giving up on a still-progressing batch. +const defaultStreamingIdleTimeout = 60 * time.Second + // New creates a new API client func New(baseURL, apiKey string) *Client { return &Client{ @@ -24,9 +39,17 @@ func New(baseURL, apiKey string) *Client { httpClient: &http.Client{ Timeout: 600 * time.Second, }, + streamingClient: &http.Client{Timeout: 0}, + streamingIdleTimeout: defaultStreamingIdleTimeout, } } +// SetStreamingIdleTimeout overrides the silence threshold for streaming +// endpoints. Pass 0 to disable the watchdog entirely (not recommended). +func (c *Client) SetStreamingIdleTimeout(d time.Duration) { + c.streamingIdleTimeout = d +} + // BaseURL returns the base URL this client is configured to use. func (c *Client) BaseURL() string { return c.baseURL diff --git a/cli/internal/client/index.go b/cli/internal/client/index.go index cb7ba17..69ded20 100644 --- a/cli/internal/client/index.go +++ b/cli/internal/client/index.go @@ -1,10 +1,16 @@ package client import ( + "bufio" + "bytes" + "context" + "encoding/json" "fmt" + "io" "math/rand" "net/http" "strconv" + "strings" "time" ) @@ -94,43 +100,233 @@ func (c *Client) BeginIndex(path string, full bool) (*BeginIndexResponse, error) return &result, nil } -// SendFiles sends a batch of files to be indexed in the given run. -// On HTTP 503 (GPU busy) or 429 (rate limited) it retries with exponential -// backoff up to maxSendRetries times before giving up. +// SendFiles sends a batch of files to be indexed. It is now a thin wrapper +// over SendFilesStreaming with a no-op event callback and a background +// context — kept for tests and for callers that don't want progress events. +// +// Note: even though the response is streamed under the hood, this wrapper +// blocks until the server closes the stream and returns only the final +// summary, matching the pre-streaming public surface. func (c *Client) SendFiles(path string, runID string, files []FilePayload) (*SendFilesResponse, error) { + return c.SendFilesStreaming(context.Background(), path, runID, files, nil) +} + +// SendFilesStreaming sends a batch of files and streams NDJSON progress +// events from the server. The onEvent callback is invoked for every event; +// pass nil if you only want the final summary. +// +// On HTTP 503 (GPU busy) or 429 (rate limited) the request is retried with +// exponential backoff up to maxSendRetries times BEFORE the stream begins. +// Once the stream has started (i.e. the server responded with NDJSON), the +// caller is in a long-lived single attempt — failures during the stream +// surface to the caller without a retry. +// +// Returns ErrLegacyServer if the server doesn't speak NDJSON (Content-Type +// negotiation failed). The CLI surfaces this as "upgrade your server". +// +// Returns ErrIdleTimeout if no data arrives for streamingIdleTimeout — the +// connection is forcibly closed and the caller should treat the run as +// failed (the server will see ctx cancellation and free the session lock). +func (c *Client) SendFilesStreaming( + ctx context.Context, + path string, + runID string, + files []FilePayload, + onEvent func(ProgressEvent), +) (*SendFilesResponse, error) { encodedPath := encodeProjectPath(path) + url := c.baseURL + fmt.Sprintf("/api/v1/projects/%s/index/files", encodedPath) + body := map[string]interface{}{ "run_id": runID, "files": files, } + bodyBytes, err := json.Marshal(body) + if err != nil { + return nil, fmt.Errorf("marshal body: %w", err) + } for attempt := 0; attempt <= maxSendRetries; attempt++ { - resp, err := c.do("POST", fmt.Sprintf("/api/v1/projects/%s/index/files", encodedPath), body) + // Wrap caller ctx so the idle watchdog can cancel without touching + // the original. callerErr() distinguishes "caller cancelled us" + // from "watchdog cancelled us" when reporting errors. + streamCtx, streamCancel := context.WithCancel(ctx) + + req, err := http.NewRequestWithContext(streamCtx, http.MethodPost, url, bytes.NewReader(bodyBytes)) + if err != nil { + streamCancel() + return nil, fmt.Errorf("create request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "application/x-ndjson") + if c.apiKey != "" { + req.Header.Set("Authorization", "Bearer "+c.apiKey) + } + + resp, err := c.streamingClient.Do(req) if err != nil { - return nil, err + streamCancel() + return nil, fmt.Errorf("do request: %w", err) } + // Retryable backpressure responses — short body, no streaming begun. if resp.StatusCode == http.StatusServiceUnavailable || resp.StatusCode == http.StatusTooManyRequests { header := resp.Header.Get("Retry-After") resp.Body.Close() + streamCancel() delay := retryAfterDelay(header, sendRetryDelay(attempt)) fmt.Printf(" GPU busy — retrying in %s (attempt %d/%d)...\n", delay.Round(time.Second), attempt+1, maxSendRetries) - time.Sleep(delay) + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(delay): + } continue } - var result SendFilesResponse - if err := parseResponse(resp, &result); err != nil { - return nil, err + // Any non-200 here is a hard error (bad run_id, project missing, …). + if resp.StatusCode != http.StatusOK { + defer resp.Body.Close() + defer streamCancel() + respBody, _ := io.ReadAll(resp.Body) + var errResp struct { + Detail string `json:"detail"` + } + if json.Unmarshal(respBody, &errResp) == nil && errResp.Detail != "" { + return nil, fmt.Errorf("API error (%d): %s", resp.StatusCode, errResp.Detail) + } + return nil, fmt.Errorf("API error (%d): %s", resp.StatusCode, string(respBody)) } - return &result, nil + + // Hard fail if the server returned plain JSON (legacy build) instead + // of NDJSON. We deliberately do not attempt a fallback parse — the + // operator is expected to upgrade the server first. + ct := resp.Header.Get("Content-Type") + if !strings.HasPrefix(ct, "application/x-ndjson") { + resp.Body.Close() + streamCancel() + return nil, ErrLegacyServer + } + + // At this point we have an open NDJSON stream. The retry loop ends. + result, err := readStream(streamCtx, streamCancel, resp.Body, onEvent, c.streamingIdleTimeout, ctx) + streamCancel() + return result, err } return nil, fmt.Errorf("GPU still busy after %d retries — try again later", maxSendRetries) } +// readStream consumes NDJSON lines from body, invokes onEvent for each, and +// returns the SendFilesResponse harvested from the terminal batch_done event. +// streamCancel is called whenever readStream wants to abort the connection +// (idle timeout, decode error, fatal server event). +func readStream( + streamCtx context.Context, + streamCancel context.CancelFunc, + body io.ReadCloser, + onEvent func(ProgressEvent), + idleTimeout time.Duration, + callerCtx context.Context, +) (*SendFilesResponse, error) { + defer body.Close() + + // Idle watchdog — fires streamCancel if no line arrives for idleTimeout. + // idleTimeout=0 disables the watchdog (used by tests when convenient). + lineRead := make(chan struct{}, 1) + if idleTimeout > 0 { + go func() { + timer := time.NewTimer(idleTimeout) + defer timer.Stop() + for { + select { + case <-lineRead: + if !timer.Stop() { + select { + case <-timer.C: + default: + } + } + timer.Reset(idleTimeout) + case <-timer.C: + streamCancel() + return + case <-streamCtx.Done(): + return + } + } + }() + } + + scanner := bufio.NewScanner(body) + // Some chunks may be very large (long file paths or error messages); + // give the scanner room. 1 MiB max-line should cover anything realistic. + scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024) + + var final *SendFilesResponse + var fatalErr error + + for scanner.Scan() { + // Notify watchdog: line arrived, reset idle timer. + select { + case lineRead <- struct{}{}: + default: + } + + line := bytes.TrimSpace(scanner.Bytes()) + if len(line) == 0 { + continue + } + + var ev ProgressEvent + if err := json.Unmarshal(line, &ev); err != nil { + return nil, fmt.Errorf("decode ndjson line: %w (line=%q)", err, line) + } + + if onEvent != nil { + onEvent(ev) + } + + switch ev.Event { + case EventBatchDone: + // Don't return yet — there may be a trailing newline. The + // scanner.Scan() loop will exit naturally on EOF. + final = &SendFilesResponse{ + FilesAccepted: ev.FilesAccepted, + ChunksCreated: ev.ChunksCreated, + FilesProcessedTotal: ev.FilesProcessedTotal, + } + case EventError: + if ev.Fatal { + fatalErr = fmt.Errorf("server error: %s", ev.Message) + } + } + } + + if err := scanner.Err(); err != nil { + // Distinguish caller cancel vs idle timeout vs network error. + if callerCtx.Err() != nil { + return nil, callerCtx.Err() + } + if streamCtx.Err() == context.Canceled && idleTimeout > 0 { + return nil, ErrIdleTimeout + } + return nil, fmt.Errorf("scan ndjson: %w", err) + } + + if fatalErr != nil { + return nil, fatalErr + } + if final == nil { + // Stream ended cleanly but no batch_done — server bug or partial + // write. Surface it so the caller can retry the batch. + return nil, fmt.Errorf("ndjson stream ended without batch_done event") + } + return final, nil +} + // FinishIndex completes the indexing session, removing deleted files. func (c *Client) FinishIndex(path string, runID string, deletedPaths []string, totalFiles int) (*FinishIndexResponse, error) { encodedPath := encodeProjectPath(path) diff --git a/cli/internal/client/index_streaming_test.go b/cli/internal/client/index_streaming_test.go new file mode 100644 index 0000000..712134a --- /dev/null +++ b/cli/internal/client/index_streaming_test.go @@ -0,0 +1,331 @@ +package client + +import ( + "bufio" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/http/httptest" + "strings" + "sync" + "testing" + "time" +) + +// streamWriter is a tiny convenience for tests that need to push NDJSON +// lines from the server side. It writes one JSON object per call followed +// by a newline, then flushes so the client sees it immediately. +type streamWriter struct { + w http.ResponseWriter + f http.Flusher +} + +func newStreamWriter(t *testing.T, w http.ResponseWriter) *streamWriter { + t.Helper() + f, ok := w.(http.Flusher) + if !ok { + t.Fatal("response writer does not implement Flusher") + } + w.Header().Set("Content-Type", "application/x-ndjson") + w.WriteHeader(http.StatusOK) + f.Flush() + return &streamWriter{w: w, f: f} +} + +func (s *streamWriter) write(t *testing.T, ev ProgressEvent) { + t.Helper() + b, err := json.Marshal(ev) + if err != nil { + t.Fatalf("marshal event: %v", err) + } + if _, err := s.w.Write(append(b, '\n')); err != nil { + t.Logf("write: %v", err) // not fatal — client may have disconnected + } + s.f.Flush() +} + +// TestSendFilesStreaming_BatchDone — happy path: events delivered in order, +// final SendFilesResponse pulled from batch_done event. +func TestSendFilesStreaming_BatchDone(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Header.Get("Accept") != "application/x-ndjson" { + t.Errorf("Accept header = %q, want application/x-ndjson", r.Header.Get("Accept")) + } + s := newStreamWriter(t, w) + s.write(t, ProgressEvent{Event: EventFileStarted, Path: "/p/a.go", FileIndex: 1, BatchSize: 2}) + s.write(t, ProgressEvent{Event: EventFileEmbedded, Path: "/p/a.go", Chunks: 3, EmbedMS: 50}) + s.write(t, ProgressEvent{Event: EventFileDone, Path: "/p/a.go", Chunks: 3}) + s.write(t, ProgressEvent{Event: EventFileStarted, Path: "/p/b.go", FileIndex: 2, BatchSize: 2}) + s.write(t, ProgressEvent{Event: EventFileDone, Path: "/p/b.go", Chunks: 2}) + s.write(t, ProgressEvent{ + Event: EventBatchDone, FilesAccepted: 2, ChunksCreated: 5, FilesProcessedTotal: 2, + }) + })) + defer srv.Close() + + c := New(srv.URL, "key") + var events []ProgressEvent + resp, err := c.SendFilesStreaming(context.Background(), "/p", "run-1", []FilePayload{ + {Path: "/p/a.go", Content: "x"}, + {Path: "/p/b.go", Content: "y"}, + }, func(ev ProgressEvent) { + events = append(events, ev) + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if resp.FilesAccepted != 2 || resp.ChunksCreated != 5 { + t.Errorf("resp = %+v, want files=2 chunks=5", resp) + } + if len(events) != 6 { + t.Errorf("events count = %d, want 6", len(events)) + } + if events[0].Event != EventFileStarted { + t.Errorf("events[0] = %q, want %q", events[0].Event, EventFileStarted) + } + if events[len(events)-1].Event != EventBatchDone { + t.Errorf("last event = %q, want %q", events[len(events)-1].Event, EventBatchDone) + } +} + +// TestSendFilesStreaming_Heartbeat verifies heartbeat events make it to the +// callback (not just dropped) and final result still reflects only batch_done. +func TestSendFilesStreaming_Heartbeat(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + s := newStreamWriter(t, w) + s.write(t, ProgressEvent{Event: EventHeartbeat, TS: "2026-04-27T17:00:00Z"}) + s.write(t, ProgressEvent{Event: EventHeartbeat, TS: "2026-04-27T17:00:10Z"}) + s.write(t, ProgressEvent{Event: EventBatchDone, FilesAccepted: 0}) + })) + defer srv.Close() + + c := New(srv.URL, "") + heartbeatCount := 0 + resp, err := c.SendFilesStreaming(context.Background(), "/p", "r", nil, func(ev ProgressEvent) { + if ev.Event == EventHeartbeat { + heartbeatCount++ + } + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if heartbeatCount != 2 { + t.Errorf("heartbeat count = %d, want 2", heartbeatCount) + } + if resp.FilesAccepted != 0 { + t.Errorf("resp.FilesAccepted = %d, want 0", resp.FilesAccepted) + } +} + +// TestSendFilesStreaming_LegacyServer ensures we hard-fail when the server +// returns single-JSON instead of NDJSON. No silent fallback — caller learns +// they need to upgrade. +func TestSendFilesStreaming_LegacyServer(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{"files_accepted":1,"chunks_created":3,"files_processed_total":1}`)) + })) + defer srv.Close() + + c := New(srv.URL, "") + calledBack := false + _, err := c.SendFilesStreaming(context.Background(), "/p", "r", nil, func(ev ProgressEvent) { + calledBack = true + }) + if !errors.Is(err, ErrLegacyServer) { + t.Errorf("err = %v, want ErrLegacyServer", err) + } + if calledBack { + t.Error("onEvent should not have been called against a legacy server") + } +} + +// TestSendFilesStreaming_IdleTimeout — stall the response indefinitely and +// confirm the watchdog cancels the request. +func TestSendFilesStreaming_IdleTimeout(t *testing.T) { + stall := make(chan struct{}) // never closed + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + s := newStreamWriter(t, w) + // Send one event then sit silent until the client times out. + s.write(t, ProgressEvent{Event: EventFileStarted, Path: "/p/x.go"}) + select { + case <-stall: + case <-r.Context().Done(): + } + })) + defer srv.Close() + defer close(stall) + + c := New(srv.URL, "") + c.SetStreamingIdleTimeout(150 * time.Millisecond) + _, err := c.SendFilesStreaming(context.Background(), "/p", "r", nil, nil) + if !errors.Is(err, ErrIdleTimeout) { + t.Errorf("err = %v, want ErrIdleTimeout", err) + } +} + +// TestSendFilesStreaming_FatalError — server emits a fatal error event, +// caller gets a non-nil error containing the message. +func TestSendFilesStreaming_FatalError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + s := newStreamWriter(t, w) + s.write(t, ProgressEvent{Event: EventFileStarted, Path: "/p/x.go"}) + s.write(t, ProgressEvent{Event: EventError, Message: "embedder unavailable", Fatal: true}) + })) + defer srv.Close() + + c := New(srv.URL, "") + _, err := c.SendFilesStreaming(context.Background(), "/p", "r", nil, nil) + if err == nil { + t.Fatal("expected error from fatal event, got nil") + } + if !strings.Contains(err.Error(), "embedder unavailable") { + t.Errorf("error %q does not contain server message", err) + } +} + +// TestSendFilesStreaming_NonStreamingErrorBodyDecoded — when the server +// returns non-200 (e.g. 404 bad run_id), the JSON detail is surfaced. +func TestSendFilesStreaming_NonStreamingError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusNotFound) + _, _ = w.Write([]byte(`{"detail":"unknown run_id"}`)) + })) + defer srv.Close() + + c := New(srv.URL, "") + _, err := c.SendFilesStreaming(context.Background(), "/p", "r", nil, nil) + if err == nil { + t.Fatal("expected error, got nil") + } + if !strings.Contains(err.Error(), "unknown run_id") { + t.Errorf("error %q does not surface server detail", err) + } +} + +// TestSendFiles_BackwardCompat — existing public surface still works, +// invoking SendFilesStreaming under the hood. +func TestSendFiles_BackwardCompat(t *testing.T) { + var requestSeen sync.WaitGroup + requestSeen.Add(1) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + defer requestSeen.Done() + if r.Header.Get("Accept") != "application/x-ndjson" { + t.Errorf("SendFiles wrapper must request NDJSON, got Accept=%q", r.Header.Get("Accept")) + } + s := newStreamWriter(t, w) + s.write(t, ProgressEvent{ + Event: EventBatchDone, FilesAccepted: 1, ChunksCreated: 4, FilesProcessedTotal: 1, + }) + })) + defer srv.Close() + + c := New(srv.URL, "") + resp, err := c.SendFiles("/p", "r", []FilePayload{{Path: "/p/x.go"}}) + if err != nil { + t.Fatalf("err: %v", err) + } + if resp.FilesAccepted != 1 || resp.ChunksCreated != 4 { + t.Errorf("resp = %+v", resp) + } + requestSeen.Wait() +} + +// TestSendFilesStreaming_RetryOn503 — server returns 503 with Retry-After, +// then succeeds; client should follow the retry and ultimately get the +// batch_done event without surfacing the temporary failure. +func TestSendFilesStreaming_RetryOn503(t *testing.T) { + var calls int32 + var mu sync.Mutex + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + mu.Lock() + calls++ + current := calls + mu.Unlock() + if current == 1 { + w.Header().Set("Retry-After", "1") + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusServiceUnavailable) + _, _ = w.Write([]byte(`{"detail":"GPU busy"}`)) + return + } + s := newStreamWriter(t, w) + s.write(t, ProgressEvent{ + Event: EventBatchDone, FilesAccepted: 1, ChunksCreated: 2, FilesProcessedTotal: 1, + }) + })) + defer srv.Close() + + c := New(srv.URL, "") + // Stub stdout via the Bash tool not relevant here; the retry print is OK. + resp, err := c.SendFilesStreaming(context.Background(), "/p", "r", []FilePayload{{Path: "x"}}, nil) + if err != nil { + t.Fatalf("expected success after retry, got err: %v", err) + } + if resp.FilesAccepted != 1 { + t.Errorf("resp.FilesAccepted = %d, want 1", resp.FilesAccepted) + } + if calls != 2 { + t.Errorf("expected 2 server calls, got %d", calls) + } +} + +// TestSendFilesStreaming_CallerCancel — caller cancels ctx mid-stream, the +// streaming call returns the context error promptly. +func TestSendFilesStreaming_CallerCancel(t *testing.T) { + hold := make(chan struct{}) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + s := newStreamWriter(t, w) + s.write(t, ProgressEvent{Event: EventFileStarted, Path: "/p/x.go"}) + select { + case <-hold: + case <-r.Context().Done(): + } + })) + defer srv.Close() + defer close(hold) + + c := New(srv.URL, "") + c.SetStreamingIdleTimeout(0) // disable watchdog so we test caller cancel only + + ctx, cancel := context.WithCancel(context.Background()) + + // Cancel after the first event is observed. + gotEvent := make(chan struct{}) + errCh := make(chan error, 1) + go func() { + _, err := c.SendFilesStreaming(ctx, "/p", "r", nil, func(ev ProgressEvent) { + select { + case gotEvent <- struct{}{}: + default: + } + }) + errCh <- err + }() + + select { + case <-gotEvent: + case <-time.After(2 * time.Second): + t.Fatal("never received first event") + } + cancel() + + select { + case err := <-errCh: + if !errors.Is(err, context.Canceled) { + t.Errorf("err = %v, want context.Canceled", err) + } + case <-time.After(2 * time.Second): + t.Fatal("SendFilesStreaming did not return after cancel") + } +} + +// keep imports honest for a future addition; small no-op compile guard +var _ = bufio.NewScanner +var _ = io.EOF +var _ = fmt.Sprintf diff --git a/cli/internal/client/progress.go b/cli/internal/client/progress.go new file mode 100644 index 0000000..c79d5a5 --- /dev/null +++ b/cli/internal/client/progress.go @@ -0,0 +1,60 @@ +package client + +import "errors" + +// ProgressEvent mirrors server/internal/indexer/progress.go:ProgressEvent. +// Both sides ship in the same PR; the duplication is the cost of keeping +// CLI and server as separate Go modules. +// +// Event values: file_started, file_chunked, file_embedded, file_done, +// file_error, heartbeat, batch_done, error. +type ProgressEvent struct { + Event string `json:"event"` + + // Per-file fields. + Path string `json:"path,omitempty"` + FileIndex int `json:"file_index,omitempty"` + BatchSize int `json:"batch_size,omitempty"` + Chunks int `json:"chunks,omitempty"` + EmbedMS int64 `json:"embed_ms,omitempty"` + + // Heartbeat. + TS string `json:"ts,omitempty"` + + // Errors. + Message string `json:"message,omitempty"` + Fatal bool `json:"fatal,omitempty"` + + // batch_done summary. + FilesAccepted int `json:"files_accepted,omitempty"` + ChunksCreated int `json:"chunks_created,omitempty"` + FilesProcessedTotal int `json:"files_processed_total,omitempty"` + + RunID string `json:"run_id,omitempty"` +} + +// Event kinds — keep in sync with server/internal/indexer/progress.go. +const ( + EventFileStarted = "file_started" + EventFileChunked = "file_chunked" + EventFileEmbedded = "file_embedded" + EventFileDone = "file_done" + EventFileError = "file_error" + EventHeartbeat = "heartbeat" + EventBatchDone = "batch_done" + EventError = "error" +) + +// ErrLegacyServer is returned by SendFilesStreaming when the server responds +// with a non-NDJSON Content-Type — meaning the server predates the streaming +// protocol. Callers should surface this as "upgrade your server" rather than +// silently retrying or falling back. +var ErrLegacyServer = errors.New( + "server does not support streaming protocol — upgrade server to a version that supports NDJSON on /index/files", +) + +// ErrIdleTimeout is returned when the streaming response has been silent for +// longer than the configured idle timeout. The server should be sending at +// least a heartbeat every 10 seconds; 30 seconds of silence implies the +// server is hung or the network has stalled. +var ErrIdleTimeout = errors.New("streaming response idle timeout — no data from server") diff --git a/cli/internal/config/config.go b/cli/internal/config/config.go index 61cb06b..1456819 100644 --- a/cli/internal/config/config.go +++ b/cli/internal/config/config.go @@ -36,6 +36,12 @@ type ServerConfig struct { type IndexingConfig struct { BatchSize int `yaml:"batchsize"` + + // StreamingIdleTimeoutSec is the maximum allowed silence on the streaming + // /index/files response before the CLI gives up and closes the conn. The + // server emits a heartbeat every 10s, so 30s gives the network three + // retry windows. Set to 0 to disable the watchdog (not recommended). + StreamingIdleTimeoutSec int `yaml:"streaming_idle_timeout_sec"` } type ProjectEntry struct { @@ -68,7 +74,8 @@ func defaults() Config { CacheTTL: 300, }, Indexing: IndexingConfig{ - BatchSize: 20, + BatchSize: 20, + StreamingIdleTimeoutSec: 30, }, } } diff --git a/cli/internal/indexer/indexer.go b/cli/internal/indexer/indexer.go index 4b69bd6..96ec57e 100644 --- a/cli/internal/indexer/indexer.go +++ b/cli/internal/indexer/indexer.go @@ -1,6 +1,7 @@ package indexer import ( + "context" "fmt" "os" "time" @@ -21,7 +22,25 @@ type Result struct { } // Run performs a complete index cycle: begin → discover → diff → send batches → finish. -func Run(apiClient *client.Client, projectPath string, full bool, batchSize int) (*Result, error) { +// +// ctx is honoured for cancellation: a SIGINT-derived ctx (or a watcher's stop +// signal) propagates through to the streaming SendFilesStreaming call, which +// closes the HTTP connection. The server-side streaming handler sees the +// disconnect and frees the project's session lock immediately, so the next +// reindex doesn't hit 409. As a belt-and-braces, this function defers an +// explicit CancelIndex call for the active run on early exit. +// +// mode controls how per-file progress events are rendered. Pass +// AutoProgressMode() for `cix reindex` (TTY-aware), ProgressQuiet for the +// watcher (only summary + errors hit the log). +func Run( + ctx context.Context, + apiClient *client.Client, + projectPath string, + full bool, + batchSize int, + mode ProgressMode, +) (*Result, error) { if batchSize <= 0 { batchSize = defaultBatchSize } @@ -36,6 +55,17 @@ func Run(apiClient *client.Client, projectPath string, full bool, batchSize int) } fmt.Printf(" Session: %s\n", beginResp.RunID) + // Belt-and-braces: if we exit early (ctx cancellation, network error, + // SendFilesStreaming failure), tell the server to release the project + // lock instead of leaving it for the 1-hour TTL. CancelIndex is + // idempotent and fast. + cancelDone := false + defer func() { + if !cancelDone { + _, _ = apiClient.CancelIndex(projectPath) + } + }() + // Phase 2: Discover files on disk fmt.Println("Discovering files...") discovered, err := discovery.Discover(projectPath, discovery.Options{}) @@ -80,8 +110,16 @@ func Run(apiClient *client.Client, projectPath string, full bool, batchSize int) fmt.Printf(" %d file(s) to process\n", len(toProcess)) } - // Phase 3: Send files in batches + // Phase 3: Send files in batches via streaming. Each batch gets its own + // progressRenderer so per-file indices restart from 1 in the renderer's + // context but display globally as (batchOffset+i). for i := 0; i < len(toProcess); i += batchSize { + // Honour ctx cancellation between batches; mid-batch cancellation + // is handled inside SendFilesStreaming. + if err := ctx.Err(); err != nil { + return nil, err + } + end := i + batchSize if end > len(toProcess) { end = len(toProcess) @@ -110,20 +148,28 @@ func Run(apiClient *client.Client, projectPath string, full bool, batchSize int) continue } - resp, err := apiClient.SendFiles(projectPath, beginResp.RunID, payloads) + // batchOffset is 1-based offset of the first payload in this batch + // within the overall toProcess slice. Renderer adds ev.FileIndex + // (which is also 1-based per batch) and prints `[N/total]`. + renderer := newProgressRenderer(mode, len(toProcess), i) + _, err := apiClient.SendFilesStreaming( + ctx, projectPath, beginResp.RunID, payloads, renderer.onEvent, + ) if err != nil { return nil, fmt.Errorf("send files (batch %d-%d): %w", i+1, end, err) } - - fmt.Printf(" Processed %d/%d files (%d chunks)\n", - resp.FilesProcessedTotal, len(toProcess), resp.ChunksCreated) } - // Phase 4: Finish — server cleans up deleted files and finalizes the run + // Phase 4: Finish — server cleans up deleted files and finalizes the run. + // We mark cancelDone before this point so the deferred CancelIndex doesn't + // fire on the happy path. + cancelDone = true finishResp, err := apiClient.FinishIndex( projectPath, beginResp.RunID, deletedPaths, len(discovered), ) if err != nil { + // Restore the deferred cancel — finish failed, lock should be released. + _, _ = apiClient.CancelIndex(projectPath) return nil, fmt.Errorf("finish index: %w", err) } diff --git a/cli/internal/indexer/indexer_test.go b/cli/internal/indexer/indexer_test.go index 7609289..fb490d6 100644 --- a/cli/internal/indexer/indexer_test.go +++ b/cli/internal/indexer/indexer_test.go @@ -1,6 +1,7 @@ package indexer import ( + "context" "crypto/sha1" "crypto/sha256" "encoding/hex" @@ -50,11 +51,11 @@ type indexHandler struct { } func (h *indexHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") p := r.URL.Path switch { case strings.Contains(p, h.hash+"/index/begin"): + w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(map[string]any{ "run_id": "run-test", "stored_hashes": h.beginHashes, @@ -67,7 +68,17 @@ func (h *indexHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { } _ = json.Unmarshal(body, &payload) h.FilesReceived = append(h.FilesReceived, payload.Files...) - json.NewEncoder(w).Encode(map[string]any{ + + // Speak NDJSON — the new client requires it. We emit a single + // batch_done event matching the legacy summary semantics so existing + // assertions on FilesReceived continue to hold. + w.Header().Set("Content-Type", "application/x-ndjson") + w.WriteHeader(http.StatusOK) + if f, ok := w.(http.Flusher); ok { + f.Flush() + } + _ = json.NewEncoder(w).Encode(map[string]any{ + "event": "batch_done", "files_accepted": len(payload.Files), "chunks_created": len(payload.Files), "files_processed_total": len(payload.Files), @@ -80,12 +91,17 @@ func (h *indexHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { } _ = json.Unmarshal(body, &finish) h.DeletedPaths = finish.DeletedPaths + w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(map[string]any{ "status": "ok", "files_processed": len(h.FilesReceived), "chunks_created": len(h.FilesReceived), }) + case strings.Contains(p, h.hash+"/index/cancel"): + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]any{"cancelled": false}) + default: http.NotFound(w, r) } @@ -111,7 +127,7 @@ func TestRun_AddNewFile(t *testing.T) { srv, h := newServer(t, dir, map[string]string{}) c := client.New(srv.URL, "test-key") - result, err := Run(c, dir, false, 0) + result, err := Run(context.Background(), c, dir, false, 0, ProgressQuiet) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -137,7 +153,7 @@ func TestRun_UpdatedFile(t *testing.T) { }) c := client.New(srv.URL, "test-key") - _, err := Run(c, dir, false, 0) + _, err := Run(context.Background(), c, dir, false, 0, ProgressQuiet) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -165,7 +181,7 @@ func TestRun_DeletedFile(t *testing.T) { }) c := client.New(srv.URL, "test-key") - _, err := Run(c, dir, false, 0) + _, err := Run(context.Background(), c, dir, false, 0, ProgressQuiet) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -219,7 +235,7 @@ func TestRun_NoChanges(t *testing.T) { t.Cleanup(srv.Close) c := client.New(srv.URL, "test-key") - _, err := Run(c, dir, false, 0) + _, err := Run(context.Background(), c, dir, false, 0, ProgressQuiet) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -243,7 +259,7 @@ func TestRun_FullReindex(t *testing.T) { srv, h := newServer(t, dir, map[string]string{path: storedHash}) c := client.New(srv.URL, "test-key") - _, err := Run(c, dir, true /* full */, 0) + _, err := Run(context.Background(), c, dir, true /* full */, 0, ProgressQuiet) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -270,7 +286,7 @@ func TestRun_ServerUnavailable(t *testing.T) { srv.Close() c := client.New(srv.URL, "test-key") - _, err := Run(c, dir, false, 0) + _, err := Run(context.Background(), c, dir, false, 0, ProgressQuiet) if err == nil { t.Fatal("expected error when server is unavailable, got nil") } @@ -294,7 +310,7 @@ func TestRun_ServerError5xx(t *testing.T) { t.Cleanup(srv.Close) c := client.New(srv.URL, "test-key") - _, err := Run(c, dir, false, 0) + _, err := Run(context.Background(), c, dir, false, 0, ProgressQuiet) if err == nil { t.Fatal("expected error on 503, got nil") } @@ -317,7 +333,7 @@ func TestRun_RecoveryAfterFailure(t *testing.T) { downSrv.Close() c1 := client.New(downSrv.URL, "test-key") - if _, err := Run(c1, dir, false, 0); err == nil { + if _, err := Run(context.Background(), c1, dir, false, 0, ProgressQuiet); err == nil { t.Fatal("expected error on first run") } @@ -326,7 +342,7 @@ func TestRun_RecoveryAfterFailure(t *testing.T) { srv, h := newServer(t, dir, map[string]string{}) c2 := client.New(srv.URL, "test-key") - _, err := Run(c2, dir, false, 0) + _, err := Run(context.Background(), c2, dir, false, 0, ProgressQuiet) if err != nil { t.Fatalf("expected recovery run to succeed: %v", err) } @@ -352,7 +368,7 @@ func TestRun_MultipleFiles(t *testing.T) { srv, h := newServer(t, dir, map[string]string{}) c := client.New(srv.URL, "test-key") - result, err := Run(c, dir, false, 1 /* batchSize=1 to exercise batching */) + result, err := Run(context.Background(), c, dir, false, 1 /* batchSize=1 to exercise batching */, ProgressQuiet) if err != nil { t.Fatalf("unexpected error: %v", err) } diff --git a/cli/internal/indexer/progress.go b/cli/internal/indexer/progress.go new file mode 100644 index 0000000..0acfa6c --- /dev/null +++ b/cli/internal/indexer/progress.go @@ -0,0 +1,205 @@ +package indexer + +import ( + "fmt" + "io" + "os" + "time" + "unicode/utf8" + + "github.com/anthropics/code-index/cli/internal/client" +) + +// ProgressMode controls how SendFilesStreaming events are rendered to the +// user. Reindex on a TTY uses Interactive (in-place status line); reindex +// in CI / non-TTY context uses LineByLine; the watcher uses Quiet (only +// summary + errors land in the log). +type ProgressMode int + +const ( + // ProgressInteractive updates a single status line with carriage returns. + ProgressInteractive ProgressMode = iota + // ProgressLineByLine prints one log line per file_started/file_done. + ProgressLineByLine + // ProgressQuiet only prints file_error and the final batch summary. + ProgressQuiet +) + +// AutoProgressMode returns Interactive when stdout is a terminal and +// LineByLine otherwise. Tests and watchers should pass an explicit mode. +func AutoProgressMode() ProgressMode { + if isTerminal(os.Stdout) { + return ProgressInteractive + } + return ProgressLineByLine +} + +// isTerminal reports whether f is a character device (a TTY). Avoids the +// golang.org/x/term dependency to keep the CLI module's go directive at the +// existing minimum (no toolchain bump for a single-line check). +func isTerminal(f *os.File) bool { + stat, err := f.Stat() + if err != nil { + return false + } + return (stat.Mode() & os.ModeCharDevice) != 0 +} + +// progressRenderer is a stateful event handler. It is created once per batch +// (so per-batch counters reset) and called for every NDJSON event the +// streaming client receives. +type progressRenderer struct { + mode ProgressMode + out io.Writer + totalFiles int // total files to process across all batches + batchOffset int // index of the first file in this batch (1-based) + fileStart time.Time // when the current file's file_started arrived + + // lastLineRunes tracks the visible width of the last status line we + // drew so we can erase it before redrawing. We count runes (not bytes) + // because UTF-8 multi-byte chars like `…` would otherwise inflate the + // padding and the cursor would land past the visible end, leaving the + // tail of the previous line on screen. + lastLineRunes int + + // activeFile is the path we're currently rendering progress for. + activeFile string + + // activeFileIdx caches the global file index from file_started so it + // can be reused on file_chunked / file_embedded / file_done — which + // the server emits without a FileIndex field. Without this cache, the + // renderer fell back to ev.FileIndex == 0 and printed `[0/N]` on every + // file_embedded line. + activeFileIdx int +} + +func newProgressRenderer(mode ProgressMode, totalFiles, batchOffset int) *progressRenderer { + return &progressRenderer{ + mode: mode, + out: os.Stdout, + totalFiles: totalFiles, + batchOffset: batchOffset, + } +} + +// onEvent is the callback fed to client.SendFilesStreaming. +func (r *progressRenderer) onEvent(ev client.ProgressEvent) { + switch r.mode { + case ProgressInteractive: + r.renderInteractive(ev) + case ProgressLineByLine: + r.renderLineByLine(ev) + case ProgressQuiet: + r.renderQuiet(ev) + } +} + +func (r *progressRenderer) renderInteractive(ev client.ProgressEvent) { + switch ev.Event { + case client.EventFileStarted: + r.activeFile = ev.Path + r.activeFileIdx = r.batchOffset + ev.FileIndex + r.fileStart = time.Now() + r.statusLine(fmt.Sprintf("[%d/%d] %s (chunking…)", + r.activeFileIdx, r.totalFiles, ev.Path)) + + case client.EventFileEmbedded: + // FileIndex is not populated on file_embedded; reuse the cached + // value from the matching file_started. + r.statusLine(fmt.Sprintf("[%d/%d] %s (embedded %d chunks, %dms)", + r.activeFileIdx, r.totalFiles, ev.Path, ev.Chunks, ev.EmbedMS)) + + case client.EventFileDone: + // Leave the embedded line — file_done arrives so quickly that + // rewriting it again is just flicker. + + case client.EventHeartbeat: + if r.activeFile != "" { + elapsed := time.Since(r.fileStart).Round(time.Second) + r.statusLine(fmt.Sprintf("[%d/%d] %s · %s elapsed", + r.activeFileIdx, r.totalFiles, r.activeFile, elapsed)) + } + + case client.EventFileError: + r.endStatusLine() + fmt.Fprintf(r.out, " ! %s: %s\n", ev.Path, ev.Message) + + case client.EventBatchDone: + r.endStatusLine() + fmt.Fprintf(r.out, " Processed %d/%d files (%d chunks)\n", + ev.FilesProcessedTotal, r.totalFiles, ev.ChunksCreated) + + case client.EventError: + if ev.Fatal { + r.endStatusLine() + fmt.Fprintf(r.out, " ! server error: %s\n", ev.Message) + } + } +} + +func (r *progressRenderer) renderLineByLine(ev client.ProgressEvent) { + switch ev.Event { + case client.EventFileStarted: + idx := r.batchOffset + ev.FileIndex + fmt.Fprintf(r.out, " [%d/%d] %s\n", idx, r.totalFiles, ev.Path) + case client.EventFileError: + fmt.Fprintf(r.out, " ! %s: %s\n", ev.Path, ev.Message) + case client.EventBatchDone: + fmt.Fprintf(r.out, " Processed %d/%d files (%d chunks)\n", + ev.FilesProcessedTotal, r.totalFiles, ev.ChunksCreated) + case client.EventError: + if ev.Fatal { + fmt.Fprintf(r.out, " ! server error: %s\n", ev.Message) + } + } +} + +func (r *progressRenderer) renderQuiet(ev client.ProgressEvent) { + switch ev.Event { + case client.EventFileError: + fmt.Fprintf(r.out, " ! %s: %s\n", ev.Path, ev.Message) + case client.EventBatchDone: + fmt.Fprintf(r.out, " Processed %d/%d files (%d chunks)\n", + ev.FilesProcessedTotal, r.totalFiles, ev.ChunksCreated) + case client.EventError: + if ev.Fatal { + fmt.Fprintf(r.out, " ! server error: %s\n", ev.Message) + } + } +} + +// statusLine clears the previous line (overwriting with spaces, then \r) and +// writes the new line without a trailing newline. Avoids ANSI escapes so it +// works in any terminal. Width is measured in runes — len() on a string with +// `…` (U+2026, 3 bytes) would over-pad and leave residue from the previous +// line at the right edge. +func (r *progressRenderer) statusLine(s string) { + runes := utf8.RuneCountInString(s) + if runes < r.lastLineRunes { + // Pad with spaces to erase the longer previous text, then \r back + // so the next write overwrites again. + fmt.Fprintf(r.out, "\r%s", s+spaces(r.lastLineRunes-runes)) + } else { + fmt.Fprintf(r.out, "\r%s", s) + } + r.lastLineRunes = runes +} + +// endStatusLine writes a newline so subsequent output starts on a fresh line. +func (r *progressRenderer) endStatusLine() { + if r.lastLineRunes > 0 { + fmt.Fprintln(r.out) + r.lastLineRunes = 0 + } +} + +func spaces(n int) string { + if n <= 0 { + return "" + } + b := make([]byte, n) + for i := range b { + b[i] = ' ' + } + return string(b) +} diff --git a/cli/internal/watcher/watcher.go b/cli/internal/watcher/watcher.go index 3c21371..000f216 100644 --- a/cli/internal/watcher/watcher.go +++ b/cli/internal/watcher/watcher.go @@ -1,6 +1,7 @@ package watcher import ( + "context" "fmt" "log" "os" @@ -463,11 +464,27 @@ func (w *Watcher) runIndexer(full bool) { } }() + // Bridge stopCh → ctx for the duration of this indexing run so + // SendFilesStreaming bails out fast when the watcher is stopped. + ctx, cancelRun := context.WithCancel(context.Background()) + stopBridge := make(chan struct{}) + go func() { + select { + case <-w.stopCh: + cancelRun() + case <-stopBridge: + } + }() + defer func() { + cancelRun() + close(stopBridge) + }() + // Run with transient failure retries var err error for attempt := 0; attempt < 3; attempt++ { var result *indexer.Result - result, err = indexer.Run(w.apiClient, w.projectPath, full, 0) + result, err = indexer.Run(ctx, w.apiClient, w.projectPath, full, 0, indexer.ProgressQuiet) if err == nil { if full { w.logger.Printf("Full reindex complete: %d files, %d chunks (run ID: %s)", diff --git a/cli/internal/watcher/watcher_test.go b/cli/internal/watcher/watcher_test.go index d4fa4a9..d5705e8 100644 --- a/cli/internal/watcher/watcher_test.go +++ b/cli/internal/watcher/watcher_test.go @@ -55,12 +55,15 @@ func newTestWatcher(t *testing.T, projectPath, apiURL string) *Watcher { } // newIndexServer sets up a minimal mock that handles the three-phase index -// protocol and counts how many times each phase was called. +// protocol and counts how many times each phase was called. /index/files +// emits an NDJSON stream when the client sends Accept: application/x-ndjson +// (matching the streaming protocol the production server speaks). type serverCalls struct { mu sync.Mutex Begin int Files int Finish int + Cancel int } func newIndexServer(t *testing.T, dir string) (*httptest.Server, *serverCalls) { @@ -69,30 +72,50 @@ func newIndexServer(t *testing.T, dir string) (*httptest.Server, *serverCalls) { hash := projectHash(dir) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") p := r.URL.Path calls.mu.Lock() - defer calls.mu.Unlock() switch { case strings.Contains(p, hash+"/index/begin"): calls.Begin++ + calls.mu.Unlock() + w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(map[string]any{ "run_id": "run-watch", "stored_hashes": map[string]string{}, }) case strings.Contains(p, hash+"/index/files"): calls.Files++ + calls.mu.Unlock() io.ReadAll(r.Body) //nolint - json.NewEncoder(w).Encode(map[string]any{ - "files_accepted": 0, "chunks_created": 0, "files_processed_total": 0, + // Always speak NDJSON — the production server does the negotiation + // based on the Accept header; for tests we emit the new protocol + // unconditionally because the new client always opts in. + w.Header().Set("Content-Type", "application/x-ndjson") + w.WriteHeader(http.StatusOK) + if f, ok := w.(http.Flusher); ok { + f.Flush() + } + _ = json.NewEncoder(w).Encode(map[string]any{ + "event": "batch_done", + "files_accepted": 0, + "chunks_created": 0, + "files_processed_total": 0, }) case strings.Contains(p, hash+"/index/finish"): calls.Finish++ + calls.mu.Unlock() + w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(map[string]any{ "status": "ok", "files_processed": 0, "chunks_created": 0, }) + case strings.Contains(p, hash+"/index/cancel"): + calls.Cancel++ + calls.mu.Unlock() + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]any{"cancelled": false}) default: + calls.mu.Unlock() http.NotFound(w, r) } })) diff --git a/server/internal/httpapi/indexing.go b/server/internal/httpapi/indexing.go index bce9d76..4065a1e 100644 --- a/server/internal/httpapi/indexing.go +++ b/server/internal/httpapi/indexing.go @@ -1,13 +1,17 @@ package httpapi import ( + "context" "encoding/json" "errors" "net/http" "strconv" + "strings" + "time" "github.com/dvcdsys/code-index/server/internal/embeddings" "github.com/dvcdsys/code-index/server/internal/indexer" + "github.com/dvcdsys/code-index/server/internal/projects" ) // --------------------------------------------------------------------------- @@ -143,6 +147,15 @@ func indexFilesHandler(d Deps) http.HandlerFunc { } } + // Negotiate streaming vs single-JSON via Accept header. Old CLIs do + // not advertise application/x-ndjson, so they keep getting the legacy + // blocking response. New CLIs explicitly request the stream and get + // per-file progress + heartbeats. + if acceptsNDJSON(r.Header.Get("Accept")) { + indexFilesStreamingHandler(d, p, body.RunID, files, w, r) + return + } + accepted, chunks, total, err := d.Indexer.ProcessFiles(r.Context(), p.HostPath, body.RunID, files) if err != nil { if retry, busy := embeddings.IsBusy(err); busy { @@ -167,6 +180,140 @@ func indexFilesHandler(d Deps) http.HandlerFunc { } } +// acceptsNDJSON returns true when the Accept header advertises +// application/x-ndjson. Comma-separated values are inspected; q-values are +// ignored (presence is sufficient — the client opted in). +func acceptsNDJSON(accept string) bool { + for _, part := range strings.Split(accept, ",") { + // Strip parameters (q=…) and surrounding whitespace. + mediaType := strings.TrimSpace(part) + if i := strings.IndexByte(mediaType, ';'); i >= 0 { + mediaType = strings.TrimSpace(mediaType[:i]) + } + if strings.EqualFold(mediaType, "application/x-ndjson") { + return true + } + } + return false +} + +// streamingHeartbeatInterval is how often we emit a heartbeat event when no +// file-level progress has been sent. Idle on the wire ≤ heartbeatInterval + +// embedder slack, well under the client's default 30s read deadline. Var +// (not const) so tests can shrink it to keep the suite fast. +var streamingHeartbeatInterval = 10 * time.Second + +// streamingDisconnectCancelTimeout bounds how long we spend cleaning up a +// session after the client disconnects. +const streamingDisconnectCancelTimeout = 5 * time.Second + +// indexFilesStreamingHandler writes one NDJSON event per line with per-file +// progress and 10-second heartbeats. When the client disconnects mid-batch +// we call CancelIndexing so the session lock is released immediately rather +// than lingering until the 1-hour TTL. +func indexFilesStreamingHandler( + d Deps, + p *projects.Project, + runID string, + files []indexer.FilePayload, + w http.ResponseWriter, + r *http.Request, +) { + flusher, ok := w.(http.Flusher) + if !ok { + // httptest.ResponseRecorder and a few mock servers don't implement + // Flusher. Falling back to writeError keeps tests readable while + // still pointing at the misuse. + writeError(w, http.StatusInternalServerError, "streaming not supported by HTTP transport") + return + } + + w.Header().Set("Content-Type", "application/x-ndjson") + w.Header().Set("Cache-Control", "no-cache") + // X-Accel-Buffering disables proxy buffering on nginx; harmless elsewhere. + w.Header().Set("X-Accel-Buffering", "no") + w.WriteHeader(http.StatusOK) + flusher.Flush() + + progress := make(chan indexer.ProgressEvent, 32) + + // streamCtx is a child of r.Context() so client-disconnect propagation + // works automatically, but we keep our own cancel handle so a *write* + // failure (broken pipe before Go's read goroutine notices the FIN) can + // also unblock the indexer goroutine immediately. Otherwise the embedder + // would keep computing wasted GPU work until r.Context() eventually fires. + streamCtx, cancelStream := context.WithCancel(r.Context()) + defer cancelStream() + + go func() { + defer close(progress) + _, _, _, _ = d.Indexer.ProcessFilesStreaming(streamCtx, p.HostPath, runID, files, progress) + }() + + ticker := time.NewTicker(streamingHeartbeatInterval) + defer ticker.Stop() + + encoder := json.NewEncoder(w) + clientGone := false + + // markClientGone is the single place where we transition into the "drain + // progress until the indexer exits" mode. Cancelling streamCtx makes the + // embedder's ctx.Done() select fire so the indexer returns within ms + // rather than completing wasted work. + markClientGone := func() { + if clientGone { + return + } + clientGone = true + cancelStream() + } + + for { + select { + case ev, open := <-progress: + if !open { + // ProcessFilesStreaming has returned and closed the channel. + // If the client disconnected mid-flight, free the session + // lock immediately so a follow-up reindex doesn't hit 409. + if clientGone { + d.Logger.Warn("streaming: client disconnected mid-batch, cancelling session", + "run_id", runID, "project", p.HostPath) + cancelCtx, cancel := context.WithTimeout( + context.Background(), streamingDisconnectCancelTimeout) + _, _ = d.Indexer.CancelIndexing(cancelCtx, p.HostPath) + cancel() + } + return + } + if clientGone { + continue // drain to let ProcessFilesStreaming finish + } + if err := encoder.Encode(ev); err != nil { + markClientGone() + continue + } + flusher.Flush() + case <-ticker.C: + if clientGone { + continue + } + if err := encoder.Encode(indexer.ProgressEvent{ + Event: indexer.EventHeartbeat, + TS: indexer.NowTS(), + }); err != nil { + markClientGone() + continue + } + flusher.Flush() + case <-r.Context().Done(): + // Client disconnected (or request context cancelled by router). + // Set clientGone and cancel the indexer's ctx so it returns now. + d.Logger.Debug("streaming: r.Context() done", "run_id", runID, "err", r.Context().Err()) + markClientGone() + } + } +} + // --------------------------------------------------------------------------- // POST /api/v1/projects/{path}/index/finish // --------------------------------------------------------------------------- diff --git a/server/internal/httpapi/indexing_streaming_test.go b/server/internal/httpapi/indexing_streaming_test.go new file mode 100644 index 0000000..cbb1a68 --- /dev/null +++ b/server/internal/httpapi/indexing_streaming_test.go @@ -0,0 +1,444 @@ +package httpapi + +import ( + "bufio" + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "log/slog" + "net/http" + "net/http/httptest" + "strings" + "sync" + "testing" + "time" + + "github.com/dvcdsys/code-index/server/internal/indexer" +) + +// slogDiscard returns a logger that discards all output — used to keep test +// stdout quiet while still satisfying the non-nil Logger contract some code +// paths rely on (Warn/Debug/Info on a nil *slog.Logger panics). +func slogDiscard() *slog.Logger { + return slog.New(slog.NewTextHandler(io.Discard, nil)) +} + +// flushRecorder is an http.ResponseWriter that supports http.Flusher and +// records every write so tests can observe streamed output. It is safe to +// access from a single test goroutine plus the handler goroutine — the +// shared buffer is mutex-protected. +type flushRecorder struct { + mu sync.Mutex + buf bytes.Buffer + header http.Header + status int + written chan struct{} +} + +func newFlushRecorder() *flushRecorder { + return &flushRecorder{ + header: make(http.Header), + written: make(chan struct{}, 1), + } +} + +func (r *flushRecorder) Header() http.Header { return r.header } + +func (r *flushRecorder) Write(p []byte) (int, error) { + r.mu.Lock() + n, err := r.buf.Write(p) + r.mu.Unlock() + select { + case r.written <- struct{}{}: + default: + } + return n, err +} + +func (r *flushRecorder) WriteHeader(s int) { r.status = s } + +func (r *flushRecorder) Flush() {} // no-op — buf is already coherent + +// waitForBytes blocks until the recorder accumulates at least min bytes or +// the timeout elapses. Returns true on success. +func (r *flushRecorder) waitForBytes(timeout time.Duration, min int) bool { + deadline := time.After(timeout) + for { + r.mu.Lock() + got := r.buf.Len() + r.mu.Unlock() + if got >= min { + return true + } + select { + case <-r.written: + // loop + case <-deadline: + return false + } + } +} + +// streamingTestServer spins up a real httptest.Server so the streaming +// handler gets an http.ResponseWriter that implements Flusher (which +// httptest.ResponseRecorder does not). +func streamingTestServer(t *testing.T, projectPath string) (*httptest.Server, string) { + t.Helper() + d, hash := newIndexerTestDeps(t, projectPath) + srv := httptest.NewServer(NewRouter(d)) + t.Cleanup(srv.Close) + return srv, hash +} + +// blockingEmbedder is a fakeEmbedder that waits on a channel before returning. +// Used to simulate a slow embedder so the disconnect test can interrupt mid-batch +// before ProcessFilesStreaming completes naturally. +type blockingEmbedder struct { + fakeEmbedder + release chan struct{} // close to allow EmbedTexts to proceed +} + +func (b *blockingEmbedder) EmbedTexts(ctx context.Context, texts []string) ([][]float32, error) { + select { + case <-b.release: + case <-ctx.Done(): + return nil, ctx.Err() + } + return b.fakeEmbedder.EmbedTexts(ctx, texts) +} + + +// readNDJSONLines reads NDJSON until either io.EOF or until limit lines have +// been collected. Returns the parsed events. +func readNDJSONLines(t *testing.T, body io.Reader, limit int) []indexer.ProgressEvent { + t.Helper() + scanner := bufio.NewScanner(body) + scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024) + var events []indexer.ProgressEvent + for scanner.Scan() { + line := bytes.TrimSpace(scanner.Bytes()) + if len(line) == 0 { + continue + } + var ev indexer.ProgressEvent + if err := json.Unmarshal(line, &ev); err != nil { + t.Fatalf("decode ndjson line %q: %v", line, err) + } + events = append(events, ev) + if limit > 0 && len(events) >= limit { + return events + } + } + if err := scanner.Err(); err != nil && err != io.EOF { + t.Fatalf("scan: %v", err) + } + return events +} + +// beginSession is a small helper: starts a session, returns run_id. +func beginSession(t *testing.T, baseURL, hash string) string { + t.Helper() + resp, err := http.Post( + baseURL+"/api/v1/projects/"+hash+"/index/begin", + "application/json", + strings.NewReader(`{"full":true}`), + ) + if err != nil { + t.Fatalf("begin: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != 200 { + body, _ := io.ReadAll(resp.Body) + t.Fatalf("begin status=%d body=%s", resp.StatusCode, body) + } + var br indexBeginResponse + if err := json.NewDecoder(resp.Body).Decode(&br); err != nil { + t.Fatalf("decode begin: %v", err) + } + return br.RunID +} + +func newFilesRequestBody(t *testing.T, runID string, files map[string]string) []byte { + t.Helper() + payload := map[string]any{ + "run_id": runID, + "files": []map[string]any{}, + } + for path, content := range files { + payload["files"] = append(payload["files"].([]map[string]any), map[string]any{ + "path": path, + "content": content, + "content_hash": shaHex(content), + "language": "go", + "size": len(content), + }) + } + b, err := json.Marshal(payload) + if err != nil { + t.Fatalf("marshal: %v", err) + } + return b +} + +// TestIndexFilesStreaming_BatchDone exercises the happy path: NDJSON event +// stream contains file_started + batch_done with the expected counts. +func TestIndexFilesStreaming_BatchDone(t *testing.T) { + srv, hash := streamingTestServer(t, "/proj") + runID := beginSession(t, srv.URL, hash) + + body := newFilesRequestBody(t, runID, map[string]string{ + "/proj/a.go": "package main\nfunc A() int { return 1 }\n", + "/proj/b.go": "package main\nfunc B() int { return 2 }\n", + }) + + req, _ := http.NewRequest( + http.MethodPost, + srv.URL+"/api/v1/projects/"+hash+"/index/files", + bytes.NewReader(body), + ) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "application/x-ndjson") + + resp, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatalf("post: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != 200 { + out, _ := io.ReadAll(resp.Body) + t.Fatalf("status=%d body=%s", resp.StatusCode, out) + } + if ct := resp.Header.Get("Content-Type"); ct != "application/x-ndjson" { + t.Errorf("Content-Type=%q, want application/x-ndjson", ct) + } + + events := readNDJSONLines(t, resp.Body, 0) + if len(events) == 0 { + t.Fatal("no events in stream") + } + + last := events[len(events)-1] + if last.Event != indexer.EventBatchDone { + t.Fatalf("last event = %q, want %q (events: %v)", last.Event, indexer.EventBatchDone, summarizeEvents(events)) + } + if last.FilesAccepted != 2 { + t.Errorf("files_accepted=%d, want 2", last.FilesAccepted) + } + if last.ChunksCreated == 0 { + t.Errorf("chunks_created=0") + } + + // At least one file_started event must appear. + startedCount := 0 + for _, e := range events { + if e.Event == indexer.EventFileStarted { + startedCount++ + } + } + if startedCount != 2 { + t.Errorf("file_started count=%d, want 2", startedCount) + } +} + +// TestIndexFilesStreaming_LegacyCompat verifies that requests without an +// Accept: application/x-ndjson header keep getting the existing single-JSON +// response. This is the regression guard for old CLIs against a new server. +func TestIndexFilesStreaming_LegacyCompat(t *testing.T) { + srv, hash := streamingTestServer(t, "/proj") + runID := beginSession(t, srv.URL, hash) + + body := newFilesRequestBody(t, runID, map[string]string{ + "/proj/x.go": "package main\nfunc X() {}\n", + }) + + req, _ := http.NewRequest( + http.MethodPost, + srv.URL+"/api/v1/projects/"+hash+"/index/files", + bytes.NewReader(body), + ) + req.Header.Set("Content-Type", "application/json") + // No Accept header → legacy path. + + resp, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatalf("post: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != 200 { + out, _ := io.ReadAll(resp.Body) + t.Fatalf("status=%d body=%s", resp.StatusCode, out) + } + ct := resp.Header.Get("Content-Type") + if !strings.HasPrefix(ct, "application/json") { + t.Errorf("Content-Type=%q, want application/json (legacy)", ct) + } + + var legacy indexFilesResponse + if err := json.NewDecoder(resp.Body).Decode(&legacy); err != nil { + t.Fatalf("decode: %v", err) + } + if legacy.FilesAccepted != 1 { + t.Errorf("files_accepted=%d, want 1", legacy.FilesAccepted) + } +} + +// TestIndexFilesStreaming_AcceptOnly verifies that Accept headers without +// application/x-ndjson (e.g. */*) take the legacy path — only an explicit +// streaming opt-in upgrades the protocol. +func TestIndexFilesStreaming_AcceptOnly(t *testing.T) { + srv, hash := streamingTestServer(t, "/proj") + runID := beginSession(t, srv.URL, hash) + + body := newFilesRequestBody(t, runID, map[string]string{ + "/proj/y.go": "package main\nfunc Y() {}\n", + }) + + req, _ := http.NewRequest( + http.MethodPost, + srv.URL+"/api/v1/projects/"+hash+"/index/files", + bytes.NewReader(body), + ) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "*/*") + + resp, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatalf("post: %v", err) + } + defer resp.Body.Close() + + ct := resp.Header.Get("Content-Type") + if !strings.HasPrefix(ct, "application/json") { + t.Errorf("Accept=*/* should still get legacy JSON, got Content-Type=%q", ct) + } +} + +// TestIndexFilesStreaming_ClientDisconnect verifies that cancelling the +// request context mid-batch frees the session lock. We call the handler +// directly with a context we control rather than relying on Go's net/http +// to detect a client TCP disconnect — that detection is best-effort and +// unreliable in unit-test timeframes (it depends on the OS noticing FIN +// during a write or read goroutine, which can take seconds with chunked +// encoding even when the client has already closed). Cancelling the +// request's context is the same signal the server reacts to in production +// (chi propagates it from the underlying http.Request). +func TestIndexFilesStreaming_ClientDisconnect(t *testing.T) { + // Heartbeat shrunk so the inner ticker case fires reliably during the test. + prevHB := streamingHeartbeatInterval + streamingHeartbeatInterval = 50 * time.Millisecond + t.Cleanup(func() { streamingHeartbeatInterval = prevHB }) + + emb := &blockingEmbedder{ + fakeEmbedder: fakeEmbedder{dim: 16}, + release: make(chan struct{}), + } + d, hash := newIndexerTestDeps(t, "/proj") + d.EmbeddingSvc = emb + d.Indexer = indexer.New(d.DB, d.VectorStore, emb, slogDiscard()) + d.Logger = slogDiscard() + router := NewRouter(d) + + // Begin a session so we have a valid run_id. + beginW := httptest.NewRecorder() + beginReq := httptest.NewRequest(http.MethodPost, + "/api/v1/projects/"+hash+"/index/begin", + strings.NewReader(`{"full":true}`)) + beginReq.Header.Set("Content-Type", "application/json") + router.ServeHTTP(beginW, beginReq) + if beginW.Code != 200 { + t.Fatalf("begin: status=%d body=%s", beginW.Code, beginW.Body) + } + var br indexBeginResponse + _ = json.Unmarshal(beginW.Body.Bytes(), &br) + + files := map[string]string{} + for i := 0; i < 5; i++ { + files[fmt.Sprintf("/proj/file_%d.go", i)] = + "package main\nfunc F() int { return 1 }\n" + } + body := newFilesRequestBody(t, br.RunID, files) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + req := httptest.NewRequest(http.MethodPost, + "/api/v1/projects/"+hash+"/index/files", + bytes.NewReader(body)).WithContext(ctx) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "application/x-ndjson") + + rw := newFlushRecorder() + + // Serve in a goroutine because the handler will block on the embedder + // until we cancel ctx. + done := make(chan struct{}) + go func() { + router.ServeHTTP(rw, req) + close(done) + }() + + // Wait for the first NDJSON line to reach our recorder — proves the + // handler is running and ProcessFilesStreaming is engaged. + if !rw.waitForBytes(2*time.Second, 10) { + t.Fatal("no bytes written before disconnect deadline") + } + + // Disconnect: the request ctx is what chi passes through to r.Context(). + cancel() + + // Handler must return promptly after ctx cancel. + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("handler did not return within 2s after ctx cancel") + } + + // A new /index/begin must succeed: proves CancelIndexing was called. + begin2W := httptest.NewRecorder() + begin2Req := httptest.NewRequest(http.MethodPost, + "/api/v1/projects/"+hash+"/index/begin", + strings.NewReader(`{"full":true}`)) + begin2Req.Header.Set("Content-Type", "application/json") + router.ServeHTTP(begin2W, begin2Req) + if begin2W.Code != 200 { + t.Fatalf("begin after disconnect: status=%d body=%s — session lock not released", + begin2W.Code, begin2W.Body) + } +} + +// TestAcceptsNDJSON unit-tests the Accept header parser. +func TestAcceptsNDJSON(t *testing.T) { + cases := []struct { + header string + want bool + }{ + {"application/x-ndjson", true}, + {"application/x-ndjson; q=1.0", true}, + {"application/json, application/x-ndjson", true}, + {" application/x-ndjson ", true}, + {"application/X-NDJSON", true}, // case-insensitive + {"*/*", false}, + {"application/json", false}, + {"", false}, + } + for _, c := range cases { + if got := acceptsNDJSON(c.header); got != c.want { + t.Errorf("acceptsNDJSON(%q) = %v, want %v", c.header, got, c.want) + } + } +} + +func summarizeEvents(events []indexer.ProgressEvent) string { + var b strings.Builder + for i, e := range events { + if i > 0 { + b.WriteString(", ") + } + b.WriteString(e.Event) + } + return b.String() +} diff --git a/server/internal/indexer/indexer.go b/server/internal/indexer/indexer.go index 9839ef6..be76d1f 100644 --- a/server/internal/indexer/indexer.go +++ b/server/internal/indexer/indexer.go @@ -269,9 +269,36 @@ func (s *Service) ProcessFiles( ctx context.Context, projectPath, runID string, files []FilePayload, +) (int, int, int, error) { + return s.ProcessFilesStreaming(ctx, projectPath, runID, files, nil) +} + +// ProcessFilesStreaming is ProcessFiles with an optional progress channel. The +// streaming HTTP handler passes a channel that forwards each event as an +// NDJSON line; non-streaming callers use ProcessFiles which passes nil. +// +// The terminal event (batch_done on success, error fatal=true on failure) is +// sent with a guaranteed-blocking send so the consumer always sees it. +// Per-file progress events use a non-blocking send and may be dropped if the +// consumer is slower than the embed loop — that is acceptable because the +// final summary is what callers depend on. +// +// When progress is non-nil, the channel is left open on return; the caller +// is expected to close it after collecting the terminal event. +func (s *Service) ProcessFilesStreaming( + ctx context.Context, + projectPath, runID string, + files []FilePayload, + progress chan<- ProgressEvent, ) (int, int, int, error) { sess, err := s.requireSession(runID, projectPath) if err != nil { + emitTerminal(progress, ProgressEvent{ + Event: EventError, + Message: err.Error(), + Fatal: true, + RunID: runID, + }) return 0, 0, 0, err } @@ -303,12 +330,28 @@ func (s *Service) ProcessFiles( } }() - for _, fp := range files { + for fi, fp := range files { + // file_started — emit even for files we'll skip below, so the client + // counter advances monotonically and rendering stays aligned with N. + progressSend(progress, ProgressEvent{ + Event: EventFileStarted, + Path: fp.Path, + FileIndex: fi + 1, + BatchSize: len(files), + RunID: runID, + }) + if strings.TrimSpace(fp.Content) == "" { continue } if len(fp.Content) > maxContentBytes { s.logger.Warn("indexer: file too large, skipping", "path", fp.Path, "size_bytes", len(fp.Content)) + progressSend(progress, ProgressEvent{ + Event: EventFileError, + Path: fp.Path, + Message: fmt.Sprintf("file too large (%d bytes)", len(fp.Content)), + Fatal: false, + }) continue } @@ -320,11 +363,22 @@ func (s *Service) ProcessFiles( chunks, refs, err := chunker.ChunkFile(fp.Path, fp.Content, language, 0) if err != nil { s.logger.Warn("indexer: chunk file failed", "path", fp.Path, "err", err) + progressSend(progress, ProgressEvent{ + Event: EventFileError, + Path: fp.Path, + Message: "chunk: " + err.Error(), + Fatal: false, + }) continue } if len(chunks) == 0 { continue } + progressSend(progress, ProgressEvent{ + Event: EventFileChunked, + Path: fp.Path, + Chunks: len(chunks), + }) // Symbol extraction — mirrors Python: function|class|method|type with a name. fileSymbols := make([]symbolindex.Symbol, 0, len(chunks)) @@ -366,6 +420,7 @@ func (s *Service) ProcessFiles( texts[i] = c.ChunkType + ": " + c.Content } var embs [][]float32 + embedStart := time.Now() if tae, ok := s.emb.(TokenAwareEmbedder); ok { embs, err = tae.TokenizeAndEmbed(ctx, texts) } else { @@ -374,16 +429,38 @@ func (s *Service) ProcessFiles( if err != nil { // Propagate ErrBusy so handler can map to 503 + Retry-After. if _, busy := embeddings.IsBusy(err); busy { + emitTerminal(progress, ProgressEvent{ + Event: EventError, + Message: err.Error(), + Fatal: true, + }) return filesAccepted, batchChunks, sess.filesProcessed, err } if errors.Is(err, embeddings.ErrDisabled) || errors.Is(err, embeddings.ErrSupervisor) || errors.Is(err, embeddings.ErrNotReady) { + emitTerminal(progress, ProgressEvent{ + Event: EventError, + Message: err.Error(), + Fatal: true, + }) return filesAccepted, batchChunks, sess.filesProcessed, err } s.logger.Error("indexer: embed texts failed", "path", fp.Path, "err", err) + progressSend(progress, ProgressEvent{ + Event: EventFileError, + Path: fp.Path, + Message: "embed: " + err.Error(), + Fatal: false, + }) continue } + progressSend(progress, ProgressEvent{ + Event: EventFileEmbedded, + Path: fp.Path, + Chunks: len(chunks), + EmbedMS: time.Since(embedStart).Milliseconds(), + }) // Per-file SAVEPOINT so a partial failure rolls back only this file. // savepointName is derived from filesAccepted (monotonically increasing @@ -458,6 +535,11 @@ func (s *Service) ProcessFiles( } if _, err := tx.ExecContext(ctx, "RELEASE SAVEPOINT "+savepointName); err != nil { + emitTerminal(progress, ProgressEvent{ + Event: EventError, + Message: "release savepoint: " + err.Error(), + Fatal: true, + }) return filesAccepted, batchChunks, sess.filesProcessed, fmt.Errorf("release savepoint: %w", err) } @@ -469,6 +551,12 @@ func (s *Service) ProcessFiles( sess.languagesSeen[language] = struct{}{} s.mu.Unlock() filesAccepted++ + + progressSend(progress, ProgressEvent{ + Event: EventFileDone, + Path: fp.Path, + Chunks: len(chunks), + }) } // M2 — these upserts are part of the outer tx. Any failure returns the @@ -476,16 +564,31 @@ func (s *Service) ProcessFiles( // below only advance on a successful commit. if len(batchSymbols) > 0 { if err := symbolindex.UpsertSymbolsTx(ctx, tx, projectPath, batchSymbols); err != nil { + emitTerminal(progress, ProgressEvent{ + Event: EventError, + Message: "upsert symbols: " + err.Error(), + Fatal: true, + }) return filesAccepted, batchChunks, sess.filesProcessed, fmt.Errorf("upsert symbols: %w", err) } } if len(batchRefs) > 0 { if err := symbolindex.UpsertReferencesTx(ctx, tx, projectPath, batchRefs); err != nil { + emitTerminal(progress, ProgressEvent{ + Event: EventError, + Message: "upsert refs: " + err.Error(), + Fatal: true, + }) return filesAccepted, batchChunks, sess.filesProcessed, fmt.Errorf("upsert refs: %w", err) } } if err := tx.Commit(); err != nil { + emitTerminal(progress, ProgressEvent{ + Event: EventError, + Message: "commit batch: " + err.Error(), + Fatal: true, + }) return filesAccepted, batchChunks, sess.filesProcessed, fmt.Errorf("commit batch: %w", err) } txCommitted = true @@ -503,6 +606,13 @@ func (s *Service) ProcessFiles( "total_files", total, ) + emitTerminal(progress, ProgressEvent{ + Event: EventBatchDone, + FilesAccepted: filesAccepted, + ChunksCreated: batchChunks, + FilesProcessedTotal: total, + }) + return filesAccepted, batchChunks, total, nil } diff --git a/server/internal/indexer/progress.go b/server/internal/indexer/progress.go new file mode 100644 index 0000000..851a0ba --- /dev/null +++ b/server/internal/indexer/progress.go @@ -0,0 +1,95 @@ +package indexer + +import "time" + +// ProgressEvent is emitted by ProcessFiles when a non-nil progress channel is +// supplied, so the streaming HTTP handler can forward each one as a JSON line. +// +// One struct with all possible fields + omitempty is intentional: it keeps the +// wire format easy to evolve and the consumer code a single switch on Event. +// +// Wire format example (newline-delimited JSON, one struct per line): +// +// {"event":"file_started","run_id":"...","path":"main.go","file_index":1,"batch_size":20} +// {"event":"file_chunked","path":"main.go","chunks":12} +// {"event":"file_embedded","path":"main.go","chunks":12,"embed_ms":540} +// {"event":"file_done","path":"main.go","chunks":12} +// {"event":"heartbeat","ts":"2026-04-27T17:25:00Z"} +// {"event":"file_error","path":"big.bin","message":"...","fatal":false} +// {"event":"batch_done","files_accepted":20,"chunks_created":347,"files_processed_total":300} +// {"event":"error","message":"...","fatal":true} +type ProgressEvent struct { + Event string `json:"event"` + + // Per-file fields. + Path string `json:"path,omitempty"` + FileIndex int `json:"file_index,omitempty"` + BatchSize int `json:"batch_size,omitempty"` + Chunks int `json:"chunks,omitempty"` + EmbedMS int64 `json:"embed_ms,omitempty"` + + // Heartbeat. + TS string `json:"ts,omitempty"` + + // Errors. + Message string `json:"message,omitempty"` + Fatal bool `json:"fatal,omitempty"` + + // Batch-done summary (mirrors indexFilesResponse). + FilesAccepted int `json:"files_accepted,omitempty"` + ChunksCreated int `json:"chunks_created,omitempty"` + FilesProcessedTotal int `json:"files_processed_total,omitempty"` + + // Run identifier — populated on the first event the handler emits. + RunID string `json:"run_id,omitempty"` +} + +// Event kinds. Using string constants both for documentation and for +// comparisons in tests / consumer switches. +const ( + EventFileStarted = "file_started" + EventFileChunked = "file_chunked" + EventFileEmbedded = "file_embedded" + EventFileDone = "file_done" + EventFileError = "file_error" + EventHeartbeat = "heartbeat" + EventBatchDone = "batch_done" + EventError = "error" +) + +// progressSend is a nil-safe non-blocking send. ProcessFiles uses it instead +// of `progress <- e` so that: +// +// 1. callers that do not care about progress pass nil and pay no cost +// 2. a slow consumer cannot stall the indexer (the channel has a small +// buffer in the streaming handler; if it fills we drop the event rather +// than blocking the embed pipeline) +// +// Drops are acceptable because the only events that *must* land are +// batch_done / error, and those are sent on the unbuffered close path +// using a guaranteed-blocking send (see emitTerminal). +func progressSend(ch chan<- ProgressEvent, e ProgressEvent) { + if ch == nil { + return + } + select { + case ch <- e: + default: + // channel full — drop. Keeps embed loop unblocked. + } +} + +// emitTerminal is for batch_done / fatal error: must reach the consumer. +// Always blocks until accepted (or ctx cancellation closes things upstream). +func emitTerminal(ch chan<- ProgressEvent, e ProgressEvent) { + if ch == nil { + return + } + ch <- e +} + +// NowTS returns an RFC3339 timestamp for heartbeat events. Exported so the +// streaming HTTP handler in package httpapi can stamp its own heartbeats. +func NowTS() string { + return time.Now().UTC().Format(time.RFC3339) +} From 94ed0ff5c209a2dd7ffdeabc725049e5d6677d1f Mon Sep 17 00:00:00 2001 From: dvcdsys Date: Mon, 27 Apr 2026 22:10:54 +0100 Subject: [PATCH 3/6] feat(cli): cix cancel command + summary grouped by language + dev makefile target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * New `cix cancel` command frees a stuck per-project session lock without having to wait for the 1h TTL. Pairs with the streaming-handler ctx- disconnect path: in normal use the lock auto-clears, this is the manual escape hatch. * `cix summary` now groups "Top symbols" by language and shows up to N per language instead of one mixed list. Earlier output mixed Go/Python/JS symbols with no indication of which file they came from, which made the summary nearly useless on multi-language repos. * server/Makefile: docker-build-cuda-dev target builds + pushes :cu128-dev for manual prod testing before tagging a release. Floating tag, no pinned variant — rollback isn't a concern for a dev tag. * root Makefile: small build-target plumbing. * doc/benchmark-cix-vs-grep.md: numbers from the search-vs-grep comparison done while debugging the install.sh hang. Tracked locally — not user documentation, more of an internal reference. Co-Authored-By: Claude Opus 4.7 --- Makefile | 4 +- cli/cmd/cancel.go | 68 +++++++ cli/cmd/cancel_test.go | 100 +++++++++++ cli/cmd/summary.go | 45 ++++- cli/cmd/summary_test.go | 5 +- doc/benchmark-cix-vs-grep.md | 331 +++++++++++++++++++++++++++++++++++ server/Makefile | 36 +++- 7 files changed, 578 insertions(+), 11 deletions(-) create mode 100644 cli/cmd/cancel.go create mode 100644 cli/cmd/cancel_test.go create mode 100644 doc/benchmark-cix-vs-grep.md diff --git a/Makefile b/Makefile index 92add01..54ad72b 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help build test bundle test-gate docker-build-cuda clean +.PHONY: help build test bundle test-gate docker-build-cuda docker-build-cuda-dev clean -help build test bundle test-gate docker-build-cuda clean: +help build test bundle test-gate docker-build-cuda docker-build-cuda-dev clean: @$(MAKE) -C server $@ diff --git a/cli/cmd/cancel.go b/cli/cmd/cancel.go new file mode 100644 index 0000000..e616a9e --- /dev/null +++ b/cli/cmd/cancel.go @@ -0,0 +1,68 @@ +package cmd + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/spf13/cobra" +) + +var cancelProject string + +var cancelCmd = &cobra.Command{ + Use: "cancel", + Short: "Cancel an active indexing session", + Long: `Cancel any in-flight indexing session for a project. + +Useful when a previous 'cix reindex' was interrupted by a network issue or +client-side timeout but the server is still holding a session lock and +returning 409 Conflict on subsequent /index/begin attempts. + +Idempotent: succeeds (no-op) when no session is active. + +Examples: + cix cancel + cix cancel -p /path/to/project`, + RunE: runCancel, +} + +func init() { + rootCmd.AddCommand(cancelCmd) + cancelCmd.Flags().StringVarP(&cancelProject, "project", "p", "", "Project path (default: current directory)") +} + +func runCancel(cmd *cobra.Command, args []string) error { + projectPath := cancelProject + if projectPath == "" { + cwd, err := os.Getwd() + if err != nil { + return fmt.Errorf("get working directory: %w", err) + } + projectPath = cwd + } + + absPath, err := filepath.Abs(projectPath) + if err != nil { + return fmt.Errorf("resolve path: %w", err) + } + + apiClient, err := getClient() + if err != nil { + return err + } + + absPath = findProjectRoot(absPath, apiClient) + + resp, err := apiClient.CancelIndex(absPath) + if err != nil { + return fmt.Errorf("cancel: %w", err) + } + + if resp.Cancelled { + fmt.Printf("✓ Cancelled active indexing session for %s\n", absPath) + } else { + fmt.Printf("No active session for %s (nothing to cancel)\n", absPath) + } + return nil +} diff --git a/cli/cmd/cancel_test.go b/cli/cmd/cancel_test.go new file mode 100644 index 0000000..6eec5ad --- /dev/null +++ b/cli/cmd/cancel_test.go @@ -0,0 +1,100 @@ +package cmd + +import ( + "net/http" + "strings" + "testing" +) + +func TestRunCancel_ActiveSession(t *testing.T) { + proj := t.TempDir() + hash := projectHash(proj) + + srv := mockServer(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case strings.HasSuffix(r.URL.Path, "/api/v1/projects"): + writeJSON(w, 200, map[string]any{"projects": []any{}, "total": 0}) + case strings.Contains(r.URL.Path, hash+"/index/cancel") && r.Method == http.MethodPost: + writeJSON(w, 200, map[string]any{"cancelled": true}) + default: + http.NotFound(w, r) + } + }) + useAPI(t, srv) + + old := cancelProject + defer func() { cancelProject = old }() + cancelProject = proj + + out, err := captureOutput(func() error { + return runCancel(nil, nil) + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !strings.Contains(out, "Cancelled active indexing session") { + t.Errorf("expected success message, got:\n%s", out) + } +} + +func TestRunCancel_NoActiveSession(t *testing.T) { + proj := t.TempDir() + hash := projectHash(proj) + + srv := mockServer(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case strings.HasSuffix(r.URL.Path, "/api/v1/projects"): + writeJSON(w, 200, map[string]any{"projects": []any{}, "total": 0}) + case strings.Contains(r.URL.Path, hash+"/index/cancel"): + writeJSON(w, 200, map[string]any{"cancelled": false}) + default: + http.NotFound(w, r) + } + }) + useAPI(t, srv) + + old := cancelProject + defer func() { cancelProject = old }() + cancelProject = proj + + out, err := captureOutput(func() error { + return runCancel(nil, nil) + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !strings.Contains(out, "No active session") { + t.Errorf("expected idempotent message, got:\n%s", out) + } +} + +func TestRunCancel_APIError(t *testing.T) { + proj := t.TempDir() + hash := projectHash(proj) + + srv := mockServer(t, func(w http.ResponseWriter, r *http.Request) { + switch { + case strings.HasSuffix(r.URL.Path, "/api/v1/projects"): + writeJSON(w, 200, map[string]any{"projects": []any{}, "total": 0}) + case strings.Contains(r.URL.Path, hash+"/index/cancel"): + apiError(w, 500, "internal error") + default: + http.NotFound(w, r) + } + }) + useAPI(t, srv) + + old := cancelProject + defer func() { cancelProject = old }() + cancelProject = proj + + _, err := captureOutput(func() error { + return runCancel(nil, nil) + }) + if err == nil { + t.Fatal("expected error, got nil") + } + if !strings.Contains(err.Error(), "cancel") { + t.Errorf("expected 'cancel' in error, got: %v", err) + } +} diff --git a/cli/cmd/summary.go b/cli/cmd/summary.go index 521c0b9..6be2d55 100644 --- a/cli/cmd/summary.go +++ b/cli/cmd/summary.go @@ -4,8 +4,10 @@ import ( "fmt" "os" "path/filepath" + "sort" "strings" + "github.com/anthropics/code-index/cli/internal/client" "github.com/spf13/cobra" ) @@ -91,16 +93,45 @@ func runSummary(cmd *cobra.Command, args []string) error { fmt.Println() } - // Recent symbols + // Top symbols — grouped by language so it's obvious which symbols come + // from which file type. Mixed lists used to be hard to scan ("why is + // `s` showing up as a function?" — turns out: minified JS bundle). if len(summary.RecentSymbols) > 0 { fmt.Println("Top symbols:") - for _, sym := range summary.RecentSymbols { - if sym.Name == "" { - continue - } - fmt.Printf(" [%s] %s\n", sym.Kind, sym.Name) - } + printSymbolsByLanguage(summary.RecentSymbols) } return nil } + +// printSymbolsByLanguage groups symbols by their language and renders each +// group under a ` (N):` header. Languages are sorted alphabetically; +// within each group, original order is preserved (the server already returns +// them ranked). Symbols with empty Language are bucketed under "(unknown)". +func printSymbolsByLanguage(syms []client.RecentSymbolEntry) { + groups := map[string][]client.RecentSymbolEntry{} + for _, sym := range syms { + if sym.Name == "" { + continue + } + lang := sym.Language + if lang == "" { + lang = "(unknown)" + } + groups[lang] = append(groups[lang], sym) + } + + langs := make([]string, 0, len(groups)) + for l := range groups { + langs = append(langs, l) + } + sort.Strings(langs) + + for _, lang := range langs { + entries := groups[lang] + fmt.Printf(" %s (%d):\n", lang, len(entries)) + for _, sym := range entries { + fmt.Printf(" [%s] %s\n", sym.Kind, sym.Name) + } + } +} diff --git a/cli/cmd/summary_test.go b/cli/cmd/summary_test.go index 8879d89..69731f8 100644 --- a/cli/cmd/summary_test.go +++ b/cli/cmd/summary_test.go @@ -27,7 +27,10 @@ func TestRunSummary(t *testing.T) { {"path": proj + "/cli", "file_count": 20.0}, }, "recent_symbols": []map[string]any{ - {"name": "IndexerService", "kind": "class"}, + {"name": "IndexerService", "kind": "class", "language": "go"}, + {"name": "User", "kind": "class", "language": "python"}, + {"name": "ParseJSON", "kind": "function", "language": "go"}, + {"name": "noLangSymbol", "kind": "function"}, }, }) default: diff --git a/doc/benchmark-cix-vs-grep.md b/doc/benchmark-cix-vs-grep.md new file mode 100644 index 0000000..2f08b8d --- /dev/null +++ b/doc/benchmark-cix-vs-grep.md @@ -0,0 +1,331 @@ +# Benchmark — CIX-first vs grep-only navigation + +Single-machine, single-model (`claude-sonnet-4-6`) head-to-head: 32 hint-free tasks +across 4 task types × 4 variants × 2 navigation strategies (Worker A: grep-only, +Worker B: cix-first). Operator: `claude-opus-4-7`. Run on 2026-04-27. + +The fixture is a frozen snapshot of this same `claude-code-index` project. +All raw transcripts and metric JSON live in `/tmp/cix-bench/results/runs/`; +this report does not include them. + +--- + +## 1. Headline comparison (16 runs each) + +| Metric | Worker A (grep-only) | Worker B (cix-first) | Δ (B − A) | Δ % | +|--------------------------|----------------------|----------------------|-----------|---------| +| Mean elapsed time (s) | **62.2** | 69.9 | +7.7 | +12.4 % | +| Median elapsed time (s) | **58.5** | **58.5** | 0.0 | 0.0 % | +| Mean tool calls | **14.5** | 19.2 | +4.7 | +32.4 % | +| Mean tokens_in | **33** | 38 | +5 | +15.2 % | +| Mean tokens_out | **2447** | 2754 | +307 | +12.5 % | +| Pass rate | 14 / 16 | **16 / 16** | +2 | +12.5 % | + +Δ is `B − A`. Negative on time/tokens means B was faster/cheaper. Bold = better cell per row. + +Token counts are uncached `input_tokens` / `output_tokens` summed across the +worker's assistant messages (per runbook §6). Cache-creation tokens, which +dominate real cost on Sonnet, are reported in §6 as a caveat but not in the +headline because the runbook fixed the metric definition before the run. + +**One-glance read:** B is *more reliable* (16 / 16 pass vs 14 / 16) but *not* +faster or cheaper on average. Median elapsed is identical; the mean gap comes +from a few long B-runs in `tests` and `summary` (see §2). The only clean B +win on time is `bugfix`. + +--- + +## 2. Per-task comparison + +| Task type | Metric | Worker A | Worker B | Δ (B − A) | Δ % | +|-----------|---------------------|----------|----------|-----------|---------| +| bugfix | mean elapsed s | 61.5 | **55.2** | −6.3 | −10.2 % | +| bugfix | mean tool calls | **13.5** | 14.0 | +0.5 | +3.7 % | +| bugfix | mean tokens_in | 21 | **20** | −1 | −4.8 % | +| bugfix | mean tokens_out | 1837 | **1745** | −92 | −5.0 % | +| bugfix | pass rate | 4 / 4 | 4 / 4 | 0 | 0 % | +| refactor | mean elapsed s | **62.0** | 64.5 | +2.5 | +4.0 % | +| refactor | mean tool calls | **13.0** | 13.8 | +0.8 | +6.2 % | +| refactor | mean tokens_in | **21** | 22 | +1 | +4.8 % | +| refactor | mean tokens_out | 2195 | **2018** | −177 | −8.1 % | +| refactor | pass rate | 2 / 4 | **4 / 4**| +2 | +50 % | +| tests | mean elapsed s | **78.2** | 107.0 | +28.8 | +36.8 % | +| tests | mean tool calls | **15.0** | 30.5 | +15.5 | +103 % | +| tests | mean tokens_in | **24** | 43 | +19 | +79 % | +| tests | mean tokens_out | **3865** | 4906 | +1041 | +26.9 % | +| tests | pass rate | 4 / 4 | 4 / 4 | 0 | 0 % | +| summary | mean elapsed s | **47.2** | 52.8 | +5.5 | +11.7 % | +| summary | mean tool calls | **16.5** | 18.5 | +2.0 | +12.1 % | +| summary | mean tokens_in | **64** | 66 | +2 | +3.1 % | +| summary | mean tokens_out | **1892** | 2347 | +454 | +24.0 % | +| summary | pass rate | 4 / 4 | 4 / 4 | 0 | 0 % | + +Where the strategy mattered most — and what it actually changed: + +- **`refactor` is the only place B's pass rate dominates.** A picked the same + non-seeded inefficiency (`chunkSlidingWindow`) twice — for variants 01 and 04 + — and was scored `partial`. B used `cix symbols` / `cix references` to + enumerate candidates more broadly and hit the seeded function in all 4 runs. +- **`bugfix` favors A on wall-clock**, ~10 % faster. With a failing test + pointing at the call site, neither navigator needs much exploration; the + extra round-trip through `cix` is pure overhead. +- **`tests` is where B paid the biggest tax** — +29 s, +1041 output tokens. + B consistently selected real exported functions (`DynamicChromaPersistDir`, + `DeleteByProject`, `DefaultSettings`) which require harness setup + (DB / temp dir) and write longer test bodies. A took the literal cheap path + 4 / 4 times: it picked an *unexported* helper (`splitChunk` ×3, + `sortRanges` ×1) every variant, which the prompt forbade ("public function"). + Verification per §7.3 still scored both as `pass` — §7.3 doesn't gate on + exportedness. See §6. +- **`summary` is a draw on quality** (rubric: A=6,6,6,7 / B=6,5,6,6) but B used + ~24 % more output tokens. Both configs read enough of the tree to ground the + paragraph; neither shape of navigation seems to help here. + +--- + +## 3. Per-run table (all 32 rows) + +| run_id | elapsed_s | tools | toks_total | toks_in | toks_out | cix_ops | grep_ops | files_read | outcome | +|-----------------|-----------|-------|------------|---------|----------|---------|----------|------------|---------| +| bugfix_01_A | 92 | 18 | 3129 | 25 | 3104 | 0 | 3 | 3 | pass | +| bugfix_01_B | 66 | 17 | 2129 | 25 | 2104 | 0 | 0 | 4 | pass | +| bugfix_02_A | 45 | 11 | 1104 | 20 | 1084 | 0 | 0 | 3 | pass | +| bugfix_02_B | 42 | 12 | 1465 | 17 | 1448 | 1 | 0 | 2 | pass | +| bugfix_03_A | 35 | 9 | 1401 | 14 | 1387 | 0 | 0 | 1 | pass | +| bugfix_03_B | 47 | 12 | 1636 | 18 | 1618 | 0 | 1 | 2 | pass | +| bugfix_04_A | 74 | 16 | 1798 | 26 | 1772 | 0 | 1 | 2 | pass | +| bugfix_04_B | 66 | 15 | 1831 | 22 | 1809 | 3 | 0 | 1 | pass | +| refactor_01_A | 55 | 13 | 2127 | 20 | 2107 | 0 | 1 | 3 | partial | +| refactor_01_B | 88 | 15 | 2646 | 25 | 2621 | 2 | 1 | 2 | pass | +| refactor_02_A | 76 | 15 | 2708 | 25 | 2683 | 0 | 3 | 2 | pass | +| refactor_02_B | 62 | 15 | 2229 | 22 | 2207 | 4 | 2 | 1 | pass | +| refactor_03_A | 59 | 11 | 1574 | 19 | 1555 | 0 | 0 | 2 | pass | +| refactor_03_B | 55 | 14 | 1835 | 23 | 1812 | 1 | 0 | 1 | pass | +| refactor_04_A | 58 | 13 | 2455 | 21 | 2434 | 0 | 3 | 3 | partial | +| refactor_04_B | 53 | 11 | 1452 | 19 | 1433 | 1 | 1 | 1 | pass | +| tests_01_A | 88 | 15 | 3600 | 24 | 3576 | 0 | 4 | 2 | pass | +| tests_01_B | 87 | 26 | 4054 | 35 | 4019 | 1 | 0 | 13 | pass | +| tests_02_A | 82 | 14 | 4857 | 23 | 4834 | 0 | 3 | 2 | pass | +| tests_02_B | 122 | 38 | 6779 | 58 | 6721 | 0 | 10 | 15 | pass | +| tests_03_A | 75 | 19 | 3227 | 29 | 3198 | 0 | 4 | 2 | pass | +| tests_03_B | 110 | 26 | 4367 | 37 | 4330 | 1 | 3 | 11 | pass | +| tests_04_A | 68 | 12 | 3873 | 20 | 3853 | 0 | 0 | 2 | pass | +| tests_04_B | 109 | 32 | 4598 | 43 | 4555 | 3 | 2 | 12 | pass | +| summary_01_A | 54 | 20 | 2557 | 199 | 2358 | 0 | 0 | 13 | pass | +| summary_01_B | 47 | 17 | 1845 | 20 | 1825 | 2 | 0 | 0 | pass | +| summary_02_A | 55 | 16 | 1714 | 19 | 1695 | 0 | 0 | 0 | pass | +| summary_02_B | 54 | 24 | 3296 | 27 | 3269 | 3 | 0 | 4 | pass | +| summary_03_A | 37 | 15 | 1836 | 18 | 1818 | 0 | 1 | 10 | pass | +| summary_03_B | 55 | 14 | 2163 | 17 | 2146 | 8 | 0 | 0 | pass | +| summary_04_A | 43 | 15 | 1719 | 21 | 1698 | 0 | 0 | 10 | pass | +| summary_04_B | 55 | 19 | 2345 | 198 | 2147 | 8 | 0 | 5 | pass | + +`cix_ops > 0` for an A row would mean the worker violated the prompt-level +restriction; **no A row has `cix_ops > 0`**, so no `(violation)` flag is needed. + +--- + +## 4. Methodology (abridged from §§0–7 of the runbook) + +**Subjects.** Two prompt-level configurations of `claude-sonnet-4-6` running as +sub-agents (operator is `claude-opus-4-7`): +- **Worker A — grep-only**: PREAMBLE_A restricts tools to Bash / Read / Edit / + Glob / Grep and forbids `cix`. The prompt also tells A that `CIX_API_KEY` + is set to an invalid value, so any cix call would 401. +- **Worker B — cix-first**: PREAMBLE_B advertises `cix search`, `cix + definitions`, `cix references`, `cix symbols`, `cix files` against + `http://192.168.1.168:21847` and notes the project has already been indexed. + Falling back to grep is permitted only when cix returns nothing relevant. + +Verbatim preambles and task prompts are in §7 below. + +**Fixture.** A frozen snapshot of `claude-code-index` at HEAD (`/tmp/cix-bench/baseline/`, +`.venv/` and built bench binaries removed). 16 variants under +`/tmp/cix-bench/variants/{bugfix,refactor,tests,summary}/{01..04}/`. SHA-256 +manifest of every variant file written to `/tmp/cix-bench/fixture-manifest.txt` +before any run; not modified afterwards. + +**Mutations (one per `bugfix` and `refactor` variant).** +- `bugfix/01`: drop the `!` in `IsBinary` (`cli/internal/fileutil/binary.go`). +- `bugfix/02`: change `".go": "go"` to `".go": "golang"` in `extensionMap`. +- `bugfix/03`: in `splitLines`, change `start = i + 1` to `start = i`. +- `bugfix/04`: legacy-key target `auto_watch:` becomes `auto-watch:`. +- `refactor/01`: replace map-based `dedupByLocation` with O(n²) nested loop. +- `refactor/02`: replace `sortRanges` (already insertion sort in baseline) with bubble sort. +- `refactor/03`: replace `joinLines`'s `strings.Join` with `+=` loop. +- `refactor/04`: fall-back per runbook — replace `repeatComma` byte-slice + build with a `+=` loop in `server/internal/symbolindex/symbolindex.go`. Recorded in manifest. + +`tests/01..04` and `summary/01..04` are identical to baseline. + +**Per-run procedure (serial, A before B per variant).** +1. `cix watch stop --all` to clear daemons; `rm -rf /tmp/cix-bench-run`. +2. `cp -R variants///. /tmp/cix-bench-run/`. +3. **B only:** `cix init --watch=false` against the server and wait for + `Status: ✓ Indexed` (192 files / 1669 chunks). Indexing is not counted in + `elapsed_s` — the worker prints its first `date +%s` only after the index + is ready. +4. Launch `Agent` with `subagent_type:"general-purpose"`, `model:"sonnet"`, the + assembled prompt, and a unique `description` (the run_id). +5. Locate transcript at `~/.claude/projects/.../subagents/agent-.jsonl`, + copy to `results/runs/.log`. +6. Compute metrics via `metrics.sh` (jq over JSONL); append CSV row. +7. Verify outcome per §7 of the runbook. + +**Outcome rules used in this run.** +- `bugfix`: `pass` iff `go test ./...` is green in **both** Go modules + (`cli/`, `server/`) — there is no top-level `go.mod`, so the runbook's + literal `go test ./...` from project root would test nothing. This is the + only verification deviation from §7.1; it applies identically to A and B. +- `refactor`: `pass` iff tests green AND a seeded function from §2.3 was + modified; `partial` iff tests green but a different function was "improved". +- `tests`: `pass` iff package builds, package tests pass, and ≥4 `func Test` + declarations exist in the new/modified test file. (Section 7.3 does not + gate on the function being exported, even though the prompt asks for one.) +- `summary`: paragraph scored 0–7 by a fresh Sonnet rubric agent (§7.4). + `pass` iff total ≥ 5; `partial` 3–4; `fail` ≤ 2. + +--- + +## 5. Executive summary (3 sentences) + +Worker A (grep-only) was faster on average (62.2 s vs 69.9 s), used fewer tool +calls, and produced fewer output tokens, but Worker B (cix-first) was strictly +more reliable — 16 / 16 pass vs 14 / 16. The two `partial` outcomes were both +on `refactor` runs where Worker A converged on the same non-seeded inefficiency +(`chunkSlidingWindow`) instead of the seeded target, while Worker B used +`cix symbols` / `cix references` to enumerate the codebase more broadly and hit +the seeded function in all four refactor variants. The strategy gap was +largest on `tests`: Worker B chose harder *exported* targets that required +real fixture setup (+29 s, +1041 output tokens), while Worker A consistently +picked unexported helpers like `splitChunk` even though the prompt asked for a +"public function" — a gap §7.3's verification doesn't penalize. + +--- + +## 6. Caveats + +- **The fixture is a snapshot of `claude-code-index` itself.** Both Sonnet + workers may recognize package layout / symbol names from training. Effect + is the same for A and B but inflates absolute "specificity" scores in §summary. +- **Tool restriction is prompt-level, not harness-level.** Worker A could have + called `cix` and we'd only catch it post-hoc via `cix_ops > 0`. None did + (16 / 16 A rows have `cix_ops = 0`). +- **Single machine, single model (`claude-sonnet-4-6`)**, single embedding + model, no warm/cold-cache split between A and B. The cix server is at + `http://192.168.1.168:21847` (remote on the LAN), not on `localhost`. + Both PREAMBLE_B and §5.2 indexing scripts were retargeted to that URL — + this is the only deviation from the verbatim preambles in the runbook, + applied identically before any A or B run started. +- **Pre-run cix indexing is excluded from `elapsed_s`** by construction — `cix + init --watch=false` returned synchronously before the Agent was spawned. + Reindex was incremental from variant 01 onward (only mutated files re-chunked). +- **Token counts are uncached `input_tokens` / `output_tokens` only.** Sonnet + cache-creation tokens — which dominate real spend on identical prompt scaffolds — + are *not* in the headline. For reference, the smoke test reported + `cache_creation_input_tokens=11775` per call against `input_tokens=3`. The + ranking between A and B does not change under cache-aware costing because + both pay nearly identical cache costs per run; cache-creation scales with + prompt length and the preambles differ by only a few sentences. +- **Outcome scoring for the `summary` task is itself done by Sonnet.** One of + the 8 scorer runs (`summary_03_A`) reasoned about port 21847 being wrong — + 21847 is in fact the correct cix-server port — and deducted a point as a + "fabrication". The total still cleared the `pass` threshold (5/7), but the + rubric run is not a perfect oracle. +- **`tests/01..04` evaluation is loose.** §7.3 doesn't gate on the function + being exported, even though TESTS_PROMPT explicitly asks for "one public + function". Worker A picked an unexported helper in all 4 tests runs, which + the verification still scores `pass`. Treating that as `partial` would + flip the tests pass-rate to 0 / 4 (A) vs 4 / 4 (B). Reported here as `pass` + to honor §7.3 letter-of-the-law. +- **`bugfix/01` actually breaks 8 tests, not 1**, because the runbook-prescribed + mutation inverts the entire `IsBinary` decision. The BUGFIX_PROMPT line + "Exactly one test is failing" is therefore mildly misleading — but the bug + is still a one-line root cause and both A and B fixed it cleanly. Recorded + in `fixture-manifest.txt`. +- **`refactor/02` baseline already used a hand-rolled insertion sort** (not + `sort.Slice` as the runbook assumed). Mutation applied in spirit: + insertion → bubble. Recorded in `fixture-manifest.txt`. +- **`refactor/04` had no `map[..]`-in-loop or `sort.Slice` in `symbolindex.go`.** + Used the runbook's documented fall-back: seeded inefficiency in + `repeatComma` (byte-slice → `+=` loop). Recorded in `fixture-manifest.txt`. + +--- + +## 7. Verbatim prompts (copy of runbook §3 + §4) + +### 7.1 Task prompts + +**BUGFIX_PROMPT** +``` +You are working in a Go project at the current directory. Run its test suite from the project root. Exactly one test is failing. Find and fix the underlying bug in the source code (do NOT modify the failing test or any other test). After your fix, re-run the full test suite from the project root and confirm everything is green. Report what you changed and why in 3–5 sentences. +``` + +**REFACTOR_PROMPT** +``` +You are working in a Go project at the current directory. Somewhere in this codebase there is a function whose implementation is asymptotically inefficient (its complexity is worse than necessary) while still being correct. Find one such function. Replace its body with an algorithmically better implementation that has the same observable behaviour. After your change, run the full test suite from the project root and confirm everything is green. Report what you changed and why in 3–5 sentences. +``` + +**TESTS_PROMPT** +``` +You are working in a Go project at the current directory. Pick one public function that currently has no unit-test coverage and write at least four meaningful unit tests for it covering distinct cases (typical input, edge case, error path, boundary). Place the new tests in the same package as the function. Run the package's tests and confirm they pass. Report which function you chose and why in 2–3 sentences. +``` + +**SUMMARY_PROMPT** +``` +You are working in a software project at the current directory. Read enough of the code to understand its overall purpose and structure. Produce a single-paragraph (≈200 words) summary covering: what the project does, its top-level architecture, the role of each major component or package, and the main entry points. The summary must be specific to THIS code base — no generic phrasing. +``` + +### 7.2 Preambles + +**COMMON_PREAMBLE** (prepended to every worker) +``` +AUTO MODE — execute autonomously, no clarifying questions, no skill invocations, code only. Begin by printing `date +%s` and end by printing `date +%s` so elapsed time can be measured from the transcript. +``` + +**PREAMBLE_A** (Worker A — grep-only) +``` +TOOL CONSTRAINT — you may use ONLY the following tools: Bash, Read, Edit, Glob, Grep. You MUST NOT call the `cix` CLI under any circumstance. Use grep, find, ls, ripgrep, etc. for navigation. +``` + +**PREAMBLE_B** (Worker B — cix-first; URL retargeted from `localhost` to `192.168.1.168` per §6) +``` +TOOL CONSTRAINT — a cix index of this project is available. Prefer the cix CLI for navigation: `cix search ""`, `cix definitions `, `cix references `, `cix symbols `, `cix files `. The cix server is at http://192.168.1.168:21847 and the project at the current working directory has already been registered and indexed for you. You MAY fall back to grep only if a cix command genuinely returns nothing relevant. Do not run `cix init`, `cix reindex`, or modify the cix configuration. +``` + +### 7.3 Final prompt assembly + +For Worker A: +``` + + + + + + +The project is at /tmp/cix-bench-run. Begin by `cd /tmp/cix-bench-run`. + +Note: the env var CIX_API_KEY is set to an invalid value for this run; any cix call will fail with an auth error. +``` + +For Worker B: +``` + + + + + + +The project is at /tmp/cix-bench-run. Begin by `cd /tmp/cix-bench-run`. +``` + +--- + +## 8. Where to look + +- Raw per-run JSONL transcripts: `/tmp/cix-bench/results/runs/.log` +- Per-run metric JSON: `/tmp/cix-bench/results/runs/.metrics.json` +- Summary task texts + scoring: `/tmp/cix-bench/results/runs/summary_*_*.txt` + + `summary_*_*.score.json` +- Combined CSV: `/tmp/cix-bench/results/results.csv` +- Frozen fixture manifest (with deviation notes): `/tmp/cix-bench/fixture-manifest.txt` diff --git a/server/Makefile b/server/Makefile index 2f991c6..085a96f 100644 --- a/server/Makefile +++ b/server/Makefile @@ -25,11 +25,17 @@ IMAGE_REPO ?= dvcdsys/code-index IMAGE_TAG ?= go-cu128 VERSION ?= $(shell git describe --tags --always 2>/dev/null || echo "0.0.0-dev") +# Floating dev tag — overwritten on every `docker-build-cuda-dev` push. Used +# by the operator to deploy "current local work" onto the RTX 3090 box for +# manual smoke testing before merging a PR. +DEV_TAG ?= cu128-dev +GIT_SHA ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo "nogit") + BUILDER ?= cix-builder SCOUT_TAG ?= scout-$(shell date +%Y%m%d-%H%M) .PHONY: help build test test-gate fetch-llama bundle run docker-build-cuda \ - scout-cuda scout-cpu promote-cuda clean + docker-build-cuda-dev scout-cuda scout-cpu promote-cuda clean help: @echo "Targets:" @@ -40,6 +46,7 @@ help: @echo " run — bundle + launch cix-server (reads .env from repo root, sets CIX_LLAMA_BIN_DIR)" @echo " test-gate — run the Phase 3 parity gate (requires fetch-llama + GGUF)" @echo " docker-build-cuda — build + push linux/amd64 CUDA image $(IMAGE_REPO):$(IMAGE_TAG)" + @echo " docker-build-cuda-dev — build + push CUDA dev image $(IMAGE_REPO):$(DEV_TAG)" @echo " scout-cuda — build CUDA image via native x86 builder → push : → docker scout cves" @echo " scout-cpu — build CPU image locally → docker scout cves (no push)" @echo " promote-cuda — retag SCOUT_TAG as go-cu128+cu128 without rebuild (imagetools)" @@ -109,6 +116,33 @@ docker-build-cuda: --push \ $(ROOT) +# Local dev workflow for the RTX 3090 box (the GPU prod server reachable via +# Portainer MCP). Builds the CUDA image with a "dev" tag and pushes to Docker +# Hub from the dev machine. Operator then bumps the Portainer stack image +# tag to :cu128-dev (or pulls fresh) and smoke-tests on real hardware before +# opening a PR / cutting a versioned release. +# +# Skips Docker Scout — this image is for the operator's eyes, not for +# downstream users; run `make scout-cuda` before promoting to :cu128/:go-cu128. +docker-build-cuda-dev: + @echo "→ Building CUDA dev image on $(BUILDER) → $(IMAGE_REPO):$(DEV_TAG)" + docker buildx build \ + --builder $(BUILDER) \ + --platform linux/amd64 \ + --pull \ + --provenance=mode=max \ + --sbom=true \ + --build-arg VERSION=$(VERSION)-dev-$(GIT_SHA) \ + -f $(ROOT)/Dockerfile.cuda \ + -t $(IMAGE_REPO):$(DEV_TAG) \ + --push \ + $(ROOT) + @echo "" + @echo "✓ Pushed $(IMAGE_REPO):$(DEV_TAG)" + @echo "" + @echo "Next: bump the Portainer stack image to :$(DEV_TAG) and redeploy," + @echo " then smoke-test on the RTX 3090 box." + # Scout workflow — iterate locally before touching production tags: # # make scout-cuda # build on native x86 → :scout-YYYYMMDD-HHMM → scan From 58de3632f4e56d14b8167c6f1eace223e6b0e4d9 Mon Sep 17 00:00:00 2001 From: dvcdsys Date: Mon, 27 Apr 2026 22:11:08 +0100 Subject: [PATCH 4/6] fix(chunker): drop markdown atx_heading + only first split-chunk keeps symbol metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related fixes that cleaned up search/cix-def output for repos with markdown docs and long Go/Python functions: 1. Markdown registry. tree-sitter-markdown's `section` already wraps the heading + its body, so listing both \`section\` and \`atx_heading\` in the type-nodes config emitted duplicate one-line chunks for every \`### foo\` heading (visible as Type: type | 1-2 line snippets in `cix search` output). Drop \`atx_heading\` — keep only \`section\`. 2. splitChunk. When tree-sitter emitted a function chunk larger than maxChunkSize (default 4500 chars), splitChunk cut it into N pieces and set SymbolName/SymbolSignature/ChunkType=\"function\" on every piece. Result: cix def run returned N hits at different line ranges of the same function. Now only the FIRST piece carries the symbol metadata; subsequent pieces become anonymous \`block\` chunks. Full content of the symbol stays indexed for embed/FTS search — only the symbol-index attribution is consolidated. Test: TestSplitChunk_OnlyFirstKeepsSymbol — fixture is a 2000-line Python function, asserts exactly one chunk in the result claims symbol=big_func. Co-Authored-By: Claude Opus 4.7 --- server/internal/chunker/chunker.go | 64 ++++++++++++++++--------- server/internal/chunker/chunker_test.go | 41 ++++++++++++++++ 2 files changed, 82 insertions(+), 23 deletions(-) diff --git a/server/internal/chunker/chunker.go b/server/internal/chunker/chunker.go index 0b140a1..b5e34a2 100644 --- a/server/internal/chunker/chunker.go +++ b/server/internal/chunker/chunker.go @@ -378,7 +378,10 @@ func defaultRegistry() map[string]languageEntry { "markdown": { factory: grammars.MarkdownLanguage, nodes: map[string][]string{ - "type": {"section", "atx_heading"}, + // `section` already wraps the heading + body in + // tree-sitter-markdown — adding `atx_heading` would emit + // duplicate one-line chunks for every `### foo` line. + "type": {"section"}, }, identifiers: nil, }, @@ -864,45 +867,60 @@ func chunkSlidingWindow(filePath, content, language string) []Chunk { // Chunk splitting // --------------------------------------------------------------------------- +// splitChunk cuts an oversized chunk into pieces of <= maxSize chars. +// +// Only the FIRST piece keeps the original SymbolName/SymbolSignature/ +// ChunkType — subsequent pieces become anonymous `block` chunks. Without +// this, splitting a long function would create N rows in the symbol index +// all claiming to be `func run()`, making `cix def run` return N +// duplicates pointing at different line ranges of the same symbol. +// +// The full text of the symbol is still indexed (both for FTS and embed +// search) — just attributed to the symbol only via its first chunk. func splitChunk(chunk Chunk, maxSize int) []Chunk { lines := splitLines(chunk.Content) var subChunks []Chunk var currentLines []string currentStart := chunk.StartLine + emit := func(content string, startLine, endLine int, isFirst bool) { + c := Chunk{ + Content: content, + FilePath: chunk.FilePath, + StartLine: startLine, + EndLine: endLine, + Language: chunk.Language, + ParentName: chunk.ParentName, + } + if isFirst { + c.ChunkType = chunk.ChunkType + c.SymbolName = chunk.SymbolName + c.SymbolSignature = chunk.SymbolSignature + } else { + c.ChunkType = "block" + } + subChunks = append(subChunks, c) + } + for _, line := range lines { currentLines = append(currentLines, line) currentContent := joinLines(currentLines) if len(currentContent) >= maxSize && len(currentLines) > 1 { splitContent := joinLines(currentLines[:len(currentLines)-1]) - subChunks = append(subChunks, Chunk{ - Content: splitContent, - ChunkType: chunk.ChunkType, - FilePath: chunk.FilePath, - StartLine: currentStart, - EndLine: currentStart + len(currentLines) - 2, - Language: chunk.Language, - SymbolName: chunk.SymbolName, - SymbolSignature: chunk.SymbolSignature, - ParentName: chunk.ParentName, - }) + emit(splitContent, + currentStart, + currentStart+len(currentLines)-2, + len(subChunks) == 0) currentStart = currentStart + len(currentLines) - 1 currentLines = []string{line} } } if len(currentLines) > 0 { - subChunks = append(subChunks, Chunk{ - Content: joinLines(currentLines), - ChunkType: chunk.ChunkType, - FilePath: chunk.FilePath, - StartLine: currentStart, - EndLine: chunk.EndLine, - Language: chunk.Language, - SymbolName: chunk.SymbolName, - SymbolSignature: chunk.SymbolSignature, - ParentName: chunk.ParentName, - }) + emit(joinLines(currentLines), + currentStart, + chunk.EndLine, + len(subChunks) == 0) } return subChunks } diff --git a/server/internal/chunker/chunker_test.go b/server/internal/chunker/chunker_test.go index 8244b9e..15310c5 100644 --- a/server/internal/chunker/chunker_test.go +++ b/server/internal/chunker/chunker_test.go @@ -189,6 +189,47 @@ func TestChunkFile_OversizedChunkSplit(t *testing.T) { } } +func TestSplitChunk_OnlyFirstKeepsSymbol(t *testing.T) { + // A long Python function that splitChunk will cut into >1 piece. + var sb strings.Builder + sb.WriteString("def big_func():\n") + for i := 0; i < 2000; i++ { + sb.WriteString(" x = 1 # padding line\n") + } + src := sb.String() + chunks, _, err := ChunkFile("big.py", src, "python", 0) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Find all chunks that mention `big_func`. Only one chunk in the index + // should claim the symbol; the rest must be anonymous `block` pieces + // even though they textually belong to the same function. + withSymbol := 0 + for _, c := range chunks { + if c.SymbolName != nil && *c.SymbolName == "big_func" { + withSymbol++ + if c.ChunkType != "function" { + t.Errorf("chunk with symbol big_func has type %q, want function", c.ChunkType) + } + } + } + if withSymbol != 1 { + t.Errorf("expected exactly 1 chunk attributed to big_func after split, got %d", withSymbol) + } + + // And we DID split — meaning multiple chunks for this function exist. + totalForFunc := 0 + for _, c := range chunks { + if c.FilePath == "big.py" && c.ChunkType != "module" { + totalForFunc++ + } + } + if totalForFunc < 2 { + t.Skipf("test self-check: function fit into one chunk (totalForFunc=%d) — need bigger fixture", totalForFunc) + } +} + func TestFindGaps_NoOverlap(t *testing.T) { covered := [][2]int{{2, 5}, {10, 12}} gaps := findGaps(covered, 15) From f87e7e70d5292243ea64f9ec25b4d99282903250 Mon Sep 17 00:00:00 2001 From: dvcdsys Date: Mon, 27 Apr 2026 22:17:08 +0100 Subject: [PATCH 5/6] feat(search): merge overlapping hits + windowed retrieval + breadcrumb render MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tree-sitter emits nested chunks by design — a markdown H1 wraps its H2 sub-sections, a Go class wraps its methods, a Python module wraps its classes. A vector search that hits the inner chunk also tends to hit (a bit weaker) the outer chunk, and the user's --limit budget gets eaten by N near-duplicates of the same code region. Same problem with splitChunk leftovers when a long function is cut into pieces. This change collapses overlapping results from the same file into a single "outer" hit with the inner matches recorded as NestedHits. Two merge cases: 1. Strict containment — A.range ⊋ B.range and same file → absorb B. 2. Same-symbol adjacent — adjacent ranges where at least one carries a symbol name → absorb (catches splitChunk piece1 + piece2 leftovers). Cross-file results are NEVER merged. Exact duplicates (same range twice) are not merged either — those should be deduped at the vector-store layer (already are, via dedupByLocation). Windowed retrieval: instead of over-fetching limit×2 once, the search handler now retries with limit×2, ×4, ×8, ×16 if mergeOverlappingHits collapses the result set below the user's --limit. Stops early when the vector store returns fewer rows than asked (HNSW exhausted) or when the factor cap is hit. In practice the first window is enough; the loop exists for repos with deeply nested markdown or many class+method hits inside the same files. Server changes: - New file search_merge.go — mergeOverlappingHits, shouldMerge. - searchResultItem gains NestedHits []nestedHit (omitempty). - semanticSearchHandler refactored: extract fetchVectorResults + filterToSearchItems helpers, wrap call in factor loop, drop the early break-on-limit (merge needs the full filtered set to identify overlaps). - 10 unit tests for mergeOverlappingHits + 1 integration test (TestSemanticSearch_NestedMarkdownMerge) verifying nested H1/H2 sections collapse to a single result with NestedHits populated. CLI changes: - SearchResult / NestedHit struct mirrors the server response. - cix search render shows "+ N more match(es) inside:" with per-hit score/range/symbol so the user sees WHY the outer chunk ranks well even when the actual signal came from a sub-section. Co-Authored-By: Claude Opus 4.7 --- cli/cmd/search.go | 19 +- cli/internal/client/search.go | 30 +- server/internal/httpapi/indexing_test.go | 65 +++++ server/internal/httpapi/search.go | 271 ++++++++++++------- server/internal/httpapi/search_merge.go | 138 ++++++++++ server/internal/httpapi/search_merge_test.go | 188 +++++++++++++ 6 files changed, 607 insertions(+), 104 deletions(-) create mode 100644 server/internal/httpapi/search_merge.go create mode 100644 server/internal/httpapi/search_merge_test.go diff --git a/cli/cmd/search.go b/cli/cmd/search.go index e9a1c32..9b12855 100644 --- a/cli/cmd/search.go +++ b/cli/cmd/search.go @@ -139,7 +139,24 @@ func runSearch(cmd *cobra.Command, args []string) error { for _, line := range strings.Split(content, "\n") { fmt.Printf(" %s\n", line) } - fmt.Printf(" ```\n\n") + fmt.Printf(" ```\n") + + // Breadcrumbs for nested hits absorbed by the server's merge step. + // Tells the user "this big chunk ranks well because of these inner + // matches" so they're not surprised that --limit returned fewer + // items than expected. + if len(result.NestedHits) > 0 { + fmt.Printf(" + %d more match(es) inside:\n", len(result.NestedHits)) + for _, nh := range result.NestedHits { + label := nh.ChunkType + if nh.SymbolName != "" { + label = fmt.Sprintf("%s %s", nh.ChunkType, nh.SymbolName) + } + fmt.Printf(" · [%.2f] %s:%d-%d (%s)\n", + nh.Score, result.FilePath, nh.StartLine, nh.EndLine, label) + } + } + fmt.Println() } return nil diff --git a/cli/internal/client/search.go b/cli/internal/client/search.go index 2392d41..018a579 100644 --- a/cli/internal/client/search.go +++ b/cli/internal/client/search.go @@ -2,16 +2,34 @@ package client import "fmt" -// SearchResult represents a code search result +// SearchResult represents a code search result. +// +// NestedHits is populated by the server's mergeOverlappingHits step when +// other matches inside this chunk's line range were absorbed (e.g. a +// markdown H2 inside an H1 section, or a method inside its class). The +// renderer uses these to show breadcrumbs so the user can see WHY this +// outer chunk ranks well. type SearchResult struct { - FilePath string `json:"file_path"` + FilePath string `json:"file_path"` + StartLine int `json:"start_line"` + EndLine int `json:"end_line"` + Content string `json:"content"` + Score float64 `json:"score"` + ChunkType string `json:"chunk_type"` + SymbolName string `json:"symbol_name"` + Language string `json:"language"` + NestedHits []NestedHit `json:"nested_hits,omitempty"` +} + +// NestedHit is a chunk that was merged INTO another result by the server. +// Just enough metadata to render a breadcrumb and let the user jump to +// the exact line. The full content is already inside the parent result. +type NestedHit struct { StartLine int `json:"start_line"` EndLine int `json:"end_line"` - Content string `json:"content"` - Score float64 `json:"score"` + SymbolName string `json:"symbol_name,omitempty"` ChunkType string `json:"chunk_type"` - SymbolName string `json:"symbol_name"` - Language string `json:"language"` + Score float64 `json:"score"` } // SearchResponse represents the search response diff --git a/server/internal/httpapi/indexing_test.go b/server/internal/httpapi/indexing_test.go index b072102..cc73c18 100644 --- a/server/internal/httpapi/indexing_test.go +++ b/server/internal/httpapi/indexing_test.go @@ -272,6 +272,71 @@ func TestSemanticSearch_HTTP(t *testing.T) { } } +// TestSemanticSearch_NestedMarkdownMerge indexes a markdown file with H1 +// containing two H2 sections, all containing a unique token. The chunker +// emits 3 overlapping `section` chunks (1 outer + 2 inner). After +// mergeOverlappingHits the outer section absorbs both inner ones — +// observable as ONE result with NestedHits populated, instead of three +// near-duplicates fighting for the user's --limit budget. +func TestSemanticSearch_NestedMarkdownMerge(t *testing.T) { + d, hash := newIndexerTestDeps(t, "/proj-md") + router := NewRouter(d) + + beginW := doRequest(t, router, http.MethodPost, "/api/v1/projects/"+hash+"/index/begin", map[string]any{}) + var begin indexBeginResponse + _ = json.Unmarshal(beginW.Body.Bytes(), &begin) + + content := "# Setup zlork\n\nIntro about zlork.\n\n## Local zlork dev\n\n" + + "Steps for zlork.\n\n## Remote zlork\n\nMore zlork.\n" + doRequest(t, router, http.MethodPost, "/api/v1/projects/"+hash+"/index/files", map[string]any{ + "run_id": begin.RunID, + "files": []map[string]any{ + {"path": "/proj-md/README.md", "content": content, "content_hash": shaHex(content), "language": "markdown"}, + }, + }) + doRequest(t, router, http.MethodPost, "/api/v1/projects/"+hash+"/index/finish", map[string]any{ + "run_id": begin.RunID, + }) + + w := doRequest(t, router, http.MethodPost, "/api/v1/projects/"+hash+"/search", map[string]any{ + "query": "zlork", + "limit": 10, + "min_score": 0.0, + }) + if w.Code != http.StatusOK { + t.Fatalf("status=%d body=%s", w.Code, w.Body.String()) + } + + var resp searchResponse + if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil { + t.Fatalf("unmarshal: %v", err) + } + + // Find the outer section result and verify it has nested_hits. + var outer *searchResultItem + for i := range resp.Results { + r := &resp.Results[i] + if r.FilePath == "/proj-md/README.md" && r.StartLine == 1 { + outer = r + break + } + } + if outer == nil { + t.Fatalf("expected an outer section starting at line 1, got results: %+v", resp.Results) + } + if len(outer.NestedHits) == 0 { + t.Errorf("outer section should have nested hits absorbed, got NestedHits=%v", outer.NestedHits) + } + // And we should NOT see those nested ranges as separate top-level results. + for _, r := range resp.Results { + if r.FilePath == "/proj-md/README.md" && r.StartLine != 1 { + // Any other start line in the same file means a nested section + // leaked through merging. + t.Errorf("non-outer section leaked as separate result: lines %d-%d", r.StartLine, r.EndLine) + } + } +} + func TestSemanticSearch_HTTP_MissingQuery(t *testing.T) { d, hash := newIndexerTestDeps(t, "/proj") router := NewRouter(d) diff --git a/server/internal/httpapi/search.go b/server/internal/httpapi/search.go index befd16e..e3918c4 100644 --- a/server/internal/httpapi/search.go +++ b/server/internal/httpapi/search.go @@ -1,6 +1,7 @@ package httpapi import ( + "context" "database/sql" "encoding/json" "errors" @@ -582,14 +583,34 @@ type searchRequest struct { } type searchResultItem struct { - FilePath string `json:"file_path"` + FilePath string `json:"file_path"` + StartLine int `json:"start_line"` + EndLine int `json:"end_line"` + Content string `json:"content"` + Score float32 `json:"score"` + ChunkType string `json:"chunk_type"` + SymbolName string `json:"symbol_name"` + Language string `json:"language"` + // NestedHits records other matches inside this result's line range that + // were merged into it by mergeOverlappingHits. Populated only when at + // least one inner hit was absorbed; emitted as `nested_hits` in JSON. + // The renderer uses these to show breadcrumbs (e.g. "+ 2 more matches: + // H2 'Foo' line 27, H3 'Bar' line 29") so the user can see WHY this + // outer chunk ranks well even when the actual signal came from a + // sub-section. + NestedHits []nestedHit `json:"nested_hits,omitempty"` +} + +// nestedHit is a compact view of a chunk that was merged INTO another +// result. We don't need the full content (the parent's content already +// contains it textually) — just enough metadata to render a breadcrumb +// and let the caller jump to the exact line. +type nestedHit struct { StartLine int `json:"start_line"` EndLine int `json:"end_line"` - Content string `json:"content"` - Score float32 `json:"score"` + SymbolName string `json:"symbol_name,omitempty"` ChunkType string `json:"chunk_type"` - SymbolName string `json:"symbol_name"` - Language string `json:"language"` + Score float32 `json:"score"` } type searchResponse struct { @@ -652,117 +673,173 @@ func semanticSearchHandler(d Deps) http.HandlerFunc { return } - // M4 — multi-language fan-out. chromem-go's `where` map cannot express - // "language IN (go, python)" natively, so: - // - 0 languages: single query, no where filter. - // - 1 language: single query with `where={"language": lang}` — same - // HNSW-level pre-filter as Python. - // - ≥2 languages: N independent queries (one per language) merged and - // deduped by document ID. Preserves pre-filter semantics so the top - // results are not starved by unrelated languages when the collection - // is large. - const maxFanout = 4 - - var allResults []vectorStoreResult - switch { - case len(body.Languages) == 0: - r1, err := d.VectorStore.Search(r.Context(), p.HostPath, qEmb, body.Limit*2, nil) + // Post-filter (path/language) and merge state are computed once outside + // the window loop — both are cheap and don't depend on factor. + langSet := map[string]struct{}{} + for _, l := range body.Languages { + langSet[l] = struct{}{} + } + applyPostLangFilter := len(body.Languages) > maxFanoutSearch + + // Windowed retrieval. Start by asking the vector store for limit×2 + // (the historical default), and if mergeOverlappingHits collapses + // the result set below the user's --limit budget — typically because + // of nested markdown sections or class+method overlaps — re-ask for + // limit×4, then ×8, up to ×maxFactorSearch. Stops early when the + // store returns fewer rows than requested (HNSW exhausted). + var merged []searchResultItem + factor := 2 + for { + n := body.Limit * factor + rawWrapped, err := fetchVectorResults( + r.Context(), d.VectorStore, p.HostPath, qEmb, n, body.Languages, + ) if err != nil { writeError(w, http.StatusInternalServerError, err.Error()) return } - allResults = wrapResults(r1) - case len(body.Languages) == 1: - r1, err := d.VectorStore.Search(r.Context(), p.HostPath, qEmb, body.Limit*2, - map[string]string{"language": body.Languages[0]}) - if err != nil { - writeError(w, http.StatusInternalServerError, err.Error()) - return + filtered := filterToSearchItems(rawWrapped, minScore, body.Paths, langSet, applyPostLangFilter) + merged = mergeOverlappingHits(filtered) + if len(merged) >= body.Limit { + break } - allResults = wrapResults(r1) - case len(body.Languages) <= maxFanout: - // Per-language fan-out; merge and dedupe. - for _, lang := range body.Languages { - rPart, err := d.VectorStore.Search(r.Context(), p.HostPath, qEmb, body.Limit*2, - map[string]string{"language": lang}) - if err != nil { - writeError(w, http.StatusInternalServerError, err.Error()) - return - } - allResults = append(allResults, wrapResults(rPart)...) + if len(rawWrapped) < n { + // Vector store returned everything it had — no point asking again. + break } - allResults = dedupByLocation(allResults) - // Sort by descending score — merged slices arrive pre-sorted per - // partition but out of order across partitions. - sort.SliceStable(allResults, func(i, j int) bool { - return allResults[i].r.Score > allResults[j].r.Score - }) - default: - // Too many languages for fan-out — fall back to post-filter with a - // generous over-fetch to minimise starvation. - rAll, err := d.VectorStore.Search(r.Context(), p.HostPath, qEmb, - body.Limit*len(body.Languages)*2, nil) - if err != nil { - writeError(w, http.StatusInternalServerError, err.Error()) - return + if factor >= maxFactorSearch { + break } - allResults = wrapResults(rAll) + factor *= 2 } - // Post-filter for the >maxFanout path needs a language set. - langSet := map[string]struct{}{} - for _, l := range body.Languages { - langSet[l] = struct{}{} + if len(merged) > body.Limit { + merged = merged[:body.Limit] } - applyPostLangFilter := len(body.Languages) > maxFanout - filtered := make([]searchResultItem, 0, len(allResults)) - for _, wrapped := range allResults { - res := wrapped.r - if res.Score < minScore { - continue + elapsedMS := float64(time.Since(start).Microseconds()) / 1000.0 + elapsedMS = float64(int(elapsedMS*10+0.5)) / 10 + + writeJSON(w, http.StatusOK, searchResponse{ + Results: merged, + Total: len(merged), + QueryTimeMS: elapsedMS, + }) + } +} + +// maxFanoutSearch is the language-count threshold above which we drop +// per-language pre-filter and fall back to a single over-fetched query +// with post-filter. Same value as the previous inline `maxFanout`. +const maxFanoutSearch = 4 + +// maxFactorSearch caps the windowed retrieval expansion. With body.Limit=10 +// and factor=16 we top out at 160 raw results — enough to fill the budget +// even on heavily nested markdown without spending all day re-querying. +const maxFactorSearch = 16 + +// fetchVectorResults performs the per-language fan-out vector-store query +// at the given limit and returns deduped, score-sorted results. Extracted +// from semanticSearchHandler so the windowed retry loop can call it with +// growing `n` values without duplicating the four-case switch. +// +// The fan-out strategy mirrors the original inline logic: 0 languages → +// single query; 1 language → single query with where-filter; 2..maxFanout +// → N queries with per-language where-filter, deduped and re-sorted by +// score; >maxFanout → single oversized query, post-filter handled by +// caller (filterToSearchItems with applyPostLangFilter=true). +func fetchVectorResults( + ctx context.Context, + store *vectorstore.Store, + projectPath string, + qEmb []float32, + n int, + languages []string, +) ([]vectorStoreResult, error) { + switch { + case len(languages) == 0: + r1, err := store.Search(ctx, projectPath, qEmb, n, nil) + if err != nil { + return nil, err + } + return wrapResults(r1), nil + case len(languages) == 1: + r1, err := store.Search(ctx, projectPath, qEmb, n, + map[string]string{"language": languages[0]}) + if err != nil { + return nil, err + } + return wrapResults(r1), nil + case len(languages) <= maxFanoutSearch: + var combined []vectorStoreResult + for _, lang := range languages { + rPart, err := store.Search(ctx, projectPath, qEmb, n, + map[string]string{"language": lang}) + if err != nil { + return nil, err } - if applyPostLangFilter { - if _, ok := langSet[res.Language]; !ok { - continue - } + combined = append(combined, wrapResults(rPart)...) + } + combined = dedupByLocation(combined) + sort.SliceStable(combined, func(i, j int) bool { + return combined[i].r.Score > combined[j].r.Score + }) + return combined, nil + default: + rAll, err := store.Search(ctx, projectPath, qEmb, n*len(languages), nil) + if err != nil { + return nil, err + } + return wrapResults(rAll), nil + } +} + +// filterToSearchItems applies min-score, language post-filter, and path +// prefix/substring matches. It does NOT truncate — the merge step needs +// the full filtered set to identify all overlaps before deciding which to +// drop. Truncation happens after merge in the caller. +func filterToSearchItems( + wrapped []vectorStoreResult, + minScore float32, + paths []string, + langSet map[string]struct{}, + applyPostLangFilter bool, +) []searchResultItem { + filtered := make([]searchResultItem, 0, len(wrapped)) + for _, w := range wrapped { + res := w.r + if res.Score < minScore { + continue + } + if applyPostLangFilter { + if _, ok := langSet[res.Language]; !ok { + continue } - if len(body.Paths) > 0 { - matched := false - for _, pfx := range body.Paths { - if strings.HasPrefix(res.FilePath, pfx) || strings.Contains(res.FilePath, pfx) { - matched = true - break - } - } - if !matched { - continue + } + if len(paths) > 0 { + matched := false + for _, pfx := range paths { + if strings.HasPrefix(res.FilePath, pfx) || strings.Contains(res.FilePath, pfx) { + matched = true + break } } - filtered = append(filtered, searchResultItem{ - FilePath: res.FilePath, - StartLine: res.StartLine, - EndLine: res.EndLine, - Content: res.Content, - Score: res.Score, - ChunkType: res.ChunkType, - SymbolName: res.SymbolName, - Language: res.Language, - }) - if len(filtered) >= body.Limit { - break + if !matched { + continue } } - - elapsedMS := float64(time.Since(start).Microseconds()) / 1000.0 - elapsedMS = float64(int(elapsedMS*10+0.5)) / 10 - - writeJSON(w, http.StatusOK, searchResponse{ - Results: filtered, - Total: len(filtered), - QueryTimeMS: elapsedMS, + filtered = append(filtered, searchResultItem{ + FilePath: res.FilePath, + StartLine: res.StartLine, + EndLine: res.EndLine, + Content: res.Content, + Score: res.Score, + ChunkType: res.ChunkType, + SymbolName: res.SymbolName, + Language: res.Language, }) } + return filtered } // strconvItoa avoids pulling strconv just for one call in this file — mirrors diff --git a/server/internal/httpapi/search_merge.go b/server/internal/httpapi/search_merge.go new file mode 100644 index 0000000..36250d9 --- /dev/null +++ b/server/internal/httpapi/search_merge.go @@ -0,0 +1,138 @@ +package httpapi + +import "sort" + +// mergeOverlappingHits collapses search results that come from the same file +// when one's line range fully contains another's, or when same-symbol pieces +// of a split chunk happen to be adjacent. The "outer" hit survives, picks up +// the best score across the merged set, and records inner hits as +// NestedHits so the renderer can show them as breadcrumbs. +// +// Why this matters +// +// Tree-sitter emits nested chunks by design: a class chunk wraps its method +// chunks; a markdown H1 section wraps its H2 sub-sections; a Python class +// wraps inner functions. Without merging, a vector-search query that hits +// strongly inside one of those nested chunks tends to also hit (slightly +// less strongly) the parent chunk that textually contains the same lines, +// and the user's --limit budget gets eaten by N copies of essentially the +// same code region. +// +// Adjacency rule (the splitChunk leftover): when a function is too long for +// a single chunk, splitChunk emits piece 1 with the symbol metadata and +// pieces 2..N as anonymous `block`s. If a query happens to hit BOTH the +// named first piece AND the anonymous tail, those two ranges are exactly +// adjacent (piece1.EndLine + 1 == piece2.StartLine). We merge those too — +// the anonymous tail "belongs" to the named symbol on the same file. +// +// Cross-file results are NEVER merged: two functions with the same name in +// two different files are legitimately separate hits. +// +// The function does not truncate to any limit — that's the caller's job +// after this returns. Output is sorted by descending merged score. +func mergeOverlappingHits(items []searchResultItem) []searchResultItem { + if len(items) <= 1 { + return items + } + + // Group indices by file path. Keeping indices (not copies) so we can + // edit items[parentIdx] in-place to grow its NestedHits. + byFile := map[string][]int{} + for i := range items { + byFile[items[i].FilePath] = append(byFile[items[i].FilePath], i) + } + + consumed := make([]bool, len(items)) + + for _, idxs := range byFile { + if len(idxs) <= 1 { + continue + } + + // Sort by range size descending (largest first → potential parent), + // tiebreak by start line ascending so the iteration order is stable + // and biggest-encloses-everything-inside-it semantics fall out + // naturally. + sort.Slice(idxs, func(a, b int) bool { + ia, ib := items[idxs[a]], items[idxs[b]] + sa := ia.EndLine - ia.StartLine + sb := ib.EndLine - ib.StartLine + if sa != sb { + return sa > sb + } + return ia.StartLine < ib.StartLine + }) + + for ai := 0; ai < len(idxs); ai++ { + parentIdx := idxs[ai] + if consumed[parentIdx] { + continue + } + parent := items[parentIdx] + + for _, childIdx := range idxs[ai+1:] { + if consumed[childIdx] { + continue + } + child := items[childIdx] + if !shouldMerge(parent, child) { + continue + } + consumed[childIdx] = true + if child.Score > parent.Score { + parent.Score = child.Score + } + parent.NestedHits = append(parent.NestedHits, nestedHit{ + StartLine: child.StartLine, + EndLine: child.EndLine, + SymbolName: child.SymbolName, + ChunkType: child.ChunkType, + Score: child.Score, + }) + } + items[parentIdx] = parent + } + } + + out := make([]searchResultItem, 0, len(items)) + for i := range items { + if !consumed[i] { + out = append(out, items[i]) + } + } + + sort.SliceStable(out, func(i, j int) bool { + return out[i].Score > out[j].Score + }) + return out +} + +// shouldMerge returns true when child should be absorbed into parent. +// Two cases trigger a merge — see mergeOverlappingHits doc-comment for +// the rationale. +func shouldMerge(parent, child searchResultItem) bool { + if parent.FilePath != child.FilePath { + return false + } + // Case 1: parent's range strictly contains child's range. We require a + // strict containment (i.e. it's NOT the same range) to avoid merging + // duplicates from per-language fan-out — those should be deduped at the + // vector-store layer, not here. + if parent.StartLine <= child.StartLine && parent.EndLine >= child.EndLine { + if parent.StartLine != child.StartLine || parent.EndLine != child.EndLine { + return true + } + } + // Case 2: same-symbol adjacent ranges. After splitChunk, only the + // first piece keeps SymbolName, so the typical pattern is + // {symbol=run, lines 61..195} + {symbol="" tail block, lines 196..198}. + // Adjacency by itself isn't enough — we need at least one to carry the + // symbol so we know they're related; otherwise we'd merge unrelated + // neighbouring chunks. + if parent.SymbolName != "" || child.SymbolName != "" { + if parent.EndLine+1 == child.StartLine || child.EndLine+1 == parent.StartLine { + return true + } + } + return false +} diff --git a/server/internal/httpapi/search_merge_test.go b/server/internal/httpapi/search_merge_test.go new file mode 100644 index 0000000..7f7da23 --- /dev/null +++ b/server/internal/httpapi/search_merge_test.go @@ -0,0 +1,188 @@ +package httpapi + +import ( + "testing" +) + +func mkItem(file string, start, end int, score float32, symbol, kind string) searchResultItem { + return searchResultItem{ + FilePath: file, + StartLine: start, + EndLine: end, + Score: score, + SymbolName: symbol, + ChunkType: kind, + Language: "go", + } +} + +// TestMerge_NestedSections: H1 (1-200) wraps H2 (27-80), which wraps H3 (29-50). +// All three matched the query — output should be the H1 chunk with +// 2 nested hits, score = max of all three. +func TestMerge_NestedSections(t *testing.T) { + items := []searchResultItem{ + mkItem("README.md", 1, 200, 0.30, "", "section"), + mkItem("README.md", 27, 80, 0.45, "", "section"), + mkItem("README.md", 29, 50, 0.50, "", "section"), + } + out := mergeOverlappingHits(items) + if len(out) != 1 { + t.Fatalf("want 1 merged result, got %d", len(out)) + } + got := out[0] + if got.StartLine != 1 || got.EndLine != 200 { + t.Errorf("expected outer range 1-200, got %d-%d", got.StartLine, got.EndLine) + } + if got.Score != 0.50 { + t.Errorf("merged score = %v, want max=0.50", got.Score) + } + if len(got.NestedHits) != 2 { + t.Fatalf("want 2 nested hits, got %d", len(got.NestedHits)) + } +} + +// TestMerge_SameSymbolAdjacent: splitChunk emitted run() as +// - lines 61-195, function:run +// - lines 196-198, block, no symbol +// Merge the second into the first. +func TestMerge_SameSymbolAdjacent(t *testing.T) { + items := []searchResultItem{ + mkItem("main.go", 61, 195, 0.40, "run", "function"), + mkItem("main.go", 196, 198, 0.30, "", "block"), + } + out := mergeOverlappingHits(items) + if len(out) != 1 { + t.Fatalf("want 1 merged result, got %d", len(out)) + } + if out[0].StartLine != 61 || out[0].EndLine != 195 { + // Parent absorbs child; parent's range stays as-is. + t.Errorf("merged range = %d-%d, want 61-195", out[0].StartLine, out[0].EndLine) + } + if out[0].SymbolName != "run" { + t.Errorf("symbol lost: got %q, want run", out[0].SymbolName) + } +} + +// TestMerge_SiblingsNotMerged: two H2 sections in same file at separate +// non-overlapping ranges → keep both. +func TestMerge_SiblingsNotMerged(t *testing.T) { + items := []searchResultItem{ + mkItem("doc.md", 10, 30, 0.40, "", "section"), + mkItem("doc.md", 50, 90, 0.45, "", "section"), + } + out := mergeOverlappingHits(items) + if len(out) != 2 { + t.Fatalf("siblings should stay separate, got %d results", len(out)) + } +} + +// TestMerge_DifferentFiles: same line range, different files → not merged. +func TestMerge_DifferentFiles(t *testing.T) { + items := []searchResultItem{ + mkItem("a.go", 10, 30, 0.40, "fn", "function"), + mkItem("b.go", 10, 30, 0.45, "fn", "function"), + } + out := mergeOverlappingHits(items) + if len(out) != 2 { + t.Fatalf("cross-file dupes shouldn't merge, got %d", len(out)) + } +} + +// TestMerge_ExactDuplicateNotAbsorbed: same range twice (e.g. fan-out +// hiccup) — these should be deduped upstream (dedupByLocation), not by +// merge. We treat them as siblings here. +func TestMerge_ExactDuplicateNotAbsorbed(t *testing.T) { + items := []searchResultItem{ + mkItem("x.go", 1, 100, 0.30, "Foo", "class"), + mkItem("x.go", 1, 100, 0.40, "Foo", "class"), + } + out := mergeOverlappingHits(items) + if len(out) != 2 { + t.Errorf("exact duplicates should NOT be merged here (dedup is a separate step), got %d", len(out)) + } +} + +// TestMerge_RescoreUsesMax: parent had lower score than child → merged +// result inherits child's higher score. +func TestMerge_RescoreUsesMax(t *testing.T) { + items := []searchResultItem{ + mkItem("a.go", 1, 100, 0.20, "Outer", "class"), + mkItem("a.go", 30, 50, 0.80, "inner", "method"), + } + out := mergeOverlappingHits(items) + if len(out) != 1 { + t.Fatalf("want 1, got %d", len(out)) + } + if out[0].Score != 0.80 { + t.Errorf("merged score = %v, want 0.80", out[0].Score) + } + if len(out[0].NestedHits) != 1 || out[0].NestedHits[0].SymbolName != "inner" { + t.Errorf("nested hit missing or wrong: %+v", out[0].NestedHits) + } +} + +// TestMerge_ResortByMergedScore: merged item's max score should bring it +// to the top of the result list. +func TestMerge_ResortByMergedScore(t *testing.T) { + items := []searchResultItem{ + mkItem("a.go", 1, 100, 0.20, "Outer", "class"), // will absorb 0.80 + mkItem("a.go", 30, 50, 0.80, "inner", "method"), + mkItem("b.go", 1, 50, 0.50, "other", "function"), + } + out := mergeOverlappingHits(items) + if len(out) != 2 { + t.Fatalf("want 2 results after merge, got %d", len(out)) + } + if out[0].FilePath != "a.go" { + t.Errorf("merged a.go (score 0.80) should be first, got %s (score %v)", out[0].FilePath, out[0].Score) + } +} + +// TestMerge_NoOverlapNoMerge: completely disjoint hits stay as-is and +// keep their original score order. +func TestMerge_NoOverlapNoMerge(t *testing.T) { + items := []searchResultItem{ + mkItem("a.go", 1, 5, 0.50, "fnA", "function"), + mkItem("b.go", 10, 15, 0.40, "fnB", "function"), + mkItem("c.go", 20, 25, 0.30, "fnC", "function"), + } + out := mergeOverlappingHits(items) + if len(out) != 3 { + t.Fatalf("disjoint items should stay separate, got %d", len(out)) + } + if out[0].Score != 0.50 || out[1].Score != 0.40 || out[2].Score != 0.30 { + t.Errorf("score order broken: %v / %v / %v", out[0].Score, out[1].Score, out[2].Score) + } +} + +// TestMerge_TripleNesting: H1 -> H2 -> H3, all match. After merge, ONE +// result with 2 nested hits (H2 and H3 inside H1). +func TestMerge_TripleNesting(t *testing.T) { + items := []searchResultItem{ + mkItem("d.md", 1, 100, 0.30, "", "section"), + mkItem("d.md", 10, 50, 0.40, "", "section"), + mkItem("d.md", 20, 30, 0.55, "", "section"), + } + out := mergeOverlappingHits(items) + if len(out) != 1 { + t.Fatalf("triple nesting → 1 result, got %d", len(out)) + } + if len(out[0].NestedHits) != 2 { + t.Errorf("want 2 nested hits, got %d", len(out[0].NestedHits)) + } +} + +// TestMerge_AdjacentNoSymbolNotMerged: two anonymous chunks adjacent in +// the same file are NOT merged — we only merge adjacent chunks if at +// least one carries a symbol (otherwise we have no signal that they're +// related). +func TestMerge_AdjacentNoSymbolNotMerged(t *testing.T) { + items := []searchResultItem{ + mkItem("x.go", 1, 50, 0.40, "", "module"), + mkItem("x.go", 51, 100, 0.45, "", "module"), + } + out := mergeOverlappingHits(items) + if len(out) != 2 { + t.Errorf("anonymous adjacent chunks shouldn't merge, got %d", len(out)) + } +} From 14fc45a74c832ddbaac254429bf22ec323214f50 Mon Sep 17 00:00:00 2001 From: dvcdsys Date: Mon, 27 Apr 2026 22:47:41 +0100 Subject: [PATCH 6/6] =?UTF-8?q?feat(search):=20file-grouped=20results=20?= =?UTF-8?q?=E2=80=94=20one=20file=20=3D=20one=20entry,=20all=20matches=20i?= =?UTF-8?q?nside?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes the unit of search output from "chunk" to "file". Inspired by how grep groups hits per file but with AST-aware match boundaries and embedding-driven ranking. Old wire shape: a flat list of chunks. A file with three matching chunks ate three slots out of the user's --limit budget, scattered across the result list, often with the file appearing at positions #3 and #10 simultaneously. New wire shape: results: [ { file_path, language, best_score, matches: [ { start_line, end_line, score, content, chunk_type, symbol_name, nested_hits }, ... ] } ] total: Ranking: * Files ordered by best_score (the highest match score in the group) descending. * Inside each file, matches ordered by start_line ascending — natural reading order top-to-bottom. * No per-file cap on matches. The only intra-file filter is min_score. A file with 50 matches above threshold shows all 50. Window loop now targets distinct files, not chunks: factor 2..16, stops when len(file_groups) >= limit, when the vector store returns fewer rows than asked, or when the cap is hit. mergeOverlappingHits still runs FIRST (collapses nested H1⊋H2⊋H3 etc. into one match with nested_hits inside), then groupByFile lifts the survivors into file-grouped output. So a markdown file with three nested sections still produces ONE match (not one file with three), and a Go file with class+method overlap still produces a clean class match with the method as a nested hit. CLI render redesigned around the new shape: 1. /path/to/file.go [best 0.85] 4 matches · go -- [0.85] lines 61-195 (function run) ```go ... ``` + 1 more match inside: · [0.50] line 80 (function init) -- [0.42] lines 250-280 (type Server) ```go ... ``` Tests: * groupByFile: sort-by-best-score, sort-matches-by-line, preserves nested_hits, empty input. * TestSemanticSearch_NestedMarkdownMerge updated for the new shape — still asserts the H1 absorbs the two H2 sub-sections (now visible as group.Matches[0].NestedHits). * CLI search_test fixture updated to new wire shape. Co-Authored-By: Claude Opus 4.7 --- cli/cmd/search.go | 87 ++++++----- cli/cmd/search_test.go | 25 ++-- cli/internal/client/search.go | 53 ++++--- server/internal/httpapi/indexing_test.go | 32 ++-- server/internal/httpapi/search.go | 146 +++++++++++++++---- server/internal/httpapi/search_merge_test.go | 74 ++++++++++ 6 files changed, 298 insertions(+), 119 deletions(-) diff --git a/cli/cmd/search.go b/cli/cmd/search.go index 9b12855..c67247f 100644 --- a/cli/cmd/search.go +++ b/cli/cmd/search.go @@ -112,48 +112,59 @@ func runSearch(cmd *cobra.Command, args []string) error { return nil } - // Print results - fmt.Printf("Found %d result(s) (%.1fms):\n\n", results.Total, results.QueryTimeMS) - - for i, result := range results.Results { - // Format score as colored - scoreStr := fmt.Sprintf("%.2f", result.Score) - - // Print result header - fmt.Printf("%d. [%s] %s:%d-%d\n", - i+1, scoreStr, result.FilePath, result.StartLine, result.EndLine) - - // Print metadata - meta := []string{} - if result.SymbolName != "" { - meta = append(meta, fmt.Sprintf("Symbol: %s", result.SymbolName)) + // Files-as-results: --limit is a count of files. Inside each file, + // every match above min_score is shown, ordered by line number so the + // reader walks the file top-to-bottom. + fmt.Printf("Found %d file(s) (%.1fms):\n\n", results.Total, results.QueryTimeMS) + + for i, file := range results.Results { + // File header. Best score is the rank driver; total match count + // gives a sense of how relevant this file is overall. + matchWord := "match" + if len(file.Matches) != 1 { + matchWord = "matches" } - meta = append(meta, fmt.Sprintf("Type: %s", result.ChunkType)) - if result.Language != "" { - meta = append(meta, fmt.Sprintf("Lang: %s", result.Language)) + langSuffix := "" + if file.Language != "" { + langSuffix = " · " + file.Language } - fmt.Printf(" %s\n", strings.Join(meta, " | ")) + fmt.Printf("%d. %s [best %.2f] %d %s%s\n", + i+1, file.FilePath, file.BestScore, len(file.Matches), matchWord, langSuffix) + + for _, m := range file.Matches { + // Per-match separator with score + line range + label so the + // user can scan vertically by relevance, even though matches + // are in line order. + label := m.ChunkType + if m.SymbolName != "" { + label = fmt.Sprintf("%s %s", m.ChunkType, m.SymbolName) + } + rangeStr := fmt.Sprintf("line %d", m.StartLine) + if m.EndLine != m.StartLine { + rangeStr = fmt.Sprintf("lines %d-%d", m.StartLine, m.EndLine) + } + fmt.Printf(" -- [%.2f] %s (%s)\n", m.Score, rangeStr, label) - fmt.Printf(" ```%s\n", result.Language) - content := result.Content - for _, line := range strings.Split(content, "\n") { - fmt.Printf(" %s\n", line) - } - fmt.Printf(" ```\n") - - // Breadcrumbs for nested hits absorbed by the server's merge step. - // Tells the user "this big chunk ranks well because of these inner - // matches" so they're not surprised that --limit returned fewer - // items than expected. - if len(result.NestedHits) > 0 { - fmt.Printf(" + %d more match(es) inside:\n", len(result.NestedHits)) - for _, nh := range result.NestedHits { - label := nh.ChunkType - if nh.SymbolName != "" { - label = fmt.Sprintf("%s %s", nh.ChunkType, nh.SymbolName) + lang := file.Language + fmt.Printf(" ```%s\n", lang) + for _, line := range strings.Split(m.Content, "\n") { + fmt.Printf(" %s\n", line) + } + fmt.Printf(" ```\n") + + // Nested hits — chunks merged INTO this match by the server. + // They sit textually inside m.Content; this just exposes the + // inner anchor points so the user can jump to the exact line. + if len(m.NestedHits) > 0 { + fmt.Printf(" + %d more match(es) inside:\n", len(m.NestedHits)) + for _, nh := range m.NestedHits { + nhLabel := nh.ChunkType + if nh.SymbolName != "" { + nhLabel = fmt.Sprintf("%s %s", nh.ChunkType, nh.SymbolName) + } + fmt.Printf(" · [%.2f] line %d (%s)\n", + nh.Score, nh.StartLine, nhLabel) } - fmt.Printf(" · [%.2f] %s:%d-%d (%s)\n", - nh.Score, result.FilePath, nh.StartLine, nh.EndLine, label) } } fmt.Println() diff --git a/cli/cmd/search_test.go b/cli/cmd/search_test.go index 140038b..609e7e4 100644 --- a/cli/cmd/search_test.go +++ b/cli/cmd/search_test.go @@ -18,14 +18,19 @@ func TestRunSearch_Results(t *testing.T) { writeJSON(w, 200, map[string]any{ "results": []map[string]any{ { - "file_path": proj + "/api/auth.go", - "start_line": 10, - "end_line": 25, - "content": "func AuthMiddleware() {}", - "score": 0.92, - "chunk_type": "function", - "symbol_name": "AuthMiddleware", - "language": "go", + "file_path": proj + "/api/auth.go", + "language": "go", + "best_score": 0.92, + "matches": []map[string]any{ + { + "start_line": 10, + "end_line": 25, + "content": "func AuthMiddleware() {}", + "score": 0.92, + "chunk_type": "function", + "symbol_name": "AuthMiddleware", + }, + }, }, }, "total": 1, @@ -58,8 +63,8 @@ func TestRunSearch_Results(t *testing.T) { if !strings.Contains(out, "auth.go") { t.Errorf("expected file path in output, got:\n%s", out) } - if !strings.Contains(out, "1 result") { - t.Errorf("expected result count in output, got:\n%s", out) + if !strings.Contains(out, "1 file") { + t.Errorf("expected file count in output, got:\n%s", out) } } diff --git a/cli/internal/client/search.go b/cli/internal/client/search.go index 018a579..093be6a 100644 --- a/cli/internal/client/search.go +++ b/cli/internal/client/search.go @@ -2,28 +2,23 @@ package client import "fmt" -// SearchResult represents a code search result. -// -// NestedHits is populated by the server's mergeOverlappingHits step when -// other matches inside this chunk's line range were absorbed (e.g. a -// markdown H2 inside an H1 section, or a method inside its class). The -// renderer uses these to show breadcrumbs so the user can see WHY this -// outer chunk ranks well. -type SearchResult struct { - FilePath string `json:"file_path"` - StartLine int `json:"start_line"` - EndLine int `json:"end_line"` - Content string `json:"content"` - Score float64 `json:"score"` - ChunkType string `json:"chunk_type"` - SymbolName string `json:"symbol_name"` - Language string `json:"language"` - NestedHits []NestedHit `json:"nested_hits,omitempty"` +// FileMatch is one search hit inside a file group. Position + score + +// content + chunk metadata. NestedHits records overlapping inner chunks +// that were absorbed by mergeOverlappingHits on the server side (e.g. a +// markdown H2 absorbed into its parent H1 section). +type FileMatch struct { + StartLine int `json:"start_line"` + EndLine int `json:"end_line"` + Content string `json:"content"` + Score float64 `json:"score"` + ChunkType string `json:"chunk_type"` + SymbolName string `json:"symbol_name,omitempty"` + NestedHits []NestedHit `json:"nested_hits,omitempty"` } -// NestedHit is a chunk that was merged INTO another result by the server. -// Just enough metadata to render a breadcrumb and let the user jump to -// the exact line. The full content is already inside the parent result. +// NestedHit is a chunk absorbed INTO a parent FileMatch. The parent's +// content already contains it textually; this carries just the metadata +// so renderers can show a breadcrumb and let the user jump to the line. type NestedHit struct { StartLine int `json:"start_line"` EndLine int `json:"end_line"` @@ -32,11 +27,23 @@ type NestedHit struct { Score float64 `json:"score"` } +// SearchResult is the top-level unit of search output: one file with +// every match inside it that passed min_score. Files are ordered by +// BestScore descending; matches inside are ordered by StartLine ascending +// (natural reading order). No per-file cap — the only intra-file filter +// is the similarity threshold. +type SearchResult struct { + FilePath string `json:"file_path"` + Language string `json:"language,omitempty"` + BestScore float64 `json:"best_score"` + Matches []FileMatch `json:"matches"` +} + // SearchResponse represents the search response type SearchResponse struct { - Results []SearchResult `json:"results"` - Total int `json:"total"` - QueryTimeMS float64 `json:"query_time_ms"` + Results []SearchResult `json:"results"` + Total int `json:"total"` + QueryTimeMS float64 `json:"query_time_ms"` } // SymbolResult represents a symbol search result diff --git a/server/internal/httpapi/indexing_test.go b/server/internal/httpapi/indexing_test.go index cc73c18..dcdce3a 100644 --- a/server/internal/httpapi/indexing_test.go +++ b/server/internal/httpapi/indexing_test.go @@ -312,28 +312,28 @@ func TestSemanticSearch_NestedMarkdownMerge(t *testing.T) { t.Fatalf("unmarshal: %v", err) } - // Find the outer section result and verify it has nested_hits. - var outer *searchResultItem + // Find the README.md file group and verify the merge happened. + var group *fileGroupResult for i := range resp.Results { - r := &resp.Results[i] - if r.FilePath == "/proj-md/README.md" && r.StartLine == 1 { - outer = r + if resp.Results[i].FilePath == "/proj-md/README.md" { + group = &resp.Results[i] break } } - if outer == nil { - t.Fatalf("expected an outer section starting at line 1, got results: %+v", resp.Results) + if group == nil { + t.Fatalf("expected a file group for README.md, got results: %+v", resp.Results) + } + // After merge, only ONE match should remain in this file (the outer + // H1 section absorbing the two H2s as nested hits). + if len(group.Matches) != 1 { + t.Fatalf("expected 1 match in README.md after merge, got %d: %+v", len(group.Matches), group.Matches) + } + outer := group.Matches[0] + if outer.StartLine != 1 { + t.Errorf("outer match should start at line 1, got %d", outer.StartLine) } if len(outer.NestedHits) == 0 { - t.Errorf("outer section should have nested hits absorbed, got NestedHits=%v", outer.NestedHits) - } - // And we should NOT see those nested ranges as separate top-level results. - for _, r := range resp.Results { - if r.FilePath == "/proj-md/README.md" && r.StartLine != 1 { - // Any other start line in the same file means a nested section - // leaked through merging. - t.Errorf("non-outer section leaked as separate result: lines %d-%d", r.StartLine, r.EndLine) - } + t.Errorf("outer match should record absorbed nested hits, got NestedHits=%v", outer.NestedHits) } } diff --git a/server/internal/httpapi/search.go b/server/internal/httpapi/search.go index e3918c4..432407d 100644 --- a/server/internal/httpapi/search.go +++ b/server/internal/httpapi/search.go @@ -582,29 +582,28 @@ type searchRequest struct { MinScore *float32 `json:"min_score,omitempty"` } +// searchResultItem is the per-chunk match used INTERNALLY during retrieval. +// It is not exposed in the JSON response — the wire format groups matches +// by file (see fileGroupResult). The merge step (mergeOverlappingHits) +// works on this struct, then groupByFile lifts the survivors into +// file-grouped results. type searchResultItem struct { - FilePath string `json:"file_path"` + FilePath string `json:"-"` StartLine int `json:"start_line"` EndLine int `json:"end_line"` Content string `json:"content"` Score float32 `json:"score"` ChunkType string `json:"chunk_type"` - SymbolName string `json:"symbol_name"` - Language string `json:"language"` - // NestedHits records other matches inside this result's line range that - // were merged into it by mergeOverlappingHits. Populated only when at - // least one inner hit was absorbed; emitted as `nested_hits` in JSON. - // The renderer uses these to show breadcrumbs (e.g. "+ 2 more matches: - // H2 'Foo' line 27, H3 'Bar' line 29") so the user can see WHY this - // outer chunk ranks well even when the actual signal came from a - // sub-section. + SymbolName string `json:"symbol_name,omitempty"` + Language string `json:"-"` NestedHits []nestedHit `json:"nested_hits,omitempty"` } // nestedHit is a compact view of a chunk that was merged INTO another -// result. We don't need the full content (the parent's content already -// contains it textually) — just enough metadata to render a breadcrumb -// and let the caller jump to the exact line. +// result by mergeOverlappingHits (e.g. an H2 section absorbed into its +// containing H1). The parent's `content` already includes the inner +// textually; this just records the metadata so renderers can show a +// breadcrumb and let the user jump to the exact line. type nestedHit struct { StartLine int `json:"start_line"` EndLine int `json:"end_line"` @@ -613,10 +612,37 @@ type nestedHit struct { Score float32 `json:"score"` } +// fileMatch is one search hit inside a file group. Mirrors the per-chunk +// information from searchResultItem but without the file_path/language +// (those live one level up on fileGroupResult — same for every match in +// the group). +type fileMatch struct { + StartLine int `json:"start_line"` + EndLine int `json:"end_line"` + Content string `json:"content"` + Score float32 `json:"score"` + ChunkType string `json:"chunk_type"` + SymbolName string `json:"symbol_name,omitempty"` + NestedHits []nestedHit `json:"nested_hits,omitempty"` +} + +// fileGroupResult is the top-level unit of search output: one file with +// every match inside it that passed min_score. Files are ranked by +// BestScore (the highest match score in the group) and matches inside are +// ordered by StartLine ascending so the renderer reads top-to-bottom like +// the actual file. There is no per-file cap on matches — the only filter +// inside a file is the similarity threshold. +type fileGroupResult struct { + FilePath string `json:"file_path"` + Language string `json:"language,omitempty"` + BestScore float32 `json:"best_score"` + Matches []fileMatch `json:"matches"` +} + type searchResponse struct { - Results []searchResultItem `json:"results"` - Total int `json:"total"` - QueryTimeMS float64 `json:"query_time_ms"` + Results []fileGroupResult `json:"results"` + Total int `json:"total"` + QueryTimeMS float64 `json:"query_time_ms"` } // semanticSearchHandler implements POST /api/v1/projects/{path}/search, @@ -673,21 +699,26 @@ func semanticSearchHandler(d Deps) http.HandlerFunc { return } - // Post-filter (path/language) and merge state are computed once outside - // the window loop — both are cheap and don't depend on factor. + // Post-filter (path/language) state is computed once outside the + // window loop — cheap and doesn't depend on factor. langSet := map[string]struct{}{} for _, l := range body.Languages { langSet[l] = struct{}{} } applyPostLangFilter := len(body.Languages) > maxFanoutSearch - // Windowed retrieval. Start by asking the vector store for limit×2 - // (the historical default), and if mergeOverlappingHits collapses - // the result set below the user's --limit budget — typically because - // of nested markdown sections or class+method overlaps — re-ask for - // limit×4, then ×8, up to ×maxFactorSearch. Stops early when the - // store returns fewer rows than requested (HNSW exhausted). - var merged []searchResultItem + // Windowed retrieval. The user's --limit is a count of FILES, not + // chunks. We start with a chunk over-fetch of limit×2, group by + // file, and if we don't have `limit` distinct files yet, re-fetch + // with limit×4, ×8, ×16. Stops early when: + // - we have ≥ limit file groups, OR + // - the vector store returned fewer rows than asked (HNSW is + // exhausted), OR + // - factor cap (×16) reached. + // + // Inside each file there is no count cap — every match that passed + // min_score is shown. The threshold is the only intra-file filter. + var fileGroups []fileGroupResult factor := 2 for { n := body.Limit * factor @@ -699,12 +730,12 @@ func semanticSearchHandler(d Deps) http.HandlerFunc { return } filtered := filterToSearchItems(rawWrapped, minScore, body.Paths, langSet, applyPostLangFilter) - merged = mergeOverlappingHits(filtered) - if len(merged) >= body.Limit { + merged := mergeOverlappingHits(filtered) + fileGroups = groupByFile(merged) + if len(fileGroups) >= body.Limit { break } if len(rawWrapped) < n { - // Vector store returned everything it had — no point asking again. break } if factor >= maxFactorSearch { @@ -713,21 +744,72 @@ func semanticSearchHandler(d Deps) http.HandlerFunc { factor *= 2 } - if len(merged) > body.Limit { - merged = merged[:body.Limit] + if len(fileGroups) > body.Limit { + fileGroups = fileGroups[:body.Limit] } elapsedMS := float64(time.Since(start).Microseconds()) / 1000.0 elapsedMS = float64(int(elapsedMS*10+0.5)) / 10 writeJSON(w, http.StatusOK, searchResponse{ - Results: merged, - Total: len(merged), + Results: fileGroups, + Total: len(fileGroups), QueryTimeMS: elapsedMS, }) } } +// groupByFile takes the merged per-chunk results and lifts them into +// file-grouped output. Inside each file matches are sorted by StartLine +// ascending (natural reading order); files are sorted by BestScore +// descending (hottest hit drives the rank). +func groupByFile(items []searchResultItem) []fileGroupResult { + if len(items) == 0 { + return nil + } + indexByPath := map[string]int{} + var groups []fileGroupResult + for _, it := range items { + idx, ok := indexByPath[it.FilePath] + if !ok { + groups = append(groups, fileGroupResult{ + FilePath: it.FilePath, + Language: it.Language, + BestScore: it.Score, + }) + idx = len(groups) - 1 + indexByPath[it.FilePath] = idx + } + g := &groups[idx] + if it.Score > g.BestScore { + g.BestScore = it.Score + } + g.Matches = append(g.Matches, fileMatch{ + StartLine: it.StartLine, + EndLine: it.EndLine, + Content: it.Content, + Score: it.Score, + ChunkType: it.ChunkType, + SymbolName: it.SymbolName, + NestedHits: it.NestedHits, + }) + } + for i := range groups { + ms := groups[i].Matches + sort.SliceStable(ms, func(a, b int) bool { + return ms[a].StartLine < ms[b].StartLine + }) + } + sort.SliceStable(groups, func(i, j int) bool { + if groups[i].BestScore != groups[j].BestScore { + return groups[i].BestScore > groups[j].BestScore + } + // Tie-break by file path so output is stable in tests. + return groups[i].FilePath < groups[j].FilePath + }) + return groups +} + // maxFanoutSearch is the language-count threshold above which we drop // per-language pre-filter and fall back to a single over-fetched query // with post-filter. Same value as the previous inline `maxFanout`. diff --git a/server/internal/httpapi/search_merge_test.go b/server/internal/httpapi/search_merge_test.go index 7f7da23..8634428 100644 --- a/server/internal/httpapi/search_merge_test.go +++ b/server/internal/httpapi/search_merge_test.go @@ -172,6 +172,80 @@ func TestMerge_TripleNesting(t *testing.T) { } } +// --- groupByFile ---------------------------------------------------------- + +// TestGroupByFile_SortsByBestScore: files are ordered by best chunk score +// descending. main.go's best chunk (0.80) outranks doc.md's best (0.60). +func TestGroupByFile_SortsByBestScore(t *testing.T) { + items := []searchResultItem{ + mkItem("doc.md", 1, 10, 0.60, "", "section"), + mkItem("main.go", 5, 20, 0.40, "Foo", "function"), + mkItem("main.go", 30, 50, 0.80, "Bar", "function"), + } + groups := groupByFile(items) + if len(groups) != 2 { + t.Fatalf("want 2 groups, got %d", len(groups)) + } + if groups[0].FilePath != "main.go" || groups[0].BestScore != 0.80 { + t.Errorf("first group should be main.go (0.80), got %+v", groups[0]) + } + if groups[1].FilePath != "doc.md" { + t.Errorf("second group should be doc.md, got %s", groups[1].FilePath) + } +} + +// TestGroupByFile_SortsMatchesByLineAsc: inside a file, matches read top- +// to-bottom — score order is for FILE ranking, not for matches inside. +func TestGroupByFile_SortsMatchesByLineAsc(t *testing.T) { + items := []searchResultItem{ + mkItem("a.go", 100, 120, 0.50, "later", "function"), + mkItem("a.go", 30, 50, 0.80, "earlier", "function"), + mkItem("a.go", 200, 220, 0.40, "latest", "function"), + } + groups := groupByFile(items) + if len(groups) != 1 { + t.Fatalf("want 1 group, got %d", len(groups)) + } + ms := groups[0].Matches + if len(ms) != 3 { + t.Fatalf("want 3 matches, got %d", len(ms)) + } + if ms[0].StartLine != 30 || ms[1].StartLine != 100 || ms[2].StartLine != 200 { + t.Errorf("matches not sorted by StartLine asc: %v %v %v", + ms[0].StartLine, ms[1].StartLine, ms[2].StartLine) + } + // Best score still tracks the max score across all matches. + if groups[0].BestScore != 0.80 { + t.Errorf("best score = %v, want 0.80", groups[0].BestScore) + } +} + +// TestGroupByFile_PreservesNestedHits: items that survived merge with +// nested_hits attached should carry them through into the file group. +func TestGroupByFile_PreservesNestedHits(t *testing.T) { + parent := mkItem("d.md", 1, 100, 0.50, "", "section") + parent.NestedHits = []nestedHit{ + {StartLine: 10, EndLine: 30, ChunkType: "section", Score: 0.60}, + } + groups := groupByFile([]searchResultItem{parent}) + if len(groups) != 1 || len(groups[0].Matches) != 1 { + t.Fatalf("unexpected shape: %+v", groups) + } + if len(groups[0].Matches[0].NestedHits) != 1 { + t.Errorf("nested hits dropped during groupByFile: %+v", groups[0].Matches[0]) + } +} + +// TestGroupByFile_Empty: nil/empty in → nil out. +func TestGroupByFile_Empty(t *testing.T) { + if groupByFile(nil) != nil { + t.Error("nil input should produce nil output") + } + if groupByFile([]searchResultItem{}) != nil { + t.Error("empty input should produce nil output") + } +} + // TestMerge_AdjacentNoSymbolNotMerged: two anonymous chunks adjacent in // the same file are NOT merged — we only merge adjacent chunks if at // least one carries a symbol (otherwise we have no signal that they're