diff --git a/.github/workflows/install-script.yml b/.github/workflows/install-script.yml index c15ea229..77b54fc7 100644 --- a/.github/workflows/install-script.yml +++ b/.github/workflows/install-script.yml @@ -1,8 +1,10 @@ # Smoke-test scripts/install.sh on Linux + macOS against the latest published # release. The script is the public install path served at get.gortex.dev, so -# any change must prove it still produces a working `gortex` binary on both -# OSes and both architectures GitHub-hosted runners cover (x64 always; arm64 -# darwin via the macos-14 runner — Linux arm64 is exercised in C5 release). +# any change must prove it still produces a working `gortex` binary. Coverage: +# Linux x64 (ubuntu-latest) and macOS arm64 (macos-14). Intel macOS is not +# tested here — GitHub retired its Intel (macos-13) runners, and install.sh is +# arch-agnostic (only the downloaded artifact differs). Linux arm64 is +# exercised in the release flow. name: install-script on: @@ -35,7 +37,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest, macos-13, macos-14] + os: [ubuntu-latest, macos-14] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a6b01497..fc8ac207 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -98,7 +98,6 @@ jobs: -w /go/src/gortex \ -e GITHUB_TOKEN \ -e HOMEBREW_TAP_TOKEN \ - -e SCOOP_BUCKET_TOKEN \ ghcr.io/goreleaser/goreleaser-cross:v1.26 \ release --clean env: @@ -106,10 +105,9 @@ jobs: # Personal access token with `repo` scope on zzet/homebrew-tap. # GITHUB_TOKEN can only push to the source repo, not the tap. HOMEBREW_TAP_TOKEN: ${{ secrets.HOMEBREW_TAP_TOKEN }} - # Same story for the Scoop bucket — a PAT with `repo` scope on - # gortexhq/scoop-bucket. Required for the `scoops` block in - # .goreleaser.yml to push the Windows manifest. - SCOOP_BUCKET_TOKEN: ${{ secrets.SCOOP_BUCKET_TOKEN }} + # (SCOOP_BUCKET_TOKEN is consumed by the release-windows job's + # "Publish Scoop manifest" step, not here — goreleaser builds no + # windows artifact, so it has no scoop manifest to push.) # goreleaser-cross runs as root inside the container, so everything # in dist/ is owned by root:root on the host. The subsequent cosign @@ -208,6 +206,164 @@ jobs: rm -rf /tmp/macos-signing fi + # Windows is built on a NATIVE windows runner: the CGo tree-sitter + # bindings need a real C/C++ toolchain (mingw-w64 ships on PATH there), + # and goreleaser-cross targets unix only. This job builds a statically + # linked, self-contained .exe (no runtime DLLs to ship), zips it, + # cosign-signs, and appends the zip to the release the `release` job + # already created. + release-windows: + needs: release + runs-on: windows-latest + permissions: + contents: write + id-token: write + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + + - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6 + with: + go-version: '1.26' + + - uses: sigstore/cosign-installer@6f9f17788090df1f26f669e9d70d6ae9567deba6 # v4.1.2 + with: + cosign-release: v2.4.1 + + - name: Build gortex.exe (static mingw runtime) + shell: bash + env: + CGO_ENABLED: "1" + run: | + set -euo pipefail + VER="${GITHUB_REF#refs/tags/}" + # -extldflags=-static folds the mingw C/C++ runtime (libstdc++, + # libgcc, libwinpthread) into the .exe so it ships as a single + # self-contained binary — nothing to bundle alongside. The C++ + # stdlib is in the link at all because some tree-sitter grammars + # carry C++ external scanners (e.g. go-sitter-forest norg); static + # linking just puts it inside the .exe instead of a DLL. + go build \ + -ldflags "-s -w -X main.version=${VER} -X main.commit=$(git rev-parse --short HEAD) -X main.date=$(date -u +%Y-%m-%dT%H:%M:%SZ) -extldflags=-static" \ + -o gortex.exe ./cmd/gortex/ + + - name: Verify gortex.exe is self-contained + shell: bash + run: | + set -euo pipefail + # The static link must leave no dependency on a mingw runtime DLL; + # a partially static .exe would fail to start where that DLL is + # absent. If objdump is available, fail the release on any leaked + # mingw runtime import. + objdump="" + for cand in objdump x86_64-w64-mingw32-objdump; do + command -v "$cand" >/dev/null 2>&1 && { objdump="$cand"; break; } + done + if [ -n "$objdump" ]; then + echo "imported DLLs:"; "$objdump" -p gortex.exe | grep -i 'DLL Name' || true + if "$objdump" -p gortex.exe | grep -iqE 'libstdc\+\+|libgcc_s|libwinpthread'; then + echo "FATAL: gortex.exe still imports a mingw runtime DLL — static link incomplete" + exit 1 + fi + echo "ok: no mingw runtime DLL imports" + else + echo "WARN: objdump not found; skipping self-containment check" + fi + + - name: Zip (gortex_windows_amd64.zip) + shell: pwsh + run: Compress-Archive -Path gortex.exe -DestinationPath gortex_windows_amd64.zip -Force + + - name: Sign + upload to release + shell: bash + env: + COSIGN_YES: "true" + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + cosign sign-blob \ + --output-signature gortex_windows_amd64.zip.sig \ + --output-certificate gortex_windows_amd64.zip.pem \ + gortex_windows_amd64.zip + TAG="${GITHUB_REF#refs/tags/}" + gh release upload "$TAG" \ + gortex_windows_amd64.zip \ + gortex_windows_amd64.zip.sig \ + gortex_windows_amd64.zip.pem \ + --clobber + + # Append the windows zip's sha256 to the release checksums.txt so + # the one-line installer (scripts/install.ps1, which verifies + # against checksums.txt) covers windows too — the unix goreleaser + # run only hashed its own artifacts. needs:release guarantees + # checksums.txt already exists. + sha="$(sha256sum gortex_windows_amd64.zip | awk '{print $1}')" + gh release download "$TAG" --pattern checksums.txt --clobber 2>/dev/null || : > checksums.txt + if ! grep -q "gortex_windows_amd64.zip" checksums.txt; then + printf '%s gortex_windows_amd64.zip\n' "$sha" >> checksums.txt + gh release upload "$TAG" checksums.txt --clobber + fi + + - name: Publish Scoop manifest + # Push a refreshed `gortex` manifest to gortexhq/scoop-bucket so + # `scoop install gortex` resolves this release. SCOOP_BUCKET_TOKEN is + # a PAT with `repo` scope on that bucket (GITHUB_TOKEN can only push + # to the source repo). Non-blocking + self-skipping: a bucket hiccup + # must not fail a release whose binary already shipped, and a + # token-less fork just skips it. + continue-on-error: true + shell: bash + env: + SCOOP_BUCKET_TOKEN: ${{ secrets.SCOOP_BUCKET_TOKEN }} + run: | + set -euo pipefail + if [ -z "${SCOOP_BUCKET_TOKEN:-}" ]; then + echo "SCOOP_BUCKET_TOKEN not set; skipping scoop manifest publish" + exit 0 + fi + TAG="${GITHUB_REF#refs/tags/}" + VER="${TAG#v}" + URL="https://github.com/${GITHUB_REPOSITORY}/releases/download/${TAG}/gortex_windows_amd64.zip" + SHA="$(sha256sum gortex_windows_amd64.zip | awk '{print $1}')" + + # Build the manifest with jq so escaping + validity are guaranteed. + # `bin` shims gortex.exe; checkver/autoupdate let scoop's tooling + # track future releases (the $version token is literal on purpose). + jq -n \ + --arg version "$VER" \ + --arg url "$URL" \ + --arg hash "$SHA" \ + --arg homepage "https://github.com/${GITHUB_REPOSITORY}" \ + --arg autourl "https://github.com/${GITHUB_REPOSITORY}/releases/download/v\$version/gortex_windows_amd64.zip" \ + '{ + version: $version, + description: "Code intelligence engine that indexes repositories into an in-memory knowledge graph.", + homepage: $homepage, + license: "Apache-2.0", + architecture: { "64bit": { url: $url, hash: $hash } }, + bin: "gortex.exe", + checkver: "github", + autoupdate: { architecture: { "64bit": { url: $autourl } } } + }' > gortex.json + + # Token in the clone URL — GitHub Actions masks the secret in logs. + git clone "https://x-access-token:${SCOOP_BUCKET_TOKEN}@github.com/gortexhq/scoop-bucket.git" scoop-bucket + cd scoop-bucket + # Honour the bucket's layout: scoop reads manifests from the repo + # root or a bucket/ subdir. Update in place if one exists, else use + # the conventional bucket/ subdir. + if [ -f gortex.json ]; then dest="gortex.json"; else mkdir -p bucket; dest="bucket/gortex.json"; fi + cp ../gortex.json "$dest" + git add "$dest" + if git diff --cached --quiet; then + echo "scoop manifest already current for ${VER}" + else + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + git commit -m "gortex: ${VER}" + git push + echo "published scoop manifest ${VER} -> $dest" + fi + # SLSA-3 provenance via the OpenSSF reusable workflow. This runs in a # separate, isolated job that the `release` job can't tamper with — # that isolation is what elevates us from SLSA-2 to SLSA-3. Output is diff --git a/.gitignore b/.gitignore index 293c1886..52c71f33 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,14 @@ eval/scripts/ eval/logs/ internal_docs/ + +# Ad-hoc bench/probe tooling — kept locally, not part of the repo. +bench/all-tools-bench/ +bench/daemon-bench/ +bench/edge-diff/ +bench/multi-repo-bench/ +bench/node-diff/ +bench/store-bench/ +bench/unresolved-audit/ +bench/run-linux.sh +bench/run-linux-rest.sh diff --git a/.goreleaser.yml b/.goreleaser.yml index 787e87fc..1b1a4eb4 100644 --- a/.goreleaser.yml +++ b/.goreleaser.yml @@ -1,8 +1,11 @@ version: 2 # Run inside ghcr.io/goreleaser/goreleaser-cross — the Docker image ships -# cross-compile toolchains for all four targets below so CGO (tree-sitter) -# links cleanly on a single Linux runner. See .github/workflows/release.yml. +# cross-compile toolchains so CGO (tree-sitter) links cleanly on a single +# Linux runner. This config builds the UNIX targets only (linux + darwin). +# Windows is built separately on a native windows runner (see the +# `release-windows` job in release.yml) because the CGo tree-sitter +# bindings need a real C/C++ toolchain there. before: hooks: - go mod tidy @@ -19,30 +22,14 @@ builds: # Version (see internal/version). Commit lands in the +build slot so # `gortex version` output round-trips as canonical semver. - -s -w -X main.version={{.Version}} -X main.commit={{.ShortCommit}} -X main.date={{.Date}} - # Statically link the mingw-w64 C/C++ runtime (libstdc++, libgcc, - # winpthread) into the Windows binary. CGO is on for tree-sitter and - # some grammar scanners ship C++; without -static the released - # gortex.exe dynamically links libstdc++-6.dll et al., which are not - # present on a stock Windows box — the binary fails to start with a - # missing-DLL error. No-op on linux/darwin, which keep their normal - # dynamic libc/libc++. - - '{{ if eq .Os "windows" }}-extldflags "-static"{{ end }}' env: - CGO_ENABLED=1 goos: - linux - darwin - - windows goarch: - amd64 - arm64 - ignore: - # windows/arm64 needs an aarch64-w64-mingw32 cross-toolchain that - # the goreleaser-cross image doesn't ship; windows/amd64 covers - # every mainstream Windows dev box. Revisit when the image gains - # the llvm-mingw arm64 target. - - goos: windows - goarch: arm64 # Per-target CC + CXX. goreleaser-cross exposes these cross-toolchains # on PATH; CGO needs both set per target triple because some deps # (tree-sitter yaml scanner, etc.) ship C++. Without CXX, the system @@ -69,11 +56,6 @@ builds: env: - CC=aarch64-linux-gnu-gcc - CXX=aarch64-linux-gnu-g++ - - goos: windows - goarch: amd64 - env: - - CC=x86_64-w64-mingw32-gcc - - CXX=x86_64-w64-mingw32-g++ # Per-target build hook. Fires after each Mach-O / ELF is linked, # before the archive step. The script is a no-op for non-darwin # targets, so we don't need a per-override hook list. @@ -151,20 +133,8 @@ homebrew_casks: executable: gortex shell_parameter_format: cobra -# Scoop manifest — `scoop install gortex` on Windows. goreleaser commits -# the generated manifest (pointing at the signed windows/amd64 .zip in -# this release) to a separate bucket repo on every tagged release, -# exactly like the Homebrew cask above. -scoops: - - name: gortex - repository: - owner: gortexhq - name: scoop-bucket - # GITHUB_TOKEN can only push to the source repo, so the bucket - # needs its own PAT with `repo` scope on gortexhq/scoop-bucket, - # stored as SCOOP_BUCKET_TOKEN in repo secrets. release.yml wires - # it in. - token: "{{ .Env.SCOOP_BUCKET_TOKEN }}" - homepage: "https://github.com/zzet/gortex" - description: "Code intelligence engine that indexes repositories into an in-memory knowledge graph." - license: "Custom" +# NOTE: goreleaser does NOT generate the Scoop manifest — windows is built by +# the separate `release-windows` job (native runner) and isn't an artifact of +# this goreleaser-cross run, so goreleaser has no windows zip to point at. That +# job publishes the manifest to gortexhq/scoop-bucket itself (see its "Publish +# Scoop manifest" step in release.yml). diff --git a/bench/perf/main.go b/bench/perf/main.go index 75c4a3e0..49b35654 100644 --- a/bench/perf/main.go +++ b/bench/perf/main.go @@ -32,7 +32,7 @@ var benchCacheDir string func main() { repos := flag.String("repos", "gin,nestjs,react", "comma-separated repo set. Forms: preset slug (gin/nestjs/react/linux), owner/repo, https URL, or local:/path") includeLinux := flag.Bool("include-linux", false, "include the linux kernel preset (multi-GB clone; skipped by default)") - cacheDir := flag.String("cache-dir", "", "cache directory for clones (default ~/.cache/gortex/bench)") + cacheDir := flag.String("cache-dir", "", "cache directory for clones (default ~/.gortex/cache/bench)") queriesPath := flag.String("queries", "bench/perf/queries.json", "JSON file with the search-bench query set") out := flag.String("out", "", "output table path (default stdout)") format := flag.String("format", "markdown", "markdown | csv | json") diff --git a/cmd/gortex/backend.go b/cmd/gortex/backend.go new file mode 100644 index 00000000..a26e5531 --- /dev/null +++ b/cmd/gortex/backend.go @@ -0,0 +1,73 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "go.uber.org/zap" + + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/platform" +) + +// openBackend constructs the graph.Store the daemon will run +// against. Picks the implementation by the --backend flag: +// +// - "memory" (default) — in-process *graph.Graph; nothing +// persists across runs; matches every existing test fixture. +// +// Returns the store, a cleanup func the caller must defer (closes +// the underlying handle on disk-backed stores), and any error +// constructing or opening the store. +// +// The actual per-backend Open* helpers live in their own +// build-tagged files (backend_memory.go is always built; the +// disk-backed ones are gated by build tags). This file is the +// shared dispatch. +func openBackend(name, path string, bufferPoolMB uint64, logger *zap.Logger) (graph.Store, func(), error) { + switch strings.ToLower(strings.TrimSpace(name)) { + case "", "memory", "mem", "in-memory": + s := graph.New() + return s, func() {}, nil + case "sqlite", "sqlite3": + resolved, err := resolveBackendPath(path, "store.sqlite") + if err != nil { + return nil, nil, err + } + logger.Info("opening sqlite backend", zap.String("path", resolved)) + return openSqliteBackend(resolved, bufferPoolMB) + default: + return nil, nil, fmt.Errorf("unknown --backend %q (expected: memory, sqlite)", name) + } +} + +// resolveBackendPath turns an empty --backend-path into a default +// under the unified store directory (~/.gortex/store/, or the +// XDG_DATA_HOME equivalent). Otherwise expands ~ and returns the +// absolute path. Creates the parent directory if missing — the +// disk-backed stores expect the parent dir to exist. +func resolveBackendPath(in, filename string) (string, error) { + in = strings.TrimSpace(in) + if in == "" { + in = filepath.Join(platform.StoreDir(), filename) + } else if strings.HasPrefix(in, "~/") { + home, err := os.UserHomeDir() + if err != nil { + return "", fmt.Errorf("resolve home dir: %w", err) + } + in = filepath.Join(home, in[2:]) + } + abs, err := filepath.Abs(in) + if err != nil { + return "", fmt.Errorf("abs path %q: %w", in, err) + } + // The on-disk store opens the leaf path (file or directory). We + // MkdirAll the parent so the path is reachable; the store itself + // creates the leaf. + if err := os.MkdirAll(filepath.Dir(abs), 0o755); err != nil { + return "", fmt.Errorf("mkdir parent %q: %w", filepath.Dir(abs), err) + } + return abs, nil +} diff --git a/cmd/gortex/backend_sqlite.go b/cmd/gortex/backend_sqlite.go new file mode 100644 index 00000000..9149705e --- /dev/null +++ b/cmd/gortex/backend_sqlite.go @@ -0,0 +1,30 @@ +package main + +import ( + "fmt" + + "github.com/zzet/gortex/internal/daemon" + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/graph/store_sqlite" +) + +// openSqliteBackend opens (or creates) the SQLite store at path. It uses +// the pure-Go modernc.org/sqlite driver, so this backend keeps the binary +// CGo-free while still getting a real query planner that drives the graph's +// secondary indexes. Returns a cleanup func that closes the handle. +// +// bufferPoolMB is accepted for signature parity with the other on-disk +// backends but is unused — SQLite sizes its page cache via the cache_size +// pragma set in store_sqlite.Open, not a single fixed pool. +func openSqliteBackend(path string, bufferPoolMB uint64) (graph.Store, func(), error) { + _ = bufferPoolMB + s, err := store_sqlite.Open(path) + if err != nil { + hint := "if another gortex daemon or server is using this store, stop it first (`gortex daemon status` / `gortex daemon stop`)" + if pid, ok := daemon.RunningPID(); ok { + hint = fmt.Sprintf("a gortex daemon is already running (pid %d) — stop it with `gortex daemon stop`, or use `gortex daemon restart`", pid) + } + return nil, nil, fmt.Errorf("open sqlite store at %q: %w (%s)", path, err, hint) + } + return s, func() { _ = s.Close() }, nil +} diff --git a/cmd/gortex/bench.go b/cmd/gortex/bench.go index 883f4a49..7fd96f1a 100644 --- a/cmd/gortex/bench.go +++ b/cmd/gortex/bench.go @@ -338,7 +338,7 @@ claim as a budget gate; --strict turns gate violations into a non-zero exit so CI catches regressions. Default behavior: - - Clones each repo to ~/.cache/gortex/bench// on first run + - Clones each repo to ~/.gortex/cache/bench// on first run - Reuses the clone on subsequent runs (rm -rf to refresh) - Honors --out-dir (artifacts land at /perf.{md,json,csv}) diff --git a/cmd/gortex/cli_progress.go b/cmd/gortex/cli_progress.go index 67294bc6..e09a886f 100644 --- a/cmd/gortex/cli_progress.go +++ b/cmd/gortex/cli_progress.go @@ -1,13 +1,9 @@ package main import ( - "context" - "fmt" - "github.com/spf13/cobra" "go.uber.org/zap" - "github.com/zzet/gortex/internal/indexer" "github.com/zzet/gortex/internal/progress" ) @@ -36,21 +32,3 @@ func loggerForSpinner(cmd *cobra.Command, real *zap.Logger) *zap.Logger { } return zap.NewNop() } - -// indexWithSpinner runs the indexer with a progress spinner attached, reporting -// stage transitions as the sub-status. Used by every enrich subcommand that -// needs an in-memory graph before running its enrichment pass. -func indexWithSpinner(cmd *cobra.Command, idx *indexer.Indexer, path string) error { - sp := newCLISpinner(cmd, "Indexing repository") - sp.Set("", path) - ctx := progress.WithReporter(context.Background(), sp) - result, err := idx.IndexCtx(ctx, path) - if err != nil { - sp.Fail(err) - return fmt.Errorf("index %s: %w", path, err) - } - sp.Set("", fmt.Sprintf("%s files · %s nodes · %s edges · %dms", - humanizeInt(result.FileCount), humanizeInt(result.NodeCount), humanizeInt(result.EdgeCount), result.DurationMs)) - sp.Done() - return nil -} diff --git a/cmd/gortex/config_cmd.go b/cmd/gortex/config_cmd.go index b3ba9f08..d7f635be 100644 --- a/cmd/gortex/config_cmd.go +++ b/cmd/gortex/config_cmd.go @@ -35,7 +35,7 @@ indexing and watching. Patterns follow .gitignore semantics. Targets (in precedence order; later layers override earlier): 1. Builtin baseline (read-only) - 2. Global - ~/.config/gortex/config.yaml (--global) + 2. Global - ~/.gortex/config.yaml (--global) 3. Repo - GlobalConfig.repos[].exclude (--repo ) 4. Workspace - ./.gortex.yaml at the repo root (default) @@ -76,7 +76,7 @@ func init() { for _, c := range []*cobra.Command{configExcludeAddCmd, configExcludeRemoveCmd} { c.Flags().BoolVar(&excludeGlobalFlag, "global", false, - "write to ~/.config/gortex/config.yaml (GlobalConfig.exclude)") + "write to ~/.gortex/config.yaml (GlobalConfig.exclude)") c.Flags().StringVar(&excludeRepoFlag, "repo", "", "write to the named RepoEntry in the global config") } diff --git a/cmd/gortex/daemon.go b/cmd/gortex/daemon.go index c04b469a..76901abe 100644 --- a/cmd/gortex/daemon.go +++ b/cmd/gortex/daemon.go @@ -34,11 +34,14 @@ var ( // (the function has no *cobra.Command of its own) to decide whether // the flag overrides the `embedding:` config block. Set once in // runDaemonStart before buildDaemonState runs. - daemonEmbeddingsChanged bool - daemonStatusWatch bool - daemonStatusInterval time.Duration - daemonHTTPAddr string - daemonHTTPAuthToken string + daemonEmbeddingsChanged bool + daemonStatusWatch bool + daemonStatusInterval time.Duration + daemonHTTPAddr string + daemonHTTPAuthToken string + daemonBackend string + daemonBackendPath string + daemonBackendBufferPoolMB uint64 ) var daemonCmd = &cobra.Command{ @@ -97,6 +100,12 @@ func init() { "also expose the MCP 2026 Streamable HTTP transport on this TCP address (e.g. 127.0.0.1:7411); empty disables") daemonStartCmd.Flags().StringVar(&daemonHTTPAuthToken, "http-auth-token", "", "bearer token required on every Streamable HTTP request (default: read $GORTEX_DAEMON_HTTP_TOKEN; empty allows unauthenticated localhost binds)") + daemonStartCmd.Flags().StringVar(&daemonBackend, "backend", "sqlite", + "storage backend: sqlite (default — pure-Go embedded SQL, persists to --backend-path so warm restarts skip re-indexing) | memory (in-process, no persistence — fastest per-op but pays the full cold-warmup cost on every restart)") + daemonStartCmd.Flags().StringVar(&daemonBackendPath, "backend-path", "", + "directory where the on-disk backend persists its store. Required when --backend != memory. Default: ~/.gortex/store/.store") + daemonStartCmd.Flags().Uint64Var(&daemonBackendBufferPoolMB, "backend-buffer-pool-mb", 0, + "advisory page-cache cap (MiB) for on-disk backends. 0 reads $GORTEX_DAEMON_BUFFER_POOL_MB or lets the backend choose its own default; backends that manage their own cache (e.g. sqlite) ignore it") daemonLogsCmd.Flags().IntVarP(&daemonTail, "tail", "n", 50, "show only the last N log lines") daemonStatusCmd.Flags().BoolVarP(&daemonStatusWatch, "watch", "w", false, @@ -121,6 +130,17 @@ func runDaemonStart(cmd *cobra.Command, _ []string) error { if daemon.IsRunning() { return fmt.Errorf("daemon already running (socket: %s)", daemon.SocketPath()) } + // IsRunning only probes the socket. A daemon that is mid-shutdown — or + // one whose socket wedged — still owns the PID file and, crucially, still + // holds the store's on-disk lock. Starting over the top of it makes the + // backend open fail with an opaque "failed to open database" lock + // conflict, so refuse early with the PID and an actionable next step. The + // detached child reaches here too, but it hasn't written its own PID file + // yet (that happens in the serve loop), so this can't false-positive on + // the daemon we're in the middle of starting. + if pid, ok := daemon.RunningPID(); ok { + return fmt.Errorf("daemon already running (pid %d) — stop it with `gortex daemon stop`, or use `gortex daemon restart`", pid) + } if daemonDetach && os.Getenv("GORTEX_DAEMON_CHILD") != "1" { return spawnDetachedDaemon() } @@ -174,7 +194,18 @@ func runDaemonStart(cmd *cobra.Command, _ []string) error { if mw != nil { _ = mw.Stop() } - saveSnapshot(state.graph, collectSnapshotRepos(state.multiIndexer), collectSnapshotContracts(state.multiIndexer), collectSnapshotVector(state.multiIndexer), version, logger) + if mg, ok := state.graph.(*graph.Graph); ok { + // Memory backend — snapshot the full in-memory graph; + // the next warmup replays nodes/edges from the gob+gzip + // dump because there's no other persistence layer. + saveSnapshot(mg, collectSnapshotRepos(state.multiIndexer), collectSnapshotContracts(state.multiIndexer), collectSnapshotVector(state.multiIndexer), version, logger) + } + // Persistent backends (sqlite) no longer write a metadata + // snapshot: per-file mtimes live in the FileMtime sidecar + // table, contract records ride on KindContract.Meta, and the + // vector index is persisted by the backend itself. Warm + // restart reads everything it needs from the on-disk store — + // no gob+gzip round-trip required. if state.mcpServer != nil { _ = state.mcpServer.FlushSavings() } @@ -309,7 +340,15 @@ func runDaemonStart(cmd *cobra.Command, _ []string) error { // the GC then has to clean up. Skipping snapshots until ready cleared // a stall observed in profile #5 where saveSnapshotTo was the only // runnable goroutine on a daemon mid-warmup. - stopSnapshotter := startPeriodicSnapshots(state.graph, state.multiIndexer, version, 10*time.Minute, controller.IsReady, logger) + // Periodic snapshots fire only for the memory backend — that's + // the path that has no other persistence layer for the graph + // itself. Persistent backends (sqlite) rely on the backend's own + // durability (graph + FileMtimes + contracts + vectors all live + // on disk) so the gob+gzip snapshot is dead weight in that mode. + stopSnapshotter := func() {} + if mg, ok := state.graph.(*graph.Graph); ok { + stopSnapshotter = startPeriodicSnapshots(mg, state.multiIndexer, version, 10*time.Minute, controller.IsReady, logger) + } defer stopSnapshotter() // Periodic savings flush — 5 minute interval. Bounds on-crash data @@ -364,6 +403,16 @@ func runDaemonStart(cmd *cobra.Command, _ []string) error { // first" against a fully populated state. if state.mcpServer != nil { state.mcpServer.RunAnalysis() + // Co-change pre-warm: fire the git-history mine in the + // background so the first user-visible + // find_co_changing_symbols / search-rerank call sees a + // populated cache. On a persistent backend the mine is + // dominated by the AllNodes + per-pair AddEdge disk-persist + // step that mineCoChange already defers into its own + // goroutine — but even the git log itself can take 10–30s + // on a large history, and we want that off every request + // path. + state.mcpServer.PrewarmCoChange() } elapsed := time.Since(start) controller.MarkReady(elapsed) @@ -609,6 +658,13 @@ func emitDaemonStartSummary(w io.Writer, pid int, elapsed time.Duration) { func runDaemonStop(cmd *cobra.Command, _ []string) error { w := cmd.ErrOrStderr() if !daemon.IsRunning() { + // The socket is gone, but a process may still be alive and holding + // the store lock — a daemon mid-shutdown, or one whose socket wedged. + // killByPID terminates it AND blocks until it has actually exited, + // which is what `daemon restart` relies on to not race the lock. + if _, ok := daemon.RunningPID(); ok { + return killByPID() + } emitDaemonStopAlreadyDown(w) return nil } @@ -617,6 +673,13 @@ func runDaemonStop(cmd *cobra.Command, _ []string) error { // post-stop summary (the socket file vanishes on clean shutdown). socket := daemon.SocketPath() uptime := daemonUptimeBeforeStop() + // Capture the PID too. ControlShutdown only *acks* — the daemon then + // flushes and closes the store (releasing its on-disk lock) and exits + // asynchronously (see server.go: the handler Shutdown()s ~100ms later in + // a goroutine). We must block until that process is gone, or a following + // `daemon start` races the still-held lock and dies with the opaque + // "failed to open database with status 1". + pid, havePID := daemon.RunningPID() c, err := daemon.Dial(daemon.Handshake{Mode: daemon.ModeControl, ClientName: "cli"}) if err != nil { @@ -632,10 +695,39 @@ func runDaemonStop(cmd *cobra.Command, _ []string) error { if !resp.OK { return fmt.Errorf("shutdown rejected: %s %s", resp.ErrorCode, resp.ErrorMsg) } + if havePID { + waitForDaemonExit(pid) + } emitDaemonStopSummary(w, socket, uptime) return nil } +// waitForDaemonExit blocks until the daemon process pid has exited — and thus +// released the store's on-disk lock — force-killing it if a graceful shutdown +// stalls. This is what makes `daemon stop` honest: when it returns, the store +// is free for the next process, which is the foundation `daemon restart` +// stands on. Polls cheaply; the common case (a clean flush) clears in well +// under a second. +func waitForDaemonExit(pid int) { + deadline := time.Now().Add(15 * time.Second) + for time.Now().Before(deadline) { + if !platform.ProcessAlive(pid) { + return + } + time.Sleep(50 * time.Millisecond) + } + // Graceful shutdown stalled (e.g. a wedged cgo call). Don't leave a + // half-exited daemon clutching the lock — force it, then clean up the + // socket/PID so the next start isn't tripped by stale files. + fmt.Fprintln(os.Stderr, "[gortex daemon] graceful shutdown timed out — force-killing") + _ = platform.KillProcess(pid) + for i := 0; i < 60 && platform.ProcessAlive(pid); i++ { + time.Sleep(50 * time.Millisecond) + } + _ = os.Remove(daemon.PIDFilePath()) + _ = os.Remove(daemon.SocketPath()) +} + // daemonUptimeBeforeStop best-effort-fetches the daemon's reported uptime via // a Status control before shutdown so the summary card can show how long the // process ran. Returns 0 on any error — we'd rather degrade the card than @@ -709,15 +801,17 @@ func runDaemonRestart(cmd *cobra.Command, args []string) error { emitDaemonRestartBanner(cmd.ErrOrStderr()) - // Stop is idempotent when not running. + // Stop is idempotent when not running and now blocks until the old + // process has fully exited — releasing the store's on-disk lock — before + // returning. That's what lets the start below reuse the store without + // racing the lock. The old code polled `daemon.IsRunning()` here, which + // watched the wrong resource: the socket is torn down ~100ms after the + // shutdown ack, long before the process exits and the lock clears, so the + // poll fell through early and the restart died on "failed to open + // database with status 1". if err := runDaemonStop(cmd, args); err != nil { return err } - // Give the OS a moment to release the socket file. - deadline := time.Now().Add(3 * time.Second) - for time.Now().Before(deadline) && daemon.IsRunning() { - time.Sleep(50 * time.Millisecond) - } daemonDetach = true return runDaemonStart(cmd, args) } @@ -812,8 +906,10 @@ func renderDaemonHeader(w io.Writer, st daemon.StatusResponse) { t.AppendRow(table.Row{"socket", st.SocketPath}) t.AppendRow(table.Row{"uptime", formatDuration(time.Duration(st.UptimeSeconds) * time.Second)}) if st.Ready { - t.AppendRow(table.Row{"state", - fmt.Sprintf("ready (warmup %s)", formatDuration(time.Duration(st.WarmupSeconds)*time.Second))}) + t.AppendRow(table.Row{ + "state", + fmt.Sprintf("ready (warmup %s)", formatDuration(time.Duration(st.WarmupSeconds)*time.Second)), + }) } else { t.AppendRow(table.Row{"state", "warming up (socket reachable, background re-index in progress)"}) } @@ -1131,6 +1227,21 @@ func daemonControlClient() (*daemon.Client, error) { return c, nil } +// resolveDaemonBufferPoolMB returns the effective buffer-pool cap. +// Precedence: --backend-buffer-pool-mb flag > GORTEX_DAEMON_BUFFER_POOL_MB env > 0 +// (which Open then maps to DefaultBufferPoolMB inside the store). +func resolveDaemonBufferPoolMB() uint64 { + if daemonBackendBufferPoolMB != 0 { + return daemonBackendBufferPoolMB + } + if env := strings.TrimSpace(os.Getenv("GORTEX_DAEMON_BUFFER_POOL_MB")); env != "" { + if v, err := strconv.ParseUint(env, 10, 64); err == nil { + return v + } + } + return 0 +} + // killByPID is the fallback stop path for stale daemons that have a PID // file but don't respond on the socket. Asks the process to terminate, // waits, then force-kills. Silently returns nil if the PID no longer diff --git a/cmd/gortex/daemon_controller.go b/cmd/gortex/daemon_controller.go index 630f0e94..9b421c82 100644 --- a/cmd/gortex/daemon_controller.go +++ b/cmd/gortex/daemon_controller.go @@ -14,10 +14,15 @@ import ( "go.uber.org/zap" + "github.com/zzet/gortex/internal/blame" + "github.com/zzet/gortex/internal/churn" + "github.com/zzet/gortex/internal/cochange" "github.com/zzet/gortex/internal/config" + "github.com/zzet/gortex/internal/coverage" "github.com/zzet/gortex/internal/daemon" "github.com/zzet/gortex/internal/graph" "github.com/zzet/gortex/internal/indexer" + "github.com/zzet/gortex/internal/releases" "github.com/zzet/gortex/internal/search" "github.com/zzet/gortex/internal/semantic/lsp" ) @@ -31,7 +36,7 @@ import ( // otherwise. The mutex is coarse; finer locking is a later optimization. type realController struct { mu sync.Mutex - graph *graph.Graph + graph graph.Store indexer *indexer.Indexer multiIndexer *indexer.MultiIndexer configManager *config.ConfigManager @@ -76,7 +81,7 @@ func (c *realController) Track(ctx context.Context, p daemon.TrackParams) (json. // Project association from TrackParams.Project isn't wired yet — the // config package doesn't expose an AddRepoToProject helper. Callers - // who need project scoping can edit ~/.config/gortex/config.yaml and + // who need project scoping can edit ~/.gortex/config.yaml and // run `gortex daemon reload`; track from the daemon-v1 surface just // adds to the top-level repo list. @@ -112,6 +117,244 @@ func (c *realController) Track(ctx context.Context, p daemon.TrackParams) (json. }) } +// EnrichChurn runs the churn enricher in-process against the daemon's +// graph. We hold c.mu for the duration so a concurrent Track/Untrack +// can't reshape the set of files while the enricher walks them. The +// caller (CLI / git hook) picks the params; an empty Path means "every +// tracked repo", an empty Branch means "resolve each repo's default +// branch from its working tree". +func (c *realController) EnrichChurn(ctx context.Context, p daemon.EnrichChurnParams) (daemon.EnrichChurnResult, error) { + c.mu.Lock() + defer c.mu.Unlock() + + if c.graph == nil { + return daemon.EnrichChurnResult{}, fmt.Errorf("graph not initialized") + } + if c.multiIndexer == nil { + return daemon.EnrichChurnResult{}, fmt.Errorf("multi-repo indexer not initialized") + } + + // Resolve the set of repo roots the call targets. Empty Path = + // every tracked repo. A path or prefix narrows to one. + type target struct { + prefix string + root string + } + var targets []target + want := strings.TrimSpace(p.Path) + for prefix, meta := range c.multiIndexer.AllMetadata() { + if want != "" && want != prefix && want != meta.RootPath { + continue + } + targets = append(targets, target{prefix: prefix, root: meta.RootPath}) + } + if len(targets) == 0 { + return daemon.EnrichChurnResult{}, fmt.Errorf("no tracked repo matches %q", p.Path) + } + + started := time.Now() + var combined daemon.EnrichChurnResult + for _, t := range targets { + branch := strings.TrimSpace(p.Branch) + if branch == "" { + branch = gitDefaultBranch(t.root) + } + if branch == "" { + c.logger.Warn("enrich churn: no default branch resolved", + zap.String("prefix", t.prefix), zap.String("root", t.root)) + continue + } + res, err := churn.EnrichGraph(ctx, c.graph, t.root, churn.Options{Branch: branch}) + if err != nil { + return daemon.EnrichChurnResult{}, fmt.Errorf("enrich %s: %w", t.prefix, err) + } + combined.Files += res.Files + combined.Symbols += res.Symbols + combined.Branch = res.Branch + combined.HeadSHA = res.HeadSHA + } + combined.DurationMS = time.Since(started).Milliseconds() + return combined, nil +} + +// EnrichReleases runs the per-file release enricher against the +// daemon's graph. Mirrors EnrichChurn — c.mu is held for the duration, +// targets resolve via the multi-indexer, and an empty Branch lets +// each repo's default branch be resolved on demand (so feature-branch +// tags don't leak into the timeline). +func (c *realController) EnrichReleases(ctx context.Context, p daemon.EnrichReleasesParams) (daemon.EnrichReleasesResult, error) { + c.mu.Lock() + defer c.mu.Unlock() + + if c.graph == nil { + return daemon.EnrichReleasesResult{}, fmt.Errorf("graph not initialized") + } + if c.multiIndexer == nil { + return daemon.EnrichReleasesResult{}, fmt.Errorf("multi-repo indexer not initialized") + } + + type target struct { + prefix string + root string + } + var targets []target + want := strings.TrimSpace(p.Path) + for prefix, meta := range c.multiIndexer.AllMetadata() { + if want != "" && want != prefix && want != meta.RootPath { + continue + } + targets = append(targets, target{prefix: prefix, root: meta.RootPath}) + } + if len(targets) == 0 { + return daemon.EnrichReleasesResult{}, fmt.Errorf("no tracked repo matches %q", p.Path) + } + _ = ctx // graph mutation is synchronous; no cancellation surface today + + started := time.Now() + var combined daemon.EnrichReleasesResult + for _, t := range targets { + branch := strings.TrimSpace(p.Branch) + if branch == "" { + branch = gitDefaultBranch(t.root) + // Empty branch is still legal — releases.EnrichGraphForBranch + // treats "" as "every tag", which is the right default when + // no default branch can be resolved (e.g. a clone without + // origin/HEAD set yet). + } + count, err := releases.EnrichGraphForBranch(c.graph, t.root, t.prefix, branch) + if err != nil { + return daemon.EnrichReleasesResult{}, fmt.Errorf("enrich %s: %w", t.prefix, err) + } + combined.Files += count + combined.Branch = branch + } + combined.DurationMS = time.Since(started).Milliseconds() + return combined, nil +} + +// enrichTarget is one (prefix, root) pair the enrichers run against. +type enrichTarget struct { + prefix string + root string +} + +// resolveEnrichTargets maps the caller-supplied path scope onto the set +// of tracked repos to enrich. An empty path means "every tracked repo"; +// a non-empty path narrows to the one repo whose prefix or root matches. +// Returns an error when nothing matches so the control caller gets a +// clear "no tracked repo" message rather than a silent zero-count +// success. Caller must hold c.mu. +func (c *realController) resolveEnrichTargets(path string) ([]enrichTarget, error) { + if c.graph == nil { + return nil, fmt.Errorf("graph not initialized") + } + if c.multiIndexer == nil { + return nil, fmt.Errorf("multi-repo indexer not initialized") + } + var targets []enrichTarget + want := strings.TrimSpace(path) + for prefix, meta := range c.multiIndexer.AllMetadata() { + if meta == nil || meta.RootPath == "" { + continue + } + if want != "" && want != prefix && want != meta.RootPath { + continue + } + targets = append(targets, enrichTarget{prefix: prefix, root: meta.RootPath}) + } + if len(targets) == 0 { + return nil, fmt.Errorf("no tracked repo matches %q", path) + } + return targets, nil +} + +// EnrichBlame runs the git-blame authorship enricher against the +// daemon's graph. Mirrors EnrichChurn — c.mu is held for the duration +// and targets resolve via the multi-indexer. +func (c *realController) EnrichBlame(_ context.Context, p daemon.EnrichBlameParams) (daemon.EnrichBlameResult, error) { + c.mu.Lock() + defer c.mu.Unlock() + + targets, err := c.resolveEnrichTargets(p.Path) + if err != nil { + return daemon.EnrichBlameResult{}, err + } + + started := time.Now() + var combined daemon.EnrichBlameResult + for _, t := range targets { + count, err := blame.EnrichGraph(c.graph, t.root) + if err != nil { + return daemon.EnrichBlameResult{}, fmt.Errorf("enrich %s: %w", t.prefix, err) + } + combined.Nodes += count + } + combined.DurationMS = time.Since(started).Milliseconds() + return combined, nil +} + +// EnrichCoverage projects the caller-parsed cover-profile segments onto +// the daemon's graph. The CLI parses the profile (the path is relative +// to the caller's cwd, not the daemon's), so the daemon only needs the +// segments and resolves each repo's module path from its working tree. +func (c *realController) EnrichCoverage(_ context.Context, p daemon.EnrichCoverageParams) (daemon.EnrichCoverageResult, error) { + c.mu.Lock() + defer c.mu.Unlock() + + targets, err := c.resolveEnrichTargets(p.Path) + if err != nil { + return daemon.EnrichCoverageResult{}, err + } + + segments := make([]coverage.Segment, len(p.Segments)) + for i, s := range p.Segments { + segments[i] = coverage.Segment{ + File: s.File, + StartLine: s.StartLine, + EndLine: s.EndLine, + NumStmt: s.NumStmt, + Count: s.Count, + } + } + + started := time.Now() + var combined daemon.EnrichCoverageResult + combined.Segments = len(segments) + for _, t := range targets { + modulePath := coverage.ReadModulePath(t.root) + combined.Symbols += coverage.EnrichGraph(c.graph, segments, modulePath) + } + combined.DurationMS = time.Since(started).Milliseconds() + return combined, nil +} + +// EnrichCochange mines co-change edges against the daemon's graph. +// Mirrors EnrichChurn — c.mu is held for the duration and targets +// resolve via the multi-indexer. The repo prefix scopes the file-node +// match in multi-repo graphs. +func (c *realController) EnrichCochange(ctx context.Context, p daemon.EnrichCochangeParams) (daemon.EnrichCochangeResult, error) { + c.mu.Lock() + defer c.mu.Unlock() + + targets, err := c.resolveEnrichTargets(p.Path) + if err != nil { + return daemon.EnrichCochangeResult{}, err + } + _ = ctx // mining is synchronous; no cancellation surface today + + started := time.Now() + var combined daemon.EnrichCochangeResult + for _, t := range targets { + count, err := cochange.EnrichGraph(c.graph, t.root, t.prefix) + if err != nil { + return daemon.EnrichCochangeResult{}, fmt.Errorf("enrich %s: %w", t.prefix, err) + } + combined.Edges += count + } + combined.DurationMS = time.Since(started).Milliseconds() + return combined, nil +} + // Untrack evicts a repo from the graph and drops it from config. // PathOrPrefix accepts either an absolute path or a repo prefix. func (c *realController) Untrack(_ context.Context, p daemon.UntrackParams) (json.RawMessage, error) { diff --git a/cmd/gortex/daemon_rebuild_test.go b/cmd/gortex/daemon_rebuild_test.go new file mode 100644 index 00000000..990b0b8c --- /dev/null +++ b/cmd/gortex/daemon_rebuild_test.go @@ -0,0 +1,33 @@ +package main + +import "testing" + +type fakeRebuildYes struct{} + +func (fakeRebuildYes) NeedsRebuild() bool { return true } + +type fakeRebuildNo struct{} + +func (fakeRebuildNo) NeedsRebuild() bool { return false } + +// storeNeedsRebuild must detect the optional NeedsRebuild capability and +// default to false for backends that don't implement it (the in-memory +// store), so the warm-restart fast path is bypassed only on an explicit +// rebuild signal. +func TestStoreNeedsRebuild(t *testing.T) { + cases := []struct { + name string + g any + want bool + }{ + {"implements true", fakeRebuildYes{}, true}, + {"implements false", fakeRebuildNo{}, false}, + {"no capability", struct{}{}, false}, + {"nil", nil, false}, + } + for _, c := range cases { + if got := storeNeedsRebuild(c.g); got != c.want { + t.Errorf("%s: storeNeedsRebuild = %v, want %v", c.name, got, c.want) + } + } +} diff --git a/cmd/gortex/daemon_snapshot.go b/cmd/gortex/daemon_snapshot.go index c263d346..68350ebb 100644 --- a/cmd/gortex/daemon_snapshot.go +++ b/cmd/gortex/daemon_snapshot.go @@ -338,7 +338,12 @@ func migrateSnapshotFile(path string, fromVersion int) (io.Reader, error) { // The vec argument carries the workspace-global vector-search index so // a default-on daemon does not re-embed the whole graph on restart. func saveSnapshot(g *graph.Graph, repos []snapshotRepo, snapContracts []snapshotContract, vec snapshotVector, version string, logger *zap.Logger) { - _ = saveSnapshotTo(g, repos, snapContracts, vec, version, daemon.SnapshotPath(), logger) + // Memory backend: the gob+gzip dump IS the persistence layer, so + // route to the per-backend path so a future disk-backed daemon + // can't accidentally pick up this snapshot at startup. See + // daemon.BackendSnapshotPath for the memory ↔ disk-backend switch + // rationale. + _ = saveSnapshotTo(g, repos, snapContracts, vec, version, daemon.BackendSnapshotPath("memory"), logger) } // saveSnapshotTo writes the snapshot to an explicit path. Used by @@ -585,6 +590,14 @@ func fromSnapshotContract(s snapshotContract) contracts.Contract { // trades "one bad byte poisons the entire cache" for "N bad records // cost at most N files being re-indexed on next warmup." func loadSnapshot(g *graph.Graph, logger *zap.Logger) (snapshotLoadResult, error) { + // Memory backend reads from its own backend-tagged path. Falls + // back transparently to the legacy unsuffixed daemon.gob.gz when + // the override env is set or the new file doesn't exist yet, so + // users upgrading across this change don't have to re-warm. + res, err := loadSnapshotFrom(g, daemon.BackendSnapshotPath("memory"), logger) + if err == nil && (res.Loaded || res.Partial) { + return res, nil + } return loadSnapshotFrom(g, daemon.SnapshotPath(), logger) } @@ -592,7 +605,7 @@ func loadSnapshot(g *graph.Graph, logger *zap.Logger) (snapshotLoadResult, error // Used by `gortex server --snapshot ` so a per-workspace // process can boot from a specific snapshot file produced by the // cloud indexer worker. -func loadSnapshotFrom(g *graph.Graph, path string, logger *zap.Logger) (snapshotLoadResult, error) { +func loadSnapshotFrom(g graph.Store, path string, logger *zap.Logger) (snapshotLoadResult, error) { // Allocate Contracts up front so every early-return path (missing // file, gzip error, header decode error, schema mismatch) hands the // caller a safe-to-read zero-value instead of a nil map. The warmup @@ -664,7 +677,7 @@ func loadSnapshotFrom(g *graph.Graph, path string, logger *zap.Logger) (snapshot // rewrites edges whose source file's mtime changed, and most files // stay untouched across daemon restarts). Bumping any resolver // behaviour without bumping snapshotSchemaVersion silently degrades - // query quality until the user thinks to wipe ~/.cache/gortex. + // query quality until the user thinks to wipe ~/.gortex/cache. // // Cheap fix: if the binary that wrote the snapshot has a different // version string than the binary loading it, discard. Cost is one diff --git a/cmd/gortex/daemon_state.go b/cmd/gortex/daemon_state.go index 48ff7e27..b4a885e1 100644 --- a/cmd/gortex/daemon_state.go +++ b/cmd/gortex/daemon_state.go @@ -9,6 +9,7 @@ import ( "sort" "strings" "sync" + "sync/atomic" "time" "go.uber.org/zap" @@ -36,7 +37,7 @@ import ( // instance per running daemon; every session the daemon accepts shares // these pointers. type daemonState struct { - graph *graph.Graph + graph graph.Store indexer *indexer.Indexer multiIndexer *indexer.MultiIndexer configManager *config.ConfigManager @@ -95,7 +96,7 @@ type daemonState struct { // stdio transport wiring — the daemon hands frames to MCPServer.HandleMessage // via the mcpDispatcher rather than going through server.ServeStdio. // -// Any previously-tracked repos (from ~/.config/gortex/config.yaml) are +// Any previously-tracked repos (from ~/.gortex/config.yaml) are // loaded on startup so the daemon restarts pick up where it left off. // isFalsyEnv returns true when the env var is explicitly set to one // of the "no" spellings: "0", "false", "no", "off", "n". An unset or @@ -177,7 +178,20 @@ func buildDaemonState(logger *zap.Logger) (*daemonState, error) { } } - g := graph.New() + g, backendCleanup, err := openBackend(daemonBackend, daemonBackendPath, resolveDaemonBufferPoolMB(), logger) + if err != nil { + return nil, fmt.Errorf("opening backend %q: %w", daemonBackend, err) + } + // Cleanup runs at daemon shutdown via the returned state's + // teardown chain (see DaemonState.Close); store it on the + // state so deferred close fires after every other shutdown + // step (snapshot save, etc.). + defer func() { + if err != nil { + backendCleanup() + } + }() + reg := parser.NewRegistry() languages.RegisterAll(reg) languages.RegisterCustomGrammars(reg, cfg.Index.Grammars, logger) @@ -190,10 +204,35 @@ func buildDaemonState(logger *zap.Logger) (*daemonState, error) { // have no signal to distinguish "indexed and unchanged" from "new // on disk", treat everything as stale, and produce duplicate // nodes/edges on every restart (bug B1). - loadResult, err := loadSnapshot(g, logger) - if err != nil { - logger.Warn("daemon: snapshot load failed", zap.Error(err)) + // + // Two snapshot shapes: + // + // - Memory backend: full graph replay (loadSnapshot). The + // gob+gzip dump IS the persistence layer; nodes + edges are + // replayed into the empty *graph.Graph. + // + // - Persistent backend (sqlite): metadata-only load + // (loadSnapshotMetadata). The graph already lives in the + // backend's own on-disk store, so the snapshot only needs to + // carry the data the backend doesn't track — per-repo + // FileMtimes, contract registries, vector index. Skipping the + // load entirely (the previous behaviour) left priorMtimes + // empty and routed every warm restart through a full + // TrackRepoCtx → BulkUpsertSymbolFTS reindex path. + var loadResult snapshotLoadResult + if mg, ok := g.(*graph.Graph); ok { + loadResult, err = loadSnapshot(mg, logger) + if err != nil { + logger.Warn("daemon: snapshot load failed", zap.Error(err)) + } } + // Disk-backed daemons don't read a metadata snapshot: per- + // repo FileMtimes live in the FileMtime sidecar table (loaded + // per-repo by priorMtimesFromStore in the parallel_parse loop + // below), KindContract nodes carry the rich contract record on + // Node.Meta (rehydrated via contracts.LoadRegistryFromGraph), + // and vector queries route to the backend's native vector index. + // The legacy gob round-trip is now memory-backend-only. idx := indexer.New(g, reg, cfg.Index, logger) @@ -457,14 +496,20 @@ func buildDaemonState(logger *zap.Logger) (*daemonState, error) { srv.SetLSPDiagnosticsBroadcasting() } srv.InitFeedback("", "") - srv.InitNotes("", "") - srv.InitMemories("", "") - // Daemon mode has no single repo to anchor a per-repo notebook - // against, but the agent still wants persistence across daemon - // restarts and shared visibility across sessions. Fall back to a - // global notebook under the legacy data dir; CLI mode keeps the - // per-repo .gortex/notebook/ path wired in cmd/gortex/mcp.go. - srv.InitNotebook(filepath.Join(platform.LegacyDataDir(), "notebook-cache")) + // Daemon mode has no single repo to anchor the per-repo side-stores + // against, but notes/memories must still persist across daemon + // restarts and compactions (they are independent of the graph + // backend). Wire them to the shared sidecar DB under the data dir + // with a stable "daemon" partition key; per-call WorkspaceID / + // SessionID filtering keeps repos' notes distinct at query time. + // The per-repo `gortex mcp` subprocess persists under its own + // cache dir (cmd/gortex/mcp.go). + srv.InitNotes(platform.DataDir(), "daemon") + srv.InitMemories(platform.DataDir(), "daemon") + // Notebook: a global notebook under the data dir so entries survive + // daemon restarts and are shared across sessions; CLI mode keeps the + // per-repo .gortex/ path wired in cmd/gortex/mcp.go. + srv.InitNotebook(filepath.Join(platform.DataDir(), "notebook-cache")) srv.InitCombo("", "", gortexmcp.ModeAI) srv.InitFrecency("", "", gortexmcp.ModeAI) @@ -475,7 +520,7 @@ func buildDaemonState(logger *zap.Logger) (*daemonState, error) { } // LLM service (opt-in via the `.gortex.yaml` `llm:` block, - // `~/.config/gortex/config.yaml::llm:`, or GORTEX_LLM_* env vars). + // `~/.gortex/config.yaml::llm:`, or GORTEX_LLM_* env vars). // Repo-local config wins per non-zero field; the global config // fills the rest; env overrides land last inside SetupLLM via // MergeEnv. The active provider is chosen by `llm.provider` @@ -652,53 +697,113 @@ func warmupDaemonState(state *daemonState, logger *zap.Logger) *indexer.MultiWat jobs := make(chan config.RepoEntry, len(repos)) var wg sync.WaitGroup + // changedRepos counts repos that actually did indexing work this + // warmup: a cold full-track, or a reconcile that re-indexed / evicted + // at least one file. When it stays zero, NOTHING on disk changed + // since the last shutdown, so the persisted graph already holds every + // resolved and derived edge — the global resolution passes below + // (RunDeferredPassesAll / RunGlobalResolve / RunGlobalGraphPasses) are + // pure recomputation and get skipped, which is what makes a true warm + // restart near-instant instead of replaying the full cold-warmup cost. + var changedRepos atomic.Int64 for i := 0; i < workers; i++ { wg.Add(1) go func() { defer wg.Done() for entry := range jobs { - // Route repos whose nodes came from the snapshot through - // ReconcileRepoCtx — it calls IncrementalReindex, which - // evicts files deleted while the daemon was down and - // re-indexes only files whose mtime changed. Repos not in - // the snapshot (newly tracked, or first startup after a - // schema bump) fall back to TrackRepoCtx, which does a - // full walk. Both paths end with the repo registered on - // the MultiIndexer; contract reconciliation is deferred - // to the single RunGlobalResolve call below. - // - // snapshotPartial == true forces the full-walk path even - // when prior mtimes exist: the partial-load signal means - // the persisted resolution state is no longer trustworthy - // (stale edges were dropped because their targets vanished), - // and the incremental path only re-resolves files whose - // mtime changed — so the dropped edges would never come - // back. Without this override every restart progressively - // erodes the graph until exported methods show zero - // callers despite having dozens of real call sites. - repoStart := time.Now() - priorMtimes := priorMtimesForEntry(state.snapshotRepos, entry) - if state.snapshotPartial { - priorMtimes = nil - } - pathFn := "track" - if priorMtimes != nil { - pathFn = "reconcile" - if _, err := state.multiIndexer.ReconcileRepoCtx(ctx, entry, priorMtimes); err != nil { - logger.Warn("daemon: startup reconcile failed", - zap.String("path", entry.Path), zap.Error(err)) + // Per-entry panic guard so one repo's crash during + // reindex doesn't kill the worker — the bad repo logs + // and skips, the worker proceeds to the next job, and + // warmup completes. + func(entry config.RepoEntry) { + defer func() { + if r := recover(); r != nil { + logger.Error("daemon: warmup repo panic recovered", + zap.String("path", entry.Path), + zap.Any("panic", r)) + } + }() + // Route repos whose nodes came from the snapshot through + // ReconcileRepoCtx — it calls IncrementalReindex, which + // evicts files deleted while the daemon was down and + // re-indexes only files whose mtime changed. Repos not in + // the snapshot (newly tracked, or first startup after a + // schema bump) fall back to TrackRepoCtx, which does a + // full walk. Both paths end with the repo registered on + // the MultiIndexer; contract reconciliation is deferred + // to the single RunGlobalResolve call below. + // + // snapshotPartial == true forces the full-walk path even + // when prior mtimes exist: the partial-load signal means + // the persisted resolution state is no longer trustworthy + // (stale edges were dropped because their targets vanished), + // and the incremental path only re-resolves files whose + // mtime changed — so the dropped edges would never come + // back. Without this override every restart progressively + // erodes the graph until exported methods show zero + // callers despite having dozens of real call sites. + repoStart := time.Now() + // Prefer mtimes stored in the backend's FileMtime + // sidecar table — that lifts the persistence off the + // gob snapshot for disk-backed backends, which is the + // path that actually rebuilds across restarts. Falls + // back to the snapshot's per-repo FileMtimes when the + // backend doesn't implement the reader (memory) or + // hasn't seen this repo yet. + priorMtimes := priorMtimesFromStore(state.graph, entry, logger) + if len(priorMtimes) == 0 { + priorMtimes = priorMtimesForEntry(state.snapshotRepos, entry) } - } else if _, err := state.multiIndexer.TrackRepoCtx(ctx, entry); err != nil { - logger.Warn("daemon: startup track failed", - zap.String("path", entry.Path), zap.Error(err)) - } - elapsed := time.Since(repoStart) - if elapsed > 2*time.Second { - logger.Info("daemon: warmup repo elapsed", - zap.String("path", entry.Path), - zap.String("path_fn", pathFn), - zap.Duration("elapsed", elapsed)) - } + if state.snapshotPartial { + priorMtimes = nil + } + // A backend that crossed a schema-rebuild migration rung + // (NeedsRebuild) has on-disk rows in the old shape that an + // incremental reconcile cannot fix. Drop prior mtimes so every + // file re-indexes into the new schema (the nil branch below + // runs a full TrackRepoCtx and marks the repo changed, so the + // global resolve/derivation passes re-run too). No-op for + // backends without the capability and whenever no rebuild rung + // was crossed — the common case. + if storeNeedsRebuild(state.graph) { + if len(priorMtimes) > 0 { + logger.Info("daemon: backend signalled schema rebuild; forcing full re-index", + zap.String("path", entry.Path)) + } + priorMtimes = nil + } + pathFn := "track" + if priorMtimes != nil { + pathFn = "reconcile" + res, err := state.multiIndexer.ReconcileRepoCtx(ctx, entry, priorMtimes) + switch { + case err != nil: + logger.Warn("daemon: startup reconcile failed", + zap.String("path", entry.Path), zap.Error(err)) + // Treat a failed reconcile as "changed" so the global + // passes still run — degrade toward correctness, not + // toward the fast path, when we can't trust the delta. + changedRepos.Add(1) + case res != nil && (res.StaleFileCount > 0 || res.DeletedFileCount > 0 || len(res.FailedFiles) > 0): + changedRepos.Add(1) + } + } else { + // No prior mtimes → full cold (re)index of this repo, + // which is "changed" by definition. + changedRepos.Add(1) + if _, err := state.multiIndexer.TrackRepoCtx(ctx, entry); err != nil { + logger.Warn("daemon: startup track failed", + zap.String("path", entry.Path), zap.Error(err)) + } + } + elapsed := time.Since(repoStart) + if elapsed > 2*time.Second { + logger.Info("daemon: warmup repo elapsed", + zap.String("path", entry.Path), + zap.String("path_fn", pathFn), + zap.Duration("elapsed", elapsed)) + } + }(entry) } }() } @@ -715,19 +820,36 @@ func warmupDaemonState(state *daemonState, logger *zap.Logger) *indexer.MultiWat "elapsed_ms": time.Since(phaseStart).Milliseconds(), }) + // Warm-restart fast path. When the reconcile loop above re-indexed + // nothing, the persistent backend already carries every resolved and + // derived edge from the prior run; the deferred per-repo passes, the + // cross-repo resolve, and the graph-wide derivation passes would all + // just recompute what's on disk. Skipping them is what turns a warm + // restart from a multi-minute replay of the cold-warmup cost into a + // near-instant "open store, reconcile zero files, start watching". + // The in-memory backend reaches here too, but its snapshot replay + // already restored the derived edges, so the skip is equally safe. + anyChanged := changedRepos.Load() > 0 + logger.Info("daemon: warmup change detection", + zap.Int64("changed_repos", changedRepos.Load()), + zap.Int("tracked_repos", len(repos)), + zap.Bool("global_passes", anyChanged)) + // Drain deferred per-repo passes (ResolveAll / semantic enrich / // contract extract+commit) serially across the indexers the parallel // loop populated. Must run before RunGlobalResolve so cross-repo // resolution sees fully-lifted per-repo placeholder edges. - phaseStart = time.Now() - publishReadinessPhase(state, "deferred_passes_all", false, nil) - state.multiIndexer.RunDeferredPassesAll(ctx) - logger.Info("daemon: warmup phase done", - zap.String("phase", "deferred_passes_all"), - zap.Duration("elapsed", time.Since(phaseStart))) - publishReadinessPhase(state, "deferred_passes_all_done", false, map[string]any{ - "elapsed_ms": time.Since(phaseStart).Milliseconds(), - }) + if anyChanged { + phaseStart = time.Now() + publishReadinessPhase(state, "deferred_passes_all", false, nil) + state.multiIndexer.RunDeferredPassesAll(ctx) + logger.Info("daemon: warmup phase done", + zap.String("phase", "deferred_passes_all"), + zap.Duration("elapsed", time.Since(phaseStart))) + publishReadinessPhase(state, "deferred_passes_all_done", false, map[string]any{ + "elapsed_ms": time.Since(phaseStart).Milliseconds(), + }) + } // Rehydrate per-repo contract registries from the snapshot. Only // target indexers whose registry is still nil — a non-nil registry @@ -737,7 +859,7 @@ func warmupDaemonState(state *daemonState, logger *zap.Logger) *indexer.MultiWat // MergedContractRegistry skips them, so `contracts` returns only // the contracts of repos whose files happened to change since the // last shutdown. - if len(state.snapshotContracts) > 0 { + { phaseStart = time.Now() injectedRepos, injectedCount := 0, 0 for prefix := range state.multiIndexer.AllMetadata() { @@ -745,20 +867,32 @@ func warmupDaemonState(state *daemonState, logger *zap.Logger) *indexer.MultiWat if idx == nil || idx.ContractRegistry() != nil { continue } - cs, ok := state.snapshotContracts[prefix] - if !ok || len(cs) == 0 { - continue - } - reg := contracts.NewRegistry() - for _, c := range cs { - reg.Add(c) + // Primary path: rebuild the per-repo registry from + // KindContract nodes already in the backend's graph. + // The indexer stamps every contract record onto + // Node.Meta at commit time, so the graph is the + // authoritative source — no gob round-trip needed. + reg := contracts.LoadRegistryFromGraph(state.graph, prefix) + if reg == nil { + // Fallback to the legacy gob-snapshot path for + // daemons upgrading across this change. The + // snapshot copy is read-only by this point so the + // two sources can't drift mid-flight. + cs, ok := state.snapshotContracts[prefix] + if !ok || len(cs) == 0 { + continue + } + reg = contracts.NewRegistry() + for _, c := range cs { + reg.Add(c) + } } idx.SetContractRegistry(reg) injectedRepos++ - injectedCount += len(cs) + injectedCount += len(reg.All()) } if injectedRepos > 0 { - logger.Info("daemon: rehydrated contract registries from snapshot", + logger.Info("daemon: rehydrated contract registries from graph/snapshot", zap.Int("repos", injectedRepos), zap.Int("contracts", injectedCount), zap.Duration("elapsed", time.Since(phaseStart))) @@ -788,24 +922,33 @@ func warmupDaemonState(state *daemonState, logger *zap.Logger) *indexer.MultiWat // for a fresh-start daemon (where there's no snapshot to reconcile // against). After resolution, contract bridge edges may have // changed too, so ReconcileContractEdges runs again. - phaseStart = time.Now() - publishReadinessPhase(state, "global_resolve", false, nil) - state.multiIndexer.RunGlobalResolve() - logger.Info("daemon: warmup phase done", - zap.String("phase", "global_resolve"), - zap.Duration("elapsed", time.Since(phaseStart))) - publishReadinessPhase(state, "global_resolve_done", false, map[string]any{ - "elapsed_ms": time.Since(phaseStart).Milliseconds(), - }) + if anyChanged { + phaseStart = time.Now() + publishReadinessPhase(state, "global_resolve", false, nil) + state.multiIndexer.RunGlobalResolve() + logger.Info("daemon: warmup phase done", + zap.String("phase", "global_resolve"), + zap.Duration("elapsed", time.Since(phaseStart))) + publishReadinessPhase(state, "global_resolve_done", false, map[string]any{ + "elapsed_ms": time.Since(phaseStart).Milliseconds(), + }) + } // Finish the batch: turn off the per-repo skip flag and run the // graph-wide derivation passes once. RunGlobalResolve above just // lifted the last cross-repo placeholder EdgeCalls, so EdgeTests // derivation here picks up cross-repo test→subject pairs that - // were unresolved during the per-repo loop. + // were unresolved during the per-repo loop. On the warm-restart fast + // path (nothing changed) ResetBatch clears the deferred-batch flags + // without re-running those passes — the persisted graph already has + // the derived edges. phaseStart = time.Now() publishReadinessPhase(state, "end_batch", false, nil) - state.multiIndexer.EndBatch() + if anyChanged { + state.multiIndexer.EndBatch() + } else { + state.multiIndexer.ResetBatch() + } logger.Info("daemon: warmup phase done", zap.String("phase", "end_batch"), zap.Duration("elapsed", time.Since(phaseStart))) @@ -865,6 +1008,51 @@ func publishReadinessPhase(state *daemonState, phase string, ready bool, extra m state.mcpServer.PublishReadiness(phase, ready, extra) } +// priorMtimesFromStore asks the backend for its persisted FileMtime +// rows for the repo described by entry. Returns nil when the backend +// doesn't implement the reader (in-memory backend) or has no recorded +// mtimes for the repo (fresh cold start). When non-nil it short- +// circuits the gob-snapshot lookup so the warm path is driven by +// data the backend persisted itself. +func priorMtimesFromStore(g graph.Store, entry config.RepoEntry, logger *zap.Logger) map[string]int64 { + reader, ok := g.(graph.FileMtimeReader) + if !ok { + if logger != nil { + logger.Info("daemon: priorMtimesFromStore: store does not implement FileMtimeReader") + } + return nil + } + prefix := strings.TrimPrefix(config.ResolvePrefix(entry), "/") + if prefix == "" { + if logger != nil { + logger.Info("daemon: priorMtimesFromStore: empty prefix", + zap.String("entry_path", entry.Path), + zap.String("entry_name", entry.Name)) + } + return nil + } + mtimes := reader.LoadFileMtimes(prefix) + if logger != nil { + logger.Info("daemon: priorMtimesFromStore loaded", + zap.String("prefix", prefix), + zap.Int("count", len(mtimes))) + } + return mtimes +} + +// storeNeedsRebuild reports whether the backend signalled, via the optional +// NeedsRebuild capability, that a schema migration crossed a rung an ALTER +// could not satisfy — so its persisted rows are in an old shape and the +// warm/incremental reconcile must be bypassed for a full re-index. This is a +// generic, opt-in capability probe: a backend implements NeedsRebuild() bool +// to participate. No backend currently does, so this always reports false; +// it stays as a hook for any future on-disk store that needs schema-version +// gating on warm restart. +func storeNeedsRebuild(g any) bool { + rb, ok := g.(interface{ NeedsRebuild() bool }) + return ok && rb.NeedsRebuild() +} + // priorMtimesForEntry finds the snapshotted FileMtimes map for a // configured repo entry, matching on absolute RootPath. Falls back to // prefix-based lookup when no path match is found — useful if the diff --git a/cmd/gortex/enrich.go b/cmd/gortex/enrich.go index f2d1743c..416eacb8 100644 --- a/cmd/gortex/enrich.go +++ b/cmd/gortex/enrich.go @@ -2,46 +2,43 @@ package main import ( "encoding/json" + "errors" "fmt" "os" + "path/filepath" "github.com/spf13/cobra" - "github.com/zzet/gortex/internal/blame" - "github.com/zzet/gortex/internal/cochange" - "github.com/zzet/gortex/internal/config" "github.com/zzet/gortex/internal/coverage" - "github.com/zzet/gortex/internal/graph" - "github.com/zzet/gortex/internal/indexer" - "github.com/zzet/gortex/internal/parser" - "github.com/zzet/gortex/internal/parser/languages" + "github.com/zzet/gortex/internal/daemon" "github.com/zzet/gortex/internal/progress" - "github.com/zzet/gortex/internal/releases" ) var enrichCmd = &cobra.Command{ Use: "enrich", - Short: "Run one-shot enrichments (blame, coverage) against an indexed repo", - Long: `Enrich indexes a repository in-process and stamps additional metadata -onto graph nodes from external data sources — git blame for authorship, -Go cover profiles for test coverage. Useful for CI pipelines or one-off -snapshots where the daemon isn't running. Equivalent to invoking the -` + "`analyze kind=blame`" + ` / ` + "`analyze kind=coverage`" + ` MCP tools against a fresh -index.`, + Short: "Run one-shot enrichments (churn, blame, coverage, releases, cochange) via the running daemon", + Long: `Enrich stamps additional metadata onto the daemon's graph from +external data sources — git blame for authorship, git history for churn +and co-change, git tags for release timelines, and Go cover profiles for +test coverage. + +Every enrichment is forwarded to the running daemon, which owns the warm +graph and its on-disk store write lock. The daemon runs the enricher +in-process against that graph so the persisted metadata is immediately +queryable by the analyze / get_churn_rate / coverage tools. + +A daemon must be running. If none is, the command exits with an error +rather than building a throwaway in-memory graph that nothing would +read — start one with ` + "`gortex daemon start`" + ` and re-run.`, } -var ( - enrichBlameSnapshot string - enrichCoverageSnapshot string - enrichReleasesSnapshot string - enrichCochangeSnapshot string +var enrichReleasesBranch string - enrichAllSnapshot string - enrichAllBlame bool - enrichAllReleases bool - enrichAllCochange bool - enrichAllProfile string -) +// errNoDaemon is the single clean error every enrich subcommand returns +// when no daemon is reachable. The enrichers only make sense against the +// daemon's warm, prefix-stamped graph; a standalone in-memory pass would +// be discarded and a direct on-disk write would race the daemon's writer. +var errNoDaemon = errors.New("enrich requires a running daemon; start it with `gortex daemon start`") var enrichBlameCmd = &cobra.Command{ Use: "blame [path]", @@ -71,33 +68,35 @@ var enrichCochangeCmd = &cobra.Command{ RunE: runEnrichCochange, } +var ( + enrichAllBlame bool + enrichAllReleases bool + enrichAllCochange bool + enrichAllChurn bool + enrichAllProfile string +) + var enrichAllCmd = &cobra.Command{ Use: "all [path]", - Short: "Index once and run multiple enrichments in a single pass", - Long: `Combined enrichment that indexes the target path once, then runs -the requested enrichments against the same in-memory graph. Avoids -the ~3x indexing cost of running blame, coverage, and releases as -three separate subcommand invocations. - -By default runs blame and releases (both git-only, no extra data -needed). Pass --coverage to also run coverage enrichment. -Each enrichment is independently optional via --no-blame / ---no-releases flags should you want a subset.`, + Short: "Run every enrichment against the daemon's graph in one invocation", + Long: `Combined enrichment that runs the requested enrichers against the +daemon's graph via successive control calls. + +By default runs churn, blame, releases, and co-change (all git-only, no +extra data needed). Pass --coverage to also project a Go cover +profile. Each enrichment is independently toggleable via the +--no-churn / --no-blame / --no-releases / --no-cochange flags. + +Like every enrich subcommand, this requires a running daemon.`, Args: cobra.MaximumNArgs(1), RunE: runEnrichAll, } func init() { - enrichBlameCmd.Flags().StringVar(&enrichBlameSnapshot, "snapshot", "", - "write the enriched graph as a gob.gz snapshot to this path") - enrichCoverageCmd.Flags().StringVar(&enrichCoverageSnapshot, "snapshot", "", - "write the enriched graph as a gob.gz snapshot to this path") - enrichReleasesCmd.Flags().StringVar(&enrichReleasesSnapshot, "snapshot", "", - "write the enriched graph as a gob.gz snapshot to this path") - enrichCochangeCmd.Flags().StringVar(&enrichCochangeSnapshot, "snapshot", "", - "write the enriched graph as a gob.gz snapshot to this path") - enrichAllCmd.Flags().StringVar(&enrichAllSnapshot, "snapshot", "", - "write the enriched graph as a gob.gz snapshot to this path") + enrichReleasesCmd.Flags().StringVar(&enrichReleasesBranch, "branch", "", + "restrict to tags reachable from this branch (default: resolve origin/main/master). Empty means every tag in the repo") + enrichAllCmd.Flags().BoolVar(&enrichAllChurn, "churn", true, + "run churn enrichment (default: on)") enrichAllCmd.Flags().BoolVar(&enrichAllBlame, "blame", true, "run blame enrichment (default: on)") enrichAllCmd.Flags().BoolVar(&enrichAllReleases, "releases", true, @@ -114,274 +113,291 @@ func init() { rootCmd.AddCommand(enrichCmd) } -func runEnrichAll(cmd *cobra.Command, args []string) error { - logger := newLogger() - defer func() { _ = logger.Sync() }() - +// enrichAbsPath resolves the optional [path] argument to an absolute +// path. Empty args default to the current directory; the abs path is the +// repo scope handed to the daemon (matched against tracked prefixes / +// roots, or "" for "every tracked repo"). +func enrichAbsPath(args []string) (string, error) { path := "." if len(args) >= 1 { path = args[0] } - - cfg, err := config.Load(cfgFile) + abs, err := filepath.Abs(path) if err != nil { - return err - } - - g := graph.New() - reg := parser.NewRegistry() - languages.RegisterAll(reg) - idx := indexer.New(g, reg, cfg.Index, loggerForSpinner(cmd, logger)) - - if err := indexWithSpinner(cmd, idx, path); err != nil { - return err - } - - result := map[string]any{ - "root": idx.RootPath(), + return "", fmt.Errorf("abs path %q: %w", path, err) } + return abs, nil +} - if enrichAllBlame { - sp := newCLISpinner(cmd, "Stamping blame") - count, err := blame.EnrichGraph(g, idx.RootPath()) - if err != nil { - sp.Fail(err) - return fmt.Errorf("blame: %w", err) - } - sp.Set("", fmt.Sprintf("%d nodes stamped", count)) - sp.Done() - result["blame_enriched"] = count - } - if enrichAllReleases { - sp := newCLISpinner(cmd, "Stamping releases") - count, err := releases.EnrichGraph(g, idx.RootPath()) - if err != nil { - sp.Fail(err) - return fmt.Errorf("releases: %w", err) +// dialEnrichDaemon opens a control connection to the running daemon for +// the given client name. Callers must have already checked +// daemon.IsRunning(); a dial failure here means the socket was present +// but unusable (a dying daemon) — surfaced as a clear error. +func dialEnrichDaemon(clientName string) (*daemon.Client, error) { + c, err := daemon.Dial(daemon.Handshake{Mode: daemon.ModeControl, ClientName: clientName}) + if err != nil { + if errors.Is(err, daemon.ErrDaemonUnavailable) { + return nil, fmt.Errorf("daemon socket detected but dial failed; restart it with `gortex daemon restart`") } - sp.Set("", fmt.Sprintf("%d files stamped", count)) - sp.Done() - result["releases_enriched"] = count + return nil, fmt.Errorf("dial daemon: %w", err) } - if enrichAllCochange { - sp := newCLISpinner(cmd, "Mining co-change") - count, err := cochange.EnrichGraph(g, idx.RootPath(), "") - if err != nil { - sp.Fail(err) - return fmt.Errorf("cochange: %w", err) - } - sp.Set("", fmt.Sprintf("%d edges added", count)) - sp.Done() - result["cochange_edges"] = count + return c, nil +} + +// controlEnrich sends one control request on c, validates the daemon +// accepted it, and decodes the typed result into out (which must be a +// pointer). Centralises the OK / error-code handling every forwarder +// repeats. +func controlEnrich(c *daemon.Client, kind string, params, out any) error { + resp, err := c.Control(kind, params) + if err != nil { + return fmt.Errorf("control %s: %w", kind, err) } - if enrichAllProfile != "" { - sp := newCLISpinner(cmd, "Stamping coverage") - sp.Set("", enrichAllProfile) - segments, err := coverage.ParseFile(enrichAllProfile) - if err != nil { - sp.Fail(err) - return fmt.Errorf("read profile: %w", err) - } - modulePath := coverage.ReadModulePath(idx.RootPath()) - count := coverage.EnrichGraph(g, segments, modulePath) - sp.Set("", fmt.Sprintf("%d symbols · %d segments", count, len(segments))) - sp.Done() - result["coverage_enriched"] = count - result["coverage_segments"] = len(segments) + if !resp.OK { + return fmt.Errorf("daemon rejected %s [%s]: %s", kind, resp.ErrorCode, resp.ErrorMsg) } - - if enrichAllSnapshot != "" { - if err := saveSnapshotTo(g, nil, nil, snapshotVector{}, "gortex-enrich-all", enrichAllSnapshot, logger); err != nil { - return fmt.Errorf("write snapshot %s: %w", enrichAllSnapshot, err) + if out != nil && len(resp.Result) > 0 { + if err := json.Unmarshal(resp.Result, out); err != nil { + return fmt.Errorf("parse daemon %s response: %w", kind, err) } - result["snapshot"] = enrichAllSnapshot } - return printEnrichResult(result) + return nil } -func runEnrichReleases(cmd *cobra.Command, args []string) error { - logger := newLogger() - defer func() { _ = logger.Sync() }() - - path := "." - if len(args) >= 1 { - path = args[0] +func runEnrichBlame(cmd *cobra.Command, args []string) error { + abs, err := enrichAbsPath(args) + if err != nil { + return err } - - cfg, err := config.Load(cfgFile) + if !daemon.IsRunning() { + return errNoDaemon + } + c, err := dialEnrichDaemon("cli-enrich-blame") if err != nil { return err } + defer func() { _ = c.Close() }() - g := graph.New() - reg := parser.NewRegistry() - languages.RegisterAll(reg) - idx := indexer.New(g, reg, cfg.Index, loggerForSpinner(cmd, logger)) - - if err := indexWithSpinner(cmd, idx, path); err != nil { + var out daemon.EnrichBlameResult + if err := controlEnrich(c, daemon.ControlEnrichBlame, daemon.EnrichBlameParams{Path: abs}, &out); err != nil { return err } + sp := newCLISpinner(cmd, "Enriched via daemon") + sp.Set("", fmt.Sprintf("%d nodes stamped", out.Nodes)) + sp.Done() + return printEnrichResult(map[string]any{ + "enriched": out.Nodes, + "duration_ms": out.DurationMS, + "path": abs, + "mode": "daemon", + }) +} - sp := newCLISpinner(cmd, "Stamping releases") - count, err := releases.EnrichGraph(g, idx.RootPath()) +func runEnrichCoverage(cmd *cobra.Command, args []string) error { + profilePath := args[0] + abs, err := enrichAbsPath(args[1:]) if err != nil { - sp.Fail(err) - return fmt.Errorf("releases: %w", err) + return err } - sp.Set("", fmt.Sprintf("%d files stamped", count)) - sp.Done() - - result := map[string]any{ - "enriched": count, - "root": idx.RootPath(), + // Parse the profile CLI-side: the path is relative to the caller's + // cwd, not the daemon's, so the daemon can't read it. We hand the + // daemon the parsed segments instead. + segments, err := coverage.ParseFile(profilePath) + if err != nil { + return fmt.Errorf("read profile: %w", err) } - if enrichReleasesSnapshot != "" { - if err := saveSnapshotTo(g, nil, nil, snapshotVector{}, "gortex-enrich-releases", enrichReleasesSnapshot, logger); err != nil { - return fmt.Errorf("write snapshot %s: %w", enrichReleasesSnapshot, err) - } - result["snapshot"] = enrichReleasesSnapshot + if !daemon.IsRunning() { + return errNoDaemon } - return printEnrichResult(result) -} - -func runEnrichCochange(cmd *cobra.Command, args []string) error { - logger := newLogger() - defer func() { _ = logger.Sync() }() - - path := "." - if len(args) >= 1 { - path = args[0] - } - - cfg, err := config.Load(cfgFile) + c, err := dialEnrichDaemon("cli-enrich-coverage") if err != nil { return err } + defer func() { _ = c.Close() }() - g := graph.New() - reg := parser.NewRegistry() - languages.RegisterAll(reg) - idx := indexer.New(g, reg, cfg.Index, loggerForSpinner(cmd, logger)) + wire := make([]daemon.EnrichCoverageSegment, len(segments)) + for i, s := range segments { + wire[i] = daemon.EnrichCoverageSegment{ + File: s.File, + StartLine: s.StartLine, + EndLine: s.EndLine, + NumStmt: s.NumStmt, + Count: s.Count, + } + } - if err := indexWithSpinner(cmd, idx, path); err != nil { + var out daemon.EnrichCoverageResult + if err := controlEnrich(c, daemon.ControlEnrichCoverage, daemon.EnrichCoverageParams{Path: abs, Segments: wire}, &out); err != nil { return err } + sp := newCLISpinner(cmd, "Enriched via daemon") + sp.Set("", fmt.Sprintf("%d symbols · %d segments", out.Symbols, out.Segments)) + sp.Done() + return printEnrichResult(map[string]any{ + "enriched": out.Symbols, + "segments": out.Segments, + "profile": profilePath, + "duration_ms": out.DurationMS, + "path": abs, + "mode": "daemon", + }) +} - sp := newCLISpinner(cmd, "Mining co-change") - count, err := cochange.EnrichGraph(g, idx.RootPath(), "") +func runEnrichReleases(cmd *cobra.Command, args []string) error { + abs, err := enrichAbsPath(args) if err != nil { - sp.Fail(err) - return fmt.Errorf("cochange: %w", err) + return err } - sp.Set("", fmt.Sprintf("%d edges added", count)) - sp.Done() - - result := map[string]any{ - "enriched": count, - "root": idx.RootPath(), + if !daemon.IsRunning() { + return errNoDaemon } - if enrichCochangeSnapshot != "" { - if err := saveSnapshotTo(g, nil, nil, snapshotVector{}, "gortex-enrich-cochange", enrichCochangeSnapshot, logger); err != nil { - return fmt.Errorf("write snapshot %s: %w", enrichCochangeSnapshot, err) - } - result["snapshot"] = enrichCochangeSnapshot + c, err := dialEnrichDaemon("cli-enrich-releases") + if err != nil { + return err } - return printEnrichResult(result) -} - -func runEnrichBlame(cmd *cobra.Command, args []string) error { - logger := newLogger() - defer func() { _ = logger.Sync() }() + defer func() { _ = c.Close() }() - path := "." - if len(args) >= 1 { - path = args[0] + var out daemon.EnrichReleasesResult + if err := controlEnrich(c, daemon.ControlEnrichReleases, daemon.EnrichReleasesParams{Path: abs, Branch: enrichReleasesBranch}, &out); err != nil { + return err } + sp := newCLISpinner(cmd, "Enriched via daemon") + sp.Set("", fmt.Sprintf("%d files · %s", out.Files, out.Branch)) + sp.Done() + return printEnrichResult(map[string]any{ + "enriched": out.Files, + "branch": out.Branch, + "duration_ms": out.DurationMS, + "path": abs, + "mode": "daemon", + }) +} - cfg, err := config.Load(cfgFile) +func runEnrichCochange(cmd *cobra.Command, args []string) error { + abs, err := enrichAbsPath(args) if err != nil { return err } - - g := graph.New() - reg := parser.NewRegistry() - languages.RegisterAll(reg) - idx := indexer.New(g, reg, cfg.Index, loggerForSpinner(cmd, logger)) - - if err := indexWithSpinner(cmd, idx, path); err != nil { + if !daemon.IsRunning() { + return errNoDaemon + } + c, err := dialEnrichDaemon("cli-enrich-cochange") + if err != nil { return err } + defer func() { _ = c.Close() }() - sp := newCLISpinner(cmd, "Stamping blame") - count, err := blame.EnrichGraph(g, idx.RootPath()) - if err != nil { - sp.Fail(err) - return fmt.Errorf("blame: %w", err) + var out daemon.EnrichCochangeResult + if err := controlEnrich(c, daemon.ControlEnrichCochange, daemon.EnrichCochangeParams{Path: abs}, &out); err != nil { + return err } - sp.Set("", fmt.Sprintf("%d nodes stamped", count)) + sp := newCLISpinner(cmd, "Enriched via daemon") + sp.Set("", fmt.Sprintf("%d edges added", out.Edges)) sp.Done() + return printEnrichResult(map[string]any{ + "enriched": out.Edges, + "duration_ms": out.DurationMS, + "path": abs, + "mode": "daemon", + }) +} - result := map[string]any{ - "enriched": count, - "root": idx.RootPath(), +func runEnrichAll(cmd *cobra.Command, args []string) error { + abs, err := enrichAbsPath(args) + if err != nil { + return err } - if enrichBlameSnapshot != "" { - if err := saveSnapshotTo(g, nil, nil, snapshotVector{}, "gortex-enrich-blame", enrichBlameSnapshot, logger); err != nil { - return fmt.Errorf("write snapshot %s: %w", enrichBlameSnapshot, err) + // Parse the coverage profile (if any) up front so a bad path fails + // before we touch the daemon. + var covSegments []daemon.EnrichCoverageSegment + if enrichAllProfile != "" { + segments, err := coverage.ParseFile(enrichAllProfile) + if err != nil { + return fmt.Errorf("read profile: %w", err) + } + covSegments = make([]daemon.EnrichCoverageSegment, len(segments)) + for i, s := range segments { + covSegments[i] = daemon.EnrichCoverageSegment{ + File: s.File, + StartLine: s.StartLine, + EndLine: s.EndLine, + NumStmt: s.NumStmt, + Count: s.Count, + } } - result["snapshot"] = enrichBlameSnapshot } - return printEnrichResult(result) -} - -func runEnrichCoverage(cmd *cobra.Command, args []string) error { - logger := newLogger() - defer func() { _ = logger.Sync() }() - - profilePath := args[0] - path := "." - if len(args) >= 2 { - path = args[1] + if !daemon.IsRunning() { + return errNoDaemon } - - cfg, err := config.Load(cfgFile) + c, err := dialEnrichDaemon("cli-enrich-all") if err != nil { return err } + defer func() { _ = c.Close() }() - g := graph.New() - reg := parser.NewRegistry() - languages.RegisterAll(reg) - idx := indexer.New(g, reg, cfg.Index, loggerForSpinner(cmd, logger)) - - if err := indexWithSpinner(cmd, idx, path); err != nil { - return err + result := map[string]any{ + "path": abs, + "mode": "daemon", } - sp := newCLISpinner(cmd, "Stamping coverage") - sp.Set("", profilePath) - segments, err := coverage.ParseFile(profilePath) - if err != nil { - sp.Fail(err) - return fmt.Errorf("read profile: %w", err) + if enrichAllChurn { + sp := newCLISpinner(cmd, "Stamping churn") + var out daemon.EnrichChurnResult + if err := controlEnrich(c, daemon.ControlEnrichChurn, daemon.EnrichChurnParams{Path: abs}, &out); err != nil { + sp.Fail(err) + return err + } + sp.Set("", fmt.Sprintf("%d files · %d symbols", out.Files, out.Symbols)) + sp.Done() + result["churn_files"] = out.Files + result["churn_symbols"] = out.Symbols + result["churn_branch"] = out.Branch } - modulePath := coverage.ReadModulePath(idx.RootPath()) - count := coverage.EnrichGraph(g, segments, modulePath) - sp.Set("", fmt.Sprintf("%d symbols · %d segments", count, len(segments))) - sp.Done() - - result := map[string]any{ - "enriched": count, - "segments": len(segments), - "profile": profilePath, - "module_path": modulePath, - "root": idx.RootPath(), + if enrichAllBlame { + sp := newCLISpinner(cmd, "Stamping blame") + var out daemon.EnrichBlameResult + if err := controlEnrich(c, daemon.ControlEnrichBlame, daemon.EnrichBlameParams{Path: abs}, &out); err != nil { + sp.Fail(err) + return err + } + sp.Set("", fmt.Sprintf("%d nodes stamped", out.Nodes)) + sp.Done() + result["blame_enriched"] = out.Nodes + } + if enrichAllReleases { + sp := newCLISpinner(cmd, "Stamping releases") + var out daemon.EnrichReleasesResult + if err := controlEnrich(c, daemon.ControlEnrichReleases, daemon.EnrichReleasesParams{Path: abs}, &out); err != nil { + sp.Fail(err) + return err + } + sp.Set("", fmt.Sprintf("%d files stamped", out.Files)) + sp.Done() + result["releases_enriched"] = out.Files } - if enrichCoverageSnapshot != "" { - if err := saveSnapshotTo(g, nil, nil, snapshotVector{}, "gortex-enrich-coverage", enrichCoverageSnapshot, logger); err != nil { - return fmt.Errorf("write snapshot %s: %w", enrichCoverageSnapshot, err) + if enrichAllCochange { + sp := newCLISpinner(cmd, "Mining co-change") + var out daemon.EnrichCochangeResult + if err := controlEnrich(c, daemon.ControlEnrichCochange, daemon.EnrichCochangeParams{Path: abs}, &out); err != nil { + sp.Fail(err) + return err } - result["snapshot"] = enrichCoverageSnapshot + sp.Set("", fmt.Sprintf("%d edges added", out.Edges)) + sp.Done() + result["cochange_edges"] = out.Edges + } + if len(covSegments) > 0 { + sp := newCLISpinner(cmd, "Stamping coverage") + sp.Set("", enrichAllProfile) + var out daemon.EnrichCoverageResult + if err := controlEnrich(c, daemon.ControlEnrichCoverage, daemon.EnrichCoverageParams{Path: abs, Segments: covSegments}, &out); err != nil { + sp.Fail(err) + return err + } + sp.Set("", fmt.Sprintf("%d symbols · %d segments", out.Symbols, out.Segments)) + sp.Done() + result["coverage_enriched"] = out.Symbols + result["coverage_segments"] = out.Segments } return printEnrichResult(result) } @@ -389,15 +405,12 @@ func runEnrichCoverage(cmd *cobra.Command, args []string) error { // printEnrichResult emits the enrichment summary as JSON when stdout // is captured by a script and as a one-line human-readable text // when invoked interactively. On a terminal we keep stdout quiet — the -// spinner already showed the per-pass count — and just caption the root / -// snapshot path. On a pipe / redirect we still emit JSON for scripts. +// spinner already showed the per-pass count — and just caption the path / +// profile. On a pipe / redirect we still emit JSON for scripts. func printEnrichResult(payload map[string]any) error { if progress.IsTTY(os.Stdout) { - if v, ok := payload["root"]; ok { - _, _ = fmt.Fprintln(os.Stdout, " "+progress.Caption("root: "+fmt.Sprint(v))) - } - if v, ok := payload["snapshot"]; ok { - _, _ = fmt.Fprintln(os.Stdout, " "+progress.Caption("snapshot: "+fmt.Sprint(v))) + if v, ok := payload["path"]; ok { + _, _ = fmt.Fprintln(os.Stdout, " "+progress.Caption("path: "+fmt.Sprint(v))) } if v, ok := payload["profile"]; ok { _, _ = fmt.Fprintln(os.Stdout, " "+progress.Caption("profile: "+fmt.Sprint(v))) diff --git a/cmd/gortex/enrich_churn.go b/cmd/gortex/enrich_churn.go new file mode 100644 index 00000000..190d5c84 --- /dev/null +++ b/cmd/gortex/enrich_churn.go @@ -0,0 +1,72 @@ +package main + +import ( + "fmt" + + "github.com/spf13/cobra" + + "github.com/zzet/gortex/internal/daemon" +) + +var enrichChurnBranch string + +var enrichChurnCmd = &cobra.Command{ + Use: "churn [path]", + Short: "Pre-compute per-symbol git churn from a fixed branch (default: origin/main)", + Long: `Walks the daemon's graph and stamps meta.churn on every file and +function/method with the commit_count / age_days / churn_rate / +last_author / last_commit_at metrics the get_churn_rate MCP tool reads. + +The signal is computed against a single branch — typically the +repository's default branch — so feature-branch work-in-progress +doesn't pollute the persisted data. Pass --branch to override. + +The enrichment is forwarded to the running daemon, which runs it against +its in-process graph and persists the result (avoiding the on-disk store +write-lock collision a direct CLI write would cause). A daemon must be +running; if none is, the command exits with an error — start one with +` + "`gortex daemon start`" + `.`, + Args: cobra.MaximumNArgs(1), + RunE: runEnrichChurn, +} + +func init() { + enrichChurnCmd.Flags().StringVar(&enrichChurnBranch, "branch", "", + "branch / tag / SHA to compute churn against (default: origin/main, falls back to local main/master)") + enrichCmd.AddCommand(enrichChurnCmd) +} + +func runEnrichChurn(cmd *cobra.Command, args []string) error { + abs, err := enrichAbsPath(args) + if err != nil { + return err + } + if !daemon.IsRunning() { + return errNoDaemon + } + c, err := dialEnrichDaemon("cli-enrich-churn") + if err != nil { + return err + } + defer func() { _ = c.Close() }() + + var out daemon.EnrichChurnResult + if err := controlEnrich(c, daemon.ControlEnrichChurn, daemon.EnrichChurnParams{ + Path: abs, + Branch: enrichChurnBranch, + }, &out); err != nil { + return err + } + sp := newCLISpinner(cmd, "Enriched via daemon") + sp.Set("", fmt.Sprintf("%d files · %d symbols · %s", out.Files, out.Symbols, out.Branch)) + sp.Done() + return printEnrichResult(map[string]any{ + "files": out.Files, + "symbols": out.Symbols, + "branch": out.Branch, + "head_sha": out.HeadSHA, + "duration_ms": out.DurationMS, + "path": abs, + "mode": "daemon", + }) +} diff --git a/cmd/gortex/enrich_test.go b/cmd/gortex/enrich_test.go new file mode 100644 index 00000000..e0ce4fca --- /dev/null +++ b/cmd/gortex/enrich_test.go @@ -0,0 +1,84 @@ +package main + +import ( + "errors" + "os" + "path/filepath" + "testing" + + "github.com/spf13/cobra" +) + +// noDaemonSocket points GORTEX_DAEMON_SOCKET at a path with no listener +// so daemon.IsRunning() reports false for the duration of the test. +func noDaemonSocket(t *testing.T) { + t.Helper() + dir, err := os.MkdirTemp("/tmp", "gx-enrich") + if err != nil { + t.Fatalf("mktemp: %v", err) + } + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + t.Setenv("GORTEX_DAEMON_SOCKET", filepath.Join(dir, "no-such-socket")) +} + +// TestEnrichSubcommands_NoDaemon_Errors confirms every enrich subcommand +// refuses to run when no daemon is reachable, returning the single clean +// errNoDaemon rather than silently building a throwaway in-memory graph. +func TestEnrichSubcommands_NoDaemon_Errors(t *testing.T) { + noDaemonSocket(t) + + cases := []struct { + name string + run func(*cobra.Command, []string) error + args []string + }{ + {"churn", runEnrichChurn, nil}, + {"blame", runEnrichBlame, nil}, + {"releases", runEnrichReleases, nil}, + {"cochange", runEnrichCochange, nil}, + {"all", runEnrichAll, nil}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + err := tc.run(&cobra.Command{}, tc.args) + if !errors.Is(err, errNoDaemon) { + t.Fatalf("expected errNoDaemon, got %v", err) + } + }) + } +} + +// TestEnrichCoverage_NoDaemon_Errors confirms coverage also requires a +// daemon. The profile is parsed first (a real cover.out on disk), so the +// no-daemon guard fires after a successful parse — proving the error is +// the daemon check, not a parse failure. +func TestEnrichCoverage_NoDaemon_Errors(t *testing.T) { + noDaemonSocket(t) + + dir := t.TempDir() + profile := filepath.Join(dir, "cover.out") + const body = "mode: set\nexample.com/m/a.go:1.1,3.2 2 1\n" + if err := os.WriteFile(profile, []byte(body), 0o600); err != nil { + t.Fatalf("write profile: %v", err) + } + + err := runEnrichCoverage(&cobra.Command{}, []string{profile}) + if !errors.Is(err, errNoDaemon) { + t.Fatalf("expected errNoDaemon, got %v", err) + } +} + +// TestEnrichCoverage_BadProfile_Errors confirms a missing profile path +// fails before the daemon check, with a read error rather than the +// no-daemon error. +func TestEnrichCoverage_BadProfile_Errors(t *testing.T) { + noDaemonSocket(t) + + err := runEnrichCoverage(&cobra.Command{}, []string{"/no/such/profile.out"}) + if err == nil { + t.Fatal("expected an error for a missing profile") + } + if errors.Is(err, errNoDaemon) { + t.Fatalf("expected a profile read error, got the no-daemon error: %v", err) + } +} diff --git a/cmd/gortex/eval_embedders.go b/cmd/gortex/eval_embedders.go index 006b350e..bc48d5fd 100644 --- a/cmd/gortex/eval_embedders.go +++ b/cmd/gortex/eval_embedders.go @@ -287,7 +287,7 @@ func onnxSizeMB(spec embedding.HugotVariant) float64 { break } } - modelDir := filepath.Join(platform.CacheDir(), "models", cacheDir) + modelDir := filepath.Join(platform.ModelsDir(), cacheDir) candidates := []string{ filepath.Join(modelDir, spec.OnnxFile), filepath.Join(modelDir, filepath.Base(spec.OnnxFile)), diff --git a/cmd/gortex/eval_recall.go b/cmd/gortex/eval_recall.go index 90a01562..ce7d3f83 100644 --- a/cmd/gortex/eval_recall.go +++ b/cmd/gortex/eval_recall.go @@ -337,7 +337,7 @@ func newRecallLogger() *zap.Logger { // chooseEmbedder honours --embeddings-url > --embedder > --embeddings > off. // Default with --embeddings is the best local provider (Hugot MiniLM-L6-v2 -// auto-downloads to ~/.cache/gortex/models/ on first use). Users can force +// auto-downloads to ~/.gortex/models/ on first use). Users can force // static GloVe with --embedder static. func chooseEmbedder() embedding.Provider { if evalRecallEmbeddingsURL != "" { diff --git a/cmd/gortex/gain.go b/cmd/gortex/gain.go index 3b787834..ecc80fb8 100644 --- a/cmd/gortex/gain.go +++ b/cmd/gortex/gain.go @@ -50,7 +50,7 @@ Default behavior: 1. Find the most recent gortex bench tokens output (auto-discovery under bench/results/, then a transparent re-run when none). 2. Render a USD-per-model card scaled to --responses-per-day. - 3. Append a short "Your history" section from ~/.cache/gortex/savings.json + 3. Append a short "Your history" section from ~/.gortex/cache/savings.json when --since's window has any tracked calls. Flags: diff --git a/cmd/gortex/git.go b/cmd/gortex/git.go index 3cebfc41..f0ff7405 100644 --- a/cmd/gortex/git.go +++ b/cmd/gortex/git.go @@ -5,6 +5,7 @@ import ( "os/exec" "strings" + "github.com/zzet/gortex/internal/churn" "github.com/zzet/gortex/internal/indexer" ) @@ -50,3 +51,12 @@ func gitBranch(dir string) string { func canonicalRepo(dir string) string { return indexer.ResolveWorktree(dir).MainRepoPath } + +// gitDefaultBranch returns the repository's default branch as a +// rev-parseable reference. Thin wrapper over churn.DefaultBranch so +// the CLI, daemon controller, and MCP tool resolve the same branch +// the same way. +func gitDefaultBranch(dir string) string { + return churn.DefaultBranch(dir) +} + diff --git a/cmd/gortex/githook.go b/cmd/gortex/githook.go index 58531dca..26ba9da1 100644 --- a/cmd/gortex/githook.go +++ b/cmd/gortex/githook.go @@ -4,6 +4,7 @@ import ( "fmt" "os" "path/filepath" + "strings" "github.com/spf13/cobra" @@ -11,20 +12,24 @@ import ( ) var ( - githookRegenMermaid bool - githookRegenWiki bool - githookRegenDocs bool - githookMermaidOutDir string - githookWikiOutDir string - githookDocsOutPath string - githookBinary string + githookRegenMermaid bool + githookRegenWiki bool + githookRegenDocs bool + githookRegenChurn bool + githookRegenReleases bool + githookMermaidOutDir string + githookWikiOutDir string + githookDocsOutPath string + githookChurnBranch string + githookReleasesBranch string + githookBinary string ) var githookCmd = &cobra.Command{ Use: "githook", Short: "Manage local git hooks that regenerate gortex artefacts", - Long: `Install, uninstall, and inspect the post-commit hook that re-runs -gortex commands after each commit. + Long: `Install, uninstall, and inspect git hooks that re-run gortex +commands. Supported hooks: post-commit, post-merge. The hook is idempotent: re-running install replaces only the gortex block, leaving any other hook content intact. Uninstall removes the @@ -33,7 +38,7 @@ block and deletes the hook file when it contains nothing else.`, var githookInstallCmd = &cobra.Command{ Use: "install ", - Short: "Install a git hook (currently: post-commit)", + Short: "Install a git hook (post-commit or post-merge)", Args: cobra.ExactArgs(1), RunE: runGithookInstall, } @@ -46,8 +51,9 @@ var githookUninstallCmd = &cobra.Command{ } var githookStatusCmd = &cobra.Command{ - Use: "status", - Short: "Report whether the post-commit hook is gortex-managed", + Use: "status [hook]", + Short: "Report whether the named hook is gortex-managed (default: post-commit)", + Args: cobra.MaximumNArgs(1), RunE: runGithookStatus, } @@ -58,6 +64,14 @@ func init() { "include `gortex wiki .` in the hook") githookInstallCmd.Flags().BoolVar(&githookRegenDocs, "regen-docs", false, "include `gortex docs . --out CHANGELOG_AUTO.md` in the hook") + githookInstallCmd.Flags().BoolVar(&githookRegenChurn, "regen-churn", false, + "include `gortex enrich churn` so get_churn_rate stays fresh without an at-read-time git subprocess") + githookInstallCmd.Flags().StringVar(&githookChurnBranch, "churn-branch", "", + "branch / tag / SHA the churn enricher pins to (default: resolve at hook run-time)") + githookInstallCmd.Flags().BoolVar(&githookRegenReleases, "regen-releases", false, + "include `gortex enrich releases` so analyze kind=releases reads pre-computed Meta") + githookInstallCmd.Flags().StringVar(&githookReleasesBranch, "releases-branch", "", + "branch / tag / SHA the releases enricher restricts to (default: resolve at hook run-time)") githookInstallCmd.Flags().StringVar(&githookMermaidOutDir, "mermaid-out-dir", "docs/architecture/", "output directory for mermaid diagrams") githookInstallCmd.Flags().StringVar(&githookWikiOutDir, "wiki-out-dir", "wiki", @@ -73,48 +87,62 @@ func init() { rootCmd.AddCommand(githookCmd) } +// supportedHook validates the hook arg. We mirror the package-level +// SupportedHooks list rather than importing it so the CLI surface +// stays decoupled from the install package's internals. +func supportedHook(name string) error { + if name == "post-commit" || name == "post-merge" { + return nil + } + return fmt.Errorf("unsupported hook %q (supported: post-commit, post-merge)", name) +} + func runGithookInstall(cmd *cobra.Command, args []string) error { hook := args[0] - if hook != "post-commit" { - return fmt.Errorf("only the post-commit hook is supported (got %q)", hook) + if err := supportedHook(hook); err != nil { + return err } repoRoot, err := resolveGithookRepoRoot() if err != nil { return err } - if !githookRegenMermaid && !githookRegenWiki && !githookRegenDocs { + if !githookRegenMermaid && !githookRegenWiki && !githookRegenDocs && !githookRegenChurn && !githookRegenReleases { // Default to mermaid when nothing was chosen — minimum // useful behaviour. githookRegenMermaid = true } - path, err := githooks.InstallPostCommit(repoRoot, githooks.InstallOpts{ - Binary: githookBinary, - RegenMermaid: githookRegenMermaid, - RegenWiki: githookRegenWiki, - RegenDocs: githookRegenDocs, - MermaidOutDir: githookMermaidOutDir, - WikiOutDir: githookWikiOutDir, - DocsOutPath: githookDocsOutPath, + path, err := githooks.InstallHook(repoRoot, hook, githooks.InstallOpts{ + Binary: githookBinary, + RegenMermaid: githookRegenMermaid, + RegenWiki: githookRegenWiki, + RegenDocs: githookRegenDocs, + RegenChurn: githookRegenChurn, + ChurnBranch: githookChurnBranch, + RegenReleases: githookRegenReleases, + ReleasesBranch: githookReleasesBranch, + MermaidOutDir: githookMermaidOutDir, + WikiOutDir: githookWikiOutDir, + DocsOutPath: githookDocsOutPath, }) if err != nil { return err } _, _ = fmt.Fprintf(cmd.OutOrStdout(), - "installed post-commit hook at %s\nactions: mermaid=%t wiki=%t docs=%t\n", - path, githookRegenMermaid, githookRegenWiki, githookRegenDocs) + "installed %s hook at %s\nactions: mermaid=%t wiki=%t docs=%t churn=%t releases=%t\n", + hook, path, githookRegenMermaid, githookRegenWiki, githookRegenDocs, githookRegenChurn, githookRegenReleases) return nil } func runGithookUninstall(cmd *cobra.Command, args []string) error { hook := args[0] - if hook != "post-commit" { - return fmt.Errorf("only the post-commit hook is supported (got %q)", hook) + if err := supportedHook(hook); err != nil { + return err } repoRoot, err := resolveGithookRepoRoot() if err != nil { return err } - path, removed, err := githooks.UninstallPostCommit(repoRoot) + path, removed, err := githooks.UninstallHook(repoRoot, hook) if err != nil { return err } @@ -126,19 +154,40 @@ func runGithookUninstall(cmd *cobra.Command, args []string) error { return nil } -func runGithookStatus(cmd *cobra.Command, _ []string) error { +func runGithookStatus(cmd *cobra.Command, args []string) error { + hook := "post-commit" + if len(args) > 0 { + if err := supportedHook(args[0]); err != nil { + return err + } + hook = args[0] + } repoRoot, err := resolveGithookRepoRoot() if err != nil { return err } - rep, err := githooks.Status(repoRoot) + hookPath, err := githooks.HookPathFor(repoRoot, hook) if err != nil { return err } + // Read directly; Status() is post-commit-locked and we want per-hook + // detail. Mirrors Status() but parameterised on hook. + body, ferr := os.ReadFile(hookPath) + exists := ferr == nil + managed := false + if exists { + bs := string(body) + begin := "# gortex-managed:" + hook + ":begin" + end := "# gortex-managed:" + hook + ":end" + if strings.Contains(bs, begin) && strings.Contains(bs, end) { + managed = true + } + } out := cmd.OutOrStdout() - _, _ = fmt.Fprintf(out, "hook_path: %s\n", rep.HookPath) - _, _ = fmt.Fprintf(out, "exists: %t\n", rep.Exists) - _, _ = fmt.Fprintf(out, "managed: %t\n", rep.Managed) + _, _ = fmt.Fprintf(out, "hook: %s\n", hook) + _, _ = fmt.Fprintf(out, "hook_path: %s\n", hookPath) + _, _ = fmt.Fprintf(out, "exists: %t\n", exists) + _, _ = fmt.Fprintf(out, "managed: %t\n", managed) return nil } diff --git a/cmd/gortex/init.go b/cmd/gortex/init.go index 07ad692f..2385d7d9 100644 --- a/cmd/gortex/init.go +++ b/cmd/gortex/init.go @@ -421,7 +421,7 @@ func ensureProjectMarker(root string, w io.Writer) error { return nil } -// ensureGlobalConfig adds this repo to ~/.config/gortex/config.yaml +// ensureGlobalConfig adds this repo to ~/.gortex/config.yaml // so the daemon picks it up on its next restart. Skipped in --dry-run. func ensureGlobalConfig(root string) error { gc, err := config.LoadGlobal() diff --git a/cmd/gortex/init_global.go b/cmd/gortex/init_global.go index 56331683..276abcc7 100644 --- a/cmd/gortex/init_global.go +++ b/cmd/gortex/init_global.go @@ -15,7 +15,7 @@ import ( // daemon config). They don't fit the Adapter interface because they // touch the daemon's RPC protocol, not on-disk agent config. -// ensureGlobalConfigExists creates an empty ~/.config/gortex/config.yaml +// ensureGlobalConfigExists creates an empty ~/.gortex/config.yaml // when none is present. The daemon needs a writable path on first // Track; creating it now surfaces any permission problems at install // time instead of on the first use. diff --git a/cmd/gortex/mcp.go b/cmd/gortex/mcp.go index 95513ae3..e40b1f78 100644 --- a/cmd/gortex/mcp.go +++ b/cmd/gortex/mcp.go @@ -74,7 +74,7 @@ func init() { mcpCmd.Flags().StringVar(&mcpCORSOrigin, "cors-origin", "*", "allowed CORS origin for server API") mcpCmd.Flags().StringSliceVar(&mcpTrack, "track", nil, "additional repository paths to track") mcpCmd.Flags().StringVar(&mcpProject, "project", "", "active project name") - mcpCmd.Flags().StringVar(&mcpCacheDir, "cache-dir", "", "graph cache directory (default ~/.cache/gortex/)") + mcpCmd.Flags().StringVar(&mcpCacheDir, "cache-dir", "", "graph cache directory (default ~/.gortex/cache/)") mcpCmd.Flags().BoolVar(&mcpNoCache, "no-cache", false, "disable graph caching") mcpCmd.Flags().BoolVar(&mcpEmbeddings, "embeddings", false, "enable semantic search (built-in word vectors or transformer if compiled in)") mcpCmd.Flags().StringVar(&mcpEmbeddingsURL, "embeddings-url", "", "embedding API URL (e.g. http://localhost:11434 for Ollama)") @@ -410,17 +410,27 @@ func runMCP(cmd *cobra.Command, args []string) error { srv.SetLSPDiagnosticsBroadcasting() } + // Resolve the side-store cache dir. When --cache-dir is unset, fall + // back to the shared cache dir so notes / memories / notebooks still + // persist via the sidecar DB (the side-stores are independent of the + // graph backend, so they persist even under --backend memory). + sideStoreCacheDir := mcpCacheDir + if sideStoreCacheDir == "" { + sideStoreCacheDir = platform.CacheDir() + } + // Initialize feedback persistence for cross-session context learning. srv.InitFeedback(mcpCacheDir, mcpIndex) // Notes: per-repo session memory store backing save_note / - // query_notes / distill_session. Persisted alongside feedback so - // notes survive daemon restarts and compactions. - srv.InitNotes(mcpCacheDir, mcpIndex) + // query_notes / distill_session. Persisted in the sidecar DB so + // notes survive daemon restarts and compactions, independent of the + // graph backend. + srv.InitNotes(sideStoreCacheDir, mcpIndex) // Memories: cross-session development-memory store backing // store_memory / query_memories / surface_memories. Shares the - // per-repo cache directory with notes; entries are workspace-wide - // and durable across sessions, compounding team knowledge. - srv.InitMemories(mcpCacheDir, mcpIndex) + // sidecar DB with notes; entries are workspace-wide and durable + // across sessions, compounding team knowledge. + srv.InitMemories(sideStoreCacheDir, mcpIndex) // Notebook: repository-local persistent notebook at // /.gortex/notebook/. Entries are committed alongside the // repo so they're visible in PR reviews and travel with the @@ -435,7 +445,7 @@ func runMCP(cmd *cobra.Command, args []string) error { srv.InitFrecency(mcpCacheDir, mcpIndex, gortexmcp.ModeAI) // Initialize cumulative token-savings persistence. Path defaults to - // ~/.cache/gortex/savings.json; the store operates in-memory when the + // ~/.gortex/cache/savings.json; the store operates in-memory when the // cache dir is unavailable. savingsPath := savings.DefaultPath() if mcpCacheDir != "" { @@ -451,7 +461,7 @@ func runMCP(cmd *cobra.Command, args []string) error { } // LLM service — same wiring as the daemon path: repo config wins - // per non-zero field, global ~/.config/gortex/config.yaml fills the + // per non-zero field, global ~/.gortex/config.yaml fills the // rest, env vars override last inside SetupLLM. The active provider // is chosen by `llm.provider` (local / anthropic / openai / ollama / // claudecli / gemini / bedrock / deepseek). diff --git a/cmd/gortex/repos_cmd.go b/cmd/gortex/repos_cmd.go index aede59d4..1e7fed7b 100644 --- a/cmd/gortex/repos_cmd.go +++ b/cmd/gortex/repos_cmd.go @@ -21,7 +21,7 @@ var reposJSON bool // reposCacheDir is the persistence-store directory `gortex repos` // inspects for index freshness. Empty resolves to the default -// (~/.cache/gortex/) — the same slot `gortex server` / `gortex mcp` +// (~/.gortex/cache/) — the same slot `gortex server` / `gortex mcp` // persist to. Overridable so tests can point at a temp store. var reposCacheDir string @@ -29,7 +29,7 @@ var reposCmd = &cobra.Command{ Use: "repos", Short: "List every tracked repository with its git head and index freshness", Long: `Lists the repositories registered in the global config -(~/.config/gortex/config.yaml). +(~/.gortex/config.yaml). For each repo the command reports the current git HEAD commit and an index-freshness indicator: when the persisted index was last built and @@ -88,7 +88,7 @@ func runRepos(cmd *cobra.Command, _ []string) error { // The persistence store is read-only here — we only inspect what // `gortex server` / `gortex mcp` already persisted. An empty - // cache dir resolves to the default (~/.cache/gortex/), the same + // cache dir resolves to the default (~/.gortex/cache/), the same // slot those commands write to. store, err := persistence.NewFileStore(reposCacheDir, version) if err != nil { diff --git a/cmd/gortex/root.go b/cmd/gortex/root.go index 57015342..57784a40 100644 --- a/cmd/gortex/root.go +++ b/cmd/gortex/root.go @@ -1,9 +1,11 @@ package main import ( + "fmt" "os" "github.com/spf13/cobra" + "github.com/zzet/gortex/internal/platform" "go.uber.org/zap" "go.uber.org/zap/zapcore" ) @@ -17,6 +19,17 @@ var ( var rootCmd = &cobra.Command{ Use: "gortex", Short: "Code intelligence engine — indexes repos into a queryable knowledge graph", + // Runs before every subcommand (cobra walks to the nearest + // PersistentPreRun; no subcommand defines its own). Fold any state + // left by older versions in the split ~/.config / ~/.cache / flat + // ~/.gortex layout into the unified ~/.gortex tree before a command + // opens the store or reads config. Best-effort + idempotent, so it's + // cheap on every run and silent after the first. + PersistentPreRun: func(cmd *cobra.Command, args []string) { + platform.MigrateToUnifiedHome(func(format string, a ...any) { + fmt.Fprintf(os.Stderr, format+"\n", a...) + }) + }, } func init() { diff --git a/cmd/gortex/savings.go b/cmd/gortex/savings.go index 69a27c72..6ccbf015 100644 --- a/cmd/gortex/savings.go +++ b/cmd/gortex/savings.go @@ -36,8 +36,8 @@ the tokens avoided (priced against popular models). Savings accumulate every time a source-reading MCP tool (get_symbol_source, batch_symbols, smart_context) returns a symbol or compressed view instead of -a full-file read. Cumulative totals live at ~/.cache/gortex/savings.json and -per-call events at the sibling ~/.cache/gortex/savings.jsonl — Today / 7-day +a full-file read. Cumulative totals live at ~/.gortex/cache/savings.json and +per-call events at the sibling ~/.gortex/cache/savings.jsonl — Today / 7-day buckets come from the JSONL log, All time from the cumulative file. Override the cache dir with --cache-dir, override pricing by exporting diff --git a/cmd/gortex/server.go b/cmd/gortex/server.go index 2ca2cdb1..67167845 100644 --- a/cmd/gortex/server.go +++ b/cmd/gortex/server.go @@ -1,6 +1,7 @@ package main import ( + "context" "fmt" "net" "net/http" @@ -16,7 +17,6 @@ import ( "github.com/zzet/gortex/internal/config" "github.com/zzet/gortex/internal/contracts" "github.com/zzet/gortex/internal/daemon" - "github.com/zzet/gortex/internal/graph" "github.com/zzet/gortex/internal/indexer" gortexmcp "github.com/zzet/gortex/internal/mcp" "github.com/zzet/gortex/internal/mcp/streamable" @@ -24,6 +24,7 @@ import ( "github.com/zzet/gortex/internal/parser/languages" "github.com/zzet/gortex/internal/persistence" "github.com/zzet/gortex/internal/platform" + "github.com/zzet/gortex/internal/progress" "github.com/zzet/gortex/internal/query" "github.com/zzet/gortex/internal/semantic" "github.com/zzet/gortex/internal/semantic/goanalysis" @@ -66,7 +67,10 @@ var ( // the in-memory graph before the HTTP listener accepts traffic. // Used by gortex-cloud's per-workspace supervisor to boot a // hosted gortex server from R2/Hetzner-OS-cached state. - serverSnapshot string + serverSnapshot string + serverBackend string + serverBackendPath string + serverBackendBufferPoolMB uint64 ) var serverCmd = &cobra.Command{ @@ -87,7 +91,7 @@ func init() { serverCmd.Flags().StringVar(&serverProject, "project", "", "active project name (GlobalConfig group of repos)") serverCmd.Flags().StringVar(&serverWorkspace, "workspace", "", "workspace slug — restricts BOTH indexing and queries to repos whose resolved workspace matches (RepoEntry override → .gortex.yaml::workspace → repo prefix). Empty means all workspaces.") serverCmd.Flags().StringVar(&serverScopeProject, "scope-project", "", "project slug — narrows further inside --workspace (also gates indexing). No effect without --workspace.") - serverCmd.Flags().StringVar(&serverCacheDir, "cache-dir", "", "graph cache directory (default ~/.cache/gortex/)") + serverCmd.Flags().StringVar(&serverCacheDir, "cache-dir", "", "graph cache directory (default ~/.gortex/cache/)") serverCmd.Flags().BoolVar(&serverNoCache, "no-cache", false, "disable graph caching") serverCmd.Flags().BoolVar(&serverEmbeddings, "embeddings", false, "enable semantic search") serverCmd.Flags().StringVar(&serverEmbeddingsURL, "embeddings-url", "", "embedding API URL (e.g. http://localhost:11434 for Ollama)") @@ -96,6 +100,10 @@ func init() { serverCmd.Flags().BoolVar(&serverNoSemantic, "no-semantic", false, "disable semantic enrichment") serverCmd.Flags().StringVar(&serverSemanticMode, "semantic-mode", "typecheck", "Go analysis mode: typecheck or callgraph") serverCmd.Flags().StringVar(&serverSnapshot, "snapshot", "", "load a snapshot file at startup (gob+gzip; the format `gortex index --snapshot` writes). Used by gortex-cloud's per-workspace supervisor to boot from a precomputed snapshot.") + serverCmd.Flags().StringVar(&serverBackend, "backend", "memory", "storage backend: memory (in-process, default — fastest, no persistence) | sqlite (pure-Go embedded SQL — persists to --backend-path, cold-loads from disk)") + serverCmd.Flags().Uint64Var(&serverBackendBufferPoolMB, "backend-buffer-pool-mb", 0, + "advisory page-cache cap (MiB) for on-disk backends. 0 lets the backend choose its own default; backends that manage their own cache (e.g. sqlite) ignore it") + serverCmd.Flags().StringVar(&serverBackendPath, "backend-path", "", "directory where the on-disk backend persists its store. Required when --backend != memory. Default: ~/.gortex/store/.store") rootCmd.AddCommand(serverCmd) } @@ -137,7 +145,11 @@ func runServer(cmd *cobra.Command, _ []string) error { } // Build graph/parser/indexer/query/MCP stack. - g := graph.New() + g, backendCleanup, err := openBackend(serverBackend, serverBackendPath, serverBackendBufferPoolMB, logger) + if err != nil { + return fmt.Errorf("opening backend %q: %w", serverBackend, err) + } + defer backendCleanup() reg := parser.NewRegistry() languages.RegisterAll(reg) languages.RegisterCustomGrammars(reg, cfg.Index.Grammars, logger) @@ -321,7 +333,7 @@ func runServer(cmd *cobra.Command, _ []string) error { } // Multi-repo support. - cm, err := config.NewConfigManager("") + cm, err := config.NewConfigManager(cfgFile) if err != nil { fmt.Fprintf(os.Stderr, "[gortex] warning: could not load global config: %v\n", err) } @@ -415,11 +427,23 @@ func runServer(cmd *cobra.Command, _ []string) error { srv.SetLSPDiagnosticsBroadcasting() } - // Create persistence store. + // Create persistence store. The snapshot cache exists for the + // in-memory backend, where heap state is lost on restart — load + // from snapshot skips the parse phase on a warm restart. For an + // on-disk backend (sqlite) the store IS already persistent + // across restarts: re-opening the same path hands back the + // previous run's graph, and replaying a snapshot via per-row + // g.AddNode would just re-write everything we already have. Skip + // the cache entirely on those backends. var store persistence.Store - if serverNoCache { + persistentBackend := !strings.EqualFold(strings.TrimSpace(serverBackend), "memory") && strings.TrimSpace(serverBackend) != "" + switch { + case serverNoCache: store = persistence.NopStore{} - } else { + case persistentBackend: + fmt.Fprintf(os.Stderr, "[gortex] server: snapshot cache disabled (backend=%s persists across restarts)\n", serverBackend) + store = persistence.NopStore{} + default: var err error store, err = persistence.NewFileStore(serverCacheDir, version) if err != nil { @@ -587,9 +611,35 @@ func runServer(cmd *cobra.Command, _ []string) error { // Background: index, multi-repo, analyze — graph populates while HTTP is live. go func() { - // When MultiIndexer is available (global config has repos), use it exclusively. - // Single --index flag is only used when no multi-repo config exists. - if mi != nil { + // Live progress logging — the daemon runs without a TTY so + // the Spinner reporter is silent. Hook a zap-logging reporter + // + a graph-size heartbeat so the log shows what's happening. + hbCtx, hbCancel := context.WithCancel(context.Background()) + defer hbCancel() + progress.StartHeartbeat(hbCtx, logger, "indexing", 5*time.Second, func() map[string]any { + // idx.Graph() follows the indexer's active store — + // during cold-start the indexer swaps to an in-memory + // shadow, so reading via idx.Graph() shows the live + // growing count. g.NodeCount() would always read the + // disk store and stay at 0 until FlushBulk drains. + cur := idx.Graph() + if cur == nil { + cur = g + } + return map[string]any{ + "nodes": cur.NodeCount(), + "edges": cur.EdgeCount(), + "disk_nodes": g.NodeCount(), + "disk_edges": g.EdgeCount(), + } + }) + // When the active config has repos AND no explicit --index was + // requested, use MultiIndexer (it handles the per-repo flow). + // When --index is set the user wants single-repo behaviour, + // even when a multi-repo config exists — bypass MultiIndexer. + hasActiveRepos := cm != nil && len(cm.ActiveRepos()) > 0 + useMulti := mi != nil && hasActiveRepos && serverIndex == "" + if useMulti { if serverWorkspace != "" || serverScopeProject != "" { fmt.Fprintf(os.Stderr, "[gortex] server: multi-repo indexing (scope: workspace=%q project=%q)...\n", serverWorkspace, serverScopeProject) } else { @@ -707,7 +757,7 @@ func isLocalhostBind(bind string) bool { // resolveServerID loads or creates the per-machine server id. When // cacheDir is empty the id lives alongside other gortex cache files -// (~/.cache/gortex/server.id); otherwise cacheDir/server.id. +// (~/.gortex/cache/server.id); otherwise cacheDir/server.id. func resolveServerID(cacheDir string) (string, error) { path := filepath.Join(cacheDir, "server.id") if cacheDir == "" { diff --git a/cmd/gortex/workspace_cmd.go b/cmd/gortex/workspace_cmd.go index 36f7b34e..42193f1a 100644 --- a/cmd/gortex/workspace_cmd.go +++ b/cmd/gortex/workspace_cmd.go @@ -55,7 +55,7 @@ the cwd. Project defaults to the workspace slug when omitted. Without --global the value is written to the repo's .gortex.yaml. With --global the value is written to -~/.config/gortex/config.yaml (your user-level config), which is +~/.gortex/config.yaml (your user-level config), which is the right choice for OSS / read-only repos where you don't want to leave any artifact in the repo. Global overrides win over .gortex.yaml at resolution time, so you can also use --global to @@ -80,7 +80,7 @@ By default the command prints the planned changes and asks for confirmation. Pass --yes to skip the prompt (CI / scripted use). --root restricts the bulk update to repos under that prefix (e.g. only your "work" repos). --global writes to -~/.config/gortex/config.yaml instead of touching each repo's +~/.gortex/config.yaml instead of touching each repo's .gortex.yaml — the OSS-friendly path.`, Args: cobra.ExactArgs(1), RunE: runWorkspaceSetAll, @@ -91,14 +91,14 @@ func init() { workspaceCmd.AddCommand(workspaceSetCmd) workspaceCmd.AddCommand(workspaceSetAllCmd) workspaceListCmd.Flags().BoolVar(&workspaceListJSON, "json", false, "emit machine-readable JSON instead of a table") - workspaceSetCmd.Flags().BoolVar(&workspaceSetGlobal, "global", false, "write to ~/.config/gortex/config.yaml instead of the repo's .gortex.yaml (OSS-friendly)") + workspaceSetCmd.Flags().BoolVar(&workspaceSetGlobal, "global", false, "write to ~/.gortex/config.yaml instead of the repo's .gortex.yaml (OSS-friendly)") workspaceSetAllCmd.Flags().BoolVarP(&workspaceSetAll, "yes", "y", false, "skip interactive confirmation") workspaceSetAllCmd.Flags().StringVar(&workspaceSetRoot, "root", "", "only stamp repos whose path starts with this prefix") - workspaceSetAllCmd.Flags().BoolVar(&workspaceSetGlobal, "global", false, "write to ~/.config/gortex/config.yaml instead of each repo's .gortex.yaml") + workspaceSetAllCmd.Flags().BoolVar(&workspaceSetGlobal, "global", false, "write to ~/.gortex/config.yaml instead of each repo's .gortex.yaml") rootCmd.AddCommand(workspaceCmd) } -// loadGlobalRepos reads the global config (~/.config/gortex/config.yaml +// loadGlobalRepos reads the global config (~/.gortex/config.yaml // by default, or whatever --config points at) and returns the tracked // repo entries. Failure to read the config returns an error rather // than an empty list — silently doing nothing on a typo'd config @@ -436,7 +436,7 @@ func stampWorkspace(repoPath, workspace, project string) error { } // stampWorkspaceGlobal writes the workspace/project override onto -// the matching RepoEntry in `~/.config/gortex/config.yaml`. Returns +// the matching RepoEntry in `~/.gortex/config.yaml`. Returns // the path of the file modified for the user-facing message. Used // when the user passes --global — the OSS-friendly path that // leaves no trace in the repo itself. diff --git a/docs/architecture.md b/docs/architecture.md index fe87b813..8ab86879 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -54,7 +54,7 @@ gortex binary Gortex snapshots the graph to disk on shutdown and restores it on startup, with incremental re-indexing of only changed files: ```bash -# Default cache directory: ~/.cache/gortex/ +# Default cache directory: ~/.gortex/cache/ gortex mcp --index /path/to/repo # Custom cache directory diff --git a/docs/landing-pages/per-tool-savings.md b/docs/landing-pages/per-tool-savings.md index 3882a1f1..b5d88a35 100644 --- a/docs/landing-pages/per-tool-savings.md +++ b/docs/landing-pages/per-tool-savings.md @@ -2,7 +2,7 @@ **Last regenerated**: 2026-05-18T22:20:29Z · Source: `gortex savings --verbose --json` against the operator's cumulative store -(`~/.cache/gortex/savings.json` + `~/.cache/gortex/savings.jsonl`). +(`~/.gortex/cache/savings.json` + `~/.gortex/cache/savings.jsonl`). ## Headline diff --git a/docs/llm.md b/docs/llm.md index 3be3df18..42d8ad83 100644 --- a/docs/llm.md +++ b/docs/llm.md @@ -23,10 +23,10 @@ The backend is chosen by the `llm.provider` key. Eight of the nine providers are ## Configuration -The `llm:` block goes in `~/.config/gortex/config.yaml` or a per-repo `.gortex.yaml` (repo-local wins per field, global fills the rest). Configure only the provider you use: +The `llm:` block goes in `~/.gortex/config.yaml` or a per-repo `.gortex.yaml` (repo-local wins per field, global fills the rest). Configure only the provider you use: ```yaml -# ~/.config/gortex/config.yaml (or per-repo .gortex.yaml) +# ~/.gortex/config.yaml (or per-repo .gortex.yaml) llm: provider: local # local | anthropic | openai | ollama | claudecli | codex | gemini | bedrock | deepseek max_steps: 16 # agent tool-loop cap (provider-agnostic) diff --git a/docs/multi-repo.md b/docs/multi-repo.md index 929a3ed8..a4291f99 100644 --- a/docs/multi-repo.md +++ b/docs/multi-repo.md @@ -8,7 +8,7 @@ Every node and contract is keyed on a **workspace slug**, which is the hard grap Slug resolution precedence (first match wins): -1. `RepoEntry.workspace` in `~/.config/gortex/config.yaml` — overrides everything, ideal for OSS / read-only repos where you don't want to leave an artifact in the tree +1. `RepoEntry.workspace` in `~/.gortex/config.yaml` — overrides everything, ideal for OSS / read-only repos where you don't want to leave an artifact in the tree 2. `workspace:` in the repo's own `.gortex.yaml` — the default for first-party repos 3. The repo prefix — fallback when neither is set, so each unconfigured repo gets its own isolated workspace @@ -18,13 +18,13 @@ The same chain applies to the optional `project:` slug (a sub-bucket inside a wo Two-tier config hierarchy: -- **Global config** (`~/.config/gortex/config.yaml`) — projects, repo lists, active project, reference tags +- **Global config** (`~/.gortex/config.yaml`) — projects, repo lists, active project, reference tags - **Workspace config** (`.gortex.yaml` per repo) — guards, excludes, local overrides Excludes are layered — builtin → repo's own `.gitignore` → global → per-repo entry → workspace — with gitignore semantics. The repo's `.gitignore` is respected by default so you don't have to re-declare entries already curated for git; opt out per-workspace with `respect_gitignore: false` in `.gortex.yaml`. Use `!pattern` in a later layer to re-include something an earlier layer excluded. Beyond `.gitignore`, the index walk also honors per-directory `.gortexignore` files (Gortex's own ignore file, a sibling to `.gitignore`) and ripgrep's `.ignore` / `.rgignore` — each scoped to the directory that contains it. ```yaml -# ~/.config/gortex/config.yaml +# ~/.gortex/config.yaml active_project: my-saas exclude: # Applies to every tracked repo @@ -58,7 +58,7 @@ projects: The daemon's defaults handle typical workflows without configuration. These knobs exist for monorepos, branch-heavy workflows, or filesystems without fsnotify support. ```yaml -# ~/.config/gortex/config.yaml (or per-repo .gortex.yaml) +# ~/.gortex/config.yaml (or per-repo .gortex.yaml) watch: debounce_ms: 150 # per-file patch debounce (default 150) @@ -94,13 +94,13 @@ gortex repos --json # Same, machine-readable (for scripts / CI) gortex workspace list # Show what each tracked repo currently declares gortex workspace list --json # Same, machine-readable gortex workspace set backend api # Write workspace=api to backend's .gortex.yaml -gortex workspace set upstream-lib api --global # OSS-friendly: pin to api in ~/.config/gortex/config.yaml +gortex workspace set upstream-lib api --global # OSS-friendly: pin to api in ~/.gortex/config.yaml gortex workspace set-all api --root ~/projects/work --yes # Bulk: stamp every tracked repo under a prefix # Manage the effective ignore list used by indexing + watching gortex config exclude list # Show all layers (builtin, global, repo entry, workspace) gortex config exclude add pkg/generated # Default target: workspace .gortex.yaml -gortex config exclude add '**/*.bak' --global # Write to ~/.config/gortex/config.yaml +gortex config exclude add '**/*.bak' --global # Write to ~/.gortex/config.yaml gortex config exclude add testdata/ --repo backend # Write to a RepoEntry gortex config exclude remove pkg/generated # Remove from the same target ``` diff --git a/docs/onboarding.md b/docs/onboarding.md index e9ed547a..ef0fa092 100644 --- a/docs/onboarding.md +++ b/docs/onboarding.md @@ -91,7 +91,7 @@ Two ways — pick whichever fits your workflow. gortex mcp --index . --watch ``` -`--watch` re-indexes changed files live via fsnotify. `--cache-dir ~/.cache/gortex` (default) saves snapshots between restarts so subsequent starts are ~200ms instead of 3-5s. +`--watch` re-indexes changed files live via fsnotify. `--cache-dir ~/.gortex/cache` (default) saves snapshots between restarts so subsequent starts are ~200ms instead of 3-5s. To also get the HTTP server API (the UI is a separate Next.js app in `web/` that talks to it over HTTP): @@ -169,7 +169,7 @@ The index is empty. Either `gortex mcp` isn't watching the right directory, or ` First-time index of a 100k-symbol repo is ~20-30 seconds. On restart, it's ~200ms because the snapshot gets restored and only changed files re-index. Make sure `--cache-dir` isn't being deleted between runs. **Semantic search isn't working.** -On first use, Gortex downloads the MiniLM-L6-v2 model (~90 MB) to `~/.cache/gortex/models/`. Needs network the first time; after that, fully offline. Check `~/.cache/gortex/models/sentence-transformers_all-MiniLM-L6-v2/` exists. +On first use, Gortex downloads the MiniLM-L6-v2 model (~90 MB) to `~/.gortex/models/`. Needs network the first time; after that, fully offline. Check `~/.gortex/models/sentence-transformers_all-MiniLM-L6-v2/` exists. **"Cannot be opened because Apple cannot check it for malicious software" on macOS.** You bypassed the curl installer and downloaded the binary by hand — `curl -fsSL https://get.gortex.dev | sh` strips the quarantine xattr automatically (and on macOS routes through Homebrew when `brew` is on PATH). To fix an existing manual install, re-run the installer, reinstall via Homebrew (`brew install zzet/tap/gortex`), or run once: `xattr -d com.apple.quarantine /usr/local/bin/gortex`. @@ -239,9 +239,9 @@ On macOS the unit lands at `~/Library/LaunchAgents/com.zzet.gortex.plist`; on Li - `gortex mcp` (what Claude Code spawns via `.mcp.json`) auto-detects the daemon. If reachable, it acts as a thin stdio ↔ socket proxy (~5 MB per client). If not, it falls back to the embedded server — global mode is never "required." - Every tracked repo gets its own fsnotify watcher so edits on disk flow into the graph live; no manual reload needed. `gortex track` attaches a watcher as part of the track operation; `gortex untrack` detaches it before evicting nodes. -- Graph state is snapshotted to `~/.cache/gortex/daemon.gob.gz` on shutdown and every 10 minutes. Daemon restarts load it back and re-index only changed files. +- Graph state is snapshotted to `~/.gortex/cache/daemon.gob.gz` on shutdown and every 10 minutes. Daemon restarts load it back and re-index only changed files. - Opening Claude Code in an untracked directory returns a structured `repo_not_tracked` error on every tool call. The agent surfaces it; you run `gortex track .` to include it. -- Per-session state is isolated by a handshake-assigned session ID — two Claude Code windows see their own recent-activity and token-savings counters, not a merged view. Cumulative savings in `~/.cache/gortex/savings.json` are still shared. +- Per-session state is isolated by a handshake-assigned session ID — two Claude Code windows see their own recent-activity and token-savings counters, not a merged view. Cumulative savings in `~/.gortex/cache/savings.json` are still shared. ### Fallback rules @@ -279,7 +279,7 @@ gortex workspace set backend my-saas # write workspace=my gortex workspace set-all my-saas --root ~/work --yes # bulk-stamp every repo under ~/work ``` -For OSS / read-only repos where you don't want a `.gortex.yaml` artifact in the tree, pass `--global` to record the slug in `~/.config/gortex/config.yaml` instead. +For OSS / read-only repos where you don't want a `.gortex.yaml` artifact in the tree, pass `--global` to record the slug in `~/.gortex/config.yaml` instead. ### Projects (optional sub-buckets) and active scope diff --git a/docs/savings.md b/docs/savings.md index 2e4a2971..442b8a2e 100644 --- a/docs/savings.md +++ b/docs/savings.md @@ -4,7 +4,7 @@ Gortex tracks how many tokens it saves compared to naive file reads — per-call - **Per-call:** `get_symbol_source` and other source-reading tools include a `tokens_saved` field in the response, showing the difference between reading the full file vs the targeted symbol. - **Session-level:** `graph_stats` returns a `token_savings` object with `calls_counted`, `tokens_returned`, `tokens_saved`, `efficiency_ratio`. -- **Cumulative (cross-session):** `graph_stats` also returns `cumulative_savings` when persistence is wired — includes `first_seen`, `last_updated`, and `cost_avoided_usd` per model (Claude Opus/Sonnet/Haiku, GPT-4o, GPT-4o-mini). Backed by `~/.cache/gortex/savings.json` (top-line totals + per-repo + per-language) and a sibling `~/.cache/gortex/savings.jsonl` event log (one line per call) used to render the windowed buckets and the per-tool breakdown. +- **Cumulative (cross-session):** `graph_stats` also returns `cumulative_savings` when persistence is wired — includes `first_seen`, `last_updated`, and `cost_avoided_usd` per model (Claude Opus/Sonnet/Haiku, GPT-4o, GPT-4o-mini). Backed by `~/.gortex/cache/savings.json` (top-line totals + per-repo + per-language) and a sibling `~/.gortex/cache/savings.jsonl` event log (one line per call) used to render the windowed buckets and the per-tool breakdown. `gortex savings` renders a three-bucket dashboard: diff --git a/docs/semantic-search.md b/docs/semantic-search.md index 15d00806..e98534d6 100644 --- a/docs/semantic-search.md +++ b/docs/semantic-search.md @@ -20,7 +20,7 @@ embedding: | Provider | Quality | Offline | Native deps | Notes | |---|---|---|---|---| | `static` (default) | Good for identifier-shaped queries | Yes | None | Baked GloVe-50d table, CPU-only, zero setup | -| `local` (Hugot MiniLM-L6-v2) | Better for NL queries | After first run | None | Auto-downloads ~90 MB to `~/.cache/gortex/models/` | +| `local` (Hugot MiniLM-L6-v2) | Better for NL queries | After first run | None | Auto-downloads ~90 MB to `~/.gortex/models/` | | `api` (Ollama / OpenAI) | Best | No | None | Bounded concurrent worker pool — tune via `api_concurrency` | ## AST sub-chunking diff --git a/go.mod b/go.mod index 7436767c..1e71ab9a 100644 --- a/go.mod +++ b/go.mod @@ -275,6 +275,7 @@ require ( golang.org/x/text v0.37.0 golang.org/x/tools v0.45.0 gopkg.in/yaml.v3 v3.0.1 + modernc.org/sqlite v1.51.0 pgregory.net/rapid v1.2.0 ) @@ -338,8 +339,10 @@ require ( github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect github.com/muesli/cancelreader v0.2.2 // indirect github.com/muesli/termenv v0.16.0 // indirect + github.com/ncruces/go-strftime v1.0.0 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect github.com/rivo/uniseg v0.4.7 // indirect github.com/sagikazarmark/locafero v0.12.0 // indirect github.com/sahilm/fuzzy v0.1.2 // indirect @@ -354,7 +357,8 @@ require ( github.com/x448/float16 v0.8.4 // indirect github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect github.com/yosida95/uritemplate/v3 v3.0.2 // indirect - go.etcd.io/bbolt v1.4.3 // indirect + github.com/zeebo/assert v1.3.0 // indirect + go.etcd.io/bbolt v1.4.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/crypto v0.52.0 // indirect @@ -364,6 +368,9 @@ require ( golang.org/x/sync v0.20.0 // indirect google.golang.org/protobuf v1.36.11 // indirect k8s.io/klog/v2 v2.140.0 // indirect + modernc.org/libc v1.72.3 // indirect + modernc.org/mathutil v1.7.1 // indirect + modernc.org/memory v1.11.0 // indirect ) replace github.com/tree-sitter/tree-sitter-elixir => github.com/elixir-lang/tree-sitter-elixir v0.3.5 diff --git a/go.sum b/go.sum index df168aa3..c2924ec1 100644 --- a/go.sum +++ b/go.sum @@ -448,8 +448,6 @@ github.com/blevesearch/bleve_index_api v1.3.11 h1:x29vbV8OjWfLcrDVd7Lr1q+BkLNS0J github.com/blevesearch/bleve_index_api v1.3.11/go.mod h1:xvd48t5XMeeioWQ5/jZvgLrV98flT2rdvEJ3l/ki4Ko= github.com/blevesearch/geo v0.2.5 h1:yJg9FX1oRwLnjXSXF+ECHfXFTF4diF02Ca/qUGVjJhE= github.com/blevesearch/geo v0.2.5/go.mod h1:Jhq7WE2K6mJTx1xS44M2pUO6Io+wjCSHh1+co3YOgH4= -github.com/blevesearch/go-faiss v1.1.1 h1:oUignystYUkdYBrVh6PkTkBlfCNql2QcS+fc0fTjtVQ= -github.com/blevesearch/go-faiss v1.1.1/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= github.com/blevesearch/go-faiss v1.1.2 h1:ojv2S7ot3orbk8wMfJWryq37G4eIL8Y8PLLZYd8ZLHY= github.com/blevesearch/go-faiss v1.1.2/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo= @@ -500,8 +498,6 @@ github.com/charmbracelet/x/exp/golden v0.0.0-20241011142426-46044092ad91 h1:payR github.com/charmbracelet/x/exp/golden v0.0.0-20241011142426-46044092ad91/go.mod h1:wDlXFlCrmJ8J+swcL/MnGUuYnqgQdW9rhSD61oNMb6U= github.com/charmbracelet/x/term v0.2.2 h1:xVRT/S2ZcKdhhOuSP4t5cLi5o+JxklsoEObBSgfgZRk= github.com/charmbracelet/x/term v0.2.2/go.mod h1:kF8CY5RddLWrsgVwpw4kAa6TESp6EB5y3uxGLeCqzAI= -github.com/chewxy/math32 v1.11.1 h1:b7PGHlp8KjylDoU8RrcEsRuGZhJuz8haxnKfuMMRqy8= -github.com/chewxy/math32 v1.11.1/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs= github.com/chewxy/math32 v1.11.2 h1:IufN08Zwr1NKuWfY+4Tz55BcwKmyKKNdOP7KtumehnM= github.com/chewxy/math32 v1.11.2/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs= github.com/clipperhouse/displaywidth v0.11.0 h1:lBc6kY44VFw+TDx4I8opi/EtL9m20WSEFgwIwO+UVM8= @@ -558,6 +554,8 @@ github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/jsonschema-go v0.4.3 h1:/DBOLZTfDow7pe2GmaJNhltueGTtDKICi8V8p+DQPd0= github.com/google/jsonschema-go v0.4.3/go.mod h1:r5quNTdLOYEz95Ru18zA0ydNbBuYoo9tgaYcxEYhJVE= +github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs= +github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gortexhq/gcx-go v0.1.0 h1:yUemJwpe8Xqf8u5Q5ADIztHVrGsGc050iMnuSXMxp0k= @@ -576,6 +574,8 @@ github.com/gortexhq/tree-sitter-sql v0.1.0 h1:RlhO40jz8Iq8tX7OtkdWoatvsRcyGvQ/uZ github.com/gortexhq/tree-sitter-sql v0.1.0/go.mod h1:16mo0LajNOlE5CL5F9RvXKByD9mckgaEPPe/ZY8OXRE= github.com/gortexhq/tree-sitter-swift v0.1.1-0.20260424235305-8dde3a3327dd h1:82S6uDIeYXz7D9M3slSz8X/XOLeSeo4Vg05pyeB5mp8= github.com/gortexhq/tree-sitter-swift v0.1.1-0.20260424235305-8dde3a3327dd/go.mod h1:Bpuob78uHdoBdIicliHC7bu2o/FW6TffFe9Yw4J3P9E= +github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= +github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/janpfeifer/go-benchmarks v0.1.1 h1:gLLy07/JrOKSnMWeUxSnjTdhkglgmrNR2IBDnR4kRqw= @@ -586,12 +586,8 @@ github.com/jedib0t/go-pretty/v6 v6.7.10 h1:B/2qW2Bkv2L6n14PP8o1kx75kWzHOQ3YTluWz github.com/jedib0t/go-pretty/v6 v6.7.10/go.mod h1:YwC5CE4fJ1HFUDeivSV1r//AmANFHyqczZk+U6BDALU= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/klauspost/cpuid/v2 v2.0.12 h1:p9dKCg8i4gmOxtv35DvrYoWqYzQrvEVdjQ762Y0OqZE= -github.com/klauspost/cpuid/v2 v2.0.12/go.mod h1:g2LTdtYhdyuGPqyWyv7qRAmj1WBqxuObKfj5c0PQa7c= github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y= github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0= -github.com/knights-analytics/hugot v0.7.2 h1:zDXXAa7c1d4VOcKbqiIVvkLLpzeqjc9K8BApnAQKcVc= -github.com/knights-analytics/hugot v0.7.2/go.mod h1:BQ9lXqBv6g0ykhpDfyxJ8I7/is+GxLl15JKPKBvrVAQ= github.com/knights-analytics/hugot v0.7.3 h1:39UqU52s4nAmNIE4JG5ViASCvd8dhue7XGtt5RhK3T4= github.com/knights-analytics/hugot v0.7.3/go.mod h1:86tRz/GzyoNFHuUUzgiYnALQNZU8Vzd5F0pApYizwrs= github.com/knights-analytics/ortgenai v0.3.1 h1:0Awe43Zu+giDxzlpoNvx9ekbez/zxc8XMzKU++sOUB8= @@ -627,6 +623,8 @@ github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELU github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo= github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc= github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk= +github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w= +github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= github.com/pelletier/go-toml/v2 v2.3.1 h1:MYEvvGnQjeNkRF1qUuGolNtNExTDwct51yp7olPtrEc= github.com/pelletier/go-toml/v2 v2.3.1/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= @@ -638,6 +636,8 @@ github.com/pkoukk/tiktoken-go-loader v0.0.2/go.mod h1:4mIkYyZooFlnenDlormIo6cd5w github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= @@ -653,8 +653,6 @@ github.com/santhosh-tekuri/jsonschema/v6 v6.0.2 h1:KRzFb2m7YtdldCEkzs6KqmJw4nqEV github.com/santhosh-tekuri/jsonschema/v6 v6.0.2/go.mod h1:JXeL+ps8p7/KNMjDQk3TCwPpBy0wYklyWTfbkIzdIFU= github.com/schollz/progressbar/v3 v3.19.0 h1:Ea18xuIRQXLAUidVDox3AbwfUhD0/1IvohyTutOIFoc= github.com/schollz/progressbar/v3 v3.19.0/go.mod h1:IsO3lpbaGuzh8zIMzgY3+J8l4C8GjO0Y9S69eFvNsec= -github.com/sgtdi/fswatcher v1.2.0 h1:uSJuMc3/Eo/vaPnZWpJ42EFYb5j38cZENmkszOV0yhw= -github.com/sgtdi/fswatcher v1.2.0/go.mod h1:smzXnaqu0SYJQNIwGLLkvRkpH4RdEACB7avMSsSaqjQ= github.com/sgtdi/fswatcher v1.3.0 h1:2tFEnBml5EipRF4TvUP0x+T4ty2OSYlmvcnQ6dSTp04= github.com/sgtdi/fswatcher v1.3.0/go.mod h1:I4FUeG0e27WFw+ogs5OjZSgPKobnGrUa17EwjRjZQaY= github.com/spf13/afero v1.15.0 h1:b/YBCLWAJdFWJTN9cLhiXXcD7mzKn9Dm86dNnfyQw1I= @@ -739,14 +737,14 @@ github.com/yalue/onnxruntime_go v1.30.1 h1:NaEng5lWbsHZ/8X1dtaw1mIj7eV1ozyjbFo// github.com/yalue/onnxruntime_go v1.30.1/go.mod h1:b4X26A8pekNb1ACJ58wAXgNKeUCGEAQ9dmACut9Sm/4= github.com/yosida95/uritemplate/v3 v3.0.2 h1:Ed3Oyj9yrmi9087+NczuL5BwkIc4wvTb5zIM+UJPGz4= github.com/yosida95/uritemplate/v3 v3.0.2/go.mod h1:ILOh0sOhIJR3+L/8afwt/kE++YT040gmv5BQTMR2HP4= -github.com/zeebo/assert v1.1.0 h1:hU1L1vLTHsnO8x8c9KAR5GmM5QscxHg5RNU5z5qbUWY= -github.com/zeebo/assert v1.1.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= +github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= +github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= github.com/zeebo/blake3 v0.2.4 h1:KYQPkhpRtcqh0ssGYcKLG1JYvddkEA8QwCM/yBqhaZI= github.com/zeebo/blake3 v0.2.4/go.mod h1:7eeQ6d2iXWRGF6npfaxl2CU+xy2Fjo2gxeyZGCRUjcE= github.com/zeebo/pcg v1.0.1 h1:lyqfGeWiv4ahac6ttHs+I5hwtH/+1mrhlCtVNQM2kHo= github.com/zeebo/pcg v1.0.1/go.mod h1:09F0S9iiKrwn9rlI5yjLkmrug154/YRW6KnnXVDM/l4= -go.etcd.io/bbolt v1.4.3 h1:dEadXpI6G79deX5prL3QRNP6JB8UxVkqo4UPnHaNXJo= -go.etcd.io/bbolt v1.4.3/go.mod h1:tKQlpPaYCVFctUIgFKFnAlvbmB3tpy1vkTnDWohtc0E= +go.etcd.io/bbolt v1.4.0 h1:TU77id3TnN/zKr7CO/uk+fBCwF2jGcMuw2B/FMAzYIk= +go.etcd.io/bbolt v1.4.0/go.mod h1:AsD+OCi/qPN1giOX1aiLAha3o1U8rAz65bvN4j0sRuk= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= @@ -755,14 +753,10 @@ go.uber.org/zap v1.28.0 h1:IZzaP1Fv73/T/pBMLk4VutPl36uNC+OSUh3JLG3FIjo= go.uber.org/zap v1.28.0/go.mod h1:rDLpOi171uODNm/mxFcuYWxDsqWSAVkFdX4XojSKg/Q= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= -golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI= -golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8= golang.org/x/crypto v0.52.0 h1:RMs7fP2rXdep0CftQlK8Uf+kibLm7qkCcradZWYz988= golang.org/x/crypto v0.52.0/go.mod h1:1QgfPxDqh0T2M/elOJtp9RvuR95kVjir0e6/BvEmGbc= golang.org/x/exp v0.0.0-20260508232706-74f9aab9d74a h1:+3jdDGGB8NGb1Zktc737jlt3/A5f6UlwSzmvqUuufxw= golang.org/x/exp v0.0.0-20260508232706-74f9aab9d74a/go.mod h1:d2fgXJLVs4dYDHUk5lwMIfzRzSrWCfGZb0ZqeLa/Vcw= -golang.org/x/image v0.40.0 h1:Tw4GyDXMo+daZN1znreBRC3VayR1aLFUyUEOLUdW1a8= -golang.org/x/image v0.40.0/go.mod h1:uIc348UZMSvS5Z65CVZ7iDPaNobNFEPeJ4kbqTOszmA= golang.org/x/image v0.41.0 h1:8wS72eGJMJaBxK6okTzd4WaXumUlTVlb753MlsSvTCo= golang.org/x/image v0.41.0/go.mod h1:uIc348UZMSvS5Z65CVZ7iDPaNobNFEPeJ4kbqTOszmA= golang.org/x/mod v0.36.0 h1:JJjpVx6myfUsUdAzZuOSTTmRE0PfZeNWzzvKrP7amb4= @@ -770,8 +764,6 @@ golang.org/x/mod v0.36.0/go.mod h1:moc6ELqsWcOw5Ef3xVprK5ul/MvtVvkIXLziUOICjUQ= golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ= -golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY= golang.org/x/sys v0.45.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4= @@ -792,5 +784,33 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= k8s.io/klog/v2 v2.140.0 h1:Tf+J3AH7xnUzZyVVXhTgGhEKnFqye14aadWv7bzXdzc= k8s.io/klog/v2 v2.140.0/go.mod h1:o+/RWfJ6PwpnFn7OyAG3QnO47BFsymfEfrz6XyYSSp0= +modernc.org/cc/v4 v4.28.2 h1:3tQ0lf2ADtoby2EtSP+J7IE2SHwEJdP8ioR59wx7XpY= +modernc.org/cc/v4 v4.28.2/go.mod h1:OnovgIhbbMXMu1aISnJ0wvVD1KnW+cAUJkIrAWh+kVI= +modernc.org/ccgo/v4 v4.34.0 h1:yRLPFZieg532OT4rp4JFNIVcquwalMX26G95WQDqwCQ= +modernc.org/ccgo/v4 v4.34.0/go.mod h1:AS5WYMyBakQ+fhsHhtP8mWB82KTGPkNNJDGfGQCe0/A= +modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM= +modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU= +modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI= +modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito= +modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo= +modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY= +modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks= +modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI= +modernc.org/libc v1.72.3 h1:ZnDF4tXn4NBXFutMMQC4vtbTFSXhhKzR73fv0beZEAU= +modernc.org/libc v1.72.3/go.mod h1:dn0dZNnnn1clLyvRxLxYExxiKRZIRENOfqQ8XEeg4Qs= +modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU= +modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg= +modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI= +modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw= +modernc.org/opt v0.2.0 h1:tGyef5ApycA7FSEOMraay9SaTk5zmbx7Tu+cJs4QKZg= +modernc.org/opt v0.2.0/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns= +modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w= +modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE= +modernc.org/sqlite v1.51.0 h1:aH/MMSoayAIhozZ7uJbVTT9QO/VhzBf0J9tymmmuC/U= +modernc.org/sqlite v1.51.0/go.mod h1:tcNzv5p84E0skkmJn038y+hWJbLQXQqEnQfeh5r2JLM= +modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0= +modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A= +modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= +modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM= pgregory.net/rapid v1.2.0 h1:keKAYRcjm+e1F0oAuU5F5+YPAWcyxNNRK2wud503Gnk= pgregory.net/rapid v1.2.0/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04= diff --git a/internal/agents/instructions.go b/internal/agents/instructions.go index db8b9d12..3f92c4cc 100644 --- a/internal/agents/instructions.go +++ b/internal/agents/instructions.go @@ -61,7 +61,7 @@ A Gortex daemon is configured machine-wide via the ` + "`gortex` MCP server" + ` ### Optional: delegate research to a local agent -When ` + "`llm.provider`" + ` is configured (one of ` + "`local`" + ` / ` + "`anthropic`" + ` / ` + "`openai`" + ` / ` + "`ollama`" + ` / ` + "`claudecli`" + ` / ` + "`gemini`" + ` / ` + "`bedrock`" + ` / ` + "`deepseek`" + ` — pick one in ` + "`.gortex.yaml`" + ` or ` + "`~/.config/gortex/config.yaml`" + `, or via ` + "`GORTEX_LLM_PROVIDER`" + ` / ` + "`GORTEX_LLM_MODEL`" + `), the ` + "`ask`" + ` MCP tool is registered. It runs a grammar-constrained agent that uses gortex tools to research one question and returns a synthesized answer — useful when you'd otherwise issue many ` + "`search_symbols`" + ` / ` + "`get_callers`" + ` / ` + "`contracts`" + ` calls. Only the ` + "`local`" + ` provider requires a ` + "`-tags llama`" + ` build; the other seven are pure-Go HTTP / subprocess adapters available in every binary. +When ` + "`llm.provider`" + ` is configured (one of ` + "`local`" + ` / ` + "`anthropic`" + ` / ` + "`openai`" + ` / ` + "`ollama`" + ` / ` + "`claudecli`" + ` / ` + "`gemini`" + ` / ` + "`bedrock`" + ` / ` + "`deepseek`" + ` — pick one in ` + "`.gortex.yaml`" + ` or ` + "`~/.gortex/config.yaml`" + `, or via ` + "`GORTEX_LLM_PROVIDER`" + ` / ` + "`GORTEX_LLM_MODEL`" + `), the ` + "`ask`" + ` MCP tool is registered. It runs a grammar-constrained agent that uses gortex tools to research one question and returns a synthesized answer — useful when you'd otherwise issue many ` + "`search_symbols`" + ` / ` + "`get_callers`" + ` / ` + "`contracts`" + ` calls. Only the ` + "`local`" + ` provider requires a ` + "`-tags llama`" + ` build; the other seven are pure-Go HTTP / subprocess adapters available in every binary. | When you'd otherwise... | Consider... | |---------------------------------------|------------------------------------------| @@ -236,7 +236,7 @@ Gortex is running as an MCP server. You MUST use graph queries instead of file r ### Optional: delegate research to a local agent -When ` + "`llm.provider`" + ` is configured (one of ` + "`local`" + ` / ` + "`anthropic`" + ` / ` + "`openai`" + ` / ` + "`ollama`" + ` / ` + "`claudecli`" + ` / ` + "`gemini`" + ` / ` + "`bedrock`" + ` / ` + "`deepseek`" + ` — pick one in ` + "`.gortex.yaml`" + ` or ` + "`~/.config/gortex/config.yaml`" + `, or via ` + "`GORTEX_LLM_PROVIDER`" + ` / ` + "`GORTEX_LLM_MODEL`" + `), the ` + "`ask`" + ` MCP tool is registered. It runs a grammar-constrained agent that uses gortex tools to research one question and returns a synthesized answer — useful when you'd otherwise issue many ` + "`search_symbols`" + ` / ` + "`get_callers`" + ` / ` + "`contracts`" + ` calls. Only the ` + "`local`" + ` provider requires a ` + "`-tags llama`" + ` build; the other seven are pure-Go HTTP / subprocess adapters available in every binary. +When ` + "`llm.provider`" + ` is configured (one of ` + "`local`" + ` / ` + "`anthropic`" + ` / ` + "`openai`" + ` / ` + "`ollama`" + ` / ` + "`claudecli`" + ` / ` + "`gemini`" + ` / ` + "`bedrock`" + ` / ` + "`deepseek`" + ` — pick one in ` + "`.gortex.yaml`" + ` or ` + "`~/.gortex/config.yaml`" + `, or via ` + "`GORTEX_LLM_PROVIDER`" + ` / ` + "`GORTEX_LLM_MODEL`" + `), the ` + "`ask`" + ` MCP tool is registered. It runs a grammar-constrained agent that uses gortex tools to research one question and returns a synthesized answer — useful when you'd otherwise issue many ` + "`search_symbols`" + ` / ` + "`get_callers`" + ` / ` + "`contracts`" + ` calls. Only the ` + "`local`" + ` provider requires a ` + "`-tags llama`" + ` build; the other seven are pure-Go HTTP / subprocess adapters available in every binary. | When you'd otherwise... | Consider... | |---------------------------------------|------------------------------------------| diff --git a/internal/analysis/analysis_test.go b/internal/analysis/analysis_test.go index 7fffe5c8..9d648ac0 100644 --- a/internal/analysis/analysis_test.go +++ b/internal/analysis/analysis_test.go @@ -146,11 +146,11 @@ func TestAnalyzeImpact_DropsHeuristicNoiseAtTransitiveDepths(t *testing.T) { } func TestAnalyzeImpact_RiskLevels(t *testing.T) { - assert.Equal(t, RiskLow, assessRisk(0, 0, 0)) - assert.Equal(t, RiskLow, assessRisk(1, 1, 0)) - assert.Equal(t, RiskMedium, assessRisk(2, 3, 0)) - assert.Equal(t, RiskHigh, assessRisk(5, 5, 0)) - assert.Equal(t, RiskCritical, assessRisk(10, 10, 0)) + assert.Equal(t, RiskLow, assessRisk(0, 0)) + assert.Equal(t, RiskLow, assessRisk(1, 1)) + assert.Equal(t, RiskMedium, assessRisk(2, 3)) + assert.Equal(t, RiskHigh, assessRisk(5, 5)) + assert.Equal(t, RiskCritical, assessRisk(10, 10)) } func TestScoreEntryPoint(t *testing.T) { diff --git a/internal/analysis/architecture.go b/internal/analysis/architecture.go index d4beb662..0f2010e5 100644 --- a/internal/analysis/architecture.go +++ b/internal/analysis/architecture.go @@ -19,7 +19,7 @@ import ( // reports a violation when a cross-layer dependency breaks the source // layer's allow/deny rules. Symbols in no declared layer, and edges // to such symbols, are unconstrained. -func EvaluateArchitecture(g *graph.Graph, arch config.ArchitectureConfig, changedSymbolIDs []string) []GuardViolation { +func EvaluateArchitecture(g graph.Store, arch config.ArchitectureConfig, changedSymbolIDs []string) []GuardViolation { if g == nil || arch.IsEmpty() { return nil } @@ -76,7 +76,7 @@ func EvaluateArchitecture(g *graph.Graph, arch config.ArchitectureConfig, change // evaluateArchRules checks the per-layer / per-pattern dependency-cone // rules — fan-out caps and caller-boundary restrictions — for a set // of changed symbols. -func evaluateArchRules(g *graph.Graph, arch config.ArchitectureConfig, changedSymbolIDs, layerNames []string) []GuardViolation { +func evaluateArchRules(g graph.Store, arch config.ArchitectureConfig, changedSymbolIDs, layerNames []string) []GuardViolation { if len(arch.Rules) == 0 { return nil } @@ -169,7 +169,7 @@ func callerWithinBoundary(callerPath string, rule config.ArchRule, callerLayer s // distinctCallTargets counts the distinct symbols a node calls or // references — the dependency-cone size. -func distinctCallTargets(g *graph.Graph, id string) int { +func distinctCallTargets(g graph.Store, id string) int { seen := make(map[string]bool) for _, e := range g.GetOutEdges(id) { if e.Kind != graph.EdgeCalls && e.Kind != graph.EdgeReferences { diff --git a/internal/analysis/betweenness.go b/internal/analysis/betweenness.go index c07d207c..a67cabcd 100644 --- a/internal/analysis/betweenness.go +++ b/internal/analysis/betweenness.go @@ -72,33 +72,78 @@ const ( // // Pivot sampling is seeded with a fixed seed, so results are // reproducible run to run. -func ComputeBetweenness(g *graph.Graph) *BetweennessResult { +func ComputeBetweenness(g graph.Store) *BetweennessResult { if g == nil { return &BetweennessResult{Scores: map[string]float64{}} } - nodes := g.AllNodes() - n := len(nodes) + // Betweenness measures shortest-path centrality across the + // call / reference subgraph; only function and method nodes carry + // those edges. The scoring kernel only ever touches node IDs, so + // the unfiltered AllNodes() pull was wasted on the other 90% of + // the node table AND on the 9 unused columns of every retained + // row. NodeIDsByKinds returns just the id column from a single + // query; NodesByKindsScanner is the legacy fallback for + // backends that haven't shipped the id projection yet. + betweennessKinds := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} + bcNodeKinds := []graph.NodeKind{graph.KindFunction, graph.KindMethod} + var ids []string + if scan, ok := g.(graph.NodeIDsByKinds); ok { + ids = scan.NodeIDsByKinds(bcNodeKinds) + } else if scan, ok := g.(graph.NodesByKindsScanner); ok { + ns := scan.NodesByKinds(bcNodeKinds) + ids = make([]string, 0, len(ns)) + for _, nd := range ns { + ids = append(ids, nd.ID) + } + } else { + all := g.AllNodes() + ids = make([]string, 0, len(all)) + for _, nd := range all { + if nd.Kind == graph.KindFunction || nd.Kind == graph.KindMethod { + ids = append(ids, nd.ID) + } + } + } + n := len(ids) if n == 0 { return &BetweennessResult{Scores: map[string]float64{}} } // Stable node ordering: betweenness itself is order-independent, // but a deterministic order makes the sampled pivot pick - // reproducible regardless of the map-iteration order AllNodes - // happens to return. - ids := make([]string, n) - for i, nd := range nodes { - ids[i] = nd.ID - } + // reproducible regardless of the iteration order + // NodeIDsByKinds happens to return. sort.Strings(ids) // Forward adjacency over the call / reference subgraph. + // EdgeAdjacencyForKinds returns only the (from, to) projection of + // function/method endpoints — the disk path collapses to one + // join with both endpoint kinds enforced in the store, so + // neither the cross-kind edges nor the ~10 unused columns are + // ever materialized. Falls back to EdgesByKinds (and then + // EdgesByKind per kind) on backends that don't implement the + // adjacency capability. adj := make(map[string][]string, n) - for _, e := range g.AllEdges() { - if e.Kind != graph.EdgeCalls && e.Kind != graph.EdgeReferences { - continue + if adjScan, ok := g.(graph.EdgeAdjacencyForKinds); ok { + for pair := range adjScan.EdgeAdjacencyForKinds(betweennessKinds, bcNodeKinds) { + adj[pair[0]] = append(adj[pair[0]], pair[1]) + } + } else if es, ok := g.(graph.EdgesByKindsScanner); ok { + for e := range es.EdgesByKinds(betweennessKinds) { + if e == nil { + continue + } + adj[e.From] = append(adj[e.From], e.To) + } + } else { + for _, kind := range betweennessKinds { + for e := range g.EdgesByKind(kind) { + if e == nil { + continue + } + adj[e.From] = append(adj[e.From], e.To) + } } - adj[e.From] = append(adj[e.From], e.To) } score := make(map[string]float64, n) @@ -156,7 +201,7 @@ func samplePivots(ids []string, k int) []string { rng := rand.New(rand.NewSource(betweennessSeed)) perm := rng.Perm(len(ids)) out := make([]string, k) - for i := 0; i < k; i++ { + for i := range k { out[i] = ids[perm[i]] } return out diff --git a/internal/analysis/communities.go b/internal/analysis/communities.go index df26ef9c..49f9fcba 100644 --- a/internal/analysis/communities.go +++ b/internal/analysis/communities.go @@ -5,6 +5,7 @@ import ( "math" "path/filepath" "sort" + "strconv" "strings" "github.com/zzet/gortex/internal/graph" @@ -50,13 +51,13 @@ type CommunityResult struct { // The Louvain implementation is preserved as // DetectCommunitiesLouvain so we can benchmark, A/B, or fall back // without re-deriving the algorithm. -func DetectCommunities(g *graph.Graph) *CommunityResult { +func DetectCommunities(g graph.Store) *CommunityResult { return DetectCommunitiesLeiden(g) } // DetectCommunitiesLouvain is the original Louvain implementation, // retained for benchmarking and as a known-good fallback. -func DetectCommunitiesLouvain(g *graph.Graph) *CommunityResult { +func DetectCommunitiesLouvain(g graph.Store) *CommunityResult { nodes := g.AllNodes() edges := g.AllEdges() @@ -123,105 +124,7 @@ func DetectCommunitiesLouvain(g *graph.Graph) *CommunityResult { } sort.Strings(commIDs) // deterministic visitation comm, commNodes := louvainLocalMoves(commIDs, neighbors, degree, totalWeight) - - // Build result - nodeMap := make(map[string]*graph.Node) - for _, n := range nodes { - nodeMap[n.ID] = n - } - - result := &CommunityResult{ - NodeToComm: make(map[string]string), - } - - // Renumber communities. We sort by old id so renumbering is - // stable across reruns (the underlying ids are member ids, which - // were sorted to drive the local-moves loop deterministically). - oldIDs := make([]string, 0, len(commNodes)) - for cid := range commNodes { - if len(commNodes[cid]) >= 2 { - oldIDs = append(oldIDs, cid) - } - } - sort.Strings(oldIDs) - commRemap := make(map[string]string, len(oldIDs)) - for i, cid := range oldIDs { - commRemap[cid] = fmt.Sprintf("community-%d", i) - } - - for nodeID, cid := range comm { - if newID, ok := commRemap[cid]; ok { - result.NodeToComm[nodeID] = newID - } - } - - // Build Community objects - for oldID, members := range commNodes { - newID, ok := commRemap[oldID] - if !ok { - continue - } - - fileSet := make(map[string]bool) - for _, mid := range members { - if n, ok := nodeMap[mid]; ok { - fileSet[n.FilePath] = true - } - } - - files := make([]string, 0, len(fileSet)) - for f := range fileSet { - files = append(files, f) - } - sort.Strings(files) - - label := inferCommunityLabel(members, nodeMap, files) - cohesion := computeCohesion(members, neighbors) - hub := findHub(members, nodeMap, neighbors) - - c := Community{ - ID: newID, - Label: label, - Members: members, - Files: files, - Size: len(members), - Cohesion: cohesion, - Hub: hub, - } - result.Communities = append(result.Communities, c) - } - - // Multi-pass label disambiguation: Louvain often splits a single - // directory into many call-density-based sub-clusters (e.g. 48 - // different clusters whose files all live in parser/languages/). - // The directory-based label is identical for all of them, which - // reads as duplicate cards in the UI. We tag colliding labels - // with the cluster's hub symbol — the function/type that - // everything else in the cluster connects through — which is the - // most semantically meaningful disambiguator. - disambiguateLabels(result.Communities) - - // Sibling grouping. Louvain genuinely produces dozens of peer - // communities under a single dominant directory (48 clusters all - // rooted at parser/languages/ in this codebase). Formally those - // peers are not sub-communities at the *modularity* level — we - // confirmed phase-2 Louvain doesn't merge them — but in - // navigation terms they obviously belong together. We surface - // that by computing ParentID from the cluster's directory head - // (the part of the label before " · sample" and " +N dirs"): - // any two clusters whose head matches get the same ParentID, so - // the UI can render them under a shared section header. - assignDirectoryParents(result.Communities) - - // Sort by size descending - sort.Slice(result.Communities, func(i, j int) bool { - return result.Communities[i].Size > result.Communities[j].Size - }) - - // Compute modularity - result.Modularity = computeModularity(comm, neighbors, degree, totalWeight) - - return result + return finaliseCommunityPartition(nodes, comm, commNodes, neighbors, degree, totalWeight) } // disambiguateLabels makes every cluster label unique. The @@ -785,3 +688,174 @@ func namePrefixLabel(members []string, nodeMap map[string]*graph.Node) string { } return bestPrefix } + +// finaliseCommunityPartition converts a (nodeID → community label) +// partition into a fully-shaped CommunityResult: renumbered IDs, +// per-cluster files / cohesion / hub, label disambiguation, and +// sibling-group parent assignment. Shared by the in-process Louvain +// path (which builds the partition itself) and the backend-delegated +// path (DetectCommunitiesLouvainBackend, which takes the partition +// from graph.CommunityDetector). +// +// commNodes can be nil; when it is, the function inverts comm to +// recover the per-community member list (one extra pass — only used +// on the backend path where commNodes isn't pre-built). +func finaliseCommunityPartition( + nodes []*graph.Node, + comm map[string]string, + commNodes map[string][]string, + neighbors map[string]map[string]float64, + degree map[string]float64, + totalWeight float64, +) *CommunityResult { + if commNodes == nil { + commNodes = make(map[string][]string, len(comm)) + for nid, cid := range comm { + commNodes[cid] = append(commNodes[cid], nid) + } + } + + nodeMap := make(map[string]*graph.Node, len(nodes)) + for _, n := range nodes { + nodeMap[n.ID] = n + } + + result := &CommunityResult{ + NodeToComm: make(map[string]string), + } + + // Renumber: keep clusters of size >= 2, sort old labels for + // determinism, mint sequential "community-N" names. + oldIDs := make([]string, 0, len(commNodes)) + for cid := range commNodes { + if len(commNodes[cid]) >= 2 { + oldIDs = append(oldIDs, cid) + } + } + sort.Strings(oldIDs) + commRemap := make(map[string]string, len(oldIDs)) + for i, cid := range oldIDs { + commRemap[cid] = fmt.Sprintf("community-%d", i) + } + + for nodeID, cid := range comm { + if newID, ok := commRemap[cid]; ok { + result.NodeToComm[nodeID] = newID + } + } + + for oldID, members := range commNodes { + newID, ok := commRemap[oldID] + if !ok { + continue + } + fileSet := make(map[string]bool) + for _, mid := range members { + if n, ok := nodeMap[mid]; ok { + fileSet[n.FilePath] = true + } + } + files := make([]string, 0, len(fileSet)) + for f := range fileSet { + files = append(files, f) + } + sort.Strings(files) + + c := Community{ + ID: newID, + Label: inferCommunityLabel(members, nodeMap, files), + Members: members, + Files: files, + Size: len(members), + Cohesion: computeCohesion(members, neighbors), + Hub: findHub(members, nodeMap, neighbors), + } + result.Communities = append(result.Communities, c) + } + + disambiguateLabels(result.Communities) + assignDirectoryParents(result.Communities) + sort.Slice(result.Communities, func(i, j int) bool { + return result.Communities[i].Size > result.Communities[j].Size + }) + result.Modularity = computeModularity(comm, neighbors, degree, totalWeight) + return result +} + +// DetectCommunitiesLouvainBackend runs Louvain via the backend's +// engine-native implementation (graph.CommunityDetector) and threads +// the resulting partition through +// the same post-processing the in-process DetectCommunitiesLouvain +// uses. The output is shape-identical: every Community label, +// hub, cohesion, parent, and modularity field is populated from +// the partition, so downstream consumers (UI, rerank pipeline) +// can't tell which path produced it. +// +// Returns nil when the backend errors — callers should fall +// through to the in-process path rather than surface a half-done +// CommunityResult. +func DetectCommunitiesLouvainBackend(g graph.Store, cd graph.CommunityDetector) *CommunityResult { + if g == nil || cd == nil { + return nil + } + hits, err := cd.Louvain(graph.CommunityOpts{}) + if err != nil || len(hits) == 0 { + return nil + } + + nodes := g.AllNodes() + symbolNodes := make(map[string]bool, len(nodes)) + for _, n := range nodes { + if n.Kind != graph.KindFile && n.Kind != graph.KindImport { + symbolNodes[n.ID] = true + } + } + + // Rebuild the same weighted neighbor view DetectCommunitiesLouvain + // uses — needed for cohesion / hub / modularity. The work is + // O(V + E) per call; small relative to the engine-native + // partitioning save. + type edgeKey struct{ a, b string } + weights := make(map[edgeKey]float64) + for _, e := range g.AllEdges() { + if !symbolNodes[e.From] || !symbolNodes[e.To] { + continue + } + w := edgeWeight(e.Kind) + if w == 0 { + continue + } + weights[edgeKey{e.From, e.To}] += w + weights[edgeKey{e.To, e.From}] += w + } + neighbors := make(map[string]map[string]float64) + for k, w := range weights { + if neighbors[k.a] == nil { + neighbors[k.a] = make(map[string]float64) + } + neighbors[k.a][k.b] = w + } + var totalWeight float64 + for _, w := range weights { + totalWeight += w + } + totalWeight /= 2 + degree := make(map[string]float64, len(symbolNodes)) + for id := range symbolNodes { + for _, w := range neighbors[id] { + degree[id] += w + } + } + + comm := make(map[string]string, len(hits)) + for _, h := range hits { + if !symbolNodes[h.NodeID] { + continue + } + comm[h.NodeID] = strconv.FormatInt(h.CommunityID, 10) + } + if len(comm) == 0 { + return nil + } + return finaliseCommunityPartition(nodes, comm, nil, neighbors, degree, totalWeight) +} diff --git a/internal/analysis/components.go b/internal/analysis/components.go new file mode 100644 index 00000000..b11016aa --- /dev/null +++ b/internal/analysis/components.go @@ -0,0 +1,294 @@ +package analysis + +import ( + "sort" + + "github.com/zzet/gortex/internal/graph" +) + +// ComponentResult is one connected component returned by +// ComputeWCC / ComputeSCC. Members are sorted ascending so the +// output is deterministic across runs. +type ComponentResult struct { + ID int `json:"id"` + Members []string `json:"members"` + Size int `json:"size"` +} + +// ComponentOptions filters the working set the algorithm runs +// against. Empty NodeKinds / EdgeKinds means "all kinds". +type ComponentOptions struct { + NodeKinds []graph.NodeKind + EdgeKinds []graph.EdgeKind + // MinSize trims trivial singleton components from the + // response — common for SCC where every non-cyclic symbol + // is its own 1-element SCC. + MinSize int +} + +// ComputeWCC returns the weakly connected components of g — pairs +// of nodes reachable from each other when every edge is treated +// as undirected. Components are sorted by size descending; ties +// broken by member ID for determinism. +// +// O(V + E). Used as the fallback when the backing graph.Store +// does not implement graph.ComponentFinder. +func ComputeWCC(g graph.Store, opts ComponentOptions) []ComponentResult { + if g == nil { + return nil + } + nodeAllow := makeComponentKindAllow(opts.NodeKinds) + edgeAllow := makeComponentEdgeAllow(opts.EdgeKinds) + + // Build a dense int index over allowed nodes. + nodes := g.AllNodes() + idx := make(map[string]int, len(nodes)) + dense := make([]string, 0, len(nodes)) + for _, n := range nodes { + if n == nil || !nodeAllow(n.Kind) { + continue + } + idx[n.ID] = len(dense) + dense = append(dense, n.ID) + } + if len(dense) == 0 { + return nil + } + + // Undirected adjacency over allowed edges. + adj := make([][]int, len(dense)) + for _, e := range g.AllEdges() { + if e == nil || !edgeAllow(e.Kind) { + continue + } + i, ok1 := idx[e.From] + j, ok2 := idx[e.To] + if !ok1 || !ok2 || i == j { + continue + } + adj[i] = append(adj[i], j) + adj[j] = append(adj[j], i) + } + + // Union-find equivalence: BFS from each unseen node, mark + // every reachable node with the same component label. + comp := make([]int, len(dense)) + for i := range comp { + comp[i] = -1 + } + next := 0 + queue := make([]int, 0, 64) + for i := range dense { + if comp[i] != -1 { + continue + } + label := next + next++ + comp[i] = label + queue = append(queue[:0], i) + for len(queue) > 0 { + cur := queue[0] + queue = queue[1:] + for _, nb := range adj[cur] { + if comp[nb] == -1 { + comp[nb] = label + queue = append(queue, nb) + } + } + } + } + + return collectComponents(dense, comp, opts.MinSize) +} + +// ComputeSCC returns the strongly connected components of g — +// pairs of nodes mutually reachable along directed edges. Uses +// an iterative Tarjan's algorithm to avoid blowing the recursion +// stack on a deep call graph. O(V + E). +func ComputeSCC(g graph.Store, opts ComponentOptions) []ComponentResult { + if g == nil { + return nil + } + nodeAllow := makeComponentKindAllow(opts.NodeKinds) + edgeAllow := makeComponentEdgeAllow(opts.EdgeKinds) + + nodes := g.AllNodes() + idx := make(map[string]int, len(nodes)) + dense := make([]string, 0, len(nodes)) + for _, n := range nodes { + if n == nil || !nodeAllow(n.Kind) { + continue + } + idx[n.ID] = len(dense) + dense = append(dense, n.ID) + } + if len(dense) == 0 { + return nil + } + + // Directed adjacency. Only out-edges — SCC walks one way. + adj := make([][]int, len(dense)) + for _, e := range g.AllEdges() { + if e == nil || !edgeAllow(e.Kind) { + continue + } + i, ok1 := idx[e.From] + j, ok2 := idx[e.To] + if !ok1 || !ok2 { + continue + } + adj[i] = append(adj[i], j) + } + + // Iterative Tarjan. State arrays sized to the dense node + // count; the call stack is replaced by an explicit (node, + // neighbour-iteration-index) stack. + n := len(dense) + const undefined = -1 + idxArr := make([]int, n) + lowlink := make([]int, n) + onStack := make([]bool, n) + for i := range idxArr { + idxArr[i] = undefined + } + stack := make([]int, 0, n) + type frame struct { + v int + ni int // next-neighbour index to visit + } + work := make([]frame, 0, n) + + var index int + comp := make([]int, n) + for i := range comp { + comp[i] = -1 + } + nextComp := 0 + + for start := 0; start < n; start++ { + if idxArr[start] != undefined { + continue + } + // Initialise the explicit DFS for this root. + idxArr[start] = index + lowlink[start] = index + index++ + stack = append(stack, start) + onStack[start] = true + work = append(work, frame{v: start, ni: 0}) + + for len(work) > 0 { + top := &work[len(work)-1] + v := top.v + neighbors := adj[v] + if top.ni < len(neighbors) { + w := neighbors[top.ni] + top.ni++ + if idxArr[w] == undefined { + // Descend into w. + idxArr[w] = index + lowlink[w] = index + index++ + stack = append(stack, w) + onStack[w] = true + work = append(work, frame{v: w, ni: 0}) + } else if onStack[w] { + if idxArr[w] < lowlink[v] { + lowlink[v] = idxArr[w] + } + } + continue + } + // All neighbours consumed; pop the frame and propagate + // the lowlink upward. + work = work[:len(work)-1] + if len(work) > 0 { + parent := &work[len(work)-1] + if lowlink[v] < lowlink[parent.v] { + lowlink[parent.v] = lowlink[v] + } + } + // Emit an SCC if v is its lowlink root. + if lowlink[v] == idxArr[v] { + label := nextComp + nextComp++ + for { + w := stack[len(stack)-1] + stack = stack[:len(stack)-1] + onStack[w] = false + comp[w] = label + if w == v { + break + } + } + } + } + } + + return collectComponents(dense, comp, opts.MinSize) +} + +// collectComponents groups dense node IDs by component label, +// applies MinSize, sorts members for determinism, and returns +// the slice ordered by size descending. +func collectComponents(dense []string, comp []int, minSize int) []ComponentResult { + groups := make(map[int][]string) + for i, id := range dense { + c := comp[i] + if c < 0 { + continue + } + groups[c] = append(groups[c], id) + } + out := make([]ComponentResult, 0, len(groups)) + for c, members := range groups { + if minSize > 0 && len(members) < minSize { + continue + } + sort.Strings(members) + out = append(out, ComponentResult{ID: c, Members: members, Size: len(members)}) + } + sort.Slice(out, func(i, j int) bool { + if out[i].Size != out[j].Size { + return out[i].Size > out[j].Size + } + if len(out[i].Members) > 0 && len(out[j].Members) > 0 { + return out[i].Members[0] < out[j].Members[0] + } + return out[i].ID < out[j].ID + }) + // Renumber sequentially so the output IDs are 0..N-1 in + // size-descending order. Stable for snapshot tests. + for i := range out { + out[i].ID = i + } + return out +} + +func makeComponentKindAllow(kinds []graph.NodeKind) func(graph.NodeKind) bool { + if len(kinds) == 0 { + return func(graph.NodeKind) bool { return true } + } + set := make(map[graph.NodeKind]struct{}, len(kinds)) + for _, k := range kinds { + set[k] = struct{}{} + } + return func(k graph.NodeKind) bool { + _, ok := set[k] + return ok + } +} + +func makeComponentEdgeAllow(kinds []graph.EdgeKind) func(graph.EdgeKind) bool { + if len(kinds) == 0 { + return func(graph.EdgeKind) bool { return true } + } + set := make(map[graph.EdgeKind]struct{}, len(kinds)) + for _, k := range kinds { + set[k] = struct{}{} + } + return func(k graph.EdgeKind) bool { + _, ok := set[k] + return ok + } +} diff --git a/internal/analysis/components_test.go b/internal/analysis/components_test.go new file mode 100644 index 00000000..9cdeab41 --- /dev/null +++ b/internal/analysis/components_test.go @@ -0,0 +1,107 @@ +package analysis + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +// seedComponentTestGraph builds a hub-and-spoke graph: two SCC +// triangles + one hub every node points at. Gives predictable +// WCC + SCC answers. +func seedComponentTestGraph() *graph.Graph { + g := graph.New() + for _, id := range []string{"a", "b", "c", "d", "e", "f", "hub"} { + g.AddNode(&graph.Node{ID: id, Kind: graph.KindFunction, Name: id, FilePath: id + ".go"}) + } + edges := [][2]string{ + {"a", "b"}, {"b", "c"}, {"c", "a"}, // triangle 1 + {"d", "e"}, {"e", "f"}, {"f", "d"}, // triangle 2 + {"c", "d"}, // bridge + {"a", "hub"}, {"b", "hub"}, {"c", "hub"}, + {"d", "hub"}, {"e", "hub"}, {"f", "hub"}, + } + for _, e := range edges { + g.AddEdge(&graph.Edge{From: e[0], To: e[1], Kind: graph.EdgeCalls, FilePath: "x.go"}) + } + return g +} + +func TestComputeWCC_OneComponent(t *testing.T) { + g := seedComponentTestGraph() + res := ComputeWCC(g, ComponentOptions{}) + require.Len(t, res, 1, "all 7 nodes form one WCC; got %v", res) + assert.Equal(t, 7, res[0].Size) +} + +func TestComputeWCC_HonoursEdgeFilter(t *testing.T) { + g := seedComponentTestGraph() + // Filter out the call edges entirely → no surviving edges → every node + // becomes its own singleton component. + res := ComputeWCC(g, ComponentOptions{ + EdgeKinds: []graph.EdgeKind{graph.EdgeReferences}, + }) + assert.Len(t, res, 7, + "with no surviving edges every node should be a singleton; got %v", res) +} + +func TestComputeSCC_ThreeComponents(t *testing.T) { + g := seedComponentTestGraph() + res := ComputeSCC(g, ComponentOptions{}) + // 7 SCCs: {a,b,c}, {d,e,f}, {hub} (singleton). But the hub is + // trivial — without MinSize, expect 3 with sizes [3, 3, 1]. + require.GreaterOrEqual(t, len(res), 3) + + bySize := map[int]int{} + for _, r := range res { + bySize[r.Size]++ + } + assert.Equal(t, 2, bySize[3], "should find two 3-node SCCs (the triangles); got %v", res) +} + +func TestComputeSCC_MinSize_DropsSingletons(t *testing.T) { + g := seedComponentTestGraph() + res := ComputeSCC(g, ComponentOptions{MinSize: 2}) + for _, r := range res { + assert.GreaterOrEqual(t, r.Size, 2, + "MinSize=2 should drop singleton SCCs; got %v", r) + } +} + +// TestComputeSCC_Iterative_NoStackOverflow constructs a deep +// straight-line graph (1 -> 2 -> 3 -> ... -> N) to make sure the +// iterative Tarjan stays in heap and doesn't blow the goroutine +// call stack. N = 10k; recursive Tarjan would fall over. +func TestComputeSCC_Iterative_NoStackOverflow(t *testing.T) { + const n = 10000 + g := graph.New() + for i := 0; i < n; i++ { + id := charID(i) + g.AddNode(&graph.Node{ID: id, Kind: graph.KindFunction, Name: id, FilePath: "x.go"}) + } + for i := 0; i < n-1; i++ { + g.AddEdge(&graph.Edge{ + From: charID(i), To: charID(i + 1), Kind: graph.EdgeCalls, FilePath: "x.go", + }) + } + res := ComputeSCC(g, ComponentOptions{}) + // A DAG of N nodes has N singleton SCCs. + assert.Equal(t, n, len(res)) +} + +func charID(i int) string { + // fmt.Sprintf is fine but we want zero allocs in the loop body — just + // build a deterministic string ID. + const hex = "0123456789abcdef" + out := make([]byte, 0, 8) + for x := i; ; x /= 16 { + out = append([]byte{hex[x%16]}, out...) + if x < 16 { + break + } + } + return "n_" + string(out) +} diff --git a/internal/analysis/connectivity.go b/internal/analysis/connectivity.go index 51eddfc3..166b7f25 100644 --- a/internal/analysis/connectivity.go +++ b/internal/analysis/connectivity.go @@ -109,7 +109,14 @@ const connectivityNote = "Connectivity health is a graph-EXTRACTION diagnostic, // fileLimit caps how many files DeadWeightByFile carries — files are // ranked by dead-weight descending, ties broken by path; pass 0 or a // negative value for no cap. -func GraphConnectivity(g *graph.Graph, nodes []*graph.Node, fileLimit int) GraphConnectivityReport { +// +// Backends that implement graph.NodeDegreeAggregator serve every +// per-node count from one bulk pass; the fallback path runs +// the legacy per-node GetInEdges + GetOutEdges + ClassifyZeroEdge +// trio. The arithmetic is identical either way — the capability +// inlines ClassifyZeroEdge's "no incoming usage edge" check into the +// same row. +func GraphConnectivity(g graph.Store, nodes []*graph.Node, fileLimit int) GraphConnectivityReport { report := GraphConnectivityReport{Note: connectivityNote} if g == nil { return report @@ -127,6 +134,14 @@ func GraphConnectivity(g *graph.Graph, nodes []*graph.Node, fileLimit int) Graph byKind := map[graph.NodeKind]*kindAgg{} byFile := map[string]*fileAgg{} + // Bulk per-node count fetch when the backend supports it; one + // bulk pair vs. 3N per-node round-trips for the legacy path + // (the killer on a disk backend — see the NodeDegreeAggregator doc-comment + // for the workspace-scale numbers). Returns a map keyed on node ID + // or nil when the capability isn't available; the fallback path + // re-queries per node via the closure below. + counts := collectConnectivityCounts(g, nodes) + for _, n := range nodes { if n == nil { continue @@ -140,8 +155,15 @@ func GraphConnectivity(g *graph.Graph, nodes []*graph.Node, fileLimit int) Graph } ka.total++ - inCount := len(g.GetInEdges(n.ID)) - outCount := len(g.GetOutEdges(n.ID)) + var inCount, outCount int + if counts != nil { + row := counts[n.ID] + inCount = row.InCount + outCount = row.OutCount + } else { + inCount = len(g.GetInEdges(n.ID)) + outCount = len(g.GetOutEdges(n.ID)) + } degree := inCount + outCount if degree > 0 { @@ -149,10 +171,12 @@ func GraphConnectivity(g *graph.Graph, nodes []*graph.Node, fileLimit int) Graph } // Isolated == zero edges of any kind. ClassifyZeroEdge returns - // ZeroEdgePossibleExtractionGap for exactly this case, so the - // "isolated" definition stays bound to the shared zero-edge - // classification used for per-symbol caveats. - isolated := graph.ClassifyZeroEdge(g, n.ID) == graph.ZeroEdgePossibleExtractionGap + // ZeroEdgePossibleExtractionGap for exactly this case (for a + // known node), so the "isolated" definition stays bound to the + // shared zero-edge classification used for per-symbol caveats. + // We derive it from the counts directly; the underlying + // classifier's check is in == 0 && out == 0 for a known id. + isolated := degree == 0 leaf := degree == 1 if isolated { @@ -230,3 +254,36 @@ func GraphConnectivity(g *graph.Graph, nodes []*graph.Node, fileLimit int) Graph return report } + +// collectConnectivityCounts returns per-node in/out/usage counts for +// the supplied node slice via the backend's NodeDegreeAggregator +// capability. Returns nil when the backend doesn't implement the +// capability — GraphConnectivity then falls back to the legacy +// per-node g.GetInEdges/g.GetOutEdges path so semantics never differ. +// +// We pass UsageInboundEdgeKinds so the server fills UsageInCount — +// today GraphConnectivity only consumes In/Out totals, but the usage +// count rides on the same row at no extra round-trip cost and makes +// the capability self-contained for callers that need it next. +func collectConnectivityCounts(g graph.Store, nodes []*graph.Node) map[string]graph.NodeDegreeRow { + agg, ok := g.(graph.NodeDegreeAggregator) + if !ok { + return nil + } + ids := make([]string, 0, len(nodes)) + for _, n := range nodes { + if n == nil || n.ID == "" { + continue + } + ids = append(ids, n.ID) + } + if len(ids) == 0 { + return map[string]graph.NodeDegreeRow{} + } + rows := agg.NodeDegreeCounts(ids, graph.UsageInboundEdgeKinds()) + out := make(map[string]graph.NodeDegreeRow, len(rows)) + for _, r := range rows { + out[r.NodeID] = r + } + return out +} diff --git a/internal/analysis/contracts.go b/internal/analysis/contracts.go index 593b09c0..c2854a04 100644 --- a/internal/analysis/contracts.go +++ b/internal/analysis/contracts.go @@ -43,7 +43,7 @@ type parsedSignature struct { // VerifyChanges checks proposed signature changes against all callers and interface // implementors, returning any contract violations found. -func VerifyChanges(g *graph.Graph, engine *query.Engine, changes []SignatureChange) *VerifyResult { +func VerifyChanges(g graph.Store, engine *query.Engine, changes []SignatureChange) *VerifyResult { result := &VerifyResult{} for _, change := range changes { @@ -151,7 +151,7 @@ func VerifyChanges(g *graph.Graph, engine *query.Engine, changes []SignatureChan // checkInterfaceViolations checks if the changed symbol is a method that belongs to // an interface, and if so, verifies all other implementors still conform. // Traversal: EdgeMemberOf → parent type → EdgeImplements → interface → all implementors -func checkInterfaceViolations(g *graph.Graph, engine *query.Engine, node *graph.Node, newSig *parsedSignature, result *VerifyResult) { +func checkInterfaceViolations(g graph.Store, engine *query.Engine, node *graph.Node, newSig *parsedSignature, result *VerifyResult) { if node.Kind != graph.KindMethod { return } @@ -232,7 +232,7 @@ func checkInterfaceViolations(g *graph.Graph, engine *query.Engine, node *graph. } // findMemberMethods returns all method nodes that are members of the given type. -func findMemberMethods(g *graph.Graph, typeID string) []*graph.Node { +func findMemberMethods(g graph.Store, typeID string) []*graph.Node { inEdges := g.GetInEdges(typeID) var methods []*graph.Node for _, edge := range inEdges { diff --git a/internal/analysis/cycles.go b/internal/analysis/cycles.go index b9573af2..d7b37f2a 100644 --- a/internal/analysis/cycles.go +++ b/internal/analysis/cycles.go @@ -20,9 +20,8 @@ type Cycle struct { // DetectCycles finds all dependency cycles in the graph using Tarjan's SCC algorithm. // If scope is non-empty, only nodes whose FilePath starts with scope are considered. // Cycles are classified by edge type and community membership, then sorted by severity descending. -func DetectCycles(g *graph.Graph, communities *CommunityResult, scope string) []Cycle { +func DetectCycles(g graph.Store, communities *CommunityResult, scope string) []Cycle { nodes := g.AllNodes() - edges := g.AllEdges() // Build set of in-scope node IDs inScope := make(map[string]bool, len(nodes)) @@ -36,24 +35,35 @@ func DetectCycles(g *graph.Graph, communities *CommunityResult, scope string) [] inScope[n.ID] = true } - // Build adjacency list and track edge kinds between pairs + // Build adjacency list and track edge kinds between pairs. + // + // Edge collection streams only EdgeImports + EdgeCalls via + // EdgesByKind (two MATCH (...)-[e:Edge {kind: $kind}]->(...) on + // disk backends) instead of materialising every edge in the graph + // just to filter for two kinds -- ~500k edge rows over cgo dropped + // to the import-and-call subset (a few tens of thousands on the + // gortex workspace). adj := make(map[string][]string) edgeKinds := make(map[edgePair][]graph.EdgeKind) - for _, e := range edges { - if e.Kind != graph.EdgeImports && e.Kind != graph.EdgeCalls { - continue - } - if !inScope[e.From] || !inScope[e.To] { - continue - } - pair := edgePair{e.From, e.To} - // Avoid duplicate adjacency entries - if _, exists := edgeKinds[pair]; !exists { - adj[e.From] = append(adj[e.From], e.To) + collect := func(kind graph.EdgeKind) { + for e := range g.EdgesByKind(kind) { + if e == nil { + continue + } + if !inScope[e.From] || !inScope[e.To] { + continue + } + pair := edgePair{e.From, e.To} + // Avoid duplicate adjacency entries + if _, exists := edgeKinds[pair]; !exists { + adj[e.From] = append(adj[e.From], e.To) + } + edgeKinds[pair] = append(edgeKinds[pair], kind) } - edgeKinds[pair] = append(edgeKinds[pair], e.Kind) } + collect(graph.EdgeImports) + collect(graph.EdgeCalls) // Run Tarjan's SCC sccs := tarjanSCC(inScope, adj) @@ -89,7 +99,7 @@ func DetectCycles(g *graph.Graph, communities *CommunityResult, scope string) [] // WouldCreateCycle checks if adding an edge from fromID to toID would create a cycle. // It performs DFS from toID to see if fromID is reachable. If so, adding fromID→toID // would close a cycle. Returns the cycle path from toID to fromID when found. -func WouldCreateCycle(g *graph.Graph, fromID, toID string) (bool, []string) { +func WouldCreateCycle(g graph.Store, fromID, toID string) (bool, []string) { edges := g.AllEdges() // Build adjacency from calls and imports edges diff --git a/internal/analysis/deadcode.go b/internal/analysis/deadcode.go index 2305212a..18cadef4 100644 --- a/internal/analysis/deadcode.go +++ b/internal/analysis/deadcode.go @@ -3,6 +3,7 @@ package analysis import ( "math" "path/filepath" + "slices" "sort" "strings" "unicode" @@ -21,14 +22,14 @@ type DeadCodeEntry struct { // HotspotEntry represents a symbol with disproportionately high complexity metrics. type HotspotEntry struct { - ID string `json:"id"` - Name string `json:"name"` - Kind string `json:"kind"` - FilePath string `json:"file_path"` - Line int `json:"start_line"` - FanIn int `json:"fan_in"` - FanOut int `json:"fan_out"` - CommunityCrossings int `json:"community_crossings"` + ID string `json:"id"` + Name string `json:"name"` + Kind string `json:"kind"` + FilePath string `json:"file_path"` + Line int `json:"start_line"` + FanIn int `json:"fan_in"` + FanOut int `json:"fan_out"` + CommunityCrossings int `json:"community_crossings"` // Betweenness is the node's betweenness-centrality score // normalized to 0-100 — how often it sits on a shortest path // between other symbols. A bottleneck the call graph routes @@ -210,23 +211,48 @@ func isEntryPointNode(n *graph.Node) bool { return v } +// candidateNodeKinds enumerates the node kinds FindDeadCode is willing +// to flag (modulo the opt-in switches for fields / variables / +// constants). Used both for the per-kind allowlist handed to the +// DeadCodeCandidator capability and as the source of truth for the +// Go-fallback loop. Kept in lockstep with neverDeadCodeKinds: a kind +// MUST appear in exactly one of the two lists. +var candidateNodeKinds = []graph.NodeKind{ + graph.KindFunction, + graph.KindMethod, + graph.KindType, + graph.KindInterface, + graph.KindField, + graph.KindVariable, + graph.KindConstant, +} + // FindDeadCode returns all symbols with zero incoming calls or references, // excluding entry points, test functions, exported symbols, and user-excluded patterns. // By default, variables are excluded (see FindDeadCodeOptions for rationale). -func FindDeadCode(g *graph.Graph, processes *ProcessResult, excludePatterns []string, opts ...FindDeadCodeOptions) []DeadCodeEntry { +func FindDeadCode(g graph.Store, processes *ProcessResult, excludePatterns []string, opts ...FindDeadCodeOptions) []DeadCodeEntry { var opt FindDeadCodeOptions if len(opts) > 0 { opt = opts[0] } - nodes := g.AllNodes() - allEdges := g.AllEdges() - // Build set of interface-required method names per type. // If a type implements an interface, all methods that the interface // requires are alive even if never called directly (they satisfy the // contract). We index: typeID → set of required method names. - ifaceRequiredMethods := buildIfaceRequiredMethods(g, nodes, allEdges) + // Backends that implement graph.IfaceImplementsScanner serve this + // from one join; the fallback walks NodesByKind + EdgesByKind + // just like before. + ifaceRequiredMethods := buildIfaceRequiredMethods(g) + + // Pick the candidate-set source. When the backend implements + // DeadCodeCandidator, the "no incoming usage edge" filter runs + // inside the store and only the surviving ~hundreds of true + // candidates are materialized — see graph.DeadCodeCandidator's + // doc-comment for the 1.3M-row-vs-hundreds rationale. Otherwise + // the legacy AllNodes + GetInEdgesByNodeIDs fallback runs, + // identical to the pre-capability path. + candidates, incomingByID := collectDeadCodeCandidates(g, opt) // Build set of entry point node IDs from processes entryPoints := make(map[string]bool) @@ -242,23 +268,46 @@ func FindDeadCode(g *graph.Graph, processes *ProcessResult, excludePatterns []st // Files holding a framework entry point (Alembic migrations, // Next.js pages, ASP.NET host files) — every symbol inside is - // reachable from a runtime, not application-dead. + // reachable from a runtime, not application-dead. Computed via + // NodesByKind(KindFile) so on disk backends we don't have to + // materialise AllNodes() just to find the entry-point files. entryPointFiles := make(map[string]bool) - for _, n := range nodes { - if n.Kind == graph.KindFile && isEntryPointNode(n) { + for n := range g.NodesByKind(graph.KindFile) { + if n != nil && isEntryPointNode(n) { entryPointFiles[n.FilePath] = true } } var result []DeadCodeEntry - for _, n := range nodes { + for _, n := range candidates { // Skip kinds the analyzer never reports — structural, // extracted metadata, infra, function-shape, and value-only // nodes. See neverDeadCodeKinds for the full list and why. + // (The server-side candidator only ships nodes whose kind is + // in candidateNodeKinds, but the Go fallback path scans + // AllNodes so we keep the explicit gate.) if neverDeadCodeKinds[n.Kind] { continue } + // Synthetic external-symbol / stub nodes are NOT first-party + // code. The external-call attribution pass materialises imported + // stdlib / dependency / external symbols as KindFunction / + // KindMethod nodes (IDs like "stdlib::fmt::Sprintf", + // "dep::::Sym", "external::::Sym") stamped with + // Meta["external"]=true; the stub layer mints "::*" IDs for + // stdlib/external_call/builtin/module targets. By construction + // these carry only inbound import / member_of links — never a + // call/reference usage edge — so they ALWAYS look dead. Reporting + // them buried the real first-party signal under thousands of + // stdlib/dep entries. Drop them unconditionally. + if graph.IsStub(n.ID) { + continue + } + if ext, _ := n.Meta["external"].(bool); ext { + continue + } + // Framework entry points, and everything in an entry-point // file, are invoked by a runtime — never dead. if isEntryPointNode(n) || entryPointFiles[n.FilePath] { @@ -311,20 +360,19 @@ func FindDeadCode(g *graph.Graph, processes *ProcessResult, excludePatterns []st continue } - // Count incoming edges that indicate the symbol is used. - // The allowlist is per-kind: fields/variables/constants are - // exercised by Reads/Writes; functions/methods by Calls/ - // References; types by References/Instantiates/MemberOf/ - // Implements/Extends/Composes/TypedAs. See incomingUsageKinds - // for the rationale. - allowed := incomingUsageKinds(n.Kind) - inEdges := g.GetInEdges(n.ID) + // Re-check the per-kind incoming-edge allowlist when we still + // have the in-edge map from the Go fallback path. The + // server-side DeadCodeCandidator has already applied the + // equivalent filter, so incomingByID is nil for that path and + // the count check short-circuits to 0 (matching the + // candidator's contract). incomingCount := 0 - for _, e := range inEdges { - for _, k := range allowed { - if e.Kind == k { + if incomingByID != nil { + allowed := incomingUsageKinds(n.Kind) + inEdges := incomingByID[n.ID] + for _, e := range inEdges { + if slices.Contains(allowed, e.Kind) { incomingCount++ - break } } } @@ -413,35 +461,83 @@ func FindDeadCode(g *graph.Graph, processes *ProcessResult, excludePatterns []st return result } +// collectDeadCodeCandidates is the candidate-set splitter for +// FindDeadCode. When the backend implements DeadCodeCandidator the +// WHERE-NOT-EXISTS filter runs server-side and we never materialise +// the in-edge map (returned nil). Otherwise we fall back to today's +// AllNodes + batched-GetInEdgesByNodeIDs path, identical pre-Part-2 +// behaviour. The post-filter loop in FindDeadCode handles both shapes +// uniformly — incomingByID==nil means "filter already applied". +func collectDeadCodeCandidates(g graph.Store, opt FindDeadCodeOptions) (candidates []*graph.Node, incomingByID map[string][]*graph.Edge) { + if dc, ok := g.(graph.DeadCodeCandidator); ok { + kinds := candidateNodeKinds[:0:0] + for _, k := range candidateNodeKinds { + // Honour the IncludeFields / IncludeVariables / IncludeConstants + // opt-in switches at the candidate-source: kinds the caller + // explicitly excluded never need to cross cgo. The post- + // filter loop still re-checks these for the fallback path + // (which sees every kind) so the contract holds either way. + switch k { + case graph.KindField: + if !opt.IncludeFields { + continue + } + case graph.KindVariable: + if !opt.IncludeVariables { + continue + } + case graph.KindConstant: + if !opt.IncludeConstants { + continue + } + } + kinds = append(kinds, k) + } + allowed := make(map[graph.NodeKind][]graph.EdgeKind, len(kinds)) + for _, k := range kinds { + allowed[k] = incomingUsageKinds(k) + } + return dc.DeadCodeCandidates(kinds, allowed), nil + } + + // Fallback: pull every node and the batched in-edge map up front. + // Same shape as before the DeadCodeCandidator capability landed. + nodes := g.AllNodes() + nodeIDs := make([]string, 0, len(nodes)) + for _, n := range nodes { + nodeIDs = append(nodeIDs, n.ID) + } + return nodes, g.GetInEdgesByNodeIDs(nodeIDs) +} + // buildIfaceRequiredMethods returns a map from type ID → set of method names // that the type must implement to satisfy its interfaces. This is computed by: // 1. Collecting all interfaces with their required method names (from Meta["methods"]). // 2. Collecting all EdgeImplements edges (type → interface). // 3. For each type that implements an interface, merging all required method names. -func buildIfaceRequiredMethods(g *graph.Graph, nodes []*graph.Node, edges []*graph.Edge) map[string]map[string]bool { - // Step 1: interface ID → required method names +// +// On backends that implement graph.IfaceImplementsScanner this is a +// single join; otherwise the fallback iterates +// NodesByKind(KindInterface) + EdgesByKind(EdgeImplements). Both paths +// produce the same map. +func buildIfaceRequiredMethods(g graph.Store) map[string]map[string]bool { + if scanner, ok := g.(graph.IfaceImplementsScanner); ok { + return buildIfaceRequiredMethodsFromRows(scanner.IfaceImplementsRows()) + } + + // Fallback: walk interfaces + EdgeImplements edges Go-side. Uses + // NodesByKind(KindInterface) so disk backends still issue one + // scan per kind instead of pulling AllNodes. ifaceMethods := make(map[string]map[string]bool) - for _, n := range nodes { - if n.Kind != graph.KindInterface || n.Meta == nil { + for n := range g.NodesByKind(graph.KindInterface) { + if n == nil || n.Meta == nil { continue } raw, ok := n.Meta["methods"] if !ok { continue } - methods := make(map[string]bool) - switch v := raw.(type) { - case []string: - for _, m := range v { - methods[m] = true - } - case []any: - for _, m := range v { - if s, ok := m.(string); ok { - methods[s] = true - } - } - } + methods := decodeMethodNames(raw) if len(methods) > 0 { ifaceMethods[n.ID] = methods } @@ -451,12 +547,8 @@ func buildIfaceRequiredMethods(g *graph.Graph, nodes []*graph.Node, edges []*gra return nil } - // Step 2: type ID → set of required method names (from all implemented interfaces) result := make(map[string]map[string]bool) - for _, e := range edges { - if e.Kind != graph.EdgeImplements { - continue - } + for e := range g.EdgesByKind(graph.EdgeImplements) { // EdgeImplements: From=type, To=interface iface, ok := ifaceMethods[e.To] if !ok { @@ -473,6 +565,67 @@ func buildIfaceRequiredMethods(g *graph.Graph, nodes []*graph.Node, edges []*gra return result } +// buildIfaceRequiredMethodsFromRows reduces the server-side +// IfaceImplementsScanner row set to the typeID → method-name-set +// shape the rest of FindDeadCode consumes. Same join logic as the +// fallback path, just folded over rows that already carry the +// interface Meta. +func buildIfaceRequiredMethodsFromRows(rows []graph.IfaceImplementsRow) map[string]map[string]bool { + if len(rows) == 0 { + return nil + } + // Cache decoded method-name sets per interface so repeated rows + // (one per implementing type) don't re-decode the same Meta. + ifaceMethods := make(map[string]map[string]bool) + result := make(map[string]map[string]bool) + for _, r := range rows { + methods, ok := ifaceMethods[r.IfaceID] + if !ok { + raw, hasRaw := r.IfaceMeta["methods"] + if !hasRaw { + ifaceMethods[r.IfaceID] = nil + continue + } + methods = decodeMethodNames(raw) + ifaceMethods[r.IfaceID] = methods + } + if len(methods) == 0 { + continue + } + if result[r.TypeID] == nil { + result[r.TypeID] = make(map[string]bool) + } + for m := range methods { + result[r.TypeID][m] = true + } + } + if len(result) == 0 { + return nil + } + return result +} + +// decodeMethodNames normalises a Node.Meta["methods"] value into a +// set of method names. Accepts []string (in-memory backend) and +// []any (decoded payload from the disk backend); anything else is +// treated as "no methods declared". +func decodeMethodNames(raw any) map[string]bool { + methods := make(map[string]bool) + switch v := raw.(type) { + case []string: + for _, m := range v { + methods[m] = true + } + case []any: + for _, m := range v { + if s, ok := m.(string); ok { + methods[s] = true + } + } + } + return methods +} + // hotspotBetweennessWeight scales the betweenness component of a // hotspot's raw score. Betweenness arrives normalized to 0-100 (same // range as the fan-in/out/crossing terms after their own @@ -488,9 +641,34 @@ const hotspotBetweennessWeight = 0.4 // centrality component — how often the symbol lies on a shortest path between // other symbols — that augments the fan-in/out signals rather than replacing them. // If threshold <= 0, the default threshold is mean + 2*stddev. -func FindHotspots(g *graph.Graph, communities *CommunityResult, threshold float64) []HotspotEntry { - nodes := g.AllNodes() - edges := g.AllEdges() +func FindHotspots(g graph.Store, communities *CommunityResult, threshold float64) []HotspotEntry { + // Pull only function/method node IDs — the hotspots ranking is + // callable-only, and the scoring math doesn't touch any column + // beyond the id. NodeIDsByKinds returns the projection from a + // single query (one id per row instead of the ~10 + // columns NodesByKinds would ship). The full *Node rows are + // fetched in one batched GetNodesByIDs call AFTER the threshold + // filter, so a typical run materialises ~100 survivors rather + // than the whole ~4k function/method bucket. + hotspotKinds := []graph.NodeKind{graph.KindFunction, graph.KindMethod} + var candidateIDs []string + if scan, ok := g.(graph.NodeIDsByKinds); ok { + candidateIDs = scan.NodeIDsByKinds(hotspotKinds) + } else if scan, ok := g.(graph.NodesByKindsScanner); ok { + ns := scan.NodesByKinds(hotspotKinds) + candidateIDs = make([]string, 0, len(ns)) + for _, n := range ns { + candidateIDs = append(candidateIDs, n.ID) + } + } else { + all := g.AllNodes() + candidateIDs = make([]string, 0, len(all)) + for _, n := range all { + if n.Kind == graph.KindFunction || n.Kind == graph.KindMethod { + candidateIDs = append(candidateIDs, n.ID) + } + } + } // Build lookup maps for community membership nodeToComm := make(map[string]string) @@ -498,31 +676,45 @@ func FindHotspots(g *graph.Graph, communities *CommunityResult, threshold float6 nodeToComm = communities.NodeToComm } - // Build edge maps for fan-in and fan-out computation - // fan_in: incoming calls + references - // fan_out: outgoing calls - fanIn := make(map[string]int) - fanOut := make(map[string]int) - - for _, e := range edges { - if e.Kind == graph.EdgeCalls || e.Kind == graph.EdgeReferences { - fanIn[e.To]++ - } - if e.Kind == graph.EdgeCalls { - fanOut[e.From]++ - } + // Restrict the fan-count pass to the kinds hotspots cares about + // (function + method). NodeFanAggregator expects the candidate id + // list -- it never returns rows for ids the caller didn't ask + // for, so the cgo payload stays bounded by the candidate count + // rather than the whole graph. + fanIn, fanOut := CollectFanCounts(g, candidateIDs, + []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences}, + []graph.EdgeKind{graph.EdgeCalls}, + ) + + // Community crossings per node: outgoing edges (Calls or + // References) whose target sits in a different community than + // the source. CommunityCrossingsByKind ships only the (from, to) + // projection from a single IN-list join — the disk path stops + // re-materialising the full edge row per kind. Backends that + // don't implement the capability fall back to the per-kind + // EdgesByKind walk that mirrors the in-memory reference. + crossingKinds := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} + var crossings map[string]int + if cc, ok := g.(graph.CommunityCrossingsByKind); ok { + crossings = cc.CommunityCrossingsByKind(crossingKinds, nodeToComm) } - - // Compute community crossings per node: outgoing edges to nodes in different communities - crossings := make(map[string]int) - for _, e := range edges { - if e.Kind == graph.EdgeCalls || e.Kind == graph.EdgeReferences { - fromComm := nodeToComm[e.From] - toComm := nodeToComm[e.To] - if fromComm != "" && toComm != "" && fromComm != toComm { - crossings[e.From]++ + if crossings == nil { + crossings = make(map[string]int) + countCrossings := func(kind graph.EdgeKind) { + for e := range g.EdgesByKind(kind) { + if e == nil { + continue + } + fromComm := nodeToComm[e.From] + toComm := nodeToComm[e.To] + if fromComm != "" && toComm != "" && fromComm != toComm { + crossings[e.From]++ + } } } + for _, k := range crossingKinds { + countCrossings(k) + } } // Betweenness centrality — exact on small graphs, sampled on @@ -536,9 +728,13 @@ func FindHotspots(g *graph.Graph, communities *CommunityResult, threshold float6 } } - // Compute raw scores for function/method nodes only + // Compute raw scores for function/method nodes only. Keyed by id + // so the full *Node fetch is deferred until after the threshold + // filter — on a ~4k candidate set the surviving share is the top + // few percent, so this materialises ~100 nodes instead of the + // whole bucket. type rawEntry struct { - node *graph.Node + id string fanIn int fanOut int crossing int @@ -546,20 +742,16 @@ func FindHotspots(g *graph.Graph, communities *CommunityResult, threshold float6 rawScore float64 } - var entries []rawEntry - for _, n := range nodes { - if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { - continue - } - - fi := fanIn[n.ID] - fo := fanOut[n.ID] - cc := crossings[n.ID] - bw := betweenness[n.ID] + entries := make([]rawEntry, 0, len(candidateIDs)) + for _, id := range candidateIDs { + fi := fanIn[id] + fo := fanOut[id] + cc := crossings[id] + bw := betweenness[id] raw := float64(fi)*2.0 + float64(fo)*1.5 + float64(cc)*3.0 + bw*hotspotBetweennessWeight entries = append(entries, rawEntry{ - node: n, + id: id, fanIn: fi, fanOut: fo, crossing: cc, @@ -607,25 +799,49 @@ func FindHotspots(g *graph.Graph, communities *CommunityResult, threshold float6 threshold = mean + 2.0*stddev } - // Filter and build result - var result []HotspotEntry - for i, e := range entries { + // Filter by threshold first to identify the surviving id set, so + // the full *Node materialisation is bounded by the result size, + // not the candidate count. + type survivor struct { + entryIdx int + score float64 + } + survivors := make([]survivor, 0, len(entries)) + for i := range entries { score := math.Round(normalized[i]*100) / 100 // round to 2 decimal places if score < threshold { continue } + survivors = append(survivors, survivor{entryIdx: i, score: score}) + } + if len(survivors) == 0 { + return nil + } + survivorIDs := make([]string, 0, len(survivors)) + for _, s := range survivors { + survivorIDs = append(survivorIDs, entries[s.entryIdx].id) + } + nodesByID := g.GetNodesByIDs(survivorIDs) + + result := make([]HotspotEntry, 0, len(survivors)) + for _, s := range survivors { + e := entries[s.entryIdx] + n := nodesByID[e.id] + if n == nil { + continue + } result = append(result, HotspotEntry{ - ID: e.node.ID, - Name: e.node.Name, - Kind: string(e.node.Kind), - FilePath: e.node.FilePath, - Line: e.node.StartLine, + ID: n.ID, + Name: n.Name, + Kind: string(n.Kind), + FilePath: n.FilePath, + Line: n.StartLine, FanIn: e.fanIn, FanOut: e.fanOut, CommunityCrossings: e.crossing, Betweenness: math.Round(e.betweenness*100) / 100, - ComplexityScore: score, + ComplexityScore: s.score, }) } @@ -811,3 +1027,90 @@ func matchesExcludePattern(filePath, nodeID string, patterns []string) bool { } return false } + +// CollectFanCounts returns per-id fan-in / fan-out counts filtered by +// edge kind. Backends that implement graph.NodeFanAggregator serve +// both counts from one bulk pass per direction (~candidateCount +// rows instead of the full edge set); the fallback path +// streams the requested kinds via EdgesByKind, accumulating into the +// fan maps Go-side -- still no AllEdges materialisation, just an +// in-memory walk of the per-kind edge buckets. +// +// Used by FindHotspots and the health_score analyzer. Both pass the +// same fanInKinds / fanOutKinds pair today; the function signature +// keeps them per-call so a future analyzer with a different kind +// split can share the same plumbing. +func CollectFanCounts(g graph.Store, ids []string, fanInKinds []graph.EdgeKind, fanOutKinds []graph.EdgeKind) (fanIn, fanOut map[string]int) { + fanIn = make(map[string]int, len(ids)) + fanOut = make(map[string]int, len(ids)) + if len(ids) == 0 { + return fanIn, fanOut + } + if agg, ok := g.(graph.NodeFanAggregator); ok { + for _, r := range agg.NodeFanCounts(ids, fanInKinds, fanOutKinds) { + if r.FanIn != 0 { + fanIn[r.NodeID] = r.FanIn + } + if r.FanOut != 0 { + fanOut[r.NodeID] = r.FanOut + } + } + return fanIn, fanOut + } + + // Fallback path: stream the requested kinds via EdgesByKind and + // tally Go-side. ID-set membership keeps the maps bounded to + // candidate ids, matching the capability contract. + idSet := make(map[string]struct{}, len(ids)) + for _, id := range ids { + if id != "" { + idSet[id] = struct{}{} + } + } + streamed := make(map[graph.EdgeKind]struct{}, len(fanInKinds)+len(fanOutKinds)) + stream := func(kind graph.EdgeKind, toIn, toOut bool) { + if _, ok := streamed[kind]; ok { + return + } + streamed[kind] = struct{}{} + for e := range g.EdgesByKind(kind) { + if e == nil { + continue + } + if toIn { + if _, ok := idSet[e.To]; ok { + fanIn[e.To]++ + } + } + if toOut { + if _, ok := idSet[e.From]; ok { + fanOut[e.From]++ + } + } + } + } + inKinds := make(map[graph.EdgeKind]struct{}, len(fanInKinds)) + for _, k := range fanInKinds { + inKinds[k] = struct{}{} + } + outKinds := make(map[graph.EdgeKind]struct{}, len(fanOutKinds)) + for _, k := range fanOutKinds { + outKinds[k] = struct{}{} + } + allKinds := make([]graph.EdgeKind, 0, len(inKinds)+len(outKinds)) + for k := range inKinds { + allKinds = append(allKinds, k) + } + for k := range outKinds { + if _, dup := inKinds[k]; dup { + continue + } + allKinds = append(allKinds, k) + } + for _, k := range allKinds { + _, toIn := inKinds[k] + _, toOut := outKinds[k] + stream(k, toIn, toOut) + } + return fanIn, fanOut +} diff --git a/internal/analysis/deadcode_external_test.go b/internal/analysis/deadcode_external_test.go new file mode 100644 index 00000000..9b1d16b1 --- /dev/null +++ b/internal/analysis/deadcode_external_test.go @@ -0,0 +1,63 @@ +package analysis + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/zzet/gortex/internal/graph" +) + +// TestDeadCode_SyntheticExternalNodesExcluded verifies that the synthetic +// external-symbol / stub nodes the resolver materialises (stdlib::*, dep::*, +// external::* with Meta["external"]=true, and the "::*" stub ids) are +// never reported as dead code — they are imported third-party / stdlib +// symbols, not first-party code, and by construction have zero incoming +// usage edges. A real unexported function with no callers must STILL be +// reported, so the filter is specific rather than blanket. +func TestDeadCode_SyntheticExternalNodesExcluded(t *testing.T) { + g := graph.New() + + // Synthetic external-call attribution nodes: KindFunction, lowercase + // (unexported) names so the only thing that could exclude them is the + // new external/stub filter — not the exported-symbol skip. + g.AddNode(&graph.Node{ + ID: "stdlib::fmt::lowerStdlib", Kind: graph.KindFunction, + Name: "lowerStdlib", Language: "go", + Meta: map[string]any{"external": true}, + }) + g.AddNode(&graph.Node{ + ID: "dep::github.com/x/y::lowerDep", Kind: graph.KindFunction, + Name: "lowerDep", Language: "go", + Meta: map[string]any{"external": true}, + }) + g.AddNode(&graph.Node{ + ID: "external::os::lowerExternal", Kind: graph.KindFunction, + Name: "lowerExternal", Language: "go", + Meta: map[string]any{"external": true}, + }) + // A stub-id node WITHOUT the Meta flag — caught by graph.IsStub on the + // id prefix alone (the CGo / stub-layer form, e.g. stdlib::C::foo). + g.AddNode(&graph.Node{ + ID: "stdlib::C::lbug_thing", Kind: graph.KindFunction, + Name: "lbug_thing", Language: "go", + }) + g.AddNode(&graph.Node{ + ID: "gortex::stdlib::C::repo_prefixed_stub", Kind: graph.KindFunction, + Name: "repo_prefixed_stub", Language: "go", + }) + + // Control: a genuine first-party unexported function with no callers. + g.AddNode(&graph.Node{ + ID: "pkg/x.go::deadHelper", Kind: graph.KindFunction, + Name: "deadHelper", FilePath: "pkg/x.go", StartLine: 10, EndLine: 20, Language: "go", + }) + + result := FindDeadCode(g, nil, nil) + + if assert.Len(t, result, 1, "only the real first-party dead function should be reported") { + assert.Equal(t, "pkg/x.go::deadHelper", result[0].ID) + } + for _, e := range result { + assert.False(t, graph.IsStub(e.ID), "no stub id should appear: %s", e.ID) + } +} diff --git a/internal/analysis/diffmap.go b/internal/analysis/diffmap.go index e9662760..bcf6214b 100644 --- a/internal/analysis/diffmap.go +++ b/internal/analysis/diffmap.go @@ -38,7 +38,7 @@ type DiffResult struct { // scope: "unstaged", "staged", "all", "compare" // baseRef: used when scope is "compare" (e.g., "main") // repoRoot: absolute path to the repository root -func MapGitDiff(g *graph.Graph, repoRoot, scope, baseRef string) (*DiffResult, error) { +func MapGitDiff(g graph.Store, repoRoot, scope, baseRef string) (*DiffResult, error) { args := buildDiffArgs(scope, baseRef) cmd := exec.Command("git", args...) cmd.Dir = repoRoot diff --git a/internal/analysis/guards.go b/internal/analysis/guards.go index 721faabd..e2180c46 100644 --- a/internal/analysis/guards.go +++ b/internal/analysis/guards.go @@ -30,7 +30,7 @@ type GuardViolation struct { // For "boundary" rules: reports a violation when any changed symbol whose file path // matches the Source prefix has outgoing call or reference edges to symbols whose // file paths match the Target prefix. -func EvaluateGuards(g *graph.Graph, rules []config.GuardRule, changedSymbolIDs []string) []GuardViolation { +func EvaluateGuards(g graph.Store, rules []config.GuardRule, changedSymbolIDs []string) []GuardViolation { var violations []GuardViolation // Pre-resolve changed symbols to nodes for efficient lookup. @@ -88,7 +88,7 @@ func evaluateCoChange(rule config.GuardRule, changedNodes []*graph.Node) []Guard // evaluateBoundary checks whether any changed symbol in the source prefix has // outgoing call or reference edges targeting symbols in the target prefix. -func evaluateBoundary(g *graph.Graph, rule config.GuardRule, changedNodes []*graph.Node) []GuardViolation { +func evaluateBoundary(g graph.Store, rule config.GuardRule, changedNodes []*graph.Node) []GuardViolation { var violations []GuardViolation seen := make(map[string]bool) diff --git a/internal/analysis/hierarchy.go b/internal/analysis/hierarchy.go index 685826a5..a5af19da 100644 --- a/internal/analysis/hierarchy.go +++ b/internal/analysis/hierarchy.go @@ -129,7 +129,7 @@ func hierarchyLeafKinds(k graph.NodeKind) bool { // The base graph is read-only here — BuildHierarchy never mutates g // and never persists a second graph. An unknown level yields an empty // view carrying that level, so callers can surface a clean error. -func BuildHierarchy(g *graph.Graph, level ResolutionLevel, communities *CommunityResult) *HierarchyView { +func BuildHierarchy(g graph.Store, level ResolutionLevel, communities *CommunityResult) *HierarchyView { view := &HierarchyView{Level: level, SelfLoops: map[string]int{}} if g == nil || !ValidResolutionLevel(level) { return view diff --git a/internal/analysis/hits.go b/internal/analysis/hits.go index 36168573..40e62ddc 100644 --- a/internal/analysis/hits.go +++ b/internal/analysis/hits.go @@ -65,7 +65,7 @@ const hitsIterations = 40 // // then L2-normalises both vectors so the scores stay bounded. A nil // or empty graph yields an empty, safe-to-query result. -func ComputeHITS(g *graph.Graph) *HITSResult { +func ComputeHITS(g graph.Store) *HITSResult { if g == nil { return &HITSResult{Authorities: map[string]float64{}, Hubs: map[string]float64{}} } diff --git a/internal/analysis/impact.go b/internal/analysis/impact.go index d8f7dbb0..6f39974f 100644 --- a/internal/analysis/impact.go +++ b/internal/analysis/impact.go @@ -54,7 +54,7 @@ type ImpactResult struct { // edges, matching the live walk's behavior. Fall back to live BFS // when any seed lacks the index — the slow path is identical to the // pre-index implementation so consumer semantics never diverge. -func AnalyzeImpact(g *graph.Graph, symbolIDs []string, communities *CommunityResult, processes *ProcessResult) *ImpactResult { +func AnalyzeImpact(g graph.Store, symbolIDs []string, communities *CommunityResult, processes *ProcessResult) *ImpactResult { result := &ImpactResult{ ByDepth: make(map[int][]ImpactEntry), } @@ -95,7 +95,7 @@ func AnalyzeImpact(g *graph.Graph, symbolIDs []string, communities *CommunityRes // Determine risk level d1 := len(result.ByDepth[1]) d2 := len(result.ByDepth[2]) - result.Risk = assessRisk(d1, d2, len(result.TestFiles)) + result.Risk = assessRisk(d1, d2) // Find affected processes if processes != nil { @@ -174,7 +174,7 @@ func AnalyzeImpact(g *graph.Graph, symbolIDs []string, communities *CommunityRes // per discovered node, attributing the in-edge that introduced it to // EdgeConfidence / ConfidenceLabel. Kept as the always-correct // fallback for fillImpactFromReach. -func fillImpactLive(g *graph.Graph, result *ImpactResult, symbolIDs []string) { +func fillImpactLive(g graph.Store, result *ImpactResult, symbolIDs []string) { visited := make(map[string]bool) for _, id := range symbolIDs { visited[id] = true @@ -228,7 +228,7 @@ func fillImpactLive(g *graph.Graph, result *ImpactResult, symbolIDs []string) { // deterministic-by-shard-iteration choice closely enough for tests // that compare ByDepth ID sets, which is the contract consumers rely // on. EdgeConfidence is set from that representative edge. -func fillImpactFromReach(g *graph.Graph, result *ImpactResult, symbolIDs []string) bool { +func fillImpactFromReach(g graph.Store, result *ImpactResult, symbolIDs []string) bool { if len(symbolIDs) == 0 { return true } @@ -347,7 +347,7 @@ func filterHeuristicEntries(entries []ImpactEntry) []ImpactEntry { return kept } -func assessRisk(directDeps, transitiveDeps, testFiles int) RiskLevel { +func assessRisk(directDeps, transitiveDeps int) RiskLevel { if directDeps >= 10 || (directDeps >= 5 && transitiveDeps >= 20) { return RiskCritical } diff --git a/internal/analysis/impact_reach_test.go b/internal/analysis/impact_reach_test.go index 29c3a0fd..873235c2 100644 --- a/internal/analysis/impact_reach_test.go +++ b/internal/analysis/impact_reach_test.go @@ -215,12 +215,23 @@ func TestAnalyzeImpact_FastPathSubMillisecond(t *testing.T) { reach.BuildIndex(g) const absoluteCeiling = 15 * time.Millisecond - // Per BenchmarkAnalyzeImpact_FastPath vs LiveWalk the steady- - // state speedup on this fixture is ~1.8x. We gate at 1.3x to - // absorb wall-clock noise (short timed loops have more variance - // than the benchmark harness's adaptive sampling) while still - // catching a regression that drops in a live walk. - const minSpeedup = 1.3 + // The reach live walk (compute) now batches its whole-BFS-level + // edge + node fetches into GetInEdgesByNodeIDs / GetNodesByIDs + // instead of issuing one GetInEdges + one GetNode per node. On the + // in-memory backend those batched reads are nearly as cheap as the + // precomputed fast path (both are then dominated by the identical + // per-entry GetNode rendering in fillImpactFromReach), so the old + // ~1.8x relative speedup no longer holds here — it collapses to + // ~1.0x. The precompute's large win is now realised on disk + // backends (SQLite), where each per-node query the batching + // eliminates is a disk round-trip, not a map read. + // + // We therefore keep the absolute sub-ms guarantee (the user-facing + // contract: a blast-radius query stays interactive) and a loose + // regression guard that the fast path is not materially SLOWER than + // the batched live walk — without re-asserting the obsolete + // in-memory speedup premise. + const minSpeedup = 0.9 speedup := float64(avgLive) / float64(avgFast) t.Logf("AnalyzeImpact on 1000-caller fan-in: fast=%v live=%v speedup=%.2fx (over %d iters)", @@ -229,8 +240,11 @@ func TestAnalyzeImpact_FastPathSubMillisecond(t *testing.T) { if avgFast > absoluteCeiling { t.Errorf("fast-path AnalyzeImpact too slow: avg=%v (absolute ceiling=%v)", avgFast, absoluteCeiling) } + if avgLive > absoluteCeiling { + t.Errorf("live-walk AnalyzeImpact too slow: avg=%v (absolute ceiling=%v)", avgLive, absoluteCeiling) + } if speedup < minSpeedup { - t.Errorf("fast-path speedup regressed: %.2fx (want >= %.2fx)", speedup, minSpeedup) + t.Errorf("fast-path is materially slower than the live walk: %.2fx (want >= %.2fx)", speedup, minSpeedup) } } diff --git a/internal/analysis/incremental_communities.go b/internal/analysis/incremental_communities.go index c1bc4448..d7714518 100644 --- a/internal/analysis/incremental_communities.go +++ b/internal/analysis/incremental_communities.go @@ -76,7 +76,7 @@ type leidenGraph struct { // the resulting weighted graph. Returns nil when the graph has no // clustering-relevant edges — the caller then yields an empty // partition. -func buildLeidenGraph(g *graph.Graph) *leidenGraph { +func buildLeidenGraph(g graph.Store) *leidenGraph { nodes := g.AllNodes() edges := g.AllEdges() @@ -166,6 +166,18 @@ type LeidenPartitionCache struct { edgeIdentityRevisions int } +// PackageFingerprints returns the cached per-package fingerprint map. +// Callers MUST treat the returned value as read-only — it is the live +// map the cache reuses on the next call. Used by the MCP server to +// report total_packages from a cache hit without re-running the +// fingerprint pass. +func (c *LeidenPartitionCache) PackageFingerprints() map[string]uint64 { + if c == nil { + return nil + } + return c.pkgFingerprint +} + // IncrementalCommunityStats reports what the incremental path did on // a single call — useful for tests and for surfacing on the wire. type IncrementalCommunityStats struct { @@ -217,7 +229,7 @@ func packageKey(filePath string) string { // kind change, or edge added/removed/reweighted flips the // fingerprint of every package it touches and leaves all others // bit-identical. -func fingerprintPackages(g *graph.Graph) map[string]uint64 { +func fingerprintPackages(g graph.Store) map[string]uint64 { nodes := g.AllNodes() edges := g.AllEdges() @@ -315,7 +327,7 @@ func diffPackageFingerprints(old, cur map[string]uint64) map[string]bool { // - the graph's edge-provenance revision moved under the cache, or // - the changed-package fraction exceeds changedFractionFullRecompute. func DetectCommunitiesLeidenIncremental( - g *graph.Graph, + g graph.Store, cache *LeidenPartitionCache, ) (*CommunityResult, *LeidenPartitionCache, IncrementalCommunityStats) { curFP := fingerprintPackages(g) @@ -399,7 +411,7 @@ type incrementalResult struct { // community into the gain calculation but never move themselves, so // every unchanged package's assignment is preserved bit-for-bit. func incrementalLeiden( - g *graph.Graph, + g graph.Store, lg *leidenGraph, cache *LeidenPartitionCache, changedPkgs map[string]bool, diff --git a/internal/analysis/kcore.go b/internal/analysis/kcore.go new file mode 100644 index 00000000..60d9fab6 --- /dev/null +++ b/internal/analysis/kcore.go @@ -0,0 +1,155 @@ +package analysis + +import ( + "sort" + + "github.com/zzet/gortex/internal/graph" +) + +// KCoreHit is one row of the k-core decomposition output: a node +// plus its k-degree (the largest k for which it stays in the +// k-core after iterative degree-< k pruning). High k-degree +// signals a node sits inside a densely connected core; a chain of +// leaves all have k-degree 1, a triangle has k-degree 2, a +// 4-clique has k-degree 3. +type KCoreHit struct { + NodeID string + KDegree int +} + +// KCoreOptions filters the working set. Empty NodeKinds / +// EdgeKinds means "all kinds". Edges are treated as undirected +// (k-core is defined on undirected graphs). +type KCoreOptions struct { + NodeKinds []graph.NodeKind + EdgeKinds []graph.EdgeKind +} + +// ComputeKCore returns the k-core decomposition of g. Classic +// algorithm — Batagelj & Zaversnik 2003, O(V + E): +// +// 1. compute every node's undirected degree +// 2. process nodes in degree-ascending order +// 3. when a node is removed, decrement its still-present +// neighbours' degrees so they can be picked up at the right +// level +// +// Used as the fallback when the backing graph.Store does not +// implement graph.KCorer. +func ComputeKCore(g graph.Store, opts KCoreOptions) []KCoreHit { + if g == nil { + return nil + } + nodeAllow := makeComponentKindAllow(opts.NodeKinds) + edgeAllow := makeComponentEdgeAllow(opts.EdgeKinds) + + // Dense index over allowed nodes. + nodes := g.AllNodes() + idx := make(map[string]int, len(nodes)) + dense := make([]string, 0, len(nodes)) + for _, n := range nodes { + if n == nil || !nodeAllow(n.Kind) { + continue + } + idx[n.ID] = len(dense) + dense = append(dense, n.ID) + } + if len(dense) == 0 { + return nil + } + + // Undirected adjacency; dedupe self-loops + parallel edges. + type edge struct{ a, b int } + seenEdge := make(map[edge]bool) + adj := make([][]int, len(dense)) + for _, e := range g.AllEdges() { + if e == nil || !edgeAllow(e.Kind) { + continue + } + i, ok1 := idx[e.From] + j, ok2 := idx[e.To] + if !ok1 || !ok2 || i == j { + continue + } + key := edge{i, j} + if i > j { + key = edge{j, i} + } + if seenEdge[key] { + continue + } + seenEdge[key] = true + adj[i] = append(adj[i], j) + adj[j] = append(adj[j], i) + } + + n := len(dense) + degree := make([]int, n) + maxDeg := 0 + for i := range dense { + degree[i] = len(adj[i]) + if degree[i] > maxDeg { + maxDeg = degree[i] + } + } + + // Bucket sort by degree (Batagelj & Zaversnik). bucket[d] + // holds dense-indices currently at degree d; pos[v] is v's + // position in its bucket; vertOrder is the global processing + // order populated as we drain the buckets. + bucket := make([][]int, maxDeg+1) + pos := make([]int, n) + for v, d := range degree { + pos[v] = len(bucket[d]) + bucket[d] = append(bucket[d], v) + } + + kdeg := make([]int, n) + processed := make([]bool, n) + for d := 0; d <= maxDeg; d++ { + for len(bucket[d]) > 0 { + // Pop the back of bucket[d] (O(1)). + v := bucket[d][len(bucket[d])-1] + bucket[d] = bucket[d][:len(bucket[d])-1] + if processed[v] { + continue + } + processed[v] = true + kdeg[v] = d + for _, w := range adj[v] { + if processed[w] { + continue + } + if degree[w] > d { + // Move w one bucket down. + old := degree[w] + // O(1) removal: swap with the back element + // of the old bucket and adjust its pos. + i := pos[w] + last := len(bucket[old]) - 1 + if i != last { + other := bucket[old][last] + bucket[old][i] = other + pos[other] = i + } + bucket[old] = bucket[old][:last] + degree[w] = old - 1 + pos[w] = len(bucket[degree[w]]) + bucket[degree[w]] = append(bucket[degree[w]], w) + } + } + } + } + + out := make([]KCoreHit, 0, n) + for v, id := range dense { + out = append(out, KCoreHit{NodeID: id, KDegree: kdeg[v]}) + } + sort.Slice(out, func(i, j int) bool { + if out[i].KDegree != out[j].KDegree { + return out[i].KDegree > out[j].KDegree + } + return out[i].NodeID < out[j].NodeID + }) + return out +} diff --git a/internal/analysis/kcore_test.go b/internal/analysis/kcore_test.go new file mode 100644 index 00000000..e341b761 --- /dev/null +++ b/internal/analysis/kcore_test.go @@ -0,0 +1,93 @@ +package analysis + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +func TestComputeKCore_KnownStructure(t *testing.T) { + // 4-clique + leaf attached to one of its members: + // a -- b + // | / | + // | / | + // c -- d + // | + // leaf + // Every clique node has k-degree 3 (the 4-clique is a 3-core); + // leaf has k-degree 1. + g := graph.New() + for _, id := range []string{"a", "b", "c", "d", "leaf"} { + g.AddNode(&graph.Node{ID: id, Kind: graph.KindFunction, Name: id, FilePath: "x.go"}) + } + for _, e := range [][2]string{ + {"a", "b"}, {"a", "c"}, {"a", "d"}, + {"b", "c"}, {"b", "d"}, + {"c", "d"}, {"c", "leaf"}, + } { + g.AddEdge(&graph.Edge{From: e[0], To: e[1], Kind: graph.EdgeCalls, FilePath: "x.go"}) + } + + hits := ComputeKCore(g, KCoreOptions{}) + require.Len(t, hits, 5) + byID := map[string]int{} + for _, h := range hits { + byID[h.NodeID] = h.KDegree + } + for _, id := range []string{"a", "b", "c", "d"} { + assert.Equal(t, 3, byID[id], + "4-clique members should have k-degree 3; got %v", byID) + } + assert.Equal(t, 1, byID["leaf"], + "leaf should have k-degree 1; got %v", byID) +} + +func TestComputeKCore_LineGraph(t *testing.T) { + // 1 -- 2 -- 3 -- 4: every node has at most 2 neighbours, + // and after peeling the two endpoints the remaining pair + // drops below k=2, so k-degree is 1 across the board. + g := graph.New() + for _, id := range []string{"1", "2", "3", "4"} { + g.AddNode(&graph.Node{ID: id, Kind: graph.KindFunction, Name: id, FilePath: "x.go"}) + } + for _, e := range [][2]string{ + {"1", "2"}, {"2", "3"}, {"3", "4"}, + } { + g.AddEdge(&graph.Edge{From: e[0], To: e[1], Kind: graph.EdgeCalls, FilePath: "x.go"}) + } + hits := ComputeKCore(g, KCoreOptions{}) + for _, h := range hits { + assert.Equal(t, 1, h.KDegree, + "line graph nodes all have k-degree 1; got %v", hits) + } +} + +func TestComputeKCore_EmptyGraph(t *testing.T) { + g := graph.New() + hits := ComputeKCore(g, KCoreOptions{}) + assert.Empty(t, hits) +} + +func TestComputeKCore_EdgeFilter(t *testing.T) { + g := graph.New() + for _, id := range []string{"a", "b", "c"} { + g.AddNode(&graph.Node{ID: id, Kind: graph.KindFunction, Name: id, FilePath: "x.go"}) + } + g.AddEdge(&graph.Edge{From: "a", To: "b", Kind: graph.EdgeCalls, FilePath: "x.go"}) + g.AddEdge(&graph.Edge{From: "b", To: "c", Kind: graph.EdgeReferences, FilePath: "x.go"}) + + // Only call edges survive — a-b stays, b-c drops. + hits := ComputeKCore(g, KCoreOptions{ + EdgeKinds: []graph.EdgeKind{graph.EdgeCalls}, + }) + byID := map[string]int{} + for _, h := range hits { + byID[h.NodeID] = h.KDegree + } + assert.Equal(t, 1, byID["a"]) + assert.Equal(t, 1, byID["b"]) + assert.Equal(t, 0, byID["c"], "c is isolated under the filter") +} diff --git a/internal/analysis/leiden.go b/internal/analysis/leiden.go index 425be412..55a64867 100644 --- a/internal/analysis/leiden.go +++ b/internal/analysis/leiden.go @@ -31,7 +31,7 @@ import ( // // Result has the same shape as DetectCommunities so the call site // can swap them out without other changes. -func DetectCommunitiesLeiden(g *graph.Graph) *CommunityResult { +func DetectCommunitiesLeiden(g graph.Store) *CommunityResult { result, _ := detectCommunitiesLeidenRaw(g) return result } @@ -45,7 +45,7 @@ func DetectCommunitiesLeiden(g *graph.Graph) *CommunityResult { // ids and drops singletons, neither of which can drive a restricted // re-optimization. The returned partition is nil when the graph has // no clustering-relevant edges (the result is then empty too). -func detectCommunitiesLeidenRaw(g *graph.Graph) (*CommunityResult, *leidenPartition) { +func detectCommunitiesLeidenRaw(g graph.Store) (*CommunityResult, *leidenPartition) { lg := buildLeidenGraph(g) if lg == nil { return &CommunityResult{NodeToComm: make(map[string]string)}, nil @@ -386,7 +386,7 @@ func leidenAggregate( // label / hub / disambiguation / parent-grouping pipeline so the UI // can render Leiden output identically. func buildCommunityResult( - g *graph.Graph, + g graph.Store, finalComm map[string]string, neighbors map[string]map[string]float64, totalWeight float64, diff --git a/internal/analysis/pagerank.go b/internal/analysis/pagerank.go index b39fdc24..afd65d4d 100644 --- a/internal/analysis/pagerank.go +++ b/internal/analysis/pagerank.go @@ -40,7 +40,7 @@ const ( // Dangling nodes (no outgoing call/reference edge — leaf utilities) // redistribute their mass uniformly each iteration so the scores stay // a proper probability distribution. -func ComputePageRank(g *graph.Graph) *PageRankResult { +func ComputePageRank(g graph.Store) *PageRankResult { if g == nil { return &PageRankResult{Scores: map[string]float64{}} } diff --git a/internal/analysis/processes.go b/internal/analysis/processes.go index 1f9463cf..468047b2 100644 --- a/internal/analysis/processes.go +++ b/internal/analysis/processes.go @@ -37,7 +37,7 @@ type ProcessResult struct { } // DiscoverProcesses finds execution flows by identifying entry points and tracing forward. -func DiscoverProcesses(g *graph.Graph) *ProcessResult { +func DiscoverProcesses(g graph.Store) *ProcessResult { nodes := g.AllNodes() edges := g.AllEdges() diff --git a/internal/analysis/scaffold.go b/internal/analysis/scaffold.go index 98211834..175bf892 100644 --- a/internal/analysis/scaffold.go +++ b/internal/analysis/scaffold.go @@ -20,7 +20,7 @@ import ( // // This interface avoids a circular dependency with the indexer package. type SourceReader interface { - Graph() *graph.Graph + Graph() graph.Store ResolveFilePath(graphPath string) string } @@ -152,7 +152,7 @@ func filterCallerNodes(sg *query.SubGraph, exampleID string) []*graph.Node { // generateRegistrationCode creates a registration/wiring edit by analyzing how // the example symbol is called by its depth-1 callers. -func generateRegistrationCode(g *graph.Graph, callers []*graph.Node, example *graph.Node, newName string) *ScaffoldEdit { +func generateRegistrationCode(g graph.Store, callers []*graph.Node, example *graph.Node, newName string) *ScaffoldEdit { if len(callers) == 0 { return nil } @@ -190,7 +190,7 @@ func generateRegistrationCode(g *graph.Graph, callers []*graph.Node, example *gr // generateTestStub creates a test stub edit by finding the test file and test // functions associated with the example symbol. -func generateTestStub(g *graph.Graph, reader SourceReader, example *graph.Node, newName string) *ScaffoldEdit { +func generateTestStub(g graph.Store, reader SourceReader, example *graph.Node, newName string) *ScaffoldEdit { testFilePath := deriveTestFilePath(example.FilePath) // Check if the test file exists on disk. Resolve abs path through diff --git a/internal/analysis/scaffold_test.go b/internal/analysis/scaffold_test.go index 46f7c855..1ddd0d1b 100644 --- a/internal/analysis/scaffold_test.go +++ b/internal/analysis/scaffold_test.go @@ -22,7 +22,7 @@ type mockSourceReader struct { rootPath string } -func (m *mockSourceReader) Graph() *graph.Graph { return m.g } +func (m *mockSourceReader) Graph() graph.Store { return m.g } func (m *mockSourceReader) ResolveFilePath(relPath string) string { if filepath.IsAbs(relPath) { return relPath diff --git a/internal/analysis/spectral.go b/internal/analysis/spectral.go index 65b60a6f..fdae9cdd 100644 --- a/internal/analysis/spectral.go +++ b/internal/analysis/spectral.go @@ -33,7 +33,7 @@ const ( // // The result has the same shape as DetectCommunities so analyze // kind=clusters can swap algorithms transparently. -func SpectralClusters(g *graph.Graph) *CommunityResult { +func SpectralClusters(g graph.Store) *CommunityResult { nodes := g.AllNodes() edges := g.AllEdges() diff --git a/internal/artifacts/artifacts.go b/internal/artifacts/artifacts.go index 07de87d7..46ef489f 100644 --- a/internal/artifacts/artifacts.go +++ b/internal/artifacts/artifacts.go @@ -56,7 +56,7 @@ type Artifact struct { // repoPrefix scopes node IDs / paths in a multi-repo graph; pass "" // for a single-repo graph. Best-effort — missing or unreadable files // are skipped rather than failing the whole pass. -func Materialize(g *graph.Graph, root string, entries []config.ArtifactEntry, repoPrefix string) []Artifact { +func Materialize(g graph.Store, root string, entries []config.ArtifactEntry, repoPrefix string) []Artifact { if g == nil || root == "" || len(entries) == 0 { return nil } @@ -81,7 +81,7 @@ func Materialize(g *graph.Graph, root string, entries []config.ArtifactEntry, re } // materializeOne reads one artifact file and projects it onto the graph. -func materializeOne(g *graph.Graph, root, rel string, entry config.ArtifactEntry, repoPrefix string, nameIndex map[string][]string) (Artifact, bool) { +func materializeOne(g graph.Store, root, rel string, entry config.ArtifactEntry, repoPrefix string, nameIndex map[string][]string) (Artifact, bool) { data, err := os.ReadFile(filepath.Join(root, rel)) if err != nil { return Artifact{}, false @@ -147,7 +147,7 @@ func materializeOne(g *graph.Graph, root, rel string, entry config.ArtifactEntry // buildSymbolIndex maps every sufficiently-long symbol name to the // node IDs that declare it, scoped to repoPrefix. -func buildSymbolIndex(g *graph.Graph, repoPrefix string) map[string][]string { +func buildSymbolIndex(g graph.Store, repoPrefix string) map[string][]string { index := make(map[string][]string) for _, n := range g.AllNodes() { switch n.Kind { diff --git a/internal/blame/blame.go b/internal/blame/blame.go index 5d2e28a8..c1bea744 100644 --- a/internal/blame/blame.go +++ b/internal/blame/blame.go @@ -46,13 +46,27 @@ type Author struct { Timestamp time.Time // author-time } -// Run executes `git blame -p` on the file and returns a map from -// 1-based line number to Author. errors include both git invocation -// failures (file not in repo, repo not initialised) and parse -// failures. Callers may treat any error as "skip this file" — the -// enrichment pass is best-effort. +// Run executes `git blame -p` on the file at the current worktree +// (HEAD) and returns a map from 1-based line number to Author. errors +// include both git invocation failures (file not in repo, repo not +// initialised) and parse failures. Callers may treat any error as +// "skip this file" — the enrichment pass is best-effort. func Run(repoRoot, relPath string) (map[int]Author, error) { - cmd := exec.Command("git", "-C", repoRoot, "blame", "-p", "--", relPath) + return RunAt(repoRoot, "", relPath) +} + +// RunAt is Run with an explicit revision (branch / tag / SHA). Pass +// "" for HEAD. Used by enrichments that must blame the default branch +// regardless of the user's current checkout — e.g. the churn enricher +// pinning to `origin/main` so feature-branch work-in-progress doesn't +// pollute the persisted data. +func RunAt(repoRoot, rev, relPath string) (map[int]Author, error) { + args := []string{"-C", repoRoot, "blame", "-p"} + if rev != "" { + args = append(args, rev) + } + args = append(args, "--", relPath) + cmd := exec.Command("git", args...) out, err := cmd.Output() if err != nil { return nil, fmt.Errorf("git blame %s: %w", relPath, err) @@ -189,7 +203,7 @@ func PersonNodeID(email string) string { return "team::" + strings.ToLower(strings.TrimSpace(email)) } -func EnrichGraph(g *graph.Graph, repoRoot string) (int, error) { +func EnrichGraph(g graph.Store, repoRoot string) (int, error) { if g == nil || repoRoot == "" { return 0, nil } @@ -212,6 +226,20 @@ func EnrichGraph(g *graph.Graph, repoRoot string) (int, error) { } enriched := 0 + // Symbol nodes we stamp meta.last_authored on. They must be + // round-tripped back through the store at the end: on the in-memory + // backend the in-place mutation already persists (n is canonical), + // but on disk backends (SQLite) n is a per-call AllNodes + // reconstruction, so without the write-back the last_authored stamp + // is silently discarded — leaving stale_code / ownership / + // health_score's recency axis empty on the disk backend even after + // a successful `gortex enrich blame`. (The person nodes and + // EdgeAuthored edges below already persist via AddNode/AddEdge; only + // the symbol-node Meta was being dropped.) Mirrors the reach index, + // coverage, and releases enrichers. + var stamped []*graph.Node + blameWriter, useBlameSidecar := g.(graph.BlameEnrichmentWriter) + var blameRows []graph.BlameEnrichment // Person nodes are deduplicated within this enrichment pass. // IDs are repo-scoped: in multi-repo mode the same email touching // two repos becomes two distinct KindTeam nodes so per-repo @@ -227,13 +255,22 @@ func EnrichGraph(g *graph.Graph, repoRoot string) (int, error) { if latest == nil { continue } - if n.Meta == nil { - n.Meta = map[string]any{} - } - n.Meta["last_authored"] = map[string]any{ - "commit": latest.Commit, - "email": latest.Email, - "timestamp": latest.Timestamp.Unix(), + if useBlameSidecar { + blameRows = append(blameRows, graph.BlameEnrichment{ + NodeID: n.ID, RepoPrefix: n.RepoPrefix, + Commit: latest.Commit, Email: latest.Email, + Timestamp: latest.Timestamp.Unix(), + }) + } else { + if n.Meta == nil { + n.Meta = map[string]any{} + } + n.Meta["last_authored"] = map[string]any{ + "commit": latest.Commit, + "email": latest.Email, + "timestamp": latest.Timestamp.Unix(), + } + stamped = append(stamped, n) } enriched++ @@ -277,6 +314,22 @@ func EnrichGraph(g *graph.Graph, repoRoot string) (int, error) { g.AddEdge(edge) } } + // Persist the symbol-node last_authored stamps in one batch (the + // durable write on disk backends; an idempotent re-insert on the + // in-memory backend). + if useBlameSidecar && len(blameRows) > 0 { + byPrefix := map[string][]graph.BlameEnrichment{} + for _, r := range blameRows { + byPrefix[r.RepoPrefix] = append(byPrefix[r.RepoPrefix], r) + } + for prefix, rr := range byPrefix { + if err := blameWriter.BulkSetBlame(prefix, rr); err != nil { + return enriched, fmt.Errorf("blame: persist sidecar: %w", err) + } + } + } else if len(stamped) > 0 { + g.AddBatch(stamped, nil) + } return enriched, nil } diff --git a/internal/blame/blame_test.go b/internal/blame/blame_test.go index fea6f285..9a833583 100644 --- a/internal/blame/blame_test.go +++ b/internal/blame/blame_test.go @@ -156,19 +156,26 @@ func TestEnrichGraph_StampsLastAuthored(t *testing.T) { t.Errorf("expected 1 enriched node, got %d", count) } - n := g.GetNode("main.go::Hello") - la, ok := n.Meta["last_authored"].(map[string]any) + // last_authored now persists in the typed sidecar (change A), not Meta. + byID := map[string]graph.BlameEnrichment{} + for _, e := range g.BlameRows("") { + byID[e.NodeID] = e + } + la, ok := byID["main.go::Hello"] if !ok { - t.Fatalf("last_authored missing or wrong shape: %+v", n.Meta) + t.Fatalf("blame row for main.go::Hello missing from sidecar; rows=%+v", byID) + } + if la.Email != "test@example.com" { + t.Errorf("email = %v", la.Email) } - if la["email"] != "test@example.com" { - t.Errorf("email = %v", la["email"]) + if la.Commit == "" { + t.Errorf("commit empty") } - if _, ok := la["commit"].(string); !ok { - t.Errorf("commit not a string: %v", la["commit"]) + if la.Timestamp == 0 { + t.Errorf("timestamp zero") } - if _, ok := la["timestamp"].(int64); !ok { - t.Errorf("timestamp not int64: %T %v", la["timestamp"], la["timestamp"]) + if _, present := g.GetNode("main.go::Hello").Meta["last_authored"]; present { + t.Errorf("last_authored must not remain in Node.Meta after sidecar migration") } } diff --git a/internal/churn/churn.go b/internal/churn/churn.go new file mode 100644 index 00000000..3503dc59 --- /dev/null +++ b/internal/churn/churn.go @@ -0,0 +1,443 @@ +// Package churn computes per-symbol and per-file commit density from +// the git log of a chosen branch (typically the default branch) and +// persists the result on graph nodes. Once enriched, the MCP tool +// get_churn_rate is a pure graph scan — no `git` subprocess at read +// time. The graph store is the source of truth; the disk-backed +// SQLite backend keeps the data across daemon restarts, while +// in-memory backends recompute on demand. +// +// Design notes: +// +// - We blame at an explicit rev (the default branch) rather than +// HEAD. Feature-branch work-in-progress doesn't pollute the +// persisted churn signal — the data answers "what's churning on +// main" regardless of where the agent is checked out. +// +// - Per-file blame is invoked once and projected onto every symbol +// in the file. The repo walk inside `git blame` dominates the +// cost; per-symbol invocations would multiply it by the symbol +// count. +// +// - After mutating n.Meta we re-call g.AddNode(n). The in-memory +// store treats this as a no-op (the pointer is already in the +// graph); the disk backend treats it as an UPSERT that +// re-serialises Meta to its on-disk row. This is the only path +// that persists Meta mutations to disk — without it the +// enrichment would be invisible on the next daemon restart. +package churn + +import ( + "bufio" + "bytes" + "context" + "fmt" + "os/exec" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/zzet/gortex/internal/blame" + "github.com/zzet/gortex/internal/graph" +) + +// Options controls how the enricher resolves and persists churn data. +type Options struct { + // Branch is the rev to blame and log. Required — call site is + // expected to resolve the repo's default branch (origin/main, + // origin/master, …) and pass it in. We do not default to HEAD + // because the whole point of pre-computation is to pin the + // signal to a stable branch. + Branch string + // Now lets tests fix the clock for deterministic age_days. When + // zero, time.Now() is used. + Now time.Time +} + +// Result summarises an enrichment pass. +type Result struct { + Files int // file nodes stamped with a churn summary + Symbols int // function/method nodes stamped with per-symbol churn + Branch string // the rev used (echoed back for the CLI) + HeadSHA string // the resolved SHA at enrich time (stored on each file) +} + +// EnrichGraph computes per-symbol and per-file churn and stamps the +// data on graph nodes. Returns counts plus the resolved SHA. Errors +// only when the repo can't be opened or the branch can't be resolved +// at all; per-file failures are best-effort and skip that file. +// +// Persistence: every mutated node is re-upserted via g.AddNode(n). +// On disk-backed stores this round-trips through the store's upsert +// path; on the in-memory store the pointer was already mutated in +// place, but the redundant AddNode call keeps the semantics uniform +// between backends and lets the enricher run against either. +func EnrichGraph(ctx context.Context, g graph.Store, repoRoot string, opts Options) (Result, error) { + if g == nil || repoRoot == "" { + return Result{}, fmt.Errorf("churn: graph and repoRoot are required") + } + if strings.TrimSpace(opts.Branch) == "" { + return Result{}, fmt.Errorf("churn: Options.Branch is required (default-branch resolution belongs to the caller)") + } + now := opts.Now + if now.IsZero() { + now = time.Now() + } + headSHA := runGit(repoRoot, "rev-parse", "--verify", "--quiet", opts.Branch) + if headSHA == "" { + return Result{}, fmt.Errorf("churn: branch %q does not resolve in %s", opts.Branch, repoRoot) + } + + // Group symbols by file path. We deliberately keep file nodes in + // a separate map so we can stamp their summary even when no + // function/method is in scope (some files contain only types or + // constants). + type bucket struct { + file *graph.Node // optional — may be nil + symbols []*graph.Node + } + byPath := map[string]*bucket{} + for _, n := range g.AllNodes() { + if n.FilePath == "" { + continue + } + switch n.Kind { + case graph.KindFile: + b := byPath[n.FilePath] + if b == nil { + b = &bucket{} + byPath[n.FilePath] = b + } + b.file = n + case graph.KindFunction, graph.KindMethod: + if n.StartLine == 0 { + continue + } + b := byPath[n.FilePath] + if b == nil { + b = &bucket{} + byPath[n.FilePath] = b + } + b.symbols = append(b.symbols, n) + } + } + + res := Result{Branch: opts.Branch, HeadSHA: headSHA} + churnWriter, useChurnSidecar := g.(graph.ChurnEnrichmentWriter) + var churnRows []graph.ChurnEnrichment + for filePath, b := range byPath { + if err := ctx.Err(); err != nil { + return res, err + } + if len(b.symbols) == 0 && b.file == nil { + continue + } + rel := stripRepoPrefix(filePath, repoRoot) + commits, err := fileCommits(repoRoot, opts.Branch, rel) + if err != nil || len(commits) == 0 { + continue + } + var blameLines map[int]blame.Author + if len(b.symbols) > 0 { + blameLines, _ = blame.RunAt(repoRoot, opts.Branch, rel) + } + + // File summary: aggregate across all commits. + if b.file != nil { + stampFileChurn(b.file, commits, headSHA, opts.Branch, now) + if useChurnSidecar { + churnRows = append(churnRows, churnEnrichmentFromNode(b.file)) + delete(b.file.Meta, "churn") + delete(b.file.Meta, "churn_meta") + } else { + g.AddNode(b.file) + } + res.Files++ + } + + if len(blameLines) == 0 { + continue + } + // Per-symbol: project blame line range, then look up each + // commit's timestamp/author in the commits map. Falls back + // to blame timestamps when the commit isn't in the log + // (shallow clones, signed-off cherry-picks). + for _, s := range b.symbols { + if stampSymbolChurn(s, blameLines, commits, now) { + if useChurnSidecar { + churnRows = append(churnRows, churnEnrichmentFromNode(s)) + delete(s.Meta, "churn") + } else { + g.AddNode(s) + } + res.Symbols++ + } + } + } + // Sidecar persist (change A): when the backend implements + // ChurnEnrichmentWriter, churn rides in the typed churn_enrichment + // table instead of nodes.meta, so the node hot path stops gob- + // encoding it and get_churn_rate reads via an index. Grouped by + // repo prefix since BulkSetChurn stamps one prefix per call. + if useChurnSidecar && len(churnRows) > 0 { + byPrefix := map[string][]graph.ChurnEnrichment{} + for _, r := range churnRows { + byPrefix[r.RepoPrefix] = append(byPrefix[r.RepoPrefix], r) + } + for prefix, rr := range byPrefix { + if err := churnWriter.BulkSetChurn(prefix, rr); err != nil { + return res, fmt.Errorf("churn: persist sidecar: %w", err) + } + } + } + + return res, nil +} + +// commitRecord is one row of `git log --format=%H|%ct|%ae`. +type commitRecord struct { + SHA string + When time.Time + Email string +} + +// fileCommits returns the commit history for relPath on branch. +// Ordered newest → oldest. Empty slice when the file has no history +// on that branch (untracked, or the rev predates the file). +func fileCommits(repoRoot, branch, relPath string) ([]commitRecord, error) { + cmd := exec.Command("git", "-C", repoRoot, "log", branch, + "--no-merges", "--follow", "--format=%H|%ct|%ae", "--", relPath) + out, err := cmd.Output() + if err != nil { + return nil, err + } + var records []commitRecord + scanner := bufio.NewScanner(bytes.NewReader(out)) + scanner.Buffer(make([]byte, 64*1024), 8*1024*1024) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" { + continue + } + parts := strings.SplitN(line, "|", 3) + if len(parts) != 3 { + continue + } + ts, err := strconv.ParseInt(parts[1], 10, 64) + if err != nil { + continue + } + records = append(records, commitRecord{ + SHA: parts[0], + When: time.Unix(ts, 0), + Email: parts[2], + }) + } + return records, scanner.Err() +} + +// churnEnrichmentFromNode projects the freshly-stamped Meta["churn"] / +// Meta["churn_meta"] payload into a typed ChurnEnrichment row for the +// sidecar. The stamp functions write int/float64 directly (no JSON +// widening at this point), so the type assertions are exact. +func churnEnrichmentFromNode(n *graph.Node) graph.ChurnEnrichment { + e := graph.ChurnEnrichment{NodeID: n.ID, RepoPrefix: n.RepoPrefix} + if m, ok := n.Meta["churn"].(map[string]any); ok { + if v, ok := m["commit_count"].(int); ok { + e.CommitCount = v + } + if v, ok := m["age_days"].(int); ok { + e.AgeDays = v + } + if v, ok := m["churn_rate"].(float64); ok { + e.ChurnRate = v + } + e.LastAuthor, _ = m["last_author"].(string) + e.LastCommitAt, _ = m["last_commit_at"].(string) + } + if m, ok := n.Meta["churn_meta"].(map[string]any); ok { + e.HeadSHA, _ = m["head_sha"].(string) + e.Branch, _ = m["branch"].(string) + e.ComputedAt, _ = m["computed_at"].(string) + } + return e +} + +// stampFileChurn writes the file-level summary onto n.Meta["churn"] +// and pins enrichment provenance under n.Meta["churn_meta"]. +func stampFileChurn(n *graph.Node, commits []commitRecord, headSHA, branch string, now time.Time) { + if n.Meta == nil { + n.Meta = map[string]any{} + } + commitCount := len(commits) + first := commits[len(commits)-1].When + last := commits[0].When + ageDays := int(now.Sub(first).Hours() / 24) + activeDays := ageDays + if activeDays < 1 { + activeDays = 1 + } + n.Meta["churn"] = map[string]any{ + "commit_count": commitCount, + "age_days": ageDays, + "churn_rate": roundTwo(float64(commitCount) / float64(activeDays)), + "last_author": commits[0].Email, + "last_commit_at": last.UTC().Format(time.RFC3339), + } + n.Meta["churn_meta"] = map[string]any{ + "head_sha": headSHA, + "branch": branch, + "computed_at": now.UTC().Format(time.RFC3339), + } +} + +// stampSymbolChurn projects the file's blame onto the symbol's line +// range and stamps n.Meta["churn"]. Returns true when the symbol's +// range had at least one blamed line — false when blame produced no +// coverage (uncommitted lines or the file is untracked at the rev). +func stampSymbolChurn(n *graph.Node, blameLines map[int]blame.Author, commits []commitRecord, now time.Time) bool { + endLine := n.EndLine + if endLine == 0 { + endLine = n.StartLine + } + commitsSeen := map[string]struct{}{} + var oldest, newest time.Time + latestEmail := "" + for line := n.StartLine; line <= endLine; line++ { + a, ok := blameLines[line] + if !ok { + continue + } + commitsSeen[a.Commit] = struct{}{} + if oldest.IsZero() || a.Timestamp.Before(oldest) { + oldest = a.Timestamp + } + if newest.IsZero() || a.Timestamp.After(newest) { + newest = a.Timestamp + latestEmail = a.Email + } + } + if len(commitsSeen) == 0 { + return false + } + // Prefer the canonical author email from the log over the blame + // author email when both exist — `git log` carries the merged-in + // author identity, while blame may show the original + // pre-rebase author. + if email := latestAuthorFromCommits(commitsSeen, commits); email != "" { + latestEmail = email + } + ageDays := 0 + if !oldest.IsZero() { + ageDays = int(now.Sub(oldest).Hours() / 24) + } + activeDays := ageDays + if activeDays < 1 { + activeDays = 1 + } + if n.Meta == nil { + n.Meta = map[string]any{} + } + n.Meta["churn"] = map[string]any{ + "commit_count": len(commitsSeen), + "age_days": ageDays, + "churn_rate": roundTwo(float64(len(commitsSeen)) / float64(activeDays)), + "last_author": latestEmail, + "last_commit_at": newest.UTC().Format(time.RFC3339), + } + return true +} + +// latestAuthorFromCommits picks the email of the most-recent commit +// that touches the symbol's range, using the per-file log as the +// authority for author identity (blame can lag a rebase / cherry-pick). +func latestAuthorFromCommits(commitsSeen map[string]struct{}, commits []commitRecord) string { + for _, c := range commits { + if _, ok := commitsSeen[c.SHA]; ok { + return c.Email + } + } + return "" +} + +// roundTwo rounds to two decimals so the JSON output stays compact +// — single-digit precision swallows the difference between 0.03 and +// 0.04 churn-per-day, which matters for ranking. +func roundTwo(v float64) float64 { + return float64(int64(v*100+0.5)) / 100 +} + +// stripRepoPrefix removes a leading repo segment from multi-repo +// indexer paths so the path we hand to git is repo-relative. Mirrors +// the helper in internal/blame; duplicated rather than exported +// because the blame copy is unexported by design. +func stripRepoPrefix(filePath, repoRoot string) string { + if !strings.Contains(filePath, "/") { + return filePath + } + if _, err := exec.LookPath("git"); err != nil { + return filePath + } + abs := filepath.Join(repoRoot, filePath) + if fileExists(abs) { + return filePath + } + if idx := strings.Index(filePath, "/"); idx >= 0 { + trimmed := filePath[idx+1:] + if fileExists(filepath.Join(repoRoot, trimmed)) { + return trimmed + } + } + return filePath +} + +var fileExists = func(path string) bool { + cmd := exec.Command("test", "-f", path) + return cmd.Run() == nil +} + +// runGit shells out and returns trimmed stdout, or "" on error. Used +// only for the one-shot rev-parse; full enrichment calls go through +// fileCommits / blame.RunAt directly. +func runGit(repoRoot string, args ...string) string { + cmd := exec.Command("git", append([]string{"-C", repoRoot}, args...)...) + out, err := cmd.Output() + if err != nil { + return "" + } + return strings.TrimSpace(string(out)) +} + +// DefaultBranch returns the repository's default branch as a +// rev-parseable reference (preferring "origin/" when an upstream +// is configured, falling back to a local branch when not). Returns "" +// when none of the candidates resolve — the caller is then expected +// to surface a clear error rather than silently picking the current +// branch (feature branches must not pollute the persisted data). +// +// Exposed so MCP-side enrich handlers can resolve the same branch +// the CLI does without duplicating the probe order across packages. +func DefaultBranch(repoRoot string) string { + probe := func(args ...string) (string, bool) { + cmd := exec.Command("git", append([]string{"-C", repoRoot}, args...)...) + out, err := cmd.Output() + if err != nil { + return "", false + } + return strings.TrimSpace(string(out)), true + } + if ref, ok := probe("symbolic-ref", "--short", "refs/remotes/origin/HEAD"); ok && ref != "" { + return ref + } + for _, candidate := range []string{"origin/main", "origin/master", "origin/trunk"} { + if _, ok := probe("rev-parse", "--verify", "--quiet", candidate); ok { + return candidate + } + } + for _, candidate := range []string{"main", "master", "trunk"} { + if _, ok := probe("rev-parse", "--verify", "--quiet", candidate); ok { + return candidate + } + } + return "" +} diff --git a/internal/churn/churn_test.go b/internal/churn/churn_test.go new file mode 100644 index 00000000..6accacbf --- /dev/null +++ b/internal/churn/churn_test.go @@ -0,0 +1,205 @@ +package churn + +import ( + "context" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/zzet/gortex/internal/graph" +) + +func TestEnrichGraph_StampsSymbolAndFile(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not available") + } + + repoDir := initRepo(t) + writeAndCommit(t, repoDir, "main.go", "package main\n\nfunc Hello() {}\n", "initial") + // Touch the file twice more so churn_rate is non-trivial. + writeAndCommit(t, repoDir, "main.go", "package main\n\nfunc Hello() { _ = 1 }\n", "second") + writeAndCommit(t, repoDir, "main.go", "package main\n\nfunc Hello() { _ = 2 }\n", "third") + + g := graph.New() + g.AddNode(&graph.Node{ + ID: "main.go", Kind: graph.KindFile, Name: "main.go", FilePath: "main.go", + }) + g.AddNode(&graph.Node{ + ID: "main.go::Hello", + Kind: graph.KindFunction, + Name: "Hello", + FilePath: "main.go", + StartLine: 3, EndLine: 3, + }) + + res, err := EnrichGraph(context.Background(), g, repoDir, Options{ + Branch: currentBranch(t, repoDir), + Now: time.Now(), + }) + if err != nil { + t.Fatalf("enrich: %v", err) + } + if res.Files != 1 || res.Symbols != 1 { + t.Errorf("res = %+v, want Files=1 Symbols=1", res) + } + if res.HeadSHA == "" { + t.Error("HeadSHA should be set") + } + + // Churn now persists in the typed sidecar (change A), not Node.Meta. + byID := map[string]graph.ChurnEnrichment{} + for _, e := range g.ChurnRows("") { + byID[e.NodeID] = e + } + + fileChurn, ok := byID["main.go"] + if !ok { + t.Fatalf("file churn row missing from sidecar; rows=%+v", byID) + } + if fileChurn.CommitCount != 3 { + t.Errorf("file commit_count = %d, want 3", fileChurn.CommitCount) + } + if fileChurn.ChurnRate == 0 { + t.Errorf("file churn_rate missing") + } + if fileChurn.HeadSHA == "" || fileChurn.Branch == "" { + t.Errorf("file churn provenance (head_sha/branch) missing: %+v", fileChurn) + } + // Meta must NOT carry churn anymore — it moved to the sidecar. + if _, present := g.GetNode("main.go").Meta["churn"]; present { + t.Errorf("churn must not remain in Node.Meta after sidecar migration") + } + + symChurn, ok := byID["main.go::Hello"] + if !ok { + t.Fatalf("symbol churn row missing from sidecar") + } + if symChurn.CommitCount < 1 { + t.Errorf("symbol commit_count = %d, want >= 1", symChurn.CommitCount) + } + if symChurn.LastAuthor == "" { + t.Errorf("symbol last_author missing: %+v", symChurn) + } +} + +func TestEnrichGraph_SkipsFilesWithNoHistory(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not available") + } + + repoDir := initRepo(t) + writeAndCommit(t, repoDir, "main.go", "package main\n\nfunc Hello() {}\n", "initial") + + g := graph.New() + // Refer to a file that exists on disk but isn't tracked by git. + if err := os.WriteFile(filepath.Join(repoDir, "untracked.go"), []byte("package main\n"), 0o644); err != nil { + t.Fatal(err) + } + g.AddNode(&graph.Node{ID: "untracked.go", Kind: graph.KindFile, FilePath: "untracked.go"}) + + res, err := EnrichGraph(context.Background(), g, repoDir, Options{ + Branch: currentBranch(t, repoDir), + }) + if err != nil { + t.Fatalf("enrich: %v", err) + } + if res.Files != 0 || res.Symbols != 0 { + t.Errorf("untracked file should yield no stamps, got %+v", res) + } +} + +func TestEnrichGraph_RequiresBranch(t *testing.T) { + g := graph.New() + _, err := EnrichGraph(context.Background(), g, "/tmp/anywhere", Options{}) + if err == nil { + t.Fatal("expected error when Branch is empty") + } + if !strings.Contains(err.Error(), "Branch is required") { + t.Errorf("unexpected error: %v", err) + } +} + +func TestEnrichGraph_RejectsUnresolvableBranch(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not available") + } + repoDir := initRepo(t) + writeAndCommit(t, repoDir, "main.go", "package main\n", "initial") + + g := graph.New() + _, err := EnrichGraph(context.Background(), g, repoDir, Options{Branch: "does-not-exist"}) + if err == nil { + t.Fatal("expected error when branch does not resolve") + } +} + +func TestRoundTwo(t *testing.T) { + cases := []struct { + in float64 + want float64 + }{ + {0.0, 0.0}, + {0.125, 0.13}, + {1.0 / 3.0, 0.33}, + {99.999, 100.0}, + } + for _, c := range cases { + if got := roundTwo(c.in); got != c.want { + t.Errorf("roundTwo(%v) = %v, want %v", c.in, got, c.want) + } + } +} + +// --- helpers --- + +func initRepo(t *testing.T) string { + t.Helper() + dir := t.TempDir() + for _, args := range [][]string{ + {"init", "-q", "-b", "main"}, + {"config", "user.email", "test@example.com"}, + {"config", "user.name", "Tester"}, + {"config", "commit.gpgsign", "false"}, + } { + cmd := exec.Command("git", args...) + cmd.Dir = dir + if out, err := cmd.CombinedOutput(); err != nil { + t.Fatalf("git %v: %v\n%s", args, err, out) + } + } + return dir +} + +func writeAndCommit(t *testing.T, dir, rel, body, msg string) { + t.Helper() + if err := os.WriteFile(filepath.Join(dir, rel), []byte(body), 0o644); err != nil { + t.Fatal(err) + } + add := exec.Command("git", "add", rel) + add.Dir = dir + if out, err := add.CombinedOutput(); err != nil { + t.Fatalf("git add: %v\n%s", err, out) + } + commit := exec.Command("git", "commit", "-q", "-m", msg) + commit.Dir = dir + commit.Env = append(commit.Environ(), + "GIT_AUTHOR_NAME=Tester", "GIT_AUTHOR_EMAIL=test@example.com", + "GIT_COMMITTER_NAME=Tester", "GIT_COMMITTER_EMAIL=test@example.com") + if out, err := commit.CombinedOutput(); err != nil { + t.Fatalf("git commit: %v\n%s", err, out) + } +} + +func currentBranch(t *testing.T, dir string) string { + t.Helper() + cmd := exec.Command("git", "rev-parse", "--abbrev-ref", "HEAD") + cmd.Dir = dir + out, err := cmd.Output() + if err != nil { + t.Fatalf("rev-parse: %v", err) + } + return strings.TrimSpace(string(out)) +} diff --git a/internal/clones/cms.go b/internal/clones/cms.go index 3e258b18..fd5d9d4a 100644 --- a/internal/clones/cms.go +++ b/internal/clones/cms.go @@ -89,6 +89,27 @@ func (c *CMS) Add(x uint64) { } } +// Decrement decreases the counters for x by one across every hash row, +// flooring each at 0: a counter already at 0 is left untouched. It is +// the inverse of Add for incremental maintenance — when a body leaves +// the corpus its shingle hashes are decremented so the boilerplate +// estimate tracks the live set instead of growing monotonically. +// +// Decrementing a key that was never added is a no-op (every row sits +// at 0 already, or sits at some other key's count that this row shares +// — flooring at 0 keeps those undamaged). Because hash collisions can +// leave a row's counter above this key's true frequency, Count stays an +// upper bound after Decrement just as it is after Add; decrement never +// makes Count drop below the true count. +func (c *CMS) Decrement(x uint64) { + for i := 0; i < c.depth; i++ { + idx := cmsHash(x, c.seeds[i]) & c.mask + if c.counts[i][idx] > 0 { + c.counts[i][idx]-- + } + } +} + // Count returns the minimum across all hash rows — the canonical CMS // frequency estimate. The result is an upper bound on the true count. func (c *CMS) Count(x uint64) uint32 { diff --git a/internal/clones/lsh.go b/internal/clones/lsh.go index eea6ab28..a5e681d5 100644 --- a/internal/clones/lsh.go +++ b/internal/clones/lsh.go @@ -91,6 +91,86 @@ func (ix *Index) Add(id string, sig Signature) { } } +// Remove deletes an item from the index, undoing a prior Add of the +// same ID. If the ID was never added (no signature recorded) the call +// is a no-op. For each band it recomputes the bucket key from the +// stored signature, drops the ID from that bucket's member slice, and +// removes the bucket entry entirely once it is empty so the band map +// does not accumulate dead keys. The signature is then forgotten. +// +// Add(id, sig) followed by Remove(id) returns the index to a state in +// which id sits in no band bucket and contributes no candidate — the +// invariant the incremental maintenance path relies on when a body is +// re-shingled or deleted. +func (ix *Index) Remove(id string) { + sig, ok := ix.sigs[id] + if !ok { + return + } + for b := range Bands { + key := bandKey(b, sig) + ids := ix.bands[b][key] + // Drop the first occurrence of id; Add banks each ID once per + // band, so a single removal clears the membership. + for i, v := range ids { + if v == id { + ids = append(ids[:i], ids[i+1:]...) + break + } + } + if len(ids) == 0 { + delete(ix.bands[b], key) + } else { + ix.bands[b][key] = ids + } + } + delete(ix.sigs, id) +} + +// QueryCandidates returns the candidate set for a single item: every +// other ID that shares at least one band bucket with id, in canonical +// sorted order. It is the per-item analogue of EmitCandidatesTo — the +// pairs (id, c) for every c in the result are exactly the candidate +// pairs EmitCandidatesTo would emit that touch id. +// +// id itself is excluded, results are deduplicated across bands, and +// buckets larger than maxBucketSize are skipped using the identical cap +// EmitCandidatesTo applies — so a candidate dropped by the batch fan-out +// cap is also dropped here, keeping the maintained query and the batch +// walk in lock-step. An id with no recorded signature yields nil. +func (ix *Index) QueryCandidates(id string) []string { + sig, ok := ix.sigs[id] + if !ok { + return nil + } + seen := make(map[string]struct{}) + for b := range Bands { + key := bandKey(b, sig) + ids := ix.bands[b][key] + if len(ids) < 2 { + continue + } + if len(ids) > maxBucketSize { + continue + } + for _, v := range ids { + if v == id { + continue + } + seen[v] = struct{}{} + } + } + if len(seen) == 0 { + return nil + } + out := make([]string, 0, len(seen)) + for v := range seen { + out = append(out, v) + } + sort.Strings(out) + return out +} + // bandKey hashes the Rows MinHash slots of band b into a bucket key. // The band index is folded into the hash so identical row values in // different bands cannot collide into the same logical bucket. diff --git a/internal/clones/maintained.go b/internal/clones/maintained.go new file mode 100644 index 00000000..11565927 --- /dev/null +++ b/internal/clones/maintained.go @@ -0,0 +1,135 @@ +package clones + +import "sort" + +// StratifiedIndex is the incrementally maintained counterpart of +// DetectPairsStratifiedWithStats. Where the batch path re-partitions +// every item into length classes and rebuilds a fresh per-class LSH +// index on each run, StratifiedIndex keeps one live Index per length +// class (one per entry in lengthBucketBounds) so a single edited body +// can be re-banked in O(its classes) — typically one or two Add/Remove +// calls — instead of rebuilding over the whole corpus. +// +// Stratification mirrors the batch path exactly: an item is banked into +// every class lengthClassesOf(TokenCount) returns, so an item in the +// overlap region of two adjacent classes lives in both. tokens records +// each id's TokenCount so Remove can recompute the same class set the +// item was added under without the caller re-supplying it. +// +// StratifiedIndex is NOT goroutine-safe by design: the maps and the +// per-class Index state are mutated without locking. The intended caller +// (the indexer's incremental clone-edge maintainer) serialises Add / +// Remove / QueryPairs under its own lock, the same way the batch Index +// is driven from a single goroutine. +type StratifiedIndex struct { + // classes[i] is the live LSH index for length class i; len matches + // lengthBucketBounds so a class index aligns with lengthClassesOf. + classes []*Index + // tokens maps an added id to the TokenCount it was banked under, so + // Remove can recompute lengthClassesOf(tokens[id]) — the exact class + // set the id occupies — and drop it from each of those class indexes. + tokens map[string]int +} + +// NewStratifiedIndex returns an empty StratifiedIndex with one live +// per-class Index for every entry in lengthBucketBounds. +func NewStratifiedIndex() *StratifiedIndex { + classes := make([]*Index, len(lengthBucketBounds)) + for i := range classes { + classes[i] = NewIndex() + } + return &StratifiedIndex{ + classes: classes, + tokens: make(map[string]int), + } +} + +// Add banks an item into every length class its TokenCount falls in +// (lengthClassesOf), recording the TokenCount so a later Remove can +// recover the same class set. Adding an id that is already present +// follows Index.Add's contract — callers should add each id once, and +// re-banking an edited body should Remove it first. +func (s *StratifiedIndex) Add(it Item) { + for _, c := range lengthClassesOf(it.TokenCount) { + s.classes[c].Add(it.ID, it.Sig) + } + s.tokens[it.ID] = it.TokenCount +} + +// Remove undoes a prior Add: it drops the id from every length class it +// was banked under — recomputed from the recorded TokenCount via +// lengthClassesOf — and forgets the recorded count. An id that was +// never added is a no-op. +func (s *StratifiedIndex) Remove(id string) { + tc, ok := s.tokens[id] + if !ok { + return + } + for _, c := range lengthClassesOf(tc) { + s.classes[c].Remove(id) + } + delete(s.tokens, id) +} + +// QueryPairs returns every clone pair touching it whose estimated +// Jaccard similarity is at or above threshold (DefaultThreshold when +// threshold ≤ 0), in canonical (A < B) form. It is the per-item query +// that the maintained index exposes in place of the batch +// DetectPairsStratifiedWithStats walk: unioning QueryPairs over every +// item reproduces the batch pair set exactly. +// +// For each class lengthClassesOf(it.TokenCount) places it in, the class +// index's QueryCandidates(it.ID) yields the candidate IDs sharing a band +// bucket; each candidate's stored signature is scored against it.Sig and +// kept when it clears threshold. A candidate that surfaces from more +// than one class (the overlap region) is deduplicated by canonical pair +// key, matching the batch merge. +// +// it does not need to already be in the index — its signature is read +// from the Item, so re-adding it before querying is fine but not +// required. Candidates are still drawn from the live class indexes, so +// for the union over all items to equal the batch set every item must +// have been Added first. +func (s *StratifiedIndex) QueryPairs(it Item, threshold float64) []Pair { + if threshold <= 0 { + threshold = DefaultThreshold + } + seen := make(map[[2]string]struct{}) + var out []Pair + for _, c := range lengthClassesOf(it.TokenCount) { + idx := s.classes[c] + for _, cand := range idx.QueryCandidates(it.ID) { + if cand == it.ID { + continue + } + candSig, ok := idx.sigs[cand] + if !ok { + continue + } + sim := EstimateJaccard(it.Sig, candSig) + if sim < threshold { + continue + } + a, b := it.ID, cand + if a > b { + a, b = b, a + } + key := [2]string{a, b} + if _, dup := seen[key]; dup { + continue + } + seen[key] = struct{}{} + out = append(out, Pair{A: a, B: b, Similarity: sim}) + } + } + sort.Slice(out, func(i, j int) bool { + if out[i].Similarity != out[j].Similarity { + return out[i].Similarity > out[j].Similarity + } + if out[i].A != out[j].A { + return out[i].A < out[j].A + } + return out[i].B < out[j].B + }) + return out +} diff --git a/internal/clones/maintained_test.go b/internal/clones/maintained_test.go new file mode 100644 index 00000000..d03e7758 --- /dev/null +++ b/internal/clones/maintained_test.go @@ -0,0 +1,267 @@ +package clones + +import ( + "reflect" + "sort" + "testing" +) + +// shinglesFrom builds a deterministic shingle-hash set from a slice of +// integer shingle ids. Using small distinct integers as the raw shingle +// hashes lets a test author dial in an exact Jaccard overlap between two +// items: |A ∩ B| / |A ∪ B| over the integer sets is what MinHash +// estimates, so near-duplicates and distinct items are constructed by +// choosing how many shingle ids two sets share. +func shinglesFrom(ids ...uint64) []uint64 { + out := make([]uint64, len(ids)) + copy(out, ids) + sort.Slice(out, func(i, j int) bool { return out[i] < out[j] }) + return out +} + +// sigFromShingles is a test helper: SignatureFromShingles with no +// minimum-shingle floor, failing the test if the set is degenerate. +func sigFromShingles(t *testing.T, shingles []uint64) Signature { + t.Helper() + sig, ok := SignatureFromShingles(shingles, 0) + if !ok { + t.Fatalf("SignatureFromShingles failed for %v", shingles) + } + return sig +} + +// makeShingleRange returns the shingle ids base, base+1, …, base+n-1 — +// a contiguous block, so two blocks overlap by a controllable amount. +func makeShingleRange(base, n uint64) []uint64 { + out := make([]uint64, 0, n) + for i := uint64(0); i < n; i++ { + out = append(out, base+i) + } + return out +} + +// fixtureItems builds the deterministic correctness fixture: +// - a / b: a high-overlap near-duplicate pair in the small length class +// - c: distinct from a/b, same small length class (a non-clone neighbour) +// - d / e: a second high-overlap near-duplicate pair, sized so they sit +// in a different (larger) length class than a/b — exercising >1 class +// - f: distinct, in the large class (a non-clone neighbour for d/e) +// +// Overlaps are tuned so EstimateJaccard clears DefaultThreshold for the +// (a,b) and (d,e) pairs and stays well below it for everything else. +func fixtureItems(t *testing.T) []Item { + t.Helper() + + // Small length class (TokenCount 60 → class 0 only, [0,80)). + // a and b share 116 of 120 shingles → exact Jaccard ≈ 0.967. + aSh := makeShingleRange(1000, 120) + bSh := makeShingleRange(1004, 120) // shifted by 4 → 116 shared + // c shares almost nothing with a/b. + cSh := makeShingleRange(9000, 120) + + // Large length class (TokenCount 250 → class 3 only, [200,640)). + // d and e share 116 of 120 shingles → exact Jaccard ≈ 0.967. + dSh := makeShingleRange(2000, 120) + eSh := makeShingleRange(2004, 120) + // f shares almost nothing with d/e. + fSh := makeShingleRange(7000, 120) + + return []Item{ + {ID: "a", Sig: sigFromShingles(t, shinglesFrom(aSh...)), TokenCount: 60}, + {ID: "b", Sig: sigFromShingles(t, shinglesFrom(bSh...)), TokenCount: 60}, + {ID: "c", Sig: sigFromShingles(t, shinglesFrom(cSh...)), TokenCount: 60}, + {ID: "d", Sig: sigFromShingles(t, shinglesFrom(dSh...)), TokenCount: 250}, + {ID: "e", Sig: sigFromShingles(t, shinglesFrom(eSh...)), TokenCount: 250}, + {ID: "f", Sig: sigFromShingles(t, shinglesFrom(fSh...)), TokenCount: 250}, + } +} + +// canonicalPairSet reduces a slice of Pairs to the set of canonical +// (A b { + a, b = b, a + } + set[[2]string{a, b}] = struct{}{} + } + return set +} + +// populatedLengthClasses counts how many length classes hold ≥1 item +// from the fixture — used to assert the equivalence test is non-vacuous +// (more than one class actually exercised). +func populatedLengthClasses(items []Item) int { + hit := make(map[int]struct{}) + for _, it := range items { + for _, c := range lengthClassesOf(it.TokenCount) { + hit[c] = struct{}{} + } + } + return len(hit) +} + +// TestStratifiedIndexEquivalence proves the incrementally maintained +// per-item query reproduces the batch detection exactly: the union of +// QueryPairs over every item equals the canonical pair set the batch +// DetectPairsStratifiedWithStats produces over the same corpus. +func TestStratifiedIndexEquivalence(t *testing.T) { + items := fixtureItems(t) + const threshold = DefaultThreshold + + batchPairs, _, _ := DetectPairsStratifiedWithStats(items, threshold) + batchSet := canonicalPairSet(batchPairs) + + // Non-vacuous fixture: the batch must find at least one pair and the + // items must span more than one length class, else the equivalence + // is trivially satisfied by an empty set in a single bucket. + if len(batchSet) < 1 { + t.Fatalf("fixture vacuous: batch found no pairs") + } + if n := populatedLengthClasses(items); n <= 1 { + t.Fatalf("fixture vacuous: only %d length class populated, want >1", n) + } + + s := NewStratifiedIndex() + for _, it := range items { + s.Add(it) + } + + maintained := make(map[[2]string]struct{}) + for _, it := range items { + for _, p := range s.QueryPairs(it, threshold) { + a, b := p.A, p.B + if a > b { + a, b = b, a + } + maintained[[2]string{a, b}] = struct{}{} + } + } + + if !reflect.DeepEqual(batchSet, maintained) { + t.Fatalf("maintained query set != batch set\n batch=%v\n maintained=%v", batchSet, maintained) + } +} + +// TestStratifiedIndexRemoveAndReadd proves Remove pulls a +// clone-participating id out of every candidate set, and that re-Adding +// it restores the original equivalence set. +func TestStratifiedIndexRemoveAndReadd(t *testing.T) { + items := fixtureItems(t) + const threshold = DefaultThreshold + + batchPairs, _, _ := DetectPairsStratifiedWithStats(items, threshold) + batchSet := canonicalPairSet(batchPairs) + if len(batchSet) < 1 { + t.Fatalf("fixture vacuous: batch found no pairs") + } + + s := NewStratifiedIndex() + for _, it := range items { + s.Add(it) + } + + // "a" participates in the (a,b) clone pair. + const removed = "a" + var removedItem Item + for _, it := range items { + if it.ID == removed { + removedItem = it + } + } + + s.Remove(removed) + + // After removal no QueryPairs over the remaining items may yield a + // pair touching the removed id. + for _, it := range items { + if it.ID == removed { + continue + } + for _, p := range s.QueryPairs(it, threshold) { + if p.A == removed || p.B == removed { + t.Fatalf("pair %+v still references removed id %q", p, removed) + } + } + } + // The removed item must also produce no surviving pairs of its own, + // since its former partner can no longer be a live candidate for it. + if pairs := s.QueryPairs(removedItem, threshold); len(pairs) != 0 { + t.Fatalf("removed item still produced pairs: %+v", pairs) + } + + // Re-Add restores the full equivalence set. + s.Add(removedItem) + restored := make(map[[2]string]struct{}) + for _, it := range items { + for _, p := range s.QueryPairs(it, threshold) { + a, b := p.A, p.B + if a > b { + a, b = b, a + } + restored[[2]string{a, b}] = struct{}{} + } + } + if !reflect.DeepEqual(batchSet, restored) { + t.Fatalf("re-add did not restore equivalence set\n batch=%v\n restored=%v", batchSet, restored) + } +} + +// TestCMSDecrementRoundTrip proves Decrement floors at 0 and that Count +// reflects the live multiset remainder after a subset is decremented: +// it stays an upper bound on the surviving true count and returns to the +// 0 floor for keys decremented down to nothing. +func TestCMSDecrementRoundTrip(t *testing.T) { + cms := NewCMS(4096, 4) + + // A multiset of keys with known multiplicities. + multiset := map[uint64]int{ + 11: 3, + 22: 5, + 33: 1, + 44: 2, + } + for key, n := range multiset { + for i := 0; i < n; i++ { + cms.Add(key) + } + } + + // Decrement a subset: drop 33 entirely (1→0), drop two of 22 (5→3). + decrements := map[uint64]int{ + 33: 1, + 22: 2, + } + remaining := make(map[uint64]int, len(multiset)) + for key, n := range multiset { + remaining[key] = n - decrements[key] + } + for key, n := range decrements { + for i := 0; i < n; i++ { + cms.Decrement(key) + } + } + + // Count is an upper bound on the live true count, and exactly the + // floor (0) for the fully-removed key. + for key, want := range remaining { + got := cms.Count(key) + if got < uint32(want) { + t.Fatalf("Count(%d)=%d below true remaining count %d (CMS must stay an upper bound)", key, got, want) + } + if want == 0 && got != 0 { + t.Fatalf("Count(%d)=%d, want 0 after full removal", key, got) + } + } + + // Decrementing a never-added key is a no-op and never drives any + // counter negative — Count stays at the 0 floor. + const neverAdded = uint64(999) + cms.Decrement(neverAdded) + if got := cms.Count(neverAdded); got != 0 { + t.Fatalf("Count(neverAdded)=%d after no-op Decrement, want 0", got) + } +} diff --git a/internal/cochange/cochange.go b/internal/cochange/cochange.go index 0fb53dc4..2c8b4e2a 100644 --- a/internal/cochange/cochange.go +++ b/internal/cochange/cochange.go @@ -196,12 +196,12 @@ func orderedPair(a, b string) [2]string { // // Best-effort: returns (0, nil) when root is not a git repository. // Idempotent — graph.AddEdge dedupes, so repeated runs converge. -func EnrichGraph(g *graph.Graph, root, repoPrefix string) (int, error) { +func EnrichGraph(g graph.Store, root, repoPrefix string) (int, error) { return EnrichGraphWith(g, root, repoPrefix, Options{}) } // EnrichGraphWith is EnrichGraph with explicit scan tuning. -func EnrichGraphWith(g *graph.Graph, root, repoPrefix string, opts Options) (int, error) { +func EnrichGraphWith(g graph.Store, root, repoPrefix string, opts Options) (int, error) { if g == nil || root == "" { return 0, nil } @@ -217,7 +217,7 @@ func EnrichGraphWith(g *graph.Graph, root, repoPrefix string, opts Options) (int // carrying that RepoPrefix are matched, against the prefix-stripped // node path (the pairs hold git-relative paths). Pass "" for a // single-repo graph. Idempotent — graph.AddEdge dedupes. -func AddEdges(g *graph.Graph, pairs []Pair, repoPrefix string) int { +func AddEdges(g graph.Store, pairs []Pair, repoPrefix string) int { if g == nil || len(pairs) == 0 { return 0 } diff --git a/internal/codeowners/parser.go b/internal/codeowners/parser.go index 5d449071..a014aa86 100644 --- a/internal/codeowners/parser.go +++ b/internal/codeowners/parser.go @@ -28,14 +28,18 @@ type Rule struct { matcher *gitignore.GitIgnore } -// matchPattern compiles the rule's pattern as a single-line gitignore -// matcher. We compile lazily so the rule list is cheap to construct -// for repos that never call MatchFile. +// matchPattern returns the rule's gitignore matcher. Parse precompiles +// it, so for any Parse-built Rule the field is non-nil and MatchFile's +// concurrent hot path only reads it — no data race on a shared rule list +// (applyCoverageDomains matches files across goroutines against one +// list). For a Rule hand-constructed outside Parse the field is nil; +// compile a throwaway matcher rather than caching into r.matcher, so +// concurrent callers still can't race on the field. func (r *Rule) matchPattern() *gitignore.GitIgnore { - if r.matcher == nil { - r.matcher = gitignore.CompileIgnoreLines(r.Pattern) + if r.matcher != nil { + return r.matcher } - return r.matcher + return gitignore.CompileIgnoreLines(r.Pattern) } // Parse reads a CODEOWNERS file's bytes and returns the rule list in @@ -67,6 +71,9 @@ func Parse(source []byte) []Rule { continue } rule := Rule{Pattern: fields[0]} + // Precompile the matcher in this single-goroutine parse so the + // concurrent MatchFile hot path only reads rule.matcher. + rule.matcher = gitignore.CompileIgnoreLines(rule.Pattern) if len(fields) > 1 { rule.Owners = append(rule.Owners, fields[1:]...) } diff --git a/internal/codeowners/parser_race_test.go b/internal/codeowners/parser_race_test.go new file mode 100644 index 00000000..6ec5c6ea --- /dev/null +++ b/internal/codeowners/parser_race_test.go @@ -0,0 +1,34 @@ +package codeowners_test + +import ( + "sync" + "testing" + + "github.com/zzet/gortex/internal/codeowners" +) + +// TestMatchFile_ConcurrentNoRace exercises MatchFile from many goroutines over +// a single shared rule list — the way the indexer's per-file coverage +// goroutines (applyCoverageDomains) call it. Pre-fix, matchPattern lazily +// compiled and cached r.matcher without synchronisation, so concurrent first +// calls raced on the shared *Rule (and on the half-published GitIgnore). Run +// under -race; it must be clean. +func TestMatchFile_ConcurrentNoRace(t *testing.T) { + rules := codeowners.Parse([]byte( + "*.go @gophers\n" + + "/docs/ @writers\n" + + "src/**/*.ts @frontend @core\n" + + "*.md @docs\n", + )) + paths := []string{"main.go", "docs/readme.md", "src/a/b/c.ts", "x/y/z.py", "pkg/foo.go", "README.md"} + + var wg sync.WaitGroup + for range 64 { + wg.Go(func() { + for i := range 200 { + _ = codeowners.MatchFile(paths[i%len(paths)], rules) + } + }) + } + wg.Wait() +} diff --git a/internal/config/global.go b/internal/config/global.go index d05f1574..5fce7f6f 100644 --- a/internal/config/global.go +++ b/internal/config/global.go @@ -48,7 +48,7 @@ type ProjectConfig struct { Repos []RepoEntry `mapstructure:"repos" yaml:"repos"` } -// GlobalConfig is the user-level config at ~/.config/gortex/config.yaml. +// GlobalConfig is the user-level config at ~/.gortex/config.yaml. type GlobalConfig struct { Projects map[string]ProjectConfig `mapstructure:"projects" yaml:"projects,omitempty"` Repos []RepoEntry `mapstructure:"repos" yaml:"repos,omitempty"` @@ -106,7 +106,7 @@ func expandHome(p string) string { return p } -// DefaultGlobalConfigPath returns the default path: ~/.config/gortex/config.yaml, +// DefaultGlobalConfigPath returns the default path: ~/.gortex/config.yaml, // or the $XDG_CONFIG_HOME equivalent when that variable is set. // // Resolved fresh on every call so HOME / XDG_CONFIG_HOME changes (notably @@ -118,7 +118,7 @@ func DefaultGlobalConfigPath() string { return filepath.Join(platform.ConfigDir(), "config.yaml") } -// LoadGlobal reads the global config from ~/.config/gortex/config.yaml. +// LoadGlobal reads the global config from ~/.gortex/config.yaml. // If the file does not exist, it returns an empty GlobalConfig (no error). // If configPath is empty, the default path is used. func LoadGlobal(configPath ...string) (*GlobalConfig, error) { diff --git a/internal/config/global_test.go b/internal/config/global_test.go index ef3b3fd3..b5434192 100644 --- a/internal/config/global_test.go +++ b/internal/config/global_test.go @@ -17,7 +17,7 @@ import ( // where DefaultGlobalConfigPath cached its result with sync.Once. Whichever // caller fired first pinned the path for the rest of the process — any // test that later set HOME via t.Setenv silently kept writing into the -// developer's real ~/.config/gortex/config.yaml. The function must +// developer's real ~/.gortex/config.yaml. The function must // re-resolve HOME on every call. func TestDefaultGlobalConfigPath_HonorsHomeChange(t *testing.T) { // Pin XDG_CONFIG_HOME empty: when it is set in the ambient @@ -28,7 +28,7 @@ func TestDefaultGlobalConfigPath_HonorsHomeChange(t *testing.T) { homeA := t.TempDir() t.Setenv("HOME", homeA) gotA := DefaultGlobalConfigPath() - wantA := filepath.Join(homeA, ".config", "gortex", "config.yaml") + wantA := filepath.Join(homeA, ".gortex", "config.yaml") if gotA != wantA { t.Fatalf("first call: got %s, want %s", gotA, wantA) } @@ -36,7 +36,7 @@ func TestDefaultGlobalConfigPath_HonorsHomeChange(t *testing.T) { homeB := t.TempDir() t.Setenv("HOME", homeB) gotB := DefaultGlobalConfigPath() - wantB := filepath.Join(homeB, ".config", "gortex", "config.yaml") + wantB := filepath.Join(homeB, ".gortex", "config.yaml") if gotB != wantB { t.Fatalf("after HOME change: got %s, want %s — path appears cached", gotB, wantB) } @@ -44,16 +44,15 @@ func TestDefaultGlobalConfigPath_HonorsHomeChange(t *testing.T) { // TestDefaultGlobalConfigPath_HonorsXDGConfigHome verifies the global // config path is routed through the XDG resolver: an absolute -// $XDG_CONFIG_HOME relocates it, while an unset variable keeps the -// historical $HOME/.config/gortex location so existing installs are -// not orphaned. +// $XDG_CONFIG_HOME relocates it, while an unset variable uses the +// unified $HOME/.gortex location. func TestDefaultGlobalConfigPath_HonorsXDGConfigHome(t *testing.T) { home := t.TempDir() t.Setenv("HOME", home) // Unset: historical default. t.Setenv("XDG_CONFIG_HOME", "") - wantUnset := filepath.Join(home, ".config", "gortex", "config.yaml") + wantUnset := filepath.Join(home, ".gortex", "config.yaml") if got := DefaultGlobalConfigPath(); got != wantUnset { t.Fatalf("XDG_CONFIG_HOME unset: got %s, want %s", got, wantUnset) } diff --git a/internal/config/manager.go b/internal/config/manager.go index 07714419..0896fc37 100644 --- a/internal/config/manager.go +++ b/internal/config/manager.go @@ -191,7 +191,7 @@ func (cm *ConfigManager) GetRepoConfig(repoPrefix string) *Config { // 1. Builtin baseline (excludes.Builtin) // 2. Repo's own `.gitignore` (read from disk; opt out with // `respect_gitignore: false` in `.gortex.yaml`) -// 3. Global Exclude from ~/.config/gortex/config.yaml +// 3. Global Exclude from ~/.gortex/config.yaml // 4. Matching RepoEntry.Exclude (first match in Repos, then Projects) // 5. Workspace .gortex.yaml top-level Exclude // 6. Legacy workspace Index.Exclude / Watch.Exclude (deprecated) diff --git a/internal/contracts/bind.go b/internal/contracts/bind.go index bfa2e483..d6e43cd2 100644 --- a/internal/contracts/bind.go +++ b/internal/contracts/bind.go @@ -31,7 +31,7 @@ import ( // 4. Tiebreak: prefer candidates in files that mention a registration // call like `pb.Register{Service}Server(` or `r.{HTTPVerb}(`. // 5. Uniquely bind or skip (never guess among multiple). -func BindProviderSymbols(reg *Registry, g *graph.Graph) int { +func BindProviderSymbols(reg *Registry, g graph.Store) int { if reg == nil || g == nil { return 0 } @@ -83,7 +83,7 @@ func BindProviderSymbols(reg *Registry, g *graph.Graph) int { // `Register{Service}Server(` call. // 4. Same method name, any receiver — only if there's exactly one // candidate in the repo. -func bindGRPCProvider(c Contract, g *graph.Graph) string { +func bindGRPCProvider(c Contract, g graph.Store) string { method, _ := c.Meta["method"].(string) service, _ := c.Meta["service"].(string) if method == "" || service == "" { @@ -123,7 +123,7 @@ func bindGRPCProvider(c Contract, g *graph.Graph) string { // widely, this is lower-confidence than gRPC binding; a stricter // implementation would also check the Gin/Echo route registration // file, but v1 just name-matches. Returns "" if no unambiguous bind. -func bindOpenAPIProvider(c Contract, g *graph.Graph) string { +func bindOpenAPIProvider(c Contract, g graph.Store) string { op, _ := c.Meta["operationId"].(string) if op == "" { // Fall back to the last path segment; OpenAPI specs diff --git a/internal/contracts/bind_test.go b/internal/contracts/bind_test.go index 84b62fb7..5435b41c 100644 --- a/internal/contracts/bind_test.go +++ b/internal/contracts/bind_test.go @@ -11,7 +11,7 @@ import ( // bindGRPCProvider. func newBindTestGraph(repoPrefix string, methods []struct { id, name, recv string -}) *graph.Graph { +}) graph.Store { g := graph.New() for _, m := range methods { n := &graph.Node{ diff --git a/internal/contracts/load_from_graph.go b/internal/contracts/load_from_graph.go new file mode 100644 index 00000000..e5ee7d20 --- /dev/null +++ b/internal/contracts/load_from_graph.go @@ -0,0 +1,82 @@ +package contracts + +import ( + "github.com/zzet/gortex/internal/graph" +) + +// LoadRegistryFromGraph rebuilds a Registry by scanning every +// KindContract node under repoPrefix and reconstructing the Contract +// struct from Node.Meta. The reverse of the AddNode stamping the +// indexer's commitContracts (and contracts/wrapper.go's +// commitInlinedContractToGraph) do — both write the full record onto +// Meta so a daemon restart can rehydrate without replaying the gob +// snapshot. +// +// Empty repoPrefix loads every contract — useful for ad-hoc probes, +// not a path the daemon normally takes (the warmup rehydrates the +// per-repo registries one prefix at a time so a stale repo's +// contracts don't bleed into a fresh sibling). Returns nil when no +// contracts are recorded for the prefix. +func LoadRegistryFromGraph(g graph.Store, repoPrefix string) *Registry { + if g == nil { + return nil + } + all := g.GetRepoNodes(repoPrefix) + if len(all) == 0 { + return nil + } + reg := NewRegistry() + for _, n := range all { + if n == nil || n.Kind != graph.KindContract { + continue + } + c := contractFromNode(n) + if c.ID == "" { + continue + } + reg.Add(c) + } + if len(reg.All()) == 0 { + return nil + } + return reg +} + +// contractFromNode decodes a Contract from a KindContract graph node's +// Meta payload. Inverse of the AddNode stamping the indexer does. +// Missing fields are left at their zero value — preserves forward +// compatibility if the indexer adds new Meta keys before this loader +// learns about them. +func contractFromNode(n *graph.Node) Contract { + c := Contract{ + ID: n.ID, + FilePath: n.FilePath, + RepoPrefix: n.RepoPrefix, + } + if n.Meta == nil { + return c + } + if v, ok := n.Meta["type"].(string); ok { + c.Type = ContractType(v) + } + if v, ok := n.Meta["role"].(string); ok { + c.Role = Role(v) + } + if v, ok := n.Meta["symbol_id"].(string); ok { + c.SymbolID = v + } + if v, ok := n.Meta["line"].(int); ok { + c.Line = v + } else if v, ok := n.Meta["line"].(int64); ok { + c.Line = int(v) + } + if v, ok := n.Meta["confidence"].(float64); ok { + c.Confidence = v + } + c.WorkspaceID = n.WorkspaceID + c.ProjectID = n.ProjectID + if v, ok := n.Meta["contract_meta"].(map[string]any); ok && len(v) > 0 { + c.Meta = v + } + return c +} diff --git a/internal/contracts/wrapper.go b/internal/contracts/wrapper.go index af38080c..97068f10 100644 --- a/internal/contracts/wrapper.go +++ b/internal/contracts/wrapper.go @@ -38,7 +38,7 @@ type SourceReader func(n *graph.Node) ([]byte, bool) // their per-repo registries — the transient merged registry MultiIndexer // hands in is rebuilt on every ReconcileContractEdges call, so mutations // to it don't survive between invocations). -func InlineWrappers(reg *Registry, g *graph.Graph, read SourceReader) []Contract { +func InlineWrappers(reg *Registry, g graph.Store, read SourceReader) []Contract { if reg == nil || g == nil || read == nil { return nil } @@ -145,7 +145,7 @@ type wrapperInfo struct { // matching a regex pattern: lines + fileNodes + lang + tree feed // EnrichHTTPContractWithTree, which dispatches to the per-language // schema_enrich_*.go detectors and (for Go) the AST overlay. -func enrichInlinedWrapperContract(c *Contract, g *graph.Graph, caller *graph.Node, src []byte) { +func enrichInlinedWrapperContract(c *Contract, g graph.Store, caller *graph.Node, src []byte) { if c == nil || caller == nil || len(src) == 0 { return } @@ -195,19 +195,28 @@ func isWrapperPath(path string) bool { // contracts list output and in the matcher's graph view. Idempotency // matters because ReconcileContractEdges runs on every repo change — // without it each track/index would duplicate edges. -func commitInlinedContractToGraph(g *graph.Graph, c Contract) { +func commitInlinedContractToGraph(g graph.Store, c Contract) { if g == nil { return } if g.GetNode(c.ID) == nil { g.AddNode(&graph.Node{ - ID: c.ID, - Kind: graph.KindContract, - Name: c.ID, - FilePath: c.FilePath, - Language: "contract", - RepoPrefix: c.RepoPrefix, - Meta: map[string]any{"type": string(c.Type), "role": string(c.Role)}, + ID: c.ID, + Kind: graph.KindContract, + Name: c.ID, + FilePath: c.FilePath, + Language: "contract", + RepoPrefix: c.RepoPrefix, + WorkspaceID: c.EffectiveWorkspace(), + ProjectID: c.EffectiveProject(), + Meta: map[string]any{ + "type": string(c.Type), + "role": string(c.Role), + "symbol_id": c.SymbolID, + "line": c.Line, + "confidence": c.Confidence, + "contract_meta": c.Meta, + }, }) } if c.SymbolID == "" { diff --git a/internal/coverage/coverage.go b/internal/coverage/coverage.go index 82c4f8f1..37b59d50 100644 --- a/internal/coverage/coverage.go +++ b/internal/coverage/coverage.go @@ -168,7 +168,7 @@ func (s CoverageStats) Percent() float64 { // file paths are repo-relative (`pkg/file.go`). Pass "" to skip // the prefix-strip, useful when the profile was generated against // raw paths. -func EnrichGraph(g *graph.Graph, segments []Segment, modulePath string) int { +func EnrichGraph(g graph.Store, segments []Segment, modulePath string) int { if g == nil || len(segments) == 0 { return 0 } @@ -182,6 +182,19 @@ func EnrichGraph(g *graph.Graph, segments []Segment, modulePath string) int { } enriched := 0 + // Collect every node whose Meta we stamp so we can round-trip it + // back through the store at the end. On the in-memory backend the + // in-place mutation already persists (n is the canonical node); on + // disk backends (SQLite) n is a per-call GetNode/AllNodes + // reconstruction, so without the write-back the coverage_pct stamp + // is silently discarded the moment AllNodes' slice goes out of + // scope — leaving analyze:coverage_gaps / health_score's coverage + // axis empty on the disk backend. Mirrors releases.EnrichGraph and + // the reach index, which already round-trip Meta through + // AddNode/AddBatch. + var stamped []*graph.Node + covWriter, useCovSidecar := g.(graph.CoverageEnrichmentWriter) + var covRows []graph.CoverageEnrichment for _, n := range g.AllNodes() { if !shouldEnrichCoverage(n.Kind) { continue @@ -206,6 +219,13 @@ func EnrichGraph(g *graph.Graph, segments []Segment, modulePath string) int { "num_stmt": stats.NumStmt, "hit": stats.Hit, } + stamped = append(stamped, n) + if useCovSidecar { + covRows = append(covRows, graph.CoverageEnrichment{ + NodeID: n.ID, RepoPrefix: n.RepoPrefix, + CoveragePct: pct, NumStmt: stats.NumStmt, Hit: stats.Hit, + }) + } enriched++ // EdgeCoveredBy: invert each EdgeTests pointing at this @@ -240,6 +260,38 @@ func EnrichGraph(g *graph.Graph, segments []Segment, modulePath string) int { }) } } + // Persist the stamped node Meta back through the store in one batch + // (a no-op-ish re-insert on the in-memory backend, the durable write + // on disk backends). Without this the coverage_pct stamps never + // survive on the disk backend. + // Persist coverage. Prefer the typed sidecar (change A); on success + // strip the Meta stamps so the node blob stays lean and skip the + // AddBatch. On sidecar write failure, fall back to persisting Meta via + // AddBatch so coverage is never lost (the readers' Meta fallback then + // serves it). + if useCovSidecar && len(covRows) > 0 { + persisted := true + byPrefix := map[string][]graph.CoverageEnrichment{} + for _, r := range covRows { + byPrefix[r.RepoPrefix] = append(byPrefix[r.RepoPrefix], r) + } + for prefix, rr := range byPrefix { + if err := covWriter.BulkSetCoverage(prefix, rr); err != nil { + persisted = false + break + } + } + if persisted { + for _, n := range stamped { + delete(n.Meta, "coverage_pct") + delete(n.Meta, "coverage") + } + } else if len(stamped) > 0 { + g.AddBatch(stamped, nil) + } + } else if len(stamped) > 0 { + g.AddBatch(stamped, nil) + } return enriched } diff --git a/internal/coverage/coverage_test.go b/internal/coverage/coverage_test.go index f784d796..a61aa396 100644 --- a/internal/coverage/coverage_test.go +++ b/internal/coverage/coverage_test.go @@ -48,7 +48,7 @@ github.com/x/y/pkg/b.go:1.13,3.2 1 1 func TestProjectStats(t *testing.T) { segments := []Segment{ - {StartLine: 5, EndLine: 8, NumStmt: 2, Count: 1}, // covered + {StartLine: 5, EndLine: 8, NumStmt: 2, Count: 1}, // covered {StartLine: 10, EndLine: 15, NumStmt: 4, Count: 0}, // uncovered {StartLine: 20, EndLine: 22, NumStmt: 1, Count: 1}, // outside range } @@ -101,16 +101,20 @@ func TestEnrichGraph_StampsMetaCoveragePct(t *testing.T) { t.Errorf("expected 2 enriched, got %d", enriched) } - foo := g.GetNode("pkg/a.go::Foo") - pct, _ := foo.Meta["coverage_pct"].(float64) - if pct < 33.32 || pct > 33.34 { + // Coverage now persists in the typed sidecar (change A), not Node.Meta. + byID := map[string]graph.CoverageEnrichment{} + for _, e := range g.CoverageRows("") { + byID[e.NodeID] = e + } + if pct := byID["pkg/a.go::Foo"].CoveragePct; pct < 33.32 || pct > 33.34 { t.Errorf("Foo pct = %v, want ~33.33", pct) } - bar := g.GetNode("pkg/a.go::Bar") - pct, _ = bar.Meta["coverage_pct"].(float64) - if pct != 100 { + if pct := byID["pkg/a.go::Bar"].CoveragePct; pct != 100 { t.Errorf("Bar pct = %v, want 100", pct) } + if _, present := g.GetNode("pkg/a.go::Foo").Meta["coverage_pct"]; present { + t.Errorf("coverage_pct must not remain in Node.Meta after sidecar migration") + } } func TestEnrichGraph_EmitsCoveredByForExistingTestEdges(t *testing.T) { diff --git a/internal/daemon/paths.go b/internal/daemon/paths.go index c32c6557..0ae1a256 100644 --- a/internal/daemon/paths.go +++ b/internal/daemon/paths.go @@ -15,8 +15,8 @@ import ( // unset the location stays at the historical default so an existing // daemon state directory is not orphaned: // -// - Windows: %LocalAppData%\gortex (via os.UserCacheDir). -// - macOS / Linux: $HOME/.cache/gortex. +// - Windows: %USERPROFILE%\.gortex\cache (via os.UserCacheDir). +// - macOS / Linux: $HOME/.gortex/cache. // // The boolean is false when the home / cache directory can't be // resolved at all, in which case callers fall back to the temp dir. @@ -45,8 +45,8 @@ func stateDir() (string, bool) { // 1. $GORTEX_DAEMON_SOCKET — explicit override (tests, custom deployments). // 2. $XDG_RUNTIME_DIR/gortex.sock — Linux standard for user runtime files. // This path is cleaned automatically on logout and has sensible perms. -// 3. The per-user state dir — $HOME/.cache/gortex on macOS/Linux, -// %LocalAppData%\gortex on Windows. +// 3. The per-user state dir — $HOME/.gortex/cache on macOS/Linux, +// %USERPROFILE%\.gortex\cache on Windows. // // AF_UNIX socket paths have a length limit (~104 bytes on macOS, 108 on // Linux and Windows). We don't enforce that here — the listener fails @@ -93,8 +93,12 @@ func LogFilePath() string { return filepath.Join(os.TempDir(), "gortex-daemon.log") } -// SnapshotPath returns the path the daemon saves graph snapshots to on -// periodic saves and clean shutdown. Loaded on startup for fast cold starts. +// SnapshotPath returns the legacy backend-agnostic snapshot path — +// `daemon.gob.gz` under the state dir. Kept for callers that haven't +// moved to backend-tagged storage yet (cloud indexer worker, ad-hoc +// `gortex index --snapshot` runs). The daemon itself routes through +// BackendSnapshotPath so a memory ↔ disk-backend switch can't read the +// other backend's snapshot — see that function's doc. func SnapshotPath() string { if override := os.Getenv("GORTEX_DAEMON_SNAPSHOT"); override != "" { return override @@ -105,10 +109,55 @@ func SnapshotPath() string { return filepath.Join(os.TempDir(), "gortex-daemon.gob.gz") } +// BackendSnapshotPath returns a backend-tagged snapshot path so the +// memory and disk backends use distinct files. The memory backend +// snapshot is a full gob+gzip of the in-memory graph; the disk +// backend snapshot is metadata-only (FileMtimes, contracts, vector +// index) because the graph itself lives in the on-disk store. Loading +// the memory backend's snapshot into a disk-backed daemon (or vice +// versa) silently produced wrong state — empty graph after disk→memory +// switch, decode-and-discard nodes after memory→disk — so a fresh +// daemon now picks the right file by backend tag. +// +// Empty backend tag falls back to SnapshotPath() so embedded callers +// that don't know the backend (the cloud indexer worker) keep working. +// +// GORTEX_DAEMON_SNAPSHOT overrides every backend tag — the override +// is an explicit "use exactly this path" signal. +func BackendSnapshotPath(backend string) string { + if override := os.Getenv("GORTEX_DAEMON_SNAPSHOT"); override != "" { + return override + } + tag := normalizeBackendTag(backend) + if tag == "" { + return SnapshotPath() + } + filename := "daemon-" + tag + ".gob.gz" + if dir, ok := stateDir(); ok { + return filepath.Join(dir, filename) + } + return filepath.Join(os.TempDir(), "gortex-"+filename) +} + +// normalizeBackendTag canonicalizes a backend identifier into the +// short tag used in the snapshot filename — "memory" / "sqlite" / +// etc. Empty / unknown input returns the empty string so the caller +// can fall back to the legacy unsuffixed path. +func normalizeBackendTag(backend string) string { + switch backend { + case "memory", "mem", "in-memory": + return "memory" + case "sqlite", "sqlite3": + return "sqlite" + default: + return "" + } +} + // EnsureParentDir creates the parent directory of path with permissions // 0o700 (user only). Daemon state files live under the user's cache dir // and should not be world-readable. The mode is advisory on Windows, -// where filesystem ACLs already scope %LocalAppData% to the user. +// where filesystem ACLs already scope %USERPROFILE% to the user. func EnsureParentDir(path string) error { dir := filepath.Dir(path) return os.MkdirAll(dir, 0o700) diff --git a/internal/daemon/pidfile_test.go b/internal/daemon/pidfile_test.go new file mode 100644 index 00000000..9182ecb2 --- /dev/null +++ b/internal/daemon/pidfile_test.go @@ -0,0 +1,67 @@ +package daemon + +import ( + "os" + "path/filepath" + "strconv" + "testing" +) + +// TestRunningPID covers the four states RunningPID must distinguish: no PID +// file, a live owner, a stale owner (process gone), and a corrupt file. The +// stale case is the load-bearing one — misreading a crashed daemon's leftover +// PID file as "running" would block every subsequent start. +func TestRunningPID(t *testing.T) { + pidPath := filepath.Join(t.TempDir(), "daemon.pid") + t.Setenv("GORTEX_DAEMON_PIDFILE", pidPath) + + t.Run("no pid file", func(t *testing.T) { + if pid, ok := RunningPID(); ok { + t.Fatalf("want (0,false), got (%d,%v)", pid, ok) + } + }) + + t.Run("live owner", func(t *testing.T) { + writePID(t, pidPath, os.Getpid()) + pid, ok := RunningPID() + if !ok || pid != os.Getpid() { + t.Fatalf("want (%d,true), got (%d,%v)", os.Getpid(), pid, ok) + } + }) + + t.Run("live owner with trailing newline", func(t *testing.T) { + // A pidfile written by `echo`/a process manager ends in "\n". The + // guard must still detect the live owner — otherwise a restart + // silently races the store lock again. + if err := os.WriteFile(pidPath, []byte(strconv.Itoa(os.Getpid())+"\n"), 0o600); err != nil { + t.Fatal(err) + } + if pid, ok := RunningPID(); !ok || pid != os.Getpid() { + t.Fatalf("want (%d,true), got (%d,%v)", os.Getpid(), pid, ok) + } + }) + + t.Run("stale owner", func(t *testing.T) { + // A PID well above any platform's pid_max — guaranteed not live. + writePID(t, pidPath, 1<<30) + if pid, ok := RunningPID(); ok { + t.Fatalf("stale pid must read as not running, got (%d,%v)", pid, ok) + } + }) + + t.Run("corrupt file", func(t *testing.T) { + if err := os.WriteFile(pidPath, []byte("not-a-pid"), 0o600); err != nil { + t.Fatal(err) + } + if pid, ok := RunningPID(); ok { + t.Fatalf("corrupt pid file must read as not running, got (%d,%v)", pid, ok) + } + }) +} + +func writePID(t *testing.T, path string, pid int) { + t.Helper() + if err := os.WriteFile(path, []byte(strconv.Itoa(pid)), 0o600); err != nil { + t.Fatal(err) + } +} diff --git a/internal/daemon/proto.go b/internal/daemon/proto.go index 5a7d4db8..47beac1f 100644 --- a/internal/daemon/proto.go +++ b/internal/daemon/proto.go @@ -91,6 +91,28 @@ const ( ControlStatus = "status" ControlShutdown = "shutdown" ControlSearchSymbols = "search_symbols" + // ControlEnrichChurn dispatches to Controller.EnrichChurn — the daemon + // runs the churn enricher against its in-process graph so the CLI + // (and the post-commit / post-merge git hooks) don't have to fight + // the on-disk store's write lock the daemon holds. + ControlEnrichChurn = "enrich_churn" + // ControlEnrichReleases dispatches to Controller.EnrichReleases. + // Same routing rationale as ControlEnrichChurn — the CLI hands the + // enrichment to the daemon when one is up so the write lock stays + // uncontested. + ControlEnrichReleases = "enrich_releases" + // ControlEnrichBlame dispatches to Controller.EnrichBlame — git-blame + // authorship stamping against the daemon's in-process graph. Same + // routing rationale as ControlEnrichChurn. + ControlEnrichBlame = "enrich_blame" + // ControlEnrichCoverage dispatches to Controller.EnrichCoverage — + // Go cover-profile projection onto the daemon's in-process graph. + // The CLI parses the profile and hands the raw segments to the + // daemon so the daemon never has to read the caller's filesystem. + ControlEnrichCoverage = "enrich_coverage" + // ControlEnrichCochange dispatches to Controller.EnrichCochange — + // co-change edge mining against the daemon's in-process graph. + ControlEnrichCochange = "enrich_cochange" ) // TrackParams is the payload for ControlTrack. @@ -239,13 +261,125 @@ type SearchSymbolsResult struct { Hits []SymbolHit `json:"hits"` } +// EnrichChurnParams is the payload for ControlEnrichChurn. +// +// Path scopes the enrichment to a single tracked repo (matched by +// prefix, abs path, or "" for "every tracked repo"). Branch overrides +// the default-branch resolution — pass "origin/main" / "main" / a tag +// / a SHA. Empty Branch means the daemon picks the default branch +// from each repo's working tree. +type EnrichChurnParams struct { + Path string `json:"path,omitempty"` + Branch string `json:"branch,omitempty"` +} + +// EnrichChurnResult is the payload returned under Result for a +// successful ControlEnrichChurn call. Counts are summed across every +// repo that participated (typically one). +type EnrichChurnResult struct { + Files int `json:"files"` + Symbols int `json:"symbols"` + Branch string `json:"branch"` + HeadSHA string `json:"head_sha"` + DurationMS int64 `json:"duration_ms"` +} + +// EnrichReleasesParams is the payload for ControlEnrichReleases. +// +// Path scopes the enrichment to a single tracked repo (prefix or +// absolute root, "" for "every tracked repo"). Branch restricts the +// considered tags to those reachable from that branch; empty Branch +// means "every tag in the repo" — matches the legacy `analyze +// kind=releases` behaviour. +type EnrichReleasesParams struct { + Path string `json:"path,omitempty"` + Branch string `json:"branch,omitempty"` +} + +// EnrichReleasesResult is the payload returned under Result for a +// successful ControlEnrichReleases call. Files is the count of file +// nodes stamped with meta.added_in across every repo that +// participated. +type EnrichReleasesResult struct { + Files int `json:"files"` + Branch string `json:"branch,omitempty"` + DurationMS int64 `json:"duration_ms"` +} + +// EnrichBlameParams is the payload for ControlEnrichBlame. +// +// Path scopes the enrichment to a single tracked repo (matched by +// prefix, abs path, or "" for "every tracked repo"). +type EnrichBlameParams struct { + Path string `json:"path,omitempty"` +} + +// EnrichBlameResult is the payload returned under Result for a +// successful ControlEnrichBlame call. Nodes is the count of symbol / +// file nodes stamped with meta.last_authored across every repo that +// participated. +type EnrichBlameResult struct { + Nodes int `json:"nodes"` + DurationMS int64 `json:"duration_ms"` +} + +// EnrichCoverageSegment mirrors coverage.Segment on the wire so the +// CLI can parse the cover profile against its own filesystem (the +// profile path is relative to the caller, not the daemon) and hand the +// parsed segments to the daemon. Field shape matches coverage.Segment +// exactly. +type EnrichCoverageSegment struct { + File string `json:"file"` + StartLine int `json:"start_line"` + EndLine int `json:"end_line"` + NumStmt int `json:"num_stmt"` + Count int `json:"count"` +} + +// EnrichCoverageParams is the payload for ControlEnrichCoverage. +// +// Path scopes the enrichment to a single tracked repo (matched by +// prefix, abs path, or "" for "every tracked repo"). Segments are the +// pre-parsed cover-profile entries; the CLI parses the profile so the +// daemon never has to read the caller's filesystem. +type EnrichCoverageParams struct { + Path string `json:"path,omitempty"` + Segments []EnrichCoverageSegment `json:"segments"` +} + +// EnrichCoverageResult is the payload returned under Result for a +// successful ControlEnrichCoverage call. Symbols is the count of nodes +// stamped with meta.coverage_pct across every repo that participated; +// Segments echoes how many profile segments were supplied. +type EnrichCoverageResult struct { + Symbols int `json:"symbols"` + Segments int `json:"segments"` + DurationMS int64 `json:"duration_ms"` +} + +// EnrichCochangeParams is the payload for ControlEnrichCochange. +// +// Path scopes the enrichment to a single tracked repo (matched by +// prefix, abs path, or "" for "every tracked repo"). +type EnrichCochangeParams struct { + Path string `json:"path,omitempty"` +} + +// EnrichCochangeResult is the payload returned under Result for a +// successful ControlEnrichCochange call. Edges is the count of +// co_change edges added across every repo that participated. +type EnrichCochangeResult struct { + Edges int `json:"edges"` + DurationMS int64 `json:"duration_ms"` +} + // TrackedRepoStatus is one row in StatusResponse.TrackedRepos. type TrackedRepoStatus struct { Prefix string `json:"prefix"` Path string `json:"path"` Name string `json:"name,omitempty"` // Project is the GlobalConfig active-project slug — a named - // grouping of repos in `~/.config/gortex/config.yaml::projects`. + // grouping of repos in `~/.gortex/config.yaml::projects`. // Distinct from `WorkspaceProject` below, which is the project // slug from `.gortex.yaml::project`. Kept here for backwards // compatibility with older daemon clients that read the field. diff --git a/internal/daemon/server.go b/internal/daemon/server.go index 6a19e483..527b6eee 100644 --- a/internal/daemon/server.go +++ b/internal/daemon/server.go @@ -13,6 +13,7 @@ import ( "os/signal" "runtime" "strconv" + "strings" "sync" "time" @@ -97,6 +98,25 @@ type Controller interface { // (Claude Code's Grep-redirect hook) that need a single short answer // without setting up a full MCP session. SearchSymbols(ctx context.Context, params SearchSymbolsParams) (SearchSymbolsResult, error) + // EnrichChurn runs the per-symbol / per-file churn enricher against + // the daemon's in-process graph. Exposed over the control surface so + // CLI invocations (and the post-commit / post-merge git hook) can + // trigger it without taking the on-disk store's write lock the daemon owns. + EnrichChurn(ctx context.Context, params EnrichChurnParams) (EnrichChurnResult, error) + // EnrichReleases runs the per-file release enricher against the + // daemon's in-process graph. Same routing rationale as + // EnrichChurn — keeps the on-disk store's write lock with the daemon. + EnrichReleases(ctx context.Context, params EnrichReleasesParams) (EnrichReleasesResult, error) + // EnrichBlame runs the git-blame authorship enricher against the + // daemon's in-process graph. Same routing rationale as EnrichChurn. + EnrichBlame(ctx context.Context, params EnrichBlameParams) (EnrichBlameResult, error) + // EnrichCoverage projects pre-parsed Go cover-profile segments onto + // the daemon's in-process graph. The CLI parses the profile so the + // daemon never reads the caller's filesystem. + EnrichCoverage(ctx context.Context, params EnrichCoverageParams) (EnrichCoverageResult, error) + // EnrichCochange mines co-change edges against the daemon's + // in-process graph. Same routing rationale as EnrichChurn. + EnrichCochange(ctx context.Context, params EnrichCochangeParams) (EnrichCochangeResult, error) // Shutdown is invoked via the control surface and should return // quickly; the daemon's actual shutdown work happens after the // response is written. @@ -121,7 +141,7 @@ func New(socketPath, version string, logger *zap.Logger) *Server { // Listen creates the socket, writes the PID file, and installs the // shutdown-signal handlers for graceful shutdown. The socket permissions // are 0o600 on Unix — the daemon is user-local and nothing else on the -// machine should reach it; on Windows, %LocalAppData% ACLs scope it to +// machine should reach it; on Windows, %USERPROFILE% ACLs scope it to // the user instead. func (s *Server) Listen() error { if err := EnsureParentDir(s.SocketPath); err != nil { @@ -142,7 +162,7 @@ func (s *Server) Listen() error { return fmt.Errorf("listen: %w", err) } // chmod the socket to user-only on Unix. Windows has no POSIX mode - // bits — the socket inherits the ACLs of %LocalAppData%, which is + // bits — the socket inherits the ACLs of %USERPROFILE%, which is // already user-scoped — so skip it there. if runtime.GOOS != "windows" { if err := os.Chmod(s.SocketPath, 0o600); err != nil { @@ -517,6 +537,81 @@ func (s *Server) handleControl(_ *Session, req ControlRequest) ControlResponse { return controlErr(ErrInternal, err.Error()) } return ControlResponse{OK: true} + + case ControlEnrichChurn: + var p EnrichChurnParams + if err := unmarshalParams(req.Params, &p); err != nil { + return controlErr(ErrInternal, err.Error()) + } + result, err := s.Controller.EnrichChurn(ctx, p) + if err != nil { + return controlErr(ErrInternal, err.Error()) + } + buf, err := json.Marshal(result) + if err != nil { + return controlErr(ErrInternal, "marshal enrich_churn result: "+err.Error()) + } + return ControlResponse{OK: true, Result: buf} + + case ControlEnrichReleases: + var p EnrichReleasesParams + if err := unmarshalParams(req.Params, &p); err != nil { + return controlErr(ErrInternal, err.Error()) + } + result, err := s.Controller.EnrichReleases(ctx, p) + if err != nil { + return controlErr(ErrInternal, err.Error()) + } + buf, err := json.Marshal(result) + if err != nil { + return controlErr(ErrInternal, "marshal enrich_releases result: "+err.Error()) + } + return ControlResponse{OK: true, Result: buf} + + case ControlEnrichBlame: + var p EnrichBlameParams + if err := unmarshalParams(req.Params, &p); err != nil { + return controlErr(ErrInternal, err.Error()) + } + result, err := s.Controller.EnrichBlame(ctx, p) + if err != nil { + return controlErr(ErrInternal, err.Error()) + } + buf, err := json.Marshal(result) + if err != nil { + return controlErr(ErrInternal, "marshal enrich_blame result: "+err.Error()) + } + return ControlResponse{OK: true, Result: buf} + + case ControlEnrichCoverage: + var p EnrichCoverageParams + if err := unmarshalParams(req.Params, &p); err != nil { + return controlErr(ErrInternal, err.Error()) + } + result, err := s.Controller.EnrichCoverage(ctx, p) + if err != nil { + return controlErr(ErrInternal, err.Error()) + } + buf, err := json.Marshal(result) + if err != nil { + return controlErr(ErrInternal, "marshal enrich_coverage result: "+err.Error()) + } + return ControlResponse{OK: true, Result: buf} + + case ControlEnrichCochange: + var p EnrichCochangeParams + if err := unmarshalParams(req.Params, &p); err != nil { + return controlErr(ErrInternal, err.Error()) + } + result, err := s.Controller.EnrichCochange(ctx, p) + if err != nil { + return controlErr(ErrInternal, err.Error()) + } + buf, err := json.Marshal(result) + if err != nil { + return controlErr(ErrInternal, "marshal enrich_cochange result: "+err.Error()) + } + return ControlResponse{OK: true, Result: buf} } return controlErr(ErrInternal, "unknown control kind: "+req.Kind) } @@ -573,6 +668,39 @@ func (s *Server) writePIDFile() error { return os.WriteFile(path, []byte(strconv.Itoa(os.Getpid())), 0o600) } +// RunningPID reports the PID of a live daemon recorded in the PID file, or +// (0, false) when none is. Unlike IsRunning — which only probes the control +// socket — this still reports a daemon that is *mid-shutdown*: the +// ControlShutdown handler tears the listener down ~100ms after acking, but +// the process stays alive while it flushes and closes the store, and it +// holds the store's on-disk lock until it exits. That window is exactly what +// turned a quick restart into a "failed to open database" lock conflict, so +// callers that must not start a second daemon over the top of a dying one — +// or that need to wait for it to exit — consult this, not the socket. +// +// A PID file whose process is dead is stale (the owner crashed without +// cleanup) and reported as not-running, mirroring writePIDFile's own +// staleness handling. +func RunningPID() (int, bool) { + b, err := os.ReadFile(PIDFilePath()) + if err != nil { + return 0, false + } + // TrimSpace so a PID file written with a trailing newline — by a shell + // `echo`, a process manager, or a hand edit — still parses. The daemon + // writes it without one, but tolerating both is free and the silent + // failure mode (guard never fires, restart races the lock again) is + // exactly the bug this helper exists to prevent. + pid, err := strconv.Atoi(strings.TrimSpace(string(b))) + if err != nil || pid <= 0 { + return 0, false + } + if !platform.ProcessAlive(pid) { + return 0, false + } + return pid, true +} + func (s *Server) trackConn(c net.Conn) { s.connsMu.Lock() s.conns[c] = struct{}{} diff --git a/internal/daemon/server_test.go b/internal/daemon/server_test.go index cf8dfdf3..8a2a5737 100644 --- a/internal/daemon/server_test.go +++ b/internal/daemon/server_test.go @@ -33,6 +33,12 @@ type fakeController struct { searchCalls []SearchSymbolsParams searchHits []SymbolHit searchErr error + + enrichChurnCalls []EnrichChurnParams + enrichReleasesCalls []EnrichReleasesParams + enrichBlameCalls []EnrichBlameParams + enrichCoverageCalls []EnrichCoverageParams + enrichCochangeCalls []EnrichCochangeParams } func (f *fakeController) Track(_ context.Context, p TrackParams) (json.RawMessage, error) { @@ -84,6 +90,41 @@ func (f *fakeController) SearchSymbols(_ context.Context, p SearchSymbolsParams) return SearchSymbolsResult{Hits: f.searchHits}, nil } +func (f *fakeController) EnrichChurn(_ context.Context, p EnrichChurnParams) (EnrichChurnResult, error) { + f.mu.Lock() + defer f.mu.Unlock() + f.enrichChurnCalls = append(f.enrichChurnCalls, p) + return EnrichChurnResult{Files: 1, Symbols: 2, Branch: p.Branch}, nil +} + +func (f *fakeController) EnrichReleases(_ context.Context, p EnrichReleasesParams) (EnrichReleasesResult, error) { + f.mu.Lock() + defer f.mu.Unlock() + f.enrichReleasesCalls = append(f.enrichReleasesCalls, p) + return EnrichReleasesResult{Files: 3, Branch: p.Branch}, nil +} + +func (f *fakeController) EnrichBlame(_ context.Context, p EnrichBlameParams) (EnrichBlameResult, error) { + f.mu.Lock() + defer f.mu.Unlock() + f.enrichBlameCalls = append(f.enrichBlameCalls, p) + return EnrichBlameResult{Nodes: 5}, nil +} + +func (f *fakeController) EnrichCoverage(_ context.Context, p EnrichCoverageParams) (EnrichCoverageResult, error) { + f.mu.Lock() + defer f.mu.Unlock() + f.enrichCoverageCalls = append(f.enrichCoverageCalls, p) + return EnrichCoverageResult{Symbols: 7, Segments: len(p.Segments)}, nil +} + +func (f *fakeController) EnrichCochange(_ context.Context, p EnrichCochangeParams) (EnrichCochangeResult, error) { + f.mu.Lock() + defer f.mu.Unlock() + f.enrichCochangeCalls = append(f.enrichCochangeCalls, p) + return EnrichCochangeResult{Edges: 11}, nil +} + // newDaemon spins up a Server on a short socket path + Fake controller. // macOS limits Unix socket paths to ~104 chars (sizeof(sun_path)), and // Go's t.TempDir() path can exceed that for long test names, so we mint @@ -162,6 +203,80 @@ func TestDaemon_ControlTrackUntrack(t *testing.T) { assert.Equal(t, "myapp", ctrl.untrackCalls[0].PathOrPrefix) } +// TestDaemon_ControlEnrichDispatch exercises the control dispatch for +// every enrich verb — confirming each routes to the matching Controller +// method, round-trips its Params, and decodes the typed Result. This is +// the contract the `gortex enrich` CLI relies on when it forwards to a +// running daemon. +func TestDaemon_ControlEnrichDispatch(t *testing.T) { + ctrl := &fakeController{} + _, socket := newDaemon(t, ctrl) + + c, err := DialTo(socket, Handshake{Mode: ModeControl, ClientName: "cli"}) + require.NoError(t, err) + defer func() { _ = c.Close() }() + + // churn + churnResp, err := c.Control(ControlEnrichChurn, EnrichChurnParams{Path: "/r", Branch: "main"}) + require.NoError(t, err) + require.True(t, churnResp.OK, "churn: %+v", churnResp) + var churnOut EnrichChurnResult + require.NoError(t, json.Unmarshal(churnResp.Result, &churnOut)) + assert.Equal(t, 1, churnOut.Files) + assert.Equal(t, 2, churnOut.Symbols) + assert.Equal(t, "main", churnOut.Branch) + + // releases + relResp, err := c.Control(ControlEnrichReleases, EnrichReleasesParams{Path: "/r", Branch: "main"}) + require.NoError(t, err) + require.True(t, relResp.OK, "releases: %+v", relResp) + var relOut EnrichReleasesResult + require.NoError(t, json.Unmarshal(relResp.Result, &relOut)) + assert.Equal(t, 3, relOut.Files) + + // blame + blameResp, err := c.Control(ControlEnrichBlame, EnrichBlameParams{Path: "/r"}) + require.NoError(t, err) + require.True(t, blameResp.OK, "blame: %+v", blameResp) + var blameOut EnrichBlameResult + require.NoError(t, json.Unmarshal(blameResp.Result, &blameOut)) + assert.Equal(t, 5, blameOut.Nodes) + + // coverage + covResp, err := c.Control(ControlEnrichCoverage, EnrichCoverageParams{ + Path: "/r", + Segments: []EnrichCoverageSegment{ + {File: "a.go", StartLine: 1, EndLine: 3, NumStmt: 2, Count: 1}, + {File: "a.go", StartLine: 4, EndLine: 6, NumStmt: 1, Count: 0}, + }, + }) + require.NoError(t, err) + require.True(t, covResp.OK, "coverage: %+v", covResp) + var covOut EnrichCoverageResult + require.NoError(t, json.Unmarshal(covResp.Result, &covOut)) + assert.Equal(t, 7, covOut.Symbols) + assert.Equal(t, 2, covOut.Segments) + + // cochange + coResp, err := c.Control(ControlEnrichCochange, EnrichCochangeParams{Path: "/r"}) + require.NoError(t, err) + require.True(t, coResp.OK, "cochange: %+v", coResp) + var coOut EnrichCochangeResult + require.NoError(t, json.Unmarshal(coResp.Result, &coOut)) + assert.Equal(t, 11, coOut.Edges) + + ctrl.mu.Lock() + defer ctrl.mu.Unlock() + require.Len(t, ctrl.enrichChurnCalls, 1) + assert.Equal(t, "/r", ctrl.enrichChurnCalls[0].Path) + require.Len(t, ctrl.enrichReleasesCalls, 1) + require.Len(t, ctrl.enrichBlameCalls, 1) + assert.Equal(t, "/r", ctrl.enrichBlameCalls[0].Path) + require.Len(t, ctrl.enrichCoverageCalls, 1) + assert.Len(t, ctrl.enrichCoverageCalls[0].Segments, 2) + require.Len(t, ctrl.enrichCochangeCalls, 1) +} + func TestDaemon_ProtocolMismatchRejected(t *testing.T) { _, socket := newDaemon(t, &fakeController{}) // Bump the version so the daemon rejects us. diff --git a/internal/daemon/servers.go b/internal/daemon/servers.go index 8a386db3..9e598376 100644 --- a/internal/daemon/servers.go +++ b/internal/daemon/servers.go @@ -30,7 +30,7 @@ import ( // Auth: prefer AuthTokenEnv (an env-var name the daemon resolves at // connect time) over AuthToken (a literal value). Putting raw // secrets in `servers.toml` is allowed for parity with how -// `~/.config/gortex/config.yaml` already gets written by `gortex +// `~/.gortex/config.yaml` already gets written by `gortex // track`, but the env-var form is the recommended path. // // Workspaces is the optional pre-declared roster: when set, the @@ -61,11 +61,10 @@ type ServersConfig struct { // 1. $GORTEX_DAEMON_SERVERS — explicit override (tests, custom // deployments). // 2. $HOME/.gortex/servers.toml — the canonical user-level file. -// Note this is NOT under `~/.config/gortex/` (where global.yaml -// lives) — `~/.gortex/` is the daemon-control directory and is -// the same place tracking scripts and `gortex daemon` already -// write to. An absolute $XDG_CONFIG_HOME relocates this to -// /gortex/servers.toml. +// It lives in the unified `~/.gortex/` tree alongside the global +// `config.yaml`, the same place tracking scripts and `gortex +// daemon` already write to. An absolute $XDG_CONFIG_HOME relocates +// this to /gortex/servers.toml. // 3. $TEMPDIR/gortex-servers.toml — last-resort fallback so the // daemon can still come up in an environment with no $HOME. func ServersConfigPath() string { @@ -77,7 +76,7 @@ func ServersConfigPath() string { return filepath.Join(os.TempDir(), "gortex-servers.toml") } } - return filepath.Join(platform.LegacyConfigDir(), "servers.toml") + return filepath.Join(platform.ConfigDir(), "servers.toml") } // LoadServersConfig reads and validates ~/.gortex/servers.toml. A diff --git a/internal/dataflow/dataflow.go b/internal/dataflow/dataflow.go index 390c29cc..1c0d5368 100644 --- a/internal/dataflow/dataflow.go +++ b/internal/dataflow/dataflow.go @@ -79,13 +79,13 @@ func (p Path) Length() int { return len(p.Edges) } // Engine is the dataflow query backend. It holds a reference to // the graph and exposes the two MCP-ready primitives. Concurrency- -// safe by virtue of relying only on graph.Graph's read methods. +// safe by virtue of relying only on graph.Store's read methods. type Engine struct { - g *graph.Graph + g graph.Store } // New returns an engine backed by the given graph. -func New(g *graph.Graph) *Engine { return &Engine{g: g} } +func New(g graph.Store) *Engine { return &Engine{g: g} } // IsDataflowKind returns true for the three edge kinds the BFS // traverses. @@ -372,6 +372,17 @@ func (p TaintPattern) matches(n *graph.Node) bool { // distinct symbol IDs whose nodes match the pattern. Returns the // caller-friendly nodes themselves so MCP responses can include // names + paths without a second lookup. +// +// The seed set is bounded by taintEligibleKinds — the fixed 8-kind +// allowlist (function/method/param/field/variable/constant/type/ +// interface) that taintEligible enforces. Iterating the per-kind +// NodesByKind bucket of each lets the backend stream only those +// kinds instead of materialising the full node table; +// on a disk backend AllNodes() pulls ~70k rows per request just to land +// at a handful of taint candidates. Pattern post-filters (name / +// path / pattern-supplied kind) still run Go-side — they compose +// AND, can't be projected onto the bucket index efficiently, and +// the per-bucket population is already small. func (e *Engine) ResolveCandidates(p TaintPattern, limit int) []*graph.Node { if e == nil || e.g == nil || p.Empty() { return nil @@ -380,37 +391,37 @@ func (e *Engine) ResolveCandidates(p TaintPattern, limit int) []*graph.Node { limit = 100 } out := make([]*graph.Node, 0, 16) - for _, n := range e.g.AllNodes() { - if !taintEligible(n) { - continue - } - if !p.matches(n) { - continue - } - out = append(out, n) + for _, k := range taintEligibleKinds { if len(out) >= limit { break } + for n := range e.g.NodesByKind(k) { + if n == nil { + continue + } + if !p.matches(n) { + continue + } + out = append(out, n) + if len(out) >= limit { + break + } + } } sort.SliceStable(out, func(i, j int) bool { return out[i].ID < out[j].ID }) return out } -// taintEligible filters the node universe to symbols that could -// plausibly be a dataflow source or sink. Files / imports / pkg +// taintEligibleKinds is the seed-bucket allowlist of node kinds that +// could plausibly be a dataflow source or sink. Files / imports / pkg // markers don't carry value semantics, so excluding them up front -// keeps the candidate set focused. -func taintEligible(n *graph.Node) bool { - if n == nil { - return false - } - switch n.Kind { - case graph.KindFunction, graph.KindMethod, graph.KindParam, - graph.KindField, graph.KindVariable, graph.KindConstant, - graph.KindType, graph.KindInterface: - return true - } - return false +// keeps the candidate set focused. Kept as a slice (not a set) so +// callers can iterate the NodesByKind bucket of each kind in a stable +// order. +var taintEligibleKinds = []graph.NodeKind{ + graph.KindFunction, graph.KindMethod, graph.KindParam, + graph.KindField, graph.KindVariable, graph.KindConstant, + graph.KindType, graph.KindInterface, } // TaintFinding is one (source, sink) hit produced by TaintPaths. diff --git a/internal/docs/docs.go b/internal/docs/docs.go index a5a8876e..cc333797 100644 --- a/internal/docs/docs.go +++ b/internal/docs/docs.go @@ -105,7 +105,7 @@ type BlameSummary struct { // Deps bundles the runtime dependencies injected by the MCP/CLI layer. type Deps struct { - Graph *graph.Graph + Graph graph.Store History HistoryProvider Blame BlameRunner } @@ -189,7 +189,7 @@ func Generate(deps Deps, opts Options) (*Bundle, error) { // walkNodes does a single pass over symbol nodes and emits the // ownership and stale-code tables in a single pass. -func walkNodes(g *graph.Graph, opts Options, now time.Time) ([]OwnershipRow, []StaleCodeRow) { +func walkNodes(g graph.Store, opts Options, now time.Time) ([]OwnershipRow, []StaleCodeRow) { type ownerStats struct { row OwnershipRow fileSet map[string]struct{} diff --git a/internal/embedding/gomlx.go b/internal/embedding/gomlx.go index 1237d7b3..2d4c42b6 100644 --- a/internal/embedding/gomlx.go +++ b/internal/embedding/gomlx.go @@ -91,7 +91,7 @@ func (p *GoMLXProvider) Close() error { } func ensureGoMLXModel() (string, error) { - dest := filepath.Join(platform.CacheDir(), "models") + dest := platform.ModelsDir() modelDir := filepath.Join(dest, "sentence-transformers_all-MiniLM-L6-v2") if _, err := os.Stat(filepath.Join(modelDir, "tokenizer.json")); err == nil { diff --git a/internal/embedding/hugot.go b/internal/embedding/hugot.go index 14c71dd1..9208fbf6 100644 --- a/internal/embedding/hugot.go +++ b/internal/embedding/hugot.go @@ -171,7 +171,7 @@ func (p *HugotProvider) Close() error { // variants and the downloader refuses to guess. The cache layout // mirrors Hugot's own convention: `/_/…`. func ensureHugotModel(spec HugotVariant) (string, error) { - dest := filepath.Join(platform.CacheDir(), "models") + dest := platform.ModelsDir() modelDir := filepath.Join(dest, hfCacheDirName(spec.RepoID)) tokenizerReady := false diff --git a/internal/embedding/onnx.go b/internal/embedding/onnx.go index f49081a3..02cefe72 100644 --- a/internal/embedding/onnx.go +++ b/internal/embedding/onnx.go @@ -46,7 +46,7 @@ type ONNXProvider struct { func newONNXProvider() (Provider, error) { modelDir := findONNXModelDir() if modelDir == "" { - return nil, fmt.Errorf("ONNX model not found; place model.onnx + vocab.txt in ~/.cache/gortex/models/gte-small/") + return nil, fmt.Errorf("ONNX model not found; place model.onnx + vocab.txt in ~/.gortex/models/gte-small/") } modelPath := filepath.Join(modelDir, "model.onnx") @@ -282,10 +282,8 @@ func (p *ONNXProvider) wordPieceTokenize(word string) []int64 { // --- helpers --- func findONNXModelDir() string { - home, _ := os.UserHomeDir() candidates := []string{ - filepath.Join(platform.CacheDir(), "models", "gte-small"), - filepath.Join(home, ".gortex", "models", "gte-small"), + filepath.Join(platform.ModelsDir(), "gte-small"), "/tmp/gte-small", } for _, dir := range candidates { diff --git a/internal/embedding/provider.go b/internal/embedding/provider.go index 573f66c8..740dab57 100644 --- a/internal/embedding/provider.go +++ b/internal/embedding/provider.go @@ -100,7 +100,7 @@ func NewProviderFromConfig(cfg ProviderConfig) (Provider, error) { func NewLocalProvider() (Provider, error) { // Opt-in transformer backends (compiled in via build tags), then the // default Hugot pure-Go ONNX runtime which auto-downloads MiniLM-L6-v2 - // to ~/.cache/gortex/models/ on first use. + // to ~/.gortex/models/ on first use. factories := []func() (Provider, error){ newONNXProvider, newGoMLXProvider, diff --git a/internal/exporter/cypher.go b/internal/exporter/cypher.go index b278818b..34985c52 100644 --- a/internal/exporter/cypher.go +++ b/internal/exporter/cypher.go @@ -25,7 +25,7 @@ import ( // // CREATE INDEX ON :GortexNode(id); // Memgraph // CREATE INDEX FOR (n:GortexNode) ON (n.id); // Neo4j 5.x -func WriteCypher(w io.Writer, g *graph.Graph, opts Options) (Stats, error) { +func WriteCypher(w io.Writer, g graph.Store, opts Options) (Stats, error) { cw := &countingWriter{w: w} nodes, edges, _ := snapshot(g, opts) diff --git a/internal/exporter/exporter.go b/internal/exporter/exporter.go index 61ac6a60..2b2d474e 100644 --- a/internal/exporter/exporter.go +++ b/internal/exporter/exporter.go @@ -1,6 +1,6 @@ // Package exporter writes the in-memory graph to portable formats so users -// can load it into external visualization and query tools (Neo4j, Memgraph, -// Kuzu via Cypher; yEd, Gephi, Cytoscape via GraphML). +// can load it into external visualization and query tools (Neo4j, Memgraph +// via Cypher; yEd, Gephi, Cytoscape via GraphML). // // The exporter is read-only and operates on a snapshot — it never mutates // the graph. Filters (repo, kinds) are applied during emission. @@ -42,11 +42,11 @@ type Options struct { // Stats reports what was emitted. Returned by every exporter Write call. type Stats struct { - NodesWritten int - EdgesWritten int - NodesSkipped int - EdgesSkipped int - BytesWritten int64 + NodesWritten int + EdgesWritten int + NodesSkipped int + EdgesSkipped int + BytesWritten int64 } // nodeFilter returns true for nodes that pass the option filters. @@ -69,7 +69,7 @@ func (o *Options) nodeFilter(n *graph.Node) bool { // When opts.DropSynthetic is false (default), edges pointing at IDs that are // not real graph nodes (`unresolved::*`, `external::*`, `annotation::*`) get // synthesized stub nodes added to the result so the call topology is preserved. -func snapshot(g *graph.Graph, opts Options) ([]*graph.Node, []*graph.Edge, map[string]bool) { +func snapshot(g graph.Store, opts Options) ([]*graph.Node, []*graph.Edge, map[string]bool) { allNodes := g.AllNodes() allEdges := g.AllEdges() diff --git a/internal/exporter/graphml.go b/internal/exporter/graphml.go index 913fabf0..a265d601 100644 --- a/internal/exporter/graphml.go +++ b/internal/exporter/graphml.go @@ -15,7 +15,7 @@ import ( // All Gortex node properties are projected to GraphML attributes. // Free-form Meta is JSON-encoded into a single `meta_json` attribute so no // information is lost — viewers that don't care about it ignore it. -func WriteGraphML(w io.Writer, g *graph.Graph, opts Options) (Stats, error) { +func WriteGraphML(w io.Writer, g graph.Store, opts Options) (Stats, error) { cw := &countingWriter{w: w} nodes, edges, _ := snapshot(g, opts) diff --git a/internal/exporter/mermaid.go b/internal/exporter/mermaid.go index fbb8f13d..c68072ca 100644 --- a/internal/exporter/mermaid.go +++ b/internal/exporter/mermaid.go @@ -44,7 +44,7 @@ func (o MermaidOpts) withDefaults() MermaidOpts { // WriteMermaid emits a single Mermaid diagram for the chosen scope. // Use this when the caller asks for one file. For multi-file output // the CLI calls WriteMermaid once per scope into separate files. -func WriteMermaid(w io.Writer, g *graph.Graph, opts MermaidOpts) (Stats, error) { +func WriteMermaid(w io.Writer, g graph.Store, opts MermaidOpts) (Stats, error) { opts = opts.withDefaults() cw := &countingWriter{w: w} @@ -66,7 +66,7 @@ func WriteMermaid(w io.Writer, g *graph.Graph, opts MermaidOpts) (Stats, error) // renderForScope dispatches the Scope to the right diagram builder and // returns the rendered Mermaid plus a (nodes, edges) count that the // caller surfaces in Stats. -func renderForScope(g *graph.Graph, opts MermaidOpts) (body string, nodes, edges int, err error) { +func renderForScope(g graph.Store, opts MermaidOpts) (body string, nodes, edges int, err error) { switch strings.ToLower(opts.Scope) { case "architecture": body, nodes, edges = renderArchitecture(g, opts) @@ -101,7 +101,7 @@ func renderForScope(g *graph.Graph, opts MermaidOpts) (body string, nodes, edges // renderArchitecture builds a top-level community map with hub // annotations. Mirrors the layout used by the wiki page. -func renderArchitecture(g *graph.Graph, opts MermaidOpts) (string, int, int) { +func renderArchitecture(g graph.Store, opts MermaidOpts) (string, int, int) { comms := analysis.DetectCommunities(g) var sb strings.Builder sb.WriteString("graph TB\n") @@ -147,7 +147,7 @@ func renderArchitecture(g *graph.Graph, opts MermaidOpts) (string, int, int) { // renderCommunities is identical to architecture today but exposes // `graph LR` for a wider canvas. Caller picks via Scope. -func renderCommunities(g *graph.Graph, opts MermaidOpts) (string, int, int) { +func renderCommunities(g graph.Store, opts MermaidOpts) (string, int, int) { comms := analysis.DetectCommunities(g) var sb strings.Builder sb.WriteString("graph LR\n") @@ -187,7 +187,7 @@ func renderCommunities(g *graph.Graph, opts MermaidOpts) (string, int, int) { // renderProcesses lists every process as a small flowchart of // caller→callee pairs, capped to keep the rendering responsive. -func renderProcesses(g *graph.Graph, _ MermaidOpts) (string, int, int) { +func renderProcesses(g graph.Store, _ MermaidOpts) (string, int, int) { procs := analysis.DiscoverProcesses(g) var sb strings.Builder sb.WriteString("graph LR\n") @@ -244,7 +244,7 @@ func renderProcesses(g *graph.Graph, _ MermaidOpts) (string, int, int) { // emitCrossCommEdges writes EdgeCalls between communities (filtered // to the kept set) and returns the edge count. -func emitCrossCommEdges(sb *strings.Builder, g *graph.Graph, comms *analysis.CommunityResult, keep map[string]bool) int { +func emitCrossCommEdges(sb *strings.Builder, g graph.Store, comms *analysis.CommunityResult, keep map[string]bool) int { type edge struct { from, to string count int diff --git a/internal/githooks/install.go b/internal/githooks/install.go index bc32776a..ce02cb5b 100644 --- a/internal/githooks/install.go +++ b/internal/githooks/install.go @@ -18,11 +18,33 @@ import ( // Begin and end markers wrap the gortex-managed block inside a hook // file. The MARKER_BEGIN / MARKER_END convention is checked by every // install/uninstall pass and never re-written verbatim by the user. +// +// These exported constants preserve the post-commit form for callers +// that pre-date multi-hook support; new code goes through markerBegin +// / markerEnd which derive the strings from the hook name (so +// post-merge gets its own pair). const ( MarkerBegin = "# gortex-managed:post-commit:begin" MarkerEnd = "# gortex-managed:post-commit:end" ) +// SupportedHooks enumerates the hook names that InstallHook accepts. +// Anything else returns an error so we don't silently scatter our +// markers into hooks we haven't audited. +var SupportedHooks = []string{"post-commit", "post-merge"} + +func isSupportedHook(name string) bool { + for _, h := range SupportedHooks { + if h == name { + return true + } + } + return false +} + +func markerBegin(hook string) string { return "# gortex-managed:" + hook + ":begin" } +func markerEnd(hook string) string { return "# gortex-managed:" + hook + ":end" } + // InstallOpts controls what the installed hook runs. type InstallOpts struct { // Binary is the gortex executable path. Defaults to "gortex" @@ -42,6 +64,23 @@ type InstallOpts struct { // DocsOutPath is the docs bundle output path. Defaults to // "CHANGELOG_AUTO.md". DocsOutPath string + // RegenChurn toggles a `gortex enrich churn` run. The companion + // MCP tool get_churn_rate reads the data this enrich pass writes, + // so wiring this into post-commit / post-merge keeps the signal + // fresh without the agent paying the recompute cost at read time. + RegenChurn bool + // ChurnBranch overrides the branch the enricher pins to. Empty + // means "let `gortex enrich churn` resolve the default branch + // at run time" — the right default for shared repos where the + // branch name varies per checkout. + ChurnBranch string + // RegenReleases toggles a `gortex enrich releases` run. Same + // motivation as RegenChurn: keeps `analyze kind=releases` answers + // fresh without paying the per-call tag walk. + RegenReleases bool + // ReleasesBranch is the rev whose reachable tags bound the + // timeline. Empty means "resolve at hook run time". + ReleasesBranch string } func (o InstallOpts) withDefaults() InstallOpts { @@ -62,12 +101,11 @@ func (o InstallOpts) withDefaults() InstallOpts { // hookCommands builds the body the installer writes inside the // marker block. The body is a `#!/bin/sh` snippet that runs every -// enabled action and tolerates failures so the commit still -// completes when gortex isn't on PATH. -func hookCommands(opts InstallOpts) []string { +// enabled action and tolerates failures so the hook always completes. +func hookCommands(hook string, opts InstallOpts) []string { var cmds []string - cmds = append(cmds, "# Auto-regenerate gortex artefacts after each commit.") - cmds = append(cmds, "# Failures are tolerated so the commit always completes.") + cmds = append(cmds, fmt.Sprintf("# Auto-regenerate gortex artefacts on %s.", hook)) + cmds = append(cmds, "# Failures are tolerated so the hook always completes.") if opts.RegenMermaid { cmds = append(cmds, fmt.Sprintf("(%s export --format mermaid --scope all --out-dir %q --on-commit) >/dev/null 2>&1 || true", opts.Binary, opts.MermaidOutDir)) @@ -80,6 +118,22 @@ func hookCommands(opts InstallOpts) []string { cmds = append(cmds, fmt.Sprintf("(%s docs . --out %q) >/dev/null 2>&1 || true", opts.Binary, opts.DocsOutPath)) } + if opts.RegenChurn { + if strings.TrimSpace(opts.ChurnBranch) == "" { + cmds = append(cmds, fmt.Sprintf("(%s enrich churn) >/dev/null 2>&1 || true", opts.Binary)) + } else { + cmds = append(cmds, fmt.Sprintf("(%s enrich churn --branch=%q) >/dev/null 2>&1 || true", + opts.Binary, opts.ChurnBranch)) + } + } + if opts.RegenReleases { + if strings.TrimSpace(opts.ReleasesBranch) == "" { + cmds = append(cmds, fmt.Sprintf("(%s enrich releases) >/dev/null 2>&1 || true", opts.Binary)) + } else { + cmds = append(cmds, fmt.Sprintf("(%s enrich releases --branch=%q) >/dev/null 2>&1 || true", + opts.Binary, opts.ReleasesBranch)) + } + } if len(cmds) == 2 { // No actions selected — note it explicitly. cmds = append(cmds, "# (no regeneration actions enabled)") @@ -89,10 +143,22 @@ func hookCommands(opts InstallOpts) []string { // HookPath resolves the absolute path of the post-commit hook for the // repository rooted at repoRoot. Honours core.hooksPath when set. +// Thin wrapper over HookPathFor — preserved for backwards compatibility. func HookPath(repoRoot string) (string, error) { + return HookPathFor(repoRoot, "post-commit") +} + +// HookPathFor resolves the absolute path of the named hook file in +// the repository rooted at repoRoot. Honours core.hooksPath when set. +// hook is a bare hook name from SupportedHooks ("post-commit", +// "post-merge", …). +func HookPathFor(repoRoot, hook string) (string, error) { if repoRoot == "" { return "", fmt.Errorf("githooks: repoRoot is empty") } + if !isSupportedHook(hook) { + return "", fmt.Errorf("githooks: unsupported hook %q (supported: %s)", hook, strings.Join(SupportedHooks, ", ")) + } gitDir, err := runGit(repoRoot, "rev-parse", "--git-dir") if err != nil { return "", fmt.Errorf("githooks: not a git repository at %q: %w", repoRoot, err) @@ -112,15 +178,15 @@ func HookPath(repoRoot string) (string, error) { if err := os.MkdirAll(hooksDir, 0o755); err != nil { return "", fmt.Errorf("githooks: create hooks dir %q: %w", hooksDir, err) } - return filepath.Join(hooksDir, "post-commit"), nil + return filepath.Join(hooksDir, hook), nil } // StatusReport describes the current state of the post-commit hook. type StatusReport struct { - HookPath string `json:"hook_path"` - Exists bool `json:"exists"` - Managed bool `json:"managed"` // true iff our marker block is present - Body string `json:"body,omitempty"` + HookPath string `json:"hook_path"` + Exists bool `json:"exists"` + Managed bool `json:"managed"` // true iff our marker block is present + Body string `json:"body,omitempty"` } // Status reports the current state of the post-commit hook. Never @@ -148,36 +214,45 @@ func Status(repoRoot string) (StatusReport, error) { return rep, nil } -// InstallPostCommit writes a post-commit hook with the configured -// commands inside our marker block. Idempotent: re-running replaces +// InstallPostCommit is a backwards-compatible wrapper over InstallHook +// that installs the post-commit hook. New callers should reach for +// InstallHook directly so they can install post-merge too. +func InstallPostCommit(repoRoot string, opts InstallOpts) (string, error) { + return InstallHook(repoRoot, "post-commit", opts) +} + +// InstallHook writes the named hook with the configured commands +// inside a hook-specific marker block. Idempotent: re-running replaces // just the gortex block, leaving any other content intact. // // Returns the absolute path of the hook so callers can show it to the -// user. -func InstallPostCommit(repoRoot string, opts InstallOpts) (string, error) { +// user. `hook` must be one of SupportedHooks. +func InstallHook(repoRoot, hook string, opts InstallOpts) (string, error) { opts = opts.withDefaults() - hookPath, err := HookPath(repoRoot) + hookPath, err := HookPathFor(repoRoot, hook) if err != nil { return "", err } - cmds := hookCommands(opts) + cmds := hookCommands(hook, opts) + mBegin := markerBegin(hook) + mEnd := markerEnd(hook) var newBlock bytes.Buffer - newBlock.WriteString(MarkerBegin) + newBlock.WriteString(mBegin) newBlock.WriteString("\n") for _, line := range cmds { newBlock.WriteString(line) newBlock.WriteString("\n") } - newBlock.WriteString(MarkerEnd) + newBlock.WriteString(mEnd) newBlock.WriteString("\n") existing, _ := os.ReadFile(hookPath) // nil bytes when file doesn't exist var out bytes.Buffer if len(existing) == 0 { out.WriteString("#!/bin/sh\n") - out.WriteString("# Installed by `gortex githook install post-commit`.\n") + fmt.Fprintf(&out, "# Installed by `gortex githook install %s`.\n", hook) out.WriteString("# Marker block below is regenerated on each install/uninstall;\n") out.WriteString("# add your own commands outside the markers and they will be preserved.\n\n") out.Write(newBlock.Bytes()) @@ -187,10 +262,10 @@ func InstallPostCommit(repoRoot string, opts InstallOpts) (string, error) { if !strings.HasPrefix(body, "#!") { out.WriteString("#!/bin/sh\n") } - if strings.Contains(body, MarkerBegin) && strings.Contains(body, MarkerEnd) { + if strings.Contains(body, mBegin) && strings.Contains(body, mEnd) { // Replace existing block. - before, rest, _ := strings.Cut(body, MarkerBegin) - _, after, _ := strings.Cut(rest, MarkerEnd) + before, rest, _ := strings.Cut(body, mBegin) + _, after, _ := strings.Cut(rest, mEnd) after = strings.TrimLeft(after, "\n") out.WriteString(before) out.Write(newBlock.Bytes()) @@ -214,18 +289,25 @@ func InstallPostCommit(repoRoot string, opts InstallOpts) (string, error) { return hookPath, nil } -// UninstallPostCommit removes the gortex-managed block. If the file -// then contains nothing but the shebang and our installer comment, -// the file is deleted entirely. Otherwise we leave the residual -// (user-authored) content in place. +// UninstallPostCommit is a backwards-compatible wrapper. +func UninstallPostCommit(repoRoot string) (string, bool, error) { + return UninstallHook(repoRoot, "post-commit") +} + +// UninstallHook removes the gortex-managed block from the named hook. +// If the file then contains nothing but the shebang and our installer +// comment, the file is deleted entirely. Otherwise we leave the +// residual (user-authored) content in place. // // Returns the path of the hook (whether it now exists or was deleted) // and a bool indicating "block was found and removed". -func UninstallPostCommit(repoRoot string) (string, bool, error) { - hookPath, err := HookPath(repoRoot) +func UninstallHook(repoRoot, hook string) (string, bool, error) { + hookPath, err := HookPathFor(repoRoot, hook) if err != nil { return "", false, err } + mBegin := markerBegin(hook) + mEnd := markerEnd(hook) body, err := os.ReadFile(hookPath) if err != nil { if os.IsNotExist(err) { @@ -234,11 +316,11 @@ func UninstallPostCommit(repoRoot string) (string, bool, error) { return "", false, err } b := string(body) - if !strings.Contains(b, MarkerBegin) || !strings.Contains(b, MarkerEnd) { + if !strings.Contains(b, mBegin) || !strings.Contains(b, mEnd) { return hookPath, false, nil } - before, rest, _ := strings.Cut(b, MarkerBegin) - _, after, _ := strings.Cut(rest, MarkerEnd) + before, rest, _ := strings.Cut(b, mBegin) + _, after, _ := strings.Cut(rest, mEnd) after = strings.TrimLeft(after, "\n") cleaned := strings.TrimRight(before, "\n") + "\n" + after cleaned = strings.TrimSpace(cleaned) diff --git a/internal/githooks/install_test.go b/internal/githooks/install_test.go index 8a61810d..0de5217f 100644 --- a/internal/githooks/install_test.go +++ b/internal/githooks/install_test.go @@ -192,6 +192,78 @@ func TestStatus_NewRepo(t *testing.T) { } } +func TestInstallHook_PostMergeAndChurn(t *testing.T) { + repo := initRepo(t) + path, err := InstallHook(repo, "post-merge", InstallOpts{RegenChurn: true, ChurnBranch: "origin/main"}) + if err != nil { + t.Fatalf("InstallHook post-merge: %v", err) + } + if filepath.Base(path) != "post-merge" { + t.Errorf("expected post-merge hook file, got %s", path) + } + body, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read hook: %v", err) + } + got := string(body) + for _, want := range []string{ + "# gortex-managed:post-merge:begin", + "# gortex-managed:post-merge:end", + "gortex enrich churn", + `--branch="origin/main"`, + } { + if !strings.Contains(got, want) { + t.Errorf("hook missing %q. Body:\n%s", want, got) + } + } + // Post-commit and post-merge should be independently managed. + if _, err := InstallHook(repo, "post-commit", InstallOpts{RegenChurn: true}); err != nil { + t.Fatalf("InstallHook post-commit: %v", err) + } + if _, removed, err := UninstallHook(repo, "post-merge"); err != nil || !removed { + t.Fatalf("UninstallHook post-merge removed=%v err=%v", removed, err) + } + // Post-commit hook should still exist after we uninstalled post-merge. + postCommitPath, err := HookPathFor(repo, "post-commit") + if err != nil { + t.Fatalf("HookPathFor: %v", err) + } + if _, err := os.Stat(postCommitPath); err != nil { + t.Errorf("post-commit hook should survive post-merge uninstall: %v", err) + } +} + +func TestInstallHook_RegenReleases(t *testing.T) { + repo := initRepo(t) + path, err := InstallHook(repo, "post-merge", InstallOpts{ + RegenReleases: true, + ReleasesBranch: "origin/main", + }) + if err != nil { + t.Fatalf("InstallHook post-merge: %v", err) + } + body, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read hook: %v", err) + } + got := string(body) + for _, want := range []string{ + "gortex enrich releases", + `--branch="origin/main"`, + } { + if !strings.Contains(got, want) { + t.Errorf("hook missing %q. Body:\n%s", want, got) + } + } +} + +func TestInstallHook_RejectsUnsupportedHook(t *testing.T) { + repo := initRepo(t) + if _, err := InstallHook(repo, "pre-push", InstallOpts{RegenMermaid: true}); err == nil { + t.Fatal("expected error for unsupported hook pre-push") + } +} + func TestHookPath_HonoursCoreHooksPath(t *testing.T) { repo := initRepo(t) customHooks := filepath.Join(repo, "custom-hooks") diff --git a/internal/graph/edge.go b/internal/graph/edge.go index 50046b0f..bf697d30 100644 --- a/internal/graph/edge.go +++ b/internal/graph/edge.go @@ -3,7 +3,18 @@ package graph type EdgeKind string const ( - EdgeImports EdgeKind = "imports" + EdgeImports EdgeKind = "imports" + // EdgeContains links a file node to its non-symbol children — import + // nodes today, and a natural home for future side-band kinds + // (todos, fixtures) that "belong to" a file without being defined + // by it. EdgeDefines is the wrong fit for these because the file + // does not semantically *define* an import; it *contains* the + // import statement. Splitting the kinds lets walkers that want + // "real definitions" follow EdgeDefines and walkers that want the + // full file neighbourhood union both. The disk-backed + // GetFileSubGraph relies on this union to fetch every file + // neighbour in one pass. + EdgeContains EdgeKind = "contains" EdgeDefines EdgeKind = "defines" EdgeCalls EdgeKind = "calls" EdgeInstantiates EdgeKind = "instantiates" @@ -228,9 +239,17 @@ const ( // dataflow without materialising a graph node per local variable, // edges target a synthetic ID of the form: // - // #local:@ + // #local:@+ // - // where ownerID is the enclosing function/method/closure node. + // where ownerID is the enclosing function/method/closure node + // and the offset is the local's 1-based line minus the owner's + // declaration line (leading `+` flags the value as a relative + // offset). The offset-based ID keeps locals stable across edits + // that shift the function as a whole — only edits inside the + // function above a binding shift that binding's ID. Each ID is + // also materialised as a KindLocal node linked to the owner + // via EdgeMemberOf; the search index excludes KindLocal so + // these per-binding nodes don't pollute name lookups. // These IDs are valid edge endpoints — BFS traverses them — but // no graph node is created, keeping search results free of // every transient binding in every function body. @@ -472,6 +491,15 @@ func BaseKindForCrossRepo(cr EdgeKind) (EdgeKind, bool) { return "", false } +// BaseKindsForCrossRepo returns the set of base edge kinds that have a +// parallel cross_repo_* variant. The slice is the single source of +// truth for callers (DetectCrossRepoEdges, the CrossRepoCandidates +// storage capability) that need the kind list without iterating +// CrossRepoKindFor over every edge. +func BaseKindsForCrossRepo() []EdgeKind { + return []EdgeKind{EdgeCalls, EdgeImplements, EdgeExtends} +} + type Edge struct { From string `json:"from"` To string `json:"to"` @@ -605,7 +633,7 @@ func DefaultOriginFor(kind EdgeKind, confidence float64, semanticSource string) } // Structural AST edges are unambiguous by construction. switch kind { - case EdgeDefines, EdgeImports, EdgeExtends, EdgeMemberOf, + case EdgeDefines, EdgeImports, EdgeContains, EdgeExtends, EdgeMemberOf, EdgeImplements, EdgeProvides, EdgeConsumes, EdgeMatches, // Coverage structural edges: the extractor produces an // unambiguous source→target binding for each, so they share @@ -656,7 +684,7 @@ func DefaultOriginFor(kind EdgeKind, confidence float64, semanticSource string) func ConfidenceLabelFor(kind EdgeKind, confidence float64) string { // Structural edges from AST are always extracted. switch kind { - case EdgeDefines, EdgeImports, EdgeExtends, EdgeMemberOf, EdgeImplements, + case EdgeDefines, EdgeImports, EdgeContains, EdgeExtends, EdgeMemberOf, EdgeImplements, EdgeProvides, EdgeConsumes, EdgeMatches, EdgeParamOf, EdgeAliases, EdgeComposes, EdgeOverrides, EdgeLicensedAs, EdgeOwns, EdgeAuthored, EdgeGeneratedBy, EdgeDependsOnModule, diff --git a/internal/graph/extraction_gap.go b/internal/graph/extraction_gap.go index a8d69162..2a4ac054 100644 --- a/internal/graph/extraction_gap.go +++ b/internal/graph/extraction_gap.go @@ -61,6 +61,24 @@ var usageEdgeKinds = map[EdgeKind]bool{ EdgeTests: true, } +// UsageInboundEdgeKinds returns the canonical list of incoming edge +// kinds that classify a symbol as "used" by ClassifyZeroEdge. Exposed +// for capability callers (NodeDegreeAggregator) that need to mirror +// the in-graph usage filter server-side. Order is stable so the slice +// is safe to pass directly to a query parameter binding. +func UsageInboundEdgeKinds() []EdgeKind { + return []EdgeKind{ + EdgeCalls, + EdgeReferences, + EdgeInstantiates, + EdgeImplements, + EdgeExtends, + EdgeReads, + EdgeWrites, + EdgeTests, + } +} + // ClassifyZeroEdge inspects a symbol's incoming and outgoing edges and // returns how an empty usage/caller/impact query for it should be read. // @@ -75,7 +93,7 @@ var usageEdgeKinds = map[EdgeKind]bool{ // An unknown symbol ID is reported as an extraction gap: a query whose // target is not even in the graph is exactly as untrustworthy as one // whose target was never wired up. -func ClassifyZeroEdge(g *Graph, symbolID string) ZeroEdgeClass { +func ClassifyZeroEdge(g Store, symbolID string) ZeroEdgeClass { if g == nil || symbolID == "" { return ZeroEdgePossibleExtractionGap } @@ -113,7 +131,7 @@ var zeroEdgeMessages = map[ZeroEdgeClass]string{ // query result on symbolID. It returns nil when the symbol has // incoming usage edges (ZeroEdgeNone) — a non-empty result carries no // caveat — so callers can attach the return value unconditionally. -func CaveatForZeroEdge(g *Graph, symbolID string) *ZeroEdgeCaveat { +func CaveatForZeroEdge(g Store, symbolID string) *ZeroEdgeCaveat { class := ClassifyZeroEdge(g, symbolID) if class == ZeroEdgeNone { return nil diff --git a/internal/graph/graph.go b/internal/graph/graph.go index 849aef5e..3383168b 100644 --- a/internal/graph/graph.go +++ b/internal/graph/graph.go @@ -1,6 +1,9 @@ package graph import ( + "iter" + "slices" + "strings" "sync" "sync/atomic" ) @@ -465,8 +468,59 @@ type Graph struct { allEdgesCacheMu sync.Mutex allEdgesCache []*Edge allEdgesCacheGen uint64 + + // cloneShingles is the in-memory implementation of the + // CloneShingle* capability: per-symbol MinHash shingle sets keyed by + // node id, alongside the repo prefix that owns each row so per-repo + // reseeds isolate correctly. Guarded by cloneShinglesMu. Slices are + // deep-copied on set and on read so callers can't mutate the stored + // state. The on-disk backend persists the same shape; the in-memory + // store keeps it live so the conformance suite exercises both. + cloneShinglesMu sync.Mutex + cloneShingles map[string]cloneShingleEntry + + // churnEnrich is the in-memory churn-enrichment sidecar (change A). + churnEnrichMu sync.Mutex + churnEnrich map[string]ChurnEnrichment + + // coverageEnrich is the in-memory coverage-enrichment sidecar. + coverageEnrichMu sync.Mutex + coverageEnrich map[string]CoverageEnrichment + + // releaseEnrich is the in-memory release-enrichment sidecar. + releaseEnrichMu sync.Mutex + releaseEnrich map[string]ReleaseEnrichment + + // blameEnrich is the in-memory blame-enrichment sidecar. + blameEnrichMu sync.Mutex + blameEnrich map[string]BlameEnrichment +} + +// cloneShingleEntry is one in-memory clone_shingles row: the owning +// repo prefix plus the (already deep-copied) shingle set. +type cloneShingleEntry struct { + repoPrefix string + shingles []uint64 } +// Compile-time assertions that the in-memory *Graph satisfies the +// optional per-symbol clone-shingle persistence capabilities, so the +// conformance suite exercises the same code path against both backends. +var ( + _ CloneShingleWriter = (*Graph)(nil) + _ CloneShingleReader = (*Graph)(nil) + _ ChurnEnrichmentWriter = (*Graph)(nil) + _ ChurnEnrichmentReader = (*Graph)(nil) + _ CoverageEnrichmentWriter = (*Graph)(nil) + _ CoverageEnrichmentReader = (*Graph)(nil) + _ ReleaseEnrichmentWriter = (*Graph)(nil) + _ ReleaseEnrichmentReader = (*Graph)(nil) + _ BlameEnrichmentWriter = (*Graph)(nil) + _ BlameEnrichmentReader = (*Graph)(nil) + _ ReleaseEnrichmentWriter = (*Graph)(nil) + _ ReleaseEnrichmentReader = (*Graph)(nil) +) + // New creates an empty graph. func New() *Graph { g := &Graph{} @@ -484,6 +538,1035 @@ func (g *Graph) ResolveMutex() *sync.Mutex { return &g.resolveMu } +// ReindexEdges is the batched sibling of ReindexEdge. The in-memory +// store has no per-call commit overhead so the implementation is a +// straight loop; the value of the batch API lives in the disk +// backends, where it collapses N transaction commits into one. +func (g *Graph) ReindexEdges(batch []EdgeReindex) { + for _, r := range batch { + if r.Edge == nil { + continue + } + g.ReindexEdge(r.Edge, r.OldTo) + } +} + +// BulkSetCloneShingles is the in-memory implementation of +// CloneShingleWriter. It records every (nodeID -> shingles) entry for +// one repo prefix, replacing any prior value in place. Slices are +// deep-copied on the way in so a later mutation of the caller's slice +// can't corrupt the stored state. Empty input is a no-op. +func (g *Graph) BulkSetCloneShingles(repoPrefix string, rows map[string][]uint64) error { + if len(rows) == 0 { + return nil + } + g.cloneShinglesMu.Lock() + defer g.cloneShinglesMu.Unlock() + if g.cloneShingles == nil { + g.cloneShingles = make(map[string]cloneShingleEntry, len(rows)) + } + for id, sh := range rows { + cp := make([]uint64, len(sh)) + copy(cp, sh) + g.cloneShingles[id] = cloneShingleEntry{repoPrefix: repoPrefix, shingles: cp} + } + return nil +} + +// DeleteCloneShingles is the in-memory implementation of the +// CloneShingleWriter delete side. It drops the rows for the supplied +// node ids. Empty input is a no-op; missing ids are ignored. +func (g *Graph) DeleteCloneShingles(nodeIDs []string) error { + if len(nodeIDs) == 0 { + return nil + } + g.cloneShinglesMu.Lock() + defer g.cloneShinglesMu.Unlock() + for _, id := range nodeIDs { + if id == "" { + continue + } + delete(g.cloneShingles, id) + } + return nil +} + +// LoadCloneShingles is the in-memory implementation of +// CloneShingleReader. It returns a fresh map of the shingle sets owned +// by one repo prefix, deep-copying each slice so callers can't mutate +// the stored state. Always returns a non-nil (possibly empty) map and +// never an error. +func (g *Graph) LoadCloneShingles(repoPrefix string) (map[string][]uint64, error) { + g.cloneShinglesMu.Lock() + defer g.cloneShinglesMu.Unlock() + out := make(map[string][]uint64) + for id, entry := range g.cloneShingles { + if entry.repoPrefix != repoPrefix { + continue + } + cp := make([]uint64, len(entry.shingles)) + copy(cp, entry.shingles) + out[id] = cp + } + return out, nil +} + +// BulkSetChurn is the in-memory ChurnEnrichmentWriter. ChurnEnrichment +// is a flat value type, so a map store needs no deep copy. +func (g *Graph) BulkSetChurn(repoPrefix string, rows []ChurnEnrichment) error { + if len(rows) == 0 { + return nil + } + g.churnEnrichMu.Lock() + defer g.churnEnrichMu.Unlock() + if g.churnEnrich == nil { + g.churnEnrich = make(map[string]ChurnEnrichment, len(rows)) + } + for _, r := range rows { + r.RepoPrefix = repoPrefix + g.churnEnrich[r.NodeID] = r + } + return nil +} + +// DeleteChurn is the in-memory ChurnEnrichmentWriter delete side. +func (g *Graph) DeleteChurn(nodeIDs []string) error { + if len(nodeIDs) == 0 { + return nil + } + g.churnEnrichMu.Lock() + defer g.churnEnrichMu.Unlock() + for _, id := range nodeIDs { + if id != "" { + delete(g.churnEnrich, id) + } + } + return nil +} + +// ChurnRows is the in-memory ChurnEnrichmentReader. An empty repoPrefix +// returns all rows across repos. +func (g *Graph) ChurnRows(repoPrefix string) []ChurnEnrichment { + g.churnEnrichMu.Lock() + defer g.churnEnrichMu.Unlock() + out := make([]ChurnEnrichment, 0, len(g.churnEnrich)) + for _, r := range g.churnEnrich { + if repoPrefix != "" && r.RepoPrefix != repoPrefix { + continue + } + out = append(out, r) + } + return out +} + +// BulkSetCoverage is the in-memory CoverageEnrichmentWriter. +func (g *Graph) BulkSetCoverage(repoPrefix string, rows []CoverageEnrichment) error { + if len(rows) == 0 { + return nil + } + g.coverageEnrichMu.Lock() + defer g.coverageEnrichMu.Unlock() + if g.coverageEnrich == nil { + g.coverageEnrich = make(map[string]CoverageEnrichment, len(rows)) + } + for _, r := range rows { + r.RepoPrefix = repoPrefix + g.coverageEnrich[r.NodeID] = r + } + return nil +} + +// DeleteCoverage is the in-memory CoverageEnrichmentWriter delete side. +func (g *Graph) DeleteCoverage(nodeIDs []string) error { + if len(nodeIDs) == 0 { + return nil + } + g.coverageEnrichMu.Lock() + defer g.coverageEnrichMu.Unlock() + for _, id := range nodeIDs { + if id != "" { + delete(g.coverageEnrich, id) + } + } + return nil +} + +// ChurnRows-style reader for coverage; empty repoPrefix returns all. +func (g *Graph) CoverageRows(repoPrefix string) []CoverageEnrichment { + g.coverageEnrichMu.Lock() + defer g.coverageEnrichMu.Unlock() + out := make([]CoverageEnrichment, 0, len(g.coverageEnrich)) + for _, r := range g.coverageEnrich { + if repoPrefix != "" && r.RepoPrefix != repoPrefix { + continue + } + out = append(out, r) + } + return out +} + +// BulkSetReleases is the in-memory ReleaseEnrichmentWriter. +func (g *Graph) BulkSetReleases(repoPrefix string, rows []ReleaseEnrichment) error { + if len(rows) == 0 { + return nil + } + g.releaseEnrichMu.Lock() + defer g.releaseEnrichMu.Unlock() + if g.releaseEnrich == nil { + g.releaseEnrich = make(map[string]ReleaseEnrichment, len(rows)) + } + for _, r := range rows { + r.RepoPrefix = repoPrefix + g.releaseEnrich[r.NodeID] = r + } + return nil +} + +// DeleteReleases is the in-memory ReleaseEnrichmentWriter delete side. +func (g *Graph) DeleteReleases(nodeIDs []string) error { + if len(nodeIDs) == 0 { + return nil + } + g.releaseEnrichMu.Lock() + defer g.releaseEnrichMu.Unlock() + for _, id := range nodeIDs { + if id != "" { + delete(g.releaseEnrich, id) + } + } + return nil +} + +// ReleaseRows reads release rows; empty repoPrefix returns all. +func (g *Graph) ReleaseRows(repoPrefix string) []ReleaseEnrichment { + g.releaseEnrichMu.Lock() + defer g.releaseEnrichMu.Unlock() + out := make([]ReleaseEnrichment, 0, len(g.releaseEnrich)) + for _, r := range g.releaseEnrich { + if repoPrefix != "" && r.RepoPrefix != repoPrefix { + continue + } + out = append(out, r) + } + return out +} + +// BulkSetBlame is the in-memory BlameEnrichmentWriter. +func (g *Graph) BulkSetBlame(repoPrefix string, rows []BlameEnrichment) error { + if len(rows) == 0 { + return nil + } + g.blameEnrichMu.Lock() + defer g.blameEnrichMu.Unlock() + if g.blameEnrich == nil { + g.blameEnrich = make(map[string]BlameEnrichment, len(rows)) + } + for _, r := range rows { + r.RepoPrefix = repoPrefix + g.blameEnrich[r.NodeID] = r + } + return nil +} + +// DeleteBlame is the in-memory BlameEnrichmentWriter delete side. +func (g *Graph) DeleteBlame(nodeIDs []string) error { + if len(nodeIDs) == 0 { + return nil + } + g.blameEnrichMu.Lock() + defer g.blameEnrichMu.Unlock() + for _, id := range nodeIDs { + if id != "" { + delete(g.blameEnrich, id) + } + } + return nil +} + +// BlameRows reads blame rows; empty repoPrefix returns all. +func (g *Graph) BlameRows(repoPrefix string) []BlameEnrichment { + g.blameEnrichMu.Lock() + defer g.blameEnrichMu.Unlock() + out := make([]BlameEnrichment, 0, len(g.blameEnrich)) + for _, r := range g.blameEnrich { + if repoPrefix != "" && r.RepoPrefix != repoPrefix { + continue + } + out = append(out, r) + } + return out +} + +// EdgesByKind yields every edge whose Kind matches. In-memory +// implementation iterates the materialised AllEdges() slice and +// filters; the algorithmic cost is identical to a hand-written +// "for _, e := range g.AllEdges() { if e.Kind == kind }" loop, which +// is what most call sites used before the predicate API existed. +// Disk backends override this with an index-backed scan. +func (g *Graph) EdgesByKind(kind EdgeKind) iter.Seq[*Edge] { + return func(yield func(*Edge) bool) { + for _, e := range g.AllEdges() { + if e == nil || e.Kind != kind { + continue + } + if !yield(e) { + return + } + } + } +} + +// EdgesByKinds is the in-memory reference implementation of +// EdgesByKindsScanner. Single pass over AllEdges with a small +// pre-built kind set — same algorithmic cost as the legacy `for _, e +// := range g.AllEdges() { if e.Kind == X || e.Kind == Y }` loop the +// edge-driven analyzers used before this capability existed. Disk +// backends override with a single `WHERE kind IN $kinds` query so the +// edge-driven analyzers stop firing one EdgesByKind per kind (or +// worse, scanning AllEdges and filtering Go-side). +// +// Empty kinds yields nothing — matches the disk contract. +func (g *Graph) EdgesByKinds(kinds []EdgeKind) iter.Seq[*Edge] { + if len(kinds) == 0 { + return func(yield func(*Edge) bool) {} + } + set := make(map[EdgeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + set[k] = struct{}{} + } + if len(set) == 0 { + return func(yield func(*Edge) bool) {} + } + return func(yield func(*Edge) bool) { + for _, e := range g.AllEdges() { + if e == nil { + continue + } + if _, ok := set[e.Kind]; !ok { + continue + } + if !yield(e) { + return + } + } + } +} + +// NodesByKind yields every node whose Kind matches. Same semantics +// and same in-memory cost story as EdgesByKind. +func (g *Graph) NodesByKind(kind NodeKind) iter.Seq[*Node] { + return func(yield func(*Node) bool) { + for _, n := range g.AllNodes() { + if n == nil || n.Kind != kind { + continue + } + if !yield(n) { + return + } + } + } +} + +// GetNodesByIDs returns a map id→*Node for every input ID that +// exists in the store. The in-memory implementation loops the +// existing GetNode — algorithmic cost identical to a hand-written +// loop in the caller, no concurrency win here. The value of the +// batched API lives in the disk backends, where it collapses N +// per-id SQL/bolt queries into one. +func (g *Graph) GetNodesByIDs(ids []string) map[string]*Node { + if len(ids) == 0 { + return nil + } + out := make(map[string]*Node, len(ids)) + for _, id := range ids { + if id == "" { + continue + } + if _, ok := out[id]; ok { + continue + } + if n := g.GetNode(id); n != nil { + out[id] = n + } + } + return out +} + +// FindNodesByNames is the batched sibling of FindNodesByName. +func (g *Graph) FindNodesByNames(names []string) map[string][]*Node { + if len(names) == 0 { + return nil + } + out := make(map[string][]*Node, len(names)) + for _, name := range names { + if name == "" { + continue + } + if _, ok := out[name]; ok { + continue + } + matches := g.FindNodesByName(name) + if len(matches) > 0 { + out[name] = matches + } + } + return out +} + +// EdgesWithUnresolvedTarget yields every edge whose To has the +// "unresolved::" prefix — the resolver's main pending-edge filter. +// In-memory iterates all edges and prefix-checks; disk backends back +// it with a range scan on a to-keyed index. +func (g *Graph) EdgesWithUnresolvedTarget() iter.Seq[*Edge] { + return func(yield func(*Edge) bool) { + for _, e := range g.AllEdges() { + if e == nil { + continue + } + // IsUnresolvedTarget matches both the bare `unresolved::` + // form and the multi-repo `::unresolved::` + // form that the disk backend's bulk-load rewrite produces. A bare + // HasPrefix check silently skipped every prefixed stub, so the + // Go resolver never got a second pass at multi-repo edges. + if !IsUnresolvedTarget(e.To) { + continue + } + if !yield(e) { + return + } + } + } +} + +// DeadCodeCandidates is the in-memory reference implementation of +// DeadCodeCandidator. Iterates the requested node kinds and filters +// out anything whose incoming-edge bucket contains an allowlist match +// — same algorithm the analysis.FindDeadCode loop runs, just exposed +// as a single capability the disk backend can short-circuit with +// one query per kind. Pure map / slice walks here; the win lives +// in the disk backend where the equivalent path materialises the full +// in-edge map. +func (g *Graph) DeadCodeCandidates(allowedNodeKinds []NodeKind, allowedInEdgeKinds map[NodeKind][]EdgeKind) []*Node { + if len(allowedNodeKinds) == 0 { + return nil + } + // Build a per-kind set so the inner loop can match against a map + // instead of re-scanning the allowlist slice for every edge. + allowedSet := make(map[NodeKind]map[EdgeKind]struct{}, len(allowedNodeKinds)) + for _, k := range allowedNodeKinds { + set := make(map[EdgeKind]struct{}, len(allowedInEdgeKinds[k])) + for _, ek := range allowedInEdgeKinds[k] { + set[ek] = struct{}{} + } + allowedSet[k] = set + } + + var out []*Node + for _, k := range allowedNodeKinds { + allowed, hasAllow := allowedSet[k] + anyKindCounts := !hasAllow || len(allowed) == 0 + for n := range g.NodesByKind(k) { + if n == nil { + continue + } + incoming := g.GetInEdges(n.ID) + dead := true + for _, e := range incoming { + if e == nil { + continue + } + if anyKindCounts { + dead = false + break + } + if _, ok := allowed[e.Kind]; ok { + dead = false + break + } + } + if dead { + out = append(out, n) + } + } + } + return out +} + +// IfaceImplementsRows is the in-memory reference implementation of +// IfaceImplementsScanner. Joins KindInterface nodes carrying +// Meta["methods"] with their EdgeImplements predecessors and returns +// one row per (typeID, ifaceID, ifaceMeta) tuple. +func (g *Graph) IfaceImplementsRows() []IfaceImplementsRow { + // Index interfaces with methods by ID so the edge walk is O(edges) + // rather than O(edges × interfaces). + ifaceMeta := make(map[string]map[string]any) + for n := range g.NodesByKind(KindInterface) { + if n == nil || n.Meta == nil { + continue + } + if _, ok := n.Meta["methods"]; !ok { + continue + } + ifaceMeta[n.ID] = n.Meta + } + if len(ifaceMeta) == 0 { + return nil + } + var out []IfaceImplementsRow + for e := range g.EdgesByKind(EdgeImplements) { + if e == nil { + continue + } + meta, ok := ifaceMeta[e.To] + if !ok { + continue + } + out = append(out, IfaceImplementsRow{ + TypeID: e.From, + IfaceID: e.To, + IfaceMeta: meta, + }) + } + return out +} + +// NodeDegreeCounts is the in-memory reference implementation of +// NodeDegreeAggregator. Walks the per-node in/out edge buckets the +// in-memory backend already maintains — same cost as the per-node +// loop GraphConnectivity ran before this capability landed, just +// folded into one method call so the analyzer can pick the disk +// backend's bulk implementation transparently. Missing ids are +// elided from the result (matching the disk contract). +func (g *Graph) NodeDegreeCounts(ids []string, usageKinds []EdgeKind) []NodeDegreeRow { + if len(ids) == 0 { + return nil + } + usage := make(map[EdgeKind]struct{}, len(usageKinds)) + for _, k := range usageKinds { + usage[k] = struct{}{} + } + seen := make(map[string]struct{}, len(ids)) + out := make([]NodeDegreeRow, 0, len(ids)) + for _, id := range ids { + if id == "" { + continue + } + if _, dup := seen[id]; dup { + continue + } + seen[id] = struct{}{} + // Skip unknown ids — the disk backend's WHERE n.id IN $ids + // clause naturally drops them; mirror that here so both + // backends return the same row count. + if g.GetNode(id) == nil { + continue + } + in := g.GetInEdges(id) + row := NodeDegreeRow{ + NodeID: id, + InCount: len(in), + OutCount: len(g.GetOutEdges(id)), + } + if len(usage) > 0 { + for _, e := range in { + if e == nil { + continue + } + if _, ok := usage[e.Kind]; ok { + row.UsageInCount++ + } + } + } + out = append(out, row) + } + return out +} + +// FileImporters is the in-memory reference implementation of the +// FileImporters capability. Iterates EdgeImports via the byKind +// bucket — same cost as the legacy AllEdges()+filter loop in +// handleCheckReferences, but exposes the predicate as a single call +// the disk backend can short-circuit with one query. +// +// Matches edges whose To node satisfies filePath == n.FilePath OR +// filePath == n.ID. The dual match keeps parity with the indexer's +// two import shapes: file-targeted imports point at the file node +// (n.ID == filePath), while symbol-targeted imports land on a symbol +// whose FilePath equals filePath. +func (g *Graph) FileImporters(filePath string) []FileImporterRow { + if filePath == "" { + return nil + } + var out []FileImporterRow + for e := range g.EdgesByKind(EdgeImports) { + if e == nil { + continue + } + to := g.GetNode(e.To) + if to == nil { + continue + } + if to.FilePath != filePath && to.ID != filePath { + continue + } + from := g.GetNode(e.From) + if from == nil { + continue + } + out = append(out, FileImporterRow{ + FromFile: from.FilePath, + FromID: from.ID, + FromName: from.Name, + FromKind: from.Kind, + }) + } + return out +} + +// NodeFanCounts is the in-memory reference implementation of +// NodeFanAggregator. Two passes over the per-node in/out edge buckets +// the in-memory backend already maintains, filtered by the caller's +// kind sets. The disk backend overrides with one query per direction +// to drop the AllEdges() materialisation FindHotspots / health_score +// were running every call. +func (g *Graph) NodeFanCounts(ids []string, fanInKinds []EdgeKind, fanOutKinds []EdgeKind) []NodeFanRow { + if len(ids) == 0 { + return nil + } + inSet := make(map[EdgeKind]struct{}, len(fanInKinds)) + for _, k := range fanInKinds { + inSet[k] = struct{}{} + } + outSet := make(map[EdgeKind]struct{}, len(fanOutKinds)) + for _, k := range fanOutKinds { + outSet[k] = struct{}{} + } + seen := make(map[string]struct{}, len(ids)) + out := make([]NodeFanRow, 0, len(ids)) + for _, id := range ids { + if id == "" { + continue + } + if _, dup := seen[id]; dup { + continue + } + seen[id] = struct{}{} + if g.GetNode(id) == nil { + continue + } + row := NodeFanRow{NodeID: id} + if len(inSet) > 0 { + for _, e := range g.GetInEdges(id) { + if e == nil { + continue + } + if _, ok := inSet[e.Kind]; ok { + row.FanIn++ + } + } + } + if len(outSet) > 0 { + for _, e := range g.GetOutEdges(id) { + if e == nil { + continue + } + if _, ok := outSet[e.Kind]; ok { + row.FanOut++ + } + } + } + out = append(out, row) + } + return out +} + +// InEdgeCountsByKind is the in-memory reference implementation of +// the InEdgeCounter capability. Walks each requested EdgeKind via +// the byKind bucket and increments a per-To counter. Same algorithm +// the AllEdges-bucketing fallback in handleGetUntestedSymbols runs; +// the win lives in the disk backend where AllEdges() materialises every +// edge just to bucket by target. +// +// Dedupes the kind set up front so a sloppy caller passing the same +// kind twice doesn't double-count — matches the disk backend's +// IN-list dedup. +func (g *Graph) InEdgeCountsByKind(kinds []EdgeKind) map[string]int { + if len(kinds) == 0 { + return nil + } + seen := make(map[EdgeKind]struct{}, len(kinds)) + out := make(map[string]int) + for _, k := range kinds { + if _, ok := seen[k]; ok { + continue + } + seen[k] = struct{}{} + for e := range g.EdgesByKind(k) { + if e == nil { + continue + } + out[e.To]++ + } + } + return out +} + +// NodesInFilesByKind is the in-memory reference implementation of +// the NodesInFilesByKindFinder capability. Filters NodesByKind for +// each requested kind down to the file set. Same algorithm as the +// Go-side loop in find_declaration's buildDeclFileIndex; the win +// lives in disk backends where AllNodes() over cgo dwarfs the few +// hundred surviving rows. +func (g *Graph) NodesInFilesByKind(files []string, kinds []NodeKind) []*Node { + if len(files) == 0 || len(kinds) == 0 { + return nil + } + wanted := make(map[string]struct{}, len(files)) + for _, f := range files { + if f == "" { + continue + } + wanted[f] = struct{}{} + } + if len(wanted) == 0 { + return nil + } + // Dedup the kinds so a sloppy caller doesn't double-scan. + seenKind := make(map[NodeKind]struct{}, len(kinds)) + var out []*Node + for _, k := range kinds { + if _, ok := seenKind[k]; ok { + continue + } + seenKind[k] = struct{}{} + for n := range g.NodesByKind(k) { + if n == nil { + continue + } + if _, ok := wanted[n.FilePath]; !ok { + continue + } + out = append(out, n) + } + } + return out +} + +// NodesByKinds is the in-memory reference implementation of the +// NodesByKindsScanner capability. Loops the existing NodesByKind +// iterator per requested kind — algorithmic cost identical to the +// hand-written `for _, n := range AllNodes() if n.Kind == K` pattern +// the metadata analyzers used before. The win lives in the disk +// backend, where one IN-list query replaces the AllNodes() pull. +// +// Dedupes the kind set up front so a sloppy caller passing the same +// kind twice doesn't double-yield — matches the disk backend's +// IN-list dedup. Empty kinds returns nil without touching the store. +func (g *Graph) NodesByKinds(kinds []NodeKind) []*Node { + if len(kinds) == 0 { + return nil + } + seen := make(map[NodeKind]struct{}, len(kinds)) + var out []*Node + for _, k := range kinds { + if _, ok := seen[k]; ok { + continue + } + seen[k] = struct{}{} + for n := range g.NodesByKind(k) { + if n == nil { + continue + } + out = append(out, n) + } + } + return out +} + +// EdgeAdjacencyForKinds is the in-memory reference implementation of +// the EdgeAdjacencyForKinds capability. One AllEdges scan that yields +// (from, to) pairs whose Kind is in the supplied edge-kind set AND +// whose endpoints both have a Kind in the node-kind set — identical +// shape to the join the disk backend folds into a single +// query. +// +// Empty edgeKinds or empty nodeKinds yields nothing — matches the +// disk contract. +func (g *Graph) EdgeAdjacencyForKinds(edgeKinds []EdgeKind, nodeKinds []NodeKind) iter.Seq[[2]string] { + if len(edgeKinds) == 0 || len(nodeKinds) == 0 { + return func(yield func([2]string) bool) {} + } + eset := make(map[EdgeKind]struct{}, len(edgeKinds)) + for _, k := range edgeKinds { + if k == "" { + continue + } + eset[k] = struct{}{} + } + nset := make(map[NodeKind]struct{}, len(nodeKinds)) + for _, k := range nodeKinds { + if k == "" { + continue + } + nset[k] = struct{}{} + } + if len(eset) == 0 || len(nset) == 0 { + return func(yield func([2]string) bool) {} + } + return func(yield func([2]string) bool) { + for _, e := range g.AllEdges() { + if e == nil { + continue + } + if _, ok := eset[e.Kind]; !ok { + continue + } + from := g.GetNode(e.From) + to := g.GetNode(e.To) + if from == nil || to == nil { + continue + } + if _, ok := nset[from.Kind]; !ok { + continue + } + if _, ok := nset[to.Kind]; !ok { + continue + } + if !yield([2]string{e.From, e.To}) { + return + } + } + } +} + +// CommunityCrossingsByKind is the in-memory reference implementation +// of the CommunityCrossingsByKind capability. AllEdges scan with the +// kind-set filter, then a Go-side community comparison per edge — +// the exact loop FindHotspots.countCrossings ran before this +// capability existed. +// +// Empty kinds or empty nodeToComm returns nil. Zero-count sources +// never surface (matches the disk contract — callers probe by +// existence). +func (g *Graph) CommunityCrossingsByKind(kinds []EdgeKind, nodeToComm map[string]string) map[string]int { + if len(kinds) == 0 || len(nodeToComm) == 0 { + return nil + } + set := make(map[EdgeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + set[k] = struct{}{} + } + if len(set) == 0 { + return nil + } + out := make(map[string]int) + for _, e := range g.AllEdges() { + if e == nil { + continue + } + if _, ok := set[e.Kind]; !ok { + continue + } + from := nodeToComm[e.From] + to := nodeToComm[e.To] + if from == "" || to == "" || from == to { + continue + } + out[e.From]++ + } + if len(out) == 0 { + return nil + } + return out +} + +// NodeIDsByKinds is the in-memory reference implementation of the +// NodeIDsByKinds capability. Single AllNodes pass with a kind-set +// filter, deduped on input — same algorithm as NodesByKinds but +// returns only the ID column. The disk-backend win is the projection +// drop, not the algorithmic shape. +func (g *Graph) NodeIDsByKinds(kinds []NodeKind) []string { + if len(kinds) == 0 { + return nil + } + seen := make(map[NodeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + seen[k] = struct{}{} + } + if len(seen) == 0 { + return nil + } + var out []string + for _, n := range g.AllNodes() { + if n == nil { + continue + } + if _, ok := seen[n.Kind]; !ok { + continue + } + out = append(out, n.ID) + } + return out +} + +// EdgeKindCounts is the in-memory reference implementation of the +// EdgeKindCounter capability. One AllEdges scan with a per-kind +// tally — the exact loop the get_surprising_connections Go fallback +// already runs today, just exposed as a single method call so the +// the disk backend can short-circuit with a server-side GROUP BY. +// +// Empty graph returns nil so callers can short-circuit a downstream +// "kindCounts != nil" gate. +func (g *Graph) EdgeKindCounts() map[EdgeKind]int { + out := map[EdgeKind]int{} + for _, e := range g.AllEdges() { + if e == nil { + continue + } + out[e.Kind]++ + } + if len(out) == 0 { + return nil + } + return out +} + +// CrossRepoEdgeCounts is the in-memory reference implementation of +// CrossRepoEdgeAggregator. Iterates the four cross_repo_* byKind +// buckets and groups by (kind, fromRepoPrefix, toRepoPrefix). Same +// algorithm as the architecture handler's AllEdges loop but exposes +// it as a single capability so the disk backend can fold the join into +// one query. +// +// Returns nil when the graph carries no cross-repo edges (single- +// repo mode) so the caller's empty-list rendering kicks in without +// allocating. +func (g *Graph) CrossRepoEdgeCounts() []CrossRepoEdgeRow { + type key struct { + kind EdgeKind + fromRepo string + toRepo string + } + counts := map[key]int{} + for _, k := range []EdgeKind{ + EdgeCrossRepoCalls, + EdgeCrossRepoImplements, + EdgeCrossRepoExtends, + } { + for e := range g.EdgesByKind(k) { + if e == nil { + continue + } + from := g.GetNode(e.From) + to := g.GetNode(e.To) + if from == nil || to == nil { + continue + } + counts[key{kind: e.Kind, fromRepo: from.RepoPrefix, toRepo: to.RepoPrefix}]++ + } + } + if len(counts) == 0 { + return nil + } + out := make([]CrossRepoEdgeRow, 0, len(counts)) + for k, c := range counts { + out = append(out, CrossRepoEdgeRow{ + Kind: k.kind, FromRepo: k.fromRepo, ToRepo: k.toRepo, Count: c, + }) + } + return out +} + +// FileImportCounts is the in-memory reference implementation of +// FileImportAggregator. Iterates the EdgeImports byKind bucket and +// groups by the target file path — coalescing to To-node FilePath +// or, when the indexer pointed the import edge at the file node +// directly, the target ID. Same algorithm as the AllEdges loop in +// mostImportedFiles; the win lives in disk backends where AllEdges +// + per-edge GetNode round-trips over cgo dwarf the few hundred +// surviving rows. +// +// scope, when non-nil, bounds the result to edges whose target ID +// lies in the slice (session-workspace clamp). A nil scope counts +// every imports edge. An empty (non-nil) scope returns nil — never +// a whole-graph scan. +func (g *Graph) FileImportCounts(scope []string) []FileImportCountRow { + if scope != nil && len(scope) == 0 { + return nil + } + var allowed map[string]struct{} + if scope != nil { + allowed = make(map[string]struct{}, len(scope)) + for _, id := range scope { + if id == "" { + continue + } + allowed[id] = struct{}{} + } + if len(allowed) == 0 { + return nil + } + } + counts := map[string]int{} + for e := range g.EdgesByKind(EdgeImports) { + if e == nil { + continue + } + target := g.GetNode(e.To) + if target == nil { + continue + } + if allowed != nil { + if _, ok := allowed[target.ID]; !ok { + continue + } + } + path := target.FilePath + if path == "" { + path = target.ID + } + if path == "" { + continue + } + counts[path]++ + } + if len(counts) == 0 { + return nil + } + out := make([]FileImportCountRow, 0, len(counts)) + for p, c := range counts { + out = append(out, FileImportCountRow{FilePath: p, Count: c}) + } + return out +} + +// SetEdgeProvenanceBatch is the batched sibling of SetEdgeProvenance. +// Same story as ReindexEdges: per-call in memory, one transaction in +// the disk backends. Returns the number of edges whose Origin +// actually changed (matches the sum of per-edge SetEdgeProvenance +// boolean returns). +func (g *Graph) SetEdgeProvenanceBatch(batch []EdgeProvenanceUpdate) int { + changed := 0 + for _, u := range batch { + if u.Edge == nil { + continue + } + if g.SetEdgeProvenance(u.Edge, u.NewOrigin) { + changed++ + } + } + return changed +} + // shardIdx picks the shard index for an ID using FNV-1a. Inlined to // avoid the per-call hash-object allocation that the stdlib's // fnv.New32a() incurs — shardIdx is on the hottest path in the graph @@ -708,7 +1791,7 @@ func (g *Graph) AddBatch(nodes []*Node, edges []*Edge) { inEdgesByShard[shardIdx(e.To)] = append(inEdgesByShard[shardIdx(e.To)], e) } - for i := 0; i < numShards; i++ { + for i := range numShards { if len(nodesByShard[i]) == 0 && len(outEdgesByShard[i]) == 0 && len(inEdgesByShard[i]) == 0 { continue } @@ -965,6 +2048,33 @@ func (g *Graph) GetNodeByQualName(qualName string) *Node { return nil } +// GetNodesByQualNames is the batch form of GetNodeByQualName — returns +// only the qual_names that have a node (an absent key means "no node"). +// The in-memory byQual index makes each lookup O(1); the method exists +// for Store-interface parity with the disk backend, where it collapses +// N per-edge qual_name scans into a single IN-scan. +func (g *Graph) GetNodesByQualNames(qualNames []string) map[string]*Node { + out := make(map[string]*Node, len(qualNames)) + for _, q := range qualNames { + if q == "" { + continue + } + if _, done := out[q]; done { + continue + } + for _, s := range g.shards { + s.mu.RLock() + n, ok := s.byQual[q] + s.mu.RUnlock() + if ok { + out[q] = n + break + } + } + } + return out +} + // FindNodesByName returns all nodes matching the short name. // // Implementation walks every shard's byName bucket. The two-pass shape @@ -1036,6 +2146,40 @@ func (g *Graph) FindNodesByNameInRepo(name, repoPrefix string) []*Node { return out } +// FindNodesByNameContaining returns nodes whose Name (case-insensitive) +// contains substr. The in-memory backend has no name-substring index, +// so this is a single pass over the byName buckets (which already group +// nodes by exact name — the same allocation we'd pay for one FindNodesByName +// call per distinct name). limit caps the slice; 0 means "no limit". +// +// Stable order is the caller's responsibility — bucket iteration is +// deterministic per shard but cross-shard order isn't fixed. +func (g *Graph) FindNodesByNameContaining(substr string, limit int) []*Node { + if substr == "" { + return nil + } + needle := strings.ToLower(substr) + var out []*Node + for _, s := range g.shards { + s.mu.RLock() + for name, bucket := range s.byName { + if !strings.Contains(strings.ToLower(name), needle) { + continue + } + out = append(out, bucket...) + if limit > 0 && len(out) >= limit { + s.mu.RUnlock() + return out[:limit] + } + } + s.mu.RUnlock() + } + if limit > 0 && len(out) > limit { + out = out[:limit] + } + return out +} + // GetFileNodes returns all nodes defined in the given file. func (g *Graph) GetFileNodes(filePath string) []*Node { var out []*Node @@ -1071,6 +2215,48 @@ func (g *Graph) GetInEdges(nodeID string) []*Edge { return out } +// GetOutEdgesByNodeIDs returns a map id→outgoing edges for every input +// id. The in-memory backend loops the existing GetOutEdges — cost +// matches a hand-written loop in the caller. The value of the batched +// API lives in the disk backend, where it collapses N point lookups into +// one bulk query. Empty input returns nil; duplicate ids are +// deduped naturally. Missing ids are absent from the returned map. +func (g *Graph) GetOutEdgesByNodeIDs(ids []string) map[string][]*Edge { + if len(ids) == 0 { + return nil + } + out := make(map[string][]*Edge, len(ids)) + for _, id := range ids { + if id == "" { + continue + } + if _, ok := out[id]; ok { + continue + } + out[id] = g.GetOutEdges(id) + } + return out +} + +// GetInEdgesByNodeIDs is the inbound sibling of GetOutEdgesByNodeIDs. +// See that doc-comment for the contract. +func (g *Graph) GetInEdgesByNodeIDs(ids []string) map[string][]*Edge { + if len(ids) == 0 { + return nil + } + out := make(map[string][]*Edge, len(ids)) + for _, id := range ids { + if id == "" { + continue + } + if _, ok := out[id]; ok { + continue + } + out[id] = g.GetInEdges(id) + } + return out +} + // EvictFile removes all nodes and edges belonging to the given file // path. Nodes for one file can span many shards (different IDs hash // differently), so we lock all shards for this multi-shard operation. @@ -1316,6 +2502,81 @@ func (g *Graph) AllEdges() []*Edge { return out } +// DrainNodes yields every node and FREES the graph's internal node +// storage shard-by-shard as it goes. After Drain finishes the graph +// holds zero nodes. Intended for the one-shot persist path where the +// shadow is about to be discarded: AllNodes would pin the full 11 GB +// graph for the entire persist phase; Drain releases each shard's +// node map (and the per-name / per-file / per-repo indexes) as soon +// as that shard's iteration completes, so GC can reclaim ~700 MB at +// a time on a Linux-scale graph instead of waiting for the indexer's +// defer to return. +// +// The graph remains structurally consistent during Drain — edges and +// other indexes are untouched, only the node maps are emptied. If +// you also need DrainEdges, call them in either order; both are +// destructive and idempotent (a second call yields nothing). +func (g *Graph) DrainNodes() iter.Seq[*Node] { + return func(yield func(*Node) bool) { + for _, s := range g.shards { + s.mu.Lock() + nodes := s.nodes + // Replace with an empty map so the shard's read methods + // keep working (return zero) instead of nil-panicking. + s.nodes = map[string]*Node{} + s.byFile = map[string][]*Node{} + s.byName = map[string][]*Node{} + s.byQual = map[string]*Node{} + s.byRepo = map[string][]*Node{} + s.byFileIdx = map[string]map[string]int{} + s.byNameIdx = map[string]map[string]int{} + s.byRepoIdx = map[string]map[string]int{} + s.mu.Unlock() + for _, n := range nodes { + if !yield(n) { + return + } + } + // nodes goes out of scope here — the shard's old map plus + // every *Node it referenced is now GC-eligible (assuming + // the caller has dropped any remaining reference). + } + } +} + +// DrainEdges yields every edge and FREES the graph's internal edge +// storage shard-by-shard. Same semantics as DrainNodes — meant for +// the persist hand-off, not for general queries. +func (g *Graph) DrainEdges() iter.Seq[*Edge] { + // Invalidate the AllEdges cache so any subsequent caller doesn't + // see drained-shard zombies. The cache holds direct *Edge slice + // references that DrainEdges is about to start freeing. + g.allEdgesCacheMu.Lock() + g.allEdgesCache = nil + g.allEdgesCacheGen = 0 + g.allEdgesCacheMu.Unlock() + return func(yield func(*Edge) bool) { + for _, s := range g.shards { + s.mu.Lock() + outEdges := s.outEdges + s.outEdges = map[string][]*Edge{} + s.inEdges = map[string][]*Edge{} + s.outEdgeIdx = map[string]map[edgeHash]int{} + s.inEdgeIdx = map[string]map[edgeHash]int{} + s.outEdgeKeys = map[string][]edgeHash{} + s.inEdgeKeys = map[string][]edgeHash{} + s.mu.Unlock() + for _, edges := range outEdges { + for _, e := range edges { + if !yield(e) { + return + } + } + } + } + } +} + // Stats returns summary counts by kind and language. func (g *Graph) Stats() GraphStats { g.lockAllRead() @@ -1364,6 +2625,32 @@ func (g *Graph) GetRepoNodes(repoPrefix string) []*Node { return out } +// GetRepoEdges returns every edge whose source node has the given +// RepoPrefix — the in-memory reference implementation of the +// Store-interface method. Walks each shard's byRepo bucket and +// concatenates that node's outEdges in place (no per-node +// GetOutEdges call, so no per-call slice copy). Equivalent in +// observable behaviour to the GetRepoNodes(r) × GetOutEdges loop +// callers used before this method existed; meant to give disk +// backends a single-query hook without changing in-memory cost. +// Empty repoPrefix returns nil (callers use AllEdges() instead). +func (g *Graph) GetRepoEdges(repoPrefix string) []*Edge { + if repoPrefix == "" { + return nil + } + var out []*Edge + for _, s := range g.shards { + s.mu.RLock() + for _, n := range s.byRepo[repoPrefix] { + if src := s.outEdges[n.ID]; len(src) > 0 { + out = append(out, src...) + } + } + s.mu.RUnlock() + } + return out +} + // EvictRepo removes all nodes with matching RepoPrefix and all edges // referencing those nodes. Returns counts of removed nodes and edges. func (g *Graph) EvictRepo(repoPrefix string) (nodesRemoved, edgesRemoved int) { @@ -1604,3 +2891,667 @@ func (g *Graph) RepoPrefixes() []string { } return prefixes } + +// InDegreeForNodes is the in-memory reference implementation of the +// InDegreeForNodes capability. Walks the per-target in-edge buckets +// directly — the same arithmetic the disk backend pushes into a single +// server-side COUNT. +func (g *Graph) InDegreeForNodes(ids []string) map[string]int { + if len(ids) == 0 { + return nil + } + out := make(map[string]int, len(ids)) + for _, id := range ids { + if id == "" { + continue + } + c := len(g.GetInEdges(id)) + if c == 0 { + continue + } + out[id] = c + } + return out +} + +// ReachableForwardByKinds is the in-memory reference implementation +// of the ReachableForwardByKinds capability. Layer-by-layer BFS from +// the seed frontier, following only edges whose Kind is in the +// supplied set. Pure map / slice walks here — the win is the disk +// backend folds the BFS into one variable-length match. +func (g *Graph) ReachableForwardByKinds(seeds []string, kinds []EdgeKind) map[string]bool { + if len(seeds) == 0 { + return nil + } + covered := make(map[string]bool, len(seeds)) + frontier := make([]string, 0, len(seeds)) + for _, id := range seeds { + if id == "" || covered[id] { + continue + } + covered[id] = true + frontier = append(frontier, id) + } + if len(kinds) == 0 { + return covered + } + allowed := make(map[EdgeKind]struct{}, len(kinds)) + for _, k := range kinds { + allowed[k] = struct{}{} + } + for len(frontier) > 0 { + next := frontier[:0:0] + for _, id := range frontier { + for _, e := range g.GetOutEdges(id) { + if e == nil { + continue + } + if _, ok := allowed[e.Kind]; !ok { + continue + } + if !covered[e.To] { + covered[e.To] = true + next = append(next, e.To) + } + } + } + frontier = next + } + return covered +} + +// ThrowerErrorSurface is the in-memory reference implementation of +// the ThrowerErrorSurfacer capability. Walks EdgeThrows once for the +// per-thrower target dedup, then walks each thrower's out-edges for +// the EdgeEmits → KindString(context=error_msg) attachment. The disk +// backend collapses both passes into two server-side GROUP BYs. +func (g *Graph) ThrowerErrorSurface(pathPrefix string) []ThrowerErrorRow { + byThrower := map[string]*ThrowerErrorRow{} + addUnique := func(set []string, v string) []string { + if slices.Contains(set, v) { + return set + } + return append(set, v) + } + for e := range g.EdgesByKind(EdgeThrows) { + if e == nil { + continue + } + if pathPrefix != "" && !strings.HasPrefix(e.FilePath, pathPrefix) { + continue + } + row, ok := byThrower[e.From] + if !ok { + file := e.FilePath + line := e.Line + n := g.GetNode(e.From) + if n != nil { + if file == "" { + file = n.FilePath + } + if line == 0 { + line = n.StartLine + } + } + row = &ThrowerErrorRow{ThrowerID: e.From, FilePath: file, Line: line} + byThrower[e.From] = row + } + row.Throws++ + row.ErrorTargets = addUnique(row.ErrorTargets, e.To) + } + for thrower, row := range byThrower { + for _, e := range g.GetOutEdges(thrower) { + if e == nil || e.Kind != EdgeEmits { + continue + } + n := g.GetNode(e.To) + if n == nil || n.Kind != KindString { + continue + } + ctxLabel, _ := n.Meta["context"].(string) + if ctxLabel != "error_msg" { + continue + } + row.ErrorMsgs = addUnique(row.ErrorMsgs, n.Name) + } + } + out := make([]ThrowerErrorRow, 0, len(byThrower)) + for _, r := range byThrower { + out = append(out, *r) + } + return out +} + +// MemberMethodsByType is the in-memory reference implementation of the +// MemberMethodsByType capability. One EdgesByKind(EdgeMemberOf) walk +// joined with the in-memory node table to filter Kind == KindMethod +// and project the four columns the resolver consumes — the exact +// loop the resolver runs today, just exposed as a single method call +// so the disk backend can fold the join into one query. +// +// Empty graph returns nil. Per-type method lists are deduplicated by +// MethodID so a method that appears twice in the EdgeMemberOf bucket +// (defensive against double-insertion) yields a single row. +func (g *Graph) MemberMethodsByType() map[string][]MemberMethodInfo { + out := map[string][]MemberMethodInfo{} + seen := map[string]map[string]struct{}{} + for e := range g.EdgesByKind(EdgeMemberOf) { + if e == nil { + continue + } + m := g.GetNode(e.From) + if m == nil || m.Kind != KindMethod { + continue + } + typeID := e.To + dedup := seen[typeID] + if dedup == nil { + dedup = make(map[string]struct{}) + seen[typeID] = dedup + } + if _, ok := dedup[m.ID]; ok { + continue + } + dedup[m.ID] = struct{}{} + out[typeID] = append(out[typeID], MemberMethodInfo{ + MethodID: m.ID, + Name: m.Name, + FilePath: m.FilePath, + StartLine: m.StartLine, + RepoPrefix: m.RepoPrefix, + }) + } + if len(out) == 0 { + return nil + } + return out +} + +// StructuralParentEdges is the in-memory reference implementation of +// the StructuralParentEdges capability. Single AllEdges scan with the +// (Extends | Implements | Composes) kind gate and the +// (Type | Interface) endpoint-kind gate applied per edge. +// +// Empty graph or no matching edges returns nil. +func (g *Graph) StructuralParentEdges() []StructuralParentEdgeRow { + var out []StructuralParentEdgeRow + for _, e := range g.AllEdges() { + if e == nil { + continue + } + switch e.Kind { + case EdgeExtends, EdgeImplements, EdgeComposes: + default: + continue + } + from := g.GetNode(e.From) + to := g.GetNode(e.To) + if from == nil || to == nil { + continue + } + if from.Kind != KindType && from.Kind != KindInterface { + continue + } + if to.Kind != KindType && to.Kind != KindInterface { + continue + } + out = append(out, StructuralParentEdgeRow{ + FromID: from.ID, + ToID: to.ID, + FromKind: from.Kind, + ToKind: to.Kind, + Origin: e.Origin, + }) + } + return out +} + +// CrossRepoCandidates is the in-memory reference implementation of the +// CrossRepoCandidates capability. Single AllEdges scan with the +// edge-kind gate + the (non-empty, distinct) repo-prefix gate. Returns +// one row per surviving edge carrying the underlying Edge pointer plus +// the two RepoPrefix values projected from the endpoints. +// +// Empty baseKinds returns nil — matches the disk-backend contract. +// Single-repo graphs (or graphs whose nodes carry no RepoPrefix) +// return no rows because the prefix gate filters them out. +func (g *Graph) CrossRepoCandidates(baseKinds []EdgeKind) []CrossRepoCandidateRow { + if len(baseKinds) == 0 { + return nil + } + kset := make(map[EdgeKind]struct{}, len(baseKinds)) + for _, k := range baseKinds { + if k == "" { + continue + } + kset[k] = struct{}{} + } + if len(kset) == 0 { + return nil + } + var out []CrossRepoCandidateRow + for _, e := range g.AllEdges() { + if e == nil { + continue + } + if _, ok := kset[e.Kind]; !ok { + continue + } + from := g.GetNode(e.From) + to := g.GetNode(e.To) + if from == nil || to == nil { + continue + } + if from.RepoPrefix == "" || to.RepoPrefix == "" { + continue + } + if from.RepoPrefix == to.RepoPrefix { + continue + } + out = append(out, CrossRepoCandidateRow{ + Edge: e, + FromRepo: from.RepoPrefix, + ToRepo: to.RepoPrefix, + }) + } + return out +} + +// ExtractCandidates is the in-memory reference implementation of +// ExtractCandidatesScanner. Walks NodesByKind for function + method, +// applies the threshold gates locally, and counts distinct in-edge +// From / out-edge To values restricted to the requested edge kinds. +func (g *Graph) ExtractCandidates( + kinds []EdgeKind, + minLines, minCallers, minFanOut int, + pathPrefix string, +) []ExtractCandidateRow { + if len(kinds) == 0 { + return nil + } + kset := make(map[EdgeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + kset[k] = struct{}{} + } + if len(kset) == 0 { + return nil + } + var out []ExtractCandidateRow + for _, n := range g.NodesByKinds([]NodeKind{KindFunction, KindMethod}) { + if n == nil { + continue + } + if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { + continue + } + if n.StartLine == 0 || n.EndLine == 0 { + continue + } + lineCount := n.EndLine - n.StartLine + 1 + if lineCount < minLines { + continue + } + callerSet := make(map[string]struct{}) + for _, e := range g.GetInEdges(n.ID) { + if e == nil { + continue + } + if _, ok := kset[e.Kind]; !ok { + continue + } + callerSet[e.From] = struct{}{} + } + if len(callerSet) < minCallers { + continue + } + calleeSet := make(map[string]struct{}) + for _, e := range g.GetOutEdges(n.ID) { + if e == nil { + continue + } + if _, ok := kset[e.Kind]; !ok { + continue + } + calleeSet[e.To] = struct{}{} + } + if len(calleeSet) < minFanOut { + continue + } + out = append(out, ExtractCandidateRow{ + NodeID: n.ID, + Name: n.Name, + FilePath: n.FilePath, + StartLine: n.StartLine, + EndLine: n.EndLine, + LineCount: lineCount, + CallerCount: len(callerSet), + FanOut: len(calleeSet), + }) + } + return out +} + +// FileSymbolNamesByPaths is the in-memory reference implementation of +// the FileSymbolNamesByPaths capability. Walks GetFileNodes for every +// input path, keeps the requested kinds, and emits one row per +// (path, name) pair. Duplicates within a file collapse to a single +// row (a method declared once per file emits once regardless of how +// many times the indexer touched it). +func (g *Graph) FileSymbolNamesByPaths(paths []string, kinds []NodeKind) []FileSymbolNameRow { + if len(paths) == 0 { + return nil + } + kset := make(map[NodeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + kset[k] = struct{}{} + } + seen := make(map[string]struct{}) + dedupKey := func(p, name string) string { return p + "\x00" + name } + var out []FileSymbolNameRow + for _, p := range paths { + if p == "" { + continue + } + for _, n := range g.GetFileNodes(p) { + if n == nil || n.Name == "" { + continue + } + if len(kset) > 0 { + if _, ok := kset[n.Kind]; !ok { + continue + } + } + k := dedupKey(p, n.Name) + if _, ok := seen[k]; ok { + continue + } + seen[k] = struct{}{} + out = append(out, FileSymbolNameRow{FilePath: p, Name: n.Name}) + } + } + return out +} + +// ClassHierarchyTraverse is the in-memory reference implementation of +// ClassHierarchyTraverser. Performs the same BFS as +// query.ClassHierarchy, but stops at the kind/depth gates and returns +// the full Path + EdgeKinds for each terminal node reached so the +// disk backend's variable-length match can be a drop-in +// replacement. Direction "up" follows out-edges; "down" follows +// in-edges. +func (g *Graph) ClassHierarchyTraverse( + seedID string, + direction string, + kinds []EdgeKind, + depth int, +) []ClassHierarchyRow { + if seedID == "" || depth <= 0 || len(kinds) == 0 { + return nil + } + kset := make(map[EdgeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + kset[k] = struct{}{} + } + if len(kset) == 0 { + return nil + } + if g.GetNode(seedID) == nil { + return nil + } + walkUp := direction == "up" + walkDown := direction == "down" + if !walkUp && !walkDown { + return nil + } + type queued struct { + id string + path []string + edgeKinds []EdgeKind + hops int + } + visited := map[string]struct{}{seedID: {}} + queue := []queued{{id: seedID, path: nil, edgeKinds: nil, hops: 0}} + var out []ClassHierarchyRow + for len(queue) > 0 { + cur := queue[0] + queue = queue[1:] + if cur.hops >= depth { + continue + } + var edges []*Edge + if walkUp { + edges = g.GetOutEdges(cur.id) + } else { + edges = g.GetInEdges(cur.id) + } + for _, e := range edges { + if e == nil { + continue + } + if _, ok := kset[e.Kind]; !ok { + continue + } + var nb string + if walkUp { + nb = e.To + } else { + nb = e.From + } + if nb == "" { + continue + } + if _, ok := visited[nb]; ok { + continue + } + visited[nb] = struct{}{} + newPath := append([]string(nil), cur.path...) + newPath = append(newPath, nb) + newKinds := append([]EdgeKind(nil), cur.edgeKinds...) + newKinds = append(newKinds, e.Kind) + out = append(out, ClassHierarchyRow{ + Path: newPath, + EdgeKinds: newKinds, + }) + queue = append(queue, queued{id: nb, path: newPath, edgeKinds: newKinds, hops: cur.hops + 1}) + } + } + return out +} + +// FileEditingContext is the in-memory reference implementation of the +// FileEditingContext capability. Performs the equivalent of +// GetFileSymbols + per-function GetCallers/GetCallChain but bounded +// to the call/method node set, so the disk backend's batched query +// returns the same projection. The kinds parameter is the set of +// kinds treated as call targets (function + method). +func (g *Graph) FileEditingContext(filePath string, kinds []NodeKind) *FileEditingContextResult { + if filePath == "" { + return nil + } + nodes := g.GetFileNodes(filePath) + if len(nodes) == 0 { + return nil + } + kset := make(map[NodeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + kset[k] = struct{}{} + } + res := &FileEditingContextResult{} + var fileNodeID string + var defNodeIDs []string + for _, n := range nodes { + if n == nil { + continue + } + if n.Kind == KindFile { + res.FileNode = n + fileNodeID = n.ID + continue + } + res.Defines = append(res.Defines, n) + if _, ok := kset[n.Kind]; ok { + defNodeIDs = append(defNodeIDs, n.ID) + } + } + if fileNodeID != "" { + for _, e := range g.GetOutEdges(fileNodeID) { + if e == nil { + continue + } + if e.Kind == EdgeImports { + res.Imports = append(res.Imports, e) + } + } + } + if len(defNodeIDs) == 0 { + return res + } + inEdges := g.GetInEdgesByNodeIDs(defNodeIDs) + outEdges := g.GetOutEdgesByNodeIDs(defNodeIDs) + callerIDSet := make(map[string]struct{}) + calleeIDSet := make(map[string]struct{}) + for _, id := range defNodeIDs { + for _, e := range inEdges[id] { + if e == nil || e.Kind != EdgeCalls { + continue + } + if e.From == "" { + continue + } + callerIDSet[e.From] = struct{}{} + } + for _, e := range outEdges[id] { + if e == nil || e.Kind != EdgeCalls { + continue + } + if e.To == "" { + continue + } + calleeIDSet[e.To] = struct{}{} + } + } + callerIDs := make([]string, 0, len(callerIDSet)) + for id := range callerIDSet { + callerIDs = append(callerIDs, id) + } + calleeIDs := make([]string, 0, len(calleeIDSet)) + for id := range calleeIDSet { + calleeIDs = append(calleeIDs, id) + } + callerNodes := g.GetNodesByIDs(callerIDs) + calleeNodes := g.GetNodesByIDs(calleeIDs) + for _, id := range callerIDs { + n := callerNodes[id] + if n == nil || n.FilePath == filePath { + continue + } + res.CalledBy = append(res.CalledBy, n) + } + for _, id := range calleeIDs { + n := calleeNodes[id] + if n == nil || n.FilePath == filePath { + continue + } + res.Calls = append(res.Calls, n) + } + return res +} + +// GetFileSubGraph is the in-memory reference implementation of the +// FileSubGraphReader capability. Iterates the existing per-file +// byFile bucket and the per-node outEdges / inEdges shards — the +// same lookups Engine.GetFileSymbols' fallback path already runs, +// just collapsed behind one method so the disk backend can push the +// whole walk into a single query. +func (g *Graph) GetFileSubGraph(filePath string) ([]*Node, []*Edge) { + if filePath == "" { + return nil, nil + } + nodes := g.GetFileNodes(filePath) + if len(nodes) == 0 { + return nil, nil + } + ids := make([]string, 0, len(nodes)) + for _, n := range nodes { + if n != nil && n.ID != "" { + ids = append(ids, n.ID) + } + } + outByID := g.GetOutEdgesByNodeIDs(ids) + inByID := g.GetInEdgesByNodeIDs(ids) + type edgeKey struct { + from string + to string + kind EdgeKind + } + seen := make(map[edgeKey]struct{}, 2*len(ids)) + edges := make([]*Edge, 0, 2*len(ids)) + add := func(e *Edge) { + if e == nil { + return + } + k := edgeKey{from: e.From, to: e.To, kind: e.Kind} + if _, ok := seen[k]; ok { + return + } + seen[k] = struct{}{} + edges = append(edges, e) + } + for _, id := range ids { + for _, e := range outByID[id] { + add(e) + } + for _, e := range inByID[id] { + add(e) + } + } + return nodes, edges +} + +// GetFileSubGraphCounts is the in-memory reference implementation of +// FileSubGraphCountReader. The per-node bucket reads are already +// O(1) so it just walks GetFileSubGraph and reports len(edges); the +// row-materialisation win belongs to disk backends. +func (g *Graph) GetFileSubGraphCounts(filePath string) ([]*Node, int) { + nodes, edges := g.GetFileSubGraph(filePath) + return nodes, len(edges) +} + +// NodeDegreeByKinds is the in-memory reference implementation of the +// NodeDegreeByKinds capability. Walks NodesByKinds and reads each +// node's in/out edge buckets — the disk backend overrides with one +// kind-filtered aggregation per direction so the IN-list of node IDs +// the legacy NodeDegreeCounts path needed is avoided altogether. +func (g *Graph) NodeDegreeByKinds(kinds []NodeKind, pathPrefix string) []NodeDegreeRow { + if len(kinds) == 0 { + return nil + } + pool := g.NodesByKinds(kinds) + out := make([]NodeDegreeRow, 0, len(pool)) + for _, n := range pool { + if n == nil { + continue + } + if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { + continue + } + out = append(out, NodeDegreeRow{ + NodeID: n.ID, + InCount: len(g.GetInEdges(n.ID)), + OutCount: len(g.GetOutEdges(n.ID)), + }) + } + return out +} diff --git a/internal/graph/node.go b/internal/graph/node.go index d2c9c00e..18c3aa38 100644 --- a/internal/graph/node.go +++ b/internal/graph/node.go @@ -40,6 +40,35 @@ const ( // node, not its enclosing function. EdgeMemberOf links to the // enclosing function. EdgeCaptures lists outer bindings closed over. KindClosure NodeKind = "closure" + // KindLocal represents an intra-function binding — a variable + // declared inside a function body via `x := …` / `var x = …` / a + // range clause / a type-switch / a for-init clause. ID convention: + // `#local:@+` (the + // leading `+` flags the value as a relative offset so the IDs + // stay stable when the enclosing function moves as a whole). + // EdgeMemberOf links each binding to its enclosing function or + // method. KindLocal is excluded from the BM25 search index by + // shouldIndexForSearch — surfacing `err` / `data` / `n` / `i` + // from every function would flood every name lookup. The data- + // flow analysis (flow_between, taint_paths, ...) traverses the + // EdgeValueFlow / EdgeArgOf / EdgeReturnsTo edges that target + // these nodes; consumers that want the locals can ask for them + // by kind explicitly. + KindLocal NodeKind = "local" + // KindBuiltin represents a language intrinsic — a function / + // type / constant that's part of the language itself, not + // declared in any indexed source file. ID convention: + // `builtin::::` for functions (`builtin::go::append`, + // `builtin::py::len`) and `builtin::::type::` for + // types (`builtin::go::type::string`). Meta.builtin_kind ∈ + // "func" | "type" | "const". KindBuiltin is excluded from the + // BM25 search index — surfacing `string` / `int` / `append` + // would flood every name lookup. They participate in normal + // graph queries: `find_usages(builtin::go::type::float64)` + // answers "every variable typed as float64 in this codebase", + // which is the load-bearing query for type-drift / dataflow + // analyses. + KindBuiltin NodeKind = "builtin" // KindConstant peels off `const`, `iota`, top-level immutable // bindings, and language-specific constant declarations from // KindVariable. Existing variable-kind nodes are re-classified on diff --git a/internal/graph/node_id_parity_test.go b/internal/graph/node_id_parity_test.go index 35cc2034..8a74a7ad 100644 --- a/internal/graph/node_id_parity_test.go +++ b/internal/graph/node_id_parity_test.go @@ -231,10 +231,18 @@ func indexFixture(t *testing.T, checkoutName string) fixtureResult { for _, n := range g.AllNodes() { // This test is about source-symbol IDs (functions, methods, // types, files) — the things overlay merging keys on. - // Contract-kind nodes (kind=contract) don't currently carry a - // RepoPrefix field; skip them here so the parity gate is - // precise about what it gates. - if n.Kind == graph.KindContract { + // Contract / Module / Builtin nodes are deliberately + // cross-repo singletons (one `dep::foo`, `module::pypi:requests`, + // `builtin::go::len` shared across every repo that uses them) + // and don't carry RepoPrefix; skip them so the parity gate + // stays precise about what it gates. KindFunction nodes + // with meta.external=true are the per-symbol stubs the + // external-call attribution materialises for stdlib/dep + // targets — same rule. + if n.Kind == graph.KindContract || n.Kind == graph.KindModule || n.Kind == graph.KindBuiltin { + continue + } + if ext, _ := n.Meta["external"].(bool); ext { continue } if n.RepoPrefix == "" { diff --git a/internal/graph/overlay.go b/internal/graph/overlay.go index 1518f33a..dbb15864 100644 --- a/internal/graph/overlay.go +++ b/internal/graph/overlay.go @@ -331,6 +331,45 @@ func (v *OverlaidView) GetNode(id string) *Node { return v.base.GetNode(id) } +// GetNodesByIDs returns the overlay-aware *Node for each input ID. +// Overlay-owned IDs short-circuit to the per-session layer (and may +// resolve to nil when the overlay deleted the node); the remainder +// fans out as a single batched lookup against the base store. Missing +// IDs are simply absent from the returned map. +func (v *OverlaidView) GetNodesByIDs(ids []string) map[string]*Node { + if len(ids) == 0 { + return nil + } + out := make(map[string]*Node, len(ids)) + baseIDs := ids[:0:0] // fresh backing array — never aliases caller's slice + for _, id := range ids { + if id == "" { + continue + } + if _, dup := out[id]; dup { + continue + } + if v.layer != nil && v.nodeBelongsToOverlay(id) { + if n := v.layer.nodeByID[id]; n != nil { + out[id] = n + } + // Overlay tombstone — ID is hidden, do not fall back to base. + continue + } + // Track for the single base round-trip; reserve a slot in `out` + // only after the batched lookup returns. + baseIDs = append(baseIDs, id) + } + if len(baseIDs) > 0 && v.base != nil { + for id, n := range v.base.GetNodesByIDs(baseIDs) { + if n != nil { + out[id] = n + } + } + } + return out +} + // GetNodeByQualName: overlay first, then base. Base hits are filtered // to drop entries whose file is overlaid (the overlay's view wins). func (v *OverlaidView) GetNodeByQualName(qualName string) *Node { @@ -351,6 +390,27 @@ func (v *OverlaidView) GetNodeByQualName(qualName string) *Node { return n } +// GetNodesByQualNames resolves each name through GetNodeByQualName so the +// overlay's layer-first / shadowed-file filtering applies — an inherited +// base batch would bypass the overlay. Per-name is fine: an interactive +// overlay's working set is small (the batch form exists for the +// cold-warmup scale on the base store, not here). Returns only hits. +func (v *OverlaidView) GetNodesByQualNames(qualNames []string) map[string]*Node { + out := make(map[string]*Node, len(qualNames)) + for _, q := range qualNames { + if q == "" { + continue + } + if _, done := out[q]; done { + continue + } + if n := v.GetNodeByQualName(q); n != nil { + out[q] = n + } + } + return out +} + // FindNodesByName merges base hits (filtered to drop nodes in // overlaid files unless the overlay re-emitted them) with overlay // hits. Order is overlay-first, then base — callers that picked @@ -383,6 +443,60 @@ func (v *OverlaidView) FindNodesByName(name string) []*Node { return out } +// FindNodesByNameContaining merges overlay-touched name hits with the +// base result, then re-applies the per-overlay-file masking the same +// way FindNodesByName does. Order is overlay-first, then base; the +// limit caps the merged total. Empty substr or both layers nil +// returns nil. +func (v *OverlaidView) FindNodesByNameContaining(substr string, limit int) []*Node { + if substr == "" { + return nil + } + needle := strings.ToLower(substr) + var out []*Node + // Overlay-side: walk the layer's nodesByName index — the same + // bucket FindNodesByName reads from — and accept any name whose + // lowercase form contains the needle. + if v.layer != nil { + for name, bucket := range v.layer.nodesByName { + if strings.Contains(strings.ToLower(name), needle) { + out = append(out, bucket...) + if limit > 0 && len(out) >= limit { + return out[:limit] + } + } + } + } + if v.base == nil { + return out + } + // Base-side: fetch with an inflated limit so overlay-mask drops + // don't leave a short page. Then re-apply the same overlaid-file + // + name-removed mask FindNodesByName uses. + fetch := limit + if fetch > 0 { + fetch *= 2 + } + for _, n := range v.base.FindNodesByNameContaining(substr, fetch) { + if v.layer != nil { + if v.layer.HasFile(IDFile(n.ID)) { + continue + } + if v.layer.nameRemoved[n.Name] != nil && v.layer.nameRemoved[n.Name][n.ID] { + continue + } + } + out = append(out, n) + if limit > 0 && len(out) >= limit { + return out[:limit] + } + } + if limit > 0 && len(out) > limit { + out = out[:limit] + } + return out +} + // GetFileNodes: if the path is overlaid, return overlay's nodes // (empty for tombstones). Otherwise pass through to base. func (v *OverlaidView) GetFileNodes(filePath string) []*Node { @@ -486,6 +600,113 @@ func (v *OverlaidView) GetInEdges(nodeID string) []*Edge { return out } +// GetOutEdgesByNodeIDs returns the overlay-aware outgoing-edge map for +// every input id. Overlay-owned ids short-circuit to the per-session +// layer; the remainder fans out as a single batched lookup against +// the base store. Output mirrors GetOutEdges's per-id semantics +// (target-side overlay deletions filtered out), but in one cgo +// round-trip per direction instead of N. +func (v *OverlaidView) GetOutEdgesByNodeIDs(ids []string) map[string][]*Edge { + if len(ids) == 0 { + return nil + } + out := make(map[string][]*Edge, len(ids)) + baseIDs := ids[:0:0] + seen := make(map[string]struct{}, len(ids)) + for _, id := range ids { + if id == "" { + continue + } + if _, dup := seen[id]; dup { + continue + } + seen[id] = struct{}{} + if v.layer != nil && v.nodeBelongsToOverlay(id) { + src := v.layer.outEdges[id] + cp := make([]*Edge, len(src)) + copy(cp, src) + out[id] = cp + continue + } + baseIDs = append(baseIDs, id) + } + if len(baseIDs) > 0 && v.base != nil { + base := v.base.GetOutEdgesByNodeIDs(baseIDs) + for id, edges := range base { + if v.layer == nil { + out[id] = edges + continue + } + filtered := edges[:0:0] + for _, e := range edges { + if v.layer.HasFile(IDFile(e.To)) { + if v.layer.nodeByID[e.To] == nil { + continue // target deleted in overlay + } + } + filtered = append(filtered, e) + } + out[id] = filtered + } + } + return out +} + +// GetInEdgesByNodeIDs is the inbound sibling of GetOutEdgesByNodeIDs. +// Merges base in-edges (filtered to drop edges sourced in overlaid +// files) with overlay-introduced in-edges for each input id, all in a +// single batched base round-trip. +func (v *OverlaidView) GetInEdgesByNodeIDs(ids []string) map[string][]*Edge { + if len(ids) == 0 { + return nil + } + out := make(map[string][]*Edge, len(ids)) + seen := make(map[string]struct{}, len(ids)) + uniq := ids[:0:0] + for _, id := range ids { + if id == "" { + continue + } + if _, dup := seen[id]; dup { + continue + } + seen[id] = struct{}{} + uniq = append(uniq, id) + } + if len(uniq) == 0 { + return out + } + if v.base != nil { + base := v.base.GetInEdgesByNodeIDs(uniq) + for _, id := range uniq { + edges := base[id] + if v.layer == nil { + out[id] = edges + continue + } + filtered := edges[:0:0] + for _, e := range edges { + if v.layer.HasFile(IDFile(e.From)) { + continue // source is overlaid — overlay's version wins + } + if v.layer.HasFile(IDFile(e.To)) && v.layer.nodeByID[e.To] == nil { + continue // target was deleted by overlay + } + filtered = append(filtered, e) + } + out[id] = filtered + } + } + if v.layer != nil { + for _, id := range uniq { + if extras := v.layer.inEdges[id]; len(extras) > 0 { + out[id] = append(out[id], extras...) + } + } + } + return out +} + // AllNodes returns base's nodes minus nodes in overlaid files, plus // every node the overlay introduced. Bulk-read consumers (analyzers, // search reindex, snapshot export) get an overlay-consistent view diff --git a/internal/graph/reader.go b/internal/graph/reader.go index 10936e0c..e9273417 100644 --- a/internal/graph/reader.go +++ b/internal/graph/reader.go @@ -21,6 +21,23 @@ type Reader interface { GetNode(id string) *Node GetNodeByQualName(qualName string) *Node FindNodesByName(name string) []*Node + // FindNodesByNameContaining returns nodes whose Name (case- + // insensitive) contains substr. The filter is pushed into the + // backend so only matching rows cross the boundary on a disk backend; + // the search hot path's substring fallback uses this instead of + // the old AllNodes()-then-filter pattern (which materialised the + // whole node set per call and didn't scale). limit caps the + // result; 0 means "no limit". + FindNodesByNameContaining(substr string, limit int) []*Node + + // GetNodesByIDs is the batched sibling of GetNode. The disk-backed + // store collapses N individual point lookups into a + // single bulk query — critical on the search hot path where one + // query materialises 60+ candidate IDs. The in-memory backend + // forwards to per-id GetNode, so the cost matches an inline loop + // there. Missing IDs are simply absent from the map (no nil + // values); duplicates dedupe naturally. + GetNodesByIDs(ids []string) map[string]*Node // File / repo scopes. GetFileNodes(filePath string) []*Node @@ -30,6 +47,19 @@ type Reader interface { GetOutEdges(nodeID string) []*Edge GetInEdges(nodeID string) []*Edge + // GetInEdgesByNodeIDs / GetOutEdgesByNodeIDs are the batched + // siblings of GetInEdges / GetOutEdges. The disk-backed store collapses + // N per-id queries into one bulk query over an `id IN $ids` + // filter; the in-memory backend forwards to per-id walks (no + // concurrency win — same algorithmic cost as an inline loop). On + // the rerank hot path this drops ~150 round-trips per + // search_symbols call down to ~4 (prepare collects every + // candidate's ids and fans them out in one inbound + one outbound + // batch). Missing nodes get nil slices in the returned map so + // callers can `for _, e := range m[id]` without an ok-check. + GetInEdgesByNodeIDs(ids []string) map[string][]*Edge + GetOutEdgesByNodeIDs(ids []string) map[string][]*Edge + // Bulk reads — used by analyzers (hotspots, cycles, dead code, // communities, …) and by the embedded query engine's whole-graph // passes. diff --git a/internal/graph/store.go b/internal/graph/store.go new file mode 100644 index 00000000..8e17b4ba --- /dev/null +++ b/internal/graph/store.go @@ -0,0 +1,1700 @@ +package graph + +import ( + "iter" + "sync" +) + +// EdgeReindex is the per-edge payload for ReindexEdges. Edge points +// at the (already mutated) Edge value the caller wants the store to +// re-bind; OldTo is the To target the edge had BEFORE the mutation, +// so the store can drop the stale in-edge index entry for OldTo +// while writing the new one for Edge.To. +type EdgeReindex struct { + Edge *Edge + OldTo string +} + +// EdgeProvenanceUpdate is the per-edge payload for +// SetEdgeProvenanceBatch. Edge points at the stored Edge whose +// origin should be promoted; NewOrigin is the target tier. The store +// only persists the change (and bumps EdgeIdentityRevisions) when +// NewOrigin differs from the currently stored Origin. +type EdgeProvenanceUpdate struct { + Edge *Edge + NewOrigin string +} + +// Store is the persistence-and-query backend the rest of gortex sees +// behind the *Graph type. The only implementation today is the +// in-memory *Graph; future implementations will include an on-disk +// embedded-DB backend (local single-binary) and a remote network +// client. The interface is the seam that lets the rest of the +// codebase be backend-agnostic. +// +// The method set deliberately mirrors *Graph's current public API so +// the codebase compiles unchanged the day this interface lands. A few +// notes on shape: +// +// - Slice-shaped reads (AllNodes / AllEdges / FindNodesByName / …) +// materialise their result in memory — fine for the in-memory +// store, but disk / remote backends will want iterator-shaped +// variants added alongside as those implementations come online. +// +// - Memory-estimate methods (RepoMemoryEstimate / +// AllRepoMemoryEstimates) are inherently in-memory specific; disk +// and remote backends return whatever they can compute and callers +// treat the result as advisory. +// +// - ResolveMutex() returns a backend-owned mutex that resolver +// instances (cross-repo, temporal, external) share to serialise +// their edge-mutation passes against each other and against the +// indexer's incremental rewrites. Every backend needs equivalent +// coordination; the in-memory store uses its existing +// graph-wide resolveMu, disk backends keep a dedicated mutex +// alongside their own write serialisation. The returned pointer +// is owned by the store and must not be Unlocked when not held. +type Store interface { + // --- Writes ----------------------------------------------------- + + AddNode(n *Node) + AddBatch(nodes []*Node, edges []*Edge) + AddEdge(e *Edge) + SetEdgeProvenance(e *Edge, newOrigin string) bool + ReindexEdge(e *Edge, oldTo string) + // Batched siblings of the per-edge mutators. Same semantics, but + // disk backends amortise the per-call transaction overhead by + // committing in implementation-chosen chunks (the in-memory + // backend just loops). The resolver fans out per-edge mutations + // across thousands of edges in a single ResolveAll pass, so the + // per-call form was unusable on disk backends without these. + // Callers MUST first mutate the *Edge fields they want persisted + // (To / Kind / Origin / …) before handing the entry over — these + // methods read the post-mutation Edge state and update the + // backend's indexes accordingly. + ReindexEdges(batch []EdgeReindex) + SetEdgeProvenanceBatch(batch []EdgeProvenanceUpdate) (changed int) + RemoveEdge(from, to string, kind EdgeKind) bool + EvictFile(filePath string) (nodesRemoved, edgesRemoved int) + EvictRepo(repoPrefix string) (nodesRemoved, edgesRemoved int) + + // --- Point lookups --------------------------------------------- + + GetNode(id string) *Node + GetNodeByQualName(qualName string) *Node + + // GetNodesByQualNames returns a map qualName→*Node (first match per + // qual_name) for the whole batch — the qual-name twin of + // FindNodesByNames. It pre-warms the resolver's import resolution: + // qual_name is unindexed on the disk backend, so the per-edge + // GetNodeByQualName in resolveImport is a full node scan per import + // edge; one batched IN-scan collapses that to a single query. + GetNodesByQualNames(qualNames []string) map[string]*Node + + // --- Name + scope queries -------------------------------------- + + FindNodesByName(name string) []*Node + FindNodesByNameInRepo(name, repoPrefix string) []*Node + // FindNodesByNameContaining returns nodes whose Name (case- + // insensitive) contains the given substring. The implementation + // pushes the filter into the backend so only matching rows cross + // the cgo boundary — the old search-substring fallback's + // AllNodes()-then-filter pattern materialised the whole node set + // per query and breaks at Linux-kernel scale (10M+ symbols). + // limit caps the result set so a very common substring can't blow + // up memory; pass 0 for "no limit" (caller's responsibility to + // handle). The order is implementation-defined — callers that + // need deterministic output sort the result. + FindNodesByNameContaining(substr string, limit int) []*Node + GetFileNodes(filePath string) []*Node + GetRepoNodes(repoPrefix string) []*Node + + // --- Edge adjacency -------------------------------------------- + + GetOutEdges(nodeID string) []*Edge + GetInEdges(nodeID string) []*Edge + + // GetInEdgesByNodeIDs / GetOutEdgesByNodeIDs batch the per-node + // edge fan-out into a single backend round-trip. The rerank + // pipeline calls these once per Rerank() to materialise every + // candidate's incoming + outgoing edges in two cgo round-trips + // instead of 6N per-candidate calls. Missing IDs are absent from + // the returned map (callers can index without an ok-check via the + // nil-slice semantics of map[k][]*Edge — range over nil is a no-op). + GetInEdgesByNodeIDs(ids []string) map[string][]*Edge + GetOutEdgesByNodeIDs(ids []string) map[string][]*Edge + + // GetRepoEdges returns every edge whose source node has the given + // RepoPrefix. Equivalent to GetRepoNodes(r) followed by + // GetOutEdges(n.ID) for every n, but executes as a single backend + // query — critical on the disk backend (SQLite) + // where the per-node loop is O(repo_nodes) round-trips. The + // in-memory backend forwards to that same nested walk; the disk + // backends push the join into one server-side query. + // + // Empty repoPrefix returns nothing — use AllEdges() for the + // global view. Nodes with an empty RepoPrefix are unreachable + // through this method by design (they don't belong to any repo). + GetRepoEdges(repoPrefix string) []*Edge + + // --- Bulk reads ------------------------------------------------ + + AllNodes() []*Node + AllEdges() []*Edge + + // --- Predicate-shaped reads (push filters into the store) ------ + // + // These methods replace the pre-Store idiom of `for _, e := range + // AllEdges() { if cond { ... } }`. On the in-memory backend they + // iterate the existing internal byKind / byPrefix buckets — same + // algorithmic cost as the inline filter. On disk backends they + // fan out to dedicated indexes (idx_edge_kind / idx_node_kind / + // the to_id LIKE prefix scan, etc.) so the row count actually + // materialised is proportional to the predicate match, not the + // whole table. + // + // The resolver alone calls AllEdges/AllNodes 34× per pass and + // throws away >99% of each scan; using these predicate methods + // instead cut a 503-second disk-backed resolver pass on a 122k-node + // graph down to seconds. + // + // Iterators stop when the consumer's yield returns false. + // Implementations MUST honour early-stop so callers can break + // out of a search. + + // EdgesByKind yields every edge whose Kind matches. + EdgesByKind(kind EdgeKind) iter.Seq[*Edge] + + // NodesByKind yields every node whose Kind matches. + NodesByKind(kind NodeKind) iter.Seq[*Node] + + // EdgesWithUnresolvedTarget yields every edge whose To has the + // "unresolved::" prefix. The resolver's main loop calls this + // once per pass; on disk backends it should range-scan a + // to-keyed index over the single contiguous "unresolved::" slice + // rather than materialise the whole edges table. + EdgesWithUnresolvedTarget() iter.Seq[*Edge] + + // --- Batched point lookups ------------------------------------- + // + // The resolver fires ~3-10 GetNode / FindNodesByName calls per + // unresolved edge across its workers. With 10-30k pending edges + // that's 100k-300k individual queries. On in-memory that's + // fine (map lookups, nanoseconds). On a disk backend each point + // lookup is ~ms — at 100k+ calls the per-pass cost is hundreds + // of seconds, dominating the resolver. The batched variants + // collapse those into one (or chunked) bulk query. + + // GetNodesByIDs returns a map id→*Node for every input ID present + // in the store. IDs not in the store are simply absent from the + // returned map (no nil values). Callers may pass duplicates; the + // returned map dedupes naturally. + GetNodesByIDs(ids []string) map[string]*Node + + // FindNodesByNames returns a map name→[]*Node where each slot + // holds every node whose Name field matches. Names that match no + // node are absent. Used by the resolver to pre-warm its name-only + // fallback lookup across the whole pending-edge slice in one + // batched call instead of one query per edge. + FindNodesByNames(names []string) map[string][]*Node + + // --- Counts and stats ------------------------------------------ + + NodeCount() int + EdgeCount() int + Stats() GraphStats + RepoStats() map[string]GraphStats + RepoPrefixes() []string + + // --- Provenance verification ----------------------------------- + + EdgeIdentityRevisions() int + VerifyEdgeIdentities() error + + // --- Memory estimation (advisory; in-memory-specific) ---------- + + RepoMemoryEstimate(repoPrefix string) RepoMemoryEstimate + AllRepoMemoryEstimates() map[string]RepoMemoryEstimate + + // --- Coordination ---------------------------------------------- + + // ResolveMutex returns a backend-owned mutex resolver instances + // share to serialise edge-mutation passes. See the package doc + // above for the full contract. + ResolveMutex() *sync.Mutex +} + +// Compile-time assertion: *Graph satisfies the Store interface. If a +// *Graph method's signature ever drifts from the interface, the build +// fails fast here instead of at runtime when a different Store +// implementation gets swapped in. +var _ Store = (*Graph)(nil) + +// BackendResolver is an optional interface backends MAY implement to +// drain the bulk-tractable subset of the resolver's work entirely +// inside the backend engine (a single server-side bulk UPDATE on the +// disk backend) instead of round-tripping every +// resolution decision back to Go. +// +// Sequencing matters: earlier rules are higher-precision than later +// ones. The orchestrator (ResolveAllBulk) runs them in the order +// listed below so that, e.g., an intra-file call binds to its same- +// file declaration before the unique-name pass would have bound it +// to a same-named symbol elsewhere in the repo. +// +// Each method returns the number of pending edges it drained. +// Unimplemented methods return (0, nil) and the orchestrator skips +// to the next. Errors surface as non-fatal — the orchestrator logs +// and continues with subsequent rules; the Go-side Resolver then +// picks up whatever the bulk pass didn't drain. +type BackendResolver interface { + // ResolveSameFile: unresolved::Name where target is in the + // caller's same source file. Strongest precision — a same-file + // declaration is almost never ambiguous. + ResolveSameFile() (resolved int, err error) + + // ResolveSamePackage: unresolved::Name where target is in the + // caller's same directory (Go package). Repo_prefix must match + // to keep the rule within one source tree. + ResolveSamePackage() (resolved int, err error) + + // ResolveImportAware: caller's file imports F, target is a + // symbol in F. Joins against the EdgeImports adjacency. + ResolveImportAware() (resolved int, err error) + + // ResolveRelativeImports: unresolved::pyrel:: / Dart + // relative-URI stubs rewritten to the matching KindFile node + // (e.g. .py or /__init__.py for Python). + // `lang` selects the dialect; empty string runs all supported + // dialects in turn. + ResolveRelativeImports(lang string) (resolved int, err error) + + // ResolveCrossRepo: unresolved::Name where exactly one + // cross-repo Node carries that name. Lower precision than the + // same-repo rules; sets cross_repo = true on the resulting edge. + ResolveCrossRepo() (resolved int, err error) + + // ResolveUniqueNames: unresolved::Name where exactly one Node + // in the entire graph carries that name. Lowest-precision + // "fallback" — runs after the same-file / same-package / + // import-aware passes have drained anything they could resolve + // more precisely. + ResolveUniqueNames() (resolved int, err error) + + // ResolveExternalCallStubs: ensures every external::* edge + // target has a corresponding Node row (the existing + // SynthesizeExternalCalls pass on the Go side). Promotes + // origin to ast_resolved for edges that now point at a real + // stub. + ResolveExternalCallStubs() (resolved int, err error) + + // ResolveAllBulk runs the bulk-tractable methods in + // precision-descending order and returns the cumulative count + // of edges resolved across all rules. The default backend + // implementation should chain the methods above; callers use + // ResolveAllBulk as the single Resolver-side hook. + ResolveAllBulk() (totalResolved int, err error) +} + +// BulkLoader is an optional interface backends MAY implement to expose +// a high-throughput cold-load fast path that bypasses per-call query +// overhead. The cold-start indexer fires ~2000 small AddBatch calls +// during its parse phase; on backends where every AddBatch round-trips +// through a query parser that per-call cost +// dominates wall time. BulkLoader lets the indexer bracket the parse +// loop with BeginBulkLoad / FlushBulk: AddBatch calls inside the +// bracket buffer rows in memory, and FlushBulk commits them through +// the backend's native bulk primitive. +// +// Contract: +// +// - BeginBulkLoad may be called on a non-empty store. The cold-start +// parse phase calls it on an empty store, but later passes (notably +// the contracts pass, which appends a few hundred contract nodes / +// edges after resolve) re-enter the bracket against a populated +// backend. FlushBulk commits via the backend's native bulk +// primitive in MERGE-on-primary-key mode, so re-appending rows +// that share an ID with existing data does not duplicate them. +// +// - Between BeginBulkLoad and FlushBulk, AddBatch is the only mutator +// the caller may invoke. Reads (GetNode, AllEdges, EdgesByKind, …) +// return whatever the backend can see — typically nothing buffered. +// The resolver MUST NOT run until after FlushBulk. +// +// - FlushBulk commits everything buffered since BeginBulkLoad and +// returns the backend to normal per-call write mode. An error +// leaves the store in an implementation-defined state. +// +// - Calling BeginBulkLoad twice without an intervening FlushBulk, +// or calling FlushBulk without a prior BeginBulkLoad, is a +// programmer error; backends are free to panic. +// +// The in-memory *Graph deliberately does NOT implement BulkLoader — +// it's already optimal at the per-call path. bbolt and SQLite likewise +// skip it: their per-call overhead is already amortised by their own +// internal batching (chunked transactions, prepared statements). The +// interface is intentionally opt-in so the indexer can probe with a +// type assertion and fall through to today's per-batch path uniformly. +type BulkLoader interface { + BeginBulkLoad() + FlushBulk() error +} + +// SymbolHit is a single full-text-search result: the matched node ID +// plus its relevance score from the backend's scorer (BM25 in +// the disk backend's FTS). Higher score = more relevant. +type SymbolHit struct { + NodeID string + Score float64 +} + +// SymbolFTSItem is the payload BulkUpsertSymbolFTS takes per node: +// the node's ID and its pre-tokenised text. Reused so the indexer +// can preallocate one slice and the backend can iterate without +// per-element wrapper allocs. +type SymbolFTSItem struct { + NodeID string + Tokens string +} + +// SymbolSearcher is an optional interface backends MAY implement to +// expose engine-native full-text search over the graph's symbol +// names. When the backing store implements it, the daemon's +// search_symbols path routes through the backend FTS instead of +// building a parallel in-process Bleve/BM25 index — saving ~100MB +// of heap on a vscode-scale repo and putting the search latency in +// the same address space as the rest of the graph. +// +// Contract: +// +// - UpsertSymbolFTS is the per-call write path used by incremental +// reindex. The store decides how to persist the pre-tokenised +// text (a sidecar table, an FTS column, an in-engine index — +// backend choice). Tokens are produced by +// internal/search.Tokenize so camelCase / snake_case / path- +// separator semantics match the existing BM25 corpus contract. +// +// - BulkUpsertSymbolFTS is the cold-start fast path used by the +// indexer's shadow-swap drain. Implementations SHOULD use the +// backend's native bulk primitive +// so a 600k-node repo doesn't pay per-row query parse cost. +// Idempotent on NodeID like UpsertSymbolFTS — re-running with +// an overlapping set replaces in place. +// +// repoPrefix is the per-repo namespace; the store wipes only +// rows owned by that prefix before COPYing the new items, so +// multiple repos sharing one store don't clobber each other's +// FTS corpus. Empty prefix means "single-repo mode" — the +// store wipes everything (the legacy behaviour). +// +// - BuildSymbolIndex finalises the index after the bulk parse +// phase. For backends whose FTS index updates automatically on +// row writes, this is a one-shot cold-start call; +// for backends that need an explicit build pass, it's where +// the work happens. Idempotent — safe to call multiple times. +// +// - SearchSymbols runs a query and returns hits ordered by score +// descending. The query string is the user's raw input; the +// backend is expected to tokenise it the same way it tokenised +// the indexed text (typically by passing it through +// internal/search.Tokenize before invoking the FTS). +// +// - Close is implied by graph.Store.Close — no separate +// teardown method here. +type SymbolSearcher interface { + UpsertSymbolFTS(nodeID, tokens string) error + BulkUpsertSymbolFTS(repoPrefix string, items []SymbolFTSItem) error + BuildSymbolIndex() error + SearchSymbols(query string, limit int) ([]SymbolHit, error) +} + +// SymbolBundle is the rerank-shaped result of one search call: the +// matched node, its BM25 score, AND the in/out edges the rerank +// pipeline reads from. Backends that can compose this in a single +// engine round-trip implement SymbolBundleSearcher; callers can fall +// through to SymbolSearcher + GetNodesByIDs + GetIn/OutEdgesByNodeIDs +// when the backend doesn't. +// +// The same node may appear in successive bundles when a multi-call +// retrieval path (primary + expansion) returns it more than once; the +// caller's dedup-by-ID step keeps the per-call shape simple and the +// engine can merge across calls into a single rerank candidate set +// without paying for the duplicate edge fetch — the second occurrence +// already carries the same edges. +type SymbolBundle struct { + Node *Node + Score float64 + InEdges []*Edge + OutEdges []*Edge +} + +// SymbolBundleSearcher is an optional capability backends MAY +// implement to fold the symbol-search hot path's three +// per-BM25-call cgo round-trips (FTS + GetNodesByIDs + the rerank +// prepare's batched in/out edge fetch) into one bundled +// engine-side call: +// +// - FTS yields (id, score) +// - One batched node materialise + one in-edge fan-in + one +// out-edge fan-out, all keyed on the same id list, return the +// bundle. +// +// Backends that do NOT implement this interface still serve the +// search path through SymbolSearcher; callers fall back to +// SymbolSearcher.SearchSymbols + GetNodesByIDs + +// GetIn/OutEdgesByNodeIDs and pay the per-call cgo cost the +// bundled form avoids. The contract is intentionally read-only — +// writes still go through UpsertSymbolFTS / BulkUpsertSymbolFTS on +// the SymbolSearcher. +type SymbolBundleSearcher interface { + SearchSymbolBundles(query string, limit int) ([]SymbolBundle, error) +} + +// VectorItem is the payload BulkUpsertEmbeddings takes per node: +// the node's ID and its embedding vector. Length of Vec must +// match the dim the corresponding BuildVectorIndex call declared +// — backends with fixed-width vector columns reject inserts that +// don't match. +type VectorItem struct { + NodeID string + Vec []float32 +} + +// VectorHit is a single ANN search result: the matched node ID +// plus its distance to the query vector under the backend's +// metric (cosine by default). LOWER distance = more +// similar. Callers that need a similarity score in [0,1] should +// translate via `1 - distance` for cosine. +type VectorHit struct { + NodeID string + Distance float64 +} + +// VectorSearcher is an optional interface backends MAY implement to +// expose engine-native HNSW vector indexing over per-symbol +// embedding vectors. When the backing store implements it, the +// daemon's semantic-search path routes through the backend's +// native ANN index instead of holding a parallel in-process +// HNSW — saving roughly `dim × 4 × N` bytes of heap (≈ 1 GB for +// 384-dim × 663k symbols on a Vscode-scale repo). +// +// The bigger win is that vector neighbours and graph traversal can +// be combined in a single server-side round-trip: an ANN seed +// lookup feeding straight into an adjacency match (e.g. "callers +// of the nearest symbols, scoped to one repo and excluding tests"). +// +// Today this is three round-trips on the in-process HNSW +// path (ANN → IDs → graph fetch → Go-side filter); with +// VectorSearcher it's one engine-side pipeline. +// +// Contract: +// +// - UpsertEmbedding is the per-call write path used by +// incremental reindex when one file's embeddings change. +// +// - BulkUpsertEmbeddings is the cold-start fast path used by +// the indexer's embedding pass. Implementations SHOULD use +// the backend's native bulk primitive so a 600k-node corpus +// doesn't pay per-row query parse cost. Idempotent on NodeID +// — re-running with an overlapping set replaces in place. +// +// - BuildVectorIndex finalises the HNSW index after the bulk +// populate. The dim parameter declares the embedding +// width; backends with fixed-width columns lazily create +// the storage schema on the first BuildVectorIndex call. +// Idempotent — safe to call multiple times with the same dim. +// +// - SimilarTo runs an ANN query: given a vector, return the k +// closest stored vectors ordered by ascending distance. +// +// - Close is implied by graph.Store.Close — no separate +// teardown method here. +type VectorSearcher interface { + UpsertEmbedding(nodeID string, vec []float32) error + BulkUpsertEmbeddings(items []VectorItem) error + BuildVectorIndex(dims int) error + SimilarTo(vec []float32, limit int) ([]VectorHit, error) +} + +// PageRankOpts tunes the PageRank computation. Zero values request +// the backend default — only set fields you genuinely want to +// override so backends can pick their own parallel-tuned defaults +// without the caller second-guessing the constants. +// +// NodeKinds / EdgeKinds restrict the projected subgraph the +// algorithm runs over. Empty means "all kinds" — the algo sees the +// full graph. A non-empty filter is rewritten into a projected- +// graph predicate (e.g. n.kind = "function"). +type PageRankOpts struct { + NodeKinds []NodeKind + EdgeKinds []EdgeKind + DampingFactor float64 + MaxIterations int + Tolerance float64 + Limit int // 0 = return every ranked node +} + +// PageRankHit is one row of the PageRank output: the node ID plus +// its rank score. Hits come back sorted by rank descending. +type PageRankHit struct { + NodeID string + Rank float64 +} + +// PageRanker is an optional interface backends MAY implement to +// expose engine-native PageRank centrality. When the store +// implements it, the daemon's hotspot / authority-ranking path +// routes through the backend's parallel implementation instead of +// computing degree-centrality in-process. +// +// Engine-native PageRank is qualitatively different from the +// degree-based hotspot analyzer: random-walk authority weights +// rare-but-influential nodes the degree count would miss +// (a low-fan-in API that's called from every domain layer ranks +// higher than a high-fan-in test helper). +// +// Contract: +// +// - PageRank runs the algorithm against a projected subgraph and +// returns hits sorted by rank descending. The projection is +// declared and torn down per call — callers don't manage +// PROJECT_GRAPH lifecycle directly. +// +// - The score is normalized so the full corpus sums to 1. +// Relative ordering — not the absolute value — is what callers +// should consume. +// +// - Close is implied by graph.Store.Close. +type PageRanker interface { + PageRank(opts PageRankOpts) ([]PageRankHit, error) +} + +// CommunityOpts tunes Louvain community detection over a projected +// subgraph. Zero values request the backend default +// (maxPhases=20, maxIterations=20). NodeKinds / EdgeKinds +// restrict the projection; an empty filter runs over the full graph. +type CommunityOpts struct { + NodeKinds []NodeKind + EdgeKinds []EdgeKind + MaxPhases int + MaxIterations int +} + +// CommunityHit is one row of the Louvain output: the node ID plus +// the integer community label the algorithm assigned. Two nodes +// with the same CommunityID are in the same community; the actual +// integer is opaque and promises no stability across runs. +type CommunityHit struct { + NodeID string + CommunityID int64 +} + +// CommunityDetector is an optional interface backends MAY +// implement to expose engine-native Louvain community detection. +// When the store implements it, the daemon's +// analysis.DetectCommunitiesLouvain +// path can delegate the partitioning step and keep the existing +// post-processing (label disambiguation, hub detection, cohesion, +// parent assignment). +// +// Contract: +// +// - Louvain runs the algorithm against a projected subgraph and +// returns one hit per node assigning it to a community. The +// projection is declared and torn down per call. +// +// - The engine-native implementation treats edges as undirected (the +// modularity score is computed on the undirected graph even +// though the projected Edge table is directed). Callers that +// care about directed modularity should consult the in-process +// fallback. +// +// - Close is implied by graph.Store.Close. +type CommunityDetector interface { + Louvain(opts CommunityOpts) ([]CommunityHit, error) +} + +// ComponentOpts tunes connected-component computation over a +// projected subgraph. Zero values request the backend default +// (maxIterations=100). NodeKinds / EdgeKinds restrict +// the projection. +type ComponentOpts struct { + NodeKinds []NodeKind + EdgeKinds []EdgeKind + MaxIterations int +} + +// ComponentHit is one row of a connected-component output: the +// node ID plus the integer component label the algorithm assigned. +// Two nodes with the same ComponentID are in the same component. +// The integer is opaque. +type ComponentHit struct { + NodeID string + ComponentID int64 +} + +// ComponentFinder is an optional interface backends MAY implement +// to expose engine-native weakly- and strongly-connected-component +// algorithms. Two methods because the algorithms answer different +// questions: +// +// - WeaklyConnectedComponents treats edges as undirected — every +// pair of nodes reachable from each other (ignoring direction) +// lands in one component. Useful for "is this symbol part of +// the connected core?" diagnostics. +// +// - StronglyConnectedComponents respects edge direction — only +// nodes mutually reachable end up in the same component. The +// SCC of a call graph is the cycle structure: every non- +// trivial SCC (size > 1) is a mutual-recursion ring. +// +// When the store implements ComponentFinder, the daemon's +// connectivity diagnostics and circular-dependency detection +// (`analyze kind=wcc` / `analyze kind=scc`) route through it; +// otherwise the in-process analysis.ComputeWCC / analysis.ComputeSCC +// fallbacks run. +type ComponentFinder interface { + WeaklyConnectedComponents(opts ComponentOpts) ([]ComponentHit, error) + StronglyConnectedComponents(opts ComponentOpts) ([]ComponentHit, error) +} + +// KCoreOpts tunes k-core decomposition. NodeKinds / EdgeKinds +// restrict the projection. The algorithm itself takes no +// per-call parameters — it always computes the full +// decomposition (every node gets its k-degree). +type KCoreOpts struct { + NodeKinds []NodeKind + EdgeKinds []EdgeKind +} + +// KCoreHit is one row of the k-core output: the node ID plus the +// largest k for which the node remains in the k-core after +// iteratively pruning nodes with degree < k. A node's KDegree is +// its position in the core hierarchy — high values mean the node +// sits inside a densely connected centre. +type KCoreHit struct { + NodeID string + KDegree int64 +} + +// KCorer is an optional interface backends MAY implement to +// expose engine-native k-core decomposition. When the store +// implements it, the daemon's `analyze kind=kcore` path delegates +// to the engine-native implementation; otherwise +// analysis.ComputeKCore runs in-process. +// +// k-core finds the densest subgraph: the k-core of a graph is +// the largest subgraph where every node has at least k +// neighbours. The k-degree of a node is the largest k for which +// it stays in the k-core — useful for "find the hub-of-hubs", or +// "what's the core infrastructure code that everything depends +// on". +type KCorer interface { + KCoreDecomposition(opts KCoreOpts) ([]KCoreHit, error) +} + +// DeadCodeCandidator is an optional capability backends MAY implement +// to compute the dead-code candidate set server-side. The default Go +// path in analysis.FindDeadCode pulls every node + a batched in-edge +// map and filters in Go; on a disk backend that's +// ~1.3M edge rows per call. A backend that implements +// DeadCodeCandidator runs the equivalent WHERE-NOT-EXISTS filter +// inside the query engine and returns ~hundreds of true candidates, +// skipping the materialise-then-filter loop entirely. +// +// The opts mirror analysis.FindDeadCodeOptions to keep the surface +// in sync — only the fields the backend can act on (kinds + the +// per-kind in-edge allowlist) are honoured. File-path / build-tag +// / well-known-name exclusions stay in Go because they need +// string parsing the backend can't do efficiently. +type DeadCodeCandidator interface { + // DeadCodeCandidates returns nodes matching the allowed node + // kinds that have NO incoming edges of the corresponding + // allowed in-edge kinds. The map keys the in-edge allowlist by + // node kind — backends evaluate the right allowlist per row. + // Empty allowedInEdgeKinds for a kind means "any incoming edge + // counts as usage". + DeadCodeCandidates(allowedNodeKinds []NodeKind, allowedInEdgeKinds map[NodeKind][]EdgeKind) []*Node +} + +// IfaceImplementsRow is the per-row payload returned by +// IfaceImplementsScanner — one tuple per EdgeImplements edge whose +// target is a KindInterface node carrying Meta["methods"]. TypeID +// is the implementing type (the edge's source); IfaceID is the +// interface (the edge's target); IfaceMeta is the interface +// node's decoded Meta map, from which the caller pulls the +// "methods" field. Rows where the interface had no Meta are +// elided server-side. +type IfaceImplementsRow struct { + TypeID string + IfaceID string + IfaceMeta map[string]any +} + +// IfaceImplementsScanner returns the set of (typeID, interfaceID, +// interfaceMeta) tuples for every EdgeImplements edge where the +// target is a KindInterface node carrying Meta["methods"]. Used by +// analysis.FindDeadCode to compute "type implements interface, so +// these methods are alive even if never called directly". The +// server-side join is one query; the Go-side equivalent fetched +// every interface node then every implements edge separately. +// +// Optional capability — analysis.FindDeadCode falls back to the +// Go-side scan when the backend doesn't implement it. +type IfaceImplementsScanner interface { + IfaceImplementsRows() []IfaceImplementsRow +} + +// NodeDegreeRow is one tuple returned by NodeDegreeAggregator. InCount +// counts EVERY incoming edge (any kind); OutCount counts EVERY outgoing +// edge; UsageInCount counts only the subset whose kind is in the +// "usage" set (Calls, References, Instantiates, Implements, Extends, +// Reads, Writes, Tests). The split exists because connectivity_health +// needs the totals (for isolated / leaf classification) AND the +// usage-edge presence (to fold ClassifyZeroEdge's logic in +// server-side); pulling them in one row saves a second cgo trip per +// node. +type NodeDegreeRow struct { + NodeID string + InCount int + OutCount int + UsageInCount int +} + +// NodeDegreeAggregator is an optional capability backends MAY +// implement to return per-node in/out edge counts plus a usage-edge +// count, server-side. Used by analysis.GraphConnectivity to replace +// the per-node g.GetInEdges(id) + g.GetOutEdges(id) + +// graph.ClassifyZeroEdge(id) trio — three full edge materialisations +// per node on a disk backend. +// One round-trip returns all three counts and lets the analyzer +// classify isolated / leaf / source-only / sink-only / extraction-gap +// without ever materialising the underlying edge structs. +// +// The usageKinds slice MUST mirror graph.usageEdgeKinds (the set +// ClassifyZeroEdge consults). Empty usageKinds means UsageInCount is +// always 0; an empty input ids slice returns nil. +// +// Optional capability — GraphConnectivity falls back to the per-node +// GetInEdges/GetOutEdges path when the backend doesn't implement it. +type NodeDegreeAggregator interface { + NodeDegreeCounts(ids []string, usageKinds []EdgeKind) []NodeDegreeRow +} + +// NodeFanRow is one tuple returned by NodeFanAggregator. FanIn counts +// incoming edges whose kind is in the fanInKinds set; FanOut counts +// outgoing edges whose kind is in the fanOutKinds set. The two kind +// sets are passed by the caller so the same capability serves both +// FindHotspots (fanIn = Calls+References, fanOut = Calls) and any +// future analyzer with a different kind split. +type NodeFanRow struct { + NodeID string + FanIn int + FanOut int +} + +// NodeFanAggregator is an optional capability backends MAY implement +// to compute per-node fan-in / fan-out counts filtered by edge kind, +// server-side. Used by analysis.FindHotspots and +// handleAnalyzeHealthScore to replace the AllEdges() materialisation +// they both ran every call (~500k edges on the gortex +// workspace, the bulk of the wall-clock cost on a disk backend). The Go-side +// crossing computation still needs per-edge (from, to) for the +// Calls/References kinds — that runs through EdgesByKind, which +// streams without materialising the full edge set. +// +// Empty ids => nil; empty fanInKinds / fanOutKinds means that side +// is always 0. Output order is unspecified. +// +// Optional capability — both analyzers fall back to the AllEdges scan +// when the backend doesn't implement it. +type NodeFanAggregator interface { + NodeFanCounts(ids []string, fanInKinds []EdgeKind, fanOutKinds []EdgeKind) []NodeFanRow +} + +// FileImporterRow is the per-row payload returned by FileImporters. +// FromFile is the importing file's path (the result the caller cares +// about); FromID / FromName / FromKind describe the node that owns +// the EdgeImports edge, in case the caller needs more than just the +// file list. +type FileImporterRow struct { + FromFile string + FromID string + FromName string + FromKind NodeKind +} + +// FileImporters is an optional capability backends MAY implement to +// answer "which files import filePath?" with a single backend round- +// trip instead of a Go-side AllEdges() scan. The MCP check_references +// tool's importing-files block hammered AllEdges() per call: ~286k +// edges materialised on the gortex workspace, then a per- +// edge GetNode(e.To) + GetNode(e.From) — multiple thousand backend +// round-trips for a single check_references call. A backend that implements +// FileImporters runs the equivalent join inside the query engine and +// only surfaces the rows that match. +// +// Match semantics mirror the original handler: an EdgeImports edge +// counts when its To node's FilePath equals filePath OR when the To +// node's ID equals filePath (the file's own node id, used by the +// indexer for file-level import bindings). The same-file dedup the +// caller applies stays in Go — backends just stream the candidate +// rows. +// +// Optional capability — handleCheckReferences falls back to the +// AllEdges-driven loop when the backend doesn't implement it. +type FileImporters interface { + FileImporters(filePath string) []FileImporterRow +} + +// InEdgeCounter is an optional capability backends MAY implement to +// compute incoming-edge fan-in counts per target node for a fixed +// set of edge kinds in one backend round-trip. The fallback iterates +// AllEdges() Go-side; on a disk backend that materialises every edge +// (~286k rows on the gortex workspace) just to bucket by To. +// The capability instead runs a single server-side GROUP BY filtered +// by edge kind and ships back only the per-target +// counts — a fraction of the rows and zero per-row Go object alloc. +// +// Used by handleGetUntestedSymbols to compute the calls+references +// fan-in ranking. The map keys are node IDs; values are the integer +// count of matching incoming edges. Targets with zero matching in- +// edges are absent from the map (callers index with `m[id]` and rely +// on the zero-value default). +// +// Optional capability — the handler falls back to AllEdges-driven +// bucketing when the backend doesn't implement it. +type InEdgeCounter interface { + InEdgeCountsByKind(kinds []EdgeKind) map[string]int +} + +// NodesInFilesByKindFinder is an optional capability backends MAY +// implement to answer "which nodes of kinds K live in files F?" +// with a single backend round-trip. The fallback iterates AllNodes() +// Go-side; on a disk backend that materialises the full node table +// per call. The capability instead runs a single server-side query +// filtering by file path and kind, and ships only the matching rows. +// +// Used by handleFindDeclaration to build the per-file enclosing- +// symbol index off the small set of trigram-match file paths. The +// Go fallback's AllNodes pull was ~70k rows on the gortex workspace +// to land at ~hundreds of relevant rows. +// +// Empty files / empty kinds returns nil — never a whole-graph scan. +// +// Optional capability — the handler falls back to AllNodes when the +// backend doesn't implement it. +type NodesInFilesByKindFinder interface { + NodesInFilesByKind(files []string, kinds []NodeKind) []*Node +} + +// FileMtimeWriter is an optional capability backends MAY implement to +// persist the per-file modification time the indexer uses for its +// incremental-reindex decisions. Lifting this state off the daemon's +// gob+gzip snapshot makes warm restarts read it through the same +// backend the graph already lives in (no second persistence surface +// to keep coherent). +// +// repoPrefix is the indexer's own prefix tag; mtimes is keyed on the +// repo-relative file path (the same key the in-memory Indexer's +// fileMtimes map uses). Empty input is a no-op; empty repoPrefix is +// allowed for single-repo daemons. +type FileMtimeWriter interface { + BulkSetFileMtimes(repoPrefix string, mtimes map[string]int64) error +} + +// FileMtimeReader is the read side of FileMtimeWriter. Returns the +// recorded mtimes for one repo prefix as a fresh map (nil for "no +// data"). Used by warmup to seed ReconcileRepoCtx with the per-file +// mtimes it would otherwise have read from the gob snapshot. +type FileMtimeReader interface { + LoadFileMtimes(repoPrefix string) map[string]int64 +} + +// FileMtimeReplacer is an optional capability: persist the AUTHORITATIVE +// full mtime set for a repo prefix, dropping any previously-stored rows for +// files no longer present. The full-index persist path calls this so files +// deleted since the last index are pruned. A backend that only implements +// the upsert-only FileMtimeWriter leaves deleted-file rows behind, and +// warm-restart reconcile then detects them as phantom deletions on every +// restart — forcing a full re-track that never converges. Empty input is a +// no-op (it must never wipe a repo's mtimes from an empty snapshot). +type FileMtimeReplacer interface { + ReplaceFileMtimes(repoPrefix string, mtimes map[string]int64) error +} + +// FileMtimeDeleter is an optional capability: drop the persisted mtime rows +// for a set of repo-relative file paths. The incremental-reindex / watcher +// path calls it when a file is deleted so the persisted set stays in step +// with the live graph (the per-file sibling of FileMtimeReplacer). Empty +// input is a no-op. +type FileMtimeDeleter interface { + DeleteFileMtimes(repoPrefix string, paths []string) error +} + +// CloneShingleWriter is an optional capability backends MAY implement +// to persist each function/method node's MinHash shingle set (a +// []uint64) keyed by node id. Lifting this state into the same backend +// the graph already lives in lets the maintained clone-detection +// count-min sketch (CMS) be rebuilt after a warm restart from the +// persisted snapshot — no re-parse, no second persistence surface to +// keep coherent. It is the shingle-set sibling of FileMtimeWriter. +// +// repoPrefix is the indexer's own prefix tag; rows is keyed on the +// node id whose shingle set the value carries. Empty input is a +// no-op; empty repoPrefix is allowed for single-repo daemons. +// DeleteCloneShingles drops the rows for a set of node ids (evicted +// or rebuilt symbols) so the persisted snapshot stays in step with +// the live graph; empty input is a no-op. +type CloneShingleWriter interface { + BulkSetCloneShingles(repoPrefix string, rows map[string][]uint64) error + DeleteCloneShingles(nodeIDs []string) error +} + +// CloneShingleReader is the read side of CloneShingleWriter. Returns +// the recorded shingle sets for one repo prefix as a fresh map (nil +// for "no data"). Used by warmup to reseed the clone-detection CMS +// from the persisted snapshot instead of re-shingling every body. +type CloneShingleReader interface { + LoadCloneShingles(repoPrefix string) (map[string][]uint64, error) +} + +// ChurnEnrichment is one node's git-churn enrichment, moved out of +// nodes.meta into a typed sidecar (change A). Maps 1:1 to the payload +// internal/churn.EnrichGraph used to stamp on Meta["churn"]/["churn_meta"]. +// HeadSHA/Branch/ComputedAt are file-level only (empty for symbols). +type ChurnEnrichment struct { + NodeID string + RepoPrefix string + CommitCount int + AgeDays int + ChurnRate float64 + LastAuthor string + LastCommitAt string // RFC3339 + HeadSHA string + Branch string + ComputedAt string // RFC3339 +} + +// ChurnEnrichmentWriter is an optional capability backends MAY implement +// to persist git-churn enrichment in a typed sidecar instead of the +// node meta blob. When absent the enricher falls back to stamping +// Node.Meta (legacy path). +type ChurnEnrichmentWriter interface { + BulkSetChurn(repoPrefix string, rows []ChurnEnrichment) error + DeleteChurn(nodeIDs []string) error +} + +// ChurnEnrichmentReader is the read side. ChurnRows returns every churn +// row for repoPrefix; an EMPTY repoPrefix returns ALL rows across repos +// (the cross-repo read get_churn_rate uses, then scope-filters per node). +type ChurnEnrichmentReader interface { + ChurnRows(repoPrefix string) []ChurnEnrichment +} + +// CoverageEnrichment is one node's coverage enrichment (change A), +// moved out of nodes.meta into a typed sidecar. +type CoverageEnrichment struct { + NodeID string + RepoPrefix string + CoveragePct float64 + NumStmt int + Hit int +} + +// CoverageEnrichmentWriter persists coverage enrichment in a typed +// sidecar. Optional capability; absent → enricher falls back to Meta. +type CoverageEnrichmentWriter interface { + BulkSetCoverage(repoPrefix string, rows []CoverageEnrichment) error + DeleteCoverage(nodeIDs []string) error +} + +// CoverageEnrichmentReader reads coverage rows; empty repoPrefix returns +// ALL rows across repos. +type CoverageEnrichmentReader interface { + CoverageRows(repoPrefix string) []CoverageEnrichment +} + +// ReleaseEnrichment is one file node's "first appeared in " +// enrichment (change A), moved out of nodes.meta. +type ReleaseEnrichment struct { + NodeID string + RepoPrefix string + AddedIn string +} + +// ReleaseEnrichmentWriter persists release enrichment in a typed sidecar. +type ReleaseEnrichmentWriter interface { + BulkSetReleases(repoPrefix string, rows []ReleaseEnrichment) error + DeleteReleases(nodeIDs []string) error +} + +// ReleaseEnrichmentReader reads release rows; empty repoPrefix → all. +type ReleaseEnrichmentReader interface { + ReleaseRows(repoPrefix string) []ReleaseEnrichment +} + +// BlameEnrichment is one node's latest-author enrichment (change A), +// moved out of nodes.meta. Timestamp is unix seconds. +type BlameEnrichment struct { + NodeID string + RepoPrefix string + Commit string + Email string + Timestamp int64 +} + +// BlameEnrichmentWriter persists blame enrichment in a typed sidecar. +type BlameEnrichmentWriter interface { + BulkSetBlame(repoPrefix string, rows []BlameEnrichment) error + DeleteBlame(nodeIDs []string) error +} + +// BlameEnrichmentReader reads blame rows; empty repoPrefix → all. +type BlameEnrichmentReader interface { + BlameRows(repoPrefix string) []BlameEnrichment +} + +// EdgesByKindsScanner is an optional capability backends MAY +// implement to stream every edge whose Kind is in the supplied set, +// in a single backend round-trip. The fallback iterates AllEdges() +// Go-side and filters in process — on a disk backend AllEdges +// materialises every edge (~286k rows on the gortex workspace) for the +// edge-driven analyzers (channel_ops, pubsub, k8s_resources, +// kustomize, error_surface, …) that only care about a handful of +// kinds. The capability runs a single server-side query filtering +// by edge kind and ships back only the matching rows. +// +// The single-kind variant EdgesByKind already exists, but the +// analyzers in question typically need 2-5 kinds in one pass; firing +// EdgesByKind once per kind would issue N independent backend queries +// when the planner can naturally batch them with an IN-list. Calling +// EdgesByKinds with one kind is equivalent to EdgesByKind for that +// kind — backends should still prefer the IN-list path so the call +// site never branches on len(kinds). +// +// Empty kinds yields nothing — never a whole-table scan. Iterators +// stop when the consumer's yield returns false; implementations MUST +// honour early-stop so callers can break out of a search. +// +// Optional capability — analyzers fall back to per-kind EdgesByKind +// iteration when the backend doesn't implement it. +type EdgesByKindsScanner interface { + EdgesByKinds(kinds []EdgeKind) iter.Seq[*Edge] +} + +// NodesByKindsScanner is an optional capability backends MAY implement +// to fetch every node whose Kind is in the supplied set in a single +// backend round-trip. Replaces the AllNodes() + Go-side `if n.Kind != +// allowed` filter used by the metadata-oriented analyze handlers +// (todos, stale_code, stale_flags, ownership, coverage_gaps, +// coverage_summary, cgo_users, wasm_users, orphan_tables, +// unreferenced_tables). Each of those scans the entire node table just +// to keep one or two kinds — on a disk backend that's ~70k rows on +// the gortex workspace per call. The capability runs a single +// server-side query filtering by node kind and ships only the +// matching rows. +// +// Why a separate kinds-IN scanner instead of looping the existing +// NodesByKind iterator per kind: on a disk backend NodesByKind is one +// query per call. Looping it for {function, method} doubles the round-trip +// count and rebuilds the row decoder for each pass. One IN-list query +// returns the union directly. The dedup is intentional — duplicated +// kinds in the input never reach the IN-list, matching the in-memory +// reference's behaviour. +// +// Optional capability — handlers fall back to AllNodes-driven scanning +// when the backend doesn't implement it. Empty kinds returns nil +// without touching the backend. +type NodesByKindsScanner interface { + NodesByKinds(kinds []NodeKind) []*Node +} + +// EdgeAdjacencyForKinds is an optional capability backends MAY +// implement to stream (from, to) id pairs for every edge whose Kind +// is in the supplied edge-kind set AND whose endpoints both belong +// to the supplied node-kind set. The shape covers the betweenness / +// centrality adjacency build that today calls EdgesByKinds and +// filters Go-side: on a disk backend the per-edge row carries ~10 string +// columns, multiplied by ~286k edges on the gortex +// workspace, just for a build that uses only From/To. The +// capability returns a 2-column projection from a single server-side +// join — every endpoint kind is enforced by the planner, so neither +// the cross-kind edges nor the irrelevant columns ever leave the backend. +// +// Empty edgeKinds or empty nodeKinds yields nothing — never a +// whole-table scan. Iterators stop when the consumer's yield +// returns false; implementations MUST honour early-stop. +// +// Optional capability — analyzers fall back to EdgesByKinds when +// the backend doesn't implement it. +type EdgeAdjacencyForKinds interface { + EdgeAdjacencyForKinds(edgeKinds []EdgeKind, nodeKinds []NodeKind) iter.Seq[[2]string] +} + +// CommunityCrossingsByKind is an optional capability backends MAY +// implement to return per-source crossing counts for edges whose +// Kind is in the supplied set, given a node→community membership +// map. A "crossing" is an edge whose source community differs from +// its target community; the count is keyed by source id. +// +// Replaces the FindHotspots.countCrossings loop that today iterates +// EdgesByKind twice and tallies per-source Go-side: on the gortex +// workspace the two EdgesByKind passes materialised the full call / +// reference bucket (~286k rows × ~10 columns) just to +// derive a thousand-row aggregate. The capability ships only the +// (from, to) projection — the community comparison runs Go-side +// because the community map isn't a Node column today. +// +// Empty kinds or an empty community map returns nil. The map keys +// in the result MUST be source ids whose count is non-zero — +// implementations MUST drop zero-count rows so callers can probe +// existence without a >0 check. +// +// Optional capability — analyzers fall back to EdgesByKind iteration +// when the backend doesn't implement it. +type CommunityCrossingsByKind interface { + CommunityCrossingsByKind(kinds []EdgeKind, nodeToComm map[string]string) map[string]int +} + +// NodeIDsByKinds is an optional capability backends MAY implement +// to return just the IDs of nodes whose Kind is in the supplied +// set. Replaces NodesByKinds in ranking paths (betweenness, +// hotspots) that only need to iterate ids — the full *Node carries +// ~10 string columns over cgo per row, and the candidate set is +// thousands of function/method rows, so the projection drops the +// per-call cgo allocation count by an order of magnitude. +// +// Empty kinds returns nil without touching the backend. Duplicated +// input kinds must NOT duplicate the output — backends MUST dedup +// the kind set in the IN-list. +// +// Optional capability — callers fall back to NodesByKinds when the +// backend doesn't implement it. +type NodeIDsByKinds interface { + NodeIDsByKinds(kinds []NodeKind) []string +} + +// EdgeKindCounter is an optional capability backends MAY implement +// to return one row per distinct edge kind with its occurrence +// count, server-side. Used by handleGetSurprisingConnections to +// derive the "rare kinds" set (kinds whose share of all edges is at +// or below the rare_kind_pct threshold) without materialising every +// edge over cgo just to bucket by Kind. On the gortex workspace the +// AllEdges() bucket pass was ~286k edges over cgo per call; the +// aggregator returns ~30 rows. +// +// The map's key is the EdgeKind; the value is the integer occurrence +// count. Empty graph returns nil (or an empty map — callers MUST +// treat both as "no rare kinds detected"). +// +// Optional capability — handleGetSurprisingConnections falls back +// to the AllEdges-driven kind bucketing when the backend doesn't +// implement it. +type EdgeKindCounter interface { + EdgeKindCounts() map[EdgeKind]int +} + +// CrossRepoEdgeRow is one tuple returned by CrossRepoEdgeAggregator. +// Kind is the cross_repo_* edge kind verbatim. FromRepo / ToRepo +// are the source / target node's RepoPrefix; Count is the number of +// underlying edges that share the triple. +type CrossRepoEdgeRow struct { + Kind EdgeKind + FromRepo string + ToRepo string + Count int +} + +// CrossRepoEdgeAggregator is an optional capability backends MAY +// implement to return pre-grouped cross-repo edge counts. Used by +// the get_architecture handler's cross_repo rollup, which previously +// scanned AllEdges() + per-edge GetNode(from)+GetNode(to) just to +// emit one row per (kind, from_repo, to_repo). On the gortex +// workspace that meant ~286k edge rows + ~thousands of GetNode +// round-trips for typically <100 cross-repo rows. The +// aggregator runs one server-side GROUP BY and ships only the surviving +// per-triple counts. +// +// Cross-repo edges are identified by graph.BaseKindForCrossRepo — +// the disk implementation MUST use the same kind list (so single- +// repo graphs return an empty slice, not a whole-graph scan). +// +// Optional capability — handleGetArchitecture falls back to the +// AllEdges-driven loop when the backend doesn't implement it. +type CrossRepoEdgeAggregator interface { + CrossRepoEdgeCounts() []CrossRepoEdgeRow +} + +// FileImportCountRow is one tuple returned by FileImportAggregator. +// FilePath is the imported file path (the target node's FilePath, or +// the target node's ID when the indexer pointed the import edge at +// the file node directly). Count is the number of distinct EdgeImports +// edges whose To resolves to that path. +type FileImportCountRow struct { + FilePath string + Count int +} + +// FileImportAggregator is an optional capability backends MAY +// implement to return per-target-file incoming-imports counts in +// one backend round-trip. Used by mostImportedFiles (shared between +// get_repo_outline and suggest_queries) which previously scanned +// AllEdges() + per-edge GetNode(to) just to bucket counts by path. +// On the gortex workspace that loop materialised ~286k edges + per- +// edge GetNode round-trips to produce a top-10 list. The +// aggregator GROUPs server-side and ships the per-file counts only. +// +// scope, when non-nil, bounds the counted edges to those whose target +// node ID lies in the slice (session-workspace clamp). An empty (but +// non-nil) scope returns nil — never a whole-graph scan. A nil scope +// means "no clamp" and counts every imports edge. +// +// Optional capability — mostImportedFiles falls back to the +// AllEdges-driven loop when the backend doesn't implement it. +type FileImportAggregator interface { + FileImportCounts(scope []string) []FileImportCountRow +} + +// InDegreeForNodes is an optional capability backends MAY implement to +// return the per-target incoming-edge count for the given node id set +// in one backend round-trip. Unlike InEdgeCounter (which filters by +// edge kind across the WHOLE graph), this counter is scoped to a +// caller-supplied id set and counts EVERY incoming edge regardless of +// kind. handleGetSurprisingConnections needs both the hub heuristic +// and the per-edge anomaly walk, but the hub check only cares about +// nodes already inside the session-scoped working set; counting every +// edge across the table just to bucket by `To` materialises the entire +// edge column (~286k rows on a disk backend). +// +// Empty ids returns nil — never a whole-table scan. Targets with zero +// matching in-edges may be absent from the returned map (callers index +// with `m[id]` and treat zero as the default). +// +// Optional capability — handleGetSurprisingConnections falls back to +// the AllEdges-driven bucketing when the backend doesn't implement it. +type InDegreeForNodes interface { + InDegreeForNodes(ids []string) map[string]int +} + +// ReachableForwardByKinds is an optional capability backends MAY +// implement to compute the set of node IDs reachable from the seed +// frontier via outgoing edges whose Kind is in the supplied set, in +// one backend round-trip. The Go fallback runs a layer-by-layer BFS +// firing GetOutEdges per node — on a disk backend that's N+1 round-trips +// where N is the transitive frontier size; on a 100k-symbol repo with +// a few thousand test functions the BFS easily issues tens of +// thousands of edge fetches. +// +// reachableFromTests in handleGetUntestedSymbols is the primary +// caller: seeds are every function/method in a test file, kinds are +// {calls, references}, and the result is the closed set of symbols +// covered transitively by the test surface. The capability runs one +// variable-length match expression and ships the closure back as a +// single id list. +// +// Empty seeds returns nil; an empty kinds set returns the seed set +// unchanged (no edges to traverse). The returned map keys are the +// reachable node IDs (including the seeds); the bool value is always +// true — the shape mirrors the in-memory implementation's covered set +// so the caller's index expression stays identical. +// +// Optional capability — reachableFromTests falls back to the +// per-layer GetOutEdges BFS when the backend doesn't implement it. +type ReachableForwardByKinds interface { + ReachableForwardByKinds(seeds []string, kinds []EdgeKind) map[string]bool +} + +// ThrowerErrorRow is one tuple returned by ThrowerErrorSurfacer. ThrowerID +// is the symbol that originates the EdgeThrows edges; ErrorTargets is the +// distinct set of error-type node IDs the thrower reaches via EdgeThrows; +// ErrorMsgs is the distinct set of literal error-message strings the +// thrower emits (KindString nodes with meta.context = "error_msg", linked +// by EdgeEmits). Throws is the count of underlying EdgeThrows edges (one +// thrower may raise the same target multiple times from different sites). +// FilePath / Line are the row metadata the legacy handler propagated from +// the first edge / falling back to the thrower node — they ride here so +// the analyzer never has to issue a follow-up GetNode lookup. +type ThrowerErrorRow struct { + ThrowerID string + FilePath string + Line int + Throws int + ErrorTargets []string + ErrorMsgs []string +} + +// ThrowerErrorSurfacer is an optional capability backends MAY implement +// to evaluate the analyze(error_surface) rollup entirely inside the +// storage layer. The Go fallback walks EdgeThrows once for the per- +// thrower aggregation, then issues GetOutEdges per surviving thrower +// to attach the literal error-message strings. On a disk backend that's +// two scans of the edge table plus an N+1 loop for the per-thrower +// emit walk; the capability runs two server-side GROUP BYs and ships the +// pre-shaped rows back. +// +// pathPrefix narrows the EdgeThrows rows by their stored FilePath +// prefix; an empty prefix means "every thrower". Returned rows are +// already deduplicated per (thrower, error_target) and per (thrower, +// error_msg) — callers feed them directly into the analyzer's sort / +// truncate path without further bucketing. +// +// Optional capability — handleAnalyzeErrorSurface falls back to the +// AllEdges-driven loop when the backend doesn't implement it. +type ThrowerErrorSurfacer interface { + ThrowerErrorSurface(pathPrefix string) []ThrowerErrorRow +} + +// MemberMethodInfo is one row of the MemberMethodsByType projection. +// MethodID is the method node's id; Name is its name (the key the +// InferImplements method-set check compares against); FilePath / +// StartLine are the source coordinates InferOverrides stamps on the +// EdgeOverrides edge it emits; RepoPrefix lets consumers +// (ResolveGRPCStubCalls' pickGRPCHandler) tie-break on same-repo +// without a follow-up GetNode. +type MemberMethodInfo struct { + MethodID string + Name string + FilePath string + StartLine int + RepoPrefix string +} + +// MemberMethodsByType is an optional capability backends MAY implement +// to return the typeID → []MemberMethodInfo projection of every +// EdgeMemberOf edge whose source is a KindMethod node, in one backend +// round-trip. Replaces the InferImplements / InferOverrides Pass 1 +// pattern of EdgesByKind(EdgeMemberOf) followed by per-edge +// GetNode(e.From) to filter on Kind == KindMethod and read the +// method's columns. On a disk backend that loop is N+1 round-trips: +// each method GetNode pulls ~10 string columns + the Meta blob just to +// read four scalar fields. The capability runs a single server-side +// join and ships only the four method columns the resolver +// actually consumes. +// +// Empty graph returns nil; types with no method members are absent +// from the result. The returned slice's elements are unique per +// MethodID — duplicated (typeID, methodID) pairs (a method +// member-of'd twice) collapse to one row. +// +// Optional capability — InferImplements / InferOverrides fall back to +// the per-edge GetNode walk when the backend doesn't implement it. +type MemberMethodsByType interface { + MemberMethodsByType() map[string][]MemberMethodInfo +} + +// StructuralParentEdgeRow is one tuple returned by StructuralParentEdges. +// FromID / ToID are the child / parent node IDs verbatim. FromKind / +// ToKind let the consumer apply the (Type | Interface) gate without a +// follow-up GetNode. Origin is the edge's resolution-tier label, which +// drives override-edge origin selection in InferOverrides. +type StructuralParentEdgeRow struct { + FromID string + ToID string + FromKind NodeKind + ToKind NodeKind + Origin string +} + +// StructuralParentEdges is an optional capability backends MAY +// implement to return every EdgeExtends / EdgeImplements / EdgeComposes +// edge whose endpoints are both KindType / KindInterface, projected as +// (FromID, ToID, FromKind, ToKind, Origin) in one backend round-trip. +// Replaces the InferOverrides Pass 2 pattern of g.AllEdges() followed +// by per-edge GetNode(e.From) + GetNode(e.To) to apply the kind gate. +// On a disk backend the AllEdges scan materialises every edge (~286k +// on the gortex workspace) plus issues two per-edge node lookups; the +// capability runs one server-side join with kind filters on both sides +// and ships only the surviving rows back (typically a small fraction of +// the edge table). +// +// Empty graph returns nil. Rows from extends/implements/composes edges +// whose endpoints aren't both type/interface are filtered server-side +// — the consumer never has to gate them again. +// +// Optional capability — InferOverrides falls back to the AllEdges + +// per-edge GetNode walk when the backend doesn't implement it. +type StructuralParentEdges interface { + StructuralParentEdges() []StructuralParentEdgeRow +} + +// CrossRepoCandidateRow is one tuple returned by CrossRepoCandidates. +// Edge is the underlying base-kind edge verbatim — the consumer +// rewrites Edge.CrossRepo on it and emits a parallel cross_repo_* edge. +// FromRepo / ToRepo are the (already-distinct) source and target +// RepoPrefix values projected from the endpoint nodes. +type CrossRepoCandidateRow struct { + Edge *Edge + FromRepo string + ToRepo string +} + +// CrossRepoCandidates is an optional capability backends MAY implement +// to return every edge whose Kind has a parallel cross_repo_* kind AND +// whose endpoints carry two different non-empty RepoPrefix values, in +// one backend round-trip. Replaces the DetectCrossRepoEdges pattern of +// g.AllEdges() + per-edge GetNode(e.From) + GetNode(e.To) to extract +// the RepoPrefix pair. On a disk backend the AllEdges scan ships every +// edge in the graph plus issues two GetNode lookups per surviving +// row; the capability filters by edge kind + the repo-prefix mismatch +// server-side and ships only the surviving rows (typically a small +// fraction of the edge table on a multi-repo workspace). +// +// baseKinds is the set of edge kinds for which a CrossRepoKindFor +// mapping exists — the caller passes the list and the implementation +// MUST use exactly that set in the IN-list, so a single-repo graph +// (or a graph whose nodes carry no RepoPrefix) returns no rows. +// +// Optional capability — DetectCrossRepoEdges falls back to the +// AllEdges + per-edge GetNode loop when the backend doesn't implement +// it. +type CrossRepoCandidates interface { + CrossRepoCandidates(baseKinds []EdgeKind) []CrossRepoCandidateRow +} + +// ExtractCandidateRow is one tuple returned by ExtractCandidatesScanner. +// Caller / FanOut counts are distinct-by-endpoint (one caller counted +// once per (From, kind) pair, one callee counted once per (To, kind) +// pair) restricted to the call-like edge kinds the consumer cares +// about. LineCount is EndLine - StartLine + 1; rows whose StartLine or +// EndLine is zero are filtered server-side. +type ExtractCandidateRow struct { + NodeID string + Name string + FilePath string + StartLine int + EndLine int + LineCount int + CallerCount int + FanOut int +} + +// ExtractCandidatesScanner is an optional capability backends MAY +// implement to compute the get_extraction_candidates ranking in two +// server-side round-trips (per-node caller-count and fan-out aggregation +// joined to the node table). Replaces the AllNodes() scan + per-node +// GetInEdges / GetOutEdges loop the handler used previously — on the +// gortex workspace that was ~30k node × 2 trips per call, where +// each trip materialised the full edge bucket just to count +// distinct endpoints. The capability instead runs the count +// (DISTINCT-by-endpoint) inside the engine and ships only the rows +// that satisfy the three threshold gates. +// +// Empty kinds yields nothing — the handler always passes a non-empty +// set (EdgeCalls + EdgeCrossRepoCalls). pathPrefix narrows the scan to +// nodes under that file-path prefix; empty matches every path. The +// returned rows mirror the result of the Go-side loop verbatim: +// thresholds applied, line_count = EndLine - StartLine + 1. +// +// Optional capability — handleGetExtractionCandidates falls back to +// the AllNodes scan when the backend doesn't implement it. +type ExtractCandidatesScanner interface { + ExtractCandidates( + kinds []EdgeKind, + minLines, minCallers, minFanOut int, + pathPrefix string, + ) []ExtractCandidateRow +} + +// FileSymbolNameRow is one tuple returned by FileSymbolNamesByPaths. +// FilePath echoes the input slot; Name is one symbol name observed in +// the file (function / method / type / interface kinds only, matching +// symbolNamesInFile's Go-side filter). One file may produce many rows. +type FileSymbolNameRow struct { + FilePath string + Name string +} + +// FileSymbolNamesByPaths is an optional capability backends MAY +// implement to fetch the sorted distinct (file → function/method/type +// names) projection for a slice of file paths in one backend round- +// trip. Replaces the per-file GetFileNodes loop find_co_changing_symbols +// runs after a positive cochange match: 20 result rows × one +// per-file query each on a disk backend. The capability runs a single +// query filtering by file path and kind with an IN-list, and ships +// one row per (file, name). +// +// Empty paths returns nil — never a whole-table scan. Rows for paths +// with no qualifying symbols are absent from the result; callers +// always index by file path and treat missing keys as "no names". +// +// Optional capability — symbolNamesInFile and its callers fall back to +// the per-file GetFileNodes loop when the backend doesn't implement +// it. +type FileSymbolNamesByPaths interface { + FileSymbolNamesByPaths(paths []string, kinds []NodeKind) []FileSymbolNameRow +} + +// ClassHierarchyRow is one tuple returned by ClassHierarchyTraverser. +// Path carries the node IDs visited from the seed (exclusive of the +// seed) out to the terminal node, in BFS order. EdgeKinds carries the +// per-hop edge kind so the caller can reconstruct the *Edge values. +// For a single hop Path has one element and EdgeKinds has one element; +// for a depth-N walk both slices have length N. +type ClassHierarchyRow struct { + Path []string + EdgeKinds []EdgeKind +} + +// ClassHierarchyTraverser is an optional capability backends MAY +// implement to compute the inheritance subgraph rooted at a seed in +// one (or two — up + down) variable-length traversals, server- +// side. Replaces the BFS in query.ClassHierarchy: each frontier node +// fired GetNode + GetInEdges or GetOutEdges per visit on a disk +// backend, so a depth-5 walk over an interface with a wide implementer +// set burned hundreds of round-trips just to discover ~50 edges. +// +// kinds is the edge-kind set the walk consumes (EdgeExtends + +// EdgeImplements + EdgeComposes + EdgeOverrides). depth caps the hop +// budget. direction: +// - "up" — follow outgoing edges from each frontier node. +// - "down" — follow incoming edges into each frontier node. +// +// Empty kinds / depth <= 0 / unknown seed returns nil. The returned +// rows are deduplicated by (Path[-1], last EdgeKind) — the consumer +// reconstructs the visited node set and the edge list from them. +// +// Optional capability — query.ClassHierarchy falls back to the BFS +// when the backend doesn't implement it. +type ClassHierarchyTraverser interface { + ClassHierarchyTraverse( + seedID string, + direction string, + kinds []EdgeKind, + depth int, + ) []ClassHierarchyRow +} + +// FileEditingContext is an optional capability backends MAY +// implement to return the get_editing_context payload (defines + +// imports + 1-hop callers + 1-hop callees, all for one file) in a +// small fixed number of server-side round-trips. Replaces the handler's +// per-symbol GetCallers / GetCallChain loop — for a file with 30 +// functions that fired 60 query-engine entry points on a disk backend. +// +// kinds is the set of node kinds the caller treats as call-targets +// (KindFunction + KindMethod). The capability returns FileNode (the +// file row), Defines (every non-file node anchored to the path, +// signature carried through Meta), Imports (the EdgeImports out-edges +// of the file node), CalledBy (one-hop callers of any defines node, +// filtered to symbols outside the file), and Calls (one-hop callees of +// any defines node, filtered to symbols outside the file). All five +// projections are scoped to the input file in one round-trip each. +// +// Optional capability — handleGetEditingContext falls back to the +// per-symbol loop when the backend doesn't implement it. +type FileEditingContextResult struct { + FileNode *Node + Defines []*Node + Imports []*Edge + CalledBy []*Node + Calls []*Node +} + +type FileEditingContext interface { + FileEditingContext(filePath string, kinds []NodeKind) *FileEditingContextResult +} + +// FileSubGraphReader is an optional capability backends MAY implement +// to return the full file neighbourhood — the file node, every node +// defined in or contained by it, and every adjacent edge — in a +// single backend round-trip. +// +// On the in-memory backend the per-id GetOutEdges / GetInEdges loop +// is already O(1) per node, so the query.Engine.GetFileSymbols +// fallback wraps it. On a disk backend the same loop is +// O(file_symbols) round-trips — ~547 symbols on a real file fanned +// out into ~5 000 round-trips just to dedup edges in Go. The +// capability lets the backend express the walk as a single server-side +// query over the node and edge indexes. +// +// Returned slices are deduplicated by the implementation. Missing +// file returns (nil, nil); empty file (file node only, no symbols) +// returns ([file], nil). Callers that need the symbols-only view +// strip KindFile + KindImport on top (see +// internal/mcp/tools_core.go::stripFileAndImportNodes). +// +// Optional capability — query.Engine.GetFileSymbols falls back to +// GetFileNodes + GetOut/InEdgesByNodeIDs when the backend doesn't +// implement it. +type FileSubGraphReader interface { + GetFileSubGraph(filePath string) (nodes []*Node, edges []*Edge) +} + +// FrontierHop is one (edge, neighbour) pair from a FrontierExpander: an +// edge adjacent to a queried source node plus the node at its far end, +// with the neighbour's columns populated and Meta left nil (traversal +// callers don't read it). It lets a BFS record the edge and +// scope-check / materialise the neighbour without a GetNode per edge. +type FrontierHop struct { + Edge *Edge + Neighbor *Node +} + +// FrontierExpander is an optional backend capability: given a set of +// source node IDs it returns, in a single round-trip, their adjacent +// edges of the requested kinds plus the neighbour nodes — the +// node-edge-node projection a BFS frontier needs. forward=true follows +// outgoing edges (neighbour = edge target); forward=false follows +// incoming (neighbour = edge source). kinds must be non-empty (the +// directed-traversal contract). limit derives a deterministic per-call +// row cap so a hub node's fan-out can no longer be dragged across the +// boundary in full. +// +// query.Engine.bfs uses it when the reader implements it (the disk +// store) and falls back to per-node GetOutEdges/GetInEdges + GetNode +// otherwise — the in-memory graph needs no batching (its reads are O(1)). +type FrontierExpander interface { + ExpandFrontier(ids []string, forward bool, kinds []EdgeKind, limit int) []FrontierHop +} + +// FileSubGraphCountReader is the count-only sibling of +// FileSubGraphReader: returns the file's nodes plus the number of +// distinct edges adjacent to any of them, without materialising the +// edges themselves. +// +// The disk-backend headline cost for get_file_summary on a 500-symbol +// file was the ~4 000-row crossing to ship every adjacent edge back to +// Go. The gcx and compact output paths only emit a total_edges scalar +// in their meta headers — never per-edge rows — so handleGetFileSummary +// routes gcx through this method and skips the row materialisation +// entirely. The json output path keeps the full GetFileSubGraph call +// because it serialises every edge in the body, and the compact path +// keeps it because it summarises edges per confidence label. +// +// On the in-memory backend the per-node edge bucket lookups are +// already O(1), so its implementation just counts via the same path +// GetFileSubGraph walks; the win is on disk backends. +// +// Optional capability — query.Engine.GetFileSymbolsCounts falls back +// to len(GetFileSubGraph().edges) when the backend doesn't implement +// it. +type FileSubGraphCountReader interface { + GetFileSubGraphCounts(filePath string) (nodes []*Node, edgeCount int) +} + +// NodeDegreeByKinds is an optional capability backends MAY implement +// to return per-node total in/out edge counts for every node whose +// kind is in the supplied set, server-side. Replaces the +// get_knowledge_gaps pattern of "give me all functions, then ask for +// their in/out degree" — on a disk backend that fed an IN-list of ~30k +// node IDs to the NodeDegreeCounts query, which has to compare every +// node against the list. The capability instead matches kinds at the +// source and groups by node — one query per direction with a kind +// predicate the planner can index. +// +// pathPrefix narrows the scan to nodes under that file-path prefix; +// empty matches every path. Empty kinds returns nil (never a whole- +// graph scan). +// +// The returned rows mirror NodeDegreeRow's shape but UsageInCount is +// always 0 — knowledge_gaps does not need the usage subset, only the +// total degree. Adding the usage filter back would re-tie the +// capability to ClassifyZeroEdge's notion of "alive" without buying +// any other call site. +// +// Optional capability — handleGetKnowledgeGaps falls back to the +// NodeDegreeCounts IN-list when the backend doesn't implement it. +type NodeDegreeByKinds interface { + NodeDegreeByKinds(kinds []NodeKind, pathPrefix string) []NodeDegreeRow +} diff --git a/internal/graph/store_sqlite/schema.go b/internal/graph/store_sqlite/schema.go new file mode 100644 index 00000000..afb06fc8 --- /dev/null +++ b/internal/graph/store_sqlite/schema.go @@ -0,0 +1,166 @@ +package store_sqlite + +// schemaSQL is the canonical DDL applied on Open. Statements are +// idempotent (IF NOT EXISTS) so they run cleanly against a fresh DB +// and against an existing one. +// +// Schema choices +// +// - nodes.id is the primary key; INSERT OR REPLACE on the id column +// gives idempotent re-adds with last-write-wins on every other +// column, matching the in-memory store's behaviour. +// +// - edges has a synthetic INTEGER PRIMARY KEY plus a UNIQUE +// constraint over (from_id, to_id, kind, file_path, line) -- the +// logical edge key the in-memory store uses for dedup. INSERT OR +// IGNORE on that constraint matches the in-memory "second AddEdge +// for the same key is a no-op" semantics. +// +// - meta is a gob-encoded blob. nil / empty Meta is stored as NULL. +// +// - Secondary indexes mirror the in-memory store's hot lookup paths: +// nodes_by_name -- FindNodesByName / FindNodesByNameInRepo +// nodes_by_kind -- Stats (group-by-kind) +// nodes_by_file -- GetFileNodes, EvictFile +// nodes_by_repo -- GetRepoNodes, RepoStats, EvictRepo +// (partial index -- empty repo_prefix is +// the common case and indexing it would +// be pure overhead) +// nodes_by_qual -- GetNodeByQualName, unique so duplicate +// qual_names surface as constraint errors +// edges_by_from -- GetOutEdges (kind included so RemoveEdge +// can probe by (from, kind) without a +// second hop) +// edges_by_to -- GetInEdges +const schemaSQL = ` +CREATE TABLE IF NOT EXISTS nodes ( + id TEXT PRIMARY KEY, + kind TEXT NOT NULL, + name TEXT NOT NULL, + qual_name TEXT NOT NULL DEFAULT '', + file_path TEXT NOT NULL, + start_line INTEGER NOT NULL DEFAULT 0, + end_line INTEGER NOT NULL DEFAULT 0, + language TEXT NOT NULL DEFAULT '', + repo_prefix TEXT NOT NULL DEFAULT '', + workspace_id TEXT NOT NULL DEFAULT '', + project_id TEXT NOT NULL DEFAULT '', + meta BLOB +) WITHOUT ROWID; + +CREATE INDEX IF NOT EXISTS nodes_by_name ON nodes(name); +CREATE INDEX IF NOT EXISTS nodes_by_kind ON nodes(kind); +CREATE INDEX IF NOT EXISTS nodes_by_file ON nodes(file_path); +CREATE INDEX IF NOT EXISTS nodes_by_repo ON nodes(repo_prefix) WHERE repo_prefix <> ''; +CREATE UNIQUE INDEX IF NOT EXISTS nodes_by_qual ON nodes(qual_name) WHERE qual_name <> ''; + +CREATE TABLE IF NOT EXISTS edges ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + from_id TEXT NOT NULL, + to_id TEXT NOT NULL, + kind TEXT NOT NULL, + file_path TEXT NOT NULL DEFAULT '', + line INTEGER NOT NULL DEFAULT 0, + confidence REAL NOT NULL DEFAULT 1.0, + confidence_label TEXT NOT NULL DEFAULT '', + origin TEXT NOT NULL DEFAULT '', + tier TEXT NOT NULL DEFAULT '', + cross_repo INTEGER NOT NULL DEFAULT 0, + meta BLOB, + UNIQUE(from_id, to_id, kind, file_path, line) +); + +CREATE INDEX IF NOT EXISTS edges_by_from ON edges(from_id, kind); +CREATE INDEX IF NOT EXISTS edges_by_to ON edges(to_id, kind); + +CREATE TABLE IF NOT EXISTS file_mtimes ( + repo_prefix TEXT NOT NULL, + file_path TEXT NOT NULL, + mtime_ns INTEGER NOT NULL, + PRIMARY KEY (repo_prefix, file_path) +) WITHOUT ROWID; + +-- clone_shingles is the per-symbol MinHash shingle-set sidecar. Each +-- function/method node's []uint64 shingle set is stored as a little- +-- endian BLOB (8 bytes/elem) keyed by node_id so the maintained clone- +-- detection count-min sketch can be rebuilt after a warm restart from +-- the snapshot instead of re-parsing every body. repo_prefix carries +-- the owning repo so per-repo reseeds (SELECT … WHERE repo_prefix = ?) +-- and per-repo wipes don't clobber other repos' shingle sets. node_id +-- is the PK (the join key back to nodes.id); like file_mtimes this is a +-- WITHOUT ROWID sidecar so the PK index IS the table. +CREATE TABLE IF NOT EXISTS clone_shingles ( + node_id TEXT PRIMARY KEY, + repo_prefix TEXT NOT NULL DEFAULT '', + shingles BLOB +) WITHOUT ROWID; + +CREATE TABLE IF NOT EXISTS vectors ( + node_id TEXT PRIMARY KEY, + dims INTEGER NOT NULL, + vec BLOB NOT NULL +) WITHOUT ROWID; + +-- churn_enrichment is the per-node git-churn sidecar (change A: move +-- enrichment OUT of nodes.meta so the node hot path stops gob-encoding +-- rarely-read data and get_churn_rate does an indexed read instead of an +-- AllNodes+gob scan). One typed row per enriched file/function/method +-- node, keyed by node_id (join key back to nodes.id); repo_prefix scopes +-- per-repo reseeds/wipes. head_sha/branch/computed_at are file-level only +-- (empty for symbols). WITHOUT ROWID: the PK index IS the table. +CREATE TABLE IF NOT EXISTS churn_enrichment ( + node_id TEXT PRIMARY KEY, + repo_prefix TEXT NOT NULL DEFAULT '', + commit_count INTEGER NOT NULL DEFAULT 0, + age_days INTEGER NOT NULL DEFAULT 0, + churn_rate REAL NOT NULL DEFAULT 0, + last_author TEXT NOT NULL DEFAULT '', + last_commit_at TEXT NOT NULL DEFAULT '', + head_sha TEXT NOT NULL DEFAULT '', + branch TEXT NOT NULL DEFAULT '', + computed_at TEXT NOT NULL DEFAULT '' +) WITHOUT ROWID; +CREATE INDEX IF NOT EXISTS churn_by_repo ON churn_enrichment(repo_prefix) WHERE repo_prefix <> ''; + +-- coverage_enrichment: per-symbol coverage sidecar (change A). Typed +-- columns keyed by node_id; repo_prefix scopes per-repo wipes. +CREATE TABLE IF NOT EXISTS coverage_enrichment ( + node_id TEXT PRIMARY KEY, + repo_prefix TEXT NOT NULL DEFAULT '', + coverage_pct REAL NOT NULL DEFAULT 0, + num_stmt INTEGER NOT NULL DEFAULT 0, + hit INTEGER NOT NULL DEFAULT 0 +) WITHOUT ROWID; +CREATE INDEX IF NOT EXISTS coverage_by_repo ON coverage_enrichment(repo_prefix) WHERE repo_prefix <> ''; + +-- release_enrichment: per-file "added_in " sidecar (change A). +CREATE TABLE IF NOT EXISTS release_enrichment ( + node_id TEXT PRIMARY KEY, + repo_prefix TEXT NOT NULL DEFAULT '', + added_in TEXT NOT NULL DEFAULT '' +) WITHOUT ROWID; +CREATE INDEX IF NOT EXISTS release_by_repo ON release_enrichment(repo_prefix) WHERE repo_prefix <> ''; + +-- blame_enrichment: per-symbol latest-author sidecar (change A). +CREATE TABLE IF NOT EXISTS blame_enrichment ( + node_id TEXT PRIMARY KEY, + repo_prefix TEXT NOT NULL DEFAULT '', + commit_sha TEXT NOT NULL DEFAULT '', + email TEXT NOT NULL DEFAULT '', + ts INTEGER NOT NULL DEFAULT 0 +) WITHOUT ROWID; +CREATE INDEX IF NOT EXISTS blame_by_repo ON blame_enrichment(repo_prefix) WHERE repo_prefix <> ''; + +-- symbol_fts is the FTS5 full-text index over pre-tokenised symbol +-- names. It replaces the multi-GB in-heap Bleve/BM25 index with an +-- on-disk inverted index the SymbolSearcher / SymbolBundleSearcher +-- query through. A standard (NOT contentless) FTS5 table so we can +-- DELETE individual rows by node_id without an external content +-- shadow. node_id is the join key back to nodes.id; repo_prefix is +-- carried UNINDEXED so per-repo staleness wipes (DELETE … WHERE +-- repo_prefix = ?) hit a literal column without a separate b-tree. +-- Only "tokens" is indexed for matching. IF NOT EXISTS makes this +-- idempotent on every Open, so an existing .sqlite gains the vtable +-- on its next open + reindex. +CREATE VIRTUAL TABLE IF NOT EXISTS symbol_fts USING fts5(node_id UNINDEXED, repo_prefix UNINDEXED, tokens); +` diff --git a/internal/graph/store_sqlite/store.go b/internal/graph/store_sqlite/store.go new file mode 100644 index 00000000..ee8ccc9d --- /dev/null +++ b/internal/graph/store_sqlite/store.go @@ -0,0 +1,1341 @@ +// Package store_sqlite is the on-disk, SQLite-backed implementation of +// graph.Store. It uses the pure-Go modernc.org/sqlite driver so the +// binary stays CGO-free on this code path, and satisfies the same +// conformance suite as the in-memory store (see +// internal/graph/storetest). +// +// Hot queries are precompiled as prepared statements in Open and +// closed in Close. Writes serialize through a single Go-side mutex +// because SQLite already serialises writers internally and an explicit +// mutex sidesteps SQLITE_BUSY contention when the conformance suite +// fans out 8 concurrent writers; reads still run concurrently under +// WAL mode. +// +// Meta maps are encoded with gob; an empty / nil Meta is stored as +// NULL so the common case adds no row weight beyond the column header. +// +// EdgeIdentityRevisions is tracked in memory (atomic counter) -- it +// mirrors the in-memory store's monotonic "provenance churn" signal +// and does not need to survive process restarts (the in-memory store +// resets it on every New(), so the contract is per-process). +package store_sqlite + +import ( + "bytes" + "database/sql" + "encoding/gob" + "errors" + "fmt" + "iter" + "runtime" + "strings" + "sync" + "sync/atomic" + + "github.com/zzet/gortex/internal/graph" + + _ "modernc.org/sqlite" +) + +// Store is the SQLite-backed graph.Store implementation. +type Store struct { + db *sql.DB + + // writeMu serialises every mutation. SQLite serialises writers + // internally; doing the same on the Go side turns SQLITE_BUSY + // contention into clean lock-wait and keeps the conformance + // concurrency test predictable. + writeMu sync.Mutex + + // resolveMu is the resolver-coordination mutex returned by + // ResolveMutex. Held by cross-repo / temporal / external resolver + // passes to keep their edge mutations from interleaving. Separate + // from writeMu so the resolver can hold it across multiple writes + // without blocking unrelated steady-state mutations. + resolveMu sync.Mutex + + edgeIdentityRevs atomic.Int64 + + // Prepared statements (compiled once in Open, closed in Close). + stmtInsertNode *sql.Stmt + stmtGetNode *sql.Stmt + stmtGetNodeByQual *sql.Stmt + stmtFindByName *sql.Stmt + stmtFindByNameInRepo *sql.Stmt + stmtFileNodes *sql.Stmt + stmtRepoNodes *sql.Stmt + stmtAllNodes *sql.Stmt + stmtNodeCount *sql.Stmt + stmtRepoPrefixes *sql.Stmt + stmtRepoStatsNodes *sql.Stmt + stmtRepoStatsEdges *sql.Stmt + stmtRepoNodeCount *sql.Stmt + stmtRepoEdgeCount *sql.Stmt + stmtAllRepoCountsNodes *sql.Stmt + stmtAllRepoCountsEdges *sql.Stmt + stmtStatsByKind *sql.Stmt + stmtStatsByLanguage *sql.Stmt + + stmtInsertEdge *sql.Stmt + stmtOutEdges *sql.Stmt + stmtInEdges *sql.Stmt + stmtRepoEdges *sql.Stmt + stmtAllEdges *sql.Stmt + stmtEdgeCount *sql.Stmt + stmtRemoveEdge *sql.Stmt + stmtUpdateEdgeOrigin *sql.Stmt + stmtSelectEdgeOrigin *sql.Stmt + stmtDeleteEdgeByKey *sql.Stmt + + stmtSelectFileNodeIDs *sql.Stmt + stmtSelectRepoNodeIDs *sql.Stmt + stmtDeleteNodeByFile *sql.Stmt + stmtDeleteNodeByRepo *sql.Stmt +} + +// Compile-time assertion: *Store satisfies graph.Store. +var _ graph.Store = (*Store)(nil) + +// ResolveMutex returns the resolver-coordination mutex. Held by +// cross-repo / temporal / external resolver passes to serialise edge +// mutations. Separate from writeMu (which protects per-statement +// write serialisation against SQLITE_BUSY) so the resolver can hold +// it across multi-write batches without blocking unrelated steady- +// state mutations on the same store. +func (s *Store) ResolveMutex() *sync.Mutex { return &s.resolveMu } + +// Open opens (or creates) the SQLite database at path, runs the schema +// migration, and prepares hot statements. The DB is opened with WAL +// journaling and synchronous=NORMAL -- the same durability/throughput +// tradeoff every embedded-SQLite app uses for write-heavy workloads. +// +// Pass ":memory:" for an ephemeral in-process database (handy for +// tests when you don't need on-disk persistence). +func Open(path string) (*Store, error) { + // Pragmas: WAL + synchronous=NORMAL is the standard write-heavy + // embedded tradeoff. cache_size(-32768) gives each pooled connection a + // 32 MiB page cache; temp_store(MEMORY) keeps GROUP BY / ORDER BY scratch + // off disk; mmap_size(256 MiB) lets reads fault pages straight from the + // OS page cache instead of copying through SQLite's. These materially + // speed the resolver/query phases on a large graph. + dsn := path + "?_pragma=journal_mode(WAL)&_pragma=synchronous(NORMAL)&_pragma=busy_timeout(5000)&_pragma=foreign_keys(OFF)&_pragma=cache_size(-32768)&_pragma=temp_store(MEMORY)&_pragma=mmap_size(268435456)" + db, err := sql.Open("sqlite", dsn) + if err != nil { + return nil, fmt.Errorf("sqlite open: %w", err) + } + // Pool up to NumCPU connections so the resolver's parallel + // worker fan-out (NumCPU goroutines doing FindNodesByName / + // GetNode / GetOutEdges concurrently) doesn't serialise through + // a single connection — the dominant gap between the SQLite and + // bbolt backends on the bench's resolver stage was exactly that. + // SQLite's WAL mode allows concurrent readers across multiple + // connections; writes still serialise via writeMu on the Go + // side, then via SQLite's internal write lock. Every connection + // the pool opens picks up the journal-mode / synchronous / + // busy-timeout pragmas from the DSN above, so we don't need to + // pin one connection to "remember" them. + db.SetMaxOpenConns(runtime.NumCPU()) + + if _, err := db.Exec(schemaSQL); err != nil { + _ = db.Close() + return nil, fmt.Errorf("sqlite schema: %w", err) + } + + s := &Store{db: db} + if err := s.prepare(); err != nil { + _ = db.Close() + return nil, fmt.Errorf("sqlite prepare: %w", err) + } + return s, nil +} + +// Close closes every prepared statement and the underlying *sql.DB. +func (s *Store) Close() error { + stmts := []*sql.Stmt{ + s.stmtInsertNode, s.stmtGetNode, s.stmtGetNodeByQual, + s.stmtFindByName, s.stmtFindByNameInRepo, + s.stmtFileNodes, s.stmtRepoNodes, + s.stmtAllNodes, s.stmtNodeCount, s.stmtRepoPrefixes, + s.stmtRepoStatsNodes, s.stmtRepoStatsEdges, + s.stmtRepoNodeCount, s.stmtRepoEdgeCount, + s.stmtAllRepoCountsNodes, s.stmtAllRepoCountsEdges, + s.stmtStatsByKind, s.stmtStatsByLanguage, + s.stmtInsertEdge, s.stmtOutEdges, s.stmtInEdges, + s.stmtRepoEdges, + s.stmtAllEdges, s.stmtEdgeCount, s.stmtRemoveEdge, + s.stmtUpdateEdgeOrigin, s.stmtSelectEdgeOrigin, s.stmtDeleteEdgeByKey, + s.stmtSelectFileNodeIDs, s.stmtSelectRepoNodeIDs, + s.stmtDeleteNodeByFile, s.stmtDeleteNodeByRepo, + } + for _, st := range stmts { + if st != nil { + _ = st.Close() + } + } + return s.db.Close() +} + +func (s *Store) prepare() error { + var err error + prep := func(out **sql.Stmt, q string) { + if err != nil { + return + } + var st *sql.Stmt + st, err = s.db.Prepare(q) + if err != nil { + err = fmt.Errorf("prepare %q: %w", q, err) + return + } + *out = st + } + + const nodeCols = `id, kind, name, qual_name, file_path, start_line, end_line, language, repo_prefix, workspace_id, project_id, meta` + + prep(&s.stmtInsertNode, + `INSERT OR REPLACE INTO nodes (`+nodeCols+`) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)`) + prep(&s.stmtGetNode, + `SELECT `+nodeCols+` FROM nodes WHERE id = ?`) + prep(&s.stmtGetNodeByQual, + `SELECT `+nodeCols+` FROM nodes WHERE qual_name = ? LIMIT 1`) + prep(&s.stmtFindByName, + `SELECT `+nodeCols+` FROM nodes WHERE name = ?`) + prep(&s.stmtFindByNameInRepo, + `SELECT `+nodeCols+` FROM nodes WHERE name = ? AND repo_prefix = ?`) + prep(&s.stmtFileNodes, + `SELECT `+nodeCols+` FROM nodes WHERE file_path = ?`) + prep(&s.stmtRepoNodes, + `SELECT `+nodeCols+` FROM nodes WHERE repo_prefix = ?`) + prep(&s.stmtAllNodes, + `SELECT `+nodeCols+` FROM nodes`) + prep(&s.stmtNodeCount, + `SELECT COUNT(*) FROM nodes`) + prep(&s.stmtRepoPrefixes, + `SELECT DISTINCT repo_prefix FROM nodes WHERE repo_prefix <> ''`) + + prep(&s.stmtRepoStatsNodes, + `SELECT repo_prefix, kind, language, COUNT(*) FROM nodes WHERE repo_prefix <> '' GROUP BY repo_prefix, kind, language`) + prep(&s.stmtRepoStatsEdges, + `SELECT n.repo_prefix, COUNT(*) + FROM edges e + JOIN nodes n ON n.id = e.from_id + WHERE n.repo_prefix <> '' + GROUP BY n.repo_prefix`) + prep(&s.stmtRepoNodeCount, + `SELECT COUNT(*) FROM nodes WHERE repo_prefix = ?`) + prep(&s.stmtRepoEdgeCount, + `SELECT COUNT(*) + FROM edges e + JOIN nodes n ON n.id = e.from_id + WHERE n.repo_prefix = ?`) + prep(&s.stmtAllRepoCountsNodes, + `SELECT repo_prefix, COUNT(*) FROM nodes WHERE repo_prefix <> '' GROUP BY repo_prefix`) + prep(&s.stmtAllRepoCountsEdges, + `SELECT n.repo_prefix, COUNT(*) + FROM edges e + JOIN nodes n ON n.id = e.from_id + WHERE n.repo_prefix <> '' + GROUP BY n.repo_prefix`) + + prep(&s.stmtStatsByKind, + `SELECT kind, COUNT(*) FROM nodes GROUP BY kind`) + prep(&s.stmtStatsByLanguage, + `SELECT language, COUNT(*) FROM nodes GROUP BY language`) + + const edgeCols = `from_id, to_id, kind, file_path, line, confidence, confidence_label, origin, tier, cross_repo, meta` + + prep(&s.stmtInsertEdge, + `INSERT OR IGNORE INTO edges (`+edgeCols+`) VALUES (?,?,?,?,?,?,?,?,?,?,?)`) + prep(&s.stmtOutEdges, + `SELECT `+edgeCols+` FROM edges WHERE from_id = ?`) + prep(&s.stmtInEdges, + `SELECT `+edgeCols+` FROM edges WHERE to_id = ?`) + prep(&s.stmtRepoEdges, + `SELECT e.from_id, e.to_id, e.kind, e.file_path, e.line, + e.confidence, e.confidence_label, e.origin, e.tier, + e.cross_repo, e.meta + FROM edges e + JOIN nodes n ON n.id = e.from_id + WHERE n.repo_prefix = ?`) + prep(&s.stmtAllEdges, + `SELECT `+edgeCols+` FROM edges`) + prep(&s.stmtEdgeCount, + `SELECT COUNT(*) FROM edges`) + prep(&s.stmtRemoveEdge, + `DELETE FROM edges WHERE from_id = ? AND to_id = ? AND kind = ?`) + + prep(&s.stmtSelectEdgeOrigin, + `SELECT origin FROM edges WHERE from_id = ? AND to_id = ? AND kind = ? AND file_path = ? AND line = ?`) + prep(&s.stmtUpdateEdgeOrigin, + `UPDATE edges SET origin = ?, tier = ? WHERE from_id = ? AND to_id = ? AND kind = ? AND file_path = ? AND line = ?`) + prep(&s.stmtDeleteEdgeByKey, + `DELETE FROM edges WHERE from_id = ? AND to_id = ? AND kind = ? AND file_path = ? AND line = ?`) + + prep(&s.stmtSelectFileNodeIDs, + `SELECT id FROM nodes WHERE file_path = ?`) + prep(&s.stmtSelectRepoNodeIDs, + `SELECT id FROM nodes WHERE repo_prefix = ?`) + prep(&s.stmtDeleteNodeByFile, + `DELETE FROM nodes WHERE file_path = ?`) + prep(&s.stmtDeleteNodeByRepo, + `DELETE FROM nodes WHERE repo_prefix = ?`) + + return err +} + +// -- meta encode/decode ---------------------------------------------------- + +func encodeMeta(m map[string]any) ([]byte, error) { + if len(m) == 0 { + return nil, nil + } + var buf bytes.Buffer + if err := gob.NewEncoder(&buf).Encode(m); err != nil { + return nil, err + } + return buf.Bytes(), nil +} + +func decodeMeta(b []byte) (map[string]any, error) { + if len(b) == 0 { + return nil, nil + } + var m map[string]any + if err := gob.NewDecoder(bytes.NewReader(b)).Decode(&m); err != nil { + return nil, err + } + return m, nil +} + +// -- row scanners --------------------------------------------------------- + +func scanNode(scanner interface { + Scan(...any) error +}) (*graph.Node, error) { + var ( + n graph.Node + metaBlob []byte + ) + err := scanner.Scan( + &n.ID, &n.Kind, &n.Name, &n.QualName, &n.FilePath, + &n.StartLine, &n.EndLine, &n.Language, + &n.RepoPrefix, &n.WorkspaceID, &n.ProjectID, &metaBlob, + ) + if err != nil { + return nil, err + } + if len(metaBlob) > 0 { + m, derr := decodeMeta(metaBlob) + if derr != nil { + return nil, derr + } + n.Meta = m + } + return &n, nil +} + +func scanEdge(scanner interface { + Scan(...any) error +}) (*graph.Edge, error) { + var ( + e graph.Edge + metaBlob []byte + crossRepo int64 + ) + err := scanner.Scan( + &e.From, &e.To, &e.Kind, &e.FilePath, &e.Line, + &e.Confidence, &e.ConfidenceLabel, &e.Origin, &e.Tier, + &crossRepo, &metaBlob, + ) + if err != nil { + return nil, err + } + e.CrossRepo = crossRepo != 0 + if len(metaBlob) > 0 { + m, derr := decodeMeta(metaBlob) + if derr != nil { + return nil, derr + } + e.Meta = m + } + return &e, nil +} + +// -- writes --------------------------------------------------------------- + +// AddNode inserts or replaces a node. Idempotent on the id column -- +// re-adding the same id with new content does a last-write-wins +// update, matching the in-memory store's behaviour. +func (s *Store) AddNode(n *graph.Node) { + if n == nil || n.ID == "" { + return + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + if err := s.insertNodeLocked(s.stmtInsertNode, n); err != nil { + // graph.Store.AddNode has no error channel; the in-memory + // store can't fail either. We swallow the error here for API + // parity; surface as a panic only on a clearly catastrophic + // failure (closed DB), not on a transient busy. + panicOnFatal(err) + } +} + +func (s *Store) insertNodeLocked(stmt *sql.Stmt, n *graph.Node) error { + metaBlob, err := encodeMeta(n.Meta) + if err != nil { + return err + } + _, err = stmt.Exec( + n.ID, string(n.Kind), n.Name, n.QualName, n.FilePath, + n.StartLine, n.EndLine, n.Language, + n.RepoPrefix, n.WorkspaceID, n.ProjectID, metaBlob, + ) + return err +} + +// AddEdge inserts an edge. Idempotent on the logical edge key (from, +// to, kind, file_path, line) -- a second AddEdge with the same key is +// a no-op (INSERT OR IGNORE), matching the in-memory store's "stored +// pointer replaced in place" semantics. Origin upgrades on a re-add +// are NOT applied through this path; use SetEdgeProvenance for that +// (matches the in-memory store: AddEdge replaces the *Edge pointer, +// but the conformance suite only verifies dedup-by-key, not pointer +// replacement, and the in-memory store also routes provenance +// upgrades through SetEdgeProvenance). +func (s *Store) AddEdge(e *graph.Edge) { + if e == nil { + return + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + if err := s.insertEdgeLocked(s.stmtInsertEdge, e); err != nil { + panicOnFatal(err) + } +} + +func (s *Store) insertEdgeLocked(stmt *sql.Stmt, e *graph.Edge) error { + metaBlob, err := encodeMeta(e.Meta) + if err != nil { + return err + } + var crossRepo int64 + if e.CrossRepo { + crossRepo = 1 + } + _, err = stmt.Exec( + e.From, e.To, string(e.Kind), e.FilePath, e.Line, + e.Confidence, e.ConfidenceLabel, e.Origin, e.Tier, + crossRepo, metaBlob, + ) + return err +} + +// AddBatch inserts nodes and edges in a single transaction -- the +// 10-100x speedup vs per-statement commits at indexing scale. +func (s *Store) AddBatch(nodes []*graph.Node, edges []*graph.Edge) { + if len(nodes) == 0 && len(edges) == 0 { + return + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + + tx, err := s.db.Begin() + if err != nil { + panicOnFatal(err) + return + } + commit := false + defer func() { + if !commit { + _ = tx.Rollback() + } + }() + + insertNode := tx.Stmt(s.stmtInsertNode) + defer func() { _ = insertNode.Close() }() + insertEdge := tx.Stmt(s.stmtInsertEdge) + defer func() { _ = insertEdge.Close() }() + + for _, n := range nodes { + if n == nil || n.ID == "" { + continue + } + if err := s.insertNodeLocked(insertNode, n); err != nil { + panicOnFatal(err) + return + } + } + for _, e := range edges { + if e == nil { + continue + } + if err := s.insertEdgeLocked(insertEdge, e); err != nil { + panicOnFatal(err) + return + } + } + + if err := tx.Commit(); err != nil { + panicOnFatal(err) + return + } + commit = true +} + +// SetEdgeProvenance mutates an existing edge's origin in-place and +// bumps the identity-revision counter when the origin actually +// changes. Returns true iff a change was applied. Mirrors the +// in-memory store's "delete-then-insert of identity" semantics. +func (s *Store) SetEdgeProvenance(e *graph.Edge, newOrigin string) bool { + if e == nil { + return false + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + + // Look up the stored origin -- the caller-supplied *Edge may be a + // detached copy whose Origin already matches newOrigin even though + // the row still has the old value. + var storedOrigin string + row := s.stmtSelectEdgeOrigin.QueryRow(e.From, e.To, string(e.Kind), e.FilePath, e.Line) + if err := row.Scan(&storedOrigin); err != nil { + if errors.Is(err, sql.ErrNoRows) { + return false + } + panicOnFatal(err) + return false + } + if storedOrigin == newOrigin { + return false + } + newTier := e.Tier + if newTier != "" { + newTier = graph.ResolvedBy(newOrigin) + } + if _, err := s.stmtUpdateEdgeOrigin.Exec(newOrigin, newTier, e.From, e.To, string(e.Kind), e.FilePath, e.Line); err != nil { + panicOnFatal(err) + return false + } + // Reflect the change on the caller's struct, mirroring the + // in-memory store which mutates the in-graph *Edge in place. + e.Origin = newOrigin + if e.Tier != "" { + e.Tier = newTier + } + s.edgeIdentityRevs.Add(1) + return true +} + +// ReindexEdge updates the stored row after e.To has been mutated from +// oldTo to e.To. Implemented as delete-old + insert-new under the +// same write lock (SQLite's UNIQUE constraint on (from,to,kind,file, +// line) makes "UPDATE to_id" a one-shot, but the delete+insert form +// keeps semantics identical when the new (from,to,...) key happens to +// already exist -- the INSERT OR IGNORE drops the dup, just like the +// in-memory store's bucket-replace). +func (s *Store) ReindexEdge(e *graph.Edge, oldTo string) { + if e == nil || oldTo == e.To { + return + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + + if _, err := s.stmtDeleteEdgeByKey.Exec(e.From, oldTo, string(e.Kind), e.FilePath, e.Line); err != nil { + panicOnFatal(err) + return + } + if err := s.insertEdgeLocked(s.stmtInsertEdge, e); err != nil { + panicOnFatal(err) + return + } +} + +// reindexChunkSize bounds the number of edge re-binds per BEGIN/COMMIT. +// Same shape as the bbolt sibling: large enough to amortise the +// per-tx overhead (BEGIN+COMMIT plus WAL fsync) but small enough that +// the WAL doesn't balloon and a crash mid-batch only loses ≤chunk +// mutations. +const reindexChunkSize = 5000 + +// ReindexEdges chunks the batch into reindexChunkSize-mutation +// transactions and runs each through prepared statements re-used +// across the chunk. Per-edge ReindexEdge was the resolver hot path +// (10k+ calls = 10k+ BEGIN/COMMIT pairs); this collapses them to two. +func (s *Store) ReindexEdges(batch []graph.EdgeReindex) { + if len(batch) == 0 { + return + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + for i := 0; i < len(batch); i += reindexChunkSize { + end := minInt(i+reindexChunkSize, len(batch)) + chunk := batch[i:end] + tx, err := s.db.Begin() + if err != nil { + panicOnFatal(err) + return + } + delStmt := tx.Stmt(s.stmtDeleteEdgeByKey) + insStmt := tx.Stmt(s.stmtInsertEdge) + for _, r := range chunk { + if r.Edge == nil || r.OldTo == r.Edge.To { + continue + } + if _, err := delStmt.Exec(r.Edge.From, r.OldTo, string(r.Edge.Kind), r.Edge.FilePath, r.Edge.Line); err != nil { + _ = tx.Rollback() + panicOnFatal(err) + return + } + if err := s.insertEdgeLocked(insStmt, r.Edge); err != nil { + _ = tx.Rollback() + panicOnFatal(err) + return + } + } + if err := tx.Commit(); err != nil { + panicOnFatal(err) + return + } + } +} + +// SetEdgeProvenanceBatch chunks origin promotions into one BEGIN/ +// COMMIT per chunk and bumps the in-process revision counter once +// per actual change, matching the per-edge SetEdgeProvenance's +// semantics. Returns the total number of edges whose Origin changed. +func (s *Store) SetEdgeProvenanceBatch(batch []graph.EdgeProvenanceUpdate) int { + if len(batch) == 0 { + return 0 + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + totalChanged := 0 + for i := 0; i < len(batch); i += reindexChunkSize { + end := minInt(i+reindexChunkSize, len(batch)) + chunk := batch[i:end] + tx, err := s.db.Begin() + if err != nil { + panicOnFatal(err) + return totalChanged + } + selStmt := tx.Stmt(s.stmtSelectEdgeOrigin) + updStmt := tx.Stmt(s.stmtUpdateEdgeOrigin) + chunkChanged := 0 + for _, u := range chunk { + if u.Edge == nil { + continue + } + var storedOrigin string + row := selStmt.QueryRow(u.Edge.From, u.Edge.To, string(u.Edge.Kind), u.Edge.FilePath, u.Edge.Line) + if err := row.Scan(&storedOrigin); err != nil { + if errors.Is(err, sql.ErrNoRows) { + continue + } + _ = tx.Rollback() + panicOnFatal(err) + return totalChanged + } + if storedOrigin == u.NewOrigin { + continue + } + newTier := u.Edge.Tier + if newTier != "" { + newTier = graph.ResolvedBy(u.NewOrigin) + } + if _, err := updStmt.Exec(u.NewOrigin, newTier, u.Edge.From, u.Edge.To, string(u.Edge.Kind), u.Edge.FilePath, u.Edge.Line); err != nil { + _ = tx.Rollback() + panicOnFatal(err) + return totalChanged + } + u.Edge.Origin = u.NewOrigin + if u.Edge.Tier != "" { + u.Edge.Tier = newTier + } + chunkChanged++ + } + if err := tx.Commit(); err != nil { + panicOnFatal(err) + return totalChanged + } + if chunkChanged > 0 { + s.edgeIdentityRevs.Add(int64(chunkChanged)) + } + totalChanged += chunkChanged + } + return totalChanged +} + +func minInt(a, b int) int { + if a < b { + return a + } + return b +} + +// RemoveEdge deletes every edge between (from, to) with the given +// kind. Returns true iff at least one row was deleted. +func (s *Store) RemoveEdge(from, to string, kind graph.EdgeKind) bool { + s.writeMu.Lock() + defer s.writeMu.Unlock() + res, err := s.stmtRemoveEdge.Exec(from, to, string(kind)) + if err != nil { + panicOnFatal(err) + return false + } + n, err := res.RowsAffected() + if err != nil { + panicOnFatal(err) + return false + } + return n > 0 +} + +// EvictFile removes every node anchored to filePath and every edge +// that touches one of those nodes. Returns (nodesRemoved, +// edgesRemoved). +func (s *Store) EvictFile(filePath string) (nodesRemoved, edgesRemoved int) { + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.evictByScopeLocked(s.stmtSelectFileNodeIDs, s.stmtDeleteNodeByFile, filePath) +} + +// EvictRepo removes every node in repoPrefix and every edge that +// touches one. Returns (nodesRemoved, edgesRemoved). +func (s *Store) EvictRepo(repoPrefix string) (nodesRemoved, edgesRemoved int) { + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.evictByScopeLocked(s.stmtSelectRepoNodeIDs, s.stmtDeleteNodeByRepo, repoPrefix) +} + +// evictByScopeLocked is the shared body of EvictFile / EvictRepo -- +// collect the affected node IDs, delete every edge touching one of +// them, then delete the nodes themselves. +func (s *Store) evictByScopeLocked(selectIDs, deleteNodes *sql.Stmt, scope string) (int, int) { + rows, err := selectIDs.Query(scope) + if err != nil { + panicOnFatal(err) + return 0, 0 + } + var ids []string + for rows.Next() { + var id string + if err := rows.Scan(&id); err != nil { + _ = rows.Close() + panicOnFatal(err) + return 0, 0 + } + ids = append(ids, id) + } + if err := rows.Err(); err != nil { + _ = rows.Close() + panicOnFatal(err) + return 0, 0 + } + _ = rows.Close() + if len(ids) == 0 { + return 0, 0 + } + + // Delete every edge touching one of these nodes. We run a single + // DELETE per node id to avoid bumping into SQLite's bound-variable + // limit on big batches; under the write lock this is a + // straight-line walk. + var edgesRemoved int + for _, id := range ids { + res, err := s.db.Exec(`DELETE FROM edges WHERE from_id = ? OR to_id = ?`, id, id) + if err != nil { + panicOnFatal(err) + return 0, edgesRemoved + } + if n, err := res.RowsAffected(); err == nil { + edgesRemoved += int(n) + } + } + + res, err := deleteNodes.Exec(scope) + if err != nil { + panicOnFatal(err) + return 0, edgesRemoved + } + n, err := res.RowsAffected() + if err != nil { + panicOnFatal(err) + return 0, edgesRemoved + } + return int(n), edgesRemoved +} + +// -- reads --------------------------------------------------------------- + +func (s *Store) GetNode(id string) *graph.Node { + row := s.stmtGetNode.QueryRow(id) + n, err := scanNode(row) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil + } + panicOnFatal(err) + return nil + } + return n +} + +func (s *Store) GetNodeByQualName(qualName string) *graph.Node { + if qualName == "" { + return nil + } + row := s.stmtGetNodeByQual.QueryRow(qualName) + n, err := scanNode(row) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil + } + panicOnFatal(err) + return nil + } + return n +} + +func (s *Store) FindNodesByName(name string) []*graph.Node { + return s.queryNodes(s.stmtFindByName, name) +} + +func (s *Store) FindNodesByNameInRepo(name, repoPrefix string) []*graph.Node { + return s.queryNodes(s.stmtFindByNameInRepo, name, repoPrefix) +} + +func (s *Store) GetFileNodes(filePath string) []*graph.Node { + return s.queryNodes(s.stmtFileNodes, filePath) +} + +func (s *Store) GetRepoNodes(repoPrefix string) []*graph.Node { + return s.queryNodes(s.stmtRepoNodes, repoPrefix) +} + +func (s *Store) AllNodes() []*graph.Node { + return s.queryNodes(s.stmtAllNodes) +} + +func (s *Store) queryNodes(stmt *sql.Stmt, args ...any) []*graph.Node { + rows, err := stmt.Query(args...) + if err != nil { + panicOnFatal(err) + return nil + } + defer func() { _ = rows.Close() }() + var out []*graph.Node + for rows.Next() { + n, err := scanNode(rows) + if err != nil { + panicOnFatal(err) + return out + } + out = append(out, n) + } + return out +} + +func (s *Store) GetOutEdges(nodeID string) []*graph.Edge { + return s.queryEdges(s.stmtOutEdges, nodeID) +} + +func (s *Store) GetInEdges(nodeID string) []*graph.Edge { + return s.queryEdges(s.stmtInEdges, nodeID) +} + +func (s *Store) AllEdges() []*graph.Edge { + return s.queryEdges(s.stmtAllEdges) +} + +// GetRepoEdges returns every edge whose source node has the given +// RepoPrefix. The pre-Store idiom — GetRepoNodes(r) followed by +// GetOutEdges(n.ID) per node — was O(repo_nodes) prepared-statement +// invocations, which on a multi-repo workspace dominated the +// per-repo extractor passes. A single JOIN over edges/nodes keyed +// on n.repo_prefix runs as one prepared statement and hits the +// existing repo_prefix index. +func (s *Store) GetRepoEdges(repoPrefix string) []*graph.Edge { + if repoPrefix == "" { + return nil + } + return s.queryEdges(s.stmtRepoEdges, repoPrefix) +} + +func (s *Store) queryEdges(stmt *sql.Stmt, args ...any) []*graph.Edge { + rows, err := stmt.Query(args...) + if err != nil { + panicOnFatal(err) + return nil + } + defer func() { _ = rows.Close() }() + var out []*graph.Edge + for rows.Next() { + e, err := scanEdge(rows) + if err != nil { + panicOnFatal(err) + return out + } + out = append(out, e) + } + return out +} + +// -- counts and stats ----------------------------------------------------- + +func (s *Store) NodeCount() int { + var n int + if err := s.stmtNodeCount.QueryRow().Scan(&n); err != nil { + panicOnFatal(err) + return 0 + } + return n +} + +func (s *Store) EdgeCount() int { + var n int + if err := s.stmtEdgeCount.QueryRow().Scan(&n); err != nil { + panicOnFatal(err) + return 0 + } + return n +} + +func (s *Store) Stats() graph.GraphStats { + st := graph.GraphStats{ + ByKind: map[string]int{}, + ByLanguage: map[string]int{}, + } + st.TotalNodes = s.NodeCount() + st.TotalEdges = s.EdgeCount() + + rows, err := s.stmtStatsByKind.Query() + if err != nil { + panicOnFatal(err) + return st + } + for rows.Next() { + var kind string + var n int + if err := rows.Scan(&kind, &n); err != nil { + _ = rows.Close() + panicOnFatal(err) + return st + } + st.ByKind[kind] = n + } + _ = rows.Close() + + rows, err = s.stmtStatsByLanguage.Query() + if err != nil { + panicOnFatal(err) + return st + } + for rows.Next() { + var lang string + var n int + if err := rows.Scan(&lang, &n); err != nil { + _ = rows.Close() + panicOnFatal(err) + return st + } + st.ByLanguage[lang] = n + } + _ = rows.Close() + return st +} + +func (s *Store) RepoStats() map[string]graph.GraphStats { + out := map[string]graph.GraphStats{} + rows, err := s.stmtRepoStatsNodes.Query() + if err != nil { + panicOnFatal(err) + return out + } + for rows.Next() { + var repo, kind, lang string + var n int + if err := rows.Scan(&repo, &kind, &lang, &n); err != nil { + _ = rows.Close() + panicOnFatal(err) + return out + } + st, ok := out[repo] + if !ok { + st = graph.GraphStats{ByKind: map[string]int{}, ByLanguage: map[string]int{}} + } + st.TotalNodes += n + st.ByKind[kind] += n + st.ByLanguage[lang] += n + out[repo] = st + } + _ = rows.Close() + + rows, err = s.stmtRepoStatsEdges.Query() + if err != nil { + panicOnFatal(err) + return out + } + for rows.Next() { + var repo string + var n int + if err := rows.Scan(&repo, &n); err != nil { + _ = rows.Close() + panicOnFatal(err) + return out + } + st, ok := out[repo] + if !ok { + st = graph.GraphStats{ByKind: map[string]int{}, ByLanguage: map[string]int{}} + } + st.TotalEdges = n + out[repo] = st + } + _ = rows.Close() + return out +} + +func (s *Store) RepoPrefixes() []string { + rows, err := s.stmtRepoPrefixes.Query() + if err != nil { + panicOnFatal(err) + return nil + } + defer func() { _ = rows.Close() }() + var out []string + for rows.Next() { + var p string + if err := rows.Scan(&p); err != nil { + panicOnFatal(err) + return out + } + out = append(out, p) + } + return out +} + +// -- provenance verification --------------------------------------------- + +func (s *Store) EdgeIdentityRevisions() int { + return int(s.edgeIdentityRevs.Load()) +} + +// VerifyEdgeIdentities is a no-op for the SQL backend: the in-memory +// store's invariant is "the same *Edge pointer lives in both +// adjacency views". The SQL store has a single row per edge, so the +// invariant is trivially satisfied -- no walk can find a divergence +// to report. +func (s *Store) VerifyEdgeIdentities() error { return nil } + +// -- memory estimation (advisory) ---------------------------------------- + +// perRowByteEstimate is a deliberately rough per-row byte cost -- +// the disk backend doesn't have an in-memory footprint to report, so +// the contract (per Store interface comment) is "return what you can +// compute and callers treat the result as advisory". The conformance +// test only checks NodeCount. +const ( + perNodeByteEstimate = 256 + perEdgeByteEstimate = 128 +) + +func (s *Store) RepoMemoryEstimate(repoPrefix string) graph.RepoMemoryEstimate { + var est graph.RepoMemoryEstimate + var n, e int + if err := s.stmtRepoNodeCount.QueryRow(repoPrefix).Scan(&n); err != nil { + panicOnFatal(err) + return est + } + if err := s.stmtRepoEdgeCount.QueryRow(repoPrefix).Scan(&e); err != nil { + panicOnFatal(err) + return est + } + est.NodeCount = n + est.EdgeCount = e + est.NodeBytes = uint64(n) * perNodeByteEstimate + est.EdgeBytes = uint64(e) * perEdgeByteEstimate + return est +} + +func (s *Store) AllRepoMemoryEstimates() map[string]graph.RepoMemoryEstimate { + out := map[string]graph.RepoMemoryEstimate{} + rows, err := s.stmtAllRepoCountsNodes.Query() + if err != nil { + panicOnFatal(err) + return out + } + for rows.Next() { + var repo string + var n int + if err := rows.Scan(&repo, &n); err != nil { + _ = rows.Close() + panicOnFatal(err) + return out + } + est := out[repo] + est.NodeCount = n + est.NodeBytes = uint64(n) * perNodeByteEstimate + out[repo] = est + } + _ = rows.Close() + + rows, err = s.stmtAllRepoCountsEdges.Query() + if err != nil { + panicOnFatal(err) + return out + } + for rows.Next() { + var repo string + var n int + if err := rows.Scan(&repo, &n); err != nil { + _ = rows.Close() + panicOnFatal(err) + return out + } + est := out[repo] + est.EdgeCount = n + est.EdgeBytes = uint64(n) * perEdgeByteEstimate + out[repo] = est + } + _ = rows.Close() + return out +} + +// -- helpers -------------------------------------------------------------- + +// panicOnFatal turns truly catastrophic SQLite errors (closed DB, +// schema mismatch, disk-full at insert time) into a panic so callers +// see them, while letting expected sql.ErrNoRows / busy / no-affected +// callers stay quiet. The graph.Store interface deliberately does not +// surface errors -- it mirrors the in-memory store's "everything +// succeeds" contract -- so a fatal storage failure cannot be ignored. +func panicOnFatal(err error) { + if err == nil { + return + } + if errors.Is(err, sql.ErrNoRows) { + return + } + panic(fmt.Errorf("store_sqlite: %w", err)) +} + +// -- predicate-shaped reads --------------------------------------------- +// +// Each method runs one indexed SELECT and streams rows back via the +// iter.Seq[T] yield callback. Stops cleanly when yield returns false. +// Heavier than the equivalent bolt path (sql parsing + driver row +// materialisation) but cuts the resolver's wasted full-table scans +// down to "match-only" cardinality, which is the whole point. + +// All three predicate iterators here MATERIALISE the query result +// into a slice before yielding, then iterate the slice. This avoids +// a deadlock peculiar to the SQLite backend's single-connection +// pool: a streaming rows-cursor holds THE connection, and any +// callback in the yield body that re-enters the store (e.g. GetNode +// to resolve an edge's caller) blocks forever waiting on the same +// connection. Materialise-then-yield releases the connection before +// the body runs, so re-entrant store calls work. +// +// The "predicate-shaped" win still holds: the indexed SELECT only +// fetches matching rows, not the whole table. We give up streaming +// memory savings (we still build a Go slice of *Edge / *Node) but +// keep the structural advantage that the row count flowing through +// scanEdge is proportional to the result, not the table. + +// EdgesByKind: indexed SELECT on the (kind) column. +func (s *Store) EdgesByKind(kind graph.EdgeKind) iter.Seq[*graph.Edge] { + return func(yield func(*graph.Edge) bool) { + out := s.queryEdgesSQL(` +SELECT from_id, to_id, kind, file_path, line, confidence, confidence_label, origin, tier, cross_repo, meta +FROM edges WHERE kind = ?`, string(kind)) + for _, e := range out { + if !yield(e) { + return + } + } + } +} + +// NodesByKind: indexed SELECT on the (kind) column. +func (s *Store) NodesByKind(kind graph.NodeKind) iter.Seq[*graph.Node] { + return func(yield func(*graph.Node) bool) { + out := s.queryNodesSQL(` +SELECT id, kind, name, qual_name, file_path, start_line, end_line, language, + repo_prefix, workspace_id, project_id, meta +FROM nodes WHERE kind = ?`, string(kind)) + for _, n := range out { + if !yield(n) { + return + } + } + } +} + +// EdgesWithUnresolvedTarget yields edges whose target is an unresolved +// stub in EITHER form: the bare `unresolved::X` (a half-open range scan +// that seeks directly to the contiguous slice via the to_id b-tree) or +// the multi-repo `::unresolved::X` rewrite (an infix LIKE — the +// unresolved set is small, so the scan is cheap). Mirrors +// graph.IsUnresolvedTarget over both shapes. +func (s *Store) EdgesWithUnresolvedTarget() iter.Seq[*graph.Edge] { + return func(yield func(*graph.Edge) bool) { + out := s.queryEdgesSQL(` +SELECT from_id, to_id, kind, file_path, line, confidence, confidence_label, origin, tier, cross_repo, meta +FROM edges WHERE (to_id >= 'unresolved::' AND to_id < 'unresolved:;') OR to_id LIKE '%::unresolved::%'`) + for _, e := range out { + if !yield(e) { + return + } + } + } +} + +// queryEdgesSQL runs an edge-shaped SELECT, materialises the rows +// into a slice, and closes the rows-cursor before returning — +// releasing the underlying sql.Conn so the predicate-iterator's +// callback body is free to make re-entrant store calls without +// deadlocking on the MaxOpenConns=1 pool. Companion to the existing +// queryEdges helper that takes a *sql.Stmt; this one takes a raw +// SQL string so the predicate iterators can pass inline queries. +func (s *Store) queryEdgesSQL(q string, args ...any) []*graph.Edge { + rows, err := s.db.Query(q, args...) + if err != nil { + return nil + } + defer func() { _ = rows.Close() }() + var out []*graph.Edge + for rows.Next() { + e, err := scanEdge(rows) + if err != nil || e == nil { + continue + } + out = append(out, e) + } + return out +} + +// queryNodesSQL is the node-shaped sibling of queryEdgesSQL. +func (s *Store) queryNodesSQL(q string, args ...any) []*graph.Node { + rows, err := s.db.Query(q, args...) + if err != nil { + return nil + } + defer func() { _ = rows.Close() }() + var out []*graph.Node + for rows.Next() { + n, err := scanNode(rows) + if err != nil || n == nil { + continue + } + out = append(out, n) + } + return out +} + +// lookupChunkSize bounds the IN-list parameter count per SQL query. +// SQLite's default SQLITE_MAX_VARIABLE_NUMBER is 32766 in modern +// builds, but staying well under that keeps query plans stable and +// avoids surprising the parser on monster lists. +const lookupChunkSize = 5000 + +// GetNodesByIDs collapses N per-id SELECTs into ⌈N/chunk⌉ queries +// of the form `SELECT … FROM nodes WHERE id IN (?, ?, …)`. The +// resolver fires hundreds of thousands of these on a large pass; +// chunking turns hundreds of seconds into single-digit seconds. +func (s *Store) GetNodesByIDs(ids []string) map[string]*graph.Node { + if len(ids) == 0 { + return nil + } + // Dedupe + skip empty up front to keep the chunk loop honest. + seen := make(map[string]struct{}, len(ids)) + uniq := make([]string, 0, len(ids)) + for _, id := range ids { + if id == "" { + continue + } + if _, ok := seen[id]; ok { + continue + } + seen[id] = struct{}{} + uniq = append(uniq, id) + } + out := make(map[string]*graph.Node, len(uniq)) + const nodeCols = `id, kind, name, qual_name, file_path, start_line, end_line, language, repo_prefix, workspace_id, project_id, meta` + for i := 0; i < len(uniq); i += lookupChunkSize { + end := minInt(i+lookupChunkSize, len(uniq)) + chunk := uniq[i:end] + placeholders := strings.Repeat(",?", len(chunk))[1:] + q := `SELECT ` + nodeCols + ` FROM nodes WHERE id IN (` + placeholders + `)` + args := make([]any, len(chunk)) + for j, id := range chunk { + args[j] = id + } + for _, n := range s.queryNodesSQL(q, args...) { + if n != nil { + out[n.ID] = n + } + } + } + return out +} + +// FindNodesByNames collapses N per-name FindNodesByName queries into +// one `SELECT … FROM nodes WHERE name IN (…)` plus an in-Go bucket +// by name. The (name) index makes the SELECT seek-driven, and the +// caller sees the same map[name][]*Node it would have built by +// calling FindNodesByName N times. +func (s *Store) FindNodesByNames(names []string) map[string][]*graph.Node { + if len(names) == 0 { + return nil + } + seen := make(map[string]struct{}, len(names)) + uniq := make([]string, 0, len(names)) + for _, name := range names { + if name == "" { + continue + } + if _, ok := seen[name]; ok { + continue + } + seen[name] = struct{}{} + uniq = append(uniq, name) + } + out := make(map[string][]*graph.Node, len(uniq)) + const nodeCols = `id, kind, name, qual_name, file_path, start_line, end_line, language, repo_prefix, workspace_id, project_id, meta` + for i := 0; i < len(uniq); i += lookupChunkSize { + end := minInt(i+lookupChunkSize, len(uniq)) + chunk := uniq[i:end] + placeholders := strings.Repeat(",?", len(chunk))[1:] + q := `SELECT ` + nodeCols + ` FROM nodes WHERE name IN (` + placeholders + `)` + args := make([]any, len(chunk)) + for j, name := range chunk { + args[j] = name + } + for _, n := range s.queryNodesSQL(q, args...) { + if n == nil { + continue + } + out[n.Name] = append(out[n.Name], n) + } + } + return out +} + +// -- BulkLoader implementation ------------------------------------------- + +// Compile-time assertion: *Store satisfies graph.BulkLoader. The +// sqlite AddBatch path already runs inside one transaction per +// chunk and the resolver's batched mutators (ReindexEdges, +// SetEdgeProvenanceBatch) are already amortised. The BulkLoad +// bracket is marker-only here: it exists so the indexer's +// in-memory shadow swap activates — the resolver and its +// post-resolve passes then run against an in-memory *Graph at +// nanosecond latency, and the final AddBatch dumps the resolved +// graph to sqlite in one shot. +var _ graph.BulkLoader = (*Store)(nil) + +// BeginBulkLoad enters bulk mode. No-op for sqlite. +func (s *Store) BeginBulkLoad() {} + +// FlushBulk exits bulk mode. No-op for sqlite. +func (s *Store) FlushBulk() error { return nil } diff --git a/internal/graph/store_sqlite/store_aggregators.go b/internal/graph/store_sqlite/store_aggregators.go new file mode 100644 index 00000000..c1e81174 --- /dev/null +++ b/internal/graph/store_sqlite/store_aggregators.go @@ -0,0 +1,567 @@ +package store_sqlite + +import ( + "iter" + "sort" + + "github.com/zzet/gortex/internal/graph" +) + +// This file implements the trivial SQL aggregator / scanner optional +// capability interfaces from graph.Store. Each method pushes its +// GROUP BY / WHERE / COUNT into SQLite so the planner drives it through +// the schema's secondary indexes, returning only the aggregate rows +// instead of materialising the whole node / edge table Go-side. +// +// Conventions shared across these methods: +// - Empty / nil input returns nil (parity with the in-memory store). +// - Input id / kind slices are deduped before they reach the IN-list. +// - Large IN-lists are chunked by lookupChunkSize. +// - agg-prefixed helpers are local to this file. + +var ( + _ graph.InEdgeCounter = (*Store)(nil) + _ graph.NodeIDsByKinds = (*Store)(nil) + _ graph.EdgeKindCounter = (*Store)(nil) + _ graph.NodeDegreeByKinds = (*Store)(nil) + _ graph.NodesInFilesByKindFinder = (*Store)(nil) + _ graph.FileImportAggregator = (*Store)(nil) + _ graph.InDegreeForNodes = (*Store)(nil) + _ graph.CrossRepoEdgeAggregator = (*Store)(nil) + _ graph.FileImporters = (*Store)(nil) + _ graph.FileSymbolNamesByPaths = (*Store)(nil) + _ graph.EdgesByKindsScanner = (*Store)(nil) + _ graph.NodesByKindsScanner = (*Store)(nil) + _ graph.EdgeAdjacencyForKinds = (*Store)(nil) + _ graph.NodeDegreeAggregator = (*Store)(nil) + _ graph.NodeFanAggregator = (*Store)(nil) +) + +// aggDedupeEdgeKinds drops empties and duplicates from an edge-kind +// slice, preserving first-seen order; returns the kinds widened to the +// []any an IN-list binds. +func aggDedupeEdgeKinds(kinds []graph.EdgeKind) (uniq []graph.EdgeKind, args []any) { + seen := make(map[graph.EdgeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + if _, ok := seen[k]; ok { + continue + } + seen[k] = struct{}{} + uniq = append(uniq, k) + args = append(args, string(k)) + } + return uniq, args +} + +// aggDedupeNodeKinds is the node-kind twin of aggDedupeEdgeKinds. +func aggDedupeNodeKinds(kinds []graph.NodeKind) (uniq []graph.NodeKind, args []any) { + seen := make(map[graph.NodeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + if _, ok := seen[k]; ok { + continue + } + seen[k] = struct{}{} + uniq = append(uniq, k) + args = append(args, string(k)) + } + return uniq, args +} + +// InEdgeCountsByKind returns per-target incoming-edge counts for the +// supplied edge kinds, grouped server-side via edges_by_to. +func (s *Store) InEdgeCountsByKind(kinds []graph.EdgeKind) map[string]int { + _, args := aggDedupeEdgeKinds(kinds) + if len(args) == 0 { + return nil + } + q := `SELECT to_id, COUNT(*) FROM edges WHERE kind IN (` + inPlaceholders(len(args)) + `) GROUP BY to_id` + rows, err := s.db.Query(q, args...) + panicOnFatal(err) + defer func() { _ = rows.Close() }() + out := make(map[string]int) + for rows.Next() { + var id string + var n int + panicOnFatal(rows.Scan(&id, &n)) + out[id] = n + } + panicOnFatal(rows.Err()) + return out +} + +// NodeIDsByKinds returns the deduplicated IDs of every node whose kind +// is in the supplied set. +func (s *Store) NodeIDsByKinds(kinds []graph.NodeKind) []string { + _, args := aggDedupeNodeKinds(kinds) + if len(args) == 0 { + return nil + } + q := `SELECT id FROM nodes WHERE kind IN (` + inPlaceholders(len(args)) + `) ORDER BY id` + rows, err := s.db.Query(q, args...) + panicOnFatal(err) + defer func() { _ = rows.Close() }() + var out []string + for rows.Next() { + var id string + panicOnFatal(rows.Scan(&id)) + out = append(out, id) + } + panicOnFatal(rows.Err()) + return out +} + +// EdgeKindCounts returns one entry per distinct edge kind with its +// occurrence count across the whole graph. +func (s *Store) EdgeKindCounts() map[graph.EdgeKind]int { + rows, err := s.db.Query(`SELECT kind, COUNT(*) FROM edges GROUP BY kind`) + panicOnFatal(err) + defer func() { _ = rows.Close() }() + out := make(map[graph.EdgeKind]int) + for rows.Next() { + var kind string + var n int + panicOnFatal(rows.Scan(&kind, &n)) + out[graph.EdgeKind(kind)] = n + } + panicOnFatal(rows.Err()) + return out +} + +// NodeDegreeByKinds returns total in/out degree for every node whose +// kind is in the set (optionally under pathPrefix); UsageInCount is +// always 0 for this capability. +func (s *Store) NodeDegreeByKinds(kinds []graph.NodeKind, pathPrefix string) []graph.NodeDegreeRow { + _, kindArgs := aggDedupeNodeKinds(kinds) + if len(kindArgs) == 0 { + return nil + } + args := append([]any(nil), kindArgs...) + q := `SELECT n.id, + (SELECT COUNT(*) FROM edges e WHERE e.to_id = n.id) AS in_count, + (SELECT COUNT(*) FROM edges e WHERE e.from_id = n.id) AS out_count + FROM nodes n + WHERE n.kind IN (` + inPlaceholders(len(kindArgs)) + `)` + if pathPrefix != "" { + q += ` AND n.file_path LIKE ? ESCAPE '\'` + args = append(args, escapeLikePattern(pathPrefix)+"%") + } + q += ` ORDER BY n.id` + rows, err := s.db.Query(q, args...) + panicOnFatal(err) + defer func() { _ = rows.Close() }() + var out []graph.NodeDegreeRow + for rows.Next() { + var r graph.NodeDegreeRow + panicOnFatal(rows.Scan(&r.NodeID, &r.InCount, &r.OutCount)) + out = append(out, r) + } + panicOnFatal(rows.Err()) + return out +} + +// NodesInFilesByKind returns every node living in one of the supplied +// files whose kind is in the supplied set. +func (s *Store) NodesInFilesByKind(files []string, kinds []graph.NodeKind) []*graph.Node { + uniqFiles := dedupeNonEmpty(files) + _, kindArgs := aggDedupeNodeKinds(kinds) + if len(uniqFiles) == 0 || len(kindArgs) == 0 { + return nil + } + var out []*graph.Node + for i := 0; i < len(uniqFiles); i += lookupChunkSize { + end := minInt(i+lookupChunkSize, len(uniqFiles)) + chunk := uniqFiles[i:end] + args := append(toAnyArgs(chunk), kindArgs...) + q := `SELECT ` + lookupNodeCols + ` FROM nodes WHERE file_path IN (` + + inPlaceholders(len(chunk)) + `) AND kind IN (` + inPlaceholders(len(kindArgs)) + `) ORDER BY id` + out = append(out, s.queryNodesSQL(q, args...)...) + } + return out +} + +// FileImportCounts returns per-target-file incoming-import counts. A +// nil scope counts every import edge; a non-nil scope bounds counts to +// edges whose target node ID lies in the slice (empty non-nil => nil). +func (s *Store) FileImportCounts(scope []string) []graph.FileImportCountRow { + if scope != nil && len(scope) == 0 { + return nil + } + base := `SELECT COALESCE(NULLIF(n.file_path, ''), n.id) AS path, COUNT(*) AS cnt + FROM edges e JOIN nodes n ON e.to_id = n.id + WHERE e.kind = ?` + args := []any{string(graph.EdgeImports)} + fileToCount := make(map[string]int) + if scope == nil { + q := base + ` GROUP BY path` + aggScanImportCounts(s, q, args, fileToCount) + } else { + uniq := dedupeNonEmpty(scope) + if len(uniq) == 0 { + return nil + } + for i := 0; i < len(uniq); i += lookupChunkSize { + end := minInt(i+lookupChunkSize, len(uniq)) + chunk := uniq[i:end] + q := base + ` AND e.to_id IN (` + inPlaceholders(len(chunk)) + `) GROUP BY path` + aggScanImportCounts(s, q, append(append([]any(nil), args...), toAnyArgs(chunk)...), fileToCount) + } + } + if len(fileToCount) == 0 { + return nil + } + out := make([]graph.FileImportCountRow, 0, len(fileToCount)) + for path, cnt := range fileToCount { + out = append(out, graph.FileImportCountRow{FilePath: path, Count: cnt}) + } + return out +} + +// aggScanImportCounts runs an import-count query and folds the (path, +// count) rows into the accumulator (chunked scopes can revisit a path). +func aggScanImportCounts(s *Store, q string, args []any, acc map[string]int) { + rows, err := s.db.Query(q, args...) + panicOnFatal(err) + defer func() { _ = rows.Close() }() + for rows.Next() { + var path string + var cnt int + panicOnFatal(rows.Scan(&path, &cnt)) + acc[path] += cnt + } + panicOnFatal(rows.Err()) +} + +// InDegreeForNodes returns total incoming-edge counts (any kind) for +// the supplied node id set. +func (s *Store) InDegreeForNodes(ids []string) map[string]int { + uniq := dedupeNonEmpty(ids) + if len(uniq) == 0 { + return nil + } + out := make(map[string]int) + for i := 0; i < len(uniq); i += lookupChunkSize { + end := minInt(i+lookupChunkSize, len(uniq)) + chunk := uniq[i:end] + q := `SELECT to_id, COUNT(*) FROM edges WHERE to_id IN (` + + inPlaceholders(len(chunk)) + `) GROUP BY to_id` + rows, err := s.db.Query(q, toAnyArgs(chunk)...) + panicOnFatal(err) + for rows.Next() { + var id string + var n int + panicOnFatal(rows.Scan(&id, &n)) + out[id] = n + } + panicOnFatal(rows.Err()) + _ = rows.Close() + } + return out +} + +// CrossRepoEdgeCounts returns pre-grouped cross-repo edge counts keyed +// by (base kind, from-repo, to-repo). Cross-repo kinds are those +// graph.BaseKindForCrossRepo recognises; the count is reported under +// the base kind. +func (s *Store) CrossRepoEdgeCounts() []graph.CrossRepoEdgeRow { + q := `SELECT e.kind, nf.repo_prefix, nt.repo_prefix, COUNT(*) + FROM edges e + JOIN nodes nf ON e.from_id = nf.id + JOIN nodes nt ON e.to_id = nt.id + WHERE nf.repo_prefix <> nt.repo_prefix + GROUP BY e.kind, nf.repo_prefix, nt.repo_prefix` + rows, err := s.db.Query(q) + panicOnFatal(err) + defer func() { _ = rows.Close() }() + // Aggregate keyed by the edge's OWN kind (cross_repo_*), NOT the base. + // BaseKindForCrossRepo is used only as the recogniser that decides + // whether an edge participates — parity with the in-memory store. + type key struct { + kind graph.EdgeKind + from string + to string + } + acc := make(map[key]int) + for rows.Next() { + var kind, from, to string + var n int + panicOnFatal(rows.Scan(&kind, &from, &to, &n)) + ek := graph.EdgeKind(kind) + if _, ok := graph.BaseKindForCrossRepo(ek); !ok { + continue + } + acc[key{kind: ek, from: from, to: to}] += n + } + panicOnFatal(rows.Err()) + if len(acc) == 0 { + return nil + } + out := make([]graph.CrossRepoEdgeRow, 0, len(acc)) + for k, n := range acc { + out = append(out, graph.CrossRepoEdgeRow{Kind: k.kind, FromRepo: k.from, ToRepo: k.to, Count: n}) + } + return out +} + +// FileImporters returns the importing-node rows for every EdgeImports +// edge whose target's FilePath OR ID equals filePath. +func (s *Store) FileImporters(filePath string) []graph.FileImporterRow { + if filePath == "" { + return nil + } + q := `SELECT nf.file_path, nf.id, nf.name, nf.kind + FROM edges e + JOIN nodes nt ON e.to_id = nt.id + JOIN nodes nf ON e.from_id = nf.id + WHERE e.kind = ? AND (nt.file_path = ? OR nt.id = ?) + ORDER BY nf.file_path` + rows, err := s.db.Query(q, string(graph.EdgeImports), filePath, filePath) + panicOnFatal(err) + defer func() { _ = rows.Close() }() + var out []graph.FileImporterRow + for rows.Next() { + var r graph.FileImporterRow + var kind string + panicOnFatal(rows.Scan(&r.FromFile, &r.FromID, &r.FromName, &kind)) + r.FromKind = graph.NodeKind(kind) + out = append(out, r) + } + panicOnFatal(rows.Err()) + return out +} + +// FileSymbolNamesByPaths returns the distinct (file, name) pairs for +// nodes in the supplied paths whose kind is in the set, sorted by +// (file, name). +func (s *Store) FileSymbolNamesByPaths(paths []string, kinds []graph.NodeKind) []graph.FileSymbolNameRow { + uniqPaths := dedupeNonEmpty(paths) + _, kindArgs := aggDedupeNodeKinds(kinds) + if len(uniqPaths) == 0 || len(kindArgs) == 0 { + return nil + } + var out []graph.FileSymbolNameRow + for i := 0; i < len(uniqPaths); i += lookupChunkSize { + end := minInt(i+lookupChunkSize, len(uniqPaths)) + chunk := uniqPaths[i:end] + args := append(toAnyArgs(chunk), kindArgs...) + q := `SELECT DISTINCT file_path, name FROM nodes WHERE file_path IN (` + + inPlaceholders(len(chunk)) + `) AND kind IN (` + inPlaceholders(len(kindArgs)) + `)` + rows, err := s.db.Query(q, args...) + panicOnFatal(err) + for rows.Next() { + var r graph.FileSymbolNameRow + panicOnFatal(rows.Scan(&r.FilePath, &r.Name)) + out = append(out, r) + } + panicOnFatal(rows.Err()) + _ = rows.Close() + } + sort.Slice(out, func(i, j int) bool { + if out[i].FilePath != out[j].FilePath { + return out[i].FilePath < out[j].FilePath + } + return out[i].Name < out[j].Name + }) + return out +} + +// EdgesByKinds streams every edge whose kind is in the supplied set; +// honours early-stop. Empty kinds yields nothing. +func (s *Store) EdgesByKinds(kinds []graph.EdgeKind) iter.Seq[*graph.Edge] { + _, args := aggDedupeEdgeKinds(kinds) + return func(yield func(*graph.Edge) bool) { + if len(args) == 0 { + return + } + q := `SELECT ` + lookupEdgeCols + ` FROM edges WHERE kind IN (` + + inPlaceholders(len(args)) + `) ORDER BY id` + for _, e := range s.queryEdgesSQL(q, args...) { + if e == nil { + continue + } + if !yield(e) { + return + } + } + } +} + +// NodesByKinds returns every node whose kind is in the supplied set. +func (s *Store) NodesByKinds(kinds []graph.NodeKind) []*graph.Node { + _, args := aggDedupeNodeKinds(kinds) + if len(args) == 0 { + return nil + } + q := `SELECT ` + lookupNodeCols + ` FROM nodes WHERE kind IN (` + + inPlaceholders(len(args)) + `) ORDER BY id` + return s.queryNodesSQL(q, args...) +} + +// EdgeAdjacencyForKinds streams (from, to) id pairs for edges whose +// kind is in edgeKinds and whose endpoints both have a kind in +// nodeKinds; honours early-stop. Empty kinds yields nothing. +func (s *Store) EdgeAdjacencyForKinds(edgeKinds []graph.EdgeKind, nodeKinds []graph.NodeKind) iter.Seq[[2]string] { + _, eArgs := aggDedupeEdgeKinds(edgeKinds) + _, nArgs := aggDedupeNodeKinds(nodeKinds) + return func(yield func([2]string) bool) { + if len(eArgs) == 0 || len(nArgs) == 0 { + return + } + args := append([]any(nil), eArgs...) + args = append(args, nArgs...) + args = append(args, nArgs...) + q := `SELECT e.from_id, e.to_id + FROM edges e + JOIN nodes nf ON e.from_id = nf.id + JOIN nodes nt ON e.to_id = nt.id + WHERE e.kind IN (` + inPlaceholders(len(eArgs)) + `) + AND nf.kind IN (` + inPlaceholders(len(nArgs)) + `) + AND nt.kind IN (` + inPlaceholders(len(nArgs)) + `)` + rows, err := s.db.Query(q, args...) + panicOnFatal(err) + defer func() { _ = rows.Close() }() + for rows.Next() { + var from, to string + panicOnFatal(rows.Scan(&from, &to)) + if !yield([2]string{from, to}) { + return + } + } + panicOnFatal(rows.Err()) + } +} + +// NodeDegreeCounts returns per-node in/out/usage-in edge counts for the +// supplied id set. Unknown ids produce no row; duplicates collapse. +func (s *Store) NodeDegreeCounts(ids []string, usageKinds []graph.EdgeKind) []graph.NodeDegreeRow { + uniq := dedupeNonEmpty(ids) + if len(uniq) == 0 { + return nil + } + _, usageArgs := aggDedupeEdgeKinds(usageKinds) + out := make([]graph.NodeDegreeRow, 0, len(uniq)) + for i := 0; i < len(uniq); i += lookupChunkSize { + end := minInt(i+lookupChunkSize, len(uniq)) + chunk := uniq[i:end] + // Usage-in subquery: a literal 0 when no usage kinds are given. + usageExpr := `0` + var usageInline []any + if len(usageArgs) > 0 { + usageExpr = `(SELECT COUNT(*) FROM edges e WHERE e.to_id = n.id AND e.kind IN (` + + inPlaceholders(len(usageArgs)) + `))` + usageInline = usageArgs + } + q := `SELECT n.id, + (SELECT COUNT(*) FROM edges e WHERE e.to_id = n.id) AS in_count, + (SELECT COUNT(*) FROM edges e WHERE e.from_id = n.id) AS out_count, + ` + usageExpr + ` AS usage_in + FROM nodes n + WHERE n.id IN (` + inPlaceholders(len(chunk)) + `)` + // Bind order matches placeholder order: usage subquery first + // (it appears earlier in the SELECT list), then the id IN-list. + args := append(append([]any(nil), usageInline...), toAnyArgs(chunk)...) + rows, err := s.db.Query(q, args...) + panicOnFatal(err) + for rows.Next() { + var r graph.NodeDegreeRow + panicOnFatal(rows.Scan(&r.NodeID, &r.InCount, &r.OutCount, &r.UsageInCount)) + out = append(out, r) + } + panicOnFatal(rows.Err()) + _ = rows.Close() + } + return out +} + +// NodeFanCounts returns per-node fan-in (incoming edges in fanInKinds) +// and fan-out (outgoing edges in fanOutKinds) for the supplied id set. +// Unknown ids produce no row; duplicates collapse. +func (s *Store) NodeFanCounts(ids []string, fanInKinds, fanOutKinds []graph.EdgeKind) []graph.NodeFanRow { + uniq := dedupeNonEmpty(ids) + if len(uniq) == 0 { + return nil + } + _, inArgs := aggDedupeEdgeKinds(fanInKinds) + _, outArgs := aggDedupeEdgeKinds(fanOutKinds) + out := make([]graph.NodeFanRow, 0, len(uniq)) + for i := 0; i < len(uniq); i += lookupChunkSize { + end := minInt(i+lookupChunkSize, len(uniq)) + chunk := uniq[i:end] + + fanInExpr := `0` + var inInline []any + if len(inArgs) > 0 { + fanInExpr = `(SELECT COUNT(*) FROM edges e WHERE e.to_id = n.id AND e.kind IN (` + + inPlaceholders(len(inArgs)) + `))` + inInline = inArgs + } + fanOutExpr := `0` + var outInline []any + if len(outArgs) > 0 { + fanOutExpr = `(SELECT COUNT(*) FROM edges e WHERE e.from_id = n.id AND e.kind IN (` + + inPlaceholders(len(outArgs)) + `))` + outInline = outArgs + } + q := `SELECT n.id, ` + fanInExpr + ` AS fan_in, ` + fanOutExpr + ` AS fan_out + FROM nodes n + WHERE n.id IN (` + inPlaceholders(len(chunk)) + `)` + // Bind order matches placeholder order in the SELECT list: + // fan-in subquery, fan-out subquery, then the id IN-list. + args := append([]any(nil), inInline...) + args = append(args, outInline...) + args = append(args, toAnyArgs(chunk)...) + rows, err := s.db.Query(q, args...) + panicOnFatal(err) + for rows.Next() { + var r graph.NodeFanRow + panicOnFatal(rows.Scan(&r.NodeID, &r.FanIn, &r.FanOut)) + out = append(out, r) + } + panicOnFatal(rows.Err()) + _ = rows.Close() + } + return out +} + +// CommunityCrossingsByKind returns per-source crossing counts for edges +// whose kind is in the supplied set, given a node→community map. A +// crossing is an edge whose source community differs from its target +// community; zero-count sources are dropped. Empty kinds or empty +// community map returns nil. The community comparison runs Go-side +// because community membership is not a node column. +func (s *Store) CommunityCrossingsByKind(kinds []graph.EdgeKind, nodeToComm map[string]string) map[string]int { + _, args := aggDedupeEdgeKinds(kinds) + if len(args) == 0 || len(nodeToComm) == 0 { + return nil + } + q := `SELECT from_id, to_id FROM edges WHERE kind IN (` + inPlaceholders(len(args)) + `)` + rows, err := s.db.Query(q, args...) + panicOnFatal(err) + defer func() { _ = rows.Close() }() + out := make(map[string]int) + for rows.Next() { + var from, to string + panicOnFatal(rows.Scan(&from, &to)) + fromComm, ok := nodeToComm[from] + if !ok { + continue + } + toComm, ok := nodeToComm[to] + if !ok { + continue + } + if fromComm != toComm { + out[from]++ + } + } + panicOnFatal(rows.Err()) + if len(out) == 0 { + return nil + } + return out +} diff --git a/internal/graph/store_sqlite/store_analysis.go b/internal/graph/store_sqlite/store_analysis.go new file mode 100644 index 00000000..38be53f7 --- /dev/null +++ b/internal/graph/store_sqlite/store_analysis.go @@ -0,0 +1,500 @@ +package store_sqlite + +// This file implements the moderate-SQL analysis capability interfaces +// for the SQLite graph.Store backend. Each method mirrors the in-memory +// reference implementation in internal/graph/graph.go and is verified +// against the same conformance suite (internal/graph/storetest). +// +// Shape: push the structural filter into one indexed SELECT via the raw- +// SQL helpers (queryNodesSQL / s.db.Query), then do any Meta-dependent +// (gob-decoded) or distinct-counting filtering in Go. No new prepared +// statements are added — every query rides the secondary indexes already +// created in schema.go (edges_by_from / edges_by_to / nodes_by_kind). + +import ( + "github.com/zzet/gortex/internal/graph" +) + +// Compile-time assertions: *Store satisfies each analysis capability. +var _ graph.DeadCodeCandidator = (*Store)(nil) +var _ graph.IfaceImplementsScanner = (*Store)(nil) +var _ graph.MemberMethodsByType = (*Store)(nil) +var _ graph.StructuralParentEdges = (*Store)(nil) +var _ graph.ExtractCandidatesScanner = (*Store)(nil) +var _ graph.CrossRepoCandidates = (*Store)(nil) +var _ graph.ThrowerErrorSurfacer = (*Store)(nil) + +// anaDedupeEdgeKinds drops empty / duplicate edge kinds, preserving +// first-seen order — the EdgeKind twin of dedupeNonEmpty. +func anaDedupeEdgeKinds(in []graph.EdgeKind) []graph.EdgeKind { + seen := make(map[graph.EdgeKind]struct{}, len(in)) + out := make([]graph.EdgeKind, 0, len(in)) + for _, k := range in { + if k == "" { + continue + } + if _, ok := seen[k]; ok { + continue + } + seen[k] = struct{}{} + out = append(out, k) + } + return out +} + +// --- DeadCodeCandidator ------------------------------------------------- + +// DeadCodeCandidates returns nodes of the allowed kinds that have no +// incoming edge of the corresponding allowed in-edge kinds. An empty +// per-kind allowlist (or one that dedupes to nothing) means "any incoming +// edge counts as usage". Mirrors graph.(*Graph).DeadCodeCandidates: the +// candidate set is purely structural (the analysis layer applies the +// exported / test / entry-point / synthetic post-filters in Go), so no +// node-id exclusion happens here. The NOT-EXISTS filter runs server-side +// per node kind. +func (s *Store) DeadCodeCandidates(allowedNodeKinds []graph.NodeKind, allowedInEdgeKinds map[graph.NodeKind][]graph.EdgeKind) []*graph.Node { + if len(allowedNodeKinds) == 0 { + return nil + } + var out []*graph.Node + for _, nk := range allowedNodeKinds { + allowed := anaDedupeEdgeKinds(allowedInEdgeKinds[nk]) + anyKindCounts := len(allowed) == 0 + + var q string + var args []any + if anyKindCounts { + // Any incoming edge disqualifies the node. + q = `SELECT ` + lookupNodeCols + ` FROM nodes n +WHERE n.kind = ? + AND NOT EXISTS (SELECT 1 FROM edges e WHERE e.to_id = n.id) +ORDER BY n.id` + args = []any{string(nk)} + } else { + // Only an incoming edge of one of the allowed kinds counts. + q = `SELECT ` + lookupNodeCols + ` FROM nodes n +WHERE n.kind = ? + AND NOT EXISTS (SELECT 1 FROM edges e WHERE e.to_id = n.id AND e.kind IN (` + inPlaceholders(len(allowed)) + `)) +ORDER BY n.id` + args = make([]any, 0, 1+len(allowed)) + args = append(args, string(nk)) + for _, ek := range allowed { + args = append(args, string(ek)) + } + } + + for _, n := range s.queryNodesSQL(q, args...) { + if n != nil { + out = append(out, n) + } + } + } + return out +} + +// --- IfaceImplementsScanner --------------------------------------------- + +// IfaceImplementsRows returns one row per EdgeImplements edge whose +// target is a KindInterface carrying Meta["methods"]. The interface's +// decoded Meta rides on the row (callers pull the "methods" field, which +// gob round-trips as []string or []any). Interfaces with no Meta or no +// "methods" key are elided server-side. +func (s *Store) IfaceImplementsRows() []graph.IfaceImplementsRow { + q := `SELECT e.from_id, n.id, n.meta +FROM edges e +JOIN nodes n ON n.id = e.to_id +WHERE e.kind = ? AND n.kind = ? AND n.meta IS NOT NULL` + rows, err := s.db.Query(q, string(graph.EdgeImplements), string(graph.KindInterface)) + if err != nil { + return nil + } + defer func() { _ = rows.Close() }() + + var out []graph.IfaceImplementsRow + for rows.Next() { + var fromID, ifaceID string + var metaBlob []byte + if err := rows.Scan(&fromID, &ifaceID, &metaBlob); err != nil { + continue + } + meta, derr := decodeMeta(metaBlob) + if derr != nil || meta == nil { + continue + } + if _, ok := meta["methods"]; !ok { + continue + } + out = append(out, graph.IfaceImplementsRow{ + TypeID: fromID, + IfaceID: ifaceID, + IfaceMeta: meta, + }) + } + return out +} + +// --- MemberMethodsByType ------------------------------------------------ + +// MemberMethodsByType returns typeID → []MemberMethodInfo for every +// EdgeMemberOf edge whose source is a KindMethod. The columns come from +// the METHOD NODE (FilePath / StartLine / RepoPrefix), matching the +// in-memory reference. Per-type lists are deduplicated by MethodID; the +// scan is ordered by the edge PK so the first-seen winner is stable. An +// empty graph (no qualifying rows) returns nil. +func (s *Store) MemberMethodsByType() map[string][]graph.MemberMethodInfo { + q := `SELECT e.to_id, n.id, n.name, n.file_path, n.start_line, n.repo_prefix +FROM edges e +JOIN nodes n ON n.id = e.from_id +WHERE e.kind = ? AND n.kind = ? +ORDER BY e.id` + rows, err := s.db.Query(q, string(graph.EdgeMemberOf), string(graph.KindMethod)) + if err != nil { + return nil + } + defer func() { _ = rows.Close() }() + + out := make(map[string][]graph.MemberMethodInfo) + seen := make(map[string]map[string]struct{}) + for rows.Next() { + var typeID, methodID, name, filePath, repoPrefix string + var startLine int + if err := rows.Scan(&typeID, &methodID, &name, &filePath, &startLine, &repoPrefix); err != nil { + continue + } + if seen[typeID] == nil { + seen[typeID] = make(map[string]struct{}) + } + if _, ok := seen[typeID][methodID]; ok { + continue + } + seen[typeID][methodID] = struct{}{} + out[typeID] = append(out[typeID], graph.MemberMethodInfo{ + MethodID: methodID, + Name: name, + FilePath: filePath, + StartLine: startLine, + RepoPrefix: repoPrefix, + }) + } + if len(out) == 0 { + // Match the in-memory reference: empty graph returns nil. + return nil + } + return out +} + +// --- StructuralParentEdges ---------------------------------------------- + +// StructuralParentEdges returns every Extends / Implements / Composes +// edge whose endpoints are both Type / Interface, projected as (FromID, +// ToID, FromKind, ToKind, Origin). Endpoints that aren't both type / +// interface are filtered server-side. Empty graph or no matching edges +// returns nil. +func (s *Store) StructuralParentEdges() []graph.StructuralParentEdgeRow { + q := `SELECT e.from_id, e.to_id, nf.kind, nt.kind, e.origin +FROM edges e +JOIN nodes nf ON nf.id = e.from_id +JOIN nodes nt ON nt.id = e.to_id +WHERE e.kind IN (?,?,?) + AND nf.kind IN (?,?) AND nt.kind IN (?,?) +ORDER BY e.id` + rows, err := s.db.Query(q, + string(graph.EdgeExtends), string(graph.EdgeImplements), string(graph.EdgeComposes), + string(graph.KindType), string(graph.KindInterface), + string(graph.KindType), string(graph.KindInterface), + ) + if err != nil { + return nil + } + defer func() { _ = rows.Close() }() + + var out []graph.StructuralParentEdgeRow + for rows.Next() { + var fromID, toID, fromKind, toKind, origin string + if err := rows.Scan(&fromID, &toID, &fromKind, &toKind, &origin); err != nil { + continue + } + out = append(out, graph.StructuralParentEdgeRow{ + FromID: fromID, + ToID: toID, + FromKind: graph.NodeKind(fromKind), + ToKind: graph.NodeKind(toKind), + Origin: origin, + }) + } + return out +} + +// --- ExtractCandidatesScanner ------------------------------------------- + +// ExtractCandidates ranks function / method nodes by extractability: line +// span (EndLine - StartLine + 1), distinct caller fan-in, and distinct +// callee fan-out, counting only edges whose kind is in the supplied set. +// Rows must clear all three thresholds. Nodes with a zero StartLine / +// EndLine are dropped; pathPrefix narrows by file-path prefix. Mirrors +// graph.(*Graph).ExtractCandidates exactly: only KindFunction + +// KindMethod nodes are considered, and the distinct-by-endpoint counting +// runs Go-side over GetInEdges / GetOutEdges. +func (s *Store) ExtractCandidates(kinds []graph.EdgeKind, minLines, minCallers, minFanOut int, pathPrefix string) []graph.ExtractCandidateRow { + if len(kinds) == 0 { + return nil + } + kindSet := make(map[graph.EdgeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + kindSet[k] = struct{}{} + } + if len(kindSet) == 0 { + return nil + } + + // Candidate nodes: function / method only, non-zero line span, + // optional path-prefix gate. + q := `SELECT ` + lookupNodeCols + ` FROM nodes +WHERE kind IN (?,?) AND start_line > 0 AND end_line > 0` + args := []any{string(graph.KindFunction), string(graph.KindMethod)} + if pathPrefix != "" { + q += ` AND file_path LIKE ? ESCAPE '\'` + args = append(args, escapeLikePattern(pathPrefix)+"%") + } + q += ` ORDER BY id` + nodes := s.queryNodesSQL(q, args...) + + var out []graph.ExtractCandidateRow + for _, n := range nodes { + if n == nil { + continue + } + lineCount := n.EndLine - n.StartLine + 1 + if lineCount < minLines { + continue + } + + callerSet := make(map[string]struct{}) + for _, e := range s.GetInEdges(n.ID) { + if e == nil { + continue + } + if _, ok := kindSet[e.Kind]; !ok { + continue + } + callerSet[e.From] = struct{}{} + } + if len(callerSet) < minCallers { + continue + } + + calleeSet := make(map[string]struct{}) + for _, e := range s.GetOutEdges(n.ID) { + if e == nil { + continue + } + if _, ok := kindSet[e.Kind]; !ok { + continue + } + calleeSet[e.To] = struct{}{} + } + if len(calleeSet) < minFanOut { + continue + } + + out = append(out, graph.ExtractCandidateRow{ + NodeID: n.ID, + Name: n.Name, + FilePath: n.FilePath, + StartLine: n.StartLine, + EndLine: n.EndLine, + LineCount: lineCount, + CallerCount: len(callerSet), + FanOut: len(calleeSet), + }) + } + return out +} + +// --- CrossRepoCandidates ------------------------------------------------ + +// CrossRepoCandidates returns every edge whose kind is in baseKinds and +// whose endpoints carry two different non-empty RepoPrefix values. The +// edge is returned verbatim (callers rewrite Edge.CrossRepo); FromRepo / +// ToRepo are the endpoint prefixes. Empty baseKinds returns nil; single- +// repo graphs (or graphs whose nodes carry no RepoPrefix) yield nothing. +func (s *Store) CrossRepoCandidates(baseKinds []graph.EdgeKind) []graph.CrossRepoCandidateRow { + uniq := anaDedupeEdgeKinds(baseKinds) + if len(uniq) == 0 { + return nil + } + args := make([]any, 0, len(uniq)) + for _, k := range uniq { + args = append(args, string(k)) + } + q := `SELECT e.from_id, e.to_id, e.kind, e.file_path, e.line, + e.confidence, e.confidence_label, e.origin, e.tier, e.cross_repo, e.meta, + nf.repo_prefix, nt.repo_prefix +FROM edges e +JOIN nodes nf ON nf.id = e.from_id +JOIN nodes nt ON nt.id = e.to_id +WHERE e.kind IN (` + inPlaceholders(len(uniq)) + `) + AND nf.repo_prefix <> '' AND nt.repo_prefix <> '' + AND nf.repo_prefix <> nt.repo_prefix +ORDER BY e.id` + rows, err := s.db.Query(q, args...) + if err != nil { + return nil + } + defer func() { _ = rows.Close() }() + + var out []graph.CrossRepoCandidateRow + for rows.Next() { + var ( + fromRepo, toRepo string + e graph.Edge + metaBlob []byte + crossRepo int64 + ) + if err := rows.Scan( + &e.From, &e.To, &e.Kind, &e.FilePath, &e.Line, + &e.Confidence, &e.ConfidenceLabel, &e.Origin, &e.Tier, + &crossRepo, &metaBlob, + &fromRepo, &toRepo, + ); err != nil { + continue + } + e.CrossRepo = crossRepo != 0 + if len(metaBlob) > 0 { + if m, derr := decodeMeta(metaBlob); derr == nil { + e.Meta = m + } + } + edge := e + out = append(out, graph.CrossRepoCandidateRow{ + Edge: &edge, + FromRepo: fromRepo, + ToRepo: toRepo, + }) + } + return out +} + +// --- ThrowerErrorSurfacer ----------------------------------------------- + +// ThrowerErrorSurface returns one row per thrower (a node with outgoing +// EdgeThrows edges), aggregating the distinct error targets and the +// distinct literal error-message strings it emits (KindString nodes with +// Meta["context"] == "error_msg", linked by EdgeEmits). pathPrefix gates +// the EdgeThrows rows by their stored FilePath prefix. Throws counts the +// underlying EdgeThrows edges; FilePath / Line seed from the first throws +// edge, falling back to the thrower node's own coordinates when the edge +// carries none — matching the in-memory reference. +func (s *Store) ThrowerErrorSurface(pathPrefix string) []graph.ThrowerErrorRow { + type rowAccum struct { + row graph.ThrowerErrorRow + targetSeen map[string]struct{} + msgSeen map[string]struct{} + } + accums := make(map[string]*rowAccum) + var order []string + + // Pass 1: EdgeThrows aggregation (count + distinct targets), keyed by + // thrower. The first edge (by PK insertion order) seeds FilePath / + // Line; an empty edge file/line falls back to the thrower node. + tq := `SELECT from_id, to_id, file_path, line FROM edges WHERE kind = ?` + targs := []any{string(graph.EdgeThrows)} + if pathPrefix != "" { + tq += ` AND file_path LIKE ? ESCAPE '\'` + targs = append(targs, escapeLikePattern(pathPrefix)+"%") + } + tq += ` ORDER BY id` + trows, err := s.db.Query(tq, targs...) + if err != nil { + return nil + } + for trows.Next() { + var from, to, filePath string + var line int + if err := trows.Scan(&from, &to, &filePath, &line); err != nil { + continue + } + acc := accums[from] + if acc == nil { + file := filePath + ln := line + if file == "" || ln == 0 { + if n := s.GetNode(from); n != nil { + if file == "" { + file = n.FilePath + } + if ln == 0 { + ln = n.StartLine + } + } + } + acc = &rowAccum{ + row: graph.ThrowerErrorRow{ + ThrowerID: from, + FilePath: file, + Line: ln, + }, + targetSeen: make(map[string]struct{}), + msgSeen: make(map[string]struct{}), + } + accums[from] = acc + order = append(order, from) + } + acc.row.Throws++ + if _, ok := acc.targetSeen[to]; !ok { + acc.targetSeen[to] = struct{}{} + acc.row.ErrorTargets = append(acc.row.ErrorTargets, to) + } + } + _ = trows.Close() + if len(accums) == 0 { + return nil + } + + // Pass 2: attach the literal error messages each thrower emits. Join + // each thrower's EdgeEmits out-edges to KindString targets and filter + // Meta["context"] == "error_msg" Go-side (the context lives in the + // gob-encoded Meta blob). + for _, id := range order { + acc := accums[id] + mq := `SELECT n.name, n.meta +FROM edges e +JOIN nodes n ON n.id = e.to_id +WHERE e.from_id = ? AND e.kind = ? AND n.kind = ? AND n.meta IS NOT NULL +ORDER BY e.id` + mrows, err := s.db.Query(mq, id, string(graph.EdgeEmits), string(graph.KindString)) + if err != nil { + continue + } + for mrows.Next() { + var name string + var metaBlob []byte + if err := mrows.Scan(&name, &metaBlob); err != nil { + continue + } + meta, derr := decodeMeta(metaBlob) + if derr != nil || meta == nil { + continue + } + ctxLabel, _ := meta["context"].(string) + if ctxLabel != "error_msg" { + continue + } + if _, ok := acc.msgSeen[name]; ok { + continue + } + acc.msgSeen[name] = struct{}{} + acc.row.ErrorMsgs = append(acc.row.ErrorMsgs, name) + } + _ = mrows.Close() + } + + out := make([]graph.ThrowerErrorRow, 0, len(order)) + for _, id := range order { + out = append(out, accums[id].row) + } + return out +} diff --git a/internal/graph/store_sqlite/store_blame_enrichment.go b/internal/graph/store_sqlite/store_blame_enrichment.go new file mode 100644 index 00000000..5828f279 --- /dev/null +++ b/internal/graph/store_sqlite/store_blame_enrichment.go @@ -0,0 +1,130 @@ +package store_sqlite + +import ( + "database/sql" + + "github.com/zzet/gortex/internal/graph" +) + +var ( + _ graph.BlameEnrichmentWriter = (*Store)(nil) + _ graph.BlameEnrichmentReader = (*Store)(nil) +) + +const blameChunk = 180 + +const blameCols = `node_id, repo_prefix, commit_sha, email, ts` + +func (s *Store) BulkSetBlame(repoPrefix string, rows []graph.BlameEnrichment) error { + if len(rows) == 0 { + return nil + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + tx, err := s.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() //nolint:errcheck + for start := 0; start < len(rows); start += blameChunk { + end := start + blameChunk + if end > len(rows) { + end = len(rows) + } + batch := rows[start:end] + args := make([]any, 0, len(batch)*5) + stmt := make([]byte, 0, 96+len(batch)*16) + stmt = append(stmt, "INSERT OR REPLACE INTO blame_enrichment ("...) + stmt = append(stmt, blameCols...) + stmt = append(stmt, ") VALUES "...) + for i, e := range batch { + if i > 0 { + stmt = append(stmt, ',') + } + stmt = append(stmt, "(?,?,?,?,?)"...) + args = append(args, e.NodeID, repoPrefix, e.Commit, e.Email, e.Timestamp) + } + if _, err := tx.Exec(string(stmt), args...); err != nil { + return err + } + } + return tx.Commit() +} + +func (s *Store) DeleteBlame(nodeIDs []string) error { + if len(nodeIDs) == 0 { + return nil + } + seen := make(map[string]struct{}, len(nodeIDs)) + uniq := make([]string, 0, len(nodeIDs)) + for _, id := range nodeIDs { + if id == "" { + continue + } + if _, ok := seen[id]; ok { + continue + } + seen[id] = struct{}{} + uniq = append(uniq, id) + } + if len(uniq) == 0 { + return nil + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + tx, err := s.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() //nolint:errcheck + for start := 0; start < len(uniq); start += blameChunk { + end := start + blameChunk + if end > len(uniq) { + end = len(uniq) + } + chunk := uniq[start:end] + args := make([]any, len(chunk)) + stmt := make([]byte, 0, 56+len(chunk)*2) + stmt = append(stmt, "DELETE FROM blame_enrichment WHERE node_id IN ("...) + for i, id := range chunk { + if i > 0 { + stmt = append(stmt, ',') + } + stmt = append(stmt, '?') + args[i] = id + } + stmt = append(stmt, ')') + if _, err := tx.Exec(string(stmt), args...); err != nil { + return err + } + } + return tx.Commit() +} + +func (s *Store) BlameRows(repoPrefix string) []graph.BlameEnrichment { + var ( + rows *sql.Rows + err error + ) + if repoPrefix == "" { + rows, err = s.db.Query(`SELECT ` + blameCols + ` FROM blame_enrichment`) + } else { + rows, err = s.db.Query(`SELECT `+blameCols+` FROM blame_enrichment WHERE repo_prefix = ?`, repoPrefix) + } + if err != nil { + return nil + } + defer func() { _ = rows.Close() }() + var out []graph.BlameEnrichment + for rows.Next() { + var e graph.BlameEnrichment + if err := rows.Scan(&e.NodeID, &e.RepoPrefix, &e.Commit, &e.Email, &e.Timestamp); err != nil { + return out + } + out = append(out, e) + } + if err := rows.Err(); err != nil { + return out + } + return out +} diff --git a/internal/graph/store_sqlite/store_churn_enrichment.go b/internal/graph/store_sqlite/store_churn_enrichment.go new file mode 100644 index 00000000..72ccd0ea --- /dev/null +++ b/internal/graph/store_sqlite/store_churn_enrichment.go @@ -0,0 +1,155 @@ +package store_sqlite + +import ( + "database/sql" + + "github.com/zzet/gortex/internal/graph" +) + +// Compile-time assertions that the SQLite Store satisfies the optional +// git-churn enrichment sidecar capabilities (change A: enrichment moved +// out of nodes.meta into a typed table so the node hot path stops +// gob-encoding rarely-read data and get_churn_rate reads via an index +// instead of an AllNodes scan). +var ( + _ graph.ChurnEnrichmentWriter = (*Store)(nil) + _ graph.ChurnEnrichmentReader = (*Store)(nil) +) + +// churnChunk bounds rows per multi-row INSERT. churn_enrichment has 10 +// columns, so at 10 params/row the 999 host-param limit caps a statement +// at 99 rows; 90 leaves headroom. Mirrors shingleChunk / mtimeChunk. +const churnChunk = 90 + +const churnCols = `node_id, repo_prefix, commit_count, age_days, churn_rate, last_author, last_commit_at, head_sha, branch, computed_at` + +// BulkSetChurn persists every churn row for one repo prefix in a single +// transaction, chunked under the host-parameter limit. Idempotent on +// node_id (INSERT OR REPLACE). Empty input is a no-op. +func (s *Store) BulkSetChurn(repoPrefix string, rows []graph.ChurnEnrichment) error { + if len(rows) == 0 { + return nil + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + + tx, err := s.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() //nolint:errcheck // rollback after Commit is a no-op + + for start := 0; start < len(rows); start += churnChunk { + end := start + churnChunk + if end > len(rows) { + end = len(rows) + } + batch := rows[start:end] + args := make([]any, 0, len(batch)*10) + stmt := make([]byte, 0, 128+len(batch)*24) + stmt = append(stmt, "INSERT OR REPLACE INTO churn_enrichment ("...) + stmt = append(stmt, churnCols...) + stmt = append(stmt, ") VALUES "...) + for i, e := range batch { + if i > 0 { + stmt = append(stmt, ',') + } + stmt = append(stmt, "(?,?,?,?,?,?,?,?,?,?)"...) + args = append(args, e.NodeID, repoPrefix, e.CommitCount, e.AgeDays, + e.ChurnRate, e.LastAuthor, e.LastCommitAt, e.HeadSHA, e.Branch, e.ComputedAt) + } + if _, err := tx.Exec(string(stmt), args...); err != nil { + return err + } + } + return tx.Commit() +} + +// DeleteChurn drops churn rows for the supplied node ids, chunked into +// `node_id IN (?, …)` DELETEs. Empty input is a no-op. +func (s *Store) DeleteChurn(nodeIDs []string) error { + if len(nodeIDs) == 0 { + return nil + } + seen := make(map[string]struct{}, len(nodeIDs)) + uniq := make([]string, 0, len(nodeIDs)) + for _, id := range nodeIDs { + if id == "" { + continue + } + if _, ok := seen[id]; ok { + continue + } + seen[id] = struct{}{} + uniq = append(uniq, id) + } + if len(uniq) == 0 { + return nil + } + + s.writeMu.Lock() + defer s.writeMu.Unlock() + + tx, err := s.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() //nolint:errcheck + + for start := 0; start < len(uniq); start += churnChunk { + end := start + churnChunk + if end > len(uniq) { + end = len(uniq) + } + chunk := uniq[start:end] + args := make([]any, len(chunk)) + stmt := make([]byte, 0, 48+len(chunk)*2) + stmt = append(stmt, "DELETE FROM churn_enrichment WHERE node_id IN ("...) + for i, id := range chunk { + if i > 0 { + stmt = append(stmt, ',') + } + stmt = append(stmt, '?') + args[i] = id + } + stmt = append(stmt, ')') + if _, err := tx.Exec(string(stmt), args...); err != nil { + return err + } + } + return tx.Commit() +} + +// ChurnRows returns every churn row for repoPrefix; an EMPTY repoPrefix +// returns ALL rows across repos. This is an index-only read over the +// (small) enriched set — the whole point of the sidecar, replacing the +// AllNodes()+gob-decode scan get_churn_rate used to do. +func (s *Store) ChurnRows(repoPrefix string) []graph.ChurnEnrichment { + var ( + rows *sql.Rows + err error + ) + if repoPrefix == "" { + rows, err = s.db.Query(`SELECT ` + churnCols + ` FROM churn_enrichment`) + } else { + rows, err = s.db.Query(`SELECT `+churnCols+` FROM churn_enrichment WHERE repo_prefix = ?`, repoPrefix) + } + if err != nil { + return nil + } + defer func() { _ = rows.Close() }() + + var out []graph.ChurnEnrichment + for rows.Next() { + var e graph.ChurnEnrichment + if err := rows.Scan(&e.NodeID, &e.RepoPrefix, &e.CommitCount, &e.AgeDays, + &e.ChurnRate, &e.LastAuthor, &e.LastCommitAt, &e.HeadSHA, &e.Branch, &e.ComputedAt); err != nil { + return out + } + out = append(out, e) + } + if err := rows.Err(); err != nil { + return out + } + return out +} diff --git a/internal/graph/store_sqlite/store_clone_shingles.go b/internal/graph/store_sqlite/store_clone_shingles.go new file mode 100644 index 00000000..e19c2588 --- /dev/null +++ b/internal/graph/store_sqlite/store_clone_shingles.go @@ -0,0 +1,192 @@ +package store_sqlite + +import ( + "encoding/binary" + + "github.com/zzet/gortex/internal/graph" +) + +// Compile-time assertions that the SQLite Store satisfies the optional +// per-symbol clone-shingle persistence capabilities. Lifting this state +// into the same backend the graph lives in means warm restarts rebuild +// the clone-detection CMS through one persistence surface instead of a +// second gob snapshot. +var ( + _ graph.CloneShingleWriter = (*Store)(nil) + _ graph.CloneShingleReader = (*Store)(nil) +) + +// shingleChunk bounds how many (node_id, repo_prefix, shingles) tuples +// ride in a single multi-row INSERT. SQLite's default compiled-in host +// parameter limit is 999; at 3 params per row that caps a statement at +// 333 rows, so 300 leaves headroom. Mirrors mtimeChunk. +const shingleChunk = 300 + +// encodeShingles serialises a uint64 slice to a little-endian BLOB +// (8 bytes per element). A nil/empty slice encodes to an empty BLOB. +func encodeShingles(shingles []uint64) []byte { + b := make([]byte, len(shingles)*8) + for i, s := range shingles { + binary.LittleEndian.PutUint64(b[i*8:], s) + } + return b +} + +// decodeShingles is the inverse of encodeShingles. A BLOB whose length +// is not a multiple of 8 yields nil (corrupt row); callers skip nil +// sets. An empty BLOB decodes to an empty (non-nil) slice. +func decodeShingles(b []byte) []uint64 { + if len(b)%8 != 0 { + return nil + } + out := make([]uint64, len(b)/8) + for i := range out { + out[i] = binary.LittleEndian.Uint64(b[i*8:]) + } + return out +} + +// BulkSetCloneShingles persists every (nodeID -> shingles) entry for +// one repo prefix in a single transaction, chunked so no statement +// exceeds SQLite's host-parameter limit. Idempotent on node_id: +// re-running with overlapping keys replaces in place. Empty input is a +// no-op. +func (s *Store) BulkSetCloneShingles(repoPrefix string, rows map[string][]uint64) error { + if len(rows) == 0 { + return nil + } + + s.writeMu.Lock() + defer s.writeMu.Unlock() + + // Stable ordering is not required for correctness, but iterating the + // map directly is fine — we only chunk by count. + type kv struct { + id string + blob []byte + } + pending := make([]kv, 0, len(rows)) + for id, sh := range rows { + pending = append(pending, kv{id: id, blob: encodeShingles(sh)}) + } + + tx, err := s.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() //nolint:errcheck // rollback after Commit is a no-op + + for start := 0; start < len(pending); start += shingleChunk { + end := start + shingleChunk + if end > len(pending) { + end = len(pending) + } + batch := pending[start:end] + + // Build a multi-row INSERT OR REPLACE: (?, ?, ?), (?, ?, ?), ... + args := make([]any, 0, len(batch)*3) + stmt := make([]byte, 0, 64+len(batch)*16) + stmt = append(stmt, "INSERT OR REPLACE INTO clone_shingles (node_id, repo_prefix, shingles) VALUES "...) + for i, e := range batch { + if i > 0 { + stmt = append(stmt, ',') + } + stmt = append(stmt, "(?, ?, ?)"...) + args = append(args, e.id, repoPrefix, e.blob) + } + if _, err := tx.Exec(string(stmt), args...); err != nil { + return err + } + } + + return tx.Commit() +} + +// DeleteCloneShingles drops the rows for the supplied node ids, chunked +// into `node_id IN (?, ?, …)` DELETEs so no statement exceeds SQLite's +// host-parameter limit. Empty input is a no-op; missing ids are simply +// not deleted. +func (s *Store) DeleteCloneShingles(nodeIDs []string) error { + if len(nodeIDs) == 0 { + return nil + } + + // Dedupe + skip empty up front to keep the chunk loop honest. + seen := make(map[string]struct{}, len(nodeIDs)) + uniq := make([]string, 0, len(nodeIDs)) + for _, id := range nodeIDs { + if id == "" { + continue + } + if _, ok := seen[id]; ok { + continue + } + seen[id] = struct{}{} + uniq = append(uniq, id) + } + if len(uniq) == 0 { + return nil + } + + s.writeMu.Lock() + defer s.writeMu.Unlock() + + tx, err := s.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() //nolint:errcheck // rollback after Commit is a no-op + + for start := 0; start < len(uniq); start += shingleChunk { + end := start + shingleChunk + if end > len(uniq) { + end = len(uniq) + } + chunk := uniq[start:end] + args := make([]any, len(chunk)) + stmt := make([]byte, 0, 48+len(chunk)*2) + stmt = append(stmt, "DELETE FROM clone_shingles WHERE node_id IN ("...) + for i, id := range chunk { + if i > 0 { + stmt = append(stmt, ',') + } + stmt = append(stmt, '?') + args[i] = id + } + stmt = append(stmt, ')') + if _, err := tx.Exec(string(stmt), args...); err != nil { + return err + } + } + + return tx.Commit() +} + +// LoadCloneShingles returns the recorded shingle sets for one repo +// prefix as a fresh map. It always returns a non-nil (possibly empty) +// map and surfaces any query error. An empty/absent prefix yields an +// empty map, not an error. +func (s *Store) LoadCloneShingles(repoPrefix string) (map[string][]uint64, error) { + rows, err := s.db.Query( + `SELECT node_id, shingles FROM clone_shingles WHERE repo_prefix = ?`, + repoPrefix, + ) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + + out := make(map[string][]uint64) + for rows.Next() { + var id string + var blob []byte + if err := rows.Scan(&id, &blob); err != nil { + return nil, err + } + out[id] = decodeShingles(blob) + } + if err := rows.Err(); err != nil { + return nil, err + } + return out, nil +} diff --git a/internal/graph/store_sqlite/store_coverage_enrichment.go b/internal/graph/store_sqlite/store_coverage_enrichment.go new file mode 100644 index 00000000..74edd7d1 --- /dev/null +++ b/internal/graph/store_sqlite/store_coverage_enrichment.go @@ -0,0 +1,143 @@ +package store_sqlite + +import ( + "database/sql" + + "github.com/zzet/gortex/internal/graph" +) + +var ( + _ graph.CoverageEnrichmentWriter = (*Store)(nil) + _ graph.CoverageEnrichmentReader = (*Store)(nil) +) + +// coverageChunk bounds rows per multi-row INSERT (5 cols → 5 params/row; +// 999/5 ≈ 199 max, 180 leaves headroom). +const coverageChunk = 180 + +const coverageCols = `node_id, repo_prefix, coverage_pct, num_stmt, hit` + +// BulkSetCoverage persists coverage rows for one repo prefix in a single +// chunked transaction. Idempotent on node_id. Empty input is a no-op. +func (s *Store) BulkSetCoverage(repoPrefix string, rows []graph.CoverageEnrichment) error { + if len(rows) == 0 { + return nil + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + + tx, err := s.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() //nolint:errcheck + + for start := 0; start < len(rows); start += coverageChunk { + end := start + coverageChunk + if end > len(rows) { + end = len(rows) + } + batch := rows[start:end] + args := make([]any, 0, len(batch)*5) + stmt := make([]byte, 0, 96+len(batch)*16) + stmt = append(stmt, "INSERT OR REPLACE INTO coverage_enrichment ("...) + stmt = append(stmt, coverageCols...) + stmt = append(stmt, ") VALUES "...) + for i, e := range batch { + if i > 0 { + stmt = append(stmt, ',') + } + stmt = append(stmt, "(?,?,?,?,?)"...) + args = append(args, e.NodeID, repoPrefix, e.CoveragePct, e.NumStmt, e.Hit) + } + if _, err := tx.Exec(string(stmt), args...); err != nil { + return err + } + } + return tx.Commit() +} + +// DeleteCoverage drops coverage rows for the supplied node ids, chunked. +func (s *Store) DeleteCoverage(nodeIDs []string) error { + if len(nodeIDs) == 0 { + return nil + } + seen := make(map[string]struct{}, len(nodeIDs)) + uniq := make([]string, 0, len(nodeIDs)) + for _, id := range nodeIDs { + if id == "" { + continue + } + if _, ok := seen[id]; ok { + continue + } + seen[id] = struct{}{} + uniq = append(uniq, id) + } + if len(uniq) == 0 { + return nil + } + + s.writeMu.Lock() + defer s.writeMu.Unlock() + + tx, err := s.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() //nolint:errcheck + + for start := 0; start < len(uniq); start += coverageChunk { + end := start + coverageChunk + if end > len(uniq) { + end = len(uniq) + } + chunk := uniq[start:end] + args := make([]any, len(chunk)) + stmt := make([]byte, 0, 56+len(chunk)*2) + stmt = append(stmt, "DELETE FROM coverage_enrichment WHERE node_id IN ("...) + for i, id := range chunk { + if i > 0 { + stmt = append(stmt, ',') + } + stmt = append(stmt, '?') + args[i] = id + } + stmt = append(stmt, ')') + if _, err := tx.Exec(string(stmt), args...); err != nil { + return err + } + } + return tx.Commit() +} + +// CoverageRows returns coverage rows for repoPrefix; empty repoPrefix +// returns ALL rows across repos. Index-only read over the enriched set. +func (s *Store) CoverageRows(repoPrefix string) []graph.CoverageEnrichment { + var ( + rows *sql.Rows + err error + ) + if repoPrefix == "" { + rows, err = s.db.Query(`SELECT ` + coverageCols + ` FROM coverage_enrichment`) + } else { + rows, err = s.db.Query(`SELECT `+coverageCols+` FROM coverage_enrichment WHERE repo_prefix = ?`, repoPrefix) + } + if err != nil { + return nil + } + defer func() { _ = rows.Close() }() + + var out []graph.CoverageEnrichment + for rows.Next() { + var e graph.CoverageEnrichment + if err := rows.Scan(&e.NodeID, &e.RepoPrefix, &e.CoveragePct, &e.NumStmt, &e.Hit); err != nil { + return out + } + out = append(out, e) + } + if err := rows.Err(); err != nil { + return out + } + return out +} diff --git a/internal/graph/store_sqlite/store_fts.go b/internal/graph/store_sqlite/store_fts.go new file mode 100644 index 00000000..6048b4e7 --- /dev/null +++ b/internal/graph/store_sqlite/store_fts.go @@ -0,0 +1,369 @@ +package store_sqlite + +import ( + "strings" + + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/search" +) + +// This file implements graph.SymbolSearcher + graph.SymbolBundleSearcher +// on the SQLite backend using the FTS5 virtual table declared in +// schema.go (symbol_fts). It is the on-disk replacement for the +// multi-GB in-heap Bleve/BM25 index: the FTS5 inverted index lives in +// the same .sqlite file as the graph, and a tier-0 exact-name boost +// short-circuits identifier queries so +// search quality holds or improves while the heap shrinks. +// +// Semantics: +// +// - BulkUpsertSymbolFTS wipes only the rows owned by repoPrefix +// before re-inserting, so sibling repos sharing one store don't +// clobber each other's corpus. Empty prefix wipes the whole table +// (single-repo / conformance behaviour). +// +// - SearchSymbols tier 0: an identifier query (no whitespace / path +// separators) that resolves to one or more nodes by exact name is +// returned directly with a fixed dominant score, skipping FTS. +// Misses fall through to the FTS5 MATCH path. +// +// - SearchSymbolBundles composes the same hit list with batched +// node + in/out edge fetches the rerank pipeline reads from. +// +// FTS5 maintains its index incrementally on every insert, so the +// Store struct needs no extra state and BuildSymbolIndex is a no-op +// (it only opportunistically merges segments). + +// Compile-time assertions: *Store satisfies the symbol-search +// capabilities. The indexer auto-engages these when the active backend +// implements them, routing search_symbols through on-disk FTS5 instead +// of the in-process BM25 index. +var ( + _ graph.SymbolSearcher = (*Store)(nil) + _ graph.SymbolBundleSearcher = (*Store)(nil) +) + +// ftsInsertChunkRows bounds the rows per multi-row INSERT. Each row +// binds 3 host params (node_id, repo_prefix, tokens); 300 rows is 900 +// params, comfortably under SQLite's default 999-variable limit so the +// statement stays portable across builds. +const ftsInsertChunkRows = 300 + +// UpsertSymbolFTS records (or replaces) the pre-tokenised text for +// nodeID. FTS5 offers no UPSERT on a table with UNINDEXED columns, so +// the write is delete-then-insert: drop any prior row for nodeID, then +// insert the new tokens. The repo_prefix is derived from the owning +// node (nodes.repo_prefix) so the per-repo staleness wipe in +// BulkUpsertSymbolFTS can scope by prefix; if the node is absent the +// prefix defaults to "". +func (s *Store) UpsertSymbolFTS(nodeID, tokens string) error { + if nodeID == "" { + return nil + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + + var repoPrefix string + row := s.db.QueryRow(`SELECT repo_prefix FROM nodes WHERE id = ?`, nodeID) + // A missing node (or a scan error) leaves repoPrefix == "" — the + // row is still indexable, it just won't be reachable by a per-repo + // prefix wipe. The graph.Store contract has no error channel for + // the indexer's incremental writes, so we don't surface this. + _ = row.Scan(&repoPrefix) + + if _, err := s.db.Exec(`DELETE FROM symbol_fts WHERE node_id = ?`, nodeID); err != nil { + return err + } + if _, err := s.db.Exec( + `INSERT INTO symbol_fts (node_id, repo_prefix, tokens) VALUES (?, ?, ?)`, + nodeID, repoPrefix, tokens, + ); err != nil { + return err + } + return nil +} + +// BulkUpsertSymbolFTS is the cold-start fast path: wipe this repo's +// stale rows, then chunked multi-row INSERT of the deduped items. The +// whole thing runs in one transaction under writeMu so a concurrent +// reader never observes the table mid-wipe. +// +// repoPrefix scopes the pre-insert wipe: a non-empty prefix deletes +// only rows owned by that repo, +// leaving siblings untouched; an empty prefix wipes the whole table +// (single-repo / conformance behaviour — the conformance suite calls +// this with ""). Items are deduped by NodeID with last-write-wins, +// matching UpsertSymbolFTS's replace semantics. +func (s *Store) BulkUpsertSymbolFTS(repoPrefix string, items []graph.SymbolFTSItem) error { + if len(items) == 0 { + return nil + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + + // Dedup by ID — last write wins, mirroring UpsertSymbolFTS's + // delete-then-insert. Guards the edge case where a re-parse of a + // file emitted the same ID twice. + pos := make(map[string]int, len(items)) + deduped := items[:0] + for _, it := range items { + if it.NodeID == "" { + continue + } + if p, ok := pos[it.NodeID]; ok { + deduped[p] = it + } else { + pos[it.NodeID] = len(deduped) + deduped = append(deduped, it) + } + } + items = deduped + if len(items) == 0 { + return nil + } + + tx, err := s.db.Begin() + if err != nil { + return err + } + commit := false + defer func() { + if !commit { + _ = tx.Rollback() + } + }() + + // Wipe this repo's prior rows so a clean rebuild of repo A doesn't + // leave phantom hits, while sibling repo B's corpus survives. The + // repo_prefix column is UNINDEXED but still stored, so the equality + // filter is a literal compare over the row set. Empty repoPrefix + // clears the whole table — the legacy single-repo wipe. + if _, err := tx.Exec(`DELETE FROM symbol_fts WHERE repo_prefix = ?`, repoPrefix); err != nil { + return err + } + + for start := 0; start < len(items); start += ftsInsertChunkRows { + end := minInt(start+ftsInsertChunkRows, len(items)) + chunk := items[start:end] + + var b strings.Builder + b.WriteString(`INSERT INTO symbol_fts (node_id, repo_prefix, tokens) VALUES `) + args := make([]any, 0, len(chunk)*3) + for i, it := range chunk { + if i > 0 { + b.WriteByte(',') + } + b.WriteString(`(?,?,?)`) + args = append(args, it.NodeID, repoPrefix, it.Tokens) + } + if _, err := tx.Exec(b.String(), args...); err != nil { + return err + } + } + + if err := tx.Commit(); err != nil { + return err + } + commit = true + return nil +} + +// BuildSymbolIndex is a no-op for FTS5: the index is maintained +// incrementally on every insert, so there is nothing to build after the +// bulk parse phase. We opportunistically run the FTS5 'optimize' +// command to merge segments (purely a read-latency improvement); any +// error is ignored because the index is already correct without it. +// Idempotent — safe to call any number of times. +func (s *Store) BuildSymbolIndex() error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + _, _ = s.db.Exec(`INSERT INTO symbol_fts(symbol_fts) VALUES('optimize')`) + return nil +} + +// SearchSymbols runs a symbol query and returns hits ordered by +// descending relevance (higher Score = more relevant). +// +// Tier 0 (exact-name boost): when the +// query looks like a literal identifier and resolves to one or more +// nodes by exact name, return those directly with a fixed dominant +// score (100.0) — an O(1)-ish index seek that beats FTS ranking for +// the common "type the symbol name" case. Misses fall through to FTS5. +// +// Otherwise tokenise on the read side with the SAME splitter as the +// write side (search.Tokenize) so a camelCase query lands on the +// split corpus, build a prefix-OR MATCH expression, and rank by BM25. +// SQLite's bm25() returns lower-is-better, so the stored Score is its +// negation (higher-is-better, matching the SymbolHit contract). +func (s *Store) SearchSymbols(query string, limit int) ([]graph.SymbolHit, error) { + if query == "" { + return nil, nil + } + if limit <= 0 { + limit = 20 + } + + // Tier 0: exact-name lookup. Only engage for identifier-shaped + // queries (no whitespace / path separators); multi-word queries are + // concept searches that need BM25 ranking. We only short-circuit + // when the lookup hits at least one node — misses fall through so a + // partial-identifier query still reaches FTS. + if isIdentifierQuery(query) { + ns := s.FindNodesByName(query) + if len(ns) > 0 { + out := make([]graph.SymbolHit, 0, minInt(len(ns), limit)) + for _, n := range ns { + if n == nil || n.ID == "" { + continue + } + out = append(out, graph.SymbolHit{NodeID: n.ID, Score: 100.0}) + if len(out) >= limit { + break + } + } + if len(out) > 0 { + return out, nil + } + } + } + + match := s.buildFTSMatch(query) + if match == "" { + return nil, nil + } + + const q = `SELECT node_id, bm25(symbol_fts) FROM symbol_fts WHERE symbol_fts MATCH ? ORDER BY bm25(symbol_fts) LIMIT ?` + rows, err := s.db.Query(q, match, limit) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + + var hits []graph.SymbolHit + for rows.Next() { + var ( + id string + score float64 + ) + if err := rows.Scan(&id, &score); err != nil { + return nil, err + } + if id == "" { + continue + } + // bm25() is negative-better in SQLite; negate so higher = better, + // matching the SymbolHit contract. Rows already arrive in bm25 + // (best-first) order from the ORDER BY. + hits = append(hits, graph.SymbolHit{NodeID: id, Score: -score}) + } + if err := rows.Err(); err != nil { + return nil, err + } + return hits, nil +} + +// buildFTSMatch tokenises the query with the write-side splitter and +// builds an FTS5 MATCH expression: each token becomes a quoted prefix +// term ("tok"*) and the terms are OR-joined so any token match counts. +// Returns "" when the query degenerates to no tokens. +func (s *Store) buildFTSMatch(query string) string { + tokens := search.Tokenize(query) + if len(tokens) == 0 { + // Fallback: when Tokenize drops everything (e.g. a single + // sub-2-char token like "go"), use the looser query tokeniser so + // the search still reaches the engine instead of returning empty. + tokens = search.TokenizeQuery(query) + if len(tokens) == 0 { + return "" + } + } + parts := make([]string, 0, len(tokens)) + for _, t := range tokens { + if t == "" { + continue + } + parts = append(parts, `"`+escapeFTSQuote(t)+`"*`) + } + if len(parts) == 0 { + return "" + } + return strings.Join(parts, " OR ") +} + +// escapeFTSQuote escapes a token for use inside an FTS5 double-quoted +// string literal: a literal double quote is doubled ("" inside "..."). +func escapeFTSQuote(t string) string { + return strings.ReplaceAll(t, `"`, `""`) +} + +// SearchSymbolBundles is the rerank-shaped fast path: it runs +// SearchSymbols to get the ranked id list (preserving order) plus a +// score-by-id map, then materialises the nodes and their in/out edges +// in batched fetches the rerank pipeline reads from. The engine routes +// through this when the backend implements SymbolBundleSearcher, +// pre-seeding rerank.Context's edge caches. +func (s *Store) SearchSymbolBundles(query string, limit int) ([]graph.SymbolBundle, error) { + hits, err := s.SearchSymbols(query, limit) + if err != nil { + return nil, err + } + if len(hits) == 0 { + return nil, nil + } + + ids := make([]string, 0, len(hits)) + scoreByID := make(map[string]float64, len(hits)) + for _, h := range hits { + if h.NodeID == "" { + continue + } + if _, dup := scoreByID[h.NodeID]; dup { + // First hit keeps the score / position; defend against a + // future ranker that returns an id more than once. + continue + } + scoreByID[h.NodeID] = h.Score + ids = append(ids, h.NodeID) + } + if len(ids) == 0 { + return nil, nil + } + + nodes := s.GetNodesByIDs(ids) + out := s.GetOutEdgesByNodeIDs(ids) + in := s.GetInEdgesByNodeIDs(ids) + + bundles := make([]graph.SymbolBundle, 0, len(ids)) + for _, id := range ids { + n := nodes[id] + if n == nil { + // Hit references a node evicted between the search and the + // node fetch — skip; the caller does its own dedup / filter. + continue + } + bundles = append(bundles, graph.SymbolBundle{ + Node: n, + Score: scoreByID[id], + OutEdges: out[id], + InEdges: in[id], + }) + } + return bundles, nil +} + +// isIdentifierQuery reports whether a query looks like a literal symbol +// name (no whitespace, no path separators, no dots, no colons, no +// commas). The tier-0 exact-name fast path engages only on such +// queries; multi-token / path / qualified queries always go to FTS. +func isIdentifierQuery(q string) bool { + if q == "" { + return false + } + for _, r := range q { + switch r { + case ' ', '\t', '\n', '/', '.', ':', ',': + return false + } + } + return true +} diff --git a/internal/graph/store_sqlite/store_lookups.go b/internal/graph/store_sqlite/store_lookups.go new file mode 100644 index 00000000..06122c3d --- /dev/null +++ b/internal/graph/store_sqlite/store_lookups.go @@ -0,0 +1,134 @@ +package store_sqlite + +import ( + "strings" + + "github.com/zzet/gortex/internal/graph" +) + +// These methods were added to graph.Store after the sqlite backend was +// first removed; they are restored here so *Store satisfies the current +// interface. All reuse the chunked IN-list / raw-SQL helpers in store.go +// (queryNodesSQL / queryEdgesSQL / lookupChunkSize / minInt). SQLite's +// planner drives every one through the existing secondary indexes. + +const lookupNodeCols = `id, kind, name, qual_name, file_path, start_line, end_line, language, repo_prefix, workspace_id, project_id, meta` +const lookupEdgeCols = `from_id, to_id, kind, file_path, line, confidence, confidence_label, origin, tier, cross_repo, meta` + +// FindNodesByNameContaining returns nodes whose Name contains substr, +// case-insensitively (SQLite's LIKE is ASCII case-insensitive). An empty +// substring matches nothing (parity with the in-memory store); a limit > 0 +// caps the result set. The leading-wildcard LIKE is a deliberate full scan — +// no index accelerates an unanchored substring — matching the in-memory +// strings.Contains fallback. % and _ in substr are escaped so they match +// literally. +func (s *Store) FindNodesByNameContaining(substr string, limit int) []*graph.Node { + if substr == "" { + return nil + } + pattern := "%" + escapeLikePattern(substr) + "%" + q := `SELECT ` + lookupNodeCols + ` FROM nodes WHERE name LIKE ? ESCAPE '\' ORDER BY id` + if limit > 0 { + return s.queryNodesSQL(q+` LIMIT ?`, pattern, limit) + } + return s.queryNodesSQL(q, pattern) +} + +// GetNodesByQualNames returns a map qualName→*Node (first match per +// qual_name) for the batch — the qual-name twin of FindNodesByNames, used to +// pre-warm import resolution. Driven by the unique nodes_by_qual index. +func (s *Store) GetNodesByQualNames(qualNames []string) map[string]*graph.Node { + uniq := dedupeNonEmpty(qualNames) + if len(uniq) == 0 { + return nil + } + out := make(map[string]*graph.Node, len(uniq)) + for i := 0; i < len(uniq); i += lookupChunkSize { + end := minInt(i+lookupChunkSize, len(uniq)) + chunk := uniq[i:end] + q := `SELECT ` + lookupNodeCols + ` FROM nodes WHERE qual_name IN (` + inPlaceholders(len(chunk)) + `)` + for _, n := range s.queryNodesSQL(q, toAnyArgs(chunk)...) { + if n == nil { + continue + } + if _, ok := out[n.QualName]; !ok { + out[n.QualName] = n + } + } + } + return out +} + +// GetOutEdgesByNodeIDs batches per-node out-edge fan-out into one query per +// chunk. Missing IDs are simply absent from the returned map. +func (s *Store) GetOutEdgesByNodeIDs(ids []string) map[string][]*graph.Edge { + return s.edgesByNodeIDs(ids, "from_id", func(e *graph.Edge) string { return e.From }) +} + +// GetInEdgesByNodeIDs is the incoming-edge twin of GetOutEdgesByNodeIDs. +func (s *Store) GetInEdgesByNodeIDs(ids []string) map[string][]*graph.Edge { + return s.edgesByNodeIDs(ids, "to_id", func(e *graph.Edge) string { return e.To }) +} + +// edgesByNodeIDs runs the chunked IN-list edge fetch keyed on the given +// column (from_id or to_id), grouping results by the supplied key extractor. +func (s *Store) edgesByNodeIDs(ids []string, col string, key func(*graph.Edge) string) map[string][]*graph.Edge { + uniq := dedupeNonEmpty(ids) + if len(uniq) == 0 { + return nil + } + out := make(map[string][]*graph.Edge, len(uniq)) + for i := 0; i < len(uniq); i += lookupChunkSize { + end := minInt(i+lookupChunkSize, len(uniq)) + chunk := uniq[i:end] + q := `SELECT ` + lookupEdgeCols + ` FROM edges WHERE ` + col + ` IN (` + inPlaceholders(len(chunk)) + `)` + for _, e := range s.queryEdgesSQL(q, toAnyArgs(chunk)...) { + if e == nil { + continue + } + k := key(e) + out[k] = append(out[k], e) + } + } + return out +} + +// dedupeNonEmpty drops empties and duplicates, preserving first-seen order. +func dedupeNonEmpty(in []string) []string { + seen := make(map[string]struct{}, len(in)) + out := make([]string, 0, len(in)) + for _, v := range in { + if v == "" { + continue + } + if _, ok := seen[v]; ok { + continue + } + seen[v] = struct{}{} + out = append(out, v) + } + return out +} + +// inPlaceholders returns "?,?,?" for n bound parameters. +func inPlaceholders(n int) string { + if n <= 0 { + return "" + } + return strings.Repeat(",?", n)[1:] +} + +// toAnyArgs widens a string slice for variadic Query/Exec args. +func toAnyArgs(ss []string) []any { + args := make([]any, len(ss)) + for i, v := range ss { + args[i] = v + } + return args +} + +// escapeLikePattern escapes the LIKE metacharacters so the substring matches +// literally under `... LIKE ? ESCAPE '\'`. +func escapeLikePattern(s string) string { + return strings.NewReplacer(`\`, `\\`, `%`, `\%`, `_`, `\_`).Replace(s) +} diff --git a/internal/graph/store_sqlite/store_mtime.go b/internal/graph/store_sqlite/store_mtime.go new file mode 100644 index 00000000..7cf79251 --- /dev/null +++ b/internal/graph/store_sqlite/store_mtime.go @@ -0,0 +1,243 @@ +package store_sqlite + +import ( + "database/sql" + + "github.com/zzet/gortex/internal/graph" +) + +// Compile-time assertions that the SQLite Store satisfies the optional +// per-file mtime persistence capabilities. Lifting this state into the +// same backend the graph lives in means warm restarts read it through +// one persistence surface instead of a second gob snapshot. +var ( + _ graph.FileMtimeWriter = (*Store)(nil) + _ graph.FileMtimeReader = (*Store)(nil) + _ graph.FileMtimeReplacer = (*Store)(nil) + _ graph.FileMtimeDeleter = (*Store)(nil) +) + +// mtimeChunk bounds how many (repo_prefix, file_path, mtime_ns) tuples +// ride in a single multi-row INSERT. SQLite's default compiled-in host +// parameter limit is 999; at 3 params per row that caps a statement at +// 333 rows, so 300 leaves headroom. +const mtimeChunk = 300 + +// SetFileMtime records one file's modification time (nanoseconds since +// the epoch) for a repo prefix, replacing any prior value. It is a +// convenience single-row form of BulkSetFileMtimes. +func (s *Store) SetFileMtime(repoPrefix, filePath string, mtimeNs int64) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + _, err := s.db.Exec( + `INSERT OR REPLACE INTO file_mtimes (repo_prefix, file_path, mtime_ns) VALUES (?, ?, ?)`, + repoPrefix, filePath, mtimeNs, + ) + return err +} + +// BulkSetFileMtimes persists every (filePath -> mtimeNs) entry for one +// repo prefix in a single transaction, chunked so no statement exceeds +// SQLite's host-parameter limit. Idempotent on (repoPrefix, filePath): +// re-running with overlapping keys replaces in place. Empty input is a +// no-op. +func (s *Store) BulkSetFileMtimes(repoPrefix string, mtimes map[string]int64) error { + if len(mtimes) == 0 { + return nil + } + + s.writeMu.Lock() + defer s.writeMu.Unlock() + + tx, err := s.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() //nolint:errcheck // rollback after Commit is a no-op + + if err := insertMtimesTx(tx, repoPrefix, mtimes); err != nil { + return err + } + + return tx.Commit() +} + +// ReplaceFileMtimes persists the AUTHORITATIVE full mtime set for one repo +// prefix: every prior row for the prefix is dropped and the supplied set is +// written, all in one transaction. The full-index persist path uses this so +// files deleted since the last index are pruned — BulkSetFileMtimes (upsert) +// would leave their rows behind, and warm-restart reconcile would then +// detect them as phantom deletions on every restart, forcing a full +// re-track that never converges. +// +// Empty input is a deliberate no-op: it never wipes a repo's mtimes from an +// empty snapshot (the indexer guards the call with len(snapshot) > 0). +func (s *Store) ReplaceFileMtimes(repoPrefix string, mtimes map[string]int64) error { + if len(mtimes) == 0 { + return nil + } + + s.writeMu.Lock() + defer s.writeMu.Unlock() + + tx, err := s.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() //nolint:errcheck // rollback after Commit is a no-op + + if _, err := tx.Exec(`DELETE FROM file_mtimes WHERE repo_prefix = ?`, repoPrefix); err != nil { + return err + } + if err := insertMtimesTx(tx, repoPrefix, mtimes); err != nil { + return err + } + + return tx.Commit() +} + +// DeleteFileMtimes drops the rows for a set of repo-relative file paths +// under one repo prefix — the incremental-reindex sibling of +// ReplaceFileMtimes. The watcher / incremental path calls it when a file is +// deleted so the persisted set stays in step with the live graph and the +// next warm restart does not see the path as a phantom deletion. Empty +// input is a no-op. +func (s *Store) DeleteFileMtimes(repoPrefix string, paths []string) error { + if len(paths) == 0 { + return nil + } + + s.writeMu.Lock() + defer s.writeMu.Unlock() + + tx, err := s.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() //nolint:errcheck // rollback after Commit is a no-op + + // Chunk so the IN-list never exceeds SQLite's host-parameter limit: + // one leading repo_prefix arg + up to mtimeChunk path args per stmt. + for start := 0; start < len(paths); start += mtimeChunk { + end := min(start+mtimeChunk, len(paths)) + batch := paths[start:end] + + args := make([]any, 0, len(batch)+1) + args = append(args, repoPrefix) + stmt := make([]byte, 0, 64+len(batch)*2) + stmt = append(stmt, "DELETE FROM file_mtimes WHERE repo_prefix = ? AND file_path IN ("...) + for i := range batch { + if i > 0 { + stmt = append(stmt, ',') + } + stmt = append(stmt, '?') + args = append(args, batch[i]) + } + stmt = append(stmt, ')') + if _, err := tx.Exec(string(stmt), args...); err != nil { + return err + } + } + + return tx.Commit() +} + +// insertMtimesTx writes every (path -> ns) entry for repoPrefix into the +// given transaction with chunked multi-row INSERT OR REPLACE statements, +// each kept under SQLite's host-parameter limit. The caller owns the tx +// lifecycle (Begin/Commit/Rollback) and the write lock. +func insertMtimesTx(tx *sql.Tx, repoPrefix string, mtimes map[string]int64) error { + // Stable ordering is not required for correctness, but iterating the + // map directly is fine — we only chunk by count. + type kv struct { + path string + ns int64 + } + pending := make([]kv, 0, len(mtimes)) + for p, ns := range mtimes { + pending = append(pending, kv{path: p, ns: ns}) + } + + for start := 0; start < len(pending); start += mtimeChunk { + end := min(start+mtimeChunk, len(pending)) + batch := pending[start:end] + + // Build a multi-row INSERT OR REPLACE: (?, ?, ?), (?, ?, ?), ... + args := make([]any, 0, len(batch)*3) + stmt := make([]byte, 0, 64+len(batch)*16) + stmt = append(stmt, "INSERT OR REPLACE INTO file_mtimes (repo_prefix, file_path, mtime_ns) VALUES "...) + for i, e := range batch { + if i > 0 { + stmt = append(stmt, ',') + } + stmt = append(stmt, "(?, ?, ?)"...) + args = append(args, repoPrefix, e.path, e.ns) + } + if _, err := tx.Exec(string(stmt), args...); err != nil { + return err + } + } + + return nil +} + +// LoadFileMtimes returns the recorded mtimes for one repo prefix as a +// fresh map. Returns nil when there is no data for the prefix (the +// "no recorded state" signal warmup expects). +func (s *Store) LoadFileMtimes(repoPrefix string) map[string]int64 { + rows, err := s.db.Query( + `SELECT file_path, mtime_ns FROM file_mtimes WHERE repo_prefix = ?`, + repoPrefix, + ) + if err != nil { + return nil + } + defer func() { _ = rows.Close() }() + + var out map[string]int64 + for rows.Next() { + var path string + var ns int64 + if err := rows.Scan(&path, &ns); err != nil { + return nil + } + if out == nil { + out = make(map[string]int64) + } + out[path] = ns + } + if err := rows.Err(); err != nil { + return nil + } + return out +} + +// FileMtimes is a fallible read form of LoadFileMtimes. It always +// returns a non-nil (possibly empty) map for a known/unknown prefix and +// surfaces any query error. The interface method LoadFileMtimes is the +// daemon's entry point; this variant exists for callers (and tests) +// that want the error and an always-materialised map. +func (s *Store) FileMtimes(repoPrefix string) (map[string]int64, error) { + rows, err := s.db.Query( + `SELECT file_path, mtime_ns FROM file_mtimes WHERE repo_prefix = ?`, + repoPrefix, + ) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + + out := make(map[string]int64) + for rows.Next() { + var path string + var ns int64 + if err := rows.Scan(&path, &ns); err != nil { + return nil, err + } + out[path] = ns + } + if err := rows.Err(); err != nil { + return nil, err + } + return out, nil +} diff --git a/internal/graph/store_sqlite/store_mtime_prune_test.go b/internal/graph/store_sqlite/store_mtime_prune_test.go new file mode 100644 index 00000000..f4efe3c9 --- /dev/null +++ b/internal/graph/store_sqlite/store_mtime_prune_test.go @@ -0,0 +1,112 @@ +package store_sqlite_test + +import ( + "reflect" + "testing" + + "github.com/zzet/gortex/internal/graph" +) + +// TestReplaceFileMtimesPrunesDeleted is the regression for the warm-restart +// "nothing changed but full re-track" bug: the full-index persist path must +// REPLACE a repo's mtime set, not union into it. An upsert-only persist +// leaves rows for files deleted since the last index behind, and warm-restart +// reconcile then detects them as phantom deletions on every restart — forcing +// a full re-track that never converges. +func TestReplaceFileMtimesPrunesDeleted(t *testing.T) { + s := openTestStore(t) + + // Assert the store advertises the capability the indexer probes for. + var _ graph.FileMtimeReplacer = s + var _ graph.FileMtimeDeleter = s + + // First index: three files persisted. + require := func(err error, what string) { + t.Helper() + if err != nil { + t.Fatalf("%s: %v", what, err) + } + } + require(s.BulkSetFileMtimes("repoA", map[string]int64{ + "a/one.go": 100, + "a/two.go": 200, + "a/three.go": 300, + }), "seed BulkSetFileMtimes") + + // A different repo whose rows must never be touched by repoA writes. + require(s.BulkSetFileMtimes("repoB", map[string]int64{"b/x.go": 999}), "seed repoB") + + // Second index: two.go was deleted on disk, four.go is new, three.go + // changed. The authoritative snapshot is {one, three', four}. + require(s.ReplaceFileMtimes("repoA", map[string]int64{ + "a/one.go": 100, + "a/three.go": 350, // changed + "a/four.go": 400, // new + }), "ReplaceFileMtimes") + + want := map[string]int64{ + "a/one.go": 100, + "a/three.go": 350, + "a/four.go": 400, + } + got := s.LoadFileMtimes("repoA") + if !reflect.DeepEqual(got, want) { + t.Fatalf("after ReplaceFileMtimes = %v, want %v (a/two.go must be pruned)", got, want) + } + if _, stillThere := got["a/two.go"]; stillThere { + t.Fatal("a/two.go was deleted on disk but its mtime row survived the replace — phantom deletion bug") + } + + // Repo isolation. + if b := s.LoadFileMtimes("repoB"); !reflect.DeepEqual(b, map[string]int64{"b/x.go": 999}) { + t.Fatalf("repoB rows disturbed by repoA replace: %v", b) + } + + // Empty input is a deliberate no-op: it must NEVER wipe a repo's set. + require(s.ReplaceFileMtimes("repoA", nil), "ReplaceFileMtimes(nil)") + if got := s.LoadFileMtimes("repoA"); !reflect.DeepEqual(got, want) { + t.Fatalf("ReplaceFileMtimes(nil) wiped the repo: %v", got) + } +} + +// TestDeleteFileMtimes covers the incremental-reindex sibling: the watcher / +// incremental path drops just the deleted paths so the persisted set stays in +// step with the live graph without a full replace. +func TestDeleteFileMtimes(t *testing.T) { + s := openTestStore(t) + + if err := s.BulkSetFileMtimes("repoA", map[string]int64{ + "a/one.go": 100, + "a/two.go": 200, + "a/three.go": 300, + "a/four.go": 400, + }); err != nil { + t.Fatalf("seed: %v", err) + } + if err := s.BulkSetFileMtimes("repoB", map[string]int64{"b/keep.go": 7}); err != nil { + t.Fatalf("seed repoB: %v", err) + } + + // Delete two existing paths and one that was never recorded (harmless). + if err := s.DeleteFileMtimes("repoA", []string{"a/two.go", "a/four.go", "a/never.go"}); err != nil { + t.Fatalf("DeleteFileMtimes: %v", err) + } + + want := map[string]int64{"a/one.go": 100, "a/three.go": 300} + if got := s.LoadFileMtimes("repoA"); !reflect.DeepEqual(got, want) { + t.Fatalf("after delete = %v, want %v", got, want) + } + + // Repo isolation: same-named delete on repoA must not touch repoB. + if b := s.LoadFileMtimes("repoB"); !reflect.DeepEqual(b, map[string]int64{"b/keep.go": 7}) { + t.Fatalf("repoB disturbed: %v", b) + } + + // Empty input is a no-op. + if err := s.DeleteFileMtimes("repoA", nil); err != nil { + t.Fatalf("DeleteFileMtimes(nil): %v", err) + } + if got := s.LoadFileMtimes("repoA"); !reflect.DeepEqual(got, want) { + t.Fatalf("DeleteFileMtimes(nil) changed the set: %v", got) + } +} diff --git a/internal/graph/store_sqlite/store_release_enrichment.go b/internal/graph/store_sqlite/store_release_enrichment.go new file mode 100644 index 00000000..d1f08a2a --- /dev/null +++ b/internal/graph/store_sqlite/store_release_enrichment.go @@ -0,0 +1,140 @@ +package store_sqlite + +import ( + "database/sql" + + "github.com/zzet/gortex/internal/graph" +) + +var ( + _ graph.ReleaseEnrichmentWriter = (*Store)(nil) + _ graph.ReleaseEnrichmentReader = (*Store)(nil) +) + +// releaseChunk bounds rows per multi-row INSERT (3 cols → 3 params/row). +const releaseChunk = 300 + +const releaseCols = `node_id, repo_prefix, added_in` + +// BulkSetReleases persists release rows for one repo prefix, chunked. +func (s *Store) BulkSetReleases(repoPrefix string, rows []graph.ReleaseEnrichment) error { + if len(rows) == 0 { + return nil + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + + tx, err := s.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() //nolint:errcheck + + for start := 0; start < len(rows); start += releaseChunk { + end := start + releaseChunk + if end > len(rows) { + end = len(rows) + } + batch := rows[start:end] + args := make([]any, 0, len(batch)*3) + stmt := make([]byte, 0, 96+len(batch)*12) + stmt = append(stmt, "INSERT OR REPLACE INTO release_enrichment ("...) + stmt = append(stmt, releaseCols...) + stmt = append(stmt, ") VALUES "...) + for i, e := range batch { + if i > 0 { + stmt = append(stmt, ',') + } + stmt = append(stmt, "(?,?,?)"...) + args = append(args, e.NodeID, repoPrefix, e.AddedIn) + } + if _, err := tx.Exec(string(stmt), args...); err != nil { + return err + } + } + return tx.Commit() +} + +// DeleteReleases drops release rows for the supplied node ids, chunked. +func (s *Store) DeleteReleases(nodeIDs []string) error { + if len(nodeIDs) == 0 { + return nil + } + seen := make(map[string]struct{}, len(nodeIDs)) + uniq := make([]string, 0, len(nodeIDs)) + for _, id := range nodeIDs { + if id == "" { + continue + } + if _, ok := seen[id]; ok { + continue + } + seen[id] = struct{}{} + uniq = append(uniq, id) + } + if len(uniq) == 0 { + return nil + } + + s.writeMu.Lock() + defer s.writeMu.Unlock() + + tx, err := s.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() //nolint:errcheck + + for start := 0; start < len(uniq); start += releaseChunk { + end := start + releaseChunk + if end > len(uniq) { + end = len(uniq) + } + chunk := uniq[start:end] + args := make([]any, len(chunk)) + stmt := make([]byte, 0, 56+len(chunk)*2) + stmt = append(stmt, "DELETE FROM release_enrichment WHERE node_id IN ("...) + for i, id := range chunk { + if i > 0 { + stmt = append(stmt, ',') + } + stmt = append(stmt, '?') + args[i] = id + } + stmt = append(stmt, ')') + if _, err := tx.Exec(string(stmt), args...); err != nil { + return err + } + } + return tx.Commit() +} + +// ReleaseRows returns release rows for repoPrefix; empty → all repos. +func (s *Store) ReleaseRows(repoPrefix string) []graph.ReleaseEnrichment { + var ( + rows *sql.Rows + err error + ) + if repoPrefix == "" { + rows, err = s.db.Query(`SELECT ` + releaseCols + ` FROM release_enrichment`) + } else { + rows, err = s.db.Query(`SELECT `+releaseCols+` FROM release_enrichment WHERE repo_prefix = ?`, repoPrefix) + } + if err != nil { + return nil + } + defer func() { _ = rows.Close() }() + + var out []graph.ReleaseEnrichment + for rows.Next() { + var e graph.ReleaseEnrichment + if err := rows.Scan(&e.NodeID, &e.RepoPrefix, &e.AddedIn); err != nil { + return out + } + out = append(out, e) + } + if err := rows.Err(); err != nil { + return out + } + return out +} diff --git a/internal/graph/store_sqlite/store_test.go b/internal/graph/store_sqlite/store_test.go new file mode 100644 index 00000000..3b294c3f --- /dev/null +++ b/internal/graph/store_sqlite/store_test.go @@ -0,0 +1,22 @@ +package store_sqlite_test + +import ( + "path/filepath" + "testing" + + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/graph/store_sqlite" + "github.com/zzet/gortex/internal/graph/storetest" +) + +func TestSQLiteStoreConformance(t *testing.T) { + storetest.RunConformance(t, func(t *testing.T) graph.Store { + dir := t.TempDir() + s, err := store_sqlite.Open(filepath.Join(dir, "test.sqlite")) + if err != nil { + t.Fatalf("Open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + return s + }) +} diff --git a/internal/graph/store_sqlite/store_traversal.go b/internal/graph/store_sqlite/store_traversal.go new file mode 100644 index 00000000..de735c7c --- /dev/null +++ b/internal/graph/store_sqlite/store_traversal.go @@ -0,0 +1,362 @@ +package store_sqlite + +import ( + "github.com/zzet/gortex/internal/graph" +) + +// The graph-traversal and subgraph-reader optional capabilities for the +// SQLite backend. Each method mirrors the in-memory *graph.Graph +// reference implementation exactly so both satisfy the same conformance +// suite (internal/graph/storetest). The walks use the same per-node / +// batched edge readers the in-memory store uses (GetOutEdges / +// GetInEdges / GetFileNodes / GetNodesByIDs / GetIn|OutEdgesByNodeIDs), +// which on SQLite hit the (from_id,kind) / (to_id,kind) / file_path +// indexes — no new prepared statements needed. + +var ( + _ graph.ReachableForwardByKinds = (*Store)(nil) + _ graph.ClassHierarchyTraverser = (*Store)(nil) + _ graph.FrontierExpander = (*Store)(nil) + _ graph.FileEditingContext = (*Store)(nil) + _ graph.FileSubGraphReader = (*Store)(nil) + _ graph.FileSubGraphCountReader = (*Store)(nil) +) + +// ReachableForwardByKinds computes the set of node IDs reachable from +// the seed frontier via outgoing edges whose Kind is in kinds, via a +// layer-by-layer forward BFS. Empty seeds returns nil; empty kinds +// returns the seed set unchanged. The returned map keys are the +// reachable IDs (seeds included); every value is true. +func (s *Store) ReachableForwardByKinds(seeds []string, kinds []graph.EdgeKind) map[string]bool { + if len(seeds) == 0 { + return nil + } + covered := make(map[string]bool, len(seeds)) + frontier := make([]string, 0, len(seeds)) + for _, id := range seeds { + if id == "" || covered[id] { + continue + } + covered[id] = true + frontier = append(frontier, id) + } + if len(kinds) == 0 { + return covered + } + allowed := make(map[graph.EdgeKind]struct{}, len(kinds)) + for _, k := range kinds { + allowed[k] = struct{}{} + } + for len(frontier) > 0 { + next := frontier[:0:0] + for _, id := range frontier { + for _, e := range s.GetOutEdges(id) { + if e == nil { + continue + } + if _, ok := allowed[e.Kind]; !ok { + continue + } + if !covered[e.To] { + covered[e.To] = true + next = append(next, e.To) + } + } + } + frontier = next + } + return covered +} + +// ClassHierarchyTraverse walks the inheritance subgraph rooted at +// seedID, following only edges whose Kind is in kinds, up to depth hops. +// direction "up" follows outgoing edges; "down" follows incoming. Empty +// kinds, depth <= 0, an unknown direction, or an unknown seed return +// nil. Each returned row carries the full Path (node IDs from the seed, +// exclusive) and per-hop EdgeKinds for one terminal node. +func (s *Store) ClassHierarchyTraverse( + seedID string, + direction string, + kinds []graph.EdgeKind, + depth int, +) []graph.ClassHierarchyRow { + if seedID == "" || depth <= 0 || len(kinds) == 0 { + return nil + } + kset := make(map[graph.EdgeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + kset[k] = struct{}{} + } + if len(kset) == 0 { + return nil + } + if s.GetNode(seedID) == nil { + return nil + } + walkUp := direction == "up" + walkDown := direction == "down" + if !walkUp && !walkDown { + return nil + } + type travQueued struct { + id string + path []string + edgeKinds []graph.EdgeKind + hops int + } + visited := map[string]struct{}{seedID: {}} + queue := []travQueued{{id: seedID, path: nil, edgeKinds: nil, hops: 0}} + var out []graph.ClassHierarchyRow + for len(queue) > 0 { + cur := queue[0] + queue = queue[1:] + if cur.hops >= depth { + continue + } + var edges []*graph.Edge + if walkUp { + edges = s.GetOutEdges(cur.id) + } else { + edges = s.GetInEdges(cur.id) + } + for _, e := range edges { + if e == nil { + continue + } + if _, ok := kset[e.Kind]; !ok { + continue + } + var nb string + if walkUp { + nb = e.To + } else { + nb = e.From + } + if nb == "" { + continue + } + if _, ok := visited[nb]; ok { + continue + } + visited[nb] = struct{}{} + newPath := append([]string(nil), cur.path...) + newPath = append(newPath, nb) + newKinds := append([]graph.EdgeKind(nil), cur.edgeKinds...) + newKinds = append(newKinds, e.Kind) + out = append(out, graph.ClassHierarchyRow{ + Path: newPath, + EdgeKinds: newKinds, + }) + queue = append(queue, travQueued{id: nb, path: newPath, edgeKinds: newKinds, hops: cur.hops + 1}) + } + } + return out +} + +// ExpandFrontier returns, for the given source IDs, their adjacent edges +// of the requested kinds plus the neighbour node at each edge's far end. +// forward=true follows outgoing edges (neighbour = edge target); +// forward=false follows incoming (neighbour = edge source). Empty ids or +// empty kinds return nil; limit > 0 caps the total number of hops. +func (s *Store) ExpandFrontier(ids []string, forward bool, kinds []graph.EdgeKind, limit int) []graph.FrontierHop { + if len(ids) == 0 || len(kinds) == 0 { + return nil + } + kset := make(map[graph.EdgeKind]struct{}, len(kinds)) + for _, k := range kinds { + kset[k] = struct{}{} + } + var out []graph.FrontierHop + for _, id := range ids { + var edges []*graph.Edge + if forward { + edges = s.GetOutEdges(id) + } else { + edges = s.GetInEdges(id) + } + for _, e := range edges { + if e == nil { + continue + } + if _, ok := kset[e.Kind]; !ok { + continue + } + var nbID string + if forward { + nbID = e.To + } else { + nbID = e.From + } + nb := s.GetNode(nbID) + if nb == nil { + continue + } + out = append(out, graph.FrontierHop{Edge: e, Neighbor: nb}) + if limit > 0 && len(out) >= limit { + return out + } + } + } + return out +} + +// FileEditingContext returns the get_editing_context payload for +// filePath: the file node, the symbols defined in it, the file node's +// import out-edges, and the 1-hop callers / callees (via EdgeCalls) of +// the defined call-target symbols, filtered to symbols outside the file. +// kinds is the set of node kinds treated as call targets (function + +// method). Empty path or a file with no nodes returns nil. +func (s *Store) FileEditingContext(filePath string, kinds []graph.NodeKind) *graph.FileEditingContextResult { + if filePath == "" { + return nil + } + nodes := s.GetFileNodes(filePath) + if len(nodes) == 0 { + return nil + } + kset := make(map[graph.NodeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + kset[k] = struct{}{} + } + res := &graph.FileEditingContextResult{} + var fileNodeID string + var defNodeIDs []string + for _, n := range nodes { + if n == nil { + continue + } + if n.Kind == graph.KindFile { + res.FileNode = n + fileNodeID = n.ID + continue + } + res.Defines = append(res.Defines, n) + if _, ok := kset[n.Kind]; ok { + defNodeIDs = append(defNodeIDs, n.ID) + } + } + if fileNodeID != "" { + for _, e := range s.GetOutEdges(fileNodeID) { + if e == nil { + continue + } + if e.Kind == graph.EdgeImports { + res.Imports = append(res.Imports, e) + } + } + } + if len(defNodeIDs) == 0 { + return res + } + inEdges := s.GetInEdgesByNodeIDs(defNodeIDs) + outEdges := s.GetOutEdgesByNodeIDs(defNodeIDs) + callerIDSet := make(map[string]struct{}) + calleeIDSet := make(map[string]struct{}) + for _, id := range defNodeIDs { + for _, e := range inEdges[id] { + if e == nil || e.Kind != graph.EdgeCalls { + continue + } + if e.From == "" { + continue + } + callerIDSet[e.From] = struct{}{} + } + for _, e := range outEdges[id] { + if e == nil || e.Kind != graph.EdgeCalls { + continue + } + if e.To == "" { + continue + } + calleeIDSet[e.To] = struct{}{} + } + } + callerIDs := make([]string, 0, len(callerIDSet)) + for id := range callerIDSet { + callerIDs = append(callerIDs, id) + } + calleeIDs := make([]string, 0, len(calleeIDSet)) + for id := range calleeIDSet { + calleeIDs = append(calleeIDs, id) + } + callerNodes := s.GetNodesByIDs(callerIDs) + calleeNodes := s.GetNodesByIDs(calleeIDs) + for _, id := range callerIDs { + n := callerNodes[id] + if n == nil || n.FilePath == filePath { + continue + } + res.CalledBy = append(res.CalledBy, n) + } + for _, id := range calleeIDs { + n := calleeNodes[id] + if n == nil || n.FilePath == filePath { + continue + } + res.Calls = append(res.Calls, n) + } + return res +} + +// GetFileSubGraph returns every node anchored to filePath plus every +// edge adjacent to one of those nodes, deduplicated by (from, to, kind). +// A missing / empty file returns (nil, nil). +func (s *Store) GetFileSubGraph(filePath string) ([]*graph.Node, []*graph.Edge) { + if filePath == "" { + return nil, nil + } + nodes := s.GetFileNodes(filePath) + if len(nodes) == 0 { + return nil, nil + } + ids := make([]string, 0, len(nodes)) + for _, n := range nodes { + if n != nil && n.ID != "" { + ids = append(ids, n.ID) + } + } + outByID := s.GetOutEdgesByNodeIDs(ids) + inByID := s.GetInEdgesByNodeIDs(ids) + type travEdgeKey struct { + from string + to string + kind graph.EdgeKind + } + seen := make(map[travEdgeKey]struct{}, 2*len(ids)) + edges := make([]*graph.Edge, 0, 2*len(ids)) + add := func(e *graph.Edge) { + if e == nil { + return + } + k := travEdgeKey{from: e.From, to: e.To, kind: e.Kind} + if _, ok := seen[k]; ok { + return + } + seen[k] = struct{}{} + edges = append(edges, e) + } + for _, id := range ids { + for _, e := range outByID[id] { + add(e) + } + for _, e := range inByID[id] { + add(e) + } + } + return nodes, edges +} + +// GetFileSubGraphCounts is the count-only sibling of GetFileSubGraph: +// it returns the file's nodes plus the number of distinct adjacent +// edges, without materialising the edge slice for the caller. +func (s *Store) GetFileSubGraphCounts(filePath string) ([]*graph.Node, int) { + nodes, edges := s.GetFileSubGraph(filePath) + return nodes, len(edges) +} diff --git a/internal/graph/store_sqlite/store_vector.go b/internal/graph/store_sqlite/store_vector.go new file mode 100644 index 00000000..2bb60e07 --- /dev/null +++ b/internal/graph/store_sqlite/store_vector.go @@ -0,0 +1,235 @@ +package store_sqlite + +import ( + "container/heap" + "encoding/binary" + "errors" + "math" + + "github.com/zzet/gortex/internal/graph" +) + +// Compile-time assertion that the SQLite Store satisfies the optional +// engine-native vector-search capability. +var _ graph.VectorSearcher = (*Store)(nil) + +// errInvalidDims is returned by BuildVectorIndex for a negative width. +var errInvalidDims = errors.New("store_sqlite: invalid vector dims") + +// Vector design (pure-Go, zero CGo) +// +// modernc.org/sqlite is a pure-Go SQLite that cannot load C extensions, +// so sqlite-vec / sqlite-vector are off the table — and staying CGo-free +// is the whole point of this backend. Embeddings are persisted as a +// little-endian float32 BLOB in the `vectors` table; the win over the +// daemon's in-process HNSW fallback is durability: vectors survive a +// restart instead of being recomputed. +// +// Queries use an exact brute-force cosine top-k: SimilarTo streams every +// stored vector, scores it against the query, and keeps the best `limit` +// in a bounded max-heap. This is O(N) per query but fully correct, +// deterministic, and holds no extra Store state (the Store struct lives +// in store.go and cannot be edited here). An on-Store HNSW cache is a +// future optimisation; for the corpus sizes this backend targets the +// exact path is the simplest thing that is verifiably right. +// +// BuildVectorIndex only validates/records intent — there is no separate +// index structure to build, since SimilarTo computes over the table +// directly. + +// vectorChunk bounds rows per multi-row INSERT in BulkUpsertEmbeddings. +// 3 host params per row, SQLite's default limit is 999 → 333 max; 300 +// leaves headroom. +const vectorChunk = 300 + +// encodeVec serialises a float32 slice to a little-endian BLOB +// (4 bytes per element). +func encodeVec(vec []float32) []byte { + b := make([]byte, len(vec)*4) + for i, f := range vec { + binary.LittleEndian.PutUint32(b[i*4:], math.Float32bits(f)) + } + return b +} + +// decodeVec is the inverse of encodeVec. A BLOB whose length is not a +// multiple of 4 yields nil (corrupt row); callers skip nil vectors. +func decodeVec(b []byte) []float32 { + if len(b)%4 != 0 { + return nil + } + out := make([]float32, len(b)/4) + for i := range out { + out[i] = math.Float32frombits(binary.LittleEndian.Uint32(b[i*4:])) + } + return out +} + +// UpsertEmbedding persists one node's embedding, replacing any prior +// vector for that node ID. +func (s *Store) UpsertEmbedding(nodeID string, vec []float32) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + _, err := s.db.Exec( + `INSERT OR REPLACE INTO vectors (node_id, dims, vec) VALUES (?, ?, ?)`, + nodeID, len(vec), encodeVec(vec), + ) + return err +} + +// BulkUpsertEmbeddings persists many embeddings in a single transaction, +// chunked under SQLite's host-parameter limit. Idempotent on NodeID. +// Empty input is a no-op. +func (s *Store) BulkUpsertEmbeddings(items []graph.VectorItem) error { + if len(items) == 0 { + return nil + } + + s.writeMu.Lock() + defer s.writeMu.Unlock() + + tx, err := s.db.Begin() + if err != nil { + return err + } + defer tx.Rollback() //nolint:errcheck // rollback after Commit is a no-op + + for start := 0; start < len(items); start += vectorChunk { + end := start + vectorChunk + if end > len(items) { + end = len(items) + } + batch := items[start:end] + + args := make([]any, 0, len(batch)*3) + stmt := make([]byte, 0, 64+len(batch)*16) + stmt = append(stmt, "INSERT OR REPLACE INTO vectors (node_id, dims, vec) VALUES "...) + for i, it := range batch { + if i > 0 { + stmt = append(stmt, ',') + } + stmt = append(stmt, "(?, ?, ?)"...) + args = append(args, it.NodeID, len(it.Vec), encodeVec(it.Vec)) + } + if _, err := tx.Exec(string(stmt), args...); err != nil { + return err + } + } + + return tx.Commit() +} + +// BuildVectorIndex finalises the vector index. Because SimilarTo scores +// over the `vectors` table directly there is no separate structure to +// populate; this validates the declared width is positive and is +// otherwise a no-op (idempotent, safe to call repeatedly). +func (s *Store) BuildVectorIndex(dims int) error { + if dims < 0 { + return errInvalidDims + } + return nil +} + +// SimilarTo returns up to `limit` stored vectors closest to the query +// under cosine distance, ordered by ascending distance (most similar +// first). Vectors whose length differs from the query are skipped — a +// dimension mismatch can't be meaningfully scored. +func (s *Store) SimilarTo(vec []float32, limit int) ([]graph.VectorHit, error) { + if limit <= 0 || len(vec) == 0 { + return nil, nil + } + + qNorm := norm(vec) + if qNorm == 0 { + return nil, nil + } + + rows, err := s.db.Query(`SELECT node_id, vec FROM vectors`) + if err != nil { + return nil, err + } + defer func() { _ = rows.Close() }() + + // Max-heap keyed on distance: the root is the *worst* kept hit, so a + // candidate better than the root evicts it. This keeps the heap at + // `limit` and yields an exact top-k. + h := &hitHeap{} + for rows.Next() { + var id string + var blob []byte + if err := rows.Scan(&id, &blob); err != nil { + return nil, err + } + cand := decodeVec(blob) + if len(cand) != len(vec) { + continue + } + cNorm := norm(cand) + if cNorm == 0 { + continue + } + dist := cosineDistance(vec, cand, qNorm, cNorm) + + if h.Len() < limit { + heap.Push(h, graph.VectorHit{NodeID: id, Distance: dist}) + } else if dist < (*h)[0].Distance { + (*h)[0] = graph.VectorHit{NodeID: id, Distance: dist} + heap.Fix(h, 0) + } + } + if err := rows.Err(); err != nil { + return nil, err + } + + // Drain the max-heap (largest distance first) then reverse so the + // result is ascending by distance (most similar first). + out := make([]graph.VectorHit, h.Len()) + for i := len(out) - 1; i >= 0; i-- { + out[i] = heap.Pop(h).(graph.VectorHit) + } + return out, nil +} + +// norm returns the Euclidean norm (L2) of v as a float64. +func norm(v []float32) float64 { + var sum float64 + for _, f := range v { + d := float64(f) + sum += d * d + } + return math.Sqrt(sum) +} + +// cosineDistance returns 1 - cosine_similarity(a, b), given precomputed +// norms. Lower = more similar; identical direction → ~0, orthogonal → 1, +// opposite → 2. a and b are assumed equal length and non-zero norm. +func cosineDistance(a, b []float32, aNorm, bNorm float64) float64 { + var dot float64 + for i := range a { + dot += float64(a[i]) * float64(b[i]) + } + sim := dot / (aNorm * bNorm) + // Guard against tiny floating-point overshoot past ±1. + if sim > 1 { + sim = 1 + } else if sim < -1 { + sim = -1 + } + return 1 - sim +} + +// hitHeap is a max-heap of VectorHit ordered by Distance: Less reports +// the *larger* distance as "less" so the root is the worst-kept hit. +type hitHeap []graph.VectorHit + +func (h hitHeap) Len() int { return len(h) } +func (h hitHeap) Less(i, j int) bool { return h[i].Distance > h[j].Distance } +func (h hitHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } +func (h *hitHeap) Push(x any) { *h = append(*h, x.(graph.VectorHit)) } +func (h *hitHeap) Pop() any { + old := *h + n := len(old) + it := old[n-1] + *h = old[:n-1] + return it +} diff --git a/internal/graph/store_sqlite/store_vector_mtime_test.go b/internal/graph/store_sqlite/store_vector_mtime_test.go new file mode 100644 index 00000000..97e1a8f8 --- /dev/null +++ b/internal/graph/store_sqlite/store_vector_mtime_test.go @@ -0,0 +1,302 @@ +package store_sqlite_test + +import ( + "math" + "math/rand" + "path/filepath" + "reflect" + "sort" + "testing" + + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/graph/store_sqlite" +) + +// openTestStore opens a fresh on-disk SQLite store in a temp dir and +// registers Close as cleanup. (modernc.org/sqlite's ":memory:" gives +// each pooled connection its OWN private database, so the conformance +// suite — and these tests — use an on-disk file shared across the pool.) +func openTestStore(t *testing.T) *store_sqlite.Store { + t.Helper() + s, err := store_sqlite.Open(filepath.Join(t.TempDir(), "test.sqlite")) + if err != nil { + t.Fatalf("Open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + return s +} + +// --- FileMtime persistence ------------------------------------------- + +func TestSQLiteFileMtimeRoundTrip(t *testing.T) { + s := openTestStore(t) + + // Single-row writes. + if err := s.SetFileMtime("repoA", "a/one.go", 100); err != nil { + t.Fatalf("SetFileMtime: %v", err) + } + if err := s.SetFileMtime("repoA", "a/two.go", 200); err != nil { + t.Fatalf("SetFileMtime: %v", err) + } + + // Batch write (includes an overwrite of an existing key). + batch := map[string]int64{ + "a/two.go": 250, // overwrite + "a/three.go": 300, + "a/four.go": 400, + } + if err := s.BulkSetFileMtimes("repoA", batch); err != nil { + t.Fatalf("BulkSetFileMtimes: %v", err) + } + + want := map[string]int64{ + "a/one.go": 100, + "a/two.go": 250, + "a/three.go": 300, + "a/four.go": 400, + } + + got, err := s.FileMtimes("repoA") + if err != nil { + t.Fatalf("FileMtimes: %v", err) + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("FileMtimes(repoA) = %v, want %v", got, want) + } + + // LoadFileMtimes (the interface method) must agree. + if loaded := s.LoadFileMtimes("repoA"); !reflect.DeepEqual(loaded, want) { + t.Fatalf("LoadFileMtimes(repoA) = %v, want %v", loaded, want) + } + + // Repo isolation: a different prefix is unaffected. + if err := s.SetFileMtime("repoB", "b/x.go", 999); err != nil { + t.Fatalf("SetFileMtime repoB: %v", err) + } + if got, _ := s.FileMtimes("repoA"); !reflect.DeepEqual(got, want) { + t.Fatalf("repoA changed after repoB write: %v", got) + } + + // Unknown repo: FileMtimes returns an empty (non-nil) map; + // LoadFileMtimes returns nil (the "no data" signal). + empty, err := s.FileMtimes("nope") + if err != nil { + t.Fatalf("FileMtimes(unknown): %v", err) + } + if len(empty) != 0 { + t.Fatalf("FileMtimes(unknown) = %v, want empty", empty) + } + if loaded := s.LoadFileMtimes("nope"); loaded != nil { + t.Fatalf("LoadFileMtimes(unknown) = %v, want nil", loaded) + } + + // Empty batch is a no-op. + if err := s.BulkSetFileMtimes("repoA", nil); err != nil { + t.Fatalf("BulkSetFileMtimes(nil): %v", err) + } +} + +// --- Vector search --------------------------------------------------- + +// bruteForceCosine ranks corpus against query the long way (exact cosine +// distance, ascending) so the test verifies SimilarTo independently of +// the implementation under test. +func bruteForceCosine(query []float32, corpus map[string][]float32, k int) []string { + type sc struct { + id string + dist float64 + } + scored := make([]sc, 0, len(corpus)) + qn := l2(query) + for id, v := range corpus { + vn := l2(v) + if qn == 0 || vn == 0 { + continue + } + var dot float64 + for i := range query { + dot += float64(query[i]) * float64(v[i]) + } + scored = append(scored, sc{id: id, dist: 1 - dot/(qn*vn)}) + } + sort.Slice(scored, func(i, j int) bool { + if scored[i].dist == scored[j].dist { + return scored[i].id < scored[j].id // stable tie-break + } + return scored[i].dist < scored[j].dist + }) + out := make([]string, 0, k) + for i := 0; i < k && i < len(scored); i++ { + out = append(out, scored[i].id) + } + return out +} + +func l2(v []float32) float64 { + var s float64 + for _, f := range v { + s += float64(f) * float64(f) + } + return math.Sqrt(s) +} + +func TestSQLiteVectorSimilarTo(t *testing.T) { + s := openTestStore(t) + + const ( + n = 50 + dims = 16 + ) + rng := rand.New(rand.NewSource(42)) + + corpus := make(map[string][]float32, n) + items := make([]graph.VectorItem, 0, n) + var ids []string + for i := 0; i < n; i++ { + id := nodeID(i) + ids = append(ids, id) + v := make([]float32, dims) + for d := 0; d < dims; d++ { + v[d] = float32(rng.NormFloat64()) + } + corpus[id] = v + items = append(items, graph.VectorItem{NodeID: id, Vec: v}) + } + + if err := s.BulkUpsertEmbeddings(items); err != nil { + t.Fatalf("BulkUpsertEmbeddings: %v", err) + } + if err := s.BuildVectorIndex(dims); err != nil { + t.Fatalf("BuildVectorIndex: %v", err) + } + + // Query == a stored vector → it must rank first at distance ~0. + queryID := ids[7] + query := corpus[queryID] + + hits, err := s.SimilarTo(query, 5) + if err != nil { + t.Fatalf("SimilarTo: %v", err) + } + if len(hits) != 5 { + t.Fatalf("SimilarTo returned %d hits, want 5", len(hits)) + } + if hits[0].NodeID != queryID { + t.Fatalf("top hit = %q, want the query vector %q", hits[0].NodeID, queryID) + } + if hits[0].Distance > 1e-6 { + t.Fatalf("top hit distance = %g, want ~0", hits[0].Distance) + } + + // Distances must be ascending. + for i := 1; i < len(hits); i++ { + if hits[i].Distance < hits[i-1].Distance { + t.Fatalf("hits not ascending by distance: %v", hits) + } + } + + // Independent brute-force ranking must match the returned top-5 ids. + want := bruteForceCosine(query, corpus, 5) + gotIDs := make([]string, len(hits)) + for i, h := range hits { + gotIDs[i] = h.NodeID + } + if !reflect.DeepEqual(gotIDs, want) { + t.Fatalf("SimilarTo top-5 = %v, brute-force = %v", gotIDs, want) + } + + // Single-add path: a new vector identical to ids[3]'s should be + // retrievable and rank at distance ~0 for its own query. + extra := make([]float32, dims) + copy(extra, corpus[ids[3]]) + if err := s.UpsertEmbedding("extra::node", extra); err != nil { + t.Fatalf("UpsertEmbedding: %v", err) + } + exHits, err := s.SimilarTo(extra, 3) + if err != nil { + t.Fatalf("SimilarTo (extra): %v", err) + } + if len(exHits) == 0 { + t.Fatalf("SimilarTo(extra) returned nothing") + } + // Either the original ids[3] or the new extra::node (both identical + // vectors, distance ~0) may sort first; the new one must be present + // at distance ~0. + foundExtra := false + for _, h := range exHits { + if h.NodeID == "extra::node" { + foundExtra = true + if h.Distance > 1e-6 { + t.Fatalf("extra::node distance = %g, want ~0", h.Distance) + } + } + } + if !foundExtra { + t.Fatalf("UpsertEmbedding'd vector not found in SimilarTo results: %v", exHits) + } +} + +func TestSQLiteVectorPersistence(t *testing.T) { + path := filepath.Join(t.TempDir(), "v.sqlite") + + corpus := map[string][]float32{ + "n::1": {1, 0, 0, 0, 0, 0, 0, 0}, + "n::2": {0, 1, 0, 0, 0, 0, 0, 0}, + "n::3": {0, 0, 1, 0, 0, 0, 0, 0}, + } + + // First session: write and close. + { + s, err := store_sqlite.Open(path) + if err != nil { + t.Fatalf("open: %v", err) + } + items := make([]graph.VectorItem, 0, len(corpus)) + for id, v := range corpus { + items = append(items, graph.VectorItem{NodeID: id, Vec: v}) + } + if err := s.BulkUpsertEmbeddings(items); err != nil { + t.Fatalf("BulkUpsertEmbeddings: %v", err) + } + if err := s.Close(); err != nil { + t.Fatalf("close: %v", err) + } + } + + // Second session: reopen, vectors must still be queryable. + { + s, err := store_sqlite.Open(path) + if err != nil { + t.Fatalf("reopen: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + + query := []float32{1, 0, 0, 0, 0, 0, 0, 0} + hits, err := s.SimilarTo(query, 3) + if err != nil { + t.Fatalf("SimilarTo after reopen: %v", err) + } + if len(hits) != 3 { + t.Fatalf("after reopen got %d hits, want 3 (persistence failed)", len(hits)) + } + if hits[0].NodeID != "n::1" { + t.Fatalf("after reopen top hit = %q, want n::1", hits[0].NodeID) + } + if hits[0].Distance > 1e-6 { + t.Fatalf("after reopen top distance = %g, want ~0", hits[0].Distance) + } + } +} + +func nodeID(i int) string { + const digits = "0123456789" + if i == 0 { + return "node::0" + } + var b []byte + for i > 0 { + b = append([]byte{digits[i%10]}, b...) + i /= 10 + } + return "node::" + string(b) +} diff --git a/internal/graph/storetest/backend_resolver.go b/internal/graph/storetest/backend_resolver.go new file mode 100644 index 00000000..2400de99 --- /dev/null +++ b/internal/graph/storetest/backend_resolver.go @@ -0,0 +1,272 @@ +package storetest + +import ( + "testing" + + "github.com/zzet/gortex/internal/graph" +) + +// RunBackendResolverConformance exercises every method of the +// graph.BackendResolver interface against a Factory that produces a +// store implementing both graph.Store and graph.BackendResolver. The +// shape mirrors RunConformance (the main Store contract): a known +// fixture graph, run the rule, assert the post-state matches the +// expected resolution. +// +// Backends that haven't implemented a rule yet ship the Phase 1 stub +// that returns (0, nil); those subtests pass trivially because the +// fixture also asserts zero-progress doesn't break correctness. +func RunBackendResolverConformance(t *testing.T, factory Factory) { + t.Helper() + t.Run("BackendResolver_SameFile", func(t *testing.T) { testBRSameFile(t, factory) }) + t.Run("BackendResolver_SamePackage", func(t *testing.T) { testBRSamePackage(t, factory) }) + t.Run("BackendResolver_ImportAware", func(t *testing.T) { testBRImportAware(t, factory) }) + t.Run("BackendResolver_RelativeImports", func(t *testing.T) { testBRRelativeImports(t, factory) }) + t.Run("BackendResolver_CrossRepo", func(t *testing.T) { testBRCrossRepo(t, factory) }) + t.Run("BackendResolver_UniqueNames", func(t *testing.T) { testBRUniqueNames(t, factory) }) + t.Run("BackendResolver_ExternalCallStubs", func(t *testing.T) { testBRExternalCallStubs(t, factory) }) + t.Run("BackendResolver_AllBulk", func(t *testing.T) { testBRAllBulk(t, factory) }) +} + +func asBackendResolver(t *testing.T, s graph.Store) graph.BackendResolver { + t.Helper() + br, ok := s.(graph.BackendResolver) + if !ok { + t.Skip("store does not implement graph.BackendResolver") + } + return br +} + +func testBRSameFile(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + br := asBackendResolver(t, s) + // caller and target in same file — unambiguous match + s.AddNode(mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction)) + s.AddNode(mkNode("a.go::Bar", "Bar", "a.go", graph.KindFunction)) + s.AddEdge(&graph.Edge{ + From: "a.go::Foo", To: "unresolved::Bar", Kind: graph.EdgeCalls, + FilePath: "a.go", Line: 1, Origin: "", + }) + n, err := br.ResolveSameFile() + if err != nil { + t.Fatalf("ResolveSameFile: %v", err) + } + if n == 0 { + // stub backend — skip the post-state assertions + return + } + if n != 1 { + t.Fatalf("ResolveSameFile resolved %d, want 1", n) + } + // edge should now point at a.go::Bar with origin ast_resolved + got := s.GetOutEdges("a.go::Foo") + if len(got) != 1 || got[0].To != "a.go::Bar" || got[0].Origin != graph.OriginASTResolved { + t.Fatalf("ResolveSameFile post-state: edges=%+v", got) + } +} + +func testBRSamePackage(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + br := asBackendResolver(t, s) + // caller in pkg/a.go, target in pkg/b.go — same directory + s.AddNode(mkRepoNode("pkg/a.go::Caller", "Caller", "pkg/a.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("pkg/b.go::Target", "Target", "pkg/b.go", "r1", graph.KindFunction)) + s.AddEdge(&graph.Edge{ + From: "pkg/a.go::Caller", To: "unresolved::Target", Kind: graph.EdgeCalls, + FilePath: "pkg/a.go", Line: 1, Origin: "", + }) + n, err := br.ResolveSamePackage() + if err != nil { + t.Fatalf("ResolveSamePackage: %v", err) + } + if n == 0 { + return + } + if n != 1 { + t.Fatalf("ResolveSamePackage resolved %d, want 1", n) + } + got := s.GetOutEdges("pkg/a.go::Caller") + if len(got) != 1 || got[0].To != "pkg/b.go::Target" || got[0].Origin != graph.OriginASTResolved { + t.Fatalf("ResolveSamePackage post-state: edges=%+v", got) + } +} + +func testBRImportAware(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + br := asBackendResolver(t, s) + // caller.go imports lib.go which exports Target + s.AddNode(mkNode("caller.go", "caller.go", "caller.go", graph.KindFile)) + s.AddNode(mkNode("lib.go", "lib.go", "lib.go", graph.KindFile)) + s.AddNode(mkNode("caller.go::Caller", "Caller", "caller.go", graph.KindFunction)) + s.AddNode(mkNode("lib.go::Target", "Target", "lib.go", graph.KindFunction)) + // the imports edge + s.AddEdge(&graph.Edge{ + From: "caller.go", To: "lib.go", Kind: graph.EdgeImports, + FilePath: "caller.go", Line: 1, Origin: graph.OriginASTResolved, + }) + // the unresolved call + s.AddEdge(&graph.Edge{ + From: "caller.go::Caller", To: "unresolved::Target", Kind: graph.EdgeCalls, + FilePath: "caller.go", Line: 5, Origin: "", + }) + n, err := br.ResolveImportAware() + if err != nil { + t.Fatalf("ResolveImportAware: %v", err) + } + if n == 0 { + return + } + if n != 1 { + t.Fatalf("ResolveImportAware resolved %d, want 1", n) + } + got := s.GetOutEdges("caller.go::Caller") + var found bool + for _, e := range got { + if e.To == "lib.go::Target" { + found = true + } + } + if !found { + t.Fatalf("ResolveImportAware post-state: edges=%+v, want one to lib.go::Target", got) + } +} + +func testBRRelativeImports(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + br := asBackendResolver(t, s) + // python relative-import stub + s.AddNode(mkNode("app/util.py", "app/util.py", "app/util.py", graph.KindFile)) + s.AddNode(mkNode("app/main.py", "app/main.py", "app/main.py", graph.KindFile)) + s.AddEdge(&graph.Edge{ + From: "app/main.py", To: "unresolved::pyrel::app/util", Kind: graph.EdgeImports, + FilePath: "app/main.py", Line: 1, Origin: "", + }) + n, err := br.ResolveRelativeImports("python") + if err != nil { + t.Fatalf("ResolveRelativeImports: %v", err) + } + if n == 0 { + return + } + if n != 1 { + t.Fatalf("ResolveRelativeImports resolved %d, want 1", n) + } + got := s.GetOutEdges("app/main.py") + var found bool + for _, e := range got { + if e.To == "app/util.py" { + found = true + } + } + if !found { + t.Fatalf("ResolveRelativeImports post-state: edges=%+v, want one to app/util.py", got) + } +} + +func testBRCrossRepo(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + br := asBackendResolver(t, s) + s.AddNode(mkRepoNode("r1/a.go::Caller", "Caller", "r1/a.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r2/x.go::Target", "Target", "r2/x.go", "r2", graph.KindFunction)) + s.AddEdge(&graph.Edge{ + From: "r1/a.go::Caller", To: "unresolved::Target", Kind: graph.EdgeCalls, + FilePath: "r1/a.go", Line: 1, Origin: "", + }) + n, err := br.ResolveCrossRepo() + if err != nil { + t.Fatalf("ResolveCrossRepo: %v", err) + } + if n == 0 { + return + } + if n != 1 { + t.Fatalf("ResolveCrossRepo resolved %d, want 1", n) + } + got := s.GetOutEdges("r1/a.go::Caller") + if len(got) != 1 || got[0].To != "r2/x.go::Target" || !got[0].CrossRepo { + t.Fatalf("ResolveCrossRepo post-state: edges=%+v", got) + } +} + +func testBRUniqueNames(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + br := asBackendResolver(t, s) + // One unique-name candidate in the graph. + s.AddNode(mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction)) + s.AddNode(mkNode("b.go::Target", "Target", "b.go", graph.KindFunction)) + s.AddEdge(&graph.Edge{ + From: "a.go::Foo", To: "unresolved::Target", Kind: graph.EdgeCalls, + FilePath: "a.go", Line: 1, Origin: "", + }) + n, err := br.ResolveUniqueNames() + if err != nil { + t.Fatalf("ResolveUniqueNames: %v", err) + } + if n == 0 { + return + } + if n != 1 { + t.Fatalf("ResolveUniqueNames resolved %d, want 1", n) + } + got := s.GetOutEdges("a.go::Foo") + if len(got) != 1 || got[0].To != "b.go::Target" { + t.Fatalf("ResolveUniqueNames post-state: edges=%+v", got) + } +} + +func testBRExternalCallStubs(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + br := asBackendResolver(t, s) + s.AddNode(mkNode("a.go::Caller", "Caller", "a.go", graph.KindFunction)) + // edge to external::npm/foo::bar with no stub node + s.AddEdge(&graph.Edge{ + From: "a.go::Caller", To: "external::npm/foo::bar", Kind: graph.EdgeCalls, + FilePath: "a.go", Line: 1, Origin: "", + }) + n, err := br.ResolveExternalCallStubs() + if err != nil { + t.Fatalf("ResolveExternalCallStubs: %v", err) + } + if n == 0 { + return + } + if n < 1 { + t.Fatalf("ResolveExternalCallStubs resolved %d, want >= 1", n) + } + // stub node must now exist + if s.GetNode("external::npm/foo::bar") == nil { + t.Fatalf("external stub node not created") + } +} + +func testBRAllBulk(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + br := asBackendResolver(t, s) + // Mix of resolvable + stub cases. + s.AddNode(mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction)) + s.AddNode(mkNode("a.go::Bar", "Bar", "a.go", graph.KindFunction)) + s.AddNode(mkNode("b.go::Unique", "Unique", "b.go", graph.KindFunction)) + // same-file + s.AddEdge(&graph.Edge{ + From: "a.go::Foo", To: "unresolved::Bar", Kind: graph.EdgeCalls, + FilePath: "a.go", Line: 1, Origin: "", + }) + // unique-name + s.AddEdge(&graph.Edge{ + From: "a.go::Foo", To: "unresolved::Unique", Kind: graph.EdgeCalls, + FilePath: "a.go", Line: 2, Origin: "", + }) + n, err := br.ResolveAllBulk() + if err != nil { + t.Fatalf("ResolveAllBulk: %v", err) + } + _ = n // 0 on stub backends, >0 on real +} diff --git a/internal/graph/storetest/memory_conformance_test.go b/internal/graph/storetest/memory_conformance_test.go new file mode 100644 index 00000000..29537241 --- /dev/null +++ b/internal/graph/storetest/memory_conformance_test.go @@ -0,0 +1,18 @@ +package storetest_test + +import ( + "testing" + + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/graph/storetest" +) + +// TestMemoryStoreConformance proves the in-memory *graph.Graph (the +// only Store impl that exists today) satisfies the conformance suite. +// This is the canonical baseline; new backends must pass the same +// battery. +func TestMemoryStoreConformance(t *testing.T) { + storetest.RunConformance(t, func(t *testing.T) graph.Store { + return graph.New() + }) +} diff --git a/internal/graph/storetest/storetest.go b/internal/graph/storetest/storetest.go new file mode 100644 index 00000000..2d10e112 --- /dev/null +++ b/internal/graph/storetest/storetest.go @@ -0,0 +1,3647 @@ +// Package storetest provides a conformance test suite that every +// graph.Store implementation MUST pass. Each backend (in-memory, +// bbolt-on-disk, SQLite-on-disk, remote-network-client) has a thin +// _test.go that calls RunConformance(t, factory) and inherits the +// full battery. +// +// The contract this package encodes is the union of behaviour the +// rest of gortex depends on from *graph.Graph today. New Store +// implementations are expected to satisfy every test before they can +// be considered a drop-in replacement. +package storetest + +import ( + "fmt" + "sort" + "sync" + "testing" + + "github.com/zzet/gortex/internal/graph" +) + +// Factory constructs a fresh, empty Store. RunConformance calls it +// many times across subtests; each invocation must yield an +// independent store with no leakage from previous runs. Backends with +// on-disk state should use t.TempDir() internally to isolate. +type Factory func(t *testing.T) graph.Store + +// RunConformance runs the full conformance suite against the Store +// produced by factory. Backends invoke it from a _test.go in their +// own package. +func RunConformance(t *testing.T, factory Factory) { + t.Helper() + t.Run("AddGetNode", func(t *testing.T) { testAddGetNode(t, factory) }) + t.Run("AddGetEdge", func(t *testing.T) { testAddGetEdge(t, factory) }) + t.Run("AddNodeIdempotent", func(t *testing.T) { testAddNodeIdempotent(t, factory) }) + t.Run("AddEdgeIdempotent", func(t *testing.T) { testAddEdgeIdempotent(t, factory) }) + t.Run("AddEdgeLineDisambiguates", func(t *testing.T) { testAddEdgeLineDisambiguates(t, factory) }) + t.Run("AddBatch", func(t *testing.T) { testAddBatch(t, factory) }) + t.Run("RemoveEdge", func(t *testing.T) { testRemoveEdge(t, factory) }) + t.Run("EvictFile", func(t *testing.T) { testEvictFile(t, factory) }) + t.Run("EvictFile_NoNodes", func(t *testing.T) { testEvictFileNoNodes(t, factory) }) + t.Run("EvictRepo", func(t *testing.T) { testEvictRepo(t, factory) }) + t.Run("EvictRepo_NoNodes", func(t *testing.T) { testEvictRepoNoNodes(t, factory) }) + t.Run("NodeAndEdgeCount", func(t *testing.T) { testNodeAndEdgeCount(t, factory) }) + t.Run("AllNodesAndEdges", func(t *testing.T) { testAllNodesAndEdges(t, factory) }) + t.Run("FindNodesByName", func(t *testing.T) { testFindNodesByName(t, factory) }) + t.Run("FindNodesByNameInRepo", func(t *testing.T) { testFindNodesByNameInRepo(t, factory) }) + t.Run("FindNodesByNameContaining", func(t *testing.T) { testFindNodesByNameContaining(t, factory) }) + t.Run("GetFileNodes", func(t *testing.T) { testGetFileNodes(t, factory) }) + t.Run("GetRepoNodes", func(t *testing.T) { testGetRepoNodes(t, factory) }) + t.Run("GetRepoEdges", func(t *testing.T) { testGetRepoEdges(t, factory) }) + t.Run("GetNodeByQualName", func(t *testing.T) { testGetNodeByQualName(t, factory) }) + t.Run("Stats", func(t *testing.T) { testStats(t, factory) }) + t.Run("RepoStats", func(t *testing.T) { testRepoStats(t, factory) }) + t.Run("RepoPrefixes", func(t *testing.T) { testRepoPrefixes(t, factory) }) + t.Run("SetEdgeProvenance", func(t *testing.T) { testSetEdgeProvenance(t, factory) }) + t.Run("SetEdgeProvenanceBatch", func(t *testing.T) { testSetEdgeProvenanceBatch(t, factory) }) + t.Run("ReindexEdge", func(t *testing.T) { testReindexEdge(t, factory) }) + t.Run("ReindexEdges", func(t *testing.T) { testReindexEdges(t, factory) }) + t.Run("Concurrency", func(t *testing.T) { testConcurrency(t, factory) }) + t.Run("EdgeIdentityRevisions", func(t *testing.T) { testEdgeIdentityRevisions(t, factory) }) + t.Run("VerifyEdgeIdentities", func(t *testing.T) { testVerifyEdgeIdentities(t, factory) }) + t.Run("RepoMemoryEstimate", func(t *testing.T) { testRepoMemoryEstimate(t, factory) }) + t.Run("AllRepoMemoryEstimates", func(t *testing.T) { testAllRepoMemoryEstimates(t, factory) }) + t.Run("MetaPreserved", func(t *testing.T) { testMetaPreserved(t, factory) }) + t.Run("EmptyStore", func(t *testing.T) { testEmptyStore(t, factory) }) + t.Run("EdgesByKind", func(t *testing.T) { testEdgesByKind(t, factory) }) + t.Run("NodesByKind", func(t *testing.T) { testNodesByKind(t, factory) }) + t.Run("EdgesWithUnresolvedTarget", func(t *testing.T) { testEdgesWithUnresolvedTarget(t, factory) }) + t.Run("GetNodesByIDs", func(t *testing.T) { testGetNodesByIDs(t, factory) }) + t.Run("FindNodesByNames", func(t *testing.T) { testFindNodesByNames(t, factory) }) + t.Run("GetEdgesByNodeIDs", func(t *testing.T) { testGetEdgesByNodeIDs(t, factory) }) + t.Run("SymbolBundleSearcher", func(t *testing.T) { testSymbolBundleSearcher(t, factory) }) + t.Run("DeadCodeCandidator", func(t *testing.T) { testDeadCodeCandidator(t, factory) }) + t.Run("IfaceImplementsScanner", func(t *testing.T) { testIfaceImplementsScanner(t, factory) }) + t.Run("NodeDegreeAggregator", func(t *testing.T) { testNodeDegreeAggregator(t, factory) }) + t.Run("NodeFanAggregator", func(t *testing.T) { testNodeFanAggregator(t, factory) }) + t.Run("FileImporters", func(t *testing.T) { testFileImporters(t, factory) }) + t.Run("InEdgeCounter", func(t *testing.T) { testInEdgeCounter(t, factory) }) + t.Run("NodesInFilesByKindFinder", func(t *testing.T) { testNodesInFilesByKindFinder(t, factory) }) + t.Run("EdgesByKindsScanner", func(t *testing.T) { testEdgesByKindsScanner(t, factory) }) + t.Run("NodesByKindsScanner", func(t *testing.T) { testNodesByKindsScanner(t, factory) }) + t.Run("EdgeKindCounter", func(t *testing.T) { testEdgeKindCounter(t, factory) }) + t.Run("CrossRepoEdgeAggregator", func(t *testing.T) { testCrossRepoEdgeAggregator(t, factory) }) + t.Run("FileImportAggregator", func(t *testing.T) { testFileImportAggregator(t, factory) }) + t.Run("InDegreeForNodes", func(t *testing.T) { testInDegreeForNodes(t, factory) }) + t.Run("ReachableForwardByKinds", func(t *testing.T) { testReachableForwardByKinds(t, factory) }) + t.Run("ThrowerErrorSurfacer", func(t *testing.T) { testThrowerErrorSurfacer(t, factory) }) + t.Run("EdgeAdjacencyForKinds", func(t *testing.T) { testEdgeAdjacencyForKinds(t, factory) }) + t.Run("CommunityCrossingsByKind", func(t *testing.T) { testCommunityCrossingsByKind(t, factory) }) + t.Run("NodeIDsByKinds", func(t *testing.T) { testNodeIDsByKinds(t, factory) }) + t.Run("MemberMethodsByType", func(t *testing.T) { testMemberMethodsByType(t, factory) }) + t.Run("StructuralParentEdges", func(t *testing.T) { testStructuralParentEdges(t, factory) }) + t.Run("CrossRepoCandidates", func(t *testing.T) { testCrossRepoCandidates(t, factory) }) + t.Run("ExtractCandidates", func(t *testing.T) { testExtractCandidates(t, factory) }) + t.Run("FileSymbolNamesByPaths", func(t *testing.T) { testFileSymbolNamesByPaths(t, factory) }) + t.Run("ClassHierarchyTraverser", func(t *testing.T) { testClassHierarchyTraverser(t, factory) }) + t.Run("FileEditingContext", func(t *testing.T) { testFileEditingContext(t, factory) }) + t.Run("NodeDegreeByKinds", func(t *testing.T) { testNodeDegreeByKinds(t, factory) }) + t.Run("CloneShingleSidecar", func(t *testing.T) { testCloneShingleSidecar(t, factory) }) + t.Run("ChurnEnrichmentSidecar", func(t *testing.T) { testChurnEnrichmentSidecar(t, factory) }) + t.Run("CoverageEnrichmentSidecar", func(t *testing.T) { testCoverageEnrichmentSidecar(t, factory) }) + t.Run("ReleaseEnrichmentSidecar", func(t *testing.T) { testReleaseEnrichmentSidecar(t, factory) }) + t.Run("BlameEnrichmentSidecar", func(t *testing.T) { testBlameEnrichmentSidecar(t, factory) }) +} + +// -- fixture helpers --------------------------------------------------- + +func mkNode(id, name, file string, kind graph.NodeKind) *graph.Node { + return &graph.Node{ + ID: id, + Kind: kind, + Name: name, + FilePath: file, + StartLine: 1, + EndLine: 10, + Language: "go", + } +} + +func mkRepoNode(id, name, file, repo string, kind graph.NodeKind) *graph.Node { + n := mkNode(id, name, file, kind) + n.RepoPrefix = repo + return n +} + +func mkEdge(from, to string, kind graph.EdgeKind) *graph.Edge { + return &graph.Edge{ + From: from, To: to, Kind: kind, + FilePath: "test.go", Line: 1, + Confidence: 1.0, + Origin: graph.OriginASTResolved, + } +} + +func sortNodeIDs(nodes []*graph.Node) []string { + ids := make([]string, 0, len(nodes)) + for _, n := range nodes { + if n != nil { + ids = append(ids, n.ID) + } + } + sort.Strings(ids) + return ids +} + +func sortEdgeKeys(edges []*graph.Edge) []string { + keys := make([]string, 0, len(edges)) + for _, e := range edges { + if e != nil { + keys = append(keys, fmt.Sprintf("%s|%s|%s|%d", e.From, e.To, e.Kind, e.Line)) + } + } + sort.Strings(keys) + return keys +} + +// -- individual subtests ---------------------------------------------- + +func testAddGetNode(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + n := mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction) + s.AddNode(n) + got := s.GetNode("a.go::Foo") + if got == nil { + t.Fatalf("GetNode returned nil for inserted node") + } + if got.Name != "Foo" || got.FilePath != "a.go" || got.Kind != graph.KindFunction { + t.Fatalf("round-trip mismatch: %+v", got) + } + if s.GetNode("missing") != nil { + t.Fatalf("GetNode should return nil for missing key") + } +} + +func testAddGetEdge(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + s.AddEdge(mkEdge("a", "b", graph.EdgeCalls)) + + out := s.GetOutEdges("a") + if len(out) != 1 || out[0].To != "b" { + t.Fatalf("GetOutEdges(a) = %+v, want one edge to b", out) + } + in := s.GetInEdges("b") + if len(in) != 1 || in[0].From != "a" { + t.Fatalf("GetInEdges(b) = %+v, want one edge from a", in) + } +} + +func testAddNodeIdempotent(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + n := mkNode("dup", "Dup", "x.go", graph.KindFunction) + s.AddNode(n) + s.AddNode(n) + s.AddNode(n) + if s.NodeCount() != 1 { + t.Fatalf("NodeCount after 3x add = %d, want 1", s.NodeCount()) + } +} + +func testAddEdgeIdempotent(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + e := mkEdge("a", "b", graph.EdgeCalls) + s.AddEdge(e) + s.AddEdge(e) + s.AddEdge(e) + if got := len(s.GetOutEdges("a")); got != 1 { + t.Fatalf("OutEdges after 3x add = %d, want 1", got) + } +} + +func testAddEdgeLineDisambiguates(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + e1 := mkEdge("a", "b", graph.EdgeCalls) + e1.Line = 1 + e2 := mkEdge("a", "b", graph.EdgeCalls) + e2.Line = 5 + s.AddEdge(e1) + s.AddEdge(e2) + if got := len(s.GetOutEdges("a")); got != 2 { + t.Fatalf("OutEdges with different lines = %d, want 2", got) + } +} + +func testAddBatch(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + nodes := []*graph.Node{ + mkNode("a", "A", "x.go", graph.KindFunction), + mkNode("b", "B", "x.go", graph.KindFunction), + mkNode("c", "C", "y.go", graph.KindType), + } + edges := []*graph.Edge{ + mkEdge("a", "b", graph.EdgeCalls), + mkEdge("b", "c", graph.EdgeReferences), + } + s.AddBatch(nodes, edges) + if s.NodeCount() != 3 { + t.Fatalf("NodeCount after AddBatch = %d, want 3", s.NodeCount()) + } + if s.EdgeCount() != 2 { + t.Fatalf("EdgeCount after AddBatch = %d, want 2", s.EdgeCount()) + } +} + +func testRemoveEdge(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + e := mkEdge("a", "b", graph.EdgeCalls) + s.AddEdge(e) + if !s.RemoveEdge("a", "b", graph.EdgeCalls) { + t.Fatalf("RemoveEdge returned false for existing edge") + } + if len(s.GetOutEdges("a")) != 0 { + t.Fatalf("OutEdges after RemoveEdge = nonzero") + } + if len(s.GetInEdges("b")) != 0 { + t.Fatalf("InEdges after RemoveEdge = nonzero") + } + // Removing non-existent should report false but not panic. + if s.RemoveEdge("a", "b", graph.EdgeCalls) { + t.Fatalf("RemoveEdge returned true for missing edge") + } +} + +func testEvictFile(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction)) + s.AddNode(mkNode("a.go::Bar", "Bar", "a.go", graph.KindFunction)) + s.AddNode(mkNode("b.go::Baz", "Baz", "b.go", graph.KindFunction)) + s.AddEdge(mkEdge("a.go::Foo", "a.go::Bar", graph.EdgeCalls)) + s.AddEdge(mkEdge("a.go::Bar", "b.go::Baz", graph.EdgeCalls)) + + nodesRemoved, edgesRemoved := s.EvictFile("a.go") + if nodesRemoved != 2 { + t.Fatalf("EvictFile nodesRemoved = %d, want 2", nodesRemoved) + } + if edgesRemoved == 0 { + t.Fatalf("EvictFile edgesRemoved should be > 0") + } + if s.GetNode("a.go::Foo") != nil { + t.Fatalf("evicted node still present") + } + if s.GetNode("b.go::Baz") == nil { + t.Fatalf("unrelated node was evicted") + } +} + +func testEvictFileNoNodes(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + n, e := s.EvictFile("nonexistent.go") + if n != 0 || e != 0 { + t.Fatalf("EvictFile on empty file returned (%d, %d), want (0, 0)", n, e) + } +} + +func testEvictRepo(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkRepoNode("r1/a.go::Foo", "Foo", "r1/a.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r1/b.go::Bar", "Bar", "r1/b.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r2/x.go::Baz", "Baz", "r2/x.go", "r2", graph.KindFunction)) + s.AddEdge(mkEdge("r1/a.go::Foo", "r1/b.go::Bar", graph.EdgeCalls)) + + nodesRemoved, edgesRemoved := s.EvictRepo("r1") + if nodesRemoved != 2 { + t.Fatalf("EvictRepo nodesRemoved = %d, want 2", nodesRemoved) + } + if edgesRemoved == 0 { + t.Fatalf("EvictRepo edgesRemoved should be > 0") + } + if s.GetNode("r1/a.go::Foo") != nil { + t.Fatalf("r1 node still present") + } + if s.GetNode("r2/x.go::Baz") == nil { + t.Fatalf("r2 node was evicted") + } +} + +func testEvictRepoNoNodes(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + n, e := s.EvictRepo("nonexistent-repo") + if n != 0 || e != 0 { + t.Fatalf("EvictRepo on missing repo returned (%d, %d), want (0, 0)", n, e) + } +} + +func testNodeAndEdgeCount(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + if s.NodeCount() != 0 || s.EdgeCount() != 0 { + t.Fatalf("empty store reports nonzero counts") + } + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + s.AddEdge(mkEdge("a", "b", graph.EdgeCalls)) + if s.NodeCount() != 2 { + t.Fatalf("NodeCount = %d, want 2", s.NodeCount()) + } + if s.EdgeCount() != 1 { + t.Fatalf("EdgeCount = %d, want 1", s.EdgeCount()) + } +} + +func testAllNodesAndEdges(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "y.go", graph.KindType)) + s.AddEdge(mkEdge("a", "b", graph.EdgeReferences)) + + gotN := sortNodeIDs(s.AllNodes()) + wantN := []string{"a", "b"} + if fmt.Sprint(gotN) != fmt.Sprint(wantN) { + t.Fatalf("AllNodes = %v, want %v", gotN, wantN) + } + gotE := sortEdgeKeys(s.AllEdges()) + if len(gotE) != 1 { + t.Fatalf("AllEdges = %v, want one entry", gotE) + } +} + +func testFindNodesByName(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction)) + s.AddNode(mkNode("b.go::Foo", "Foo", "b.go", graph.KindFunction)) + s.AddNode(mkNode("c.go::Bar", "Bar", "c.go", graph.KindFunction)) + got := sortNodeIDs(s.FindNodesByName("Foo")) + want := []string{"a.go::Foo", "b.go::Foo"} + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("FindNodesByName(Foo) = %v, want %v", got, want) + } + if len(s.FindNodesByName("MissingName")) != 0 { + t.Fatalf("FindNodesByName for missing name should be empty") + } +} + +func testFindNodesByNameInRepo(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkRepoNode("r1/a.go::Foo", "Foo", "r1/a.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r2/a.go::Foo", "Foo", "r2/a.go", "r2", graph.KindFunction)) + got := sortNodeIDs(s.FindNodesByNameInRepo("Foo", "r1")) + want := []string{"r1/a.go::Foo"} + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("FindNodesByNameInRepo(Foo, r1) = %v, want %v", got, want) + } +} + +func testFindNodesByNameContaining(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + // Three "log"-containing names + one unrelated. + s.AddNode(mkNode("a.go::Login", "Login", "a.go", graph.KindFunction)) + s.AddNode(mkNode("b.go::LoginHandler", "LoginHandler", "b.go", graph.KindFunction)) + s.AddNode(mkNode("c.go::Logout", "Logout", "c.go", graph.KindFunction)) + s.AddNode(mkNode("d.go::Unrelated", "Unrelated", "d.go", graph.KindFunction)) + + // Case-insensitive substring match should return exactly the 3 + // "log"-bearing nodes. + got := sortNodeIDs(s.FindNodesByNameContaining("log", 10)) + want := []string{"a.go::Login", "b.go::LoginHandler", "c.go::Logout"} + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("FindNodesByNameContaining(log, 10) = %v, want %v", got, want) + } + + // Mixed-case query — must still match (case-insensitive). + gotUpper := sortNodeIDs(s.FindNodesByNameContaining("LOG", 10)) + if fmt.Sprint(gotUpper) != fmt.Sprint(want) { + t.Fatalf("FindNodesByNameContaining(LOG, 10) = %v, want %v", gotUpper, want) + } + + // Limit is honoured. Asking for 2 must return at most 2. + gotLimited := s.FindNodesByNameContaining("log", 2) + if len(gotLimited) != 2 { + t.Fatalf("FindNodesByNameContaining(log, 2) returned %d, want 2", len(gotLimited)) + } + + // Empty needle returns nothing — never the whole graph. + if got := s.FindNodesByNameContaining("", 10); len(got) != 0 { + t.Fatalf("FindNodesByNameContaining(\"\") returned %d, want 0", len(got)) + } + + // No match — empty slice. + if got := s.FindNodesByNameContaining("nonexistent_substring_xyz", 10); len(got) != 0 { + t.Fatalf("FindNodesByNameContaining(no-match) returned %d, want 0", len(got)) + } +} + +func testGetFileNodes(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction)) + s.AddNode(mkNode("a.go::Bar", "Bar", "a.go", graph.KindFunction)) + s.AddNode(mkNode("b.go::Baz", "Baz", "b.go", graph.KindFunction)) + got := sortNodeIDs(s.GetFileNodes("a.go")) + want := []string{"a.go::Bar", "a.go::Foo"} + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("GetFileNodes(a.go) = %v, want %v", got, want) + } +} + +func testGetRepoNodes(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkRepoNode("r1/a.go::Foo", "Foo", "r1/a.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r1/b.go::Bar", "Bar", "r1/b.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r2/x.go::Baz", "Baz", "r2/x.go", "r2", graph.KindFunction)) + got := sortNodeIDs(s.GetRepoNodes("r1")) + want := []string{"r1/a.go::Foo", "r1/b.go::Bar"} + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("GetRepoNodes(r1) = %v, want %v", got, want) + } +} + +// testGetRepoEdges asserts that GetRepoEdges returns every edge whose +// SOURCE node carries the requested RepoPrefix, regardless of where +// the target lives — same-repo intra edges, cross-repo edges (source +// in r1 → target in r2), AND unresolved::* targets all count. Edges +// whose source is in a different repo (or unscoped) MUST NOT appear. +// Empty prefix returns nil so callers don't accidentally fall through +// to a full-graph scan. +func testGetRepoEdges(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + // r1 has two nodes that originate outgoing edges; r2 has a target + // node and one of its own source nodes. + s.AddNode(mkRepoNode("r1/a.go::Foo", "Foo", "r1/a.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r1/b.go::Bar", "Bar", "r1/b.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r2/x.go::Baz", "Baz", "r2/x.go", "r2", graph.KindFunction)) + s.AddNode(mkRepoNode("r2/y.go::Qux", "Qux", "r2/y.go", "r2", graph.KindFunction)) + + // r1-intra (Foo → Bar) — same repo. + s.AddEdge(mkEdge("r1/a.go::Foo", "r1/b.go::Bar", graph.EdgeCalls)) + // r1 → r2 cross-repo (Foo → Baz). + s.AddEdge(mkEdge("r1/a.go::Foo", "r2/x.go::Baz", graph.EdgeCalls)) + // r1 → unresolved (Bar → unresolved::Missing) — counts because + // source is in r1. + s.AddEdge(mkEdge("r1/b.go::Bar", "unresolved::Missing", graph.EdgeCalls)) + // r2-intra (Qux → Baz) — MUST NOT appear in r1's slice. + s.AddEdge(mkEdge("r2/y.go::Qux", "r2/x.go::Baz", graph.EdgeCalls)) + // r2 → r1 cross-repo (Qux → Foo) — MUST NOT appear in r1's slice + // because the source is in r2. + s.AddEdge(mkEdge("r2/y.go::Qux", "r1/a.go::Foo", graph.EdgeCalls)) + + gotR1 := sortEdgeKeys(s.GetRepoEdges("r1")) + wantR1 := sortEdgeKeys([]*graph.Edge{ + mkEdge("r1/a.go::Foo", "r1/b.go::Bar", graph.EdgeCalls), + mkEdge("r1/a.go::Foo", "r2/x.go::Baz", graph.EdgeCalls), + mkEdge("r1/b.go::Bar", "unresolved::Missing", graph.EdgeCalls), + }) + if fmt.Sprint(gotR1) != fmt.Sprint(wantR1) { + t.Fatalf("GetRepoEdges(r1) =\n %v\nwant\n %v", gotR1, wantR1) + } + + gotR2 := sortEdgeKeys(s.GetRepoEdges("r2")) + wantR2 := sortEdgeKeys([]*graph.Edge{ + mkEdge("r2/y.go::Qux", "r2/x.go::Baz", graph.EdgeCalls), + mkEdge("r2/y.go::Qux", "r1/a.go::Foo", graph.EdgeCalls), + }) + if fmt.Sprint(gotR2) != fmt.Sprint(wantR2) { + t.Fatalf("GetRepoEdges(r2) =\n %v\nwant\n %v", gotR2, wantR2) + } + + // Empty prefix MUST return nothing (use AllEdges for the global + // view). Disk backends must not fall through to a full scan. + if got := s.GetRepoEdges(""); len(got) != 0 { + t.Fatalf("GetRepoEdges(\"\") = %d edges, want 0", len(got)) + } + + // Unknown prefix MUST return empty (no panic, no fallthrough). + if got := s.GetRepoEdges("nope"); len(got) != 0 { + t.Fatalf("GetRepoEdges(nope) = %d edges, want 0", len(got)) + } +} + +func testGetNodeByQualName(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + n := mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction) + n.QualName = "pkg.Foo" + s.AddNode(n) + got := s.GetNodeByQualName("pkg.Foo") + if got == nil || got.ID != "a.go::Foo" { + t.Fatalf("GetNodeByQualName(pkg.Foo) = %v, want a.go::Foo", got) + } + if s.GetNodeByQualName("missing.Qual") != nil { + t.Fatalf("GetNodeByQualName missing should be nil") + } +} + +func testStats(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "y.go", graph.KindType)) + s.AddEdge(mkEdge("a", "b", graph.EdgeReferences)) + st := s.Stats() + if st.TotalNodes != 2 || st.TotalEdges != 1 { + t.Fatalf("Stats = %+v, want TotalNodes=2, TotalEdges=1", st) + } + if st.ByKind[string(graph.KindFunction)] != 1 || st.ByKind[string(graph.KindType)] != 1 { + t.Fatalf("Stats.ByKind = %v, want one each", st.ByKind) + } +} + +func testRepoStats(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkRepoNode("r1/a.go::Foo", "Foo", "r1/a.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r2/x.go::Baz", "Baz", "r2/x.go", "r2", graph.KindType)) + st := s.RepoStats() + if len(st) != 2 { + t.Fatalf("RepoStats has %d entries, want 2", len(st)) + } + if st["r1"].TotalNodes != 1 { + t.Fatalf("RepoStats[r1].TotalNodes = %d, want 1", st["r1"].TotalNodes) + } +} + +func testRepoPrefixes(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkRepoNode("r1/a.go::Foo", "Foo", "r1/a.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r2/x.go::Baz", "Baz", "r2/x.go", "r2", graph.KindType)) + got := s.RepoPrefixes() + sort.Strings(got) + want := []string{"r1", "r2"} + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("RepoPrefixes = %v, want %v", got, want) + } +} + +func testSetEdgeProvenance(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + e := mkEdge("a", "b", graph.EdgeCalls) + e.Origin = graph.OriginTextMatched + s.AddEdge(e) + + bumped := s.SetEdgeProvenance(e, graph.OriginLSPResolved) + if !bumped { + t.Fatalf("SetEdgeProvenance returned false for real upgrade") + } + out := s.GetOutEdges("a") + if len(out) != 1 || out[0].Origin != graph.OriginLSPResolved { + t.Fatalf("Origin did not propagate: %+v", out) + } +} + +func testReindexEdges(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + // Build a small graph with three out-edges from "a" pointing at + // three different targets, then re-bind all three to a fourth + // target in one batched call. + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + s.AddNode(mkNode("c", "C", "x.go", graph.KindFunction)) + s.AddNode(mkNode("d", "D", "x.go", graph.KindFunction)) + s.AddNode(mkNode("z", "Z", "x.go", graph.KindFunction)) + + e1 := mkEdge("a", "b", graph.EdgeCalls) + e1.Line = 1 + e2 := mkEdge("a", "c", graph.EdgeCalls) + e2.Line = 2 + e3 := mkEdge("a", "d", graph.EdgeCalls) + e3.Line = 3 + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + + // Mutate each edge's To, then hand the batch over. After the + // call, all three edges must show as in-edges of z; none of the + // originals must remain. + e1.To, e2.To, e3.To = "z", "z", "z" + s.ReindexEdges([]graph.EdgeReindex{ + {Edge: e1, OldTo: "b"}, + {Edge: e2, OldTo: "c"}, + {Edge: e3, OldTo: "d"}, + }) + + for _, oldID := range []string{"b", "c", "d"} { + if got := len(s.GetInEdges(oldID)); got != 0 { + t.Fatalf("GetInEdges(%q) after batch reindex = %d, want 0", oldID, got) + } + } + if got := len(s.GetInEdges("z")); got != 3 { + t.Fatalf("GetInEdges(z) after batch reindex = %d, want 3", got) + } + if got := len(s.GetOutEdges("a")); got != 3 { + t.Fatalf("GetOutEdges(a) after batch reindex = %d, want 3", got) + } + + // Empty batch is a no-op. + s.ReindexEdges(nil) + s.ReindexEdges([]graph.EdgeReindex{}) +} + +func testSetEdgeProvenanceBatch(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + + e1 := mkEdge("a", "b", graph.EdgeCalls) + e1.Line = 1 + e1.Origin = graph.OriginTextMatched + e2 := mkEdge("a", "b", graph.EdgeCalls) + e2.Line = 2 + e2.Origin = graph.OriginTextMatched + e3 := mkEdge("a", "b", graph.EdgeCalls) + e3.Line = 3 + e3.Origin = graph.OriginLSPResolved // already at target tier — should be no-op + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + + changed := s.SetEdgeProvenanceBatch([]graph.EdgeProvenanceUpdate{ + {Edge: e1, NewOrigin: graph.OriginLSPResolved}, + {Edge: e2, NewOrigin: graph.OriginLSPResolved}, + {Edge: e3, NewOrigin: graph.OriginLSPResolved}, + }) + if changed != 2 { + t.Fatalf("SetEdgeProvenanceBatch reported %d changed, want 2 (one was already at target tier)", changed) + } + // Verify both promotions landed in the persisted edges. + out := s.GetOutEdges("a") + if len(out) != 3 { + t.Fatalf("GetOutEdges(a) = %d, want 3", len(out)) + } + for _, e := range out { + if e.Origin != graph.OriginLSPResolved { + t.Fatalf("edge %s->%s Origin = %q, want lsp_resolved", e.From, e.To, e.Origin) + } + } + + // Empty batch is a no-op and returns 0. + if got := s.SetEdgeProvenanceBatch(nil); got != 0 { + t.Fatalf("empty batch returned %d, want 0", got) + } + if got := s.SetEdgeProvenanceBatch([]graph.EdgeProvenanceUpdate{}); got != 0 { + t.Fatalf("empty batch returned %d, want 0", got) + } +} + +func testReindexEdge(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("old", "Old", "x.go", graph.KindFunction)) + s.AddNode(mkNode("new", "New", "x.go", graph.KindFunction)) + e := mkEdge("a", "old", graph.EdgeCalls) + s.AddEdge(e) + + e.To = "new" + s.ReindexEdge(e, "old") + + if got := len(s.GetInEdges("old")); got != 0 { + t.Fatalf("InEdges(old) after reindex = %d, want 0", got) + } + in := s.GetInEdges("new") + if len(in) != 1 || in[0].From != "a" { + t.Fatalf("InEdges(new) = %+v, want one edge from a", in) + } +} + +func testConcurrency(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + const workers = 8 + const perWorker = 50 + var wg sync.WaitGroup + for w := range workers { + wg.Add(1) + go func(w int) { + defer wg.Done() + for i := range perWorker { + id := fmt.Sprintf("w%d/n%d", w, i) + s.AddNode(mkNode(id, fmt.Sprintf("N%d", i), fmt.Sprintf("f%d.go", w), graph.KindFunction)) + } + }(w) + } + wg.Wait() + if got, want := s.NodeCount(), workers*perWorker; got != want { + t.Fatalf("concurrent NodeCount = %d, want %d", got, want) + } +} + +func testEdgeIdentityRevisions(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + // Just ensure the method exists and returns a non-negative int. + // The semantic invariant ("bumps on origin change") is + // implementation-defined; backends may return 0 if they don't + // track this. + if got := s.EdgeIdentityRevisions(); got < 0 { + t.Fatalf("EdgeIdentityRevisions negative: %d", got) + } +} + +func testVerifyEdgeIdentities(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + s.AddEdge(mkEdge("a", "b", graph.EdgeCalls)) + if err := s.VerifyEdgeIdentities(); err != nil { + t.Fatalf("VerifyEdgeIdentities on consistent store: %v", err) + } +} + +func testRepoMemoryEstimate(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkRepoNode("r1/a.go::Foo", "Foo", "r1/a.go", "r1", graph.KindFunction)) + // Backends may return zero (disk/remote) or a real estimate + // (in-memory). The contract is that the call succeeds and + // NodeCount matches what we inserted. + est := s.RepoMemoryEstimate("r1") + if est.NodeCount != 1 { + t.Fatalf("RepoMemoryEstimate NodeCount = %d, want 1", est.NodeCount) + } +} + +func testAllRepoMemoryEstimates(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkRepoNode("r1/a.go::Foo", "Foo", "r1/a.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r2/x.go::Baz", "Baz", "r2/x.go", "r2", graph.KindFunction)) + all := s.AllRepoMemoryEstimates() + if len(all) != 2 { + t.Fatalf("AllRepoMemoryEstimates len = %d, want 2", len(all)) + } +} + +func testMetaPreserved(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + n := mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction) + n.Meta = map[string]any{ + "signature": "func Foo(x int) error", + "visibility": "public", + } + s.AddNode(n) + got := s.GetNode("a.go::Foo") + if got == nil { + t.Fatalf("GetNode returned nil") + } + if got.Meta == nil { + t.Fatalf("Meta not preserved") + } + if got.Meta["signature"] != "func Foo(x int) error" { + t.Fatalf("Meta[signature] = %v", got.Meta["signature"]) + } + if got.Meta["visibility"] != "public" { + t.Fatalf("Meta[visibility] = %v", got.Meta["visibility"]) + } +} + +func testEdgesByKind(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + s.AddNode(mkNode("c", "C", "y.go", graph.KindType)) + + e1 := mkEdge("a", "b", graph.EdgeCalls) + e1.Line = 1 + e2 := mkEdge("a", "b", graph.EdgeCalls) + e2.Line = 2 + e3 := mkEdge("a", "c", graph.EdgeReferences) + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + + var calls []*graph.Edge + for e := range s.EdgesByKind(graph.EdgeCalls) { + calls = append(calls, e) + } + if len(calls) != 2 { + t.Fatalf("EdgesByKind(EdgeCalls) yielded %d, want 2", len(calls)) + } + for _, e := range calls { + if e.Kind != graph.EdgeCalls { + t.Fatalf("yielded edge has wrong kind: %s", e.Kind) + } + } + + var refs []*graph.Edge + for e := range s.EdgesByKind(graph.EdgeReferences) { + refs = append(refs, e) + } + if len(refs) != 1 { + t.Fatalf("EdgesByKind(EdgeReferences) yielded %d, want 1", len(refs)) + } + + // Unknown kind yields nothing. + count := 0 + for range s.EdgesByKind(graph.EdgeKind("nonexistent")) { + count++ + } + if count != 0 { + t.Fatalf("EdgesByKind(nonexistent) yielded %d, want 0", count) + } + + // Early stop honours the contract. + stopped := 0 + for range s.EdgesByKind(graph.EdgeCalls) { + stopped++ + if stopped == 1 { + break + } + } + if stopped != 1 { + t.Fatalf("early stop yielded %d before break, want 1", stopped) + } +} + +func testNodesByKind(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + s.AddNode(mkNode("c", "C", "y.go", graph.KindType)) + + var fns []*graph.Node + for n := range s.NodesByKind(graph.KindFunction) { + fns = append(fns, n) + } + if len(fns) != 2 { + t.Fatalf("NodesByKind(KindFunction) yielded %d, want 2", len(fns)) + } + for _, n := range fns { + if n.Kind != graph.KindFunction { + t.Fatalf("yielded node has wrong kind: %s", n.Kind) + } + } + + var types []*graph.Node + for n := range s.NodesByKind(graph.KindType) { + types = append(types, n) + } + if len(types) != 1 { + t.Fatalf("NodesByKind(KindType) yielded %d, want 1", len(types)) + } + + // Early stop honours the contract. + stopped := 0 + for range s.NodesByKind(graph.KindFunction) { + stopped++ + if stopped == 1 { + break + } + } + if stopped != 1 { + t.Fatalf("early stop yielded %d before break, want 1", stopped) + } +} + +func testEdgesWithUnresolvedTarget(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + + e1 := mkEdge("a", "b", graph.EdgeCalls) + e1.Line = 1 + e2 := mkEdge("a", "unresolved::Foo", graph.EdgeCalls) + e2.Line = 2 + e3 := mkEdge("a", "unresolved::Bar", graph.EdgeCalls) + e3.Line = 3 + e4 := mkEdge("a", "resolved", graph.EdgeCalls) + e4.Line = 4 + // Multi-repo COPY rewrite form: copyBulkLocked rewrites a bare + // `unresolved::` stub to `::unresolved::` + // so per-repo stubs can't collide on the COPY primary key. The + // pending-edge scan MUST yield this form too, or the Go resolver + // never gets a second pass at multi-repo stubs (the whole-repo + // "every function looks dead" bug). graph.IsUnresolvedTarget is + // the canonical matcher for both encodings. + e5 := mkEdge("a", "gortex::unresolved::Baz", graph.EdgeCalls) + e5.Line = 5 + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + s.AddEdge(e4) + s.AddEdge(e5) + + var unres []*graph.Edge + for e := range s.EdgesWithUnresolvedTarget() { + unres = append(unres, e) + } + if len(unres) != 3 { + t.Fatalf("EdgesWithUnresolvedTarget yielded %d, want 3 (unresolved::Foo, unresolved::Bar, gortex::unresolved::Baz)", len(unres)) + } + gotPrefixed := false + for _, e := range unres { + if !graph.IsUnresolvedTarget(e.To) { + t.Fatalf("yielded edge has non-unresolved To: %s", e.To) + } + if e.To == "gortex::unresolved::Baz" { + gotPrefixed = true + } + } + if !gotPrefixed { + t.Fatalf("EdgesWithUnresolvedTarget did not yield the multi-repo prefixed stub gortex::unresolved::Baz") + } +} + +func testEmptyStore(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + if s.NodeCount() != 0 { + t.Fatalf("empty NodeCount = %d, want 0", s.NodeCount()) + } + if s.EdgeCount() != 0 { + t.Fatalf("empty EdgeCount = %d, want 0", s.EdgeCount()) + } + if len(s.AllNodes()) != 0 { + t.Fatalf("empty AllNodes nonzero") + } + if len(s.AllEdges()) != 0 { + t.Fatalf("empty AllEdges nonzero") + } + if len(s.RepoPrefixes()) != 0 { + t.Fatalf("empty RepoPrefixes nonzero") + } +} + +func testGetNodesByIDs(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction)) + s.AddNode(mkNode("a.go::Bar", "Bar", "a.go", graph.KindFunction)) + s.AddNode(mkNode("b.go::Baz", "Baz", "b.go", graph.KindType)) + + got := s.GetNodesByIDs([]string{"a.go::Foo", "b.go::Baz", "missing", "a.go::Bar", "a.go::Foo"}) + if len(got) != 3 { + t.Fatalf("GetNodesByIDs len = %d, want 3 (3 present, 1 missing, 1 duplicate)", len(got)) + } + if got["a.go::Foo"] == nil || got["a.go::Foo"].Name != "Foo" { + t.Fatalf("missing or wrong Foo: %v", got["a.go::Foo"]) + } + if got["b.go::Baz"] == nil || got["b.go::Baz"].Kind != graph.KindType { + t.Fatalf("missing or wrong Baz: %v", got["b.go::Baz"]) + } + if _, present := got["missing"]; present { + t.Fatalf("missing ID should not be in map, got %v", got["missing"]) + } + + // Empty / nil input is a no-op. + if got := s.GetNodesByIDs(nil); len(got) != 0 { + t.Fatalf("nil input returned %d entries", len(got)) + } + if got := s.GetNodesByIDs([]string{}); len(got) != 0 { + t.Fatalf("empty input returned %d entries", len(got)) + } + if got := s.GetNodesByIDs([]string{""}); len(got) != 0 { + t.Fatalf("empty-string ID returned %d entries", len(got)) + } +} + +func testFindNodesByNames(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction)) + s.AddNode(mkNode("b.go::Foo", "Foo", "b.go", graph.KindFunction)) + s.AddNode(mkNode("c.go::Bar", "Bar", "c.go", graph.KindFunction)) + + got := s.FindNodesByNames([]string{"Foo", "Missing", "Bar", "Foo"}) + if len(got) != 2 { + t.Fatalf("FindNodesByNames len = %d, want 2 (2 present, 1 missing, 1 duplicate)", len(got)) + } + foos := got["Foo"] + if len(foos) != 2 { + t.Fatalf("Foo matches = %d, want 2", len(foos)) + } + for _, n := range foos { + if n.Name != "Foo" { + t.Fatalf("matched node has wrong Name: %s", n.Name) + } + } + bars := got["Bar"] + if len(bars) != 1 || bars[0].Name != "Bar" { + t.Fatalf("Bar matches wrong: %v", bars) + } + if _, present := got["Missing"]; present { + t.Fatalf("missing name should not be in map") + } + + // Empty / nil input. + if got := s.FindNodesByNames(nil); len(got) != 0 { + t.Fatalf("nil input returned %d entries", len(got)) + } + if got := s.FindNodesByNames([]string{}); len(got) != 0 { + t.Fatalf("empty input returned %d entries", len(got)) + } +} + +// testGetEdgesByNodeIDs covers the batched fan-in / fan-out edge +// lookups. Builds a small graph with mixed fan-in/out, calls both +// methods with a mix of present and missing ids (plus an empty +// string), and asserts the per-id slices match what GetInEdges / +// GetOutEdges would return individually. +func testGetEdgesByNodeIDs(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + // Nodes + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + s.AddNode(mkNode("c", "C", "y.go", graph.KindFunction)) + s.AddNode(mkNode("d", "D", "y.go", graph.KindFunction)) + // Edges: a→b, a→c, b→c, d→c (so c has 3 in-edges, a has 2 out-edges). + s.AddEdge(mkEdge("a", "b", graph.EdgeCalls)) + s.AddEdge(mkEdge("a", "c", graph.EdgeCalls)) + s.AddEdge(mkEdge("b", "c", graph.EdgeCalls)) + s.AddEdge(mkEdge("d", "c", graph.EdgeReferences)) + + // --- GetOutEdgesByNodeIDs --- + outMap := s.GetOutEdgesByNodeIDs([]string{"a", "b", "d", "missing", "a"}) + // a has 2 out-edges (a→b, a→c). + if got := sortEdgeKeys(outMap["a"]); len(got) != 2 { + t.Fatalf("GetOutEdgesByNodeIDs[a] = %v, want 2 edges", got) + } + // b has 1 out-edge (b→c). + if got := outMap["b"]; len(got) != 1 || got[0].To != "c" { + t.Fatalf("GetOutEdgesByNodeIDs[b] = %v, want one edge to c", got) + } + // d has 1 out-edge (d→c). + if got := outMap["d"]; len(got) != 1 || got[0].To != "c" { + t.Fatalf("GetOutEdgesByNodeIDs[d] = %v, want one edge to c", got) + } + // missing key — range over nil is a no-op, so callers can index + // without an ok-check. + if got := outMap["missing"]; len(got) != 0 { + t.Fatalf("GetOutEdgesByNodeIDs[missing] = %v, want empty", got) + } + + // --- GetInEdgesByNodeIDs --- + inMap := s.GetInEdgesByNodeIDs([]string{"a", "b", "c", "missing"}) + // a has 0 in-edges. + if got := inMap["a"]; len(got) != 0 { + t.Fatalf("GetInEdgesByNodeIDs[a] = %v, want empty", got) + } + // b has 1 in-edge (a→b). + if got := inMap["b"]; len(got) != 1 || got[0].From != "a" { + t.Fatalf("GetInEdgesByNodeIDs[b] = %v, want one edge from a", got) + } + // c has 3 in-edges (a→c, b→c, d→c). + if got := inMap["c"]; len(got) != 3 { + t.Fatalf("GetInEdgesByNodeIDs[c] = %v, want 3 edges", got) + } + froms := map[string]bool{} + for _, e := range inMap["c"] { + froms[e.From] = true + } + for _, want := range []string{"a", "b", "d"} { + if !froms[want] { + t.Fatalf("GetInEdgesByNodeIDs[c] missing edge from %q", want) + } + } + if got := inMap["missing"]; len(got) != 0 { + t.Fatalf("GetInEdgesByNodeIDs[missing] = %v, want empty", got) + } + + // Empty / nil / empty-string inputs are no-ops. + if got := s.GetOutEdgesByNodeIDs(nil); len(got) != 0 { + t.Fatalf("GetOutEdgesByNodeIDs(nil) returned %d entries", len(got)) + } + if got := s.GetInEdgesByNodeIDs(nil); len(got) != 0 { + t.Fatalf("GetInEdgesByNodeIDs(nil) returned %d entries", len(got)) + } + if got := s.GetOutEdgesByNodeIDs([]string{}); len(got) != 0 { + t.Fatalf("GetOutEdgesByNodeIDs([]) returned %d entries", len(got)) + } + if got := s.GetInEdgesByNodeIDs([]string{""}); len(got) != 0 { + t.Fatalf("GetInEdgesByNodeIDs([\"\"]) returned %d entries", len(got)) + } +} + +// testSymbolBundleSearcher exercises the optional +// graph.SymbolBundleSearcher capability. The interface is opt-in +// (today only the disk backend implements it; the in-memory +// *Graph deliberately leaves it unimplemented so the engine's +// fallback path stays exercised) — backends without the capability +// skip the subtest cleanly. +// +// Coverage: +// - SymbolSearcher.BulkUpsertSymbolFTS + BuildSymbolIndex must be +// called first so the FTS index is populated. +// - SearchSymbolBundles returns a bundle per matched id with the +// correct in/out edges attached. +// - Empty / no-match query returns an empty bundle slice. +func testSymbolBundleSearcher(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + bs, ok := s.(graph.SymbolBundleSearcher) + if !ok { + t.Skip("backend does not implement graph.SymbolBundleSearcher") + } + ss, ok := s.(graph.SymbolSearcher) + if !ok { + t.Skip("backend implements SymbolBundleSearcher but not SymbolSearcher — cannot populate FTS") + } + + // Build a small graph: A → B → C, plus an unrelated isolated D. + // FTS-searchable name tokens that should land on the same hit. + s.AddNode(mkNode("a", "AlphaWidget", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "BetaWidget", "x.go", graph.KindFunction)) + s.AddNode(mkNode("c", "GammaWidget", "y.go", graph.KindFunction)) + s.AddNode(mkNode("d", "Delta", "y.go", graph.KindFunction)) + s.AddEdge(mkEdge("a", "b", graph.EdgeCalls)) + s.AddEdge(mkEdge("b", "c", graph.EdgeCalls)) + s.AddEdge(mkEdge("a", "c", graph.EdgeCalls)) + + // Populate the FTS sidecar — every searchable node carries its + // tokenised name as the FTS text. + items := []graph.SymbolFTSItem{ + {NodeID: "a", Tokens: "alpha widget"}, + {NodeID: "b", Tokens: "beta widget"}, + {NodeID: "c", Tokens: "gamma widget"}, + {NodeID: "d", Tokens: "delta"}, + } + if err := ss.BulkUpsertSymbolFTS("", items); err != nil { + t.Fatalf("BulkUpsertSymbolFTS: %v", err) + } + if err := ss.BuildSymbolIndex(); err != nil { + t.Fatalf("BuildSymbolIndex: %v", err) + } + + // Querying for "widget" should match a/b/c and not d. Each bundle + // must carry the correct in/out edges off the graph. + bundles, err := bs.SearchSymbolBundles("widget", 10) + if err != nil { + t.Fatalf("SearchSymbolBundles: %v", err) + } + if len(bundles) == 0 { + t.Fatalf("SearchSymbolBundles returned no bundles — expected matches for a/b/c") + } + gotIDs := make(map[string]graph.SymbolBundle, len(bundles)) + for _, b := range bundles { + if b.Node == nil { + t.Fatalf("bundle has nil node: %+v", b) + } + gotIDs[b.Node.ID] = b + } + for _, want := range []string{"a", "b", "c"} { + if _, ok := gotIDs[want]; !ok { + t.Fatalf("missing bundle for id %q; got ids=%v", want, idsOf(bundles)) + } + } + if _, ok := gotIDs["d"]; ok { + t.Fatalf("unexpected bundle for id %q (no 'widget' token in its FTS row)", "d") + } + + // Edge verification: per-bundle in/out edges must match the + // in-memory truth surfaced via the existing GetIn/Out edges. + for id, b := range gotIDs { + wantOut := s.GetOutEdges(id) + if !edgeSlicesMatch(wantOut, b.OutEdges) { + t.Fatalf("bundle[%s].OutEdges mismatch: want=%v got=%v", id, edgeKeys(wantOut), edgeKeys(b.OutEdges)) + } + wantIn := s.GetInEdges(id) + if !edgeSlicesMatch(wantIn, b.InEdges) { + t.Fatalf("bundle[%s].InEdges mismatch: want=%v got=%v", id, edgeKeys(wantIn), edgeKeys(b.InEdges)) + } + } + + // Empty query is a clean no-op. + if empty, err := bs.SearchSymbolBundles("", 10); err != nil || len(empty) != 0 { + t.Fatalf("SearchSymbolBundles(\"\"): err=%v len=%d, want empty", err, len(empty)) + } + // No-match query — backend MAY return nil or empty slice; both + // are valid. + if no, err := bs.SearchSymbolBundles("nomatchforanything", 10); err != nil { + t.Fatalf("SearchSymbolBundles(nomatch): err=%v", err) + } else if len(no) != 0 { + t.Fatalf("SearchSymbolBundles(nomatch) returned %d bundles, want 0", len(no)) + } +} + +// idsOf is a small helper for the bundle assertions above. +func idsOf(bs []graph.SymbolBundle) []string { + out := make([]string, 0, len(bs)) + for _, b := range bs { + if b.Node != nil { + out = append(out, b.Node.ID) + } + } + sort.Strings(out) + return out +} + +// edgeSlicesMatch reports whether two edge slices contain the same +// (from, to, kind) tuples regardless of order. Used by the bundle +// assertions to ignore back-end-imposed ordering differences. +func edgeSlicesMatch(want, got []*graph.Edge) bool { + if len(want) != len(got) { + return false + } + wantKeys := edgeKeys(want) + gotKeys := edgeKeys(got) + sort.Strings(wantKeys) + sort.Strings(gotKeys) + for i := range wantKeys { + if wantKeys[i] != gotKeys[i] { + return false + } + } + return true +} + +// edgeKeys flattens a slice of edges into deterministic (from→to:kind) +// strings for ordered diffing. +func edgeKeys(es []*graph.Edge) []string { + out := make([]string, 0, len(es)) + for _, e := range es { + if e == nil { + continue + } + out = append(out, fmt.Sprintf("%s->%s:%s", e.From, e.To, e.Kind)) + } + return out +} + +// testDeadCodeCandidator exercises the optional +// graph.DeadCodeCandidator capability. Builds a small graph with +// nodes that fall into each filter case the analyzer cares about: +// +// - zero in-edges (dead). +// - in-edges of disallowed kind only (dead). +// - in-edges of allowed kind (alive). +// - mixed kinds across the candidate set (per-row allowlist must apply). +// +// The in-memory *graph.Graph implements this; the disk backend overrides +// with a server-side query. Both must return the same candidate set. +func testDeadCodeCandidator(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + dc, ok := s.(graph.DeadCodeCandidator) + if !ok { + t.Skip("backend does not implement graph.DeadCodeCandidator") + } + + // Functions: AliveFunc (called), DeadFunc (no in-edges), + // ReadOnlyFunc (only EdgeReads — disallowed for KindFunction). + s.AddNode(mkNode("AliveFunc", "AliveFunc", "a.go", graph.KindFunction)) + s.AddNode(mkNode("DeadFunc", "DeadFunc", "a.go", graph.KindFunction)) + s.AddNode(mkNode("ReadOnlyFunc", "ReadOnlyFunc", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Caller", "Caller", "a.go", graph.KindFunction)) + // Types: AliveType (referenced), DeadType (no in-edges). + s.AddNode(mkNode("AliveType", "AliveType", "b.go", graph.KindType)) + s.AddNode(mkNode("DeadType", "DeadType", "b.go", graph.KindType)) + // Methods: AliveMethod (called), DeadMethod (no in-edges). + s.AddNode(mkNode("AliveMethod", "AliveMethod", "c.go", graph.KindMethod)) + s.AddNode(mkNode("DeadMethod", "DeadMethod", "c.go", graph.KindMethod)) + + // Edges that exercise the per-kind allowlist. + e1 := mkEdge("Caller", "AliveFunc", graph.EdgeCalls) + e1.Line = 1 + e2 := mkEdge("Caller", "ReadOnlyFunc", graph.EdgeReads) + e2.Line = 2 + e3 := mkEdge("Caller", "AliveMethod", graph.EdgeCalls) + e3.Line = 3 + e4 := mkEdge("Caller", "AliveType", graph.EdgeReferences) + e4.Line = 4 + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + s.AddEdge(e4) + + // Per-kind allowlist mirrors analysis.incomingUsageKinds for the + // three kinds under test. Functions are alive on Calls/References; + // methods on Calls/Implements; types on References/Instantiates. + allowedKinds := []graph.NodeKind{ + graph.KindFunction, + graph.KindMethod, + graph.KindType, + } + allowedInEdges := map[graph.NodeKind][]graph.EdgeKind{ + graph.KindFunction: {graph.EdgeCalls, graph.EdgeReferences}, + graph.KindMethod: {graph.EdgeCalls, graph.EdgeImplements}, + graph.KindType: {graph.EdgeReferences, graph.EdgeInstantiates}, + } + + got := dc.DeadCodeCandidates(allowedKinds, allowedInEdges) + gotIDs := sortNodeIDs(got) + // Caller has zero in-edges of any kind, so it surfaces too — the + // analyzer's per-kind allowlist would also flag it as a candidate + // here. The backend's job is just the candidate set; post-filters + // (exported / test / entry-point) run in Go. + want := []string{"Caller", "DeadFunc", "DeadMethod", "DeadType", "ReadOnlyFunc"} + if fmt.Sprint(gotIDs) != fmt.Sprint(want) { + t.Fatalf("DeadCodeCandidates = %v\nwant %v", gotIDs, want) + } + + // Empty kind list returns nothing — never the whole graph. + if got := dc.DeadCodeCandidates(nil, allowedInEdges); len(got) != 0 { + t.Fatalf("DeadCodeCandidates(nil) = %d, want 0", len(got)) + } + + // Empty per-kind allowlist means "any incoming edge counts as + // usage" — AliveFunc and ReadOnlyFunc (both have *some* in-edge) + // drop out; only DeadFunc + Caller remain among functions. + anyKind := map[graph.NodeKind][]graph.EdgeKind{ + graph.KindFunction: nil, + } + gotAny := dc.DeadCodeCandidates([]graph.NodeKind{graph.KindFunction}, anyKind) + gotAnyIDs := sortNodeIDs(gotAny) + wantAny := []string{"Caller", "DeadFunc"} + if fmt.Sprint(gotAnyIDs) != fmt.Sprint(wantAny) { + t.Fatalf("DeadCodeCandidates(any-kind) = %v\nwant %v", gotAnyIDs, wantAny) + } +} + +// testIfaceImplementsScanner exercises the optional +// graph.IfaceImplementsScanner capability. Seeds two interfaces (one +// with methods Meta, one without) plus a type that implements each; +// the row set must include only the (type, iface) tuple whose target +// has a Meta["methods"] payload — the no-meta interface drops out. +func testIfaceImplementsScanner(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scanner, ok := s.(graph.IfaceImplementsScanner) + if !ok { + t.Skip("backend does not implement graph.IfaceImplementsScanner") + } + + // Interface with required methods. + ifaceA := mkNode("iface_A", "Reader", "a.go", graph.KindInterface) + ifaceA.Meta = map[string]any{"methods": []string{"Read", "Close"}} + s.AddNode(ifaceA) + // Interface with no Meta — must not appear in the row set. + ifaceB := mkNode("iface_B", "Empty", "a.go", graph.KindInterface) + s.AddNode(ifaceB) + // Implementing type for each. + s.AddNode(mkNode("type_A", "ReaderImpl", "a.go", graph.KindType)) + s.AddNode(mkNode("type_B", "EmptyImpl", "a.go", graph.KindType)) + s.AddEdge(mkEdge("type_A", "iface_A", graph.EdgeImplements)) + s.AddEdge(mkEdge("type_B", "iface_B", graph.EdgeImplements)) + + rows := scanner.IfaceImplementsRows() + if len(rows) != 1 { + t.Fatalf("IfaceImplementsRows len = %d, want 1 (iface_B has no Meta)", len(rows)) + } + r := rows[0] + if r.TypeID != "type_A" || r.IfaceID != "iface_A" { + t.Fatalf("row = %+v, want type_A → iface_A", r) + } + if r.IfaceMeta == nil { + t.Fatalf("IfaceMeta is nil") + } + raw, ok := r.IfaceMeta["methods"] + if !ok { + t.Fatalf("IfaceMeta missing methods key: %+v", r.IfaceMeta) + } + // Meta encoding round-trips lists differently between backends + // (in-memory keeps []string; gob-encoded comes back as []any). + // Accept either. + var methods []string + switch v := raw.(type) { + case []string: + methods = v + case []any: + for _, m := range v { + if str, ok := m.(string); ok { + methods = append(methods, str) + } + } + default: + t.Fatalf("unexpected methods type %T: %v", raw, raw) + } + sort.Strings(methods) + if fmt.Sprint(methods) != fmt.Sprint([]string{"Close", "Read"}) { + t.Fatalf("methods = %v, want [Close Read]", methods) + } +} + +// testNodeDegreeAggregator exercises the optional +// graph.NodeDegreeAggregator capability. Builds a small graph with +// nodes that cover every classification branch +// graph.GraphConnectivity / graph.ClassifyZeroEdge care about: +// +// - isolated (zero edges). +// - leaf (exactly one edge in either direction). +// - usage-edge in-bound only (alive — at least one EdgeCalls in). +// - non-usage-edge in-bound only (no EdgeCalls / EdgeReferences / +// etc — counts as "likely unused"). +// - usage-edge mixed with non-usage in-edges (still alive). +// - unknown id (must be elided). +func testNodeDegreeAggregator(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + dc, ok := s.(graph.NodeDegreeAggregator) + if !ok { + t.Skip("backend does not implement graph.NodeDegreeAggregator") + } + + s.AddNode(mkNode("Isolated", "Isolated", "a.go", graph.KindFunction)) + s.AddNode(mkNode("LeafSink", "LeafSink", "a.go", graph.KindFunction)) + s.AddNode(mkNode("LeafSource", "LeafSource", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Alive", "Alive", "a.go", graph.KindFunction)) + s.AddNode(mkNode("StructuralOnly", "StructuralOnly", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Mixed", "Mixed", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Caller", "Caller", "a.go", graph.KindFunction)) + s.AddNode(mkNode("FileNode", "FileNode", "a.go", graph.KindFile)) + + // One incoming call into LeafSink → leaf (in_count=1, out_count=0). + e1 := mkEdge("Caller", "LeafSink", graph.EdgeCalls) + e1.Line = 1 + s.AddEdge(e1) + // One outgoing reference from LeafSource → leaf (in=0, out=1). + e2 := mkEdge("LeafSource", "Caller", graph.EdgeReferences) + e2.Line = 2 + s.AddEdge(e2) + // Alive: incoming call → alive (in=1 usage). + e3 := mkEdge("Caller", "Alive", graph.EdgeCalls) + e3.Line = 3 + s.AddEdge(e3) + // StructuralOnly: incoming EdgeDefines (NOT a usage kind) → + // classified as "likely unused" but not isolated. + e4 := mkEdge("FileNode", "StructuralOnly", graph.EdgeDefines) + e4.Line = 4 + s.AddEdge(e4) + // Mixed: incoming EdgeDefines (non-usage) + incoming EdgeCalls + // (usage). UsageInCount must reflect ONLY the usage edge. + e5 := mkEdge("FileNode", "Mixed", graph.EdgeDefines) + e5.Line = 5 + s.AddEdge(e5) + e6 := mkEdge("Caller", "Mixed", graph.EdgeCalls) + e6.Line = 6 + s.AddEdge(e6) + + ids := []string{ + "Isolated", + "LeafSink", + "LeafSource", + "Alive", + "StructuralOnly", + "Mixed", + "unknown::id", + } + usage := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} + rows := dc.NodeDegreeCounts(ids, usage) + + byID := make(map[string]graph.NodeDegreeRow, len(rows)) + for _, r := range rows { + byID[r.NodeID] = r + } + // Unknown id MUST be elided. + if _, ok := byID["unknown::id"]; ok { + t.Fatalf("NodeDegreeCounts must elide unknown ids, got row") + } + + type want struct{ in, out, usageIn int } + cases := map[string]want{ + "Isolated": {0, 0, 0}, + "LeafSink": {1, 0, 1}, + "LeafSource": {0, 1, 0}, + "Alive": {1, 0, 1}, + "StructuralOnly": {1, 0, 0}, + "Mixed": {2, 0, 1}, + } + for id, w := range cases { + got, ok := byID[id] + if !ok { + t.Errorf("missing row for %s", id) + continue + } + if got.InCount != w.in || got.OutCount != w.out || got.UsageInCount != w.usageIn { + t.Errorf("row %s = in=%d out=%d usage=%d, want in=%d out=%d usage=%d", + id, got.InCount, got.OutCount, got.UsageInCount, + w.in, w.out, w.usageIn) + } + } + + // Empty ids returns nil — never the whole graph. + if got := dc.NodeDegreeCounts(nil, usage); len(got) != 0 { + t.Fatalf("NodeDegreeCounts(nil) = %d, want 0", len(got)) + } + + // Empty usage kinds means UsageInCount is always 0 (totals + // still populated). + noUsage := dc.NodeDegreeCounts([]string{"Mixed"}, nil) + if len(noUsage) != 1 { + t.Fatalf("NodeDegreeCounts(Mixed, nil) = %d rows, want 1", len(noUsage)) + } + if noUsage[0].InCount != 2 || noUsage[0].UsageInCount != 0 { + t.Fatalf("NodeDegreeCounts(Mixed, nil) = in=%d usage=%d, want in=2 usage=0", + noUsage[0].InCount, noUsage[0].UsageInCount) + } +} + +// testNodeFanAggregator exercises the optional +// graph.NodeFanAggregator capability. Builds a small graph that +// exercises the per-direction kind filter independently: +// +// - Hub: high fan-in (Calls + References) AND high fan-out (Calls). +// - Leaf: zero fan in either direction. +// - ReadHeavy: incoming Reads only — fan-in must be 0 when the +// filter is Calls+References. +// - CallerOnly: outgoing Calls only — fan-out non-zero, fan-in 0. +// - Unknown id elided. +func testNodeFanAggregator(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + fa, ok := s.(graph.NodeFanAggregator) + if !ok { + t.Skip("backend does not implement graph.NodeFanAggregator") + } + + s.AddNode(mkNode("Hub", "Hub", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Leaf", "Leaf", "a.go", graph.KindFunction)) + s.AddNode(mkNode("ReadHeavy", "ReadHeavy", "a.go", graph.KindFunction)) + s.AddNode(mkNode("CallerOnly", "CallerOnly", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Target1", "Target1", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Target2", "Target2", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Src1", "Src1", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Src2", "Src2", "a.go", graph.KindFunction)) + + // Hub: 2 incoming Calls + 1 incoming Reference + 2 outgoing + // Calls + 1 outgoing Reference. With fan-in=Calls+Refs and + // fan-out=Calls: fan_in=3, fan_out=2. + add := func(from, to string, kind graph.EdgeKind, line int) { + e := mkEdge(from, to, kind) + e.Line = line + s.AddEdge(e) + } + add("Src1", "Hub", graph.EdgeCalls, 1) + add("Src2", "Hub", graph.EdgeCalls, 2) + add("Src1", "Hub", graph.EdgeReferences, 3) + add("Hub", "Target1", graph.EdgeCalls, 4) + add("Hub", "Target2", graph.EdgeCalls, 5) + add("Hub", "Target1", graph.EdgeReferences, 6) + + // ReadHeavy: incoming Reads only. + add("Src1", "ReadHeavy", graph.EdgeReads, 7) + add("Src2", "ReadHeavy", graph.EdgeReads, 8) + + // CallerOnly: outgoing Calls only. + add("CallerOnly", "Target1", graph.EdgeCalls, 9) + + ids := []string{"Hub", "Leaf", "ReadHeavy", "CallerOnly", "unknown::id"} + rows := fa.NodeFanCounts(ids, + []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences}, + []graph.EdgeKind{graph.EdgeCalls}, + ) + + byID := make(map[string]graph.NodeFanRow, len(rows)) + for _, r := range rows { + byID[r.NodeID] = r + } + if _, ok := byID["unknown::id"]; ok { + t.Fatalf("NodeFanCounts must elide unknown ids, got row") + } + + type want struct{ in, out int } + cases := map[string]want{ + "Hub": {3, 2}, + "Leaf": {0, 0}, + "ReadHeavy": {0, 0}, + "CallerOnly": {0, 1}, + } + for id, w := range cases { + got, ok := byID[id] + if !ok { + t.Errorf("missing row for %s", id) + continue + } + if got.FanIn != w.in || got.FanOut != w.out { + t.Errorf("row %s = in=%d out=%d, want in=%d out=%d", + id, got.FanIn, got.FanOut, w.in, w.out) + } + } + + // Empty ids returns nil. + if got := fa.NodeFanCounts(nil, []graph.EdgeKind{graph.EdgeCalls}, nil); len(got) != 0 { + t.Fatalf("NodeFanCounts(nil) = %d, want 0", len(got)) + } + + // Empty kind sets → all-zero rows for known ids only. + zeros := fa.NodeFanCounts([]string{"Hub", "unknown::id"}, nil, nil) + if len(zeros) != 1 { + t.Fatalf("NodeFanCounts(empty kinds) = %d rows, want 1 (Hub only)", len(zeros)) + } + if zeros[0].NodeID != "Hub" || zeros[0].FanIn != 0 || zeros[0].FanOut != 0 { + t.Fatalf("NodeFanCounts(empty kinds) = %+v, want Hub/0/0", zeros[0]) + } +} + +// testFileImporters exercises the optional graph.FileImporters +// capability. Seeds two importing files (one production, one test) +// plus an unrelated import edge that targets a different file. The +// returned rows must include exactly the importers of the target +// file — both via the file-node ID and via the FilePath-on-symbol +// shape — and must not surface the unrelated edge. +func testFileImporters(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + fi, ok := s.(graph.FileImporters) + if !ok { + t.Skip("backend does not implement graph.FileImporters") + } + + // target file node + a symbol inside it. + s.AddNode(mkNode("pkg/target.go", "target.go", "pkg/target.go", graph.KindFile)) + s.AddNode(mkNode("TargetFunc", "TargetFunc", "pkg/target.go", graph.KindFunction)) + + // Two importing files: one production, one test. Each has an + // import edge — one targets the file node by id, the other + // targets a symbol inside the file (FilePath match path). + s.AddNode(mkNode("pkg/prod.go", "prod.go", "pkg/prod.go", graph.KindFile)) + s.AddNode(mkNode("pkg/test_test.go", "test_test.go", "pkg/test_test.go", graph.KindFile)) + + // And an unrelated importer that points elsewhere — must NOT + // surface in the results. + s.AddNode(mkNode("pkg/other.go", "other.go", "pkg/other.go", graph.KindFile)) + s.AddNode(mkNode("pkg/elsewhere.go", "elsewhere.go", "pkg/elsewhere.go", graph.KindFile)) + + s.AddEdge(mkEdge("pkg/prod.go", "pkg/target.go", graph.EdgeImports)) + s.AddEdge(mkEdge("pkg/test_test.go", "TargetFunc", graph.EdgeImports)) + s.AddEdge(mkEdge("pkg/other.go", "pkg/elsewhere.go", graph.EdgeImports)) + // A non-imports edge to the target file must also drop out. + s.AddEdge(mkEdge("pkg/prod.go", "TargetFunc", graph.EdgeCalls)) + + rows := fi.FileImporters("pkg/target.go") + got := make([]string, 0, len(rows)) + for _, r := range rows { + got = append(got, r.FromFile) + } + sort.Strings(got) + want := []string{"pkg/prod.go", "pkg/test_test.go"} + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("FileImporters = %v, want %v", got, want) + } + + if got := fi.FileImporters(""); len(got) != 0 { + t.Fatalf("FileImporters(empty) = %d rows, want 0", len(got)) + } + if got := fi.FileImporters("pkg/no_such.go"); len(got) != 0 { + t.Fatalf("FileImporters(unknown) = %d rows, want 0", len(got)) + } +} + +// testInEdgeCounter exercises the optional graph.InEdgeCounter +// capability. Seeds a small graph and asserts the per-To fan-in +// count matches what an AllEdges-bucketing loop would compute for +// the same edge-kind set. +func testInEdgeCounter(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + ic, ok := s.(graph.InEdgeCounter) + if !ok { + t.Skip("backend does not implement graph.InEdgeCounter") + } + + s.AddNode(mkNode("A", "A", "a.go", graph.KindFunction)) + s.AddNode(mkNode("B", "B", "a.go", graph.KindFunction)) + s.AddNode(mkNode("C", "C", "a.go", graph.KindFunction)) + s.AddNode(mkNode("T", "T", "a.go", graph.KindType)) + + // B is called twice (from A and C), referenced once (from A). + e1 := mkEdge("A", "B", graph.EdgeCalls) + e1.Line = 1 + e2 := mkEdge("C", "B", graph.EdgeCalls) + e2.Line = 2 + e3 := mkEdge("A", "B", graph.EdgeReferences) + e3.Line = 3 + // T is referenced once and held by an import edge that should + // not be counted under {calls,references}. + e4 := mkEdge("A", "T", graph.EdgeReferences) + e4.Line = 4 + e5 := mkEdge("A", "T", graph.EdgeImports) + e5.Line = 5 + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + s.AddEdge(e4) + s.AddEdge(e5) + + got := ic.InEdgeCountsByKind([]graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences}) + if got["B"] != 3 { + t.Fatalf("count[B] = %d, want 3", got["B"]) + } + if got["T"] != 1 { + t.Fatalf("count[T] = %d, want 1", got["T"]) + } + if _, ok := got["A"]; ok { + t.Fatalf("A should have zero matching incoming edges, got %d", got["A"]) + } + + // Empty kind list must return nil — never the whole graph. + if got := ic.InEdgeCountsByKind(nil); got != nil { + t.Fatalf("InEdgeCountsByKind(nil) = %v, want nil", got) + } + + // Single-kind filter dedups when callers pass duplicates. + got2 := ic.InEdgeCountsByKind([]graph.EdgeKind{graph.EdgeCalls, graph.EdgeCalls}) + if got2["B"] != 2 { + t.Fatalf("count[B] (calls only, deduped) = %d, want 2", got2["B"]) + } +} + +// testNodesInFilesByKindFinder exercises the optional +// graph.NodesInFilesByKindFinder capability. Seeds a graph spanning +// three files and three kinds; the result must include only the +// requested-kind nodes whose FilePath sits in the requested file +// set. +func testNodesInFilesByKindFinder(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + fn, ok := s.(graph.NodesInFilesByKindFinder) + if !ok { + t.Skip("backend does not implement graph.NodesInFilesByKindFinder") + } + + // f1.go: function + method + type. + s.AddNode(mkNode("f1::F1", "F1", "f1.go", graph.KindFunction)) + s.AddNode(mkNode("f1::M1", "M1", "f1.go", graph.KindMethod)) + s.AddNode(mkNode("f1::T1", "T1", "f1.go", graph.KindType)) + // f2.go: function only. + s.AddNode(mkNode("f2::F2", "F2", "f2.go", graph.KindFunction)) + // f3.go: drops out of every result — not in the requested files. + s.AddNode(mkNode("f3::F3", "F3", "f3.go", graph.KindFunction)) + + got := fn.NodesInFilesByKind( + []string{"f1.go", "f2.go"}, + []graph.NodeKind{graph.KindFunction, graph.KindMethod}, + ) + gotIDs := sortNodeIDs(got) + want := []string{"f1::F1", "f1::M1", "f2::F2"} + if fmt.Sprint(gotIDs) != fmt.Sprint(want) { + t.Fatalf("NodesInFilesByKind = %v, want %v", gotIDs, want) + } + + // Empty files / kinds must return nil — never a whole-graph scan. + if got := fn.NodesInFilesByKind(nil, []graph.NodeKind{graph.KindFunction}); got != nil { + t.Fatalf("NodesInFilesByKind(nil files) = %v, want nil", got) + } + if got := fn.NodesInFilesByKind([]string{"f1.go"}, nil); got != nil { + t.Fatalf("NodesInFilesByKind(nil kinds) = %v, want nil", got) + } + + // Dedup: passing the same file / kind twice must not double-yield. + gotDup := fn.NodesInFilesByKind( + []string{"f1.go", "f1.go"}, + []graph.NodeKind{graph.KindType, graph.KindType}, + ) + if len(gotDup) != 1 || gotDup[0].ID != "f1::T1" { + t.Fatalf("NodesInFilesByKind(dup) = %v, want [f1::T1]", sortNodeIDs(gotDup)) + } +} + +// testEdgesByKindsScanner exercises the optional +// graph.EdgesByKindsScanner capability. Builds a small graph with a +// mix of edge kinds, then verifies the streaming filter returns +// exactly the union of the requested kinds in any order. Covers the +// edge cases that the edge-driven analyzers rely on: zero-match (no +// edge matches the requested kinds), empty filter (yields nothing — +// never a whole-table scan), and early stop honouring the iterator +// contract. +func testEdgesByKindsScanner(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + s.AddNode(mkNode("c", "C", "y.go", graph.KindType)) + s.AddNode(mkNode("d", "D", "y.go", graph.KindField)) + + calls1 := mkEdge("a", "b", graph.EdgeCalls) + calls1.Line = 1 + calls2 := mkEdge("a", "b", graph.EdgeCalls) + calls2.Line = 2 + refs := mkEdge("a", "c", graph.EdgeReferences) + writes := mkEdge("a", "d", graph.EdgeWrites) + throws := mkEdge("a", "c", graph.EdgeThrows) + s.AddEdge(calls1) + s.AddEdge(calls2) + s.AddEdge(refs) + s.AddEdge(writes) + s.AddEdge(throws) + + es, ok := s.(graph.EdgesByKindsScanner) + if !ok { + t.Skip("backend does not implement graph.EdgesByKindsScanner") + } + + // Multi-kind: union of Calls + References must surface all three + // calls/refs edges; counts (not pointers) compared so the in-memory + // and disk backends agree without relying on edge identity. + counts := map[graph.EdgeKind]int{} + for e := range es.EdgesByKinds([]graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences}) { + counts[e.Kind]++ + } + if counts[graph.EdgeCalls] != 2 || counts[graph.EdgeReferences] != 1 { + t.Fatalf("EdgesByKinds(Calls,References) = %+v, want Calls:2 References:1", counts) + } + if got := len(counts); got != 2 { + t.Fatalf("EdgesByKinds(Calls,References) yielded %d distinct kinds, want 2", got) + } + + // Single-kind via the multi-kind path must match EdgesByKind. + single := 0 + for e := range es.EdgesByKinds([]graph.EdgeKind{graph.EdgeWrites}) { + if e.Kind != graph.EdgeWrites { + t.Fatalf("EdgesByKinds(Writes) yielded kind=%s, want Writes", e.Kind) + } + single++ + } + if single != 1 { + t.Fatalf("EdgesByKinds(Writes) yielded %d, want 1", single) + } + + // Dedupe: repeating a kind must not double-yield. The backend's + // IN-list MUST collapse duplicates. + dup := 0 + for range es.EdgesByKinds([]graph.EdgeKind{graph.EdgeCalls, graph.EdgeCalls}) { + dup++ + } + if dup != 2 { + t.Fatalf("EdgesByKinds(Calls,Calls) yielded %d, want 2 (no double-yield)", dup) + } + + // Empty kinds yields nothing — never a whole-table scan. + empty := 0 + for range es.EdgesByKinds(nil) { + empty++ + } + if empty != 0 { + t.Fatalf("EdgesByKinds(nil) yielded %d, want 0", empty) + } + emptySlice := 0 + for range es.EdgesByKinds([]graph.EdgeKind{}) { + emptySlice++ + } + if emptySlice != 0 { + t.Fatalf("EdgesByKinds([]) yielded %d, want 0", emptySlice) + } + + // Empty string kinds get elided (matches dedupeEdgeKinds contract). + blank := 0 + for range es.EdgesByKinds([]graph.EdgeKind{"", "", ""}) { + blank++ + } + if blank != 0 { + t.Fatalf("EdgesByKinds(blank) yielded %d, want 0", blank) + } + + // Zero-match: a kind nothing in the graph uses yields nothing. + zero := 0 + for range es.EdgesByKinds([]graph.EdgeKind{graph.EdgeKind("nonexistent")}) { + zero++ + } + if zero != 0 { + t.Fatalf("EdgesByKinds(nonexistent) yielded %d, want 0", zero) + } + + // Early stop honours the iterator contract. + stopped := 0 + for range es.EdgesByKinds([]graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences}) { + stopped++ + if stopped == 1 { + break + } + } + if stopped != 1 { + t.Fatalf("early stop yielded %d before break, want 1", stopped) + } +} + +// testNodesByKindsScanner exercises the optional graph.NodesByKindsScanner +// capability. Seeds nodes of several kinds, including ones whose Meta +// holds the keys the metadata analyzers read, and asserts: +// - the IN-list returns exactly the union of the requested kinds +// (with nodes' Meta intact so post-filtering still works); +// - kinds the caller did not request never surface; +// - empty / nil kinds returns nil without scanning; +// - duplicate kinds in the input never duplicate the output. +// +// The Meta-preservation assertion is the load-bearing one: every +// downstream handler still runs its meta gate in Go after the kind +// pushdown, so the capability is worthless if Meta doesn't round-trip +// through the backend. +func testNodesByKindsScanner(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.NodesByKindsScanner) + if !ok { + t.Skip("backend does not implement graph.NodesByKindsScanner") + } + + // Two functions (one with coverage meta), one method, one type, + // one file (with cgo meta), one todo (with assignee meta), one + // table. Mix of meta-bearing and meta-bare nodes so the + // round-trip assertion covers both shapes. Meta values stay + // scalar — testMetaPreserved already covers flat round-trip, and + // the disk backend's gob encoder needs gob.Register for nested + // map shapes (out of scope for a kind-pushdown capability test). + fn1 := mkNode("pkg/a.go::Fn1", "Fn1", "pkg/a.go", graph.KindFunction) + fn1.Meta = map[string]any{ + "coverage_pct": 42.5, + "author_email": "alice@example.com", + } + fn2 := mkNode("pkg/a.go::Fn2", "Fn2", "pkg/a.go", graph.KindFunction) + method := mkNode("pkg/a.go::T.M", "M", "pkg/a.go", graph.KindMethod) + typ := mkNode("pkg/a.go::T", "T", "pkg/a.go", graph.KindType) + file := mkNode("pkg/a.go", "a.go", "pkg/a.go", graph.KindFile) + file.Meta = map[string]any{"uses_cgo": true} + todo := mkNode("pkg/a.go::TODO:7", "TODO", "pkg/a.go", graph.KindTodo) + todo.Meta = map[string]any{ + "tag": "TODO", + "assignee": "alice", + "text": "wire this up", + } + tbl := mkNode("table::users", "users", "schema/001.sql", graph.KindTable) + tbl.Meta = map[string]any{"table": "users", "dialect": "postgres"} + + for _, n := range []*graph.Node{fn1, fn2, method, typ, file, todo, tbl} { + s.AddNode(n) + } + + // Function + method — the stale_code/ownership/coverage default. + gotFnM := scan.NodesByKinds([]graph.NodeKind{graph.KindFunction, graph.KindMethod}) + wantFnM := []string{"pkg/a.go::Fn1", "pkg/a.go::Fn2", "pkg/a.go::T.M"} + if got := sortNodeIDs(gotFnM); fmt.Sprint(got) != fmt.Sprint(wantFnM) { + t.Fatalf("NodesByKinds(function,method) = %v, want %v", got, wantFnM) + } + + // Meta round-trip: pick up Fn1 and assert flat scalar meta survived. + var fn1Got *graph.Node + for _, n := range gotFnM { + if n.ID == "pkg/a.go::Fn1" { + fn1Got = n + break + } + } + if fn1Got == nil { + t.Fatalf("Fn1 missing from result") + } + if pct, _ := fn1Got.Meta["coverage_pct"].(float64); pct != 42.5 { + t.Fatalf("Fn1.Meta.coverage_pct = %v, want 42.5", fn1Got.Meta["coverage_pct"]) + } + if email, _ := fn1Got.Meta["author_email"].(string); email != "alice@example.com" { + t.Fatalf("Fn1.Meta.author_email = %q, want alice@example.com", email) + } + + // Single kind on a kind with meta — todo/file. + gotTodo := scan.NodesByKinds([]graph.NodeKind{graph.KindTodo}) + if len(gotTodo) != 1 || gotTodo[0].ID != "pkg/a.go::TODO:7" { + t.Fatalf("NodesByKinds(todo) = %v, want [pkg/a.go::TODO:7]", sortNodeIDs(gotTodo)) + } + if tag, _ := gotTodo[0].Meta["tag"].(string); tag != "TODO" { + t.Fatalf("Todo.Meta.tag = %q, want TODO", tag) + } + + gotFile := scan.NodesByKinds([]graph.NodeKind{graph.KindFile}) + if len(gotFile) != 1 || gotFile[0].ID != "pkg/a.go" { + t.Fatalf("NodesByKinds(file) = %v, want [pkg/a.go]", sortNodeIDs(gotFile)) + } + if cgo, _ := gotFile[0].Meta["uses_cgo"].(bool); !cgo { + t.Fatalf("File.Meta.uses_cgo = false, want true") + } + + // Table kind — for orphan/unreferenced analyzers. + gotTbl := scan.NodesByKinds([]graph.NodeKind{graph.KindTable}) + if len(gotTbl) != 1 || gotTbl[0].ID != "table::users" { + t.Fatalf("NodesByKinds(table) = %v, want [table::users]", sortNodeIDs(gotTbl)) + } + + // Empty / nil kinds — nil result, no scan. + if got := scan.NodesByKinds(nil); got != nil { + t.Fatalf("NodesByKinds(nil) = %v, want nil", got) + } + if got := scan.NodesByKinds([]graph.NodeKind{}); got != nil { + t.Fatalf("NodesByKinds([]) = %v, want nil", got) + } + + // Unknown kind — no rows, but still nil/empty, never the full table. + if got := scan.NodesByKinds([]graph.NodeKind{graph.NodeKind("no_such_kind")}); len(got) != 0 { + t.Fatalf("NodesByKinds(unknown) = %v, want 0 rows", got) + } + + // Dedup: passing the same kind twice must not double-yield. + gotDup := scan.NodesByKinds([]graph.NodeKind{graph.KindFunction, graph.KindFunction}) + wantDup := []string{"pkg/a.go::Fn1", "pkg/a.go::Fn2"} + if got := sortNodeIDs(gotDup); fmt.Sprint(got) != fmt.Sprint(wantDup) { + t.Fatalf("NodesByKinds(dup function) = %v, want %v", got, wantDup) + } +} + +// testEdgeKindCounter exercises the optional graph.EdgeKindCounter +// capability. Seeds a graph with several kinds in different +// frequencies and asserts the per-kind tally matches what an +// AllEdges()+map[kind]++ loop would compute. +func testEdgeKindCounter(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + ek, ok := s.(graph.EdgeKindCounter) + if !ok { + t.Skip("backend does not implement graph.EdgeKindCounter") + } + + // Empty graph returns nil or empty — both are valid per the + // contract; callers must treat them the same. + if got := ek.EdgeKindCounts(); len(got) != 0 { + t.Fatalf("EdgeKindCounts(empty) = %v, want empty", got) + } + + s.AddNode(mkNode("A", "A", "a.go", graph.KindFunction)) + s.AddNode(mkNode("B", "B", "a.go", graph.KindFunction)) + s.AddNode(mkNode("C", "C", "a.go", graph.KindFunction)) + s.AddNode(mkNode("f1", "a.go", "a.go", graph.KindFile)) + + // 3 calls, 2 references, 1 imports. + e1 := mkEdge("A", "B", graph.EdgeCalls) + e2 := mkEdge("A", "C", graph.EdgeCalls) + e2.Line = 2 + e3 := mkEdge("B", "C", graph.EdgeCalls) + e3.Line = 3 + e4 := mkEdge("A", "C", graph.EdgeReferences) + e4.Line = 4 + e5 := mkEdge("B", "C", graph.EdgeReferences) + e5.Line = 5 + e6 := mkEdge("A", "f1", graph.EdgeImports) + e6.Line = 6 + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + s.AddEdge(e4) + s.AddEdge(e5) + s.AddEdge(e6) + + got := ek.EdgeKindCounts() + if got[graph.EdgeCalls] != 3 { + t.Fatalf("EdgeKindCounts[calls] = %d, want 3", got[graph.EdgeCalls]) + } + if got[graph.EdgeReferences] != 2 { + t.Fatalf("EdgeKindCounts[references] = %d, want 2", got[graph.EdgeReferences]) + } + if got[graph.EdgeImports] != 1 { + t.Fatalf("EdgeKindCounts[imports] = %d, want 1", got[graph.EdgeImports]) + } + // No extends edge was added; absence must produce 0 via the + // zero value (callers index with `m[k]`). + if got[graph.EdgeExtends] != 0 { + t.Fatalf("EdgeKindCounts[extends] = %d, want 0", got[graph.EdgeExtends]) + } +} + +// testCrossRepoEdgeAggregator exercises the optional +// graph.CrossRepoEdgeAggregator capability. Seeds a two-repo graph +// with one cross_repo_calls + one cross_repo_implements and two +// same-repo edges of other kinds. Asserts the per-triple counts and +// that single-repo edges drop out. +func testCrossRepoEdgeAggregator(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + ag, ok := s.(graph.CrossRepoEdgeAggregator) + if !ok { + t.Skip("backend does not implement graph.CrossRepoEdgeAggregator") + } + + // Empty graph -> nil. + if got := ag.CrossRepoEdgeCounts(); got != nil { + t.Fatalf("CrossRepoEdgeCounts(empty) = %v, want nil", got) + } + + s.AddNode(mkRepoNode("repoA::Caller", "Caller", "a/c.go", "repoA", graph.KindFunction)) + s.AddNode(mkRepoNode("repoA::Callee2", "Callee2", "a/d.go", "repoA", graph.KindFunction)) + s.AddNode(mkRepoNode("repoB::Callee", "Callee", "b/d.go", "repoB", graph.KindFunction)) + s.AddNode(mkRepoNode("repoB::Iface", "Iface", "b/i.go", "repoB", graph.KindType)) + s.AddNode(mkRepoNode("repoA::Impl", "Impl", "a/i.go", "repoA", graph.KindType)) + + // Two cross-repo edges to the same (kind, fromRepo, toRepo) + + // one cross-repo implements + one non-cross edge. + e1 := mkEdge("repoA::Caller", "repoB::Callee", graph.EdgeCrossRepoCalls) + e2 := mkEdge("repoA::Caller", "repoB::Callee", graph.EdgeCrossRepoCalls) + e2.Line = 2 + e3 := mkEdge("repoA::Impl", "repoB::Iface", graph.EdgeCrossRepoImplements) + e3.Line = 3 + e4 := mkEdge("repoA::Caller", "repoA::Callee2", graph.EdgeCalls) + e4.Line = 4 + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + s.AddEdge(e4) + + rows := ag.CrossRepoEdgeCounts() + // Sort for stable assertions — capability output order is + // unspecified. + sort.Slice(rows, func(i, j int) bool { + if rows[i].Kind != rows[j].Kind { + return rows[i].Kind < rows[j].Kind + } + if rows[i].FromRepo != rows[j].FromRepo { + return rows[i].FromRepo < rows[j].FromRepo + } + return rows[i].ToRepo < rows[j].ToRepo + }) + if len(rows) != 2 { + t.Fatalf("CrossRepoEdgeCounts: got %d rows, want 2 (rows=%v)", len(rows), rows) + } + if rows[0].Kind != graph.EdgeCrossRepoCalls || rows[0].FromRepo != "repoA" || rows[0].ToRepo != "repoB" || rows[0].Count != 2 { + t.Fatalf("CrossRepoEdgeCounts[0] = %+v, want {cross_repo_calls,repoA,repoB,2}", rows[0]) + } + if rows[1].Kind != graph.EdgeCrossRepoImplements || rows[1].FromRepo != "repoA" || rows[1].ToRepo != "repoB" || rows[1].Count != 1 { + t.Fatalf("CrossRepoEdgeCounts[1] = %+v, want {cross_repo_implements,repoA,repoB,1}", rows[1]) + } +} + +// testFileImportAggregator exercises the optional +// graph.FileImportAggregator capability. Seeds a graph with several +// import edges and asserts the per-target-file counts. Covers both +// the unscoped and the scope-bound paths plus the file-node-by-ID +// vs symbol-FilePath import shapes. +func testFileImportAggregator(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + ag, ok := s.(graph.FileImportAggregator) + if !ok { + t.Skip("backend does not implement graph.FileImportAggregator") + } + + if got := ag.FileImportCounts(nil); got != nil { + t.Fatalf("FileImportCounts(empty graph) = %v, want nil", got) + } + + // Two targets, three importing files, mixed shapes. + s.AddNode(mkNode("pkg/popular.go", "popular.go", "pkg/popular.go", graph.KindFile)) + s.AddNode(mkNode("PopularFn", "PopularFn", "pkg/popular.go", graph.KindFunction)) + s.AddNode(mkNode("pkg/lonely.go", "lonely.go", "pkg/lonely.go", graph.KindFile)) + s.AddNode(mkNode("pkg/a.go", "a.go", "pkg/a.go", graph.KindFile)) + s.AddNode(mkNode("pkg/b.go", "b.go", "pkg/b.go", graph.KindFile)) + s.AddNode(mkNode("pkg/c.go", "c.go", "pkg/c.go", graph.KindFile)) + + // pkg/popular.go imported by 3 files (two via file-id, one via symbol-FilePath). + s.AddEdge(mkEdge("pkg/a.go", "pkg/popular.go", graph.EdgeImports)) + s.AddEdge(mkEdge("pkg/b.go", "pkg/popular.go", graph.EdgeImports)) + s.AddEdge(mkEdge("pkg/c.go", "PopularFn", graph.EdgeImports)) + // pkg/lonely.go imported once. + s.AddEdge(mkEdge("pkg/a.go", "pkg/lonely.go", graph.EdgeImports)) + // A calls edge — must drop out of imports counts. + s.AddEdge(mkEdge("pkg/a.go", "PopularFn", graph.EdgeCalls)) + + rows := ag.FileImportCounts(nil) + got := map[string]int{} + for _, r := range rows { + got[r.FilePath] = r.Count + } + if got["pkg/popular.go"] != 3 { + t.Fatalf("FileImportCounts[popular.go] = %d, want 3", got["pkg/popular.go"]) + } + if got["pkg/lonely.go"] != 1 { + t.Fatalf("FileImportCounts[lonely.go] = %d, want 1", got["pkg/lonely.go"]) + } + + // Scope-bound: only count edges whose target is in the allow set. + scoped := ag.FileImportCounts([]string{"pkg/lonely.go"}) + if len(scoped) != 1 || scoped[0].FilePath != "pkg/lonely.go" || scoped[0].Count != 1 { + t.Fatalf("FileImportCounts(scope=lonely) = %v, want [lonely.go:1]", scoped) + } + + // Scope-bound with file-id + symbol shape both targeting popular. + scopedPop := ag.FileImportCounts([]string{"pkg/popular.go", "PopularFn"}) + gotPop := map[string]int{} + for _, r := range scopedPop { + gotPop[r.FilePath] = r.Count + } + if gotPop["pkg/popular.go"] != 3 { + t.Fatalf("FileImportCounts(scope=popular+sym) = %v, want popular.go:3", scopedPop) + } + + // Empty (non-nil) scope MUST return nil — never a whole-graph scan. + if got := ag.FileImportCounts([]string{}); got != nil { + t.Fatalf("FileImportCounts(empty scope) = %v, want nil", got) + } +} + +// testInDegreeForNodes exercises the optional graph.InDegreeForNodes +// capability. Seeds a tiny graph with three targets carrying 0 / 1 / 3 +// incoming edges (of mixed kinds) and asserts the counter returns the +// per-target count restricted to the caller's id set. +func testInDegreeForNodes(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + ic, ok := s.(graph.InDegreeForNodes) + if !ok { + t.Skip("backend does not implement graph.InDegreeForNodes") + } + + s.AddNode(mkNode("Hub", "Hub", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Lonely", "Lonely", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Isolated", "Isolated", "a.go", graph.KindFunction)) + s.AddNode(mkNode("C1", "C1", "a.go", graph.KindFunction)) + s.AddNode(mkNode("C2", "C2", "a.go", graph.KindFunction)) + s.AddNode(mkNode("C3", "C3", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Outside", "Outside", "a.go", graph.KindFunction)) + + e1 := mkEdge("C1", "Hub", graph.EdgeCalls) + e1.Line = 1 + e2 := mkEdge("C2", "Hub", graph.EdgeReferences) + e2.Line = 2 + e3 := mkEdge("C3", "Hub", graph.EdgeReads) + e3.Line = 3 + e4 := mkEdge("C1", "Lonely", graph.EdgeCalls) + e4.Line = 4 + // One incoming edge that targets Outside — must NOT surface when + // Outside is absent from the caller's id list. + e5 := mkEdge("C2", "Outside", graph.EdgeCalls) + e5.Line = 5 + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + s.AddEdge(e4) + s.AddEdge(e5) + + got := ic.InDegreeForNodes([]string{"Hub", "Lonely", "Isolated"}) + if got["Hub"] != 3 { + t.Fatalf("InDegreeForNodes[Hub] = %d, want 3", got["Hub"]) + } + if got["Lonely"] != 1 { + t.Fatalf("InDegreeForNodes[Lonely] = %d, want 1", got["Lonely"]) + } + // Isolated and Outside are absent — the contract drops zero-count + // targets from the map. + if _, ok := got["Isolated"]; ok { + t.Fatalf("InDegreeForNodes[Isolated] surfaced with value %d, want absent", got["Isolated"]) + } + if _, ok := got["Outside"]; ok { + t.Fatalf("InDegreeForNodes[Outside] surfaced — caller didn't ask for it") + } + + // Empty ids => nil (never a whole-table scan). + if got := ic.InDegreeForNodes(nil); got != nil { + t.Fatalf("InDegreeForNodes(nil) = %v, want nil", got) + } + if got := ic.InDegreeForNodes([]string{}); got != nil { + t.Fatalf("InDegreeForNodes(empty) = %v, want nil", got) + } + // Duplicated ids dedup naturally. + dup := ic.InDegreeForNodes([]string{"Hub", "Hub", "Hub"}) + if dup["Hub"] != 3 { + t.Fatalf("InDegreeForNodes(dup Hub) = %d, want 3", dup["Hub"]) + } +} + +// testReachableForwardByKinds exercises the optional +// graph.ReachableForwardByKinds capability. Seeds a small directed +// graph mixing allowed and disallowed edge kinds, then asserts the +// closure from the seed set is the transitive subset reachable +// through only the allowed kinds. +func testReachableForwardByKinds(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + rf, ok := s.(graph.ReachableForwardByKinds) + if !ok { + t.Skip("backend does not implement graph.ReachableForwardByKinds") + } + + // Layout: + // Test -> A (calls) + // A -> B (calls) + // B -> C (references) + // C -> D (reads) <-- disallowed kind: D unreachable + // X -> Y (calls) <-- disjoint subgraph: neither in closure + for _, id := range []string{"Test", "A", "B", "C", "D", "X", "Y"} { + s.AddNode(mkNode(id, id, "a.go", graph.KindFunction)) + } + e1 := mkEdge("Test", "A", graph.EdgeCalls) + e1.Line = 1 + e2 := mkEdge("A", "B", graph.EdgeCalls) + e2.Line = 2 + e3 := mkEdge("B", "C", graph.EdgeReferences) + e3.Line = 3 + e4 := mkEdge("C", "D", graph.EdgeReads) + e4.Line = 4 + e5 := mkEdge("X", "Y", graph.EdgeCalls) + e5.Line = 5 + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + s.AddEdge(e4) + s.AddEdge(e5) + + kinds := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} + got := rf.ReachableForwardByKinds([]string{"Test"}, kinds) + want := map[string]bool{"Test": true, "A": true, "B": true, "C": true} + for id := range want { + if !got[id] { + t.Fatalf("ReachableForwardByKinds: missing %q in closure %v", id, got) + } + } + if got["D"] { + t.Fatalf("ReachableForwardByKinds: D should not be reachable (reads is disallowed)") + } + if got["X"] || got["Y"] { + t.Fatalf("ReachableForwardByKinds: disjoint subgraph leaked: %v", got) + } + + // Empty seeds => nil. + if got := rf.ReachableForwardByKinds(nil, kinds); got != nil { + t.Fatalf("ReachableForwardByKinds(nil) = %v, want nil", got) + } + // Empty kinds => seed set only. + zero := rf.ReachableForwardByKinds([]string{"Test"}, nil) + if !zero["Test"] || zero["A"] { + t.Fatalf("ReachableForwardByKinds(no kinds) = %v, want {Test:true}", zero) + } + // Duplicate seeds dedup naturally. + dup := rf.ReachableForwardByKinds([]string{"Test", "Test"}, kinds) + if !dup["Test"] || !dup["A"] || !dup["B"] || !dup["C"] { + t.Fatalf("ReachableForwardByKinds(dup seeds) = %v, want full closure", dup) + } +} + +// testThrowerErrorSurfacer exercises the optional +// graph.ThrowerErrorSurfacer capability. Seeds throwers with mixed +// error targets and EdgeEmits→KindString attachments, asserts the +// per-thrower row dedup + path-prefix filter both fire. +func testThrowerErrorSurfacer(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + ts, ok := s.(graph.ThrowerErrorSurfacer) + if !ok { + t.Skip("backend does not implement graph.ThrowerErrorSurfacer") + } + + // Throwers ThrowA (in pkg/keep/), ThrowB (in pkg/drop/). Targets + // ErrIO + ErrTimeout. ThrowA also emits two literal error_msg + // strings; one EdgeEmits goes to a non-error_msg context that + // must NOT surface in ErrorMsgs. + s.AddNode(mkNode("ThrowA", "ThrowA", "pkg/keep/a.go", graph.KindFunction)) + s.AddNode(mkNode("ThrowB", "ThrowB", "pkg/drop/b.go", graph.KindFunction)) + s.AddNode(mkNode("ErrIO", "ErrIO", "errors/io.go", graph.KindType)) + s.AddNode(mkNode("ErrTimeout", "ErrTimeout", "errors/io.go", graph.KindType)) + + msgOK1 := mkNode("msg1", "open failed", "pkg/keep/a.go", graph.KindString) + msgOK1.Meta = map[string]any{"context": "error_msg"} + s.AddNode(msgOK1) + msgOK2 := mkNode("msg2", "timeout", "pkg/keep/a.go", graph.KindString) + msgOK2.Meta = map[string]any{"context": "error_msg"} + s.AddNode(msgOK2) + // Wrong context — must be filtered out. + msgWrong := mkNode("msg3", "log line", "pkg/keep/a.go", graph.KindString) + msgWrong.Meta = map[string]any{"context": "log_msg"} + s.AddNode(msgWrong) + + // ThrowA throws ErrIO twice (dedup to one target) + ErrTimeout once. + e1 := mkEdge("ThrowA", "ErrIO", graph.EdgeThrows) + e1.FilePath = "pkg/keep/a.go" + e1.Line = 10 + e2 := mkEdge("ThrowA", "ErrIO", graph.EdgeThrows) + e2.FilePath = "pkg/keep/a.go" + e2.Line = 12 + e3 := mkEdge("ThrowA", "ErrTimeout", graph.EdgeThrows) + e3.FilePath = "pkg/keep/a.go" + e3.Line = 14 + // ThrowB throws ErrIO once. + e4 := mkEdge("ThrowB", "ErrIO", graph.EdgeThrows) + e4.FilePath = "pkg/drop/b.go" + e4.Line = 4 + // EdgeEmits attachments for ThrowA. + e5 := mkEdge("ThrowA", "msg1", graph.EdgeEmits) + e5.Line = 11 + e6 := mkEdge("ThrowA", "msg2", graph.EdgeEmits) + e6.Line = 13 + e7 := mkEdge("ThrowA", "msg3", graph.EdgeEmits) + e7.Line = 15 + for _, e := range []*graph.Edge{e1, e2, e3, e4, e5, e6, e7} { + s.AddEdge(e) + } + + rows := ts.ThrowerErrorSurface("") + byID := map[string]graph.ThrowerErrorRow{} + for _, r := range rows { + byID[r.ThrowerID] = r + } + + a, ok := byID["ThrowA"] + if !ok { + t.Fatalf("ThrowerErrorSurface: ThrowA missing from rows %v", rows) + } + if a.Throws != 3 { + t.Fatalf("ThrowA.Throws = %d, want 3", a.Throws) + } + gotTargets := append([]string(nil), a.ErrorTargets...) + sort.Strings(gotTargets) + if fmt.Sprint(gotTargets) != fmt.Sprint([]string{"ErrIO", "ErrTimeout"}) { + t.Fatalf("ThrowA.ErrorTargets = %v, want [ErrIO ErrTimeout]", gotTargets) + } + gotMsgs := append([]string(nil), a.ErrorMsgs...) + sort.Strings(gotMsgs) + if fmt.Sprint(gotMsgs) != fmt.Sprint([]string{"open failed", "timeout"}) { + t.Fatalf("ThrowA.ErrorMsgs = %v, want [open failed timeout]", gotMsgs) + } + + b, ok := byID["ThrowB"] + if !ok || b.Throws != 1 || len(b.ErrorTargets) != 1 || b.ErrorTargets[0] != "ErrIO" { + t.Fatalf("ThrowB row = %+v, want Throws=1 ErrorTargets=[ErrIO]", b) + } + if len(b.ErrorMsgs) != 0 { + t.Fatalf("ThrowB.ErrorMsgs = %v, want empty", b.ErrorMsgs) + } + + // Path-prefix filter drops ThrowB (under pkg/drop/) and keeps ThrowA. + keep := ts.ThrowerErrorSurface("pkg/keep/") + if len(keep) != 1 || keep[0].ThrowerID != "ThrowA" { + t.Fatalf("ThrowerErrorSurface(pkg/keep/) = %v, want only ThrowA", keep) + } + drop := ts.ThrowerErrorSurface("pkg/missing/") + if len(drop) != 0 { + t.Fatalf("ThrowerErrorSurface(pkg/missing/) = %v, want empty", drop) + } +} + +// testEdgeAdjacencyForKinds exercises the optional +// graph.EdgeAdjacencyForKinds capability. Seeds a graph mixing +// function/method/type nodes joined by Calls / References / Writes +// edges and asserts the iterator yields only (from, to) pairs whose +// edge kind is in the allowed set AND whose endpoints both fall in +// the allowed node-kind set. +func testEdgeAdjacencyForKinds(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.EdgeAdjacencyForKinds) + if !ok { + t.Skip("backend does not implement graph.EdgeAdjacencyForKinds") + } + + s.AddNode(mkNode("F1", "F1", "x.go", graph.KindFunction)) + s.AddNode(mkNode("F2", "F2", "x.go", graph.KindFunction)) + s.AddNode(mkNode("M1", "M1", "x.go", graph.KindMethod)) + s.AddNode(mkNode("T1", "T1", "y.go", graph.KindType)) + s.AddNode(mkNode("V1", "V1", "y.go", graph.KindVariable)) + + // F1 → F2 Calls (function→function, in-set) + e1 := mkEdge("F1", "F2", graph.EdgeCalls) + e1.Line = 1 + // F2 → M1 References (function→method, in-set) + e2 := mkEdge("F2", "M1", graph.EdgeReferences) + e2.Line = 2 + // F1 → T1 References (function→type, NOT in-set: T1 excluded) + e3 := mkEdge("F1", "T1", graph.EdgeReferences) + e3.Line = 3 + // T1 → F2 References (type→function, NOT in-set: T1 excluded) + e4 := mkEdge("T1", "F2", graph.EdgeReferences) + e4.Line = 4 + // M1 → F1 Writes (method→function, edge kind excluded) + e5 := mkEdge("M1", "F1", graph.EdgeWrites) + e5.Line = 5 + // F1 → V1 References (function→variable, NOT in-set: V1 excluded) + e6 := mkEdge("F1", "V1", graph.EdgeReferences) + e6.Line = 6 + for _, e := range []*graph.Edge{e1, e2, e3, e4, e5, e6} { + s.AddEdge(e) + } + + eKinds := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} + nKinds := []graph.NodeKind{graph.KindFunction, graph.KindMethod} + + got := make(map[[2]string]int) + for pair := range scan.EdgeAdjacencyForKinds(eKinds, nKinds) { + got[pair]++ + } + want := map[[2]string]int{ + {"F1", "F2"}: 1, + {"F2", "M1"}: 1, + } + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("EdgeAdjacencyForKinds = %v, want %v", got, want) + } + + // Empty edge kinds yields nothing — never a whole-table scan. + empty := 0 + for range scan.EdgeAdjacencyForKinds(nil, nKinds) { + empty++ + } + if empty != 0 { + t.Fatalf("EdgeAdjacencyForKinds(nil edges) yielded %d, want 0", empty) + } + // Empty node kinds yields nothing. + for range scan.EdgeAdjacencyForKinds(eKinds, nil) { + empty++ + } + if empty != 0 { + t.Fatalf("EdgeAdjacencyForKinds(nil nodes) yielded %d, want 0", empty) + } + // Zero-match: edge kind absent from graph yields nothing. + zero := 0 + for range scan.EdgeAdjacencyForKinds([]graph.EdgeKind{graph.EdgeKind("nonexistent")}, nKinds) { + zero++ + } + if zero != 0 { + t.Fatalf("EdgeAdjacencyForKinds(nonexistent edge) yielded %d, want 0", zero) + } + // Node-kind filter actually narrows: asking only for {Type} drops every pair. + narrowed := 0 + for range scan.EdgeAdjacencyForKinds(eKinds, []graph.NodeKind{graph.KindType}) { + narrowed++ + } + if narrowed != 0 { + t.Fatalf("EdgeAdjacencyForKinds(Type only) yielded %d, want 0", narrowed) + } + // Early stop honours the iterator contract. + stopped := 0 + for range scan.EdgeAdjacencyForKinds(eKinds, nKinds) { + stopped++ + if stopped == 1 { + break + } + } + if stopped != 1 { + t.Fatalf("early stop yielded %d before break, want 1", stopped) + } +} + +// testCommunityCrossingsByKind exercises the optional +// graph.CommunityCrossingsByKind capability. Seeds a small graph +// with a known community partition and asserts per-source crossing +// counts match for: no edges, all-same-community, all-cross, mixed. +func testCommunityCrossingsByKind(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.CommunityCrossingsByKind) + if !ok { + t.Skip("backend does not implement graph.CommunityCrossingsByKind") + } + + s.AddNode(mkNode("A1", "A1", "x.go", graph.KindFunction)) + s.AddNode(mkNode("A2", "A2", "x.go", graph.KindFunction)) + s.AddNode(mkNode("B1", "B1", "y.go", graph.KindFunction)) + s.AddNode(mkNode("B2", "B2", "y.go", graph.KindFunction)) + s.AddNode(mkNode("C1", "C1", "z.go", graph.KindFunction)) + + // A1 → A2 Calls (same community A — NOT a crossing) + e1 := mkEdge("A1", "A2", graph.EdgeCalls) + e1.Line = 1 + // A1 → B1 Calls (A→B — crossing) + e2 := mkEdge("A1", "B1", graph.EdgeCalls) + e2.Line = 2 + // A1 → C1 References (A→C — crossing, second from A1) + e3 := mkEdge("A1", "C1", graph.EdgeReferences) + e3.Line = 3 + // B1 → B2 References (same community B — NOT a crossing) + e4 := mkEdge("B1", "B2", graph.EdgeReferences) + e4.Line = 4 + // B2 → C1 Calls (B→C — crossing) + e5 := mkEdge("B2", "C1", graph.EdgeCalls) + e5.Line = 5 + // A2 → B2 Writes (different community but edge kind excluded) + e6 := mkEdge("A2", "B2", graph.EdgeWrites) + e6.Line = 6 + for _, e := range []*graph.Edge{e1, e2, e3, e4, e5, e6} { + s.AddEdge(e) + } + + communities := map[string]string{ + "A1": "A", "A2": "A", + "B1": "B", "B2": "B", + "C1": "C", + } + kinds := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} + + got := scan.CommunityCrossingsByKind(kinds, communities) + want := map[string]int{ + "A1": 2, // → B1 + → C1 + "B2": 1, // → C1 + } + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("CommunityCrossingsByKind(mixed) = %v, want %v", got, want) + } + + // All-same-community partition: no crossings at all. + same := map[string]string{ + "A1": "A", "A2": "A", "B1": "A", "B2": "A", "C1": "A", + } + if r := scan.CommunityCrossingsByKind(kinds, same); len(r) != 0 { + t.Fatalf("CommunityCrossingsByKind(all-same) = %v, want empty", r) + } + + // All-cross-community partition: every edge in scope is a crossing. + allCross := map[string]string{ + "A1": "1", "A2": "2", "B1": "3", "B2": "4", "C1": "5", + } + allGot := scan.CommunityCrossingsByKind(kinds, allCross) + allWant := map[string]int{ + "A1": 3, // A1 has 3 in-scope out-edges + "B1": 1, // B1 → B2 (now also a crossing) + "B2": 1, // B2 → C1 + } + if fmt.Sprint(allGot) != fmt.Sprint(allWant) { + t.Fatalf("CommunityCrossingsByKind(all-cross) = %v, want %v", allGot, allWant) + } + + // Empty kinds returns nil — never a whole-table scan. + if r := scan.CommunityCrossingsByKind(nil, communities); r != nil { + t.Fatalf("CommunityCrossingsByKind(nil kinds) = %v, want nil", r) + } + // Empty community map returns nil. + if r := scan.CommunityCrossingsByKind(kinds, nil); r != nil { + t.Fatalf("CommunityCrossingsByKind(nil comm) = %v, want nil", r) + } + // Kind absent from graph yields nil. + if r := scan.CommunityCrossingsByKind([]graph.EdgeKind{graph.EdgeKind("nonexistent")}, communities); r != nil { + t.Fatalf("CommunityCrossingsByKind(nonexistent) = %v, want nil", r) + } +} + +// testNodeIDsByKinds exercises the optional graph.NodeIDsByKinds +// capability. Seeds nodes of several kinds and asserts the +// projection returns just the IDs of the requested kinds, with +// duplicates collapsed and empty input returning nil. +func testNodeIDsByKinds(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.NodeIDsByKinds) + if !ok { + t.Skip("backend does not implement graph.NodeIDsByKinds") + } + + s.AddNode(mkNode("F1", "F1", "x.go", graph.KindFunction)) + s.AddNode(mkNode("F2", "F2", "x.go", graph.KindFunction)) + s.AddNode(mkNode("M1", "M1", "x.go", graph.KindMethod)) + s.AddNode(mkNode("T1", "T1", "y.go", graph.KindType)) + s.AddNode(mkNode("V1", "V1", "y.go", graph.KindVariable)) + + got := scan.NodeIDsByKinds([]graph.NodeKind{graph.KindFunction, graph.KindMethod}) + sort.Strings(got) + want := []string{"F1", "F2", "M1"} + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("NodeIDsByKinds(Function,Method) = %v, want %v", got, want) + } + + // Empty kinds returns nil. + if r := scan.NodeIDsByKinds(nil); r != nil { + t.Fatalf("NodeIDsByKinds(nil) = %v, want nil", r) + } + if r := scan.NodeIDsByKinds([]graph.NodeKind{}); r != nil { + t.Fatalf("NodeIDsByKinds(empty) = %v, want nil", r) + } + + // Blank kinds are elided. + if r := scan.NodeIDsByKinds([]graph.NodeKind{"", ""}); r != nil { + t.Fatalf("NodeIDsByKinds(blank) = %v, want nil", r) + } + + // Duplicates collapse — the IN-list must dedupe. + dup := scan.NodeIDsByKinds([]graph.NodeKind{graph.KindFunction, graph.KindFunction}) + sort.Strings(dup) + wantDup := []string{"F1", "F2"} + if fmt.Sprint(dup) != fmt.Sprint(wantDup) { + t.Fatalf("NodeIDsByKinds(Function,Function) = %v, want %v", dup, wantDup) + } + + // Kinds absent from the graph yield an empty slice (or nil). + miss := scan.NodeIDsByKinds([]graph.NodeKind{graph.KindInterface}) + if len(miss) != 0 { + t.Fatalf("NodeIDsByKinds(Interface) = %v, want empty", miss) + } +} + +// testMemberMethodsByType exercises the optional +// graph.MemberMethodsByType capability. Seeds a graph with multiple +// types, their methods, and a non-method EdgeMemberOf edge to verify +// the kind gate. +func testMemberMethodsByType(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.MemberMethodsByType) + if !ok { + t.Skip("backend does not implement graph.MemberMethodsByType") + } + + // Two types with method members + a noise field. + s.AddNode(mkNode("T1", "T1", "a.go", graph.KindType)) + s.AddNode(mkNode("T2", "T2", "b.go", graph.KindType)) + s.AddNode(mkNode("M1", "Foo", "a.go", graph.KindMethod)) + s.AddNode(mkNode("M2", "Bar", "a.go", graph.KindMethod)) + s.AddNode(mkNode("M3", "Foo", "b.go", graph.KindMethod)) + s.AddNode(mkNode("F1", "Field1", "a.go", graph.KindField)) + + s.AddEdge(mkEdge("M1", "T1", graph.EdgeMemberOf)) + s.AddEdge(mkEdge("M2", "T1", graph.EdgeMemberOf)) + s.AddEdge(mkEdge("M3", "T2", graph.EdgeMemberOf)) + // Non-method source — must NOT appear. + s.AddEdge(mkEdge("F1", "T1", graph.EdgeMemberOf)) + + got := scan.MemberMethodsByType() + t1Names := map[string]bool{} + for _, m := range got["T1"] { + t1Names[m.Name] = true + } + if !t1Names["Foo"] || !t1Names["Bar"] { + t.Fatalf("MemberMethodsByType T1 = %v, want {Foo, Bar}", got["T1"]) + } + if len(got["T1"]) != 2 { + t.Fatalf("MemberMethodsByType T1 size = %d, want 2", len(got["T1"])) + } + t2Names := map[string]bool{} + for _, m := range got["T2"] { + t2Names[m.Name] = true + } + if !t2Names["Foo"] || len(got["T2"]) != 1 { + t.Fatalf("MemberMethodsByType T2 = %v, want {Foo}", got["T2"]) + } + // Verify FilePath / StartLine columns are projected. + for _, m := range got["T1"] { + if m.MethodID == "" || m.FilePath == "" { + t.Fatalf("MemberMethodsByType T1 row missing columns: %+v", m) + } + } + + // Empty store returns nil. + empty := factory(t) + if r := empty.(graph.MemberMethodsByType).MemberMethodsByType(); r != nil { + t.Fatalf("MemberMethodsByType(empty) = %v, want nil", r) + } +} + +// testStructuralParentEdges exercises the optional +// graph.StructuralParentEdges capability. Seeds a mix of extends / +// implements / composes edges with varying endpoint kinds. +func testStructuralParentEdges(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.StructuralParentEdges) + if !ok { + t.Skip("backend does not implement graph.StructuralParentEdges") + } + + // Types / interfaces (in-set endpoints). + s.AddNode(mkNode("C1", "Child", "a.go", graph.KindType)) + s.AddNode(mkNode("P1", "Parent", "a.go", graph.KindType)) + s.AddNode(mkNode("I1", "Iface", "a.go", graph.KindInterface)) + // A method (NOT in-set). + s.AddNode(mkNode("M1", "Foo", "a.go", graph.KindMethod)) + + // In-set: type → type extends. + e1 := mkEdge("C1", "P1", graph.EdgeExtends) + e1.Line = 1 + e1.Origin = graph.OriginASTResolved + // In-set: type → interface implements. + e2 := mkEdge("C1", "I1", graph.EdgeImplements) + e2.Line = 2 + e2.Origin = graph.OriginASTInferred + // In-set: type → type composes. + e3 := mkEdge("C1", "P1", graph.EdgeComposes) + e3.Line = 3 + // OUT: extends with a method on one side. + e4 := mkEdge("M1", "P1", graph.EdgeExtends) + e4.Line = 4 + // OUT: irrelevant kind. + e5 := mkEdge("C1", "P1", graph.EdgeCalls) + e5.Line = 5 + for _, e := range []*graph.Edge{e1, e2, e3, e4, e5} { + s.AddEdge(e) + } + + rows := scan.StructuralParentEdges() + if len(rows) != 3 { + t.Fatalf("StructuralParentEdges len = %d, want 3 (rows=%v)", len(rows), rows) + } + // Verify origin propagation on the ast_resolved row. + var sawResolved, sawInferred bool + for _, r := range rows { + if r.FromID != "C1" { + t.Fatalf("unexpected FromID %q in row %v", r.FromID, r) + } + if r.FromKind != graph.KindType { + t.Fatalf("unexpected FromKind %q in row %v", r.FromKind, r) + } + if r.Origin == graph.OriginASTResolved { + sawResolved = true + } + if r.Origin == graph.OriginASTInferred { + sawInferred = true + } + } + if !sawResolved || !sawInferred { + t.Fatalf("origin not propagated: resolved=%v inferred=%v", sawResolved, sawInferred) + } + + // Empty graph returns nil/empty. + empty := factory(t) + if r := empty.(graph.StructuralParentEdges).StructuralParentEdges(); len(r) != 0 { + t.Fatalf("StructuralParentEdges(empty) = %v, want empty", r) + } +} + +// testCrossRepoCandidates exercises the optional +// graph.CrossRepoCandidates capability. Seeds same-repo and +// cross-repo edges and asserts only the distinct, non-empty +// repo-prefix pairs survive. +func testCrossRepoCandidates(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.CrossRepoCandidates) + if !ok { + t.Skip("backend does not implement graph.CrossRepoCandidates") + } + + // Repo A. + s.AddNode(mkRepoNode("A1", "fnA1", "a.go", "repoA", graph.KindFunction)) + s.AddNode(mkRepoNode("A2", "fnA2", "a.go", "repoA", graph.KindFunction)) + // Repo B. + s.AddNode(mkRepoNode("B1", "fnB1", "b.go", "repoB", graph.KindFunction)) + // No repo. + s.AddNode(mkNode("X1", "fnX1", "x.go", graph.KindFunction)) + + // Same-repo calls — must NOT appear. + e1 := mkEdge("A1", "A2", graph.EdgeCalls) + e1.Line = 1 + // Cross-repo call — in. + e2 := mkEdge("A1", "B1", graph.EdgeCalls) + e2.Line = 2 + // Cross-repo implements — in. + e3 := mkEdge("A1", "B1", graph.EdgeImplements) + e3.Line = 3 + // Cross-repo edge but kind not in baseKinds — out. + e4 := mkEdge("A1", "B1", graph.EdgeReferences) + e4.Line = 4 + // Either endpoint missing repo — out. + e5 := mkEdge("A1", "X1", graph.EdgeCalls) + e5.Line = 5 + for _, e := range []*graph.Edge{e1, e2, e3, e4, e5} { + s.AddEdge(e) + } + + kinds := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeImplements, graph.EdgeExtends} + rows := scan.CrossRepoCandidates(kinds) + if len(rows) != 2 { + t.Fatalf("CrossRepoCandidates len = %d, want 2 (rows=%v)", len(rows), rows) + } + for _, r := range rows { + if r.FromRepo != "repoA" || r.ToRepo != "repoB" { + t.Fatalf("unexpected repos in row %v", r) + } + if r.Edge == nil || r.Edge.From != "A1" || r.Edge.To != "B1" { + t.Fatalf("unexpected edge in row %v", r) + } + } + + // Empty kinds returns nil — never a whole-table scan. + if r := scan.CrossRepoCandidates(nil); r != nil { + t.Fatalf("CrossRepoCandidates(nil) = %v, want nil", r) + } +} + +// testExtractCandidates exercises the optional +// graph.ExtractCandidatesScanner capability. Builds a graph with +// three functions: +// - Long+Hot: long body, 3 distinct callers, 6 distinct callees +// (passes every threshold). +// - Long+Cold: long body, 1 caller, 6 callees (fails minCallers). +// - Short+Hot: short body, 3 callers, 6 callees (fails minLines). +func testExtractCandidates(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.ExtractCandidatesScanner) + if !ok { + t.Skip("backend does not implement graph.ExtractCandidatesScanner") + } + + mk := func(id string, kind graph.NodeKind, start, end int) *graph.Node { + n := mkNode(id, id, "p/a.go", kind) + n.StartLine = start + n.EndLine = end + return n + } + s.AddNode(mk("LongHot", graph.KindFunction, 1, 60)) + s.AddNode(mk("LongCold", graph.KindFunction, 100, 160)) + s.AddNode(mk("ShortHot", graph.KindFunction, 200, 205)) + // Callers + callees as plain function nodes. + for i := 0; i < 6; i++ { + c := mkNode(fmt.Sprintf("C%d", i), fmt.Sprintf("C%d", i), "p/c.go", graph.KindFunction) + s.AddNode(c) + t := mkNode(fmt.Sprintf("T%d", i), fmt.Sprintf("T%d", i), "p/t.go", graph.KindFunction) + s.AddNode(t) + } + // LongHot: 3 distinct callers, 6 distinct callees. + for i := 0; i < 3; i++ { + e := mkEdge(fmt.Sprintf("C%d", i), "LongHot", graph.EdgeCalls) + e.Line = i + 1 + s.AddEdge(e) + } + for i := 0; i < 6; i++ { + e := mkEdge("LongHot", fmt.Sprintf("T%d", i), graph.EdgeCalls) + e.Line = 100 + i + s.AddEdge(e) + } + // LongCold: 1 caller, 6 callees. + e := mkEdge("C0", "LongCold", graph.EdgeCalls) + e.Line = 200 + s.AddEdge(e) + for i := 0; i < 6; i++ { + e := mkEdge("LongCold", fmt.Sprintf("T%d", i), graph.EdgeCalls) + e.Line = 300 + i + s.AddEdge(e) + } + // ShortHot: 3 callers, 6 callees but too short. + for i := 0; i < 3; i++ { + e := mkEdge(fmt.Sprintf("C%d", i), "ShortHot", graph.EdgeCalls) + e.Line = 400 + i + s.AddEdge(e) + } + for i := 0; i < 6; i++ { + e := mkEdge("ShortHot", fmt.Sprintf("T%d", i), graph.EdgeCalls) + e.Line = 500 + i + s.AddEdge(e) + } + + rows := scan.ExtractCandidates( + []graph.EdgeKind{graph.EdgeCalls}, + 20, // minLines + 2, // minCallers + 5, // minFanOut + "", // no prefix + ) + byID := make(map[string]graph.ExtractCandidateRow) + for _, r := range rows { + byID[r.NodeID] = r + } + r, ok := byID["LongHot"] + if !ok { + t.Fatalf("expected LongHot in result, got %v", rows) + } + if r.CallerCount != 3 || r.FanOut != 6 || r.LineCount != 60 { + t.Fatalf("LongHot row mismatch: %+v", r) + } + if _, present := byID["LongCold"]; present { + t.Fatalf("LongCold should have been filtered (caller count < 2)") + } + if _, present := byID["ShortHot"]; present { + t.Fatalf("ShortHot should have been filtered (lines < 20)") + } + + // Path prefix narrows to only LongHot (it's the one in p/a.go; + // LongCold and ShortHot also are in p/a.go so use a prefix that + // doesn't match). + none := scan.ExtractCandidates( + []graph.EdgeKind{graph.EdgeCalls}, 20, 2, 5, "no/such/", + ) + if len(none) != 0 { + t.Fatalf("ExtractCandidates with non-matching prefix = %d, want 0", len(none)) + } + // Empty kinds returns nil. + if r := scan.ExtractCandidates(nil, 0, 0, 0, ""); r != nil { + t.Fatalf("ExtractCandidates(nil kinds) = %v, want nil", r) + } +} + +// testFileSymbolNamesByPaths exercises the optional +// graph.FileSymbolNamesByPaths capability. +func testFileSymbolNamesByPaths(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.FileSymbolNamesByPaths) + if !ok { + t.Skip("backend does not implement graph.FileSymbolNamesByPaths") + } + + s.AddNode(mkNode("Alpha", "Alpha", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Beta", "Beta", "a.go", graph.KindType)) + s.AddNode(mkNode("Gamma", "Gamma", "a.go", graph.KindMethod)) + s.AddNode(mkNode("LowCardField", "LowCardField", "a.go", graph.KindField)) + s.AddNode(mkNode("Delta", "Delta", "b.go", graph.KindFunction)) + + rows := scan.FileSymbolNamesByPaths( + []string{"a.go", "b.go"}, + []graph.NodeKind{graph.KindFunction, graph.KindMethod, graph.KindType, graph.KindInterface}, + ) + byFile := make(map[string]map[string]struct{}) + for _, r := range rows { + seen := byFile[r.FilePath] + if seen == nil { + seen = make(map[string]struct{}) + byFile[r.FilePath] = seen + } + seen[r.Name] = struct{}{} + } + want := map[string]map[string]struct{}{ + "a.go": {"Alpha": {}, "Beta": {}, "Gamma": {}}, + "b.go": {"Delta": {}}, + } + for file, names := range want { + got := byFile[file] + if len(got) != len(names) { + t.Fatalf("file %q: got %v, want %v", file, got, names) + } + for n := range names { + if _, ok := got[n]; !ok { + t.Errorf("file %q: missing name %q (got %v)", file, n, got) + } + } + } + // LowCardField (KindField) must not appear because it's not in + // the requested kinds. + if _, ok := byFile["a.go"]["LowCardField"]; ok { + t.Fatalf("kind filter leaked KindField row") + } + + // Empty paths returns nil. + if r := scan.FileSymbolNamesByPaths(nil, nil); r != nil { + t.Fatalf("FileSymbolNamesByPaths(nil) = %v, want nil", r) + } +} + +// testClassHierarchyTraverser exercises the optional +// graph.ClassHierarchyTraverser capability. +func testClassHierarchyTraverser(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.ClassHierarchyTraverser) + if !ok { + t.Skip("backend does not implement graph.ClassHierarchyTraverser") + } + + s.AddNode(mkNode("Animal", "Animal", "z.go", graph.KindInterface)) + s.AddNode(mkNode("Dog", "Dog", "z.go", graph.KindType)) + s.AddNode(mkNode("Puppy", "Puppy", "z.go", graph.KindType)) + // Dog implements Animal; Puppy extends Dog. + e1 := mkEdge("Dog", "Animal", graph.EdgeImplements) + e1.Line = 1 + s.AddEdge(e1) + e2 := mkEdge("Puppy", "Dog", graph.EdgeExtends) + e2.Line = 2 + s.AddEdge(e2) + + upRows := scan.ClassHierarchyTraverse( + "Puppy", "up", + []graph.EdgeKind{graph.EdgeExtends, graph.EdgeImplements, graph.EdgeComposes}, + 5, + ) + if len(upRows) != 2 { + t.Fatalf("Puppy up: %d rows, want 2 (Dog, Animal). rows=%v", len(upRows), upRows) + } + visited := map[string]bool{} + for _, r := range upRows { + for _, id := range r.Path { + visited[id] = true + } + } + if !visited["Dog"] || !visited["Animal"] { + t.Fatalf("Puppy up: missing Dog or Animal in visited set: %v", visited) + } + downRows := scan.ClassHierarchyTraverse( + "Animal", "down", + []graph.EdgeKind{graph.EdgeExtends, graph.EdgeImplements, graph.EdgeComposes}, + 5, + ) + visited = map[string]bool{} + for _, r := range downRows { + for _, id := range r.Path { + visited[id] = true + } + } + if !visited["Dog"] || !visited["Puppy"] { + t.Fatalf("Animal down: missing Dog or Puppy in visited set: %v", visited) + } + + // Empty kinds / depth=0 / unknown seed must return nil. + if r := scan.ClassHierarchyTraverse("Puppy", "up", nil, 5); r != nil { + t.Fatalf("nil kinds: got %v", r) + } + if r := scan.ClassHierarchyTraverse("Puppy", "up", + []graph.EdgeKind{graph.EdgeExtends}, 0); r != nil { + t.Fatalf("depth=0: got %v", r) + } + if r := scan.ClassHierarchyTraverse("nope", "up", + []graph.EdgeKind{graph.EdgeExtends}, 5); r != nil { + t.Fatalf("unknown seed: got %v", r) + } +} + +// testFileEditingContext exercises the optional +// graph.FileEditingContext capability. +func testFileEditingContext(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.FileEditingContext) + if !ok { + t.Skip("backend does not implement graph.FileEditingContext") + } + // File node + two functions inside it; an importing file with one + // function that calls into the file; a downstream file with a + // function the file's function calls. + s.AddNode(mkNode("a.go", "a.go", "a.go", graph.KindFile)) + s.AddNode(mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction)) + s.AddNode(mkNode("a.go::Bar", "Bar", "a.go", graph.KindMethod)) + s.AddNode(mkNode("b.go", "b.go", "b.go", graph.KindFile)) + s.AddNode(mkNode("b.go::Caller", "Caller", "b.go", graph.KindFunction)) + s.AddNode(mkNode("c.go::Callee", "Callee", "c.go", graph.KindFunction)) + + // Import edge: a.go imports b.go. + e := mkEdge("a.go", "b.go", graph.EdgeImports) + e.Line = 1 + s.AddEdge(e) + // Caller in b.go calls Foo in a.go. + e = mkEdge("b.go::Caller", "a.go::Foo", graph.EdgeCalls) + e.Line = 2 + s.AddEdge(e) + // Foo in a.go calls Callee in c.go. + e = mkEdge("a.go::Foo", "c.go::Callee", graph.EdgeCalls) + e.Line = 3 + s.AddEdge(e) + + res := scan.FileEditingContext("a.go", []graph.NodeKind{graph.KindFunction, graph.KindMethod}) + if res == nil { + t.Fatalf("FileEditingContext returned nil for a.go") + } + if res.FileNode == nil || res.FileNode.ID != "a.go" { + t.Fatalf("FileNode missing or wrong: %+v", res.FileNode) + } + defineIDs := map[string]bool{} + for _, n := range res.Defines { + defineIDs[n.ID] = true + } + if !defineIDs["a.go::Foo"] || !defineIDs["a.go::Bar"] { + t.Fatalf("defines missing entries: got %v", defineIDs) + } + if len(res.Imports) != 1 || res.Imports[0].To != "b.go" { + t.Fatalf("imports = %v, want one edge a.go→b.go", res.Imports) + } + calledBy := map[string]bool{} + for _, n := range res.CalledBy { + calledBy[n.ID] = true + } + if !calledBy["b.go::Caller"] { + t.Fatalf("called_by missing Caller: %v", calledBy) + } + calls := map[string]bool{} + for _, n := range res.Calls { + calls[n.ID] = true + } + if !calls["c.go::Callee"] { + t.Fatalf("calls missing Callee: %v", calls) + } + + // Empty path returns nil. + if r := scan.FileEditingContext("", nil); r != nil { + t.Fatalf("empty path: got %v, want nil", r) + } +} + +// testNodeDegreeByKinds exercises the optional +// graph.NodeDegreeByKinds capability. +func testNodeDegreeByKinds(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.NodeDegreeByKinds) + if !ok { + t.Skip("backend does not implement graph.NodeDegreeByKinds") + } + s.AddNode(mkNode("Iso", "Iso", "pkg/iso.go", graph.KindFunction)) + s.AddNode(mkNode("Hub", "Hub", "pkg/hub.go", graph.KindFunction)) + s.AddNode(mkNode("Leaf", "Leaf", "pkg/leaf.go", graph.KindMethod)) + s.AddNode(mkNode("Other", "Other", "pkg/other.go", graph.KindType)) + s.AddNode(mkNode("Caller", "Caller", "pkg/caller.go", graph.KindFunction)) + // 2 incoming + 1 outgoing on Hub. + for i, from := range []string{"Caller", "Leaf"} { + e := mkEdge(from, "Hub", graph.EdgeCalls) + e.Line = i + 1 + s.AddEdge(e) + } + e := mkEdge("Hub", "Leaf", graph.EdgeCalls) + e.Line = 3 + s.AddEdge(e) + + rows := scan.NodeDegreeByKinds( + []graph.NodeKind{graph.KindFunction, graph.KindMethod}, + "", + ) + byID := make(map[string]graph.NodeDegreeRow) + for _, r := range rows { + byID[r.NodeID] = r + } + if got := byID["Hub"]; got.InCount != 2 || got.OutCount != 1 { + t.Fatalf("Hub: %+v, want in=2 out=1", got) + } + if got, ok := byID["Iso"]; !ok || got.InCount != 0 || got.OutCount != 0 { + t.Fatalf("Iso: ok=%v got=%+v, want in=0 out=0", ok, got) + } + if _, ok := byID["Other"]; ok { + t.Fatalf("Other (KindType) leaked into kind-filtered result") + } + // Empty kinds returns nil. + if r := scan.NodeDegreeByKinds(nil, ""); r != nil { + t.Fatalf("NodeDegreeByKinds(nil) = %v, want nil", r) + } + // Path prefix narrows. + rows = scan.NodeDegreeByKinds( + []graph.NodeKind{graph.KindFunction, graph.KindMethod}, + "pkg/leaf", + ) + if len(rows) != 1 || rows[0].NodeID != "Leaf" { + t.Fatalf("pathPrefix scope mismatch: got %v", rows) + } +} + +// eqShingles reports whether two []uint64 are element-for-element +// equal with order preserved — the exact contract LoadCloneShingles +// must round-trip. +func eqShingles(a, b []uint64) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + +// testCloneShingleSidecar mirrors the FileMtime sidecar conformance: +// set shingle sets for a few node ids under a repo prefix, Load them +// back (asserting exact []uint64 equality with order preserved), +// Delete a subset and re-Load (asserting the gone rows are gone and +// the survivors untouched), verify repo-prefix scoping isolates rows, +// and that an empty/absent load returns an empty (non-nil) map, not an +// error. Backends that don't implement the capability skip — both the +// in-memory Graph and the SQLite Store do implement it. +func testCloneShingleSidecar(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + w, ok := s.(graph.CloneShingleWriter) + if !ok { + t.Skip("backend does not implement graph.CloneShingleWriter") + } + r, ok := s.(graph.CloneShingleReader) + if !ok { + t.Skip("backend implements CloneShingleWriter but not CloneShingleReader") + } + + // Empty / absent load returns an empty (non-nil) map, not an error. + if got, err := r.LoadCloneShingles("repoA"); err != nil { + t.Fatalf("LoadCloneShingles(empty store): %v", err) + } else if got == nil { + t.Fatalf("LoadCloneShingles(empty store) = nil, want empty non-nil map") + } else if len(got) != 0 { + t.Fatalf("LoadCloneShingles(empty store) = %v, want empty", got) + } + + // Empty input is a no-op. + if err := w.BulkSetCloneShingles("repoA", nil); err != nil { + t.Fatalf("BulkSetCloneShingles(nil): %v", err) + } + + // Write three shingle sets under repoA. Order within each set must + // survive the round-trip, so use non-sorted, repeated-value slices. + want := map[string][]uint64{ + "a.go::Foo": {9, 1, 9, 4, 2}, + "a.go::Bar": {7}, + "b.go::Baz": {0xFFFFFFFFFFFFFFFF, 0, 42}, + } + if err := w.BulkSetCloneShingles("repoA", want); err != nil { + t.Fatalf("BulkSetCloneShingles(repoA): %v", err) + } + + got, err := r.LoadCloneShingles("repoA") + if err != nil { + t.Fatalf("LoadCloneShingles(repoA): %v", err) + } + if len(got) != len(want) { + t.Fatalf("LoadCloneShingles(repoA) len = %d, want %d", len(got), len(want)) + } + for id, ws := range want { + if !eqShingles(got[id], ws) { + t.Fatalf("LoadCloneShingles(repoA)[%q] = %v, want %v (order preserved)", id, got[id], ws) + } + } + + // Overwrite is idempotent in place: re-setting one id replaces it. + if err := w.BulkSetCloneShingles("repoA", map[string][]uint64{"a.go::Bar": {7, 8, 9}}); err != nil { + t.Fatalf("BulkSetCloneShingles(overwrite): %v", err) + } + if got, err := r.LoadCloneShingles("repoA"); err != nil { + t.Fatalf("LoadCloneShingles after overwrite: %v", err) + } else if !eqShingles(got["a.go::Bar"], []uint64{7, 8, 9}) { + t.Fatalf("overwrite not in place: a.go::Bar = %v, want [7 8 9]", got["a.go::Bar"]) + } + + // Deep-copy isolation: mutating the input slice after the write must + // not corrupt stored state, and mutating the returned slice must not + // corrupt the next read. + src := []uint64{1, 2, 3} + if err := w.BulkSetCloneShingles("repoA", map[string][]uint64{"a.go::Foo": src}); err != nil { + t.Fatalf("BulkSetCloneShingles(isolation): %v", err) + } + src[0] = 999 + got2, err := r.LoadCloneShingles("repoA") + if err != nil { + t.Fatalf("LoadCloneShingles(isolation): %v", err) + } + if !eqShingles(got2["a.go::Foo"], []uint64{1, 2, 3}) { + t.Fatalf("input mutation leaked into store: a.go::Foo = %v, want [1 2 3]", got2["a.go::Foo"]) + } + got2["a.go::Foo"][0] = 777 + if got3, _ := r.LoadCloneShingles("repoA"); !eqShingles(got3["a.go::Foo"], []uint64{1, 2, 3}) { + t.Fatalf("returned-slice mutation leaked into store: a.go::Foo = %v, want [1 2 3]", got3["a.go::Foo"]) + } + + // Delete a subset and re-Load — the deleted rows must be gone; the + // survivors untouched. + if err := w.DeleteCloneShingles([]string{"a.go::Bar", "b.go::Baz", "missing::id", ""}); err != nil { + t.Fatalf("DeleteCloneShingles: %v", err) + } + after, err := r.LoadCloneShingles("repoA") + if err != nil { + t.Fatalf("LoadCloneShingles after delete: %v", err) + } + if _, present := after["a.go::Bar"]; present { + t.Fatalf("a.go::Bar still present after delete") + } + if _, present := after["b.go::Baz"]; present { + t.Fatalf("b.go::Baz still present after delete") + } + if !eqShingles(after["a.go::Foo"], []uint64{1, 2, 3}) { + t.Fatalf("survivor a.go::Foo corrupted after delete: %v", after["a.go::Foo"]) + } + + // Empty delete is a no-op. + if err := w.DeleteCloneShingles(nil); err != nil { + t.Fatalf("DeleteCloneShingles(nil): %v", err) + } + + // Repo-prefix scoping: a write under repoB must not surface under + // repoA, and vice versa. + if err := w.BulkSetCloneShingles("repoB", map[string][]uint64{"c.go::Qux": {5, 6}}); err != nil { + t.Fatalf("BulkSetCloneShingles(repoB): %v", err) + } + aRows, err := r.LoadCloneShingles("repoA") + if err != nil { + t.Fatalf("LoadCloneShingles(repoA) after repoB write: %v", err) + } + if _, leaked := aRows["c.go::Qux"]; leaked { + t.Fatalf("repoB row c.go::Qux leaked into repoA scope") + } + bRows, err := r.LoadCloneShingles("repoB") + if err != nil { + t.Fatalf("LoadCloneShingles(repoB): %v", err) + } + if len(bRows) != 1 || !eqShingles(bRows["c.go::Qux"], []uint64{5, 6}) { + t.Fatalf("LoadCloneShingles(repoB) = %v, want {c.go::Qux:[5 6]}", bRows) + } +} + +// testChurnEnrichmentSidecar mirrors the clone-shingle sidecar +// conformance for the churn enrichment capability (change A): write, +// read-all vs read-by-prefix, idempotent overwrite, per-repo isolation, +// and delete. +func testChurnEnrichmentSidecar(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + w, ok := s.(graph.ChurnEnrichmentWriter) + if !ok { + t.Skip("backend does not implement graph.ChurnEnrichmentWriter") + } + r, ok := s.(graph.ChurnEnrichmentReader) + if !ok { + t.Skip("backend implements ChurnEnrichmentWriter but not ChurnEnrichmentReader") + } + + // Empty store + empty input are no-ops. + if got := r.ChurnRows("repoA"); len(got) != 0 { + t.Fatalf("ChurnRows(empty store) = %v, want empty", got) + } + if err := w.BulkSetChurn("repoA", nil); err != nil { + t.Fatalf("BulkSetChurn(nil): %v", err) + } + + rowsA := []graph.ChurnEnrichment{ + {NodeID: "a.go", CommitCount: 5, AgeDays: 30, ChurnRate: 1.5, LastAuthor: "x@y", LastCommitAt: "2026-01-01T00:00:00Z", HeadSHA: "abc", Branch: "main", ComputedAt: "2026-06-01T00:00:00Z"}, + {NodeID: "a.go::Foo", CommitCount: 2, AgeDays: 10, ChurnRate: 0.2, LastAuthor: "z@y", LastCommitAt: "2026-02-01T00:00:00Z"}, + } + rowsB := []graph.ChurnEnrichment{ + {NodeID: "b.go::Bar", CommitCount: 9, AgeDays: 90, ChurnRate: 0.1, LastAuthor: "q@y"}, + } + if err := w.BulkSetChurn("repoA", rowsA); err != nil { + t.Fatalf("BulkSetChurn(repoA): %v", err) + } + if err := w.BulkSetChurn("repoB", rowsB); err != nil { + t.Fatalf("BulkSetChurn(repoB): %v", err) + } + + // Per-repo read isolation. + if got := r.ChurnRows("repoA"); len(got) != 2 { + t.Fatalf("ChurnRows(repoA) len = %d, want 2", len(got)) + } + if got := r.ChurnRows("repoB"); len(got) != 1 { + t.Fatalf("ChurnRows(repoB) len = %d, want 1", len(got)) + } + // Empty prefix returns ALL rows across repos. + all := r.ChurnRows("") + if len(all) != 3 { + t.Fatalf("ChurnRows(\"\") len = %d, want 3 (all repos)", len(all)) + } + + // Field round-trip + repo_prefix stamping. + byID := map[string]graph.ChurnEnrichment{} + for _, e := range all { + byID[e.NodeID] = e + } + foo := byID["a.go"] + if foo.RepoPrefix != "repoA" || foo.CommitCount != 5 || foo.ChurnRate != 1.5 || + foo.LastAuthor != "x@y" || foo.LastCommitAt != "2026-01-01T00:00:00Z" || + foo.HeadSHA != "abc" || foo.Branch != "main" { + t.Fatalf("round-trip mismatch for a.go: %+v", foo) + } + + // Idempotent overwrite (INSERT OR REPLACE on node_id). + rowsA[0].CommitCount = 99 + if err := w.BulkSetChurn("repoA", rowsA[:1]); err != nil { + t.Fatalf("BulkSetChurn(overwrite): %v", err) + } + for _, e := range r.ChurnRows("repoA") { + if e.NodeID == "a.go" && e.CommitCount != 99 { + t.Fatalf("overwrite failed: a.go commit_count = %d, want 99", e.CommitCount) + } + } + + // Delete. + if err := w.DeleteChurn([]string{"a.go", "a.go::Foo"}); err != nil { + t.Fatalf("DeleteChurn: %v", err) + } + if got := r.ChurnRows("repoA"); len(got) != 0 { + t.Fatalf("ChurnRows(repoA) after delete = %d, want 0", len(got)) + } + if got := r.ChurnRows("repoB"); len(got) != 1 { + t.Fatalf("DeleteChurn must not touch repoB: len = %d, want 1", len(got)) + } +} + +// testCoverageEnrichmentSidecar mirrors the churn sidecar conformance. +func testCoverageEnrichmentSidecar(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + w, ok := s.(graph.CoverageEnrichmentWriter) + if !ok { + t.Skip("backend does not implement graph.CoverageEnrichmentWriter") + } + r, ok := s.(graph.CoverageEnrichmentReader) + if !ok { + t.Skip("backend implements CoverageEnrichmentWriter but not Reader") + } + if got := r.CoverageRows("repoA"); len(got) != 0 { + t.Fatalf("CoverageRows(empty) = %v, want empty", got) + } + if err := w.BulkSetCoverage("repoA", nil); err != nil { + t.Fatalf("BulkSetCoverage(nil): %v", err) + } + rowsA := []graph.CoverageEnrichment{ + {NodeID: "a.go::Foo", CoveragePct: 87.5, NumStmt: 8, Hit: 7}, + {NodeID: "a.go::Bar", CoveragePct: 0, NumStmt: 3, Hit: 0}, + } + rowsB := []graph.CoverageEnrichment{{NodeID: "b.go::Baz", CoveragePct: 100, NumStmt: 1, Hit: 1}} + if err := w.BulkSetCoverage("repoA", rowsA); err != nil { + t.Fatalf("BulkSetCoverage(repoA): %v", err) + } + if err := w.BulkSetCoverage("repoB", rowsB); err != nil { + t.Fatalf("BulkSetCoverage(repoB): %v", err) + } + if got := r.CoverageRows("repoA"); len(got) != 2 { + t.Fatalf("CoverageRows(repoA) = %d, want 2", len(got)) + } + if got := r.CoverageRows(""); len(got) != 3 { + t.Fatalf("CoverageRows(all) = %d, want 3", len(got)) + } + byID := map[string]graph.CoverageEnrichment{} + for _, e := range r.CoverageRows("") { + byID[e.NodeID] = e + } + foo := byID["a.go::Foo"] + if foo.RepoPrefix != "repoA" || foo.CoveragePct != 87.5 || foo.NumStmt != 8 || foo.Hit != 7 { + t.Fatalf("round-trip mismatch: %+v", foo) + } + rowsA[0].CoveragePct = 12.0 + if err := w.BulkSetCoverage("repoA", rowsA[:1]); err != nil { + t.Fatalf("overwrite: %v", err) + } + for _, e := range r.CoverageRows("repoA") { + if e.NodeID == "a.go::Foo" && e.CoveragePct != 12.0 { + t.Fatalf("overwrite failed: %v", e.CoveragePct) + } + } + if err := w.DeleteCoverage([]string{"a.go::Foo", "a.go::Bar"}); err != nil { + t.Fatalf("DeleteCoverage: %v", err) + } + if got := r.CoverageRows("repoA"); len(got) != 0 { + t.Fatalf("after delete repoA = %d, want 0", len(got)) + } + if got := r.CoverageRows("repoB"); len(got) != 1 { + t.Fatalf("delete must not touch repoB: %d", len(got)) + } +} + +// testReleaseEnrichmentSidecar mirrors the churn/coverage sidecar conformance. +func testReleaseEnrichmentSidecar(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + w, ok := s.(graph.ReleaseEnrichmentWriter) + if !ok { + t.Skip("backend does not implement graph.ReleaseEnrichmentWriter") + } + r := s.(graph.ReleaseEnrichmentReader) + if err := w.BulkSetReleases("repoA", nil); err != nil { + t.Fatalf("BulkSetReleases(nil): %v", err) + } + if err := w.BulkSetReleases("repoA", []graph.ReleaseEnrichment{ + {NodeID: "a.go", AddedIn: "v1.0.0"}, + {NodeID: "b.go", AddedIn: "v1.2.0"}, + }); err != nil { + t.Fatalf("BulkSetReleases(repoA): %v", err) + } + if err := w.BulkSetReleases("repoB", []graph.ReleaseEnrichment{{NodeID: "c.go", AddedIn: "v2.0.0"}}); err != nil { + t.Fatalf("BulkSetReleases(repoB): %v", err) + } + if got := r.ReleaseRows("repoA"); len(got) != 2 { + t.Fatalf("ReleaseRows(repoA) = %d, want 2", len(got)) + } + if got := r.ReleaseRows(""); len(got) != 3 { + t.Fatalf("ReleaseRows(all) = %d, want 3", len(got)) + } + byID := map[string]graph.ReleaseEnrichment{} + for _, e := range r.ReleaseRows("") { + byID[e.NodeID] = e + } + if byID["a.go"].AddedIn != "v1.0.0" || byID["a.go"].RepoPrefix != "repoA" { + t.Fatalf("round-trip mismatch: %+v", byID["a.go"]) + } + if err := w.DeleteReleases([]string{"a.go", "b.go"}); err != nil { + t.Fatalf("DeleteReleases: %v", err) + } + if got := r.ReleaseRows("repoA"); len(got) != 0 { + t.Fatalf("after delete repoA = %d, want 0", len(got)) + } + if got := r.ReleaseRows("repoB"); len(got) != 1 { + t.Fatalf("delete must not touch repoB: %d", len(got)) + } +} + +// testBlameEnrichmentSidecar mirrors the other enrichment sidecars. +func testBlameEnrichmentSidecar(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + w, ok := s.(graph.BlameEnrichmentWriter) + if !ok { + t.Skip("backend does not implement graph.BlameEnrichmentWriter") + } + r := s.(graph.BlameEnrichmentReader) + if err := w.BulkSetBlame("repoA", nil); err != nil { + t.Fatalf("BulkSetBlame(nil): %v", err) + } + if err := w.BulkSetBlame("repoA", []graph.BlameEnrichment{ + {NodeID: "a.go::Foo", Commit: "abc", Email: "x@y", Timestamp: 1700000000}, + {NodeID: "a.go::Bar", Commit: "def", Email: "z@y", Timestamp: 1700001000}, + }); err != nil { + t.Fatalf("BulkSetBlame(repoA): %v", err) + } + if err := w.BulkSetBlame("repoB", []graph.BlameEnrichment{{NodeID: "b.go::Baz", Commit: "ghi", Email: "q@y", Timestamp: 1700002000}}); err != nil { + t.Fatalf("BulkSetBlame(repoB): %v", err) + } + if got := r.BlameRows("repoA"); len(got) != 2 { + t.Fatalf("BlameRows(repoA) = %d, want 2", len(got)) + } + if got := r.BlameRows(""); len(got) != 3 { + t.Fatalf("BlameRows(all) = %d, want 3", len(got)) + } + byID := map[string]graph.BlameEnrichment{} + for _, e := range r.BlameRows("") { + byID[e.NodeID] = e + } + foo := byID["a.go::Foo"] + if foo.RepoPrefix != "repoA" || foo.Commit != "abc" || foo.Email != "x@y" || foo.Timestamp != 1700000000 { + t.Fatalf("round-trip mismatch: %+v", foo) + } + if err := w.DeleteBlame([]string{"a.go::Foo", "a.go::Bar"}); err != nil { + t.Fatalf("DeleteBlame: %v", err) + } + if got := r.BlameRows("repoA"); len(got) != 0 { + t.Fatalf("after delete repoA = %d, want 0", len(got)) + } + if got := r.BlameRows("repoB"); len(got) != 1 { + t.Fatalf("delete must not touch repoB: %d", len(got)) + } +} diff --git a/internal/graph/stub.go b/internal/graph/stub.go new file mode 100644 index 00000000..df813913 --- /dev/null +++ b/internal/graph/stub.go @@ -0,0 +1,238 @@ +package graph + +import "strings" + +// Stub-node identifier conventions. +// +// A "stub" is a placeholder Node the resolver materialises for a +// symbol the indexer can see referenced but not defined in the +// current repo's source: a stdlib call, a language builtin, an +// external module import, etc. Stubs let the graph hold edges +// to "external" targets uniformly with edges to first-party +// nodes. +// +// Format (all stubs): +// +// :::: +// +// where: +// +// repoPrefix — the owning repo's RepoPrefix (Indexer.RepoPrefix). +// Empty only when the stub is created outside a +// per-repo context (legacy single-repo daemons). +// kind — one of: stdlib, builtin, external_call, module. +// rest — kind-specific (e.g. "fmt::Errorf" for stdlib). +// +// Why per-repo? Two repos pinned to different language SDK +// versions have semantically distinct stdlib symbols. Go 1.21's +// `min` is a builtin; in 1.20 it isn't. A global `builtin::go::min` +// node would conflate them and produce wrong cross-repo edges. +// Per-repo prefix keeps them as distinct nodes; a future +// "same-as" edge can union them when the workspace knows the +// versions actually match. +const ( + StubKindStdlib = "stdlib" + StubKindBuiltin = "builtin" + StubKindExternalCall = "external_call" + StubKindModule = "module" +) + +// StubID composes a stub identifier with the per-repo prefix. +// Pass repoPrefix = "" when the caller is outside a per-repo +// context (single-repo daemons that haven't set a prefix). +func StubID(repoPrefix, kind string, parts ...string) string { + var b strings.Builder + if repoPrefix != "" { + b.WriteString(repoPrefix) + b.WriteString("::") + } + b.WriteString(kind) + for _, p := range parts { + b.WriteString("::") + b.WriteString(p) + } + return b.String() +} + +// IsStub reports whether id is any stub kind. Cheaper than +// StubKind when callers only need a yes/no. +func IsStub(id string) bool { + return StubKind(id) != "" +} + +// StubKind extracts the stub category (stdlib / builtin / +// external_call / module) from id. Returns "" if id is not a +// stub. +// +// Format dispatch: +// - "::" — legacy, no repo prefix +// - "::::" — per-repo prefix +// +// We match by looking for one of the known kind segments +// anywhere in the first two "::"-separated positions. +func StubKind(id string) string { + for _, k := range stubKinds { + // Without repo prefix: "::..." + if strings.HasPrefix(id, k+"::") { + return k + } + } + // With repo prefix: "::::..." + // Find the second "::" segment. + first := strings.Index(id, "::") + if first < 0 { + return "" + } + rest := id[first+2:] + for _, k := range stubKinds { + if strings.HasPrefix(rest, k+"::") { + return k + } + } + return "" +} + +// stubKinds is the closed set of stub categories. Ordered by +// expected frequency so the lookup loop bails early in the +// common case. +var stubKinds = []string{ + StubKindStdlib, + StubKindExternalCall, + StubKindBuiltin, + StubKindModule, +} + +// IsStdlibStub etc are convenience predicates that don't make +// the caller compare StubKind's return against a literal. +func IsStdlibStub(id string) bool { return StubKind(id) == StubKindStdlib } +func IsBuiltinStub(id string) bool { return StubKind(id) == StubKindBuiltin } +func IsExternalCallStub(id string) bool { return StubKind(id) == StubKindExternalCall } +func IsModuleStub(id string) bool { return StubKind(id) == StubKindModule } + +// StubRest returns the kind-specific tail of a stub id (the +// portion after "::::" or "::"). Returns "" if +// id is not a stub. Useful for the "fmt::Errorf" portion of a +// stdlib stub when callers need to inspect the symbol identity. +func StubRest(id string) string { + kind := StubKind(id) + if kind == "" { + return "" + } + prefix := kind + "::" + if idx := strings.Index(id, prefix); idx >= 0 { + return id[idx+len(prefix):] + } + return "" +} + +// UnresolvedMarker is the prefix the extractor emits for a call/ +// reference target the resolver still needs to bind to a concrete +// Node. +// +// Forms: +// +// unresolved::Name — legacy / single-repo +// ::unresolved::Name — multi-repo COPY rewrite (in +// copyBulkLocked, to dodge +// cross-repo PK collisions) +// +// IsUnresolvedTarget / UnresolvedName / UnresolvedRepoPrefix +// normalise over both shapes so callers (resolver, MCP filters, +// data-flow tracker) don't have to know the encoding. +const UnresolvedMarker = "unresolved::" + +// IsUnresolvedTarget reports whether id names an unresolved +// extractor stub in either the bare or the multi-repo form. +func IsUnresolvedTarget(id string) bool { + if id == "" { + return false + } + if strings.HasPrefix(id, UnresolvedMarker) { + return true + } + return strings.Contains(id, "::"+UnresolvedMarker) +} + +// UnresolvedName returns the bare symbol name encoded in an +// unresolved target id, stripping the `unresolved::` prefix (and +// any leading `::`). Returns "" when id is not an +// unresolved stub. +func UnresolvedName(id string) string { + if id == "" { + return "" + } + if strings.HasPrefix(id, UnresolvedMarker) { + return id[len(UnresolvedMarker):] + } + idx := strings.Index(id, "::"+UnresolvedMarker) + if idx < 0 { + return "" + } + return id[idx+len("::"+UnresolvedMarker):] +} + +// UnresolvedRepoPrefix returns the per-repo prefix encoded in an +// unresolved target id, or "" if the id is bare or not an +// unresolved stub. +func UnresolvedRepoPrefix(id string) string { + if id == "" || strings.HasPrefix(id, UnresolvedMarker) { + return "" + } + idx := strings.Index(id, "::"+UnresolvedMarker) + if idx <= 0 { + return "" + } + return id[:idx] +} + +// StubRepoPrefix returns the per-repo prefix of a stub id, or +// "" if the id has no prefix or isn't a stub. +func StubRepoPrefix(id string) string { + kind := StubKind(id) + if kind == "" { + return "" + } + // If id starts with the kind directly, there's no repo prefix. + if strings.HasPrefix(id, kind+"::") { + return "" + } + if idx := strings.Index(id, "::"); idx > 0 { + return id[:idx] + } + return "" +} + +// IsResolvableRefEdge reports whether an edge of this kind is a +// symbol-level reference that the resolver binds from an +// `unresolved::` stub — calls, references, value reads/writes, +// type positions (typed_as / returns), and type hierarchy +// (implements / extends / composes / instantiates). These are the edges +// that must survive a definition's re-index as pending stubs rather than +// be dropped wholesale. Structural edges (contains / defines / member_of +// / imports / param_of) and enrichment edges (tests / provides / spawns +// / annotated / …) are not name-resolved and are excluded — re-stubbing +// them would only create edges nothing ever rebinds. +func IsResolvableRefEdge(k EdgeKind) bool { + switch k { + case EdgeCalls, EdgeReferences, EdgeReads, EdgeWrites, + EdgeTypedAs, EdgeReturns, EdgeInstantiates, + EdgeImplements, EdgeExtends, EdgeComposes: + return true + } + return false +} + +// IsReferenceableSymbol reports whether a node of this kind can be the +// target of a cross-file symbol reference — and thus the subject of +// reverse resolution by name. Excludes files, imports, packages, +// params, closures, locals, builtins, generic params, and the +// coverage / infra node kinds, none of which a caller binds to by bare +// name from an unresolved stub. +func IsReferenceableSymbol(k NodeKind) bool { + switch k { + case KindFunction, KindMethod, KindType, KindInterface, + KindVariable, KindConstant, KindField, KindEnumMember: + return true + } + return false +} diff --git a/internal/graph/unresolved_helpers_test.go b/internal/graph/unresolved_helpers_test.go new file mode 100644 index 00000000..bf494a54 --- /dev/null +++ b/internal/graph/unresolved_helpers_test.go @@ -0,0 +1,45 @@ +package graph + +import "testing" + +// TestUnresolvedHelpers locks in the multi-repo unresolved target +// normalisation: a literal `unresolved::Foo` (legacy single-repo) and +// a per-repo `gortex::unresolved::Foo` (multi-repo COPY rewrite) must +// both be recognised by IsUnresolvedTarget and decoded to "Foo" by +// UnresolvedName. Pre-fix, every caller used strings.HasPrefix on the +// literal form, which silently missed the prefixed form and left +// every multi-repo call edge dangling. +func TestUnresolvedHelpers(t *testing.T) { + t.Parallel() + + cases := []struct { + id string + isU bool + name string + prefix string + }{ + // Legacy / single-repo form + {"unresolved::AddNode", true, "AddNode", ""}, + {"unresolved::*.Foo", true, "*.Foo", ""}, + {"unresolved::import::fmt", true, "import::fmt", ""}, + // Multi-repo COPY-rewrite form + {"gortex::unresolved::AddNode", true, "AddNode", "gortex"}, + {"tree-sitter-dart::unresolved::ACCEPT_TOKEN", true, "ACCEPT_TOKEN", "tree-sitter-dart"}, + // Non-stubs + {"gortex/internal/graph/graph.go::Graph.AddNode", false, "", ""}, + {"", false, "", ""}, + {"stdlib::fmt::Errorf", false, "", ""}, + {"gortex::stdlib::fmt::Errorf", false, "", ""}, + } + for _, c := range cases { + if got := IsUnresolvedTarget(c.id); got != c.isU { + t.Errorf("IsUnresolvedTarget(%q) = %v, want %v", c.id, got, c.isU) + } + if got := UnresolvedName(c.id); got != c.name { + t.Errorf("UnresolvedName(%q) = %q, want %q", c.id, got, c.name) + } + if got := UnresolvedRepoPrefix(c.id); got != c.prefix { + t.Errorf("UnresolvedRepoPrefix(%q) = %q, want %q", c.id, got, c.prefix) + } + } +} diff --git a/internal/hooks/probe_e2e_test.go b/internal/hooks/probe_e2e_test.go index 9f54422a..5799f0a4 100644 --- a/internal/hooks/probe_e2e_test.go +++ b/internal/hooks/probe_e2e_test.go @@ -38,6 +38,21 @@ func (f *fakeController) Shutdown(_ context.Context) error { return nil } func (f *fakeController) SearchSymbols(_ context.Context, _ daemon.SearchSymbolsParams) (daemon.SearchSymbolsResult, error) { return daemon.SearchSymbolsResult{Hits: f.hits}, nil } +func (f *fakeController) EnrichChurn(_ context.Context, _ daemon.EnrichChurnParams) (daemon.EnrichChurnResult, error) { + return daemon.EnrichChurnResult{}, nil +} +func (f *fakeController) EnrichReleases(_ context.Context, _ daemon.EnrichReleasesParams) (daemon.EnrichReleasesResult, error) { + return daemon.EnrichReleasesResult{}, nil +} +func (f *fakeController) EnrichBlame(_ context.Context, _ daemon.EnrichBlameParams) (daemon.EnrichBlameResult, error) { + return daemon.EnrichBlameResult{}, nil +} +func (f *fakeController) EnrichCoverage(_ context.Context, _ daemon.EnrichCoverageParams) (daemon.EnrichCoverageResult, error) { + return daemon.EnrichCoverageResult{}, nil +} +func (f *fakeController) EnrichCochange(_ context.Context, _ daemon.EnrichCochangeParams) (daemon.EnrichCochangeResult, error) { + return daemon.EnrichCochangeResult{}, nil +} // startTestDaemon spins up a real daemon on a short-path unix socket and // points GORTEX_DAEMON_SOCKET at it so daemon.Dial finds it. diff --git a/internal/hooks/telemetry.go b/internal/hooks/telemetry.go index aa4aedbf..775d2d5b 100644 --- a/internal/hooks/telemetry.go +++ b/internal/hooks/telemetry.go @@ -32,7 +32,7 @@ type hookDecision struct { } // hookDecisionsPath returns the telemetry file path. Respects GORTEX_HOOK_LOG -// so tests can redirect writes. Defaults to ~/.cache/gortex (or the +// so tests can redirect writes. Defaults to ~/.gortex/cache (or the // $XDG_CACHE_HOME equivalent when that variable is set). func hookDecisionsPath() string { if p := os.Getenv("GORTEX_HOOK_LOG"); p != "" { diff --git a/internal/indexer/clone_incremental.go b/internal/indexer/clone_incremental.go new file mode 100644 index 00000000..4f8c4739 --- /dev/null +++ b/internal/indexer/clone_incremental.go @@ -0,0 +1,308 @@ +package indexer + +import ( + "sync" + + "github.com/zzet/gortex/internal/clones" + "github.com/zzet/gortex/internal/graph" +) + +// incrementalCloneIndex maintains the clone-detection state (CMS + +// length-stratified LSH) live across single-file edits so a (re)index of +// one file updates EdgeSimilarTo edges in O(edited file) instead of the +// whole-graph detectClonesAndEmitEdges recompute. It is the steady-state +// counterpart of the batch pass: the batch pass re-baselines (corrects CMS +// drift) and runs diffusion; this index keeps the direct similar_to edges +// in step between batch passes. +// +// Source of truth in-session is the in-memory shingles cache; the durable +// copy lives in the CloneShingle* sidecar so Rebuild can reseed the CMS +// after a warm restart without re-parsing. Signatures are computed through +// the same kernel the batch pass uses (computeCloneSigFromShingles), so at +// a given corpus the incremental and batch edge sets are identical. +// +// It is NOT goroutine-safe beyond its own mutex — every method takes the +// lock — and is driven under the indexer's write path (one goroutine at a +// time), the same single-writer discipline the underlying clones.CMS / +// clones.StratifiedIndex assume. +type incrementalCloneIndex struct { + mu sync.Mutex + cms *clones.CMS + lsh *clones.StratifiedIndex + shingles map[string][]uint64 // node id -> raw shingle set (cache) + corpus int + built bool +} + +// newIncrementalCloneIndex returns an empty, un-built index. built stays +// false until a batch pass or Rebuild seeds it from the graph / sidecar; +// while un-built the indexer falls back to the whole-graph clone pass. +func newIncrementalCloneIndex() *incrementalCloneIndex { + return &incrementalCloneIndex{ + cms: clones.NewCMS(65536, 4), + lsh: clones.NewStratifiedIndex(), + shingles: make(map[string][]uint64), + } +} + +// tokensFromMeta reads a node's stamped normalised-token count, tolerating +// the int / int64 / float64 shapes a backend round-trip may produce. +// Mirrors the switch in detectClonesAndEmitEdgesCtx so the LSH length +// classes match the batch pass. +func tokensFromMeta(n *graph.Node) int { + if n == nil || n.Meta == nil { + return 0 + } + switch v := n.Meta[cloneTokensMetaKey].(type) { + case int: + return v + case int64: + return int(v) + case float64: + return int(v) + } + return 0 +} + +// cloneFuncNodes filters a node slice to the function/method nodes that +// participate in clone detection. +func cloneFuncNodes(nodes []*graph.Node) []*graph.Node { + out := make([]*graph.Node, 0, len(nodes)) + for _, n := range nodes { + if n == nil { + continue + } + if n.Kind == graph.KindFunction || n.Kind == graph.KindMethod { + out = append(out, n) + } + } + return out +} + +// Rebuild resets the index and reseeds it from the graph's current +// signatures plus the persisted shingle sidecar. It is the warmup / +// post-batch / warm-restart path: after the whole-graph clone pass has +// stamped clone_sig on the surviving bodies (and finaliseCloneSignatures +// has persisted clone_shingles for EVERY eligible body — survivors and +// boilerplate-dropped alike — to the sidecar), Rebuild walks this repo's +// bodies, rebuilds the CMS + corpus from the persisted shingles, banks +// each surviving signature into the live LSH index, and marks built=true +// so subsequent edits go incremental. +// +// The CMS and corpus MUST mirror finaliseCloneSignatures' bodies set: that +// pass builds its CMS and useFilter/threshold from ALL eligible bodies +// (every func/method node that had clone_shingles), including the ones it +// then drops as boilerplate-dominated (no clone_sig). Seeding the CMS / +// corpus only from survivors (clone_sig present) would under-count the +// sketch and shrink the corpus, so the incremental path would filter +// against a different threshold than the batch finalise and stamp +// different signatures on the edited file. We therefore seed CMS + corpus +// from every body with persisted shingles and gate ONLY the LSH Add on a +// decodable clone_sig (survivors). This makes Rebuild's CMS/corpus +// byte-match what the batch finalise produced. +// +// Repo-scoped: it walks AllNodes filtered to n.RepoPrefix == repoPrefix so +// each per-repo index's corpus counts only that repo's bodies — matching +// its repo-scoped LoadCloneShingles seed. An unfiltered AllNodes walk would +// count every repo's bodies into a single repo's corpus and skew its +// threshold. (GetRepoNodes can't be used here: in single-repo / in-memory +// mode repoPrefix is "" and nodes with an empty RepoPrefix are not tracked +// in the byRepo buckets GetRepoNodes reads, so GetRepoNodes("") is always +// empty — the AllNodes+filter form is the one that works for both regimes, +// since "" == "" matches every node.) +// +// Tolerant of a missing/partial sidecar: a body with a clone_sig but no +// persisted shingle row still enters the LSH index (so its edges are +// maintained) — that body just contributes nothing to the CMS / corpus, +// which at the re-baseline corpus is corrected at the next batch pass. +func (ci *incrementalCloneIndex) Rebuild(g graph.Store, repoPrefix string) { + if ci == nil || g == nil { + return + } + ci.mu.Lock() + defer ci.mu.Unlock() + + ci.cms = clones.NewCMS(65536, 4) + ci.lsh = clones.NewStratifiedIndex() + ci.shingles = make(map[string][]uint64) + ci.corpus = 0 + + var load map[string][]uint64 + if r, ok := g.(graph.CloneShingleReader); ok { + if rows, err := r.LoadCloneShingles(repoPrefix); err == nil { + load = rows + } + } + + for _, n := range g.AllNodes() { + if n == nil { + continue + } + if n.RepoPrefix != repoPrefix { + continue + } + if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { + continue + } + // Seed CMS + corpus from every eligible body that has persisted + // shingles — survivors AND boilerplate-dropped bodies — so the + // sketch and corpus mirror finaliseCloneSignatures' bodies set. + sh := load[n.ID] + if len(sh) > 0 { + for _, s := range sh { + ci.cms.Add(s) + } + ci.shingles[n.ID] = sh + ci.corpus++ + } + // Only survivors (a decodable clone_sig) enter the LSH index — + // dropped bodies have no signature and never produce edges. + if n.Meta == nil { + continue + } + enc, ok := n.Meta[cloneSigMetaKey].(string) + if !ok || enc == "" { + continue + } + sig, ok := clones.DecodeSignature(enc) + if !ok { + continue + } + ci.lsh.Add(clones.Item{ID: n.ID, Sig: sig, TokenCount: tokensFromMeta(n)}) + } + ci.built = true +} + +// EvictFuncs removes a set of function/method nodes from the index: it +// decrements their shingles out of the CMS, drops them from the LSH index +// and the in-memory cache, and deletes their rows from the persisted +// sidecar. Called with the OLD function ids of a file just before that +// file's fresh nodes are added (UpdateFuncs), so a re-index is an +// evict-then-add of only the edited file's bodies. +func (ci *incrementalCloneIndex) EvictFuncs(g graph.Store, ids []string) { + if ci == nil || len(ids) == 0 { + return + } + ci.mu.Lock() + defer ci.mu.Unlock() + for _, id := range ids { + sh, ok := ci.shingles[id] + if !ok { + // Not a tracked clone body (no signature / never added) — + // still remove from the LSH index in case it was banked, + // then move on. + ci.lsh.Remove(id) + continue + } + for _, s := range sh { + ci.cms.Decrement(s) + } + delete(ci.shingles, id) + ci.lsh.Remove(id) + ci.corpus-- + } + if w, ok := g.(graph.CloneShingleWriter); ok { + _ = w.DeleteCloneShingles(ids) + } +} + +// UpdateFuncs banks the freshly-parsed function/method nodes of one file +// into the index and emits the EdgeSimilarTo edges their signatures imply. +// funcNodes carry the raw shingle set on Meta (cloneShinglesMetaKey, +// stamped by applyCloneSignatures during parse) — this method computes +// their signatures through the same kernel the batch pass uses, so the two +// paths agree exactly. +// +// Two phases. First every new body's shingles are folded into the CMS, +// cached, persisted, and the corpus count bumped — so the boilerplate +// threshold the signature kernel sees reflects the new corpus, matching +// finaliseCloneSignatures. Then each body's signature is computed, stamped +// on the node, banked into the LSH index, and queried for clone pairs; +// surviving pairs are materialised as symmetric EdgeSimilarTo edges (both +// directions, mirroring detectClonesAndEmitEdgesCtx). +func (ci *incrementalCloneIndex) UpdateFuncs(g graph.Store, repoPrefix string, funcNodes []*graph.Node, threshold float64) { + if ci == nil || g == nil { + return + } + ci.mu.Lock() + defer ci.mu.Unlock() + + // Phase 1: fold every new body into the CMS + cache + sidecar and + // bump the corpus count, so the boilerplate gate below sees the same + // corpus the batch finalise would. + rows := make(map[string][]uint64) + type pending struct { + node *graph.Node + shingles []uint64 + } + var todo []pending + for _, n := range funcNodes { + if n == nil || n.Meta == nil { + continue + } + if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { + continue + } + sh, ok := n.Meta[cloneShinglesMetaKey].([]uint64) + if !ok { + continue + } + for _, s := range sh { + ci.cms.Add(s) + } + ci.shingles[n.ID] = sh + ci.corpus++ + rows[n.ID] = sh + todo = append(todo, pending{node: n, shingles: sh}) + } + if w, ok := g.(graph.CloneShingleWriter); ok && len(rows) > 0 { + _ = w.BulkSetCloneShingles(repoPrefix, rows) + } + + // Corpus-based gate, matching finaliseCloneSignatures exactly. + useFilter := ci.corpus >= cmsMinCorpus + var thr uint32 + if useFilter { + thr = uint32(float64(ci.corpus) * cmsBoilerplateRatio) + if thr < 1 { + thr = 1 + } + } + + // Phase 2: compute each signature, stamp it, bank it into the LSH + // index, and remember the banked Item so we can query for pairs once + // every new body is in the index. clone_shingles is removed from Meta + // (the sidecar holds the durable copy) — mirrors finalise. + added := make([]clones.Item, 0, len(todo)) + for _, p := range todo { + n := p.node + sig, ok := computeCloneSigFromShingles(ci.cms, thr, useFilter, p.shingles) + delete(n.Meta, cloneShinglesMetaKey) + if !ok { + delete(n.Meta, cloneSigMetaKey) + continue + } + n.Meta[cloneSigMetaKey] = clones.EncodeSignature(sig) + item := clones.Item{ID: n.ID, Sig: sig, TokenCount: tokensFromMeta(n)} + ci.lsh.Add(item) + added = append(added, item) + } + + // Emit edges for every clone pair touching a newly-added body. Both + // endpoints are looked up and a symmetric EdgeSimilarTo pair is + // emitted, mirroring detectClonesAndEmitEdgesCtx's emit. AddEdge + // dedupes by edge key, so a pair surfaced from both of its endpoints + // (when two new bodies in the same file are clones of each other) + // collapses to one symmetric pair. + for _, item := range added { + for _, p := range ci.lsh.QueryPairs(item, threshold) { + from := g.GetNode(p.A) + to := g.GetNode(p.B) + if from == nil || to == nil { + continue + } + emitSimilarEdge(g, from, to, p.Similarity) + emitSimilarEdge(g, to, from, p.Similarity) + } + } +} diff --git a/internal/indexer/clone_incremental_test.go b/internal/indexer/clone_incremental_test.go new file mode 100644 index 00000000..b27aeaa8 --- /dev/null +++ b/internal/indexer/clone_incremental_test.go @@ -0,0 +1,378 @@ +package indexer + +import ( + "fmt" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/clones" + "github.com/zzet/gortex/internal/graph" +) + +// Three small Go files holding cross-file near-duplicate (Type-2) function +// pairs: every identifier is renamed but the control flow is identical, so +// MinHash + LSH flags them as clones and emits EdgeSimilarTo. The shapes +// are deliberately spread across files so the incremental path exercises +// cross-file pair emission (UpdateFuncs querying the live LSH index, not +// just within-file pairs). + +const cloneIncFileA = `package main + +func sumActiveItems(items []Item) int { + total := 0 + for i := 0; i < len(items); i++ { + if items[i].Active { + total += items[i].Weight * factor + } else { + total -= items[i].Penalty + } + } + if total < 0 { + total = 0 + } + return total +} + +func parseAndValidate(input string) (string, error) { + parts := splitOnComma(input) + if len(parts) == 0 { + return "", errEmpty + } + first := parts[0] + if first == "" { + return "", errBlank + } + return normalize(first), nil +} +` + +const cloneIncFileB = `package main + +func sumEnabledRecords(records []Record) int { + sum := 0 + for idx := 0; idx < len(records); idx++ { + if records[idx].Enabled { + sum += records[idx].Score * multiplier + } else { + sum -= records[idx].Fine + } + } + if sum < 0 { + sum = 0 + } + return sum +} + +func openAndScanRows(conn *Conn, statement string) error { + rows, err := conn.Query(statement) + if err != nil { + return wrap(err, "query failed") + } + defer rows.Close() + for rows.Next() { + var name string + if scanErr := rows.Scan(&name); scanErr != nil { + return scanErr + } + } + return rows.Err() +} +` + +const cloneIncFileC = `package main + +func decodeAndCheck(payload string) (string, error) { + segments := splitOnComma(payload) + if len(segments) == 0 { + return "", errEmpty + } + head := segments[0] + if head == "" { + return "", errBlank + } + return normalize(head), nil +} +` + +// writeCloneIncFixture writes the three-file fixture into dir and returns +// the absolute paths in a stable order. +func writeCloneIncFixture(t *testing.T, dir string) []string { + t.Helper() + a := filepath.Join(dir, "a.go") + b := filepath.Join(dir, "b.go") + c := filepath.Join(dir, "c.go") + writeFile(t, a, cloneIncFileA) + writeFile(t, b, cloneIncFileB) + writeFile(t, c, cloneIncFileC) + return []string{a, b, c} +} + +// similarEdgeSet returns the EdgeSimilarTo {From,To} directed-edge set. +func similarEdgeSet(g graph.Store) map[[2]string]struct{} { + set := make(map[[2]string]struct{}) + for _, e := range g.AllEdges() { + if e.Kind == graph.EdgeSimilarTo { + set[[2]string{e.From, e.To}] = struct{}{} + } + } + return set +} + +// TestCloneIncremental_MatchesBatch is the equivalence test: the +// EdgeSimilarTo set produced by the whole-graph batch clone pass must be +// IDENTICAL to the set produced by driving the incremental maintainer +// (EvictFuncs/UpdateFuncs) over the same files one at a time. At this small +// scale the CMS is identical between the two paths (no boilerplate +// filtering kicks in below cmsMinCorpus) so there is zero drift, making +// exact set equality the correct assertion. +func TestCloneIncremental_MatchesBatch(t *testing.T) { + dir := t.TempDir() + files := writeCloneIncFixture(t, dir) + require.Greater(t, len(files), 1, "fixture must be multi-file") + + // (a) Batch path: full cold index on graph A. + gA := graph.New() + idxA := newTestIndexer(gA) + _, err := idxA.Index(dir) + require.NoError(t, err) + batch := similarEdgeSet(gA) + require.GreaterOrEqual(t, len(batch), 1, "fixture must produce >=1 EdgeSimilarTo (non-vacuity)") + + // (b) Incremental path: fresh graph B. The full Index() seeds the + // incremental clone index (IndexCtx calls Rebuild at the end → + // built=true). Re-indexing each file then drives EvictFuncs + + // UpdateFuncs through the incremental maintainer. + gB := graph.New() + idxB := newTestIndexer(gB) + _, err = idxB.Index(dir) + require.NoError(t, err) + require.True(t, idxB.cloneIndex.built, "incremental clone index must be built after full Index()") + + for _, f := range files { + require.NoError(t, idxB.IndexFile(f)) + } + incremental := similarEdgeSet(gB) + + assert.Equal(t, batch, incremental, + "incremental clone edges must exactly equal the batch clone edges") +} + +// TestCloneIncremental_WarmRestart simulates a daemon warm restart: after a +// full index, the in-memory CMS/LSH state is thrown away and the index is +// rebuilt purely from the persisted clone_shingles sidecar + the graph's +// clone_sig stamps. A subsequent single-file reindex must produce the same +// EdgeSimilarTo set as before the restart. +func TestCloneIncremental_WarmRestart(t *testing.T) { + dir := t.TempDir() + files := writeCloneIncFixture(t, dir) + require.Greater(t, len(files), 1, "fixture must be multi-file") + + g := graph.New() + idx := newTestIndexer(g) + _, err := idx.Index(dir) + require.NoError(t, err) + + want := similarEdgeSet(g) + require.GreaterOrEqual(t, len(want), 1, "fixture must produce >=1 EdgeSimilarTo (non-vacuity)") + + // Simulate restart: drop the live incremental index and rebuild a + // fresh one from scratch. Rebuild reads clone_sig off the graph and + // clone_shingles from the sidecar (the in-memory *Graph persisted + // them during finaliseCloneSignatures). No re-parse happens. + idx.cloneIndex = newIncrementalCloneIndex() + require.False(t, idx.cloneIndex.built) + idx.cloneIndex.Rebuild(g, idx.repoPrefix) + require.True(t, idx.cloneIndex.built, "Rebuild must mark the index built") + require.Greater(t, idx.cloneIndex.corpus, 1, + "Rebuild must reseed the corpus from clone_sig nodes") + + // A single-file reindex now runs through the incremental maintainer + // seeded only from the sidecar. The edge set must be unchanged. + require.NoError(t, idx.IndexFile(files[0])) + got := similarEdgeSet(g) + assert.Equal(t, want, got, + "clone edges after a sidecar-only rebuild + reindex must match the pre-restart set") +} + +// writeCloneFilteredFixture writes a large fixture engineered to push the +// corpus over a (test-lowered) cmsMinCorpus so the CMS boilerplate filter +// (useFilter) engages on BOTH the batch and incremental paths. It contains +// three classes of body, one per file: +// +// - filler*: ~240 structurally varied bodies that pad the corpus. +// - boiler*: ~40 bodies sharing one identical skeleton, so every shingle +// they own is high-frequency and gets filtered out — they survive with +// too few discriminative shingles and are DROPPED (no clone_sig). These +// are the bodies whose presence the survivor-only Rebuild seeding fails +// to count. +// - cloneA / cloneB: one genuine Type-2 clone pair whose shared structure +// appears in exactly two bodies (frequency = 2 ≤ threshold), so it +// survives filtering and emits EdgeSimilarTo. +// +// The fixture is split one function per file so a single-file reindex drives +// exactly one body through EvictFuncs/UpdateFuncs. +func writeCloneFilteredFixture(t *testing.T, dir string) []string { + t.Helper() + var files []string + write := func(name, body string) { + p := filepath.Join(dir, name+".go") + writeFile(t, p, "package main\n\n"+body) + files = append(files, p) + } + + ops := []string{"+", "-", "*", "/", "%", "&", "|", "^"} + cmps := []string{">", "<", ">=", "<=", "==", "!="} + for k := 0; k < 240; k++ { + body := fmt.Sprintf("func filler%d(in []int) int {\n\tacc := 0\n", k) + for s := 0; s < 20; s++ { + op := ops[(k*7+s*3)%len(ops)] + op2 := ops[(k*5+s*11)%len(ops)] + cmp := cmps[(k*13+s*17)%len(cmps)] + body += fmt.Sprintf("\tif acc %s %d {\n\t\tacc = acc %s %d %s %d\n\t}\n", + cmp, (k*3+s)%17, op, (k+s)%13, op2, (k*2+s*5)%11) + } + body += "\treturn acc\n}\n" + write(fmt.Sprintf("filler%d", k), body) + } + + for k := 0; k < 40; k++ { + write(fmt.Sprintf("boiler%d", k), fmt.Sprintf(`func boiler%d(a int, b int) int { + c := a + b + d := c + a + e := d + b + f := e + c + g := f + d + return g +} +`, k)) + } + + cloneShape := func(name, p, q, r string) string { + return fmt.Sprintf(`func %s(%s []int) int { + %s := 0 + for %s := 0; %s < len(%s); %s++ { + if %s[%s] > 100 { + %s += %s[%s] * 7 - 3 + } else if %s[%s] < -50 { + %s -= %s[%s] / 2 + } else { + %s += %s[%s] & 255 + } + } + if %s > 1000 { + %s = 1000 + } + return %s +} +`, name, p, q, r, r, p, r, p, r, q, p, r, p, r, q, p, r, q, p, r, q, q, q) + } + write("clonea", cloneShape("crunchActive", "items", "acc", "i")) + write("cloneb", cloneShape("foldEnabled", "records", "sum", "j")) + return files +} + +// cloneBodyShingles recomputes, from the persisted clone_shingles sidecar, +// the (corpus, CMS) the batch finaliseCloneSignatures would have built — its +// body set is EVERY func/method node with shingles (survivors AND +// boilerplate-dropped), which is exactly what Rebuild must mirror. Returns +// the corpus size, a CMS seeded from all those shingles, and one sample +// shingle observed in the corpus (for a Count() spot-check). +func cloneBodyShingles(t *testing.T, g graph.Store, repoPrefix string) (corpus int, cms *clones.CMS, sample uint64) { + t.Helper() + r, ok := g.(graph.CloneShingleReader) + require.True(t, ok, "in-memory graph must implement CloneShingleReader") + rows, err := r.LoadCloneShingles(repoPrefix) + require.NoError(t, err) + cms = clones.NewCMS(65536, 4) + for _, sh := range rows { + if len(sh) == 0 { + continue + } + for _, s := range sh { + cms.Add(s) + if sample == 0 { + sample = s + } + } + corpus++ + } + return corpus, cms, sample +} + +// TestCloneIncremental_MatchesBatch_Filtered is the equivalence test with +// the CMS boilerplate filter ENGAGED. The base TestCloneIncremental_MatchesBatch +// runs below cmsMinCorpus where useFilter=false, so the survivor-only Rebuild +// seeding bug is dormant. This test lowers cmsMinCorpus so useFilter=true on +// BOTH paths over a fixture that includes boilerplate-dominated bodies that +// finaliseCloneSignatures drops (no clone_sig) but still counts into its +// CMS/corpus. The pre-fix Rebuild seeded CMS/corpus from survivors only, so: +// +// - its corpus would be ~2 (only the clone pair) instead of the full body +// count, and +// - useFilter on the next incremental update would flip to false, +// +// changing the edited file's signatures vs the batch. Both assertions below +// fail against the pre-fix Rebuild and pass after. +func TestCloneIncremental_MatchesBatch_Filtered(t *testing.T) { + // Lower the corpus floor so the filter engages on this fixture, then + // restore it so other tests see the production default. + prev := cmsMinCorpus + cmsMinCorpus = 6 + t.Cleanup(func() { cmsMinCorpus = prev }) + + dir := t.TempDir() + files := writeCloneFilteredFixture(t, dir) + require.Greater(t, len(files), cmsMinCorpus, "fixture must exceed the lowered corpus floor") + + // (a) Batch path: full cold index on graph A. + gA := graph.New() + idxA := newTestIndexer(gA) + _, err := idxA.Index(dir) + require.NoError(t, err) + batch := similarEdgeSet(gA) + require.GreaterOrEqual(t, len(batch), 1, + "filtered fixture must still produce >=1 EdgeSimilarTo (non-vacuity)") + + // The batch corpus must be well above the lowered floor (so useFilter + // was true) AND well above the survivor count (so dropped bodies exist + // — that gap is what the bug mishandles). + batchCorpus, batchCMS, sample := cloneBodyShingles(t, gA, idxA.repoPrefix) + require.Greater(t, batchCorpus, cmsMinCorpus, + "batch corpus must exceed the floor so useFilter engaged") + require.NotZero(t, sample, "fixture must yield at least one shingle") + + // (b) Incremental path: fresh graph B. Full Index() seeds the + // incremental clone index via Rebuild (built=true); re-indexing each + // file then drives EvictFuncs + UpdateFuncs. + gB := graph.New() + idxB := newTestIndexer(gB) + _, err = idxB.Index(dir) + require.NoError(t, err) + require.True(t, idxB.cloneIndex.built, + "incremental clone index must be built after full Index()") + + // DIRECT seeding assertions: Rebuild's CMS+corpus must mirror the batch + // finalise's all-bodies set. The survivor-only pre-fix seeding makes + // the corpus collapse to the survivor count and undercounts the CMS — + // these assertions are the regression tripwire. + idxB.cloneIndex.mu.Lock() + gotCorpus := idxB.cloneIndex.corpus + gotCount := idxB.cloneIndex.cms.Count(sample) + idxB.cloneIndex.mu.Unlock() + assert.Equal(t, batchCorpus, gotCorpus, + "Rebuild corpus must equal the batch finalise corpus (all bodies, not survivors)") + assert.Equal(t, batchCMS.Count(sample), gotCount, + "Rebuild CMS Count(sample) must equal the batch finalise CMS count") + + // EDGE-SET equivalence under the engaged filter: driving each file + // through the incremental maintainer must reproduce the batch edges. + for _, f := range files { + require.NoError(t, idxB.IndexFile(f)) + } + incremental := similarEdgeSet(gB) + assert.Equal(t, batch, incremental, + "incremental clone edges must exactly equal the batch clone edges under the CMS filter") +} diff --git a/internal/indexer/clones.go b/internal/indexer/clones.go index dd2de4a5..f7c48530 100644 --- a/internal/indexer/clones.go +++ b/internal/indexer/clones.go @@ -60,10 +60,19 @@ const cloneShinglesMetaKey = "clone_shingles" // bodies (e.g. trivial controller / DTO wrappers) land here. const ( cmsBoilerplateRatio = 0.01 - cmsMinCorpus = 2000 minSurvivingShingles = 8 ) +// cmsMinCorpus is the body-count floor below which the CMS boilerplate +// filter is disabled (useFilter=false) and the pass falls back to +// unfiltered MinHash — see the doc comment above for the rationale and +// default. It is a package-level var (not a const) purely so the clone +// equivalence tests can temporarily lower it to force useFilter=true on a +// small fixture and exercise the filtered batch/incremental paths; restore +// it via t.Cleanup. Production never mutates it — the default semantics are +// unchanged. +var cmsMinCorpus = 2000 + // applyCloneSignatures is the per-file half of clone detection. It runs // inside applyCoverageDomains (gated on the "clones" coverage domain), // slices each function/method body out of the file source, computes a @@ -209,6 +218,49 @@ func bodyText(lines []string, startLine, endLine int) string { return b.String() } +// computeCloneSigFromShingles is the per-body signature kernel shared by +// the whole-graph finalise pass (finaliseCloneSignatures) and the +// incremental maintainer (incrementalCloneIndex.UpdateFuncs). Both paths +// MUST route through this function so a body's signature is byte-identical +// regardless of which path stamped it — that is what lets the equivalence +// test assert exact set equality between the batch and incremental clone +// edges. +// +// cms is the corpus Count-Min Sketch; threshold is the boilerplate cutoff +// (a shingle whose CMS count exceeds it is dropped). useFilter selects the +// branch: +// +// - useFilter true: exclude high-frequency shingles, then require the +// surviving set to clear minSurvivingShingles before computing MinHash. +// - useFilter false: keep every shingle and apply no floor (legacy +// small-corpus behaviour) — cms may be nil in this branch. +// +// Returns the signature and ok=false when the body is dropped from clone +// detection (empty / below the surviving floor) — the caller then leaves +// the node without a clone_sig, exactly as the batch pass does. +func computeCloneSigFromShingles(cms *clones.CMS, threshold uint32, useFilter bool, shingles []uint64) (clones.Signature, bool) { + var filtered []uint64 + if useFilter { + filtered = make([]uint64, 0, len(shingles)) + for _, sh := range shingles { + if cms.Count(sh) > threshold { + continue + } + filtered = append(filtered, sh) + } + } else { + filtered = shingles + } + floor := minSurvivingShingles + if !useFilter { + // Without filtering, every shingle survives — fall back to the + // legacy gate so we don't silently drop bodies the old code + // would have kept. + floor = 0 + } + return clones.SignatureFromShingles(filtered, floor) +} + // finaliseCloneSignatures runs after every file's shingles have been // stamped on its function / method nodes (by applyCloneSignatures // during the per-file parse). It builds a Count-Min Sketch of shingle @@ -234,7 +286,18 @@ func bodyText(lines []string, startLine, endLine int) string { // (deletes clone_shingles, sets clone_sig) across nodes that other // graph-wide passes (markTestSymbolsAndEmitEdges, ResolveTemporalCalls, // reach.BuildIndex) also touch under the same mutex. -func finaliseCloneSignatures(g *graph.Graph) { +// +// Repo-scoped: only bodies whose n.RepoPrefix == repoPrefix enter the +// CMS / signature passes, so a multi-repo graph computes each repo's +// boilerplate sketch and per-body signatures from that repo's bodies +// alone — clone detection is per-repository. A standalone single-repo +// Indexer uses repoPrefix == "" and its nodes carry RepoPrefix == "", +// so the equality matches every node and behaviour is unchanged. +// (GetRepoNodes can't be used here: GetRepoNodes("") is empty for the +// in-memory / single-repo store — see incrementalCloneIndex.Rebuild — +// so the AllNodes + equality filter is the form that works for both +// regimes, since "" == "" matches every node.) +func finaliseCloneSignatures(g graph.Store, repoPrefix string) { // First pass: collect every body that has stashed shingles. We // capture the *graph.Node pointers up front so the CMS-build pass // and the signature-compute pass don't both re-walk g.AllNodes(). @@ -243,6 +306,9 @@ func finaliseCloneSignatures(g *graph.Graph) { if n == nil || n.Meta == nil { continue } + if n.RepoPrefix != repoPrefix { + continue + } if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { continue } @@ -275,32 +341,51 @@ func finaliseCloneSignatures(g *graph.Graph) { } } + // Persist each body's raw shingle set to the clone_shingles sidecar + // BEFORE deleting it from Meta. This loop walks EVERY body in the + // corpus — both the survivors (which get a clone_sig below) and the + // boilerplate-dropped bodies (which do not) — persisting any with a + // non-empty shingle set. That is deliberate: incrementalCloneIndex. + // Rebuild reseeds its CMS + corpus from these rows and must mirror + // the bodies set this pass used to build its own CMS / threshold, + // which is ALL eligible bodies, not just survivors. Persisting only + // survivors here would under-seed Rebuild's sketch and skew the + // incremental threshold away from the batch one. Meta stays lean + // (the shingle set is large and only the CMS pass needs it), but the + // durable sidecar copy lets a warm restart rebuild the incremental + // CMS without re-parsing every body. Accumulate per node.RepoPrefix + // so a multi-repo graph reseeds each repo's CMS in isolation. + // Backends that don't implement CloneShingleWriter (no on-disk store) + // simply skip this — the in-session incremental index caches shingles + // in memory regardless. + if w, ok := g.(graph.CloneShingleWriter); ok { + byPrefix := make(map[string]map[string][]uint64) + for _, n := range bodies { + shingles, _ := n.Meta[cloneShinglesMetaKey].([]uint64) + if len(shingles) == 0 { + continue + } + rows := byPrefix[n.RepoPrefix] + if rows == nil { + rows = make(map[string][]uint64) + byPrefix[n.RepoPrefix] = rows + } + rows[n.ID] = shingles + } + for prefix, rows := range byPrefix { + _ = w.BulkSetCloneShingles(prefix, rows) + } + } + // Second pass: signature computation. Each body either lands a // fresh clone_sig (signature over surviving shingles) or is // dropped entirely (no clone_sig, never enters detection items - // list). In both cases clone_shingles is removed from Meta. + // list). In both cases clone_shingles is removed from Meta. The + // per-body kernel is computeCloneSigFromShingles — the incremental + // maintainer calls the same kernel so signatures match exactly. for _, n := range bodies { shingles, _ := n.Meta[cloneShinglesMetaKey].([]uint64) - var filtered []uint64 - if useFilter { - filtered = make([]uint64, 0, len(shingles)) - for _, sh := range shingles { - if cms.Count(sh) > threshold { - continue - } - filtered = append(filtered, sh) - } - } else { - filtered = shingles - } - floor := minSurvivingShingles - if !useFilter { - // Without filtering, every shingle survives — fall back - // to the legacy gate so we don't silently drop bodies the - // old code would have kept. - floor = 0 - } - sig, ok := clones.SignatureFromShingles(filtered, floor) + sig, ok := computeCloneSigFromShingles(cms, threshold, useFilter, shingles) delete(n.Meta, cloneShinglesMetaKey) if !ok { // Boilerplate-dominated or empty after filter — drop @@ -342,8 +427,15 @@ type CloneDetectionStats struct { // edges cannot survive — when either endpoint's file is reindexed, // EvictFile removes that node's edges in both directions before this // pass re-runs. -func detectClonesAndEmitEdges(g *graph.Graph, threshold float64) CloneDetectionStats { - return detectClonesAndEmitEdgesCtx(context.Background(), g, threshold) +// +// repoPrefix scopes the pass to one repository's nodes: every whole-graph +// walk it drives (finalise, item gather, diffusion) is filtered to +// n.RepoPrefix == repoPrefix so no cross-repo candidate pair is ever +// formed. A standalone single-repo Indexer passes "" and its nodes carry +// RepoPrefix == "", so the equality matches all nodes and the single-repo +// result is unchanged. +func detectClonesAndEmitEdges(g graph.Store, repoPrefix string, threshold float64) CloneDetectionStats { + return detectClonesAndEmitEdgesCtx(context.Background(), g, repoPrefix, threshold) } // detectClonesAndEmitEdgesCtx is the context-aware sibling of @@ -353,7 +445,7 @@ func detectClonesAndEmitEdges(g *graph.Graph, threshold float64) CloneDetectionS // without intra-stage reporters an operator sees just one // "clone detection pass" marker followed by minutes of silence — no // way to tell finalise-signatures from LSH from edge-emission. -func detectClonesAndEmitEdgesCtx(ctx context.Context, g *graph.Graph, threshold float64) CloneDetectionStats { +func detectClonesAndEmitEdgesCtx(ctx context.Context, g graph.Store, repoPrefix string, threshold float64) CloneDetectionStats { var stats CloneDetectionStats if g == nil { return stats @@ -384,7 +476,7 @@ func detectClonesAndEmitEdgesCtx(ctx context.Context, g *graph.Graph, threshold // (delete clone_shingles, set clone_sig) don't race the AllNodes // walk below. reporter.Report("clones: CMS-finalise signatures", 0, 0) - finaliseCloneSignatures(g) + finaliseCloneSignatures(g, repoPrefix) reporter.Report("clones: gather items", 0, 0) var items []clones.Item @@ -392,6 +484,11 @@ func detectClonesAndEmitEdgesCtx(ctx context.Context, g *graph.Graph, threshold if n == nil || n.Meta == nil { continue } + // Scope to this repo's nodes so no cross-repo candidate pair is + // ever formed. "" matches every node (single-repo / in-memory). + if n.RepoPrefix != repoPrefix { + continue + } if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { continue } @@ -527,7 +624,7 @@ type diffusionEdge struct { // directPairs carries the canonicalised clone pairs already emitted as // EdgeSimilarTo; any pair in that set is skipped so semantically_related // and similar_to partition cleanly. -func diffuseSimilarityEdges(g *graph.Graph, pairs []clones.Pair, directPairs map[[2]string]struct{}) (diffusedPairs, diffusedEdges int) { +func diffuseSimilarityEdges(g graph.Store, pairs []clones.Pair, directPairs map[[2]string]struct{}) (diffusedPairs, diffusedEdges int) { if g == nil || len(pairs) < 2 { return 0, 0 } @@ -633,7 +730,7 @@ func diffuseSimilarityEdges(g *graph.Graph, pairs []clones.Pair, directPairs map // node's file/line for locality. Origin is ast_inferred — the // relationship is a statistical estimate over normalised tokens, not a // structural fact. -func emitSimilarEdge(g *graph.Graph, from, to *graph.Node, similarity float64) { +func emitSimilarEdge(g graph.Store, from, to *graph.Node, similarity float64) { g.AddEdge(&graph.Edge{ From: from.ID, To: to.ID, @@ -651,7 +748,7 @@ func emitSimilarEdge(g *graph.Graph, from, to *graph.Node, similarity float64) { // edge is anchored at the source node's file/line and origin is // ast_inferred — the score is a statistical estimate over normalised // tokens, here additionally smoothed across the similarity graph. -func emitSemanticallyRelatedEdge(g *graph.Graph, from, to *graph.Node, similarity float64) { +func emitSemanticallyRelatedEdge(g graph.Store, from, to *graph.Node, similarity float64) { g.AddEdge(&graph.Edge{ From: from.ID, To: to.ID, diff --git a/internal/indexer/clones_indexer_test.go b/internal/indexer/clones_indexer_test.go index 632c61bb..ff983196 100644 --- a/internal/indexer/clones_indexer_test.go +++ b/internal/indexer/clones_indexer_test.go @@ -63,7 +63,7 @@ func openAndScan(conn *Conn, statement string) error { } ` -func similarToEdges(g *graph.Graph) []*graph.Edge { +func similarToEdges(g graph.Store) []*graph.Edge { var out []*graph.Edge for _, e := range g.AllEdges() { if e.Kind == graph.EdgeSimilarTo { @@ -169,12 +169,12 @@ func TestDetectClonesAndEmitEdges(t *testing.T) { FilePath: "c.go", StartLine: 1, Language: "go", }) - stats := detectClonesAndEmitEdges(g, 0) + stats := detectClonesAndEmitEdges(g, "", 0) assert.Equal(t, 1, stats.Pairs) assert.Equal(t, 2, stats.Edges) // Idempotent: a second run dedupes via graph.AddEdge. - detectClonesAndEmitEdges(g, 0) + detectClonesAndEmitEdges(g, "", 0) assert.Len(t, similarToEdges(g), 2, "second pass must not duplicate edges") } diff --git a/internal/indexer/clones_multirepo_test.go b/internal/indexer/clones_multirepo_test.go new file mode 100644 index 00000000..e35cc0ac --- /dev/null +++ b/internal/indexer/clones_multirepo_test.go @@ -0,0 +1,327 @@ +package indexer + +import ( + "context" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +// Clone detection is PER-REPOSITORY: a near-duplicate body that appears +// once in repoA and once in repoB must NOT be linked by an EdgeSimilarTo +// edge, even though the two bodies are textbook Type-2 clones of each +// other. Within each repo, genuine clone pairs are still detected. +// +// These fixtures build two repos that share one graph (prefixes "repoA" +// and "repoB"). Each repo holds: +// +// - a within-repo Type-2 clone pair (every identifier renamed, control +// flow identical) that MUST emit EdgeSimilarTo, and +// - a "crossDup" body that is near-identical across the two repos — the +// cross-repo near-dup that per-repo scoping must keep unlinked. + +// repoA within-repo Type-2 clone pair: sumActiveItems / sumEnabledRecords. +const mrRepoAClone1 = `package main + +func sumActiveItems(items []Item) int { + total := 0 + for i := 0; i < len(items); i++ { + if items[i].Active { + total += items[i].Weight * factor + } else { + total -= items[i].Penalty + } + } + if total < 0 { + total = 0 + } + return total +} +` + +const mrRepoAClone2 = `package main + +func sumEnabledRecords(records []Record) int { + sum := 0 + for idx := 0; idx < len(records); idx++ { + if records[idx].Enabled { + sum += records[idx].Score * multiplier + } else { + sum -= records[idx].Fine + } + } + if sum < 0 { + sum = 0 + } + return sum +} +` + +// repoB within-repo Type-2 clone pair: scanOpenRows / scanLiveRows. A +// distinct shape from repoA's pair so each repo's within-repo clone is +// independent of the other's. +const mrRepoBClone1 = `package main + +func scanOpenRows(conn *Conn, statement string) error { + rows, err := conn.Query(statement) + if err != nil { + return wrap(err, "query failed") + } + defer rows.Close() + for rows.Next() { + var name string + if scanErr := rows.Scan(&name); scanErr != nil { + return scanErr + } + } + return rows.Err() +} +` + +const mrRepoBClone2 = `package main + +func scanLiveRows(handle *Handle, query string) error { + cursor, qerr := handle.Run(query) + if qerr != nil { + return wrap(qerr, "run failed") + } + defer cursor.Close() + for cursor.Next() { + var label string + if readErr := cursor.Read(&label); readErr != nil { + return readErr + } + } + return cursor.Err() +} +` + +// crossDup is one body that is parsed into BOTH repos. The repoA copy and +// the repoB copy are near-identical (Type-2 clone of each other) — the +// cross-repo near-dup whose link must be suppressed by per-repo scoping. +// To make it a real Type-2 clone across repos (not byte-identical, which +// would also collide intra-repo), repoA uses one identifier set and repoB +// another. +const mrCrossDupA = `package main + +func computeDelta(values []float64, base float64) float64 { + acc := 0.0 + for k := 0; k < len(values); k++ { + if values[k] > base { + acc += values[k] - base + } else { + acc -= base - values[k] + } + } + if acc < 0 { + acc = 0 + } + return acc +} +` + +const mrCrossDupB = `package main + +func computeSpread(samples []float64, pivot float64) float64 { + agg := 0.0 + for m := 0; m < len(samples); m++ { + if samples[m] > pivot { + agg += samples[m] - pivot + } else { + agg -= pivot - samples[m] + } + } + if agg < 0 { + agg = 0 + } + return agg +} +` + +// writeMultiRepoCloneFixture lays out two repo directories under root and +// returns their absolute file paths in stable per-repo order. +func writeMultiRepoCloneFixture(t *testing.T, root string) (repoADir string, repoAFiles []string, repoBDir string, repoBFiles []string) { + t.Helper() + repoADir = filepath.Join(root, "repoA") + repoBDir = filepath.Join(root, "repoB") + require.NoError(t, os.MkdirAll(repoADir, 0o755)) + require.NoError(t, os.MkdirAll(repoBDir, 0o755)) + + wa := func(name, body string) { + p := filepath.Join(repoADir, name) + writeFile(t, p, body) + repoAFiles = append(repoAFiles, p) + } + wb := func(name, body string) { + p := filepath.Join(repoBDir, name) + writeFile(t, p, body) + repoBFiles = append(repoBFiles, p) + } + + wa("clone1.go", mrRepoAClone1) + wa("clone2.go", mrRepoAClone2) + wa("crossdup.go", mrCrossDupA) + + wb("clone1.go", mrRepoBClone1) + wb("clone2.go", mrRepoBClone2) + wb("crossdup.go", mrCrossDupB) + return repoADir, repoAFiles, repoBDir, repoBFiles +} + +// edgeCrossesRepos reports whether a directed edge connects a repoA node +// to a repoB node (in either direction), keyed off the node RepoPrefix. +func edgeCrossesRepos(g graph.Store, e *graph.Edge) bool { + from := g.GetNode(e.From) + to := g.GetNode(e.To) + if from == nil || to == nil { + return false + } + return from.RepoPrefix != to.RepoPrefix +} + +// assertNoCrossRepoSimilarEdge fails if any EdgeSimilarTo edge connects a +// node in one repo to a node in another. +func assertNoCrossRepoSimilarEdge(t *testing.T, g graph.Store) { + t.Helper() + for _, e := range g.AllEdges() { + if e.Kind != graph.EdgeSimilarTo { + continue + } + if edgeCrossesRepos(g, e) { + from := g.GetNode(e.From) + to := g.GetNode(e.To) + t.Fatalf("cross-repo EdgeSimilarTo leaked: %s (%s) -> %s (%s)", + e.From, from.RepoPrefix, e.To, to.RepoPrefix) + } + } +} + +// repoSimilarEdgeSet returns the EdgeSimilarTo directed-edge set whose +// endpoints both live in repoPrefix. +func repoSimilarEdgeSet(g graph.Store, repoPrefix string) map[[2]string]struct{} { + set := make(map[[2]string]struct{}) + for _, e := range g.AllEdges() { + if e.Kind != graph.EdgeSimilarTo { + continue + } + from := g.GetNode(e.From) + to := g.GetNode(e.To) + if from == nil || to == nil { + continue + } + if from.RepoPrefix != repoPrefix || to.RepoPrefix != repoPrefix { + continue + } + set[[2]string{e.From, e.To}] = struct{}{} + } + return set +} + +// newRepoIndexer builds a test indexer bound to a repo prefix and sharing +// the given graph — the multi-repo setup MultiIndexer drives in production. +func newRepoIndexer(g graph.Store, prefix string) *Indexer { + idx := newTestIndexer(g) + idx.SetRepoPrefix(prefix) + return idx +} + +// TestClones_PerRepo_NoCrossRepoEdges is the per-repository clone-scoping +// test. Two repos share one graph; each has a within-repo Type-2 clone +// pair plus a cross-repo near-duplicate function. Running the per-repo +// batch pass (mirroring MultiIndexer.RunGlobalGraphPasses' loop) must: +// +// (a) emit the within-repo clone pair as EdgeSimilarTo in EACH repo; +// (b) emit NO EdgeSimilarTo edge between a repoA node and a repoB node; +// (c) produce, via the per-repo incremental path (Rebuild then a file +// reindex), the SAME EdgeSimilarTo set the per-repo batch produced. +func TestClones_PerRepo_NoCrossRepoEdges(t *testing.T) { + ctx := context.Background() + + // ---- (1) Batch path: two indexers share graph gBatch. ------------- + // SetDeferGlobalPasses(true) so Index() only parses + stamps shingles; + // the clone pass is then driven manually per repo, exactly as + // MultiIndexer.RunGlobalGraphPasses does. + root := t.TempDir() + repoADir, _, repoBDir, _ := writeMultiRepoCloneFixture(t, root) + + gBatch := graph.New() + idxA := newRepoIndexer(gBatch, "repoA") + idxA.SetDeferGlobalPasses(true) + idxB := newRepoIndexer(gBatch, "repoB") + idxB.SetDeferGlobalPasses(true) + _, err := idxA.Index(repoADir) + require.NoError(t, err) + _, err = idxB.Index(repoBDir) + require.NoError(t, err) + + // Per-repo batch clone pass (the new MultiIndexer loop). + csA := detectClonesAndEmitEdgesCtx(ctx, gBatch, "repoA", 0) + csB := detectClonesAndEmitEdgesCtx(ctx, gBatch, "repoB", 0) + require.Positive(t, csA.Items, "repoA must have clone-eligible bodies") + require.Positive(t, csB.Items, "repoB must have clone-eligible bodies") + + batchA := repoSimilarEdgeSet(gBatch, "repoA") + batchB := repoSimilarEdgeSet(gBatch, "repoB") + + // (a) Within-repo clone pairs emitted in each repo (non-vacuity). + require.GreaterOrEqual(t, len(batchA), 1, + "repoA must emit >=1 within-repo EdgeSimilarTo") + require.GreaterOrEqual(t, len(batchB), 1, + "repoB must emit >=1 within-repo EdgeSimilarTo") + // The within-repo pair is symmetric, so we expect exactly the two + // directed edges of repoA's sumActiveItems<->sumEnabledRecords pair. + assert.Contains(t, batchA, [2]string{"repoA/clone1.go::sumActiveItems", "repoA/clone2.go::sumEnabledRecords"}) + assert.Contains(t, batchA, [2]string{"repoA/clone2.go::sumEnabledRecords", "repoA/clone1.go::sumActiveItems"}) + assert.Contains(t, batchB, [2]string{"repoB/clone1.go::scanOpenRows", "repoB/clone2.go::scanLiveRows"}) + assert.Contains(t, batchB, [2]string{"repoB/clone2.go::scanLiveRows", "repoB/clone1.go::scanOpenRows"}) + + // (b) No EdgeSimilarTo edge crosses the repo boundary. The crossDup + // bodies are Type-2 clones of each other but live in different repos, + // so per-repo scoping must never form that candidate pair. + assertNoCrossRepoSimilarEdge(t, gBatch) + + // ---- (2) Incremental path: a fresh graph, per-repo Rebuild + reindex. + // deferGlobalPasses=false so the cold Index() runs each repo's inline + // per-repo clone pass and seeds its incremental index (Rebuild); a + // subsequent IndexFile then drives EvictFuncs/UpdateFuncs. + root2 := t.TempDir() + repoADir2, repoAFiles2, repoBDir2, repoBFiles2 := writeMultiRepoCloneFixture(t, root2) + + gInc := graph.New() + incA := newRepoIndexer(gInc, "repoA") + incB := newRepoIndexer(gInc, "repoB") + _, err = incA.Index(repoADir2) + require.NoError(t, err) + _, err = incB.Index(repoBDir2) + require.NoError(t, err) + require.True(t, incA.cloneIndex.built, "repoA incremental index must be built") + require.True(t, incB.cloneIndex.built, "repoB incremental index must be built") + + // Drive each repo's files through the incremental maintainer. + for _, f := range repoAFiles2 { + require.NoError(t, incA.IndexFile(f)) + } + for _, f := range repoBFiles2 { + require.NoError(t, incB.IndexFile(f)) + } + + // (c) The per-repo incremental edge set equals the per-repo batch set, + // and still no cross-repo edge appears. + incEdgesA := repoSimilarEdgeSet(gInc, "repoA") + incEdgesB := repoSimilarEdgeSet(gInc, "repoB") + assert.Equal(t, batchA, incEdgesA, + "repoA incremental EdgeSimilarTo set must equal the batch set") + assert.Equal(t, batchB, incEdgesB, + "repoB incremental EdgeSimilarTo set must equal the batch set") + assertNoCrossRepoSimilarEdge(t, gInc) + + // Guard the directory names are wired through (the fixture writer + // returns absolute repo dirs used above) so a refactor that drops a + // repo can't silently make this test vacuous. + require.NotEqual(t, repoADir2, repoBDir2) +} diff --git a/internal/indexer/contract_import_resolve.go b/internal/indexer/contract_import_resolve.go index ebf5bd5c..78026329 100644 --- a/internal/indexer/contract_import_resolve.go +++ b/internal/indexer/contract_import_resolve.go @@ -31,7 +31,7 @@ import ( // Languages other than TS / JS are skipped — Go disambiguates // bare-name collisions via package qualification (`pkg.Type`) and the // in-file resolveTypeInFile pass already handles those. -func (mi *MultiIndexer) disambiguateBareTypesViaImports(cr *contracts.Registry, g *graph.Graph) { +func (mi *MultiIndexer) disambiguateBareTypesViaImports(cr *contracts.Registry, g graph.Store) { srcCache := map[string][]byte{} importCache := map[string]map[string]string{} @@ -74,7 +74,7 @@ func (mi *MultiIndexer) disambiguateBareTypesViaImports(cr *contracts.Registry, // (so the caller leaves the bare name in place). func (mi *MultiIndexer) resolveBareTypeViaImports( srcFile, name string, - g *graph.Graph, + g graph.Store, srcCache map[string][]byte, importCache map[string]map[string]string, ) string { diff --git a/internal/indexer/contracts_bulk_commit_test.go b/internal/indexer/contracts_bulk_commit_test.go new file mode 100644 index 00000000..ea45fd50 --- /dev/null +++ b/internal/indexer/contracts_bulk_commit_test.go @@ -0,0 +1,208 @@ +package indexer + +import ( + "os" + "path/filepath" + "sync/atomic" + "testing" + + "github.com/stretchr/testify/require" + "go.uber.org/zap" + + "github.com/zzet/gortex/internal/config" + "github.com/zzet/gortex/internal/contracts" + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/parser" +) + +// recordingBulkGraph embeds *graph.Graph (auto-satisfying graph.Store) +// and adds the BulkLoader methods so it also satisfies +// graph.BulkLoader. It records the order of BeginBulkLoad / AddBatch +// / FlushBulk calls so a test can assert that the contracts commit +// path routes through the bulk fast lane instead of per-row +// AddNode / AddEdge writes. +type recordingBulkGraph struct { + *graph.Graph + + calls []string + addNode atomic.Int64 + addEdge atomic.Int64 +} + +func newRecordingBulkGraph() *recordingBulkGraph { + return &recordingBulkGraph{Graph: graph.New()} +} + +func (r *recordingBulkGraph) BeginBulkLoad() { + r.calls = append(r.calls, "BeginBulkLoad") +} + +func (r *recordingBulkGraph) FlushBulk() error { + r.calls = append(r.calls, "FlushBulk") + return nil +} + +func (r *recordingBulkGraph) AddNode(n *graph.Node) { + r.addNode.Add(1) + r.Graph.AddNode(n) +} + +func (r *recordingBulkGraph) AddEdge(e *graph.Edge) { + r.addEdge.Add(1) + r.Graph.AddEdge(e) +} + +func (r *recordingBulkGraph) AddBatch(nodes []*graph.Node, edges []*graph.Edge) { + r.calls = append(r.calls, "AddBatch") + r.Graph.AddBatch(nodes, edges) +} + +// TestCommitContracts_BatchesViaAddBatch asserts that the final +// write phase of commitContracts emits all contract nodes and +// edges through a single AddBatch call and does NOT engage the +// BulkLoader COPY bracket. Contract IDs frequently coincide with +// existing source-symbol IDs (a handler appears as both a Go +// function and an HTTP-contract anchor), and the on-disk backend's +// bulk load is INSERT-only on the node table — wrapping the contracts +// pass in BeginBulkLoad/FlushBulk would crash on the first collision. +// AddBatch's per-call MERGE path absorbs duplicates safely. +func TestCommitContracts_BatchesViaAddBatch(t *testing.T) { + g := newRecordingBulkGraph() + require.Implements(t, (*graph.BulkLoader)(nil), graph.Store(g)) + + // Anchor symbol the contract's provides-edge will point from. + g.Graph.AddNode(&graph.Node{ + ID: "pkg/foo.go::Handler.List", + Kind: graph.KindMethod, + Name: "List", + FilePath: "pkg/foo.go", + Language: "go", + }) + + idx := New(g, parser.NewRegistry(), config.Default().Index, zap.NewNop()) + + reg := contracts.NewRegistry() + reg.Add(contracts.Contract{ + ID: "http::GET::/v1/items", + Type: contracts.ContractHTTP, + Role: contracts.RoleProvider, + SymbolID: "pkg/foo.go::Handler.List", + FilePath: "pkg/foo.go", + Line: 42, + }) + reg.Add(contracts.Contract{ + ID: "http::POST::/v1/items", + Type: contracts.ContractHTTP, + Role: contracts.RoleConsumer, + SymbolID: "pkg/foo.go::Handler.List", + FilePath: "pkg/foo.go", + Line: 58, + }) + + idx.commitContracts(reg) + + require.Equal(t, + []string{"AddBatch"}, + g.calls, + "contracts commit must batch through a single AddBatch call", + ) + require.Zero(t, g.addNode.Load(), "no per-row AddNode calls expected") + require.Zero(t, g.addEdge.Load(), "no per-row AddEdge calls expected") + + require.NotNil(t, g.GetNode("http::GET::/v1/items")) + require.NotNil(t, g.GetNode("http::POST::/v1/items")) + + // Provider contract emits both EdgeProvides and EdgeHandlesRoute; + // consumer contract emits only EdgeConsumes. + provides := g.GetOutEdges("pkg/foo.go::Handler.List") + var nProvides, nConsumes, nHandles int + for _, e := range provides { + switch e.Kind { + case graph.EdgeProvides: + nProvides++ + case graph.EdgeConsumes: + nConsumes++ + case graph.EdgeHandlesRoute: + nHandles++ + } + } + require.Equal(t, 1, nProvides, "expected 1 EdgeProvides for the provider contract") + require.Equal(t, 1, nConsumes, "expected 1 EdgeConsumes for the consumer contract") + require.Equal(t, 1, nHandles, "expected 1 EdgeHandlesRoute for the HTTP provider") +} + +// TestCommitContracts_NoBulkLoader_FallsBackToAddBatch asserts that +// when the backend does not implement graph.BulkLoader (the +// in-memory *graph.Graph case) commitContracts still issues a +// single AddBatch — not the per-row AddNode / AddEdge writes — and +// does not attempt to call BeginBulkLoad / FlushBulk. +func TestCommitContracts_NoBulkLoader_FallsBackToAddBatch(t *testing.T) { + g := graph.New() + require.NotImplements(t, (*graph.BulkLoader)(nil), graph.Store(g)) + + g.AddNode(&graph.Node{ + ID: "pkg/foo.go::Handler.List", + Kind: graph.KindMethod, + Name: "List", + FilePath: "pkg/foo.go", + Language: "go", + }) + + idx := New(g, parser.NewRegistry(), config.Default().Index, zap.NewNop()) + + reg := contracts.NewRegistry() + reg.Add(contracts.Contract{ + ID: "http::GET::/v1/items", + Type: contracts.ContractHTTP, + Role: contracts.RoleProvider, + SymbolID: "pkg/foo.go::Handler.List", + FilePath: "pkg/foo.go", + Line: 42, + }) + + idx.commitContracts(reg) + + require.NotNil(t, g.GetNode("http::GET::/v1/items")) + out := g.GetOutEdges("pkg/foo.go::Handler.List") + var nProvides, nHandles int + for _, e := range out { + switch e.Kind { + case graph.EdgeProvides: + nProvides++ + case graph.EdgeHandlesRoute: + nHandles++ + } + } + require.Equal(t, 1, nProvides) + require.Equal(t, 1, nHandles) +} + +// TestExtractGoModContracts_UsesAddBatch asserts that go.mod +// dependency-contract emission goes through a single AddBatch +// call (with the bulk path engaged when the backend supports it) +// instead of the per-row AddNode loop that previously did one +// round-trip per dependency on the on-disk backend. +func TestExtractGoModContracts_UsesAddBatch(t *testing.T) { + dir := t.TempDir() + goMod := []byte(`module example.com/test + +go 1.22 + +require ( + github.com/dep/one v1.0.0 + github.com/dep/two v0.5.0 +) +`) + require.NoError(t, os.WriteFile(filepath.Join(dir, "go.mod"), goMod, 0o644)) + + g := newRecordingBulkGraph() + idx := New(g, parser.NewRegistry(), config.Default().Index, zap.NewNop()) + idx.rootPath = dir + + reg := contracts.NewRegistry() + idx.extractGoModContracts(reg) + + require.Contains(t, g.calls, "AddBatch", + "extractGoModContracts must emit dep nodes via a single AddBatch") + require.Zero(t, g.addNode.Load(), "no per-row AddNode calls expected") +} diff --git a/internal/indexer/dataflow.go b/internal/indexer/dataflow.go index c8c7679d..69554432 100644 --- a/internal/indexer/dataflow.go +++ b/internal/indexer/dataflow.go @@ -51,11 +51,77 @@ func (idx *Indexer) materializeDataflowParams() { } } +// materializeDataflowParamsForFile is the single-file equivalent of +// materializeDataflowParams, used on the incremental (fsnotify / +// edit_file) re-index path so a one-line edit doesn't scan the whole +// edge set. fileEdges is the file's freshly-extracted edge slice +// (result.Edges from indexFile); only its From endpoints are read, so +// stale To/From values from before resolution don't matter. +// +// A file's arg_of / returns_to From is NOT always a node in the file, +// so node membership alone is insufficient. Two From classes exist: +// - file nodes: returns_to's From is the caller function, and an +// arg_of whose argument is a bare in-scope identifier has its From +// rewritten by the resolver to that local/param — GetFileNodes +// covers both. +// - synthetic ids: arg_of for a selector (obj.Field), package- +// qualified (pkg.V), global, or nested-call (f(g())) argument keeps +// a synthetic `unresolved::` / `external::` From that never becomes +// a file node. The resolver leaves these untouched, so the id the +// extractor emitted (still present in fileEdges) is the id in the +// graph. +// +// Probing the union of both, then keeping only edges whose FilePath is +// this file, yields exactly the arg_of+returns_to set the whole-graph +// pass would touch for it — faithful, not approximate. Each rewrite +// needs only the edge plus a targeted callee lookup (paramNodeAtPosition +// / findCallTarget). The batch path (Resolver.ResolveAll) still runs the +// whole-graph variant once, where amortising one scan over many files +// is the right trade. +func (idx *Indexer) materializeDataflowParamsForFile(graphPath string, fileEdges []*graph.Edge) { + g := idx.graph + fromSet := make(map[string]struct{}) + for _, n := range g.GetFileNodes(graphPath) { + if n != nil && n.ID != "" { + fromSet[n.ID] = struct{}{} + } + } + for _, e := range fileEdges { + if e != nil && (e.Kind == graph.EdgeArgOf || e.Kind == graph.EdgeReturnsTo) && e.From != "" { + fromSet[e.From] = struct{}{} + } + } + if len(fromSet) == 0 { + return + } + froms := make([]string, 0, len(fromSet)) + for id := range fromSet { + froms = append(froms, id) + } + // A synthetic From can be shared across files, so restrict the rewrite + // to edges this file actually emitted: every arg_of / returns_to edge + // carries its call-site FilePath, so the filter keeps the set exactly + // the file's own. + for _, edges := range g.GetOutEdgesByNodeIDs(froms) { + for _, e := range edges { + if e == nil || e.FilePath != graphPath { + continue + } + switch e.Kind { + case graph.EdgeArgOf: + rewriteArgOf(g, e) + case graph.EdgeReturnsTo: + rewriteReturnsTo(g, e) + } + } + } +} + // rewriteArgOf walks the resolved callee's incoming param_of edges // and lifts the edge target from the function node to the param // node at the recorded position. Edges that already point at a // param node are left alone. -func rewriteArgOf(g *graph.Graph, e *graph.Edge) { +func rewriteArgOf(g graph.Store, e *graph.Edge) { if e == nil || e.Meta == nil { return } @@ -83,7 +149,7 @@ func rewriteArgOf(g *graph.Graph, e *graph.Edge) { // rewriteReturnsTo lifts the placeholder From by joining on the // resolved EdgeCalls edge from the same caller and line. -func rewriteReturnsTo(g *graph.Graph, e *graph.Edge) { +func rewriteReturnsTo(g graph.Store, e *graph.Edge) { if e == nil || e.Meta == nil { return } @@ -112,7 +178,7 @@ func rewriteReturnsTo(g *graph.Graph, e *graph.Edge) { // unresolved target string so we don't lift to the wrong call when // two calls live on the same line. Falls back to the first match // otherwise. -func findCallTarget(g *graph.Graph, callerID string, line int, calleeText string) string { +func findCallTarget(g graph.Store, callerID string, line int, calleeText string) string { out := g.GetOutEdges(callerID) var fallback string for _, e := range out { @@ -163,7 +229,7 @@ func callTargetMatches(call *graph.Edge, calleeText string) bool { // paramNodeAtPosition returns the param node ID with the recorded // position attached to ownerID via EdgeParamOf. -func paramNodeAtPosition(g *graph.Graph, ownerID string, pos int) string { +func paramNodeAtPosition(g graph.Store, ownerID string, pos int) string { in := g.GetInEdges(ownerID) for _, e := range in { if e.Kind != graph.EdgeParamOf { diff --git a/internal/indexer/dataflow_scoped_equiv_test.go b/internal/indexer/dataflow_scoped_equiv_test.go new file mode 100644 index 00000000..28025e32 --- /dev/null +++ b/internal/indexer/dataflow_scoped_equiv_test.go @@ -0,0 +1,401 @@ +package indexer + +import ( + "os" + "path/filepath" + "sort" + "strconv" + "strings" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +// TestMaterializeDataflowParamsForFile_EquivalentToWholeGraph proves the +// correctness claim behind the scoped per-file dataflow materialisation: +// materializeDataflowParamsForFile, run once per file, rewrites EXACTLY +// the same EdgeArgOf / EdgeReturnsTo edges — to the same (From, To, Kind) +// tuples — as the whole-graph materializeDataflowParams does in a single +// AllEdges scan. +// +// Why this holds (the invariant under test): returns_to's From is the +// enclosing caller function (a file node), while arg_of's From is the +// argument's source — a file local for a bare in-scope identifier, but a +// synthetic `unresolved::` id for selector / package-qualified / global / +// nested-call arguments, which is NOT a file node. The scoped pass +// therefore probes the union of (the file's nodes) and (the synthetic +// From ids the file's freshly-extracted edges carry), then keeps only +// edges whose FilePath is this file — exactly the arg_of+returns_to set +// the whole-graph pass would touch for it. The fixture below exercises +// all four argument shapes so the synthetic-From cases are covered. +// +// Method: build ONE resolved-but-not-yet-materialised graph from a small +// multi-file Go fixture (a caller file that calls a callee in another +// file, passing a parameter as an argument and assigning the return +// value), deep-clone it into two byte-identical graphs, then: +// +// (a) run materializeDataflowParams() once on gGlobal +// (b) run materializeDataflowParamsForFile(path) for each file on gScoped +// +// and assert the arg_of+returns_to {From,To,Kind} tuple sets are +// IDENTICAL. Cloning (not two independent indexings) removes any +// node-id / ordering nondeterminism, so any divergence is the scoping +// logic, not the build. +func TestMaterializeDataflowParamsForFile_EquivalentToWholeGraph(t *testing.T) { + dir := t.TempDir() + + // callee.go: a function with a declared parameter and a return value. + // The param node gives rewriteArgOf a #param: target to lift the + // arg_of edge onto; the return value gives the caller a returns_to + // edge to rewrite onto the resolved callee. + require.NoError(t, os.MkdirAll(filepath.Join(dir, "sink"), 0o755)) + writeFile(t, filepath.Join(dir, "sink", "callee.go"), `package sink + +// Transform consumes payload and returns a derived value. The declared +// parameter is what rewriteArgOf lifts an arg_of edge onto. +func Transform(payload string) string { + return payload + "!" +} +`) + + // caller.go: calls sink.Transform passing its own parameter as the + // argument (so arg_of's From is a dataflow node, not a literal) and + // assigns the return value (so returns_to is emitted). Both edges are + // anchored to nodes in THIS file. + writeFile(t, filepath.Join(dir, "caller.go"), `package main + +import "fmt" + +import "`+goModName+`/sink" + +var GlobalCfg = "cfg" + +type Box struct{ Payload string } + +func Drive(input string, b Box) { + out := sink.Transform(input) // bare in-scope arg: From resolves to a file local + fmt.Println(out) // arg_of(out) + returns_to + sink.Transform(b.Payload) // selector arg: From = synthetic unresolved::*.Payload + sink.Transform(GlobalCfg) // global arg: From = synthetic unresolved::GlobalCfg + sink.Transform(echo(input)) // nested-call arg: From = synthetic unresolved::echo +} + +func echo(s string) string { return s } +`) + + // A go.mod so the cross-file import resolves to a real callee node + // (resolver.ResolveAll lifts unresolved::Transform → the sink node). + writeFile(t, filepath.Join(dir, "go.mod"), "module "+goModName+"\n\ngo 1.22\n") + + // Build ONE raw graph: index every file WITHOUT the per-file dataflow + // pass, then run the cross-file resolver so unresolved:: call targets + // are lifted — but stop short of any materialisation. This is exactly + // the state both materialise passes are designed to consume. + gRaw := graph.New() + idx := newTestIndexer(gRaw) + files := goFilesUnder(t, dir) + require.NotEmpty(t, files) + for _, f := range files { + require.NoError(t, idx.IndexFileNoResolve(f)) + } + idx.resolver.ResolveAll() + + // Sanity: the fixture must actually emit the edges we claim to test. + // If it doesn't, an "equivalent" result is vacuously true and proves + // nothing — fail loudly instead. + preArg, preRet := countKinds(gRaw) + require.Greaterf(t, preArg, 0, + "fixture produced no EdgeArgOf edges; nothing to materialise (edges: %s)", dumpDataflow(gRaw)) + require.Greaterf(t, preRet, 0, + "fixture produced no EdgeReturnsTo edges; nothing to materialise (edges: %s)", dumpDataflow(gRaw)) + // Guard against a vacuous pass: the fixture MUST produce at least one + // arg_of edge whose From is a synthetic (unresolved::/external::) id — + // the selector / global / nested-call shape a node-membership scope + // misses. This is the exact regression the scoped pass must handle, so + // fail loudly if the fixture stops exercising it. + require.Truef(t, hasSyntheticArgFrom(gRaw), + "fixture produced no synthetic-From arg_of edge; the regression case is not exercised (edges: %s)", dumpDataflow(gRaw)) + + // Two byte-identical clones of the raw graph. + gGlobal := cloneGraph(gRaw) + gScoped := cloneGraph(gRaw) + require.Equal(t, dataflowTupleSet(gRaw), dataflowTupleSet(gGlobal), + "clone must reproduce the raw graph's dataflow edges before any pass runs") + require.Equal(t, dataflowTupleSet(gRaw), dataflowTupleSet(gScoped), + "clone must reproduce the raw graph's dataflow edges before any pass runs") + + // (a) whole-graph pass on gGlobal. + idxGlobal := newTestIndexer(gGlobal) + idxGlobal.materializeDataflowParams() + + // (b) scoped per-file pass on gScoped — once per file, mirroring the + // incremental re-index path that calls it after ResolveFile. + idxScoped := newTestIndexer(gScoped) + for _, gp := range graphFilePaths(gScoped) { + idxScoped.materializeDataflowParamsForFile(gp, fileEdgesOf(gScoped, gp)) + } + + globalSet := dataflowTupleSet(gGlobal) + scopedSet := dataflowTupleSet(gScoped) + + // The whole point: a rewrite must have actually occurred (at least one + // arg_of lifted to a #param: target, at least one returns_to lifted to + // the resolved callee), otherwise both sets equalling the raw set + // would pass trivially without exercising the rewrite logic. + require.Truef(t, rewriteOccurred(gGlobal), + "whole-graph pass performed no rewrite; test would be vacuous (edges: %s)", dumpDataflow(gGlobal)) + + if globalSet != scopedSet { + t.Fatalf("scoped per-file dataflow materialisation diverged from the whole-graph pass\n%s", + diffTupleSets(globalSet, scopedSet)) + } +} + +const goModName = "dataflowfixture" + +// goFilesUnder returns absolute paths to every .go file under dir, sorted +// for determinism. +func goFilesUnder(t *testing.T, dir string) []string { + t.Helper() + var out []string + require.NoError(t, filepath.WalkDir(dir, func(path string, d os.DirEntry, err error) error { + if err != nil { + return err + } + if d.IsDir() { + return nil + } + if strings.HasSuffix(path, ".go") { + out = append(out, path) + } + return nil + })) + sort.Strings(out) + return out +} + +// graphFilePaths returns the distinct file-node paths in the graph +// (the keys GetFileNodes / materializeDataflowParamsForFile accept), +// sorted for determinism. +func graphFilePaths(g graph.Store) []string { + seen := map[string]struct{}{} + for _, n := range g.AllNodes() { + if n == nil || n.FilePath == "" { + continue + } + seen[n.FilePath] = struct{}{} + } + out := make([]string, 0, len(seen)) + for p := range seen { + out = append(out, p) + } + sort.Strings(out) + return out +} + +// fileEdgesOf returns the edges the given file emitted, matched by the +// edge's own FilePath — the test stand-in for indexFile's result.Edges, +// from which materializeDataflowParamsForFile reads From endpoints +// (including the synthetic ids that are not file nodes). +func fileEdgesOf(g graph.Store, filePath string) []*graph.Edge { + var out []*graph.Edge + for _, e := range g.AllEdges() { + if e != nil && e.FilePath == filePath { + out = append(out, e) + } + } + return out +} + +// dataflowTupleSet renders the EdgeArgOf + EdgeReturnsTo edges as a sorted, +// newline-joined set of "Kind|From|To" tuples. Two graphs with an equal +// set are indistinguishable for the dataflow edges this pass owns. +func dataflowTupleSet(g graph.Store) string { + var lines []string + for _, e := range g.AllEdges() { + if e == nil { + continue + } + if e.Kind != graph.EdgeArgOf && e.Kind != graph.EdgeReturnsTo { + continue + } + lines = append(lines, string(e.Kind)+"|"+e.From+"|"+e.To) + } + sort.Strings(lines) + return strings.Join(lines, "\n") +} + +// countKinds counts arg_of and returns_to edges in the graph. +func countKinds(g graph.Store) (argOf, returnsTo int) { + for _, e := range g.AllEdges() { + if e == nil { + continue + } + switch e.Kind { + case graph.EdgeArgOf: + argOf++ + case graph.EdgeReturnsTo: + returnsTo++ + } + } + return +} + +// rewriteOccurred reports whether the materialise pass actually moved an +// edge: an arg_of now points at a #param: node, or a returns_to no longer +// originates from an unresolved/placeholder caller (its From was lifted to +// the resolved callee, observable as a From that is itself the To of a +// resolved EdgeArgOf's owner — pragmatically we detect the arg_of lift, +// which is unambiguous). +func rewriteOccurred(g graph.Store) bool { + for _, e := range g.AllEdges() { + if e == nil { + continue + } + if e.Kind == graph.EdgeArgOf && strings.Contains(e.To, "#param:") { + return true + } + } + return false +} + +// hasSyntheticArgFrom reports whether any arg_of edge's From is a +// synthetic placeholder (unresolved::/external::) rather than a real file +// node — the shape that a node-membership-only scope would skip. +func hasSyntheticArgFrom(g graph.Store) bool { + for _, e := range g.AllEdges() { + if e == nil || e.Kind != graph.EdgeArgOf { + continue + } + if strings.HasPrefix(e.From, "unresolved::") || strings.HasPrefix(e.From, "external::") { + return true + } + } + return false +} + +// dumpDataflow renders the arg_of/returns_to edges (with the Meta keys the +// rewrites read) for failure diagnostics. +func dumpDataflow(g graph.Store) string { + var lines []string + for _, e := range g.AllEdges() { + if e == nil || (e.Kind != graph.EdgeArgOf && e.Kind != graph.EdgeReturnsTo) { + continue + } + lines = append(lines, string(e.Kind)+" "+e.From+" -> "+e.To+ + " meta{arg_position="+metaVal(e.Meta, "arg_position")+ + " returns_to_call="+metaVal(e.Meta, "returns_to_call")+ + " call_line="+metaVal(e.Meta, "call_line")+ + " callee_target="+metaVal(e.Meta, "callee_target")+"}") + } + sort.Strings(lines) + return "\n " + strings.Join(lines, "\n ") +} + +func metaVal(m map[string]any, k string) string { + if m == nil { + return "" + } + v, ok := m[k] + if !ok { + return "" + } + switch x := v.(type) { + case string: + return x + case bool: + if x { + return "true" + } + return "false" + case int: + return strconv.Itoa(x) + case int64: + return strconv.Itoa(int(x)) + case float64: + return strconv.Itoa(int(x)) + default: + return "?" + } +} + +// diffTupleSets renders a unified line-diff of two sorted tuple sets. +func diffTupleSets(global, scoped string) string { + g := map[string]struct{}{} + for _, l := range strings.Split(global, "\n") { + if l != "" { + g[l] = struct{}{} + } + } + s := map[string]struct{}{} + for _, l := range strings.Split(scoped, "\n") { + if l != "" { + s[l] = struct{}{} + } + } + var onlyGlobal, onlyScoped []string + for l := range g { + if _, ok := s[l]; !ok { + onlyGlobal = append(onlyGlobal, l) + } + } + for l := range s { + if _, ok := g[l]; !ok { + onlyScoped = append(onlyScoped, l) + } + } + sort.Strings(onlyGlobal) + sort.Strings(onlyScoped) + var b strings.Builder + b.WriteString("only in WHOLE-GRAPH pass (missing from scoped):\n") + for _, l := range onlyGlobal { + b.WriteString(" - " + l + "\n") + } + b.WriteString("only in SCOPED pass (missing from whole-graph):\n") + for _, l := range onlyScoped { + b.WriteString(" + " + l + "\n") + } + return b.String() +} + +// cloneGraph builds a fresh in-memory graph that is structurally identical +// to src, deep-copying every node and edge (including Meta) so a pass run +// on the clone cannot mutate src or the sibling clone. +func cloneGraph(src graph.Store) graph.Store { + dst := graph.New() + srcNodes := src.AllNodes() + srcEdges := src.AllEdges() + nodes := make([]*graph.Node, 0, len(srcNodes)) + for _, n := range srcNodes { + if n == nil { + continue + } + nc := *n + nc.Meta = cloneMeta(n.Meta) + nodes = append(nodes, &nc) + } + edges := make([]*graph.Edge, 0, len(srcEdges)) + for _, e := range srcEdges { + if e == nil { + continue + } + ec := *e + ec.Meta = cloneMeta(e.Meta) + edges = append(edges, &ec) + } + dst.AddBatch(nodes, edges) + return dst +} + +func cloneMeta(m map[string]any) map[string]any { + if m == nil { + return nil + } + c := make(map[string]any, len(m)) + for k, v := range m { + c[k] = v + } + return c +} diff --git a/internal/indexer/dataflow_test.go b/internal/indexer/dataflow_test.go index deb223aa..25293959 100644 --- a/internal/indexer/dataflow_test.go +++ b/internal/indexer/dataflow_test.go @@ -14,7 +14,7 @@ import ( // indexAll indexes a single-file Go fixture and runs the global // resolve + dataflow materialisation pass. Returns the graph for // assertions. -func indexAll(t *testing.T, src string) *graph.Graph { +func indexAll(t *testing.T, src string) graph.Store { t.Helper() dir := t.TempDir() require.NoError(t, os.WriteFile(filepath.Join(dir, "main.go"), []byte(src), 0o644)) @@ -28,7 +28,7 @@ func indexAll(t *testing.T, src string) *graph.Graph { } // findEdges returns all edges matching the predicate. -func findEdges(g *graph.Graph, kind graph.EdgeKind, match func(*graph.Edge) bool) []*graph.Edge { +func findEdges(g graph.Store, kind graph.EdgeKind, match func(*graph.Edge) bool) []*graph.Edge { var out []*graph.Edge for _, e := range g.AllEdges() { if e.Kind != kind { @@ -172,7 +172,7 @@ func Driver(z int) int { } } -func findFuncID(t *testing.T, g *graph.Graph, name string) string { +func findFuncID(t *testing.T, g graph.Store, name string) string { t.Helper() candidates := g.FindNodesByName(name) for _, n := range candidates { @@ -184,7 +184,7 @@ func findFuncID(t *testing.T, g *graph.Graph, name string) string { return "" } -func dumpAllEdges(g *graph.Graph) string { +func dumpAllEdges(g graph.Store) string { var b strings.Builder for _, e := range g.AllEdges() { b.WriteString(string(e.Kind)) diff --git a/internal/indexer/di_contracts.go b/internal/indexer/di_contracts.go index 6447eb53..11592be1 100644 --- a/internal/indexer/di_contracts.go +++ b/internal/indexer/di_contracts.go @@ -36,15 +36,17 @@ func (idx *Indexer) extractDIContracts(reg *contracts.Registry) { var discovered []contracts.Contract if idx.repoPrefix != "" { - // Multi-repo: walk only this repo's outgoing edges. - for _, n := range idx.graph.GetRepoNodes(idx.repoPrefix) { - for _, e := range idx.graph.GetOutEdges(n.ID) { - c, ok := diContractFromEdge(e) - if !ok { - continue - } - discovered = append(discovered, c) + // Multi-repo: walk only this repo's outgoing edges via a + // single backend query. The previous GetRepoNodes × + // GetOutEdges nested walk was O(repo_nodes) per-node round- + // trips on disk backends — at ~68k repo nodes that meant + // 68k backend queries per pass on a disk backend. + for _, e := range idx.graph.GetRepoEdges(idx.repoPrefix) { + c, ok := diContractFromEdge(e) + if !ok { + continue } + discovered = append(discovered, c) } } else { // Single-repo: every edge belongs to this repo. @@ -96,10 +98,11 @@ func (idx *Indexer) linkSpringBeans() { } if idx.repoPrefix != "" { - for _, n := range idx.graph.GetRepoNodes(idx.repoPrefix) { - for _, e := range idx.graph.GetOutEdges(n.ID) { - collectBean(e) - } + // Single backend query instead of one GetOutEdges per + // repo node — see extractDIContracts above for the round- + // trip math. + for _, e := range idx.graph.GetRepoEdges(idx.repoPrefix) { + collectBean(e) } } else { for _, e := range idx.graph.AllEdges() { diff --git a/internal/indexer/diffusion_test.go b/internal/indexer/diffusion_test.go index b72702da..6db42951 100644 --- a/internal/indexer/diffusion_test.go +++ b/internal/indexer/diffusion_test.go @@ -12,7 +12,7 @@ import ( // semanticallyRelatedEdges collects every EdgeSemanticallyRelated edge // in the graph — the diffusion-pass output surface. -func semanticallyRelatedEdges(g *graph.Graph) []*graph.Edge { +func semanticallyRelatedEdges(g graph.Store) []*graph.Edge { var out []*graph.Edge for _, e := range g.AllEdges() { if e.Kind == graph.EdgeSemanticallyRelated { @@ -24,7 +24,7 @@ func semanticallyRelatedEdges(g *graph.Graph) []*graph.Edge { // addFnNode registers a bare function node so diffuseSimilarityEdges // has real endpoints to attach edges to. -func addFnNode(g *graph.Graph, id string) { +func addFnNode(g graph.Store, id string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindFunction, Name: id, FilePath: id, StartLine: 1, Language: "go", @@ -169,7 +169,7 @@ func TestDiffuseSimilarityEdges_Chain(t *testing.T) { // diffusedScoreFor returns the similarity carried by the directed // semantically_related edge from→to, and whether such an edge exists. -func diffusedScoreFor(g *graph.Graph, from, to string) (float64, bool) { +func diffusedScoreFor(g graph.Store, from, to string) (float64, bool) { for _, e := range semanticallyRelatedEdges(g) { if e.From == from && e.To == to { return e.Meta["similarity"].(float64), true @@ -357,7 +357,7 @@ func TestDetectClonesAndEmitEdges_DiffusionWiring(t *testing.T) { Meta: map[string]any{cloneSigMetaKey: encAB}, }) - stats := detectClonesAndEmitEdges(g, 0) + stats := detectClonesAndEmitEdges(g, "", 0) // A, B, C all share a signature: three direct clone pairs, so the // only diffusable pairs are themselves direct clones — diffusion // correctly emits nothing (partition invariant). diff --git a/internal/indexer/grpc_resolve_test.go b/internal/indexer/grpc_resolve_test.go index 44568451..9b942e16 100644 --- a/internal/indexer/grpc_resolve_test.go +++ b/internal/indexer/grpc_resolve_test.go @@ -12,7 +12,7 @@ import ( ) // outEdgeTo returns the first out-edge of fromID whose target is toID. -func outEdgeTo(g *graph.Graph, fromID, toID string) *graph.Edge { +func outEdgeTo(g graph.Store, fromID, toID string) *graph.Edge { for _, e := range g.GetOutEdges(fromID) { if e.To == toID { return e diff --git a/internal/indexer/incremental_reindex_test.go b/internal/indexer/incremental_reindex_test.go index 1f3daae0..c9ca51db 100644 --- a/internal/indexer/incremental_reindex_test.go +++ b/internal/indexer/incremental_reindex_test.go @@ -87,7 +87,7 @@ func Gone() {} // of its structural identity (node identities + edge triples). Two // graphs with an equal projection are byte-identical for every query // the engine can answer. -func canonicalGraph(g *graph.Graph) string { +func canonicalGraph(g graph.Store) string { var lines []string for _, n := range g.AllNodes() { if n == nil { diff --git a/internal/indexer/incremental_resolve_test.go b/internal/indexer/incremental_resolve_test.go new file mode 100644 index 00000000..fa9ac886 --- /dev/null +++ b/internal/indexer/incremental_resolve_test.go @@ -0,0 +1,124 @@ +package indexer + +import ( + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap" + + "github.com/zzet/gortex/internal/config" + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/search" +) + +// fnNodeID returns the function/method node ID named `name` defined in +// graph file `file`, failing the test if it is absent. +func fnNodeID(t *testing.T, g graph.Store, file, name string) string { + t.Helper() + for _, n := range g.GetFileNodes(file) { + if n.Name == name && (n.Kind == graph.KindFunction || n.Kind == graph.KindMethod) { + return n.ID + } + } + t.Fatalf("function %q in %s not found", name, file) + return "" +} + +// callTargetFrom returns the To of the (single) EdgeCalls edge leaving +// node `fromID`. +func callTargetFrom(t *testing.T, g graph.Store, fromID string) string { + t.Helper() + for _, e := range g.GetOutEdges(fromID) { + if e.Kind == graph.EdgeCalls { + return e.To + } + } + t.Fatalf("no call edge from %s", fromID) + return "" +} + +// TestIncrementalReindex_PreservesIncomingCallerEdges is the proof of +// the reverse-resolution + un-resolve fix. When file A defines Foo and +// file B calls it, B's call edge resolves to A.Foo. Re-indexing or +// deleting A must NOT silently drop B's edge: +// +// - re-indexing A (Foo unchanged): restubIncomingRefs re-stubs B's +// edge to unresolved::Foo before A is evicted, then +// ResolveIncomingForFile rebinds it to A's fresh Foo — so B's caller +// edge survives a definition edit. +// - deleting A: B's edge survives as an unresolved::Foo stub (the +// correct state for a call to a now-missing symbol), not dropped. +// - re-creating A: ResolveIncomingForFile rebinds the pending stub. +// +// Against the pre-fix code, step (1) FAILS: evicting A drops B's +// incoming caller edge wholesale and ResolveFile(A) only touches A's +// outgoing edges, so get_callers(Foo) goes blank until a cold reindex. +func TestIncrementalReindex_PreservesIncomingCallerEdges(t *testing.T) { + dir := t.TempDir() + aPath := filepath.Join(dir, "a.go") + bPath := filepath.Join(dir, "b.go") + writeFile(t, aPath, "package p\n\nfunc Foo() {}\n") + writeFile(t, bPath, "package p\n\nfunc Bar() { Foo() }\n") + + g := graph.New() + idx := New(g, newTestRegistry(), config.IndexConfig{Workers: 1}, zap.NewNop()) + idx.search = search.NewBM25() + idx.SetRootPath(dir) + _, err := idx.IndexCtx(testCtx(), dir) + require.NoError(t, err) + + fooID := fnNodeID(t, g, "a.go", "Foo") + barID := fnNodeID(t, g, "b.go", "Bar") + + require.Equal(t, fooID, callTargetFrom(t, g, barID), + "baseline: Bar's call must resolve to Foo") + + // (1) Re-index the DEFINITION file with Foo unchanged. The caller + // edge in b.go must survive. + require.NoError(t, idx.IndexFile(aPath)) + assert.Equal(t, fooID, callTargetFrom(t, g, barID), + "re-indexing Foo's own file must not drop Bar's caller edge") + + // (2) Delete the definition. Bar's edge must revert to an unresolved + // stub, not vanish. + idx.EvictFile(aPath) + deletedTarget := callTargetFrom(t, g, barID) + assert.True(t, graph.IsUnresolvedTarget(deletedTarget), + "deleting Foo must leave Bar's call as an unresolved stub, not drop it") + assert.Equal(t, "Foo", graph.UnresolvedName(deletedTarget), + "the re-stubbed target must carry Foo's name") + + // (3) Re-create the definition. The pending stub must rebind. + require.NoError(t, idx.IndexFile(aPath)) + rebound := fnNodeID(t, g, "a.go", "Foo") + assert.Equal(t, rebound, callTargetFrom(t, g, barID), + "re-adding Foo must rebind Bar's pending caller edge via the reverse pass") +} + +// TestEvictFile_DropsEnrichmentSidecars proves the change-A eviction +// cascade: deleting a file drops its nodes' churn/coverage/blame +// sidecar rows, leaving no orphan enrichment. +func TestEvictFile_DropsEnrichmentSidecars(t *testing.T) { + idx, _ := newToggleIndexer(t) + dir := t.TempDir() + idx.SetRootPath(dir) + g := idx.graph + + g.AddBatch([]*graph.Node{ + {ID: "main.fk", Kind: graph.KindFile, Name: "main.fk", FilePath: "main.fk"}, + {ID: "main.fk::Foo", Kind: graph.KindFunction, Name: "Foo", FilePath: "main.fk"}, + }, nil) + require.NoError(t, g.(graph.ChurnEnrichmentWriter).BulkSetChurn("", []graph.ChurnEnrichment{{NodeID: "main.fk::Foo", CommitCount: 3}})) + require.NoError(t, g.(graph.CoverageEnrichmentWriter).BulkSetCoverage("", []graph.CoverageEnrichment{{NodeID: "main.fk::Foo", CoveragePct: 50}})) + require.NoError(t, g.(graph.BlameEnrichmentWriter).BulkSetBlame("", []graph.BlameEnrichment{{NodeID: "main.fk::Foo", Email: "x@y"}})) + + require.NotEmpty(t, g.(graph.ChurnEnrichmentReader).ChurnRows(""), "churn seeded") + + idx.EvictFile("main.fk") + + assert.Empty(t, g.(graph.ChurnEnrichmentReader).ChurnRows(""), "churn rows must be evicted with the file") + assert.Empty(t, g.(graph.CoverageEnrichmentReader).CoverageRows(""), "coverage rows must be evicted") + assert.Empty(t, g.(graph.BlameEnrichmentReader).BlameRows(""), "blame rows must be evicted") +} diff --git a/internal/indexer/indexer.go b/internal/indexer/indexer.go index 1a9e6e52..f7a332bd 100644 --- a/internal/indexer/indexer.go +++ b/internal/indexer/indexer.go @@ -75,9 +75,18 @@ type IndexResult struct { // (MaxExtractMillis). Each is recorded in the graph as a synthetic // file node carrying skipped_due_to_size / skipped_due_to_timeout // telemetry. Zero unless one of those caps is set. - SkippedFiles int `json:"skipped_files,omitempty"` - DurationMs int64 `json:"duration_ms"` - Errors []IndexError `json:"errors,omitempty"` + SkippedFiles int `json:"skipped_files,omitempty"` + // DeletedFileCount is the number of previously-indexed files that + // were evicted this pass because they no longer exist on disk (only + // populated by IncrementalReindex). Together with StaleFileCount it + // lets a batch caller — the daemon warmup loop in particular — decide + // whether a repo actually changed since the last shutdown: when both + // are zero across every repo, the persisted graph already carries + // every resolved / derived edge and the global resolution passes can + // be skipped entirely (the warm-restart fast path). + DeletedFileCount int `json:"deleted_file_count,omitempty"` + DurationMs int64 `json:"duration_ms"` + Errors []IndexError `json:"errors,omitempty"` } // EdgeSanityViolated reports the post-reindex sanity-check failure: an @@ -101,7 +110,14 @@ type IndexError struct { // Indexer walks a repository and populates the graph. type Indexer struct { - graph *graph.Graph + graph graph.Store + // indexCount tracks how many IndexCtx calls this Indexer has + // completed. Gates the cold-start shadow-swap: each per-repo + // Indexer in MultiIndexer is fresh (indexCount==0), so all of + // them take the shadow path regardless of what sibling repos + // have already drained into the shared disk store. Per-repo- + // prefixed stub IDs make the concurrent drains conflict-free. + indexCount atomic.Int32 registry *parser.Registry resolver *resolver.Resolver search search.Backend @@ -273,6 +289,15 @@ type Indexer struct { // absent file produces empty rules and a no-op pass. codeownersOnce sync.Once codeownersRules []codeowners.Rule + + // cloneIndex maintains the clone-detection CMS + length-stratified + // LSH live across single-file edits, so a steady-state reindex + // updates EdgeSimilarTo edges in O(edited file) instead of the + // whole-graph detectClonesAndEmitEdges recompute. Constructed empty + // (built=false) — a batch/global clone pass calls Rebuild to seed it, + // after which indexFile drives EvictFuncs/UpdateFuncs. While un-built + // indexFile falls back to the whole-graph pass. + cloneIndex *incrementalCloneIndex } // contractCacheEntry is a cached contract-extraction result for one file. @@ -281,8 +306,12 @@ type contractCacheEntry struct { contracts []contracts.Contract } -// New creates an Indexer. -func New(g *graph.Graph, reg *parser.Registry, cfg config.IndexConfig, logger *zap.Logger) *Indexer { +// New creates an Indexer that writes through the supplied graph.Store. +// Any backend (in-memory, SQLite-on-disk, remote) is acceptable — the +// indexer's mutation paths go through the Store interface methods only, +// so swapping backends is a zero-code-change configuration choice for +// callers. +func New(g graph.Store, reg *parser.Registry, cfg config.IndexConfig, logger *zap.Logger) *Indexer { idx := &Indexer{ graph: g, registry: reg, @@ -291,12 +320,21 @@ func New(g *graph.Graph, reg *parser.Registry, cfg config.IndexConfig, logger *z // corpus sizes can happen in a background goroutine without // racing with concurrent searches. Subsequent reassignments to // idx.search (Hybrid wrap, etc.) should use swap helpers below. - search: search.NewSwappable(search.NewAuto()), + // + // When the backing store implements graph.SymbolSearcher + // (today only store_sqlite), the initial backend is a thin + // adapter that forwards Search to the store's native FTS. + // The in-process Bleve / BM25 build path is then bypassed + // entirely — saving ~100MB heap on a Vscode-scale repo and + // putting search in the same address space as the rest of + // the graph queries. + search: search.NewSwappable(initialSearchBackend(g)), config: cfg, transforms: newTransformPipeline(cfg.Transforms, logger), logger: logger, fileMtimes: make(map[string]int64), contractCache: make(map[string]*contractCacheEntry), + cloneIndex: newIncrementalCloneIndex(), } // Resolve JS/TS imports declared through an npm alias against the // local index. The index is built lazily on first use — the repo @@ -345,6 +383,87 @@ func searchIndexFields(n *graph.Node) []string { return []string{n.Name, n.FilePath, sig} } +// vectorSearcherDelegate is the search.VectorDelegate-shaped +// adapter the indexer hands to VectorBackend.SetDelegate when the +// underlying store implements graph.VectorSearcher. SimilarTo just +// forwards — search.VectorDelegate is defined to return +// graph.VectorHit slices directly, so there's no translation work +// here, just a small struct so the in-process search package +// doesn't depend on graph.VectorSearcher's full surface. +type vectorSearcherDelegate struct { + s graph.VectorSearcher +} + +func (d *vectorSearcherDelegate) SimilarTo(vec []float32, limit int) ([]graph.VectorHit, error) { + if d == nil || d.s == nil { + return nil, nil + } + return d.s.SimilarTo(vec, limit) +} + +// initialSearchBackend picks the search.Backend the indexer wraps +// in its Swappable on construction. When the underlying store +// implements graph.SymbolSearcher (today only store_sqlite), a +// thin adapter routes Search calls through the store's native FTS +// — the in-process BM25 / Bleve build path is bypassed entirely. +// Otherwise falls through to search.NewAuto which picks BM25 for +// small corpora and auto-upgrades to Bleve once the size warrants +// it. +func initialSearchBackend(g graph.Store) search.Backend { + if s, ok := g.(graph.SymbolSearcher); ok { + return search.NewSymbolSearcherBackend(s) + } + return search.NewAuto() +} + +// isSymbolSearcherBackend reports whether the swappable's currently +// active backend is the SymbolSearcher adapter. Used to suppress +// the Bleve auto-upgrade goroutine — if the active backend is +// already a native FTS, upgrading to Bleve would re-index the same +// corpus into a parallel in-process Bleve and silently swap it in, +// defeating the FTS path and pinning the ~100MB heap the FTS +// integration was meant to release. +func isSymbolSearcherBackend(b search.Backend) bool { + if b == nil { + return false + } + if sw, ok := b.(*search.Swappable); ok { + b = sw.Inner() + } + _, ok := b.(*search.SymbolSearcherBackend) + return ok +} + +// ftsTokensFor produces the pre-tokenised text the backend FTS path +// indexes. Mirrors searchIndexFields' field selection but joins +// every field through search.Tokenize (camelCase / snake_case / +// path-segment splitter) so the resulting token list matches the +// in-process BM25 corpus contract — the same query produces the +// same recall against either backend. Joined with spaces so the +// downstream COPY FROM sees a single STRING column value. +func ftsTokensFor(n *graph.Node) string { + fields := searchIndexFields(n) + if n.QualName != "" { + // QualName carries the dotted form (`pkg.Sub.Type.Method`) + // that adds qualifier-hop recall ("auth" matching + // "auth.ValidateToken"). searchIndexFields omits it for + // the legacy BM25 path (which folds qual into the + // name-token bag separately), so we add it explicitly here. + fields = append(fields, n.QualName) + } + tokens := make([]string, 0, 16) + for _, f := range fields { + if f == "" { + continue + } + tokens = append(tokens, search.Tokenize(f)...) + } + if len(tokens) == 0 { + return "" + } + return strings.Join(tokens, " ") +} + // shouldIndexForSearch reports whether a node should be added to the // text search index (BM25/Bleve). File and Import nodes are never // searchable symbols. Beyond that, config.SkipSearch filters out @@ -357,6 +476,22 @@ func (idx *Indexer) shouldIndexForSearch(n *graph.Node) bool { if n.Kind == graph.KindFile || n.Kind == graph.KindImport { return false } + // KindLocal nodes are intra-function bindings emitted to satisfy + // rel-table FK constraints on the dataflow edges that target + // locals. They have a real Name (the variable identifier) but + // surfacing them in BM25 would flood every search for common + // names like `err`, `data`, `n`, `i`. Excluded unconditionally. + if n.Kind == graph.KindLocal { + return false + } + // KindBuiltin nodes are language intrinsics (append / len / + // string / int / ...). Surfacing them in name search would + // drown every other hit on common identifiers — agents already + // know `string` / `append`. They remain queryable by kind and + // by ID for the analytics passes that care. + if n.Kind == graph.KindBuiltin { + return false + } // Prose-section nodes are searchable only when prose indexing is // enabled (search.index_prose); the rest of the graph is // unaffected by the toggle. @@ -485,7 +620,7 @@ func (idx *Indexer) upgradeSearchToBleve(snapshot []bleveUpgradeEntry) { } // Graph returns the underlying graph. -func (idx *Indexer) Graph() *graph.Graph { return idx.graph } +func (idx *Indexer) Graph() graph.Store { return idx.graph } // Search returns the search backend. func (idx *Indexer) Search() search.Backend { return idx.search } @@ -560,7 +695,7 @@ func (idx *Indexer) RunGlobalGraphPasses(ctx context.Context) { ) } reporter.Report("clone detection pass (global)", 0, 0) - if cs := detectClonesAndEmitEdgesCtx(ctx, idx.graph, idx.cloneThreshold()); cs.Items > 0 { + if cs := detectClonesAndEmitEdgesCtx(ctx, idx.graph, idx.repoPrefix, idx.cloneThreshold()); cs.Items > 0 { idx.logger.Info("clone edges emitted (global)", zap.Int("items", cs.Items), zap.Int("clone_pairs", cs.Pairs), @@ -571,6 +706,12 @@ func (idx *Indexer) RunGlobalGraphPasses(ctx context.Context) { zap.Int("diffused_edges", cs.DiffusedEdges), ) } + // Seed the incremental clone index from the freshly-baselined + // signatures + sidecar so steady-state single-file edits after this + // batch go incremental instead of re-running the whole-graph pass. + if idx.cloneIndex != nil { + idx.cloneIndex.Rebuild(idx.graph, idx.repoPrefix) + } // gRPC stub-call resolution. Runs after InferImplements (the // interface-satisfaction fallback signal depends on its // EdgeImplements edges) and before DetectCrossRepoEdges so a @@ -644,12 +785,16 @@ func (idx *Indexer) RunDeferredPasses(ctx context.Context) { return } reporter := progress.FromContext(ctx) + tphase := time.Now() + var dGoMod, dResolve, dEnrich, dContract time.Duration // Materialise dep:: contract nodes from go.mod BEFORE // ResolveAll so the resolver's import bridge can re-target Go // imports of declared modules to their dep contract node instead // of producing an `external::` stub. idx.extractGoModContracts(idx.pendingContractReg) + dGoMod = time.Since(tphase) + tphase = time.Now() // Per-repo resolver.ResolveAll walks the entire shared graph; with R // repos and E edges that's O(R · E). The MultiIndexer batch driver @@ -661,6 +806,8 @@ func (idx *Indexer) RunDeferredPasses(ctx context.Context) { reporter.Report("resolving references", 0, 0) idx.resolver.ResolveAll() } + dResolve = time.Since(tphase) + tphase = time.Now() if idx.semanticMgr != nil && idx.semanticMgr.Enabled() && idx.semanticMgr.HasProviders() { reporter.Report("semantic enrichment", 0, 0) @@ -682,6 +829,9 @@ func (idx *Indexer) RunDeferredPasses(ctx context.Context) { } } + dEnrich = time.Since(tphase) + tphase = time.Now() + reporter.Report("extracting contracts", 0, 0) // extractGoModContracts already ran (see above) so dep nodes // were available during ResolveAll's import-bridge pass. @@ -689,6 +839,13 @@ func (idx *Indexer) RunDeferredPasses(ctx context.Context) { idx.extractDIContracts(idx.pendingContractReg) idx.commitContracts(idx.pendingContractReg) idx.pendingContractReg = nil + dContract = time.Since(tphase) + idx.logger.Info("DEFERRED-TIMING per-repo", + zap.String("repo", idx.repoPrefix), + zap.Duration("gomod", dGoMod), + zap.Duration("resolve", dResolve), + zap.Duration("enrich", dEnrich), + zap.Duration("contract_commit", dContract)) } // RootPath returns the root path used for relative path computation. @@ -1506,7 +1663,7 @@ func (idx *Indexer) Index(root string) (*IndexResult, error) { // is pulled from ctx via progress.FromContext — attach one with // progress.WithReporter to receive stage updates. If no reporter is attached, // stage calls are silently dropped. -func (idx *Indexer) IndexCtx(ctx context.Context, root string) (*IndexResult, error) { +func (idx *Indexer) IndexCtx(ctx context.Context, root string) (result *IndexResult, retErr error) { start := time.Now() reporter := progress.FromContext(ctx) @@ -1584,6 +1741,213 @@ func (idx *Indexer) IndexCtx(ctx context.Context, root string) (*IndexResult, er } reporter.Report("walking files", len(files), len(files)) + // In-memory shadow for cold-start indexing on disk-backed stores. + // Disk backends pay ms-level per-call cost on every read; running + // the resolver against the disk store turns its ~100k+ point + // lookups into many minutes of wall time. Instead, swap idx.graph + // to an in-memory *Graph for the whole IndexCtx pipeline — parse, + // resolve, all subpasses, every per-edge MERGE/MATCH stays in + // memory at nanosecond latency. At the end, dump the final state + // to the disk backend via one BulkLoad cycle, so the disk has the + // post-resolve graph and the bench's query workload runs against + // the persisted state. + // + // Guards: + // - Backend must implement graph.BulkLoader (the on-disk backend opts in). + // - Store must be empty (NodeCount == 0 && EdgeCount == 0). The + // final dump is BulkLoad's INSERT-only fast path — running it + // against a non-empty store would corrupt or duplicate. + // Incremental / re-index flows fall through to the per-call + // AddBatch path against the disk store directly. + // - File count is below the shadow-max threshold (see + // shadowMaxFileCount). Above the threshold the shadow's RAM + // footprint would exceed available memory — Linux / Firefox + // at full scale (~10M+ edges) would push the shadow past + // 20GB. Override with GORTEX_SHADOW_MAX_FILES. + // - The swap happens before the parse worker pool starts and is + // committed before IndexCtx returns. retErr from the named + // return suppresses the commit when the pipeline errored — + // the disk store stays empty rather than capturing partial + // state. + var diskTarget graph.Store + var inMemShadow *graph.Graph + bl, blOK := idx.graph.(graph.BulkLoader) + // Per-Indexer sentinel: each *Indexer is constructed fresh + // (per-repo in MultiIndexer, once in single-repo daemons), so + // "this Indexer has indexed before" is the right question to + // gate the shadow-swap on. The legacy gate looked at the + // disk store's NodeCount, but in MultiIndexer the disk store + // holds data from sibling repos that already drained — the + // gate would mis-fire and force the big repo onto the per-row + // path. With per-repo-prefixed stub IDs (internal/graph/stub.go) + // concurrent shadow drains no longer conflict on PRIMARY KEY, + // so disk-non-empty is safe. + firstIndex := idx.indexCount.Load() == 0 + belowShadowMax := len(files) <= shadowMaxFileCount() + preNodes := idx.graph.NodeCount() + preEdges := idx.graph.EdgeCount() + idx.logger.Info("indexer: shadow-swap decision", + zap.String("repo", idx.RepoPrefix()), + zap.Bool("bulk_loader", blOK), + zap.Bool("first_index", firstIndex), + zap.Int("pre_nodes", preNodes), + zap.Int("pre_edges", preEdges), + zap.Int("files", len(files)), + zap.Int("shadow_max_files", shadowMaxFileCount()), + zap.Bool("below_shadow_max", belowShadowMax), + zap.Bool("shadow_taken", blOK && firstIndex && belowShadowMax), + ) + if blOK && firstIndex && belowShadowMax { + // Warm-restart safety. `firstIndex` is a PER-INDEXER sentinel, and + // a fresh per-repo Indexer is constructed on every daemon restart, + // so firstIndex is true on every restart — even when the + // persistent disk store already holds this repo's nodes from a + // prior run. The shadow drain below ends in BulkLoad's INSERT-only + // COPY, which (per this function's own contract) "running against a + // non-empty store would corrupt or duplicate". A duplicate-primary- + // key bulk load against the persisted rows would fail warmup, and + // because the repo's mtimes never get persisted when warmup dies + // first, the failure re-fires on the next restart: a crash loop. + // Evicting the repo's existing rows first makes the bulk load land + // on a clean slate. EvictRepo self-guards with a count query, so this is a + // cheap no-op for the genuine first-index cases (true cold start, + // a newly-tracked repo) where the disk store has no rows for this + // prefix. preNodes>0 short-circuits the call entirely on the + // first repo of a cold start (empty store). + if preNodes > 0 { + if n, e := idx.graph.EvictRepo(idx.RepoPrefix()); n > 0 || e > 0 { + idx.logger.Info("indexer: evicted stale repo rows before bulk reload (warm restart)", + zap.String("repo", idx.RepoPrefix()), + zap.Int("nodes", n), zap.Int("edges", e)) + } + } + idx.indexCount.Add(1) + diskTarget = idx.graph + inMemShadow = graph.New() + idx.graph = inMemShadow + // The resolver was constructed at indexer.New with the disk + // Store. Redirect it at the shadow too, otherwise ResolveAll + // reads from the empty disk Store, finds no pending edges, + // and short-circuits — silently disabling every resolver pass + // (module attribution, relative imports, edge in-place + // resolution, …) for any backend that takes the shadow path. + if idx.resolver != nil { + idx.resolver.SetGraph(inMemShadow) + } + defer func() { + if retErr != nil { + idx.graph = diskTarget + if idx.resolver != nil { + idx.resolver.SetGraph(diskTarget) + } + return + } + reporter.Report("persisting bulk graph", 0, 0) + drainStart := time.Now() + shadowNodeCount := inMemShadow.NodeCount() + shadowEdgeCount := inMemShadow.EdgeCount() + idx.logger.Info("indexer: drain start (shadow → disk)", + zap.String("repo", idx.RepoPrefix()), + zap.Int("shadow_nodes", shadowNodeCount), + zap.Int("shadow_edges", shadowEdgeCount), + ) + bl.BeginBulkLoad() + // Drain the shadow shard-by-shard so the indexer's hold on + // the 11-GB Linux-scale graph is released progressively + // instead of pinned until persist returns. The drain + // iterators free each shard's node/edge maps as they + // advance, so peak RAM during the persist window is + // roughly the chunk buffer + the backend's working set, + // not full shadow + the disk backend's bulk-COPY buffer. + // + // Collect (id, tokens) for every search-eligible node as + // the drain yields them — feeds the backend's native FTS + // at FlushBulk time when the store implements + // graph.SymbolSearcher. Nodes that fail + // shouldIndexForSearch (KindFile / KindImport / + // KindLocal / KindBuiltin / skip-search lang+kind pairs) + // are excluded so the FTS corpus matches the in-process + // BM25 corpus exactly. + searcher, hasFTS := diskTarget.(graph.SymbolSearcher) + var ftsItems []graph.SymbolFTSItem + if hasFTS { + // Pre-size to the shadow's node count to avoid grow + // churn on a 600k-node Vscode-shape repo. + ftsItems = make([]graph.SymbolFTSItem, 0, inMemShadow.NodeCount()) + } + const persistChunk = 100000 + nodeBuf := make([]*graph.Node, 0, persistChunk) + for n := range inMemShadow.DrainNodes() { + if hasFTS && idx.shouldIndexForSearch(n) { + ftsItems = append(ftsItems, graph.SymbolFTSItem{ + NodeID: n.ID, + Tokens: ftsTokensFor(n), + }) + } + nodeBuf = append(nodeBuf, n) + if len(nodeBuf) >= persistChunk { + diskTarget.AddBatch(nodeBuf, nil) + nodeBuf = nodeBuf[:0] + } + } + if len(nodeBuf) > 0 { + diskTarget.AddBatch(nodeBuf, nil) + nodeBuf = nil + } + edgeBuf := make([]*graph.Edge, 0, persistChunk) + for e := range inMemShadow.DrainEdges() { + edgeBuf = append(edgeBuf, e) + if len(edgeBuf) >= persistChunk { + diskTarget.AddBatch(nil, edgeBuf) + edgeBuf = edgeBuf[:0] + } + } + if len(edgeBuf) > 0 { + diskTarget.AddBatch(nil, edgeBuf) + edgeBuf = nil + } + flushStart := time.Now() + idx.logger.Info("indexer: FlushBulk start", + zap.String("repo", idx.RepoPrefix()), + zap.Duration("drain_elapsed", flushStart.Sub(drainStart)), + ) + if ferr := bl.FlushBulk(); ferr != nil { + retErr = fmt.Errorf("indexer: persist bulk graph: %w", ferr) + } + idx.logger.Info("indexer: FlushBulk complete", + zap.String("repo", idx.RepoPrefix()), + zap.Duration("flush_elapsed", time.Since(flushStart)), + zap.Duration("total_drain", time.Since(drainStart)), + zap.Int("nodes", shadowNodeCount), + zap.Int("edges", shadowEdgeCount), + ) + // Build the backend FTS after the bulk load completes so + // CREATE_FTS_INDEX has the full corpus to scan in one + // pass. BulkUpsertSymbolFTS does its own + // extension-install dance, so this is the only place the + // indexer needs to know about SymbolSearcher. + if hasFTS && len(ftsItems) > 0 { + reporter.Report("building symbol fts", 0, 0) + if ferr := searcher.BulkUpsertSymbolFTS(idx.RepoPrefix(), ftsItems); ferr != nil { + idx.logger.Warn("indexer: bulk symbol FTS upsert failed", + zap.Error(ferr)) + } else if ferr := searcher.BuildSymbolIndex(); ferr != nil { + idx.logger.Warn("indexer: backend FTS build failed", + zap.Error(ferr)) + } + reporter.Report("building symbol fts", 1, 1) + } + reporter.Report("persisting bulk graph", 1, 1) + idx.graph = diskTarget + }() + } else if diskTarget == nil && idx.graph.NodeCount() == 0 && idx.graph.EdgeCount() == 0 { + if _, isBulk := idx.graph.(graph.BulkLoader); isBulk && len(files) > shadowMaxFileCount() { + idx.logger.Info("indexer: skipping in-memory shadow above threshold", + zap.Int("files", len(files)), + zap.Int("threshold", shadowMaxFileCount())) + } + } + // Worker pool. workers := idx.config.Workers if workers <= 0 { @@ -1623,7 +1987,6 @@ func (idx *Indexer) IndexCtx(ctx context.Context, root string) (*IndexResult, er contractReg := contracts.NewRegistry() var contractMu sync.Mutex - fileCh := make(chan walkedFile, workers*4) var errMu sync.Mutex var errors []IndexError var processed int64 @@ -1631,156 +1994,201 @@ func (idx *Indexer) IndexCtx(ctx context.Context, root string) (*IndexResult, er var skippedByTimeout int64 var skippedByMinified int64 - var wg sync.WaitGroup - for range workers { - wg.Add(1) - go func() { - defer wg.Done() - var localContracts []contracts.Contract - for wf := range fileCh { - path := wf.path - p := atomic.AddInt64(&processed, 1) - if p == 1 || p%parseReportEvery == 0 { - reporter.Report("parsing", int(p), totalFiles) - } + // parseChunk runs the per-file worker pool over the supplied + // slice. Closure over outer state (errors, counters, contract + // registry, parsePool, quarantine) so it can be called multiple + // times — once for the non-streaming path, repeatedly for the + // streaming-flush large-repo path where each call processes a + // bounded slice into a per-chunk in-memory shadow. + parseChunk := func(chunkFiles []walkedFile) { + fileCh := make(chan walkedFile, workers*4) + var wg sync.WaitGroup + for range workers { + wg.Add(1) + go func() { + defer wg.Done() + var localContracts []contracts.Contract + for wf := range fileCh { + path := wf.path + p := atomic.AddInt64(&processed, 1) + if p == 1 || p%parseReportEvery == 0 { + reporter.Report("parsing", int(p), totalFiles) + } - src, err := os.ReadFile(path) - if err != nil { - errMu.Lock() - errors = append(errors, IndexError{FilePath: path, Error: err.Error()}) - errMu.Unlock() - continue - } + src, err := os.ReadFile(path) + if err != nil { + errMu.Lock() + errors = append(errors, IndexError{FilePath: path, Error: err.Error()}) + errMu.Unlock() + continue + } - relPath, _ := filepath.Rel(absRoot, path) - // Reuse the walk-time language. The walk's - // effectiveLanguage call already consulted shebang - // bytes via readSniffPrefix (512-byte probe), so a - // re-detect against the full src would change the - // answer only on the vanishingly rare case where a - // language marker lives past byte 512 — and any such - // case is content-sniffing-by-luck rather than spec'd - // behaviour. The fallback below covers the truly - // pathological case where the walk-time language has - // no extractor registered (effectively dead code). - lang := wf.lang - ext, _ := idx.registry.GetByLanguage(lang) - if ext == nil { - if relang, ok := idx.effectiveLanguage(path, src); ok { - lang = relang - ext, _ = idx.registry.GetByLanguage(lang) + relPath, _ := filepath.Rel(absRoot, path) + // Reuse the walk-time language. The walk's + // effectiveLanguage call already consulted shebang + // bytes via readSniffPrefix (512-byte probe), so a + // re-detect against the full src would change the + // answer only on the vanishingly rare case where a + // language marker lives past byte 512 — and any such + // case is content-sniffing-by-luck rather than spec'd + // behaviour. The fallback below covers the truly + // pathological case where the walk-time language has + // no extractor registered (effectively dead code). + lang := wf.lang + ext, _ := idx.registry.GetByLanguage(lang) + if ext == nil { + if relang, ok := idx.effectiveLanguage(path, src); ok { + lang = relang + ext, _ = idx.registry.GetByLanguage(lang) + } + } + if ext == nil { + continue } - } - if ext == nil { - continue - } - // Pre-ingestion transforms: rewrite the bytes before - // extraction (BOM strip, minified-bundle expansion, a - // PDF→markdown command, …). - src = idx.transforms.run(relPath, src) + // Pre-ingestion transforms: rewrite the bytes before + // extraction (BOM strip, minified-bundle expansion, a + // PDF→markdown command, …). + src = idx.transforms.run(relPath, src) - result, skipped, err := idx.extractFile(parsePool, quarantine, path, relPath, lang, ext, src) - if err != nil { - errMu.Lock() - errors = append(errors, IndexError{FilePath: path, Error: err.Error()}) - errMu.Unlock() - } - if result == nil { - continue - } - if skipped && len(result.Nodes) > 0 { - if _, ok := result.Nodes[0].Meta["skipped_due_to_timeout"]; ok { - atomic.AddInt64(&skippedByTimeout, 1) + result, skipped, err := idx.extractFile(parsePool, quarantine, path, relPath, lang, ext, src) + if err != nil { + errMu.Lock() + errors = append(errors, IndexError{FilePath: path, Error: err.Error()}) + errMu.Unlock() } - if _, ok := result.Nodes[0].Meta["skipped_due_to_minified"]; ok { - atomic.AddInt64(&skippedByMinified, 1) + if result == nil { + continue + } + if skipped && len(result.Nodes) > 0 { + if _, ok := result.Nodes[0].Meta["skipped_due_to_timeout"]; ok { + atomic.AddInt64(&skippedByTimeout, 1) + } + if _, ok := result.Nodes[0].Meta["skipped_due_to_minified"]; ok { + atomic.AddInt64(&skippedByMinified, 1) + } } - } - // Append coverage artifacts (todos / licenses / - // ownership) before applyRepoPrefix so they get the - // same multi-repo namespacing treatment as - // language-extractor output. Skipped for quarantined / - // timed-out files — the coverage scanners would re-read - // a source the parser could not survive. - if !skipped { - idx.applyCoverageDomains(relPath, lang, src, result) - } + // Append coverage artifacts (todos / licenses / + // ownership) before applyRepoPrefix so they get the + // same multi-repo namespacing treatment as + // language-extractor output. Skipped for quarantined / + // timed-out files — the coverage scanners would re-read + // a source the parser could not survive. + if !skipped { + idx.applyCoverageDomains(relPath, lang, src, result) + } - idx.applyRepoPrefix(result.Nodes, result.Edges) - - // Find the file node (if the extractor produced one) - // and collect its outgoing edges — contract extractors - // take the file-scope edge set (imports, etc.), not - // every intra-file edge. - var fileNodeID, fileGraphPath string - for _, n := range result.Nodes { - if n.Kind == graph.KindFile { - fileNodeID = n.ID - fileGraphPath = n.FilePath - break + idx.applyRepoPrefix(result.Nodes, result.Edges) + + // Find the file node (if the extractor produced one) + // and collect its outgoing edges — contract extractors + // take the file-scope edge set (imports, etc.), not + // every intra-file edge. + var fileNodeID, fileGraphPath string + for _, n := range result.Nodes { + if n.Kind == graph.KindFile { + fileNodeID = n.ID + fileGraphPath = n.FilePath + break + } } - } - var fileScopeEdges []*graph.Edge - if fileNodeID != "" { - for _, e := range result.Edges { - if e.From == fileNodeID { - fileScopeEdges = append(fileScopeEdges, e) + var fileScopeEdges []*graph.Edge + if fileNodeID != "" { + for _, e := range result.Edges { + if e.From == fileNodeID { + fileScopeEdges = append(fileScopeEdges, e) + } } } - } - // Batch the per-file insert into one shard-grouped pass - // so each shard's lock is acquired at most once per - // file instead of N + 2·E times. Profiling showed 69 - // of 102 workers blocked on lockTwoWrite under the - // per-edge path during cold-start warmup. - idx.graph.AddBatch(result.Nodes, result.Edges) - - if !skipped && fileGraphPath != "" { - exts := contractExtractorsByLang[lang] - if len(exts) > 0 { - c := idx.runContractExtractorsForFile( - fileGraphPath, src, result.Nodes, fileScopeEdges, exts, result.Tree) - localContracts = append(localContracts, c...) - - // Populate the per-file contract cache so a - // later IncrementalReindex can skip this file - // on a cache hit. Mtime comes from the walk- - // time d.Info() — no extra stat here. - if wf.mtimeNano > 0 { - idx.contractCacheMu.Lock() - idx.contractCache[fileGraphPath] = &contractCacheEntry{ - mtimeNano: wf.mtimeNano, - contracts: c, + // Batch the per-file insert into one shard-grouped pass + // so each shard's lock is acquired at most once per + // file instead of N + 2·E times. Profiling showed 69 + // of 102 workers blocked on lockTwoWrite under the + // per-edge path during cold-start warmup. + idx.graph.AddBatch(result.Nodes, result.Edges) + + if !skipped && fileGraphPath != "" { + exts := contractExtractorsByLang[lang] + if len(exts) > 0 { + c := idx.runContractExtractorsForFile( + fileGraphPath, src, result.Nodes, fileScopeEdges, exts, result.Tree) + localContracts = append(localContracts, c...) + + // Populate the per-file contract cache so a + // later IncrementalReindex can skip this file + // on a cache hit. Mtime comes from the walk- + // time d.Info() — no extra stat here. + if wf.mtimeNano > 0 { + idx.contractCacheMu.Lock() + idx.contractCache[fileGraphPath] = &contractCacheEntry{ + mtimeNano: wf.mtimeNano, + contracts: c, + } + idx.contractCacheMu.Unlock() } - idx.contractCacheMu.Unlock() } } + // Release the parse tree now that the per-file + // contract pass is done. Post-passes that need a + // tree for this file (cross-file handler resolution) + // re-parse on demand. Nil-safe. + result.Tree.Release() + atomic.AddInt64(&fileCount, 1) } - // Release the parse tree now that the per-file - // contract pass is done. Post-passes that need a - // tree for this file (cross-file handler resolution) - // re-parse on demand. Nil-safe. - result.Tree.Release() - atomic.AddInt64(&fileCount, 1) - } - if len(localContracts) > 0 { - contractMu.Lock() - for _, c := range localContracts { - contractReg.Add(c) + if len(localContracts) > 0 { + contractMu.Lock() + for _, c := range localContracts { + contractReg.Add(c) + } + contractMu.Unlock() } - contractMu.Unlock() + }() + } + + for _, f := range chunkFiles { + fileCh <- f + } + close(fileCh) + wg.Wait() + } + + // Streaming-flush path: above shadowMaxFileCount with a + // BulkLoader-capable backend, we can't fit the whole shadow in + // RAM but we can still amortise the per-file disk-write cost by + // chunking. Each chunk runs against its own throwaway shadow, + // then flushes via BulkLoad to disk. Resolve runs against the + // disk store afterwards (per-call, slower than the shadow path + // but bounded RAM). Activated by GORTEX_STREAMING_FLUSH=1; off + // by default since it requires the disk-only resolver path + // (~tens of minutes on huge repos) that we haven't yet + // optimised end-to-end. + if diskTarget == nil && streamingFlushActive(idx.graph, len(files)) { + bl, _ := idx.graph.(graph.BulkLoader) + streamingDisk := idx.graph + chunkSize := streamingChunkSize() + idx.logger.Info("indexer: streaming-flush parse", + zap.Int("files", len(files)), + zap.Int("chunk_size", chunkSize)) + for chunkStart := 0; chunkStart < len(files); chunkStart += chunkSize { + chunkEnd := min(chunkStart+chunkSize, len(files)) + chunkShadow := graph.New() + idx.graph = chunkShadow + parseChunk(files[chunkStart:chunkEnd]) + // Flush chunk to disk. + bl.BeginBulkLoad() + streamingDisk.AddBatch(chunkShadow.AllNodes(), chunkShadow.AllEdges()) + if err := bl.FlushBulk(); err != nil { + return nil, fmt.Errorf("indexer: streaming-flush chunk %d..%d: %w", chunkStart, chunkEnd, err) } - }() - } - - for _, f := range files { - fileCh <- f + } + // After all chunks, idx.graph points at the disk store so + // the resolver and subpasses read/mutate the merged state. + idx.graph = streamingDisk + } else { + parseChunk(files) } - close(fileCh) - wg.Wait() if processed > 0 { reporter.Report("parsing", int(processed), totalFiles) @@ -1803,8 +2211,58 @@ func (idx *Indexer) IndexCtx(ctx context.Context, root string) (*IndexResult, er idx.fileMtimes[idx.relKey(f.path)] = f.mtimeNano } } + mtimeSnapshot := make(map[string]int64, len(idx.fileMtimes)) + for k, v := range idx.fileMtimes { + mtimeSnapshot[k] = v + } idx.mtimeMu.Unlock() + // Persist the per-file mtimes through the store's optional + // FileMtime sidecar table. On the on-disk backend this lets warm + // restarts seed ReconcileRepoCtx without having to read them back + // out of the gob+gzip metadata snapshot; on the in-memory + // backend the capability isn't implemented and the assertion + // short-circuits. + // + // Multi-repo bug: when the shadow-swap path is active, idx.graph + // is the in-memory shadow graph at this point — graph.Graph does + // NOT implement FileMtimeWriter, so the type assertion fails and + // persistence is silently skipped. The actual disk store is + // the local diskTarget variable; checking it first ensures warm- + // restart-skip-reindex actually works. The defer that swaps + // idx.graph back to diskTarget runs LATER, when IndexCtx returns, + // so we can't rely on it here. Falls through to idx.graph for the + // non-shadow path. + mtimeTarget := graph.Store(idx.graph) + if diskTarget != nil { + mtimeTarget = diskTarget + } + // Full-index persist is AUTHORITATIVE: replace the repo's entire mtime + // set so files deleted since the last index are pruned. An upsert-only + // write (BulkSetFileMtimes) leaves deleted-file rows behind, and warm- + // restart reconcile then detects them as phantom deletions on every + // restart — forcing a full re-track that never converges. Prefer the + // replace capability; fall back to upsert for backends without it. + if len(mtimeSnapshot) > 0 { + var perr error + persisted := false + if r, ok := mtimeTarget.(graph.FileMtimeReplacer); ok { + perr, persisted = r.ReplaceFileMtimes(idx.repoPrefix, mtimeSnapshot), true + } else if w, ok := mtimeTarget.(graph.FileMtimeWriter); ok { + perr, persisted = w.BulkSetFileMtimes(idx.repoPrefix, mtimeSnapshot), true + } + if persisted { + if perr != nil { + idx.logger.Warn("persist file mtimes failed", + zap.String("repo", idx.repoPrefix), zap.Error(perr)) + } else { + idx.logger.Info("persisted file mtimes", + zap.String("repo", idx.repoPrefix), + zap.Int("count", len(mtimeSnapshot))) + } + } + } + // Retain parse errors and record index metadata. idx.parseErrors = errors idx.totalDetected = len(files) @@ -1888,7 +2346,7 @@ func (idx *Indexer) IndexCtx(ctx context.Context, root string) (*IndexResult, er ) } reporter.Report("clone detection pass", 0, 0) - if cs := detectClonesAndEmitEdgesCtx(ctx, idx.graph, idx.cloneThreshold()); cs.Items > 0 { + if cs := detectClonesAndEmitEdgesCtx(ctx, idx.graph, idx.repoPrefix, idx.cloneThreshold()); cs.Items > 0 { idx.logger.Info("clone edges emitted", zap.Int("items", cs.Items), zap.Int("clone_pairs", cs.Pairs), @@ -1899,6 +2357,14 @@ func (idx *Indexer) IndexCtx(ctx context.Context, root string) (*IndexResult, er zap.Int("diffused_edges", cs.DiffusedEdges), ) } + // Seed the incremental clone index from the freshly-baselined + // signatures + sidecar so steady-state single-file edits go + // incremental (EvictFuncs/UpdateFuncs) instead of re-running + // this whole-graph pass per file. The batch pass remains the + // re-baseline (corrects CMS drift) and owns diffusion. + if idx.cloneIndex != nil { + idx.cloneIndex.Rebuild(idx.graph, idx.repoPrefix) + } // gRPC stub-call resolution — runs once the call graph and // interface inference are final. Skipped under // deferGlobalPasses; the batch caller folds it into @@ -1949,7 +2415,16 @@ func (idx *Indexer) IndexCtx(ctx context.Context, root string) (*IndexResult, er // upgradeOnce gates the spawn so multi-repo warmup, which calls // IndexCtx once per tracked repo, doesn't launch one upgrade // goroutine per post-threshold repo. One per indexer lifetime. - if idx.search.Count() >= search.AutoThreshold { + // + // Skip the upgrade when the active search backend is the + // SymbolSearcher adapter: the disk store's native FTS is + // already serving search at engine-native latency, and + // spawning a parallel Bleve build would (a) waste ~100MB heap + // re-indexing the same corpus and (b) silently swap the + // adapter out for Bleve on completion — defeating the whole + // FTS path. The Swappable's current backend tells us which + // branch we're on. + if !isSymbolSearcherBackend(idx.search) && idx.search.Count() >= search.AutoThreshold { idx.upgradeOnce.Do(func() { reporter.Report("scheduling search backend upgrade", 0, 0) idx.upgradeSpawnedMu.Lock() @@ -1988,7 +2463,7 @@ func (idx *Indexer) IndexCtx(ctx context.Context, root string) (*IndexResult, er idx.indexGen.Add(1) // invalidate the trigram search cache nodes, edges := idx.repoNodeEdgeCount() - result := &IndexResult{ + result = &IndexResult{ NodeCount: nodes, EdgeCount: edges, FileCount: int(fileCount), @@ -2061,13 +2536,32 @@ func (idx *Indexer) indexFile(filePath string, resolve bool) error { // In multi-repo mode, the graph stores prefixed file paths. graphPath := idx.prefixPath(relPath) - // Evict existing data for this file (graph + search). - for _, n := range idx.graph.GetFileNodes(graphPath) { - if n.Kind != graph.KindFile && n.Kind != graph.KindImport { - idx.search.Remove(n.ID) + // Parse-then-swap: we must NOT evict the file's existing nodes/edges + // and search entries until we hold a usable parse result. Evicting + // first leaves the file at zero nodes whenever the on-disk bytes are + // transiently unparseable (a save mid-edit) — a failed extraction + // then returns early and the symbols stay nuked. Capturing the old + // state up front and deferring the actual eviction to evictExisting() + // keeps the file stale-but-present on failure (stale beats empty) and + // shrinks the no-nodes window to the gap between evict and AddBatch. + // + // oldFuncIDs holds this file's function/method node IDs so the + // incremental clone index can drop their CMS/LSH contributions — + // EvictFile removes the nodes (and their clone_sig) from the graph, + // so it must be captured before evictExisting runs. + var oldFuncIDs []string + evictExisting := func() { + for _, n := range idx.graph.GetFileNodes(graphPath) { + if n.Kind != graph.KindFile && n.Kind != graph.KindImport { + idx.search.Remove(n.ID) + } + if n.Kind == graph.KindFunction || n.Kind == graph.KindMethod { + oldFuncIDs = append(oldFuncIDs, n.ID) + } } + idx.restubIncomingRefs(graphPath) + idx.graph.EvictFile(graphPath) } - idx.graph.EvictFile(graphPath) src, err := os.ReadFile(absPath) if err != nil { @@ -2085,12 +2579,14 @@ func (idx *Indexer) indexFile(filePath string, resolve bool) error { // Honour the size cap on the incremental path too: an over-cap // file gets a synthetic skip node, not a parse — matching the - // bulk IndexCtx walk. + // bulk IndexCtx walk. This IS a successful result, so it evicts the + // prior state and installs the synthetic node, same as before. if maxSize := idx.config.MaxFileSize; maxSize > 0 && int64(len(src)) > maxSize { n := sizeSkipNode(skippedFile{ relPath: filepath.ToSlash(relPath), lang: lang, size: int64(len(src)), }, maxSize) idx.applyRepoPrefix([]*graph.Node{n}, nil) + evictExisting() idx.graph.AddBatch([]*graph.Node{n}, nil) return nil } @@ -2112,9 +2608,17 @@ func (idx *Indexer) indexFile(filePath string, resolve bool) error { _ = quarantine.Save() } if result == nil { + // No usable parse result (transient parse failure, quarantine, + // timeout). Do NOT evict — the file's prior nodes/edges/search + // entries stay intact. A stale-but-present file beats an empty + // one, and the next successful re-index swaps cleanly. return err } + // We hold a usable result: evict the old state now, then add the + // new — the window where the file has no nodes is just this gap. + evictExisting() + // Coverage extractors (todos, licenses, ownership). Same call // site exists in the bulk IndexCtx worker pool — see // applyCoverageDomains. Skipped for a quarantined / timed-out file. @@ -2128,29 +2632,59 @@ func (idx *Indexer) indexFile(filePath string, resolve bool) error { // Add new symbols to search index. shouldIndexForSearch enforces // the same SkipSearch filter used by the bulk and upgrade paths. + // When the backing store implements graph.SymbolSearcher we + // also mirror each upsert into its native FTS, so an + // incremental reindex doesn't fall out of sync with the + // bulk-built corpus. + searcher, _ := idx.graph.(graph.SymbolSearcher) for _, n := range result.Nodes { if !idx.shouldIndexForSearch(n) { continue } idx.search.Add(n.ID, searchIndexFields(n)...) + if searcher != nil { + if err := searcher.UpsertSymbolFTS(n.ID, ftsTokensFor(n)); err != nil { + idx.logger.Debug("indexer: backend FTS upsert failed", + zap.String("id", n.ID), + zap.Error(err)) + } + } } if resolve { idx.resolver.ResolveFile(graphPath) + // Reverse pass: bind callers in OTHER files that reference a + // symbol (re)defined here. ResolveFile above only fixed this + // file's OUTGOING edges; a symbol newly defined or changed here + // leaves callers elsewhere pointing at the unresolved stub + // restubIncomingRefs left when the prior concrete node was + // evicted. Scoped to this file's names — not a whole-graph + // ResolveAll. + idx.resolver.ResolveIncomingForFile(graphPath) // CPG-lite dataflow placeholders for this file: inter- // procedural callees may have just been lifted by // ResolveFile, so re-run the dataflow materialisation pass // to keep arg_of / returns_to edges in sync with the - // freshly resolved EdgeCalls graph. - idx.materializeDataflowParams() + // freshly resolved EdgeCalls graph. Scoped to this file's + // out-edges — not a whole-graph AllEdges scan — so an + // incremental edit stays O(file), not O(all edges). + idx.materializeDataflowParamsForFile(graphPath, result.Edges) // Clone detection. EvictFile above removed this file's - // EdgeSimilarTo edges in both directions; a full recompute - // restores the correct set against the freshly stamped - // signatures. Skipped under deferGlobalPasses — a batch - // caller (ReconcileAll, warmup) runs the global pass once at - // the end instead of paying the O(functions) walk per file. + // EdgeSimilarTo edges in both directions. When the incremental + // clone index is built, re-bank just this file's bodies + // (EvictFuncs the old ids, UpdateFuncs the fresh nodes) — an + // O(edited file) update that restores the same edge set the + // whole-graph pass would. Until a batch/global pass has seeded + // the index (built=false) we fall back to the full recompute. + // Skipped under deferGlobalPasses — a batch caller (ReconcileAll, + // warmup) runs the global pass once at the end. if !idx.deferGlobalPasses { - detectClonesAndEmitEdges(idx.graph, idx.cloneThreshold()) + if idx.cloneIndex != nil && idx.cloneIndex.built { + idx.cloneIndex.EvictFuncs(idx.graph, oldFuncIDs) + idx.cloneIndex.UpdateFuncs(idx.graph, idx.repoPrefix, cloneFuncNodes(result.Nodes), idx.cloneThreshold()) + } else { + detectClonesAndEmitEdges(idx.graph, idx.repoPrefix, idx.cloneThreshold()) + } } } @@ -2158,9 +2692,18 @@ func (idx *Indexer) indexFile(filePath string, resolve bool) error { // key (relKey applied slash + NFC), so the mtime entry lines up // with the graph file-node key and with the bulk-walk mtimes. if info, err := os.Stat(absPath); err == nil { + mtime := info.ModTime().UnixNano() idx.mtimeMu.Lock() - idx.fileMtimes[relPath] = info.ModTime().UnixNano() + idx.fileMtimes[relPath] = mtime idx.mtimeMu.Unlock() + // Also persist through the store's FileMtime sidecar so the + // next warm restart sees this incremental update without + // having to wait for the periodic gob snapshot to roll it. + // Per-file write is ~1ms on the on-disk backend; trivial under + // steady-state file-watcher load. + if w, ok := idx.graph.(graph.FileMtimeWriter); ok { + _ = w.BulkSetFileMtimes(idx.repoPrefix, map[string]int64{relPath: mtime}) + } } return nil @@ -2303,9 +2846,90 @@ func (idx *Indexer) EvictFile(filePath string) (int, int) { idx.search.Remove(n.ID) } } + idx.restubIncomingRefs(graphPath) + idx.evictEnrichment(graphPath) return idx.graph.EvictFile(graphPath) } +// restubIncomingRefs rewrites every resolved reference edge that points +// INTO a symbol of graphPath from a surviving (other-file) source back +// to an `unresolved::` stub, in place, BEFORE the file's nodes are +// evicted. Graph eviction otherwise drops those incoming caller edges +// wholesale (it removes the edge from the surviving source's out-edge +// bucket) and nothing recreates them until a cold reindex — so editing +// or deleting a definition silently strips its callers' edges and +// find_usages / get_callers go blank. Re-stubbing detaches the edges +// from the soon-to-be-evicted nodes so they survive as pending stubs; +// ResolveIncomingForFile (after a re-index) rebinds them to the file's +// fresh symbols, or they stay unresolved — the correct state once the +// symbol is gone. Only name-resolvable reference kinds are re-stubbed; +// structural and enrichment edges are left to be dropped. Backend- +// agnostic: GetInEdges + ReindexEdges are the same Store primitives the +// resolver uses, so this behaves identically on the in-memory and disk +// stores. +// evictEnrichment drops the per-node enrichment sidecar rows (churn, +// coverage, release, blame — change A) for a file's nodes on the +// delete/rename paths only, so a removed file leaves no orphan +// enrichment. Capability-gated. A modify re-indexes the same node IDs +// (enrichment stays valid) so it is NOT cascaded there. +func (idx *Indexer) evictEnrichment(graphPath string) { + nodes := idx.graph.GetFileNodes(graphPath) + if len(nodes) == 0 { + return + } + ids := make([]string, 0, len(nodes)) + for _, n := range nodes { + ids = append(ids, n.ID) + } + if w, ok := idx.graph.(graph.ChurnEnrichmentWriter); ok { + _ = w.DeleteChurn(ids) + } + if w, ok := idx.graph.(graph.CoverageEnrichmentWriter); ok { + _ = w.DeleteCoverage(ids) + } + if w, ok := idx.graph.(graph.ReleaseEnrichmentWriter); ok { + _ = w.DeleteReleases(ids) + } + if w, ok := idx.graph.(graph.BlameEnrichmentWriter); ok { + _ = w.DeleteBlame(ids) + } +} + +func (idx *Indexer) restubIncomingRefs(graphPath string) { + nodes := idx.graph.GetFileNodes(graphPath) + if len(nodes) == 0 { + return + } + evicted := make(map[string]struct{}, len(nodes)) + for _, n := range nodes { + evicted[n.ID] = struct{}{} + } + var batch []graph.EdgeReindex + for _, n := range nodes { + if n.Name == "" || !graph.IsReferenceableSymbol(n.Kind) { + continue + } + stub := graph.UnresolvedMarker + n.Name + for _, e := range idx.graph.GetInEdges(n.ID) { + if e == nil || !graph.IsResolvableRefEdge(e.Kind) { + continue + } + if _, fromEvicted := evicted[e.From]; fromEvicted { + continue // intra-file edge: the source is evicted too + } + if graph.IsUnresolvedTarget(e.To) { + continue // already a pending stub + } + oldTo := e.To + e.To = stub + batch = append(batch, graph.EdgeReindex{Edge: e, OldTo: oldTo}) + } + } + if len(batch) > 0 { + idx.graph.ReindexEdges(batch) + } +} + // embeddingDimsOrDefault returns the embedder's reported vector width, // falling back to a neutral placeholder only when the provider cannot // state its width yet (Dimensions() == 0, the APIProvider-before-first- @@ -2786,9 +3410,38 @@ func (idx *Indexer) buildSearchIndex() { } vecBackend := search.NewVector(dims) + // VectorSearcher capability bridging: if the underlying store + // has a native HNSW, install it as the in-process backend's + // delegate — Add becomes a no-op, Search forwards to the + // engine, and we don't allocate `dim × 4 × N` bytes of heap + // for a parallel in-process HNSW. The indexer still drives + // the writes (BulkUpsertEmbeddings below) so the engine + // index lands with the same corpus the in-process one would + // have built. + vecSearcher, _ := idx.graph.(graph.VectorSearcher) + var backendItems []graph.VectorItem + if vecSearcher != nil { + vecBackend.SetDelegate(&vectorSearcherDelegate{s: vecSearcher}) + backendItems = make([]graph.VectorItem, 0, len(vectors)) + } for i, vec := range vectors { if vec != nil { vecBackend.Add(ids[i], vec) + if vecSearcher != nil { + backendItems = append(backendItems, graph.VectorItem{ + NodeID: ids[i], + Vec: vec, + }) + } + } + } + if vecSearcher != nil && len(backendItems) > 0 { + if err := vecSearcher.BulkUpsertEmbeddings(backendItems); err != nil { + idx.logger.Warn("indexer: backend vector bulk upsert failed", + zap.Error(err)) + } else if err := vecSearcher.BuildVectorIndex(dims); err != nil { + idx.logger.Warn("indexer: backend vector index build failed", + zap.Error(err)) } } // Install the chunk → parent-symbol mapping so HybridBackend can @@ -2908,6 +3561,24 @@ func (idx *Indexer) RefreshFileMtime(filePath string) { idx.mtimeMu.Unlock() } +// pruneDeletedFileMtimes drops the persisted mtime rows for files the +// incremental reindex just confirmed deleted. The in-memory map is already +// pruned by the caller; this keeps the store's FileMtime sidecar in step so +// a later warm restart does not re-discover them as phantom deletions and +// force a full re-track. A no-op when the backend lacks the capability +// (the in-memory backend) or the list is empty. +func (idx *Indexer) pruneDeletedFileMtimes(deleted []string) { + if len(deleted) == 0 { + return + } + if d, ok := idx.graph.(graph.FileMtimeDeleter); ok { + if err := d.DeleteFileMtimes(idx.repoPrefix, deleted); err != nil { + idx.logger.Warn("prune deleted file mtimes failed", + zap.String("repo", idx.repoPrefix), zap.Error(err)) + } + } +} + // SetFileMtimes restores the file modification time map from a persisted snapshot. func (idx *Indexer) SetFileMtimes(mtimes map[string]int64) { idx.mtimeMu.Lock() @@ -3093,11 +3764,17 @@ func (idx *Indexer) IncrementalReindexPaths(root string, paths []string) (*Index for _, relPath := range deletedFiles { graphPath := idx.prefixPath(relPath) + idx.restubIncomingRefs(graphPath) + idx.evictEnrichment(graphPath) idx.graph.EvictFile(graphPath) idx.mtimeMu.Lock() delete(idx.fileMtimes, relPath) idx.mtimeMu.Unlock() } + // Prune the persisted mtime rows for deleted files too, so the next + // warm restart does not see them as phantom deletions (the in-memory + // delete above does not reach the store's sidecar table). + idx.pruneDeletedFileMtimes(deletedFiles) // Re-index stale files with the same one-shot retry as the // whole-root path — a file locked or mid-write when the walk caught @@ -3133,7 +3810,19 @@ func (idx *Indexer) IncrementalReindexPaths(root string, paths []string) (*Index resolver.SynthesizeExternalCalls(idx.graph, idx.externalCallSynthesisEnabled()) } - idx.buildSearchIndex() + // Skip the search-index rebuild on a zero-change reconcile when the + // backend already persists its search structures (the on-disk + // backend keeps its FTS index and vector embeddings on disk). + // buildSearchIndex re-reads every node (GetRepoNodes) and re-embeds + // them, then BulkUpsertEmbeddings re-writes the embedding rows. On a + // warm restart that work is pure recompute of already-persisted data. + // When nothing changed there is nothing to re-embed, so skip it + // entirely — the persisted index is authoritative. The in-memory + // backends (BM25 / Bleve) must still rebuild from the replayed + // snapshot, so they keep the unconditional path. + if len(staleFiles) > 0 || len(deletedFiles) > 0 || !isSymbolSearcherBackend(idx.search) { + idx.buildSearchIndex() + } if len(staleFiles) > 0 || len(deletedFiles) > 0 { idx.extractContracts() @@ -3142,12 +3831,13 @@ func (idx *Indexer) IncrementalReindexPaths(root string, paths []string) (*Index nodes, edges := idx.repoNodeEdgeCount() result := &IndexResult{ - NodeCount: nodes, - EdgeCount: edges, - FileCount: len(diskFiles), - StaleFileCount: len(staleFiles), - FailedFiles: failedFiles, - DurationMs: time.Since(start).Milliseconds(), + NodeCount: nodes, + EdgeCount: edges, + FileCount: len(diskFiles), + StaleFileCount: len(staleFiles), + DeletedFileCount: len(deletedFiles), + FailedFiles: failedFiles, + DurationMs: time.Since(start).Milliseconds(), } idx.warnIfEdgeSanityViolated(result) return result, nil @@ -3278,11 +3968,17 @@ func (idx *Indexer) IncrementalReindex(root string) (*IndexResult, error) { // Evict only files that are truly absent from disk. for _, relPath := range deletedFiles { graphPath := idx.prefixPath(relPath) + idx.restubIncomingRefs(graphPath) + idx.evictEnrichment(graphPath) idx.graph.EvictFile(graphPath) idx.mtimeMu.Lock() delete(idx.fileMtimes, relPath) idx.mtimeMu.Unlock() } + // Prune the persisted mtime rows for deleted files too, so the next + // warm restart does not see them as phantom deletions (the in-memory + // delete above does not reach the store's sidecar table). + idx.pruneDeletedFileMtimes(deletedFiles) // Re-index stale files. A file that fails — most often because it // was locked or mid-write when the walk caught it — is collected @@ -3335,8 +4031,14 @@ func (idx *Indexer) IncrementalReindex(root string) (*IndexResult, error) { // the global clone pass once at the end. } - // Rebuild search index to ensure consistency. - idx.buildSearchIndex() + // Rebuild search index to ensure consistency — but skip it on a + // zero-change reconcile against a backend that persists its search + // structures natively (the on-disk backend). See the matching guard + // in the other incremental path: re-embedding is wasted work and + // there is nothing to rebuild when no file changed. + if len(staleFiles) > 0 || len(deletedFiles) > 0 || !isSymbolSearcherBackend(idx.search) { + idx.buildSearchIndex() + } // Update totalDetected so index_health reports correctly after cache restore. if idx.totalDetected == 0 { @@ -3351,12 +4053,13 @@ func (idx *Indexer) IncrementalReindex(root string) (*IndexResult, error) { nodes, edges := idx.repoNodeEdgeCount() result := &IndexResult{ - NodeCount: nodes, - EdgeCount: edges, - FileCount: len(diskFiles), - StaleFileCount: len(staleFiles), - FailedFiles: failedFiles, - DurationMs: time.Since(start).Milliseconds(), + NodeCount: nodes, + EdgeCount: edges, + FileCount: len(diskFiles), + StaleFileCount: len(staleFiles), + DeletedFileCount: len(deletedFiles), + FailedFiles: failedFiles, + DurationMs: time.Since(start).Milliseconds(), } idx.warnIfEdgeSanityViolated(result) return result, nil @@ -3514,52 +4217,77 @@ func (idx *Indexer) commitContracts(reg *contracts.Registry) { // the wire format. idx.inlineEnvelopeShapes(reg) - for _, c := range reg.All() { - contractNode := &graph.Node{ - ID: c.ID, - Kind: graph.KindContract, - Name: c.ID, - FilePath: c.FilePath, - Language: "contract", - Meta: map[string]any{"type": string(c.Type), "role": string(c.Role)}, + all := reg.All() + nodes := make([]*graph.Node, 0, len(all)) + edges := make([]*graph.Edge, 0, len(all)) + for _, c := range all { + // dep:: nodes were materialised by extractGoModContracts + // before ResolveAll (so the import bridge could find them); + // re-emitting them here would PK-collide on backends whose bulk + // load is INSERT-only (the on-disk backend). The pre-pass is the single + // writer for that contract type. + if c.Type == contracts.ContractDependency { + continue } - idx.graph.AddNode(contractNode) + nodes = append(nodes, &graph.Node{ + ID: c.ID, + Kind: graph.KindContract, + Name: c.ID, + FilePath: c.FilePath, + Language: "contract", + RepoPrefix: c.RepoPrefix, + WorkspaceID: c.EffectiveWorkspace(), + ProjectID: c.EffectiveProject(), + Meta: map[string]any{ + "type": string(c.Type), + "role": string(c.Role), + "symbol_id": c.SymbolID, + "line": c.Line, + "confidence": c.Confidence, + "contract_meta": c.Meta, + }, + }) + if c.SymbolID == "" { + continue + } edgeKind := graph.EdgeProvides if c.Role == contracts.RoleConsumer { edgeKind = graph.EdgeConsumes } - if c.SymbolID != "" { - idx.graph.AddEdge(&graph.Edge{ + edges = append(edges, &graph.Edge{ + From: c.SymbolID, + To: c.ID, + Kind: edgeKind, + FilePath: c.FilePath, + Line: c.Line, + }) + // Framework-layer EdgeHandlesRoute. Emitted alongside + // EdgeProvides for HTTP / gRPC / WS / GraphQL / topic + // providers so `analyze kind=routes` and other + // framework-aware tools walk one targeted edge instead + // of filtering EdgeProvides by contract type. Consumers + // (callers of routes) and non-route contract types (env, + // OpenAPI specs, DI tokens) intentionally skip this + // edge — they aren't route handlers. + if c.Role == contracts.RoleProvider && isRouteContractType(c.Type) { + edges = append(edges, &graph.Edge{ From: c.SymbolID, To: c.ID, - Kind: edgeKind, + Kind: graph.EdgeHandlesRoute, FilePath: c.FilePath, Line: c.Line, + Meta: map[string]any{ + "contract_type": string(c.Type), + }, }) - // Framework-layer EdgeHandlesRoute. Emitted alongside - // EdgeProvides for HTTP / gRPC / WS / GraphQL / topic - // providers so `analyze kind=routes` and other - // framework-aware tools walk one targeted edge instead - // of filtering EdgeProvides by contract type. Consumers - // (callers of routes) and non-route contract types (env, - // OpenAPI specs, DI tokens) intentionally skip this - // edge — they aren't route handlers. - if c.Role == contracts.RoleProvider && isRouteContractType(c.Type) { - idx.graph.AddEdge(&graph.Edge{ - From: c.SymbolID, - To: c.ID, - Kind: graph.EdgeHandlesRoute, - FilePath: c.FilePath, - Line: c.Line, - Meta: map[string]any{ - "contract_type": string(c.Type), - }, - }) - } } } + bulkStart := time.Now() + idx.bulkCommit(nodes, edges) + bulkElapsed := time.Since(bulkStart) + idx.contractRegistry = reg repo := idx.rootPath if idx.repoPrefix != "" { @@ -3567,7 +4295,24 @@ func (idx *Indexer) commitContracts(reg *contracts.Registry) { } idx.logger.Info("contracts extracted", zap.String("repo", repo), - zap.Int("count", len(reg.All()))) + zap.Int("count", len(all)), + zap.Duration("commit_bulk_elapsed", bulkElapsed)) +} + +// bulkCommit writes nodes + edges in one AddBatch call. The bulk +// load path is intentionally NOT used here: contract IDs often +// coincide with existing source-symbol IDs (a route handler shows +// up as both a Go function and an HTTP-contract anchor), and the +// on-disk backend's bulk load is INSERT-only on the node table so +// any collision fails the whole batch. AddBatch's non-bulk path +// upserts every row so duplicates are absorbed in place; the +// per-call cost is amortised by the chunked write path the backend +// uses internally. +func (idx *Indexer) bulkCommit(nodes []*graph.Node, edges []*graph.Edge) { + if len(nodes) == 0 && len(edges) == 0 { + return + } + idx.graph.AddBatch(nodes, edges) } // isRouteContractType reports whether a ContractType corresponds to a @@ -4933,6 +5678,7 @@ func (idx *Indexer) extractGoModContracts(reg *contracts.Registry) { found := goModExtractor.Extract(goModFilePath, goModSrc, nil, nil) reg.AddAllScoped(found, idx.repoPrefix, idx.workspaceID, idx.projectID) + var nodes []*graph.Node for i := range found { c := found[i] if c.Type != contracts.ContractDependency { @@ -4941,7 +5687,7 @@ func (idx *Indexer) extractGoModContracts(reg *contracts.Registry) { if idx.graph.GetNode(c.ID) != nil { continue } - idx.graph.AddNode(&graph.Node{ + nodes = append(nodes, &graph.Node{ ID: c.ID, Kind: graph.KindContract, Name: c.ID, @@ -4951,6 +5697,9 @@ func (idx *Indexer) extractGoModContracts(reg *contracts.Registry) { Meta: map[string]any{"type": string(c.Type), "role": string(c.Role)}, }) } + if len(nodes) > 0 { + idx.graph.AddBatch(nodes, nil) + } } // extractContracts scans all file nodes in the graph and runs contract @@ -4979,6 +5728,33 @@ func (idx *Indexer) extractContracts() { nodes = idx.graph.AllNodes() } + // Pre-bucket the already-fetched node slice by FilePath so the + // per-file body can look up its co-located nodes in O(1) instead + // of firing a fresh GetFileNodes query per file. Likewise pre- + // fetch every out-edge whose source is in this repo as ONE backend + // call and bucket by From so the per-file body can replace + // GetOutEdges(fileNode.ID) — on disk backends the per-file query + // path was the second-largest source of round-trips in + // deferred_passes (after the DI walk). + nodesByFile := make(map[string][]*graph.Node, len(nodes)) + for _, n := range nodes { + if n == nil { + continue + } + nodesByFile[n.FilePath] = append(nodesByFile[n.FilePath], n) + } + var edgesByFrom map[string][]*graph.Edge + if idx.repoPrefix != "" { + repoEdges := idx.graph.GetRepoEdges(idx.repoPrefix) + edgesByFrom = make(map[string][]*graph.Edge, len(nodes)) + for _, e := range repoEdges { + if e == nil { + continue + } + edgesByFrom[e.From] = append(edgesByFrom[e.From], e) + } + } + for _, fileNode := range nodes { if fileNode.Kind != graph.KindFile { continue @@ -5022,8 +5798,15 @@ func (idx *Indexer) extractContracts() { continue } - fileNodes := idx.graph.GetFileNodes(fileNode.FilePath) - fileEdges := idx.graph.GetOutEdges(fileNode.ID) + var fileNodes []*graph.Node + var fileEdges []*graph.Edge + if idx.repoPrefix != "" { + fileNodes = nodesByFile[fileNode.FilePath] + fileEdges = edgesByFrom[fileNode.ID] + } else { + fileNodes = idx.graph.GetFileNodes(fileNode.FilePath) + fileEdges = idx.graph.GetOutEdges(fileNode.ID) + } // Language-filtered dispatch: skip extractors that don't list // this file's language in SupportedLanguages(). On big repos @@ -5074,6 +5857,84 @@ func (idx *Indexer) extractContracts() { // Unicode form than fileMtimes was keyed with still resolves — without // the fold the lookup would miss and the file be reported permanently // stale, re-indexing it under a second key on every pass. +// HasChangesSinceMtimes reports whether any indexable file under root +// changed (mtime differs or is new) or was deleted, relative to the +// indexer's currently-loaded fileMtimes. It runs the SAME walk + +// staleness + deletion logic as IncrementalReindex but writes nothing. +// +// The daemon warmup uses it to choose a reconcile strategy for a +// reopened repo: a repo with zero changes takes the fast no-op +// IncrementalReindex path, while a repo that changed while the daemon +// was down is routed through the shadow/bulk re-track path instead. +// That routing matters because IncrementalReindex re-resolves changed +// files through per-edge graph.ReindexEdges, and the per-edge write +// path against a freshly reopened disk store is slow and unreliable. +// The shadow path resolves entirely in an in-memory graph and commits +// the result in one bulk load, so it never issues a per-edge write to +// the reopened store. It re-indexes the whole repo (more work than a +// true incremental pass), but it is reliable, and only repos that +// actually changed during downtime pay the cost. +// +// Conservative on error: anything it can't determine (bad root, walk +// error) returns true so the caller re-indexes rather than silently +// serving a stale graph. +func (idx *Indexer) HasChangesSinceMtimes(root string) bool { + absRoot, err := filepath.Abs(root) + if err != nil { + return true + } + idx.rootPath = absRoot + + diskFiles := make(map[string]bool) + errStop := errors.New("stop-walk") + walkErr := filepath.WalkDir(absRoot, func(path string, d os.DirEntry, werr error) error { + if werr != nil { + return nil + } + if d.IsDir() { + if idx.shouldExclude(path, absRoot, true) { + return filepath.SkipDir + } + return nil + } + if _, ok := idx.effectiveLanguage(path, nil); !ok { + return nil + } + if idx.shouldExclude(path, absRoot, false) { + return nil + } + rel := idx.relKey(path) + diskFiles[rel] = true + if idx.IsStale(rel) { + return errStop // a single changed/new file is enough + } + return nil + }) + if errors.Is(walkErr, errStop) { + return true + } + if walkErr != nil { + return true + } + + // Deletion check: a previously-indexed file absent from the walk and + // confirmed gone from disk counts as a change (its edges must drop). + idx.mtimeMu.RLock() + var candidates []string + for rel := range idx.fileMtimes { + if !diskFiles[rel] { + candidates = append(candidates, rel) + } + } + idx.mtimeMu.RUnlock() + for _, rel := range candidates { + if _, err := os.Stat(filepath.Join(absRoot, filepath.FromSlash(rel))); errors.Is(err, os.ErrNotExist) { + return true + } + } + return false +} + func (idx *Indexer) IsStale(relPath string) bool { relPath = pathkey.Normalize(filepath.ToSlash(relPath)) diff --git a/internal/indexer/indexer_test.go b/internal/indexer/indexer_test.go index 2fcba073..1b12e725 100644 --- a/internal/indexer/indexer_test.go +++ b/internal/indexer/indexer_test.go @@ -64,7 +64,7 @@ func writeFile(t *testing.T, path, content string) { require.NoError(t, os.WriteFile(path, []byte(content), 0o644)) } -func newTestIndexer(g *graph.Graph) *Indexer { +func newTestIndexer(g graph.Store) *Indexer { reg := parser.NewRegistry() reg.Register(languages.NewGoExtractor()) cfg := config.Default().Index diff --git a/internal/indexer/multi.go b/internal/indexer/multi.go index d70f7e8f..d7ef9eb3 100644 --- a/internal/indexer/multi.go +++ b/internal/indexer/multi.go @@ -45,7 +45,7 @@ type RepoMetadata struct { // MultiIndexer orchestrates indexing across multiple repositories. type MultiIndexer struct { - graph *graph.Graph + graph graph.Store registry *parser.Registry search search.Backend embedder embedding.Provider @@ -349,6 +349,7 @@ func (mi *MultiIndexer) RunDeferredPassesAll(ctx context.Context) { } if mi.graph != nil { master := resolver.New(mi.graph) + master.SetLogger(mi.logger) // Mirror the resolve-time LSP helper onto the master pass // too — RunDeferredPassesAll is where placeholder edges // added by deferred per-repo passes get resolved in batch, @@ -359,7 +360,9 @@ func (mi *MultiIndexer) RunDeferredPassesAll(ctx context.Context) { } master.SetNpmAliasResolver(mi.npmAliasResolver()) master.SetWorkspaceMembership(mi.workspaceMembershipResolver()) + mt := time.Now() master.ResolveAll() + mi.logger.Info("DEFERRED-TIMING master.ResolveAll", zap.Duration("elapsed", time.Since(mt))) } } @@ -379,6 +382,26 @@ func (mi *MultiIndexer) EndBatch() { mi.RunGlobalGraphPasses(context.Background()) } +// ResetBatch clears deferred-batch mode WITHOUT running the graph-wide +// derivation passes. It is the warm-restart fast-path counterpart to +// EndBatch: when the warmup reconcile loop observed zero changed files +// across every repo, the persistent backend already holds every resolved +// and derived edge from the prior run, so RunGlobalGraphPasses (plus the +// RunDeferredPassesAll / RunGlobalResolve the caller also skips) would +// only recompute what's already on disk — the work that turns a warm +// restart into a 30s–500s stall. The per-Indexer SetDeferGlobalPasses +// flag is still restored so a later watch-triggered TrackRepoCtx / +// IncrementalReindex runs its passes inline as normal. +func (mi *MultiIndexer) ResetBatch() { + mi.mu.Lock() + defer mi.mu.Unlock() + mi.deferGlobalPasses = false + mi.deferResolve = false + for _, idx := range mi.indexers { + idx.SetDeferGlobalPasses(false) + } +} + // RunGlobalGraphPasses runs the graph-wide derivation passes once // against the shared graph: InferImplements (structural interface // satisfaction), InferOverrides (method-level overrides on @@ -404,17 +427,46 @@ func (mi *MultiIndexer) RunGlobalGraphPasses(ctx context.Context) { zap.Int("edges", emitted), ) } + // Clone detection is PER-REPOSITORY: each tracked repo gets its own + // finalise + detect over its own nodes (scoped by RepoPrefix), so no + // cross-repo candidate pair is ever formed and each repo's boilerplate + // CMS / threshold is computed from that repo's bodies alone. This + // matches the per-repo incremental maintainer (cloneIndex.Rebuild / + // UpdateFuncs) so the batch and incremental edge sets agree. reporter.Report("clone detection pass (global)", 0, 0) - if cs := detectClonesAndEmitEdgesCtx(ctx, mi.graph, mi.cloneThreshold()); cs.Items > 0 { - mi.logger.Info("clone edges emitted (global)", - zap.Int("items", cs.Items), - zap.Int("clone_pairs", cs.Pairs), - zap.Int("edges", cs.Edges), - zap.Int("skipped_buckets", cs.SkippedBuckets), - zap.Int("skipped_bucket_items", cs.SkippedBucketItems), - zap.Int("diffused_pairs", cs.DiffusedPairs), - zap.Int("diffused_edges", cs.DiffusedEdges), - ) + mi.mu.RLock() + cloneIdx := make([]*Indexer, 0, len(mi.indexers)) + for _, idx := range mi.indexers { + cloneIdx = append(cloneIdx, idx) + } + mi.mu.RUnlock() + for _, idx := range cloneIdx { + // Per-repo threshold, NOT a max-over-repos value: the batch must use + // the same cutoff the per-repo incremental maintainer uses + // (UpdateFuncs/Rebuild → idx.cloneThreshold()), or the batch and + // incremental edge sets diverge for any repo whose configured + // threshold differs from the workspace maximum. + if cs := detectClonesAndEmitEdgesCtx(ctx, mi.graph, idx.repoPrefix, idx.cloneThreshold()); cs.Items > 0 { + mi.logger.Info("clone edges emitted (global)", + zap.String("repo", idx.repoPrefix), + zap.Int("items", cs.Items), + zap.Int("clone_pairs", cs.Pairs), + zap.Int("edges", cs.Edges), + zap.Int("skipped_buckets", cs.SkippedBuckets), + zap.Int("skipped_bucket_items", cs.SkippedBucketItems), + zap.Int("diffused_pairs", cs.DiffusedPairs), + zap.Int("diffused_edges", cs.DiffusedEdges), + ) + } + } + // Seed each per-repo indexer's incremental clone index from the + // freshly-baselined signatures + sidecar (scoped to that repo's + // prefix) so steady-state single-file edits after this batch go + // incremental instead of re-running the whole-graph pass per file. + for _, idx := range cloneIdx { + if idx.cloneIndex != nil { + idx.cloneIndex.Rebuild(mi.graph, idx.repoPrefix) + } } // gRPC stub-call resolution. After InferImplements (the // interface-satisfaction fallback signal) and before @@ -456,23 +508,6 @@ func (mi *MultiIndexer) RunGlobalGraphPasses(ctx context.Context) { } } -// cloneThreshold resolves the graph-wide Jaccard similarity cutoff for -// clone detection. Thresholds are configured per-repo but the LSH pass -// is graph-wide, so the strictest (highest) configured value across -// tracked repos wins — fewer false-positive EdgeSimilarTo edges. Zero -// (no repo set one) falls through to the clones package default. -func (mi *MultiIndexer) cloneThreshold() float64 { - mi.mu.RLock() - defer mi.mu.RUnlock() - best := 0.0 - for _, idx := range mi.indexers { - if t := idx.cloneThreshold(); t > best { - best = t - } - } - return best -} - // externalCallSynthesisEnabled resolves whether external-call placeholder // synthesis should run over the shared graph. The pass is graph-wide, so // it is enabled when any tracked repo opted in — a repo that wants the @@ -491,7 +526,7 @@ func (mi *MultiIndexer) externalCallSynthesisEnabled() bool { // NewMultiIndexer creates a MultiIndexer. func NewMultiIndexer( - g *graph.Graph, + g graph.Store, reg *parser.Registry, s search.Backend, cm *config.ConfigManager, @@ -1085,7 +1120,40 @@ func (mi *MultiIndexer) ReconcileRepoCtx(ctx context.Context, entry config.RepoE idx.SetRootPath(absPath) idx.SetFileMtimes(priorMtimes) - result, err := idx.IncrementalReindex(absPath) + // Choose the reconcile strategy. A repo that changed while the + // daemon was down must NOT take IncrementalReindex's per-file path: + // re-resolving a changed file there goes through per-edge + // graph.ReindexEdges, and the per-edge write against a freshly + // reopened disk store is slow and unreliable. The shadow/bulk + // re-track path (IndexCtx) resolves in an in-memory shadow and + // commits one bulk load, so it never issues a per-edge write to the + // reopened store. It re-indexes the whole repo, but only repos that + // actually changed pay it, and it is reliable where the per-edge path + // is not. A repo with zero changes keeps the fast IncrementalReindex + // no-op (walk + 0 stale → return), which is what makes an unchanged + // warm restart near-instant. + // The shadow/bulk re-track path for the per-edge ReindexEdges + // problem applies ONLY to disk-backed stores, which is where the + // per-edge write to a reopened store is unreliable. The in-memory + // backend (*graph.Graph) has + // no reopen and no CGo write path, and IncrementalReindex is the + // authoritative path there — it evicts offline-deleted files in place + // (a re-track of a shared in-memory graph would not). Gate on the + // store type so the memory backend keeps its exact prior behaviour. + _, memoryBacked := mi.graph.(*graph.Graph) + var result *IndexResult + if !memoryBacked && idx.HasChangesSinceMtimes(absPath) { + result, err = idx.IndexCtx(ctx, absPath) + if err == nil && result != nil && result.StaleFileCount == 0 { + // Signal "this repo did re-indexing work" to the warmup + // change-detector (which keys on StaleFileCount): a full + // re-track touches every file, so the daemon's global + // resolution passes must run. + result.StaleFileCount = result.FileCount + } + } else { + result, err = idx.IncrementalReindex(absPath) + } if err != nil { return nil, fmt.Errorf("reconciling %s: %w", absPath, err) } @@ -1149,7 +1217,14 @@ func (mi *MultiIndexer) ReconcileAll() map[string]*IndexResult { // don't suppress it. With ~100 repos that's ~100× the work for the // hourly janitor. mi.BeginBatch() - defer mi.EndBatch() + // Always restore batch flags on exit (incl. panic) WITHOUT running the + // graph-wide derivation passes — those are run explicitly below, and + // only when a repo actually reindexed. The hourly janitor used to run + // EndBatch unconditionally, walking the full graph (InferImplements / + // InferOverrides / clone detection over hundreds of thousands of + // edges) every cycle even when nothing changed — wasted CPU and, on a + // small resident buffer pool, needless memory churn. + defer mi.ResetBatch() results := make(map[string]*IndexResult, len(prefixes)) reindexed := 0 @@ -1187,6 +1262,10 @@ func (mi *MultiIndexer) ReconcileAll() map[string]*IndexResult { if reindexed > 0 { mi.ReconcileContractEdges() + // Only now — when at least one repo actually reindexed — is it + // worth the full-graph derivation pass. Nothing changed → skip it + // (the deferred ResetBatch still clears the batch flags). + mi.RunGlobalGraphPasses(context.Background()) } return results } @@ -1587,7 +1666,7 @@ func (mi *MultiIndexer) MergedContractRegistry() *contracts.Registry { // re-extract shapes (the type nodes already have them from // snapshotContractShapes if they were referenced anywhere), it just // attaches them to the new contract entries. -func (mi *MultiIndexer) attachInlinedShapes(cr *contracts.Registry, g *graph.Graph) { +func (mi *MultiIndexer) attachInlinedShapes(cr *contracts.Registry, g graph.Store) { idsToTouch := map[string]bool{} for _, c := range cr.All() { if c.Meta == nil { @@ -2036,7 +2115,7 @@ func (mi *MultiIndexer) ReconcileContractEdges() int { // have the contract ID can also look up the topic node directly. // Meta on the node carries the broker family and the raw topic name // for filterless queries. -func emitTopicEdges(g *graph.Graph, m contracts.CrossLink, topicNodes map[string]struct{}) { +func emitTopicEdges(g graph.Store, m contracts.CrossLink, topicNodes map[string]struct{}) { // Trust the matcher to bucket only same-broker contracts together // because Contract.ID already includes the broker token; if the // broker isn't on the provider Meta, fall through to the contract @@ -2136,7 +2215,7 @@ func parseTopicContractID(id string) (broker, name string, ok bool) { } // Graph returns the underlying shared graph. -func (mi *MultiIndexer) Graph() *graph.Graph { +func (mi *MultiIndexer) Graph() graph.Store { return mi.graph } diff --git a/internal/indexer/multi_contract_edges_test.go b/internal/indexer/multi_contract_edges_test.go index d938a068..d6e1ab66 100644 --- a/internal/indexer/multi_contract_edges_test.go +++ b/internal/indexer/multi_contract_edges_test.go @@ -880,7 +880,7 @@ func TestReconcileContractEdges_OpenAPIBridge(t *testing.T) { // matchEdgeSummaries dumps all EdgeMatches as "from → to" strings for // failure-message context when the expected bridges aren't present. -func matchEdgeSummaries(g *graph.Graph) []string { +func matchEdgeSummaries(g graph.Store) []string { var out []string for _, e := range g.AllEdges() { if e.Kind == graph.EdgeMatches { @@ -927,7 +927,7 @@ func TestReconcileContractEdges_PurgesStaleOnUntrack(t *testing.T) { len(remaining), remaining) } -func collectMatchEdges(g *graph.Graph) []string { +func collectMatchEdges(g graph.Store) []string { var out []string for _, e := range g.AllEdges() { if e.Kind == graph.EdgeMatches { diff --git a/internal/indexer/multi_global_passes_test.go b/internal/indexer/multi_global_passes_test.go index b426cc5c..d0b65707 100644 --- a/internal/indexer/multi_global_passes_test.go +++ b/internal/indexer/multi_global_passes_test.go @@ -50,7 +50,7 @@ func TestRunGreet(t *testing.T) { return dir } -func countEdges(g *graph.Graph, kind graph.EdgeKind) int { +func countEdges(g graph.Store, kind graph.EdgeKind) int { n := 0 for _, e := range g.AllEdges() { if e.Kind == kind { diff --git a/internal/indexer/multi_node_id_test.go b/internal/indexer/multi_node_id_test.go index d58a414a..47483359 100644 --- a/internal/indexer/multi_node_id_test.go +++ b/internal/indexer/multi_node_id_test.go @@ -130,7 +130,7 @@ func TestMultiRepo_ResolvesCallEdges(t *testing.T) { } } -func outEdgeSummaries(g *graph.Graph, id string) []string { +func outEdgeSummaries(g graph.Store, id string) []string { var out []string for _, e := range g.GetOutEdges(id) { out = append(out, string(e.Kind)+":"+e.To) @@ -176,9 +176,22 @@ func TestTrackRepoCtx_FirstOfManyStillGetsPrefix(t *testing.T) { // Every node must carry a non-empty RepoPrefix and its FilePath must // live under that prefix. Any violation means a code path bypassed - // applyRepoPrefix. + // applyRepoPrefix. KindModule and KindBuiltin are deliberately + // cross-repo singletons (one `module::pypi:requests` / + // `builtin::go::type::string` shared across every repo that uses + // them) so they're exempt from the per-repo prefix rule. var missingPrefix, badFilePaths []string for _, n := range g.AllNodes() { + if n.Kind == graph.KindModule || n.Kind == graph.KindBuiltin { + continue + } + if ext, _ := n.Meta["external"].(bool); ext { + // External call targets the resolver materialises as + // KindFunction with meta.external=true are cross-repo + // singletons (one `stdlib::fmt::Sprintf` shared across + // every repo that calls it) — same as KindModule. + continue + } if n.RepoPrefix == "" { missingPrefix = append(missingPrefix, n.ID) continue diff --git a/internal/indexer/multi_test.go b/internal/indexer/multi_test.go index 3cc88ad7..2f4c5aae 100644 --- a/internal/indexer/multi_test.go +++ b/internal/indexer/multi_test.go @@ -747,7 +747,7 @@ func TestPropertyReindexIsolation(t *testing.T) { } // countRepoEdges counts edges where at least one endpoint belongs to the given repo prefix. -func countRepoEdges(g *graph.Graph, repoPrefix string) int { +func countRepoEdges(g graph.Store, repoPrefix string) int { prefix := repoPrefix + "/" count := 0 for _, e := range g.AllEdges() { diff --git a/internal/indexer/multi_topic_edges_test.go b/internal/indexer/multi_topic_edges_test.go index 52db7f62..66b06505 100644 --- a/internal/indexer/multi_topic_edges_test.go +++ b/internal/indexer/multi_topic_edges_test.go @@ -25,7 +25,7 @@ import ( // findTopicNode walks the graph for a KindTopic node by ID and // returns it (or nil if absent). Used by topic-edge tests to assert // node materialisation alongside edge presence. -func findTopicNode(g *graph.Graph, id string) *graph.Node { +func findTopicNode(g graph.Store, id string) *graph.Node { for _, n := range g.AllNodes() { if n.Kind == graph.KindTopic && n.ID == id { return n @@ -36,7 +36,7 @@ func findTopicNode(g *graph.Graph, id string) *graph.Node { // collectTopicEdges returns every produces_topic / consumes_topic // edge in the graph as "from→to" strings, for diagnostic output. -func collectTopicEdges(g *graph.Graph, kind graph.EdgeKind) []string { +func collectTopicEdges(g graph.Store, kind graph.EdgeKind) []string { var out []string for _, e := range g.AllEdges() { if e.Kind == kind { @@ -264,7 +264,7 @@ func TestReconcileContractEdges_TopicEdges_CrossWorkspaceIsolation(t *testing.T) } // topicNodeIDs returns the ID of every KindTopic node in the graph. -func topicNodeIDs(g *graph.Graph) []string { +func topicNodeIDs(g graph.Store) []string { var out []string for _, n := range g.AllNodes() { if n.Kind == graph.KindTopic { diff --git a/internal/indexer/multi_watcher.go b/internal/indexer/multi_watcher.go index bddbdbfc..70c044c5 100644 --- a/internal/indexer/multi_watcher.go +++ b/internal/indexer/multi_watcher.go @@ -211,9 +211,18 @@ func (mw *MultiWatcher) forwardEvents(prefix string, w *Watcher) { return } - // After re-indexing, trigger cross-repo resolution. + // After re-indexing, trigger cross-repo resolution — scoped + // to the file that changed, not the whole repo. ResolveForRepo + // materialised the repo's entire edge set on every save (the + // per-edit allocation flood); ResolveForFile only re-resolves + // the changed file's out-edges. The watcher path is absolute, + // so convert it to the repo-relative graph key first. if mw.multi.IsMultiRepo() { - stats := mw.resolver.ResolveForRepo(prefix) + relPath := ev.FilePath + if w.indexer != nil { + relPath = w.indexer.RelKey(ev.FilePath) + } + stats := mw.resolver.ResolveForFile(prefix, relPath) if stats.CrossRepoEdges > 0 { mw.logger.Debug("cross-repo edges updated after file change", zap.String("repo", prefix), diff --git a/internal/indexer/npm_alias_resolve_test.go b/internal/indexer/npm_alias_resolve_test.go index 467777d2..c78b7f42 100644 --- a/internal/indexer/npm_alias_resolve_test.go +++ b/internal/indexer/npm_alias_resolve_test.go @@ -116,7 +116,7 @@ func TestNpmAliasIndex_NilRootsYieldsNil(t *testing.T) { // addPackageNode registers a KindPackage node with the given qualified // name — this is what CrossRepoResolver.resolveImport matches an // import path against (mirrors the existing cross-repo import tests). -func addPackageNode(g *graph.Graph, repo, file, qualName string) { +func addPackageNode(g graph.Store, repo, file, qualName string) { g.AddNode(&graph.Node{ ID: file, Kind: graph.KindPackage, Name: qualName, QualName: qualName, FilePath: file, Language: "typescript", RepoPrefix: repo, diff --git a/internal/indexer/poller.go b/internal/indexer/poller.go index e4f67ffb..d5804dc2 100644 --- a/internal/indexer/poller.go +++ b/internal/indexer/poller.go @@ -205,9 +205,16 @@ func (p *Poller) pollGitHead() bool { } p.mu.Lock() oldSHA := p.lastSHA - p.lastSHA = newSHA p.mu.Unlock() - if oldSHA == "" || oldSHA == newSHA { + if oldSHA == "" { + // First observation: seed lastSHA and don't diff against a + // phantom range. There is no prior commit to reconcile from. + p.mu.Lock() + p.lastSHA = newSHA + p.mu.Unlock() + return false + } + if oldSHA == newSHA { return false } @@ -215,6 +222,9 @@ func (p *Poller) pollGitHead() bool { defer cancel() changes, err := pollerDiffNameStatus(ctx, p.rootPath, oldSHA, newSHA) if err != nil { + // Leave lastSHA at oldSHA so the next cycle retries this exact + // range. Advancing it here would permanently skip the + // un-reconciled oldSHA..newSHA span on a transient diff failure. if p.logger != nil { p.logger.Debug("watcher: poller git diff failed", zap.String("from", oldSHA), zap.String("to", newSHA), @@ -222,6 +232,15 @@ func (p *Poller) pollGitHead() bool { } return false } + + // Diff succeeded — the range is now safe to mark reconciled. Advance + // lastSHA before dispatching so a concurrent poll doesn't re-diff the + // same span; dispatch failures of individual files are best-effort + // and don't warrant re-running the whole diff. + p.mu.Lock() + p.lastSHA = newSHA + p.mu.Unlock() + n := 0 for _, c := range changes { switch c.Status { diff --git a/internal/indexer/realtime_reliability_test.go b/internal/indexer/realtime_reliability_test.go new file mode 100644 index 00000000..49ccd744 --- /dev/null +++ b/internal/indexer/realtime_reliability_test.go @@ -0,0 +1,567 @@ +package indexer + +import ( + "errors" + "os" + "os/exec" + "path/filepath" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/sgtdi/fswatcher" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + "go.uber.org/zap/zaptest/observer" + + "github.com/zzet/gortex/internal/config" + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/parser" + "github.com/zzet/gortex/internal/search" +) + +// toggleExtractor is a parser.Extractor whose Extract result is flipped +// at runtime. In good mode it emits a file node plus one function node; +// in fail mode it returns (nil, err) — the exact shape that drives +// indexFile's `result == nil` branch (a transient parse failure / +// quarantine), which tree-sitter's error-tolerant grammars never +// produce for real Go source. The custom ".fk" extension keeps it off +// every other extractor's turf. +type toggleExtractor struct { + mu sync.Mutex + fail bool + funcs []string +} + +func (e *toggleExtractor) Language() string { return "faketoggle" } +func (e *toggleExtractor) Extensions() []string { return []string{".fk"} } + +func (e *toggleExtractor) setFail(f bool) { + e.mu.Lock() + e.fail = f + e.mu.Unlock() +} + +func (e *toggleExtractor) setFuncs(names ...string) { + e.mu.Lock() + e.funcs = names + e.mu.Unlock() +} + +func (e *toggleExtractor) Extract(filePath string, src []byte) (*parser.ExtractionResult, error) { + e.mu.Lock() + fail := e.fail + funcs := append([]string(nil), e.funcs...) + e.mu.Unlock() + if fail { + return nil, errors.New("toggleExtractor: forced parse failure") + } + nodes := []*graph.Node{{ + ID: filePath, + Kind: graph.KindFile, + Name: filepath.Base(filePath), + FilePath: filePath, + Language: "faketoggle", + }} + for _, fn := range funcs { + nodes = append(nodes, &graph.Node{ + ID: filePath + "::" + fn, + Kind: graph.KindFunction, + Name: fn, + FilePath: filePath, + Language: "faketoggle", + }) + } + return &parser.ExtractionResult{Nodes: nodes}, nil +} + +func newToggleIndexer(t *testing.T) (*Indexer, *toggleExtractor) { + t.Helper() + ext := &toggleExtractor{} + reg := parser.NewRegistry() + reg.Register(ext) + g := graph.New() + idx := New(g, reg, config.IndexConfig{Workers: 1}, zap.NewNop()) + idx.search = search.NewBM25() + return idx, ext +} + +func searchHasID(idx *Indexer, query, id string) bool { + for _, r := range idx.search.Search(query, 50) { + if r.ID == id { + return true + } + } + return false +} + +// TestIndexFile_ParseFailureKeepsPriorNodes is the central proof of the +// parse-then-swap fix: re-indexing a file whose new bytes are +// unparseable must NOT zero the file's prior nodes / edges / search +// entries. Stale-but-present beats empty. A clean re-index then swaps +// them as normal. +// +// Against the pre-fix evict-first code this test FAILS: indexFile +// evicted the graph + search entries before parsing and returned early +// on result == nil, leaving the file at zero nodes. +func TestIndexFile_ParseFailureKeepsPriorNodes(t *testing.T) { + idx, ext := newToggleIndexer(t) + dir := t.TempDir() + path := filepath.Join(dir, "main.fk") + idx.SetRootPath(dir) + + // First index, good mode — one function lands in graph + search. + ext.setFail(false) + ext.setFuncs("Alpha") + writeFile(t, path, "alpha body") + require.NoError(t, idx.IndexFile(path)) + + funcID := "main.fk::Alpha" + require.NotNil(t, idx.graph.GetNode(funcID), "Alpha must be indexed before the bad edit") + require.True(t, searchHasID(idx, "Alpha", funcID), "Alpha must be in the search index before the bad edit") + nodesBefore := len(idx.graph.GetFileNodes("main.fk")) + require.Equal(t, 2, nodesBefore, "file node + Alpha") + + // Save a transiently unparseable edit. extractFile returns + // (nil, err); indexFile must NOT evict. + ext.setFail(true) + writeFile(t, path, "this no longer parses") + require.Error(t, idx.IndexFile(path), + "a failed parse should surface the extractor error") + + // The prior state survives, untouched. + assert.Equal(t, nodesBefore, len(idx.graph.GetFileNodes("main.fk")), + "a failed re-index must leave the file's prior nodes intact, not zero them") + assert.NotNil(t, idx.graph.GetNode(funcID), + "Alpha must still exist after the failed re-index") + assert.True(t, searchHasID(idx, "Alpha", funcID), + "Alpha must still be in the search index after the failed re-index") + + // A subsequent valid re-index swaps cleanly: Alpha gone, Beta in. + ext.setFail(false) + ext.setFuncs("Beta") + writeFile(t, path, "beta body") + require.NoError(t, idx.IndexFile(path)) + + assert.Nil(t, idx.graph.GetNode(funcID), + "a successful re-index must evict the old Alpha node") + assert.False(t, searchHasID(idx, "Alpha", funcID), + "a successful re-index must remove Alpha's stale search entry") + betaID := "main.fk::Beta" + assert.NotNil(t, idx.graph.GetNode(betaID), "Beta must be indexed by the clean swap") + assert.True(t, searchHasID(idx, "Beta", betaID), "Beta must be in the search index after the clean swap") +} + +// TestPatchGraphModify_ParseFailureKeepsPriorNodes proves the LIVE +// watcher path is parse-safe, not just indexFile in isolation. The +// editor-save path goes Watcher event -> patchGraph(ChangeModified), +// which used to call EvictFile BEFORE IndexFile — so a transiently +// unparseable save dropped the file's nodes even with indexFile itself +// fixed. With the pre-evict removed, a failed modify through patchGraph +// must leave the file's prior nodes / search entries intact, and a clean +// modify must still swap. Against the pre-fix patchGraph this FAILS. +func TestPatchGraphModify_ParseFailureKeepsPriorNodes(t *testing.T) { + idx, ext := newToggleIndexer(t) + dir := t.TempDir() + idx.SetRootPath(dir) + path := filepath.Join(dir, "main.fk") + + w, err := NewWatcher(idx, config.WatchConfig{Enabled: true, DebounceMs: 10}, zap.NewNop()) + require.NoError(t, err) + + // Initial index through the live create patch. + ext.setFail(false) + ext.setFuncs("Alpha") + writeFile(t, path, "alpha body") + w.patchGraph(path, ChangeCreated) + + funcID := "main.fk::Alpha" + require.NotNil(t, idx.graph.GetNode(funcID), "Alpha must be indexed via the create patch") + require.True(t, searchHasID(idx, "Alpha", funcID)) + nodesBefore := len(idx.graph.GetFileNodes("main.fk")) + require.Equal(t, 2, nodesBefore, "file node + Alpha") + + // A transiently-unparseable save arrives as a Modify on the live path. + ext.setFail(true) + writeFile(t, path, "this no longer parses") + w.patchGraph(path, ChangeModified) + + assert.Equal(t, nodesBefore, len(idx.graph.GetFileNodes("main.fk")), + "a failed modify through the live watcher path must not zero the file's nodes") + assert.NotNil(t, idx.graph.GetNode(funcID), "Alpha must survive the failed live modify") + assert.True(t, searchHasID(idx, "Alpha", funcID), "Alpha's search entry must survive the failed live modify") + + // A clean modify swaps cleanly. + ext.setFail(false) + ext.setFuncs("Beta") + writeFile(t, path, "beta body") + w.patchGraph(path, ChangeModified) + assert.Nil(t, idx.graph.GetNode(funcID), "a clean live modify evicts Alpha") + assert.NotNil(t, idx.graph.GetNode("main.fk::Beta"), "a clean live modify indexes Beta") +} + +// TestPollGitHead_DiffFailureRetriesRange proves the lastSHA-advance +// fix: when the git diff for the moved range errors, pollGitHead must +// leave lastSHA at the old SHA so the next cycle retries the same +// (un-reconciled) range. Advancing it on failure would permanently skip +// that span. +// +// We force the diff failure by seeding lastSHA with a bogus SHA — `git +// diff ..HEAD` errors with "unknown revision". The fix then +// requires lastSHA to stay bogus across the failing poll, and the range +// to reconcile once lastSHA is a real prior commit. +func TestPollGitHead_DiffFailureRetriesRange(t *testing.T) { + if !haveGit(t) { + t.Skip("git binary not available in PATH") + } + repoDir := t.TempDir() + runGit(t, repoDir, "init", "-q", "-b", "main") + runGit(t, repoDir, "config", "user.email", "test@example.com") + runGit(t, repoDir, "config", "user.name", "Test") + runGit(t, repoDir, "config", "commit.gpgsign", "false") + + writeFile(t, filepath.Join(repoDir, "a.go"), "package main\nfunc Alpha() {}\n") + runGit(t, repoDir, "add", ".") + runGit(t, repoDir, "commit", "-q", "-m", "main: Alpha") + firstSHA, err := pollerHeadSHA(repoDir) + require.NoError(t, err) + + writeFile(t, filepath.Join(repoDir, "b.go"), "package main\nfunc Beta() {}\n") + runGit(t, repoDir, "add", ".") + runGit(t, repoDir, "commit", "-q", "-m", "main: Beta") + + g := graph.New() + idx := New(g, newTestRegistry(), config.IndexConfig{Workers: 1}, zap.NewNop()) + idx.search = search.NewBM25() + idx.SetRootPath(repoDir) + _, err = idx.IndexCtx(testCtx(), repoDir) + require.NoError(t, err) + require.NotEmpty(t, g.GetFileNodes("a.go")) + require.NotEmpty(t, g.GetFileNodes("b.go")) + + w, err := NewWatcher(idx, config.WatchConfig{Enabled: true, DebounceMs: 10}, zap.NewNop()) + require.NoError(t, err) + p := newPoller(w, idx, zap.NewNop()) + + // Seed lastSHA with a SHA git can't resolve — the diff for this + // cycle's range will error. + const bogus = "0000000000000000000000000000000000000000" + p.mu.Lock() + p.lastSHA = bogus + p.mu.Unlock() + + // Failing cycle: diff errors, so lastSHA must NOT advance. + require.False(t, p.pollGitHead(), "a failed diff reports no reconcile") + p.mu.Lock() + stuck := p.lastSHA + p.mu.Unlock() + require.Equal(t, bogus, stuck, + "a failed git diff must leave lastSHA at the old SHA so the range is retried, not skipped") + + // Now point lastSHA at a real prior commit; the retry reconciles + // the same HEAD range that the failure left un-reconciled. + p.mu.Lock() + p.lastSHA = firstSHA + p.mu.Unlock() + require.True(t, p.pollGitHead(), "the retry must reconcile the previously-failed range") + head, err := pollerHeadSHA(repoDir) + require.NoError(t, err) + p.mu.Lock() + settled := p.lastSHA + p.mu.Unlock() + assert.Equal(t, head, settled, "a successful diff advances lastSHA to HEAD") +} + +func haveGit(t *testing.T) bool { + t.Helper() + _, err := exec.LookPath("git") + return err == nil +} + +// TestWatcher_OverflowEventTriggersReconcile proves the kernel-overflow +// gap is closed: a pathless EventOverflow on the Events channel triggers +// a coalesced full-tree reconcile (the signal the Linux inotify backend +// raises when its queue overflows and events are lost). The reconcileFn +// seam stands in for IncrementalReindex so the assertion is +// deterministic and platform-independent. +func TestWatcher_OverflowEventTriggersReconcile(t *testing.T) { + idx, _ := newToggleIndexer(t) + idx.SetRootPath(t.TempDir()) + w, err := NewWatcher(idx, config.WatchConfig{Enabled: true, DebounceMs: 10}, zap.NewNop()) + require.NoError(t, err) + + var calls int32 + done := make(chan struct{}, 1) + w.reconcileMu.Lock() + w.reconcileFn = func() { + atomic.AddInt32(&calls, 1) + select { + case done <- struct{}{}: + default: + } + } + w.reconcileMu.Unlock() + + w.handleEvent(fswatcher.WatchEvent{Types: []fswatcher.EventType{fswatcher.EventOverflow}}) + + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("overflow event did not trigger a reconcile") + } + assert.Equal(t, int32(1), atomic.LoadInt32(&calls), "exactly one reconcile from one overflow") +} + +// TestWatcher_OverflowReconcileCoalesces proves a burst of overflow +// signals collapses into at most one reconcile in flight — the loop is +// never blocked and the tree isn't re-walked per dropped event. +func TestWatcher_OverflowReconcileCoalesces(t *testing.T) { + idx, _ := newToggleIndexer(t) + idx.SetRootPath(t.TempDir()) + w, err := NewWatcher(idx, config.WatchConfig{Enabled: true, DebounceMs: 10}, zap.NewNop()) + require.NoError(t, err) + + var calls int32 + release := make(chan struct{}) + started := make(chan struct{}, 1) + w.reconcileMu.Lock() + w.reconcileFn = func() { + atomic.AddInt32(&calls, 1) + select { + case started <- struct{}{}: + default: + } + <-release // hold the reconcile "in flight" + } + w.reconcileMu.Unlock() + + // First signal starts the (blocked) reconcile. + w.triggerOverflowReconcile("queue-overflow") + select { + case <-started: + case <-time.After(2 * time.Second): + t.Fatal("first reconcile never started") + } + + // A burst while one is in flight must be coalesced away. + for i := 0; i < 25; i++ { + w.triggerOverflowReconcile("queue-overflow") + } + assert.Equal(t, int32(1), atomic.LoadInt32(&calls), + "overflow signals during an in-flight reconcile must coalesce to one") + + close(release) + // Once the in-flight reconcile drains, a fresh signal runs again. + require.Eventually(t, func() bool { + w.reconcileMu.Lock() + pending := w.reconcilePending + w.reconcileMu.Unlock() + return !pending + }, 2*time.Second, 5*time.Millisecond, "reconcilePending must clear after the reconcile finishes") +} + +// TestWatcher_OverflowReconcileIndexesMissedFile is the end-to-end proof +// that the real reconcile path (IncrementalReindex) recovers a file +// whose create/modify event was lost. We index a tree, drop a brand-new +// file on disk (simulating a missed inotify create), then drive an +// overflow through the real reconcile and assert the new file is now in +// the graph. +func TestWatcher_OverflowReconcileIndexesMissedFile(t *testing.T) { + dir := t.TempDir() + ext := &toggleExtractor{} + reg := parser.NewRegistry() + reg.Register(ext) + g := graph.New() + idx := New(g, reg, config.IndexConfig{Workers: 1}, zap.NewNop()) + idx.search = search.NewBM25() + idx.SetRootPath(dir) + + ext.setFail(false) + ext.setFuncs("Seed") + writeFile(t, filepath.Join(dir, "seed.fk"), "seed body") + _, err := idx.IndexCtx(testCtx(), dir) + require.NoError(t, err) + require.NotEmpty(t, g.GetFileNodes("seed.fk")) + + w, err := NewWatcher(idx, config.WatchConfig{Enabled: true, DebounceMs: 10}, zap.NewNop()) + require.NoError(t, err) + + // A new file appears on disk but its create event was "lost". + ext.setFuncs("Recovered") + writeFile(t, filepath.Join(dir, "missed.fk"), "recovered body") + require.Empty(t, g.GetFileNodes("missed.fk"), "missed file must be absent before the reconcile") + + // Drive the real IncrementalReindex through the overflow path, with + // a thin wrapper only to know when it finishes. + done := make(chan struct{}, 1) + w.reconcileMu.Lock() + w.reconcileFn = func() { + _, rerr := idx.IncrementalReindex(dir) + require.NoError(t, rerr) + done <- struct{}{} + } + w.reconcileMu.Unlock() + + w.handleEvent(fswatcher.WatchEvent{Types: []fswatcher.EventType{fswatcher.EventOverflow}}) + select { + case <-done: + case <-time.After(5 * time.Second): + t.Fatal("overflow reconcile never ran") + } + + assert.NotEmpty(t, g.GetFileNodes("missed.fk"), + "the overflow-driven reconcile must index the previously-missed file") +} + +// TestWatcher_NewSubdirScanIndexesPreWatchFile proves the new-subdir +// race is closed: a file written into a freshly-created directory before +// its watch attaches (so its own create event is never delivered) is +// still indexed, because the directory's create event triggers a scoped +// subtree scan. We drive the real path — handleEvent -> enqueueDirScan +// -> runDirScan -> IncrementalReindexPaths, no seam — and assert the +// pre-watch file lands in the graph. +func TestWatcher_NewSubdirScanIndexesPreWatchFile(t *testing.T) { + dir := t.TempDir() + ext := &toggleExtractor{} + reg := parser.NewRegistry() + reg.Register(ext) + g := graph.New() + idx := New(g, reg, config.IndexConfig{Workers: 1}, zap.NewNop()) + idx.search = search.NewBM25() + idx.SetRootPath(dir) + + ext.setFail(false) + ext.setFuncs("Seed") + writeFile(t, filepath.Join(dir, "seed.fk"), "seed body") + _, err := idx.IndexCtx(testCtx(), dir) + require.NoError(t, err) + require.NotEmpty(t, g.GetFileNodes("seed.fk")) + + w, err := NewWatcher(idx, config.WatchConfig{Enabled: true, DebounceMs: 10}, zap.NewNop()) + require.NoError(t, err) + + // A new subdirectory appears with a file already inside it; the + // file's own create event was lost (it landed before the watch on + // the new directory attached), so only the directory create arrives. + subdir := filepath.Join(dir, "pkg") + require.NoError(t, os.MkdirAll(subdir, 0o755)) + ext.setFuncs("Buried") + writeFile(t, filepath.Join(subdir, "buried.fk"), "buried body") + require.Empty(t, g.GetFileNodes("pkg/buried.fk"), + "the pre-watch file must be absent before the directory scan") + + w.handleEvent(fswatcher.WatchEvent{ + Path: subdir, + Types: []fswatcher.EventType{fswatcher.EventCreate}, + }) + + require.Eventually(t, func() bool { + return len(g.GetFileNodes("pkg/buried.fk")) > 0 + }, 5*time.Second, 10*time.Millisecond, + "the new-directory create must trigger a scoped scan that indexes the pre-watch file") +} + +// TestWatcher_DirEventScanGating proves the scan trigger is gated on a +// Create: a directory create enqueues a scoped scan, while a bare +// directory modify (an mtime bump with no Create) does not — entry +// changes inside an existing directory fire their own file events. Uses +// the scanFn seam. +func TestWatcher_DirEventScanGating(t *testing.T) { + idx, _ := newToggleIndexer(t) + dir := t.TempDir() + idx.SetRootPath(dir) + subdir := filepath.Join(dir, "sub") + require.NoError(t, os.MkdirAll(subdir, 0o755)) + + w, err := NewWatcher(idx, config.WatchConfig{Enabled: true, DebounceMs: 10}, zap.NewNop()) + require.NoError(t, err) + + scanned := make(chan map[string]struct{}, 4) + w.reconcileMu.Lock() + w.scanFn = func(dirs map[string]struct{}) { scanned <- dirs } + w.reconcileMu.Unlock() + + // A bare modify on the directory must NOT enqueue a scan. + w.handleEvent(fswatcher.WatchEvent{ + Path: subdir, + Types: []fswatcher.EventType{fswatcher.EventMod}, + }) + select { + case <-scanned: + t.Fatal("a directory modify without a Create must not trigger a scan") + case <-time.After(150 * time.Millisecond): + } + + // A create on the directory must enqueue a scoped scan of it. + w.handleEvent(fswatcher.WatchEvent{ + Path: subdir, + Types: []fswatcher.EventType{fswatcher.EventCreate}, + }) + select { + case dirs := <-scanned: + _, ok := dirs[subdir] + assert.True(t, ok, "the scan set must contain the newly-created directory") + case <-time.After(2 * time.Second): + t.Fatal("a directory create must trigger a scoped scan") + } +} + +// panicOnReadStore wraps a real Store but panics on GetFileNodes once +// armed — the shape store_sqlite's panicOnFatal produces when the DB is +// closed/locked (e.g. mid daemon-restart) or its schema is missing. +type panicOnReadStore struct { + graph.Store + armed atomic.Bool +} + +func (s *panicOnReadStore) GetFileNodes(p string) []*graph.Node { + if s.armed.Load() { + panic("simulated fatal store error") + } + return s.Store.GetFileNodes(p) +} + +// TestWatcher_PatchPanicRecoveredNotCrash proves the watcher panic +// firewall: a fatal store error during a debounced patch is recovered +// and logged, not propagated out of the timer goroutine to crash the +// whole daemon. The fsnotify-driven goroutines don't route through the +// MCP wrapToolHandler firewall, so a closed/locked DB during a restart +// (panicOnFatal) used to take the process down — the exact shape of the +// observed crash. Against the pre-firewall code the panic escapes the +// AfterFunc goroutine and aborts the test binary. +func TestWatcher_PatchPanicRecoveredNotCrash(t *testing.T) { + ext := &toggleExtractor{} + reg := parser.NewRegistry() + reg.Register(ext) + store := &panicOnReadStore{Store: graph.New()} + idx := New(store, reg, config.IndexConfig{Workers: 1}, zap.NewNop()) + idx.search = search.NewBM25() + dir := t.TempDir() + idx.SetRootPath(dir) + path := filepath.Join(dir, "main.fk") + + ext.setFail(false) + ext.setFuncs("Alpha") + writeFile(t, path, "alpha body") + require.NoError(t, idx.IndexFile(path)) + + core, logs := observer.New(zapcore.ErrorLevel) + w, err := NewWatcher(idx, config.WatchConfig{Enabled: true, DebounceMs: 5}, zap.New(core)) + require.NoError(t, err) + + // Arm the store so the next read panics, then drive the debounced + // patch path. The panic fires in the AfterFunc goroutine. + store.armed.Store(true) + w.handleEvent(fswatcher.WatchEvent{ + Path: path, + Types: []fswatcher.EventType{fswatcher.EventMod}, + }) + + require.Eventually(t, func() bool { + return logs.FilterMessageSnippet("recovered from panic").Len() > 0 + }, 2*time.Second, 10*time.Millisecond, + "a panic in the debounced patch must be recovered and logged, not crash the daemon") +} diff --git a/internal/indexer/shadow_resolver_test.go b/internal/indexer/shadow_resolver_test.go new file mode 100644 index 00000000..c946c6bb --- /dev/null +++ b/internal/indexer/shadow_resolver_test.go @@ -0,0 +1,122 @@ +package indexer + +import ( + "context" + "path/filepath" + "sort" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap" + + "github.com/zzet/gortex/internal/config" + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/graph/store_sqlite" + "github.com/zzet/gortex/internal/parser" + "github.com/zzet/gortex/internal/parser/languages" +) + +// TestShadowSwap_ResolverFollowsGraphPointer guards against the regression +// where the indexer's in-memory shadow swap reassigned idx.graph but left +// idx.resolver pointing at the empty disk Store. The symptom was that +// every resolver pass (module attribution, relative imports, edge in-place +// resolution, ...) silently no-op'd for any backend that opted into the +// shadow swap — because the resolver's r.graph.EdgesWithUnresolvedTarget() +// returned 0 against the empty disk store and ResolveAll short-circuited +// on len(pending) == 0. +// +// The test indexes the same Python project twice — once into an in-memory +// *Graph (no shadow swap), once into a sqlite *Store (shadow swap engaged) +// — and asserts both produce the same node ID set and the same module +// attribution output (KindModule nodes for pypi imports). +func TestShadowSwap_ResolverFollowsGraphPointer(t *testing.T) { + dir := t.TempDir() + + // A pyproject.toml so the dep scanner discovers pypi:requests as + // an external dependency, which the resolver then materialises as + // a KindModule node via attributeNonGoModuleImports. + writeFile(t, filepath.Join(dir, "pyproject.toml"), ` +[project] +name = "regression" +dependencies = ["requests>=2.0"] +`) + + // Source file imports the pypi package and a stdlib module. Both + // flow through the same module-attribution pass. + writeFile(t, filepath.Join(dir, "app.py"), ` +import os +import requests + +def fetch(url): + return requests.get(url).text +`) + + newIdx := func(t *testing.T, g graph.Store) *Indexer { + t.Helper() + reg := parser.NewRegistry() + reg.Register(languages.NewPythonExtractor()) + cfg := config.Default().Index + cfg.Workers = 2 + return New(g, reg, cfg, zap.NewNop()) + } + + indexAndCollect := func(t *testing.T, g graph.Store) map[string]string { + t.Helper() + _, err := newIdx(t, g).IndexCtx(context.Background(), dir) + require.NoError(t, err) + ids := map[string]string{} + for _, n := range g.AllNodes() { + ids[n.ID] = string(n.Kind) + } + return ids + } + + memG := graph.New() + memIDs := indexAndCollect(t, memG) + + sqliteDir := t.TempDir() + sqliteStore, err := store_sqlite.Open(filepath.Join(sqliteDir, "store.sqlite")) + require.NoError(t, err) + t.Cleanup(func() { _ = sqliteStore.Close() }) + + // Sanity: sqlite implements BulkLoader so the shadow swap engages. + _, isBulk := graph.Store(sqliteStore).(graph.BulkLoader) + require.True(t, isBulk, "sqlite must implement BulkLoader for this regression to exercise the shadow swap") + + dskIDs := indexAndCollect(t, sqliteStore) + + // The KindModule node the resolver materialises for `import requests` + // is the canary — without the fix it never gets written, because + // ResolveAll short-circuits before attributeNonGoModuleImports runs. + require.Contains(t, memIDs, "module::pypi:requests", + "baseline: in-memory backend must materialise the pypi module node") + assert.Contains(t, dskIDs, "module::pypi:requests", + "shadow-swap path must materialise the pypi module node — regression: resolver pointed at empty disk store") + + // Stdlib import gets the same treatment. + require.Contains(t, memIDs, "module::python:stdlib::os", + "baseline: in-memory backend must materialise the python stdlib module node") + assert.Contains(t, dskIDs, "module::python:stdlib::os", + "shadow-swap path must materialise the python stdlib module node") + + // Beyond the canary, both backends must produce the same set of + // node IDs. Any divergence means some resolver pass is still missing + // from one of the two paths. + onlyMem := setDiff(memIDs, dskIDs) + onlyDsk := setDiff(dskIDs, memIDs) + sort.Strings(onlyMem) + sort.Strings(onlyDsk) + assert.Empty(t, onlyMem, "nodes only in memory: %v", onlyMem) + assert.Empty(t, onlyDsk, "nodes only in sqlite: %v", onlyDsk) +} + +func setDiff(a, b map[string]string) []string { + out := []string{} + for id := range a { + if _, ok := b[id]; !ok { + out = append(out, id) + } + } + return out +} diff --git a/internal/indexer/shadow_threshold.go b/internal/indexer/shadow_threshold.go new file mode 100644 index 00000000..ee1b0468 --- /dev/null +++ b/internal/indexer/shadow_threshold.go @@ -0,0 +1,74 @@ +package indexer + +import ( + "os" + "strconv" + "strings" + + "github.com/zzet/gortex/internal/graph" +) + +// defaultShadowMaxFileCount caps the file count above which IndexCtx +// refuses to swap idx.graph for an in-memory shadow during cold start. +// Picked empirically from the in-memory store's prior profiling: at +// ~35k C files (drivers/) the in-memory store peaked at 8.6GB RSS; at +// 60k+ the peak is well past 16GB. The shadow path doubles that +// footprint (in-memory + persisted disk copy at the FlushBulk step), +// so the safe ceiling for a 32GB dev machine sits around 50k source +// files. Above that we fall through to the per-call disk path — +// slower per IndexCtx but bounded RAM. +const defaultShadowMaxFileCount = 50000 + +// defaultStreamingChunkSize is the per-chunk file count used by the +// streaming-flush path. At ~30 nodes / ~100 edges per file, 5000 +// files per chunk yields a ~600MB shadow that fits comfortably in +// RAM even on 8GB build agents. +const defaultStreamingChunkSize = 5000 + +// shadowMaxFileCount returns the active file-count ceiling for the +// IndexCtx in-memory shadow swap. GORTEX_SHADOW_MAX_FILES overrides +// the default; setting it to 0 disables the shadow entirely (always +// run against the disk store directly), setting it to a high value +// (e.g. 10_000_000) effectively disables the guard. Non-numeric or +// negative values fall back to the default. +func shadowMaxFileCount() int { + if v := os.Getenv("GORTEX_SHADOW_MAX_FILES"); v != "" { + n, err := strconv.Atoi(v) + if err == nil && n >= 0 { + return n + } + } + return defaultShadowMaxFileCount +} + +// streamingFlushActive reports whether the streaming-flush parse path +// should engage for this IndexCtx. Requirements: +// +// - the backing store implements graph.BulkLoader (the on-disk backend does) +// - the file count is above the shadow-max threshold (small repos +// stay on the all-in-memory shadow path) +// - GORTEX_STREAMING_FLUSH is enabled (off by default — the +// streaming path leaves resolve to the disk-only per-call path, +// so it's only useful when shadow swap can't fit in RAM) +func streamingFlushActive(store graph.Store, fileCount int) bool { + if _, ok := store.(graph.BulkLoader); !ok { + return false + } + if fileCount <= shadowMaxFileCount() { + return false + } + v := os.Getenv("GORTEX_STREAMING_FLUSH") + return v == "1" || strings.EqualFold(v, "true") +} + +// streamingChunkSize returns the per-chunk file count for the +// streaming-flush path. Override via GORTEX_STREAMING_CHUNK_SIZE. +func streamingChunkSize() int { + if v := os.Getenv("GORTEX_STREAMING_CHUNK_SIZE"); v != "" { + n, err := strconv.Atoi(v) + if err == nil && n > 0 { + return n + } + } + return defaultStreamingChunkSize +} diff --git a/internal/indexer/should_index_for_search_test.go b/internal/indexer/should_index_for_search_test.go new file mode 100644 index 00000000..d3702666 --- /dev/null +++ b/internal/indexer/should_index_for_search_test.go @@ -0,0 +1,43 @@ +package indexer + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "go.uber.org/zap" + + "github.com/zzet/gortex/internal/config" + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/parser" +) + +// TestShouldIndexForSearch_ExcludesKindLocal is the regression that +// guards the search-index default-filter for KindLocal. The Go +// dataflow walker materialises every intra-function binding as a +// KindLocal node; without the search-side exclusion, common names +// (`err` / `data` / `n` / `i`) would flood every search result with +// thousands of per-function copies. +func TestShouldIndexForSearch_ExcludesKindLocal(t *testing.T) { + idx := New(graph.New(), parser.NewRegistry(), config.Default().Index, zap.NewNop()) + + cases := []struct { + name string + node *graph.Node + want bool + }{ + {"function passes", &graph.Node{ID: "f", Kind: graph.KindFunction, Name: "Foo"}, true}, + {"method passes", &graph.Node{ID: "m", Kind: graph.KindMethod, Name: "Bar"}, true}, + {"type passes", &graph.Node{ID: "t", Kind: graph.KindType, Name: "Baz"}, true}, + {"param passes", &graph.Node{ID: "p", Kind: graph.KindParam, Name: "x"}, true}, + {"closure passes", &graph.Node{ID: "c", Kind: graph.KindClosure, Name: "closure@4"}, true}, + {"file excluded", &graph.Node{ID: "fl", Kind: graph.KindFile, Name: "foo.go"}, false}, + {"import excluded", &graph.Node{ID: "im", Kind: graph.KindImport, Name: "fmt"}, false}, + {"local excluded — the regression", &graph.Node{ID: "l", Kind: graph.KindLocal, Name: "err"}, false}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := idx.shouldIndexForSearch(c.node) + assert.Equal(t, c.want, got) + }) + } +} diff --git a/internal/indexer/test_edges.go b/internal/indexer/test_edges.go index e52b813c..77a16beb 100644 --- a/internal/indexer/test_edges.go +++ b/internal/indexer/test_edges.go @@ -28,7 +28,7 @@ import ( // // Returns counts for telemetry: number of nodes marked as test, // number of EdgeTests emitted. -func markTestSymbolsAndEmitEdges(g *graph.Graph) (markedTests int, edgesEmitted int) { +func markTestSymbolsAndEmitEdges(g graph.Store) (markedTests int, edgesEmitted int) { if g == nil { return 0, 0 } @@ -40,11 +40,15 @@ func markTestSymbolsAndEmitEdges(g *graph.Graph) (markedTests int, edgesEmitted g.ResolveMutex().Lock() defer g.ResolveMutex().Unlock() - // Pass 1: classify file nodes, then function/method nodes. - testFiles := map[string]bool{} // file node ID → is test file - fileRunners := map[string]string{} // file FilePath → test runner - for _, n := range g.AllNodes() { - if n == nil || n.Kind != graph.KindFile { + // Pass 1: classify file nodes, then function/method nodes. Build + // a local testNodes set keyed by node id so Pass 2 can probe it + // without re-walking the Meta. (Node.Meta mutations on returned + // nodes don't persist back to disk backends, so a later GetNode + // in Pass 2 wouldn't see the is_test flag we set here.) + testFiles := map[string]bool{} // file node ID → is test file + fileRunners := map[string]string{} // file FilePath → test runner + for n := range g.NodesByKind(graph.KindFile) { + if n == nil { continue } if IsTestFile(n.FilePath) { @@ -60,22 +64,10 @@ func markTestSymbolsAndEmitEdges(g *graph.Graph) (markedTests int, edgesEmitted } } - for _, n := range g.AllNodes() { - if n == nil { - continue - } - if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { - continue - } - // Test-file membership is the authoritative signal. No standard - // runner (go test, pytest, ...) picks up a test by name outside - // a test file, so a production function that merely starts with - // "Test"/"Benchmark" (e.g. TestRole) must not be flagged. The - // name convention only refines the *role* — benchmark / fuzz / - // example — for symbols already inside a test file; anything - // else there is test support code: role "test". + testNodes := map[string]bool{} + stampTestSymbol := func(n *graph.Node) { if !testFiles[n.FilePath] { - continue + return } role := TestRole(n.Name, n.Language) if role == "" { @@ -89,31 +81,49 @@ func markTestSymbolsAndEmitEdges(g *graph.Graph) (markedTests int, edgesEmitted if runner := fileRunners[n.FilePath]; runner != "" { n.Meta["test_runner"] = runner } + testNodes[n.ID] = true markedTests++ } + for n := range g.NodesByKind(graph.KindFunction) { + if n != nil { + // Test-file membership is the authoritative signal. No + // standard runner (go test, pytest, ...) picks up a test + // by name outside a test file, so a production function + // that merely starts with "Test"/"Benchmark" (e.g. + // TestRole) must not be flagged. The name convention only + // refines the *role* — benchmark / fuzz / example — for + // symbols already inside a test file; anything else there + // is test support code: role "test". + stampTestSymbol(n) + } + } + for n := range g.NodesByKind(graph.KindMethod) { + if n != nil { + stampTestSymbol(n) + } + } // Pass 2: walk EdgeCalls; for each (test, non-test) pair, emit a // parallel EdgeTests. We dedupe per (From, To) because a single - // test can call the same subject multiple times. + // test can call the same subject multiple times. The testNodes set + // built in Pass 1 is the authoritative source — no inline GetNode + // is needed because the From / To kind filter is already enforced + // by "From must be a test symbol" (only function/method ids land + // in testNodes). seen := map[string]bool{} type pair struct{ from, to string } var pending []struct { pair pair edge *graph.Edge } - for _, e := range g.AllEdges() { - if e == nil || e.Kind != graph.EdgeCalls { - continue - } - fromNode := g.GetNode(e.From) - toNode := g.GetNode(e.To) - if fromNode == nil || toNode == nil { + for e := range g.EdgesByKind(graph.EdgeCalls) { + if e == nil { continue } - if !isTestNode(fromNode) { + if !testNodes[e.From] { continue } - if isTestNode(toNode) { + if testNodes[e.To] { continue // test → test calls are infrastructure, not subject coverage } key := e.From + "\x00" + e.To @@ -141,14 +151,6 @@ func markTestSymbolsAndEmitEdges(g *graph.Graph) (markedTests int, edgesEmitted return markedTests, edgesEmitted } -func isTestNode(n *graph.Node) bool { - if n == nil || n.Meta == nil { - return false - } - v, _ := n.Meta["is_test"].(bool) - return v -} - // detectTestRunnerForFile resolves the runner identifier for a test file // node by consulting three signals, in priority order: // @@ -173,7 +175,7 @@ func isTestNode(n *graph.Node) bool { // // Returns "" when no signal applies; the caller leaves test_runner // unset rather than guessing. -func detectTestRunnerForFile(g *graph.Graph, fileNode *graph.Node) string { +func detectTestRunnerForFile(g graph.Store, fileNode *graph.Node) string { if fileNode == nil { return "" } @@ -215,7 +217,7 @@ func detectTestRunnerForFile(g *graph.Graph, fileNode *graph.Node) string { // (mirrors DetectJSTSTestRunner so files compiled by a non-JS / TS // extractor still classify correctly), Python (pytest / unittest), // and Ruby (rspec / minitest). -func detectRunnerFromImportEdges(g *graph.Graph, fileNode *graph.Node) string { +func detectRunnerFromImportEdges(g graph.Store, fileNode *graph.Node) string { const prefix = "unresolved::import::" for _, e := range g.GetOutEdges(fileNode.ID) { if e == nil || e.Kind != graph.EdgeImports { diff --git a/internal/indexer/unicode_path_test.go b/internal/indexer/unicode_path_test.go index 1973b868..81ffefc7 100644 --- a/internal/indexer/unicode_path_test.go +++ b/internal/indexer/unicode_path_test.go @@ -47,7 +47,7 @@ func goSrc(funcName string) string { // fileKindNodes returns only the file-kind nodes the graph holds for // the given key — used to detect a duplicate file-node leaking after a // re-index. -func fileKindNodes(g *graph.Graph, key string) []*graph.Node { +func fileKindNodes(g graph.Store, key string) []*graph.Node { var out []*graph.Node for _, n := range g.GetFileNodes(key) { if n.Kind == graph.KindFile { diff --git a/internal/indexer/warm_restart_mtime_prune_test.go b/internal/indexer/warm_restart_mtime_prune_test.go new file mode 100644 index 00000000..07097ebe --- /dev/null +++ b/internal/indexer/warm_restart_mtime_prune_test.go @@ -0,0 +1,126 @@ +package indexer + +import ( + "context" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap" + + "github.com/zzet/gortex/internal/config" + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/graph/store_sqlite" +) + +// TestWarmRestart_PrunesDeletedFileMtimes_FastPath is the end-to-end +// regression for the "warm restart re-indexes everything even though nothing +// changed on disk" bug. +// +// Root cause: the full-index mtime persist used to be an upsert +// (BulkSetFileMtimes), so a file deleted since the last index left its row in +// the store forever. On every warm restart HasChangesSinceMtimes hit that +// phantom-deletion row, flagged the repo as changed, and forced a full +// re-track + all global passes — which never converged, because the re-track +// re-persisted with the same upsert. +// +// The fix makes the full-index persist authoritative (ReplaceFileMtimes). This +// test proves: (1) a deleted file's row is pruned on the next full index, and +// (2) the subsequent unchanged warm restart takes the fast path +// (HasChangesSinceMtimes == false). +func TestWarmRestart_PrunesDeletedFileMtimes_FastPath(t *testing.T) { + dir := t.TempDir() + repoPath := filepath.Join(dir, "repo") + require.NoError(t, os.MkdirAll(repoPath, 0o755)) + writeFile(t, filepath.Join(repoPath, "a.go"), "package main\nfunc Alpha() {}\n") + writeFile(t, filepath.Join(repoPath, "b.go"), "package main\nfunc Beta() {}\n") + + s, err := store_sqlite.Open(filepath.Join(t.TempDir(), "store.sqlite")) + require.NoError(t, err) + t.Cleanup(func() { _ = s.Close() }) + + // sqlite must advertise the replace capability or the fix can't engage. + _, isReplacer := graph.Store(s).(graph.FileMtimeReplacer) + require.True(t, isReplacer, "sqlite store must implement FileMtimeReplacer") + + newIdx := func() *Indexer { + idx := New(graph.Store(s), newTestRegistry(), config.Default().Index, zap.NewNop()) + idx.SetRepoPrefix("repo") + idx.SetRootPath(repoPath) + return idx + } + + // First index: both files persisted. + _, err = newIdx().IndexCtx(context.Background(), repoPath) + require.NoError(t, err) + + got := s.LoadFileMtimes("repo") + require.Contains(t, got, "a.go", "first index must persist a.go") + require.Contains(t, got, "b.go", "first index must persist b.go") + + // Delete b.go on disk — the analogue of the deleted store_ladybug files. + require.NoError(t, os.Remove(filepath.Join(repoPath, "b.go"))) + + // Warm restart #1: a fresh indexer seeded from the persisted snapshot + // must DETECT the deletion — this is correct behaviour the first time. + idxR1 := newIdx() + idxR1.SetFileMtimes(s.LoadFileMtimes("repo")) + require.True(t, idxR1.HasChangesSinceMtimes(repoPath), + "the first warm restart after a deletion must detect the change") + + // Re-track (full index). The authoritative persist must prune b.go. + _, err = idxR1.IndexCtx(context.Background(), repoPath) + require.NoError(t, err) + + got = s.LoadFileMtimes("repo") + assert.Contains(t, got, "a.go", "surviving file must stay persisted") + assert.NotContains(t, got, "b.go", + "deleted file's mtime row must be pruned by the authoritative full-index persist") + + // Warm restart #2: nothing changed and the persisted set now matches + // disk, so the reconcile must take the FAST PATH — no phantom deletion, + // no full re-track, no global passes. + idxR2 := newIdx() + idxR2.SetFileMtimes(s.LoadFileMtimes("repo")) + assert.False(t, idxR2.HasChangesSinceMtimes(repoPath), + "after pruning, an unchanged warm restart must take the fast path") +} + +// TestIncrementalReindex_PrunesDeletedFileMtimes covers the watcher / +// incremental path: a file deleted between scans must have its persisted +// mtime row removed by IncrementalReindex (via DeleteFileMtimes), not just +// its in-memory entry — otherwise the next warm restart re-discovers it as a +// phantom deletion. +func TestIncrementalReindex_PrunesDeletedFileMtimes(t *testing.T) { + dir := t.TempDir() + repoPath := filepath.Join(dir, "repo") + require.NoError(t, os.MkdirAll(repoPath, 0o755)) + writeFile(t, filepath.Join(repoPath, "keep.go"), "package main\nfunc Keep() {}\n") + writeFile(t, filepath.Join(repoPath, "drop.go"), "package main\nfunc Drop() {}\n") + + s, err := store_sqlite.Open(filepath.Join(t.TempDir(), "store.sqlite")) + require.NoError(t, err) + t.Cleanup(func() { _ = s.Close() }) + + idx := New(graph.Store(s), newTestRegistry(), config.Default().Index, zap.NewNop()) + idx.SetRepoPrefix("repo") + idx.SetRootPath(repoPath) + + _, err = idx.IndexCtx(context.Background(), repoPath) + require.NoError(t, err) + require.Contains(t, s.LoadFileMtimes("repo"), "drop.go") + + // Delete drop.go and run the incremental path (the janitor / watcher + // route), not a full re-track. + require.NoError(t, os.Remove(filepath.Join(repoPath, "drop.go"))) + res, err := idx.IncrementalReindex(repoPath) + require.NoError(t, err) + assert.Equal(t, 1, res.DeletedFileCount, "incremental reindex must report the deletion") + + got := s.LoadFileMtimes("repo") + assert.Contains(t, got, "keep.go") + assert.NotContains(t, got, "drop.go", + "incremental reindex must prune the deleted file's persisted mtime row") +} diff --git a/internal/indexer/watcher.go b/internal/indexer/watcher.go index 06b7f418..d629ed17 100644 --- a/internal/indexer/watcher.go +++ b/internal/indexer/watcher.go @@ -94,6 +94,29 @@ type Watcher struct { // down in Stop alongside the fsnotify backend. nil when the // per-repo watcher is disabled via WatchConfig.Enabled. poller *Poller + + // reconcileMu guards the overflow-driven full-tree reconcile. + // reconcilePending coalesces a burst of overflow / dropped-event + // signals into at most one reconcile in flight: the kernel inotify + // queue can overflow (EventOverflow) or the backend can drop events + // under backpressure (the Dropped() channel), and either means we + // may have lost a create/modify with no path to re-index. macOS + // FSEvents self-heals (it re-scans on UserDropped/KernelDropped), + // but Linux inotify does not — without this the lost event waits on + // the up-to-1h janitor. reconcileFn is a test seam: nil in + // production (the real IncrementalReindex runs). + reconcileMu sync.Mutex + reconcilePending bool + reconcileFn func() + + // pendingScanDirs coalesces newly-created directories awaiting a + // scoped subtree re-index — the new-subdir race (see enqueueDirScan). + // dirScanActive guards a single in-flight drainer goroutine; scanFn + // is a test seam, nil in production (the real IncrementalReindexPaths + // runs). All three are guarded by reconcileMu. + pendingScanDirs map[string]struct{} + dirScanActive bool + scanFn func(map[string]struct{}) } const maxHistory = 1000 @@ -375,6 +398,7 @@ func (w *Watcher) loop() { return } eventsCh := w.fsw.Events() + droppedCh := w.fsw.Dropped() for { select { case <-w.done: @@ -384,11 +408,189 @@ func (w *Watcher) loop() { return } w.handleEvent(event) + case _, ok := <-droppedCh: + if !ok { + // Backend tore down its dropped channel; keep + // draining Events only. + droppedCh = nil + continue + } + // The backend dropped an event under backpressure (the + // main Events channel was full). We don't know which path + // was lost, so reconcile the whole tree. + w.triggerOverflowReconcile("dropped-event") } } } +// guardWatcherPanic recovers a panic in a watcher background goroutine — +// a debounced patch, a storm drain, an overflow reconcile, or a +// new-directory scan. Those goroutines call into the graph store, and +// store_sqlite turns a fatal storage error (a closed DB during a daemon +// restart, a busy/locked DB, disk-full) into a panic via panicOnFatal. +// The MCP tool path has its own firewall (wrapToolHandler); these +// fsnotify-driven goroutines don't route through it, so without this a +// single transient store error during a restart or rebuild takes the +// whole daemon down. Recovering aborts just that unit of work — the file +// stays stale until the next event or the reconcile janitor — instead of +// crashing the process. +func (w *Watcher) guardWatcherPanic(op string) { + if r := recover(); r != nil && w.logger != nil { + w.logger.Error("watcher: recovered from panic in background re-index", + zap.String("op", op), + zap.Any("panic", r), + zap.Stack("stack")) + } +} + +// triggerOverflowReconcile schedules a single coalesced full-tree +// reconcile in response to a lost-event signal (a kernel inotify queue +// overflow or a backpressure-dropped event). A burst of signals +// collapses into at most one reconcile in flight: the first caller sets +// reconcilePending and runs the reconcile off the event loop; concurrent +// callers observe the flag and return immediately. Best-effort and +// logged — the event loop is never blocked. +func (w *Watcher) triggerOverflowReconcile(reason string) { + w.reconcileMu.Lock() + if w.reconcilePending { + w.reconcileMu.Unlock() + return + } + w.reconcilePending = true + fn := w.reconcileFn + w.reconcileMu.Unlock() + + if w.logger != nil { + w.logger.Warn("watcher: event signal lost — scheduling full-tree reconcile", + zap.String("reason", reason), + zap.String("root", w.indexer.rootPath)) + } + + go func() { + defer func() { + w.reconcileMu.Lock() + w.reconcilePending = false + w.reconcileMu.Unlock() + }() + defer w.guardWatcherPanic("overflow-reconcile") + if fn != nil { + fn() + return + } + if _, err := w.indexer.IncrementalReindex(w.indexer.rootPath); err != nil { + if w.logger != nil { + w.logger.Warn("watcher: overflow reconcile failed", + zap.String("reason", reason), + zap.Error(err)) + } + } + }() +} + +// dirScanEscalateCap bounds the scoped new-directory scan: a burst that +// creates more than this many directories (a large checkout or unpack) +// escalates to a single full-tree reconcile instead of fanning out into +// that many scoped subtree walks. +const dirScanEscalateCap = 64 + +// enqueueDirScan schedules a scoped re-index of a newly-created +// directory's subtree, closing the new-subdir race: on Linux inotify a +// file written into a directory before its watch attaches fires no +// event. A burst of directory creates coalesces into a single in-flight +// drainer (mirrors triggerOverflowReconcile) — the first caller starts +// the goroutine, concurrent callers add their directory to +// pendingScanDirs and return. The drainer loops until the set is empty, +// so a directory enqueued while a scan is in flight is still picked up; +// nothing is lost and there is no debounce-timing race. +func (w *Watcher) enqueueDirScan(dir string) { + w.reconcileMu.Lock() + if w.pendingScanDirs == nil { + w.pendingScanDirs = make(map[string]struct{}) + } + w.pendingScanDirs[dir] = struct{}{} + if w.dirScanActive { + w.reconcileMu.Unlock() + return + } + w.dirScanActive = true + w.reconcileMu.Unlock() + + go func() { + for { + w.reconcileMu.Lock() + dirs := w.pendingScanDirs + w.pendingScanDirs = nil + if len(dirs) == 0 { + w.dirScanActive = false + w.reconcileMu.Unlock() + return + } + fn := w.scanFn + w.reconcileMu.Unlock() + func() { + defer w.guardWatcherPanic("dir-scan") + w.runDirScan(dirs, fn) + }() + } + }() +} + +// runDirScan re-indexes the accumulated new directories. A large burst +// escalates to one full-tree reconcile (dirScanEscalateCap); otherwise +// the scoped subtrees are walked in a single IncrementalReindexPaths +// call, which IsStale-gates each file so already-current files cost only +// a stat. fn is the test seam. +func (w *Watcher) runDirScan(dirs map[string]struct{}, fn func(map[string]struct{})) { + if fn != nil { + fn(dirs) + return + } + if len(dirs) > dirScanEscalateCap { + if w.logger != nil { + w.logger.Info("watcher: large new-directory burst — full-tree reconcile", + zap.Int("dirs", len(dirs)), zap.String("root", w.indexer.rootPath)) + } + if _, err := w.indexer.IncrementalReindex(w.indexer.rootPath); err != nil && w.logger != nil { + w.logger.Warn("watcher: new-directory reconcile failed", zap.Error(err)) + } + return + } + paths := make([]string, 0, len(dirs)) + for d := range dirs { + paths = append(paths, d) + } + if _, err := w.indexer.IncrementalReindexPaths(w.indexer.rootPath, paths); err != nil && w.logger != nil { + w.logger.Warn("watcher: new-directory scan failed", + zap.Strings("dirs", paths), zap.Error(err)) + } +} + +// hasEventType reports whether the aggregated event-type set contains want. +func hasEventType(types []fswatcher.EventType, want fswatcher.EventType) bool { + for _, t := range types { + if t == want { + return true + } + } + return false +} + func (w *Watcher) handleEvent(event fswatcher.WatchEvent) { + // Kernel queue overflow arrives as a pathless EventOverflow on the + // Events channel: the Linux inotify and Windows backends emit it when + // the kernel drops events and cannot tell us which paths were lost. + // macOS FSEvents never emits it — the darwin backend absorbs + // UserDropped/KernelDropped by re-scanning the affected subtree + // internally — so this branch is effectively Linux/Windows-only. With + // no path to re-index, trigger a coalesced full-tree reconcile and + // stop; every path-based step below would misfire on the empty path. + for _, t := range event.Types { + if t == fswatcher.EventOverflow { + w.triggerOverflowReconcile("queue-overflow") + return + } + } + path := normalizeEventPath(event.Path, w.indexer.rootPath) // Probe artifacts: sentinel files Start writes to confirm the @@ -416,11 +618,24 @@ func (w *Watcher) handleEvent(event fswatcher.WatchEvent) { return } - // fswatcher with WatchNested is recursive on every backend, so we - // don't need to manually re-attach watches on directory creates; - // drop dir events before they reach indexer logic. + // Directory events. fswatcher with WatchNested attaches the watch + // for a new directory itself, so we never re-attach. But on Linux + // inotify that watch lands only AFTER the directory's create event is + // read, so a file written into the directory in that gap fires no + // event and would stay invisible until the hourly janitor. When the + // event carries a Create, scan the new directory's subtree on disk so + // those pre-watch files are picked up regardless of whether an event + // ever fired ("watch first, then scan": files created after the watch + // fire normal events, files created before are caught by the scan, + // and the overlap is at worst a redundant idempotent re-index). A dir + // event without a Create — a bare mtime bump on an existing dir — + // needs no scan: entry changes inside it fire their own file events. + // Either way the directory event itself reaches no indexer logic. if kind == ChangeCreated || kind == ChangeModified { if info, err := os.Stat(path); err == nil && info.IsDir() { + if hasEventType(event.Types, fswatcher.EventCreate) { + w.enqueueDirScan(path) + } return } } @@ -451,10 +666,15 @@ func (w *Watcher) handleEvent(event fswatcher.WatchEvent) { } debounce := time.Duration(w.config.DebounceMs) * time.Millisecond w.pending[path] = time.AfterFunc(debounce, func() { + // Clean up the pending entry even if the patch panics, then + // recover so a fatal store error can't crash the daemon. + defer func() { + w.mu.Lock() + delete(w.pending, path) + w.mu.Unlock() + }() + defer w.guardWatcherPanic("patch " + path) w.patchGraph(path, kind) - w.mu.Lock() - delete(w.pending, path) - w.mu.Unlock() }) w.mu.Unlock() } @@ -526,6 +746,7 @@ func (w *Watcher) recordInStorm(path string, kind ChangeKind) { // then one global ResolveAll at the end. Cuts a 500-file checkout // from "resolver runs 500 times" to "resolver runs once." func (w *Watcher) drainStorm() { + defer w.guardWatcherPanic("storm-drain") w.stormMu.Lock() batch := w.stormBatch w.stormBatch = make(map[string]ChangeKind) @@ -627,15 +848,30 @@ func (w *Watcher) patchGraph(path string, kind ChangeKind) { return } - nr, er := w.indexer.EvictFile(path) - nodesRemoved = nr - edgesRemoved = er + // Do NOT pre-evict. IndexFile parse-then-swaps internally: it + // evicts the file's prior nodes and re-adds the new ones only on a + // successful parse, and leaves the prior nodes intact on a parse + // failure. Pre-evicting here was the node-loss bug — a transiently + // unparseable save (mid-edit) dropped the file's symbols from the + // graph until the next clean save. Capture the file's prior node + // count first (still present pre-swap) so removed/added telemetry + // stays gross: a rename removes one node and adds one even though + // the net node delta is zero. + priorFileNodes := len(w.indexer.graph.GetFileNodes(relPath)) if err := w.indexer.IndexFile(path); err != nil { w.logger.Warn("reindex file failed", zap.String("path", path), zap.Error(err)) return } - nodesAdded = w.indexer.graph.NodeCount() - (nodesBefore - nr) - edgesAdded = w.indexer.graph.EdgeCount() - (edgesBefore - er) + nodesRemoved = priorFileNodes + nodesAdded = len(w.indexer.graph.GetFileNodes(relPath)) + // Edge churn as the net graph-wide delta — per-file edge counting + // would need a subgraph walk, which this watch-patch telemetry + // doesn't need. + if edgesAfter := w.indexer.graph.EdgeCount(); edgesAfter >= edgesBefore { + edgesAdded = edgesAfter - edgesBefore + } else { + edgesRemoved = edgesBefore - edgesAfter + } // Notify callback with old and new symbols. w.symbolChangeCbMu.RLock() diff --git a/internal/indexer/workspace_resolve.go b/internal/indexer/workspace_resolve.go index 92efae61..d635e8ca 100644 --- a/internal/indexer/workspace_resolve.go +++ b/internal/indexer/workspace_resolve.go @@ -15,7 +15,7 @@ import ( // Resolution order (highest priority first): // // 1. RepoEntry.Workspace — user-level override in -// `~/.config/gortex/config.yaml`. Lets users pin OSS / read-only +// `~/.gortex/config.yaml`. Lets users pin OSS / read-only // repos to a workspace without leaving a `.gortex.yaml` artifact // in the repo itself, and lets users override a workspace the // repo author chose (the OSS author's slug shouldn't pollute the @@ -278,6 +278,7 @@ func (mi *MultiIndexer) RunGlobalResolve() { return } cr := resolver.NewCrossRepo(mi.graph) + cr.SetLogger(mi.logger) cr.SetCrossWorkspaceDepLookup(mi.crossWorkspaceLookup()) cr.SetNpmAliasResolver(mi.npmAliasResolver()) cr.SetWorkspaceMembership(mi.workspaceMembershipResolver()) diff --git a/internal/indexer/zzbench_backends_test.go b/internal/indexer/zzbench_backends_test.go new file mode 100644 index 00000000..b34f1811 --- /dev/null +++ b/internal/indexer/zzbench_backends_test.go @@ -0,0 +1,210 @@ +package indexer_test + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "sort" + "strconv" + "strings" + "testing" + "time" + + "go.uber.org/zap" + + "github.com/zzet/gortex/internal/config" + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/graph/store_sqlite" + "github.com/zzet/gortex/internal/indexer" + "github.com/zzet/gortex/internal/parser" + "github.com/zzet/gortex/internal/parser/languages" +) + +// TestBackendBench cold-indexes GORTEX_BENCH_ROOT through the full indexer +// pipeline into the backend named by GORTEX_BENCH_BACKEND (memory | sqlite), +// then runs a fixed query workload. Reports cold-index time, graph size, +// process RSS, and query throughput so the sqlite backend can be compared +// head-to-head with the in-memory baseline on real repositories. +// +// GORTEX_BENCH_ROOT=/Users/zzet/code/my/gortex/gortex \ +// GORTEX_BENCH_BACKEND=sqlite \ +// go test ./internal/indexer/ -run TestBackendBench -timeout 40m -v +func TestBackendBench(t *testing.T) { + root := os.Getenv("GORTEX_BENCH_ROOT") + if root == "" { + t.Skip("bench harness; set GORTEX_BENCH_ROOT= and GORTEX_BENCH_BACKEND=memory|sqlite") + } + if _, err := os.Stat(root); err != nil { + t.Skipf("bench root not available: %v", err) + } + backendName := os.Getenv("GORTEX_BENCH_BACKEND") + if backendName == "" { + backendName = "memory" + } + + store, cleanup := openBenchStore(t, backendName) + defer cleanup() + + reg := parser.NewRegistry() + languages.RegisterAll(reg) + workers := runtime.NumCPU() + idx := indexer.New(store, reg, config.IndexConfig{Workers: workers}, zap.NewNop()) + + var m0 runtime.MemStats + runtime.ReadMemStats(&m0) + + start := time.Now() + res, err := idx.IndexCtx(context.Background(), root) + indexDur := time.Since(start) + if err != nil { + t.Fatalf("index: %v", err) + } + rssAfterIndex := processRSSMB() + var m1 runtime.MemStats + runtime.ReadMemStats(&m1) + fmt.Fprintf(os.Stderr, ">>> %s INDEX DONE in %s (files=%d nodes=%d edges=%d) — querying\n", + backendName, indexDur.Round(time.Millisecond), res.FileCount, res.NodeCount, res.EdgeCount) + + qStart := time.Now() + q := runQueryWorkload(store) + fmt.Fprintf(os.Stderr, ">>> %s QUERY WORKLOAD DONE in %s\n", backendName, time.Since(qStart).Round(time.Millisecond)) + + mb := func(b uint64) float64 { return float64(b) / (1024 * 1024) } + t.Logf("================ BACKEND BENCH ================") + t.Logf("backend=%s root=%s workers=%d", backendName, root, workers) + t.Logf("cold index : %s files=%d nodes=%d edges=%d errors=%d", + indexDur.Round(time.Millisecond), res.FileCount, res.NodeCount, res.EdgeCount, len(res.Errors)) + if indexDur.Seconds() > 0 { + t.Logf("throughput : %.0f files/s %.0f nodes/s", + float64(res.FileCount)/indexDur.Seconds(), float64(res.NodeCount)/indexDur.Seconds()) + } + t.Logf("memory : processRSS=%.0fMB goHeapAlloc=%.0fMB goTotalAlloc=%.0fMB", + rssAfterIndex, mb(m1.HeapAlloc), mb(m1.TotalAlloc-m0.TotalAlloc)) + t.Logf("queries : %s", q) + t.Logf("==============================================") + runtime.KeepAlive(store) +} + +func openBenchStore(t *testing.T, name string) (graph.Store, func()) { + t.Helper() + switch strings.ToLower(name) { + case "", "memory", "mem": + return graph.New(), func() {} + case "sqlite", "sqlite3": + s, err := store_sqlite.Open(filepath.Join(t.TempDir(), "bench.sqlite")) + if err != nil { + t.Fatalf("open sqlite: %v", err) + } + return s, func() { _ = s.Close() } + default: + t.Fatalf("unknown GORTEX_BENCH_BACKEND %q (memory|sqlite)", name) + return nil, func() {} + } +} + +// runQueryWorkload times a fixed, deterministic read mix against the freshly +// indexed store: point lookups + adjacency over a node sample, exact-name +// lookups, substring search, Stats, and a full AllEdges scan. +func runQueryWorkload(store graph.Store) string { + nodes := store.AllNodes() + sort.Slice(nodes, func(i, j int) bool { return nodes[i].ID < nodes[j].ID }) + sample := sampleNodes(nodes, 2000) + + ptStart := time.Now() + ptOps := 0 + for _, n := range sample { + store.GetNode(n.ID) + store.GetOutEdges(n.ID) + store.GetInEdges(n.ID) + ptOps += 3 + } + ptDur := time.Since(ptStart) + + // Query DISTINCT names once each — real lookup traffic asks for a name + // once, not N times. (A naive per-sample loop re-queries hyper-common + // names like markdown "json code block", which match ~25k rows, hundreds + // of times and measures result-set serialization, not lookup latency.) + seenName := make(map[string]struct{}, len(sample)) + var names []string + for _, n := range sample { + if n.Name == "" { + continue + } + if _, ok := seenName[n.Name]; ok { + continue + } + seenName[n.Name] = struct{}{} + names = append(names, n.Name) + } + nameStart := time.Now() + nameRows := 0 + for _, nm := range names { + nameRows += len(store.FindNodesByName(nm)) + } + nameDur := time.Since(nameStart) + nameOps := len(names) + + subStart := time.Now() + for _, frag := range []string{"Index", "resolve", "Store", "config", "handler"} { + store.FindNodesByNameContaining(frag, 50) + } + subDur := time.Since(subStart) + + statsStart := time.Now() + st := store.Stats() + statsDur := time.Since(statsStart) + + allStart := time.Now() + allEdges := store.AllEdges() + allDur := time.Since(allStart) + + opsPerSec := func(ops int, d time.Duration) float64 { + if d <= 0 { + return 0 + } + return float64(ops) / d.Seconds() + } + return fmt.Sprintf( + "sample=%d | point %d ops %s (%.0f op/s) | name %d distinct %s (%.0f op/s, %d rows) | substr 5q %s | Stats(%dn/%de) %s | AllEdges %d %s", + len(sample), + ptOps, ptDur.Round(time.Millisecond), opsPerSec(ptOps, ptDur), + nameOps, nameDur.Round(time.Millisecond), opsPerSec(nameOps, nameDur), nameRows, + subDur.Round(time.Millisecond), + st.TotalNodes, st.TotalEdges, statsDur.Round(time.Millisecond), + len(allEdges), allDur.Round(time.Millisecond), + ) +} + +func sampleNodes(nodes []*graph.Node, n int) []*graph.Node { + if len(nodes) <= n { + return nodes + } + step := len(nodes) / n + out := make([]*graph.Node, 0, n) + for i := 0; i < len(nodes) && len(out) < n; i += step { + out = append(out, nodes[i]) + } + return out +} + +// processRSSMB returns the current process RSS in MiB (reads /proc on Linux, +// falls back to `ps` on macOS). +func processRSSMB() float64 { + if b, err := os.ReadFile("/proc/self/statm"); err == nil { + if f := strings.Fields(string(b)); len(f) >= 2 { + if pages, err := strconv.ParseInt(f[1], 10, 64); err == nil { + return float64(pages*int64(os.Getpagesize())) / (1024 * 1024) + } + } + } + out, err := exec.Command("ps", "-o", "rss=", "-p", strconv.Itoa(os.Getpid())).Output() + if err == nil { + if kb, err := strconv.ParseInt(strings.TrimSpace(string(out)), 10, 64); err == nil { + return float64(kb) / 1024 + } + } + return 0 +} diff --git a/internal/mcp/combo_apply.go b/internal/mcp/combo_apply.go index c90cdf32..3dccc3e1 100644 --- a/internal/mcp/combo_apply.go +++ b/internal/mcp/combo_apply.go @@ -1,16 +1,18 @@ package mcp import ( + "time" + "github.com/zzet/gortex/internal/graph" "github.com/zzet/gortex/internal/search/rerank" ) -// applyRerankBoosts is the I13 entry point that runs the full -// 11-signal rerank.Pipeline over the candidate set with the -// session-aware Context wired in (locality, combo, frecency, -// feedback, churn, community). The structural signals (BM25 rank, -// fan-in / fan-out, MinHash similarity, signature match, recency) -// are computed off the graph + the candidate's current index. +// applyRerankBoostsTimed is the I13 entry point that runs the full +// 11-signal rerank.Pipeline over the candidate set with the session- +// aware Context wired in (locality, combo, frecency, feedback, churn, +// community). Structural signals (BM25 rank, fan-in / fan-out, +// MinHash similarity, signature match, recency) are computed off the +// graph + the candidate's current index. // // rerankCtx is the per-request Context built by the server; pass nil // and the pipeline falls back to a structural-only rerank using just @@ -18,13 +20,19 @@ import ( // candidate slice — when non-nil it carries per-signal contributions // out to the caller for debug / winnow surfacing; pass nil if the // caller only wants the sorted nodes. -func applyRerankBoosts(s *Server, nodes []*graph.Node, query string, rerankCtx *rerank.Context, lastResults *[]*rerank.Candidate) []*graph.Node { +// +// Returns the rerank's prepare and signals phase durations separately +// so the search_symbols handler's per-phase Debug log can attribute +// time honestly between the batched edge fetch (prepare) and the +// in-process scoring loop (signals). Zero durations when there's no +// work to do. +func applyRerankBoostsTimed(s *Server, nodes []*graph.Node, query string, rerankCtx *rerank.Context, lastResults *[]*rerank.Candidate) (result []*graph.Node, prepare time.Duration, signals time.Duration) { if len(nodes) < 2 || s == nil || s.engine == nil { - return nodes + return nodes, 0, 0 } pipeline := s.engine.Rerank() if pipeline == nil { - return nodes + return nodes, 0, 0 } cands := make([]*rerank.Candidate, 0, len(nodes)) for i, n := range nodes { @@ -38,15 +46,27 @@ func applyRerankBoosts(s *Server, nodes []*graph.Node, query string, rerankCtx * if rerankCtx.Graph == nil { rerankCtx.Graph = s.graph } + + // Phase 1: prepare — the batched in/out edge fetch + scratch fields. + // Exposed via the explicit Prepare call; Pipeline.Rerank detects the + // already-prepared slice and skips the duplicate work. + prepStart := time.Now() + rerankCtx.Prepare(cands) + prepare = time.Since(prepStart) + + // Phase 2: signals — the in-process scoring loop + final sort. + sigStart := time.Now() pipeline.Rerank(query, cands, rerankCtx) - out := make([]*graph.Node, 0, len(cands)) + signals = time.Since(sigStart) + + result = make([]*graph.Node, 0, len(cands)) for _, c := range cands { - out = append(out, c.Node) + result = append(result, c.Node) } if lastResults != nil { *lastResults = cands } - return out + return result, prepare, signals } // recordLastSearchFromNodes stores the query + top-limit IDs on the session diff --git a/internal/mcp/etag.go b/internal/mcp/etag.go index 055609bb..ab5d95e6 100644 --- a/internal/mcp/etag.go +++ b/internal/mcp/etag.go @@ -2,23 +2,75 @@ package mcp import ( "crypto/sha256" + "encoding/binary" "encoding/hex" "encoding/json" "sort" "strconv" "github.com/mark3labs/mcp-go/mcp" + "github.com/zzet/gortex/internal/query" ) // computeETag produces a short content hash suitable for conditional fetch. -// The hash is computed from the JSON serialization of the data. +// Streams the JSON serialization straight into the hash so we don't +// allocate the full marshaled byte slice (significant on large +// payloads — a 500-symbol SubGraph used to allocate ~100 KiB just to +// feed sha256). func computeETag(data any) string { - b, err := json.Marshal(data) - if err != nil { + h := sha256.New() + if err := json.NewEncoder(h).Encode(data); err != nil { return "" } - h := sha256.Sum256(b) - return hex.EncodeToString(h[:8]) // 16 hex chars — collision-safe for session use + sum := h.Sum(nil) + return hex.EncodeToString(sum[:8]) // 16 hex chars — collision-safe for session use +} + +// etagSubGraph is a fast structural ETag specialised for query.SubGraph +// payloads (the get_file_summary / get_editing_context hot path). +// Instead of going through json.Marshal on every node + edge + Meta map +// (which is the dominant cost for a 500-symbol file), it hashes a +// stable structural fingerprint: each node's id + line range, each +// edge's (from, to, kind), and the truncation / total counts. That +// keeps the invariant the callers depend on — "the etag changes when +// the file's listing changes" — without paying for the body of every +// Meta map on every call. +func etagSubGraph(sg *query.SubGraph) string { + if sg == nil { + return "" + } + h := sha256.New() + var buf [16]byte + for _, n := range sg.Nodes { + if n == nil { + continue + } + h.Write([]byte(n.ID)) + binary.BigEndian.PutUint32(buf[0:4], uint32(n.StartLine)) + binary.BigEndian.PutUint32(buf[4:8], uint32(n.EndLine)) + h.Write(buf[:8]) + h.Write([]byte{0}) + } + h.Write([]byte{1}) + for _, e := range sg.Edges { + if e == nil { + continue + } + h.Write([]byte(e.From)) + h.Write([]byte{31}) + h.Write([]byte(e.To)) + h.Write([]byte{31}) + h.Write([]byte(e.Kind)) + h.Write([]byte{0}) + } + binary.BigEndian.PutUint64(buf[0:8], uint64(sg.TotalNodes)) + binary.BigEndian.PutUint64(buf[8:16], uint64(sg.TotalEdges)) + h.Write(buf[:16]) + if sg.Truncated { + h.Write([]byte{1}) + } + sum := h.Sum(nil) + return hex.EncodeToString(sum[:8]) } // notModifiedResult returns a minimal "not modified" response with the matching etag. diff --git a/internal/mcp/gcx.go b/internal/mcp/gcx.go index c7f96aed..3c7db20a 100644 --- a/internal/mcp/gcx.go +++ b/internal/mcp/gcx.go @@ -497,13 +497,20 @@ func encodeSubGraph(tool string, sg *query.SubGraph) ([]byte, error) { } // encodeFileSummary emits one row per symbol in a file plus a trailing -// edge-distribution comment. +// edge-distribution comment. Pulls the edge total from sg.TotalEdges +// rather than len(sg.Edges) so the count-only handler path (which +// leaves the Edge slice nil to avoid materialising every adjacent +// edge over cgo) still reports the right number. func encodeFileSummary(sg *query.SubGraph, etag string) ([]byte, error) { var buf bytes.Buffer + totalEdges := sg.TotalEdges + if totalEdges == 0 { + totalEdges = len(sg.Edges) + } enc := newGCX(&buf, "get_file_summary", []string{"id", "kind", "name", "line", "sig"}, "total_nodes", fmt.Sprintf("%d", sg.TotalNodes), - "total_edges", fmt.Sprintf("%d", len(sg.Edges)), + "total_edges", fmt.Sprintf("%d", totalEdges), "truncated", boolString(sg.Truncated), "etag", etag, ) diff --git a/internal/mcp/memories.go b/internal/mcp/memories.go index 55667734..a5039e88 100644 --- a/internal/mcp/memories.go +++ b/internal/mcp/memories.go @@ -14,6 +14,11 @@ import ( "github.com/zzet/gortex/internal/persistence" ) +// maxMemoriesCap is the soft ceiling on stored memories per repo +// scope. Trimming honours pinned + high-importance memories. Matches +// the prior gob.gz cap. +const maxMemoriesCap = 10000 + // memoryManager owns the cross-session development-memory store. // It mirrors notesManager structurally (same persistence + filter // shape) but its entries have no SessionID — every memory is @@ -21,28 +26,44 @@ import ( // compounds the longer a team uses Gortex. // // Memories live alongside the graph as a separate, persistent -// side-store written into the same per-repo cache directory as -// notes / feedback / combo / frecency. Empty dir yields an -// in-memory-only manager (test fixtures, single-shot CLI calls). +// side-store backed by the SQLite sidecar DB. The in-memory slice + +// scorers are unchanged from the gob.gz era; only the persistence +// layer changed. A nil sidecar yields an in-memory-only manager +// (test fixtures, single-shot CLI calls). type memoryManager struct { - mu sync.Mutex - store persistence.MemoryStore - dir string + mu sync.Mutex + store persistence.MemoryStore + sidecar *persistence.SidecarStore + repoKey string } -// newMemoryManager constructs a manager, lazily loading any -// existing memories from disk. Empty cacheDir/repoPath yields a -// no-disk manager. +// newMemoryManager constructs a manager, lazily loading any existing +// memories from the sidecar. Empty cacheDir/repoPath yields a no-disk +// manager. The sidecar lives at /sidecar.sqlite; any legacy +// memories.gob.gz under the per-repo cache subdir is imported once, +// then renamed to *.bak. func newMemoryManager(cacheDir, repoPath string) *memoryManager { if cacheDir == "" || repoPath == "" { return &memoryManager{} } - dir := persistence.MemoriesDir(cacheDir, repoPath) - mm := &memoryManager{dir: dir} + sidecar, err := persistence.OpenSidecar(persistence.DefaultSidecarPath(cacheDir)) + if err != nil || sidecar == nil { + return &memoryManager{} + } + return newMemoryManagerFromSidecar(sidecar, persistence.RepoCacheKey(repoPath), persistence.MemoriesDir(cacheDir, repoPath)) +} - loaded, err := persistence.LoadMemories(dir) - if err == nil && loaded != nil { - mm.store = *loaded +// newMemoryManagerFromSidecar builds a memory manager bound to an +// already-open sidecar + repo key, importing legacyDir/memories.gob.gz +// once. Used by the daemon path where the sidecar is opened once and +// shared across managers. +func newMemoryManagerFromSidecar(sidecar *persistence.SidecarStore, repoKey, legacyDir string) *memoryManager { + mm := &memoryManager{sidecar: sidecar, repoKey: repoKey} + if sidecar != nil { + _ = sidecar.MigrateLegacyMemories(repoKey, legacyDir) + if rows, err := sidecar.LoadMemoriesRows(repoKey); err == nil { + mm.store.Entries = rows + } } return mm } @@ -120,9 +141,10 @@ func (mm *memoryManager) Save(entry persistence.MemoryEntry) (string, error) { } mm.store.Entries = append(mm.store.Entries, entry) - if err := mm.flushLocked(); err != nil { + if err := mm.persistLocked(entry); err != nil { return entry.ID, err } + mm.trimLocked() return entry.ID, nil } @@ -175,7 +197,7 @@ func (mm *memoryManager) Update(id string, patch MemoryPatch) (persistence.Memor } e.UpdatedAt = time.Now().UTC() mm.store.Entries[idx] = e - if err := mm.flushLocked(); err != nil { + if err := mm.persistLocked(e); err != nil { return e, err } return e, nil @@ -191,7 +213,10 @@ func (mm *memoryManager) Delete(id string) error { return nil } mm.store.Entries = append(mm.store.Entries[:idx], mm.store.Entries[idx+1:]...) - return mm.flushLocked() + if mm.sidecar == nil { + return nil + } + return mm.sidecar.DeleteMemory(mm.repoKey, id) } // Get returns a single memory by ID, or (zero, false) when not found. @@ -215,7 +240,6 @@ func (mm *memoryManager) MarkAccessed(ids []string) { mm.mu.Lock() defer mm.mu.Unlock() now := time.Now().UTC() - touched := false for _, id := range ids { idx := mm.findLocked(id) if idx < 0 { @@ -223,10 +247,7 @@ func (mm *memoryManager) MarkAccessed(ids []string) { } mm.store.Entries[idx].AccessCount++ mm.store.Entries[idx].LastAccessed = now - touched = true - } - if touched { - _ = mm.flushLocked() + _ = mm.persistLocked(mm.store.Entries[idx]) } } @@ -324,11 +345,28 @@ func (mm *memoryManager) findLocked(id string) int { return -1 } -func (mm *memoryManager) flushLocked() error { - if mm.dir == "" { +// persistLocked writes a single memory row to the sidecar. No-op for +// an in-memory-only manager. Callers hold mm.mu. +func (mm *memoryManager) persistLocked(e persistence.MemoryEntry) error { + if mm.sidecar == nil { return nil } - return persistence.SaveMemories(mm.dir, &mm.store) + return mm.sidecar.UpsertMemory(mm.repoKey, e) +} + +// trimLocked enforces the soft cap (maxMemoriesCap) via the two-pass +// bounded DELETE on the sidecar, then reconciles the in-memory slice. +// No-op when under cap or in-memory-only. Callers hold mm.mu. +func (mm *memoryManager) trimLocked() { + if mm.sidecar == nil || len(mm.store.Entries) <= maxMemoriesCap { + return + } + if err := mm.sidecar.TrimMemories(mm.repoKey, maxMemoriesCap); err != nil { + return + } + if rows, err := mm.sidecar.LoadMemoriesRows(mm.repoKey); err == nil { + mm.store.Entries = rows + } } // --------------------------------------------------------------------------- diff --git a/internal/mcp/notebook.go b/internal/mcp/notebook.go index d30aa8cc..6f311085 100644 --- a/internal/mcp/notebook.go +++ b/internal/mcp/notebook.go @@ -5,13 +5,14 @@ import ( "encoding/hex" "errors" "fmt" - "os" "path/filepath" "regexp" "sort" "strings" "sync" "time" + + "github.com/zzet/gortex/internal/persistence" ) // notebookEntry is a single repository-local persistent notebook @@ -33,44 +34,113 @@ type notebookEntry struct { Body string } -// notebookManager owns the on-disk notebook store. The directory is -// the repo's .gortex/notebook/ tree; an empty dir yields a no-op -// manager so test fixtures and single-shot CLI calls don't fail. +// notebookManager owns the repository notebook store, now backed by +// the SQLite sidecar DB (the sidecar lives at /.gortex/ +// sidecar.sqlite, co-located with the repo as the markdown layout was). +// A nil sidecar yields a no-op manager so test fixtures and +// single-shot CLI calls don't fail. The notebookEntry shape is +// unchanged; only the persistence layer moved from per-entry markdown +// files to sqlite rows. type notebookManager struct { - mu sync.Mutex - dir string + mu sync.Mutex + sidecar *persistence.SidecarStore + repoKey string + // legacyDir is the historical /.gortex/notebook/ markdown + // directory, kept so the one-shot migration can find + rename old + // .md files. Empty when uninitialised. + legacyDir string // ttl applies to LastUsed when set: entries unused for longer // than ttl are pruned at save time. 0 disables pruning. ttl time.Duration } -// newNotebookManager returns a manager rooted at /.gortex/ -// notebook/. Empty repoPath yields a no-disk manager (the methods -// are still safe to call, they just no-op the persistence). +// newNotebookManager returns a manager whose sidecar DB lives at +// /.gortex/sidecar.sqlite. Empty repoPath yields a no-disk +// manager (the methods are still safe to call, they just no-op the +// persistence and Save returns an honest "not initialised" error). Any +// legacy /.gortex/notebook/.md files are imported once, +// then renamed to .md.bak. func newNotebookManager(repoPath string) *notebookManager { if repoPath == "" { return ¬ebookManager{} } + gortexDir := filepath.Join(repoPath, ".gortex") + sidecar, err := persistence.OpenSidecar(persistence.DefaultSidecarPath(gortexDir)) + if err != nil || sidecar == nil { + return ¬ebookManager{} + } + repoKey := persistence.RepoCacheKey(repoPath) + legacyDir := filepath.Join(gortexDir, "notebook") + _ = sidecar.MigrateLegacyNotebook(repoKey, legacyDir, importLegacyNotebookMD) return ¬ebookManager{ - dir: filepath.Join(repoPath, ".gortex", "notebook"), - ttl: 30 * 24 * time.Hour, + sidecar: sidecar, + repoKey: repoKey, + legacyDir: legacyDir, + ttl: 30 * 24 * time.Hour, + } +} + +// importLegacyNotebookMD parses a markdown notebook file's contents +// into a sidecar NotebookRow for the one-shot migration. +func importLegacyNotebookMD(id, contents string) (persistence.NotebookRow, bool) { + e, err := notebookUnmarshal(contents) + if err != nil { + return persistence.NotebookRow{}, false + } + return persistence.NotebookRow{ + ID: id, + Title: e.Title, + Body: e.Body, + Tags: e.Tags, + UsedCount: e.UsedCount, + LastUsed: e.LastUsed, + Created: e.Created, + Updated: e.Updated, + }, true +} + +// rowToEntry / entryToRow convert between the public notebookEntry and +// the sidecar NotebookRow. +func rowToEntry(r persistence.NotebookRow) notebookEntry { + return notebookEntry{ + ID: r.ID, + Title: r.Title, + Tags: r.Tags, + Created: r.Created, + Updated: r.Updated, + LastUsed: r.LastUsed, + UsedCount: r.UsedCount, + Body: r.Body, + } +} + +func entryToRow(e notebookEntry) persistence.NotebookRow { + return persistence.NotebookRow{ + ID: e.ID, + Title: e.Title, + Body: e.Body, + Tags: e.Tags, + UsedCount: e.UsedCount, + LastUsed: e.LastUsed, + Created: e.Created, + Updated: e.Updated, } } // Save persists a notebook entry. Generates an ID when missing. -// Returns the entry as it landed on disk (id + timestamps set). +// Returns the entry as it landed in the sidecar (id + timestamps set). // -// Errors when the manager has no backing directory — the daemon's -// multi-repo path historically called InitNotebook("") which left -// nm.dir empty, and the old behaviour was to *silently succeed*: the -// caller got an ID and timestamps back but no entry ever landed on -// disk, so notebook_list / notebook_find / notebook_show / notebook_used -// all returned empty afterwards. Honest failure beats phantom success. +// Errors when the manager has no backing sidecar — the daemon's +// multi-repo path historically called InitNotebook("") which left the +// manager empty, and the old behaviour was to *silently succeed*: the +// caller got an ID and timestamps back but no entry ever persisted, so +// notebook_list / notebook_find / notebook_show / notebook_used all +// returned empty afterwards. Honest failure beats phantom success. func (nm *notebookManager) Save(entry notebookEntry) (notebookEntry, error) { nm.mu.Lock() defer nm.mu.Unlock() - if nm.dir == "" { + if nm.sidecar == nil { return notebookEntry{}, errors.New("notebook is not initialised") } @@ -83,10 +153,7 @@ func (nm *notebookManager) Save(entry notebookEntry) (notebookEntry, error) { } entry.Updated = now - if err := os.MkdirAll(nm.dir, 0o755); err != nil { - return entry, fmt.Errorf("mkdir notebook: %w", err) - } - if err := os.WriteFile(nm.entryPath(entry.ID), []byte(notebookMarshal(entry)), 0o644); err != nil { + if err := nm.sidecar.UpsertNotebook(nm.repoKey, entryToRow(entry)); err != nil { return entry, fmt.Errorf("write notebook: %w", err) } // Best-effort TTL prune. Failures don't fail the save — the @@ -96,43 +163,34 @@ func (nm *notebookManager) Save(entry notebookEntry) (notebookEntry, error) { } // Get loads a single entry by id. Returns (entry, true) on hit, -// (zero, false) when the file is missing. +// (zero, false) when the entry is missing. func (nm *notebookManager) Get(id string) (notebookEntry, bool) { nm.mu.Lock() defer nm.mu.Unlock() - if nm.dir == "" { + if nm.sidecar == nil { return notebookEntry{}, false } - body, err := os.ReadFile(nm.entryPath(id)) - if err != nil { - return notebookEntry{}, false - } - entry, err := notebookUnmarshal(string(body)) - if err != nil { + row, ok := nm.sidecar.GetNotebookRow(nm.repoKey, id) + if !ok { return notebookEntry{}, false } - entry.ID = id - return entry, true + return rowToEntry(row), true } -// Delete removes an entry from disk. Missing files are not errors — -// callers can use Delete unconditionally. +// Delete removes an entry. Missing entries are not errors — callers +// can use Delete unconditionally. func (nm *notebookManager) Delete(id string) error { nm.mu.Lock() defer nm.mu.Unlock() - if nm.dir == "" { + if nm.sidecar == nil { return nil } - err := os.Remove(nm.entryPath(id)) - if err != nil && !errors.Is(err, os.ErrNotExist) { - return err - } - return nil + return nm.sidecar.DeleteNotebook(nm.repoKey, id) } -// List returns every entry on disk sorted by Updated DESC. Cheap -// enough for typical notebook sizes (hundreds of entries); the cap -// at the call site keeps responses bounded. +// List returns every entry sorted by Updated DESC. Cheap enough for +// typical notebook sizes (hundreds of entries); the cap at the call +// site keeps responses bounded. func (nm *notebookManager) List() []notebookEntry { nm.mu.Lock() defer nm.mu.Unlock() @@ -140,28 +198,16 @@ func (nm *notebookManager) List() []notebookEntry { } func (nm *notebookManager) listLocked() []notebookEntry { - if nm.dir == "" { + if nm.sidecar == nil { return nil } - entries, err := os.ReadDir(nm.dir) + rows, err := nm.sidecar.LoadNotebookRows(nm.repoKey) if err != nil { return nil } - out := make([]notebookEntry, 0, len(entries)) - for _, de := range entries { - if de.IsDir() || !strings.HasSuffix(de.Name(), ".md") { - continue - } - body, err := os.ReadFile(filepath.Join(nm.dir, de.Name())) - if err != nil { - continue - } - e, err := notebookUnmarshal(string(body)) - if err != nil { - continue - } - e.ID = strings.TrimSuffix(de.Name(), ".md") - out = append(out, e) + out := make([]notebookEntry, 0, len(rows)) + for _, r := range rows { + out = append(out, rowToEntry(r)) } sort.Slice(out, func(i, j int) bool { return out[i].Updated.After(out[j].Updated) @@ -204,52 +250,35 @@ func (nm *notebookManager) Find(query string) []notebookEntry { func (nm *notebookManager) MarkUsed(id string) (notebookEntry, error) { nm.mu.Lock() defer nm.mu.Unlock() - if nm.dir == "" { + if nm.sidecar == nil { return notebookEntry{}, fmt.Errorf("notebook is not initialised") } - body, err := os.ReadFile(nm.entryPath(id)) - if err != nil { - return notebookEntry{}, err + row, ok := nm.sidecar.GetNotebookRow(nm.repoKey, id) + if !ok { + return notebookEntry{}, fmt.Errorf("notebook entry %q not found", id) } - entry, err := notebookUnmarshal(string(body)) - if err != nil { - return notebookEntry{}, err - } - entry.ID = id + entry := rowToEntry(row) entry.UsedCount++ entry.LastUsed = time.Now().UTC() - if err := os.WriteFile(nm.entryPath(id), []byte(notebookMarshal(entry)), 0o644); err != nil { + if err := nm.sidecar.UpsertNotebook(nm.repoKey, entryToRow(entry)); err != nil { return notebookEntry{}, err } return entry, nil } // pruneLocked removes entries whose LastUsed (or Updated, when never -// used) is older than the TTL. Best-effort — silent on individual -// errors so a permission glitch on one file doesn't poison the -// rest of the call. +// used) is older than the TTL via a bounded DELETE on the sidecar. +// Best-effort — a failure is silent so the next call retries. func (nm *notebookManager) pruneLocked() { - if nm.dir == "" || nm.ttl <= 0 { + if nm.sidecar == nil || nm.ttl <= 0 { return } cutoff := time.Now().UTC().Add(-nm.ttl) - for _, e := range nm.listLocked() { - ref := e.LastUsed - if ref.IsZero() { - ref = e.Updated - } - if ref.Before(cutoff) { - _ = os.Remove(nm.entryPath(e.ID)) - } - } -} - -func (nm *notebookManager) entryPath(id string) string { - return filepath.Join(nm.dir, id+".md") + _ = nm.sidecar.NotebookPrune(nm.repoKey, cutoff) } -// newNotebookID returns a short random hex string suitable for a -// file basename. 16 chars = 8 bytes = ample collision resistance +// newNotebookID returns a short random hex string suitable for an +// entry id. 16 chars = 8 bytes = ample collision resistance // for a per-repo notebook. func newNotebookID() string { var buf [8]byte diff --git a/internal/mcp/notes.go b/internal/mcp/notes.go index f2bb5db4..d0e4d1ef 100644 --- a/internal/mcp/notes.go +++ b/internal/mcp/notes.go @@ -15,33 +15,54 @@ import ( "github.com/zzet/gortex/internal/persistence" ) +// maxNotesCap is the soft ceiling on stored notes per repo scope. +// Trimming honours pinned notes: the oldest non-pinned notes are shed +// first. Matches the prior gob.gz cap. +const maxNotesCap = 5000 + // notesManager owns the session-memory side-store: thread-safe note -// CRUD with gob+gzip persistence. Mirrors the lifecycle of -// feedbackManager — one per server, init-once, cache-dir-or-noop. +// CRUD backed by the SQLite sidecar DB. Mirrors the lifecycle of +// feedbackManager — one per server, init-once, sidecar-or-noop. // -// Notes are written into the same per-repo cache directory as -// feedback / combo / frecency. When dir is empty the manager -// operates in-memory only (test fixtures, single-shot CLI calls). +// The in-memory slice + scorers are unchanged from the gob.gz era; +// only the persistence layer changed: rows load into the slice on +// construction and each mutation writes its row(s) to the sidecar. +// When sidecar is nil the manager operates in-memory only (test +// fixtures, single-shot CLI calls with no cache dir). type notesManager struct { - mu sync.Mutex - store persistence.NoteStore - dir string + mu sync.Mutex + store persistence.NoteStore + sidecar *persistence.SidecarStore + repoKey string } // newNotesManager constructs a manager, lazily loading any existing -// notes from disk. Empty cacheDir/repoPath yields a no-disk manager -// — useful for tests and for the daemon path that wires per-session -// state without a stable repo path. +// notes from the sidecar. Empty cacheDir/repoPath yields a no-disk +// manager. The sidecar lives at /sidecar.sqlite; any legacy +// notes.gob.gz under the per-repo cache subdir is imported once, then +// renamed to *.bak. func newNotesManager(cacheDir, repoPath string) *notesManager { if cacheDir == "" || repoPath == "" { return ¬esManager{} } - dir := persistence.NotesDir(cacheDir, repoPath) - nm := ¬esManager{dir: dir} + sidecar, err := persistence.OpenSidecar(persistence.DefaultSidecarPath(cacheDir)) + if err != nil || sidecar == nil { + return ¬esManager{} + } + return newNotesManagerFromSidecar(sidecar, persistence.RepoCacheKey(repoPath), persistence.NotesDir(cacheDir, repoPath)) +} - loaded, err := persistence.LoadNotes(dir) - if err == nil && loaded != nil { - nm.store = *loaded +// newNotesManagerFromSidecar builds a notes manager bound to an +// already-open sidecar + repo key, importing legacyDir/notes.gob.gz +// once. Used by the daemon path where the sidecar is opened once and +// shared across managers. +func newNotesManagerFromSidecar(sidecar *persistence.SidecarStore, repoKey, legacyDir string) *notesManager { + nm := ¬esManager{sidecar: sidecar, repoKey: repoKey} + if sidecar != nil { + _ = sidecar.MigrateLegacyNotes(repoKey, legacyDir) + if rows, err := sidecar.LoadNotesRows(repoKey); err == nil { + nm.store.Entries = rows + } } return nm } @@ -82,9 +103,10 @@ func (nm *notesManager) Save(entry persistence.NoteEntry) (string, error) { entry.Tags = dedupeStrings(normaliseTags(entry.Tags)) nm.store.Entries = append(nm.store.Entries, entry) - if err := nm.flushLocked(); err != nil { + if err := nm.persistLocked(entry); err != nil { return entry.ID, err } + nm.trimLocked() return entry.ID, nil } @@ -114,7 +136,7 @@ func (nm *notesManager) Update(id string, body *string, tags []string, pinned *b } e.UpdatedAt = time.Now().UTC() nm.store.Entries[idx] = e - if err := nm.flushLocked(); err != nil { + if err := nm.persistLocked(e); err != nil { return e, err } return e, nil @@ -131,7 +153,10 @@ func (nm *notesManager) Delete(id string) error { return nil } nm.store.Entries = append(nm.store.Entries[:idx], nm.store.Entries[idx+1:]...) - return nm.flushLocked() + if nm.sidecar == nil { + return nil + } + return nm.sidecar.DeleteNote(nm.repoKey, id) } // Get returns a single note by ID, or (zero, false) when not found. @@ -220,11 +245,28 @@ func (nm *notesManager) findLocked(id string) int { return -1 } -func (nm *notesManager) flushLocked() error { - if nm.dir == "" { +// persistLocked writes a single note row to the sidecar. No-op for an +// in-memory-only manager. Callers hold nm.mu. +func (nm *notesManager) persistLocked(e persistence.NoteEntry) error { + if nm.sidecar == nil { return nil } - return persistence.SaveNotes(nm.dir, &nm.store) + return nm.sidecar.UpsertNote(nm.repoKey, e) +} + +// trimLocked enforces the soft cap (maxNotesCap) via a bounded DELETE +// on the sidecar, then reconciles the in-memory slice so it stays in +// sync. No-op when under cap or in-memory-only. Callers hold nm.mu. +func (nm *notesManager) trimLocked() { + if nm.sidecar == nil || len(nm.store.Entries) <= maxNotesCap { + return + } + if err := nm.sidecar.TrimNotes(nm.repoKey, maxNotesCap); err != nil { + return + } + if rows, err := nm.sidecar.LoadNotesRows(nm.repoKey); err == nil { + nm.store.Entries = rows + } } // distillResult is the structured digest returned by DistillSession. @@ -602,7 +644,7 @@ func defaultAutoLinkOptions() autoLinkOptions { // // The function never panics — a nil graph or empty body just // returns no links. Results are deduplicated and capped. -func autoLinkBody(body string, g *graph.Graph, workspaceID string, opts autoLinkOptions) []string { +func autoLinkBody(body string, g graph.Store, workspaceID string, opts autoLinkOptions) []string { if g == nil || body == "" { return nil } @@ -628,9 +670,13 @@ func autoLinkBody(body string, g *graph.Graph, workspaceID string, opts autoLink } // (1) Direct ID matches — anything containing "::" is treated as - // a candidate ID. The regexp-free scan keeps this hot path cheap. - for _, candidate := range extractIDCandidates(body) { - node := g.GetNode(candidate) + // a candidate ID. Batch the lookup so even auto-linkers with many + // candidates on long notes only pay one backend round-trip on + // disk-backed stores. + candidates := extractIDCandidates(body) + candidateNodes := g.GetNodesByIDs(candidates) + for _, candidate := range candidates { + node := candidateNodes[candidate] if node == nil { continue } diff --git a/internal/mcp/overlay.go b/internal/mcp/overlay.go index db7c096b..fde934df 100644 --- a/internal/mcp/overlay.go +++ b/internal/mcp/overlay.go @@ -11,6 +11,7 @@ import ( "github.com/mark3labs/mcp-go/mcp" mcpserver "github.com/mark3labs/mcp-go/server" + "go.uber.org/zap" "github.com/zzet/gortex/internal/daemon" ) @@ -73,7 +74,28 @@ func (s *Server) wrapToolHandler(h mcpserver.ToolHandlerFunc) mcpserver.ToolHand // Prompt-injection screening sits closest to the handler so it // sees the real arguments and the real result (see sanitize.go). h = s.sanitizeToolHandler(h) - return func(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { + return func(ctx context.Context, req mcp.CallToolRequest) (res *mcp.CallToolResult, retErr error) { + // Last-resort panic firewall around EVERY tool handler. A Go + // panic in any handler (e.g. when the store surfaces a fatal + // engine error) would otherwise unwind past the mcp-go server + // loop and crash the whole daemon — dropping every session's + // MCP transport, not just the offending call. Convert it to a + // structured tool error so the panicking tool fails in + // isolation and the daemon survives. This supersedes the + // per-handler recover that get_file_summary carried; every + // tool now gets the same protection. + defer func() { + if r := recover(); r != nil { + if s.logger != nil { + s.logger.Error("tool handler panic recovered", + zap.String("tool", req.Params.Name), + zap.Any("panic", r), + zap.Stack("stack")) + } + res = mcp.NewToolResultError(fmt.Sprintf("tool %q internal error: %v", req.Params.Name, r)) + retErr = nil + } + }() // Tolerate hallucinated / mistyped parameter names before the // handler reads arguments (e.g. "symbol" accepted as "id"). s.reconcileToolParams(&req) diff --git a/internal/mcp/overlay_view.go b/internal/mcp/overlay_view.go index 19402b8a..42f7da9e 100644 --- a/internal/mcp/overlay_view.go +++ b/internal/mcp/overlay_view.go @@ -447,7 +447,7 @@ func (s *Server) resolveOverlayEdges(layer *graph.OverlayLayer) { // in-place via AddEdge / removal pattern (layer is meant // to be append-only post-construction; the resolver pass runs // before the layer is handed to the View, so we still own it). - for from, edges := range layer.OutEdgesByFromAll() { + for _, edges := range layer.OutEdgesByFromAll() { for _, e := range edges { if !strings.HasPrefix(e.To, unresolvedPrefix) { continue @@ -464,13 +464,12 @@ func (s *Server) resolveOverlayEdges(layer *graph.OverlayLayer) { if target == "" { continue } - resolved := s.lookupOverlayTarget(layer, target, from) + resolved := s.lookupOverlayTarget(layer, target) if resolved == "" { continue } e.To = resolved } - _ = from } // Rebuild the layer's inEdges index now that targets may have // changed. The layer exposes a Rebuild helper so we don't have @@ -482,7 +481,7 @@ func (s *Server) resolveOverlayEdges(layer *graph.OverlayLayer) { // short name in (layer ∪ base). Returns the node ID on a unique // match, empty string otherwise. Tied matches return empty so the // edge stays as a placeholder rather than picking the wrong target. -func (s *Server) lookupOverlayTarget(layer *graph.OverlayLayer, name, _fromID string) string { +func (s *Server) lookupOverlayTarget(layer *graph.OverlayLayer, name string) string { overlay := layer.NodesByName(name) if len(overlay) == 1 { return overlay[0].ID diff --git a/internal/mcp/resources_analyzer.go b/internal/mcp/resources_analyzer.go index 89a12bd8..d6d189ee 100644 --- a/internal/mcp/resources_analyzer.go +++ b/internal/mcp/resources_analyzer.go @@ -113,7 +113,7 @@ func (s *Server) handleResourceReport(ctx context.Context, req mcp.ReadResourceR var hotspotCount int if len(scoped) >= 10 { - for _, h := range analysis.FindHotspots(s.graph, s.getCommunities(), 0) { + for _, h := range s.getHotspots() { if inScope == nil || inScope[h.ID] { hotspotCount++ } @@ -173,7 +173,7 @@ func (s *Server) handleResourceGodNodes(_ context.Context, req mcp.ReadResourceR }) } - entries := analysis.FindHotspots(s.graph, s.getCommunities(), 0) + entries := s.getHotspots() totalCount := len(entries) truncated := false if len(entries) > 20 { @@ -205,7 +205,7 @@ func (s *Server) handleResourceSurprises(_ context.Context, req mcp.ReadResource var topHubs []analysis.HotspotEntry if s.graph.NodeCount() >= 10 { - hot := analysis.FindHotspots(s.graph, communities, 0) + hot := s.getHotspots() // Top hubs == hotspots with at least one community crossing. for _, h := range hot { if h.CommunityCrossings > 0 { diff --git a/internal/mcp/scopes.go b/internal/mcp/scopes.go index eea28ebd..19bade2a 100644 --- a/internal/mcp/scopes.go +++ b/internal/mcp/scopes.go @@ -1,13 +1,13 @@ package mcp import ( - "encoding/json" "os" "path/filepath" "sort" "strings" "sync" + "github.com/zzet/gortex/internal/persistence" "github.com/zzet/gortex/internal/platform" ) @@ -27,21 +27,22 @@ type SavedScope struct { Paths []string `json:"paths,omitempty"` } -// scopeStore is a small JSON-file-backed registry of SavedScopes. It -// survives daemon restarts. All exported methods are safe for concurrent -// use. +// scopeStore is a small registry of SavedScopes backed by the SQLite +// sidecar DB. It survives daemon restarts. Scopes are global (not +// repo-scoped). The in-memory byName map mirrors the scopes table so +// reads stay lock-cheap; mutations write through to the sidecar. All +// exported methods are safe for concurrent use. type scopeStore struct { - mu sync.Mutex - path string - byName map[string]SavedScope + mu sync.Mutex + sidecar *persistence.SidecarStore + byName map[string]SavedScope } -// scopesFilePath returns the on-disk location of the saved-scope store, -// honouring GORTEX_SCOPES_PATH (used by tests) over the cache default. -// -// An absolute $XDG_CACHE_HOME wins; otherwise the store stays under -// os.UserCacheDir() — the historical location, kept so an existing -// scopes file is not orphaned. +// scopesFilePath returns the legacy on-disk location of the saved-scope +// store, honouring GORTEX_SCOPES_PATH (used by tests) over the cache +// default. The sidecar DB lives next to it (/sidecar.sqlite); a +// pre-existing scopes.json at this path is imported once, then renamed +// to scopes.json.bak. func scopesFilePath() string { if p := strings.TrimSpace(os.Getenv("GORTEX_SCOPES_PATH")); p != "" { return p @@ -49,46 +50,32 @@ func scopesFilePath() string { return filepath.Join(platform.OSCacheDir(), "scopes.json") } -// newScopeStore builds a store at path and loads any persisted scopes. -func newScopeStore(path string) *scopeStore { - st := &scopeStore{path: path, byName: map[string]SavedScope{}} - st.load() - return st +// newScopeStore builds a store whose sidecar DB lives next to the given +// legacy scopes.json path. Any scopes.json present is imported once, +// then the in-memory map is hydrated from the sidecar. A nil sidecar +// (open failure) yields an in-memory-only store. +func newScopeStore(legacyPath string) *scopeStore { + sidecarPath := persistence.DefaultSidecarPath(filepath.Dir(legacyPath)) + sidecar, _ := persistence.OpenSidecar(sidecarPath) + return newScopeStoreFromSidecar(sidecar, legacyPath) } -// load reads persisted scopes; a missing or unreadable file leaves the -// store empty. Called only from the constructor, so it takes no lock. -func (st *scopeStore) load() { - data, err := os.ReadFile(st.path) - if err != nil { - return - } - var scopes []SavedScope - if json.Unmarshal(data, &scopes) != nil { - return - } - for _, sc := range scopes { - if sc.Name != "" { - st.byName[sc.Name] = sc +// newScopeStoreFromSidecar builds a scope store bound to an already-open +// sidecar, importing legacyPath/scopes.json once. Used by the daemon +// path where the sidecar is opened once and shared. +func newScopeStoreFromSidecar(sidecar *persistence.SidecarStore, legacyPath string) *scopeStore { + st := &scopeStore{sidecar: sidecar, byName: map[string]SavedScope{}} + if sidecar != nil { + _ = sidecar.MigrateLegacyScopes(legacyPath) + if rows, err := sidecar.LoadScopes(); err == nil { + for _, r := range rows { + if r.Name != "" { + st.byName[r.Name] = SavedScope{Name: r.Name, Description: r.Description, Repos: r.Repos, Paths: r.Paths} + } + } } } -} - -// save persists the store. Callers hold st.mu. -func (st *scopeStore) save() error { - scopes := make([]SavedScope, 0, len(st.byName)) - for _, sc := range st.byName { - scopes = append(scopes, sc) - } - sort.Slice(scopes, func(i, j int) bool { return scopes[i].Name < scopes[j].Name }) - data, err := json.MarshalIndent(scopes, "", " ") - if err != nil { - return err - } - if err := os.MkdirAll(filepath.Dir(st.path), 0o755); err != nil { - return err - } - return os.WriteFile(st.path, data, 0o644) + return st } func (st *scopeStore) get(name string) (SavedScope, bool) { @@ -113,7 +100,12 @@ func (st *scopeStore) put(sc SavedScope) error { st.mu.Lock() defer st.mu.Unlock() st.byName[sc.Name] = sc - return st.save() + if st.sidecar == nil { + return nil + } + return st.sidecar.UpsertScope(persistence.ScopeRow{ + Name: sc.Name, Description: sc.Description, Repos: sc.Repos, Paths: sc.Paths, + }) } func (st *scopeStore) remove(name string) (bool, error) { @@ -123,7 +115,10 @@ func (st *scopeStore) remove(name string) (bool, error) { return false, nil } delete(st.byName, name) - return true, st.save() + if st.sidecar == nil { + return true, nil + } + return true, st.sidecar.DeleteScope(name) } // scopeStoreOrInit lazily constructs the per-server saved-scope store. diff --git a/internal/mcp/search_equivalence.go b/internal/mcp/search_equivalence.go index f7f97f80..2b367b2d 100644 --- a/internal/mcp/search_equivalence.go +++ b/internal/mcp/search_equivalence.go @@ -54,6 +54,22 @@ func (m expandMode) allowsEquivalenceExpansion() bool { return m == expandBoth || m == expandEquivalenceOnly } +// isIdentifierClass reports whether the query class is one of the +// identifier-shape classes (symbol / path / signature) — the classes +// where the rerank's classWeightTable already proves the semantic +// channel contributes near-zero useful signal (0.65 / 0.45 / 0.80 vs +// the baseline 1.00 for concept). The handler routes these queries +// through the identifier-shape fast path: expansion off, vector +// channel off, fetch slack tightened. +func isIdentifierClass(c rerank.QueryClass) bool { + switch c { + case rerank.QueryClassSymbol, rerank.QueryClassPath, rerank.QueryClassSignature: + return true + default: + return false + } +} + // expandEquivalenceClasses returns the deterministic expansion terms // for a query: for every query token, its curated-equivalence-table // siblings and its per-repo auto-mined concept siblings. The result diff --git a/internal/mcp/server.go b/internal/mcp/server.go index ee4ac548..c8713055 100644 --- a/internal/mcp/server.go +++ b/internal/mcp/server.go @@ -5,7 +5,6 @@ import ( "encoding/json" "math" "os" - "path/filepath" "sort" "strings" "sync" @@ -85,7 +84,7 @@ func (sh *symbolHistory) All() map[string][]SymbolModification { type Server struct { mcpServer *server.MCPServer engine *query.Engine - graph *graph.Graph + graph graph.Store indexer *indexer.Indexer watcher watcherHistory multiIndexer *indexer.MultiIndexer @@ -118,7 +117,22 @@ type Server struct { // of the whole graph. nil until the first clusters request; // guarded by analysisMu. leidenCache *analysis.LeidenPartitionCache - analysisMu sync.RWMutex + // communitiesToken snapshots the graph identity that backed + // s.communities — (NodeCount, EdgeCount, EdgeIdentityRevisions). + // handleAnalyzeClusters reads this before calling the incremental + // detector: if the token still matches the live graph, the cached + // communities are reused without scanning AllNodes / AllEdges to + // fingerprint packages. On a disk backend the fingerprint scan alone is + // ~140s; the cache check is three scalar reads. + communitiesToken communityCacheToken + // hotspots is the default-threshold (mean + 2*stddev) hotspot + // ranking. FindHotspots' inner ComputeBetweenness pass dominates + // the wall clock of get_repo_outline / get_architecture / + // gortex_wakeup / the analyze(hotspots) resource — caching it + // once per RunAnalysis turn turns repeat calls into a map lookup. + // Rebuilt each RunAnalysis pass; guarded by analysisMu. + hotspots []analysis.HotspotEntry + analysisMu sync.RWMutex // cochange caches the git-history co-change graph. cochangeByFile // maps a file path to its co-changing file paths and association @@ -446,10 +460,7 @@ type tokenStats struct { // returned and fullFile are token counts (cl100k_base via internal/tokens). func (ts *tokenStats) record(node *graph.Node, tool string, returned, fullFile int64) { ts.mu.Lock() - saved := fullFile - returned - if saved < 0 { - saved = 0 - } + saved := max(fullFile-returned, 0) ts.tokensSaved += saved ts.tokensReturned += returned ts.callCount++ @@ -724,7 +735,7 @@ const serverInstructions = `Gortex is a code-intelligence graph server — it in - Pass format:"gcx" to list-shaped tools for a compact, round-trippable wire format (~27% fewer tokens).` // NewServer creates an MCP server with all Gortex tools registered. -func NewServer(engine *query.Engine, g *graph.Graph, idx *indexer.Indexer, watcher *indexer.Watcher, logger *zap.Logger, guardRules []config.GuardRule, opts ...MultiRepoOptions) *Server { +func NewServer(engine *query.Engine, g graph.Store, idx *indexer.Indexer, watcher *indexer.Watcher, logger *zap.Logger, guardRules []config.GuardRule, opts ...MultiRepoOptions) *Server { s := &Server{ engine: engine, graph: g, @@ -836,6 +847,8 @@ func NewServer(engine *query.Engine, g *graph.Graph, idx *indexer.Indexer, watch s.registerGenerateSkillTool() s.registerInspectionsTools() s.registerChurnRateTool() + s.registerEnrichChurnTool() + s.registerEnrichReleasesTool() s.registerCoChangeTool() s.registerArtifactTools() s.registerCouplingMetricsTool() @@ -951,13 +964,13 @@ func (s *Server) InitNotebook(repoPath string) { func (s *Server) InitMemories(cacheDir, repoPath string) { s.memories = newMemoryManager(cacheDir, repoPath) - // Mount the user-level global store. Defaults to - // ~/.gortex/memories-cache; an absolute $XDG_DATA_HOME relocates it - // to /gortex/memories-cache. Failures (no $HOME, - // unreadable home) leave globalMemories nil; tools detect that and - // surface a clear error rather than silently dropping global writes. + // Mount the user-level global store. Defaults to ~/.gortex/memories; + // an absolute $XDG_DATA_HOME relocates it to + // /gortex/memories. Failures (no $HOME, unreadable + // home) leave globalMemories nil; tools detect that and surface a + // clear error rather than silently dropping global writes. if home, err := os.UserHomeDir(); err == nil && home != "" { - s.globalMemories = newMemoryManager(filepath.Join(platform.LegacyDataDir(), "memories-cache"), "global") + s.globalMemories = newMemoryManager(platform.MemoriesDir(), "global") } } @@ -1146,6 +1159,59 @@ func (s *Server) scopedNodes(ctx context.Context) []*graph.Node { return out } +// scopedNodesByKinds is the kind-pushdown sibling of scopedNodes for +// handlers that only need a specific kind set. When the backend +// implements graph.NodesByKindsScanner the kind predicate runs server- +// side (one kind-filtered scan over the node table) instead of +// the legacy AllNodes()-then-Go-side filter. The metadata analyzers +// (todos, stale_code, stale_flags, ownership, coverage_gaps, +// coverage_summary, cgo_users, wasm_users, orphan_tables, +// unreferenced_tables) each keep one or two kinds out of the whole +// node table; pushing that filter is the entire win. +// +// Workspace-bound sessions still narrow Go-side: the capability does +// not know about ScopeAllows, and adding workspace_id to every analyze +// query would tie the capability to the session-scope concept. The +// secondary filter is cheap because the kind pushdown already shrank +// the row count by 1-2 orders of magnitude. +// +// Empty kinds returns nil — defensive against caller bugs that would +// otherwise drop into the full-AllNodes fallback path. +func (s *Server) scopedNodesByKinds(ctx context.Context, kinds []graph.NodeKind) []*graph.Node { + if len(kinds) == 0 { + return nil + } + var nodes []*graph.Node + if scan, ok := s.graph.(graph.NodesByKindsScanner); ok { + nodes = scan.NodesByKinds(kinds) + } else { + // Fallback: same behaviour as scopedNodes, kind-filtered Go-side. + all := s.graph.AllNodes() + allowed := make(map[graph.NodeKind]struct{}, len(kinds)) + for _, k := range kinds { + allowed[k] = struct{}{} + } + nodes = make([]*graph.Node, 0, len(all)) + for _, n := range all { + if _, ok := allowed[n.Kind]; ok { + nodes = append(nodes, n) + } + } + } + sessWS, _, bound := s.sessionScope(ctx) + if !bound { + return nodes + } + opts := query.QueryOptions{WorkspaceID: sessWS} + out := make([]*graph.Node, 0, len(nodes)) + for _, n := range nodes { + if opts.ScopeAllows(n) { + out = append(out, n) + } + } + return out +} + // scopedNodeSlice filters an existing node slice to the session's // workspace. Convenience for handlers that already hold a node list // (engine list methods that don't take QueryOptions). @@ -1395,6 +1461,25 @@ func (s *Server) ResolveToolScope(toolName string, repo any) (*ScopedRepos, *mcp return ResolveScopedRepos(scope, s.bind, repo) } +// communityCacheToken is the per-graph identity tuple +// handleAnalyzeClusters checks before re-running the incremental +// detector. EdgeIdentity moves on any structural mutation; NodeCount +// and EdgeCount cover pure additions / removals that leave the +// identity counter alone. A zero token is "never populated". +type communityCacheToken struct { + edgeIdentity int + nodeCount int + edgeCount int +} + +func (s *Server) currentCommunityToken() communityCacheToken { + return communityCacheToken{ + edgeIdentity: s.graph.EdgeIdentityRevisions(), + nodeCount: s.graph.NodeCount(), + edgeCount: s.graph.EdgeCount(), + } +} + // RunAnalysis performs community detection and process discovery on // the current graph, then pushes a `notifications/resources/updated` // for every bootstrap resource so subscribed clients can refresh @@ -1409,6 +1494,7 @@ func (s *Server) RunAnalysis() { communities, cache, _ := analysis.DetectCommunitiesLeidenIncremental(s.graph, s.leidenCache) s.communities = communities s.leidenCache = cache + s.communitiesToken = s.currentCommunityToken() s.processes = analysis.DiscoverProcesses(s.graph) s.pageRank = analysis.ComputePageRank(s.graph) // Auto-concept vocabulary: mine domain phrases from symbol names @@ -1418,6 +1504,10 @@ func (s *Server) RunAnalysis() { // HITS authority/hub scores -- fed into the search rerank as an // authority signal that complements raw fan-in. s.hits = analysis.ComputeHITS(s.graph) + // Default-threshold hotspot ranking — cached because FindHotspots + // triggers ComputeBetweenness which is the shared wall-clock + // floor for outline / architecture / wakeup / the resource view. + s.hotspots = analysis.FindHotspots(s.graph, communities, 0) s.analysisMu.Unlock() // Bootstrap-resource payloads (graph_stats, index_health, etc.) @@ -1444,11 +1534,59 @@ func (s *Server) getCommunities() *analysis.CommunityResult { // packages. The cache it returns is stored back under analysisMu so // the next clusters request can build on it. The accompanying stats // describe whether the fast path or a full recompute ran. +// +// Short-circuits when the cached communities are still valid for the +// live graph: the (NodeCount, EdgeCount, EdgeIdentityRevisions) token +// captured by the last detector run is compared against the current +// graph identity in three scalar reads. On a disk backend a match skips the +// AllNodes / AllEdges fingerprint scan that otherwise dominates the +// call (~140s on a fresh daemon) and serves the existing partition +// straight from the cache. The reported stats describe a no-op +// incremental run (no changed packages, no repartitioned nodes) so +// callers see the cache hit on the wire. func (s *Server) incrementalCommunities() (*analysis.CommunityResult, analysis.IncrementalCommunityStats) { s.analysisMu.Lock() defer s.analysisMu.Unlock() + cur := s.currentCommunityToken() + if s.communities != nil && s.communitiesToken == cur { + stats := analysis.IncrementalCommunityStats{ + Incremental: true, + } + if s.leidenCache != nil { + stats.TotalPackages = len(s.leidenCache.PackageFingerprints()) + } + if s.logger != nil { + s.logger.Debug("incrementalCommunities cache hit", + zap.Int("nodes", cur.nodeCount), + zap.Int("edges", cur.edgeCount), + zap.Int("edge_identity_rev", cur.edgeIdentity)) + } + return s.communities, stats + } + if s.logger != nil { + // INFO-level on the miss path so a regression that re-introduces + // a steady-state cache miss is visible without flipping the + // daemon to debug. The full token diff is here precisely to + // catch background-mutation regressions (some pass keeps drifting + // the edge count under the cache and the Leiden walk runs every + // call). A real first-call miss is a single line in the log. + s.logger.Info("incrementalCommunities cache miss", + zap.Bool("communities_nil", s.communities == nil), + zap.Int("cached_nodes", s.communitiesToken.nodeCount), + zap.Int("cur_nodes", cur.nodeCount), + zap.Int("cached_edges", s.communitiesToken.edgeCount), + zap.Int("cur_edges", cur.edgeCount), + zap.Int("cached_edge_rev", s.communitiesToken.edgeIdentity), + zap.Int("cur_edge_rev", cur.edgeIdentity)) + } result, cache, stats := analysis.DetectCommunitiesLeidenIncremental(s.graph, s.leidenCache) + s.communities = result s.leidenCache = cache + // Capture the token AFTER the algo finishes — if the graph mutated + // during the (potentially slow) detector run, the token reflects + // the state the result was actually computed against, and the next + // call's token comparison stays meaningful. + s.communitiesToken = s.currentCommunityToken() return result, stats } @@ -1482,6 +1620,17 @@ func (s *Server) getHITS() *analysis.HITSResult { return s.hits } +// getHotspots returns the default-threshold hotspot ranking computed +// by the most recent RunAnalysis pass. Nil/empty until the first +// pass; callers use the live FindHotspots(threshold) path when they +// need a non-default threshold. Returned slice is shared and must +// not be mutated by the caller. +func (s *Server) getHotspots() []analysis.HotspotEntry { + s.analysisMu.RLock() + defer s.analysisMu.RUnlock() + return s.hotspots +} + // SetArchitecture installs the declarative architecture-rules DSL so // check_guards evaluates layered violations alongside the flat guard // rules. Called by the server / daemon entrypoint right after diff --git a/internal/mcp/server_test.go b/internal/mcp/server_test.go index 186acb15..3b36c0eb 100644 --- a/internal/mcp/server_test.go +++ b/internal/mcp/server_test.go @@ -26,16 +26,17 @@ import ( func setupTestServer(t *testing.T) (*Server, string) { t.Helper() dir := t.TempDir() + // Fixture deliberately has zero external imports so the + // resolver's attributeGoExternalCalls pass doesn't auto-add a + // `module::go:*` node — that lets the external-calls analyser + // tests assert on an exact set of manually-added modules. _ = os.WriteFile(filepath.Join(dir, "main.go"), []byte(`package main -import "fmt" - type Config struct { Port int } func main() { - fmt.Println("hello") helper() } diff --git a/internal/mcp/sidecar_migration_test.go b/internal/mcp/sidecar_migration_test.go new file mode 100644 index 00000000..2f28b85d --- /dev/null +++ b/internal/mcp/sidecar_migration_test.go @@ -0,0 +1,128 @@ +package mcp + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/persistence" +) + +// TestNotesManager_MigratesLegacyGobGz proves a pre-existing +// notes.gob.gz is imported into the sidecar on first manager open and +// the legacy file is renamed to *.bak (never deleted). +func TestNotesManager_MigratesLegacyGobGz(t *testing.T) { + cache := t.TempDir() + repo := "/tmp/migrate-notes-repo" + legacyDir := persistence.NotesDir(cache, repo) + require.NoError(t, persistence.SaveNotes(legacyDir, &persistence.NoteStore{ + Entries: []persistence.NoteEntry{ + {ID: "nt-legacy", Body: "legacy note", SessionID: "s1", Pinned: true}, + }, + })) + + nm := newNotesManager(cache, repo) + require.True(t, nm.HasData(), "legacy note imported") + got, ok := nm.Get("nt-legacy") + require.True(t, ok) + assert.Equal(t, "legacy note", got.Body) + assert.True(t, got.Pinned) + + // Legacy gob.gz renamed to .bak. + _, errOrig := os.Stat(filepath.Join(legacyDir, "notes.gob.gz")) + assert.True(t, os.IsNotExist(errOrig), "legacy notes.gob.gz renamed away") + _, errBak := os.Stat(filepath.Join(legacyDir, "notes.gob.gz.bak")) + assert.NoError(t, errBak, ".bak preserved") + + // A fresh manager over the same cache sees the migrated note from + // the sidecar and does not re-import (idempotent). + nm2 := newNotesManager(cache, repo) + assert.Equal(t, 1, nm2.Count()) +} + +// TestMemoryManager_MigratesLegacyGobGz proves the same for memories. +func TestMemoryManager_MigratesLegacyGobGz(t *testing.T) { + cache := t.TempDir() + repo := "/tmp/migrate-mem-repo" + legacyDir := persistence.MemoriesDir(cache, repo) + require.NoError(t, persistence.SaveMemories(legacyDir, &persistence.MemoryStore{ + Entries: []persistence.MemoryEntry{ + {ID: "mem-legacy", Body: "legacy memory", Kind: "invariant", Importance: 5}, + }, + })) + + mm := newMemoryManager(cache, repo) + out := mm.Query(MemoryQueryFilter{}) + require.Len(t, out, 1) + assert.Equal(t, "mem-legacy", out[0].ID) + assert.Equal(t, "invariant", out[0].Kind) + + _, errBak := os.Stat(filepath.Join(legacyDir, "memories.gob.gz.bak")) + assert.NoError(t, errBak) +} + +// TestScopeStore_MigratesLegacyJSON proves a pre-existing scopes.json +// is imported into the sidecar and renamed to *.bak. +func TestScopeStore_MigratesLegacyJSON(t *testing.T) { + dir := t.TempDir() + legacyPath := filepath.Join(dir, "scopes.json") + require.NoError(t, os.WriteFile(legacyPath, + []byte(`[{"name":"backend","description":"be","repos":["api","core"]}]`), 0o644)) + + st := newScopeStore(legacyPath) + got, ok := st.get("backend") + require.True(t, ok, "legacy scope imported") + assert.Equal(t, []string{"api", "core"}, got.Repos) + + _, errBak := os.Stat(legacyPath + ".bak") + assert.NoError(t, errBak, ".bak preserved") + + // A fresh store over the same dir reads from the sidecar. + st2 := newScopeStore(legacyPath) + _, ok = st2.get("backend") + assert.True(t, ok) +} + +// TestNotebookManager_MigratesLegacyMarkdown proves pre-existing +// /.gortex/notebook/.md files are imported into the sidecar +// and renamed to *.bak. +func TestNotebookManager_MigratesLegacyMarkdown(t *testing.T) { + repo := t.TempDir() + mdDir := filepath.Join(repo, ".gortex", "notebook") + require.NoError(t, os.MkdirAll(mdDir, 0o755)) + md := notebookMarshal(notebookEntry{ + ID: "nbold", + Title: "legacy nb", + Tags: []string{"design"}, + Body: "legacy body\n", + }) + require.NoError(t, os.WriteFile(filepath.Join(mdDir, "nbold.md"), []byte(md), 0o644)) + + nm := newNotebookManager(repo) + got, ok := nm.Get("nbold") + require.True(t, ok, "legacy markdown entry imported") + assert.Equal(t, "legacy nb", got.Title) + assert.Contains(t, got.Body, "legacy body") + assert.Equal(t, []string{"design"}, got.Tags) + + _, errBak := os.Stat(filepath.Join(mdDir, "nbold.md.bak")) + assert.NoError(t, errBak, ".bak preserved") +} + +// TestNotebookManager_PersistsAcrossRestart proves notebook entries +// survive a manager restart (the sidecar is the durable store). +func TestNotebookManager_PersistsAcrossRestart(t *testing.T) { + repo := t.TempDir() + nm1 := newNotebookManager(repo) + saved, err := nm1.Save(notebookEntry{Title: "t1", Body: "b1", Tags: []string{"x"}}) + require.NoError(t, err) + + nm2 := newNotebookManager(repo) + got, ok := nm2.Get(saved.ID) + require.True(t, ok, "entry survives a manager restart via the sidecar") + assert.Equal(t, "t1", got.Title) + assert.Equal(t, "b1", got.Body) +} diff --git a/internal/mcp/streamable/transport.go b/internal/mcp/streamable/transport.go index 918b542e..2122c24d 100644 --- a/internal/mcp/streamable/transport.go +++ b/internal/mcp/streamable/transport.go @@ -438,14 +438,20 @@ func (t *Transport) localDispatch(r *http.Request, state SessionState, frame []b // roster, and proxy the call there. A return value of (_, _, false) // means "fall through to local dispatch". func (t *Transport) tryRouteToolCall(r *http.Request, state SessionState, frame []byte) ([]byte, int, bool) { + // Decode the JSON-RPC envelope keeping the inbound `arguments` + // object as raw bytes — we MUST forward every caller-supplied key + // (e.g. `query`, `limit`, etc.) to the downstream executor, not + // just the workspace+cwd peek fields. A previous version + // re-marshalled only the typed peek struct, which silently + // stripped every other argument and made every router-routed tool + // call see an empty args map ("X is required" failures). Mirror + // the daemon dispatcher's tryProxyToolCall: peek workspace+cwd + // without dropping the rest. var envelope struct { ID json.RawMessage `json:"id"` Params struct { - Name string `json:"name"` - Arguments struct { - Workspace string `json:"workspace"` - Cwd string `json:"cwd"` - } `json:"arguments"` + Name string `json:"name"` + Arguments json.RawMessage `json:"arguments"` } `json:"params"` } if err := json.Unmarshal(frame, &envelope); err != nil { @@ -454,23 +460,39 @@ func (t *Transport) tryRouteToolCall(r *http.Request, state SessionState, frame if envelope.Params.Name == "" { return nil, 0, false } - scope := envelope.Params.Arguments.Workspace + // Second decode is only used to peek the routing hints. + var peek struct { + Workspace string `json:"workspace"` + Cwd string `json:"cwd"` + } + if len(envelope.Params.Arguments) > 0 { + _ = json.Unmarshal(envelope.Params.Arguments, &peek) + } + scope := peek.Workspace if scope == "" { scope = state.Workspace } - cwd := envelope.Params.Arguments.Cwd + cwd := peek.Cwd if cwd == "" { cwd = state.CWD } if cwd == "" { cwd = strings.TrimSpace(r.Header.Get("X-Gortex-Cwd")) } - argsJSON, err := json.Marshal(envelope.Params.Arguments) + // Wrap the original raw arguments under `{"arguments": {...}}` so + // the local executor's nested-arguments unmarshal path (see + // cmd/gortex/server_router.go newLocalToolExecutor) finds them. + // This matches cmd/gortex/daemon_mcp.go:tryProxyToolCall exactly. + rawArgs := envelope.Params.Arguments + if len(rawArgs) == 0 { + rawArgs = json.RawMessage(`{}`) + } + body, err := json.Marshal(map[string]json.RawMessage{"arguments": rawArgs}) if err != nil { return nil, 0, false } out, status, rerr := t.router.RouteToolCall(r.Context(), - envelope.Params.Name, argsJSON, daemon.RouteContext{ + envelope.Params.Name, body, daemon.RouteContext{ ScopeOverride: scope, Cwd: cwd, }) diff --git a/internal/mcp/streamable/transport_test.go b/internal/mcp/streamable/transport_test.go index c483645f..b12d5bc7 100644 --- a/internal/mcp/streamable/transport_test.go +++ b/internal/mcp/streamable/transport_test.go @@ -16,6 +16,8 @@ import ( "github.com/mark3labs/mcp-go/mcp" mcpserver "github.com/mark3labs/mcp-go/server" + "github.com/zzet/gortex/internal/daemon" + "go.uber.org/zap" ) // newTestMCPServer mints an mcp-go server pre-loaded with an `echo` @@ -713,6 +715,91 @@ func TestMCPServerDispatcherNilFailsCleanly(t *testing.T) { } } +// TestRouterPreservesFullArguments pins the regression fix: when the +// streamable transport routes a tools/call through a daemon.Router +// whose local executor unmarshals to a map, the executor must see the +// caller's ORIGINAL arguments — not a stripped {workspace,cwd} peek. +// +// A previous version of tryRouteToolCall re-marshalled only the typed +// peek struct (workspace+cwd) and dropped every other key on the +// floor, breaking every real MCP usage with "X is required" because +// the args map was effectively empty. This test fails on that bug +// and passes on the fix. +func TestRouterPreservesFullArguments(t *testing.T) { + var seenBody []byte + router := daemon.NewRouter(daemon.RouterConfig{ + LocalExecute: func(_ context.Context, _ string, body []byte) ([]byte, int, error) { + seenBody = append([]byte(nil), body...) + // Mirror the production local executor: unwrap + // `{"arguments": {...}}` then assert every caller key + // survived the round-trip. + var nested struct { + Arguments map[string]any `json:"arguments"` + } + if err := json.Unmarshal(body, &nested); err != nil { + return nil, 500, err + } + if nested.Arguments == nil { + return []byte(`{"error":"no arguments"}`), 200, nil + } + out, _ := json.Marshal(map[string]any{"ok": true, "args": nested.Arguments}) + return out, 200, nil + }, + Logger: zap.NewNop(), + }) + + store := NewMemoryStore(time.Minute) + defer store.Close() + tr := New(Config{ + Dispatcher: MCPServerDispatcher{Server: newTestMCPServer()}, + Store: store, + Router: router, + }) + + // Seed an initialized session so the transport accepts the call. + sid, err := store.Create(SessionState{Initialized: true, ClientName: "test"}) + if err != nil { + t.Fatalf("seed Create: %v", err) + } + + callBody := jsonRPC(7, "tools/call", map[string]any{ + "name": "search_symbols", + "arguments": map[string]any{ + "query": "NewServer", + "limit": 10, + }, + }) + rec := doPOST(t, tr, callBody, map[string]string{HeaderSessionID: sid}) + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, body=%s", rec.Code, rec.Body.String()) + } + + // 1) The local executor must have seen the original args. + var nested struct { + Arguments map[string]any `json:"arguments"` + } + if err := json.Unmarshal(seenBody, &nested); err != nil { + t.Fatalf("local executor body not JSON: %v\nbody=%s", err, string(seenBody)) + } + if nested.Arguments == nil { + t.Fatalf("local executor saw nil arguments — args were stripped. body=%s", string(seenBody)) + } + if got, _ := nested.Arguments["query"].(string); got != "NewServer" { + t.Errorf("query = %q, want %q (args stripped before reaching executor). body=%s", + got, "NewServer", string(seenBody)) + } + // JSON numbers decode to float64 in interface{}; compare as such. + if got, _ := nested.Arguments["limit"].(float64); got != 10 { + t.Errorf("limit = %v, want 10 (args stripped before reaching executor). body=%s", + got, string(seenBody)) + } + + // 2) The wrapped tool result must reach the client too. + if !strings.Contains(rec.Body.String(), "NewServer") { + t.Errorf("client response missing forwarded args: %s", rec.Body.String()) + } +} + // TestHTTPRoundTripEndToEnd — fires the transport behind an // httptest.Server so the body actually flows through net/http; covers // the boundary the per-test recorder can't. diff --git a/internal/mcp/tools_analyze_annotation_users_test.go b/internal/mcp/tools_analyze_annotation_users_test.go index 65b573e8..099ee610 100644 --- a/internal/mcp/tools_analyze_annotation_users_test.go +++ b/internal/mcp/tools_analyze_annotation_users_test.go @@ -30,7 +30,7 @@ func callAnalyzeAnnotationUsers(t *testing.T, srv *Server, args map[string]any) return out } -func addAnnotationNode(g *graph.Graph, id, name string) { +func addAnnotationNode(g graph.Store, id, name string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindType, @@ -39,7 +39,7 @@ func addAnnotationNode(g *graph.Graph, id, name string) { }) } -func addAnnotatedEdge(g *graph.Graph, from, to, args string) { +func addAnnotatedEdge(g graph.Store, from, to, args string) { e := &graph.Edge{From: from, To: to, Kind: graph.EdgeAnnotated, FilePath: "x.go", Line: 1} if args != "" { e.Meta = map[string]any{"args": args} diff --git a/internal/mcp/tools_analyze_blame_test.go b/internal/mcp/tools_analyze_blame_test.go index 07968b8a..c594cf5c 100644 --- a/internal/mcp/tools_analyze_blame_test.go +++ b/internal/mcp/tools_analyze_blame_test.go @@ -63,18 +63,23 @@ func TestAnalyzeBlame_StampsLastAuthored(t *testing.T) { } // Spot-check at least one symbol got authorship metadata. + // blame now persists in the typed sidecar (change A), not Node.Meta. found := false - for _, n := range srv.graph.AllNodes() { - if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { - continue + if r, ok := srv.graph.(graph.BlameEnrichmentReader); ok { + for _, e := range r.BlameRows("") { + if e.Email == "test@example.com" { + found = true + break + } } - la, ok := n.Meta["last_authored"].(map[string]any) - if !ok { - continue - } - if la["email"] == "test@example.com" { - found = true - break + } + if !found { + // Fallback for capability-less backends: scan Meta. + for _, n := range srv.graph.AllNodes() { + if la, ok := n.Meta["last_authored"].(map[string]any); ok && la["email"] == "test@example.com" { + found = true + break + } } } if !found { diff --git a/internal/mcp/tools_analyze_channel_ops_test.go b/internal/mcp/tools_analyze_channel_ops_test.go index d5e3cd15..9031f70e 100644 --- a/internal/mcp/tools_analyze_channel_ops_test.go +++ b/internal/mcp/tools_analyze_channel_ops_test.go @@ -30,7 +30,7 @@ func callAnalyzeChannelOps(t *testing.T, srv *Server, args map[string]any) map[s return out } -func addChannelEdge(g *graph.Graph, kind graph.EdgeKind, from, to, file string, line int) { +func addChannelEdge(g graph.Store, kind graph.EdgeKind, from, to, file string, line int) { g.AddEdge(&graph.Edge{ From: from, To: to, diff --git a/internal/mcp/tools_analyze_clusters.go b/internal/mcp/tools_analyze_clusters.go index 699162c2..4f0c3e13 100644 --- a/internal/mcp/tools_analyze_clusters.go +++ b/internal/mcp/tools_analyze_clusters.go @@ -63,12 +63,6 @@ func (s *Server) handleAnalyzeClusters(ctx context.Context, req mcp.CallToolRequ }) } - scoped := s.scopedNodes(ctx) - scopedSet := make(map[string]*graph.Node, len(scoped)) - for _, n := range scoped { - scopedSet[n.ID] = n - } - type clusterRow struct { ID string `json:"id"` Label string `json:"label"` @@ -82,8 +76,18 @@ func (s *Server) handleAnalyzeClusters(ctx context.Context, req mcp.CallToolRequ MemberSample []string `json:"member_sample,omitempty"` } - rows := make([]clusterRow, 0, len(cr.Communities)) - for _, c := range cr.Communities { + // First pass: keep only the clusters that survive size + path-prefix + // gates, then sort + truncate to the requested limit. The density, + // language-mix, and top-files work below is bounded by the truncated + // row count instead of every community in the partition — important + // on a disk backend where each member touches the graph store. + type pending struct { + c *analysis.Community + row clusterRow + } + survivors := make([]pending, 0, len(cr.Communities)) + for i := range cr.Communities { + c := &cr.Communities[i] if c.Size < minSize { continue } @@ -99,30 +103,77 @@ func (s *Server) handleAnalyzeClusters(ctx context.Context, req mcp.CallToolRequ continue } } - row := clusterRow{ ID: c.ID, Label: c.Label, Hub: c.Hub, Size: c.Size, Files: len(c.Files), Languages: map[string]int{}, } - - // File-spread = files-per-member; 1.0 means every member - // lives in its own file (boundary-heavy), close to 0 means - // many members per file (file-bound cluster). if c.Size > 0 { row.FileSpread = roundScore(float64(len(c.Files)) / float64(c.Size)) } + survivors = append(survivors, pending{c: c, row: row}) + } + sort.Slice(survivors, func(i, j int) bool { + if survivors[i].c.Size != survivors[j].c.Size { + return survivors[i].c.Size > survivors[j].c.Size + } + return survivors[i].c.ID < survivors[j].c.ID + }) + truncated := false + if len(survivors) > limit { + survivors = survivors[:limit] + truncated = true + } - // Density requires the intra-cluster edge count. Use the - // member set + graph in-place; cheap on cluster-sized - // node lists. - memberSet := make(map[string]bool, len(c.Members)) - for _, m := range c.Members { - memberSet[m] = true + // Batch every surviving cluster's member ids and pull their nodes + + // outgoing edges in two calls — one round-trip each on + // a disk backend, against the per-member GetNode / GetOutEdges loop the + // previous shape ran (N members × 2 round-trips). Members from + // communities that didn't survive the truncate above never reach + // the store. + // + // Per-cluster member cap: communities can hold thousands of nodes + // each. On a disk backend, fetching tens of thousands of nodes + edges per + // call is several seconds of cost — the rendered response only + // uses these to compute density / language mix / top files, all of + // which converge on a representative sample long before they need + // every member. With a default 50-cluster limit and ~200 sampled + // members per cluster, the IN-list stays under 10k IDs and the + // rendering stays sub-second. The exact `size` field still reflects + // the true cluster size because it comes from c.Size, not from the + // sampled set. + const sampleCap = 200 + sampleMemberIDs := make([]string, 0, len(survivors)*sampleCap) + sampleSets := make([]map[string]bool, 0, len(survivors)) + for _, p := range survivors { + members := p.c.Members + if len(members) > sampleCap { + members = members[:sampleCap] + } + set := make(map[string]bool, len(members)) + for _, m := range members { + set[m] = true } + sampleSets = append(sampleSets, set) + sampleMemberIDs = append(sampleMemberIDs, members...) + } + memberNodes := s.graph.GetNodesByIDs(sampleMemberIDs) + memberOutEdges := s.graph.GetOutEdgesByNodeIDs(sampleMemberIDs) + + rows := make([]clusterRow, 0, len(survivors)) + for i, p := range survivors { + c := p.c + row := p.row + memberSet := sampleSets[i] + sampleSize := len(memberSet) + + // Density on the sample, normalised against (sampleSize · + // (sampleSize-1)) to keep the ratio meaningful when only part + // of the cluster was inspected. Intra-sample edges restricted + // to the call / reference kinds the clusterer cares about. intra := 0 - for _, m := range c.Members { - for _, e := range s.graph.GetOutEdges(m) { + for m := range memberSet { + for _, e := range memberOutEdges[m] { if e.Kind != graph.EdgeCalls && e.Kind != graph.EdgeReferences { continue } @@ -131,16 +182,14 @@ func (s *Server) handleAnalyzeClusters(ctx context.Context, req mcp.CallToolRequ } } } - // Density = intra-edges / possible-directed-pairs. - if c.Size > 1 { - possible := c.Size * (c.Size - 1) + if sampleSize > 1 { + possible := sampleSize * (sampleSize - 1) row.Density = roundScore(float64(intra) / float64(possible)) } - // Language mix + top files. fileCounts := map[string]int{} - for _, m := range c.Members { - n := scopedSet[m] + for m := range memberSet { + n := memberNodes[m] if n == nil { continue } @@ -156,17 +205,6 @@ func (s *Server) handleAnalyzeClusters(ctx context.Context, req mcp.CallToolRequ rows = append(rows, row) } - sort.Slice(rows, func(i, j int) bool { - if rows[i].Size != rows[j].Size { - return rows[i].Size > rows[j].Size - } - return rows[i].ID < rows[j].ID - }) - truncated := false - if len(rows) > limit { - rows = rows[:limit] - truncated = true - } resp := map[string]any{ "clusters": rows, diff --git a/internal/mcp/tools_analyze_components.go b/internal/mcp/tools_analyze_components.go new file mode 100644 index 00000000..bcb2b2f9 --- /dev/null +++ b/internal/mcp/tools_analyze_components.go @@ -0,0 +1,164 @@ +// wcc / scc — connected-component diagnostics. +// +// `analyze kind=wcc` returns the weakly connected components: pairs +// of symbols reachable from each other ignoring edge direction. A +// healthy index has a small number of large WCCs (the connected +// codebase) plus a long tail of singletons (isolated extracted +// symbols). A WCC count that explodes between reindexes signals +// extraction drift, not code change. +// +// `analyze kind=scc` returns the strongly connected components: +// pairs of symbols mutually reachable along directed edges. Every +// non-trivial SCC (size > 1) is a recursion ring — mutual +// recursion in calls, two-way references between data types, +// circular module dependencies. Useful for cycle audits beyond +// what kind=cycles surfaces today. +// +// Routing: +// +// - When the backing graph.Store implements graph.ComponentFinder +// (today only store_sqlite), both kinds delegate to the +// engine-native algorithm. +// +// - Otherwise the in-process analysis.ComputeWCC / +// analysis.ComputeSCC runs. SCC uses an iterative Tarjan so a +// deep call graph won't blow the goroutine stack. + +package mcp + +import ( + "context" + "fmt" + "sort" + "strings" + + "github.com/mark3labs/mcp-go/mcp" + + "github.com/zzet/gortex/internal/analysis" + "github.com/zzet/gortex/internal/graph" +) + +// componentRow is the per-component shape the analyzer returns. +type componentRow struct { + ID int `json:"id"` + Size int `json:"size"` + Members []string `json:"members"` +} + +// handleAnalyzeConnectedComponents serves both `analyze kind=wcc` +// and `analyze kind=scc`. The directed flag picks SCC; unset picks +// WCC. +func (s *Server) handleAnalyzeConnectedComponents( + ctx context.Context, req mcp.CallToolRequest, directed bool, +) (*mcp.CallToolResult, error) { + args := req.GetArguments() + + limit := 50 + if v, ok := args["limit"].(float64); ok && v > 0 { + limit = int(v) + } + minSize := 0 + if v, ok := args["min_size"].(float64); ok && v > 0 { + minSize = int(v) + } + memberLimit := 100 + if v, ok := args["member_limit"].(float64); ok && v > 0 { + memberLimit = int(v) + } + + kindLabel := "wcc" + if directed { + kindLabel = "scc" + } + + results := s.runComponents(directed, analysis.ComponentOptions{MinSize: minSize}) + if limit > 0 && limit < len(results) { + results = results[:limit] + } + + rows := make([]componentRow, 0, len(results)) + for _, r := range results { + members := r.Members + if memberLimit > 0 && memberLimit < len(members) { + members = members[:memberLimit] + } + rows = append(rows, componentRow{ID: r.ID, Size: r.Size, Members: members}) + } + + if s.isGCX(ctx, req) { + return s.gcxResponseWithBudget(req)(encodeAnalyze(kindLabel, rows)) + } + if isCompact(req) { + var b strings.Builder + for _, r := range rows { + fmt.Fprintf(&b, "id=%d size=%d members=%v\n", r.ID, r.Size, r.Members) + } + return mcp.NewToolResultText(b.String()), nil + } + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "components": rows, + "total": len(rows), + "kind": kindLabel, + }) +} + +// runComponents picks the engine-native path when the backing +// store implements graph.ComponentFinder, otherwise falls back to +// the in-process analysis.ComputeWCC / ComputeSCC. +func (s *Server) runComponents(directed bool, opts analysis.ComponentOptions) []analysis.ComponentResult { + if store := s.backendStore(); store != nil { + if cf, ok := store.(graph.ComponentFinder); ok { + hits, err := callComponentFinder(cf, directed, graph.ComponentOpts{ + NodeKinds: opts.NodeKinds, + EdgeKinds: opts.EdgeKinds, + }) + if err == nil { + return collectHits(hits, opts.MinSize) + } + // Engine-native error falls through to the in-process + // path rather than returning a half-done result. + } + } + if directed { + return analysis.ComputeSCC(s.graph, opts) + } + return analysis.ComputeWCC(s.graph, opts) +} + +func callComponentFinder(cf graph.ComponentFinder, directed bool, opts graph.ComponentOpts) ([]graph.ComponentHit, error) { + if directed { + return cf.StronglyConnectedComponents(opts) + } + return cf.WeaklyConnectedComponents(opts) +} + +// collectHits groups CommunityHits by ID, applies MinSize, sorts +// for determinism, and renumbers — mirrors analysis.collectComponents +// without exporting that internal helper. +func collectHits(hits []graph.ComponentHit, minSize int) []analysis.ComponentResult { + groups := make(map[int64][]string) + for _, h := range hits { + groups[h.ComponentID] = append(groups[h.ComponentID], h.NodeID) + } + out := make([]analysis.ComponentResult, 0, len(groups)) + for _, members := range groups { + if minSize > 0 && len(members) < minSize { + continue + } + sort.Strings(members) + out = append(out, analysis.ComponentResult{Members: members, Size: len(members)}) + } + sort.Slice(out, func(i, j int) bool { + if out[i].Size != out[j].Size { + return out[i].Size > out[j].Size + } + if len(out[i].Members) > 0 && len(out[j].Members) > 0 { + return out[i].Members[0] < out[j].Members[0] + } + return false + }) + for i := range out { + out[i].ID = i + } + return out +} diff --git a/internal/mcp/tools_analyze_concurrency.go b/internal/mcp/tools_analyze_concurrency.go index b57586ac..14a9de44 100644 --- a/internal/mcp/tools_analyze_concurrency.go +++ b/internal/mcp/tools_analyze_concurrency.go @@ -72,10 +72,7 @@ func (s *Server) handleAnalyzeRaceWrites(ctx context.Context, req mcp.CallToolRe } var rows []raceRow - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeWrites { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeWrites) { if !goroutineReachable[e.From] { continue } @@ -162,10 +159,7 @@ func (s *Server) handleAnalyzeRaceWrites(ctx context.Context, req mcp.CallToolRe func (s *Server) buildGoroutineReachableSet() map[string]bool { reach := map[string]bool{} var roots []string - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeSpawns { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeSpawns) { if !reach[e.To] { reach[e.To] = true roots = append(roots, e.To) @@ -282,10 +276,7 @@ func (s *Server) handleAnalyzeUnclosedChannels(ctx context.Context, req mcp.Call // channel"; the channel arg isn't tracked so the membership test // is per-function, not per-channel. closesIn := map[string]bool{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeCalls { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeCalls) { if callTargetName(e) != "close" { continue } @@ -303,10 +294,7 @@ func (s *Server) handleAnalyzeUnclosedChannels(ctx context.Context, req mcp.Call Line int } byChannel := map[string]*channelInfo{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeSends && e.Kind != graph.EdgeRecvs { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeSends, graph.EdgeRecvs) { info := byChannel[e.To] if info == nil { info = &channelInfo{ @@ -366,7 +354,7 @@ func (s *Server) handleAnalyzeUnclosedChannels(ctx context.Context, req mcp.Call if anyCloser { continue } - risk, reason := classifyUnclosed(info.Sends, len(info.Senders), info.Recvs) + risk, reason := classifyUnclosed(len(info.Senders), info.Recvs) rows = append(rows, unclosedRow{ Channel: info.Channel, FilePath: info.FilePath, @@ -434,7 +422,7 @@ func (s *Server) handleAnalyzeUnclosedChannels(ctx context.Context, req mcp.Call // receivers — the receiver may or may not range; without arg flow // we can't tell. Low: senders without receivers, almost always a // fire-and-forget signal. -func classifyUnclosed(sends, senders, recvs int) (string, string) { +func classifyUnclosed(senders, recvs int) (string, string) { switch { case senders >= 2 && recvs >= 1: return "high", "multiple senders with consumer(s) and no detected close — receivers will hang on range" diff --git a/internal/mcp/tools_analyze_concurrency_test.go b/internal/mcp/tools_analyze_concurrency_test.go index 466b57b1..b1db8739 100644 --- a/internal/mcp/tools_analyze_concurrency_test.go +++ b/internal/mcp/tools_analyze_concurrency_test.go @@ -34,15 +34,15 @@ func concurrencyServer(t *testing.T) *Server { return NewServer(eng, g, idx, nil, zap.NewNop(), nil) } -func addFn(g *graph.Graph, id, name, path string) { +func addFn(g graph.Store, id, name, path string) { g.AddNode(&graph.Node{ID: id, Kind: graph.KindFunction, Name: name, FilePath: path, Language: "go"}) } -func addField(g *graph.Graph, id, name, path string) { +func addField(g graph.Store, id, name, path string) { g.AddNode(&graph.Node{ID: id, Kind: graph.KindField, Name: name, FilePath: path, Language: "go"}) } -func addEdge(g *graph.Graph, from, to string, kind graph.EdgeKind, path string, line int) { +func addEdge(g graph.Store, from, to string, kind graph.EdgeKind, path string, line int) { g.AddEdge(&graph.Edge{From: from, To: to, Kind: kind, FilePath: path, Line: line, Confidence: 1}) } @@ -328,15 +328,15 @@ func TestAnalyzeRaceWrites_GCXEncodesRow(t *testing.T) { // addMethod / addType / addTypedField build the node shapes the // concurrency classifier reads: a method linked to its receiver type // via EdgeMemberOf, and a typed field linked to its owning type. -func addMethod(g *graph.Graph, id, name, path string) { +func addMethod(g graph.Store, id, name, path string) { g.AddNode(&graph.Node{ID: id, Kind: graph.KindMethod, Name: name, FilePath: path, Language: "go"}) } -func addType(g *graph.Graph, id, name, path string) { +func addType(g graph.Store, id, name, path string) { g.AddNode(&graph.Node{ID: id, Kind: graph.KindType, Name: name, FilePath: path, Language: "go"}) } -func addTypedField(g *graph.Graph, id, name, fieldType, path string) { +func addTypedField(g graph.Store, id, name, fieldType, path string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindField, Name: name, FilePath: path, Language: "go", Meta: map[string]any{"field_type": fieldType}, diff --git a/internal/mcp/tools_analyze_config_readers_test.go b/internal/mcp/tools_analyze_config_readers_test.go index c53aed2a..e0a656a2 100644 --- a/internal/mcp/tools_analyze_config_readers_test.go +++ b/internal/mcp/tools_analyze_config_readers_test.go @@ -30,7 +30,7 @@ func callAnalyzeConfigReaders(t *testing.T, srv *Server, args map[string]any) ma return out } -func addConfigKeyNode(g *graph.Graph, id, name, source string) { +func addConfigKeyNode(g graph.Store, id, name, source string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindConfigKey, @@ -39,7 +39,7 @@ func addConfigKeyNode(g *graph.Graph, id, name, source string) { }) } -func addReadConfigEdge(g *graph.Graph, from, to string) { +func addReadConfigEdge(g graph.Store, from, to string) { g.AddEdge(&graph.Edge{From: from, To: to, Kind: graph.EdgeReadsConfig}) } diff --git a/internal/mcp/tools_analyze_coverage_gaps_test.go b/internal/mcp/tools_analyze_coverage_gaps_test.go index b2c0e441..a5f90e79 100644 --- a/internal/mcp/tools_analyze_coverage_gaps_test.go +++ b/internal/mcp/tools_analyze_coverage_gaps_test.go @@ -11,7 +11,7 @@ import ( // addCoveredNode wires a function node with synthetic // coverage_pct meta — emulating coverage.EnrichGraph output. -func addCoveredNode(g *graph.Graph, id, file string, pct float64, numStmt, hit int) { +func addCoveredNode(g graph.Store, id, file string, pct float64, numStmt, hit int) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindFunction, diff --git a/internal/mcp/tools_analyze_coverage_test.go b/internal/mcp/tools_analyze_coverage_test.go index b65e121f..70540776 100644 --- a/internal/mcp/tools_analyze_coverage_test.go +++ b/internal/mcp/tools_analyze_coverage_test.go @@ -19,12 +19,15 @@ func TestAnalyzeCoverage_StampsMeta(t *testing.T) { _ = os.WriteFile(filepath.Join(dir, "go.mod"), []byte("module example.test/repo\n\ngo 1.22\n"), 0o644) - // Synthetic cover profile: covers `main` (line 9), uncovered - // segment for `helper` (line 14). The file path is the - // module-qualified form Go's cover tool emits. + // Synthetic cover profile: covers `main` (line 7-9), uncovered + // segment for `helper` (line 11). Line numbers match the + // setupTestServer fixture in server_test.go — after the fmt + // import was dropped to keep external-call attribution clean, + // the function bodies shifted up by 2 lines. The file path is + // the module-qualified form Go's cover tool emits. profile := []byte(`mode: set -example.test/repo/main.go:9.13,11.2 1 1 -example.test/repo/main.go:14.13,14.16 1 0 +example.test/repo/main.go:7.13,9.2 1 1 +example.test/repo/main.go:11.13,11.16 1 0 `) profilePath := filepath.Join(dir, "cover.out") if err := os.WriteFile(profilePath, profile, 0o644); err != nil { @@ -60,11 +63,22 @@ example.test/repo/main.go:14.13,14.16 1 0 // Spot-check the function node got coverage_pct. hasCovered, hasUncovered := false, false + covByID := map[string]float64{} + if r, ok := srv.graph.(graph.CoverageEnrichmentReader); ok { + for _, e := range r.CoverageRows("") { + covByID[e.NodeID] = e.CoveragePct + } + } for _, n := range srv.graph.AllNodes() { if n.Kind != graph.KindFunction { continue } - pct, ok := n.Meta["coverage_pct"].(float64) + pct, ok := covByID[n.ID] + if !ok { + if p, has := n.Meta["coverage_pct"].(float64); has { + pct, ok = p, true + } + } if !ok { continue } diff --git a/internal/mcp/tools_analyze_cross_repo_test.go b/internal/mcp/tools_analyze_cross_repo_test.go index 4940c33c..b347593c 100644 --- a/internal/mcp/tools_analyze_cross_repo_test.go +++ b/internal/mcp/tools_analyze_cross_repo_test.go @@ -33,7 +33,7 @@ func callAnalyzeCrossRepo(t *testing.T, srv *Server, args map[string]any) map[st // seedCrossRepoGraph wires three repos with a handful of cross-repo // edges so the analyzer has something to group. -func seedCrossRepoGraph(g *graph.Graph) { +func seedCrossRepoGraph(g graph.Store) { add := func(id, repo string) { g.AddNode(&graph.Node{ID: id, Kind: graph.KindFunction, Name: id, FilePath: id, RepoPrefix: repo}) } diff --git a/internal/mcp/tools_analyze_edges.go b/internal/mcp/tools_analyze_edges.go index d4f8e844..ebeaaed0 100644 --- a/internal/mcp/tools_analyze_edges.go +++ b/internal/mcp/tools_analyze_edges.go @@ -20,6 +20,8 @@ package mcp import ( "context" "fmt" + "iter" + "slices" "sort" "strings" @@ -68,10 +70,9 @@ func (s *Server) handleAnalyzeChannelOps(ctx context.Context, req mcp.CallToolRe return row } - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeSends && e.Kind != graph.EdgeRecvs { - continue - } + // One scan over Sends+Recvs only — replaces the legacy AllEdges() + // walk that pulled every edge over cgo just to keep two kinds. + for e := range edgesByKinds(s.graph, graph.EdgeSends, graph.EdgeRecvs) { if pathPrefix != "" && !strings.HasPrefix(e.FilePath, pathPrefix) { continue } @@ -156,10 +157,7 @@ func (s *Server) handleAnalyzeGoroutineSpawns(ctx context.Context, req mcp.CallT } byTarget := map[string]*spawnRow{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeSpawns { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeSpawns) { mode, _ := e.Meta["mode"].(string) key := e.To + "|" + mode row, ok := byTarget[key] @@ -271,10 +269,7 @@ func (s *Server) handleAnalyzeFieldWriters(ctx context.Context, req mcp.CallTool } byField := map[string]*writerRow{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeWrites { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeWrites) { if idFilter != "" && e.To != idFilter { continue } @@ -379,8 +374,8 @@ func (s *Server) handleAnalyzeAnnotationUsers(ctx context.Context, req mcp.CallT Args string `json:"args,omitempty"` } var rows []annotatedRow - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeAnnotated || e.To != idFilter { + for e := range edgesByKinds(s.graph, graph.EdgeAnnotated) { + if e.To != idFilter { continue } argsStr, _ := e.Meta["args"].(string) @@ -433,10 +428,7 @@ func (s *Server) handleAnalyzeAnnotationUsers(ctx context.Context, req mcp.CallT Users int `json:"users"` } byID := map[string]*annoRow{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeAnnotated { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeAnnotated) { row, ok := byID[e.To] if !ok { n := s.graph.GetNode(e.To) @@ -523,10 +515,7 @@ func (s *Server) handleAnalyzeConfigReaders(ctx context.Context, req mcp.CallToo Reads int `json:"reads"` } byKey := map[string]*configRow{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeReadsConfig { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeReadsConfig) { row, ok := byKey[e.To] if !ok { n := s.graph.GetNode(e.To) @@ -636,10 +625,7 @@ func (s *Server) handleAnalyzeEnvVarUsers(ctx context.Context, req mcp.CallToolR Reads int `json:"reads"` } byKey := map[string]*envRow{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeReadsConfig { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeReadsConfig) { row, ok := byKey[e.To] if !ok { n := s.graph.GetNode(e.To) @@ -727,10 +713,7 @@ func (s *Server) handleAnalyzeEventEmitters(ctx context.Context, req mcp.CallToo Emitters []string `json:"emitters,omitempty"` } byEvent := map[string]*eventRow{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeEmits { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeEmits) { // Level filter: an emit edge stores the method on the edge // (e.g. "Errorf"); the event node may carry an event_kind. // We accept either source so both per-event and per-call @@ -880,7 +863,7 @@ func (s *Server) handleAnalyzePubsub(ctx context.Context, req mcp.CallToolReques return row } - for _, e := range s.graph.AllEdges() { + for e := range edgesByKinds(s.graph, graph.EdgeEmits, graph.EdgeListensOn) { switch e.Kind { case graph.EdgeEmits: row := ensureRow(e.To) @@ -987,61 +970,74 @@ func (s *Server) handleAnalyzeErrorSurface(ctx context.Context, req mcp.CallTool Errors []string `json:"errors"` ErrorMsgs []string `json:"error_msgs,omitempty"` } - byThrower := map[string]*throwerRow{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeThrows { - continue - } - if pathPrefix != "" && !strings.HasPrefix(e.FilePath, pathPrefix) { - continue - } - row, ok := byThrower[e.From] - if !ok { - n := s.graph.GetNode(e.From) - file := e.FilePath - line := e.Line - if n != nil { - if file == "" { - file = n.FilePath - } - if line == 0 { - line = n.StartLine - } + rows := make([]*throwerRow, 0) + if surfacer, ok := s.graph.(graph.ThrowerErrorSurfacer); ok { + // Server-side path: one server-side aggregate for the per-thrower + // throws+targets dedup, one for the per-thrower error-msg + // attachment. No per-thrower GetOutEdges fanout. + for _, r := range surfacer.ThrowerErrorSurface(pathPrefix) { + row := &throwerRow{ + Symbol: r.ThrowerID, + File: r.FilePath, + Line: r.Line, + Throws: r.Throws, + Errors: append([]string(nil), r.ErrorTargets...), + ErrorMsgs: append([]string(nil), r.ErrorMsgs...), } - row = &throwerRow{Symbol: e.From, File: file, Line: line} - byThrower[e.From] = row - } - row.Throws++ - row.Errors = appendUnique(row.Errors, e.To) - } - // For every thrower, also surface the error_msg KindString - // literals it emits. EdgeThrows targets error types; the - // data-side companion (errors.New("…") → string::error_msg::…) - // carries the literal message. Joining both gives an agent both - // "what error types propagate" and "what literal messages - // originate here" in one row. - for thrower, row := range byThrower { - for _, e := range s.graph.GetOutEdges(thrower) { - if e == nil || e.Kind != graph.EdgeEmits { + sort.Strings(row.Errors) + sort.Strings(row.ErrorMsgs) + rows = append(rows, row) + } + } else { + byThrower := map[string]*throwerRow{} + for e := range edgesByKinds(s.graph, graph.EdgeThrows) { + if pathPrefix != "" && !strings.HasPrefix(e.FilePath, pathPrefix) { continue } - n := s.graph.GetNode(e.To) - if n == nil || n.Kind != graph.KindString { - continue + row, ok := byThrower[e.From] + if !ok { + n := s.graph.GetNode(e.From) + file := e.FilePath + line := e.Line + if n != nil { + if file == "" { + file = n.FilePath + } + if line == 0 { + line = n.StartLine + } + } + row = &throwerRow{Symbol: e.From, File: file, Line: line} + byThrower[e.From] = row } - ctxLabel, _ := n.Meta["context"].(string) - if ctxLabel != "error_msg" { - continue + row.Throws++ + row.Errors = appendUnique(row.Errors, e.To) + } + // For every thrower, also surface the error_msg KindString + // literals it emits. EdgeThrows targets error types; the + // data-side companion (errors.New("…") → string::error_msg::…) + // carries the literal message. + for thrower, row := range byThrower { + for _, e := range s.graph.GetOutEdges(thrower) { + if e == nil || e.Kind != graph.EdgeEmits { + continue + } + n := s.graph.GetNode(e.To) + if n == nil || n.Kind != graph.KindString { + continue + } + ctxLabel, _ := n.Meta["context"].(string) + if ctxLabel != "error_msg" { + continue + } + row.ErrorMsgs = appendUnique(row.ErrorMsgs, n.Name) } - row.ErrorMsgs = appendUnique(row.ErrorMsgs, n.Name) } - } - - rows := make([]*throwerRow, 0, len(byThrower)) - for _, r := range byThrower { - sort.Strings(r.Errors) - sort.Strings(r.ErrorMsgs) - rows = append(rows, r) + for _, r := range byThrower { + sort.Strings(r.Errors) + sort.Strings(r.ErrorMsgs) + rows = append(rows, r) + } } sort.Slice(rows, func(i, j int) bool { // Throwers with the most distinct error targets surface @@ -1163,7 +1159,11 @@ func (s *Server) handleAnalyzeCrossRepo(ctx context.Context, req mcp.CallToolReq return "" } - for _, e := range s.graph.AllEdges() { + for e := range edgesByKinds(s.graph, + graph.EdgeCrossRepoCalls, + graph.EdgeCrossRepoImplements, + graph.EdgeCrossRepoExtends, + ) { base, ok := graph.BaseKindForCrossRepo(e.Kind) if !ok { continue @@ -1262,6 +1262,41 @@ func (s *Server) handleAnalyzeCrossRepo(ctx context.Context, req mcp.CallToolReq // shared helpers // --------------------------------------------------------------------------- +// edgesByKinds streams every edge whose Kind is in the supplied set +// using the EdgesByKindsScanner capability when the backend +// implements it (one round-trip with a `kind IN (…)` filter), or +// falls back to per-kind EdgesByKind iteration otherwise. +// +// The edge-driven analyzers below use it instead of `for _, e := range +// s.graph.AllEdges() { switch e.Kind … }` so the disk backends stop +// materialising the full edge table over cgo for a handful of kinds. +// Pass each kind as a separate argument — kinds typed inline as a +// variadic so call sites read as `edgesByKinds(g, EdgeEmits, +// EdgeListensOn)` rather than constructing a slice each time. +// +// Empty kinds yields nothing — matches both the capability contract +// and the original semantics (no kinds requested means no rows). +func edgesByKinds(g graph.Store, kinds ...graph.EdgeKind) iter.Seq[*graph.Edge] { + if len(kinds) == 0 { + return func(yield func(*graph.Edge) bool) {} + } + if scanner, ok := g.(graph.EdgesByKindsScanner); ok { + return scanner.EdgesByKinds(kinds) + } + return func(yield func(*graph.Edge) bool) { + for _, k := range kinds { + if k == "" { + continue + } + for e := range g.EdgesByKind(k) { + if !yield(e) { + return + } + } + } + } +} + // appendUnique returns dst with v added if not already present. // Used by every analyzer above to dedupe the From-side caller list // without falling back to a map (the lists are small per row, so a @@ -1270,10 +1305,8 @@ func appendUnique(dst []string, v string) []string { if v == "" { return dst } - for _, x := range dst { - if x == v { - return dst - } + if slices.Contains(dst, v) { + return dst } return append(dst, v) } diff --git a/internal/mcp/tools_analyze_error_surface_test.go b/internal/mcp/tools_analyze_error_surface_test.go index e8e3baa5..420255a9 100644 --- a/internal/mcp/tools_analyze_error_surface_test.go +++ b/internal/mcp/tools_analyze_error_surface_test.go @@ -30,7 +30,7 @@ func callAnalyzeErrorSurface(t *testing.T, srv *Server, args map[string]any) map return out } -func addThrowsEdge(g *graph.Graph, from, to, file string, line int) { +func addThrowsEdge(g graph.Store, from, to, file string, line int) { g.AddEdge(&graph.Edge{ From: from, To: to, diff --git a/internal/mcp/tools_analyze_event_emitters_test.go b/internal/mcp/tools_analyze_event_emitters_test.go index 54af6e20..fbfd357e 100644 --- a/internal/mcp/tools_analyze_event_emitters_test.go +++ b/internal/mcp/tools_analyze_event_emitters_test.go @@ -30,7 +30,7 @@ func callAnalyzeEventEmitters(t *testing.T, srv *Server, args map[string]any) ma return out } -func addEventNode(g *graph.Graph, id, name, kind string) { +func addEventNode(g graph.Store, id, name, kind string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindEvent, @@ -39,7 +39,7 @@ func addEventNode(g *graph.Graph, id, name, kind string) { }) } -func addEmitsEdge(g *graph.Graph, from, to, method string) { +func addEmitsEdge(g graph.Store, from, to, method string) { e := &graph.Edge{From: from, To: to, Kind: graph.EdgeEmits} if method != "" { e.Meta = map[string]any{"method": method} diff --git a/internal/mcp/tools_analyze_external_calls.go b/internal/mcp/tools_analyze_external_calls.go index 77e03618..429f7cdf 100644 --- a/internal/mcp/tools_analyze_external_calls.go +++ b/internal/mcp/tools_analyze_external_calls.go @@ -247,7 +247,7 @@ func suffixVersion(v string) string { // countCallersToExternal counts every incoming non-EdgeDependsOnModule // edge to an external symbol node — those are the calls / references // that goanalysis attributed. -func countCallersToExternal(g *graph.Graph, nodeID string) int { +func countCallersToExternal(g graph.Store, nodeID string) int { n := 0 for _, e := range g.GetInEdges(nodeID) { if e.Kind == graph.EdgeDependsOnModule { @@ -260,7 +260,7 @@ func countCallersToExternal(g *graph.Graph, nodeID string) int { // tallyExternalCallers returns (totalCallEdges, distinctCallers) — the // detail surface for the per-module symbol listing. -func tallyExternalCallers(g *graph.Graph, nodeID string) (int, int) { +func tallyExternalCallers(g graph.Store, nodeID string) (int, int) { calls := 0 seen := map[string]struct{}{} for _, e := range g.GetInEdges(nodeID) { diff --git a/internal/mcp/tools_analyze_external_calls_test.go b/internal/mcp/tools_analyze_external_calls_test.go index 956ea6de..cfd86cd1 100644 --- a/internal/mcp/tools_analyze_external_calls_test.go +++ b/internal/mcp/tools_analyze_external_calls_test.go @@ -30,7 +30,7 @@ func callAnalyzeExternalCalls(t *testing.T, srv *Server, args map[string]any) ma return out } -func addExternalModuleNode(g *graph.Graph, id, path, version, kind string) { +func addExternalModuleNode(g graph.Store, id, path, version, kind string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindModule, @@ -44,7 +44,7 @@ func addExternalModuleNode(g *graph.Graph, id, path, version, kind string) { }) } -func addExternalSymbolNode(g *graph.Graph, id, name, importPath, moduleID string, kind graph.NodeKind) { +func addExternalSymbolNode(g graph.Store, id, name, importPath, moduleID string, kind graph.NodeKind) { g.AddNode(&graph.Node{ ID: id, Kind: kind, @@ -63,7 +63,7 @@ func addExternalSymbolNode(g *graph.Graph, id, name, importPath, moduleID string }) } -func addExternalCall(g *graph.Graph, from, to string) { +func addExternalCall(g graph.Store, from, to string) { g.AddEdge(&graph.Edge{ From: from, To: to, diff --git a/internal/mcp/tools_analyze_field_writers_test.go b/internal/mcp/tools_analyze_field_writers_test.go index 4c17c4a0..e98ca1ba 100644 --- a/internal/mcp/tools_analyze_field_writers_test.go +++ b/internal/mcp/tools_analyze_field_writers_test.go @@ -30,11 +30,11 @@ func callAnalyzeFieldWriters(t *testing.T, srv *Server, args map[string]any) map return out } -func addFieldNode(g *graph.Graph, id, name string) { +func addFieldNode(g graph.Store, id, name string) { g.AddNode(&graph.Node{ID: id, Kind: graph.KindField, Name: name}) } -func addWriteEdge(g *graph.Graph, from, to string) { +func addWriteEdge(g graph.Store, from, to string) { g.AddEdge(&graph.Edge{From: from, To: to, Kind: graph.EdgeWrites}) } diff --git a/internal/mcp/tools_analyze_framework.go b/internal/mcp/tools_analyze_framework.go index 566b68e6..300e55e9 100644 --- a/internal/mcp/tools_analyze_framework.go +++ b/internal/mcp/tools_analyze_framework.go @@ -39,10 +39,7 @@ func (s *Server) handleAnalyzeRoutes(ctx context.Context, req mcp.CallToolReques Line int `json:"line"` } var rows []*routeRow - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeHandlesRoute { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeHandlesRoute) { contractNode := s.graph.GetNode(e.To) if contractNode == nil { continue @@ -154,10 +151,7 @@ func (s *Server) handleAnalyzeModels(ctx context.Context, req mcp.CallToolReques Line int `json:"line"` } var rows []*modelRow - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeModelsTable { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeModelsTable) { modelNode := s.graph.GetNode(e.From) if modelNode == nil { continue @@ -269,10 +263,7 @@ func (s *Server) componentsRollup(ctx context.Context, req mcp.CallToolRequest, stats[id] = row return row } - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeRendersChild { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeRendersChild) { parent := get(e.From) parent.FanOut++ // Skip the child if it never resolved to a real node — leaving @@ -454,7 +445,7 @@ func (s *Server) handleAnalyzeDbtModels(ctx context.Context, req mcp.CallToolReq // Second pass: tally columns (EdgeMemberOf → model) and lineage // (EdgeDependsOn between two model nodes) in one walk of AllEdges. - for _, e := range s.graph.AllEdges() { + for e := range edgesByKinds(s.graph, graph.EdgeMemberOf, graph.EdgeDependsOn) { switch e.Kind { case graph.EdgeMemberOf: if r := rowByID[e.To]; r != nil { diff --git a/internal/mcp/tools_analyze_framework_test.go b/internal/mcp/tools_analyze_framework_test.go index 00f5f285..365f3a87 100644 --- a/internal/mcp/tools_analyze_framework_test.go +++ b/internal/mcp/tools_analyze_framework_test.go @@ -30,7 +30,7 @@ func callAnalyzeFramework(t *testing.T, srv *Server, kind string, args map[strin return out } -func addContractNode(g *graph.Graph, id, ctype string, meta map[string]any) { +func addContractNode(g graph.Store, id, ctype string, meta map[string]any) { full := map[string]any{"type": ctype, "role": "provider"} for k, v := range meta { full[k] = v @@ -40,7 +40,7 @@ func addContractNode(g *graph.Graph, id, ctype string, meta map[string]any) { }) } -func addHandlesRouteEdge(g *graph.Graph, from, to, file string, line int) { +func addHandlesRouteEdge(g graph.Store, from, to, file string, line int) { g.AddEdge(&graph.Edge{ From: from, To: to, Kind: graph.EdgeHandlesRoute, FilePath: file, Line: line, @@ -97,7 +97,7 @@ func TestAnalyzeRoutes_FilterByKind(t *testing.T) { } } -func addModelTableEdge(g *graph.Graph, from, to, orm, table, derivation string) { +func addModelTableEdge(g graph.Store, from, to, orm, table, derivation string) { g.AddNode(&graph.Node{ID: to, Kind: graph.KindTable, Name: table, Language: "go", Meta: map[string]any{"dialect": "orm"}}) g.AddEdge(&graph.Edge{ From: from, To: to, Kind: graph.EdgeModelsTable, @@ -151,7 +151,7 @@ func TestAnalyzeModels_FilterByTableSubstring(t *testing.T) { } } -func addRendersChildEdge(g *graph.Graph, from, to, name string, line int) { +func addRendersChildEdge(g graph.Store, from, to, name string, line int) { g.AddEdge(&graph.Edge{ From: from, To: to, Kind: graph.EdgeRendersChild, Line: line, @@ -224,7 +224,7 @@ func TestAnalyzeComponents_EmptyOnNoEdges(t *testing.T) { } } -func addDbtModelNode(g *graph.Graph, id, name, framework, resourceType, materialized string) { +func addDbtModelNode(g graph.Store, id, name, framework, resourceType, materialized string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindTable, Name: name, Language: "sql", FilePath: name + ".sql", StartLine: 1, @@ -235,7 +235,7 @@ func addDbtModelNode(g *graph.Graph, id, name, framework, resourceType, material }) } -func addDbtColumn(g *graph.Graph, modelID, col string) { +func addDbtColumn(g graph.Store, modelID, col string) { colID := modelID + "::" + col g.AddNode(&graph.Node{ID: colID, Kind: graph.KindColumn, Name: col, Language: "sql"}) g.AddEdge(&graph.Edge{From: colID, To: modelID, Kind: graph.EdgeMemberOf}) diff --git a/internal/mcp/tools_analyze_goroutine_spawns_test.go b/internal/mcp/tools_analyze_goroutine_spawns_test.go index e70113ef..69df7f4a 100644 --- a/internal/mcp/tools_analyze_goroutine_spawns_test.go +++ b/internal/mcp/tools_analyze_goroutine_spawns_test.go @@ -34,7 +34,7 @@ func callAnalyzeGoroutineSpawns(t *testing.T, srv *Server, args map[string]any) // site is unique under the graph's edge-dedup key. Meta is dropped // when mode is empty so the analyzer's "modeless spawn" path is // exercisable. -func addSpawnEdge(g *graph.Graph, from, to, mode string, line int) { +func addSpawnEdge(g graph.Store, from, to, mode string, line int) { e := &graph.Edge{From: from, To: to, Kind: graph.EdgeSpawns, FilePath: "f.go", Line: line} if mode != "" { e.Meta = map[string]any{"mode": mode} diff --git a/internal/mcp/tools_analyze_health_score.go b/internal/mcp/tools_analyze_health_score.go index a61c4e58..790d9e69 100644 --- a/internal/mcp/tools_analyze_health_score.go +++ b/internal/mcp/tools_analyze_health_score.go @@ -10,6 +10,7 @@ import ( mcp "github.com/mark3labs/mcp-go/mcp" + "github.com/zzet/gortex/internal/analysis" "github.com/zzet/gortex/internal/graph" ) @@ -60,13 +61,13 @@ const ( // healthRollupRow is one per-file / per-repo aggregate row produced // when `roll_up` selects a non-symbol scope. type healthRollupRow struct { - Scope string `json:"scope"` // "file" | "repo" - Key string `json:"key"` // file path or repo prefix - AvgScore float64 `json:"avg_score"` - MinScore float64 `json:"min_score"` - MaxScore float64 `json:"max_score"` - Symbols int `json:"symbols"` - Grade string `json:"grade"` // derived from AvgScore + Scope string `json:"scope"` // "file" | "repo" + Key string `json:"key"` // file path or repo prefix + AvgScore float64 `json:"avg_score"` + MinScore float64 `json:"min_score"` + MaxScore float64 `json:"max_score"` + Symbols int `json:"symbols"` + Grade string `json:"grade"` // derived from AvgScore GradeCount map[string]int `json:"grade_counts"` } @@ -90,28 +91,28 @@ type healthDistribution struct { // input it was derived from, so the consumer can both rank and // explain the score. type healthScoreRow struct { - ID string `json:"id"` - Name string `json:"name"` - Kind string `json:"kind"` - File string `json:"file"` - Line int `json:"line"` + ID string `json:"id"` + Name string `json:"name"` + Kind string `json:"kind"` + File string `json:"file"` + Line int `json:"line"` Score float64 `json:"score"` Grade string `json:"grade"` // Axes — "_pct" suffix is the 0..100 health value; "_raw" is // the underlying input. Pointers because "no data" is a real // signal distinct from "score is zero". - CoveragePct *float64 `json:"coverage_pct,omitempty"` + CoveragePct *float64 `json:"coverage_pct,omitempty"` ComplexityPct *float64 `json:"complexity_pct,omitempty"` - RecencyPct *float64 `json:"recency_pct,omitempty"` - ChurnPct *float64 `json:"churn_pct,omitempty"` - - FanIn int `json:"fan_in"` - FanOut int `json:"fan_out"` - Crossings int `json:"community_crossings"` - AgeDays *int `json:"age_days,omitempty"` - Mods int `json:"session_mods"` - AxesUsed int `json:"axes_used"` + RecencyPct *float64 `json:"recency_pct,omitempty"` + ChurnPct *float64 `json:"churn_pct,omitempty"` + + FanIn int `json:"fan_in"` + FanOut int `json:"fan_out"` + Crossings int `json:"community_crossings"` + AgeDays *int `json:"age_days,omitempty"` + Mods int `json:"session_mods"` + AxesUsed int `json:"axes_used"` } // handleAnalyzeHealthScore aggregates the shipped enrichment into one @@ -120,15 +121,15 @@ type healthScoreRow struct { // Filters: // - path_prefix — keep only symbols whose file path starts with this. // - kinds — comma-separated (default function,method); "all" -// keeps every blame-eligible kind. +// keeps every blame-eligible kind. // - grade — comma-separated A..F subset; keeps only matching rows. // - min_score — drop rows whose composite score is below this. // - max_score — drop rows whose composite score is above this. // - min_axes — drop rows backed by fewer than this many axes -// (default 1; raise to 2-3 to demand multi-signal -// confidence at the cost of fewer rows). +// (default 1; raise to 2-3 to demand multi-signal +// confidence at the cost of fewer rows). // - limit — cap rows (default 200). Total still reports -// pre-truncation count. +// pre-truncation count. func (s *Server) handleAnalyzeHealthScore(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { args := req.GetArguments() pathPrefix := strings.TrimSpace(stringArg(args, "path_prefix")) @@ -156,25 +157,52 @@ func (s *Server) handleAnalyzeHealthScore(ctx context.Context, req mcp.CallToolR allowedKinds = parseAnalyzeKindsFilter(k) } - // Build fan-in / fan-out / community-crossing maps in one edge - // pass. Same arithmetic shape as FindHotspots — we read the - // raw axes here rather than calling FindHotspots so the per- - // node fan-in is available for symbols below its threshold. + // Build fan-in / fan-out / community-crossing maps. Same + // arithmetic shape as FindHotspots -- we read the raw axes here + // rather than calling FindHotspots so the per-node fan-in is + // available for symbols below its threshold. + // + // Fan-in / fan-out go through analysis.CollectFanCounts, which + // uses the NodeFanAggregator capability when the backend + // supports it (one bulk query per direction over the candidate + // id set) and falls back to a per-kind EdgesByKind stream + // otherwise. Crossings still need per-edge (from, to) for the + // Calls + References kinds -- streamed via EdgesByKind so even + // the fallback path never materialises the full edge set. nodeToComm := map[string]string{} if c := s.getCommunities(); c != nil { nodeToComm = c.NodeToComm } - fanIn := map[string]int{} - fanOut := map[string]int{} - crossings := map[string]int{} - for _, e := range s.graph.AllEdges() { - if e.Kind == graph.EdgeCalls || e.Kind == graph.EdgeReferences { - fanIn[e.To]++ + + // Pull only the candidate kinds from the store — most workspaces + // keep ~5-15% of nodes as functions/methods, so the kind pushdown + // drops the AllNodes materialisation by 1-2 orders of magnitude. + kindList := make([]graph.NodeKind, 0, len(allowedKinds)) + for k := range allowedKinds { + kindList = append(kindList, k) + } + scoped := s.scopedNodesByKinds(ctx, kindList) + candidateIDs := make([]string, 0, len(scoped)) + for _, n := range scoped { + if n == nil { + continue } - if e.Kind == graph.EdgeCalls { - fanOut[e.From]++ + if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { + continue } - if e.Kind == graph.EdgeCalls || e.Kind == graph.EdgeReferences { + candidateIDs = append(candidateIDs, n.ID) + } + fanIn, fanOut := analysis.CollectFanCounts(s.graph, candidateIDs, + []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences}, + []graph.EdgeKind{graph.EdgeCalls}, + ) + + crossings := map[string]int{} + for _, kind := range []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} { + for e := range s.graph.EdgesByKind(kind) { + if e == nil { + continue + } from := nodeToComm[e.From] to := nodeToComm[e.To] if from != "" && to != "" && from != to { @@ -190,9 +218,11 @@ func (s *Server) handleAnalyzeHealthScore(ctx context.Context, req mcp.CallToolR now := time.Now() + covRows := s.coverageByID() + blame := blameRowsByID(s.graph) rows := make([]healthScoreRow, 0, 128) - for _, n := range s.scopedNodes(ctx) { - if _, ok := allowedKinds[n.Kind]; !ok { + for _, n := range scoped { + if n == nil { continue } if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { @@ -215,7 +245,7 @@ func (s *Server) handleAnalyzeHealthScore(ctx context.Context, req mcp.CallToolR // Coverage axis — direct mapping (coverage_pct is already // 0..100, higher is healthier). - if pct, ok := n.Meta["coverage_pct"].(float64); ok { + if pct, ok := coveragePctFrom(covRows, n); ok { covHealth := clamp01(pct) row.CoveragePct = &covHealth weighted += covHealth * healthWeightCoverage @@ -243,11 +273,8 @@ func (s *Server) handleAnalyzeHealthScore(ctx context.Context, req mcp.CallToolR // Linear piecewise: fresh (≤30d) = 100; ok-zone // (30..365d) = 100→50; stale-zone (365..1095d) = 50→0; // dead (>1095d) = 0. - if ts, ok := extractTimestamp(n.Meta); ok { - ageDays := int(now.Sub(time.Unix(ts, 0)).Hours() / 24) - if ageDays < 0 { - ageDays = 0 - } + if ts, ok := lastAuthoredTSFrom(blame, n); ok { + ageDays := max(int(now.Sub(time.Unix(ts, 0)).Hours()/24), 0) row.AgeDays = &ageDays recHealth := recencyScore(ageDays) row.RecencyPct = &recHealth @@ -492,7 +519,7 @@ func computeHealthDistribution(rows []healthScoreRow) healthDistribution { // ascending slice of non-negative values. 0 = perfectly equal; // approaches 1 = maximally unequal. Standard formula: // -// G = ( 2 · Σ i·x_i / (n · Σ x_i) ) − (n+1)/n +// G = ( 2 · Σ i·x_i / (n · Σ x_i) ) − (n+1)/n // // Bails to 0 on the trivial cases (empty / all-zero) since dividing // by zero would produce NaN and the consumer reads "0" as the @@ -587,7 +614,6 @@ func repoPrefixForPath(s *Server, path string) string { return path } - // recencyScore maps days-since-last-commit to a 0..100 health value. // Piecewise linear so the curve is predictable to a human auditor; // no exponential decay because the threshold cliffs already encode @@ -629,24 +655,6 @@ func scoreGrade(score float64) string { } } -// extractTimestamp pulls the `timestamp` field out of meta.last_authored. -// Accepts both int64 (in-process enrichment) and float64 (json/gob -// round-trip lands integers as float64). Same shape recovery the -// stale-code analyzer uses. -func extractTimestamp(meta map[string]any) (int64, bool) { - la, ok := meta["last_authored"].(map[string]any) - if !ok { - return 0, false - } - if ts, ok := la["timestamp"].(int64); ok { - return ts, true - } - if f, ok := la["timestamp"].(float64); ok { - return int64(f), true - } - return 0, false -} - func clamp01(v float64) float64 { if v < 0 { return 0 diff --git a/internal/mcp/tools_analyze_health_score_test.go b/internal/mcp/tools_analyze_health_score_test.go index 05b54853..e42eea04 100644 --- a/internal/mcp/tools_analyze_health_score_test.go +++ b/internal/mcp/tools_analyze_health_score_test.go @@ -38,7 +38,7 @@ func callAnalyzeHealth(t *testing.T, srv *Server, extra map[string]any) map[stri // addHealthFn drops one function node into the graph with the given // id/file. Avoids re-using `addFn` from tools_analyze_concurrency_test.go // to keep this test file self-contained. -func addHealthFn(g *graph.Graph, id, file string, meta map[string]any) *graph.Node { +func addHealthFn(g graph.Store, id, file string, meta map[string]any) *graph.Node { n := &graph.Node{ ID: id, Kind: graph.KindFunction, Name: id, FilePath: file, StartLine: 1, EndLine: 5, diff --git a/internal/mcp/tools_analyze_history.go b/internal/mcp/tools_analyze_history.go index 20cd30b0..aa06161d 100644 --- a/internal/mcp/tools_analyze_history.go +++ b/internal/mcp/tools_analyze_history.go @@ -170,3 +170,43 @@ func (s *Server) symbolNamesInFile(filePath string) []string { sort.Strings(names) return names } + +// symbolNamesByFiles is the batched sibling of symbolNamesInFile. +// Returns a map filePath → sorted distinct names for every input +// path in one backend round-trip when the store implements +// FileSymbolNamesByPaths; falls back to the per-file loop otherwise. +// Used by find_co_changing_symbols and analyze fixes_history where +// the row count after truncation is bounded but each per-row name +// lookup was a separate query before — multiple thousand +// query-engine entry points per call on a disk backend. +func (s *Server) symbolNamesByFiles(paths []string) map[string][]string { + if len(paths) == 0 { + return nil + } + kinds := []graph.NodeKind{graph.KindFunction, graph.KindMethod, graph.KindType, graph.KindInterface} + out := make(map[string][]string, len(paths)) + if scanner, ok := s.graph.(graph.FileSymbolNamesByPaths); ok { + rows := scanner.FileSymbolNamesByPaths(paths, kinds) + seenPerFile := make(map[string]map[string]bool, len(paths)) + for _, r := range rows { + seen := seenPerFile[r.FilePath] + if seen == nil { + seen = make(map[string]bool) + seenPerFile[r.FilePath] = seen + } + if r.Name == "" || seen[r.Name] { + continue + } + seen[r.Name] = true + out[r.FilePath] = append(out[r.FilePath], r.Name) + } + for f := range out { + sort.Strings(out[f]) + } + return out + } + for _, p := range paths { + out[p] = s.symbolNamesInFile(p) + } + return out +} diff --git a/internal/mcp/tools_analyze_hotspot_modes.go b/internal/mcp/tools_analyze_hotspot_modes.go index 2783c4e1..c672575f 100644 --- a/internal/mcp/tools_analyze_hotspot_modes.go +++ b/internal/mcp/tools_analyze_hotspot_modes.go @@ -30,12 +30,13 @@ import ( // We don't fail when the meta is absent — the analyzer treats this // as a soft ranker, not a strict filter, so callers get *some* // ranking even on un-enriched graphs (the unweighted baseline). -func rerankHotspots(entries []analysis.HotspotEntry, g *graph.Graph, mode, direction string, windowDays int) []analysis.HotspotEntry { +func rerankHotspots(entries []analysis.HotspotEntry, g graph.Store, mode, direction string, windowDays int) []analysis.HotspotEntry { if windowDays <= 0 { windowDays = 30 } now := time.Now().UTC() window := time.Duration(windowDays) * 24 * time.Hour + blame := blameRowsByID(g) weighted := make([]analysis.HotspotEntry, 0, len(entries)) for _, e := range entries { @@ -46,7 +47,7 @@ func rerankHotspots(entries []analysis.HotspotEntry, g *graph.Graph, mode, direc var weight float64 switch mode { case "novelty": - weight = noveltyWeight(n, now, window) + weight = noveltyWeight(blame, n, now, window) case "directional": weight = directionalWeight(n, now, window, direction) default: @@ -70,8 +71,8 @@ func rerankHotspots(entries []analysis.HotspotEntry, g *graph.Graph, mode, direc // noveltyWeight returns 1.0 - days_since_last_authored / windowDays, // clamped to [0, 1]. Symbols missing the meta return 0 — they sort // to the bottom rather than getting a free "fully novel" pass. -func noveltyWeight(n *graph.Node, now time.Time, window time.Duration) float64 { - ts := nodeLastAuthoredTime(n) +func noveltyWeight(blame map[string]graph.BlameEnrichment, n *graph.Node, now time.Time, window time.Duration) float64 { + ts := nodeLastAuthoredTime(blame, n) if ts.IsZero() { return 0 } @@ -113,15 +114,12 @@ func directionalWeight(n *graph.Node, now time.Time, window time.Duration, direc // time.Time, or zero when the field isn't populated. Blame writes // the timestamp as a Unix int64; releases enrichment may write an // RFC3339 string — we tolerate both. -func nodeLastAuthoredTime(n *graph.Node) time.Time { - if n.Meta == nil { - return time.Time{} - } - la, ok := n.Meta["last_authored"].(map[string]any) - if !ok { +func nodeLastAuthoredTime(blame map[string]graph.BlameEnrichment, n *graph.Node) time.Time { + e, ok := lastAuthoredFrom(blame, n) + if !ok || e.Timestamp == 0 { return time.Time{} } - return decodeMetaTimestamp(la["timestamp"]) + return time.Unix(e.Timestamp, 0) } // nodeAddedInTime returns meta.added_in.timestamp as a time.Time, diff --git a/internal/mcp/tools_analyze_hotspot_modes_test.go b/internal/mcp/tools_analyze_hotspot_modes_test.go index e9492425..0a9bc5f9 100644 --- a/internal/mcp/tools_analyze_hotspot_modes_test.go +++ b/internal/mcp/tools_analyze_hotspot_modes_test.go @@ -12,7 +12,7 @@ import ( // buildHotspotRerankFixture seeds three function nodes with deterministic // complexity scores AND varying blame / releases metadata so the // novelty / directional modes can reorder them in predictable ways. -func buildHotspotRerankFixture(t *testing.T, now time.Time) (*graph.Graph, []analysis.HotspotEntry) { +func buildHotspotRerankFixture(t *testing.T, now time.Time) (graph.Store, []analysis.HotspotEntry) { t.Helper() g := graph.New() @@ -127,11 +127,11 @@ func TestNoveltyWeight_LinearDecay(t *testing.T) { window := 30 * 24 * time.Hour // Day 0 → weight 1.0 n := &graph.Node{Meta: map[string]any{"last_authored": map[string]any{"timestamp": now.Unix()}}} - assert.InDelta(t, 1.0, noveltyWeight(n, now, window), 1e-6) + assert.InDelta(t, 1.0, noveltyWeight(nil, n, now, window), 1e-6) // Day 15 → weight 0.5 n.Meta["last_authored"] = map[string]any{"timestamp": now.Add(-15 * 24 * time.Hour).Unix()} - assert.InDelta(t, 0.5, noveltyWeight(n, now, window), 1e-2) + assert.InDelta(t, 0.5, noveltyWeight(nil, n, now, window), 1e-2) // Day 30+ → weight 0 n.Meta["last_authored"] = map[string]any{"timestamp": now.Add(-31 * 24 * time.Hour).Unix()} - assert.InDelta(t, 0.0, noveltyWeight(n, now, window), 1e-6) + assert.InDelta(t, 0.0, noveltyWeight(nil, n, now, window), 1e-6) } diff --git a/internal/mcp/tools_analyze_impact.go b/internal/mcp/tools_analyze_impact.go index 4235c695..6618bc4c 100644 --- a/internal/mcp/tools_analyze_impact.go +++ b/internal/mcp/tools_analyze_impact.go @@ -9,6 +9,7 @@ import ( mcp "github.com/mark3labs/mcp-go/mcp" + "github.com/zzet/gortex/internal/analysis" "github.com/zzet/gortex/internal/graph" "github.com/zzet/gortex/internal/reach" ) @@ -135,14 +136,61 @@ func (s *Server) handleAnalyzeImpactComposite(ctx context.Context, req mcp.CallT nodeToComm = c.NodeToComm } - // One edge pass builds direct fan-in plus, per symbol, the set of - // distinct communities its call/reference neighbours belong to. - fanIn := map[string]int{} + // Build the candidate id set up front so both the fan-in + // aggregator and the per-edge community walk stay bounded by + // the kinds / path / ids the caller actually asked for. Without + // this, the analyzer paid for an unfiltered AllEdges() + // materialisation per call -- ~500k edges over cgo on the gortex + // workspace, the bulk of the wall-clock cost on a disk backend. + scoped := s.scopedNodes(ctx) + candidateIDs := make([]string, 0, len(scoped)) + candidateSet := make(map[string]struct{}, len(scoped)) + for _, n := range scoped { + if n == nil { + continue + } + if allowedKinds != nil { + if _, ok := allowedKinds[n.Kind]; !ok { + continue + } + } + if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { + continue + } + if len(idFilter) > 0 { + if _, ok := idFilter[n.ID]; !ok { + continue + } + } + candidateIDs = append(candidateIDs, n.ID) + candidateSet[n.ID] = struct{}{} + } + + // fan-in: uses the NodeFanAggregator capability when the + // backend supports it (one bulk query per direction over the + // candidate id set) and falls back to a per-kind EdgesByKind + // stream otherwise. fanOutKinds is empty -- impact only reads + // fan-in. + fanIn, _ := analysis.CollectFanCounts(s.graph, candidateIDs, + []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences}, + nil, + ) + + // neighborComms[n] = set of distinct communities of n's call / + // reference neighbours (both directions). Streamed via + // EdgesByKind per kind so neither backend pays for an + // unfiltered AllEdges walk; the per-kind MATCH on disk backends + // is the same plan EdgesByKind feeds every other analyzer. + // Membership is restricted to candidate ids -- a node outside + // the result set has nowhere to receive a span count. neighborComms := map[string]map[string]struct{}{} addComm := func(node, comm string) { if comm == "" { return } + if _, ok := candidateSet[node]; !ok { + return + } set := neighborComms[node] if set == nil { set = map[string]struct{}{} @@ -150,29 +198,23 @@ func (s *Server) handleAnalyzeImpactComposite(ctx context.Context, req mcp.CallT } set[comm] = struct{}{} } - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeCalls && e.Kind != graph.EdgeReferences { - continue - } - fanIn[e.To]++ - addComm(e.From, nodeToComm[e.To]) - addComm(e.To, nodeToComm[e.From]) - } - - rows := make([]impactRow, 0, 128) - for _, n := range s.scopedNodes(ctx) { - if allowedKinds != nil { - if _, ok := allowedKinds[n.Kind]; !ok { + for _, kind := range []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} { + for e := range s.graph.EdgesByKind(kind) { + if e == nil { continue } + addComm(e.From, nodeToComm[e.To]) + addComm(e.To, nodeToComm[e.From]) } - if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { + } + + rows := make([]impactRow, 0, len(candidateIDs)) + for _, n := range scoped { + if n == nil { continue } - if len(idFilter) > 0 { - if _, ok := idFilter[n.ID]; !ok { - continue - } + if _, ok := candidateSet[n.ID]; !ok { + continue } prVal := pr.ScoreOf(n.ID) diff --git a/internal/mcp/tools_analyze_infra.go b/internal/mcp/tools_analyze_infra.go index f15b1427..5537e3aa 100644 --- a/internal/mcp/tools_analyze_infra.go +++ b/internal/mcp/tools_analyze_infra.go @@ -67,12 +67,14 @@ func (s *Server) handleAnalyzeK8sResources(ctx context.Context, req mcp.CallTool c.usesEnv++ } } - for _, e := range s.graph.AllEdges() { - switch e.Kind { - case graph.EdgeDependsOn, graph.EdgeConfigures, graph.EdgeMounts, - graph.EdgeExposes, graph.EdgeUsesEnv: - bump(e.From, e.Kind) - } + for e := range edgesByKinds(s.graph, + graph.EdgeDependsOn, + graph.EdgeConfigures, + graph.EdgeMounts, + graph.EdgeExposes, + graph.EdgeUsesEnv, + ) { + bump(e.From, e.Kind) } var rows []*resourceRow @@ -148,10 +150,7 @@ func (s *Server) handleAnalyzeImages(ctx context.Context, req mcp.CallToolReques } consumers := make(map[string]int) - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeDependsOn { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeDependsOn) { consumers[e.To]++ } @@ -227,11 +226,8 @@ func (s *Server) handleAnalyzeKustomize(ctx context.Context, req mcp.CallToolReq c.res++ } } - for _, e := range s.graph.AllEdges() { - switch e.Kind { - case graph.EdgeDependsOn, graph.EdgeReferences: - bump(e.From, e.Kind) - } + for e := range edgesByKinds(s.graph, graph.EdgeDependsOn, graph.EdgeReferences) { + bump(e.From, e.Kind) } var rows []*overlayRow diff --git a/internal/mcp/tools_analyze_infra_test.go b/internal/mcp/tools_analyze_infra_test.go index fe13942b..2a78550e 100644 --- a/internal/mcp/tools_analyze_infra_test.go +++ b/internal/mcp/tools_analyze_infra_test.go @@ -33,7 +33,7 @@ func callAnalyzeInfra(t *testing.T, srv *Server, kind string, args map[string]an return out } -func seedK8sFixture(g *graph.Graph) { +func seedK8sFixture(g graph.Store) { deploy := &graph.Node{ ID: "k8s::Deployment::prod::api", Kind: graph.KindResource, Name: "api", FilePath: "k8s/api.yaml", StartLine: 1, diff --git a/internal/mcp/tools_analyze_kcore.go b/internal/mcp/tools_analyze_kcore.go new file mode 100644 index 00000000..09f97bf4 --- /dev/null +++ b/internal/mcp/tools_analyze_kcore.go @@ -0,0 +1,142 @@ +// kcore — find the densely connected core of the graph. +// +// k-core decomposition assigns every node a k-degree: the largest +// k for which the node remains in the k-core after iteratively +// pruning nodes with degree < k. Nodes with high k-degree sit at +// the densely connected centre of the graph — useful for "what's +// the core infrastructure every other layer depends on", and as a +// complement to PageRank (which weights by random-walk authority, +// not local density). +// +// Routing: +// +// - When the backing graph.Store implements graph.KCorer (today +// only store_sqlite), the analyzer delegates to the engine- +// native parallel implementation. +// +// - Otherwise analysis.ComputeKCore runs in-process. The +// implementation is the classic Batagelj & Zaversnik bucket +// algorithm — O(V + E), no recursion. + +package mcp + +import ( + "context" + "fmt" + "sort" + "strings" + + "github.com/mark3labs/mcp-go/mcp" + + "github.com/zzet/gortex/internal/analysis" + "github.com/zzet/gortex/internal/graph" +) + +// kcoreRow is the per-symbol shape the analyzer returns. +type kcoreRow struct { + ID string `json:"id"` + Name string `json:"name,omitempty"` + Kind string `json:"kind,omitempty"` + FilePath string `json:"file_path,omitempty"` + Line int `json:"line,omitempty"` + KDegree int `json:"k_degree"` +} + +func (s *Server) handleAnalyzeKCore(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { + args := req.GetArguments() + + limit := 20 + if v, ok := args["limit"].(float64); ok && v > 0 { + limit = int(v) + } + minDegree := 0 + if v, ok := args["min_degree"].(float64); ok && v > 0 { + minDegree = int(v) + } + + hits := s.runKCore(graph.KCoreOpts{ + NodeKinds: parseKindFilter(stringArg(args, "node_kinds")), + }) + + // Filter by min_degree (drop trivial low-core nodes), then cap. + if minDegree > 0 { + filtered := hits[:0] + for _, h := range hits { + if h.KDegree >= int64(minDegree) { + filtered = append(filtered, h) + } + } + hits = filtered + } + if limit > 0 && limit < len(hits) { + hits = hits[:limit] + } + + // Batch-materialise hit nodes in one backend round-trip — same + // rationale as analyze(pagerank). Preserves the descending + // k-degree order from runKCore. + ids := make([]string, 0, len(hits)) + for _, h := range hits { + if h.NodeID != "" { + ids = append(ids, h.NodeID) + } + } + nodeByID := s.graph.GetNodesByIDs(ids) + + rows := make([]kcoreRow, 0, len(hits)) + for _, h := range hits { + row := kcoreRow{ID: h.NodeID, KDegree: int(h.KDegree)} + if n := nodeByID[h.NodeID]; n != nil { + row.Name = n.Name + row.Kind = string(n.Kind) + row.FilePath = n.FilePath + row.Line = n.StartLine + } + rows = append(rows, row) + } + + if s.isGCX(ctx, req) { + return s.gcxResponseWithBudget(req)(encodeAnalyze("kcore", rows)) + } + if isCompact(req) { + var b strings.Builder + for _, r := range rows { + fmt.Fprintf(&b, "%s %s %s:%d k=%d\n", r.Kind, r.ID, r.FilePath, r.Line, r.KDegree) + } + return mcp.NewToolResultText(b.String()), nil + } + return s.respondJSONOrTOON(ctx, req, map[string]any{"kcore": rows, "count": len(rows)}) +} + +// runKCore picks the engine-native KCorer when available, +// otherwise falls back to the in-process implementation. Returns +// hits sorted by k-degree descending (the engine-native CALL +// returns them unordered; the in-process ComputeKCore returns +// already sorted — normalise both here so the handler doesn't +// have to re-sort). +func (s *Server) runKCore(opts graph.KCoreOpts) []graph.KCoreHit { + if store := s.backendStore(); store != nil { + if kc, ok := store.(graph.KCorer); ok { + hits, err := kc.KCoreDecomposition(opts) + if err == nil { + sort.Slice(hits, func(i, j int) bool { + if hits[i].KDegree != hits[j].KDegree { + return hits[i].KDegree > hits[j].KDegree + } + return hits[i].NodeID < hits[j].NodeID + }) + return hits + } + // Engine-native error falls through. + } + } + res := analysis.ComputeKCore(s.graph, analysis.KCoreOptions{ + NodeKinds: opts.NodeKinds, + EdgeKinds: opts.EdgeKinds, + }) + out := make([]graph.KCoreHit, len(res)) + for i, h := range res { + out[i] = graph.KCoreHit{NodeID: h.NodeID, KDegree: int64(h.KDegree)} + } + return out +} diff --git a/internal/mcp/tools_analyze_orphan_tables_test.go b/internal/mcp/tools_analyze_orphan_tables_test.go index 9ad3295e..6c57fb1b 100644 --- a/internal/mcp/tools_analyze_orphan_tables_test.go +++ b/internal/mcp/tools_analyze_orphan_tables_test.go @@ -33,7 +33,7 @@ func callAnalyzeOrphanTables(t *testing.T, srv *Server, args map[string]any) map // addTable + addQuery + addMigration are tiny helpers that mirror the // shape the indexer produces. Kept inside the test so it doesn't grow // production-side scaffolding. -func addTable(g *graph.Graph, id, table, dialect string) { +func addTable(g graph.Store, id, table, dialect string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindTable, @@ -45,7 +45,7 @@ func addTable(g *graph.Graph, id, table, dialect string) { }) } -func addQueryEdge(g *graph.Graph, fromID, toID string) { +func addQueryEdge(g graph.Store, fromID, toID string) { g.AddEdge(&graph.Edge{ From: fromID, To: toID, @@ -53,7 +53,7 @@ func addQueryEdge(g *graph.Graph, fromID, toID string) { }) } -func addMigrationEdge(g *graph.Graph, fromID, toID string) { +func addMigrationEdge(g graph.Store, fromID, toID string) { g.AddEdge(&graph.Edge{ From: fromID, To: toID, diff --git a/internal/mcp/tools_analyze_ownership_test.go b/internal/mcp/tools_analyze_ownership_test.go index b5042b7e..a6496b73 100644 --- a/internal/mcp/tools_analyze_ownership_test.go +++ b/internal/mcp/tools_analyze_ownership_test.go @@ -33,7 +33,7 @@ func callAnalyzeOwnership(t *testing.T, srv *Server, args map[string]any) map[st // addBlameNode wires a function node with synthetic last_authored // meta keyed off email + timestamp. -func addBlameNode(g *graph.Graph, id, file, email string, ts int64) { +func addBlameNode(g graph.Store, id, file, email string, ts int64) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindFunction, diff --git a/internal/mcp/tools_analyze_pagerank.go b/internal/mcp/tools_analyze_pagerank.go new file mode 100644 index 00000000..c5274d36 --- /dev/null +++ b/internal/mcp/tools_analyze_pagerank.go @@ -0,0 +1,277 @@ +// pagerank — graph-EXTRACTION-flavoured centrality analysis. +// +// analyze kind=pagerank ranks symbols by PageRank authority: a +// symbol is "central" when central symbols depend on it, so a +// rarely-called API that's invoked from every domain layer ranks +// higher than a heavily-called test helper. This is qualitatively +// different from the degree-based `hotspots` analyzer — random-walk +// authority weights influence by reach, not by raw fan-in count. +// +// Routing: +// +// - When the backing graph.Store implements graph.PageRanker +// (today only store_sqlite), the analyzer delegates to the +// engine-native parallel implementation (Ligra-based). Saves +// the per-call cost of a fresh Go-side power iteration. +// +// - Otherwise (in-memory store), falls back to +// analysis.ComputePageRank — the same pure-Go implementation +// the search rerank pipeline consumes via the cached +// Server.pageRank field. + +package mcp + +import ( + "context" + "fmt" + "sort" + "strings" + + "github.com/mark3labs/mcp-go/mcp" + + "github.com/zzet/gortex/internal/analysis" + "github.com/zzet/gortex/internal/graph" +) + +// pageRankRow is the per-symbol shape the analyzer returns. +type pageRankRow struct { + ID string `json:"id"` + Name string `json:"name,omitempty"` + Kind string `json:"kind,omitempty"` + FilePath string `json:"file_path,omitempty"` + Line int `json:"line,omitempty"` + Rank float64 `json:"rank"` +} + +func (s *Server) handleAnalyzePageRank(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { + args := req.GetArguments() + + limit := 20 + if v, ok := args["limit"].(float64); ok && v > 0 { + limit = int(v) + } + damping := 0.0 + if v, ok := args["damping"].(float64); ok && v > 0 && v < 1 { + damping = v + } + maxIter := 0 + if v, ok := args["max_iterations"].(float64); ok && v > 0 { + maxIter = int(v) + } + tolerance := 0.0 + if v, ok := args["tolerance"].(float64); ok && v > 0 { + tolerance = v + } + nodeKinds := parseKindFilter(stringArg(args, "node_kinds")) + + hits := s.runPageRank(graph.PageRankOpts{ + NodeKinds: nodeKinds, + DampingFactor: damping, + MaxIterations: maxIter, + Tolerance: tolerance, + Limit: limit, + }) + + // Batch-materialise hit nodes in one backend round-trip instead + // of per-id GetNode. On a disk backend each GetNode is a + // round-trip; on the default limit (20) the per-id path issued 20 + // round-trips per pagerank invocation. Single GetNodesByIDs + // collapses that into one bulk query while preserving rank order + // (the local map lookup is keyed by NodeID). + ids := make([]string, 0, len(hits)) + for _, h := range hits { + if h.NodeID != "" { + ids = append(ids, h.NodeID) + } + } + nodeByID := s.graph.GetNodesByIDs(ids) + + rows := make([]pageRankRow, 0, len(hits)) + for _, h := range hits { + row := pageRankRow{ID: h.NodeID, Rank: h.Rank} + if n := nodeByID[h.NodeID]; n != nil { + row.Name = n.Name + row.Kind = string(n.Kind) + row.FilePath = n.FilePath + row.Line = n.StartLine + } + rows = append(rows, row) + } + + if s.isGCX(ctx, req) { + return s.gcxResponseWithBudget(req)(encodeAnalyze("pagerank", rows)) + } + if isCompact(req) { + var b strings.Builder + for _, r := range rows { + fmt.Fprintf(&b, "%s %s %s:%d rank=%.6f\n", r.Kind, r.ID, r.FilePath, r.Line, r.Rank) + } + return mcp.NewToolResultText(b.String()), nil + } + return s.respondJSONOrTOON(ctx, req, map[string]any{"pagerank": rows, "count": len(rows)}) +} + +// runPageRank picks the engine-native PageRanker when the +// backing store implements it, otherwise falls back to the +// in-process power iteration. +func (s *Server) runPageRank(opts graph.PageRankOpts) []graph.PageRankHit { + if store := s.backendStore(); store != nil { + if pr, ok := store.(graph.PageRanker); ok { + hits, err := pr.PageRank(opts) + if err == nil { + return hits + } + // Fall through to the in-process path on backend + // error rather than surface a half-completed + // result; engine-native is a hot path optimisation, + // not the source of truth. + } + } + // Fallback: pure-Go power iteration on the in-memory mirror. + // analysis.ComputePageRank doesn't accept the same options + // as the engine-native call yet — it uses fixed damping / + // iteration constants — so opts.DampingFactor / MaxIterations + // / Tolerance are silently ignored on the fallback path. The + // NodeKinds filter is honoured by post-filtering the result. + res := analysis.ComputePageRank(s.graph) + if res == nil || len(res.Scores) == 0 { + return nil + } + allow := makeKindAllow(opts.NodeKinds) + hits := make([]graph.PageRankHit, 0, len(res.Scores)) + for id, rank := range res.Scores { + if !allow(s.graph.GetNode(id)) { + continue + } + hits = append(hits, graph.PageRankHit{NodeID: id, Rank: rank}) + } + sort.Slice(hits, func(i, j int) bool { return hits[i].Rank > hits[j].Rank }) + if opts.Limit > 0 && opts.Limit < len(hits) { + hits = hits[:opts.Limit] + } + return hits +} + +// backendStore returns the underlying graph.Store the indexer +// writes to — which is what implements the capability interfaces +// (PageRanker, CommunityDetector, …). Falls back to s.graph when +// no indexer is wired so test fixtures keep working. +func (s *Server) backendStore() graph.Store { + if s.indexer != nil { + return s.indexer.Graph() + } + return s.graph +} + +// parseKindFilter parses a comma-separated list of graph node +// kinds (e.g. "function,method,type") into a typed slice. Empty +// input → empty slice (caller treats that as "no filter"). +func parseKindFilter(in string) []graph.NodeKind { + in = strings.TrimSpace(in) + if in == "" { + return nil + } + parts := strings.Split(in, ",") + out := make([]graph.NodeKind, 0, len(parts)) + for _, p := range parts { + p = strings.TrimSpace(p) + if p == "" { + continue + } + out = append(out, graph.NodeKind(p)) + } + return out +} + +// handleAnalyzeLouvain returns the Louvain partitioning of the +// graph. When the backing store implements graph.CommunityDetector +// (today only store_sqlite), the partitioning is delegated to the +// engine-native implementation and threaded through the existing +// label / hub / cohesion / parent post-processing +// (analysis.DetectCommunitiesLouvainBackend) so the response is +// shape-identical to the in-process path. Otherwise the in-process +// DetectCommunitiesLouvain runs. +// +// Distinct from `analyze kind=clusters` which uses the Leiden +// algorithm (the Server's cached communities). Louvain produces +// different — typically more granular — partitions; this kind +// exposes it as a first-class result for clients that want the +// Louvain shape specifically. +func (s *Server) handleAnalyzeLouvain(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { + limit := 50 + if v, ok := req.GetArguments()["limit"].(float64); ok && v > 0 { + limit = int(v) + } + + result := s.runLouvain() + if result == nil { + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "communities": []any{}, + "modularity": 0.0, + }) + } + + communities := result.Communities + if limit > 0 && limit < len(communities) { + communities = communities[:limit] + } + + if s.isGCX(ctx, req) { + return s.gcxResponseWithBudget(req)(encodeAnalyze("louvain", map[string]any{ + "communities": communities, + "modularity": result.Modularity, + })) + } + if isCompact(req) { + var b strings.Builder + fmt.Fprintf(&b, "modularity=%.4f communities=%d\n", result.Modularity, len(result.Communities)) + for _, c := range communities { + fmt.Fprintf(&b, " %s size=%d cohesion=%.3f label=%s hub=%s\n", + c.ID, c.Size, c.Cohesion, c.Label, c.Hub) + } + return mcp.NewToolResultText(b.String()), nil + } + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "communities": communities, + "modularity": result.Modularity, + "total": len(result.Communities), + }) +} + +// runLouvain picks the engine-native CommunityDetector when the +// backing store implements it, otherwise falls back to the +// pure-Go in-process Louvain. The output shape is identical +// either way (analysis.DetectCommunitiesLouvainBackend threads +// the engine-native partition through the same post-processing). +func (s *Server) runLouvain() *analysis.CommunityResult { + if store := s.backendStore(); store != nil { + if cd, ok := store.(graph.CommunityDetector); ok { + if r := analysis.DetectCommunitiesLouvainBackend(s.graph, cd); r != nil { + return r + } + // Engine-native error path falls through to the + // in-process implementation rather than surfacing + // a half-completed result. + } + } + return analysis.DetectCommunitiesLouvain(s.graph) +} + +// makeKindAllow returns a predicate that reports whether a node's +// kind passes the filter. nil node is always rejected (defensive). +func makeKindAllow(kinds []graph.NodeKind) func(*graph.Node) bool { + if len(kinds) == 0 { + return func(n *graph.Node) bool { return n != nil } + } + set := make(map[graph.NodeKind]struct{}, len(kinds)) + for _, k := range kinds { + set[k] = struct{}{} + } + return func(n *graph.Node) bool { + if n == nil { + return false + } + _, ok := set[n.Kind] + return ok + } +} diff --git a/internal/mcp/tools_analyze_pubsub_test.go b/internal/mcp/tools_analyze_pubsub_test.go index 1675cb49..d860bc87 100644 --- a/internal/mcp/tools_analyze_pubsub_test.go +++ b/internal/mcp/tools_analyze_pubsub_test.go @@ -30,7 +30,7 @@ func callAnalyzePubsub(t *testing.T, srv *Server, args map[string]any) map[strin return out } -func addPubsubTopic(g *graph.Graph, id, name, transport string) { +func addPubsubTopic(g graph.Store, id, name, transport string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindEvent, @@ -39,7 +39,7 @@ func addPubsubTopic(g *graph.Graph, id, name, transport string) { }) } -func addListensOnEdge(g *graph.Graph, from, to string) { +func addListensOnEdge(g graph.Store, from, to string) { g.AddEdge(&graph.Edge{From: from, To: to, Kind: graph.EdgeListensOn}) } diff --git a/internal/mcp/tools_analyze_role.go b/internal/mcp/tools_analyze_role.go index 7d7c1ee2..a07ac165 100644 --- a/internal/mcp/tools_analyze_role.go +++ b/internal/mcp/tools_analyze_role.go @@ -103,7 +103,7 @@ func (s *Server) handleAnalyzeRole(ctx context.Context, req mcp.CallToolRequest) // the first matching label. Rules are deliberately conservative; // false-negatives (defaulting to "core") are preferable to noisy // false-positives on a label that pretends to be authoritative. -func classifyRole(n *graph.Node, fanIn, fanOut int, g *graph.Graph, nodeToComm map[string]string) string { +func classifyRole(n *graph.Node, fanIn, fanOut int, g graph.Store, nodeToComm map[string]string) string { switch { case fanIn == 0 && fanOut == 0: return "dead" diff --git a/internal/mcp/tools_analyze_stale_code_test.go b/internal/mcp/tools_analyze_stale_code_test.go index c9ca9143..9e185a82 100644 --- a/internal/mcp/tools_analyze_stale_code_test.go +++ b/internal/mcp/tools_analyze_stale_code_test.go @@ -13,7 +13,7 @@ import ( // addBlameEnrichedNode wires a function node with synthetic // last_authored meta — emulating what blame.EnrichGraph would have // produced after a real run. -func addBlameEnrichedNode(g *graph.Graph, id, file string, line int, email, commit string, ageDays int) { +func addBlameEnrichedNode(g graph.Store, id, file string, line int, email, commit string, ageDays int) { ts := time.Now().Add(-time.Duration(ageDays*24) * time.Hour).Unix() g.AddNode(&graph.Node{ ID: id, diff --git a/internal/mcp/tools_analyze_stale_flags_test.go b/internal/mcp/tools_analyze_stale_flags_test.go index a0eab3c9..59d44f29 100644 --- a/internal/mcp/tools_analyze_stale_flags_test.go +++ b/internal/mcp/tools_analyze_stale_flags_test.go @@ -33,7 +33,7 @@ func callAnalyzeStaleFlags(t *testing.T, srv *Server, args map[string]any) map[s // addFlagWithCallers wires a flag node + N caller functions, each // stamped with last_authored.timestamp = ageDays ago. -func addFlagWithCallers(g *graph.Graph, flagID, provider, name string, callers map[string]int /* callerID → ageDays */) { +func addFlagWithCallers(g graph.Store, flagID, provider, name string, callers map[string]int /* callerID → ageDays */) { g.AddNode(&graph.Node{ ID: flagID, Kind: graph.KindFlag, diff --git a/internal/mcp/tools_analyze_string_downstream.go b/internal/mcp/tools_analyze_string_downstream.go index 9941c005..faf96bc3 100644 --- a/internal/mcp/tools_analyze_string_downstream.go +++ b/internal/mcp/tools_analyze_string_downstream.go @@ -52,10 +52,7 @@ func (s *Server) handleAnalyzeLogEvents(ctx context.Context, req mcp.CallToolReq Emitters []string `json:"emitters,omitempty"` } byString := map[string]*logRow{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeEmits { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeEmits) { n := s.graph.GetNode(e.To) if n == nil || n.Kind != graph.KindString { continue @@ -224,10 +221,7 @@ func (s *Server) handleAnalyzeSQLCallSites(ctx context.Context, req mcp.CallTool Writes int `json:"writes"` } bySite := map[string]*sqlCallSite{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeQueries { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeQueries) { row, ok := bySite[e.From] if !ok { name, file := e.From, "" diff --git a/internal/mcp/tools_analyze_string_downstream_test.go b/internal/mcp/tools_analyze_string_downstream_test.go index e7bbc1f1..8fc6f568 100644 --- a/internal/mcp/tools_analyze_string_downstream_test.go +++ b/internal/mcp/tools_analyze_string_downstream_test.go @@ -36,7 +36,7 @@ func callAnalyze(t *testing.T, srv *Server, kind string, extra map[string]any) m // addEmitToKindString builds a (caller, KindString) emit pair with // the given context and meta. Used by the registry-downstream // analyzers' tests. -func addEmitToKindString(g *graph.Graph, caller, strID, value, ctx string, nodeMeta, edgeMeta map[string]any) { +func addEmitToKindString(g graph.Store, caller, strID, value, ctx string, nodeMeta, edgeMeta map[string]any) { meta := map[string]any{ "context": ctx, "value": value, diff --git a/internal/mcp/tools_analyze_string_emitters.go b/internal/mcp/tools_analyze_string_emitters.go index d96c8e58..6b51087d 100644 --- a/internal/mcp/tools_analyze_string_emitters.go +++ b/internal/mcp/tools_analyze_string_emitters.go @@ -34,10 +34,7 @@ func (s *Server) handleAnalyzeStringEmitters(ctx context.Context, req mcp.CallTo Emitters []string `json:"emitters,omitempty"` } byString := map[string]*stringRow{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeEmits { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeEmits) { n := s.graph.GetNode(e.To) if n == nil || n.Kind != graph.KindString { continue diff --git a/internal/mcp/tools_analyze_string_emitters_test.go b/internal/mcp/tools_analyze_string_emitters_test.go index 4406bdad..ca3aa829 100644 --- a/internal/mcp/tools_analyze_string_emitters_test.go +++ b/internal/mcp/tools_analyze_string_emitters_test.go @@ -30,7 +30,7 @@ func callAnalyzeStringEmitters(t *testing.T, srv *Server, args map[string]any) m return out } -func addStringNode(g *graph.Graph, id, value, ctx string) { +func addStringNode(g graph.Store, id, value, ctx string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindString, @@ -39,7 +39,7 @@ func addStringNode(g *graph.Graph, id, value, ctx string) { }) } -func addStringEmitEdge(g *graph.Graph, from, to, ctx, method string) { +func addStringEmitEdge(g graph.Store, from, to, ctx, method string) { g.AddEdge(&graph.Edge{ From: from, To: to, diff --git a/internal/mcp/tools_analyze_tests.go b/internal/mcp/tools_analyze_tests.go index 6e24d98d..fbce7c63 100644 --- a/internal/mcp/tools_analyze_tests.go +++ b/internal/mcp/tools_analyze_tests.go @@ -57,10 +57,7 @@ func (s *Server) handleAnalyzeTestsAsEdges(ctx context.Context, req mcp.CallTool testsBySymbol := make(map[string][]string) symbolsByTest := make(map[string][]string) edgeCount := 0 - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeTests { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeTests) { edgeCount++ testsBySymbol[e.To] = append(testsBySymbol[e.To], e.From) symbolsByTest[e.From] = append(symbolsByTest[e.From], e.To) @@ -71,9 +68,27 @@ func (s *Server) handleAnalyzeTestsAsEdges(ctx context.Context, req mcp.CallTool primary = symbolsByTest } + // Batch-fetch every primary key and every related ID in one bulk + // round-trip. On a repo with thousands of EdgeTests edges the old + // per-id GetNode pattern burned one round-trip per row plus + // one per related ID on a disk backend — easily 5-10k round-trips per + // analyze kind=tests_as_edges call. + idSet := make(map[string]struct{}, len(primary)) + for id, relatedIDs := range primary { + idSet[id] = struct{}{} + for _, rid := range relatedIDs { + idSet[rid] = struct{}{} + } + } + allIDs := make([]string, 0, len(idSet)) + for id := range idSet { + allIDs = append(allIDs, id) + } + nodeByID := s.graph.GetNodesByIDs(allIDs) + rows := make([]testEdgeRow, 0, len(primary)) for id, relatedIDs := range primary { - n := s.graph.GetNode(id) + n := nodeByID[id] if n == nil { continue } @@ -88,7 +103,7 @@ func (s *Server) handleAnalyzeTestsAsEdges(ctx context.Context, req mcp.CallTool } seen[rid] = true name := rid - if rn := s.graph.GetNode(rid); rn != nil { + if rn := nodeByID[rid]; rn != nil { name = rn.Name } related = append(related, testEdgeRef{ID: rid, Name: name}) diff --git a/internal/mcp/tools_analyze_todos_test.go b/internal/mcp/tools_analyze_todos_test.go index e2960fcd..2eaff6fe 100644 --- a/internal/mcp/tools_analyze_todos_test.go +++ b/internal/mcp/tools_analyze_todos_test.go @@ -12,7 +12,7 @@ import ( // addTodoNode is a small helper for these tests — wires a KindTodo // node directly into the graph without going through the indexer's // per-file pipeline. -func addTodoNode(g *graph.Graph, id string, line int, meta map[string]any) { +func addTodoNode(g graph.Store, id string, line int, meta map[string]any) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindTodo, diff --git a/internal/mcp/tools_architecture.go b/internal/mcp/tools_architecture.go index 78887a2c..9ab02ed7 100644 --- a/internal/mcp/tools_architecture.go +++ b/internal/mcp/tools_architecture.go @@ -2,6 +2,7 @@ package mcp import ( "context" + "maps" "sort" "strings" @@ -62,23 +63,37 @@ func (s *Server) handleGetArchitecture(ctx context.Context, req mcp.CallToolRequ topEntryPoints := max(req.GetInt("top_entry_points", 10), 1) pathPrefix := strings.TrimSpace(req.GetString("path_prefix", "")) - scoped := s.scopedNodes(ctx) - inScope := make(map[string]*graph.Node, len(scoped)) - for _, n := range scoped { - if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { - continue + // scoped + inScope are only needed when the session is bound or + // the caller supplied a path-prefix narrowing. Otherwise every + // node is in scope and downstream membership tests are tautologies + // the helpers handle via nil inScope. + _, _, bound := s.sessionScope(ctx) + needScoped := bound || pathPrefix != "" + var scoped []*graph.Node + var inScope map[string]bool + var totalNodesScoped int + if needScoped { + scoped = s.scopedNodes(ctx) + inScope = make(map[string]bool, len(scoped)) + for _, n := range scoped { + if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { + continue + } + inScope[n.ID] = true } - inScope[n.ID] = n + totalNodesScoped = len(inScope) + } else { + totalNodesScoped = s.graph.NodeCount() } // 1. Summary — language mix + node/edge counts. - summary := architectureSummary(scoped, inScope, s.graph) + summary := architectureSummary(scoped, inScope, totalNodesScoped, s.graph) // 2. Communities — same shape as the outline tool, capped here. communitiesSection := architectureCommunities(s.getCommunities(), inScope, topCommunities) // 3. Hotspots — load-bearing symbols, scoped + capped. - hotspots := architectureHotspots(s.graph, s.getCommunities(), inScope, topHotspots) + hotspots := architectureHotspots(s.getHotspots(), inScope, topHotspots) // 4. Entry points — functions with zero in-edges that have // out-edges (called by no one, calls into the system). Sorted @@ -127,7 +142,7 @@ func (s *Server) handleGetArchitecture(ctx context.Context, req mcp.CallToolRequ // unrecognised tier returns ("", message) so the handler can surface a // clean error. Otherwise it rolls the base graph up to the requested // tier via analysis.BuildHierarchy and returns the wire shape. -func architectureHierarchy(g *graph.Graph, cr *analysis.CommunityResult, resolution string) (map[string]any, string) { +func architectureHierarchy(g graph.Store, cr *analysis.CommunityResult, resolution string) (map[string]any, string) { resolution = strings.ToLower(strings.TrimSpace(resolution)) if resolution == "" { return nil, "" @@ -169,11 +184,23 @@ func architectureHierarchy(g *graph.Graph, cr *analysis.CommunityResult, resolut // architectureSummary builds the language mix + node/edge count // header. Edges are bounded to the scoped subgraph so multi-repo -// callers don't see cross-workspace numbers. -func architectureSummary(allScoped []*graph.Node, inScope map[string]*graph.Node, g *graph.Graph) map[string]any { +// callers don't see cross-workspace numbers. nil inScope is the +// signal that every node is in scope — the helper short-circuits +// the lang count through Stats() and the edge count through +// EdgeCount() rather than materialising the whole graph over cgo. +func architectureSummary(allScoped []*graph.Node, inScope map[string]bool, totalNodes int, g graph.Store) map[string]any { langCounts := map[string]int{} - for _, n := range inScope { - if n.Language != "" { + if inScope == nil { + // Unbound session + no path-prefix — pull the aggregate from + // the backend's cached stats. One indexed groupby vs a + // whole-table scan over cgo. + stats := g.Stats() + maps.Copy(langCounts, stats.ByLanguage) + } else { + for _, n := range allScoped { + if !inScope[n.ID] || n.Language == "" { + continue + } langCounts[n.Language]++ } } @@ -192,15 +219,23 @@ func architectureSummary(allScoped []*graph.Node, inScope map[string]*graph.Node return languages[i].Name < languages[j].Name }) - totalEdges := 0 - for _, e := range g.AllEdges() { - if _, ok := inScope[e.From]; !ok { - continue - } - if _, ok := inScope[e.To]; !ok { - continue + // Common case — unbound session + no path-prefix — every node + // is in scope so the edge count is exactly the backend's + // EdgeCount(), which is an O(1) lookup. Skips materialising + // every edge over cgo just to count them. + var totalEdges int + if inScope == nil { + totalEdges = g.EdgeCount() + } else { + for _, e := range g.AllEdges() { + if !inScope[e.From] { + continue + } + if !inScope[e.To] { + continue + } + totalEdges++ } - totalEdges++ } primary := "" @@ -208,32 +243,40 @@ func architectureSummary(allScoped []*graph.Node, inScope map[string]*graph.Node primary = languages[0].Name } + unscopedCount := totalNodes + if inScope != nil { + unscopedCount = len(allScoped) + } return map[string]any{ - "total_nodes": len(inScope), - "total_nodes_unscoped": len(allScoped), + "total_nodes": totalNodes, + "total_nodes_unscoped": unscopedCount, "total_edges": totalEdges, "primary_language": primary, "languages": languages, } } -func architectureCommunities(cr *analysis.CommunityResult, inScope map[string]*graph.Node, top int) map[string]any { +func architectureCommunities(cr *analysis.CommunityResult, inScope map[string]bool, top int) map[string]any { out := map[string]any{"count": 0} if cr == nil { return out } kept := make([]analysis.Community, 0, len(cr.Communities)) for _, c := range cr.Communities { - // Drop communities with no members in scope. - match := false - for _, m := range c.Members { - if _, ok := inScope[m]; ok { - match = true - break + // nil inScope means "every node is in scope" — keep the + // community unconditionally. Otherwise drop the community + // when no member lands inside the session's workspace. + if inScope != nil { + match := false + for _, m := range c.Members { + if inScope[m] { + match = true + break + } + } + if !match { + continue } - } - if !match { - continue } kept = append(kept, c) } @@ -261,13 +304,13 @@ func architectureCommunities(cr *analysis.CommunityResult, inScope map[string]*g return out } -func architectureHotspots(g *graph.Graph, cr *analysis.CommunityResult, inScope map[string]*graph.Node, top int) []map[string]any { +func architectureHotspots(hotspots []analysis.HotspotEntry, inScope map[string]bool, top int) []map[string]any { out := []map[string]any{} - for _, h := range analysis.FindHotspots(g, cr, 0) { + for _, h := range hotspots { if len(out) >= top { break } - if _, ok := inScope[h.ID]; !ok { + if inScope != nil && !inScope[h.ID] { continue } out = append(out, map[string]any{ @@ -284,24 +327,87 @@ func architectureHotspots(g *graph.Graph, cr *analysis.CommunityResult, inScope return out } -func architectureEntryPoints(inScope map[string]*graph.Node, g *graph.Graph, top int) []map[string]any { +// architectureEntryPoints returns functions/methods with zero +// incoming edges and at least one outgoing edge — the "called by +// no one, calls into the system" pattern. +// +// The candidate pool is either the kind-filtered subset of an in-scope +// node map (bound session / path-prefix narrowing) or — when inScope +// is nil — the function+method slice pulled directly from the storage +// layer via NodesByKindsScanner. The legacy code path walked the full +// scoped-nodes slice every call just to keep the callable subset. +// +// Uses NodeDegreeAggregator when the backend implements it (one +// batched in/out count instead of 2N GetInEdges/GetOutEdges +// round-trips on a disk backend — the per-node loop was the entire +// wall-clock cost of this section on large repos). +func architectureEntryPoints(inScope map[string]bool, g graph.Store, top int) []map[string]any { type entryCandidate struct { node *graph.Node fanOut int } - cands := make([]entryCandidate, 0, len(inScope)) - for _, n := range inScope { - if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { - continue + // Pre-filter on kind Go-side first. When inScope is nil pull + // only function/method via the kind scanner; otherwise project + // the same subset out of the supplied scope set. + var pool []*graph.Node + if inScope == nil { + if scan, ok := g.(graph.NodesByKindsScanner); ok { + pool = scan.NodesByKinds([]graph.NodeKind{graph.KindFunction, graph.KindMethod}) + } else { + all := g.AllNodes() + pool = make([]*graph.Node, 0, len(all)) + for _, n := range all { + if n.Kind == graph.KindFunction || n.Kind == graph.KindMethod { + pool = append(pool, n) + } + } } - if len(g.GetInEdges(n.ID)) > 0 { - continue + } else { + // Materialise the callable subset out of the in-scope node + // id set. The caller's scoped slice already lives in memory, + // so this stays cheap — but the inScope map carries bools, + // not nodes, so we re-resolve via GetNode for each id. + pool = make([]*graph.Node, 0, len(inScope)) + for id := range inScope { + n := g.GetNode(id) + if n == nil { + continue + } + if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { + continue + } + pool = append(pool, n) } - out := len(g.GetOutEdges(n.ID)) - if out == 0 { - continue + } + cands := make([]entryCandidate, 0, len(pool)) + if agg, ok := g.(graph.NodeDegreeAggregator); ok && len(pool) > 0 { + ids := make([]string, 0, len(pool)) + byID := make(map[string]*graph.Node, len(pool)) + for _, n := range pool { + ids = append(ids, n.ID) + byID[n.ID] = n + } + for _, r := range agg.NodeDegreeCounts(ids, nil) { + if r.InCount > 0 || r.OutCount == 0 { + continue + } + n := byID[r.NodeID] + if n == nil { + continue + } + cands = append(cands, entryCandidate{node: n, fanOut: r.OutCount}) + } + } else { + for _, n := range pool { + if len(g.GetInEdges(n.ID)) > 0 { + continue + } + out := len(g.GetOutEdges(n.ID)) + if out == 0 { + continue + } + cands = append(cands, entryCandidate{node: n, fanOut: out}) } - cands = append(cands, entryCandidate{node: n, fanOut: out}) } sort.Slice(cands, func(i, j int) bool { if cands[i].fanOut != cands[j].fanOut { @@ -324,13 +430,13 @@ func architectureEntryPoints(inScope map[string]*graph.Node, g *graph.Graph, top return out } -func architectureProcesses(pr *analysis.ProcessResult, inScope map[string]*graph.Node, top int) []architectureProcess { +func architectureProcesses(pr *analysis.ProcessResult, inScope map[string]bool, top int) []architectureProcess { if pr == nil { return []architectureProcess{} } kept := make([]analysis.Process, 0, len(pr.Processes)) for _, p := range pr.Processes { - if _, ok := inScope[p.EntryPoint]; !ok { + if inScope != nil && !inScope[p.EntryPoint] { continue } kept = append(kept, p) @@ -361,22 +467,34 @@ func architectureProcesses(pr *analysis.ProcessResult, inScope map[string]*graph // architectureCrossRepo bundles every cross_repo_* edge into a // (from_repo, to_repo, kind) → count rollup. Empty list when no // cross-repo edges exist (single-repo mode). -func architectureCrossRepo(g *graph.Graph) []crossRepoRow { +// +// Picks the CrossRepoEdgeAggregator capability when the backend +// implements it (one server-side aggregate replaces the AllEdges + +// per-edge GetNode pair — typically ~286k edge rows + thousands +// of GetNode round-trips on a disk backend for <100 rows of output). Falls +// back to the AllEdges-driven loop on backends that don't. +func architectureCrossRepo(g graph.Store) []crossRepoRow { type key struct { kind, fromRepo, toRepo string } counts := map[key]int{} - for _, e := range g.AllEdges() { - if _, isCross := graph.BaseKindForCrossRepo(e.Kind); !isCross { - continue + if ag, ok := g.(graph.CrossRepoEdgeAggregator); ok { + for _, r := range ag.CrossRepoEdgeCounts() { + counts[key{kind: string(r.Kind), fromRepo: r.FromRepo, toRepo: r.ToRepo}] = r.Count } - from := g.GetNode(e.From) - to := g.GetNode(e.To) - if from == nil || to == nil { - continue + } else { + for _, e := range g.AllEdges() { + if _, isCross := graph.BaseKindForCrossRepo(e.Kind); !isCross { + continue + } + from := g.GetNode(e.From) + to := g.GetNode(e.To) + if from == nil || to == nil { + continue + } + k := key{kind: string(e.Kind), fromRepo: from.RepoPrefix, toRepo: to.RepoPrefix} + counts[k]++ } - k := key{kind: string(e.Kind), fromRepo: from.RepoPrefix, toRepo: to.RepoPrefix} - counts[k]++ } rows := make([]crossRepoRow, 0, len(counts)) for k, c := range counts { diff --git a/internal/mcp/tools_ast.go b/internal/mcp/tools_ast.go index 427c2e43..af8b83ac 100644 --- a/internal/mcp/tools_ast.go +++ b/internal/mcp/tools_ast.go @@ -178,8 +178,14 @@ func (s *Server) buildASTTargets(language, pathPrefix string, allowedRepos map[s return nil, fmt.Errorf("search_ast: no graph available") } out := make([]astquery.Target, 0, 256) - for _, n := range s.graph.AllNodes() { - if n.Kind != graph.KindFile { + // File nodes are a fraction of the node table; iterating the + // KindFile bucket via NodesByKind lets the backend stream only + // those rows instead of materialising the full table over cgo. + // Repo / language / path filters compose AND, so they stay Go- + // side — they can't be projected onto the bucket index without + // duplicating the predicate set across both call sites. + for n := range s.graph.NodesByKind(graph.KindFile) { + if n == nil { continue } if allowedRepos != nil && n.RepoPrefix != "" && !allowedRepos[n.RepoPrefix] { @@ -227,7 +233,7 @@ func (s *Server) buildASTTargets(language, pathPrefix string, allowedRepos map[s // than `min` incoming edges. Without an enclosing symbol, the // match is preserved (we'd otherwise silently swallow file-level // matches that legitimately have no caller graph). -func filterByMinFanIn(g *graph.Graph, matches []astquery.Match, min int) []astquery.Match { +func filterByMinFanIn(g graph.Store, matches []astquery.Match, min int) []astquery.Match { if g == nil || min <= 0 { return matches } diff --git a/internal/mcp/tools_check_references.go b/internal/mcp/tools_check_references.go index 28080a44..f958a781 100644 --- a/internal/mcp/tools_check_references.go +++ b/internal/mcp/tools_check_references.go @@ -81,14 +81,38 @@ func (s *Server) handleCheckReferences(ctx context.Context, req mcp.CallToolRequ callers := map[string]bool{} totalEdges := 0 if target != nil { - for _, e := range s.graph.GetInEdges(target.ID) { + // Pre-filter the in-edges and batch-fetch the surviving + // `From` nodes in one round-trip. On a disk backend the per-edge + // GetNode pattern was a round-trip per inbound edge — + // for heavily-referenced symbols (hundreds of callers) the + // cost was dominant. One GetNodesByIDs gives us the same + // data in a single bulk query. + inEdges := s.graph.GetInEdges(target.ID) + fromIDs := make([]string, 0, len(inEdges)) + seenFrom := make(map[string]struct{}, len(inEdges)) + for _, e := range inEdges { if !isCheckRefEdge(e.Kind) { continue } if minTier != "" && !atOrAboveTier(string(e.Origin), minTier) { continue } - from := s.graph.GetNode(e.From) + if _, dup := seenFrom[e.From]; dup { + continue + } + seenFrom[e.From] = struct{}{} + fromIDs = append(fromIDs, e.From) + } + fromByID := s.graph.GetNodesByIDs(fromIDs) + + for _, e := range inEdges { + if !isCheckRefEdge(e.Kind) { + continue + } + if minTier != "" && !atOrAboveTier(string(e.Origin), minTier) { + continue + } + from := fromByID[e.From] if from != nil && excludeTests && isTestPath(from.FilePath) { continue } @@ -149,39 +173,13 @@ func (s *Server) handleCheckReferences(ctx context.Context, req mcp.CallToolRequ } } - // Importing-files scan — every node whose FilePath imports the - // target's FilePath. Today the graph encodes file-level imports - // via EdgeImports between file/import nodes; we walk those to - // answer "is the home package consumed at all?". - importingFiles := []string{} - if target != nil && target.FilePath != "" { - seen := map[string]bool{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeImports { - continue - } - toNode := s.graph.GetNode(e.To) - if toNode == nil { - continue - } - if toNode.FilePath != target.FilePath && toNode.ID != target.FilePath { - continue - } - fromNode := s.graph.GetNode(e.From) - if fromNode == nil { - continue - } - if excludeTests && isTestPath(fromNode.FilePath) { - continue - } - if seen[fromNode.FilePath] { - continue - } - seen[fromNode.FilePath] = true - importingFiles = append(importingFiles, fromNode.FilePath) - } - sort.Strings(importingFiles) - } + // Importing-files scan — every file whose nodes carry an + // EdgeImports edge into the target's FilePath. Backends that + // implement graph.FileImporters serve this from one query + // (no AllEdges() materialisation, no per-edge GetNode round- + // trip). The legacy AllEdges + per-edge GetNode loop stays as + // the fallback for backends that don't ship the capability. + importingFiles := s.collectImportingFiles(target, excludeTests) referenced := totalEdges > 0 || len(sameName) > 0 || len(importingFiles) > 0 @@ -199,6 +197,67 @@ func (s *Server) handleCheckReferences(ctx context.Context, req mcp.CallToolRequ }) } +// collectImportingFiles answers "which files import the file that +// holds target?". Prefers the graph.FileImporters capability when +// the backend implements it — that path runs one query +// instead of an AllEdges() scan plus 2× per-edge GetNode round-trip. +// Returns a sorted, deduplicated, optionally test-filtered slice +// of file paths. +// +// When target is nil or has no FilePath the question is undefined; +// returns an empty slice (consistent with the legacy behaviour). +func (s *Server) collectImportingFiles(target *graph.Node, excludeTests bool) []string { + importingFiles := []string{} + if target == nil || target.FilePath == "" { + return importingFiles + } + seen := map[string]bool{} + add := func(fromFile string) { + if fromFile == "" { + return + } + if excludeTests && isTestPath(fromFile) { + return + } + if seen[fromFile] { + return + } + seen[fromFile] = true + importingFiles = append(importingFiles, fromFile) + } + + if fi, ok := s.graph.(graph.FileImporters); ok { + for _, row := range fi.FileImporters(target.FilePath) { + add(row.FromFile) + } + sort.Strings(importingFiles) + return importingFiles + } + + // Fallback: pull every edge and filter Go-side. Identical + // pre-capability behaviour — only the cgo-heavy backend ever + // reaches this path. + for _, e := range s.graph.AllEdges() { + if e.Kind != graph.EdgeImports { + continue + } + toNode := s.graph.GetNode(e.To) + if toNode == nil { + continue + } + if toNode.FilePath != target.FilePath && toNode.ID != target.FilePath { + continue + } + fromNode := s.graph.GetNode(e.From) + if fromNode == nil { + continue + } + add(fromNode.FilePath) + } + sort.Strings(importingFiles) + return importingFiles +} + // isCheckRefEdge identifies edges that mean "this symbol is being // used". Mirrors safe_delete_symbol's referencing-edge filter so // the two tools agree on what "referenced" means. diff --git a/internal/mcp/tools_churn.go b/internal/mcp/tools_churn.go index 5c6aa027..f53082d4 100644 --- a/internal/mcp/tools_churn.go +++ b/internal/mcp/tools_churn.go @@ -4,27 +4,26 @@ import ( "context" "sort" "strings" - "time" "github.com/mark3labs/mcp-go/mcp" - "github.com/zzet/gortex/internal/blame" + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/query" ) -// registerChurnRateTool wires get_churn_rate — a standalone MCP tool -// that exposes per-symbol git-commit density. The metric is already -// implicit in `analyze hotspots` (composite); this tool surfaces the -// raw number so refactor planning, code review, and bus-factor work -// can read it directly. +// registerChurnRateTool wires get_churn_rate — a pure graph scan over +// per-symbol churn metadata pre-computed by `gortex enrich churn`. // -// Computation: walk the scoped subgraph for function/method nodes, -// group by file_path, run `git blame -p` once per unique file, count -// distinct commits whose blame range intersects the symbol's line -// range. Bounded by file count, not symbol count. +// At read time the handler does NOT shell out to git. Every value it +// returns lives in n.Meta["churn"] on the node, populated either by +// the CLI/git-hook (which writes through the on-disk backend) or by +// an in-process call to the enrich_churn MCP tool. When no node in +// scope has the data, the response is a structured error pointing +// the agent at the enrich command. func (s *Server) registerChurnRateTool() { s.addTool( mcp.NewTool("get_churn_rate", - mcp.WithDescription("Per-symbol git-commit density. For each function/method in scope, runs `git blame -p` once per unique file and counts distinct commits intersecting the symbol's line range. Returns {symbol_id, name, file, churn_rate (commits per active day), commit_count, age_days, last_author, last_commit_at}. Sort and filter by churn_rate or commit_count to find unstable abstractions, hidden coupling, and bus-factor risks. Pairs with `analyze hotspots` — that returns the composite; this returns the raw signal."), + mcp.WithDescription("Per-symbol git-commit density, read from pre-computed graph data. For each function/method in scope returns {symbol_id, name, file, churn_rate (commits per active day), commit_count, age_days, last_author, last_commit_at}. Sort and filter by churn_rate or commit_count to find unstable abstractions, hidden coupling, and bus-factor risks. Data is populated by `gortex enrich churn` (or the enrich_churn MCP tool); when nothing in scope has churn meta the tool returns a structured error with the suggested next command. No git subprocess at request time — sub-second on indexed repos."), mcp.WithString("path_prefix", mcp.Description("Scope analysis to nodes under this file-path prefix.")), mcp.WithNumber("min_commits", mcp.Description("Only return symbols with at least this many commits (default: 1).")), mcp.WithString("kinds", mcp.Description("Comma-separated kinds (default: function,method). Pass 'all' for every symbol.")), @@ -65,104 +64,93 @@ func (s *Server) handleGetChurnRate(ctx context.Context, req mcp.CallToolRequest allowed = nil } - // Resolve the repo root once so blame.Run can be called with a - // fixed cwd. In multi-repo mode each file lives under one of the - // MultiIndexer repos; we resolve per-file with resolveFilePath. - scoped := s.scopedNodes(ctx) - byFile := map[string][]*graph.Node{} - for _, n := range scoped { - if allowed != nil { - if _, ok := allowed[n.Kind]; !ok { - continue - } - } - if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { - continue - } - if n.StartLine == 0 { - continue - } - byFile[n.FilePath] = append(byFile[n.FilePath], n) - } - - rows := make([]churnRow, 0, len(scoped)) - scannedFiles := 0 - for filePath, nodes := range byFile { - abs, _, err := s.resolveFilePath(filePath) - if err != nil { - continue - } - workTree := repoRootContaining(abs) - if workTree == "" { - continue - } - // Convert absolute path back to a path relative to the git - // work tree — git blame takes tree-relative paths. - gitRel := abs - if rel, err := stripPathPrefix(abs, workTree+"/"); err == nil { - gitRel = rel - } - lines, err := blame.Run(workTree, gitRel) - if err != nil || len(lines) == 0 { - continue - } - scannedFiles++ + rows := make([]churnRow, 0, 64) + seenFiles := map[string]struct{}{} + sawMeta := false - for _, n := range nodes { - endLine := n.EndLine - if endLine == 0 { - endLine = n.StartLine + usedSidecar := false + if reader, ok := s.graph.(graph.ChurnEnrichmentReader); ok { + // Sidecar fast-path (change A): read the typed churn rows via an + // index over the (small) enriched set, then resolve their nodes + // in one batch — instead of scanning AllNodes and gob-decoding + // every meta blob to peek at Meta["churn"]. + if enrich := reader.ChurnRows(""); len(enrich) > 0 { + usedSidecar = true + sawMeta = true + ids := make([]string, 0, len(enrich)) + for _, e := range enrich { + ids = append(ids, e.NodeID) + } + nodes := s.graph.GetNodesByIDs(ids) + sessWS, _, bound := s.sessionScope(ctx) + var opts query.QueryOptions + if bound { + opts = query.QueryOptions{WorkspaceID: sessWS} } - commits := map[string]bool{} - oldest, newest := time.Time{}, time.Time{} - latestEmail := "" - for line := n.StartLine; line <= endLine; line++ { - a, ok := lines[line] - if !ok { + for _, e := range enrich { + n := nodes[e.NodeID] + if n == nil { continue } - if !commits[a.Commit] { - commits[a.Commit] = true + if bound && !opts.ScopeAllows(n) { + continue } - if oldest.IsZero() || a.Timestamp.Before(oldest) { - oldest = a.Timestamp + if allowed != nil { + if _, ok := allowed[n.Kind]; !ok { + continue + } } - if newest.IsZero() || a.Timestamp.After(newest) { - newest = a.Timestamp - latestEmail = a.Email + if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { + continue } + if e.CommitCount < minCommits { + continue + } + rows = append(rows, churnRowFromEnrichment(n, e)) + seenFiles[n.FilePath] = struct{}{} } - if len(commits) == 0 || len(commits) < minCommits { - continue - } - ageDays := 0 - if !oldest.IsZero() { - ageDays = int(time.Since(oldest).Hours() / 24) + } + } + if !usedSidecar { + // Fallback: no sidecar rows yet (un-migrated DB, recompute-on- + // next-enrich) or a backend without the capability — read + // Meta["churn"] off a full AllNodes scan. + for _, n := range s.scopedNodes(ctx) { + if allowed != nil { + if _, ok := allowed[n.Kind]; !ok { + continue + } } - // Churn rate: commits per active day. A symbol active for - // 1 day with 3 commits gets churn_rate=3.0; one active for - // 100 days with the same 3 commits gets 0.03. The minimum - // denominator of 1 day stops a fresh symbol from looking - // infinitely churny. - activeDays := ageDays - if activeDays < 1 { - activeDays = 1 + if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { + continue } - row := churnRow{ - ID: n.ID, Name: n.Name, File: n.FilePath, - StartLine: n.StartLine, EndLine: endLine, - CommitCount: len(commits), - AgeDays: ageDays, - ChurnRate: roundScore(float64(len(commits)) / float64(activeDays)), - LastAuthor: latestEmail, + row, ok := churnRowFromMeta(n) + if !ok { + continue } - if !newest.IsZero() { - row.LastCommitAt = newest.UTC().Format(time.RFC3339) + sawMeta = true + if row.CommitCount < minCommits { + continue } rows = append(rows, row) + seenFiles[n.FilePath] = struct{}{} } } + if !sawMeta { + // No node in scope carries meta.churn — the agent needs to + // run the enricher before this tool can answer. We surface + // the gap loudly rather than returning an empty result that + // looks like "nothing churns" (which is misleading). + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "error": "no churn data in scope; run `gortex enrich churn` (or call the enrich_churn MCP tool) to populate meta.churn", + "suggestion": "gortex enrich churn", + "symbols": []churnRow{}, + "total": 0, + "truncated": false, + }) + } + sort.Slice(rows, func(i, j int) bool { switch sortBy { case "commit_count": @@ -187,23 +175,102 @@ func (s *Server) handleGetChurnRate(ctx context.Context, req mcp.CallToolRequest } return s.respondJSONOrTOON(ctx, req, map[string]any{ - "symbols": rows, - "total": len(rows), - "truncated": truncated, - "scanned_files": scannedFiles, - "sort_by": sortBy, - "min_commits": minCommits, + "symbols": rows, + "total": len(rows), + "truncated": truncated, + "scanned_files": len(seenFiles), + "sort_by": sortBy, + "min_commits": minCommits, }) } -// stripPathPrefix returns path with prefix stripped iff path begins -// with prefix. Used to convert absolute paths back to git-tree-relative. -func stripPathPrefix(path, prefix string) (string, error) { - if strings.HasPrefix(path, prefix) { - return path[len(prefix):], nil +// churnRowFromEnrichment builds a response row from a node + its typed +// sidecar churn enrichment (change A read path). +func churnRowFromEnrichment(n *graph.Node, e graph.ChurnEnrichment) churnRow { + endLine := n.EndLine + if endLine == 0 { + endLine = n.StartLine + } + return churnRow{ + ID: n.ID, Name: n.Name, File: n.FilePath, + StartLine: n.StartLine, EndLine: endLine, + CommitCount: e.CommitCount, + AgeDays: e.AgeDays, + ChurnRate: e.ChurnRate, + LastAuthor: e.LastAuthor, + LastCommitAt: e.LastCommitAt, + } +} + +// churnRowFromMeta projects a node's meta.churn payload into the +// response row. Returns (zero, false) when the node has no churn +// metadata — the caller distinguishes "missing data" from +// "filtered out". The Meta layout matches what +// internal/churn.EnrichGraph writes: +// +// meta.churn = { +// commit_count: int, +// age_days: int, +// churn_rate: float64, +// last_author: string, +// last_commit_at: RFC3339 string, +// } +// +// Numeric fields tolerate both int and float64 because Meta round- +// trips through the on-disk backend or JSON (snapshots), which can widen +// ints to floats. Missing fields default to zero — they're stamped +// together so partial payloads are unexpected, but a defensive read +// is cheaper than asserting and crashing on an old snapshot. +func churnRowFromMeta(n *graph.Node) (churnRow, bool) { + if n == nil || n.Meta == nil { + return churnRow{}, false + } + raw, ok := n.Meta["churn"].(map[string]any) + if !ok || len(raw) == 0 { + return churnRow{}, false + } + endLine := n.EndLine + if endLine == 0 { + endLine = n.StartLine + } + row := churnRow{ + ID: n.ID, Name: n.Name, File: n.FilePath, + StartLine: n.StartLine, EndLine: endLine, + CommitCount: intFromAny(raw["commit_count"]), + AgeDays: intFromAny(raw["age_days"]), + ChurnRate: floatFromAny(raw["churn_rate"]), + } + if v, ok := raw["last_author"].(string); ok { + row.LastAuthor = v + } + if v, ok := raw["last_commit_at"].(string); ok { + row.LastCommitAt = v + } + return row, true +} + +func intFromAny(v any) int { + switch x := v.(type) { + case int: + return x + case int64: + return int(x) + case float64: + return int(x) } - if path == strings.TrimSuffix(prefix, "/") { - return "", nil + return 0 +} + +func floatFromAny(v any) float64 { + switch x := v.(type) { + case float64: + return x + case float32: + return float64(x) + case int: + return float64(x) + case int64: + return float64(x) } - return path, errPathUnresolved + return 0 } diff --git a/internal/mcp/tools_churn_test.go b/internal/mcp/tools_churn_test.go index ce84e286..c8b25143 100644 --- a/internal/mcp/tools_churn_test.go +++ b/internal/mcp/tools_churn_test.go @@ -3,85 +3,57 @@ package mcp import ( "context" "encoding/json" - "os" - "os/exec" - "path/filepath" "testing" "time" "github.com/mark3labs/mcp-go/mcp" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/zzet/gortex/internal/graph" ) -// seedChurnRepo creates a real git repo at dir, with several commits -// touching different parts of foo.go so blame returns distinct -// authors and timestamps per line range. Returns absolute path. -func seedChurnRepo(t *testing.T) string { - t.Helper() - dir := t.TempDir() - - gitInit := func(args ...string) { - cmd := exec.Command("git", args...) - cmd.Dir = dir - if out, err := cmd.CombinedOutput(); err != nil { - t.Fatalf("git %v: %v\n%s", args, err, out) - } - } - gitInit("init", "-q") - gitInit("config", "user.email", "alice@example.com") - gitInit("config", "user.name", "alice") - gitInit("config", "commit.gpgsign", "false") - - write := func(content string) { - require.NoError(t, os.WriteFile(filepath.Join(dir, "foo.go"), []byte(content), 0o644)) - } - - // Commit 1: initial file. dead and live each at one line range. - write(`package foo - -func dead() int { - return 1 -} - -func live() int { - return 1 -} -`) - gitInit("add", "foo.go") - gitInit("commit", "-q", "-m", "init") - - // Commits 2-4: modify live() body three times, dead() once. - for i := 2; i <= 4; i++ { - write(`package foo - -func dead() int { - return ` + string(rune('1'+i)) + ` -} - -func live() int { - return ` + string(rune('1'+i)) + ` -} -`) - gitInit("commit", "-aq", "-m", "edit "+string(rune('1'+i))+"") - } - - return dir -} - -func newChurnTestServer(t *testing.T, dir string) *Server { +// seedChurnGraph builds a small graph with two function nodes whose +// meta.churn data the read-side handler is supposed to surface. We +// stamp the metadata directly instead of running the enricher — the +// read path is what's under test here; the enrich pass has its own +// tests in internal/churn. +func seedChurnGraph(t *testing.T) *Server { t.Helper() g := graph.New() - absFoo := filepath.Join(dir, "foo.go") + now := time.Now().UTC() g.AddNode(&graph.Node{ - ID: absFoo + "::dead", Name: "dead", Kind: graph.KindFunction, - FilePath: absFoo, StartLine: 3, EndLine: 5, Language: "go", + ID: "foo.go::dead", + Kind: graph.KindFunction, + Name: "dead", + FilePath: "foo.go", + StartLine: 3, EndLine: 5, + Meta: map[string]any{ + "churn": map[string]any{ + "commit_count": 1, + "age_days": 0, + "churn_rate": 1.0, + "last_author": "alice@example.com", + "last_commit_at": now.Format(time.RFC3339), + }, + }, }) g.AddNode(&graph.Node{ - ID: absFoo + "::live", Name: "live", Kind: graph.KindFunction, - FilePath: absFoo, StartLine: 7, EndLine: 9, Language: "go", + ID: "foo.go::live", + Kind: graph.KindFunction, + Name: "live", + FilePath: "foo.go", + StartLine: 7, EndLine: 9, + Meta: map[string]any{ + "churn": map[string]any{ + "commit_count": 4, + "age_days": 2, + "churn_rate": 2.0, + "last_author": "bob@example.com", + "last_commit_at": now.Format(time.RFC3339), + }, + }, }) return &Server{ @@ -112,45 +84,35 @@ func callChurnHandler(t *testing.T, s *Server, args map[string]any) map[string]a } func TestChurnRate_BothFunctionsSurface(t *testing.T) { - dir := seedChurnRepo(t) - s := newChurnTestServer(t, dir) - + s := seedChurnGraph(t) out := callChurnHandler(t, s, map[string]any{}) symbols, _ := out["symbols"].([]any) require.Len(t, symbols, 2, "both dead and live should surface") } -func TestChurnRate_LiveHasHigherCommitCount(t *testing.T) { - dir := seedChurnRepo(t) - s := newChurnTestServer(t, dir) - +func TestChurnRate_SortByCommitCount(t *testing.T) { + s := seedChurnGraph(t) out := callChurnHandler(t, s, map[string]any{"sort_by": "commit_count"}) symbols, _ := out["symbols"].([]any) require.Len(t, symbols, 2) first := symbols[0].(map[string]any) second := symbols[1].(map[string]any) - // Both functions get edited by the same 4 commits — blame attribution - // will treat the entire file's lines as touched in each commit. The - // ordering should at least be stable; the count should be ≥1. - assert.GreaterOrEqual(t, int(first["commit_count"].(float64)), 1) - assert.GreaterOrEqual(t, int(second["commit_count"].(float64)), 1) + assert.Greater(t, int(first["commit_count"].(float64)), int(second["commit_count"].(float64))) + assert.Equal(t, "live", first["name"], "live has 4 commits, should rank above dead's 1") } func TestChurnRate_MinCommitsFilter(t *testing.T) { - dir := seedChurnRepo(t) - s := newChurnTestServer(t, dir) - - // Very high threshold should drop everything. - out := callChurnHandler(t, s, map[string]any{"min_commits": 100}) + s := seedChurnGraph(t) + // dead has 1, live has 4 — threshold of 3 keeps only live. + out := callChurnHandler(t, s, map[string]any{"min_commits": 3}) symbols, _ := out["symbols"].([]any) - assert.Empty(t, symbols) + require.Len(t, symbols, 1) + assert.Equal(t, "live", symbols[0].(map[string]any)["name"]) } func TestChurnRate_LimitTruncates(t *testing.T) { - dir := seedChurnRepo(t) - s := newChurnTestServer(t, dir) - + s := seedChurnGraph(t) out := callChurnHandler(t, s, map[string]any{"limit": 1}) symbols, _ := out["symbols"].([]any) assert.Len(t, symbols, 1) @@ -158,47 +120,27 @@ func TestChurnRate_LimitTruncates(t *testing.T) { } func TestChurnRate_PathPrefixFilter(t *testing.T) { - dir := seedChurnRepo(t) - s := newChurnTestServer(t, dir) - - // Use a prefix that won't match anything. + s := seedChurnGraph(t) + // Prefix that matches none of the nodes' file paths. out := callChurnHandler(t, s, map[string]any{"path_prefix": "/no/such/path"}) - symbols, _ := out["symbols"].([]any) - assert.Empty(t, symbols) + // With no in-scope nodes carrying meta we hit the structured + // error path — assert the suggestion is present. + assert.Equal(t, "gortex enrich churn", out["suggestion"]) } func TestChurnRate_ScannedFilesCount(t *testing.T) { - dir := seedChurnRepo(t) - s := newChurnTestServer(t, dir) - + s := seedChurnGraph(t) out := callChurnHandler(t, s, map[string]any{}) // One file (foo.go) — scanned once even with two symbols. assert.EqualValues(t, 1, out["scanned_files"].(float64)) } -func TestChurnRate_AgeDaysWithinFreshRepo(t *testing.T) { - dir := seedChurnRepo(t) - s := newChurnTestServer(t, dir) - - out := callChurnHandler(t, s, map[string]any{}) - symbols, _ := out["symbols"].([]any) - require.NotEmpty(t, symbols) - first := symbols[0].(map[string]any) - // Fresh repo — age_days < 1 most of the time. Allow some slack. - age := int(first["age_days"].(float64)) - assert.LessOrEqual(t, age, 1, "fresh repo: symbol age should be 0 or 1 day") -} - -func TestChurnRate_RejectsNonGitDirectory(t *testing.T) { - dir := t.TempDir() - // Create a file but no git repo. - abs := filepath.Join(dir, "foo.go") - require.NoError(t, os.WriteFile(abs, []byte("package foo\nfunc x() {}\n"), 0o644)) - +func TestChurnRate_ErrorsWhenNoMeta(t *testing.T) { + // Graph with a function node but no meta.churn → error response. g := graph.New() g.AddNode(&graph.Node{ - ID: abs + "::x", Name: "x", Kind: graph.KindFunction, - FilePath: abs, StartLine: 2, EndLine: 2, + ID: "bar.go::x", Name: "x", Kind: graph.KindFunction, + FilePath: "bar.go", StartLine: 2, EndLine: 2, }) s := &Server{ graph: g, @@ -208,16 +150,13 @@ func TestChurnRate_RejectsNonGitDirectory(t *testing.T) { sessions: newSessionMap(), toolScopes: newScopeRegistry(), } - out := callChurnHandler(t, s, map[string]any{}) - symbols, _ := out["symbols"].([]any) - assert.Empty(t, symbols, "non-git directories return zero rows, not an error") + require.NotEmpty(t, out["error"], "expected structured error when no meta.churn is present") + assert.Equal(t, "gortex enrich churn", out["suggestion"]) } func TestChurnRate_SortByOptions(t *testing.T) { - dir := seedChurnRepo(t) - s := newChurnTestServer(t, dir) - + s := seedChurnGraph(t) for _, sortBy := range []string{"churn_rate", "commit_count", "age_days"} { out := callChurnHandler(t, s, map[string]any{"sort_by": sortBy}) assert.Equal(t, sortBy, out["sort_by"], "sort_by echoed") @@ -226,20 +165,8 @@ func TestChurnRate_SortByOptions(t *testing.T) { } } -func TestStripPathPrefix(t *testing.T) { - got, err := stripPathPrefix("/a/b/c.go", "/a/") - require.NoError(t, err) - assert.Equal(t, "b/c.go", got) - - _, err = stripPathPrefix("/x/y.go", "/a/") - assert.Error(t, err) -} - -// Smoke test: roundtrip Unix timestamp through time.Time matches RFC3339. func TestChurnRate_TimestampShape(t *testing.T) { - dir := seedChurnRepo(t) - s := newChurnTestServer(t, dir) - + s := seedChurnGraph(t) out := callChurnHandler(t, s, map[string]any{}) symbols, _ := out["symbols"].([]any) require.NotEmpty(t, symbols) @@ -249,3 +176,71 @@ func TestChurnRate_TimestampShape(t *testing.T) { _, err := time.Parse(time.RFC3339, ts) require.NoError(t, err) } + +func TestChurnRate_TolerantMetaTypes(t *testing.T) { + // gob → JSON → Go round-trip can widen ints to float64. Verify the + // projection handles both forms transparently. + g := graph.New() + g.AddNode(&graph.Node{ + ID: "f.go::a", Name: "a", Kind: graph.KindFunction, + FilePath: "f.go", StartLine: 1, EndLine: 1, + Meta: map[string]any{ + "churn": map[string]any{ + "commit_count": float64(7), // came back from JSON + "age_days": int64(3), // came back from gob int64 + "churn_rate": float64(2.33), + "last_author": "x@y", + "last_commit_at": "2026-05-01T00:00:00Z", + }, + }, + }) + s := &Server{ + graph: g, + session: newSessionState(), + tokenStats: &tokenStats{}, + symHistory: &symbolHistory{entries: make(map[string][]SymbolModification)}, + sessions: newSessionMap(), + toolScopes: newScopeRegistry(), + } + out := callChurnHandler(t, s, map[string]any{}) + symbols, _ := out["symbols"].([]any) + require.Len(t, symbols, 1) + row := symbols[0].(map[string]any) + assert.EqualValues(t, 7, row["commit_count"]) + assert.EqualValues(t, 3, row["age_days"]) + assert.InDelta(t, 2.33, row["churn_rate"].(float64), 0.001) +} + +// TestChurnRate_SidecarReadPath proves the change-A primary path: +// churn populated in the typed sidecar (BulkSetChurn) — with NO +// Meta["churn"] on the nodes — is surfaced by get_churn_rate via the +// ChurnEnrichmentReader index read, not the AllNodes Meta scan. +func TestChurnRate_SidecarReadPath(t *testing.T) { + g := graph.New() + g.AddNode(&graph.Node{ID: "foo.go::a", Kind: graph.KindFunction, Name: "a", FilePath: "foo.go", StartLine: 1, EndLine: 2}) + g.AddNode(&graph.Node{ID: "foo.go::b", Kind: graph.KindFunction, Name: "b", FilePath: "foo.go", StartLine: 3, EndLine: 4}) + require.NoError(t, g.BulkSetChurn("", []graph.ChurnEnrichment{ + {NodeID: "foo.go::a", CommitCount: 7, ChurnRate: 3.0, LastAuthor: "a@x"}, + {NodeID: "foo.go::b", CommitCount: 2, ChurnRate: 0.5, LastAuthor: "b@x"}, + })) + s := &Server{ + graph: g, + session: newSessionState(), + tokenStats: &tokenStats{}, + symHistory: &symbolHistory{entries: make(map[string][]SymbolModification)}, + sessions: newSessionMap(), + toolScopes: newScopeRegistry(), + } + + out := callChurnHandler(t, s, map[string]any{"sort_by": "commit_count"}) + symbols, _ := out["symbols"].([]any) + require.Len(t, symbols, 2, "both sidecar rows must surface") + first, _ := symbols[0].(map[string]any) + assert.Equal(t, "foo.go::a", first["symbol_id"], "sort_by commit_count: a (7) before b (2)") + assert.EqualValues(t, 7, first["commit_count"]) + assert.Equal(t, "a@x", first["last_author"]) + + out2 := callChurnHandler(t, s, map[string]any{"min_commits": 5}) + syms2, _ := out2["symbols"].([]any) + require.Len(t, syms2, 1, "min_commits=5 keeps only a") +} diff --git a/internal/mcp/tools_clones.go b/internal/mcp/tools_clones.go index 4fc1ccbd..54d5e443 100644 --- a/internal/mcp/tools_clones.go +++ b/internal/mcp/tools_clones.go @@ -83,10 +83,16 @@ func (s *Server) handleFindClones(ctx context.Context, req mcp.CallToolRequest) // Walk EdgeSimilarTo edges. The graph holds them symmetrically // (fA→fB and fB→fA); canonicalise to A rows[j].Score + sort.Slice(pendings, func(i, j int) bool { + if pendings[i].score != pendings[j].score { + return pendings[i].score > pendings[j].score } - return rows[i].File < rows[j].File + return pendings[i].file < pendings[j].file }) truncated := false - if len(rows) > limit { - rows = rows[:limit] + if len(pendings) > limit { + pendings = pendings[:limit] truncated = true } + keepFiles := make([]string, 0, len(pendings)) + for _, p := range pendings { + keepFiles = append(keepFiles, p.file) + } + symbolsByFile := s.symbolNamesByFiles(keepFiles) + rows := make([]coChangeRow, 0, len(pendings)) + for _, p := range pendings { + rows = append(rows, coChangeRow{ + File: p.file, + Score: roundScore(p.score), + Count: p.count, + Symbols: symbolsByFile[p.file], + }) + } result := map[string]any{ "target_file": targetFile, @@ -96,25 +117,106 @@ func (s *Server) handleFindCoChangingSymbols(ctx context.Context, req mcp.CallTo if symbolID != "" { result["symbol_id"] = symbolID } + // When the cache is empty AND the background mine has not finished + // yet, surface an in-progress marker so the caller can distinguish + // "this file has no co-change data" from "the daemon hasn't built + // the data yet". The mine is fired at daemon-ready by RunAnalysis; + // a fresh daemon on a disk backend takes tens of seconds before the cache is + // populated. + if len(rows) == 0 && !s.coChangeReady() { + result["mining_in_progress"] = true + result["note"] = "co-change graph is still being mined; retry shortly" + } return s.respondJSONOrTOON(ctx, req, result) } -// ensureCoChange mines the co-change graph exactly once per daemon -// lifetime. Safe for concurrent callers — later callers block until -// the first mine completes, then return immediately. +// ensureCoChange triggers the co-change mine if it has not run yet +// and returns IMMEDIATELY — the mine itself runs asynchronously. +// +// Why async? On a disk backend with no pre-existing +// EdgeCoChange edges, mineCoChange spends 60+ seconds in +// cochange.AddEdges: an AllNodes full-table scan plus thousands of +// per-pair AddEdge round-trips. Wrapping that in sync.Once.Do +// turned every queued tool call into a blocked-for-60s caller. The +// async shape keeps the request path off the slow path. +// +// PrewarmCoChange (called from RunAnalysis at daemon-ready) fires +// the mine ahead of any user-visible call so the cache is already +// populated by the time the first find_co_changing_symbols arrives. +// +// Returning immediately means the first user call may see an empty +// cache when the prewarm goroutine has not yet completed. That is +// the deliberate trade-off — the alternative is a 60s blocked tool +// call. The handler surfaces an `in_progress` flag when the cache is +// empty so callers know to retry rather than treating the file as +// genuinely uncoupled. func (s *Server) ensureCoChange() { - s.cochangeOnce.Do(s.mineCoChange) + s.cochangeOnce.Do(func() { + go s.mineCoChange() + }) +} + +// PrewarmCoChange triggers the co-change mine in the background so a +// later find_co_changing_symbols / search rerank call sees a +// populated cache without blocking. Safe to call multiple times — the +// underlying sync.Once still gates the work to one execution. +// +// Returns immediately whether mining is in progress, completed, or +// freshly started. +func (s *Server) PrewarmCoChange() { + go s.cochangeOnce.Do(s.mineCoChange) +} + +// coChangeReady reports whether the mine has completed and the cache +// is populated. Used by the handler to set an `in_progress` flag +// when the cache is empty but mining is still running. +func (s *Server) coChangeReady() bool { + s.cochangeMu.RLock() + defer s.cochangeMu.RUnlock() + return s.cochangeByFile != nil } // mineCoChange populates the co-change caches. It prefers EdgeCoChange // edges already present in the graph (an enriched snapshot); only when -// none exist does it mine `git log` and materialise the edges. +// none exist does it mine `git log`. +// +// The mine populates the in-memory caches AND persists the mined +// pairs as EdgeCoChange edges (cochange.AddEdges) so a subsequent daemon +// start takes the coChangeFromEdges fast path instead of re-mining +// `git log` (the 5-15s restart cost). +// +// The earlier version deliberately skipped the persist to avoid the +// analyze[clusters] partition cache (keyed on NodeCount/EdgeCount/ +// EdgeIdentityRevisions) being invalidated by edge-count drift. That +// concern was about CONTINUOUS drift; here the persist is bounded — +// mineCoChange runs once per process (sync.Once) and the fast path skips +// the mine once edges exist — so the edge count (and the clusters token) +// moves at most ONCE per graph, triggering a single recompute rather +// than per-restart thrash. Co-change edges are partition-irrelevant +// (edgeWeight 0; both endpoints are KindFile nodes, filtered out of +// community detection), so that one recompute yields the same partition. +// +// Reads are unaffected: find_co_changing_symbols and the search rerank's +// CoChangeOf hook both read the in-memory cache. The CLI cochange.EnrichGraph +// path already persisted via AddEdges; this aligns the lazy daemon path +// with it. Refreshing stale co-change after a HEAD move is still a manual +// `gortex enrich cochange` (or a cold reindex) — the lazy path does not +// auto-re-mine once edges exist. func (s *Server) mineCoChange() { scores := map[string]map[string]float64{} counts := map[string]map[string]int{} if s.coChangeFromEdges(scores, counts) { s.storeCoChange(scores, counts) + // The co-change graph COULD be refreshed by re-mining git log, + // but was NOT: persisted EdgeCoChange edges already exist, so the + // lazy path serves them as-is. If history advanced since the last + // mine these counts are stale until an explicit refresh. Surface + // that rather than silently serving possibly-stale data. + if s.logger != nil { + s.logger.Info("co-change served from persisted edges; not re-mined (could be updated, but was not) — run `gortex enrich cochange` to refresh after history changes", + zap.Int("file_relations", len(scores))) + } return } @@ -123,7 +225,6 @@ func (s *Server) mineCoChange() { if len(res.Pairs) == 0 { continue } - cochange.AddEdges(s.graph, res.Pairs, prefix) for _, p := range res.Pairs { fa, fb := p.FileA, p.FileB if prefix != "" { @@ -133,6 +234,13 @@ func (s *Server) mineCoChange() { addCoChangeLink(scores, counts, fa, fb, p.Score, p.Count) addCoChangeLink(scores, counts, fb, fa, p.Score, p.Count) } + // Persist the mined pairs as EdgeCoChange edges so a later daemon + // start takes the coChangeFromEdges fast path instead of re-mining + // git log (the 5-15s restart cost). Bounded: mineCoChange runs once + // per process (sync.Once) and the fast path above skips the mine + // once edges exist, so this persist (and its one clusters-cache + // token bump) happens at most once per graph, not per restart. + cochange.AddEdges(s.graph, res.Pairs, prefix) } s.storeCoChange(scores, counts) } @@ -141,18 +249,27 @@ func (s *Server) mineCoChange() { // edges already in the graph. Returns true when at least one edge was // found — the signal that an enriched snapshot is loaded and no fresh // git mine is needed. +// +// EdgesByKind streams only the CoChange edges; the endpoint nodes are +// fetched in one batched GetNodesByIDs call instead of two GetNode +// round-trips per edge. On disk backends that drops the +// whole-graph AllEdges materialisation plus the per-edge +// GetNode trips that loaded the file paths. func (s *Server) coChangeFromEdges(scores map[string]map[string]float64, counts map[string]map[string]int) bool { - found := false - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeCoChange { - continue - } - from := s.graph.GetNode(e.From) - to := s.graph.GetNode(e.To) - if from == nil || to == nil { + // First pass: collect CoChange edges + the set of node IDs they + // reference. Both can stream from EdgesByKind in one + // round-trip on disk backends. + type ccEdge struct { + from, to string + score float64 + count int + } + var edges []ccEdge + idSet := make(map[string]struct{}) + for e := range s.graph.EdgesByKind(graph.EdgeCoChange) { + if e == nil { continue } - found = true score := e.Confidence if e.Meta != nil { if v, ok := e.Meta["score"].(float64); ok { @@ -170,9 +287,35 @@ func (s *Server) coChangeFromEdges(scores map[string]map[string]float64, counts count = int(v) } } - addCoChangeLink(scores, counts, from.FilePath, to.FilePath, score, count) + edges = append(edges, ccEdge{from: e.From, to: e.To, score: score, count: count}) + idSet[e.From] = struct{}{} + idSet[e.To] = struct{}{} + } + if len(edges) == 0 { + return false + } + + // Batched endpoint resolution — one batched id-IN query vs. + // 2 * len(edges) per-row GetNode trips. On a workspace with + // thousands of co-change edges this is the bulk of the latency. + ids := make([]string, 0, len(idSet)) + for id := range idSet { + ids = append(ids, id) + } + nodes := s.graph.GetNodesByIDs(ids) + + for _, e := range edges { + from, ok := nodes[e.from] + if !ok || from == nil { + continue + } + to, ok := nodes[e.to] + if !ok || to == nil { + continue + } + addCoChangeLink(scores, counts, from.FilePath, to.FilePath, e.score, e.count) } - return found + return true } // addCoChangeLink records one directed co-change relationship. diff --git a/internal/mcp/tools_cochange_test.go b/internal/mcp/tools_cochange_test.go index 152adfce..23da2139 100644 --- a/internal/mcp/tools_cochange_test.go +++ b/internal/mcp/tools_cochange_test.go @@ -7,6 +7,7 @@ import ( "github.com/mark3labs/mcp-go/mcp" "github.com/stretchr/testify/require" + "github.com/zzet/gortex/internal/cochange" "github.com/zzet/gortex/internal/graph" ) @@ -115,3 +116,33 @@ func TestFindCoChanging_UnknownSymbol(t *testing.T) { _, isErr := callFindCoChanging(t, s, map[string]any{"symbol_id": "does/not::Exist"}) require.True(t, isErr) } + +// TestCoChange_PersistedEdgesTakeFastPath proves change B's mechanism: +// mineCoChange persists mined pairs as EdgeCoChange edges (via +// cochange.AddEdges), so a subsequent daemon start reads them back via +// coChangeFromEdges (the fast path) instead of re-mining git log. +func TestCoChange_PersistedEdgesTakeFastPath(t *testing.T) { + g := graph.New() + g.AddNode(&graph.Node{ID: "a.go", Kind: graph.KindFile, Name: "a.go", FilePath: "a.go", Language: "go"}) + g.AddNode(&graph.Node{ID: "b.go", Kind: graph.KindFile, Name: "b.go", FilePath: "b.go", Language: "go"}) + + // What mineCoChange now does after a git mine: persist the pairs. + n := cochange.AddEdges(g, []cochange.Pair{{FileA: "a.go", FileB: "b.go", Score: 0.9, Count: 5}}, "") + require.Positive(t, n, "AddEdges must persist EdgeCoChange edges") + + // A fresh server over the same graph takes the coChangeFromEdges + // fast path (no git mine) and surfaces the persisted co-change. + s := &Server{ + graph: g, + session: newSessionState(), + tokenStats: &tokenStats{}, + symHistory: &symbolHistory{entries: make(map[string][]SymbolModification)}, + sessions: newSessionMap(), + toolScopes: newScopeRegistry(), + } + scores := map[string]map[string]float64{} + counts := map[string]map[string]int{} + require.True(t, s.coChangeFromEdges(scores, counts), "persisted edges must take the fast path") + require.InDelta(t, 0.9, scores["a.go"]["b.go"], 1e-9) + require.Equal(t, 5, counts["a.go"]["b.go"]) +} diff --git a/internal/mcp/tools_coding.go b/internal/mcp/tools_coding.go index d02c258a..01418d17 100644 --- a/internal/mcp/tools_coding.go +++ b/internal/mcp/tools_coding.go @@ -264,6 +264,34 @@ func resolveKeepPredicate(keep string, symbols []*graph.Node) (func(elide.Decl) return pred, resolved } +// editingContextSymbolNodes reconstructs the *graph.Node slice the +// elide.KeepAny predicate needs from the editing-context Defines +// rows. We carry the node IDs only on the wire, but a `keep` token +// can target a node by id, name, or kind — so we re-resolve every +// defines row to a node here. Used only when compress_bodies=true. +func (s *Server) editingContextSymbolNodes(filePath string, defines []map[string]any) []*graph.Node { + if len(defines) == 0 { + return nil + } + ids := make([]string, 0, len(defines)) + for _, d := range defines { + if id, _ := d["id"].(string); id != "" { + ids = append(ids, id) + } + } + if len(ids) == 0 { + return nil + } + nodes := s.graph.GetNodesByIDs(ids) + out := make([]*graph.Node, 0, len(ids)) + for _, id := range ids { + if n, ok := nodes[id]; ok && n != nil { + out = append(out, n) + } + } + return out +} + func (s *Server) handleGetEditingContext(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { fp, err := req.RequireString("path") if err != nil { @@ -274,99 +302,164 @@ func (s *Server) handleGetEditingContext(ctx context.Context, req mcp.CallToolRe s.ensureFresh([]string{fp}) s.sessionFor(ctx).recordFile(fp) - sg := s.engineFor(ctx).GetFileSymbols(fp) - if len(sg.Nodes) == 0 { - return mcp.NewToolResultError("no symbols found for file: " + fp), nil - } - // A file outside the session's workspace is reported as not found - // — its symbols all share one repo, so the first node decides. - if !s.nodeInSessionScope(ctx, sg.Nodes[0]) { - return mcp.NewToolResultError("no symbols found for file: " + fp), nil - } - // Confine the caller/callee neighbourhoods below to the session - // workspace so editing context never reaches across the boundary. - sessWS, _, _ := s.sessionScope(ctx) - // Frecency: a file-level editing context is effectively an access to - // every symbol defined in that file. Credit each of them — this is - // the signal that "the agent is working in this area right now." - for _, n := range sg.Nodes { - if n.Kind == graph.KindFile { - continue - } - s.frecency.Record(n.ID) - } - out := editingContext{} - - // File info. - for _, n := range sg.Nodes { - if n.Kind == graph.KindFile { - out.File = map[string]any{"id": n.ID, "language": n.Language} - break + var fileNodeForScope *graph.Node + callerCap := 20 + calleeCap := 20 + + // Fast path: when the backend implements FileEditingContext we + // take all five projections in a small fixed number of + // round-trips instead of the per-symbol GetCallers / GetCallChain + // loop. The fallback retains the previous engine-based shape so + // the in-memory backend is unaffected. + if fc, ok := s.graph.(graph.FileEditingContext); ok { + bundle := fc.FileEditingContext(fp, []graph.NodeKind{graph.KindFunction, graph.KindMethod}) + if bundle == nil || (bundle.FileNode == nil && len(bundle.Defines) == 0) { + return mcp.NewToolResultError("no symbols found for file: " + fp), nil + } + fileNodeForScope = bundle.FileNode + if fileNodeForScope == nil && len(bundle.Defines) > 0 { + fileNodeForScope = bundle.Defines[0] + } + if !s.nodeInSessionScope(ctx, fileNodeForScope) { + return mcp.NewToolResultError("no symbols found for file: " + fp), nil + } + for _, n := range bundle.Defines { + s.frecency.Record(n.ID) + } + if bundle.FileNode != nil { + out.File = map[string]any{"id": bundle.FileNode.ID, "language": bundle.FileNode.Language} + } + for _, n := range bundle.Defines { + entry := map[string]any{ + "id": n.ID, + "kind": n.Kind, + "name": n.Name, + "start_line": n.StartLine, + } + if n.Meta != nil { + if sig, ok := n.Meta["signature"]; ok { + entry["signature"] = sig + } + } + out.Defines = append(out.Defines, entry) } - } - - // Defines: all non-file symbols in this file. - for _, n := range sg.Nodes { - if n.Kind == graph.KindFile { - continue + for _, e := range bundle.Imports { + out.Imports = append(out.Imports, map[string]any{ + "id": e.To, + "external": strings.HasPrefix(e.To, "external::"), + }) } - entry := map[string]any{ - "id": n.ID, - "kind": n.Kind, - "name": n.Name, - "start_line": n.StartLine, + // Workspace-scope post-filter mirrors the legacy GetCallers / + // GetCallChain WorkspaceID gate. + sessWS, _, bound := s.sessionScope(ctx) + var opts query.QueryOptions + if bound { + opts.WorkspaceID = sessWS } - if sig, ok := n.Meta["signature"]; ok { - entry["signature"] = sig + for _, n := range bundle.CalledBy { + if bound && !opts.ScopeAllows(n) { + continue + } + if len(out.CalledBy) >= callerCap { + break + } + out.CalledBy = append(out.CalledBy, map[string]any{ + "id": n.ID, + "name": n.Name, + "file_path": n.FilePath, + "start_line": n.StartLine, + }) } - out.Defines = append(out.Defines, entry) - } - - // Imports: outgoing import edges from the file node. - for _, e := range sg.Edges { - if e.Kind == graph.EdgeImports { - importInfo := map[string]any{ - "id": e.To, - "external": strings.HasPrefix(e.To, "external::"), + for _, n := range bundle.Calls { + if bound && !opts.ScopeAllows(n) { + continue + } + if len(out.Calls) >= calleeCap { + break } - out.Imports = append(out.Imports, importInfo) + out.Calls = append(out.Calls, map[string]any{ + "id": n.ID, + "name": n.Name, + "file_path": n.FilePath, + "start_line": n.StartLine, + }) } - } - - // CalledBy: who calls symbols in this file (depth 1). - callerSeen := make(map[string]bool) - for _, n := range sg.Nodes { - if n.Kind == graph.KindFunction || n.Kind == graph.KindMethod { - callers := s.engineFor(ctx).GetCallers(n.ID, query.QueryOptions{Depth: 1, Limit: 20, Detail: "brief", WorkspaceID: sessWS}) - for _, cn := range callers.Nodes { - if cn.FilePath != fp && !callerSeen[cn.ID] { - callerSeen[cn.ID] = true - out.CalledBy = append(out.CalledBy, map[string]any{ - "id": cn.ID, - "name": cn.Name, - "file_path": cn.FilePath, - "start_line": cn.StartLine, - }) + } else { + sg := s.engineFor(ctx).GetFileSymbols(fp) + if len(sg.Nodes) == 0 { + return mcp.NewToolResultError("no symbols found for file: " + fp), nil + } + if !s.nodeInSessionScope(ctx, sg.Nodes[0]) { + return mcp.NewToolResultError("no symbols found for file: " + fp), nil + } + sessWS, _, _ := s.sessionScope(ctx) + for _, n := range sg.Nodes { + if n.Kind == graph.KindFile { + continue + } + s.frecency.Record(n.ID) + } + for _, n := range sg.Nodes { + if n.Kind == graph.KindFile { + out.File = map[string]any{"id": n.ID, "language": n.Language} + break + } + } + for _, n := range sg.Nodes { + if n.Kind == graph.KindFile { + continue + } + entry := map[string]any{ + "id": n.ID, + "kind": n.Kind, + "name": n.Name, + "start_line": n.StartLine, + } + if sig, ok := n.Meta["signature"]; ok { + entry["signature"] = sig + } + out.Defines = append(out.Defines, entry) + } + for _, e := range sg.Edges { + if e.Kind == graph.EdgeImports { + out.Imports = append(out.Imports, map[string]any{ + "id": e.To, + "external": strings.HasPrefix(e.To, "external::"), + }) + } + } + callerSeen := make(map[string]bool) + for _, n := range sg.Nodes { + if n.Kind == graph.KindFunction || n.Kind == graph.KindMethod { + callers := s.engineFor(ctx).GetCallers(n.ID, query.QueryOptions{Depth: 1, Limit: callerCap, Detail: "brief", WorkspaceID: sessWS}) + for _, cn := range callers.Nodes { + if cn.FilePath != fp && !callerSeen[cn.ID] { + callerSeen[cn.ID] = true + out.CalledBy = append(out.CalledBy, map[string]any{ + "id": cn.ID, + "name": cn.Name, + "file_path": cn.FilePath, + "start_line": cn.StartLine, + }) + } } } } - } - - // Calls: what symbols in this file call (depth 1). - callSeen := make(map[string]bool) - for _, n := range sg.Nodes { - if n.Kind == graph.KindFunction || n.Kind == graph.KindMethod { - chain := s.engineFor(ctx).GetCallChain(n.ID, query.QueryOptions{Depth: 1, Limit: 20, Detail: "brief", WorkspaceID: sessWS}) - for _, cn := range chain.Nodes { - if cn.FilePath != fp && !callSeen[cn.ID] { - callSeen[cn.ID] = true - out.Calls = append(out.Calls, map[string]any{ - "id": cn.ID, - "name": cn.Name, - "file_path": cn.FilePath, - "start_line": cn.StartLine, - }) + callSeen := make(map[string]bool) + for _, n := range sg.Nodes { + if n.Kind == graph.KindFunction || n.Kind == graph.KindMethod { + chain := s.engineFor(ctx).GetCallChain(n.ID, query.QueryOptions{Depth: 1, Limit: calleeCap, Detail: "brief", WorkspaceID: sessWS}) + for _, cn := range chain.Nodes { + if cn.FilePath != fp && !callSeen[cn.ID] { + callSeen[cn.ID] = true + out.Calls = append(out.Calls, map[string]any{ + "id": cn.ID, + "name": cn.Name, + "file_path": cn.FilePath, + "start_line": cn.StartLine, + }) + } } } } @@ -388,18 +481,20 @@ func (s *Server) handleGetEditingContext(ctx context.Context, req mcp.CallToolRe } } if language != "" && elide.IsSupported(language) { - // Use the first non-file node to find the on-disk path. + // Use the file node (cached above from the editing-context + // bundle) to find the on-disk path. Falls back to the first + // defines node if no file node materialised (defensive — the + // FileEditingContext implementation always returns one when + // the file is indexed). var fileBytes []byte - for _, n := range sg.Nodes { - if n.Kind == graph.KindFile { - if absPath, rerr := s.resolveNodePath(n); rerr == nil { - if content, ok := s.overlayContentFor(ctx, absPath); ok { - fileBytes = []byte(content) - } else if b, ferr := os.ReadFile(absPath); ferr == nil { - fileBytes = b - } + anchor := fileNodeForScope + if anchor != nil { + if absPath, rerr := s.resolveNodePath(anchor); rerr == nil { + if content, ok := s.overlayContentFor(ctx, absPath); ok { + fileBytes = []byte(content) + } else if b, ferr := os.ReadFile(absPath); ferr == nil { + fileBytes = b } - break } } if len(fileBytes) > 0 { @@ -407,7 +502,8 @@ func (s *Server) handleGetEditingContext(ctx context.Context, req mcp.CallToolRe // verbatim bodies while the rest of the file is still // stubbed — keep the functions being edited at full // source and compress everything else. - keepPred, resolved := resolveKeepPredicate(req.GetString("keep", ""), sg.Nodes) + keepNodes := s.editingContextSymbolNodes(fp, out.Defines) + keepPred, resolved := resolveKeepPredicate(req.GetString("keep", ""), keepNodes) keptSymbols = resolved if compressed, cerr := elide.CompressWith(fileBytes, language, elide.Options{Keep: keepPred}); cerr == nil { sourceCompressed = string(compressed) diff --git a/internal/mcp/tools_core.go b/internal/mcp/tools_core.go index 21dc896f..57ed2190 100644 --- a/internal/mcp/tools_core.go +++ b/internal/mcp/tools_core.go @@ -7,6 +7,7 @@ import ( "path/filepath" "sort" "strings" + "time" "github.com/mark3labs/mcp-go/mcp" toon "github.com/toon-format/toon-go" @@ -563,11 +564,22 @@ func filterSubGraph(sg *query.SubGraph, allowed map[string]bool) *query.SubGraph edges = append(edges, e) } } + totalEdges := len(edges) + // Counts-only payloads arrive with Edges == nil and TotalEdges + // pre-populated — preserve the upstream count instead of zeroing + // it. Inexact in the presence of a non-trivial filter (we'd need + // the edges to know which belong to filtered-out nodes), but the + // gcx output that asks for the count-only path runs with the + // session's workspace scope already applied at the store, so the + // filter pass is typically a no-op. + if len(sg.Edges) == 0 && sg.TotalEdges > 0 { + totalEdges = sg.TotalEdges + } return &query.SubGraph{ Nodes: nodes, Edges: edges, TotalNodes: len(nodes), - TotalEdges: len(edges), + TotalEdges: totalEdges, Truncated: sg.Truncated, } } @@ -619,6 +631,69 @@ func enrichSubGraphEdges(sg *query.SubGraph) { } } +// isNonDefinitionNode reports whether a node kind is NOT a file-level +// definition and should be dropped from a get_file_summary view. It +// excludes the file node itself, imports, and the function-body-internal +// nodes (locals, params, closures, generic params, builtins) that the +// file_path lookup pulls in but that the "symbols a file defines" +// contract never wanted. Without this filter the summary floods with +// hundreds of locals/params (the old defines-edge query excluded them by +// construction; the GetFileNodes-based path does not). +func isNonDefinitionNode(k graph.NodeKind) bool { + switch k { + case graph.KindFile, graph.KindImport, graph.KindLocal, + graph.KindParam, graph.KindClosure, graph.KindGenericParam, + graph.KindBuiltin: + return true + } + return false +} + +// stripNonDefinitionNodes returns a copy of sg with non-definition nodes +// nodes removed (and edges that reference them dropped). Used by +// handleGetFileSummary to keep its output focused on the symbols a +// file *defines* — the file node and per-statement import nodes are +// useful internals (e.g. for the file-neighbourhood walk that drives +// the disk-backend pushdown) but noise in the agent-visible payload. +func stripNonDefinitionNodes(sg *query.SubGraph) *query.SubGraph { + if sg == nil { + return nil + } + keep := make(map[string]bool, len(sg.Nodes)) + nodes := make([]*graph.Node, 0, len(sg.Nodes)) + for _, n := range sg.Nodes { + if n == nil || isNonDefinitionNode(n.Kind) { + continue + } + nodes = append(nodes, n) + keep[n.ID] = true + } + edges := make([]*graph.Edge, 0, len(sg.Edges)) + for _, e := range sg.Edges { + if e == nil || !keep[e.From] || !keep[e.To] { + continue + } + edges = append(edges, e) + } + totalEdges := len(edges) + // Counts-only payloads arrive with Edges == nil and TotalEdges + // already populated by the store. Keep that count — the file + + // import nodes we're stripping pulled some edges with them so it's + // a slight overcount, but the gcx callers that take this path + // only render it as a header scalar, not as anything load-bearing. + if len(sg.Edges) == 0 && sg.TotalEdges > 0 { + totalEdges = sg.TotalEdges + } + return &query.SubGraph{ + Nodes: nodes, + Edges: edges, + TotalNodes: len(nodes), + TotalEdges: totalEdges, + Truncated: sg.Truncated, + CallerNotes: sg.CallerNotes, + } +} + // compactSubGraph formats a SubGraph as compact text. func compactSubGraph(sg *query.SubGraph) string { var b strings.Builder @@ -726,7 +801,7 @@ func (s *Server) registerCoreTools() { mcp.WithString("assist", mcp.Description("LLM assist mode: \"auto\" (default — engages on natural-language queries, skips identifier lookups), \"on\" (force engage), \"off\" (bypass), \"deep\" (on + a body-grounded verification pass that reads candidate code and HONESTLY drops irrelevant matches — slower, may return empty results when nothing genuinely matches). Requires an LLM provider configured via `llm.provider` (local / anthropic / openai / ollama / claudecli / gemini / bedrock / deepseek); behaves as \"off\" when none is available.")), mcp.WithBoolean("debug", mcp.Description("When true, attach a `rerank` block to the response carrying per-candidate scores and per-signal contributions from the 11-signal rerank pipeline (bm25, semantic, fan_in, hits, fan_out, churn, community, minhash, api_signature, type_signature, recency, feedback) plus the active per-signal weight map. Off by default; enable to inspect ranking decisions or tune `.gortex.yaml::search::weights`.")), mcp.WithString("query_class", mcp.Description("Advisory hint that tunes the bm25-vs-semantic balance of the rerank: \"auto\" (default — detect from query shape), \"symbol\" (identifier / API lookup — BM25-heavy), \"concept\" (natural-language description — balanced), \"path\" (file-path query — most BM25-heavy), \"signature\" (type/function-signature fragment — BM25-leaning), \"keyword_soup\" (a degenerate boolean OR-list \u2014 suppresses LLM expansion and splits the soup into per-disjunct BM25 fetches; a `query_advice` nudge rides on the response). The class actually used is echoed back as `query_class` in the response.")), - mcp.WithString("expand", mcp.Description("Query-expansion channels: \"both\" (default \u2014 LLM expansion when the assist gate engages, plus the deterministic equivalence-class table), \"equivalence\" (only the LLM-free curated synonym table + per-repo auto-mined concepts), \"llm\" (only LLM expansion), \"off\" (pure BM25, no expansion). Equivalence expansion bridges query vocabulary to the words a symbol uses (auth->login, delete->remove) and runs even with no LLM provider configured.")), + mcp.WithString("expand", mcp.Description("Query-expansion channels: \"both\" (default \u2014 LLM expansion when the assist gate engages, plus the deterministic equivalence-class table), \"equivalence\" (only the LLM-free curated synonym table + per-repo auto-mined concepts), \"llm\" (only LLM expansion), \"off\" (pure BM25, no expansion). Equivalence expansion bridges query vocabulary to the words a symbol uses (auth->login, delete->remove) and runs even with no LLM provider configured. For identifier queries (query_class symbol / path / signature) the server auto-disables expansion + vector even when expand is set \u2014 these classes match best on BM25 + exact-name alone.")), mcp.WithString("corpus", mcp.Description("Which corpus to search: \"code\" (default \u2014 code symbols only), \"docs\" (only Markdown prose-section nodes \u2014 the heading-delimited documentation sections), \"all\" (both). With docs/all a prose query matches the right README / guide section by its body text.")), mcp.WithNumber("max_per_file", mcp.Description("Cap how many results a single source file may contribute to the diverse head of the result set (default 3). Hits beyond the cap are demoted below not-yet-capped results — never dropped — so the top of the list spans more files. Set 0 to disable diversification.")), ), @@ -735,7 +810,7 @@ func (s *Server) registerCoreTools() { s.addTool( mcp.NewTool("get_file_summary", - mcp.WithDescription("Use instead of Read to understand a file's role: returns all its symbols and imports without reading source lines."), + mcp.WithDescription("Use instead of Read to understand a file's role: returns the symbols a file defines (functions, methods, types, fields, …) without reading source lines. The file node itself and import nodes are excluded — use find_import_path or get_dependencies for import-shape queries."), mcp.WithString("path", mcp.Required(), mcp.Description("Relative file path")), mcp.WithBoolean("compact", mcp.Description("One-line-per-symbol text output (saves 50-70% tokens)")), mcp.WithString("format", mcp.Description("Output format: json (default), gcx (GCX1 compact wire format), or toon")), @@ -1103,7 +1178,15 @@ func (s *Server) handleSearchSymbols(ctx context.Context, req mcp.CallToolReques projectArg = fq.Project } scopeWS, scopeProj := s.resolveQueryScope(ctx, workspaceArg, projectArg) - scope := query.QueryOptions{WorkspaceID: scopeWS, ProjectID: scopeProj} + // Per-phase timing for the search hot path. The struct is populated + // across the engine boundary (BM25 backend call wall-clock attributes + // to BM25*MS in fetchAndMergeBM25Timed; GetNodes / FindName / Fallback + // land here from inside Engine.gatherBackendCandidates) and surfaced + // at the end as a single debug log line. Nil-safe: callers without + // debug logging pay zero overhead. + timings := &query.SearchTimings{} + phaseStart := time.Now() + scope := query.QueryOptions{WorkspaceID: scopeWS, ProjectID: scopeProj, SearchTimings: timings} // Keyword-soup defense: a degenerate boolean / OR-list query // ("A OR B OR 'no access'") defeats ordinary retrieval. Detect it @@ -1120,6 +1203,37 @@ func (s *Server) handleSearchSymbols(ctx context.Context, req mcp.CallToolReques soupReason = "query reads as a boolean OR-list; search ranks best on a single concept or symbol name -- run one query per disjunct, or describe the intent in plain words" } + // Identifier-shape fast path. ClassifyQuery is the structural + // detector the rerank uses; QueryClassSymbol / Path / Signature + // are queries where the rerank's classWeightTable already proves + // the semantic channel contributes near-zero signal (0.65 / 0.45 / + // 0.80 vs the baseline 1.00) — see internal/search/rerank/ + // query_kind.go::classWeightTable. For these classes the handler + // forces expansion off and tells the engine to skip the vector + // channel entirely; the rest of the pipeline (BM25 + bundle + + // rerank) is the only path that matters. An explicit + // query_class arg pin on one of these three classes engages the + // fast path too. A soup query never engages the fast path — + // keyword_soup has its own split-disjunct treatment. + // + // Validation of the query_class arg happens here so the early + // gating uses the same validated value the rerank below uses; + // invalid input is rejected before the engine runs. + queryClass := rerank.ClassifyQuery(q) + if qcArg := strings.TrimSpace(req.GetString("query_class", "")); qcArg != "" { + parsed, ok := rerank.ParseQueryClass(qcArg) + if !ok { + return mcp.NewToolResultError("invalid query_class: " + qcArg + " (want auto, symbol, concept, path, signature, or keyword_soup)"), nil + } + if parsed != rerank.QueryClassUnknown { + queryClass = parsed + } + } + identifierFastPath := !isSoup && isIdentifierClass(queryClass) + if identifierFastPath { + scope.SkipVectorChannel = true + } + // LLM assist gate: decides whether the expansion + rerank passes // run for this query. The service-enabled check is layered inside // the helpers so a stub build is a clean bypass. A soup query @@ -1129,6 +1243,14 @@ func (s *Server) handleSearchSymbols(ctx context.Context, req mcp.CallToolReques // expand mode picks which query-expansion channels run -- LLM, // the deterministic equivalence table, both (default), or off. expand := parseExpandMode(req) + // Identifier-shape queries skip every expansion channel — the + // rerank's classWeightTable shows BM25 is near-perfect for these + // classes; expansion would only add the combined-OR fan-out's + // extra backend call without lifting recall on a literal-token + // query. The explicit arg pin still wins for soup / concept. + if identifierFastPath { + expand = expandOff + } engage := shouldEngageAssist(assist, q) && s.llmService != nil && s.llmService.Enabled() if isSoup || !expand.allowsLLMExpansion() { engage = false @@ -1139,6 +1261,14 @@ func (s *Server) handleSearchSymbols(ctx context.Context, req mcp.CallToolReques // Slightly widen the BM25 over-fetch when we're going to // rerank: more head candidates means a more useful reorder. fetchLimit = offset + limit + rerankCap + } else if identifierFastPath { + // Identifier-shape fast path: no expansion, no vector channel, + // no LLM rerank — the only down-stream consumer is the + // structural rerank pipeline scoring a single FTS-ranked head. + // A wide head is wasted work; every extra candidate drags an + // in/out edge pair through the bundle phase. Tighten to + // +5 so the post-filter slack still leaves a full page. + fetchLimit = offset + limit + 5 } // Expansion terms feeding the BM25 OR-merge: LLM-derived synonyms @@ -1162,14 +1292,28 @@ func (s *Server) handleSearchSymbols(ctx context.Context, req mcp.CallToolReques } expandedTerms := mergeExpansionTerms(soupFragments, llmTerms, equivTerms) + // Build the rerank context BEFORE the BM25 fetch so the engine's + // bundle path can seed its edge caches as the BM25 calls land. + // The handler-side applyRerankBoostsTimed reuses this same rctx, + // so the merged candidate set's edges are already cached when + // prepare() runs against the post-filter slice. Without this + // pre-fetch construction the engine's bundle would build a + // throwaway cache on each BM25 call and the handler's later + // rerank would still fetch every candidate's edges itself. + rctx := s.buildRerankContext(ctx, q) + scope.RerankContext = rctx + var nodes []*graph.Node var primaryCount int if len(expandedTerms) > 0 { - nodes, primaryCount = fetchAndMergeBM25(s.engineFor(ctx), q, expandedTerms, fetchLimit, scope) + nodes, primaryCount = fetchAndMergeBM25Timed(s.engineFor(ctx), q, expandedTerms, fetchLimit, scope, timings) } else { + bm25Start := time.Now() nodes = s.engineFor(ctx).SearchSymbolsScoped(q, fetchLimit, scope) + timings.BM25PrimaryMS += time.Since(bm25Start).Milliseconds() primaryCount = len(nodes) } + candsAfterGather := len(nodes) mergedCount := len(nodes) // pre-filter; comparable to primaryCount // Apply repo/project/ref filter. @@ -1253,34 +1397,45 @@ func (s *Server) handleSearchSymbols(ctx context.Context, req mcp.CallToolReques // feedback, churn) layer on top once the agent has spent time // in the codebase. Cold queries with no session data fall back // to a structural-only pass. - rctx := s.buildRerankContext(ctx, q) - // Per-class rerank weighting: detect the query class (or honour an - // explicit query_class hint) and pin it on the rerank Context so - // the pipeline scales the bm25 / semantic blend accordingly. - queryClass := rerank.ClassifyQuery(q) - if qcArg := strings.TrimSpace(req.GetString("query_class", "")); qcArg != "" { - parsed, ok := rerank.ParseQueryClass(qcArg) - if !ok { - return mcp.NewToolResultError("invalid query_class: " + qcArg + " (want auto, symbol, concept, path, signature, or keyword_soup)"), nil - } - if parsed != rerank.QueryClassUnknown { - queryClass = parsed - } - } - // A detected soup query reports the keyword_soup class even when - // the caller did not pin it, so the response surfaces the class - // the handler actually treated the query as. + // + // rctx was built above (before the BM25 fetch) so the engine's + // bundle path could seed its edge caches into the same rctx the + // handler-side rerank will read from. + // queryClass was classified + validated at the top of the handler + // so the identifier-shape fast path could read it. Re-apply the + // soup override here — soup detection happens after classification + // and reports keyword_soup regardless of what the structural + // detector thought the query looked like. if isSoup { queryClass = rerank.QueryClassKeywordSoup } - rctx.QueryClass = queryClass + if rctx != nil { + rctx.QueryClass = queryClass + } + candsAfterFilter := len(nodes) + // Capture the post-filter candidate ID set so we can ask the rctx + // what fraction of these candidates' edges were already cached by + // the bundle pre-seed (vs needing prepare's own batched fetch). + // Hit-rate is reported on the debug log as cache_hit_rate. + if rctx != nil { + preIDs := make([]string, 0, len(nodes)) + for _, n := range nodes { + if n != nil { + preIDs = append(preIDs, n.ID) + } + } + timings.CacheHitRate = rctx.EdgeCacheHitRate(preIDs) + } var rerankBreakdown []*rerank.Candidate - nodes = applyRerankBoosts(s, nodes, q, rctx, &rerankBreakdown) + var rerankPrepare, rerankSignals time.Duration + nodes, rerankPrepare, rerankSignals = applyRerankBoostsTimed(s, nodes, q, rctx, &rerankBreakdown) // Per-file diversification: keep one file's many symbols from // monopolising the head of the result set. Runs after the rerank // so demotion acts on final scores; nothing is dropped. + diversifyStart := time.Now() nodes, rerankBreakdown = diversifyByFile(nodes, rerankBreakdown, req.GetInt("max_per_file", defaultMaxPerFile)) + diversifyMS := time.Since(diversifyStart).Milliseconds() // Remember the returned IDs for attribution on later consume calls. // Cap at top limit so unseen "overflow" results don't get credited. @@ -1392,6 +1547,53 @@ func (s *Server) handleSearchSymbols(ctx context.Context, req mcp.CallToolReques } resp["rerank"] = encodeRerankBreakdown(pageBreakdown, s.engineFor(ctx).Rerank()) } + + // Per-phase Debug log line — single zap.Debug call carrying every + // timing field for this search_symbols invocation. The bench harness + // greps for the "search_symbols phases" message at --log-level + // debug; production runs at info level pay nothing. Tracked phases: + // BM25 primary / expansion calls (wall-clock around the engine), + // the inner GetNodesByIDs / FindNodesByName / Fallback hops (from + // the engine), rerank prepare (batched edge fetch) and signals + // (in-process scoring), diversify, and the candidate counts at + // gather → filter → final. + if s.logger != nil { + totalMS := time.Since(phaseStart).Milliseconds() + // "BM25 backend" cost = the BM25 wall-clock minus the inner + // phases the engine also accumulated under that call. Negative + // values are clamped to 0 (clock granularity / contention). + // BundleMS is subtracted too — it's a fold of the FTS + nodes + // + edge fetches that, on the legacy path, would have shown up + // in TextBackend / GetNodes / (no field for edges) separately. + bm25Backend := timings.BM25PrimaryMS + timings.BM25ExpansionMS - timings.GetNodesMS - timings.FindNameMS - timings.FallbackMS - timings.BundleMS + if bm25Backend < 0 { + bm25Backend = 0 + } + s.logger.Debug("search_symbols phases", + zap.String("query", q), + zap.Int("expansion_terms", len(expandedTerms)), + zap.Int64("bm25_primary_ms", timings.BM25PrimaryMS), + zap.Int64("bm25_expansion_ms", timings.BM25ExpansionMS), + zap.Int64("bm25_backend_ms", bm25Backend), + zap.Int64("text_backend_ms", timings.TextBackendMS), + zap.Int64("embed_ms", timings.EmbedMS), + zap.Int64("vector_search_ms", timings.VectorSearchMS), + zap.Int64("engine_rerank_ms", timings.EngineRerankMS), + zap.Int64("bundle_ms", timings.BundleMS), + zap.Float64("cache_hit_rate", timings.CacheHitRate), + zap.Int64("get_nodes_ms", timings.GetNodesMS), + zap.Int64("find_name_ms", timings.FindNameMS), + zap.Int64("fallback_ms", timings.FallbackMS), + zap.Duration("rerank_prepare_ms", rerankPrepare), + zap.Duration("rerank_signals_ms", rerankSignals), + zap.Int64("diversify_ms", diversifyMS), + zap.Int64("total_ms", totalMS), + zap.Int("cands_after_gather", candsAfterGather), + zap.Int("cands_after_filter", candsAfterFilter), + zap.Int("cands_final", len(nodes)), + ) + } + return s.respondJSONOrTOON(ctx, req, resp) } @@ -1446,7 +1648,20 @@ func roundTo(v float64, places int) float64 { return float64(int64(v*pow+0.5)) / pow } -func (s *Server) handleGetFileSummary(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { +func (s *Server) handleGetFileSummary(ctx context.Context, req mcp.CallToolRequest) (res *mcp.CallToolResult, retErr error) { + // Defensive panic recovery — get_file_summary has been observed + // to crash the MCP transport in multi-repo mode (file-content + // validation gap). Surface the panic as a tool error so the + // session survives. + defer func() { + if r := recover(); r != nil { + s.logger.Error("get_file_summary panic recovered", + zap.String("path", req.GetString("path", "")), + zap.Any("panic", r)) + res = mcp.NewToolResultError(fmt.Sprintf("get_file_summary internal error: %v", r)) + retErr = nil + } + }() fp, err := req.RequireString("path") if err != nil { return mcp.NewToolResultError("path is required"), nil @@ -1455,7 +1670,21 @@ func (s *Server) handleGetFileSummary(ctx context.Context, req mcp.CallToolReque // Auto re-index stale file before querying. s.ensureFresh([]string{fp}) - sg := s.engineFor(ctx).GetFileSymbols(fp) + // gcx is the high-volume agent format and only emits total_edges + // in its meta header — never per-edge rows. Route gcx-only calls + // through the count-only path so the disk backends skip + // materialising every adjacent edge across cgo (a 4 000-row + // round-trip on a 500-symbol file becomes two scalar aggregates). + // compact + json paths still take the full SubGraph because + // compact summarises edges per confidence label and json ships + // every edge in the body. + gcxOnly := s.isGCX(ctx, req) && !isCompact(req) + var sg *query.SubGraph + if gcxOnly { + sg = s.engineFor(ctx).GetFileSymbolsCounts(fp) + } else { + sg = s.engineFor(ctx).GetFileSymbols(fp) + } if len(sg.Nodes) == 0 { return mcp.NewToolResultError("no symbols found for file: " + fp), nil } @@ -1470,12 +1699,26 @@ func (s *Server) handleGetFileSummary(ctx context.Context, req mcp.CallToolReque return mcp.NewToolResultError("no symbols found for file in specified scope: " + fp), nil } + // get_file_summary's contract is "what symbols does this file + // define" — the file node itself and import nodes ride on + // GetFileSubGraph because they're useful for other walkers, but + // the encoder layer wants the symbols-only view. The compact + // path already filtered both kinds inline; the cleaner home is + // here so every output format (compact, gcx, json, toon) sees the + // same shape. + sg = stripNonDefinitionNodes(sg) + if len(sg.Nodes) == 0 { + return mcp.NewToolResultError("no symbols found for file: " + fp), nil + } + if isCompact(req) { return mcp.NewToolResultText(compactSubGraph(sg)), nil } - // ETag conditional fetch. - etag := computeETag(sg) + // ETag conditional fetch. Use the structural SubGraph hash — + // json.Marshal'ing the whole SubGraph + Meta on every call was the + // dominant cost on large files (~2 ms / call on a 500-symbol file). + etag := etagSubGraph(sg) if ifNoneMatch := req.GetString("if_none_match", ""); ifNoneMatch != "" && ifNoneMatch == etag { return notModifiedResult(etag), nil } diff --git a/internal/mcp/tools_coupling.go b/internal/mcp/tools_coupling.go index 4618fb5e..a8e7ba65 100644 --- a/internal/mcp/tools_coupling.go +++ b/internal/mcp/tools_coupling.go @@ -14,9 +14,9 @@ import ( // classic Robert C. Martin metrics computed per package or // community. // -// Ca (afferent coupling) — how many external units depend on us -// Ce (efferent coupling) — how many external units we depend on -// I (instability) — Ce / (Ca + Ce). 0 = max stable, 1 = max unstable +// Ca (afferent coupling) — how many external units depend on us +// Ce (efferent coupling) — how many external units we depend on +// I (instability) — Ce / (Ca + Ce). 0 = max stable, 1 = max unstable // // The painful packages are the ones with **high Ca + high I** — // load-bearing and changing all the time. The tool returns rows @@ -97,25 +97,45 @@ func (s *Server) handleGetCouplingMetrics(ctx context.Context, req mcp.CallToolR stats[u] = &units{ca: map[string]bool{}, ce: map[string]bool{}} } - for _, e := range s.graph.AllEdges() { - if !isCouplingEdge(e.Kind) { - continue - } - fromUnit, fromOK := nodeToUnit[e.From] - toUnit, toOK := nodeToUnit[e.To] - if !fromOK || !toOK { - continue - } - if fromUnit == toUnit { - stats[fromUnit].internal++ + // Iterate the coupling-edge buckets directly via EdgesByKind + // instead of AllEdges() + a Go-side filter — the disk backend's + // EdgesByKind runs one indexed query per kind and ships only + // the matching rows. Structural edges (defines / member_of / + // contains-file-of-symbol) which dominate edge counts on large + // repos drop out before they cross the storage boundary. Order is fixed so the + // loop body stays trivially identical to the legacy AllEdges + // branch. + for _, k := range []graph.EdgeKind{ + graph.EdgeCalls, + graph.EdgeImports, + graph.EdgeImplements, + graph.EdgeExtends, + graph.EdgeReferences, + graph.EdgeInstantiates, + graph.EdgeCrossRepoCalls, + graph.EdgeCrossRepoImplements, + graph.EdgeCrossRepoExtends, + } { + for e := range s.graph.EdgesByKind(k) { + if e == nil { + continue + } + fromUnit, fromOK := nodeToUnit[e.From] + toUnit, toOK := nodeToUnit[e.To] + if !fromOK || !toOK { + continue + } + if fromUnit == toUnit { + stats[fromUnit].internal++ + stats[fromUnit].total++ + continue + } + // Cross-unit: counts as ce for the source unit, ca for the target. + stats[fromUnit].ce[toUnit] = true stats[fromUnit].total++ - continue + stats[toUnit].ca[fromUnit] = true + stats[toUnit].total++ } - // Cross-unit: counts as ce for the source unit, ca for the target. - stats[fromUnit].ce[toUnit] = true - stats[fromUnit].total++ - stats[toUnit].ca[fromUnit] = true - stats[toUnit].total++ } rows := make([]couplingRow, 0, len(stats)) @@ -178,11 +198,11 @@ func (s *Server) handleGetCouplingMetrics(ctx context.Context, req mcp.CallToolR } return s.respondJSONOrTOON(ctx, req, map[string]any{ - "units": rows, - "total": len(rows), - "truncated": truncated, - "unit_kind": unitKind, - "sort_by": sortBy, + "units": rows, + "total": len(rows), + "truncated": truncated, + "unit_kind": unitKind, + "sort_by": sortBy, }) } @@ -212,22 +232,3 @@ func packageOfPath(path string, depth int) string { } return strings.Join(parts[:depth], "/") } - -// isCouplingEdge identifies edges that signal real dependency -// — calls, imports, implements, extends, references, instantiates. -// Structural edges (defines, member_of) don't count. -func isCouplingEdge(k graph.EdgeKind) bool { - switch k { - case graph.EdgeCalls, - graph.EdgeImports, - graph.EdgeImplements, - graph.EdgeExtends, - graph.EdgeReferences, - graph.EdgeInstantiates, - graph.EdgeCrossRepoCalls, - graph.EdgeCrossRepoImplements, - graph.EdgeCrossRepoExtends: - return true - } - return false -} diff --git a/internal/mcp/tools_enhancements.go b/internal/mcp/tools_enhancements.go index 88557812..0c7808c4 100644 --- a/internal/mcp/tools_enhancements.go +++ b/internal/mcp/tools_enhancements.go @@ -8,6 +8,7 @@ import ( "math" "os" "path/filepath" + "slices" "sort" "strings" "time" @@ -22,7 +23,6 @@ import ( "github.com/zzet/gortex/internal/graph" "github.com/zzet/gortex/internal/persistence" "github.com/zzet/gortex/internal/query" - "github.com/zzet/gortex/internal/releases" "github.com/zzet/gortex/internal/tokens" "go.uber.org/zap" ) @@ -35,6 +35,17 @@ func (s *Server) ensureFresh(filePaths []string) []string { if s.watcher != nil { return nil } + // In multi-repo mode the legacy single-Indexer's fileMtimes is + // always empty for cross-repo paths, so IsStale returns true for + // every file → IndexFile fires → race with the daemon's read + // surface, which has been observed to crash the MCP transport + // (a concurrency hazard against the live read surface). The MultiIndexer's own + // per-repo watcher / Reconcile path owns freshness here; the + // single-Indexer auto-refresh is dead weight that does more harm + // than good. + if s.multiIndexer != nil { + return nil + } if s.indexer == nil { return nil } @@ -145,7 +156,7 @@ func (s *Server) registerEnhancementTools() { mcp.WithNumber("min_pct", mcp.Description("(coverage_gaps) Lower-inclusive coverage threshold — default 0")), mcp.WithNumber("max_pct", mcp.Description("(coverage_gaps) Upper-exclusive coverage threshold — default 100, i.e. anything not fully covered")), mcp.WithString("provider", mcp.Description("(stale_flags) Filter to a single provider — launchdarkly, growthbook, unleash, internal")), - mcp.WithString("tag", mcp.Description("(todos) Filter by tag — TODO / FIXME / HACK / XXX / NOTE — case-insensitive")), + mcp.WithString("tag", mcp.Description("(todos) Filter by tag — TODO / FIXME / HACK / XXX / NOTE — case-insensitive. (releases) Filter to one release tag — returns the file list whose meta.added_in matches; populate via enrich_releases first.")), mcp.WithString("assignee", mcp.Description("(todos) Filter by exact assignee — case-sensitive")), mcp.WithString("ticket", mcp.Description("(todos) Filter by exact ticket reference — e.g. PROJ-42")), mcp.WithBoolean("has_assignee", mcp.Description("(todos) Keep only TODOs that have an assignee set")), @@ -443,7 +454,7 @@ func (s *Server) handlePrefetchContext(ctx context.Context, req mcp.CallToolRequ // Gather recent symbols from parameter or session state. var recentIDs []string if recentStr != "" { - for _, id := range strings.Split(recentStr, ",") { + for id := range strings.SplitSeq(recentStr, ",") { recentIDs = append(recentIDs, strings.TrimSpace(id)) } } @@ -578,14 +589,7 @@ func (s *Server) handlePrefetchContext(ctx context.Context, req mcp.CallToolRequ var candidates []prefetchCandidate for id, sc := range scoreMap { // Exclude recently viewed symbols themselves - isRecent := false - for _, rid := range recentIDs { - if id == rid { - isRecent = true - break - } - } - if isRecent { + if slices.Contains(recentIDs, id) { continue } @@ -629,14 +633,8 @@ func (s *Server) handlePrefetchContext(ctx context.Context, req mcp.CallToolRequ if limit <= 0 { limit = 10 } - offset := decodeCursor(req.GetString("cursor", "")) - if offset > totalCount { - offset = totalCount - } - endIdx := offset + limit - if endIdx > totalCount { - endIdx = totalCount - } + offset := min(decodeCursor(req.GetString("cursor", "")), totalCount) + endIdx := min(offset+limit, totalCount) candidates = candidates[offset:endIdx] truncated := endIdx < totalCount nextCursor := "" @@ -697,7 +695,7 @@ func (s *Server) handlePrefetchContext(ctx context.Context, req mcp.CallToolRequ func (s *Server) handleAnalyze(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { kind, err := req.RequireString("kind") if err != nil { - return mcp.NewToolResultError("kind is required (one of: dead_code, hotspots, cycles, would_create_cycle, todos, blame, coverage, stale_code, ownership, coverage_gaps, stale_flags, releases, cgo_users, wasm_users, orphan_tables, unreferenced_tables, coverage_summary, channel_ops, goroutine_spawns, field_writers, race_writes, unclosed_channels, unsafe_patterns, health_score, annotation_users, config_readers, event_emitters, pubsub, string_emitters, error_surface, log_events, sql_rebuild, external_calls, routes, models, components, k8s_resources, images, kustomize, cross_repo, impact, named, tests_as_edges, connectivity_health)"), nil + return mcp.NewToolResultError("kind is required (one of: dead_code, hotspots, cycles, would_create_cycle, todos, blame, coverage, stale_code, ownership, coverage_gaps, stale_flags, releases, cgo_users, wasm_users, orphan_tables, unreferenced_tables, coverage_summary, channel_ops, goroutine_spawns, field_writers, race_writes, unclosed_channels, unsafe_patterns, health_score, annotation_users, config_readers, event_emitters, pubsub, string_emitters, error_surface, log_events, sql_rebuild, external_calls, routes, models, components, k8s_resources, images, kustomize, cross_repo, impact, named, tests_as_edges, connectivity_health, pagerank, louvain, wcc, scc, kcore)"), nil } switch kind { case "dead_code": @@ -810,8 +808,18 @@ func (s *Server) handleAnalyze(ctx context.Context, req mcp.CallToolRequest) (*m return s.handleAnalyzeTestsAsEdges(ctx, req) case "connectivity_health": return s.handleAnalyzeConnectivityHealth(ctx, req) + case "pagerank": + return s.handleAnalyzePageRank(ctx, req) + case "louvain": + return s.handleAnalyzeLouvain(ctx, req) + case "wcc": + return s.handleAnalyzeConnectedComponents(ctx, req, false) + case "scc": + return s.handleAnalyzeConnectedComponents(ctx, req, true) + case "kcore": + return s.handleAnalyzeKCore(ctx, req) default: - return mcp.NewToolResultError("unknown analyze kind: " + kind + " (expected: dead_code, hotspots, cycles, would_create_cycle, todos, blame, coverage, stale_code, ownership, coverage_gaps, stale_flags, releases, cgo_users, wasm_users, orphan_tables, unreferenced_tables, coverage_summary, channel_ops, goroutine_spawns, field_writers, race_writes, unclosed_channels, unsafe_patterns, sast, hygiene, health_score, annotation_users, config_readers, env_var_users, sql_call_sites, fixes_history, edge_audit, domain, event_emitters, pubsub, string_emitters, error_surface, log_events, sql_rebuild, external_calls, routes, models, components, k8s_resources, images, kustomize, cross_repo, dbt_models, impact, named, tests_as_edges, connectivity_health)"), nil + return mcp.NewToolResultError("unknown analyze kind: " + kind + " (expected: dead_code, hotspots, cycles, would_create_cycle, todos, blame, coverage, stale_code, ownership, coverage_gaps, stale_flags, releases, cgo_users, wasm_users, orphan_tables, unreferenced_tables, coverage_summary, channel_ops, goroutine_spawns, field_writers, race_writes, unclosed_channels, unsafe_patterns, sast, hygiene, health_score, annotation_users, config_readers, env_var_users, sql_call_sites, fixes_history, edge_audit, domain, event_emitters, pubsub, string_emitters, error_surface, log_events, sql_rebuild, external_calls, routes, models, components, k8s_resources, images, kustomize, cross_repo, dbt_models, impact, named, tests_as_edges, connectivity_health, pagerank, louvain, wcc, scc, kcore)"), nil } } @@ -847,10 +855,10 @@ func (s *Server) handleAnalyzeTodos(ctx context.Context, req mcp.CallToolRequest } var rows []todoRow - for _, n := range s.scopedNodes(ctx) { - if n.Kind != graph.KindTodo { - continue - } + // Push the kind filter into the storage layer — todos are a + // tiny slice of the node table, so the AllNodes scan was the + // dominant cost on a disk backend. + for _, n := range s.scopedNodesByKinds(ctx, []graph.NodeKind{graph.KindTodo}) { tag, _ := n.Meta["tag"].(string) assignee, _ := n.Meta["assignee"].(string) ticket, _ := n.Meta["ticket"].(string) @@ -1006,33 +1014,24 @@ func (s *Server) handleAnalyzeStaleCode(ctx context.Context, req mcp.CallToolReq AgeDays int `json:"age_days"` } var rows []staleRow - for _, n := range s.scopedNodes(ctx) { - if _, ok := allowedKinds[n.Kind]; !ok { - continue - } - la, ok := n.Meta["last_authored"].(map[string]any) - if !ok { + // Push the kind filter into the storage layer; the meta gate + // (last_authored.timestamp) stays in Go since the meta column is + // opaque to the query layer. + blame := blameRowsByID(s.graph) + for _, n := range s.scopedNodesByKinds(ctx, allowedKindsSlice(allowedKinds)) { + la, ok := lastAuthoredFrom(blame, n) + if !ok || la.Timestamp == 0 { continue } - ts, ok := la["timestamp"].(int64) - if !ok { - // JSON unmarshal lands ints as float64 in some paths; - // accept both shapes so the analyzer works on graphs - // loaded from snapshots and graphs enriched in-process. - if f, isFloat := la["timestamp"].(float64); isFloat { - ts = int64(f) - } else { - continue - } - } + ts := la.Timestamp if ts > cutoffSec { continue } - email, _ := la["email"].(string) + email := la.Email if emailFilter != "" && email != emailFilter { continue } - commit, _ := la["commit"].(string) + commit := la.Commit ageSec := time.Now().Unix() - ts rows = append(rows, staleRow{ ID: n.ID, @@ -1069,6 +1068,21 @@ func (s *Server) handleAnalyzeStaleCode(ctx context.Context, req mcp.CallToolReq }) } +// allowedKindsSlice returns the keys of an analyzer's allowedKinds +// set so the caller can hand them to scopedNodesByKinds. Kept as a +// helper rather than inlined at every call site so the order is +// deterministic — not load-bearing for correctness (the capability +// dedupes), but it keeps test expectations stable when the IN list +// is logged. +func allowedKindsSlice(allowed map[graph.NodeKind]struct{}) []graph.NodeKind { + out := make([]graph.NodeKind, 0, len(allowed)) + for k := range allowed { + out = append(out, k) + } + slices.Sort(out) + return out +} + // parseAnalyzeKindsFilter parses a comma-separated kinds argument // into the set used by handleAnalyzeStaleCode. The literal "all" // returns the broadest blame-eligible kind set so callers can drop @@ -1076,7 +1090,7 @@ func (s *Server) handleAnalyzeStaleCode(ctx context.Context, req mcp.CallToolReq // fields included too. func parseAnalyzeKindsFilter(arg string) map[graph.NodeKind]struct{} { out := map[graph.NodeKind]struct{}{} - for _, k := range strings.Split(arg, ",") { + for k := range strings.SplitSeq(arg, ",") { k = strings.TrimSpace(strings.ToLower(k)) if k == "" { continue @@ -1144,22 +1158,23 @@ func (s *Server) handleAnalyzeOwnership(ctx context.Context, req mcp.CallToolReq } byEmail := map[string]*ownerStats{} - for _, n := range s.scopedNodes(ctx) { - if _, ok := allowedKinds[n.Kind]; !ok { - continue - } + // Kind pushdown — owners are derived from the blame meta on + // function/method (or wider) nodes; the analyzer scans tens of + // thousands of irrelevant nodes without it on a disk backend. + ownBlame := blameRowsByID(s.graph) + for _, n := range s.scopedNodesByKinds(ctx, allowedKindsSlice(allowedKinds)) { if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { continue } - la, ok := n.Meta["last_authored"].(map[string]any) + la, ok := lastAuthoredFrom(ownBlame, n) if !ok { continue } - email, _ := la["email"].(string) + email := la.Email if email == "" { continue } - ts := tsFromMeta(la["timestamp"]) + ts := la.Timestamp if ts == 0 { continue } @@ -1286,14 +1301,14 @@ func (s *Server) handleAnalyzeCoverageGaps(ctx context.Context, req mcp.CallTool Hit int `json:"hit"` } var rows []gapRow - for _, n := range s.scopedNodes(ctx) { - if _, ok := allowedKinds[n.Kind]; !ok { - continue - } + covRows := s.coverageByID() + // Kind pushdown — coverage_pct only ever lands on executable + // kinds, so the IN-list IS the candidate set. + for _, n := range s.scopedNodesByKinds(ctx, allowedKindsSlice(allowedKinds)) { if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { continue } - pct, ok := n.Meta["coverage_pct"].(float64) + pct, ok := coveragePctFrom(covRows, n) if !ok { continue } @@ -1306,7 +1321,10 @@ func (s *Server) handleAnalyzeCoverageGaps(ctx context.Context, req mcp.CallTool Line: n.StartLine, Pct: pct, } - if cov, ok := n.Meta["coverage"].(map[string]any); ok { + if e, ok := covRows[n.ID]; ok { + row.NumStmt = e.NumStmt + row.Hit = e.Hit + } else if cov, ok := n.Meta["coverage"].(map[string]any); ok { if v, ok := cov["num_stmt"].(int); ok { row.NumStmt = v } else if f, ok := cov["num_stmt"].(float64); ok { @@ -1401,10 +1419,13 @@ func (s *Server) handleAnalyzeStaleFlags(ctx context.Context, req mcp.CallToolRe var rows []staleFlag unscored := 0 - for _, n := range s.scopedNodes(ctx) { - if n.Kind != graph.KindFlag { - continue - } + // Kind pushdown — KindFlag is a few hundred nodes max even on + // the biggest workspaces, so pulling AllNodes() to find them + // was pure overhead. The caller batch below still does per- + // flag GetInEdges; pushing that into a single query join is a + // separate follow-up since the join semantics differ per flag. + flagBlame := blameRowsByID(s.graph) + for _, n := range s.scopedNodesByKinds(ctx, []graph.NodeKind{graph.KindFlag}) { provider, _ := n.Meta["provider"].(string) if providerFilter != "" && provider != providerFilter { continue @@ -1437,11 +1458,11 @@ func (s *Server) handleAnalyzeStaleFlags(ctx context.Context, req mcp.CallToolRe if caller == nil { continue } - la, ok := caller.Meta["last_authored"].(map[string]any) + la, ok := lastAuthoredFrom(flagBlame, caller) if !ok { continue } - ts := tsFromMeta(la["timestamp"]) + ts := la.Timestamp if ts == 0 { continue } @@ -1536,10 +1557,9 @@ func (s *Server) handleAnalyzeOrphanTables(ctx context.Context, req mcp.CallTool QueryCount int `json:"query_count"` } var rows []orphanRow - for _, n := range s.scopedNodes(ctx) { - if n.Kind != graph.KindTable { - continue - } + // Kind pushdown — only KindTable carries the providers/queries + // fan-in we care about; the rest of the node table is noise. + for _, n := range s.scopedNodesByKinds(ctx, []graph.NodeKind{graph.KindTable}) { // Walk incoming edges to detect both providers (migrations) // and consumers (query call sites). hasProvider := false @@ -1617,10 +1637,8 @@ func (s *Server) handleAnalyzeUnreferencedTables(ctx context.Context, req mcp.Ca ProviderCount int `json:"provider_count"` } var rows []unrefRow - for _, n := range s.scopedNodes(ctx) { - if n.Kind != graph.KindTable { - continue - } + // Kind pushdown — same story as orphan_tables. + for _, n := range s.scopedNodesByKinds(ctx, []graph.NodeKind{graph.KindTable}) { providerCount := 0 queryCount := 0 for _, e := range s.graph.GetInEdges(n.ID) { @@ -1703,15 +1721,14 @@ func (s *Server) handleAnalyzeCoverageSummary(ctx context.Context, req mcp.CallT sumPct float64 // running sum, hidden from JSON } byDir := map[string]*dirStats{} + covRows := s.coverageByID() - for _, n := range s.scopedNodes(ctx) { - if _, ok := allowedKinds[n.Kind]; !ok { - continue - } + // Kind pushdown — coverage_pct only lives on executable kinds. + for _, n := range s.scopedNodesByKinds(ctx, allowedKindsSlice(allowedKinds)) { if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { continue } - pct, ok := n.Meta["coverage_pct"].(float64) + pct, ok := coveragePctFrom(covRows, n) if !ok { continue } @@ -1797,10 +1814,10 @@ func (s *Server) handleAnalyzeInteropUsers(ctx context.Context, req mcp.CallTool ID string `json:"id"` } var rows []interopFile - for _, n := range s.scopedNodes(ctx) { - if n.Kind != graph.KindFile { - continue - } + // Kind pushdown — uses_cgo / uses_wasm_bindgen sentinels only + // live on file nodes; pulling AllNodes() to find them was pure + // overhead on a disk backend. + for _, n := range s.scopedNodesByKinds(ctx, []graph.NodeKind{graph.KindFile}) { if v, _ := n.Meta[metaKey].(bool); !v { continue } @@ -1830,34 +1847,161 @@ func (s *Server) handleAnalyzeInteropUsers(ctx context.Context, req mcp.CallTool }) } -// handleAnalyzeReleases walks git tags chronologically and stamps -// meta.added_in on every file node with the earliest tag whose -// tree contained that file. Symbols inherit indirectly via their -// owning file — answers "added in v1.4?" with one graph hop from -// any symbol to its file. Re-runnable: each call re-walks tags -// and overwrites existing meta. +// handleAnalyzeReleases reads the pre-computed release timeline from +// the graph. Inputs come from meta.added_in (stamped on KindFile +// nodes) and the KindRelease nodes the enricher materialises — one +// per tag, ordered, carrying file_count metadata. No git subprocess +// at read time. +// +// When nothing in scope carries release metadata the tool returns a +// structured error pointing the agent at `enrich_releases` (or the +// `gortex enrich releases` CLI) rather than silently returning an +// empty result; the latter would look like "this repo has no +// releases" even when the cause is "you haven't enriched yet". +// +// Optional filter `tag` returns only the named release with the list +// of files whose meta.added_in matches it — answers "what shipped in +// v1.4?" with a single graph scan. func (s *Server) handleAnalyzeReleases(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { - roots := s.collectRepoRoots(req.GetString("repo", "")) - if len(roots) == 0 { - return mcp.NewToolResultError("releases enrichment requires at least one indexed repo with a root path"), nil - } - total := 0 - perRepo := make(map[string]any, len(roots)) - for prefix, root := range roots { - count, err := releases.EnrichGraphWithRepoPrefix(s.graph, root, prefix) - if err != nil { - perRepo[prefix] = map[string]any{"root": root, "error": err.Error()} + if s.graph == nil { + return mcp.NewToolResultError("graph not initialized"), nil + } + repoFilter := strings.TrimSpace(req.GetString("repo", "")) + tagFilter := strings.TrimSpace(req.GetString("tag", "")) + + type releaseRow struct { + ID string `json:"id"` + Tag string `json:"tag"` + RepoPrefix string `json:"repo_prefix,omitempty"` + FileCount int `json:"file_count"` + Order int `json:"order"` + Files []string `json:"files,omitempty"` + } + releaseByTag := map[string]*releaseRow{} + for _, n := range s.graph.AllNodes() { + if n.Kind != graph.KindRelease { continue } - total += count - perRepo[prefix] = map[string]any{"root": root, "enriched": count} + if repoFilter != "" && n.RepoPrefix != repoFilter { + continue + } + row := &releaseRow{ + ID: n.ID, + Tag: n.Name, + RepoPrefix: n.RepoPrefix, + } + if n.Meta != nil { + row.FileCount = intFromAny(n.Meta["file_count"]) + row.Order = intFromAny(n.Meta["order"]) + } + key := releaseKey(n.RepoPrefix, n.Name) + releaseByTag[key] = row + } + + if tagFilter != "" { + // Caller wants the file list for one release. We surface it + // from meta.added_in rather than a tree walk, so the answer + // is whatever the last enrich pass observed. + row, ok := releaseByTag[releaseKey(repoFilter, tagFilter)] + if !ok { + // Tolerate the no-prefix form: agents pass "v1.4" without + // realising the graph stores multi-repo tags as + // "/v1.4". Fall back to a tag-name-only match. + for k, r := range releaseByTag { + if r.Tag == tagFilter { + row = r + _ = k + break + } + } + } + if row == nil { + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "error": fmt.Sprintf("no KindRelease node for tag %q; run `enrich_releases` first", tagFilter), + "suggestion": "enrich_releases", + "releases": []releaseRow{}, + "total": 0, + }) + } + relByID := s.releaseByID() + for _, n := range s.graph.AllNodes() { + if n.Kind != graph.KindFile || n.FilePath == "" { + continue + } + if repoFilter != "" && n.RepoPrefix != repoFilter { + continue + } + added, ok := addedInFrom(relByID, n) + if !ok || added != row.Tag { + continue + } + row.Files = append(row.Files, n.FilePath) + } + sort.Strings(row.Files) + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "releases": []releaseRow{*row}, + "total": 1, + "tag": tagFilter, + "file_hits": len(row.Files), + }) + } + + // No tag filter: return the timeline. Use `order` (oldest=0) so + // callers can flip to newest-first via reverse. + if len(releaseByTag) == 0 { + // Distinguish "no enrichment yet" from "repo has no tags" by + // peeking at any file's meta.added_in. If even one file has + // the field set the enrichment ran and produced no releases + // (an unlikely combination; surface as an empty timeline); + // otherwise return the structured error. + hasAnyAddedIn := false + if relByID := s.releaseByID(); len(relByID) > 0 { + hasAnyAddedIn = true + } else { + for _, n := range s.graph.AllNodes() { + if n.Kind == graph.KindFile && n.Meta != nil { + if _, ok := n.Meta["added_in"].(string); ok { + hasAnyAddedIn = true + break + } + } + } + } + if !hasAnyAddedIn { + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "error": "no release timeline in scope; run `enrich_releases` (or `gortex enrich releases`) to populate KindRelease nodes and meta.added_in", + "suggestion": "enrich_releases", + "releases": []releaseRow{}, + "total": 0, + }) + } } + rows := make([]releaseRow, 0, len(releaseByTag)) + for _, r := range releaseByTag { + rows = append(rows, *r) + } + sort.Slice(rows, func(i, j int) bool { + if rows[i].Order != rows[j].Order { + return rows[i].Order < rows[j].Order + } + return rows[i].Tag < rows[j].Tag + }) return s.respondJSONOrTOON(ctx, req, map[string]any{ - "enriched": total, - "per_repo": perRepo, + "releases": rows, + "total": len(rows), }) } +// releaseKey builds the lookup key from a (repoPrefix, tag) pair so +// the tag-filtered path can compare scoped IDs against the bare +// agent input. +func releaseKey(repoPrefix, tag string) string { + if repoPrefix == "" { + return tag + } + return repoPrefix + "/" + tag +} + // handleAnalyzeBlame runs `git blame -p` against the indexed // repository and stamps meta.last_authored on each function / // method / type / interface / field / variable / constant / @@ -2039,7 +2183,12 @@ func (s *Server) handleFindHotspots(ctx context.Context, req mcp.CallToolRequest threshold = v } - entries := analysis.FindHotspots(s.graph, s.getCommunities(), threshold) + var entries []analysis.HotspotEntry + if threshold == 0 { + entries = s.getHotspots() + } else { + entries = analysis.FindHotspots(s.graph, s.getCommunities(), threshold) + } // K17: optional novelty / directional reranking modes. Default // "complexity" preserves the legacy ranking. @@ -2114,7 +2263,7 @@ func (s *Server) handleFindHotspots(ctx context.Context, req mcp.CallToolRequest // multi-repo mode. type scaffoldReader struct{ s *Server } -func (r scaffoldReader) Graph() *graph.Graph { return r.s.graph } +func (r scaffoldReader) Graph() graph.Store { return r.s.graph } func (r scaffoldReader) ResolveFilePath(graphPath string) string { abs, err := r.s.resolveGraphPath(graphPath) if err != nil { @@ -2162,13 +2311,8 @@ func (s *Server) handleScaffold(ctx context.Context, req mcp.CallToolRequest) (* return mcp.NewToolResultError(fmt.Sprintf("could not read %s: %v", edit.FilePath, readErr)), nil } lines := strings.Split(string(content), "\n") - insertIdx := edit.InsertionLine - 1 - if insertIdx < 0 { - insertIdx = 0 - } - if insertIdx > len(lines) { - insertIdx = len(lines) - } + insertIdx := max(edit.InsertionLine-1, 0) + insertIdx = min(insertIdx, len(lines)) newLines := make([]string, 0, len(lines)+strings.Count(edit.Code, "\n")+2) newLines = append(newLines, lines[:insertIdx]...) newLines = append(newLines, "") @@ -2520,10 +2664,7 @@ func (s *Server) buildIndexHealthPayload() map[string]any { } } - successfullyIndexed := totalDetected - len(parseErrors) - if successfullyIndexed < 0 { - successfullyIndexed = 0 - } + successfullyIndexed := max(totalDetected-len(parseErrors), 0) var healthScore float64 if totalDetected > 0 { @@ -2886,10 +3027,7 @@ func (s *Server) handleBatchEdit(ctx context.Context, req mcp.CallToolRequest) ( for i := 0; i < node.StartLine-1 && i < len(lines); i++ { symbolStart += len(lines[i]) + 1 } - symbolEnd := symbolStart + len(symbolSource) - if symbolEnd > len(fileStr) { - symbolEnd = len(fileStr) - } + symbolEnd := min(symbolStart+len(symbolSource), len(fileStr)) offset := strings.Index(fileStr[symbolStart:symbolEnd], o.edit.OldSource) if offset < 0 { @@ -3063,10 +3201,7 @@ func (s *Server) handleGetContracts(ctx context.Context, req mcp.CallToolRequest if contractsOffset > contractsTotal { contractsOffset = contractsTotal } - contractsEnd := contractsOffset + contractsLimit - if contractsEnd > contractsTotal { - contractsEnd = contractsTotal - } + contractsEnd := min(contractsOffset+contractsLimit, contractsTotal) filtered = filtered[contractsOffset:contractsEnd] contractsTruncated := contractsEnd < contractsTotal contractsNextCursor := "" @@ -3703,3 +3838,101 @@ func (s *Server) handleAuditAgentConfig(ctx context.Context, req mcp.CallToolReq return s.respondJSONOrTOON(ctx, req, report) } + +// coverageByID batch-loads the coverage sidecar (change A) into an +// id->row map; nil when the backend lacks the capability (callers then +// fall back to Node.Meta). One read per handler call, not per-node. +func (s *Server) coverageByID() map[string]graph.CoverageEnrichment { + r, ok := s.graph.(graph.CoverageEnrichmentReader) + if !ok { + return nil + } + rows := r.CoverageRows("") + m := make(map[string]graph.CoverageEnrichment, len(rows)) + for _, e := range rows { + m[e.NodeID] = e + } + return m +} + +// coveragePctFrom returns a node's coverage %, preferring the sidecar map +// and falling back to Meta["coverage_pct"] for un-migrated DBs. +func coveragePctFrom(cov map[string]graph.CoverageEnrichment, n *graph.Node) (float64, bool) { + if e, ok := cov[n.ID]; ok { + return e.CoveragePct, true + } + if pct, ok := n.Meta["coverage_pct"].(float64); ok { + return pct, true + } + return 0, false +} + +// releaseByID batch-loads the release sidecar (change A) into an +// id->tag map; nil when the backend lacks the capability. +func (s *Server) releaseByID() map[string]string { + r, ok := s.graph.(graph.ReleaseEnrichmentReader) + if !ok { + return nil + } + rows := r.ReleaseRows("") + m := make(map[string]string, len(rows)) + for _, e := range rows { + m[e.NodeID] = e.AddedIn + } + return m +} + +// addedInFrom returns a node's "added_in" tag, preferring the sidecar +// map and falling back to Meta["added_in"] for un-migrated DBs. +func addedInFrom(rel map[string]string, n *graph.Node) (string, bool) { + if tag, ok := rel[n.ID]; ok { + return tag, true + } + if n.Meta != nil { + if tag, ok := n.Meta["added_in"].(string); ok { + return tag, true + } + } + return "", false +} + +// blameRowsByID batch-loads the blame sidecar (change A) into an +// id->row map; nil when the backend lacks the capability. +func blameRowsByID(g graph.Store) map[string]graph.BlameEnrichment { + r, ok := g.(graph.BlameEnrichmentReader) + if !ok { + return nil + } + rows := r.BlameRows("") + m := make(map[string]graph.BlameEnrichment, len(rows)) + for _, e := range rows { + m[e.NodeID] = e + } + return m +} + +// lastAuthoredFrom returns a node's blame, preferring the sidecar map and +// falling back to Meta["last_authored"] for un-migrated DBs. +func lastAuthoredFrom(blame map[string]graph.BlameEnrichment, n *graph.Node) (graph.BlameEnrichment, bool) { + if e, ok := blame[n.ID]; ok { + return e, true + } + if n.Meta != nil { + if la, ok := n.Meta["last_authored"].(map[string]any); ok { + e := graph.BlameEnrichment{NodeID: n.ID} + e.Commit, _ = la["commit"].(string) + e.Email, _ = la["email"].(string) + e.Timestamp = tsFromMeta(la["timestamp"]) + return e, true + } + } + return graph.BlameEnrichment{}, false +} + +// lastAuthoredTSFrom is the timestamp-only convenience over lastAuthoredFrom. +func lastAuthoredTSFrom(blame map[string]graph.BlameEnrichment, n *graph.Node) (int64, bool) { + if e, ok := lastAuthoredFrom(blame, n); ok && e.Timestamp != 0 { + return e.Timestamp, true + } + return 0, false +} diff --git a/internal/mcp/tools_enrich_churn.go b/internal/mcp/tools_enrich_churn.go new file mode 100644 index 00000000..ec3d8f9b --- /dev/null +++ b/internal/mcp/tools_enrich_churn.go @@ -0,0 +1,102 @@ +package mcp + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/mark3labs/mcp-go/mcp" + + "github.com/zzet/gortex/internal/churn" +) + +// registerEnrichChurnTool exposes the churn enricher as an MCP tool so +// agents (and the post-commit / post-merge git hook driving `gortex +// enrich churn`) can refresh per-symbol churn data without going +// through the daemon control socket. The handler runs the enricher +// in-process against s.graph, so it inherits whatever backend the +// daemon was launched with — the on-disk backend for persistence, in-memory for +// CI / one-off invocations. +// +// The accompanying `get_churn_rate` tool reads from the same +// meta.churn fields this tool writes; pre-computation here is what +// makes the read path a sub-second graph scan. +func (s *Server) registerEnrichChurnTool() { + s.addTool( + mcp.NewTool("enrich_churn", + mcp.WithDescription("Pre-compute per-file and per-symbol git churn data and stamp it on graph nodes so `get_churn_rate` can answer without a git subprocess. Walks `git log ` and `git blame ` once per file, then projects line-range commit counts onto every function/method node. The branch is the repository's default branch (origin/main, then origin/master, then local main/master/trunk) unless `branch` overrides. Idempotent: re-running updates the same Meta fields in place. Disk-backed daemons (sqlite) persist the result across restarts; in-memory daemons recompute on next call."), + mcp.WithString("branch", mcp.Description("Branch / tag / SHA to compute churn against. Empty means resolve the repository's default branch.")), + mcp.WithString("path", mcp.Description("Optional path or repo prefix to scope the enrichment. Multi-repo daemons enrich every tracked repo when empty.")), + mcp.WithString("format", mcp.Description("Output format: json (default), gcx, or toon")), + ), + s.handleEnrichChurn, + ) +} + +func (s *Server) handleEnrichChurn(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { + if s.graph == nil { + return mcp.NewToolResultError("graph not initialized"), nil + } + branch := strings.TrimSpace(req.GetString("branch", "")) + pathArg := strings.TrimSpace(req.GetString("path", "")) + + // Resolve targets: one repo root per tracked repo, optionally + // filtered by path (matched as either prefix or absolute root). + type target struct { + prefix string + root string + } + var targets []target + if s.multiIndexer != nil { + for prefix, meta := range s.multiIndexer.AllMetadata() { + if pathArg != "" && pathArg != prefix && pathArg != meta.RootPath { + continue + } + targets = append(targets, target{prefix: prefix, root: meta.RootPath}) + } + } + if len(targets) == 0 { + return mcp.NewToolResultError(fmt.Sprintf("no tracked repo matches %q", pathArg)), nil + } + + started := time.Now() + type perRepo struct { + Prefix string `json:"prefix"` + Branch string `json:"branch"` + HeadSHA string `json:"head_sha"` + Files int `json:"files"` + Symbols int `json:"symbols"` + Skipped string `json:"skipped,omitempty"` + } + var per []perRepo + totalFiles, totalSymbols := 0, 0 + for _, t := range targets { + b := branch + if b == "" { + b = churn.DefaultBranch(t.root) + } + if b == "" { + per = append(per, perRepo{Prefix: t.prefix, Skipped: "no default branch resolvable"}) + continue + } + res, err := churn.EnrichGraph(ctx, s.graph, t.root, churn.Options{Branch: b}) + if err != nil { + per = append(per, perRepo{Prefix: t.prefix, Branch: b, Skipped: err.Error()}) + continue + } + per = append(per, perRepo{ + Prefix: t.prefix, Branch: res.Branch, HeadSHA: res.HeadSHA, + Files: res.Files, Symbols: res.Symbols, + }) + totalFiles += res.Files + totalSymbols += res.Symbols + } + + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "repos": per, + "files": totalFiles, + "symbols": totalSymbols, + "duration_ms": time.Since(started).Milliseconds(), + }) +} diff --git a/internal/mcp/tools_enrich_releases.go b/internal/mcp/tools_enrich_releases.go new file mode 100644 index 00000000..691a61c6 --- /dev/null +++ b/internal/mcp/tools_enrich_releases.go @@ -0,0 +1,92 @@ +package mcp + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/mark3labs/mcp-go/mcp" + + "github.com/zzet/gortex/internal/churn" + "github.com/zzet/gortex/internal/releases" +) + +// registerEnrichReleasesTool exposes the releases enricher as an MCP +// tool. `analyze kind=releases` is now a pure read — populating the +// per-file meta.added_in and the KindRelease timeline is this tool's +// job (counterpart to enrich_churn). +// +// Branch constrains the considered tags to those reachable from the +// branch — typically the repo's default branch — so topic-branch tags +// don't pollute the timeline. Empty branch means "every tag", matching +// the legacy behaviour. +func (s *Server) registerEnrichReleasesTool() { + s.addTool( + mcp.NewTool("enrich_releases", + mcp.WithDescription("Pre-compute the release timeline: list tags on the default branch (or `branch` override), stamp meta.added_in on every file present in each tag's tree, and materialise one KindRelease node per tag. The read tool `analyze kind=releases` then answers from this Meta without re-walking git. Idempotent; disk-backed daemons (sqlite) persist the result across restarts."), + mcp.WithString("branch", mcp.Description("Branch / tag / SHA whose reachable tag set bounds the timeline. Empty resolves the repo's default branch; pass a value to override.")), + mcp.WithString("path", mcp.Description("Optional path or repo prefix to scope the enrichment. Multi-repo daemons enrich every tracked repo when empty.")), + mcp.WithString("format", mcp.Description("Output format: json (default), gcx, or toon")), + ), + s.handleEnrichReleases, + ) +} + +func (s *Server) handleEnrichReleases(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { + if s.graph == nil { + return mcp.NewToolResultError("graph not initialized"), nil + } + branchArg := strings.TrimSpace(req.GetString("branch", "")) + pathArg := strings.TrimSpace(req.GetString("path", "")) + + type target struct { + prefix string + root string + } + var targets []target + if s.multiIndexer != nil { + for prefix, meta := range s.multiIndexer.AllMetadata() { + if pathArg != "" && pathArg != prefix && pathArg != meta.RootPath { + continue + } + targets = append(targets, target{prefix: prefix, root: meta.RootPath}) + } + } + if len(targets) == 0 { + return mcp.NewToolResultError(fmt.Sprintf("no tracked repo matches %q", pathArg)), nil + } + _ = ctx + + started := time.Now() + type perRepo struct { + Prefix string `json:"prefix"` + Branch string `json:"branch,omitempty"` + Files int `json:"files"` + Skipped string `json:"skipped,omitempty"` + } + var per []perRepo + totalFiles := 0 + for _, t := range targets { + b := branchArg + if b == "" { + b = churn.DefaultBranch(t.root) + // b can stay "" — releases.EnrichGraphForBranch treats + // that as "every tag", the right fallback when no default + // branch resolves. + } + count, err := releases.EnrichGraphForBranch(s.graph, t.root, t.prefix, b) + if err != nil { + per = append(per, perRepo{Prefix: t.prefix, Branch: b, Skipped: err.Error()}) + continue + } + per = append(per, perRepo{Prefix: t.prefix, Branch: b, Files: count}) + totalFiles += count + } + + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "repos": per, + "files": totalFiles, + "duration_ms": time.Since(started).Milliseconds(), + }) +} diff --git a/internal/mcp/tools_extract_candidates.go b/internal/mcp/tools_extract_candidates.go index aedb26a3..ab1e6f81 100644 --- a/internal/mcp/tools_extract_candidates.go +++ b/internal/mcp/tools_extract_candidates.go @@ -38,16 +38,16 @@ func (s *Server) registerExtractionCandidatesTool() { } type extractCandidateRow struct { - ID string `json:"symbol_id"` - Name string `json:"name"` - File string `json:"file"` - StartLine int `json:"start_line"` - EndLine int `json:"end_line"` - LineCount int `json:"line_count"` - CallerCount int `json:"caller_count"` - FanOut int `json:"fan_out"` - Score float64 `json:"score"` - Rationale string `json:"rationale"` + ID string `json:"symbol_id"` + Name string `json:"name"` + File string `json:"file"` + StartLine int `json:"start_line"` + EndLine int `json:"end_line"` + LineCount int `json:"line_count"` + CallerCount int `json:"caller_count"` + FanOut int `json:"fan_out"` + Score float64 `json:"score"` + Rationale string `json:"rationale"` } func (s *Server) handleGetExtractionCandidates(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { @@ -57,6 +57,93 @@ func (s *Server) handleGetExtractionCandidates(ctx context.Context, req mcp.Call pathPrefix := strings.TrimSpace(req.GetString("path_prefix", "")) limit := max(req.GetInt("limit", 25), 1) + rows := s.collectExtractionCandidates(ctx, minLines, minCallers, minFanOut, pathPrefix) + + sort.Slice(rows, func(i, j int) bool { + if rows[i].Score != rows[j].Score { + return rows[i].Score > rows[j].Score + } + return rows[i].ID < rows[j].ID + }) + truncated := false + if len(rows) > limit { + rows = rows[:limit] + truncated = true + } + + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "candidates": rows, + "total": len(rows), + "truncated": truncated, + "thresholds": map[string]any{ + "min_lines": minLines, + "min_callers": minCallers, + "min_fan_out": minFanOut, + }, + }) +} + +// collectExtractionCandidates evaluates the three threshold gates +// (min lines, min callers, min fan-out) over every function/method +// in scope, returning the surviving rows. +// +// Picks ExtractCandidatesScanner when the backend implements it: that +// path runs the caller-count + fan-out aggregations server-side in +// one query per direction instead of the AllNodes + per-node +// GetInEdges + GetOutEdges loop the fallback runs. On a disk backend the +// fallback fires 2N round-trips per call and materialises every +// edge bucket just to count distinct endpoints. The pushdown drops +// the call to two aggregations the planner can index. +// +// The session's workspace scope is applied as a post-filter when +// the capability is used — kind / threshold pre-filtering is the +// dominant win, so workspace gating Go-side is cheap. +func (s *Server) collectExtractionCandidates( + ctx context.Context, + minLines, minCallers, minFanOut int, + pathPrefix string, +) []extractCandidateRow { + callKinds := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeCrossRepoCalls} + if scanner, ok := s.graph.(graph.ExtractCandidatesScanner); ok { + raw := scanner.ExtractCandidates(callKinds, minLines, minCallers, minFanOut, pathPrefix) + // Session-scope post-filter: skip the lookup when the session + // is unbound (every node is in scope) so the bench-friendly + // path stays a pure stream of rows. + _, _, bound := s.sessionScope(ctx) + var scopeIDs map[string]*graph.Node + if bound { + ids := make([]string, 0, len(raw)) + for _, r := range raw { + ids = append(ids, r.NodeID) + } + scopeIDs = s.graph.GetNodesByIDs(ids) + } + out := make([]extractCandidateRow, 0, len(raw)) + for _, r := range raw { + if bound { + n := scopeIDs[r.NodeID] + if n == nil || !s.nodeInSessionScope(ctx, n) { + continue + } + } + score := math.Log1p(float64(r.LineCount)) * + math.Log1p(float64(r.CallerCount)) * + math.Log1p(float64(r.FanOut)) + out = append(out, extractCandidateRow{ + ID: r.NodeID, Name: r.Name, File: r.FilePath, + StartLine: r.StartLine, + EndLine: r.EndLine, + LineCount: r.LineCount, + CallerCount: r.CallerCount, + FanOut: r.FanOut, + Score: roundScore(score), + Rationale: buildExtractRationale(r.LineCount, r.CallerCount, r.FanOut), + }) + } + return out + } + // In-memory fallback — kept inline so the call site doesn't + // branch on the capability twice. scoped := s.scopedNodes(ctx) rows := make([]extractCandidateRow, 0, len(scoped)) for _, n := range scoped { @@ -73,7 +160,6 @@ func (s *Server) handleGetExtractionCandidates(ctx context.Context, req mcp.Call if lineCount < minLines { continue } - callers := callerCount(s.graph, n.ID) if callers < minCallers { continue @@ -82,13 +168,9 @@ func (s *Server) handleGetExtractionCandidates(ctx context.Context, req mcp.Call if fanOut < minFanOut { continue } - - // Log-scaled composite — long-tail values don't dominate the - // short-tail. Adding 1 inside each log keeps the score >= 0. score := math.Log1p(float64(lineCount)) * math.Log1p(float64(callers)) * math.Log1p(float64(fanOut)) - rows = append(rows, extractCandidateRow{ ID: n.ID, Name: n.Name, File: n.FilePath, StartLine: n.StartLine, EndLine: n.EndLine, @@ -99,34 +181,12 @@ func (s *Server) handleGetExtractionCandidates(ctx context.Context, req mcp.Call Rationale: buildExtractRationale(lineCount, callers, fanOut), }) } - - sort.Slice(rows, func(i, j int) bool { - if rows[i].Score != rows[j].Score { - return rows[i].Score > rows[j].Score - } - return rows[i].ID < rows[j].ID - }) - truncated := false - if len(rows) > limit { - rows = rows[:limit] - truncated = true - } - - return s.respondJSONOrTOON(ctx, req, map[string]any{ - "candidates": rows, - "total": len(rows), - "truncated": truncated, - "thresholds": map[string]any{ - "min_lines": minLines, - "min_callers": minCallers, - "min_fan_out": minFanOut, - }, - }) + return rows } // callerCount returns the number of distinct call-site origins for // the given node. Counts EdgeCalls and the cross-repo call variant. -func callerCount(g *graph.Graph, id string) int { +func callerCount(g graph.Store, id string) int { seen := map[string]bool{} for _, e := range g.GetInEdges(id) { if e.Kind != graph.EdgeCalls && e.Kind != graph.EdgeCrossRepoCalls { @@ -140,7 +200,7 @@ func callerCount(g *graph.Graph, id string) int { // distinctCalleeCount returns how many distinct functions/methods // the node calls. Proxy for internal complexity — a function that // orchestrates 20 different callees is probably doing too much. -func distinctCalleeCount(g *graph.Graph, id string) int { +func distinctCalleeCount(g graph.Store, id string) int { seen := map[string]bool{} for _, e := range g.GetOutEdges(id) { if e.Kind != graph.EdgeCalls && e.Kind != graph.EdgeCrossRepoCalls { diff --git a/internal/mcp/tools_fileops.go b/internal/mcp/tools_fileops.go index eb899252..2e047791 100644 --- a/internal/mcp/tools_fileops.go +++ b/internal/mcp/tools_fileops.go @@ -237,7 +237,7 @@ func (s *Server) resolveNodePath(node *graph.Node) (string, error) { if root, ok := s.multiIndexer.RepoRoot(node.RepoPrefix); ok { // applyRepoPrefix stamps `/` onto node.FilePath // at index time, so a node's FilePath looks like - // `gortex/internal/exporter/cypher.go`. RepoRoot returns + // `gortex/internal/mcp/tools_fileops.go`. RepoRoot returns // the on-disk path that ALREADY corresponds to the repo // (e.g. `/Users/zzet/code/my/gortex/gortex`). Joining as-is // duplicates the prefix segment when the repo's basename diff --git a/internal/mcp/tools_find_declaration.go b/internal/mcp/tools_find_declaration.go index 23538970..ff34f6ea 100644 --- a/internal/mcp/tools_find_declaration.go +++ b/internal/mcp/tools_find_declaration.go @@ -88,7 +88,12 @@ func (s *Server) handleFindDeclaration(ctx context.Context, req mcp.CallToolRequ // Stage 2 — resolve each use site to a declaration. eng := s.engineFor(ctx) - fileIdx := buildDeclFileIndex(eng, matches) + // Pass the NodesInFilesByKindFinder capability when the backend + // implements it; buildDeclFileIndex falls back to AllNodes() when + // finder is nil (e.g. behind an overlay view that doesn't expose + // the capability). + finder, _ := s.graph.(graph.NodesInFilesByKindFinder) + fileIdx := buildDeclFileIndex(eng, finder, matches) groups := make(map[string]*declGroup) var declOrder []string @@ -173,24 +178,63 @@ func (s *Server) findUseSiteMatches(useSite string, isRegex bool, pathPrefix str // matches, so the enclosing symbol of any match line can be found // quickly. It mirrors buildFileSymbolIndex but is keyed off the match // set directly rather than astquery targets. -func buildDeclFileIndex(eng *query.Engine, matches []trigram.Match) map[string]*fileSymbolIndex { +// +// finder may be nil when no NodesInFilesByKindFinder-capable backend +// is available (e.g. when running through an editor-buffer overlay +// whose underlying view doesn't expose the capability); the function +// then falls back to walking eng.AllNodes() Go-side, identical to +// the pre-capability shape. Backends that ship the capability +// (the disk backend) collapse the per-call node fetch into one query +// scoped to the trigram-match file set — on the gortex workspace +// that was ~70k AllNodes() rows over the storage boundary just to keep the few +// hundred whose FilePath sat in the small match-file set. +func buildDeclFileIndex(eng *query.Engine, finder graph.NodesInFilesByKindFinder, matches []trigram.Match) map[string]*fileSymbolIndex { wanted := make(map[string]struct{}, len(matches)) + files := make([]string, 0, len(matches)) for _, m := range matches { + if _, ok := wanted[m.Path]; ok { + continue + } wanted[m.Path] = struct{}{} + files = append(files, m.Path) } out := make(map[string]*fileSymbolIndex, len(wanted)) - for _, n := range eng.AllNodes() { - if _, ok := wanted[n.FilePath]; !ok { - continue + + add := func(n *graph.Node) { + if n == nil { + return } - switch n.Kind { - case graph.KindFunction, graph.KindMethod, graph.KindClosure, graph.KindType, graph.KindInterface: - idx := out[n.FilePath] - if idx == nil { - idx = &fileSymbolIndex{} - out[n.FilePath] = idx + idx := out[n.FilePath] + if idx == nil { + idx = &fileSymbolIndex{} + out[n.FilePath] = idx + } + idx.add(n) + } + + if finder != nil { + kinds := []graph.NodeKind{ + graph.KindFunction, + graph.KindMethod, + graph.KindClosure, + graph.KindType, + graph.KindInterface, + } + for _, n := range finder.NodesInFilesByKind(files, kinds) { + if _, ok := wanted[n.FilePath]; !ok { + continue + } + add(n) + } + } else { + for _, n := range eng.AllNodes() { + if _, ok := wanted[n.FilePath]; !ok { + continue + } + switch n.Kind { + case graph.KindFunction, graph.KindMethod, graph.KindClosure, graph.KindType, graph.KindInterface: + add(n) } - idx.add(n) } } for _, idx := range out { @@ -212,7 +256,7 @@ func resolveUseSiteDecl(eng *query.Engine, fileIdx map[string]*fileSymbolIndex, if e.Line != m.Line || !declResolveKinds[e.Kind] { continue } - if strings.HasPrefix(e.To, "unresolved::") || strings.HasPrefix(e.To, "external::") { + if graph.IsUnresolvedTarget(e.To) || strings.HasPrefix(e.To, "external::") { continue } // Prefer a call edge over a plain reference when the diff --git a/internal/mcp/tools_graph_completion.go b/internal/mcp/tools_graph_completion.go index ade6f675..dc9c588f 100644 --- a/internal/mcp/tools_graph_completion.go +++ b/internal/mcp/tools_graph_completion.go @@ -85,12 +85,12 @@ func (s *Server) handleGraphCompletionSearch(ctx context.Context, req mcp.CallTo } return s.respondJSONOrTOON(ctx, req, map[string]any{ - "results": rows, - "total": len(rows), - "retriever": retriever.Name(), - "seed_count": countSeeds(cands), - "expanded": len(cands) - countSeeds(cands), - "edge_kinds": edgeKindStrings(edgeKinds), + "results": rows, + "total": len(rows), + "retriever": retriever.Name(), + "seed_count": countSeeds(cands), + "expanded": len(cands) - countSeeds(cands), + "edge_kinds": edgeKindStrings(edgeKinds), }) } @@ -100,14 +100,22 @@ func (s *Server) handleGraphCompletionSearch(ctx context.Context, req mcp.CallTo // substring (case-insensitive). Replaceable by callers who plug in // vector search or another retrieval scheme via the public Retriever // interface. -func (s *Server) nameMatchSeeder(ctx context.Context, g *graph.Graph, query string, limit int) ([]*rerank.Candidate, error) { - q := strings.ToLower(query) - out := make([]*rerank.Candidate, 0, limit) - for _, n := range g.AllNodes() { - if ctx.Err() != nil { - return out, ctx.Err() - } - if !strings.Contains(strings.ToLower(n.Name), q) { +func (s *Server) nameMatchSeeder(ctx context.Context, g graph.Store, query string, limit int) ([]*rerank.Candidate, error) { + // FindNodesByNameContaining pushes the case-insensitive substring + // filter into the backend — on a disk backend that's an indexed + // substring filter against the name column, so only matching rows + // cross the storage boundary instead of the legacy AllNodes() + // materialisation + per-row Go string check. The in-memory backend + // already had a tight implementation behind the same surface, so + // this is a strict win on disk backends and matches today's cost + // in-memory. + matches := g.FindNodesByNameContaining(query, limit) + if ctx.Err() != nil { + return nil, ctx.Err() + } + out := make([]*rerank.Candidate, 0, len(matches)) + for _, n := range matches { + if n == nil { continue } out = append(out, &rerank.Candidate{Node: n, TextRank: len(out)}) diff --git a/internal/mcp/tools_graph_query.go b/internal/mcp/tools_graph_query.go index db62fd9c..c1dd2671 100644 --- a/internal/mcp/tools_graph_query.go +++ b/internal/mcp/tools_graph_query.go @@ -3,6 +3,7 @@ package mcp import ( "context" "fmt" + "iter" "regexp" "strings" @@ -270,12 +271,47 @@ func evalGraphQuery(eng *query.Engine, stages []gqStage, limit int) (*query.SubG for _, st := range stages { switch st.kind { case gqStageNodes: - for _, n := range eng.AllNodes() { - if matchesAll(n, st.filters) { - add(n) - if len(working) >= limit { + // When the pipeline opens with a `kind=` predicate (the + // common case — e.g. `nodes kind=function ...`), iterate + // the backend's per-kind bucket instead of AllNodes(). On + // a disk backend NodesByKind hits a server-side filter and only + // the matching rows cross the storage boundary; AllNodes() materialised the + // whole node table per request. Other filters + // (`name~`/`path=`/`lang=`) still post-filter in Go. + // + // Overlay views (NodesByKindReader-unaware) fall through + // to the AllNodes() walk — they're already in-memory, so + // the bucket optimisation has no win there. + seedKinds := seedKindsFromFilters(st.filters) + byKind, _ := eng.Reader().(nodesByKindReader) + if byKind != nil && len(seedKinds) > 0 { + done := false + for _, k := range seedKinds { + if done { break } + for n := range byKind.NodesByKind(k) { + if n == nil { + continue + } + if !matchesAll(n, st.filters) { + continue + } + add(n) + if len(working) >= limit { + done = true + break + } + } + } + } else { + for _, n := range eng.AllNodes() { + if matchesAll(n, st.filters) { + add(n) + if len(working) >= limit { + break + } + } } } @@ -340,7 +376,7 @@ func evalGraphQuery(eng *query.Engine, stages []gqStage, limit int) (*query.SubG } targetID = e.From } - if strings.HasPrefix(targetID, "unresolved::") || + if graph.IsUnresolvedTarget(targetID) || strings.HasPrefix(targetID, "external::") { continue } @@ -398,3 +434,35 @@ func evalGraphQuery(eng *query.Engine, stages []gqStage, limit int) (*query.SubG TotalEdges: len(edges), }, nil } + +// nodesByKindReader is the optional read-side capability the eng.Reader +// underlying type may implement. *graph.Graph satisfies it directly +// (Store has NodesByKind); OverlaidView does not, which is fine — +// overlays already work in-memory and don't benefit from the bucket +// fast path. +type nodesByKindReader interface { + NodesByKind(kind graph.NodeKind) iter.Seq[*graph.Node] +} + +// seedKindsFromFilters extracts every `kind=` predicate from a stage's +// filter list so the seed loop can iterate the corresponding NodesByKind +// buckets instead of AllNodes(). Returns nil when no `kind=` filter is +// present — the caller falls back to the AllNodes() walk in that case. +// Duplicates are deduped so a sloppy author writing `kind=function +// kind=function` doesn't double-iterate. +func seedKindsFromFilters(filters []gqFilter) []graph.NodeKind { + var out []graph.NodeKind + seen := make(map[graph.NodeKind]struct{}, len(filters)) + for _, f := range filters { + if f.op != "kind=" { + continue + } + k := graph.NodeKind(f.value) + if _, ok := seen[k]; ok { + continue + } + seen[k] = struct{}{} + out = append(out, k) + } + return out +} diff --git a/internal/mcp/tools_inspections.go b/internal/mcp/tools_inspections.go index 3dea6f80..f24f7446 100644 --- a/internal/mcp/tools_inspections.go +++ b/internal/mcp/tools_inspections.go @@ -312,6 +312,7 @@ func runTodosInspection(s *Server, scope inspectionScope) []inspectionViolation func runCoverageGapsInspection(s *Server, scope inspectionScope) []inspectionViolation { out := make([]inspectionViolation, 0) + covRows := s.coverageByID() for _, n := range s.graph.AllNodes() { if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { continue @@ -319,7 +320,7 @@ func runCoverageGapsInspection(s *Server, scope inspectionScope) []inspectionVio if !scope.keep(n.FilePath) { continue } - pct, ok := n.Meta["coverage_pct"].(float64) + pct, ok := coveragePctFrom(covRows, n) if !ok || pct >= 100.0 { continue } diff --git a/internal/mcp/tools_knowledge_gaps.go b/internal/mcp/tools_knowledge_gaps.go index 9d6c5e7d..9249b759 100644 --- a/internal/mcp/tools_knowledge_gaps.go +++ b/internal/mcp/tools_knowledge_gaps.go @@ -34,11 +34,11 @@ func (s *Server) registerKnowledgeGapsTool() { // edges. Almost always either dead code or an isolated utility // nobody wired up. type gapDisconnected struct { - ID string `json:"id"` - Name string `json:"name"` - Kind string `json:"kind"` - File string `json:"file"` - Line int `json:"line"` + ID string `json:"id"` + Name string `json:"name"` + Kind string `json:"kind"` + File string `json:"file"` + Line int `json:"line"` } // gapCommunity — for thin and single-file communities the caller @@ -56,13 +56,13 @@ type gapCommunity struct { // gate so we surface load-bearing nodes even in small repos where // the analyzer is conservative. type gapUntestedHotspot struct { - ID string `json:"id"` - Name string `json:"name"` - File string `json:"file"` - Line int `json:"line"` - FanIn int `json:"fan_in"` - Coverage float64 `json:"coverage_pct"` - HasCoverage bool `json:"has_coverage"` + ID string `json:"id"` + Name string `json:"name"` + File string `json:"file"` + Line int `json:"line"` + FanIn int `json:"fan_in"` + Coverage float64 `json:"coverage_pct"` + HasCoverage bool `json:"has_coverage"` } func (s *Server) handleGetKnowledgeGaps(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { @@ -78,11 +78,17 @@ func (s *Server) handleGetKnowledgeGaps(ctx context.Context, req mcp.CallToolReq perCategoryLimit := max(req.GetInt("limit_per_category", 20), 1) pathPrefix := strings.TrimSpace(req.GetString("path_prefix", "")) - scoped := s.scopedNodes(ctx) + // degreeByID maps node id -> (in, out) edge counts for every + // function/method in scope, computed once via the backend's + // NodeDegreeByKinds path when available. The legacy + // NodeDegreeCounts route shipped a 30k-element IN-list per call + // on a disk backend; NodeDegreeByKinds runs the same aggregate over the + // kind-filtered node set so the planner never builds the list. + degreeByID, scoped := s.scopedFunctionDegrees(ctx, pathPrefix) - disconnected := s.collectDisconnected(scoped, pathPrefix, perCategoryLimit) + disconnected := s.collectDisconnected(scoped, pathPrefix, perCategoryLimit, degreeByID) thin, singleFile := s.collectCommunityGaps(thinSize, pathPrefix, perCategoryLimit) - untested := s.collectUntestedHotspots(scoped, pathPrefix, hotspotLimit, minCov, perCategoryLimit) + untested := s.collectUntestedHotspots(scoped, pathPrefix, hotspotLimit, minCov, perCategoryLimit, degreeByID) return s.respondJSONOrTOON(ctx, req, map[string]any{ "disconnected_nodes": disconnected, @@ -104,27 +110,102 @@ func (s *Server) handleGetKnowledgeGaps(ctx context.Context, req mcp.CallToolReq }) } +// scopedFunctionDegrees returns the per-node in/out degree map and +// the scoped function/method node list, in two pushdown calls. +// NodeDegreeByKinds runs server-side over the kind-filtered node +// table — the previous path fed NodeDegreeCounts a 30k-element +// IN-list, which the planner had to materialise before joining. The +// scoped node list is built from NodesByKinds (or AllNodes when the +// backend has no NodesByKindsScanner) and post-filtered for the +// session workspace, matching scopedNodesByKinds' contract. +func (s *Server) scopedFunctionDegrees(ctx context.Context, pathPrefix string) (map[string]graph.NodeDegreeRow, []*graph.Node) { + kinds := []graph.NodeKind{graph.KindFunction, graph.KindMethod} + scoped := s.scopedNodesByKinds(ctx, kinds) + var degByID map[string]graph.NodeDegreeRow + if dk, ok := s.graph.(graph.NodeDegreeByKinds); ok { + rows := dk.NodeDegreeByKinds(kinds, pathPrefix) + degByID = make(map[string]graph.NodeDegreeRow, len(rows)) + for _, r := range rows { + degByID[r.NodeID] = r + } + } + return degByID, scoped +} + // collectDisconnected returns function/method nodes with zero // incoming and zero outgoing edges in the scoped subgraph. The // kind filter mirrors handleAnalyzeCoverageGaps' default — variables // and constants always look disconnected, so including them would // flood the result. -func (s *Server) collectDisconnected(scoped []*graph.Node, pathPrefix string, limit int) []gapDisconnected { - out := make([]gapDisconnected, 0) +// +// Reads from the prebuilt degree map when present (the storage +// backend computed it once in scopedFunctionDegrees), falls back to +// per-node GetInEdges / GetOutEdges otherwise. The legacy +// NodeDegreeAggregator path is kept as a tertiary fallback for +// backends that publish NodeDegreeCounts but not NodeDegreeByKinds. +func (s *Server) collectDisconnected(scoped []*graph.Node, pathPrefix string, limit int, degreeByID map[string]graph.NodeDegreeRow) []gapDisconnected { + candidates := make([]*graph.Node, 0, len(scoped)) for _, n := range scoped { - if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { - continue - } if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { continue } - if len(s.graph.GetInEdges(n.ID)) > 0 || len(s.graph.GetOutEdges(n.ID)) > 0 { - continue + candidates = append(candidates, n) + } + + out := make([]gapDisconnected, 0) + switch { + case degreeByID != nil: + for _, n := range candidates { + r, ok := degreeByID[n.ID] + if !ok { + // Absent from the aggregate => zero edges, by + // definition of the kind-filtered aggregate. + out = append(out, gapDisconnected{ + ID: n.ID, Name: n.Name, Kind: string(n.Kind), + File: n.FilePath, Line: n.StartLine, + }) + continue + } + if r.InCount > 0 || r.OutCount > 0 { + continue + } + out = append(out, gapDisconnected{ + ID: n.ID, Name: n.Name, Kind: string(n.Kind), + File: n.FilePath, Line: n.StartLine, + }) + } + default: + if agg, ok := s.graph.(graph.NodeDegreeAggregator); ok && len(candidates) > 0 { + ids := make([]string, 0, len(candidates)) + byID := make(map[string]*graph.Node, len(candidates)) + for _, n := range candidates { + ids = append(ids, n.ID) + byID[n.ID] = n + } + for _, r := range agg.NodeDegreeCounts(ids, nil) { + if r.InCount > 0 || r.OutCount > 0 { + continue + } + n := byID[r.NodeID] + if n == nil { + continue + } + out = append(out, gapDisconnected{ + ID: n.ID, Name: n.Name, Kind: string(n.Kind), + File: n.FilePath, Line: n.StartLine, + }) + } + } else { + for _, n := range candidates { + if len(s.graph.GetInEdges(n.ID)) > 0 || len(s.graph.GetOutEdges(n.ID)) > 0 { + continue + } + out = append(out, gapDisconnected{ + ID: n.ID, Name: n.Name, Kind: string(n.Kind), + File: n.FilePath, Line: n.StartLine, + }) + } } - out = append(out, gapDisconnected{ - ID: n.ID, Name: n.Name, Kind: string(n.Kind), - File: n.FilePath, Line: n.StartLine, - }) } sort.Slice(out, func(i, j int) bool { if out[i].File != out[j].File { @@ -193,20 +274,50 @@ func (s *Server) collectCommunityGaps(thinSize int, pathPrefix string, limit int // coverage_pct < minCov or no coverage data at all. Independent of // analyze hotspots (which gates on mean+2σ) so it still surfaces // load-bearing nodes in small repos. -func (s *Server) collectUntestedHotspots(scoped []*graph.Node, pathPrefix string, hotspotLimit int, minCov float64, limit int) []gapUntestedHotspot { +// +// Reads from the prebuilt NodeDegreeByKinds aggregate when present; +// falls back to NodeDegreeAggregator (the older IN-list shape) for +// backends that only publish that one, and finally to per-node +// GetInEdges for everyone else. +func (s *Server) collectUntestedHotspots(scoped []*graph.Node, pathPrefix string, hotspotLimit int, minCov float64, limit int, degreeByID map[string]graph.NodeDegreeRow) []gapUntestedHotspot { type ranked struct { node *graph.Node fanIn int } - candidates := make([]ranked, 0, len(scoped)) + pool := make([]*graph.Node, 0, len(scoped)) for _, n := range scoped { - if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { - continue - } if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { continue } - candidates = append(candidates, ranked{node: n, fanIn: len(s.graph.GetInEdges(n.ID))}) + pool = append(pool, n) + } + candidates := make([]ranked, 0, len(pool)) + switch { + case degreeByID != nil: + for _, n := range pool { + r := degreeByID[n.ID] + candidates = append(candidates, ranked{node: n, fanIn: r.InCount}) + } + default: + if agg, ok := s.graph.(graph.NodeDegreeAggregator); ok && len(pool) > 0 { + ids := make([]string, 0, len(pool)) + byID := make(map[string]*graph.Node, len(pool)) + for _, n := range pool { + ids = append(ids, n.ID) + byID[n.ID] = n + } + for _, r := range agg.NodeDegreeCounts(ids, nil) { + n := byID[r.NodeID] + if n == nil { + continue + } + candidates = append(candidates, ranked{node: n, fanIn: r.InCount}) + } + } else { + for _, n := range pool { + candidates = append(candidates, ranked{node: n, fanIn: len(s.graph.GetInEdges(n.ID))}) + } + } } sort.Slice(candidates, func(i, j int) bool { return candidates[i].fanIn > candidates[j].fanIn @@ -216,6 +327,7 @@ func (s *Server) collectUntestedHotspots(scoped []*graph.Node, pathPrefix string } out := make([]gapUntestedHotspot, 0) + covRows := s.coverageByID() for _, c := range candidates { // A "hotspot" with zero callers isn't a hotspot — drop it. // Disconnected functions are already covered by the @@ -223,7 +335,7 @@ func (s *Server) collectUntestedHotspots(scoped []*graph.Node, pathPrefix string if c.fanIn == 0 { continue } - pct, has := c.node.Meta["coverage_pct"].(float64) + pct, has := coveragePctFrom(covRows, c.node) if has && pct >= minCov { continue } diff --git a/internal/mcp/tools_nav.go b/internal/mcp/tools_nav.go index 88a6dc34..97e118b0 100644 --- a/internal/mcp/tools_nav.go +++ b/internal/mcp/tools_nav.go @@ -272,7 +272,7 @@ func navNeighbours(eng engineLike, edges []*graph.Edge, kind graph.EdgeKind, for } else { id = e.From } - if seen[id] || strings.HasPrefix(id, "unresolved::") || strings.HasPrefix(id, "external::") { + if seen[id] || graph.IsUnresolvedTarget(id) || strings.HasPrefix(id, "external::") { continue } n := eng.GetSymbol(id) diff --git a/internal/mcp/tools_nav_test.go b/internal/mcp/tools_nav_test.go index 363ce6ac..d539205c 100644 --- a/internal/mcp/tools_nav_test.go +++ b/internal/mcp/tools_nav_test.go @@ -22,7 +22,7 @@ import ( // setupNavServer indexes a Go source with a deeper call graph and a type // carrying several methods, so the nav tool's into / up / sibling moves // have real candidates to choose between. -func setupNavServer(t *testing.T) (*Server, *graph.Graph) { +func setupNavServer(t *testing.T) (*Server, graph.Store) { t.Helper() dir := t.TempDir() src := `package svc @@ -73,7 +73,7 @@ func navResult(t *testing.T, result *mcplib.CallToolResult) map[string]any { } // navFindMethod returns the graph ID of a method named `name`. -func navFindMethod(t *testing.T, g *graph.Graph, name string) string { +func navFindMethod(t *testing.T, g graph.Store, name string) string { t.Helper() for _, n := range g.AllNodes() { if n.Name == name && (n.Kind == graph.KindMethod || n.Kind == graph.KindFunction) { diff --git a/internal/mcp/tools_notebook_test.go b/internal/mcp/tools_notebook_test.go index 6edce254..9eaff757 100644 --- a/internal/mcp/tools_notebook_test.go +++ b/internal/mcp/tools_notebook_test.go @@ -3,8 +3,6 @@ package mcp import ( "context" "encoding/json" - "os" - "path/filepath" "strings" "testing" "time" @@ -13,6 +11,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/persistence" ) func newNotebookTestServer(t *testing.T) (*Server, string) { @@ -47,8 +46,8 @@ func callNotebookHandler(t *testing.T, h func(context.Context, mcp.CallToolReque return m } -func TestNotebook_SaveCreatesFile(t *testing.T) { - s, dir := newNotebookTestServer(t) +func TestNotebook_SavePersists(t *testing.T) { + s, _ := newNotebookTestServer(t) out := callNotebookHandler(t, s.handleNotebookSave, map[string]any{ "title": "auth invariant", "body": "Bar must hold the mutex.", @@ -57,12 +56,15 @@ func TestNotebook_SaveCreatesFile(t *testing.T) { id := out["id"].(string) require.NotEmpty(t, id) - path := filepath.Join(dir, ".gortex", "notebook", id+".md") - body, err := os.ReadFile(path) - require.NoError(t, err) - assert.Contains(t, string(body), "title: auth invariant") - assert.Contains(t, string(body), "Bar must hold the mutex") - assert.Contains(t, string(body), "tags: [invariant, auth]") + + // The entry round-trips through the sidecar DB (no markdown file). + shown := callNotebookHandler(t, s.handleNotebookShow, map[string]any{"id": id}) + assert.Equal(t, "auth invariant", shown["title"]) + assert.Contains(t, shown["body"], "Bar must hold the mutex") + tags, _ := shown["tags"].([]any) + require.Len(t, tags, 2) + assert.Equal(t, "invariant", tags[0]) + assert.Equal(t, "auth", tags[1]) } func TestNotebook_UpdatePreservesCreated(t *testing.T) { @@ -152,7 +154,7 @@ func TestNotebook_ShowReturnsBody(t *testing.T) { id := created["id"].(string) out := callNotebookHandler(t, s.handleNotebookShow, map[string]any{"id": id}) - assert.Equal(t, "the full markdown body here\n", out["body"], "show returns full body including trailing newline") + assert.Equal(t, "the full markdown body here", out["body"], "show returns the verbatim body") } func TestNotebook_ShowUnknownIDErrors(t *testing.T) { @@ -228,18 +230,23 @@ func TestNotebook_PrunesByTTL(t *testing.T) { dir := t.TempDir() nm := newNotebookManager(dir) nm.ttl = 1 * time.Millisecond - // Write an entry with Updated far in the past so the prune - // purges it on the next save. - stale := notebookEntry{ID: "stale", Title: "stale", Updated: time.Now().Add(-time.Hour)} - _ = os.MkdirAll(filepath.Join(dir, ".gortex", "notebook"), 0o755) - _ = os.WriteFile(filepath.Join(dir, ".gortex", "notebook", "stale.md"), []byte(notebookMarshal(stale)), 0o644) + require.NotNil(t, nm.sidecar) + // Insert a row with Updated far in the past directly into the + // sidecar so the next Save's prune sweeps it. + require.NoError(t, nm.sidecar.UpsertNotebook(nm.repoKey, persistence.NotebookRow{ + ID: "stale", + Title: "stale", + Updated: time.Now().UTC().Add(-time.Hour), + })) // Trigger a save which fires the prune. - _, _ = nm.Save(notebookEntry{Title: "fresh", Body: "x"}) + _, err := nm.Save(notebookEntry{Title: "fresh", Body: "x"}) + require.NoError(t, err) - // stale.md should be gone. - _, err := os.Stat(filepath.Join(dir, ".gortex", "notebook", "stale.md")) - assert.True(t, os.IsNotExist(err), "TTL-expired entry pruned") + // The stale entry should be gone; the fresh one survives. + _, ok := nm.Get("stale") + assert.False(t, ok, "TTL-expired entry pruned") + assert.Len(t, nm.List(), 1, "only the fresh entry remains") } func TestNotebook_DeleteIdempotent(t *testing.T) { diff --git a/internal/mcp/tools_outline.go b/internal/mcp/tools_outline.go index dbf6b0b5..a82b51a0 100644 --- a/internal/mcp/tools_outline.go +++ b/internal/mcp/tools_outline.go @@ -2,6 +2,7 @@ package mcp import ( "context" + "maps" "sort" "github.com/mark3labs/mcp-go/mcp" @@ -35,10 +36,18 @@ func (s *Server) handleGetRepoOutline(ctx context.Context, req mcp.CallToolReque // outline is byte-identical to the legacy global view. inScope is // the node-ID set used to bound the edge-driven and analyzer-driven // sections; nil for an unbound session means "no filter". - scoped := s.scopedNodes(ctx) _, _, bound := s.sessionScope(ctx) + + // Pull the full scoped node slice only when the session is bound + // — the lang count, total-node count, and edge filter need it then. + // Unbound sessions get the same numbers from the backend's cached + // Stats() (one indexed groupby on disk backends) and the + // callable-only entry-point pass, neither of which materialises + // the whole node table over cgo. + var scoped []*graph.Node var inScope map[string]bool if bound { + scoped = s.scopedNodes(ctx) inScope = make(map[string]bool, len(scoped)) for _, n := range scoped { inScope[n.ID] = true @@ -52,10 +61,20 @@ func (s *Server) handleGetRepoOutline(ctx context.Context, req mcp.CallToolReque Nodes int `json:"nodes"` } langCounts := make(map[string]int) - for _, n := range scoped { - if n.Language != "" { - langCounts[n.Language]++ + totalScopedNodes := 0 + if bound { + for _, n := range scoped { + if n.Language != "" { + langCounts[n.Language]++ + } } + totalScopedNodes = len(scoped) + } else { + // Unbound: Stats().ByLanguage already aggregates this server- + // side; the cgo cost is one GROUP BY instead of one row per node. + stats := s.graph.Stats() + maps.Copy(langCounts, stats.ByLanguage) + totalScopedNodes = stats.TotalNodes } var languages []langEntry for name, n := range langCounts { @@ -76,16 +95,23 @@ func (s *Server) handleGetRepoOutline(ctx context.Context, req mcp.CallToolReque } // Edge count, bounded to edges whose endpoints are both in scope. + // Unbound sessions never set inScope, so the count is exactly + // the backend's EdgeCount() — an O(1) lookup that skips + // materialising every edge over cgo. totalEdges := 0 - for _, e := range s.graph.AllEdges() { - if inScope != nil && (!inScope[e.From] || !inScope[e.To]) { - continue + if inScope == nil { + totalEdges = s.graph.EdgeCount() + } else { + for _, e := range s.graph.AllEdges() { + if !inScope[e.From] || !inScope[e.To] { + continue + } + totalEdges++ } - totalEdges++ } summary := map[string]any{ - "total_nodes": len(scoped), + "total_nodes": totalScopedNodes, "total_edges": totalEdges, "primary_language": primaryLang, "languages": languages, @@ -126,7 +152,7 @@ func (s *Server) handleGetRepoOutline(ctx context.Context, req mcp.CallToolReque // threshold to ensure we get the top N regardless of repo size. // Post-filtered to the session's workspace. hotspotsSection := []map[string]any{} - hs := analysis.FindHotspots(s.graph, s.getCommunities(), 0) + hs := s.getHotspots() for _, h := range hs { if len(hotspotsSection) >= topHotspotsN { break @@ -150,7 +176,7 @@ func (s *Server) handleGetRepoOutline(ctx context.Context, req mcp.CallToolReque "communities": communitiesSection, "hotspots": hotspotsSection, "most_imported_files": mostImportedFiles(s.graph, inScope, topMostImportedN), - "entry_points": entryPoints(scoped, topEntryPointsN), + "entry_points": entryPoints(s.graph, inScope, topEntryPointsN), }) } @@ -176,31 +202,55 @@ func topCommunitiesSummary(comms []analysis.Community) []map[string]any { // "here's where the gravity lives" signal for newcomers. // inScope, when non-nil, bounds the ranking to imports whose target // node is inside the session's workspace. -func mostImportedFiles(g *graph.Graph, inScope map[string]bool, topN int) []map[string]any { +// +// Picks the FileImportAggregator capability when the backend +// implements it (one server-side aggregate ships back the per-file count +// instead of materialising every edge over cgo just to bucket). +// Falls back to the AllEdges-driven loop on backends that don't. +func mostImportedFiles(g graph.Store, inScope map[string]bool, topN int) []map[string]any { type fileCount struct { path string count int } counts := make(map[string]int) - for _, e := range g.AllEdges() { - if e.Kind != graph.EdgeImports { - continue - } - target := g.GetNode(e.To) - if target == nil { - continue + if ag, ok := g.(graph.FileImportAggregator); ok { + var scope []string + if inScope != nil { + scope = make([]string, 0, len(inScope)) + for id := range inScope { + scope = append(scope, id) + } + // An empty inScope means "nothing matches" — the + // aggregator contract maps that to nil so we never + // fire a whole-graph scan on a bound session. + if len(scope) == 0 { + scope = []string{} + } } - if inScope != nil && !inScope[target.ID] { - continue + for _, r := range ag.FileImportCounts(scope) { + counts[r.FilePath] = r.Count } - // Aggregate at the file level. For Import-kind nodes the node's - // FilePath is the file being imported; for File-kind nodes the - // ID is already the path. - path := target.FilePath - if path == "" { - path = target.ID + } else { + for _, e := range g.AllEdges() { + if e.Kind != graph.EdgeImports { + continue + } + target := g.GetNode(e.To) + if target == nil { + continue + } + if inScope != nil && !inScope[target.ID] { + continue + } + // Aggregate at the file level. For Import-kind nodes the node's + // FilePath is the file being imported; for File-kind nodes the + // ID is already the path. + path := target.FilePath + if path == "" { + path = target.ID + } + counts[path]++ } - counts[path]++ } var ranked []fileCount @@ -231,18 +281,28 @@ func mostImportedFiles(g *graph.Graph, inScope map[string]bool, topN int) []map[ // (the Go / Rust / C convention) and top-level functions with no callers // in files named `main.*` or `cmd/**`. Good enough for the outline; a // fuller process-based walk is what `get_processes` does separately. -func entryPoints(nodes []*graph.Node, topN int) []map[string]any { +// +// Lookup goes through FindNodesByName so the name index runs server- +// side on disk backends — the legacy nodes-slice walk pulled the whole +// node table just to keep the ~10 nodes literally named "main". When +// an inScope filter is supplied (bound session), it's applied after +// the name lookup so a bound session never sees mains from other +// workspaces. +func entryPoints(g graph.Store, inScope map[string]bool, topN int) []map[string]any { type ep struct { id string name string filePath string } var out []ep - for _, n := range nodes { + for _, n := range g.FindNodesByName("main") { + if n == nil { + continue + } if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { continue } - if n.Name != "main" { + if inScope != nil && !inScope[n.ID] { continue } out = append(out, ep{id: n.ID, name: n.Name, filePath: n.FilePath}) diff --git a/internal/mcp/tools_releases_test.go b/internal/mcp/tools_releases_test.go new file mode 100644 index 00000000..61ca593c --- /dev/null +++ b/internal/mcp/tools_releases_test.go @@ -0,0 +1,119 @@ +package mcp + +import ( + "context" + "encoding/json" + "testing" + + "github.com/mark3labs/mcp-go/mcp" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +// seedReleasesGraph populates the graph with a KindRelease timeline +// and a couple of file nodes whose meta.added_in maps onto the +// releases. Mirrors what releases.EnrichGraphForBranch would have +// written; lets the read-side handler be tested without a real git +// repo. +func seedReleasesGraph(t *testing.T) *Server { + t.Helper() + g := graph.New() + g.AddNode(&graph.Node{ + ID: "release::v0.1", + Kind: graph.KindRelease, + Name: "v0.1", + Meta: map[string]any{ + "tag": "v0.1", + "file_count": 1, + "order": 0, + }, + }) + g.AddNode(&graph.Node{ + ID: "release::v0.2", + Kind: graph.KindRelease, + Name: "v0.2", + Meta: map[string]any{ + "tag": "v0.2", + "file_count": 2, + "order": 1, + }, + }) + g.AddNode(&graph.Node{ + ID: "a.go", Kind: graph.KindFile, FilePath: "a.go", + Meta: map[string]any{"added_in": "v0.1"}, + }) + g.AddNode(&graph.Node{ + ID: "b.go", Kind: graph.KindFile, FilePath: "b.go", + Meta: map[string]any{"added_in": "v0.2"}, + }) + return &Server{ + graph: g, + session: newSessionState(), + tokenStats: &tokenStats{}, + symHistory: &symbolHistory{entries: make(map[string][]SymbolModification)}, + sessions: newSessionMap(), + toolScopes: newScopeRegistry(), + } +} + +func callAnalyzeReleases(t *testing.T, s *Server, args map[string]any) map[string]any { + t.Helper() + req := mcp.CallToolRequest{} + req.Params.Arguments = args + res, err := s.handleAnalyzeReleases(context.Background(), req) + require.NoError(t, err) + require.NotNil(t, res) + tc, ok := res.Content[0].(mcp.TextContent) + require.True(t, ok) + var m map[string]any + require.NoError(t, json.Unmarshal([]byte(tc.Text), &m)) + return m +} + +func TestAnalyzeReleases_Timeline(t *testing.T) { + s := seedReleasesGraph(t) + out := callAnalyzeReleases(t, s, map[string]any{}) + releases, _ := out["releases"].([]any) + require.Len(t, releases, 2) + first := releases[0].(map[string]any) + assert.Equal(t, "v0.1", first["tag"], "ordered by Meta.order asc — oldest first") + assert.EqualValues(t, 0, first["order"]) + assert.EqualValues(t, 1, first["file_count"]) +} + +func TestAnalyzeReleases_TagFilterReturnsFiles(t *testing.T) { + s := seedReleasesGraph(t) + out := callAnalyzeReleases(t, s, map[string]any{"tag": "v0.2"}) + releases, _ := out["releases"].([]any) + require.Len(t, releases, 1) + first := releases[0].(map[string]any) + files, _ := first["files"].([]any) + require.Len(t, files, 1) + assert.Equal(t, "b.go", files[0]) + assert.EqualValues(t, 1, out["file_hits"]) +} + +func TestAnalyzeReleases_TagFilterUnknownTag(t *testing.T) { + s := seedReleasesGraph(t) + out := callAnalyzeReleases(t, s, map[string]any{"tag": "v99"}) + require.NotEmpty(t, out["error"]) + assert.Equal(t, "enrich_releases", out["suggestion"]) +} + +func TestAnalyzeReleases_ErrorsWhenNoMeta(t *testing.T) { + g := graph.New() + g.AddNode(&graph.Node{ID: "x.go", Kind: graph.KindFile, FilePath: "x.go"}) + s := &Server{ + graph: g, + session: newSessionState(), + tokenStats: &tokenStats{}, + symHistory: &symbolHistory{entries: make(map[string][]SymbolModification)}, + sessions: newSessionMap(), + toolScopes: newScopeRegistry(), + } + out := callAnalyzeReleases(t, s, map[string]any{}) + require.NotEmpty(t, out["error"]) + assert.Equal(t, "enrich_releases", out["suggestion"]) +} diff --git a/internal/mcp/tools_replay_episode.go b/internal/mcp/tools_replay_episode.go index 4eed9358..e90ed7b4 100644 --- a/internal/mcp/tools_replay_episode.go +++ b/internal/mcp/tools_replay_episode.go @@ -30,13 +30,13 @@ func (s *Server) registerReplayEpisodeTool() { } type replayTimelineRow struct { - ID string `json:"id"` - Name string `json:"name"` - FilePath string `json:"file_path"` - LastCommitAt string `json:"last_commit_at,omitempty"` - LastAuthor string `json:"last_author,omitempty"` - SessionEdits int `json:"session_edits,omitempty"` - SignatureFlux bool `json:"signature_flux,omitempty"` + ID string `json:"id"` + Name string `json:"name"` + FilePath string `json:"file_path"` + LastCommitAt string `json:"last_commit_at,omitempty"` + LastAuthor string `json:"last_author,omitempty"` + SessionEdits int `json:"session_edits,omitempty"` + SignatureFlux bool `json:"signature_flux,omitempty"` } type replayCallerRow struct { @@ -95,13 +95,13 @@ func (s *Server) handleReplayEpisode(ctx context.Context, req mcp.CallToolReques "name": anchorNode.Name, "file_path": anchorNode.FilePath, }, - "window_days": windowDays, - "depth": depth, - "radius_size": len(radius), - "timeline": timeline, - "callers": callers, - "coverage_gaps": coverage, - "memories": memories, + "window_days": windowDays, + "depth": depth, + "radius_size": len(radius), + "timeline": timeline, + "callers": callers, + "coverage_gaps": coverage, + "memories": memories, }) } @@ -137,9 +137,17 @@ func (s *Server) replayTimeline(radius map[string]int, windowDays, limit int) [] if windowDays > 0 { cutoff = time.Now().Add(-time.Duration(windowDays) * 24 * time.Hour) } + // Batch-fetch every node in the radius; the radius is the BFS + // frontier (often hundreds of IDs), and per-id GetNode on a disk + // backend would issue that many round-trips per replay call. + ids := make([]string, 0, len(radius)) + for id := range radius { + ids = append(ids, id) + } + nodeByID := s.graph.GetNodesByIDs(ids) rows := make([]replayTimelineRow, 0, len(radius)) for id := range radius { - n := s.graph.GetNode(id) + n := nodeByID[id] if n == nil { continue } @@ -197,12 +205,23 @@ func (s *Server) replayTimeline(radius map[string]int, windowDays, limit int) [] } func (s *Server) replayCallers(radius map[string]int, anchor string, limit int) []replayCallerRow { + // Batch-fetch the radius minus the anchor; same rationale as + // replayTimeline — per-id GetNode on a disk backend costs one + // round-trip per BFS node. + ids := make([]string, 0, len(radius)) + for id := range radius { + if id == anchor { + continue + } + ids = append(ids, id) + } + nodeByID := s.graph.GetNodesByIDs(ids) rows := make([]replayCallerRow, 0, len(radius)) for id, d := range radius { if id == anchor { continue } - n := s.graph.GetNode(id) + n := nodeByID[id] if n == nil { continue } @@ -226,13 +245,20 @@ func (s *Server) replayCallers(radius map[string]int, anchor string, limit int) } func (s *Server) replayCoverageGaps(radius map[string]int, limit int) []replayCoverageRow { + // Batch-fetch the radius — same rationale as replayTimeline. + ids := make([]string, 0, len(radius)) + for id := range radius { + ids = append(ids, id) + } + nodeByID := s.graph.GetNodesByIDs(ids) + covRows := s.coverageByID() rows := make([]replayCoverageRow, 0) for id := range radius { - n := s.graph.GetNode(id) + n := nodeByID[id] if n == nil { continue } - pct, has := n.Meta["coverage_pct"].(float64) + pct, has := coveragePctFrom(covRows, n) if has && pct >= 100.0 { continue } diff --git a/internal/mcp/tools_safe_delete.go b/internal/mcp/tools_safe_delete.go index 3f9b73a7..fb848c5d 100644 --- a/internal/mcp/tools_safe_delete.go +++ b/internal/mcp/tools_safe_delete.go @@ -363,7 +363,7 @@ func expandDeleteRange(node *graph.Node, lines []string) (int, int) { // target. Iteration is bounded by cascadeIterationCap; if hit, the // caller surfaces cascade_truncated so the agent knows the closure // may be incomplete. -func computeCascadeClosure(g *graph.Graph, target *graph.Node, cascadeIntoTests bool) ([]cascadeClosureEntry, bool) { +func computeCascadeClosure(g graph.Store, target *graph.Node, cascadeIntoTests bool) ([]cascadeClosureEntry, bool) { closure := []cascadeClosureEntry{} inClosure := map[string]bool{target.ID: true} reasons := map[string]string{} @@ -423,7 +423,7 @@ func computeCascadeClosure(g *graph.Graph, target *graph.Node, cascadeIntoTests // collectCascadeCandidates returns every distinct node ID that an // in-closure node points at via a referencing edge — the only // possible new entrants to the closure on this iteration. -func collectCascadeCandidates(g *graph.Graph, inClosure map[string]bool) []string { +func collectCascadeCandidates(g graph.Store, inClosure map[string]bool) []string { seen := map[string]bool{} out := []string{} for from := range inClosure { @@ -448,7 +448,7 @@ func collectCascadeCandidates(g *graph.Graph, inClosure map[string]bool) []strin // reports whether the node has no caller outside the current // closure. Returns a human-readable reason string when the node // qualifies (used for the response payload). -func candidateQualifies(g *graph.Graph, cn *graph.Node, inClosure map[string]bool, cascadeIntoTests bool) (string, bool) { +func candidateQualifies(g graph.Store, cn *graph.Node, inClosure map[string]bool, cascadeIntoTests bool) (string, bool) { targetWS := "" // Build an "in-closure caller" list so the reason string can // name the symbol(s) that are the only ones still calling this @@ -540,7 +540,7 @@ func workspaceKey(n *graph.Node) string { // represents real use (someone calls, implements, extends, or // references this symbol). Structural edges (defines, member_of) // are excluded because they don't block a delete. -func collectReferencingEdges(g *graph.Graph, id string) []safeDeleteReference { +func collectReferencingEdges(g graph.Store, id string) []safeDeleteReference { out := make([]safeDeleteReference, 0) seen := map[string]bool{} for _, e := range g.GetInEdges(id) { diff --git a/internal/mcp/tools_search_assist.go b/internal/mcp/tools_search_assist.go index 42c56067..98483536 100644 --- a/internal/mcp/tools_search_assist.go +++ b/internal/mcp/tools_search_assist.go @@ -3,6 +3,7 @@ package mcp import ( "context" "strings" + "time" mcpgo "github.com/mark3labs/mcp-go/mcp" @@ -149,25 +150,72 @@ func expandSearchTerms(ctx context.Context, s *Server, query string) []string { return res.Terms } -// fetchAndMergeBM25 runs BM25 once per term (original + expansions), -// then folds the results into a single deduplicated slice. The -// original query's hits win position; expansion hits append in their -// own BM25 order with duplicates skipped. +// fetchAndMergeBM25 fires (at most) two BM25 calls — one for the +// primary query alone (so we can attribute primaryCount honestly for +// the debug surface) and one for the combined OR-merge of every +// expansion term — then folds the results into a single deduplicated +// slice. The original query's hits win position; the combined- +// expansion hits append in their own BM25 order with duplicates +// skipped. // -// fetchLimit is the per-term over-fetch budget. Bounded by the caller -// so a wide expansion can't blow up the candidate pool. +// Both BM25 backends (BM25Backend and the on-disk backend's FTS) +// treat a multi-token query as an OR-style union +// with a single global BM25 score, so one combined call replaces +// the prior N per-term fan-out (the N+1 round-trip pattern dominated +// the search hot path on disk backends). +// +// A per-fragment exact-name rescue runs after the combined call — +// one batched FindNodesByNames on the engine's reader. This +// preserves the per-term behaviour where a fragment like +// "BillingInvoice" finds its exact-name node even when BM25 +// tokenisation drops the PascalCase concatenation. +// +// fetchLimit caps each call so a wide expansion can't blow up the +// candidate pool. // // primaryCount is the size of the original-query BM25 result before -// merging; useful for diagnostic / debug surfaces that want to show -// how many candidates expansion contributed. +// merging — surfaced on the assist debug field so callers can see how +// much expansion contributed. func fetchAndMergeBM25(eng *query.Engine, original string, expanded []string, fetchLimit int, scope query.QueryOptions) (merged []*graph.Node, primaryCount int) { + return fetchAndMergeBM25Timed(eng, original, expanded, fetchLimit, scope, nil) +} + +// fetchAndMergeBM25Timed is fetchAndMergeBM25 with per-phase wall-clock +// breakdowns. The MCP handler hands a fresh SearchTimings struct so +// the resulting Debug log line attributes BM25 time honestly across +// the primary call and the combined-expansion call. Pass nil to skip +// instrumentation (e.g. unit tests that don't care). +func fetchAndMergeBM25Timed(eng *query.Engine, original string, expanded []string, fetchLimit int, scope query.QueryOptions, timings *query.SearchTimings) (merged []*graph.Node, primaryCount int) { + // The merged candidate set is reranked by the handler with the + // full session-aware context; the per-call inner rerank inside + // SearchSymbolsRanked would be wasted work whose output the + // merge discards. SkipInnerRerank collapses the N+1 engine + // rerank invocations to zero — drops ~150-300ms per call on + // a disk backend (each inner rerank's Context.prepare costs at minimum + // two batched edge fetches when the bundle cache misses). + scope.SkipInnerRerank = true + primaryStart := time.Now() primary := eng.SearchSymbolsScoped(original, fetchLimit, scope) primaryCount = len(primary) - if len(expanded) == 0 { + if timings != nil { + timings.BM25PrimaryMS += time.Since(primaryStart).Milliseconds() + } + + // Trim and de-empty the expansion list. When nothing useful + // survives we skip the combined call entirely. + cleanedExpansion := make([]string, 0, len(expanded)) + for _, t := range expanded { + t = strings.TrimSpace(t) + if t != "" { + cleanedExpansion = append(cleanedExpansion, t) + } + } + if len(cleanedExpansion) == 0 { return primary, primaryCount } - seen := make(map[string]bool, len(primary)) - merged = make([]*graph.Node, 0, len(primary)) + + seen := make(map[string]bool, len(primary)+fetchLimit) + merged = make([]*graph.Node, 0, len(primary)+fetchLimit) for _, n := range primary { if seen[n.ID] { continue @@ -175,23 +223,83 @@ func fetchAndMergeBM25(eng *query.Engine, original string, expanded []string, fe seen[n.ID] = true merged = append(merged, n) } - for _, term := range expanded { - term = strings.TrimSpace(term) - if term == "" { + + // Combined OR-merge: pass every expansion term — concatenated by + // whitespace — as ONE BM25 call. Tokenisation + IDF scoring run + // once across the whole bag of terms instead of N times. + // + // The concatenated bag of terms is never going to match any + // node's literal Name, so the engine's exact-name splice would + // pay a guaranteed-empty FindNodesByName round-trip every + // fan-out. SkipExactNameSplice tells gatherBackendCandidates to + // skip it — the per-fragment exact-name rescue below covers the + // load-bearing PascalCase-fragment case the splice was insuring + // against, so dropping the round-trip is safe. + combined := strings.Join(cleanedExpansion, " ") + expansionScope := scope + expansionScope.SkipExactNameSplice = true + expansionStart := time.Now() + extra := eng.SearchSymbolsScoped(combined, fetchLimit, expansionScope) + if timings != nil { + timings.BM25ExpansionMS += time.Since(expansionStart).Milliseconds() + } + for _, n := range extra { + if seen[n.ID] { continue } - extra := eng.SearchSymbolsScoped(term, fetchLimit, scope) - for _, n := range extra { - if seen[n.ID] { - continue + seen[n.ID] = true + merged = append(merged, n) + } + + // Per-fragment exact-name union — cheap (one name-bucket lookup + // per term on in-memory, a single batched name-IN query on a + // disk backend via FindNodesByNames). Preserves the + // per-term behaviour where a fragment like "BillingInvoice" + // finds its exact-name node even when BM25 tokenisation misses + // the PascalCase concatenated token. Without this rescue, + // soup-split mode silently dropped exact matches that the + // per-term loop used to surface via the engine's FindNodesByName + // fallback. + if rdr, ok := graphReaderFromEngine(eng); ok { + nameMap := rdr.FindNodesByNames(cleanedExpansion) + for _, term := range cleanedExpansion { + for _, n := range nameMap[term] { + if n == nil || seen[n.ID] { + continue + } + if n.Kind == graph.KindFile || n.Kind == graph.KindImport { + continue + } + if scope.WorkspaceID != "" && !scope.ScopeAllows(n) { + continue + } + seen[n.ID] = true + merged = append(merged, n) } - seen[n.ID] = true - merged = append(merged, n) } } return merged, primaryCount } +// graphReaderFromEngine returns the engine's underlying graph reader +// if it also exposes the batched FindNodesByNames method (every +// production backend does — in-memory, the on-disk backend, and OverlaidView via +// the layered base). Falls back to (nil, false) when an embedded +// test engine wires a stripped-down reader — the rescue step is then +// skipped, matching the contract that callers without a names-batch +// reader simply get the BM25-only result. +type namesReader interface { + FindNodesByNames(names []string) map[string][]*graph.Node +} + +func graphReaderFromEngine(eng *query.Engine) (namesReader, bool) { + if eng == nil { + return nil, false + } + r, ok := eng.Reader().(namesReader) + return r, ok +} + // rerankCap bounds how many candidates the rerank pass sees. The // model has limited working memory; past ~25 items its judgement // degrades and the prompt blows the assist context. Trailing diff --git a/internal/mcp/tools_search_assist_test.go b/internal/mcp/tools_search_assist_test.go index 69968ce9..e4e87e77 100644 --- a/internal/mcp/tools_search_assist_test.go +++ b/internal/mcp/tools_search_assist_test.go @@ -176,6 +176,84 @@ func TestFetchAndMergeBM25_DedupesAcrossTerms(t *testing.T) { assert.Equal(t, idsOf(primary), idsOf(merged)) } +// TestFetchAndMergeBM25_CombinedQueryUnionIsSuperset is the load-bearing +// guard for the "combine expansion terms into one BM25 query" +// optimisation. The merged result MUST contain at least every node +// that a per-term fan-out would have returned — otherwise switching +// from N BM25 calls to (primary + combined) drops candidates the +// rerank pipeline used to see. Exact-name rescue (the per-fragment +// FindNodesByNames step) is what makes this hold for tokenisation +// edge cases like PascalCase concatenated names that BM25 misses. +func TestFetchAndMergeBM25_CombinedQueryUnionIsSuperset(t *testing.T) { + srv, _ := setupTestServer(t) + scope := query.QueryOptions{} + + // Per-term fan-out (the OLD behaviour). For each fragment, run + // the engine search separately and collect every distinct node ID + // it surfaces — this is the worst-case "no candidate may be + // dropped by collapsing into one query" set. + terms := []string{"helper", "main"} + unionExpected := map[string]bool{} + for _, t := range terms { + for _, n := range srv.engine.SearchSymbolsScoped(t, 20, scope) { + unionExpected[n.ID] = true + } + } + require.NotEmpty(t, unionExpected, "per-term fan-out produced nothing — test corpus drifted") + + // New behaviour: primary + combined-OR + per-fragment exact-name + // rescue, all driven by fetchAndMergeBM25. + merged, _ := fetchAndMergeBM25(srv.engine, terms[0], terms[1:], 20, scope) + mergedSet := map[string]bool{} + for _, n := range merged { + mergedSet[n.ID] = true + } + + for id := range unionExpected { + require.True(t, mergedSet[id], "merged result missing per-term hit %q", id) + } +} + +// TestFetchAndMergeBM25_ExactNameRescuePreserved is the regression +// guard for the soup-mode + PascalCase fragment case that per-term +// fan-out used to handle implicitly. When BM25 tokenisation misses +// a fragment ("BillingInvoice" tokenises to one term `billinginvoice` +// which the camelCase-split index doesn't carry), the per-fragment +// FindNodesByNames rescue MUST still surface its exact-name node. +// This mirrors the failure mode TestSearchSymbols_PathScoping caught +// when soup-split fragments first went through the combined query +// path. +func TestFetchAndMergeBM25_ExactNameRescuePreserved(t *testing.T) { + srv, _ := setupTestServer(t) + + // The test corpus carries no PascalCase-concatenated names by + // default, so add three synthetic ones — these never reach BM25 + // (we don't re-index it for the test) but they are what the + // rescue step has to surface. + for path, name := range map[string]string{ + "svc/billing/Invoice.go": "BillingInvoice", + "svc/auth/Login.go": "AuthLogin", + "libs/money/Amount.go": "MoneyAmount", + } { + id := path + "::" + name + srv.graph.AddNode(&graph.Node{ + ID: id, Kind: graph.KindFunction, Name: name, + FilePath: path, StartLine: 1, EndLine: 5, Language: "go", + }) + } + + terms := []string{"BillingInvoice", "AuthLogin", "MoneyAmount"} + merged, _ := fetchAndMergeBM25(srv.engine, terms[0], terms[1:], 20, query.QueryOptions{}) + + mergedNames := map[string]bool{} + for _, n := range merged { + mergedNames[n.Name] = true + } + for _, want := range terms { + require.True(t, mergedNames[want], "exact-name rescue dropped %q from merged result", want) + } +} + // TestFetchAndMergeBM25_AppendsNewMatches verifies that expansion // terms bring in additional candidates the primary term missed. func TestFetchAndMergeBM25_AppendsNewMatches(t *testing.T) { diff --git a/internal/mcp/tools_search_fast_path_test.go b/internal/mcp/tools_search_fast_path_test.go new file mode 100644 index 00000000..6ff98ca1 --- /dev/null +++ b/internal/mcp/tools_search_fast_path_test.go @@ -0,0 +1,207 @@ +package mcp + +import ( + "context" + "encoding/json" + "sync/atomic" + "testing" + + mcplib "github.com/mark3labs/mcp-go/mcp" + "github.com/stretchr/testify/require" + "go.uber.org/zap" + + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/query" + "github.com/zzet/gortex/internal/search" +) + +// recordingBackend is a search.Backend that counts how many times the +// engine called into Search, VectorChannelOnly, and +// SearchSymbolBundles. The identifier-shape fast path test reads these +// counters to assert the handler skipped the vector channel and skipped +// the combined-OR fan-out. +// +// Implements search.Backend, search.ChannelSearcher, +// search.SymbolBundleSearcherBackend, and the VectorChannelOnly +// duck-typed interface the engine queries on the bundle-bypass path. +type recordingBackend struct { + hits []search.SearchResult + nodes map[string]*graph.Node + searchCalls atomic.Int32 + bundleCalls atomic.Int32 + vectorOnlyCalls atomic.Int32 + channelCalls atomic.Int32 + queriesMu atomic.Pointer[[]string] +} + +func newRecordingBackend(nodes map[string]*graph.Node, hits []search.SearchResult) *recordingBackend { + rb := &recordingBackend{hits: hits, nodes: nodes} + empty := []string{} + rb.queriesMu.Store(&empty) + return rb +} + +func (rb *recordingBackend) recordQuery(q string) { + for { + oldPtr := rb.queriesMu.Load() + newList := append([]string(nil), *oldPtr...) + newList = append(newList, q) + if rb.queriesMu.CompareAndSwap(oldPtr, &newList) { + return + } + } +} + +func (rb *recordingBackend) queries() []string { + return *rb.queriesMu.Load() +} + +func (rb *recordingBackend) Add(id string, fields ...string) {} +func (rb *recordingBackend) Remove(id string) {} +func (rb *recordingBackend) Count() int { return len(rb.hits) } +func (rb *recordingBackend) Close() {} + +func (rb *recordingBackend) Search(query string, limit int) []search.SearchResult { + rb.searchCalls.Add(1) + rb.recordQuery(query) + return rb.hits +} + +func (rb *recordingBackend) SearchChannels(query string, limit int) ([]search.SearchResult, []string) { + rb.channelCalls.Add(1) + rb.recordQuery(query) + return rb.hits, nil +} + +func (rb *recordingBackend) VectorChannelOnly(query string, limit int) ([]string, search.ChannelTimings) { + rb.vectorOnlyCalls.Add(1) + return nil, search.ChannelTimings{} +} + +// SearchSymbolBundles satisfies the bundle interface so the engine +// takes the bundle fast path on this backend. Edges are nil — the +// rerank tolerates an empty edge cache (it'll fall back to per-node +// fetches via Graph, but for the test we just care that the call +// signature flows through). +func (rb *recordingBackend) SearchSymbolBundles(query string, limit int) []search.SymbolBundle { + rb.bundleCalls.Add(1) + rb.recordQuery(query) + if len(rb.hits) == 0 { + return nil + } + out := make([]search.SymbolBundle, 0, len(rb.hits)) + for _, h := range rb.hits { + n := rb.nodes[h.ID] + if n == nil { + continue + } + out = append(out, search.SymbolBundle{Node: n, Score: h.Score}) + } + return out +} + +// identifierFastPathTestServer wires a Server around the recording backend so a +// search_symbols call can be inspected for vector / expansion fan-out +// activity. +func identifierFastPathTestServer(t *testing.T, names []string) (*Server, *recordingBackend) { + t.Helper() + g := graph.New() + nodes := make(map[string]*graph.Node, len(names)) + hits := make([]search.SearchResult, 0, len(names)) + for i, n := range names { + id := "pkg/" + n + ".go::" + n + node := &graph.Node{ + ID: id, Kind: graph.KindFunction, Name: n, + FilePath: "pkg/" + n + ".go", StartLine: i + 1, EndLine: i + 5, Language: "go", + } + g.AddNode(node) + nodes[id] = node + hits = append(hits, search.SearchResult{ID: id, Score: 1.0 / float64(i+1)}) + } + rb := newRecordingBackend(nodes, hits) + eng := query.NewEngine(g) + eng.SetSearch(rb) + srv := NewServer(eng, g, nil, nil, zap.NewNop(), nil) + srv.RunAnalysis() + return srv, rb +} + +// TestSearchSymbols_IdentifierFastPath_SkipsVectorAndExpansion is the +// behavioural guard for the QueryClassSymbol / Path / Signature fast +// path. Three contracts must hold: +// +// 1. The vector channel (VectorChannelOnly on the bundle path, +// SearchChannels on the legacy path) is NEVER called. +// 2. Only the primary query reaches the backend — no combined-OR +// fan-out gets emitted (no second Search / Bundle call carrying +// a concatenated expansion-term string). +// 3. The query_class echoed back in the response matches what the +// handler actually treated the query as. +// +// "NewServer" is the canonical identifier-shape probe (PascalCase, no +// whitespace, no separator) — classifies as QueryClassSymbol. +func TestSearchSymbols_IdentifierFastPath_SkipsVectorAndExpansion(t *testing.T) { + srv, rb := identifierFastPathTestServer(t, []string{"NewServer", "NewClient", "StartServer", "Server"}) + + req := mcplib.CallToolRequest{} + req.Params.Name = "search_symbols" + req.Params.Arguments = map[string]any{"query": "NewServer", "limit": 10} + res, err := srv.handleSearchSymbols(context.Background(), req) + require.NoError(t, err) + require.False(t, res.IsError, "search errored: %v", res.Content) + + // Contract 1: no vector channel call. The bundle path's + // VectorChannelOnly is the production-shape probe; SearchChannels + // is the legacy fallback. Neither may fire for an identifier query. + require.Equal(t, int32(0), rb.vectorOnlyCalls.Load(), + "identifier fast path must not call VectorChannelOnly; queries=%v", rb.queries()) + require.Equal(t, int32(0), rb.channelCalls.Load(), + "identifier fast path must not call SearchChannels; queries=%v", rb.queries()) + + // Contract 2: only the primary query reaches the backend. Bundle + // path: one call to SearchSymbolBundles with the bare query. + // Fallback Search may also fire (zero candidates → fallback tier), + // but the combined-OR expansion call is the regression to guard + // against — no Search/Bundle query carries a multi-token expansion + // payload like "NewServer StartServer Server …". + require.Equal(t, int32(1), rb.bundleCalls.Load(), + "primary bundle call should fire exactly once; queries=%v", rb.queries()) + for _, q := range rb.queries() { + require.Equal(t, "NewServer", q, + "only the original query is allowed to reach the backend on the identifier fast path; saw %q in %v", q, rb.queries()) + } + + // Contract 3: response echoes the class. + var resp map[string]any + require.NoError(t, json.Unmarshal([]byte(res.Content[0].(mcplib.TextContent).Text), &resp)) + require.Equal(t, "symbol", resp["query_class"], + "response must echo the classified query_class") +} + +// TestSearchSymbols_ConceptQuery_DoesNotEngageFastPath is the negative +// guard: a natural-language query (concept class) keeps the legacy +// pipeline — vector channel allowed, expansion allowed. Without this +// the fast-path optimisation could silently swallow concept queries. +func TestSearchSymbols_ConceptQuery_DoesNotEngageFastPath(t *testing.T) { + srv, rb := identifierFastPathTestServer(t, []string{"AuthMiddleware", "ValidateToken", "ParseConfig", "Helper"}) + + req := mcplib.CallToolRequest{} + req.Params.Name = "search_symbols" + // Multi-word natural-language query → QueryClassConcept. + req.Params.Arguments = map[string]any{"query": "where do we validate the user token auth", "limit": 10} + res, err := srv.handleSearchSymbols(context.Background(), req) + require.NoError(t, err) + require.False(t, res.IsError, "search errored: %v", res.Content) + + // Concept queries MUST still let the engine fan out to the vector + // channel — the bundle's VectorChannelOnly call fires on the + // bundle hot path. Anything that prevented this would silently + // downgrade the natural-language search experience. + require.GreaterOrEqual(t, rb.vectorOnlyCalls.Load(), int32(1), + "concept query must still pull the vector channel; queries=%v", rb.queries()) + + var resp map[string]any + require.NoError(t, json.Unmarshal([]byte(res.Content[0].(mcplib.TextContent).Text), &resp)) + require.Equal(t, "concept", resp["query_class"], + "NL query must classify as concept") +} diff --git a/internal/mcp/tools_suggest_queries.go b/internal/mcp/tools_suggest_queries.go index f3f59509..7da254d5 100644 --- a/internal/mcp/tools_suggest_queries.go +++ b/internal/mcp/tools_suggest_queries.go @@ -78,7 +78,7 @@ func (s *Server) buildSuggestedQueries(scoped []*graph.Node, inScope map[string] } // 1. Entry points — where the program starts executing. - for i, ep := range entryPoints(scoped, 3) { + for i, ep := range entryPoints(s.graph, inScope, 3) { if i >= 2 { break } @@ -90,27 +90,42 @@ func (s *Server) buildSuggestedQueries(scoped []*graph.Node, inScope map[string] // and by how many of those edges cross a community boundary. Done // directly off the graph rather than via FindHotspots, whose // mean+2σ threshold returns nothing on small repositories. + // + // EdgesByKind streams from the storage layer (one query per kind + // on a disk backend, an indexed bucket scan in-memory) so the cost is + // O(call+reference edges) once — replacing the per-node + // GetInEdges loop that was N cgo round-trips materialising the + // full in-edge bucket per candidate. nodeToComm := map[string]string{} if comms := s.getCommunities(); comms != nil { nodeToComm = comms.NodeToComm } - var stats []symbolStat + statByID := make(map[string]*symbolStat, len(scoped)) + stats := make([]symbolStat, 0, len(scoped)) for _, n := range scoped { if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod && n.Kind != graph.KindType { continue } - st := symbolStat{node: n} - myComm := nodeToComm[n.ID] - for _, e := range s.graph.GetInEdges(n.ID) { - if e.Kind != graph.EdgeCalls && e.Kind != graph.EdgeReferences { + stats = append(stats, symbolStat{node: n}) + } + for i := range stats { + statByID[stats[i].node.ID] = &stats[i] + } + for _, k := range []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} { + for e := range s.graph.EdgesByKind(k) { + if e == nil { + continue + } + st, ok := statByID[e.To] + if !ok { continue } st.fanIn++ + myComm := nodeToComm[e.To] if c := nodeToComm[e.From]; myComm != "" && c != "" && c != myComm { st.crossings++ } } - stats = append(stats, st) } // 2. Bridges — symbols pulled at from the most other subsystems. diff --git a/internal/mcp/tools_surprising.go b/internal/mcp/tools_surprising.go index 9a65c196..88ba0c34 100644 --- a/internal/mcp/tools_surprising.go +++ b/internal/mcp/tools_surprising.go @@ -32,16 +32,16 @@ func (s *Server) registerSurprisingConnectionsTool() { // decide whether the anomaly is real or expected without an extra // get_symbol_source round-trip. type surprisingEdgeRow struct { - From string `json:"from"` - FromName string `json:"from_name,omitempty"` - FromFile string `json:"from_file,omitempty"` - To string `json:"to"` - ToName string `json:"to_name,omitempty"` - ToFile string `json:"to_file,omitempty"` - Kind string `json:"kind"` - Score float64 `json:"score"` - Signals map[string]float64 `json:"signals"` - Reasons []string `json:"reasons"` + From string `json:"from"` + FromName string `json:"from_name,omitempty"` + FromFile string `json:"from_file,omitempty"` + To string `json:"to"` + ToName string `json:"to_name,omitempty"` + ToFile string `json:"to_file,omitempty"` + Kind string `json:"kind"` + Score float64 `json:"score"` + Signals map[string]float64 `json:"signals"` + Reasons []string `json:"reasons"` } func (s *Server) handleGetSurprisingConnections(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { @@ -61,27 +61,55 @@ func (s *Server) handleGetSurprisingConnections(ctx context.Context, req mcp.Cal nodeToComm = cr.NodeToComm } - // Build a fast scoped-node index and an in-edge counter for - // the hub check. Counting once is cheaper than calling - // GetInEdges per edge. + // Build a fast scoped-node index. We still need ALL kinds here — + // edges in the surprise tally can land on any node, not just + // function/method. Use scopedNodes' single bulk pull rather than + // the per-edge GetNode lookups the legacy path fell back to. scopedSet := make(map[string]*graph.Node, 1024) for _, n := range s.scopedNodes(ctx) { scopedSet[n.ID] = n } - allEdges := s.graph.AllEdges() - inDegree := make(map[string]int, len(scopedSet)) + // Kind tally — short-circuit the AllEdges scan when the backend + // implements EdgeKindCounter (returns one row per distinct kind, + // not one per edge — a few-dozen-row response replaces a ~286k + // edge round-trip on a disk backend). The total edge count then comes + // from the per-kind sum so we don't need a second backend call. kindCounts := make(map[graph.EdgeKind]int, 16) + totalEdges := 0 + var allEdges []*graph.Edge + if counter, ok := s.graph.(graph.EdgeKindCounter); ok { + for k, c := range counter.EdgeKindCounts() { + kindCounts[k] = c + totalEdges += c + } + } else { + allEdges = s.graph.AllEdges() + for _, e := range allEdges { + kindCounts[e.Kind]++ + } + totalEdges = len(allEdges) + } + + // In-degree still walks edges Go-side — the per-edge anomaly walk + // further down already pulls the full edge stream, so bucketing + // fan-in during that traversal is free. The InDegreeForNodes + // capability runs one COUNT { … } per id; on the gortex workspace + // the scoped set is ~30k function/method nodes, and tens of + // thousands of indexed subqueries are noticeably slower than the + // single AllEdges materialisation the anomaly walk already pays. + if allEdges == nil { + allEdges = s.graph.AllEdges() + } + inDegree := make(map[string]int, len(scopedSet)) for _, e := range allEdges { if _, ok := scopedSet[e.To]; ok { inDegree[e.To]++ } - kindCounts[e.Kind]++ } // Determine which edge kinds are "unusual" — share of total // edges is at or below rare_kind_pct. Recomputed once per call. - totalEdges := len(allEdges) rareKinds := make(map[graph.EdgeKind]bool, len(kindCounts)) if totalEdges > 0 { thresholdFrac := rareKindPct / 100.0 diff --git a/internal/mcp/tools_untested.go b/internal/mcp/tools_untested.go index 53096f26..f2d8f0fc 100644 --- a/internal/mcp/tools_untested.go +++ b/internal/mcp/tools_untested.go @@ -33,12 +33,11 @@ func (s *Server) handleGetUntestedSymbols(ctx context.Context, req mcp.CallToolR // Fan-in map for ranking — incoming calls/references only; imports and // defines would flood every exported symbol with meaningless coverage. - fanIn := make(map[string]int) - for _, e := range s.graph.AllEdges() { - if e.Kind == graph.EdgeCalls || e.Kind == graph.EdgeReferences { - fanIn[e.To]++ - } - } + // Backends that implement graph.InEdgeCounter serve this from one + // count(*) join — on a disk backend the legacy AllEdges() loop + // materialised every edge over the storage boundary just to bucket two kinds. The + // fallback walks AllEdges() as before. + fanIn := collectFanInByKind(s.graph, []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences}) type untestedEntry struct { ID string `json:"id"` @@ -51,10 +50,8 @@ func (s *Server) handleGetUntestedSymbols(ctx context.Context, req mcp.CallToolR var entries []untestedEntry totalCandidates := 0 - for _, n := range s.scopedNodes(ctx) { - if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { - continue - } + scoped := s.scopedNodesByKinds(ctx, []graph.NodeKind{graph.KindFunction, graph.KindMethod}) + for _, n := range scoped { // Skip symbols defined inside test files — those ARE test code. if isTestFile(n.FilePath) { continue @@ -117,26 +114,49 @@ func (s *Server) handleGetUntestedSymbols(ctx context.Context, req mcp.CallToolR // Test files are detected via isTestFile so this works across languages // (Go _test.go, Python test_*.py, JS .spec.ts, etc.) without per-language // special-casing here. -func reachableFromTests(g *graph.Graph) map[string]bool { - covered := make(map[string]bool) - - // Seed: every function/method defined in a test file. - var frontier []string - for _, n := range g.AllNodes() { - if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { - continue - } - if !isTestFile(n.FilePath) { - continue +// +// Seeds the frontier via NodesByKind(function|method) so disk backends +// only materialise the two kinds rather than the whole node table. +// The test-file predicate is a Go string heuristic — the backend has +// no equivalent — so it stays in the post-filter. +// +// The BFS itself runs through graph.ReachableForwardByKinds when the +// backend implements it (one query per layer over the frontier +// IN-list instead of N+1 GetOutEdges round-trips). Falls back to +// the per-id GetOutEdges loop on backends that don't. +func reachableFromTests(g graph.Store) map[string]bool { + // Seed: every function/method defined in a test file. NodesByKind + // pushes the kind filter into the backend; isTestFile stays Go. + seeds := make([]string, 0) + for _, kind := range []graph.NodeKind{graph.KindFunction, graph.KindMethod} { + for n := range g.NodesByKind(kind) { + if n == nil || !isTestFile(n.FilePath) { + continue + } + seeds = append(seeds, n.ID) } - if !covered[n.ID] { - covered[n.ID] = true - frontier = append(frontier, n.ID) + } + if len(seeds) == 0 { + return map[string]bool{} + } + + kinds := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} + if rf, ok := g.(graph.ReachableForwardByKinds); ok { + if got := rf.ReachableForwardByKinds(seeds, kinds); got != nil { + return got } + return map[string]bool{} } - // Forward BFS along calls + references. A test function that calls X - // covers X; X transitively covers whatever X calls, etc. + // Fallback: layer-by-layer BFS using per-id GetOutEdges. + covered := make(map[string]bool, len(seeds)) + frontier := make([]string, 0, len(seeds)) + for _, id := range seeds { + if !covered[id] { + covered[id] = true + frontier = append(frontier, id) + } + } for len(frontier) > 0 { next := frontier[:0:0] for _, id := range frontier { @@ -154,3 +174,32 @@ func reachableFromTests(g *graph.Graph) map[string]bool { } return covered } + +// collectFanInByKind returns the per-target incoming-edge count for +// every edge whose kind is in the allowlist. Prefers the +// graph.InEdgeCounter capability — backends that ship it run one +// count(*) per request instead of an AllEdges() materialisation +// + Go-side bucketing. +func collectFanInByKind(g graph.Store, kinds []graph.EdgeKind) map[string]int { + if len(kinds) == 0 { + return map[string]int{} + } + if ic, ok := g.(graph.InEdgeCounter); ok { + if got := ic.InEdgeCountsByKind(kinds); got != nil { + return got + } + return map[string]int{} + } + allowed := make(map[graph.EdgeKind]struct{}, len(kinds)) + for _, k := range kinds { + allowed[k] = struct{}{} + } + out := make(map[string]int) + for _, e := range g.AllEdges() { + if _, ok := allowed[e.Kind]; !ok { + continue + } + out[e.To]++ + } + return out +} diff --git a/internal/mcp/tools_wakeup.go b/internal/mcp/tools_wakeup.go index da04d12d..5047a472 100644 --- a/internal/mcp/tools_wakeup.go +++ b/internal/mcp/tools_wakeup.go @@ -41,6 +41,13 @@ type WakeupOptions struct { TopCommunities int TopHotspots int TopEntryPoints int + // PrecomputedHotspots, when non-nil, is the default-threshold + // hotspot ranking the caller has already paid for. Threaded by + // the MCP handler from the server-wide cache so the wakeup turn + // skips a redundant FindHotspots (and its ComputeBetweenness + // pass). nil means BuildWakeup computes it fresh — the CLI + // `gortex wakeup` path. + PrecomputedHotspots []analysis.HotspotEntry } // DefaultWakeupOptions returns the defaults the MCP handler uses. @@ -58,7 +65,7 @@ func DefaultWakeupOptions() WakeupOptions { // communities. Returns the markdown body and an approximate token // count (bytes / 4). Exposed so CLI and MCP paths share one // implementation. -func BuildWakeup(g *graph.Graph, communities *analysis.CommunityResult, opts WakeupOptions) (markdown string, tokensEst int) { +func BuildWakeup(g graph.Store, communities *analysis.CommunityResult, opts WakeupOptions) (markdown string, tokensEst int) { if opts.MaxTokens <= 0 { opts.MaxTokens = 500 } @@ -72,16 +79,23 @@ func BuildWakeup(g *graph.Graph, communities *analysis.CommunityResult, opts Wak opts.TopEntryPoints = 5 } - nodes := g.AllNodes() + // Wakeup is a whole-repo digest — language tally + hotspot list + + // entry-point list, with no session scoping. The lang count can + // come from Stats() (one indexed groupby on disk backends); + // hotspots and entry points already iterate the function/method + // subset via the analyzers / NodesByKindsScanner path, so the + // AllNodes() pull the legacy build used to feed the lang summary + // just adds a redundant 107k-row trip on a disk backend. + stats := g.Stats() var b strings.Builder b.WriteString("# Codebase wakeup\n\n") - // Summary line: total nodes, top 3 languages. langCounts := map[string]int{} - for _, n := range nodes { - if n.Language != "" { - langCounts[n.Language]++ + for lang, c := range stats.ByLanguage { + if lang == "" { + continue } + langCounts[lang] = c } type langRow struct { name string @@ -105,8 +119,9 @@ func BuildWakeup(g *graph.Graph, communities *analysis.CommunityResult, opts Wak for _, l := range topLangs { langSummary = append(langSummary, fmt.Sprintf("%s (%d)", l.name, l.count)) } + fileCount := stats.ByKind[string(graph.KindFile)] fmt.Fprintf(&b, "**Scale.** %d indexed symbols across %d files. Primary: %s.\n\n", - len(nodes), countFileNodes(nodes), strings.Join(langSummary, ", ")) + stats.TotalNodes, fileCount, strings.Join(langSummary, ", ")) // Communities. if communities != nil && len(communities.Communities) > 0 { @@ -131,7 +146,12 @@ func BuildWakeup(g *graph.Graph, communities *analysis.CommunityResult, opts Wak } // Hotspots. - hotspots := analysis.FindHotspots(g, communities, 0) + var hotspots []analysis.HotspotEntry + if opts.PrecomputedHotspots != nil { + hotspots = opts.PrecomputedHotspots + } else { + hotspots = analysis.FindHotspots(g, communities, 0) + } if len(hotspots) > opts.TopHotspots { hotspots = hotspots[:opts.TopHotspots] } @@ -144,7 +164,7 @@ func BuildWakeup(g *graph.Graph, communities *analysis.CommunityResult, opts Wak } // Entry points. - entries := wakeupEntryPoints(nodes, g, opts.TopEntryPoints) + entries := wakeupEntryPoints(g, opts.TopEntryPoints) if len(entries) > 0 { b.WriteString("**Entry points.**\n") for _, e := range entries { @@ -158,42 +178,79 @@ func BuildWakeup(g *graph.Graph, communities *analysis.CommunityResult, opts Wak return out, len(out) / 4 } -func countFileNodes(nodes []*graph.Node) int { - n := 0 - for _, x := range nodes { - if x.Kind == graph.KindFile { - n++ +// wakeupEntryPoints returns functions/methods with zero incoming +// edges and at least one outgoing edge, ranked by out-degree. +// +// Uses NodeDegreeAggregator when the backend implements it (one +// batched in/out count instead of up to 3N GetInEdges/GetOutEdges +// round-trips on a disk backend — the sort path called GetOutEdges +// twice per candidate, the worst single hot spot in this file). We +// stash the fan-out alongside each node so the sort never has to +// re-query. +func wakeupEntryPoints(g graph.Store, top int) []*graph.Node { + type entry struct { + node *graph.Node + fanOut int + } + // Pull only the callable subset via NodesByKindsScanner so disk + // backends never materialise the whole node table for an entry- + // point candidate set that only ranges across function + method. + var pool []*graph.Node + if scan, ok := g.(graph.NodesByKindsScanner); ok { + pool = scan.NodesByKinds([]graph.NodeKind{graph.KindFunction, graph.KindMethod}) + } else { + all := g.AllNodes() + pool = make([]*graph.Node, 0, len(all)) + for _, n := range all { + if n.Kind == graph.KindFunction || n.Kind == graph.KindMethod { + pool = append(pool, n) + } } } - return n -} - -func wakeupEntryPoints(nodes []*graph.Node, g *graph.Graph, top int) []*graph.Node { - candidates := make([]*graph.Node, 0) - for _, n := range nodes { - if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { - continue + entries := make([]entry, 0, len(pool)) + if agg, ok := g.(graph.NodeDegreeAggregator); ok && len(pool) > 0 { + ids := make([]string, 0, len(pool)) + byID := make(map[string]*graph.Node, len(pool)) + for _, n := range pool { + ids = append(ids, n.ID) + byID[n.ID] = n } - if len(g.GetInEdges(n.ID)) > 0 { - continue + for _, r := range agg.NodeDegreeCounts(ids, nil) { + if r.InCount > 0 || r.OutCount == 0 { + continue + } + n := byID[r.NodeID] + if n == nil { + continue + } + entries = append(entries, entry{node: n, fanOut: r.OutCount}) } - if len(g.GetOutEdges(n.ID)) == 0 { - continue + } else { + for _, n := range pool { + if len(g.GetInEdges(n.ID)) > 0 { + continue + } + out := len(g.GetOutEdges(n.ID)) + if out == 0 { + continue + } + entries = append(entries, entry{node: n, fanOut: out}) } - candidates = append(candidates, n) } - sort.Slice(candidates, func(i, j int) bool { - oi := len(g.GetOutEdges(candidates[i].ID)) - oj := len(g.GetOutEdges(candidates[j].ID)) - if oi != oj { - return oi > oj + sort.Slice(entries, func(i, j int) bool { + if entries[i].fanOut != entries[j].fanOut { + return entries[i].fanOut > entries[j].fanOut } - return candidates[i].ID < candidates[j].ID + return entries[i].node.ID < entries[j].node.ID }) - if len(candidates) > top { - candidates = candidates[:top] + if len(entries) > top { + entries = entries[:top] + } + out := make([]*graph.Node, 0, len(entries)) + for _, e := range entries { + out = append(out, e.node) } - return candidates + return out } // trimToTokens caps the markdown to the requested approximate token @@ -226,6 +283,7 @@ func (s *Server) handleGortexWakeup(ctx context.Context, req mcp.CallToolRequest opts.TopEntryPoints = v } + opts.PrecomputedHotspots = s.getHotspots() md, est := BuildWakeup(s.graph, s.getCommunities(), opts) format := strings.ToLower(strings.TrimSpace(req.GetString("format", "markdown"))) diff --git a/internal/modules/scanner.go b/internal/modules/scanner.go index 2630aa20..3357fbd5 100644 --- a/internal/modules/scanner.go +++ b/internal/modules/scanner.go @@ -948,7 +948,7 @@ func BuildGraphArtifacts(filePath string, specs []Spec) ([]*graph.Node, []*graph // dependencies. Multi-version imports (Go's `module/v2` shape) // match the longest spec; a manifest declaring both `bar` and // `bar/v2` will resolve `import bar/v2/sub` to the v2 spec. -func LinkImports(g *graph.Graph, specs []Spec, ownModulePath string) int { +func LinkImports(g graph.Store, specs []Spec, ownModulePath string) int { if g == nil { return 0 } @@ -961,7 +961,7 @@ func LinkImports(g *graph.Graph, specs []Spec, ownModulePath string) int { // in multi-repo mode should pass the repo's own KindImport nodes (e.g. // from g.GetRepoNodes(repoPrefix) filtered by Kind) so each pass stays // O(repo size). -func LinkImportsIn(g *graph.Graph, importNodes []*graph.Node, specs []Spec, ownModulePath string) int { +func LinkImportsIn(g graph.Store, importNodes []*graph.Node, specs []Spec, ownModulePath string) int { if g == nil || len(specs) == 0 || len(importNodes) == 0 { return 0 } diff --git a/internal/parser/languages/go_dataflow.go b/internal/parser/languages/go_dataflow.go index 1b6c6d5c..d32a0213 100644 --- a/internal/parser/languages/go_dataflow.go +++ b/internal/parser/languages/go_dataflow.go @@ -23,13 +23,24 @@ import ( // `x := …` / `var x = …` / a range clause / a type-switch / a for- // statement init clause maps to a synthetic ID: // -// #local:@ +// #local:@+ // -// where ownerID is the enclosing function/method node and line is -// the 1-based decl line. These IDs are valid edge endpoints — the -// BFS in `flow_between` traverses them — but no graph node is -// materialised, keeping symbol search free of every transient -// binding in every function body. +// where ownerID is the enclosing function/method node and the +// offset is the local's 1-based line minus the function-decl's +// 1-based line. The leading `+` flags the value as a relative +// offset rather than an absolute line — important for the +// incremental indexer: adding a line *above* the enclosing +// function leaves every local-binding ID inside it stable, so the +// per-save edge churn collapses from O(locals-in-file) to +// O(locals-below-the-edit). +// +// Each binding is materialised as a KindLocal graph node anchored +// to the enclosing function via EdgeMemberOf, so dataflow edges +// targeting locals are not orphan endpoints — they navigate to a +// first-class node like every other edge. KindLocal nodes are +// excluded from the BM25 search index (see +// internal/indexer.shouldIndexForSearch) so identifiers like +// `err` / `data` / `n` / `i` don't flood search results. // // v1 limitations: // @@ -46,7 +57,7 @@ import ( // mirrors the call edge for the same call site. Indexer post- // resolution rewrites them once the callee is known — see // `materializeDataflowParams` in internal/indexer. -func emitGoDataflow(ownerID string, body *sitter.Node, paramsByName map[string]string, src []byte, filePath string, result *parser.ExtractionResult) { +func emitGoDataflow(ownerID string, ownerStartLine int, body *sitter.Node, paramsByName map[string]string, imports map[string]string, src []byte, filePath string, result *parser.ExtractionResult) { if body == nil { return } @@ -59,15 +70,51 @@ func emitGoDataflow(ownerID string, body *sitter.Node, paramsByName map[string]s scope.bindings[name] = []string{paramID} } walker := &goFlowWalker{ - ownerID: ownerID, - filePath: filePath, - src: src, - scope: scope, - result: result, + ownerID: ownerID, + ownerStartLine: ownerStartLine, + filePath: filePath, + src: src, + scope: scope, + result: result, + emittedLocals: map[string]struct{}{}, + imports: imports, } walker.walk(body) } +// bindLocal computes the canonical local-binding ID, registers it in +// scope, and on first sight emits the corresponding KindLocal node + +// EdgeMemberOf edge so the binding is a first-class graph element +// rather than a phantom edge endpoint. Returns the ID. Dedupe key is +// the ID itself: a binding visited through multiple walk paths still +// produces one node row. +func (w *goFlowWalker) bindLocal(name string, line int) string { + id := w.localID(name, line) + w.scope.bindings[name] = []string{id} + if _, ok := w.emittedLocals[id]; ok { + return id + } + w.emittedLocals[id] = struct{}{} + w.result.Nodes = append(w.result.Nodes, &graph.Node{ + ID: id, + Kind: graph.KindLocal, + Name: name, + FilePath: w.filePath, + StartLine: line, + EndLine: line, + Language: "go", + }) + w.result.Edges = append(w.result.Edges, &graph.Edge{ + From: id, + To: w.ownerID, + Kind: graph.EdgeMemberOf, + FilePath: w.filePath, + Line: line, + Origin: graph.OriginASTResolved, + }) + return id +} + // goFlowScope tracks the most recent source IDs for each named // binding inside a function body. Reassignment replaces the slice @@ -83,13 +130,29 @@ func newGoFlowScope() *goFlowScope { // goFlowWalker carries the per-function state needed to emit // dataflow edges. ownerID is the enclosing function node ID; -// scope tracks live bindings; result accumulates emitted edges. +// ownerStartLine is the 1-based source line of the function's +// declaration — local-binding IDs are anchored to it so edits +// above the function don't churn every binding inside; +// scope tracks live bindings; result accumulates emitted edges; +// emittedLocals dedupes KindLocal node emissions so a binding +// visited through more than one walk path doesn't produce +// duplicate node rows. type goFlowWalker struct { - ownerID string - filePath string - src []byte - scope *goFlowScope - result *parser.ExtractionResult + ownerID string + ownerStartLine int + filePath string + src []byte + scope *goFlowScope + result *parser.ExtractionResult + emittedLocals map[string]struct{} + // imports maps the file's package aliases to their import paths + // (`fmt → "fmt"`, `assert → "github.com/stretchr/testify/assert"`). + // Threaded through so the selector-expression cases in calleeRef / + // exprSources can emit `unresolved::extern::::` + // when the LHS identifier is an imported package — matching the + // shape the call extractor uses — instead of collapsing the + // qualifier to `*.` and losing the resolution evidence. + imports map[string]string } func (w *goFlowWalker) walk(n *sitter.Node) { @@ -126,10 +189,20 @@ func (w *goFlowWalker) walk(n *sitter.Node) { } // localID returns the synthetic local-binding ID for `name` at the -// given line. Always anchored to ownerID so two functions can have -// identically-named locals without colliding. +// given absolute line. Always anchored to ownerID so two functions +// can have identically-named locals without colliding. The line is +// encoded as an offset from the owner's declaration line (prefixed +// `+` so it's unambiguous): a same-function shift caused by an edit +// above the function leaves the ID stable. A defensive zero-anchor +// fallback handles cases where the caller didn't supply an owner +// start line (the walker is constructed with one in production; the +// fallback keeps misuse from producing IDs missing the @ separator). func (w *goFlowWalker) localID(name string, line int) string { - return w.ownerID + "#local:" + name + "@" + strconv.Itoa(line) + offset := line + if w.ownerStartLine > 0 { + offset = line - w.ownerStartLine + 1 + } + return w.ownerID + "#local:" + name + "@+" + strconv.Itoa(offset) } func (w *goFlowWalker) handleShortVarDecl(n *sitter.Node) { @@ -224,9 +297,7 @@ func (w *goFlowWalker) declareTarget(lhs *sitter.Node, decl bool, line int) (str if name == "" || name == "_" { return "", false } - id := w.localID(name, line) - w.scope.bindings[name] = []string{id} - return id, true + return w.bindLocal(name, line), true case "selector_expression": // `x.field = …` — write goes to the field node when known. field := lhs.ChildByFieldName("field") @@ -343,8 +414,7 @@ func (w *goFlowWalker) handleRangeClause(n *sitter.Node) { if name == "" || name == "_" { continue } - id := w.localID(name, line) - w.scope.bindings[name] = []string{id} + id := w.bindLocal(name, line) for _, src := range rhsSources { if src == "" || src == id { continue @@ -477,11 +547,22 @@ func (w *goFlowWalker) calleeRef(call *sitter.Node) string { if method == "" { return "" } - // Receiver-typed targets (e.g. an import alias dispatch) - // can't be reconstructed without the file's import map. - // Fall through to the generic "*." form — same shape the - // call extractor uses when receiver is a local. - _ = recv + // Package-qualified call: when the receiver is a bare + // identifier matching one of the file's import aliases, + // emit the same `unresolved::extern::::` + // shape the call extractor uses for explicit calls (see + // golang.go::Extract `imports[c.receiver]` branch). The + // resolver's resolveExtern pass then lands these on + // stdlib::/dep::/external:: targets or the real cross-repo + // symbol when the import path resolves to an indexed file. + // Without this branch the qualifier is dropped and we leak + // `unresolved::*.` for every package call inside a + // dataflow context. + if recv != nil && recv.Type() == "identifier" { + if importPath := w.importPathFor(recv.Content(w.src)); importPath != "" { + return "unresolved::extern::" + importPath + "::" + method + } + } return "unresolved::*." + method case "generic_function": // `f[T](args)` — strip the type instantiation wrapper. @@ -551,6 +632,17 @@ func (w *goFlowWalker) exprSources(n *sitter.Node) []string { if fieldName == "" { return nil } + // Package-qualified value: when the receiver is a bare + // identifier matching one of the file's import aliases, + // emit `unresolved::extern::::` so the + // resolver can land it on stdlib::/dep::/external::. See + // the matching comment in calleeRef. + operand := n.ChildByFieldName("operand") + if operand != nil && operand.Type() == "identifier" { + if importPath := w.importPathFor(operand.Content(w.src)); importPath != "" { + return []string{"unresolved::extern::" + importPath + "::" + fieldName} + } + } return []string{"unresolved::*." + fieldName} case "call_expression": ref := w.calleeRef(n) @@ -666,3 +758,17 @@ func (w *goFlowWalker) emitValueFlow(src, dst string, line int) { Origin: graph.OriginASTResolved, }) } + +// importPathFor returns the import path the given identifier names +// as a package alias in the current file, or "" when the identifier +// doesn't match any import. The walker's imports map is the same +// map populated by the Go extractor's emitImport handler, so an +// `assert` alias for `github.com/stretchr/testify/assert` resolves +// here exactly as it does in the call extractor's +// `imports[c.receiver]` branch. +func (w *goFlowWalker) importPathFor(name string) string { + if name == "" || w.imports == nil { + return "" + } + return w.imports[name] +} diff --git a/internal/parser/languages/go_dataflow_local_nodes_test.go b/internal/parser/languages/go_dataflow_local_nodes_test.go new file mode 100644 index 00000000..1aa6e622 --- /dev/null +++ b/internal/parser/languages/go_dataflow_local_nodes_test.go @@ -0,0 +1,118 @@ +package languages + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +// TestGoDataflow_LocalsMaterialiseAsKindLocal is the regression for +// the design change that lifted intra-function bindings from +// edge-endpoint-only IDs to first-class KindLocal nodes. Storage +// backends that enforce edge-endpoint foreign keys (the disk backend) +// had to auto-stub empty Node rows for every local-binding edge endpoint — +// 51k+ stubs on the gortex codebase. Materialising as KindLocal +// converges every backend's node count and gives locals a proper +// home in the graph via EdgeMemberOf to the enclosing function. +func TestGoDataflow_LocalsMaterialiseAsKindLocal(t *testing.T) { + src := `package foo + +func Handler(x int) int { + y := x + z := y + return z +} +` + fix := runGoExtract(t, src) + owner := "pkg/foo.go::Handler" + + locals := fix.nodesByKind[graph.KindLocal] + require.NotEmpty(t, locals, "extractor should emit KindLocal nodes for short_var_decl bindings") + + names := map[string]*graph.Node{} + for _, n := range locals { + names[n.Name] = n + } + for _, want := range []string{"y", "z"} { + n, ok := names[want] + require.Truef(t, ok, "missing KindLocal for %q; got: %v", want, names) + assert.Equal(t, graph.KindLocal, n.Kind) + assert.Equal(t, "pkg/foo.go", n.FilePath, "local %q should carry the file it lives in", want) + assert.Equal(t, "go", n.Language, "local %q should carry language", want) + assert.Greater(t, n.StartLine, 0, "local %q should carry a source line", want) + // The node ID must be exactly the same string the dataflow + // edges target — they're keyed by edge endpoint, so a + // mismatch silently breaks flow_between BFS. + assert.True(t, strings.HasPrefix(n.ID, owner+"#local:"+want+"@+"), + "local node ID must follow the function-relative offset convention, got %q", n.ID) + } + + // Every materialised local must have an EdgeMemberOf edge to the + // enclosing function — that's what makes the local discoverable + // as a member of its owner via get_callers / class_hierarchy. + memberEdges := fix.edgesByKind[graph.EdgeMemberOf] + memberOwners := map[string]string{} + for _, e := range memberEdges { + memberOwners[e.From] = e.To + } + for _, n := range locals { + owner, ok := memberOwners[n.ID] + assert.Truef(t, ok, "local %q must have an EdgeMemberOf edge", n.Name) + assert.Equalf(t, "pkg/foo.go::Handler", owner, + "local %q's EdgeMemberOf target must be the enclosing function", n.Name) + } +} + +// TestGoDataflow_LocalsDedupedAcrossWalks guards against duplicate +// KindLocal node emissions if the same binding is visited through +// more than one walk path (e.g., short_var + a subsequent reference +// in the same scope). The walker's emittedLocals set must collapse +// repeat visits to one node row. +func TestGoDataflow_LocalsDedupedAcrossWalks(t *testing.T) { + src := `package foo + +func Multi() { + y := 1 + _ = y + _ = y + _ = y +} +` + fix := runGoExtract(t, src) + ys := []string{} + for _, n := range fix.nodesByKind[graph.KindLocal] { + if n.Name == "y" { + ys = append(ys, n.ID) + } + } + assert.Lenf(t, ys, 1, "exactly one KindLocal row per (function, binding) — got: %v", ys) +} + +// TestGoDataflow_RangeClauseEmitsKindLocal covers the second binding +// site (the range-clause path) — confirms the materialisation isn't +// limited to short_var_decl / var_spec. +func TestGoDataflow_RangeClauseEmitsKindLocal(t *testing.T) { + src := `package foo + +func Iter(xs []int) int { + total := 0 + for i, v := range xs { + _ = i + total += v + } + return total +} +` + fix := runGoExtract(t, src) + names := map[string]bool{} + for _, n := range fix.nodesByKind[graph.KindLocal] { + names[n.Name] = true + } + for _, want := range []string{"total", "i", "v"} { + assert.Truef(t, names[want], "missing KindLocal for range binding %q; got %v", want, names) + } +} diff --git a/internal/parser/languages/go_dataflow_offset_test.go b/internal/parser/languages/go_dataflow_offset_test.go new file mode 100644 index 00000000..ab63f4e2 --- /dev/null +++ b/internal/parser/languages/go_dataflow_offset_test.go @@ -0,0 +1,177 @@ +package languages + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/zzet/gortex/internal/graph" +) + +// TestGoDataflow_LocalIDsAreFunctionRelative is the regression for +// the absolute-line local-ID encoding that produced O(locals-in-file) +// edge churn on every save: adding an unrelated line above a function +// shifted every local-binding ID inside it, so the per-file +// incremental update had to delete + re-insert every dataflow edge +// even when nothing inside the function changed. +// +// The function-relative encoding (#local:@+) +// anchors each binding's ID to the owner's declaration line, so the +// IDs are invariant under shifts of the function as a whole — only +// edits *inside* the function above a binding shift that binding's +// ID. The test indexes the same source twice — once verbatim, once +// with a comment inserted above the function — and asserts the local +// IDs match exactly. +func TestGoDataflow_LocalIDsAreFunctionRelative(t *testing.T) { + original := `package foo + +func Handler(x int) int { + y := x + z := y + return z +} +` + // Same Handler, but with 5 unrelated lines of comments above it. + // If local IDs used absolute lines, every #local: target in the + // extracted edges would shift by 5 and would NOT match the + // originals. + shifted := `package foo + +// shimmer +// shimmer +// shimmer +// shimmer +// shimmer +func Handler(x int) int { + y := x + z := y + return z +} +` + + collectLocalIDs := func(t *testing.T, src string) map[string]struct{} { + t.Helper() + fix := runGoExtract(t, src) + ids := map[string]struct{}{} + for _, edges := range fix.edgesByKind { + for _, e := range edges { + for _, ep := range []string{e.From, e.To} { + if strings.Contains(ep, "#local:") { + ids[ep] = struct{}{} + } + } + } + } + return ids + } + + origIDs := collectLocalIDs(t, original) + shiftedIDs := collectLocalIDs(t, shifted) + + // Sanity: the function actually has locals to compare. + assert.NotEmpty(t, origIDs, "extractor should emit #local: edge endpoints") + + // The two sets must match. Any divergence means a local-ID shifted + // because of the lines added *above* the function — the exact + // churn case the offset encoding is meant to prevent. + assert.Equal(t, origIDs, shiftedIDs, + "local IDs must stay stable when only lines ABOVE the function move") + + // Belt + suspenders: every #local: ID must carry the offset + // marker (`@+`) rather than the legacy `@`. + for id := range origIDs { + at := strings.LastIndex(id, "@") + assert.Greater(t, at, 0, "id has no @ separator: %q", id) + assert.Equal(t, byte('+'), id[at+1], "id must encode offset (`@+`), got %q", id) + } +} + +// TestGoDataflow_LocalIDsShiftOnIntraFunctionEdit confirms the +// converse: edits *inside* the function above a binding still shift +// that binding's ID. (The offset encoding only neutralises edits +// outside the function, not inside it — local-line motion within the +// function is the load-bearing disambiguator for the same name +// shadowed at different lines.) +func TestGoDataflow_LocalIDsShiftOnIntraFunctionEdit(t *testing.T) { + base := `package foo + +func Handler(x int) int { + y := x + return y +} +` + withInternalShift := `package foo + +func Handler(x int) int { + _ = 1 // <-- inserted INSIDE the function, above y + y := x + return y +} +` + collect := func(t *testing.T, src string) map[string]struct{} { + t.Helper() + ids := map[string]struct{}{} + for _, edges := range runGoExtract(t, src).edgesByKind { + for _, e := range edges { + for _, ep := range []string{e.From, e.To} { + if strings.Contains(ep, "#local:y@") { + ids[ep] = struct{}{} + } + } + } + } + return ids + } + + a := collect(t, base) + b := collect(t, withInternalShift) + assert.NotEmpty(t, a) + assert.NotEmpty(t, b) + assert.NotEqual(t, a, b, + "adding a line INSIDE the function above the binding MUST shift the local ID — this is the disambiguator for re-bound names") +} + +// TestGoClosureIDsAreFunctionRelative is the closure analogue of the +// local-binding test. The closure's anchor used to be the absolute +// `#closure@`; switching it to `#closure@+` gives the +// same churn-reduction benefit. The Name field still carries the +// absolute line for human readability in outlines. +func TestGoClosureIDsAreFunctionRelative(t *testing.T) { + original := `package foo + +func Outer() func() int { + return func() int { return 42 } +} +` + shifted := `package foo + +// a +// b +// c +func Outer() func() int { + return func() int { return 42 } +} +` + closureNodes := func(t *testing.T, src string) map[string]*graph.Node { + t.Helper() + fix := runGoExtract(t, src) + out := map[string]*graph.Node{} + for _, n := range fix.nodesByKind[graph.KindClosure] { + out[n.ID] = n + } + return out + } + + a := closureNodes(t, original) + b := closureNodes(t, shifted) + assert.NotEmpty(t, a, "extractor should emit at least one closure node") + + // IDs must match across the shift. + for id := range a { + assert.Contains(t, b, id, + "closure ID must stay stable when only lines ABOVE the enclosing function move") + assert.True(t, strings.Contains(id, "#closure@+"), + "closure ID must use the `@+` form, got %q", id) + } +} diff --git a/internal/parser/languages/go_dataflow_qualifier_test.go b/internal/parser/languages/go_dataflow_qualifier_test.go new file mode 100644 index 00000000..561ac1d3 --- /dev/null +++ b/internal/parser/languages/go_dataflow_qualifier_test.go @@ -0,0 +1,161 @@ +package languages + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/zzet/gortex/internal/graph" +) + +// TestGoDataflow_SelectorCallPreservesPackageQualifier is the +// regression for the dataflow walker dropping the package qualifier +// on selector calls (`fmt.Sprintf`, `strings.Join`, `assert.True`) +// and leaking `unresolved::*.` instead of the proper +// `unresolved::extern::::` shape the call +// extractor uses. The resolver's resolveExtern pass then lands +// these on stdlib::/dep::/external::, so without preserving the +// qualifier here every package-qualified call inside a dataflow +// context (argument source, return target, value flow) stays as +// an unresolved phantom. +func TestGoDataflow_SelectorCallPreservesPackageQualifier(t *testing.T) { + src := `package foo + +import ( + "fmt" + "strings" +) + +func Handler(input string) string { + cleaned := strings.TrimSpace(input) + return fmt.Sprintf("got: %s", cleaned) +} +` + fix := runGoExtract(t, src) + + // Every `unresolved::extern::::` target the + // dataflow walker emits must use the canonical import path, + // not the `*.method` collapsed form. + var hasStringsTrimSpace, hasFmtSprintf bool + for _, edges := range fix.edgesByKind { + for _, e := range edges { + switch e.To { + case "unresolved::extern::strings::TrimSpace": + hasStringsTrimSpace = true + case "unresolved::extern::fmt::Sprintf": + hasFmtSprintf = true + } + } + } + + assert.True(t, hasStringsTrimSpace, + "dataflow walker must preserve the `strings` qualifier on TrimSpace(...) calls — got: %s", + dumpDataflowSelectorTargets(fix)) + assert.True(t, hasFmtSprintf, + "dataflow walker must preserve the `fmt` qualifier on Sprintf(...) calls — got: %s", + dumpDataflowSelectorTargets(fix)) + + // And the collapsed `*.TrimSpace`/`*.Sprintf` shape must NOT + // appear for these calls. + for _, edges := range fix.edgesByKind { + for _, e := range edges { + assert.NotEqual(t, "unresolved::*.TrimSpace", e.To, + "package-qualified Trim should never land as `unresolved::*.TrimSpace`") + assert.NotEqual(t, "unresolved::*.Sprintf", e.To, + "package-qualified Sprintf should never land as `unresolved::*.Sprintf`") + } + } +} + +// TestGoDataflow_NonImportedReceiverFallsBack ensures the pass +// doesn't false-positive: when the receiver is NOT a package alias +// (a local variable, a struct field), it must keep emitting the +// `unresolved::*.` form so other passes can apply their +// own heuristics. +func TestGoDataflow_NonImportedReceiverFallsBack(t *testing.T) { + src := `package foo + +type Buffer struct{} + +func (b *Buffer) Write(p []byte) {} + +func Run(buf *Buffer, data []byte) { + buf.Write(data) +} +` + fix := runGoExtract(t, src) + + // `buf.Write(data)` — buf is a parameter, NOT an import; the + // walker's fallback path must keep `*.` (the call extractor's + // own path already records receiver_type on the call edge). + var seen bool + for _, edges := range fix.edgesByKind { + for _, e := range edges { + if e.To == "unresolved::*.Write" { + seen = true + } + assert.NotEqual(t, "unresolved::extern::buf::Write", e.To, + "`buf` is a parameter — must not be classified as a package alias") + } + } + assert.True(t, seen, "the walker must still emit `unresolved::*.Write` for non-import receivers; "+ + "got: %s", dumpDataflowSelectorTargets(fix)) +} + +func dumpDataflowSelectorTargets(fix *extractedFixture) string { + var b strings.Builder + for _, edges := range fix.edgesByKind { + for _, e := range edges { + if strings.Contains(e.To, "Sprintf") || strings.Contains(e.To, "TrimSpace") || strings.Contains(e.To, "Write") { + b.WriteString("\n [" + string(e.Kind) + "] " + e.From + " -> " + e.To) + } + } + } + return b.String() +} + +// guard: also verifies the same fix applies in exprSources (not just +// calleeRef) — a selector accessed as a value (not invoked) should +// also preserve its qualifier. Uses a real stdlib import so the +// extractor's emitImport handler matches its production code path. +func TestGoDataflow_SelectorValuePreservesQualifier(t *testing.T) { + src := `package foo + +import "os" + +func DefaultPerm() any { + return os.ModePerm +} +` + fix := runGoExtract(t, src) + _ = graph.KindFunction + + var foundProperShape bool + for _, edges := range fix.edgesByKind { + for _, e := range edges { + // handleReturn emits `From: src, To: owner` — flow goes + // FROM the value source TO the function's owner. So the + // qualified target lives on e.From, not e.To. + if strings.HasPrefix(e.From, "unresolved::extern::os::") || + strings.HasPrefix(e.To, "unresolved::extern::os::") { + foundProperShape = true + } + } + } + assert.True(t, foundProperShape, + "selector-value access (os.ModePerm) must emit the extern:: shape; got:\n%s", + dumpAllSelectorish(fix)) +} + +func dumpAllSelectorish(fix *extractedFixture) string { + var b strings.Builder + for _, edges := range fix.edgesByKind { + for _, e := range edges { + if strings.Contains(e.To, "ModePerm") || strings.Contains(e.To, "::os::") || strings.HasPrefix(e.To, "unresolved::*.") { + b.WriteString(" [" + string(e.Kind) + "] " + e.From + " -> " + e.To + "\n") + } + } + } + return b.String() +} diff --git a/internal/parser/languages/go_function_shape.go b/internal/parser/languages/go_function_shape.go index 27cebdc5..7b6211ce 100644 --- a/internal/parser/languages/go_function_shape.go +++ b/internal/parser/languages/go_function_shape.go @@ -24,7 +24,7 @@ import ( // declLine is the 1-based line of the declaration, used as the // anchor for nodes/edges that don't have a finer-grained AST // position to reference. -func emitGoFunctionShape(ownerID string, defNode *sitter.Node, paramsCap, resultCap *parser.CapturedNode, src []byte, filePath string, declLine int, result *parser.ExtractionResult) { +func emitGoFunctionShape(ownerID string, defNode *sitter.Node, paramsCap, resultCap *parser.CapturedNode, src []byte, filePath string, declLine int, imports map[string]string, result *parser.ExtractionResult) { if defNode == nil { return } @@ -32,15 +32,21 @@ func emitGoFunctionShape(ownerID string, defNode *sitter.Node, paramsCap, result emitGoReturnEdges(ownerID, resultCap, src, filePath, declLine, result) emitGoGenericParamNodes(ownerID, defNode, src, filePath, declLine, result) if body := goFuncBody(defNode); body != nil { - emitGoClosureNodes(ownerID, body, src, filePath, result) + emitGoClosureNodes(ownerID, declLine, body, src, filePath, result) emitGoChannelOps(ownerID, body, src, filePath, result) // CPG-lite intra-procedural dataflow: emits EdgeValueFlow, // EdgeArgOf, and EdgeReturnsTo placeholders. Inter-procedural // targets are lifted by the indexer's // MaterializeDataflowParams pass once the call resolver - // has landed every callee. + // has landed every callee. declLine anchors local-binding + // IDs as offsets so edits above the function don't churn + // every binding inside. imports are the file's package + // aliases so selector-expression cases inside the walker + // can rewrite `pkg.Method` calls to the proper + // `unresolved::extern::::` shape + // instead of dropping the qualifier. paramsByName := goParamNamesFromCapture(paramsCap, src) - emitGoDataflow(ownerID, body, paramsByName, src, filePath, result) + emitGoDataflow(ownerID, declLine, body, paramsByName, imports, src, filePath, result) } } @@ -388,7 +394,7 @@ func emitGoGenericParamNodes(ownerID string, defNode *sitter.Node, src []byte, f // enclosing function. Re-attributing them would require teaching // the call-emit walker to recognise closure boundaries — tracked as // a Phase 1.5 follow-up. -func emitGoClosureNodes(ownerID string, body *sitter.Node, src []byte, filePath string, result *parser.ExtractionResult) { +func emitGoClosureNodes(ownerID string, ownerStartLine int, body *sitter.Node, src []byte, filePath string, result *parser.ExtractionResult) { if body == nil { return } @@ -398,7 +404,15 @@ func emitGoClosureNodes(ownerID string, body *sitter.Node, src []byte, filePath return true } startLine := int(n.StartPoint().Row) + 1 - closureID := ownerID + "#closure@" + strconv.Itoa(startLine) + // ID anchors on the owner-relative offset (+ prefix) so edits + // above the enclosing function don't churn the closure's ID. + // Name keeps the absolute line for human readability in search + // results / outlines. + offset := startLine + if ownerStartLine > 0 { + offset = startLine - ownerStartLine + 1 + } + closureID := ownerID + "#closure@+" + strconv.Itoa(offset) // If two anonymous functions start on the same line, append a // stable suffix so IDs stay unique. Rare in practice but // defensive. diff --git a/internal/parser/languages/golang.go b/internal/parser/languages/golang.go index 50a3a8b3..1f9a5c6c 100644 --- a/internal/parser/languages/golang.go +++ b/internal/parser/languages/golang.go @@ -279,10 +279,10 @@ func (e *GoExtractor) Extract(filePath string, src []byte) (*parser.ExtractionRe // No-op (the package name is not currently surfaced as a node). case m.Captures["func.def"] != nil: - e.emitFunction(m, filePath, fileID, src, result, paramsByFunc) + e.emitFunction(m, filePath, fileID, src, result, paramsByFunc, imports) case m.Captures["method.def"] != nil: - e.emitMethod(m, filePath, fileID, src, result, paramsByFunc) + e.emitMethod(m, filePath, fileID, src, result, paramsByFunc, imports) case m.Captures["typedef.def"] != nil: e.emitTypeDecl(m, filePath, fileID, src, result, seenTypeName) @@ -831,7 +831,7 @@ func (e *GoExtractor) Extract(filePath string, src []byte) (*parser.ExtractionRe // --- Per-match emit helpers ----------------------------------------- -func (e *GoExtractor) emitFunction(m parser.QueryResult, filePath, fileID string, src []byte, result *parser.ExtractionResult, paramsByFunc map[string]typeEnv) { +func (e *GoExtractor) emitFunction(m parser.QueryResult, filePath, fileID string, src []byte, result *parser.ExtractionResult, paramsByFunc map[string]typeEnv, imports map[string]string) { name := m.Captures["func.name"].Text def := m.Captures["func.def"] id := filePath + "::" + name @@ -875,7 +875,7 @@ func (e *GoExtractor) emitFunction(m parser.QueryResult, filePath, fileID string }) emitGoThrowsEdges(node, m.Captures["func.result"], filePath, result) emitGoFunctionShape(id, def.Node, m.Captures["func.params"], m.Captures["func.result"], - src, filePath, def.StartLine+1, result) + src, filePath, def.StartLine+1, imports, result) } // goFuncBody returns the `block` body child of a function/method @@ -897,7 +897,7 @@ func goFuncBody(decl *sitter.Node) *sitter.Node { return nil } -func (e *GoExtractor) emitMethod(m parser.QueryResult, filePath, fileID string, src []byte, result *parser.ExtractionResult, paramsByFunc map[string]typeEnv) { +func (e *GoExtractor) emitMethod(m parser.QueryResult, filePath, fileID string, src []byte, result *parser.ExtractionResult, paramsByFunc map[string]typeEnv, imports map[string]string) { name := m.Captures["method.name"].Text def := m.Captures["method.def"] receiverText := m.Captures["method.receiver"].Text @@ -958,7 +958,7 @@ func (e *GoExtractor) emitMethod(m parser.QueryResult, filePath, fileID string, }) emitGoThrowsEdges(node, m.Captures["method.result"], filePath, result) emitGoFunctionShape(id, def.Node, m.Captures["method.params"], m.Captures["method.result"], - src, filePath, def.StartLine+1, result) + src, filePath, def.StartLine+1, imports, result) } // goTypeParams reads the `type_parameters` child of a Go declaration @@ -1459,12 +1459,15 @@ func (e *GoExtractor) emitImport(m parser.QueryResult, filePath, fileID string, Language: "go", Meta: importMeta, }) - // File → import-node edge (Defines), so get_file_summary picks - // it up under the file's children. + // File → import-node edge. EdgeContains is the semantic fit (the + // file *contains* an import statement; it doesn't *define* the + // imported package). The disk-backed GetFileSubGraph walks + // EdgeDefines ∪ EdgeContains from the file node to enumerate the + // full neighbourhood in one edge-index pass. result.Edges = append(result.Edges, &graph.Edge{ From: fileID, To: importNodeID, - Kind: graph.EdgeDefines, + Kind: graph.EdgeContains, FilePath: filePath, Line: line, }) diff --git a/internal/parser/languages/python.go b/internal/parser/languages/python.go index b689cae9..9cee7cb4 100644 --- a/internal/parser/languages/python.go +++ b/internal/parser/languages/python.go @@ -876,9 +876,13 @@ func pyEmitImportNode(filePath, fileID, importPath, alias string, line int, resu Language: "python", Meta: meta, }) + // File → import-node uses EdgeContains (the file contains an + // import statement; it doesn't define the imported module). + // GetFileSubGraph walks EdgeDefines ∪ EdgeContains to recover the + // full file neighbourhood. result.Edges = append(result.Edges, &graph.Edge{ From: fileID, To: importNodeID, - Kind: graph.EdgeDefines, FilePath: filePath, Line: line, + Kind: graph.EdgeContains, FilePath: filePath, Line: line, }) } diff --git a/internal/parser/languages/ts_dataflow.go b/internal/parser/languages/ts_dataflow.go new file mode 100644 index 00000000..6c5e405d --- /dev/null +++ b/internal/parser/languages/ts_dataflow.go @@ -0,0 +1,244 @@ +package languages + +import ( + "strconv" + + sitter "github.com/zzet/gortex/internal/parser/tsitter" + + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/parser" +) + +// emitTSLocalBindings walks a TypeScript / JavaScript function body +// and materialises a KindLocal node for every introduced binding +// (`let x = …`, `const x = …`, `var x = …`, destructured shorthand, +// for-in/for-of induction vars, catch clause bindings, ...). Each +// binding gets: +// +// - ID `#local:@+` +// (function-relative offset like the Go walker, so an edit +// above the function leaves the IDs stable), +// - Name = the identifier, +// - FilePath / StartLine = the binding's source position, +// - EdgeMemberOf back to the enclosing function so the resolver's +// scope-aware bare-name binding (#81) can find it by walking +// the function's inbound EdgeMemberOf of KindLocal. +// +// TS doesn't (yet) have a dataflow walker analogous to +// emitGoDataflow, so no value_flow / arg_of / returns_to edges +// target these locals today. Their value is semantic parity with +// Go: every introduced binding is a first-class graph node with +// stable identity, ready for the dataflow / scope-resolution +// passes downstream. KindLocal is excluded from BM25 search via +// shouldIndexForSearch so the materialisation doesn't pollute name +// lookups with per-function `err` / `data` / `i` rows. +// +// Mirrors emitGoDataflow's bindLocal helper for the +// node-emission side; the walk shape is TypeScript-specific +// (different AST node types). +func emitTSLocalBindings(ownerID string, ownerStartLine int, body *sitter.Node, src []byte, filePath string, result *parser.ExtractionResult) { + if body == nil || ownerID == "" { + return + } + w := &tsBindingWalker{ + ownerID: ownerID, + ownerStartLine: ownerStartLine, + filePath: filePath, + src: src, + result: result, + emitted: map[string]struct{}{}, + } + w.walk(body) +} + +type tsBindingWalker struct { + ownerID string + ownerStartLine int + filePath string + src []byte + result *parser.ExtractionResult + emitted map[string]struct{} +} + +func (w *tsBindingWalker) walk(n *sitter.Node) { + if n == nil { + return + } + switch n.Type() { + case "function_declaration", "method_definition", "function", "arrow_function", "generator_function", "generator_function_declaration", "function_expression": + // Don't descend into nested functions — their bindings + // belong to the inner function's scope. The TS extractor's + // own pass handles each inner function separately. + return + case "lexical_declaration", "variable_declaration": + w.handleVarDecl(n) + // Fall through to children for any nested expressions + // (e.g. an initializer that contains a destructure pattern + // is already captured by handleVarDecl; no extra walk). + return + case "for_in_statement", "for_of_statement": + w.handleForInOf(n) + // Continue into the body to pick up nested declarations. + if body := n.ChildByFieldName("body"); body != nil { + w.walk(body) + } + return + case "catch_clause": + w.handleCatchClause(n) + if body := n.ChildByFieldName("body"); body != nil { + w.walk(body) + } + return + } + for i := 0; i < int(n.NamedChildCount()); i++ { + w.walk(n.NamedChild(i)) + } +} + +// handleVarDecl visits `let`, `const`, `var` declarations and emits +// a KindLocal node per declarator. Each declarator's `name` field +// is either an identifier (simplest case) or a destructure pattern +// (object_pattern / array_pattern) — for patterns we descend and +// emit one node per shorthand identifier. +func (w *tsBindingWalker) handleVarDecl(decl *sitter.Node) { + for i := 0; i < int(decl.NamedChildCount()); i++ { + c := decl.NamedChild(i) + if c == nil || c.Type() != "variable_declarator" { + continue + } + name := c.ChildByFieldName("name") + if name == nil { + continue + } + w.emitFromPattern(name, int(decl.StartPoint().Row)+1) + } +} + +// handleForInOf visits `for (const x of items)` / `for (let k in obj)` +// and materialises the induction var(s) declared on the LHS. +func (w *tsBindingWalker) handleForInOf(n *sitter.Node) { + left := n.ChildByFieldName("left") + if left == nil { + return + } + line := int(n.StartPoint().Row) + 1 + switch left.Type() { + case "lexical_declaration", "variable_declaration": + w.handleVarDecl(left) + case "identifier": + w.bindLocal(left.Content(w.src), line) + default: + w.emitFromPattern(left, line) + } +} + +// handleCatchClause materialises the catch parameter (`catch (err) +// { ... }`). TS supports both an identifier and a destructure +// pattern as the catch binding. +func (w *tsBindingWalker) handleCatchClause(n *sitter.Node) { + param := n.ChildByFieldName("parameter") + if param == nil { + return + } + w.emitFromPattern(param, int(n.StartPoint().Row)+1) +} + +// emitFromPattern recursively visits a binding pattern (identifier +// at the leaf; object_pattern / array_pattern in the middle) and +// emits a KindLocal node for every leaf identifier. Shorthand +// (`{ a, b }`) and renamed (`{ a: aliased }`) both produce +// identifier leaves the walker handles uniformly. +func (w *tsBindingWalker) emitFromPattern(node *sitter.Node, line int) { + if node == nil { + return + } + switch node.Type() { + case "identifier", "shorthand_property_identifier_pattern": + w.bindLocal(node.Content(w.src), line) + case "object_pattern", "array_pattern": + for i := 0; i < int(node.NamedChildCount()); i++ { + c := node.NamedChild(i) + if c == nil { + continue + } + switch c.Type() { + case "pair_pattern": + // `{ a: aliased }` — the bound name lives on the + // `value` field. + if v := c.ChildByFieldName("value"); v != nil { + w.emitFromPattern(v, line) + } + case "rest_pattern": + for j := 0; j < int(c.NamedChildCount()); j++ { + w.emitFromPattern(c.NamedChild(j), line) + } + default: + w.emitFromPattern(c, line) + } + } + case "assignment_pattern": + // `let x = 1` inside a destructure — the bound name is on + // the `left` field; the right is the default. + if l := node.ChildByFieldName("left"); l != nil { + w.emitFromPattern(l, line) + } + case "rest_pattern": + for i := 0; i < int(node.NamedChildCount()); i++ { + w.emitFromPattern(node.NamedChild(i), line) + } + } +} + +// bindLocal emits the KindLocal node + owner edge. Idempotent on +// the binding ID so a name visited through more than one walk path +// produces exactly one node row. +func (w *tsBindingWalker) bindLocal(name string, line int) { + if name == "" || name == "_" { + return + } + offset := line + if w.ownerStartLine > 0 { + offset = line - w.ownerStartLine + 1 + } + id := w.ownerID + "#local:" + name + "@+" + strconv.Itoa(offset) + if _, ok := w.emitted[id]; ok { + return + } + w.emitted[id] = struct{}{} + // Language tag mirrors the file's source language; the + // extractor's caller passes the file path so we recover it + // from the suffix. Defaults to typescript when ambiguous. + lang := "typescript" + switch { + case hasSuffix(w.filePath, ".tsx"): + lang = "tsx" + case hasSuffix(w.filePath, ".jsx"): + lang = "javascript" + case hasSuffix(w.filePath, ".js"), hasSuffix(w.filePath, ".mjs"), hasSuffix(w.filePath, ".cjs"): + lang = "javascript" + } + w.result.Nodes = append(w.result.Nodes, &graph.Node{ + ID: id, + Kind: graph.KindLocal, + Name: name, + FilePath: w.filePath, + StartLine: line, + EndLine: line, + Language: lang, + }) + w.result.Edges = append(w.result.Edges, &graph.Edge{ + From: id, + To: w.ownerID, + Kind: graph.EdgeMemberOf, + FilePath: w.filePath, + Line: line, + Origin: graph.OriginASTResolved, + }) +} + +func hasSuffix(s, suf string) bool { + if len(s) < len(suf) { + return false + } + return s[len(s)-len(suf):] == suf +} diff --git a/internal/parser/languages/ts_dataflow_test.go b/internal/parser/languages/ts_dataflow_test.go new file mode 100644 index 00000000..d0731363 --- /dev/null +++ b/internal/parser/languages/ts_dataflow_test.go @@ -0,0 +1,188 @@ +package languages + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +// runTSLocalExtract is a thin adapter over the package's runTSExtract +// (declared in ts_function_shape_test.go) that returns the nodes and +// edges as a single struct convenient for the binding assertions +// below. +type tsLocalFixture struct { + nodes []*graph.Node + edges []*graph.Edge +} + +func runTSLocalExtract(t *testing.T, fileName, src string) tsLocalFixture { + t.Helper() + nodes, edges := runTSExtract(t, "pkg/"+fileName, src) + return tsLocalFixture{nodes: nodes, edges: edges} +} + +// TestEmitTSLocalBindings_LetConstVar covers the headline case: +// `let`, `const`, `var` declarations each produce a KindLocal node +// anchored to the enclosing function via EdgeMemberOf, with a +// function-relative offset ID so the binding stays stable across +// edits above the function. +func TestEmitTSLocalBindings_LetConstVar(t *testing.T) { + src := `function handler(req: any): string { + const raw = req.headers.authorization; + let token = raw.replace("Bearer ", ""); + var fallback = "anon"; + return token || fallback; +} +` + result := runTSLocalExtract(t, "auth.ts", src) + owner := "pkg/auth.ts::handler" + + locals := map[string]*graph.Node{} + for _, n := range result.nodes { + if n.Kind == graph.KindLocal { + locals[n.Name] = n + } + } + for _, want := range []string{"raw", "token", "fallback"} { + n, ok := locals[want] + require.Truef(t, ok, "missing KindLocal %q; got %v", want, mapKeys(locals)) + assert.Equal(t, graph.KindLocal, n.Kind) + assert.Equal(t, "pkg/auth.ts", n.FilePath) + assert.Truef(t, strings.HasPrefix(n.ID, owner+"#local:"+want+"@+"), + "local %q ID must be function-relative; got %q", want, n.ID) + } + + // Every local must have an EdgeMemberOf back to the owner. + memberFor := map[string]string{} + for _, e := range result.edges { + if e.Kind == graph.EdgeMemberOf { + memberFor[e.From] = e.To + } + } + for _, n := range locals { + assert.Equal(t, owner, memberFor[n.ID], + "local %q must own-link to enclosing function", n.Name) + } +} + +// TestEmitTSLocalBindings_DestructurePatterns ensures the walker +// handles object and array destructure patterns — common in JS/TS +// codebases (`const { foo, bar: aliased } = obj`). +func TestEmitTSLocalBindings_DestructurePatterns(t *testing.T) { + src := `function unpack(obj: any) { + const { foo, bar: aliased } = obj; + const [first, second] = obj.list; +} +` + result := runTSLocalExtract(t, "unpack.ts", src) + names := map[string]bool{} + for _, n := range result.nodes { + if n.Kind == graph.KindLocal { + names[n.Name] = true + } + } + for _, want := range []string{"foo", "aliased", "first", "second"} { + assert.Truef(t, names[want], "missing KindLocal for destructure %q; got %v", want, names) + } +} + +// TestEmitTSLocalBindings_ForOfBinding covers for-of induction vars +// — the parser's other binding-introduction site beyond plain +// declarations. +func TestEmitTSLocalBindings_ForOfBinding(t *testing.T) { + src := `function each(items: any[]) { + for (const item of items) { + const inner = item.value; + } +} +` + result := runTSLocalExtract(t, "each.ts", src) + names := map[string]bool{} + for _, n := range result.nodes { + if n.Kind == graph.KindLocal { + names[n.Name] = true + } + } + assert.True(t, names["item"], "for-of induction var must be materialised") + assert.True(t, names["inner"], "binding inside the loop body must be materialised") +} + +// TestEmitTSLocalBindings_NestedFunctionsScopeIsolated guards the +// walker against descending into nested functions (their bindings +// belong to their own scope, not the outer function's). +func TestEmitTSLocalBindings_NestedFunctionsScopeIsolated(t *testing.T) { + src := `function outer() { + const x = 1; + function inner() { + const y = 2; + } +} +` + result := runTSLocalExtract(t, "nested.ts", src) + outerOwner := "pkg/nested.ts::outer" + memberOwners := map[string]string{} + for _, e := range result.edges { + if e.Kind == graph.EdgeMemberOf { + memberOwners[e.From] = e.To + } + } + for _, n := range result.nodes { + if n.Kind != graph.KindLocal { + continue + } + switch n.Name { + case "x": + assert.Equal(t, outerOwner, memberOwners[n.ID], + "outer's local must own-link to outer") + case "y": + assert.NotEqual(t, outerOwner, memberOwners[n.ID], + "inner's local must NOT own-link to outer — different scope") + } + } +} + +// TestEmitTSLocalBindings_FunctionRelativeOffsetIsStable mirrors the +// Go regression at #76: adding a line above the function must NOT +// shift any local-binding ID inside it. +func TestEmitTSLocalBindings_FunctionRelativeOffsetIsStable(t *testing.T) { + orig := `function f() { + const x = 1; + const y = 2; +} +` + shifted := `// header +// header +// header +function f() { + const x = 1; + const y = 2; +} +` + collect := func(t *testing.T, src string) map[string]struct{} { + t.Helper() + ids := map[string]struct{}{} + for _, n := range runTSLocalExtract(t, "stable.ts", src).nodes { + if n.Kind == graph.KindLocal { + ids[n.ID] = struct{}{} + } + } + return ids + } + a := collect(t, orig) + b := collect(t, shifted) + assert.NotEmpty(t, a) + assert.Equal(t, a, b, + "local IDs must stay stable when only lines ABOVE the function move") +} + +func mapKeys(m map[string]*graph.Node) []string { + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + return out +} diff --git a/internal/parser/languages/ts_function_shape.go b/internal/parser/languages/ts_function_shape.go index d58062cf..26018cc4 100644 --- a/internal/parser/languages/ts_function_shape.go +++ b/internal/parser/languages/ts_function_shape.go @@ -34,6 +34,13 @@ func emitTSFunctionShape(ownerID string, declNode *sitter.Node, src []byte, file if body := tsFunctionBody(declNode); body != nil { emitTSAsyncSpawns(ownerID, body, src, filePath, result) emitTSFieldAccess(ownerID, body, src, filePath, result) + // Materialise let / const / var / range / catch bindings as + // KindLocal nodes — semantic parity with the Go extractor's + // #77 work. Idempotent on the binding ID (function-relative + // offset), excluded from BM25 search by shouldIndexForSearch, + // and consumed by the resolver's scope-aware bare-name bind + // (#81) for future dataflow / scope-resolution work. + emitTSLocalBindings(ownerID, declLine, body, src, filePath, result) } } diff --git a/internal/parser/languages/typescript.go b/internal/parser/languages/typescript.go index 8af445a3..528b5659 100644 --- a/internal/parser/languages/typescript.go +++ b/internal/parser/languages/typescript.go @@ -803,9 +803,14 @@ func (e *TypeScriptExtractor) emitImport(m parser.QueryResult, filePath, fileID Language: "typescript", Meta: importMeta, }) + // File → import-node uses EdgeContains (the file contains the + // import statement; it doesn't define the imported module). The + // resolver-facing file → unresolved::import path stays on + // EdgeImports unchanged — that's a file-to-file dependency, a + // different relationship. result.Edges = append(result.Edges, &graph.Edge{ From: fileID, To: importNodeID, - Kind: graph.EdgeDefines, FilePath: filePath, Line: line, + Kind: graph.EdgeContains, FilePath: filePath, Line: line, }) result.Edges = append(result.Edges, &graph.Edge{ From: fileID, To: "unresolved::import::" + importPath, diff --git a/internal/persistence/file_store.go b/internal/persistence/file_store.go index 33ef57d7..65ae577a 100644 --- a/internal/persistence/file_store.go +++ b/internal/persistence/file_store.go @@ -36,7 +36,7 @@ type FileStore struct { } // NewFileStore creates a file-based persistence store. -// If dir is empty, defaults to the Gortex cache dir (~/.cache/gortex/, +// If dir is empty, defaults to the Gortex cache dir (~/.gortex/cache/, // or the $XDG_CACHE_HOME equivalent when that variable is set). func NewFileStore(dir, version string) (*FileStore, error) { if dir == "" { diff --git a/internal/persistence/file_store_test.go b/internal/persistence/file_store_test.go index 1d2b5914..cf58db9f 100644 --- a/internal/persistence/file_store_test.go +++ b/internal/persistence/file_store_test.go @@ -276,10 +276,21 @@ func TestFileStore_ConcurrentReadWrite(t *testing.T) { return default: } + var e error if i%2 == 0 { - errs <- fs.Save(snap) + e = fs.Save(snap) } else { - errs <- fs.Evict(snap.RepoPath, snap.Branch, snap.CommitHash) + e = fs.Evict(snap.RepoPath, snap.Branch, snap.CommitHash) + } + // Honour stop while sending: errs is buffered, and the + // writer outruns the buffer in microseconds. Without the + // stop arm here the writer blocks on a full errs channel, + // never re-checks stop, and wg.Wait() deadlocks (the buffer + // only drains after wg.Wait()). + select { + case errs <- e: + case <-stop: + return } } }() diff --git a/internal/persistence/sidecar_sqlite.go b/internal/persistence/sidecar_sqlite.go new file mode 100644 index 00000000..290718de --- /dev/null +++ b/internal/persistence/sidecar_sqlite.go @@ -0,0 +1,855 @@ +package persistence + +import ( + "database/sql" + "encoding/json" + "fmt" + "os" + "path/filepath" + "sort" + "sync" + "time" + + _ "modernc.org/sqlite" +) + +// SidecarStore is the SQLite-backed side-store for the agent's +// non-graph knowledge: session notes, cross-session development +// memories, saved scopes, and repository notebooks. It is a SEPARATE +// database file from the graph store — independent of the graph +// --backend — so notes/memories/scopes/notebooks persist even when +// the graph runs with the in-memory backend. +// +// The file lives at /sidecar.sqlite by default (see +// DefaultSidecarPath); tests and the per-repo `gortex mcp` subprocess +// can point it at a cache-dir-local path for isolation. +// +// Rows are scoped by repo_key (the same RepoCacheKey hash the gob.gz +// layout used as a directory name) so a single sidecar file holds the +// notes/memories/notebooks of every repo the daemon serves. Scopes are +// global (no repo_key) — they were never per-repo. +// +// The managers in internal/mcp keep their in-memory slice + scorers +// unchanged; this store only swaps the persistence layer: load rows +// into the slice on open, write rows on each mutation, trim via a +// bounded DELETE. +type SidecarStore struct { + db *sql.DB + // writeMu serialises mutations. SQLite serialises writers + // internally; mirroring that on the Go side turns SQLITE_BUSY + // contention into clean lock-wait. + writeMu sync.Mutex +} + +const sidecarSchema = ` +CREATE TABLE IF NOT EXISTS notes ( + id TEXT NOT NULL, + repo_key TEXT NOT NULL, + session_id TEXT NOT NULL DEFAULT '', + client_name TEXT NOT NULL DEFAULT '', + body TEXT NOT NULL DEFAULT '', + symbol_id TEXT NOT NULL DEFAULT '', + file_path TEXT NOT NULL DEFAULT '', + repo_prefix TEXT NOT NULL DEFAULT '', + workspace_id TEXT NOT NULL DEFAULT '', + project_id TEXT NOT NULL DEFAULT '', + tags TEXT NOT NULL DEFAULT '[]', + auto_links TEXT NOT NULL DEFAULT '[]', + pinned INTEGER NOT NULL DEFAULT 0, + created_at INTEGER NOT NULL DEFAULT 0, + updated_at INTEGER NOT NULL DEFAULT 0, + PRIMARY KEY (repo_key, id) +) WITHOUT ROWID; +CREATE INDEX IF NOT EXISTS idx_notes_session ON notes (repo_key, session_id); +CREATE INDEX IF NOT EXISTS idx_notes_workspace ON notes (repo_key, workspace_id, project_id); +CREATE INDEX IF NOT EXISTS idx_notes_updated ON notes (repo_key, updated_at DESC); + +CREATE TABLE IF NOT EXISTS memories ( + id TEXT NOT NULL, + repo_key TEXT NOT NULL, + kind TEXT NOT NULL DEFAULT '', + source TEXT NOT NULL DEFAULT '', + body TEXT NOT NULL DEFAULT '', + title TEXT NOT NULL DEFAULT '', + confidence REAL NOT NULL DEFAULT 0, + importance INTEGER NOT NULL DEFAULT 0, + author_agent TEXT NOT NULL DEFAULT '', + symbol_ids TEXT NOT NULL DEFAULT '[]', + file_paths TEXT NOT NULL DEFAULT '[]', + auto_links TEXT NOT NULL DEFAULT '[]', + tags TEXT NOT NULL DEFAULT '[]', + workspace_id TEXT NOT NULL DEFAULT '', + project_id TEXT NOT NULL DEFAULT '', + repo_prefix TEXT NOT NULL DEFAULT '', + pinned INTEGER NOT NULL DEFAULT 0, + superseded_by TEXT NOT NULL DEFAULT '', + access_count INTEGER NOT NULL DEFAULT 0, + last_accessed INTEGER NOT NULL DEFAULT 0, + created_at INTEGER NOT NULL DEFAULT 0, + updated_at INTEGER NOT NULL DEFAULT 0, + PRIMARY KEY (repo_key, id) +) WITHOUT ROWID; +CREATE INDEX IF NOT EXISTS idx_memories_workspace ON memories (repo_key, workspace_id, project_id); +CREATE INDEX IF NOT EXISTS idx_memories_updated ON memories (repo_key, updated_at DESC); +CREATE INDEX IF NOT EXISTS idx_memories_kind ON memories (repo_key, kind); + +CREATE TABLE IF NOT EXISTS scopes ( + name TEXT NOT NULL PRIMARY KEY, + description TEXT NOT NULL DEFAULT '', + repos TEXT NOT NULL DEFAULT '[]', + paths TEXT NOT NULL DEFAULT '[]' +) WITHOUT ROWID; + +CREATE TABLE IF NOT EXISTS notebooks ( + id TEXT NOT NULL, + repo_key TEXT NOT NULL, + title TEXT NOT NULL DEFAULT '', + body TEXT NOT NULL DEFAULT '', + tags TEXT NOT NULL DEFAULT '[]', + symbol_ids TEXT NOT NULL DEFAULT '[]', + used_count INTEGER NOT NULL DEFAULT 0, + last_used INTEGER NOT NULL DEFAULT 0, + created_at INTEGER NOT NULL DEFAULT 0, + updated_at INTEGER NOT NULL DEFAULT 0, + PRIMARY KEY (repo_key, id) +) WITHOUT ROWID; +CREATE INDEX IF NOT EXISTS idx_notebooks_updated ON notebooks (repo_key, updated_at DESC); + +CREATE TABLE IF NOT EXISTS migration_marks ( + repo_key TEXT NOT NULL, + kind TEXT NOT NULL, + done_at INTEGER NOT NULL DEFAULT 0, + PRIMARY KEY (repo_key, kind) +) WITHOUT ROWID; +` + +// DefaultSidecarPath is the canonical location of the side-store DB: +// /sidecar.sqlite (~/.gortex/sidecar.sqlite by default). An +// absolute $XDG_DATA_HOME relocates it under that tree, same as the +// graph store and models. +func DefaultSidecarPath(dataDir string) string { + return filepath.Join(dataDir, "sidecar.sqlite") +} + +// --------------------------------------------------------------------------- +// Process-shared sidecar cache. +// +// A single sidecar file may back several managers (notes + memories + +// notebooks + scopes for one repo, plus every other repo a daemon +// serves). Opening one *sql.DB per manager would multiply the pool and +// risk lock contention, so stores are cached by absolute path and +// reused. Tests that pass distinct temp paths get distinct handles. +// --------------------------------------------------------------------------- + +var ( + sidecarMu sync.Mutex + sidecarCache = map[string]*SidecarStore{} +) + +// OpenSidecar opens (or creates) the sidecar DB at path, reusing an +// already-open handle for the same absolute path. An empty path yields +// (nil, nil): callers treat a nil store as "in-memory only, no disk" +// — the behaviour the gob.gz managers had when their cache dir was +// empty. +func OpenSidecar(path string) (*SidecarStore, error) { + if path == "" { + return nil, nil + } + abs, err := filepath.Abs(path) + if err != nil { + abs = path + } + + sidecarMu.Lock() + defer sidecarMu.Unlock() + if st, ok := sidecarCache[abs]; ok { + return st, nil + } + + if dir := filepath.Dir(abs); dir != "" { + if err := os.MkdirAll(dir, 0o755); err != nil { + return nil, fmt.Errorf("persistence: mkdir sidecar dir: %w", err) + } + } + + // Same WAL + synchronous=NORMAL + busy_timeout tradeoff the graph + // store_sqlite backend uses for write-heavy embedded workloads. + dsn := abs + "?_pragma=journal_mode(WAL)&_pragma=synchronous(NORMAL)&_pragma=busy_timeout(5000)&_pragma=foreign_keys(OFF)" + db, err := sql.Open("sqlite", dsn) + if err != nil { + return nil, fmt.Errorf("persistence: open sidecar: %w", err) + } + if _, err := db.Exec(sidecarSchema); err != nil { + _ = db.Close() + return nil, fmt.Errorf("persistence: sidecar schema: %w", err) + } + + st := &SidecarStore{db: db} + sidecarCache[abs] = st + return st, nil +} + +// Close closes the underlying *sql.DB and drops it from the shared +// cache. Primarily for tests; the daemon keeps its sidecar open for +// the process lifetime. +func (s *SidecarStore) Close() error { + if s == nil { + return nil + } + sidecarMu.Lock() + for k, v := range sidecarCache { + if v == s { + delete(sidecarCache, k) + } + } + sidecarMu.Unlock() + return s.db.Close() +} + +// --------------------------------------------------------------------------- +// JSON helpers for []string columns. +// --------------------------------------------------------------------------- + +func encodeStrings(in []string) string { + if len(in) == 0 { + return "[]" + } + b, err := json.Marshal(in) + if err != nil { + return "[]" + } + return string(b) +} + +func decodeStrings(s string) []string { + if s == "" || s == "[]" { + return nil + } + var out []string + if err := json.Unmarshal([]byte(s), &out); err != nil { + return nil + } + return out +} + +// unixOrZero converts a time to a UTC unix-nano stamp; the zero time +// maps to 0 so a NULL/absent value round-trips back to the zero time. +func unixOrZero(t time.Time) int64 { + if t.IsZero() { + return 0 + } + return t.UTC().UnixNano() +} + +func fromUnix(n int64) time.Time { + if n == 0 { + return time.Time{} + } + return time.Unix(0, n).UTC() +} + +// --------------------------------------------------------------------------- +// Migration bookkeeping. +// --------------------------------------------------------------------------- + +// migrationDone reports whether a one-shot legacy import has already +// run for (repoKey, kind). Idempotency guard for the gob.gz/json/md → +// sqlite import. +func (s *SidecarStore) migrationDone(repoKey, kind string) bool { + var n int + row := s.db.QueryRow(`SELECT COUNT(1) FROM migration_marks WHERE repo_key = ? AND kind = ?`, repoKey, kind) + if err := row.Scan(&n); err != nil { + return false + } + return n > 0 +} + +func (s *SidecarStore) markMigrated(repoKey, kind string) { + _, _ = s.db.Exec(`INSERT OR REPLACE INTO migration_marks (repo_key, kind, done_at) VALUES (?,?,?)`, + repoKey, kind, time.Now().UTC().UnixNano()) +} + +// countRows returns the number of rows for a repo_key in the given +// table — used to guard "sqlite already has rows" before importing. +func (s *SidecarStore) countRows(table, repoKey string) int { + var n int + row := s.db.QueryRow(`SELECT COUNT(1) FROM `+table+` WHERE repo_key = ?`, repoKey) + if err := row.Scan(&n); err != nil { + return 0 + } + return n +} + +// =========================================================================== +// Notes +// =========================================================================== + +// LoadNotesRows reads every note for a repo_key, oldest-first (the +// managers append-load into a chronological slice). +func (s *SidecarStore) LoadNotesRows(repoKey string) ([]NoteEntry, error) { + rows, err := s.db.Query(` + SELECT id, session_id, client_name, body, symbol_id, file_path, + repo_prefix, workspace_id, project_id, tags, auto_links, + pinned, created_at, updated_at + FROM notes WHERE repo_key = ? + ORDER BY created_at ASC, id ASC`, repoKey) + if err != nil { + return nil, fmt.Errorf("persistence: query notes: %w", err) + } + defer func() { _ = rows.Close() }() + + var out []NoteEntry + for rows.Next() { + var ( + e NoteEntry + tags, links string + pinned int + createdAt, updatedAt int64 + ) + if err := rows.Scan(&e.ID, &e.SessionID, &e.ClientName, &e.Body, &e.SymbolID, + &e.FilePath, &e.RepoPrefix, &e.WorkspaceID, &e.ProjectID, &tags, &links, + &pinned, &createdAt, &updatedAt); err != nil { + return out, fmt.Errorf("persistence: scan note: %w", err) + } + e.Tags = decodeStrings(tags) + e.AutoLinks = decodeStrings(links) + e.Pinned = pinned != 0 + e.Timestamp = fromUnix(createdAt) + e.UpdatedAt = fromUnix(updatedAt) + out = append(out, e) + } + return out, rows.Err() +} + +// UpsertNote writes (or replaces) a single note row. +func (s *SidecarStore) UpsertNote(repoKey string, e NoteEntry) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + pinned := 0 + if e.Pinned { + pinned = 1 + } + _, err := s.db.Exec(` + INSERT OR REPLACE INTO notes + (id, repo_key, session_id, client_name, body, symbol_id, file_path, + repo_prefix, workspace_id, project_id, tags, auto_links, pinned, + created_at, updated_at) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)`, + e.ID, repoKey, e.SessionID, e.ClientName, e.Body, e.SymbolID, e.FilePath, + e.RepoPrefix, e.WorkspaceID, e.ProjectID, encodeStrings(e.Tags), + encodeStrings(e.AutoLinks), pinned, unixOrZero(e.Timestamp), unixOrZero(e.UpdatedAt)) + if err != nil { + return fmt.Errorf("persistence: upsert note: %w", err) + } + return nil +} + +// DeleteNote removes a single note row. Missing rows are not errors. +func (s *SidecarStore) DeleteNote(repoKey, id string) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + _, err := s.db.Exec(`DELETE FROM notes WHERE repo_key = ? AND id = ?`, repoKey, id) + return err +} + +// TrimNotes enforces the soft cap: when the repo_key holds more than +// cap notes, the oldest non-pinned notes are deleted first until the +// count is within cap (pinned notes are never deleted). Mirrors the +// gob.gz trimNotes semantics as a bounded DELETE. +func (s *SidecarStore) TrimNotes(repoKey string, cap int) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + + var total int + if err := s.db.QueryRow(`SELECT COUNT(1) FROM notes WHERE repo_key = ?`, repoKey).Scan(&total); err != nil { + return err + } + if total <= cap { + return nil + } + excess := total - cap + // Delete the oldest non-pinned notes first. + _, err := s.db.Exec(` + DELETE FROM notes + WHERE repo_key = ? AND pinned = 0 AND id IN ( + SELECT id FROM notes + WHERE repo_key = ? AND pinned = 0 + ORDER BY created_at ASC, id ASC + LIMIT ? + )`, repoKey, repoKey, excess) + return err +} + +// =========================================================================== +// Memories +// =========================================================================== + +// LoadMemoriesRows reads every memory for a repo_key, oldest-first. +func (s *SidecarStore) LoadMemoriesRows(repoKey string) ([]MemoryEntry, error) { + rows, err := s.db.Query(` + SELECT id, kind, source, body, title, confidence, importance, + author_agent, symbol_ids, file_paths, auto_links, tags, + workspace_id, project_id, repo_prefix, pinned, superseded_by, + access_count, last_accessed, created_at, updated_at + FROM memories WHERE repo_key = ? + ORDER BY created_at ASC, id ASC`, repoKey) + if err != nil { + return nil, fmt.Errorf("persistence: query memories: %w", err) + } + defer func() { _ = rows.Close() }() + + var out []MemoryEntry + for rows.Next() { + var ( + e MemoryEntry + syms, files, links, tags string + confidence float64 + pinned int + accessCount int64 + lastAccessed, created, updated int64 + ) + if err := rows.Scan(&e.ID, &e.Kind, &e.Source, &e.Body, &e.Title, &confidence, + &e.Importance, &e.AuthorAgent, &syms, &files, &links, &tags, + &e.WorkspaceID, &e.ProjectID, &e.RepoPrefix, &pinned, &e.SupersededBy, + &accessCount, &lastAccessed, &created, &updated); err != nil { + return out, fmt.Errorf("persistence: scan memory: %w", err) + } + e.Confidence = float32(confidence) + e.SymbolIDs = decodeStrings(syms) + e.FilePaths = decodeStrings(files) + e.AutoLinks = decodeStrings(links) + e.Tags = decodeStrings(tags) + e.Pinned = pinned != 0 + e.AccessCount = uint64(accessCount) + e.LastAccessed = fromUnix(lastAccessed) + e.Timestamp = fromUnix(created) + e.UpdatedAt = fromUnix(updated) + out = append(out, e) + } + return out, rows.Err() +} + +// UpsertMemory writes (or replaces) a single memory row. +func (s *SidecarStore) UpsertMemory(repoKey string, e MemoryEntry) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + pinned := 0 + if e.Pinned { + pinned = 1 + } + _, err := s.db.Exec(` + INSERT OR REPLACE INTO memories + (id, repo_key, kind, source, body, title, confidence, importance, + author_agent, symbol_ids, file_paths, auto_links, tags, workspace_id, + project_id, repo_prefix, pinned, superseded_by, access_count, + last_accessed, created_at, updated_at) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)`, + e.ID, repoKey, e.Kind, e.Source, e.Body, e.Title, float64(e.Confidence), + e.Importance, e.AuthorAgent, encodeStrings(e.SymbolIDs), encodeStrings(e.FilePaths), + encodeStrings(e.AutoLinks), encodeStrings(e.Tags), e.WorkspaceID, e.ProjectID, + e.RepoPrefix, pinned, e.SupersededBy, int64(e.AccessCount), + unixOrZero(e.LastAccessed), unixOrZero(e.Timestamp), unixOrZero(e.UpdatedAt)) + if err != nil { + return fmt.Errorf("persistence: upsert memory: %w", err) + } + return nil +} + +// DeleteMemory removes a single memory row. Missing rows are not errors. +func (s *SidecarStore) DeleteMemory(repoKey, id string) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + _, err := s.db.Exec(`DELETE FROM memories WHERE repo_key = ? AND id = ?`, repoKey, id) + return err +} + +// TrimMemories enforces the soft cap with the two-pass policy the +// gob.gz trimMemories used: first shed non-pinned importance<=2 rows, +// then (if still over cap) shed the oldest non-pinned rows. Pinned +// rows are never deleted. +func (s *SidecarStore) TrimMemories(repoKey string, cap int) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + + var total int + if err := s.db.QueryRow(`SELECT COUNT(1) FROM memories WHERE repo_key = ?`, repoKey).Scan(&total); err != nil { + return err + } + if total <= cap { + return nil + } + excess := total - cap + + // Pass 1: oldest non-pinned, low-importance (<=2) rows. + res, err := s.db.Exec(` + DELETE FROM memories + WHERE repo_key = ? AND pinned = 0 AND importance <= 2 AND id IN ( + SELECT id FROM memories + WHERE repo_key = ? AND pinned = 0 AND importance <= 2 + ORDER BY created_at ASC, id ASC + LIMIT ? + )`, repoKey, repoKey, excess) + if err != nil { + return err + } + dropped, _ := res.RowsAffected() + remaining := excess - int(dropped) + if remaining <= 0 { + return nil + } + + // Pass 2: oldest non-pinned rows regardless of importance. + _, err = s.db.Exec(` + DELETE FROM memories + WHERE repo_key = ? AND pinned = 0 AND id IN ( + SELECT id FROM memories + WHERE repo_key = ? AND pinned = 0 + ORDER BY created_at ASC, id ASC + LIMIT ? + )`, repoKey, repoKey, remaining) + return err +} + +// =========================================================================== +// Scopes (global — no repo_key) +// =========================================================================== + +// ScopeRow mirrors the SavedScope shape without importing the mcp +// package. The mcp scopeStore converts between this and SavedScope. +type ScopeRow struct { + Name string + Description string + Repos []string + Paths []string +} + +// LoadScopes reads every saved scope, name-sorted. +func (s *SidecarStore) LoadScopes() ([]ScopeRow, error) { + rows, err := s.db.Query(`SELECT name, description, repos, paths FROM scopes ORDER BY name ASC`) + if err != nil { + return nil, fmt.Errorf("persistence: query scopes: %w", err) + } + defer func() { _ = rows.Close() }() + + var out []ScopeRow + for rows.Next() { + var ( + r ScopeRow + repos, paths string + ) + if err := rows.Scan(&r.Name, &r.Description, &repos, &paths); err != nil { + return out, fmt.Errorf("persistence: scan scope: %w", err) + } + r.Repos = decodeStrings(repos) + r.Paths = decodeStrings(paths) + out = append(out, r) + } + return out, rows.Err() +} + +// UpsertScope writes (or replaces) a single scope row. +func (s *SidecarStore) UpsertScope(r ScopeRow) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + _, err := s.db.Exec(` + INSERT OR REPLACE INTO scopes (name, description, repos, paths) + VALUES (?,?,?,?)`, + r.Name, r.Description, encodeStrings(r.Repos), encodeStrings(r.Paths)) + if err != nil { + return fmt.Errorf("persistence: upsert scope: %w", err) + } + return nil +} + +// DeleteScope removes a scope by name. Missing rows are not errors. +func (s *SidecarStore) DeleteScope(name string) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + _, err := s.db.Exec(`DELETE FROM scopes WHERE name = ?`, name) + return err +} + +// ScopeCount returns the number of saved scopes — used to guard the +// legacy scopes.json import. +func (s *SidecarStore) ScopeCount() int { + var n int + if err := s.db.QueryRow(`SELECT COUNT(1) FROM scopes`).Scan(&n); err != nil { + return 0 + } + return n +} + +// =========================================================================== +// Notebooks +// =========================================================================== + +// NotebookRow is the persisted notebook shape. SymbolIDs is carried +// for forward-compatibility (the markdown layout never had it, but the +// schema reserves the column); the mcp notebookEntry maps onto this. +type NotebookRow struct { + ID string + Title string + Body string + Tags []string + SymbolIDs []string + UsedCount uint64 + LastUsed time.Time + Created time.Time + Updated time.Time +} + +// LoadNotebookRows reads every notebook entry for a repo_key, +// newest-first by Updated. +func (s *SidecarStore) LoadNotebookRows(repoKey string) ([]NotebookRow, error) { + rows, err := s.db.Query(` + SELECT id, title, body, tags, symbol_ids, used_count, last_used, + created_at, updated_at + FROM notebooks WHERE repo_key = ? + ORDER BY updated_at DESC, id ASC`, repoKey) + if err != nil { + return nil, fmt.Errorf("persistence: query notebooks: %w", err) + } + defer func() { _ = rows.Close() }() + + var out []NotebookRow + for rows.Next() { + var ( + r NotebookRow + tags, syms string + usedCount int64 + lastUsed, created, updated int64 + ) + if err := rows.Scan(&r.ID, &r.Title, &r.Body, &tags, &syms, &usedCount, + &lastUsed, &created, &updated); err != nil { + return out, fmt.Errorf("persistence: scan notebook: %w", err) + } + r.Tags = decodeStrings(tags) + r.SymbolIDs = decodeStrings(syms) + r.UsedCount = uint64(usedCount) + r.LastUsed = fromUnix(lastUsed) + r.Created = fromUnix(created) + r.Updated = fromUnix(updated) + out = append(out, r) + } + return out, rows.Err() +} + +// GetNotebookRow reads a single notebook entry by id, or (zero, false). +func (s *SidecarStore) GetNotebookRow(repoKey, id string) (NotebookRow, bool) { + row := s.db.QueryRow(` + SELECT id, title, body, tags, symbol_ids, used_count, last_used, + created_at, updated_at + FROM notebooks WHERE repo_key = ? AND id = ?`, repoKey, id) + var ( + r NotebookRow + tags, syms string + usedCount int64 + lastUsed, created, updated int64 + ) + if err := row.Scan(&r.ID, &r.Title, &r.Body, &tags, &syms, &usedCount, + &lastUsed, &created, &updated); err != nil { + return NotebookRow{}, false + } + r.Tags = decodeStrings(tags) + r.SymbolIDs = decodeStrings(syms) + r.UsedCount = uint64(usedCount) + r.LastUsed = fromUnix(lastUsed) + r.Created = fromUnix(created) + r.Updated = fromUnix(updated) + return r, true +} + +// UpsertNotebook writes (or replaces) a single notebook row. +func (s *SidecarStore) UpsertNotebook(repoKey string, r NotebookRow) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + _, err := s.db.Exec(` + INSERT OR REPLACE INTO notebooks + (id, repo_key, title, body, tags, symbol_ids, used_count, last_used, + created_at, updated_at) + VALUES (?,?,?,?,?,?,?,?,?,?)`, + r.ID, repoKey, r.Title, r.Body, encodeStrings(r.Tags), encodeStrings(r.SymbolIDs), + int64(r.UsedCount), unixOrZero(r.LastUsed), unixOrZero(r.Created), unixOrZero(r.Updated)) + if err != nil { + return fmt.Errorf("persistence: upsert notebook: %w", err) + } + return nil +} + +// DeleteNotebook removes a notebook entry. Missing rows are not errors. +func (s *SidecarStore) DeleteNotebook(repoKey, id string) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + _, err := s.db.Exec(`DELETE FROM notebooks WHERE repo_key = ? AND id = ?`, repoKey, id) + return err +} + +// NotebookCutoff deletes notebook rows whose effective freshness stamp +// (LastUsed, falling back to Updated when never used) is older than +// cutoff. Mirrors the markdown TTL pruner as a bounded DELETE. Returns +// the deleted ids so the caller can mirror the prune elsewhere if +// needed. +func (s *SidecarStore) NotebookPrune(repoKey string, cutoff time.Time) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + c := unixOrZero(cutoff) + if c == 0 { + return nil + } + // effective = last_used when non-zero, else created/updated. + _, err := s.db.Exec(` + DELETE FROM notebooks + WHERE repo_key = ? + AND (CASE WHEN last_used > 0 THEN last_used ELSE updated_at END) < ?`, repoKey, c) + return err +} + +// =========================================================================== +// Legacy migration importers +// =========================================================================== + +// MigrateLegacyNotes imports a legacy notes.gob.gz for repoKey when the +// sqlite table is empty for that scope, then renames the legacy file to +// *.bak. Idempotent: guarded by a migration mark and an empty-table +// check. legacyDir is the gob.gz directory (NotesDir result). +func (s *SidecarStore) MigrateLegacyNotes(repoKey, legacyDir string) error { + if legacyDir == "" || s.migrationDone(repoKey, "notes") || s.countRows("notes", repoKey) > 0 { + return nil + } + loaded, err := LoadNotes(legacyDir) + if err != nil || loaded == nil || len(loaded.Entries) == 0 { + s.markMigrated(repoKey, "notes") + return nil + } + for _, e := range loaded.Entries { + if e.Timestamp.IsZero() { + e.Timestamp = time.Now().UTC() + } + if e.UpdatedAt.IsZero() { + e.UpdatedAt = e.Timestamp + } + if err := s.UpsertNote(repoKey, e); err != nil { + return err + } + } + s.markMigrated(repoKey, "notes") + renameLegacy(filepath.Join(legacyDir, notesFile)) + return nil +} + +// MigrateLegacyMemories imports a legacy memories.gob.gz for repoKey. +func (s *SidecarStore) MigrateLegacyMemories(repoKey, legacyDir string) error { + if legacyDir == "" || s.migrationDone(repoKey, "memories") || s.countRows("memories", repoKey) > 0 { + return nil + } + loaded, err := LoadMemories(legacyDir) + if err != nil || loaded == nil || len(loaded.Entries) == 0 { + s.markMigrated(repoKey, "memories") + return nil + } + for _, e := range loaded.Entries { + if e.Timestamp.IsZero() { + e.Timestamp = time.Now().UTC() + } + if e.UpdatedAt.IsZero() { + e.UpdatedAt = e.Timestamp + } + if err := s.UpsertMemory(repoKey, e); err != nil { + return err + } + } + s.markMigrated(repoKey, "memories") + renameLegacy(filepath.Join(legacyDir, memoriesFile)) + return nil +} + +// MigrateLegacyScopes imports a legacy scopes.json when the scopes +// table is empty, then renames the file to *.bak. Idempotent. +func (s *SidecarStore) MigrateLegacyScopes(legacyPath string) error { + if legacyPath == "" || s.migrationDone("global", "scopes") || s.ScopeCount() > 0 { + return nil + } + data, err := os.ReadFile(legacyPath) + if err != nil { + s.markMigrated("global", "scopes") + return nil + } + type legacyScope struct { + Name string `json:"name"` + Description string `json:"description"` + Repos []string `json:"repos"` + Paths []string `json:"paths"` + } + var legacy []legacyScope + if json.Unmarshal(data, &legacy) != nil { + s.markMigrated("global", "scopes") + return nil + } + for _, sc := range legacy { + if sc.Name == "" { + continue + } + if err := s.UpsertScope(ScopeRow(sc)); err != nil { + return err + } + } + s.markMigrated("global", "scopes") + renameLegacy(legacyPath) + return nil +} + +// MigrateLegacyNotebook imports markdown notebook files under +// legacyDir/.md into the sqlite notebooks table for repoKey, then +// renames each imported file to .md.bak. importMD parses one file's +// contents into a NotebookRow. Idempotent. +func (s *SidecarStore) MigrateLegacyNotebook(repoKey, legacyDir string, importMD func(id, contents string) (NotebookRow, bool)) error { + if legacyDir == "" || importMD == nil || s.migrationDone(repoKey, "notebook") || s.countRows("notebooks", repoKey) > 0 { + return nil + } + entries, err := os.ReadDir(legacyDir) + if err != nil { + s.markMigrated(repoKey, "notebook") + return nil + } + imported := make([]string, 0, len(entries)) + for _, de := range entries { + name := de.Name() + if de.IsDir() || filepath.Ext(name) != ".md" { + continue + } + full := filepath.Join(legacyDir, name) + contents, rerr := os.ReadFile(full) + if rerr != nil { + continue + } + id := name[:len(name)-len(".md")] + row, ok := importMD(id, string(contents)) + if !ok { + continue + } + row.ID = id + if err := s.UpsertNotebook(repoKey, row); err != nil { + return err + } + imported = append(imported, full) + } + s.markMigrated(repoKey, "notebook") + sort.Strings(imported) + for _, full := range imported { + renameLegacy(full) + } + return nil +} + +// renameLegacy renames a legacy file to .bak. Best-effort — +// never deletes; a missing file or rename failure is silently +// ignored so a migration that already moved the file stays idempotent. +func renameLegacy(path string) { + if path == "" { + return + } + if _, err := os.Stat(path); err != nil { + return + } + _ = os.Rename(path, path+".bak") +} diff --git a/internal/persistence/sidecar_sqlite_test.go b/internal/persistence/sidecar_sqlite_test.go new file mode 100644 index 00000000..dcb9f70f --- /dev/null +++ b/internal/persistence/sidecar_sqlite_test.go @@ -0,0 +1,379 @@ +package persistence + +import ( + "os" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func openTempSidecar(t *testing.T) *SidecarStore { + t.Helper() + path := filepath.Join(t.TempDir(), "sidecar.sqlite") + st, err := OpenSidecar(path) + require.NoError(t, err) + require.NotNil(t, st) + t.Cleanup(func() { _ = st.Close() }) + return st +} + +func TestSidecar_OpenEmptyPathIsNoOp(t *testing.T) { + st, err := OpenSidecar("") + require.NoError(t, err) + require.Nil(t, st) +} + +func TestSidecar_SameAbsPathReusesHandle(t *testing.T) { + path := filepath.Join(t.TempDir(), "sidecar.sqlite") + a, err := OpenSidecar(path) + require.NoError(t, err) + b, err := OpenSidecar(path) + require.NoError(t, err) + require.Same(t, a, b, "same absolute path must return the cached handle") + t.Cleanup(func() { _ = a.Close() }) +} + +func TestSidecar_NotesRoundTrip(t *testing.T) { + st := openTempSidecar(t) + now := time.Now().UTC().Truncate(time.Nanosecond) + in := NoteEntry{ + ID: "nt-1", + Timestamp: now, + UpdatedAt: now, + SessionID: "sess-1", + ClientName: "claude-code", + Body: "decision: switch to fastpath", + SymbolID: "pkg/foo.go::Bar", + FilePath: "pkg/foo.go", + RepoPrefix: "core", + WorkspaceID: "ws-a", + ProjectID: "proj-a", + Tags: []string{"decision", "perf"}, + AutoLinks: []string{"pkg/foo.go::Bar"}, + Pinned: true, + } + require.NoError(t, st.UpsertNote("rk", in)) + + rows, err := st.LoadNotesRows("rk") + require.NoError(t, err) + require.Len(t, rows, 1) + got := rows[0] + assert.Equal(t, in.ID, got.ID) + assert.Equal(t, in.SessionID, got.SessionID) + assert.Equal(t, in.ClientName, got.ClientName) + assert.Equal(t, in.Body, got.Body) + assert.Equal(t, in.SymbolID, got.SymbolID) + assert.Equal(t, in.FilePath, got.FilePath) + assert.Equal(t, in.WorkspaceID, got.WorkspaceID) + assert.Equal(t, in.Tags, got.Tags) + assert.Equal(t, in.AutoLinks, got.AutoLinks) + assert.True(t, got.Pinned) + assert.WithinDuration(t, in.UpdatedAt, got.UpdatedAt, time.Microsecond) + + // Scope isolation: another repo_key sees nothing. + other, err := st.LoadNotesRows("other") + require.NoError(t, err) + require.Empty(t, other) + + // Delete. + require.NoError(t, st.DeleteNote("rk", "nt-1")) + rows, err = st.LoadNotesRows("rk") + require.NoError(t, err) + require.Empty(t, rows) +} + +func TestSidecar_NotesTrimKeepsPinnedAndNewest(t *testing.T) { + st := openTempSidecar(t) + base := time.Now().UTC() + for i := 0; i < 10; i++ { + require.NoError(t, st.UpsertNote("rk", NoteEntry{ + ID: noteID(i), + Timestamp: base.Add(time.Duration(i) * time.Second), + UpdatedAt: base.Add(time.Duration(i) * time.Second), + Pinned: i == 0 || i == 5, + })) + } + require.NoError(t, st.TrimNotes("rk", 6)) + + rows, err := st.LoadNotesRows("rk") + require.NoError(t, err) + require.Len(t, rows, 6) + ids := map[string]bool{} + for _, r := range rows { + ids[r.ID] = true + } + assert.True(t, ids[noteID(0)], "pinned[0] survives") + assert.True(t, ids[noteID(5)], "pinned[5] survives") + assert.True(t, ids[noteID(9)], "newest survives") + assert.False(t, ids[noteID(1)], "oldest non-pinned dropped") +} + +func TestSidecar_MemoriesRoundTrip(t *testing.T) { + st := openTempSidecar(t) + now := time.Now().UTC() + in := MemoryEntry{ + ID: "mem-1", + Timestamp: now, + UpdatedAt: now, + LastAccessed: now, + AccessCount: 7, + Body: "lock invariant for Bar", + Title: "Bar lock invariant", + Kind: "invariant", + Source: "manual", + Confidence: 0.8, + Importance: 5, + AuthorAgent: "claude-code", + SymbolIDs: []string{"pkg/foo.go::Bar"}, + FilePaths: []string{"pkg/foo.go"}, + AutoLinks: []string{"pkg/foo.go::Baz"}, + Tags: []string{"invariant", "lock"}, + WorkspaceID: "ws-a", + ProjectID: "proj-a", + RepoPrefix: "core", + Pinned: true, + SupersededBy: "mem-2", + } + require.NoError(t, st.UpsertMemory("rk", in)) + + rows, err := st.LoadMemoriesRows("rk") + require.NoError(t, err) + require.Len(t, rows, 1) + got := rows[0] + assert.Equal(t, in.ID, got.ID) + assert.Equal(t, in.Title, got.Title) + assert.Equal(t, in.Kind, got.Kind) + assert.Equal(t, in.Source, got.Source) + assert.InDelta(t, in.Confidence, got.Confidence, 1e-6) + assert.Equal(t, in.Importance, got.Importance) + assert.Equal(t, in.AuthorAgent, got.AuthorAgent) + assert.Equal(t, in.SymbolIDs, got.SymbolIDs) + assert.Equal(t, in.FilePaths, got.FilePaths) + assert.Equal(t, in.AutoLinks, got.AutoLinks) + assert.Equal(t, in.Tags, got.Tags) + assert.Equal(t, uint64(7), got.AccessCount) + assert.Equal(t, "mem-2", got.SupersededBy) + assert.True(t, got.Pinned) + + require.NoError(t, st.DeleteMemory("rk", "mem-1")) + rows, err = st.LoadMemoriesRows("rk") + require.NoError(t, err) + require.Empty(t, rows) +} + +func TestSidecar_MemoriesTrimTwoPass(t *testing.T) { + st := openTempSidecar(t) + base := time.Now().UTC() + for i := 0; i < 10; i++ { + e := MemoryEntry{ + ID: memID(i), + Timestamp: base.Add(time.Duration(i) * time.Second), + UpdatedAt: base.Add(time.Duration(i) * time.Second), + Importance: 4, + } + if i == 2 || i == 4 { + e.Importance = 1 + } + if i == 7 { + e.Pinned = true + e.Importance = 1 + } + require.NoError(t, st.UpsertMemory("rk", e)) + } + require.NoError(t, st.TrimMemories("rk", 6)) + + rows, err := st.LoadMemoriesRows("rk") + require.NoError(t, err) + require.Len(t, rows, 6) + ids := map[string]bool{} + for _, r := range rows { + ids[r.ID] = true + } + assert.True(t, ids[memID(7)], "pinned low-imp survives") + assert.False(t, ids[memID(2)], "low-imp dropped") + assert.False(t, ids[memID(4)], "low-imp dropped") +} + +func TestSidecar_ScopesRoundTrip(t *testing.T) { + st := openTempSidecar(t) + require.NoError(t, st.UpsertScope(ScopeRow{ + Name: "backend", Description: "be", Repos: []string{"api", "core"}, Paths: []string{"services/x"}, + })) + require.NoError(t, st.UpsertScope(ScopeRow{Name: "frontend", Repos: []string{"web"}})) + + rows, err := st.LoadScopes() + require.NoError(t, err) + require.Len(t, rows, 2) + assert.Equal(t, "backend", rows[0].Name) + assert.Equal(t, []string{"api", "core"}, rows[0].Repos) + assert.Equal(t, []string{"services/x"}, rows[0].Paths) + assert.Equal(t, 2, st.ScopeCount()) + + require.NoError(t, st.DeleteScope("backend")) + rows, err = st.LoadScopes() + require.NoError(t, err) + require.Len(t, rows, 1) + assert.Equal(t, "frontend", rows[0].Name) +} + +func TestSidecar_NotebookRoundTrip(t *testing.T) { + st := openTempSidecar(t) + now := time.Now().UTC() + in := NotebookRow{ + ID: "nb-1", + Title: "design: sidecar", + Body: "use sqlite\nfor durability", + Tags: []string{"design", "storage"}, + SymbolIDs: []string{"pkg/p.go::Q"}, + UsedCount: 3, + LastUsed: now, + Created: now, + Updated: now, + } + require.NoError(t, st.UpsertNotebook("rk", in)) + + got, ok := st.GetNotebookRow("rk", "nb-1") + require.True(t, ok) + assert.Equal(t, in.Title, got.Title) + assert.Equal(t, in.Body, got.Body) + assert.Equal(t, in.Tags, got.Tags) + assert.Equal(t, in.SymbolIDs, got.SymbolIDs) + assert.Equal(t, uint64(3), got.UsedCount) + + rows, err := st.LoadNotebookRows("rk") + require.NoError(t, err) + require.Len(t, rows, 1) + + require.NoError(t, st.DeleteNotebook("rk", "nb-1")) + _, ok = st.GetNotebookRow("rk", "nb-1") + require.False(t, ok) +} + +func TestSidecar_NotebookPrune(t *testing.T) { + st := openTempSidecar(t) + old := time.Now().UTC().Add(-2 * time.Hour) + fresh := time.Now().UTC() + require.NoError(t, st.UpsertNotebook("rk", NotebookRow{ID: "stale", Updated: old})) + require.NoError(t, st.UpsertNotebook("rk", NotebookRow{ID: "fresh", Updated: fresh, LastUsed: fresh})) + + require.NoError(t, st.NotebookPrune("rk", time.Now().UTC().Add(-time.Hour))) + rows, err := st.LoadNotebookRows("rk") + require.NoError(t, err) + require.Len(t, rows, 1) + assert.Equal(t, "fresh", rows[0].ID) +} + +// --------------------------------------------------------------------------- +// Migration: legacy gob.gz / json → sqlite. +// --------------------------------------------------------------------------- + +func TestSidecar_MigrateLegacyNotes(t *testing.T) { + legacyDir := t.TempDir() + require.NoError(t, SaveNotes(legacyDir, &NoteStore{Entries: []NoteEntry{ + {ID: "nt-old", Body: "legacy note", SessionID: "s1", Pinned: true}, + }})) + + st := openTempSidecar(t) + require.NoError(t, st.MigrateLegacyNotes("rk", legacyDir)) + + rows, err := st.LoadNotesRows("rk") + require.NoError(t, err) + require.Len(t, rows, 1) + assert.Equal(t, "nt-old", rows[0].ID) + assert.Equal(t, "legacy note", rows[0].Body) + assert.True(t, rows[0].Pinned) + + // Legacy file renamed to .bak. + _, errOrig := os.Stat(filepath.Join(legacyDir, notesFile)) + assert.Error(t, errOrig, "original gob.gz must be renamed away") + _, errBak := os.Stat(filepath.Join(legacyDir, notesFile+".bak")) + assert.NoError(t, errBak, ".bak must exist") + + // Idempotent: a second migrate is a no-op (no duplicate rows). + require.NoError(t, st.MigrateLegacyNotes("rk", legacyDir)) + rows, err = st.LoadNotesRows("rk") + require.NoError(t, err) + require.Len(t, rows, 1) +} + +func TestSidecar_MigrateLegacyMemories(t *testing.T) { + legacyDir := t.TempDir() + require.NoError(t, SaveMemories(legacyDir, &MemoryStore{Entries: []MemoryEntry{ + {ID: "mem-old", Body: "legacy memory", Kind: "invariant", Importance: 5}, + }})) + + st := openTempSidecar(t) + require.NoError(t, st.MigrateLegacyMemories("rk", legacyDir)) + + rows, err := st.LoadMemoriesRows("rk") + require.NoError(t, err) + require.Len(t, rows, 1) + assert.Equal(t, "mem-old", rows[0].ID) + assert.Equal(t, "invariant", rows[0].Kind) + + _, errBak := os.Stat(filepath.Join(legacyDir, memoriesFile+".bak")) + assert.NoError(t, errBak, ".bak must exist") +} + +func TestSidecar_MigrateLegacyScopes(t *testing.T) { + legacyPath := filepath.Join(t.TempDir(), "scopes.json") + require.NoError(t, os.WriteFile(legacyPath, []byte(`[{"name":"be","description":"backend","repos":["api"],"paths":["svc/x"]}]`), 0o644)) + + st := openTempSidecar(t) + require.NoError(t, st.MigrateLegacyScopes(legacyPath)) + + rows, err := st.LoadScopes() + require.NoError(t, err) + require.Len(t, rows, 1) + assert.Equal(t, "be", rows[0].Name) + assert.Equal(t, []string{"api"}, rows[0].Repos) + assert.Equal(t, []string{"svc/x"}, rows[0].Paths) + + _, errBak := os.Stat(legacyPath + ".bak") + assert.NoError(t, errBak) + + // Idempotent. + require.NoError(t, st.MigrateLegacyScopes(legacyPath)) + assert.Equal(t, 1, st.ScopeCount()) +} + +func TestSidecar_MigrateLegacyNotebook(t *testing.T) { + legacyDir := t.TempDir() + md := "---\ntitle: old entry\ntags: [a, b]\nused_count: 4\n---\n\nbody text\n" + require.NoError(t, os.WriteFile(filepath.Join(legacyDir, "nbold.md"), []byte(md), 0o644)) + + st := openTempSidecar(t) + importMD := func(id, contents string) (NotebookRow, bool) { + // Minimal frontmatter parse for the test importer. + return NotebookRow{ID: id, Title: "old entry", Body: "body text\n", Tags: []string{"a", "b"}, UsedCount: 4}, true + } + require.NoError(t, st.MigrateLegacyNotebook("rk", legacyDir, importMD)) + + rows, err := st.LoadNotebookRows("rk") + require.NoError(t, err) + require.Len(t, rows, 1) + assert.Equal(t, "nbold", rows[0].ID) + assert.Equal(t, "old entry", rows[0].Title) + assert.Equal(t, uint64(4), rows[0].UsedCount) + + _, errBak := os.Stat(filepath.Join(legacyDir, "nbold.md.bak")) + assert.NoError(t, errBak) +} + +func TestSidecar_MigrateSkippedWhenTableNonEmpty(t *testing.T) { + legacyDir := t.TempDir() + require.NoError(t, SaveNotes(legacyDir, &NoteStore{Entries: []NoteEntry{{ID: "nt-old", Body: "legacy"}}})) + + st := openTempSidecar(t) + // Pre-seed the table so the import is skipped (guard on existing rows). + require.NoError(t, st.UpsertNote("rk", NoteEntry{ID: "nt-existing", Body: "already here"})) + require.NoError(t, st.MigrateLegacyNotes("rk", legacyDir)) + + rows, err := st.LoadNotesRows("rk") + require.NoError(t, err) + require.Len(t, rows, 1, "import must be skipped when the table already has rows") + assert.Equal(t, "nt-existing", rows[0].ID) +} diff --git a/internal/platform/migrate.go b/internal/platform/migrate.go new file mode 100644 index 00000000..3b155271 --- /dev/null +++ b/internal/platform/migrate.go @@ -0,0 +1,99 @@ +package platform + +import ( + "os" + "path/filepath" + "strings" +) + +// MigrateToUnifiedHome relocates per-user state written by older Gortex +// versions (which split files across ~/.config/gortex, ~/.cache/gortex, +// and a flat ~/.gortex) into the unified ~/.gortex tree this package now +// resolves. It is best-effort and idempotent: a destination that already +// exists is never overwritten, and individual move failures are reported +// to logf but never abort the caller. +// +// It is a no-op when any XDG_*_HOME variable is set to an absolute path — +// that signals the user opted into the XDG layout, so there is nothing to +// unify. logf may be nil. Because every step short-circuits once its +// destination exists, the function is cheap to call on every startup; it +// only logs (and only does work) on the first run after the upgrade. +func MigrateToUnifiedHome(logf func(format string, args ...any)) { + if logf == nil { + logf = func(string, ...any) {} + } + // Respect an explicit XDG opt-in: relocate nothing. + for _, v := range []string{"XDG_CONFIG_HOME", "XDG_DATA_HOME", "XDG_CACHE_HOME"} { + if val := os.Getenv(v); val != "" && filepath.IsAbs(val) { + return + } + } + home, err := os.UserHomeDir() + if err != nil || home == "" { + return + } + root := filepath.Join(home, homeDir) // ~/.gortex + + // 1. Global config moves out of ~/.config/gortex into the root. + migrateInto(logf, filepath.Join(home, ".config", gortexDir, "config.yaml"), filepath.Join(root, "config.yaml")) + migrateInto(logf, filepath.Join(home, ".config", gortexDir, "servers.toml"), filepath.Join(root, "servers.toml")) + + // 2. The old ~/.cache/gortex tree folds into ~/.gortex/cache, except + // downloaded models (durable data, kept out of cache so a cache + // wipe doesn't discard them) and the stale daemon socket / pid + // (regenerated on the next start). + oldCache := filepath.Join(home, ".cache", gortexDir) + if entries, err := os.ReadDir(oldCache); err == nil { + for _, e := range entries { + switch e.Name() { + case "daemon.sock", "daemon.pid": + continue + case "models": + migrateInto(logf, filepath.Join(oldCache, "models"), filepath.Join(root, "models")) + default: + migrateInto(logf, filepath.Join(oldCache, e.Name()), filepath.Join(root, cacheSub, e.Name())) + } + } + } + + // 3. In-place reorg of the ~/.gortex root: the backend store (and its + // WAL/shm sidecars) move under store/, and the old memories-cache + // directory becomes memories/. + if entries, err := os.ReadDir(root); err == nil { + for _, e := range entries { + name := e.Name() + if e.IsDir() { + continue + } + if strings.HasSuffix(name, ".store") || strings.HasPrefix(name, "store.sqlite") { + migrateInto(logf, filepath.Join(root, name), filepath.Join(root, "store", name)) + } + } + } + migrateInto(logf, filepath.Join(root, "memories-cache"), filepath.Join(root, "memories")) +} + +// migrateInto moves src to dst when src exists and dst does not. The move +// is a rename (atomic within a filesystem); a cross-device failure is +// logged and the source left in place rather than risking a partial copy +// of a live store. Idempotent: a pre-existing dst short-circuits. +func migrateInto(logf func(string, ...any), src, dst string) { + if src == dst { + return + } + if _, err := os.Lstat(src); err != nil { + return // nothing to migrate + } + if _, err := os.Lstat(dst); err == nil { + return // already present — never clobber + } + if err := os.MkdirAll(filepath.Dir(dst), 0o755); err != nil { + logf("gortex: migrate %s: mkdir parent failed: %v", dst, err) + return + } + if err := os.Rename(src, dst); err != nil { + logf("gortex: could not migrate %s -> %s (move it manually): %v", src, dst, err) + return + } + logf("gortex: migrated %s -> %s", src, dst) +} diff --git a/internal/platform/migrate_test.go b/internal/platform/migrate_test.go new file mode 100644 index 00000000..baaf42ef --- /dev/null +++ b/internal/platform/migrate_test.go @@ -0,0 +1,91 @@ +package platform + +import ( + "os" + "path/filepath" + "testing" +) + +func seed(t *testing.T, path, content string) { + t.Helper() + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatal(err) + } +} + +// TestMigrateToUnifiedHome verifies the old split layout folds into the +// unified ~/.gortex tree, the stale socket is left behind, and a second +// run is a no-op that doesn't clobber. +func TestMigrateToUnifiedHome(t *testing.T) { + clearXDG(t) + home := t.TempDir() + t.Setenv("HOME", home) + + seed(t, filepath.Join(home, ".config", "gortex", "config.yaml"), "cfg") + seed(t, filepath.Join(home, ".cache", "gortex", "daemon-sqlite.gob.gz"), "snap") + seed(t, filepath.Join(home, ".cache", "gortex", "models", "gte-small", "model.onnx"), "model") + seed(t, filepath.Join(home, ".cache", "gortex", "daemon.sock"), "sock") // ephemeral — skipped + seed(t, filepath.Join(home, ".gortex", "store.sqlite"), "db") + seed(t, filepath.Join(home, ".gortex", "store.sqlite-wal"), "wal") + seed(t, filepath.Join(home, ".gortex", "memories-cache", "global", "x.json"), "mem") + + MigrateToUnifiedHome(nil) + + want := map[string]string{ + filepath.Join(home, ".gortex", "config.yaml"): "cfg", + filepath.Join(home, ".gortex", "cache", "daemon-sqlite.gob.gz"): "snap", + filepath.Join(home, ".gortex", "models", "gte-small", "model.onnx"): "model", + filepath.Join(home, ".gortex", "store", "store.sqlite"): "db", + filepath.Join(home, ".gortex", "store", "store.sqlite-wal"): "wal", + filepath.Join(home, ".gortex", "memories", "global", "x.json"): "mem", + } + for p, w := range want { + got, err := os.ReadFile(p) + if err != nil { + t.Errorf("expected migrated file %s: %v", p, err) + continue + } + if string(got) != w { + t.Errorf("%s = %q, want %q", p, got, w) + } + } + + // The stale socket must NOT be carried into the unified cache. + if _, err := os.Lstat(filepath.Join(home, ".gortex", "cache", "daemon.sock")); err == nil { + t.Errorf("daemon.sock should have been skipped, not migrated") + } + // The old flat store file must have moved (not left behind). + if _, err := os.Lstat(filepath.Join(home, ".gortex", "store.sqlite")); err == nil { + t.Errorf("old flat store.sqlite should have moved under store/") + } + + // Idempotent: a second run neither errors nor clobbers. + MigrateToUnifiedHome(nil) + if got, _ := os.ReadFile(filepath.Join(home, ".gortex", "config.yaml")); string(got) != "cfg" { + t.Errorf("config.yaml clobbered on second migration run") + } +} + +// TestMigrateToUnifiedHome_SkipsUnderXDG verifies an explicit XDG opt-in +// makes migration a no-op — the user chose the XDG layout. +func TestMigrateToUnifiedHome_SkipsUnderXDG(t *testing.T) { + clearXDG(t) + home := t.TempDir() + t.Setenv("HOME", home) + t.Setenv("XDG_CONFIG_HOME", t.TempDir()) + + old := filepath.Join(home, ".config", "gortex", "config.yaml") + seed(t, old, "cfg") + + MigrateToUnifiedHome(nil) + + if _, err := os.Lstat(filepath.Join(home, ".gortex", "config.yaml")); err == nil { + t.Errorf("migration must be a no-op when an XDG override is set") + } + if _, err := os.Lstat(old); err != nil { + t.Errorf("original config must be untouched under XDG: %v", err) + } +} diff --git a/internal/platform/xdg.go b/internal/platform/xdg.go index 4a3ea8f1..d84b73e8 100644 --- a/internal/platform/xdg.go +++ b/internal/platform/xdg.go @@ -5,151 +5,88 @@ import ( "path/filepath" ) -// gortexDir is the application sub-directory Gortex owns inside any of -// the XDG base directories. Every config / data / cache path Gortex -// writes lives under "/gortex/...". -const gortexDir = "gortex" - -// xdgBase resolves one XDG base directory. When the named XDG_*_HOME -// environment variable is set AND holds an absolute path, that value -// wins on every platform — this is the "consistent" behaviour: an -// explicit XDG override is always honoured, Linux / macOS / Windows -// alike. +// Gortex keeps all per-user state under one directory tree. By default +// that tree is $HOME/.gortex, holding config, cache, the on-disk store, +// downloaded models, and development memories side by side — a single +// place to find, back up, or delete. // -// A non-absolute XDG_*_HOME value is ignored, exactly as the XDG Base -// Directory specification mandates ("If [the variable] is set to a -// relative path the value MUST be ignored"). When the variable is -// unset, empty, or relative, the function falls back to -// filepath.Join($HOME, fallbackRel) — the historical Gortex default, -// preserved verbatim so existing installs keep resolving to the same -// location. The optional homeFallback is used only when $HOME itself -// cannot be resolved. -func xdgBase(envVar, fallbackRel, homeFallback string) string { - if v := os.Getenv(envVar); v != "" && filepath.IsAbs(v) { - return v - } +// The XDG Base Directory variables remain an explicit escape hatch: +// when XDG_CONFIG_HOME / XDG_DATA_HOME / XDG_CACHE_HOME is set to an +// absolute path it wins, and that category's files live under +// "/gortex" (standard XDG layout) instead of inside the +// unified ~/.gortex tree. This keeps XDG-strict setups, sandboxes, and +// the test suite working while giving everyone else one folder. + +const ( + // gortexDir is the application sub-directory Gortex owns inside an + // XDG base directory when an XDG_*_HOME override is in effect. + gortexDir = "gortex" + // homeDir is the unified per-user directory ($HOME/.gortex) used + // when no XDG override applies. + homeDir = ".gortex" + // cacheSub disambiguates cache from config/data inside the unified + // ~/.gortex tree. Under an XDG_CACHE_HOME override the base is + // already cache-specific, so this sub-path is not added there. + cacheSub = "cache" +) + +// Home returns the unified per-user Gortex directory ($HOME/.gortex), +// falling back to a temp-dir equivalent when $HOME can't be resolved. +// This is the root the cache / store / models / memories sub-paths hang +// off when no XDG override is in play. +func Home() string { home, err := os.UserHomeDir() if err != nil || home == "" { - return homeFallback + return filepath.Join(os.TempDir(), homeDir) } - return filepath.Join(home, fallbackRel) -} - -// ConfigHome returns the XDG config base directory: $XDG_CONFIG_HOME -// when set to an absolute path, otherwise $HOME/.config. This is the -// base, not Gortex-scoped — use ConfigDir for the Gortex sub-directory. -func ConfigHome() string { - return xdgBase("XDG_CONFIG_HOME", ".config", os.TempDir()) + return filepath.Join(home, homeDir) } -// DataHome returns the XDG data base directory: $XDG_DATA_HOME when set -// to an absolute path, otherwise $HOME/.local/share. -func DataHome() string { - return xdgBase("XDG_DATA_HOME", filepath.Join(".local", "share"), os.TempDir()) -} - -// CacheHome returns the XDG cache base directory: $XDG_CACHE_HOME when -// set to an absolute path, otherwise $HOME/.cache. +// unifiedDir resolves a Gortex base for one XDG category. An absolute +// $envVar wins ("/gortex" — the standard XDG location), so +// XDG-strict setups, sandboxes, and the test suite keep working. +// Otherwise the category collapses into the unified ~/.gortex tree, +// with homeSub distinguishing cache ("cache") from config/data (""). // -// Note this deliberately falls back to $HOME/.cache on every platform -// — that is Gortex's historical default and what most subsystems -// (the snapshot store, token cache, daemon state on Unix, …) have -// always used. Subsystems that historically rooted their cache at -// os.UserCacheDir() (which differs from $HOME/.cache on macOS and -// Windows) must call OSCacheHome instead, so their unset-env fallback -// stays byte-identical and existing data is not orphaned. -func CacheHome() string { - return xdgBase("XDG_CACHE_HOME", ".cache", os.TempDir()) -} - -// OSCacheHome returns the cache base directory for subsystems whose -// historical default was os.UserCacheDir() rather than $HOME/.cache. -// -// $XDG_CACHE_HOME still wins when set to an absolute path — that is the -// consistency the resolver guarantees, and on Linux os.UserCacheDir() -// already consults XDG_CACHE_HOME anyway. When the variable is unset -// the function falls back to os.UserCacheDir() so the resolved path is -// identical to what these subsystems used before (e.g. -// ~/Library/Caches on macOS, %LocalAppData% on Windows), keeping -// existing on-disk state reachable. -func OSCacheHome() string { - if v := os.Getenv("XDG_CACHE_HOME"); v != "" && filepath.IsAbs(v) { - return v - } - dir, err := os.UserCacheDir() - if err != nil || dir == "" { - return os.TempDir() +// A non-absolute $envVar is ignored, as the XDG Base Directory +// specification mandates ("If [the variable] is set to a relative path +// the value MUST be ignored"). +func unifiedDir(envVar, homeSub string) string { + if v := os.Getenv(envVar); v != "" && filepath.IsAbs(v) { + return filepath.Join(v, gortexDir) } - return dir + return filepath.Join(Home(), homeSub) } -// ConfigDir returns the Gortex configuration directory: -// "/gortex". Honours $XDG_CONFIG_HOME; falls back to -// $HOME/.config/gortex when unset. -func ConfigDir() string { - return filepath.Join(ConfigHome(), gortexDir) -} +// ConfigDir is where Gortex reads/writes configuration (config.yaml, +// servers.toml). Default ~/.gortex; an absolute $XDG_CONFIG_HOME +// relocates it to "/gortex". +func ConfigDir() string { return unifiedDir("XDG_CONFIG_HOME", "") } -// DataDir returns the Gortex data directory: "/gortex". -// Honours $XDG_DATA_HOME; falls back to $HOME/.local/share/gortex when -// unset. -func DataDir() string { - return filepath.Join(DataHome(), gortexDir) -} +// DataDir is the root for durable, non-disposable state (the on-disk +// store, downloaded models, development memories). Default ~/.gortex; +// an absolute $XDG_DATA_HOME relocates it to "/gortex". +func DataDir() string { return unifiedDir("XDG_DATA_HOME", "") } -// CacheDir returns the Gortex cache directory: "/gortex". -// Honours $XDG_CACHE_HOME; falls back to $HOME/.cache/gortex when -// unset. -func CacheDir() string { - return filepath.Join(CacheHome(), gortexDir) -} +// CacheDir is where Gortex keeps disposable state (the daemon socket / +// pid / log, snapshots, eval and token caches). Default ~/.gortex/cache; +// an absolute $XDG_CACHE_HOME relocates it to "/gortex". +func CacheDir() string { return unifiedDir("XDG_CACHE_HOME", cacheSub) } -// OSCacheDir returns the Gortex cache directory for subsystems whose -// historical root was os.UserCacheDir(): "/gortex". -// Honours $XDG_CACHE_HOME; falls back to os.UserCacheDir()/gortex when -// unset (preserving the pre-existing macOS / Windows location). -func OSCacheDir() string { - return filepath.Join(OSCacheHome(), gortexDir) -} +// OSCacheDir is retained for callers that historically rooted their +// cache at os.UserCacheDir(); under the unified layout it resolves to +// the same directory as CacheDir. +func OSCacheDir() string { return CacheDir() } -// legacyDir is the dot-directory ($HOME/.gortex) that a few subsystems -// adopted before Gortex grew an XDG-aware layout. It is already the -// Gortex-owned directory (no extra "gortex" sub-directory). New code -// should not add paths here; LegacyConfigDir / LegacyDataDir exist only -// so the pre-XDG subsystems keep an unchanged unset-env fallback. -const legacyDir = ".gortex" +// StoreDir is where the on-disk backend persists its store: +// /store (~/.gortex/store by default). +func StoreDir() string { return filepath.Join(DataDir(), "store") } -// legacyAwareDir resolves a Gortex directory for a pre-XDG subsystem. -// When the named XDG_*_HOME variable is set to an absolute path it -// wins, and the standard "/gortex" layout is used so the -// subsystem joins the same Gortex tree as everything else. When the -// variable is unset the legacy $HOME/.gortex location is returned -// verbatim, so an existing install's files stay reachable. -func legacyAwareDir(envVar string) string { - if v := os.Getenv(envVar); v != "" && filepath.IsAbs(v) { - return filepath.Join(v, gortexDir) - } - home, err := os.UserHomeDir() - if err != nil || home == "" { - return filepath.Join(os.TempDir(), legacyDir) - } - return filepath.Join(home, legacyDir) -} +// ModelsDir is where downloaded embedding models live: /models +// (~/.gortex/models by default). Models live under DataDir rather than +// CacheDir so a cache wipe doesn't discard multi-hundred-MB downloads. +func ModelsDir() string { return filepath.Join(DataDir(), "models") } -// LegacyConfigDir returns the Gortex config directory for subsystems -// that historically rooted config-shaped state at $HOME/.gortex -// (rather than $HOME/.config). An absolute $XDG_CONFIG_HOME wins -// ("/gortex"); otherwise the legacy $HOME/.gortex -// location is kept so existing files are not orphaned. -func LegacyConfigDir() string { - return legacyAwareDir("XDG_CONFIG_HOME") -} - -// LegacyDataDir returns the Gortex data directory for subsystems that -// historically rooted data-shaped state at $HOME/.gortex (rather than -// $HOME/.local/share). An absolute $XDG_DATA_HOME wins -// ("/gortex"); otherwise the legacy $HOME/.gortex -// location is kept so existing files are not orphaned. -func LegacyDataDir() string { - return legacyAwareDir("XDG_DATA_HOME") -} +// MemoriesDir is where cross-session development memories persist: +// /memories (~/.gortex/memories by default). +func MemoriesDir() string { return filepath.Join(DataDir(), "memories") } diff --git a/internal/platform/xdg_test.go b/internal/platform/xdg_test.go index 77d2abf0..2744b797 100644 --- a/internal/platform/xdg_test.go +++ b/internal/platform/xdg_test.go @@ -1,7 +1,6 @@ package platform import ( - "os" "path/filepath" "testing" ) @@ -16,8 +15,19 @@ func clearXDG(t *testing.T) { } } +// TestHome verifies the unified per-user directory is $HOME/.gortex. +func TestHome(t *testing.T) { + clearXDG(t) + home := t.TempDir() + t.Setenv("HOME", home) + + if got, want := Home(), filepath.Join(home, ".gortex"); got != want { + t.Errorf("Home() = %s, want %s", got, want) + } +} + // TestConfigDir_HonorsXDGConfigHome verifies an absolute $XDG_CONFIG_HOME -// is used verbatim. +// relocates config to the standard XDG location. func TestConfigDir_HonorsXDGConfigHome(t *testing.T) { clearXDG(t) xdg := t.TempDir() @@ -29,22 +39,21 @@ func TestConfigDir_HonorsXDGConfigHome(t *testing.T) { } } -// TestConfigDir_UnsetFallback verifies the env-unset fallback stays at -// the historical $HOME/.config/gortex location so existing installs are -// not orphaned. +// TestConfigDir_UnsetFallback verifies the env-unset default is the +// unified $HOME/.gortex directory. func TestConfigDir_UnsetFallback(t *testing.T) { clearXDG(t) home := t.TempDir() t.Setenv("HOME", home) - want := filepath.Join(home, ".config", "gortex") + want := filepath.Join(home, ".gortex") if got := ConfigDir(); got != want { - t.Errorf("ConfigDir() = %s, want %s (unset fallback must match the historical default)", got, want) + t.Errorf("ConfigDir() = %s, want %s (unified default)", got, want) } } -// TestDataDir_HonorsXDGDataHome verifies an absolute $XDG_DATA_HOME is -// used verbatim. +// TestDataDir_HonorsXDGDataHome verifies an absolute $XDG_DATA_HOME +// relocates data to the standard XDG location. func TestDataDir_HonorsXDGDataHome(t *testing.T) { clearXDG(t) xdg := t.TempDir() @@ -56,21 +65,21 @@ func TestDataDir_HonorsXDGDataHome(t *testing.T) { } } -// TestDataDir_UnsetFallback verifies the env-unset fallback is the XDG -// default $HOME/.local/share/gortex. +// TestDataDir_UnsetFallback verifies the env-unset default collapses +// into the unified $HOME/.gortex directory. func TestDataDir_UnsetFallback(t *testing.T) { clearXDG(t) home := t.TempDir() t.Setenv("HOME", home) - want := filepath.Join(home, ".local", "share", "gortex") + want := filepath.Join(home, ".gortex") if got := DataDir(); got != want { - t.Errorf("DataDir() = %s, want %s", got, want) + t.Errorf("DataDir() = %s, want %s (unified default)", got, want) } } // TestCacheDir_HonorsXDGCacheHome verifies an absolute $XDG_CACHE_HOME -// is used verbatim. +// relocates cache to the standard XDG location. func TestCacheDir_HonorsXDGCacheHome(t *testing.T) { clearXDG(t) xdg := t.TempDir() @@ -82,22 +91,89 @@ func TestCacheDir_HonorsXDGCacheHome(t *testing.T) { } } -// TestCacheDir_UnsetFallback verifies the env-unset fallback stays at -// the historical $HOME/.cache/gortex location. +// TestCacheDir_UnsetFallback verifies the env-unset default is the +// cache/ sub-directory inside the unified ~/.gortex tree. func TestCacheDir_UnsetFallback(t *testing.T) { clearXDG(t) home := t.TempDir() t.Setenv("HOME", home) - want := filepath.Join(home, ".cache", "gortex") + want := filepath.Join(home, ".gortex", "cache") if got := CacheDir(); got != want { - t.Errorf("CacheDir() = %s, want %s (unset fallback must match the historical default)", got, want) + t.Errorf("CacheDir() = %s, want %s (unified default)", got, want) + } +} + +// TestOSCacheDir_ConvergesWithCacheDir verifies OSCacheDir now resolves +// to the same place as CacheDir under both an XDG override and the +// unified default. +func TestOSCacheDir_ConvergesWithCacheDir(t *testing.T) { + clearXDG(t) + home := t.TempDir() + t.Setenv("HOME", home) + if got, want := OSCacheDir(), CacheDir(); got != want { + t.Errorf("OSCacheDir() = %s, want %s (must converge with CacheDir)", got, want) + } + if got, want := OSCacheDir(), filepath.Join(home, ".gortex", "cache"); got != want { + t.Errorf("OSCacheDir() unified = %s, want %s", got, want) + } + + xdg := t.TempDir() + t.Setenv("XDG_CACHE_HOME", xdg) + if got, want := OSCacheDir(), filepath.Join(xdg, "gortex"); got != want { + t.Errorf("OSCacheDir() with XDG_CACHE_HOME = %s, want %s", got, want) + } +} + +// TestPurposeDirs_UnsetFallback verifies the store / models / memories +// sub-directories hang off the unified ~/.gortex tree by default. +func TestPurposeDirs_UnsetFallback(t *testing.T) { + clearXDG(t) + home := t.TempDir() + t.Setenv("HOME", home) + + cases := []struct { + name string + got func() string + want string + }{ + {"store", StoreDir, filepath.Join(home, ".gortex", "store")}, + {"models", ModelsDir, filepath.Join(home, ".gortex", "models")}, + {"memories", MemoriesDir, filepath.Join(home, ".gortex", "memories")}, + } + for _, tc := range cases { + if got := tc.got(); got != tc.want { + t.Errorf("%sDir() = %s, want %s", tc.name, got, tc.want) + } + } +} + +// TestPurposeDirs_HonorXDGDataHome verifies the purpose sub-directories +// follow an absolute $XDG_DATA_HOME into the standard XDG layout. +func TestPurposeDirs_HonorXDGDataHome(t *testing.T) { + clearXDG(t) + xdg := t.TempDir() + t.Setenv("XDG_DATA_HOME", xdg) + + cases := []struct { + name string + got func() string + want string + }{ + {"store", StoreDir, filepath.Join(xdg, "gortex", "store")}, + {"models", ModelsDir, filepath.Join(xdg, "gortex", "models")}, + {"memories", MemoriesDir, filepath.Join(xdg, "gortex", "memories")}, + } + for _, tc := range cases { + if got := tc.got(); got != tc.want { + t.Errorf("%sDir() = %s, want %s", tc.name, got, tc.want) + } } } // TestNonAbsoluteXDGIgnored verifies a relative XDG_*_HOME value is // ignored, as the XDG Base Directory specification mandates — the -// resolver falls back to the $HOME default instead. +// resolver falls back to the unified $HOME/.gortex default instead. func TestNonAbsoluteXDGIgnored(t *testing.T) { clearXDG(t) home := t.TempDir() @@ -110,9 +186,9 @@ func TestNonAbsoluteXDGIgnored(t *testing.T) { got func() string want string }{ - {"config", "XDG_CONFIG_HOME", "relative/config", ConfigDir, filepath.Join(home, ".config", "gortex")}, - {"data", "XDG_DATA_HOME", "relative/data", DataDir, filepath.Join(home, ".local", "share", "gortex")}, - {"cache", "XDG_CACHE_HOME", "relative/cache", CacheDir, filepath.Join(home, ".cache", "gortex")}, + {"config", "XDG_CONFIG_HOME", "relative/config", ConfigDir, filepath.Join(home, ".gortex")}, + {"data", "XDG_DATA_HOME", "relative/data", DataDir, filepath.Join(home, ".gortex")}, + {"cache", "XDG_CACHE_HOME", "relative/cache", CacheDir, filepath.Join(home, ".gortex", "cache")}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { @@ -124,47 +200,3 @@ func TestNonAbsoluteXDGIgnored(t *testing.T) { }) } } - -// TestOSCacheDir_HonorsXDGCacheHome verifies the os.UserCacheDir-rooted -// helper still honours an absolute $XDG_CACHE_HOME — the consistency -// guarantee the resolver gives every subsystem. -func TestOSCacheDir_HonorsXDGCacheHome(t *testing.T) { - clearXDG(t) - xdg := t.TempDir() - t.Setenv("XDG_CACHE_HOME", xdg) - - want := filepath.Join(xdg, "gortex") - if got := OSCacheDir(); got != want { - t.Errorf("OSCacheDir() = %s, want %s", got, want) - } -} - -// TestOSCacheDir_UnsetFallback verifies that with $XDG_CACHE_HOME unset -// OSCacheDir falls back to os.UserCacheDir()/gortex, byte-identical to -// what the os.UserCacheDir-rooted subsystems used before this change. -func TestOSCacheDir_UnsetFallback(t *testing.T) { - clearXDG(t) - base, err := os.UserCacheDir() - if err != nil { - t.Skipf("os.UserCacheDir unavailable: %v", err) - } - want := filepath.Join(base, "gortex") - if got := OSCacheDir(); got != want { - t.Errorf("OSCacheDir() = %s, want %s (unset fallback must match os.UserCacheDir)", got, want) - } -} - -// TestOSCacheDir_NonAbsoluteIgnored verifies a relative $XDG_CACHE_HOME -// is ignored by the os.UserCacheDir-rooted helper too. -func TestOSCacheDir_NonAbsoluteIgnored(t *testing.T) { - clearXDG(t) - t.Setenv("XDG_CACHE_HOME", "relative/cache") - base, err := os.UserCacheDir() - if err != nil { - t.Skipf("os.UserCacheDir unavailable: %v", err) - } - want := filepath.Join(base, "gortex") - if got := OSCacheDir(); got != want { - t.Errorf("OSCacheDir() with relative XDG_CACHE_HOME = %s, want %s", got, want) - } -} diff --git a/internal/progress/zaplog.go b/internal/progress/zaplog.go new file mode 100644 index 00000000..65342e4d --- /dev/null +++ b/internal/progress/zaplog.go @@ -0,0 +1,114 @@ +package progress + +import ( + "context" + "sync" + "time" + + "go.uber.org/zap" +) + +// ZapReporter logs every Report call as a zap INFO line. Used in +// non-TTY environments (the daemon, CI) where the Spinner is +// silent so progress is invisible. Stage transitions get logged +// immediately; intra-stage progress (current/total) gets logged on +// transition AND every progressInterval seconds so a slow stage +// emits a heartbeat instead of going quiet. +type ZapReporter struct { + logger *zap.Logger + prefix string + interval time.Duration + + mu sync.Mutex + lastStage string + stageStart time.Time + lastEmitted time.Time + lastCur int + lastTotal int +} + +// NewZapReporter creates a reporter that logs to the given logger. +// prefix is added to every log line ("indexer", "multi-repo", …). +// interval is the heartbeat cadence for intra-stage progress +// (0 disables heartbeats — only stage transitions log). +func NewZapReporter(logger *zap.Logger, prefix string, interval time.Duration) *ZapReporter { + if logger == nil { + logger = zap.NewNop() + } + return &ZapReporter{ + logger: logger, + prefix: prefix, + interval: interval, + } +} + +// Report records a stage advancement. Always logs on a stage +// transition; logs intra-stage updates at most once per interval. +func (r *ZapReporter) Report(stage string, cur, total int) { + r.mu.Lock() + defer r.mu.Unlock() + now := time.Now() + if stage != r.lastStage { + if r.lastStage != "" { + r.logger.Info(r.prefix+": stage end", + zap.String("stage", r.lastStage), + zap.Duration("elapsed", now.Sub(r.stageStart)), + ) + } + r.lastStage = stage + r.stageStart = now + r.lastEmitted = now + r.lastCur = cur + r.lastTotal = total + r.logger.Info(r.prefix+": stage start", + zap.String("stage", stage), + zap.Int("current", cur), + zap.Int("total", total), + ) + return + } + // Same stage — heartbeat at most once per interval. + if r.interval > 0 && now.Sub(r.lastEmitted) < r.interval { + return + } + r.lastEmitted = now + r.lastCur = cur + r.lastTotal = total + r.logger.Info(r.prefix+": stage progress", + zap.String("stage", stage), + zap.Int("current", cur), + zap.Int("total", total), + zap.Duration("elapsed", now.Sub(r.stageStart)), + ) +} + +// StartHeartbeat runs a goroutine that logs an "alive" line every +// interval until the context is done. Useful when the indexer is +// inside a long-running phase that doesn't call Report itself +// (e.g. the disk backend's bulk writes during a slow drain). +func StartHeartbeat(ctx context.Context, logger *zap.Logger, prefix string, interval time.Duration, snapshot func() map[string]any) { + if logger == nil || interval <= 0 { + return + } + go func() { + t := time.NewTicker(interval) + defer t.Stop() + start := time.Now() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + fields := []zap.Field{ + zap.Duration("elapsed", time.Since(start)), + } + if snapshot != nil { + for k, v := range snapshot() { + fields = append(fields, zap.Any(k, v)) + } + } + logger.Info(prefix+": heartbeat", fields...) + } + } + }() +} diff --git a/internal/query/class_hierarchy.go b/internal/query/class_hierarchy.go index 0feccdef..bf705cbc 100644 --- a/internal/query/class_hierarchy.go +++ b/internal/query/class_hierarchy.go @@ -50,6 +50,13 @@ var methodHierarchyEdgeKinds = map[graph.EdgeKind]bool{ // Workspace / project scope is enforced via opts.ScopeAllows on every // neighbour. opts.MinTier is applied as a post-pass over the collected // edges (consistent with the rest of the engine surface). +// +// Picks ClassHierarchyTraverser when the backend implements it: that +// path runs the BFS as one variable-length traversal per direction +// inside the engine, replacing the per-node GetNode + GetIn/OutEdges +// loop the fallback runs. On a disk backend a deep walk over a wide +// implementer set previously fired hundreds of round-trips per +// call — the pushdown drops to one or two queries. func (e *Engine) ClassHierarchy(seedID string, direction HierarchyDirection, depth int, includeMethods bool, opts QueryOptions) *SubGraph { if direction == "" { direction = HierarchyBoth @@ -61,6 +68,272 @@ func (e *Engine) ClassHierarchy(seedID string, direction HierarchyDirection, dep depth = 64 } + seed := e.g.GetNode(seedID) + if seed == nil { + return &SubGraph{} + } + + if _, ok := e.g.(graph.ClassHierarchyTraverser); ok { + return e.classHierarchyPushdown(seed, direction, depth, includeMethods, opts) + } + return e.classHierarchyWalk(seed, direction, depth, includeMethods, opts) +} + +// classHierarchyPushdown runs the BFS through the +// ClassHierarchyTraverser capability. Each direction issues one or +// two backend round-trips (the type-edge kinds, optionally chasing +// methods through EdgeMemberOf) instead of the per-frontier per-hop +// loop the fallback runs. +func (e *Engine) classHierarchyPushdown( + seed *graph.Node, + direction HierarchyDirection, + depth int, + includeMethods bool, + opts QueryOptions, +) *SubGraph { + tr := e.g.(graph.ClassHierarchyTraverser) + walkUp := direction == HierarchyUp || direction == HierarchyBoth + walkDown := direction == HierarchyDown || direction == HierarchyBoth + + typeKinds := []graph.EdgeKind{graph.EdgeExtends, graph.EdgeImplements, graph.EdgeComposes} + methodKinds := []graph.EdgeKind{graph.EdgeOverrides} + + // Per-direction walks: type-hierarchy kinds rooted at seed if seed + // is a type/interface; method-hierarchy kinds rooted at seed if + // seed is a method/function. Methods reached via includeMethods + // are added as separate roots in a follow-up pass. + var rows []graph.ClassHierarchyRow + seedIsType := seed.Kind == graph.KindType || seed.Kind == graph.KindInterface + seedIsMethod := seed.Kind == graph.KindMethod || seed.Kind == graph.KindFunction + if seedIsType { + if walkUp { + rows = append(rows, tr.ClassHierarchyTraverse(seed.ID, "up", typeKinds, depth)...) + } + if walkDown { + rows = append(rows, tr.ClassHierarchyTraverse(seed.ID, "down", typeKinds, depth)...) + } + } else if seedIsMethod { + if walkUp { + rows = append(rows, tr.ClassHierarchyTraverse(seed.ID, "up", methodKinds, depth)...) + } + if walkDown { + rows = append(rows, tr.ClassHierarchyTraverse(seed.ID, "down", methodKinds, depth)...) + } + } + + // Collect the node IDs visited so we can resolve them in one + // batched fetch, instead of one GetNode per row. + visited := map[string]bool{seed.ID: true} + for _, r := range rows { + for _, id := range r.Path { + visited[id] = true + } + } + + // includeMethods folds in EdgeMemberOf hops from every visited + // type node. The override walk on each method then runs as a + // further pushdown call. + memberLinks := []struct { + from, to string + kind graph.EdgeKind + }{} + if includeMethods { + typeIDs := make([]string, 0, len(visited)) + for id := range visited { + n := e.g.GetNode(id) + if n == nil { + continue + } + if n.Kind == graph.KindType || n.Kind == graph.KindInterface { + typeIDs = append(typeIDs, id) + } + } + if len(typeIDs) > 0 { + memberIns := e.g.GetInEdgesByNodeIDs(typeIDs) + methodRoots := []string{} + for _, id := range typeIDs { + for _, ed := range memberIns[id] { + if ed == nil || ed.Kind != graph.EdgeMemberOf { + continue + } + member := e.g.GetNode(ed.From) + if member == nil { + continue + } + if member.Kind != graph.KindMethod && member.Kind != graph.KindFunction { + continue + } + memberLinks = append(memberLinks, struct { + from, to string + kind graph.EdgeKind + }{from: member.ID, to: id, kind: graph.EdgeMemberOf}) + if !visited[member.ID] { + visited[member.ID] = true + methodRoots = append(methodRoots, member.ID) + } + } + } + for _, mid := range methodRoots { + if walkUp { + subRows := tr.ClassHierarchyTraverse(mid, "up", methodKinds, depth) + for _, sr := range subRows { + for _, id := range sr.Path { + visited[id] = true + } + } + rows = append(rows, methodPathsWithRoot(mid, subRows)...) + } + if walkDown { + subRows := tr.ClassHierarchyTraverse(mid, "down", methodKinds, depth) + for _, sr := range subRows { + for _, id := range sr.Path { + visited[id] = true + } + } + rows = append(rows, methodPathsWithRoot(mid, subRows)...) + } + } + } + } + + // Resolve every visited node + collect the edge pointers in one + // place. The capability doesn't carry edge pointers (on-disk + // backend edges aren't first-class objects), so we re-resolve them via + // GetOutEdgesByNodeIDs / GetInEdgesByNodeIDs once per direction. + allIDs := make([]string, 0, len(visited)) + for id := range visited { + allIDs = append(allIDs, id) + } + nodeMap := e.g.GetNodesByIDs(allIDs) + if nodeMap[seed.ID] == nil { + nodeMap[seed.ID] = seed + } + + resultNodes := make([]*graph.Node, 0, len(allIDs)) + for _, id := range allIDs { + n := nodeMap[id] + if n == nil { + continue + } + if opts.WorkspaceID != "" && id != seed.ID && !opts.ScopeAllows(n) { + continue + } + resultNodes = append(resultNodes, n) + } + + // Reconstruct edges: each row's Path[i] → Path[i+1] (for i>=0) + // carries an edge of EdgeKinds[i]. The seed's first hop is from + // seed → Path[0]. The direction the walk came from determines + // whether the edge points seed→neighbour or neighbour→seed. + resultEdges := make([]*graph.Edge, 0) + seenEdge := make(map[string]bool) + addEdge := func(from, to string, kind graph.EdgeKind) { + // Find the actual *Edge so the downstream FilterByMinTier + // still has the origin / tier columns to read. + var found *graph.Edge + for _, ed := range e.g.GetOutEdges(from) { + if ed == nil { + continue + } + if ed.To == to && ed.Kind == kind { + found = ed + break + } + } + if found == nil { + // Direction-flipped lookup — happens when "down" walks + // hand back paths whose hops are in-edges of the seed. + for _, ed := range e.g.GetInEdges(from) { + if ed == nil { + continue + } + if ed.From == to && ed.Kind == kind { + found = ed + break + } + } + } + if found == nil { + return + } + k := found.From + "→" + found.To + "::" + string(found.Kind) + ":" + edgeMetaTag(found) + if seenEdge[k] { + return + } + seenEdge[k] = true + resultEdges = append(resultEdges, found) + } + for _, r := range rows { + prev := seed.ID + for i, nb := range r.Path { + if i >= len(r.EdgeKinds) { + break + } + addEdge(prev, nb, r.EdgeKinds[i]) + prev = nb + } + } + for _, link := range memberLinks { + addEdge(link.from, link.to, link.kind) + } + + // Workspace-scope post-filter for edges (any edge whose endpoints + // were dropped from resultNodes is also dropped). + if opts.WorkspaceID != "" { + nodeSet := make(map[string]bool, len(resultNodes)) + for _, n := range resultNodes { + nodeSet[n.ID] = true + } + filtered := resultEdges[:0] + for _, ed := range resultEdges { + if !nodeSet[ed.From] || !nodeSet[ed.To] { + continue + } + filtered = append(filtered, ed) + } + resultEdges = filtered + } + + sg := &SubGraph{ + Nodes: resultNodes, + Edges: resultEdges, + TotalNodes: len(resultNodes), + TotalEdges: len(resultEdges), + } + if opts.MinTier != "" { + sg.FilterByMinTier(opts.MinTier) + } + return sg +} + +// methodPathsWithRoot rebases the traversal rows so the seed prefix +// in their paths reflects the method root they came from rather than +// the outer ClassHierarchy seed. Returned rows are otherwise +// unchanged. +func methodPathsWithRoot(root string, rows []graph.ClassHierarchyRow) []graph.ClassHierarchyRow { + out := make([]graph.ClassHierarchyRow, len(rows)) + for i, r := range rows { + newPath := append([]string{root}, r.Path...) + newKinds := append([]graph.EdgeKind{}, r.EdgeKinds...) + // The seed→Path[0] hop is encoded by EdgeMemberOf in the outer + // addEdge pass, so we keep the EdgeKinds slice aligned with + // the slice the caller iterates ([0]=Path[0]→Path[1]). + out[i] = graph.ClassHierarchyRow{Path: newPath[1:], EdgeKinds: newKinds} + _ = newPath + } + return out +} + +// classHierarchyWalk is the in-memory BFS path. Kept verbatim so the +// in-memory backend has the same shape it had before the pushdown +// landed. +func (e *Engine) classHierarchyWalk( + seed *graph.Node, + direction HierarchyDirection, + depth int, + includeMethods bool, + opts QueryOptions, +) *SubGraph { walkUp := direction == HierarchyUp || direction == HierarchyBoth walkDown := direction == HierarchyDown || direction == HierarchyBoth @@ -77,9 +350,6 @@ func (e *Engine) ClassHierarchy(seedID string, direction HierarchyDirection, dep resultNodes = append(resultNodes, n) } - // Edges are deduped by their source pointer identity — the graph - // store hands out stable pointers per edge, so a pointer key is - // sufficient and avoids constructing a synthetic key per edge. edgeKey := func(ed *graph.Edge) string { return ed.From + "→" + ed.To + "::" + string(ed.Kind) + ":" + edgeMetaTag(ed) } @@ -95,17 +365,13 @@ func (e *Engine) ClassHierarchy(seedID string, direction HierarchyDirection, dep resultEdges = append(resultEdges, ed) } - seed := e.g.GetNode(seedID) - if seed == nil { - return &SubGraph{} - } addNode(seed) type queued struct { id string depth int } - queue := []queued{{id: seedID, depth: 0}} + queue := []queued{{id: seed.ID, depth: 0}} for len(queue) > 0 { cur := queue[0] @@ -122,10 +388,6 @@ func (e *Engine) ClassHierarchy(seedID string, direction HierarchyDirection, dep isType := curNode.Kind == graph.KindType || curNode.Kind == graph.KindInterface isMethod := curNode.Kind == graph.KindMethod || curNode.Kind == graph.KindFunction - // Pull in member methods of type/interface nodes when requested. - // This happens at the visit step (not as a hop), so methods land - // in the result without consuming a depth budget — they're a - // projection of the type, not a separate hierarchy hop. if includeMethods && isType { for _, mEdge := range e.g.GetInEdges(cur.id) { if mEdge.Kind != graph.EdgeMemberOf { @@ -143,15 +405,10 @@ func (e *Engine) ClassHierarchy(seedID string, direction HierarchyDirection, dep } addNode(member) addEdge(mEdge) - // Surface the method itself for the override walk in - // the next iteration. Same depth budget as the parent - // type so a method's overrides cost the same as walking - // to a method-seed at this depth. queue = append(queue, queued{id: member.ID, depth: cur.depth}) } } - // Pick edge kinds based on what kind of node we're standing on. var kindSet map[graph.EdgeKind]bool switch { case isType: @@ -159,7 +416,6 @@ func (e *Engine) ClassHierarchy(seedID string, direction HierarchyDirection, dep case isMethod: kindSet = methodHierarchyEdgeKinds default: - // Fields, params, files, etc. — nothing to walk. continue } diff --git a/internal/query/engine.go b/internal/query/engine.go index cb89b4a4..455c44c3 100644 --- a/internal/query/engine.go +++ b/internal/query/engine.go @@ -3,6 +3,7 @@ package query import ( "sort" "strings" + "time" "github.com/zzet/gortex/internal/graph" "github.com/zzet/gortex/internal/search" @@ -51,7 +52,7 @@ func (e *Engine) Reader() graph.Reader { return e.g } // NewEngine creates a query engine wrapping the given graph. The // default 11-signal rerank.Pipeline is wired in; callers wanting a // custom signal set / weights override via SetRerank. -func NewEngine(g *graph.Graph) *Engine { +func NewEngine(g graph.Store) *Engine { return &Engine{g: g, rerank: rerank.NewDefault()} } @@ -128,13 +129,74 @@ func (e *Engine) FindSymbols(name string, kinds ...graph.NodeKind) []*graph.Node return filtered } -// GetFileSymbols returns all symbols defined in a file. +// GetFileSymbolsCounts returns the file's symbols and the count of +// edges adjacent to them, without materialising the edges themselves. +// Use it instead of GetFileSymbols when the caller only needs an +// edge total (gcx + compact output paths in get_file_summary), since +// the disk backends can collapse the edge round-trip into a server- +// side aggregate that's orders of magnitude cheaper than shipping +// every row back over cgo. +// +// Backends that implement graph.FileSubGraphCountReader handle the +// count server-side; others fall through to a full GetFileSymbols call +// and report len(sg.Edges) (correct, just not cheap). +func (e *Engine) GetFileSymbolsCounts(filePath string) *SubGraph { + if pd, ok := e.g.(graph.FileSubGraphCountReader); ok { + nodes, edgeCount := pd.GetFileSubGraphCounts(filePath) + if len(nodes) == 0 { + return &SubGraph{} + } + return &SubGraph{ + Nodes: nodes, + TotalNodes: len(nodes), + TotalEdges: edgeCount, + } + } + sg := e.GetFileSymbols(filePath) + if sg == nil { + return &SubGraph{} + } + // Strip edges — the caller asked for counts only and we don't + // want stale edge buffers riding back on the SubGraph. + sg.Edges = nil + return sg +} + +// GetFileSymbols returns the file node, every symbol the file +// defines or contains, and every edge adjacent to any of them. +// +// Backends that implement graph.FileSubGraphReader (the on-disk +// store, for instance) handle the whole walk in one method call so +// they can express the symbol enumeration as a primary-key probe + +// adjacency walk instead of a property-filter scan over Node. +// Backends without the capability fall through to the +// GetFileNodes + GetOut/InEdgesByNodeIDs trio — equivalent on the +// in-memory graph (the per-id lookups are already O(1)). func (e *Engine) GetFileSymbols(filePath string) *SubGraph { + if pd, ok := e.g.(graph.FileSubGraphReader); ok { + nodes, edges := pd.GetFileSubGraph(filePath) + if len(nodes) == 0 { + return &SubGraph{} + } + return &SubGraph{ + Nodes: nodes, Edges: edges, + TotalNodes: len(nodes), TotalEdges: len(edges), + } + } nodes := e.g.GetFileNodes(filePath) - var edges []*graph.Edge + if len(nodes) == 0 { + return &SubGraph{} + } + ids := make([]string, 0, len(nodes)) for _, n := range nodes { - edges = append(edges, e.g.GetOutEdges(n.ID)...) - edges = append(edges, e.g.GetInEdges(n.ID)...) + ids = append(ids, n.ID) + } + outByID := e.g.GetOutEdgesByNodeIDs(ids) + inByID := e.g.GetInEdgesByNodeIDs(ids) + var edges []*graph.Edge + for _, id := range ids { + edges = append(edges, outByID[id]...) + edges = append(edges, inByID[id]...) } return &SubGraph{ Nodes: nodes, Edges: dedup(edges), @@ -304,6 +366,32 @@ func (e *Engine) FindUsagesScoped(nodeID string, opts QueryOptions) *SubGraph { edges := e.g.GetInEdges(nodeID) nodeMap := make(map[string]*graph.Node) var filtered []*graph.Edge + + // First pass: collect every From id whose edge kind qualifies as + // a usage. We need the From *Node for the workspace / test + // filters below, but the legacy loop fetched it with one GetNode + // per edge — on a disk backend that's one query round-trip per + // inbound edge, which for hot symbols (hundreds of callers) was + // the dominant cost of find_usages. Pre-filter the kinds, then + // batch the lookup so the disk backend issues one query instead + // of N. The target nodeID rides on the same batch so the + // "include the target node itself" step at the end of this + // function does not need its own per-id call. + fromIDs := make([]string, 0, len(edges)+1) + seenFrom := make(map[string]struct{}, len(edges)) + for _, edge := range edges { + if !isUsageEdgeKind(edge.Kind) { + continue + } + if _, dup := seenFrom[edge.From]; dup { + continue + } + seenFrom[edge.From] = struct{}{} + fromIDs = append(fromIDs, edge.From) + } + fromIDs = append(fromIDs, nodeID) + fromByID := e.g.GetNodesByIDs(fromIDs) + for _, edge := range edges { // EdgeProvides + EdgeConsumes carry DI token relationships — // `@Inject(TOKEN)` and `{ provide: TOKEN, useValue: ... }` @@ -319,17 +407,8 @@ func (e *Engine) FindUsagesScoped(nodeID string, opts QueryOptions) *SubGraph { // callers via the legacy reads_config path; find_usages on a // Service returns Ingresses routing to it (EdgeDependsOn); // find_usages on an Image returns workloads pulling it. - if edge.Kind == graph.EdgeCalls || edge.Kind == graph.EdgeReferences || - edge.Kind == graph.EdgeInstantiates || - edge.Kind == graph.EdgeReturns || edge.Kind == graph.EdgeTypedAs || - edge.Kind == graph.EdgeImplements || edge.Kind == graph.EdgeExtends || - edge.Kind == graph.EdgeComposes || - edge.Kind == graph.EdgeProvides || edge.Kind == graph.EdgeConsumes || - edge.Kind == graph.EdgeReadsConfig || edge.Kind == graph.EdgeWritesConfig || - edge.Kind == graph.EdgeUsesEnv || edge.Kind == graph.EdgeConfigures || - edge.Kind == graph.EdgeMounts || edge.Kind == graph.EdgeExposes || - edge.Kind == graph.EdgeDependsOn { - from := e.g.GetNode(edge.From) + if isUsageEdgeKind(edge.Kind) { + from := fromByID[edge.From] if opts.WorkspaceID != "" && !opts.ScopeAllows(from) { continue } @@ -342,8 +421,8 @@ func (e *Engine) FindUsagesScoped(nodeID string, opts QueryOptions) *SubGraph { } } } - // Include the target node itself. - if n := e.g.GetNode(nodeID); n != nil { + // Include the target node itself (already in the batch above). + if n := fromByID[nodeID]; n != nil { nodeMap[n.ID] = n } nodes := make([]*graph.Node, 0, len(nodeMap)) @@ -389,11 +468,25 @@ func (e *Engine) SearchSymbolsRanked(query string, limit int, opts QueryOptions, } } + // Engine-side rctx wins over the opts-piggybacked one (the explicit + // arg is the load-bearing path for callers that build the context + // inline). Callers (the MCP search_symbols handler) that build the + // rctx upstream and want both BM25 calls to share the same edge- + // cache seeding pass it through opts.RerankContext instead. + gatherCtx := rctx + if gatherCtx == nil { + gatherCtx = opts.RerankContext + } + var cands []*rerank.Candidate if s := e.getSearch(); s != nil && s.Count() > 0 { - cands = e.gatherBackendCandidates(query, fetchLimit) + cands = e.gatherBackendCandidates(query, fetchLimit, opts, gatherCtx) } else { + start := time.Now() nodes := e.searchSubstring(query, fetchLimit) + if opts.SearchTimings != nil { + opts.SearchTimings.FallbackMS += time.Since(start).Milliseconds() + } cands = make([]*rerank.Candidate, 0, len(nodes)) for i, n := range nodes { cands = append(cands, &rerank.Candidate{Node: n, TextRank: i, VectorRank: -1}) @@ -418,13 +511,27 @@ func (e *Engine) SearchSymbolsRanked(query string, limit int, opts QueryOptions, // ranking within one merged corpus. No-op for a single-repo set. crossRepoRerank(cands) - if e.rerank != nil { + if e.rerank != nil && !opts.SkipInnerRerank { ctx := rctx if ctx == nil { ctx = &rerank.Context{} } ctx.Graph = e.g + // When the caller supplied opts.RerankContext (the bundle- + // seeding handler), inherit its cached edges so this per-call + // rerank's prepare can read them — saves the 2 batched edge + // fetches per BM25 fan-out on the bundle hot path. Session + // signals stay scoped to the OUTER rerank (the one the handler + // runs against the merged candidate set); the inner rerank + // gets a structural-only context plus the bundle-cached edges. + if rctx == nil && opts.RerankContext != nil { + ctx.InheritEdgeCacheFrom(opts.RerankContext) + } + rerankStart := time.Now() e.rerank.Rerank(query, cands, ctx) + if opts.SearchTimings != nil { + opts.SearchTimings.EngineRerankMS += time.Since(rerankStart).Milliseconds() + } } if len(cands) > limit { @@ -452,20 +559,186 @@ func (e *Engine) SearchSymbolsScoped(query string, limit int, opts QueryOptions) // substring / bigram-rescue matches. Each candidate carries its // 0-based TextRank and VectorRank (or -1 when the channel didn't // return it) so the rerank pipeline can score per channel. -func (e *Engine) gatherBackendCandidates(query string, limit int) []*rerank.Candidate { +// +// Bundle fast path: when the backend implements +// SymbolBundleSearcherBackend, BM25 hits + their Node payload + their +// in/out edges all arrive in one engine round-trip. The bundle's +// edges seed rctx (when non-nil) so the rerank pipeline's prepare +// pass can skip its own batched fetch entirely. Vector channel IDs +// (which don't carry edges in the bundle) still route through the +// per-call GetNodesByIDs + GetIn/OutEdgesByNodeIDs path; bundle and +// vector candidates merge into one rerank slice. +// +// Fallback (no bundle support): the legacy path — Search() / channel +// for IDs, GetNodesByIDs to materialise. On a disk backend +// the bundle fast path collapses 3 round-trips (FTS + nodes + +// the rerank's 2 edge fetches) into 4 server-side queries with no +// engine→rerank boundary crossings; the GetNodesByIDs cost goes +// away entirely for the BM25 hits. +func (e *Engine) gatherBackendCandidates(query string, limit int, opts QueryOptions, rctx *rerank.Context) []*rerank.Candidate { backend := e.getSearch() + timings := opts.SearchTimings - // Pull text + vector channels separately when the backend exposes - // them (HybridBackend). Otherwise treat plain Search() output as - // text-only. + // Bundle fast path. The SymbolBundleSearcherBackend assertion + // chains through Swappable → HybridBackend → SymbolSearcherBackend + // in production; both Swappable and HybridBackend forward when + // the inner backend supports it. Vector IDs still need the + // per-call materialise — bundles don't carry vector hits. var ( - textResults []search.SearchResult - vectorIDs []string + textResults []search.SearchResult + vectorIDs []string + bundleHandled bool + bundleNodeByID = make(map[string]*graph.Node) ) - if cs, ok := backend.(search.ChannelSearcher); ok { - textResults, vectorIDs = cs.SearchChannels(query, limit*2) - } else { - textResults = backend.Search(query, limit*2) + if bsb, ok := backend.(search.SymbolBundleSearcherBackend); ok { + // Pull the vector channel separately when present. Bundles + // cover BM25 only; the engine merges vector hits below. + // VectorChannelOnly avoids re-running the text BM25 path — + // the bundle already returned the BM25 hits and their full + // node + edge payload. Falling back to SearchChannels here + // would double-pay the FTS query cost per BM25 fan-out. + type vectorOnly interface { + VectorChannelOnly(query string, limit int) ([]string, search.ChannelTimings) + } + vectorOnlyBackend, vectorOnlyOK := backend.(vectorOnly) + bundleStart := time.Now() + bundles := bsb.SearchSymbolBundles(query, limit*2) + if timings != nil { + timings.BundleMS += time.Since(bundleStart).Milliseconds() + } + if len(bundles) > 0 { + bundleHandled = true + textResults = make([]search.SearchResult, 0, len(bundles)) + outSeed := make(map[string][]*graph.Edge, len(bundles)) + inSeed := make(map[string][]*graph.Edge, len(bundles)) + for _, b := range bundles { + if b.Node == nil { + continue + } + bundleNodeByID[b.Node.ID] = b.Node + textResults = append(textResults, search.SearchResult{ID: b.Node.ID, Score: b.Score}) + outSeed[b.Node.ID] = b.OutEdges + inSeed[b.Node.ID] = b.InEdges + } + // Seed the rerank context's edge caches so prepare() can + // skip its own batched fetch for the bundle-covered IDs. + // preSeeded=true is the contract that prepare's batched + // edge fetch is now redundant — see rerank.Context for the + // invariant the engine relies on (the next caller's + // candidate set is fully covered by these maps for the + // BM25 hits; vector / substring fallback hits are still + // served by the per-candidate accessor fallback). + if rctx != nil { + rctx.SeedEdgeCaches(inSeed, outSeed, true) + } + } + // Vector channel: only when the bundle path took the BM25 + // branch. Otherwise the fallback path below pulls both. + // VectorChannelOnly skips the BM25 re-run (the bundle already + // returned text hits + their full payload); a few hundred + // microseconds of embed + ANN, not a second FTS query. + // + // opts.SkipVectorChannel suppresses the embed + ANN entirely. + // The MCP handler flips this on for identifier-shape queries + // (QueryClassSymbol / Path / Signature) where the rerank's + // classWeightTable already proves semantic contributes near- + // zero signal vs the BM25 channel — see classWeightTable in + // internal/search/rerank/query_kind.go. + if vectorOnlyOK && !opts.SkipVectorChannel { + vecIDs, stats := vectorOnlyBackend.VectorChannelOnly(query, limit*2) + vectorIDs = vecIDs + if timings != nil { + timings.EmbedMS += stats.EmbedMS + timings.VectorSearchMS += stats.VectorSearchMS + } + } + } + + // Legacy / fallback path: bundle backend absent OR returned no + // hits. Pull text + vector channels separately when the backend + // exposes them (HybridBackend). Otherwise treat plain Search() + // output as text-only. The wall-clock for the backend search + // call lands on the outer caller's BM25*MS bucket — measuring + // around the engine boundary captures the full per-call cost + // without double-counting against the post-call GetNodesByIDs / + // FindNodesByName / Fallback phases that this function + // instruments individually below. + if !bundleHandled { + type timedChan interface { + SearchChannelsTimed(query string, limit int) ([]search.SearchResult, []string, search.ChannelTimings) + } + switch { + case opts.SkipVectorChannel: + // Identifier-shape fast path: skip the vector channel + // (no embed, no ANN) and run text-only Search. The cost + // saved is the per-call embedder + vector index hit; the + // rerank's classWeightTable proves it's not earning its + // keep for these query classes. + textStart := time.Now() + textResults = backend.Search(query, limit*2) + if timings != nil { + timings.TextBackendMS += time.Since(textStart).Milliseconds() + } + default: + if tc, ok := backend.(timedChan); ok { + var stats search.ChannelTimings + textResults, vectorIDs, stats = tc.SearchChannelsTimed(query, limit*2) + if timings != nil { + timings.TextBackendMS += stats.TextMS + timings.EmbedMS += stats.EmbedMS + timings.VectorSearchMS += stats.VectorSearchMS + } + } else if cs, ok := backend.(search.ChannelSearcher); ok { + textStart := time.Now() + textResults, vectorIDs = cs.SearchChannels(query, limit*2) + if timings != nil { + timings.TextBackendMS += time.Since(textStart).Milliseconds() + } + } else { + textStart := time.Now() + textResults = backend.Search(query, limit*2) + if timings != nil { + timings.TextBackendMS += time.Since(textStart).Milliseconds() + } + } + } + } + + // Collect every ID NOT covered by the bundle path (vector hits + + // fallback path's text hits) and materialise them with one + // batched fetch. Empty IDs are tolerated — the batch lookup + // ignores them and the per-id insert short-circuits below. + idBatch := make([]string, 0, len(textResults)+len(vectorIDs)) + for _, r := range textResults { + if r.ID != "" { + if _, covered := bundleNodeByID[r.ID]; covered { + continue + } + idBatch = append(idBatch, r.ID) + } + } + for _, id := range vectorIDs { + if id != "" { + if _, covered := bundleNodeByID[id]; covered { + continue + } + idBatch = append(idBatch, id) + } + } + getNodesStart := time.Now() + nodeByID := e.g.GetNodesByIDs(idBatch) + if timings != nil { + timings.GetNodesMS += time.Since(getNodesStart).Milliseconds() + } + if nodeByID == nil { + // GetNodesByIDs returns nil for empty input — we still need a + // non-nil map below to merge the bundle's nodes into. + nodeByID = make(map[string]*graph.Node, len(bundleNodeByID)) + } + // Merge the bundle's already-materialised nodes into the same + // lookup map the per-candidate insert step below reads from. + for id, n := range bundleNodeByID { + nodeByID[id] = n } idx := make(map[string]int) // node ID → slice index for dedup @@ -475,7 +748,7 @@ func (e *Engine) gatherBackendCandidates(query string, limit int) []*rerank.Cand if id == "" { return } - node := e.g.GetNode(id) + node := nodeByID[id] if node == nil || node.Kind == graph.KindFile || node.Kind == graph.KindImport { return } @@ -510,50 +783,70 @@ func (e *Engine) gatherBackendCandidates(query string, limit int) []*rerank.Cand } // Exact-name matches that BM25 might rank low — splice them in at - // the tail of the text channel so they're still text-ranked. - for _, n := range e.g.FindNodesByName(query) { - if n.Kind == graph.KindFile || n.Kind == graph.KindImport { - continue + // the tail of the text channel so they're still text-ranked. The + // caller can suppress this when the query string is known to never + // match a literal Name (the combined-OR fan-out's concatenated bag + // of expansion terms, for example) — saves the query round-trip + // that would unconditionally return zero rows. + if !opts.SkipExactNameSplice { + findNameStart := time.Now() + for _, n := range e.g.FindNodesByName(query) { + if n.Kind == graph.KindFile || n.Kind == graph.KindImport { + continue + } + if _, seen := idx[n.ID]; seen { + continue + } + idx[n.ID] = len(cands) + cands = append(cands, &rerank.Candidate{Node: n, TextRank: len(textResults), VectorRank: -1}) } - if _, seen := idx[n.ID]; seen { - continue + if timings != nil { + timings.FindNameMS += time.Since(findNameStart).Milliseconds() } - idx[n.ID] = len(cands) - cands = append(cands, &rerank.Candidate{Node: n, TextRank: len(textResults), VectorRank: -1}) } // Substring fallback for remaining slots — strictly TextRank=-1 // (the rerank pipeline still considers them via signature/recency - // signals, but BM25 can't speak to them). Matches are collected, - // sorted by ID, then truncated, so the candidate set does not - // depend on the randomised map-iteration order of AllNodes(). + // signals, but BM25 can't speak to them). The store-side + // FindNodesByNameContaining pushes the predicate into the backend + // index instead of materialising every node over cgo and filtering + // in Go — the old AllNodes loop is broken at Linux-kernel scale + // (10M+ symbols, hundreds of MB of nodes per query). We over-fetch + // by a small slack factor so dedup against existing cands still + // leaves room to fill `limit`. if len(cands) < limit { - lower := strings.ToLower(query) - var subMatches []*graph.Node - for _, n := range e.g.AllNodes() { + fallbackStart := time.Now() + fetch := (limit - len(cands)) * 2 + if fetch < limit { + fetch = limit + } + subMatches := e.g.FindNodesByNameContaining(query, fetch) + // Stable ordering — backends may return in catalog order, which + // is not a meaningful relevance signal here. + sort.Slice(subMatches, func(i, j int) bool { return subMatches[i].ID < subMatches[j].ID }) + for _, n := range subMatches { if n.Kind == graph.KindFile || n.Kind == graph.KindImport { continue } if _, seen := idx[n.ID]; seen { continue } - if strings.Contains(strings.ToLower(n.Name), lower) { - subMatches = append(subMatches, n) - } - } - sort.Slice(subMatches, func(i, j int) bool { return subMatches[i].ID < subMatches[j].ID }) - for _, n := range subMatches { idx[n.ID] = len(cands) cands = append(cands, &rerank.Candidate{Node: n, TextRank: -1, VectorRank: -1}) if len(cands) >= limit { break } } + if timings != nil { + timings.FallbackMS += time.Since(fallbackStart).Milliseconds() + } } // Bigram-overlap typo rescue. Same gates as the legacy path: // nothing else surfaced, query is one indivisible 4+ char token, - // backend can provide candidates. + // backend can provide candidates. The bigram backend also returns + // raw IDs — batch-materialise them too rather than fall back to + // per-id GetNode. if len(cands) == 0 && len(query) >= 4 && !strings.ContainsAny(query, " /.:_-") { if bg, ok := backend.(bigramProvider); ok { keys := len(query) - 1 @@ -561,18 +854,25 @@ func (e *Engine) gatherBackendCandidates(query string, limit int) []*rerank.Cand if minOverlap < 3 { minOverlap = 3 } - for _, id := range bg.BigramCandidates(query, minOverlap) { - if _, seen := idx[id]; seen { - continue - } - node := e.g.GetNode(id) - if node == nil || node.Kind == graph.KindFile || node.Kind == graph.KindImport { - continue - } - idx[id] = len(cands) - cands = append(cands, &rerank.Candidate{Node: node, TextRank: -1, VectorRank: -1}) - if len(cands) >= limit { - break + bigramIDs := bg.BigramCandidates(query, minOverlap) + // Skip the batch fetch entirely when the bigram backend + // returned nothing — otherwise we'd issue an empty query + // round-trip. + if len(bigramIDs) > 0 { + bigramNodes := e.g.GetNodesByIDs(bigramIDs) + for _, id := range bigramIDs { + if _, seen := idx[id]; seen { + continue + } + node := bigramNodes[id] + if node == nil || node.Kind == graph.KindFile || node.Kind == graph.KindImport { + continue + } + idx[id] = len(cands) + cands = append(cands, &rerank.Candidate{Node: node, TextRank: -1, VectorRank: -1}) + if len(cands) >= limit { + break + } } } } @@ -725,18 +1025,11 @@ func (e *Engine) bfs(nodeID string, opts QueryOptions, forward bool, edgeKinds [ kindSet[k] = true } - visited := make(map[string]bool) + visited := map[string]bool{nodeID: true} var allNodes []*graph.Node var allEdges []*graph.Edge truncated := false - type item struct { - id string - depth int - } - queue := []item{{id: nodeID, depth: 0}} - visited[nodeID] = true - if n := e.g.GetNode(nodeID); n != nil { // The seed always enters the result, regardless of scope — // callers ask "what reaches X" with X already in mind. The @@ -744,92 +1037,147 @@ func (e *Engine) bfs(nodeID string, opts QueryOptions, forward bool, edgeKinds [ allNodes = append(allNodes, n) } - for len(queue) > 0 { - cur := queue[0] - queue = queue[1:] - - if cur.depth >= opts.Depth { - continue + // admit is the single place edge/node bookkeeping lives, shared by + // the batched and per-node expansion paths. It records the edge + // (unless the node budget is already full — the legacy code grew + // allEdges without bound, so a high-degree hub could pin gigabytes + // of edge structs), then admits a new, in-scope, non-test neighbour + // and returns its id to enqueue ("" = skip). + admit := func(edge *graph.Edge, neighborID string, neighbor *graph.Node) string { + // Skip unresolved/external targets. + if graph.IsUnresolvedTarget(neighborID) || strings.HasPrefix(neighborID, "external::") { + return "" } - - var edges []*graph.Edge - if bidir { - edges = append(e.g.GetOutEdges(cur.id), e.g.GetInEdges(cur.id)...) - } else if forward { - edges = e.g.GetOutEdges(cur.id) - } else { - edges = e.g.GetInEdges(cur.id) + // Once the node budget is full, stop recording edges too: the + // result is already truncated and an unbounded allEdges is the + // memory-blowup vector this guard closes. + if len(allNodes) >= opts.Limit { + truncated = true + return "" } + // ExcludeTests drops neighbours flagged as tests during a reverse + // traversal — a no-op for forward/bidirectional walks. + if opts.ExcludeTests && !forward && !bidir && isTestSource(neighbor) { + return "" + } + // Workspace/project scope: neighbours outside the bound scope are + // dropped along with the edge that pointed at them. + if opts.WorkspaceID != "" && neighbor != nil && !opts.ScopeAllows(neighbor) { + return "" + } + allEdges = append(allEdges, edge) + if visited[neighborID] { + return "" + } + visited[neighborID] = true + if neighbor == nil { + return "" + } + allNodes = append(allNodes, neighbor) + return neighborID + } - for _, edge := range edges { - if !bidir && !kindSet[edge.Kind] { - continue - } - - var neighborID string - if forward || bidir { - if edge.From == cur.id { - neighborID = edge.To - } else if bidir { - neighborID = edge.From - } else { + // A backend that implements graph.FrontierExpander (the on-disk + // store) returns a whole frontier's edges + neighbour nodes in one + // round-trip — no GetNode per edge, no meta decode. Bidirectional + // (cluster) walks and capability-less backends (the in-memory graph, + // whose reads are already O(1)) keep the per-node path. + expander, batched := e.g.(graph.FrontierExpander) + batched = batched && !bidir && len(edgeKinds) > 0 + + frontier := []string{nodeID} + for depth := 0; depth < opts.Depth && len(frontier) > 0 && len(allNodes) < opts.Limit; depth++ { + var next []string + if batched { + for _, h := range expander.ExpandFrontier(frontier, forward, edgeKinds, opts.Limit) { + if h.Edge == nil { continue } - } else { - if edge.To == cur.id { - neighborID = edge.From - } else { - continue + neighborID := h.Edge.To + if !forward { + neighborID = h.Edge.From } - } - - // Skip unresolved/external targets. - if strings.HasPrefix(neighborID, "unresolved::") || strings.HasPrefix(neighborID, "external::") { - continue - } - - // ExcludeTests drops neighbours flagged as tests during a - // reverse traversal — for forward traversals it's a no-op - // because callers asking "who depends on X" (reverse) are - // the only consumers of this filter today. - if opts.ExcludeTests && !forward && !bidir { - if n := e.g.GetNode(neighborID); isTestSource(n) { - continue + if id := admit(h.Edge, neighborID, h.Neighbor); id != "" { + next = append(next, id) } - } - - // Workspace/project scope. When opts.WorkspaceID is set, - // neighbours outside that scope are dropped along with the - // edge that pointed at them. Cross-workspace edges produced - // by the resolver only exist when an explicit - // cross_workspace_dep allows them, so this filter also - // acts as the query-time enforcement of "find_usages on a - // tuck symbol returns hits only from tuck". - if opts.WorkspaceID != "" { - if n := e.g.GetNode(neighborID); n != nil && !opts.ScopeAllows(n) { - continue + if len(allNodes) >= opts.Limit { + truncated = true + break } } - - allEdges = append(allEdges, edge) - - if visited[neighborID] { - continue + } else { + for _, cur := range frontier { + var edges []*graph.Edge + switch { + case bidir: + edges = append(e.g.GetOutEdges(cur), e.g.GetInEdges(cur)...) + case forward: + edges = e.g.GetOutEdges(cur) + default: + edges = e.g.GetInEdges(cur) + } + for _, edge := range edges { + if !bidir && !kindSet[edge.Kind] { + continue + } + var neighborID string + switch { + case forward || bidir: + if edge.From == cur { + neighborID = edge.To + } else if bidir { + neighborID = edge.From + } else { + continue + } + default: + if edge.To == cur { + neighborID = edge.From + } else { + continue + } + } + // One GetNode per neighbour (the legacy path fetched + // it twice — scope check, then materialise). + var neighbor *graph.Node + if !graph.IsUnresolvedTarget(neighborID) && !strings.HasPrefix(neighborID, "external::") { + neighbor = e.g.GetNode(neighborID) + } + if id := admit(edge, neighborID, neighbor); id != "" { + next = append(next, id) + } + if len(allNodes) >= opts.Limit { + truncated = true + break + } + } + if len(allNodes) >= opts.Limit { + break + } } - visited[neighborID] = true + } + frontier = next + } - n := e.g.GetNode(neighborID) - if n == nil { - continue + // ExpandFrontier returns meta-free neighbours; a full-detail caller + // (e.g. one reading Meta["signature"]) gets them re-hydrated in one + // batched round-trip. Brief callers (smart_context's ring, step-7) + // skip this — stripMeta would drop the meta anyway. + if batched && opts.Detail != "brief" && len(allNodes) > 1 { + if hyd, ok := e.g.(interface { + GetNodesByIDs(ids []string) map[string]*graph.Node + }); ok { + ids := make([]string, 0, len(allNodes)) + for _, n := range allNodes { + ids = append(ids, n.ID) } - - if len(allNodes) >= opts.Limit { - truncated = true - continue + if full := hyd.GetNodesByIDs(ids); full != nil { + for i, n := range allNodes { + if fn := full[n.ID]; fn != nil { + allNodes[i] = fn + } + } } - - allNodes = append(allNodes, n) - queue = append(queue, item{id: neighborID, depth: cur.depth + 1}) } } @@ -853,6 +1201,28 @@ func stripMeta(sg *SubGraph) { } } +// isUsageEdgeKind reports whether an edge kind counts as a "usage" +// for FindUsages — the same predicate the legacy inline if-chain +// evaluated. Hoisted into a function so the kind set can be reused +// across the pre-filter pass and the materialisation pass without +// drifting. +func isUsageEdgeKind(k graph.EdgeKind) bool { + switch k { + case graph.EdgeCalls, graph.EdgeReferences, + graph.EdgeInstantiates, + graph.EdgeReturns, graph.EdgeTypedAs, + graph.EdgeImplements, graph.EdgeExtends, + graph.EdgeComposes, + graph.EdgeProvides, graph.EdgeConsumes, + graph.EdgeReadsConfig, graph.EdgeWritesConfig, + graph.EdgeUsesEnv, graph.EdgeConfigures, + graph.EdgeMounts, graph.EdgeExposes, + graph.EdgeDependsOn: + return true + } + return false +} + // isTestSource reports whether a node was flagged as a test by the // indexer's test-edge pass. Used by QueryOptions.ExcludeTests to drop // callers/users that originate in tests, leaving production callers. @@ -865,14 +1235,29 @@ func isTestSource(n *graph.Node) bool { } func dedup(edges []*graph.Edge) []*graph.Edge { - seen := make(map[string]bool) - var out []*graph.Edge + if len(edges) == 0 { + return edges + } + // Struct key avoids the per-edge string concatenation the old + // implementation paid (e.From + "->" + e.To + ":" + kind) — on a + // 4 000-edge file the alloc storm dominated GetFileSymbols. + type dedupKey struct { + from string + to string + kind graph.EdgeKind + } + seen := make(map[dedupKey]struct{}, len(edges)) + out := make([]*graph.Edge, 0, len(edges)) for _, e := range edges { - key := e.From + "->" + e.To + ":" + string(e.Kind) - if !seen[key] { - seen[key] = true - out = append(out, e) + if e == nil { + continue + } + k := dedupKey{from: e.From, to: e.To, kind: e.Kind} + if _, ok := seen[k]; ok { + continue } + seen[k] = struct{}{} + out = append(out, e) } return out } diff --git a/internal/query/subgraph.go b/internal/query/subgraph.go index b7483574..8a79392b 100644 --- a/internal/query/subgraph.go +++ b/internal/query/subgraph.go @@ -5,6 +5,7 @@ import ( "strings" "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/search/rerank" ) // SubGraph is a JSON-serializable result from a graph query. @@ -60,6 +61,92 @@ type QueryOptions struct { // indexer's test-edge pass. Lets find_usages / get_callers answer // "who depends on X *in production*" without test-noise dilution. ExcludeTests bool `json:"exclude_tests,omitempty"` + + // SearchTimings, when non-nil, is populated by the search hot path + // (SearchSymbolsScoped → gatherBackendCandidates) with per-phase + // wall-clock breakdowns. Used by the MCP search_symbols handler's + // debug log line; nil disables instrumentation. Single-call: the + // caller MUST hand a fresh struct per query (the engine does not + // reset). Never serialised — `json:"-"` keeps the option struct + // JSON shape stable. + SearchTimings *SearchTimings `json:"-"` + + // RerankContext is the optional rerank context the engine uses when + // gathering bundle candidates: each bundle's in/out edges are + // seeded into the context's edge caches so the handler-side + // rerank.Pipeline.Rerank can skip its own batched edge fetch on + // the merged candidate set. Pass nil — the engine's gather path + // still works, the bundle's edges are just discarded after the + // per-call rerank. Never serialised. + RerankContext *rerank.Context `json:"-"` + + // SkipInnerRerank, when true, makes SearchSymbolsRanked skip its + // own per-call rerank.Pipeline.Rerank pass. Callers that fan a + // search across N expansion terms and merge the results themselves + // (the MCP search_symbols handler) re-run the rerank once on the + // merged candidate set with the full session-aware context — the + // inner per-call rerank is wasted work whose output is mostly + // discarded by the merge. Flipping this on collapses N+1 + // engine-side rerank invocations to zero. The merge-side rerank + // is the source of truth either way. + SkipInnerRerank bool `json:"-"` + + // SkipVectorChannel, when true, makes gatherBackendCandidates skip + // the vector channel entirely — no embedder call, no ANN search. + // Set by the MCP search_symbols handler on identifier-shape queries + // (QueryClassSymbol / QueryClassPath / QueryClassSignature) where + // the rerank's classWeightTable already proves the semantic + // channel contributes near-zero useful signal (multipliers 0.65 / + // 0.45 / 0.80 vs the baseline 1.00 for concept). Saves the embed + // + vector search round-trip on the common-case identifier lookup. + // The bundle path's vector-only branch and the legacy + // SearchChannels path both honour this flag. + SkipVectorChannel bool `json:"-"` + + // SkipExactNameSplice, when true, makes gatherBackendCandidates + // skip the FindNodesByName(query) splice-in. Set by callers that + // know the query string cannot match any exact node name — the + // fetchAndMergeBM25 fan-out's combined-OR call is the canonical + // case: a concatenated bag of expansion terms ("NewServer + // StartServer Server.Init …") can't be the literal Name of any + // node, so the FindNodesByName query round-trip is wasted work. + // The primary query still runs the splice. + SkipExactNameSplice bool `json:"-"` +} + +// SearchTimings carries per-phase wall-clock measurements collected +// by the BM25 retrieval pipeline. Zero-valued fields mean the phase +// didn't run on this call (e.g. FallbackMS is 0 when the BM25 result +// already saturated the limit). +type SearchTimings struct { + BM25PrimaryMS int64 // time spent in the primary BM25 backend call + BM25ExpansionMS int64 // time spent across all expansion-term BM25 calls + GetNodesMS int64 // time spent materialising BM25/vector IDs via GetNodesByIDs + FindNameMS int64 // time spent on the FindNodesByName splice-in + FallbackMS int64 // time spent in the substring/name-contains fallback + // Sub-buckets of the BM25*MS totals — proves which phase inside + // the wrapper is actually slow. Accumulated across every + // primary + expansion BM25 invocation. + TextBackendMS int64 // strictly inside Backend.Search / text channel + EmbedMS int64 // inside embedder.Embed (vector path only) + VectorSearchMS int64 // inside vector.Search ANN call (vector path only) + EngineRerankMS int64 // inside rerank.Pipeline.Rerank in SearchSymbolsRanked + // BundleMS accumulates the wall-clock spent inside + // SymbolBundleSearcherBackend.SearchSymbolBundles (one query per + // BM25 fan-out that returns Node + in/out edges in one bundle). + // When the backend supports bundles, the bundle path replaces the + // (TextBackend + GetNodes) sub-buckets; the bm25_backend_ms + // derivation in the handler subtracts BundleMS so the existing + // fields stay meaningful. + BundleMS int64 + // CacheHitRate is the fraction of post-merge candidates whose + // in/out edges were already in the rerank Context cache when the + // handler-side prepare() ran. 1.0 means every candidate was + // pre-seeded from a bundle; 0.0 means the rerank had to fetch + // every candidate's edges itself. Populated by the handler when + // the bundle path is active so the search_symbols debug log can + // surface how often the seeding actually catches. + CacheHitRate float64 } // ScopeAllows reports whether a node passes the workspace/project diff --git a/internal/query/walk.go b/internal/query/walk.go index cf35a1ad..7fb070bc 100644 --- a/internal/query/walk.go +++ b/internal/query/walk.go @@ -204,7 +204,7 @@ func (e *Engine) WalkBudgeted(startID string, opts WalkOptions) *SubGraph { neighborID = edge.From } - if strings.HasPrefix(neighborID, "unresolved::") || + if graph.IsUnresolvedTarget(neighborID) || strings.HasPrefix(neighborID, "external::") { continue } diff --git a/internal/reach/reach.go b/internal/reach/reach.go index ed9edcfd..9968339a 100644 --- a/internal/reach/reach.go +++ b/internal/reach/reach.go @@ -105,7 +105,7 @@ var buildCounter uint64 // Safe to call repeatedly: existing reach_d* entries are overwritten // and the build counter advances each time so any consumer that read // an entry from a prior generation will fall back to a live walk. -func BuildIndex(g *graph.Graph) *Stats { +func BuildIndex(g graph.Store) *Stats { return BuildIndexCtx(context.Background(), g) } @@ -116,7 +116,7 @@ func BuildIndex(g *graph.Graph) *Stats { // longest stages on monorepo-scale graphs (~200 s on k8s with 150 k // impact seeds). Pure operator-visibility instrumentation: the per- // report call is cheap (no I/O when the reporter is the default no-op). -func BuildIndexCtx(ctx context.Context, g *graph.Graph) *Stats { +func BuildIndexCtx(ctx context.Context, g graph.Store) *Stats { if g == nil { return &Stats{} } @@ -146,6 +146,13 @@ func BuildIndexCtx(ctx context.Context, g *graph.Graph) *Stats { const reachProgressEvery = 1000 seedsDone := 0 + // Collect the seed nodes we stamp so we can persist the Meta back + // through the store in one batch at the end. On the in-memory + // backend the in-place stamp already persists (n is canonical); on + // disk backends n is a GetNode reconstruction, so without + // the write-back the whole reach index would be computed and then + // thrown away. Mirrors the per-seed AddNode in Lookup's slow path. + stamped := make([]*graph.Node, 0, seedTotal) for _, n := range nodes { if n == nil || !ImpactSeedKind(n.Kind) { continue @@ -169,6 +176,7 @@ func BuildIndexCtx(ctx context.Context, g *graph.Graph) *Stats { setOrDeleteStrings(n.Meta, MetaReachD2Label, tiers[1].Labels) setOrDeleteStrings(n.Meta, MetaReachD3Label, tiers[2].Labels) + stamped = append(stamped, n) stats.NodesIndexed++ stats.EntriesD1 += len(tiers[0].IDs) stats.EntriesD2 += len(tiers[1].IDs) @@ -179,6 +187,12 @@ func BuildIndexCtx(ctx context.Context, g *graph.Graph) *Stats { reporter.Report("reachability index", seedsDone, seedTotal) } } + // Persist every stamped node's Meta back through the store in one + // batch (no-op-ish on the in-memory backend, the durable write on + // disk backends). AddBatch with no edges only upserts the nodes. + if len(stamped) > 0 { + g.AddBatch(stamped, nil) + } reporter.Report("reachability index", seedsDone, seedTotal) return stats } @@ -221,14 +235,31 @@ func setOrDeleteFloats(m map[string]any, key string, value []float64) { // filtered with ReachableEdge so the result matches AnalyzeImpact; // file / import nodes are walked through for fan-out but excluded // from the tier slices. -func compute(g *graph.Graph, seedID string) [3]tier { +func compute(g graph.Store, seedID string) [3]tier { var result [3]tier visited := map[string]struct{}{seedID: {}} current := []string{seedID} - for depth := 1; depth <= 3; depth++ { + for depth := 1; depth <= 3 && len(current) > 0; depth++ { + // Batch the whole BFS level's incoming-edge fetch into one + // backend round-trip. The per-node g.GetInEdges(id) form issued + // one query per node on disk backends — an + // O(reachable-nodes) query storm that turned a single + // AnalyzeImpact live walk into a multi-minute (timeout) call on + // a disk backend. GetInEdgesByNodeIDs collapses it to one query per depth. + inEdges := g.GetInEdgesByNodeIDs(current) + + // First pass: discover this level's new From-nodes in + // deterministic (current-order, edge-order) order, recording the + // representative in-edge for each. + type cand struct { + from string + conf float64 + kind graph.EdgeKind + } var next []string + var cands []cand for _, id := range current { - for _, e := range g.GetInEdges(id) { + for _, e := range inEdges[id] { if !ReachableEdge(e.Kind) { continue } @@ -237,17 +268,30 @@ func compute(g *graph.Graph, seedID string) [3]tier { } visited[e.From] = struct{}{} next = append(next, e.From) + cands = append(cands, cand{from: e.From, conf: e.Confidence, kind: e.Kind}) + } + } - if n := g.GetNode(e.From); n == nil || - n.Kind == graph.KindFile || n.Kind == graph.KindImport { - continue - } - slot := depth - 1 - result[slot].IDs = append(result[slot].IDs, e.From) - result[slot].Conf = append(result[slot].Conf, e.Confidence) - result[slot].Labels = append(result[slot].Labels, - graph.ConfidenceLabelFor(e.Kind, e.Confidence)) + // Batch the node-kind lookups too — the original called + // g.GetNode(e.From) once per discovered node (a second per-node + // query storm on disk backends). File / import nodes are still + // walked through for fan-out (they stay in `next`) but excluded + // from the result tiers, exactly as before. + ids := make([]string, len(cands)) + for i := range cands { + ids[i] = cands[i].from + } + nodes := g.GetNodesByIDs(ids) + slot := depth - 1 + for _, c := range cands { + n := nodes[c.from] + if n == nil || n.Kind == graph.KindFile || n.Kind == graph.KindImport { + continue } + result[slot].IDs = append(result[slot].IDs, c.from) + result[slot].Conf = append(result[slot].Conf, c.conf) + result[slot].Labels = append(result[slot].Labels, + graph.ConfidenceLabelFor(c.kind, c.conf)) } current = next } @@ -287,7 +331,7 @@ func sortTierByID(t *tier) { // and bumps the build counter so any cached lookups dated to a prior // generation are invalidated. Use when the graph topology has shifted // so far that a full rebuild is cheaper than incremental invalidation. -func ClearIndex(g *graph.Graph) { +func ClearIndex(g graph.Store) { if g == nil { return } @@ -339,7 +383,7 @@ type Entry struct { // given seed, then caches forever. BuildIndex remains available for // `gortex enrich reach` (explicit prebuild) and for callers that // want to pay the cost up front under controlled conditions. -func Lookup(g *graph.Graph, seedID string) (d1, d2, d3 []Entry, hit bool) { +func Lookup(g graph.Store, seedID string) (d1, d2, d3 []Entry, hit bool) { if g == nil { return nil, nil, nil, false } @@ -386,6 +430,17 @@ func Lookup(g *graph.Graph, seedID string) (d1, d2, d3 []Entry, hit bool) { setOrDeleteStrings(n.Meta, MetaReachD2Label, tiers[1].Labels) setOrDeleteStrings(n.Meta, MetaReachD3Label, tiers[2].Labels) + // Persist the freshly-stamped Meta through the store. On the + // in-memory backend n is the canonical node, so the mutations above + // already stuck — AddNode re-inserts the same pointer idempotently. + // On disk backends n is a per-call reconstruction returned + // by GetNode, so the in-place stamp would otherwise be discarded the + // moment this function returns: the lazy reach cache would never + // survive a single query, forcing a full recompute on every + // AnalyzeImpact / explain_change_impact / get_callers call. AddNode + // upserts the Meta column so the cache actually sticks. + g.AddNode(n) + d1 = readTier(n.Meta, MetaReachD1, MetaReachD1Conf, MetaReachD1Label) d2 = readTier(n.Meta, MetaReachD2, MetaReachD2Conf, MetaReachD2Label) d3 = readTier(n.Meta, MetaReachD3, MetaReachD3Conf, MetaReachD3Label) diff --git a/internal/releases/releases.go b/internal/releases/releases.go index 5d19a785..085577de 100644 --- a/internal/releases/releases.go +++ b/internal/releases/releases.go @@ -37,8 +37,26 @@ import ( // unavailable. Errors silently produce an empty list — releases // enrichment is best-effort like blame. func ListTags(repoRoot string) []string { - cmd := exec.Command("git", "-C", repoRoot, - "for-each-ref", "--sort=creatordate", "--format=%(refname:short)", "refs/tags/") + return ListTagsOnBranch(repoRoot, "") +} + +// ListTagsOnBranch is ListTags scoped to tags reachable from `branch`. +// Empty branch means "every tag in the repo", matching ListTags. +// +// Restricting to a single branch is the canonical defence against +// feature-branch tags polluting the release timeline: tags that were +// only ever pushed on a topic branch (a "v0.0.0-test" tag from a +// rebase scratch, for instance) shouldn't appear in the persisted +// release order. Pass the repo's default branch ("origin/main", +// "main", …) when callers want that semantic. +func ListTagsOnBranch(repoRoot, branch string) []string { + args := []string{"-C", repoRoot, "for-each-ref", + "--sort=creatordate", "--format=%(refname:short)"} + if strings.TrimSpace(branch) != "" { + args = append(args, "--merged="+branch) + } + args = append(args, "refs/tags/") + cmd := exec.Command("git", args...) out, err := cmd.Output() if err != nil { return nil @@ -104,7 +122,7 @@ func ReleaseNodeID(repoPrefix, tag string) string { // // Errors from individual git invocations are tolerated — a broken // ref shouldn't kill enrichment for the rest of the tag set. -func EnrichGraph(g *graph.Graph, repoRoot string) (int, error) { +func EnrichGraph(g graph.Store, repoRoot string) (int, error) { return EnrichGraphWithRepoPrefix(g, repoRoot, "") } @@ -112,11 +130,24 @@ func EnrichGraph(g *graph.Graph, repoRoot string) (int, error) { // EnrichGraph. EnrichGraph delegates to it with an empty prefix; the // multi-repo enricher passes the per-repo prefix so KindRelease IDs // stay collision-free across repos. -func EnrichGraphWithRepoPrefix(g *graph.Graph, repoRoot, repoPrefix string) (int, error) { +// +// Walks every tag in the repo. Use EnrichGraphForBranch when callers +// want to restrict the timeline to tags reachable from a specific +// branch — typically the default branch — so topic-branch tags don't +// pollute the persisted history. +func EnrichGraphWithRepoPrefix(g graph.Store, repoRoot, repoPrefix string) (int, error) { + return EnrichGraphForBranch(g, repoRoot, repoPrefix, "") +} + +// EnrichGraphForBranch is EnrichGraphWithRepoPrefix scoped to tags +// reachable from `branch`. Empty branch means "every tag", matching +// the legacy behaviour. Mutations round-trip through g.AddNode so +// disk-backed stores persist the result. +func EnrichGraphForBranch(g graph.Store, repoRoot, repoPrefix, branch string) (int, error) { if g == nil || repoRoot == "" { return 0, nil } - tags := ListTags(repoRoot) + tags := ListTagsOnBranch(repoRoot, branch) if len(tags) == 0 { return 0, nil } @@ -162,6 +193,8 @@ func EnrichGraphWithRepoPrefix(g *graph.Graph, repoRoot, repoPrefix string) (int } enriched := 0 + relWriter, useRelSidecar := g.(graph.ReleaseEnrichmentWriter) + var relRows []graph.ReleaseEnrichment for _, n := range g.AllNodes() { if n.Kind != graph.KindFile { continue @@ -185,11 +218,24 @@ func EnrichGraphWithRepoPrefix(g *graph.Graph, repoRoot, repoPrefix string) (int if !ok { continue } - if n.Meta == nil { - n.Meta = map[string]any{} + if useRelSidecar { + relRows = append(relRows, graph.ReleaseEnrichment{NodeID: n.ID, AddedIn: tag}) + } else { + if n.Meta == nil { + n.Meta = map[string]any{} + } + n.Meta["added_in"] = tag + // Re-upsert so disk-backed stores persist the Meta change. + g.AddNode(n) } - n.Meta["added_in"] = tag enriched++ } + // Sidecar persist (change A): release "added_in" rides in the typed + // release_enrichment table when the backend supports it. + if useRelSidecar && len(relRows) > 0 { + if err := relWriter.BulkSetReleases(repoPrefix, relRows); err != nil { + return enriched, fmt.Errorf("releases: persist sidecar: %w", err) + } + } return enriched, nil } diff --git a/internal/releases/releases_test.go b/internal/releases/releases_test.go index 0e44d30d..98a373e5 100644 --- a/internal/releases/releases_test.go +++ b/internal/releases/releases_test.go @@ -110,10 +110,15 @@ func TestEnrichGraph_AssignsEarliestTag(t *testing.T) { if count != 2 { t.Errorf("expected 2 enriched, got %d", count) } - if got := g.GetNode("a.go").Meta["added_in"]; got != "v0.1" { + // added_in now persists in the typed sidecar (change A), not Node.Meta. + rel := map[string]string{} + for _, e := range g.ReleaseRows("") { + rel[e.NodeID] = e.AddedIn + } + if got := rel["a.go"]; got != "v0.1" { t.Errorf("a.go added_in = %v, want v0.1", got) } - if got := g.GetNode("b.go").Meta["added_in"]; got != "v0.2" { + if got := rel["b.go"]; got != "v0.2" { t.Errorf("b.go added_in = %v, want v0.2", got) } } @@ -135,7 +140,11 @@ func TestEnrichGraph_MultiRepoPrefixHandled(t *testing.T) { if count != 1 { t.Errorf("expected 1 enriched (with prefix-strip), got %d", count) } - if got := g.GetNode("myrepo/a.go").Meta["added_in"]; got != "v0.1" { + rel := map[string]string{} + for _, e := range g.ReleaseRows("") { + rel[e.NodeID] = e.AddedIn + } + if got := rel["myrepo/a.go"]; got != "v0.1" { t.Errorf("added_in = %v", got) } } diff --git a/internal/resolver/backend_resolver.go b/internal/resolver/backend_resolver.go new file mode 100644 index 00000000..6681f750 --- /dev/null +++ b/internal/resolver/backend_resolver.go @@ -0,0 +1,23 @@ +package resolver + +import ( + "os" + "strings" +) + +// backendResolverEnabled reports whether the resolver should consult +// graph.BackendResolver before running its Go-side worker pool. +// Default on for the disk-backed daemon: the backend resolver runs +// one query per rule rather than one round-trip per unresolved edge. +// With the multi-repo encoding exposing 100k+ `unresolved::*` edges +// at warmup, the per-edge Go path is the difference between a sub- +// 10-minute warmup and a hang / OOM. Set GORTEX_BACKEND_RESOLVER=0 +// to opt back out for the edge case where a small in-memory corpus +// can be heuristically resolved faster in RAM. +func backendResolverEnabled() bool { + v := os.Getenv("GORTEX_BACKEND_RESOLVER") + if v == "0" || strings.EqualFold(v, "false") { + return false + } + return true +} diff --git a/internal/resolver/bare_name_scope_bind.go b/internal/resolver/bare_name_scope_bind.go new file mode 100644 index 00000000..e8b34c2e --- /dev/null +++ b/internal/resolver/bare_name_scope_bind.go @@ -0,0 +1,195 @@ +package resolver + +import ( + "strings" + + "github.com/zzet/gortex/internal/graph" +) + +// scopeNode is the per-binding payload of the owner-keyed scope +// index built by bindBareNameScopeRefs. Kept as a named struct so +// the bind helpers can share the same signature. +type scopeNode struct { + id string + name string + startLine int + kind graph.NodeKind +} + +// bindBareNameScopeRefs rewrites `unresolved::` edges whose +// source is inside a function scope (or IS a function) onto the +// matching KindLocal / KindParam node that the enclosing function +// declares. Pre-#77 there was nothing to bind to — locals were +// edge-endpoint-only — so the resolver always fell through to +// `unresolved::*`. With #77's KindLocal materialisation the scope is +// now first-class and we can do the bind. +// +// Two precedence rules govern the choice when more than one candidate +// matches the name: +// +// 1. KindLocal beats KindParam — Go shadowing semantics, a local +// declared with the same name as a parameter takes over from its +// declaration line onwards. +// 2. Among KindLocal candidates the most recently declared one before +// the reference line wins (the standard "last shadow in scope" +// rule). The edge's Line field is the reference site; we filter +// candidates to StartLine <= reference line and pick the maximum +// StartLine. +// +// Ambiguous cases that don't resolve to one winner (e.g. two locals +// with the same Name on the same StartLine, or no candidate before +// the reference line) are left untouched so the downstream `unresolved` +// audit can still surface them. +// +// Scope today is Go-only — TypeScript / Python don't materialise +// locals yet, so their unresolved bare-name edges have no candidate +// to bind to. The pass naturally degenerates to a no-op for those +// languages because the candidate index will be empty for their +// owners. +func (r *Resolver) bindBareNameScopeRefs() { + // Index every KindLocal / KindParam by enclosing-function ID. Done + // once up front so the per-edge bind is an O(matching-name) walk + // rather than a graph-wide FindNodesByName. + owned := map[string][]scopeNode{} + for n := range r.graph.NodesByKind(graph.KindLocal) { + owner := enclosingFunctionForBinding(n.ID) + if owner == "" { + continue + } + owned[owner] = append(owned[owner], scopeNode{ + id: n.ID, name: n.Name, startLine: n.StartLine, kind: graph.KindLocal, + }) + } + for n := range r.graph.NodesByKind(graph.KindParam) { + owner := enclosingFunctionForBinding(n.ID) + if owner == "" { + continue + } + owned[owner] = append(owned[owner], scopeNode{ + id: n.ID, name: n.Name, startLine: n.StartLine, kind: graph.KindParam, + }) + } + if len(owned) == 0 { + return + } + + var batch []graph.EdgeReindex + for e := range r.graph.EdgesByKind(graph.EdgeReads) { + if rewrote := r.tryBindBareName(e, owned); rewrote != "" { + batch = append(batch, graph.EdgeReindex{Edge: e, OldTo: rewrote}) + } + } + for e := range r.graph.EdgesByKind(graph.EdgeReferences) { + if rewrote := r.tryBindBareName(e, owned); rewrote != "" { + batch = append(batch, graph.EdgeReindex{Edge: e, OldTo: rewrote}) + } + } + // EdgeArgOf and EdgeValueFlow carry the same shape — `unresolved::` + // is the dataflow source/target the parser couldn't bind. + for e := range r.graph.EdgesByKind(graph.EdgeArgOf) { + if rewrote := r.tryBindBareName(e, owned); rewrote != "" { + batch = append(batch, graph.EdgeReindex{Edge: e, OldTo: rewrote}) + } + } + for e := range r.graph.EdgesByKind(graph.EdgeValueFlow) { + if rewrote := r.tryBindBareName(e, owned); rewrote != "" { + batch = append(batch, graph.EdgeReindex{Edge: e, OldTo: rewrote}) + } + } + if len(batch) > 0 { + r.graph.ReindexEdges(batch) + } +} + +// tryBindBareName tries to rewrite e.To from `unresolved::` to a +// matching in-scope KindLocal/KindParam ID. Returns the original To +// value when a rewrite happened (caller batches it for ReindexEdges) +// or "" when the edge was left alone. +func (r *Resolver) tryBindBareName(e *graph.Edge, owned map[string][]scopeNode) string { + if e == nil || !graph.IsUnresolvedTarget(e.To) { + return "" + } + name := graph.UnresolvedName(e.To) + if name == "" || strings.ContainsAny(name, ".*:#") { + // Not a bare identifier — leave to other passes (qualified + // names, *.method, etc.). + return "" + } + ownerID := enclosingFunctionForBinding(e.From) + if ownerID == "" { + return "" + } + candidates := owned[ownerID] + if len(candidates) == 0 { + return "" + } + chosen := pickInScopeBinding(candidates, name, e.Line) + if chosen == "" || chosen == e.To { + return "" + } + oldTo := e.To + e.To = chosen + return oldTo +} + +// pickInScopeBinding implements the precedence rules: +// - prefer KindLocal over KindParam (Go shadowing), +// - among KindLocal, pick the latest StartLine that's still <= refLine, +// - if multiple candidates match the same maximum StartLine, return "" +// (ambiguous — leave the edge unresolved so the audit surfaces it). +// +// owned is the per-owner scope-node slice; name is the bare identifier +// from the edge target; refLine is the edge's line (the reference +// site). Returns the chosen ID, or "" when no unambiguous winner. +func pickInScopeBinding(owned []scopeNode, name string, refLine int) string { + var bestLocal struct { + id string + line int + dups int + } + var paramID string + for _, c := range owned { + if c.name != name { + continue + } + if c.kind == graph.KindLocal { + if refLine > 0 && c.startLine > refLine { + // Declared after the reference — can't be bound here. + continue + } + switch { + case c.startLine > bestLocal.line: + bestLocal.id = c.id + bestLocal.line = c.startLine + bestLocal.dups = 0 + case c.startLine == bestLocal.line && c.id != bestLocal.id: + bestLocal.dups++ + } + } else if c.kind == graph.KindParam { + if paramID != "" && paramID != c.id { + // Two params with the same name in the same function + // shouldn't happen but defensive — abstain. + paramID = "" + } else { + paramID = c.id + } + } + } + if bestLocal.id != "" && bestLocal.dups == 0 { + return bestLocal.id + } + return paramID +} + +// enclosingFunctionForBinding strips the per-binding suffix added by +// the Go extractor (`#local:`, `#param:`, `#closure`, `#tparam:`) to +// recover the owner function/method ID. If `id` has no suffix it's +// returned unchanged — the caller is already a function/method node +// directly (the per-edge From is the function itself for things like +// the `external::foo` import edge inside `func Foo()`). +func enclosingFunctionForBinding(id string) string { + if i := strings.Index(id, "#"); i > 0 { + return id[:i] + } + return id +} diff --git a/internal/resolver/bare_name_scope_bind_test.go b/internal/resolver/bare_name_scope_bind_test.go new file mode 100644 index 00000000..98db3f6b --- /dev/null +++ b/internal/resolver/bare_name_scope_bind_test.go @@ -0,0 +1,200 @@ +package resolver + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/zzet/gortex/internal/graph" +) + +// TestBindBareNameScopeRefs_LocalWins covers the headline case: a +// function declares a KindLocal `key1`; an EdgeReads to +// `unresolved::key1` originating from that function's body should be +// rewritten to point at the KindLocal node. +func TestBindBareNameScopeRefs_LocalWins(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Handler" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Handler", FilePath: "pkg/foo.go", Language: "go"}) + + localID := owner + "#local:key1@+3" + g.AddNode(&graph.Node{ + ID: localID, Kind: graph.KindLocal, Name: "key1", + FilePath: "pkg/foo.go", StartLine: 3, EndLine: 3, Language: "go", + }) + g.AddEdge(&graph.Edge{From: localID, To: owner, Kind: graph.EdgeMemberOf, FilePath: "pkg/foo.go", Line: 3}) + + edge := &graph.Edge{ + From: owner, To: "unresolved::key1", + Kind: graph.EdgeReads, FilePath: "pkg/foo.go", Line: 5, + } + g.AddEdge(edge) + + r := New(g) + r.bindBareNameScopeRefs() + + assert.Equal(t, localID, edge.To, "EdgeReads must be rewritten to the in-scope KindLocal") +} + +// TestBindBareNameScopeRefs_FromBindingResolvesToOwner — the From of +// the edge is itself a per-binding ID (`#local:x@+N`); the +// pass should strip the suffix to recover the enclosing function and +// still bind correctly. +func TestBindBareNameScopeRefs_FromBindingResolvesToOwner(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Handler" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Handler", FilePath: "pkg/foo.go", Language: "go"}) + + keyID := owner + "#local:key@+2" + g.AddNode(&graph.Node{ID: keyID, Kind: graph.KindLocal, Name: "key", FilePath: "pkg/foo.go", StartLine: 2, Language: "go"}) + g.AddEdge(&graph.Edge{From: keyID, To: owner, Kind: graph.EdgeMemberOf}) + + from := owner + "#local:out@+5" + g.AddNode(&graph.Node{ID: from, Kind: graph.KindLocal, Name: "out", FilePath: "pkg/foo.go", StartLine: 5, Language: "go"}) + g.AddEdge(&graph.Edge{From: from, To: owner, Kind: graph.EdgeMemberOf}) + + edge := &graph.Edge{From: from, To: "unresolved::key", Kind: graph.EdgeValueFlow, Line: 5} + g.AddEdge(edge) + + New(g).bindBareNameScopeRefs() + assert.Equal(t, keyID, edge.To, "From with #local: suffix must still resolve via enclosing function") +} + +// TestBindBareNameScopeRefs_ParamFallback covers the Go-shadowing +// fallback: when no local matches, the parameter with the same name +// wins. +func TestBindBareNameScopeRefs_ParamFallback(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Handler" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Handler", FilePath: "pkg/foo.go", Language: "go"}) + + paramID := owner + "#param:req" + g.AddNode(&graph.Node{ID: paramID, Kind: graph.KindParam, Name: "req", FilePath: "pkg/foo.go", Language: "go"}) + g.AddEdge(&graph.Edge{From: paramID, To: owner, Kind: graph.EdgeParamOf}) + + edge := &graph.Edge{From: owner, To: "unresolved::req", Kind: graph.EdgeReads, Line: 3} + g.AddEdge(edge) + + New(g).bindBareNameScopeRefs() + assert.Equal(t, paramID, edge.To, "no matching local — param with same name must take over") +} + +// TestBindBareNameScopeRefs_LocalShadowsParam — both a param and a +// local share the same name; the local wins (Go shadowing). +func TestBindBareNameScopeRefs_LocalShadowsParam(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Handler" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Handler", FilePath: "pkg/foo.go", Language: "go"}) + + paramID := owner + "#param:x" + g.AddNode(&graph.Node{ID: paramID, Kind: graph.KindParam, Name: "x", FilePath: "pkg/foo.go", Language: "go"}) + g.AddEdge(&graph.Edge{From: paramID, To: owner, Kind: graph.EdgeParamOf}) + + localID := owner + "#local:x@+4" + g.AddNode(&graph.Node{ID: localID, Kind: graph.KindLocal, Name: "x", FilePath: "pkg/foo.go", StartLine: 4, Language: "go"}) + g.AddEdge(&graph.Edge{From: localID, To: owner, Kind: graph.EdgeMemberOf}) + + edge := &graph.Edge{From: owner, To: "unresolved::x", Kind: graph.EdgeReads, Line: 6} + g.AddEdge(edge) + + New(g).bindBareNameScopeRefs() + assert.Equal(t, localID, edge.To, "KindLocal must shadow KindParam with the same name") +} + +// TestBindBareNameScopeRefs_RefBeforeDeclLeftAlone — a reference +// whose line is BEFORE the local's StartLine can't possibly bind to +// that local. The pass must leave the edge unresolved rather than +// reach backwards. +func TestBindBareNameScopeRefs_RefBeforeDeclLeftAlone(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Handler" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Handler", FilePath: "pkg/foo.go", Language: "go"}) + + localID := owner + "#local:tmp@+10" + g.AddNode(&graph.Node{ID: localID, Kind: graph.KindLocal, Name: "tmp", FilePath: "pkg/foo.go", StartLine: 10, Language: "go"}) + g.AddEdge(&graph.Edge{From: localID, To: owner, Kind: graph.EdgeMemberOf}) + + edge := &graph.Edge{From: owner, To: "unresolved::tmp", Kind: graph.EdgeReads, Line: 3} + g.AddEdge(edge) + + New(g).bindBareNameScopeRefs() + assert.Equal(t, "unresolved::tmp", edge.To, "reference before declaration must not bind") +} + +// TestBindBareNameScopeRefs_LatestShadowWins covers the standard "last +// shadow in scope" rule when two locals share a name across scopes: +// the binding declared on the higher line (closer to the reference) +// wins. +func TestBindBareNameScopeRefs_LatestShadowWins(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Handler" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Handler", FilePath: "pkg/foo.go", Language: "go"}) + + earlier := owner + "#local:err@+2" + g.AddNode(&graph.Node{ID: earlier, Kind: graph.KindLocal, Name: "err", FilePath: "pkg/foo.go", StartLine: 2, Language: "go"}) + g.AddEdge(&graph.Edge{From: earlier, To: owner, Kind: graph.EdgeMemberOf}) + + later := owner + "#local:err@+8" + g.AddNode(&graph.Node{ID: later, Kind: graph.KindLocal, Name: "err", FilePath: "pkg/foo.go", StartLine: 8, Language: "go"}) + g.AddEdge(&graph.Edge{From: later, To: owner, Kind: graph.EdgeMemberOf}) + + edge := &graph.Edge{From: owner, To: "unresolved::err", Kind: graph.EdgeReads, Line: 12} + g.AddEdge(edge) + + New(g).bindBareNameScopeRefs() + assert.Equal(t, later, edge.To, "the most recent shadow before the reference line must win") +} + +// TestBindBareNameScopeRefs_AmbiguousLeftAlone — two locals with the +// same name declared on the same line (shouldn't happen in valid Go +// but defensive): the pass must leave the edge unresolved rather +// than pick an arbitrary winner. +func TestBindBareNameScopeRefs_AmbiguousLeftAlone(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Handler" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Handler", FilePath: "pkg/foo.go", Language: "go"}) + + a := owner + "#local:err@+5" + b := owner + "#local:err@+5#1" + g.AddNode(&graph.Node{ID: a, Kind: graph.KindLocal, Name: "err", FilePath: "pkg/foo.go", StartLine: 5, Language: "go"}) + g.AddNode(&graph.Node{ID: b, Kind: graph.KindLocal, Name: "err", FilePath: "pkg/foo.go", StartLine: 5, Language: "go"}) + g.AddEdge(&graph.Edge{From: a, To: owner, Kind: graph.EdgeMemberOf}) + g.AddEdge(&graph.Edge{From: b, To: owner, Kind: graph.EdgeMemberOf}) + + edge := &graph.Edge{From: owner, To: "unresolved::err", Kind: graph.EdgeReads, Line: 7} + g.AddEdge(edge) + + New(g).bindBareNameScopeRefs() + assert.Equal(t, "unresolved::err", edge.To, "ambiguous candidates on same line must leave the edge unresolved") +} + +// TestBindBareNameScopeRefs_QualifiedNotTouched ensures the pass only +// fires on bare names — qualified shapes (`*.Method`, `pkg.Name`, +// `unresolved::pyrel::...`) are left to other passes. +func TestBindBareNameScopeRefs_QualifiedNotTouched(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Handler" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Handler", FilePath: "pkg/foo.go", Language: "go"}) + + // Even if a local matches the unqualified part, the qualified + // shapes must be left alone. + g.AddNode(&graph.Node{ID: owner + "#local:Foo@+2", Kind: graph.KindLocal, Name: "Foo", FilePath: "pkg/foo.go", StartLine: 2, Language: "go"}) + g.AddEdge(&graph.Edge{From: owner + "#local:Foo@+2", To: owner, Kind: graph.EdgeMemberOf}) + + keep := []*graph.Edge{ + {From: owner, To: "unresolved::*.Foo", Kind: graph.EdgeReads, Line: 5}, + {From: owner, To: "unresolved::pkg.Foo", Kind: graph.EdgeReads, Line: 6}, + {From: owner, To: "unresolved::pyrel::./foo", Kind: graph.EdgeReads, Line: 7}, + } + for _, e := range keep { + g.AddEdge(e) + } + + New(g).bindBareNameScopeRefs() + for _, e := range keep { + assert.True(t, + e.To == "unresolved::*.Foo" || e.To == "unresolved::pkg.Foo" || e.To == "unresolved::pyrel::./foo", + "qualified shape %q must stay untouched", e.To, + ) + } +} diff --git a/internal/resolver/bench_test.go b/internal/resolver/bench_test.go index bbce2a3e..8ea93f6a 100644 --- a/internal/resolver/bench_test.go +++ b/internal/resolver/bench_test.go @@ -8,7 +8,7 @@ import ( ) // buildResolverGraph creates a graph with unresolved edges for benchmarking. -func buildResolverGraph(files, symsPerFile int) (*graph.Graph, *Resolver) { +func buildResolverGraph(files, symsPerFile int) (graph.Store, *Resolver) { g := graph.New() // Create file nodes with functions, types, and methods. diff --git a/internal/resolver/concurrent_test.go b/internal/resolver/concurrent_test.go index 682f33c1..b06ee542 100644 --- a/internal/resolver/concurrent_test.go +++ b/internal/resolver/concurrent_test.go @@ -98,7 +98,7 @@ func TestResolver_CrossRepoResolver_SerializeOnGraph(t *testing.T) { // one unresolved edge so the resolver actually has work to do during // the race test. The shape doesn't matter — only that buildDirIndexes // observes >0 file nodes and the resolveEdge inner loop runs. -func buildSmallGraph(t *testing.T) *graph.Graph { +func buildSmallGraph(t *testing.T) graph.Store { t.Helper() g := graph.New() for _, fp := range []string{"repo-a/lib/a.go", "repo-a/lib/b.go", "repo-b/main.go"} { diff --git a/internal/resolver/cross_pkg_call_guard_test.go b/internal/resolver/cross_pkg_call_guard_test.go index db98107c..080e8095 100644 --- a/internal/resolver/cross_pkg_call_guard_test.go +++ b/internal/resolver/cross_pkg_call_guard_test.go @@ -14,7 +14,7 @@ import ( // faithful end-to-end harness for the resolver tests below: a real // extractor produces the unresolved edges, then ResolveAll runs against // them exactly as it does on a live index. -func buildGraphFromSources(t *testing.T, files map[string]string) *graph.Graph { +func buildGraphFromSources(t *testing.T, files map[string]string) graph.Store { t.Helper() g := graph.New() ts := languages.NewTypeScriptExtractor() @@ -50,7 +50,7 @@ func buildGraphFromSources(t *testing.T, files map[string]string) *graph.Graph { // callEdgeTo returns the resolved To-end of the call/reference edge that // leaves fromID at the given 1-based line. Empty string when no such // edge exists. -func callEdgeTo(g *graph.Graph, fromID string, line int) string { +func callEdgeTo(g graph.Store, fromID string, line int) string { for _, e := range g.GetOutEdges(fromID) { if (e.Kind == graph.EdgeCalls || e.Kind == graph.EdgeReferences) && e.Line == line { return e.To diff --git a/internal/resolver/cross_pkg_guard.go b/internal/resolver/cross_pkg_guard.go index 060651ed..e3c6c8ae 100644 --- a/internal/resolver/cross_pkg_guard.go +++ b/internal/resolver/cross_pkg_guard.go @@ -44,7 +44,13 @@ func (r *Resolver) guardCrossPackageCallEdges(jobs []reindexJob, closure map[str if len(jobs) == 0 { return 0 } - reverted := 0 + // Collect both mutation lists across the whole pass and apply them + // via the batched Store methods at the end. Per-edge + // SetEdgeProvenance + ReindexEdge in the body would otherwise pay + // two ACID round-trips per reverted edge against disk backends — + // catastrophic on a 30k-job pass. + var provBatch []graph.EdgeProvenanceUpdate + var reindexBatch []graph.EdgeReindex for i := range jobs { j := &jobs[i] if !isCallLikeEdge(j.kind) { @@ -71,7 +77,7 @@ func (r *Resolver) guardCrossPackageCallEdges(jobs []reindexJob, closure map[str continue } callerFile := r.edgeCallerFile(j.edge) - target := r.graph.GetNode(j.newTo) + target := r.cachedGetNode(j.newTo) if callerFile == "" || target == nil { continue } @@ -80,19 +86,24 @@ func (r *Resolver) guardCrossPackageCallEdges(jobs []reindexJob, closure map[str } // Not reachable — revert to the unresolved placeholder and // re-index against the resolved target we are abandoning. - // Drop the resolution provenance through SetEdgeProvenance so - // the reverted edge's identity change is counted; the logical - // key still carries the resolved target at this point, which - // is fine — SetEdgeProvenance keys the revision on Origin - // alone. The target revert + re-bucket follows. + // SetEdgeProvenance("") drops the resolution provenance so + // the reverted edge's identity change is counted; the target + // revert + re-bucket follows. Both go in their respective + // batches so the whole pass commits in two chunks instead of + // 2×N per-edge transactions. oldResolved := j.edge.To - r.graph.SetEdgeProvenance(j.edge, "") + provBatch = append(provBatch, graph.EdgeProvenanceUpdate{Edge: j.edge, NewOrigin: ""}) j.edge.To = j.oldTo j.edge.Confidence = 0 - r.graph.ReindexEdge(j.edge, oldResolved) - reverted++ + reindexBatch = append(reindexBatch, graph.EdgeReindex{Edge: j.edge, OldTo: oldResolved}) } - return reverted + if len(provBatch) > 0 { + r.graph.SetEdgeProvenanceBatch(provBatch) + } + if len(reindexBatch) > 0 { + r.graph.ReindexEdges(reindexBatch) + } + return len(reindexBatch) } // isBareNameCallTarget reports whether an unresolved edge target is a @@ -127,8 +138,13 @@ func isCallLikeEdge(k graph.EdgeKind) bool { // edgeCallerFile returns the file path of the node that owns the edge's // From end. Empty when the caller node is unknown. +// +// Hot path: called once per cross-package-guarded edge. The pre-warmed +// per-pass cache populated in ResolveAll holds every From ID across the +// pending slice, so this call is a map lookup during a ResolveAll pass +// and a direct store call elsewhere. func (r *Resolver) edgeCallerFile(e *graph.Edge) string { - if n := r.graph.GetNode(e.From); n != nil && n.FilePath != "" { + if n := r.cachedGetNode(e.From); n != nil && n.FilePath != "" { return n.FilePath } return e.FilePath @@ -180,26 +196,49 @@ func (r *Resolver) buildImportClosure() map[string]map[string]struct{} { } set[dir] = struct{}{} } - for _, n := range r.graph.AllNodes() { - if n.Kind == graph.KindFile && n.FilePath != "" { + for n := range r.graph.NodesByKind(graph.KindFile) { + if n.FilePath != "" { add(n.FilePath, filepath.Dir(n.FilePath)) } } - for _, e := range r.graph.AllEdges() { - if e.Kind != graph.EdgeImports { - continue - } + // Materialise the resolved import edges and batch-load their endpoints + // (caller file + target) in one GetNodesByIDs — a per-edge GetNode here + // is a query round-trip per import on a disk backend. Inlines + // edgeCallerFile's cached-node logic against the batch map. + var imports []*graph.Edge + ids := make(map[string]struct{}) + for e := range r.graph.EdgesByKind(graph.EdgeImports) { // Skip imports still pointing at an unresolved placeholder or an // out-of-repo stub — neither names an in-repo directory that a // name-only call candidate could legitimately live in. if strings.HasPrefix(e.To, unresolvedPrefix) || strings.HasPrefix(e.To, "external::") || - strings.HasPrefix(e.To, "stdlib::") || + graph.IsStdlibStub(e.To) || strings.HasPrefix(e.To, "dep::") { continue } - callerFile := r.edgeCallerFile(e) - if target := r.graph.GetNode(e.To); target != nil && target.FilePath != "" { + imports = append(imports, e) + if e.From != "" { + ids[e.From] = struct{}{} + } + if e.To != "" { + ids[e.To] = struct{}{} + } + } + if len(imports) == 0 { + return closure + } + idList := make([]string, 0, len(ids)) + for id := range ids { + idList = append(idList, id) + } + nodes := r.graph.GetNodesByIDs(idList) + for _, e := range imports { + callerFile := e.FilePath + if n := nodes[e.From]; n != nil && n.FilePath != "" { + callerFile = n.FilePath + } + if target := nodes[e.To]; target != nil && target.FilePath != "" { add(callerFile, filepath.Dir(target.FilePath)) } } diff --git a/internal/resolver/cross_repo.go b/internal/resolver/cross_repo.go index 87edf078..07a44361 100644 --- a/internal/resolver/cross_repo.go +++ b/internal/resolver/cross_repo.go @@ -2,9 +2,14 @@ package resolver import ( "path/filepath" + "runtime" "sort" "strings" "sync" + "sync/atomic" + "time" + + "go.uber.org/zap" "github.com/zzet/gortex/internal/graph" ) @@ -62,9 +67,21 @@ type CrossWorkspaceDepLookup func(sourceWorkspaceID string) []CrossWorkspaceDepR // the target workspace via `cross_workspace_deps` AND, for import // edges, the import path has a declared-module prefix. type CrossRepoResolver struct { - graph *graph.Graph - dirIndex map[string][]*graph.Node - lastDirIndex map[string][]*graph.Node + graph graph.Store + // nodeByID / nodesByName: per-pass batched lookup cache, the + // cross-repo mirror of the fields on Resolver (resolver.go). + // Populated by warmLookupCache before the per-edge fan-out and + // cleared on return; cachedGetNode / cachedFindNodesByName consult + // them first. Without it the cross-repo pass fires one + // GetNode/FindNodesByName query per pending edge — across 200k+ + // unresolved edges that is a warmup hang on disk backends. + logger *zap.Logger + nodeByID map[string]*graph.Node + nodesByName map[string][]*graph.Node + nodesByNameRepo map[string]map[string][]*graph.Node + nodesByQualName map[string]*graph.Node + dirIndex map[string][]*graph.Node + lastDirIndex map[string][]*graph.Node // reachableReposByFile maps a caller file's ID to the set of repo // prefixes that file imports (derived from resolved EdgeImports // edges). It is the import-reachability evidence gate: a name-only @@ -98,8 +115,17 @@ type CrossRepoResolver struct { } // NewCrossRepo creates a CrossRepoResolver for the given graph. -func NewCrossRepo(g *graph.Graph) *CrossRepoResolver { - return &CrossRepoResolver{graph: g, mu: g.ResolveMutex()} +func NewCrossRepo(g graph.Store) *CrossRepoResolver { + return &CrossRepoResolver{graph: g, mu: g.ResolveMutex(), logger: zap.NewNop()} +} + +// SetLogger attaches a logger so ResolveAll emits pass progress (the +// cross-repo mirror of Resolver.SetLogger). A nil logger becomes a no-op. +func (cr *CrossRepoResolver) SetLogger(l *zap.Logger) { + if l == nil { + l = zap.NewNop() + } + cr.logger = l } // SetCrossWorkspaceDepLookup wires the boundary rule. After this @@ -116,7 +142,7 @@ func (cr *CrossRepoResolver) SetCrossWorkspaceDepLookup(lookup CrossWorkspaceDep // an edge. Falls back to RepoPrefix to match Contract.Effective- // Workspace's "missing → repo-name" rule. func (cr *CrossRepoResolver) callerWorkspaceID(e *graph.Edge) string { - from := cr.graph.GetNode(e.From) + from := cr.cachedGetNode(e.From) if from == nil { return "" } @@ -187,12 +213,112 @@ func (cr *CrossRepoResolver) ResolveAll() *CrossRepoStats { stats := &CrossRepoStats{ByRepo: make(map[string]int)} - edges := cr.graph.AllEdges() - for _, e := range edges { - if !strings.HasPrefix(e.To, unresolvedPrefix) { + // Predicate-shaped read: disk backends only enumerate the + // "unresolved::*" slice (the only one this pass mutates). Batch + // mutations to commit in chunks at the end. + // Materialise the pending slice once so warmLookupCache can batch + // the per-edge GetNode / FindNodesByName the cascade would otherwise + // fire serially (the cross-repo warmup storm on disk backends). + var pending []*graph.Edge + for e := range cr.graph.EdgesWithUnresolvedTarget() { + pending = append(pending, e) + } + cr.warmLookupCache(pending) + defer cr.clearLookupCache() + + passStart := time.Now() + cr.logger.Info("cross-repo resolve: pass start", zap.Int("pending", len(pending))) + var processed atomic.Int64 + progressDone := make(chan struct{}) + go func() { + t := time.NewTicker(3 * time.Second) + defer t.Stop() + for { + select { + case <-progressDone: + return + case <-t.C: + cr.logger.Info("cross-repo resolve: compute progress", + zap.Int64("processed", processed.Load()), + zap.Int("pending", len(pending)), + zap.Duration("elapsed", time.Since(passStart))) + } + } + }() + + // Resolve concurrently across NumCPU workers, mirroring the master + // Resolver's pool. Each edge is touched by exactly one worker (disjoint + // chunks); the per-pass caches/indexes are read-only here; each worker + // accumulates into its OWN batch + stats — so no shared mutable state is + // written concurrently. Batches are concatenated and applied once after + // the barrier (cr never reindexes per-edge mid-loop, so unlike the + // master pool no edge clone is needed); stats are summed. + workers := runtime.NumCPU() + // Clamp to the work count BEFORE flooring at 1: an empty pending slice + // must leave workers >= 1 so the chunk division below can't divide by + // zero. With workers == 1 and len(pending) == 0 the chunk is 0 and every + // worker's [start,end) is empty, so the pass is a correct no-op. + if workers > len(pending) { + workers = len(pending) + } + if workers < 1 { + workers = 1 + } + perWorkerBatch := make([][]graph.EdgeReindex, workers) + perWorkerStats := make([]*CrossRepoStats, workers) + var wg sync.WaitGroup + chunk := (len(pending) + workers - 1) / workers + for w := 0; w < workers; w++ { + start := w * chunk + end := start + chunk + if end > len(pending) { + end = len(pending) + } + if start >= end { + continue + } + wg.Add(1) + go func(idx int, slice []*graph.Edge) { + defer wg.Done() + ws := &CrossRepoStats{ByRepo: make(map[string]int)} + var batch []graph.EdgeReindex + for _, e := range slice { + cr.resolveEdge(e, ws, &batch) + processed.Add(1) + } + perWorkerStats[idx] = ws + perWorkerBatch[idx] = batch + }(w, pending[start:end]) + } + wg.Wait() + close(progressDone) + + var reindexBatch []graph.EdgeReindex + for i := range perWorkerBatch { + reindexBatch = append(reindexBatch, perWorkerBatch[i]...) + } + for _, ws := range perWorkerStats { + if ws == nil { continue } - cr.resolveEdge(e, stats) + stats.Resolved += ws.Resolved + stats.Unresolved += ws.Unresolved + stats.CrossRepoEdges += ws.CrossRepoEdges + for repo, n := range ws.ByRepo { + stats.ByRepo[repo] += n + } + } + cr.logger.Info("cross-repo resolve: compute done", + zap.Int("pending", len(pending)), + zap.Int("reindex_batch", len(reindexBatch)), + zap.Int("workers", workers), + zap.Duration("elapsed", time.Since(passStart))) + if len(reindexBatch) > 0 { + applyStart := time.Now() + cr.graph.ReindexEdges(reindexBatch) + cr.logger.Info("cross-repo resolve: apply done", + zap.Int("edges", len(reindexBatch)), + zap.Duration("elapsed", time.Since(applyStart))) } // Materialise the cross_repo_* edge layer over the freshly lifted // calls / implements / extends edges. @@ -205,7 +331,60 @@ func (cr *CrossRepoResolver) ResolveAll() *CrossRepoStats { func (cr *CrossRepoResolver) ResolveForRepo(repoPrefix string) *CrossRepoStats { cr.mu.Lock() defer cr.mu.Unlock() + // One backend query for every out-edge from this repo's nodes, + // instead of GetRepoNodes followed by GetOutEdges per node. On + // disk backends (SQLite, DuckDB) the per-node loop + // was O(repo_nodes) round-trips per pass — single-digit minutes + // of warmup on a multi-repo workspace where this method runs + // once per tracked repo. + return cr.resolveScopedLocked(cr.graph.GetRepoEdges(repoPrefix)) +} +// ResolveForFile is the watcher fast path: it re-resolves only the +// out-edges of the changed file, not the whole repo. The watcher fires +// after every single-file save, and the old ResolveForRepo path +// materialised the repo's ENTIRE edge set (hundreds of thousands of +// edges, each with its meta blob) on every keystroke-save — the +// dominant per-edit allocation flood and the cause of the +// "buffer pool is full" crash on a small resident pool. Scoping to the +// changed file's edges turns that into a GetFileNodes lookup plus one +// batched GetOutEdgesByNodeIDs, bounded by the file's size. +// +// relPath must be the repo-relative graph key — callers convert an +// absolute watcher path via Indexer.RelKey first. A path matching no +// nodes is a no-op. +// +// Scope note: this resolves edges the changed file OWNS. A new +// definition in this file that would resolve some OTHER file's pending +// unresolved edge (inbound resolution) is not re-checked here — that +// case is rare, self-heals when the referencing file is next touched, +// and is swept up by the periodic full ResolveAll. ResolveForRepo +// remains for warmup / global recompute. +func (cr *CrossRepoResolver) ResolveForFile(repoPrefix, relPath string) *CrossRepoStats { + cr.mu.Lock() + defer cr.mu.Unlock() + nodes := cr.graph.GetFileNodes(relPath) + if len(nodes) == 0 { + return &CrossRepoStats{ByRepo: make(map[string]int)} + } + ids := make([]string, 0, len(nodes)) + for _, n := range nodes { + if n != nil { + ids = append(ids, n.ID) + } + } + var edges []*graph.Edge + for _, es := range cr.graph.GetOutEdgesByNodeIDs(ids) { + edges = append(edges, es...) + } + return cr.resolveScopedLocked(edges) +} + +// resolveScopedLocked lifts every unresolved target among edges to its +// real cross-repo node, then materialises the cross_repo_* parallel-edge +// layer. Shared by ResolveForRepo (whole-repo edge set) and +// ResolveForFile (one changed file's out-edges). Caller holds cr.mu. +func (cr *CrossRepoResolver) resolveScopedLocked(edges []*graph.Edge) *CrossRepoStats { cr.buildDirIndexes() defer cr.clearDirIndexes() cr.buildDepModuleIndex() @@ -214,16 +393,15 @@ func (cr *CrossRepoResolver) ResolveForRepo(repoPrefix string) *CrossRepoStats { defer cr.clearReachableReposIndex() stats := &CrossRepoStats{ByRepo: make(map[string]int)} - - nodes := cr.graph.GetRepoNodes(repoPrefix) - for _, n := range nodes { - edges := cr.graph.GetOutEdges(n.ID) - for _, e := range edges { - if !strings.HasPrefix(e.To, unresolvedPrefix) { - continue - } - cr.resolveEdge(e, stats) + var reindexBatch []graph.EdgeReindex + for _, e := range edges { + if e == nil || !strings.HasPrefix(e.To, unresolvedPrefix) { + continue } + cr.resolveEdge(e, stats, &reindexBatch) + } + if len(reindexBatch) > 0 { + cr.graph.ReindexEdges(reindexBatch) } // Materialise the cross_repo_* edge layer. The pass is graph-wide // (cheap relative to a resolve pass) so an edge into repoPrefix @@ -246,13 +424,9 @@ func (cr *CrossRepoResolver) ResolveForRepo(repoPrefix string) *CrossRepoStats { // These maps are torn down via clearDirIndexes when the pass completes // so we don't keep ~N pointers alive between resolves. func (cr *CrossRepoResolver) buildDirIndexes() { - nodes := cr.graph.AllNodes() - cr.dirIndex = make(map[string][]*graph.Node, len(nodes)/4) - cr.lastDirIndex = make(map[string][]*graph.Node, len(nodes)/4) - for _, n := range nodes { - if n.Kind != graph.KindFile { - continue - } + cr.dirIndex = make(map[string][]*graph.Node, 128) + cr.lastDirIndex = make(map[string][]*graph.Node, 128) + for n := range cr.graph.NodesByKind(graph.KindFile) { dir := filepath.Dir(n.FilePath) cr.dirIndex[dir] = append(cr.dirIndex[dir], n) last := lastPathComponent(dir) @@ -267,12 +441,8 @@ func (cr *CrossRepoResolver) buildDirIndexes() { // by callerRepo, so the same dep node reachable here is the one in the // importing file's own go.mod. func (cr *CrossRepoResolver) buildDepModuleIndex() { - nodes := cr.graph.AllNodes() by := make(map[string][]depModuleEntry) - for _, n := range nodes { - if n.Kind != graph.KindContract { - continue - } + for n := range cr.graph.NodesByKind(graph.KindContract) { if !strings.HasPrefix(n.ID, "dep::") { continue } @@ -324,13 +494,32 @@ func (cr *CrossRepoResolver) clearDirIndexes() { // graph is settled enough to be trustworthy evidence. func (cr *CrossRepoResolver) buildReachableReposIndex() { idx := make(map[string]map[string]struct{}) - for _, e := range cr.graph.AllEdges() { - if e.Kind != graph.EdgeImports { - continue + // Materialise the import edges and batch-load their targets in one + // GetNodesByIDs — a per-edge GetNode(e.To) here is a query round-trip + // per import on a disk backend, which under the cross-repo pass's + // import population was a multi-minute cold-warmup stall (it runs + // before the pass even logs "pass start"). + var imports []*graph.Edge + ids := make(map[string]struct{}) + for e := range cr.graph.EdgesByKind(graph.EdgeImports) { + imports = append(imports, e) + if e.To != "" { + ids[e.To] = struct{}{} } + } + if len(imports) == 0 { + cr.reachableReposByFile = idx + return + } + idList := make([]string, 0, len(ids)) + for id := range ids { + idList = append(idList, id) + } + nodes := cr.graph.GetNodesByIDs(idList) + for _, e := range imports { // Only resolved imports carry evidence — an unresolved import // target tells us nothing about which repo the caller reaches. - to := cr.graph.GetNode(e.To) + to := nodes[e.To] if to == nil || to.RepoPrefix == "" { continue } @@ -348,26 +537,26 @@ func (cr *CrossRepoResolver) clearReachableReposIndex() { cr.reachableReposByFile = nil } -// repoReachable reports whether the caller of edge e is allowed to -// resolve to a candidate in targetRepo. Empty targetRepo (synthetic / -// stdlib node) is never a repo boundary. A candidate in the caller's -// own repo is always reachable. A candidate in a *different* repo is -// reachable only when the caller's file has a resolved import edge into -// that repo — the import-reachability evidence gate that stops -// name-only matches from crossing a repo line on a coincidence. -func (cr *CrossRepoResolver) repoReachable(e *graph.Edge, targetRepo string) bool { - if targetRepo == "" { - return true - } - if targetRepo == cr.callerRepoPrefix(e) { - return true - } - repos := cr.reachableReposByFile[cr.callerFileID(e)] - if repos == nil { - return false +// reachabilityChecker returns a per-edge closure that reports whether the +// caller of e may reach a candidate in targetRepo. It captures the caller's +// repo + import-reachability set ONCE; the per-call repoReachable re-derived +// both via cachedGetNode on every candidate, so a common cross-repo name +// with thousands of candidates paid O(candidates) redundant cache lookups +// per edge — the bulk of cr's compute wall time. Same semantics as +// repoReachable; only the per-candidate cost differs. +func (cr *CrossRepoResolver) reachabilityChecker(e *graph.Edge) func(targetRepo string) bool { + callerRepo := cr.callerRepoPrefix(e) + reachableRepos := cr.reachableReposByFile[cr.callerFileID(e)] + return func(targetRepo string) bool { + if targetRepo == "" || targetRepo == callerRepo { + return true + } + if reachableRepos == nil { + return false + } + _, ok := reachableRepos[targetRepo] + return ok } - _, ok := repos[targetRepo] - return ok } // callerFileID returns the graph ID of the file that owns the edge's @@ -376,7 +565,7 @@ func (cr *CrossRepoResolver) repoReachable(e *graph.Edge, targetRepo string) boo // reachableReposByFile. Falls back to the edge's own FilePath when the // From node can't be resolved. func (cr *CrossRepoResolver) callerFileID(e *graph.Edge) string { - if from := cr.graph.GetNode(e.From); from != nil { + if from := cr.cachedGetNode(e.From); from != nil { if from.Kind == graph.KindFile { return from.ID } @@ -387,9 +576,247 @@ func (cr *CrossRepoResolver) callerFileID(e *graph.Edge) string { return e.FilePath } -func (cr *CrossRepoResolver) resolveEdge(e *graph.Edge, stats *CrossRepoStats) { +// resolveEdge dispatches one unresolved edge through the cross-repo +// resolution paths and, when the resolution lifted the To target, +// appends a re-bind job to batch instead of committing a per-edge +// ReindexEdge transaction. The caller flushes the accumulated batch +// after the whole pass via ReindexEdges so disk backends amortise +// the commit cost. +// warmLookupCache batches the per-edge GetNode / FindNodesByName the +// cross-repo worker loop would otherwise fire serially — the mirror of +// Resolver.warmLookupCache (resolver.go). It includes the authoritative +// negative: a queried name with no node records an empty result, so the +// 200k+ external-call stubs return from the cache instead of each +// scanning the unindexed name column (the warmup hang). +func (cr *CrossRepoResolver) warmLookupCache(pending []*graph.Edge) { + if len(pending) == 0 { + return + } + idSet := make(map[string]struct{}, len(pending)) + nameSet := make(map[string]struct{}, len(pending)) + qualNameSet := make(map[string]struct{}) + for _, e := range pending { + if e == nil { + continue + } + if e.From != "" { + idSet[e.From] = struct{}{} + } + bare := graph.UnresolvedName(e.To) + if name := identifierFromTarget(bare); name != "" { + nameSet[name] = struct{}{} + } + // Seed the RAW unresolved name too. This is pure scan-avoidance and + // changes no resolution outcome: the legit cross-repo matches use the + // bare identifier (seeded above) and resolve fine. The problem is the + // EXTERNAL / unresolvable residual that dominates this pass (stdlib + + // out-of-tree "calls" that never match a node): resolveFunctionCall + // looks them up by their full target (e.g. "extern::pkg::Foo"), which + // the stripped pre-warm key ("Foo") didn't cover, so they missed the + // cache and fell through to a per-edge FindNodesByName scan — the + // parallel cross-repo storm. Seeding the raw form lets them hit the + // authoritative negative instead of scanning. + if bare != "" { + nameSet[bare] = struct{}{} + } + // Import targets: mirror resolveEdge's dispatch (TrimPrefix of the + // bare unresolved:: form) so the seeded qual-name matches what + // resolveImport looks up via GetNodeByQualName. + if t := strings.TrimPrefix(e.To, unresolvedPrefix); strings.HasPrefix(t, "import::") { + if qn := strings.TrimPrefix(t, "import::"); qn != "" { + qualNameSet[qn] = struct{}{} + } + } + } + ids := make([]string, 0, len(idSet)) + for id := range idSet { + ids = append(ids, id) + } + names := make([]string, 0, len(nameSet)) + for n := range nameSet { + names = append(names, n) + } + cr.nodeByID = cr.graph.GetNodesByIDs(ids) + cr.nodesByName = cr.graph.FindNodesByNames(names) + // Authoritative negatives: record an empty result for every queried + // name that has no node, so the cached lookup returns empty instead + // of falling through to a per-edge FindNodesByName scan. + if cr.nodesByName == nil { + cr.nodesByName = make(map[string][]*graph.Node, len(nameSet)) + } + for n := range nameSet { + if _, ok := cr.nodesByName[n]; !ok { + cr.nodesByName[n] = nil + } + } + // Fold every candidate node into the id cache too, so a downstream + // GetNode on a chosen target hits instead of going to the store. + if cr.nodeByID == nil && len(cr.nodesByName) > 0 { + cr.nodeByID = make(map[string]*graph.Node, len(cr.nodesByName)) + } + for _, hits := range cr.nodesByName { + for _, n := range hits { + if n == nil || n.ID == "" { + continue + } + if _, ok := cr.nodeByID[n.ID]; !ok { + cr.nodeByID[n.ID] = n + } + } + } + // Index the name hits by repo so resolveFunctionCall / resolveMethodCall + // collect ONLY the caller's reachable-repo, same-language candidates + // instead of fetching every same-named node across all repos + languages + // and discarding the unreachable majority per edge (the cross-repo + // candidate-iteration cost). Every pre-warmed name gets an entry (empty + // for an authoritative negative) so scopedCandidates can distinguish + // "pre-warmed, no node" (return empty) from "not pre-warmed" (fall + // through to the flat cache). + cr.nodesByNameRepo = make(map[string]map[string][]*graph.Node, len(cr.nodesByName)) + for name, hits := range cr.nodesByName { + byRepo := make(map[string][]*graph.Node) + for _, n := range hits { + if n == nil { + continue + } + byRepo[n.RepoPrefix] = append(byRepo[n.RepoPrefix], n) + } + cr.nodesByNameRepo[name] = byRepo + } + // Pre-warm the import qual-name cache + authoritative negatives, so + // resolveImport's GetNodeByQualName hits instead of scanning the + // unindexed qual_name column per cross-repo import edge. + if len(qualNameSet) > 0 { + qns := make([]string, 0, len(qualNameSet)) + for q := range qualNameSet { + qns = append(qns, q) + } + cr.nodesByQualName = cr.graph.GetNodesByQualNames(qns) + if cr.nodesByQualName == nil { + cr.nodesByQualName = make(map[string]*graph.Node, len(qualNameSet)) + } + for q := range qualNameSet { + if _, ok := cr.nodesByQualName[q]; !ok { + cr.nodesByQualName[q] = nil + } + } + } +} + +func (cr *CrossRepoResolver) clearLookupCache() { + cr.nodeByID = nil + cr.nodesByName = nil + cr.nodesByNameRepo = nil + cr.nodesByQualName = nil +} + +// scopedCandidates returns the candidates named `name` the caller of e could +// plausibly resolve to: nodes in the caller's own repo, a repo its file +// imports (reachableReposByFile), or no repo (synthetic) — AND of the +// caller's language (a Go call can't bind a same-named TypeScript symbol). +// This applies the import + language prune at the SOURCE: cachedFindNodesByName +// returns every same-named node across all repos and languages (thousands for +// a common name), which the per-edge loops then iterate and discard; the +// per-pass name→repo index collects only the relevant few. Names absent from +// the index (not pre-warmed) fall through to the flat cache, preserving the +// negative-cache + correctness contract. +func (cr *CrossRepoResolver) scopedCandidates(e *graph.Edge, name string) []*graph.Node { + byRepo, ok := cr.nodesByNameRepo[name] + if !ok { + return cr.cachedFindNodesByName(name) + } + if len(byRepo) == 0 { + return nil // pre-warmed, no node (authoritative negative) + } + caller := cr.cachedGetNode(e.From) + callerRepo, callerLang, callerFile := "", "", e.FilePath + if caller != nil { + callerRepo = caller.RepoPrefix + callerLang = caller.Language + if caller.Kind == graph.KindFile { + callerFile = caller.ID + } else if caller.FilePath != "" { + callerFile = caller.FilePath + } + } + reachableRepos := cr.reachableReposByFile[callerFile] + var out []*graph.Node + keep := func(repo string) { + for _, n := range byRepo[repo] { + if callerLang == "" || n.Language == "" || n.Language == callerLang { + out = append(out, n) + } + } + } + keep(callerRepo) + if callerRepo != "" { + keep("") // synthetic / no-repo nodes are always reachable + } + for r := range reachableRepos { + if r != callerRepo && r != "" { + keep(r) + } + } + return out +} + +// cachedGetNode consults the per-pass id cache first, falling through to +// the store on a miss (positive-only: absence means "not pre-warmed"). +func (cr *CrossRepoResolver) cachedGetNode(id string) *graph.Node { + if id == "" { + return nil + } + if cr.nodeByID != nil { + if n, ok := cr.nodeByID[id]; ok { + return n + } + } + return cr.graph.GetNode(id) +} + +// cachedFindNodesByName consults the per-pass name cache first. A +// pre-warmed name with no node returns empty (authoritative negative); +// a name absent from the cache falls through to the store. +func (cr *CrossRepoResolver) cachedFindNodesByName(name string) []*graph.Node { + if name == "" { + return nil + } + if cr.nodesByName != nil { + if hits, ok := cr.nodesByName[name]; ok { + return hits + } + } + return cr.graph.FindNodesByName(name) +} + +// cachedGetNodeByQualName serves resolveImport's qual-name lookup from the +// per-pass cache (authoritative negative for queried-but-absent import +// paths), mirroring Resolver.cachedGetNodeByQualName. +func (cr *CrossRepoResolver) cachedGetNodeByQualName(qualName string) *graph.Node { + if qualName == "" { + return nil + } + if cr.nodesByQualName != nil { + if n, ok := cr.nodesByQualName[qualName]; ok { + return n + } + } + return cr.graph.GetNodeByQualName(qualName) +} + +func (cr *CrossRepoResolver) resolveEdge(e *graph.Edge, stats *CrossRepoStats, batch *[]graph.EdgeReindex) { oldTo := e.To - target := strings.TrimPrefix(e.To, unresolvedPrefix) + // UnresolvedName handles BOTH the bare `unresolved::X` and the + // multi-repo `::unresolved::X` forms; a plain TrimPrefix only + // strips the bare form, leaving prefixed stubs (which fix-1's widened + // EdgesWithUnresolvedTarget now feeds this pass) with target=full-id — + // so the lookup key matched no node and missed the per-pass name cache, + // turning every prefixed stub into a futile per-edge FindNodesByName + // scan. Mirrors the master Resolver.resolveEdge. + target := graph.UnresolvedName(e.To) + if target == "" { + target = strings.TrimPrefix(e.To, unresolvedPrefix) + } switch { case strings.HasPrefix(target, "import::"): @@ -410,13 +837,13 @@ func (cr *CrossRepoResolver) resolveEdge(e *graph.Edge, stats *CrossRepoStats) { } if e.To != oldTo { - cr.graph.ReindexEdge(e, oldTo) + *batch = append(*batch, graph.EdgeReindex{Edge: e, OldTo: oldTo}) } } // callerRepoPrefix returns the RepoPrefix of the node that owns the edge's From field. func (cr *CrossRepoResolver) callerRepoPrefix(e *graph.Edge) string { - fromNode := cr.graph.GetNode(e.From) + fromNode := cr.cachedGetNode(e.From) if fromNode != nil { return fromNode.RepoPrefix } @@ -424,7 +851,7 @@ func (cr *CrossRepoResolver) callerRepoPrefix(e *graph.Edge) string { } func (cr *CrossRepoResolver) resolveFunctionCall(e *graph.Edge, funcName string, stats *CrossRepoStats) { - candidates := cr.graph.FindNodesByName(funcName) + candidates := cr.scopedCandidates(e, funcName) if len(candidates) == 0 { stats.Unresolved++ return @@ -432,6 +859,7 @@ func (cr *CrossRepoResolver) resolveFunctionCall(e *graph.Edge, funcName string, callerRepo := cr.callerRepoPrefix(e) callerWS := cr.callerWorkspaceID(e) + reachable := cr.reachabilityChecker(e) // 1. Prefer same-repo match. for _, c := range candidates { @@ -455,7 +883,7 @@ func (cr *CrossRepoResolver) resolveFunctionCall(e *graph.Edge, funcName string, if c.Kind != graph.KindFunction && c.Kind != graph.KindMethod { continue } - if !cr.repoReachable(e, c.RepoPrefix) { + if !reachable(c.RepoPrefix) { continue } if !cr.crossWorkspaceEligible(callerWS, candidateWorkspaceID(c), "") { @@ -483,7 +911,7 @@ func (cr *CrossRepoResolver) resolveImport(e *graph.Edge, importPath string, sta importPath, npmAliased := rewriteNpmAliasImport(cr.npmAlias, e.FilePath, importPath) // Look for a package node with matching qualified name. - node := cr.graph.GetNodeByQualName(importPath) + node := cr.cachedGetNodeByQualName(importPath) if node != nil { // Workspace boundary check: if the candidate is in a // different workspace, allow only when an explicit @@ -563,10 +991,7 @@ func (cr *CrossRepoResolver) resolveImport(e *graph.Edge, importPath string, sta } } } else { - for _, n := range cr.graph.AllNodes() { - if n.Kind != graph.KindFile { - continue - } + for n := range cr.graph.NodesByKind(graph.KindFile) { dir := filepath.Dir(n.FilePath) if strings.HasSuffix(dir, lastPathComponent(importPath)) || dir == importPath { consider(n) @@ -616,7 +1041,7 @@ func (cr *CrossRepoResolver) resolveImport(e *graph.Edge, importPath string, sta // package node itself. See Resolver.resolveImport. if npmAliased { if pkg := npmPackagePrefix(importPath); pkg != "" { - if node := cr.graph.GetNodeByQualName(pkg); node != nil && + if node := cr.cachedGetNodeByQualName(pkg); node != nil && cr.crossWorkspaceEligible(callerWS, candidateWorkspaceID(node), pkg) { e.To = node.ID if node.RepoPrefix != callerRepo { @@ -636,7 +1061,7 @@ func (cr *CrossRepoResolver) resolveImport(e *graph.Edge, importPath string, sta } func (cr *CrossRepoResolver) resolveMethodCall(e *graph.Edge, methodName string, stats *CrossRepoStats) { - candidates := cr.graph.FindNodesByName(methodName) + candidates := cr.scopedCandidates(e, methodName) if len(candidates) == 0 { stats.Unresolved++ return @@ -645,6 +1070,7 @@ func (cr *CrossRepoResolver) resolveMethodCall(e *graph.Edge, methodName string, callerRepo := cr.callerRepoPrefix(e) callerWS := cr.callerWorkspaceID(e) receiverType := edgeReceiverType(e) + reachable := cr.reachabilityChecker(e) // If we have a type hint, try exact type match first. if receiverType != "" { @@ -665,7 +1091,7 @@ func (cr *CrossRepoResolver) resolveMethodCall(e *graph.Edge, methodName string, if c.Kind != graph.KindMethod || nodeReceiverType(c) != receiverType { continue } - if !cr.repoReachable(e, c.RepoPrefix) { + if !reachable(c.RepoPrefix) { continue } if !cr.crossWorkspaceEligible(callerWS, candidateWorkspaceID(c), "") { @@ -693,7 +1119,7 @@ func (cr *CrossRepoResolver) resolveMethodCall(e *graph.Edge, methodName string, if c.Kind != graph.KindMethod { continue } - if !cr.repoReachable(e, c.RepoPrefix) { + if !reachable(c.RepoPrefix) { continue } if !cr.crossWorkspaceEligible(callerWS, candidateWorkspaceID(c), "") { @@ -717,7 +1143,7 @@ func (cr *CrossRepoResolver) resolveMethodCall(e *graph.Edge, methodName string, if c.Kind != graph.KindFunction { continue } - if !cr.repoReachable(e, c.RepoPrefix) { + if !reachable(c.RepoPrefix) { continue } if !cr.crossWorkspaceEligible(callerWS, candidateWorkspaceID(c), "") { diff --git a/internal/resolver/cross_repo_edges.go b/internal/resolver/cross_repo_edges.go index aafaedcd..206bed30 100644 --- a/internal/resolver/cross_repo_edges.go +++ b/internal/resolver/cross_repo_edges.go @@ -25,12 +25,13 @@ import "github.com/zzet/gortex/internal/graph" // // Returns the count of cross-repo relationships found this pass — the // number of parallel edges that exist after it, modulo graph dedup. -func DetectCrossRepoEdges(g *graph.Graph) int { +func DetectCrossRepoEdges(g graph.Store) int { if g == nil { return 0 } emitted := 0 - for _, e := range g.AllEdges() { + for _, row := range crossRepoCandidates(g) { + e := row.Edge if e == nil { continue } @@ -38,21 +39,6 @@ func DetectCrossRepoEdges(g *graph.Graph) int { if !ok { continue } - from := g.GetNode(e.From) - to := g.GetNode(e.To) - if from == nil || to == nil { - // Unresolved / external / stdlib / dep stub targets never - // have a graph node — they cannot be cross-repo. - continue - } - if from.RepoPrefix == "" || to.RepoPrefix == "" { - // Single-repo graph (no prefixes) — nothing crosses a - // boundary. Also covers a node whose repo wasn't stamped. - continue - } - if from.RepoPrefix == to.RepoPrefix { - continue - } // Keep the bool flag on the base edge consistent with the // dedicated kind — existing consumers (smart_context's // cross_repo_dependencies, the Cypher / GraphML exporters) read @@ -71,11 +57,62 @@ func DetectCrossRepoEdges(g *graph.Graph) int { CrossRepo: true, Meta: map[string]any{ "base_kind": string(e.Kind), - "source_repo": from.RepoPrefix, - "target_repo": to.RepoPrefix, + "source_repo": row.FromRepo, + "target_repo": row.ToRepo, }, }) emitted++ } return emitted } + +// crossRepoCandidates returns every edge whose Kind has a parallel +// cross_repo_* kind AND whose endpoints carry two distinct, non-empty +// RepoPrefix values. Routed through the storage layer's +// CrossRepoCandidates capability when the backend implements it (one +// query — a join with the kind + repo-prefix filters in WHERE); falls +// back to the AllEdges + per-edge GetNode walk otherwise. +// +// The base-kind set is derived from graph.CrossRepoKindFor by +// iterating the in-process registry — the disk backend uses the same +// kind list verbatim so single-repo graphs return no rows without a +// whole-table scan. +func crossRepoCandidates(g graph.Store) []graph.CrossRepoCandidateRow { + baseKinds := graph.BaseKindsForCrossRepo() + if cap, ok := g.(graph.CrossRepoCandidates); ok { + return cap.CrossRepoCandidates(baseKinds) + } + if len(baseKinds) == 0 { + return nil + } + kset := make(map[graph.EdgeKind]struct{}, len(baseKinds)) + for _, k := range baseKinds { + kset[k] = struct{}{} + } + var out []graph.CrossRepoCandidateRow + for _, e := range g.AllEdges() { + if e == nil { + continue + } + if _, ok := kset[e.Kind]; !ok { + continue + } + from := g.GetNode(e.From) + to := g.GetNode(e.To) + if from == nil || to == nil { + continue + } + if from.RepoPrefix == "" || to.RepoPrefix == "" { + continue + } + if from.RepoPrefix == to.RepoPrefix { + continue + } + out = append(out, graph.CrossRepoCandidateRow{ + Edge: e, + FromRepo: from.RepoPrefix, + ToRepo: to.RepoPrefix, + }) + } + return out +} diff --git a/internal/resolver/cross_repo_edges_test.go b/internal/resolver/cross_repo_edges_test.go index 51e7961d..fac8519c 100644 --- a/internal/resolver/cross_repo_edges_test.go +++ b/internal/resolver/cross_repo_edges_test.go @@ -9,7 +9,7 @@ import ( // countOutEdgesByKind returns how many out-edges of the given kind the // node fromID has. -func countOutEdgesByKind(g *graph.Graph, fromID string, kind graph.EdgeKind) int { +func countOutEdgesByKind(g graph.Store, fromID string, kind graph.EdgeKind) int { n := 0 for _, e := range g.GetOutEdges(fromID) { if e.Kind == kind { @@ -21,7 +21,7 @@ func countOutEdgesByKind(g *graph.Graph, fromID string, kind graph.EdgeKind) int // firstOutEdgeByKind returns the first out-edge of fromID with the given // kind, or nil. -func firstOutEdgeByKind(g *graph.Graph, fromID string, kind graph.EdgeKind) *graph.Edge { +func firstOutEdgeByKind(g graph.Store, fromID string, kind graph.EdgeKind) *graph.Edge { for _, e := range g.GetOutEdges(fromID) { if e.Kind == kind { return e diff --git a/internal/resolver/cross_repo_test.go b/internal/resolver/cross_repo_test.go index cba906ff..b4d3407a 100644 --- a/internal/resolver/cross_repo_test.go +++ b/internal/resolver/cross_repo_test.go @@ -18,7 +18,7 @@ import ( // without it, a bare name like `Helper` could land on any repo that // happens to define a `Helper`, which is the exact name-collision // false-positive class this guards against. -func wireImport(g *graph.Graph, callerFile, targetRepo, targetFile string) { +func wireImport(g graph.Store, callerFile, targetRepo, targetFile string) { g.AddNode(&graph.Node{ ID: targetFile, Kind: graph.KindFile, Name: targetFile, FilePath: targetFile, Language: "go", RepoPrefix: targetRepo, diff --git a/internal/resolver/dep_module_test.go b/internal/resolver/dep_module_test.go index 54cc998f..511be7d2 100644 --- a/internal/resolver/dep_module_test.go +++ b/internal/resolver/dep_module_test.go @@ -10,7 +10,7 @@ import ( // addDepNode is a tiny helper to materialise a dep:: contract // node the way GoModExtractor + commitInlinedContractToGraph would. -func addDepNode(t *testing.T, g *graph.Graph, repoPrefix, modulePath string) { +func addDepNode(t *testing.T, g graph.Store, repoPrefix, modulePath string) { t.Helper() g.AddNode(&graph.Node{ ID: "dep::" + modulePath, diff --git a/internal/resolver/external_call_attribution.go b/internal/resolver/external_call_attribution.go new file mode 100644 index 00000000..9df41482 --- /dev/null +++ b/internal/resolver/external_call_attribution.go @@ -0,0 +1,221 @@ +package resolver + +import ( + "path" + "strings" + + "github.com/zzet/gortex/internal/graph" +) + +// attributeGoExternalCalls materialises a KindFunction node for every +// unique `stdlib::::` / `dep::::` +// / `external::::` edge target, plus a KindModule +// parent for each owning import path. Without this pass the targets +// are stubs in storage backends that enforce rel-table FK (the on-disk backend) +// and invisible nodes in the in-memory backend, so a query like +// `find_usages(stdlib::encoding/json::Marshal)` +// can't surface "every function in this codebase that calls +// json.Marshal" — the destination doesn't exist as a graph node. +// +// Mirrors the Python / Dart attributeNonGoModuleImports pass for Go. +// Runs after resolveExtern (which classifies extern targets into the +// three prefix buckets) so we materialise the post-classification +// state rather than the pre-classification `unresolved::extern::*` +// shape. +// +// ID conventions: +// - Module node: `module::go:` — shared across every +// repo that imports the same path. Carries +// Meta["ecosystem"]="go" and Meta["import_path"]=. +// Meta["role"]="stdlib" for stdlib paths. +// - Symbol node: the original `stdlib::*` / `dep::*` / +// `external::*` ID stays the symbol's ID so existing edges land +// on it without rewriting. Carries Meta["external"]=true and +// Meta["module_path"]=. +// - EdgeMemberOf: symbol → module so `get_callers` on the module +// surfaces every symbol used from that package. +// +// All AddNode / AddEdge calls are idempotent on ID, so a second run +// of this pass (incremental ResolveFile re-invocation) is a no-op. +func (r *Resolver) attributeGoExternalCalls() { + // Go-only pass: skip the external-prefix edge scan when the graph has + // no Go nodes. + if !r.graphHasLanguage("go") { + return + } + // Scan every edge whose target sits in one of the three external + // prefixes. Collect unique (repoPrefix, prefix, importPath, symbol) + // tuples so we materialise each one once even when many edges + // reference the same target. repoPrefix is included because + // stdlib stubs are per-repo (see internal/graph/stub.go) — two + // repos on different Go SDK versions emit semantically distinct + // `::stdlib::fmt::Errorf` and `::stdlib::fmt::Errorf` + // stubs that MUST round-trip through this attribution pass as + // distinct nodes, not collide into one. + type extKey struct { + repoPrefix, prefix, importPath, symbol string + } + seen := map[extKey]struct{}{} + depEdgesScan := func(kind graph.EdgeKind) { + for e := range r.graph.EdgesByKind(kind) { + if e.To == "" { + continue + } + prefix, importPath, symbol := splitGoExternalTarget(e.To) + if prefix == "" { + continue + } + seen[extKey{graph.StubRepoPrefix(e.To), prefix, importPath, symbol}] = struct{}{} + } + } + // Same edge-kind set as attributeGoBuiltins — anywhere an + // extern-prefixed target can show up. + for _, k := range []graph.EdgeKind{ + graph.EdgeCalls, + graph.EdgeReferences, + graph.EdgeReads, + graph.EdgeArgOf, + graph.EdgeValueFlow, + graph.EdgeReturnsTo, + graph.EdgeTypedAs, + graph.EdgeReturns, + graph.EdgeInstantiates, + graph.EdgeCaptures, + graph.EdgeThrows, + } { + depEdgesScan(k) + } + if len(seen) == 0 { + return + } + + // Materialise the parent KindModule for each unique import path, + // then the per-symbol KindFunction. Module-side dedupe is via + // the `modules` map; the per-symbol nodes are unique by (prefix, + // path, symbol) by construction. + // Module IDs are also per-repo now — a module node carries the + // same SDK-version sensitivity its symbols do. Key includes the + // repo prefix so two repos importing the same path get distinct + // module nodes. + type modKey struct{ repoPrefix, importPath string } + modules := map[modKey]string{} + for k := range seen { + modKey := modKey{repoPrefix: k.repoPrefix, importPath: k.importPath} + moduleID, ok := modules[modKey] + if !ok { + // Ecosystem + path are ONE stub segment joined by a single + // colon (`go:`), matching the npm convention + // (`module::npm:`) and every module-id consumer + // (tools_analyze_external_calls). Passing them as two + // StubID parts would emit `module::go::` (double + // colon) — the form that broke the attribution tests. + moduleID = graph.StubID(k.repoPrefix, graph.StubKindModule, "go:"+k.importPath) + modules[modKey] = moduleID + role := "external" + switch k.prefix { + case "stdlib::": + role = "stdlib" + case "dep::": + role = "dep" + } + r.graph.AddNode(&graph.Node{ + ID: moduleID, + Kind: graph.KindModule, + Name: lastImportSegment(k.importPath), + Language: "go", + Meta: map[string]any{ + "ecosystem": "go", + "role": role, + "import_path": k.importPath, + }, + }) + } + var symbolID string + switch k.prefix { + case "stdlib::": + symbolID = graph.StubID(k.repoPrefix, graph.StubKindStdlib, k.importPath, k.symbol) + default: + // dep:: / external:: keep their legacy unprefixed form for + // now — they aren't covered by the stub-prefix migration + // (different module paths already provide repo-level + // distinction; same version pinning is enforced by go.mod + // per-repo). + symbolID = k.prefix + k.importPath + "::" + k.symbol + } + r.graph.AddNode(&graph.Node{ + ID: symbolID, + Kind: graph.KindFunction, + Name: k.symbol, + Language: "go", + Meta: map[string]any{ + "external": true, + "module_path": k.importPath, + "module_role": map[string]string{ + "stdlib::": "stdlib", + "dep::": "dep", + "external::": "external", + }[k.prefix], + }, + }) + // EdgeMemberOf: symbol → module. AddEdge is idempotent on the + // edge-key tuple so a re-run doesn't duplicate. + r.graph.AddEdge(&graph.Edge{ + From: symbolID, + To: moduleID, + Kind: graph.EdgeMemberOf, + Origin: graph.OriginASTResolved, + }) + } +} + +// splitGoExternalTarget recognises the three external-target prefixes +// the resolver emits after resolveExtern. Returns the prefix +// (`stdlib::` / `dep::` / `external::`), the import path, and the +// symbol name. Returns ("", "", "") for any other shape so the pass +// can skip it cleanly. +// +// The stdlib case is matched via graph.IsStdlibStub so both the +// legacy `stdlib::fmt::Errorf` shape and the per-repo-prefixed +// `::stdlib::fmt::Errorf` shape (see internal/graph/stub.go) +// route the same way. The returned bucket label stays `stdlib::` for +// downstream `k.prefix == "stdlib::"` comparisons. +func splitGoExternalTarget(target string) (prefix, importPath, symbol string) { + var body string + switch { + case graph.IsStdlibStub(target): + prefix = "stdlib::" + body = graph.StubRest(target) + case strings.HasPrefix(target, "dep::"): + prefix = "dep::" + body = strings.TrimPrefix(target, prefix) + case strings.HasPrefix(target, "external::"): + prefix = "external::" + body = strings.TrimPrefix(target, prefix) + default: + return "", "", "" + } + // The body shape produced by resolveExtern is + // `::`. Split on the LAST `::` because import + // paths can include slashes but not `::`, so the rightmost + // separator is always between path and symbol. + sep := strings.LastIndex(body, "::") + if sep < 0 { + // `external::os` style (just the package, no symbol — + // the resolveImport path). Treat the whole body as the path + // and leave symbol empty so we still materialise the module + // node but skip the symbol. + return prefix, body, "" + } + return prefix, body[:sep], body[sep+2:] +} + +// lastImportSegment returns the rightmost path component, used as +// the human-readable Name on the KindModule node. For +// `github.com/stretchr/testify/assert` the segment is `assert`; for +// `encoding/json` it's `json`; for `fmt` it's `fmt`. +func lastImportSegment(importPath string) string { + if importPath == "" { + return "" + } + return path.Base(importPath) +} diff --git a/internal/resolver/external_call_attribution_test.go b/internal/resolver/external_call_attribution_test.go new file mode 100644 index 00000000..473722fd --- /dev/null +++ b/internal/resolver/external_call_attribution_test.go @@ -0,0 +1,141 @@ +package resolver + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +func TestAttributeGoExternalCalls_StdlibFunctionMaterialised(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::F" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "F", FilePath: "pkg/foo.go", Language: "go"}) + // Post-resolveExtern shape: an edge directly to stdlib::fmt::Sprintf. + edge := &graph.Edge{From: owner, To: "stdlib::fmt::Sprintf", Kind: graph.EdgeCalls, FilePath: "pkg/foo.go", Line: 5} + g.AddEdge(edge) + + New(g).attributeGoExternalCalls() + + // The symbol becomes a KindFunction with the right metadata. + sym := g.GetNode("stdlib::fmt::Sprintf") + require.NotNil(t, sym, "stdlib symbol must be materialised as a node") + assert.Equal(t, graph.KindFunction, sym.Kind) + assert.Equal(t, "Sprintf", sym.Name) + assert.Equal(t, "go", sym.Language) + assert.Equal(t, true, sym.Meta["external"]) + assert.Equal(t, "fmt", sym.Meta["module_path"]) + assert.Equal(t, "stdlib", sym.Meta["module_role"]) + + // And a KindModule parent under module::go:fmt with role=stdlib. + mod := g.GetNode("module::go:fmt") + require.NotNil(t, mod, "module parent must be materialised") + assert.Equal(t, graph.KindModule, mod.Kind) + assert.Equal(t, "fmt", mod.Name) + assert.Equal(t, "stdlib", mod.Meta["role"]) + assert.Equal(t, "go", mod.Meta["ecosystem"]) + + // EdgeMemberOf: symbol -> module. + var foundLink bool + for e := range g.EdgesByKind(graph.EdgeMemberOf) { + if e.From == "stdlib::fmt::Sprintf" && e.To == "module::go:fmt" { + foundLink = true + } + } + assert.True(t, foundLink, "symbol must be linked to its module via EdgeMemberOf") +} + +func TestAttributeGoExternalCalls_DepUsesFullImportPath(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::F" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "F", FilePath: "pkg/foo.go", Language: "go"}) + g.AddEdge(&graph.Edge{From: owner, To: "dep::github.com/stretchr/testify/assert::True", Kind: graph.EdgeCalls, FilePath: "pkg/foo.go", Line: 7}) + + New(g).attributeGoExternalCalls() + + sym := g.GetNode("dep::github.com/stretchr/testify/assert::True") + require.NotNil(t, sym) + assert.Equal(t, "True", sym.Name) + assert.Equal(t, "github.com/stretchr/testify/assert", sym.Meta["module_path"]) + assert.Equal(t, "dep", sym.Meta["module_role"]) + + mod := g.GetNode("module::go:github.com/stretchr/testify/assert") + require.NotNil(t, mod) + assert.Equal(t, "assert", mod.Name, "module name must be the last path segment, not the full import path") + assert.Equal(t, "dep", mod.Meta["role"]) +} + +func TestAttributeGoExternalCalls_ModuleNodeSharedAcrossSymbols(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::F" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "F", FilePath: "pkg/foo.go", Language: "go"}) + // Three different functions from the same stdlib package — all + // should attach to ONE module node, not three. + for _, sym := range []string{"Marshal", "Unmarshal", "RawMessage"} { + g.AddEdge(&graph.Edge{ + From: owner, To: "stdlib::encoding/json::" + sym, + Kind: graph.EdgeCalls, FilePath: "pkg/foo.go", Line: 1, + }) + } + + New(g).attributeGoExternalCalls() + + count := 0 + for n := range g.NodesByKind(graph.KindModule) { + if n.ID == "module::go:encoding/json" { + count++ + } + } + assert.Equal(t, 1, count, "exactly one KindModule per import path") +} + +func TestAttributeGoExternalCalls_IdempotentOnRerun(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::F" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "F", FilePath: "pkg/foo.go", Language: "go"}) + g.AddEdge(&graph.Edge{From: owner, To: "stdlib::os::Open", Kind: graph.EdgeCalls, FilePath: "pkg/foo.go", Line: 1}) + + r := New(g) + r.attributeGoExternalCalls() + r.attributeGoExternalCalls() // second run must not duplicate + + syms := 0 + for n := range g.NodesByKind(graph.KindFunction) { + if n.ID == "stdlib::os::Open" { + syms++ + } + } + assert.Equal(t, 1, syms, "second pass must not duplicate the symbol node") + + memberEdges := 0 + for e := range g.EdgesByKind(graph.EdgeMemberOf) { + if e.From == "stdlib::os::Open" && e.To == "module::go:os" { + memberEdges++ + } + } + assert.Equal(t, 1, memberEdges, "second pass must not duplicate the membership edge") +} + +func TestAttributeGoExternalCalls_NonExternEdgesIgnored(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::F" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "F", FilePath: "pkg/foo.go", Language: "go"}) + // Real intra-repo call — must not be touched. + g.AddNode(&graph.Node{ID: "pkg/bar.go::Helper", Kind: graph.KindFunction, Name: "Helper", FilePath: "pkg/bar.go", Language: "go"}) + g.AddEdge(&graph.Edge{From: owner, To: "pkg/bar.go::Helper", Kind: graph.EdgeCalls, FilePath: "pkg/foo.go", Line: 1}) + // And an unresolved bare name — also not in scope for this pass. + g.AddEdge(&graph.Edge{From: owner, To: "unresolved::doSomething", Kind: graph.EdgeCalls, FilePath: "pkg/foo.go", Line: 2}) + + before := []string{} + for n := range g.NodesByKind(graph.KindModule) { + before = append(before, n.ID) + } + New(g).attributeGoExternalCalls() + after := []string{} + for n := range g.NodesByKind(graph.KindModule) { + after = append(after, n.ID) + } + assert.Equal(t, before, after, "no module nodes should be created when there are no extern-prefixed targets") +} diff --git a/internal/resolver/external_calls.go b/internal/resolver/external_calls.go index d776c8e5..91c81a4e 100644 --- a/internal/resolver/external_calls.go +++ b/internal/resolver/external_calls.go @@ -67,7 +67,7 @@ const externalCallPrefix = "external-call::" // the external hop visible. Enabled is the opt-in gate // (`.gortex.yaml::index::synthesize_external_calls`); when false the // pass is a no-op and the graph is untouched. -func SynthesizeExternalCalls(g *graph.Graph, enabled bool) int { +func SynthesizeExternalCalls(g graph.Store, enabled bool) int { if g == nil || !enabled { return 0 } @@ -81,8 +81,20 @@ func SynthesizeExternalCalls(g *graph.Graph, enabled bool) int { defer mu.Unlock() synthesized := 0 - for _, e := range g.AllEdges() { - if e == nil || !isCallLikeEdge(e.Kind) { + var reindexBatch []graph.EdgeReindex + // First sweep: collect every candidate edge and the From IDs we'll + // need to read Language off. Narrow to the call-like edge kinds + // server-side via EdgesByKinds — AllEdges scanned the whole bucket + // just to filter Kind Go-side. + type candidate struct { + edge *graph.Edge + ecosystem, importPath string + } + var candidates []candidate + fromIDSet := map[string]struct{}{} + callKinds := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} + for e := range edgesByKinds(g, callKinds) { + if e == nil { continue } // Already pointing at a synthetic node — a prior run of this @@ -97,17 +109,35 @@ func SynthesizeExternalCalls(g *graph.Graph, enabled bool) int { if !ok { continue } - callerLang := edgeCallerLanguage(g, e) - if isLanguageStdlib(callerLang, importPath) { + candidates = append(candidates, candidate{edge: e, ecosystem: ecosystem, importPath: importPath}) + if e.From != "" { + fromIDSet[e.From] = struct{}{} + } + } + fromList := make([]string, 0, len(fromIDSet)) + for id := range fromIDSet { + fromList = append(fromList, id) + } + callerNodes := g.GetNodesByIDs(fromList) + + for _, c := range candidates { + e := c.edge + callerLang := "" + if from := callerNodes[e.From]; from != nil && from.Language != "" { + callerLang = from.Language + } else { + callerLang = langFamilyFromExt(e.FilePath) + } + if isLanguageStdlib(callerLang, c.importPath) { // Language built-in / standard library — noise. Leave the // edge on its bookkeeping-string terminal; a stdlib hop is // not a cross-system call worth a call-chain node. continue } - nodeID := externalCallNodeID(ecosystem, importPath) + nodeID := externalCallNodeID(c.ecosystem, c.importPath) if g.GetNode(nodeID) == nil { - g.AddNode(newExternalCallNode(nodeID, ecosystem, importPath, callerLang)) + g.AddNode(newExternalCallNode(nodeID, c.ecosystem, c.importPath, callerLang)) } oldTo := e.To @@ -124,9 +154,12 @@ func SynthesizeExternalCalls(g *graph.Graph, enabled bool) int { e.Meta = map[string]any{} } e.Meta["external_call"] = true - g.ReindexEdge(e, oldTo) + reindexBatch = append(reindexBatch, graph.EdgeReindex{Edge: e, OldTo: oldTo}) synthesized++ } + if len(reindexBatch) > 0 { + g.ReindexEdges(reindexBatch) + } return synthesized } @@ -155,8 +188,11 @@ func parseExternalCallTarget(target string) (ecosystem, importPath string, ok bo return "", "", false } return "dep", path, true - case strings.HasPrefix(target, "stdlib::"): - path := importPathOfExtern(strings.TrimPrefix(target, "stdlib::")) + case graph.IsStdlibStub(target): + // Handles both legacy `stdlib::::` and the + // per-repo-prefixed `::stdlib::::` shape + // (see internal/graph/stub.go). + path := importPathOfExtern(graph.StubRest(target)) if path == "" { return "", "", false } @@ -218,16 +254,6 @@ func newExternalCallNode(nodeID, ecosystem, importPath, callerLang string) *grap } } -// edgeCallerLanguage returns the source language of the node that owns -// the call edge's From end, falling back to the file extension of the -// edge's own FilePath when the caller node carries no Language. -func edgeCallerLanguage(g *graph.Graph, e *graph.Edge) string { - if from := g.GetNode(e.From); from != nil && from.Language != "" { - return from.Language - } - return langFamilyFromExt(e.FilePath) -} - // langFamilyFromExt maps a file extension to the coarse language label // stored on graph nodes. Distinct from builtins.go::langFromFilePath, // which collapses ts→ts/js→js for the built-in method tables; here we diff --git a/internal/resolver/external_calls_test.go b/internal/resolver/external_calls_test.go index f4afcd33..7af3d4d9 100644 --- a/internal/resolver/external_calls_test.go +++ b/internal/resolver/external_calls_test.go @@ -17,7 +17,7 @@ import ( // builder spans every ecosystem the external-call synthesis pass // classifies, so one table can exercise Go modules, pip packages, and // npm packages through the same real extract → resolve pipeline. -func buildMultiLangGraph(t *testing.T, files map[string]string) *graph.Graph { +func buildMultiLangGraph(t *testing.T, files map[string]string) graph.Store { t.Helper() g := graph.New() for path, src := range files { @@ -58,7 +58,7 @@ func buildMultiLangGraph(t *testing.T, files map[string]string) *graph.Graph { // with — and then the opt-in external-call synthesis pass. It mirrors // the indexer settle point: synthesis runs strictly after resolution + // guard, so the test exercises the same ordering the daemon uses. -func resolveAndSynthesize(g *graph.Graph, enabled bool) int { +func resolveAndSynthesize(g graph.Store, enabled bool) int { New(g).ResolveAll() return SynthesizeExternalCalls(g, enabled) } @@ -66,7 +66,7 @@ func resolveAndSynthesize(g *graph.Graph, enabled bool) int { // callTargetsFrom collects the To-end of every call/reference edge // leaving fromID, so a test can assert on the post-resolution shape of // a caller's outbound calls. -func callTargetsFrom(g *graph.Graph, fromID string) []string { +func callTargetsFrom(g graph.Store, fromID string) []string { var out []string for _, e := range g.GetOutEdges(fromID) { if e.Kind == graph.EdgeCalls || e.Kind == graph.EdgeReferences { diff --git a/internal/resolver/generic_param_bind.go b/internal/resolver/generic_param_bind.go new file mode 100644 index 00000000..18d8e0e9 --- /dev/null +++ b/internal/resolver/generic_param_bind.go @@ -0,0 +1,99 @@ +package resolver + +import ( + "strings" + + "github.com/zzet/gortex/internal/graph" +) + +// bindGenericParamRefs rewrites `unresolved::` edges where the +// name is a generic type parameter declared by the source's +// enclosing function. The Go extractor already materialises +// KindGenericParam nodes with IDs `#tparam:` and an +// EdgeMemberOf back to the owner — the resolver just hasn't been +// consulting them when an in-body reference (`var x T`, return type +// `T`, etc.) lands as `unresolved::T`. +// +// Side benefit beyond stub reduction: `find_usages` on a generic +// type parameter starts working — *"where in this generic function +// is T used?"* — which is a real refactoring query. +// +// Scope is per-function: a function's tparams are visible only +// inside its body. The owner-keyed index built here lets each edge +// resolve in O(1) without re-walking the graph. +func (r *Resolver) bindGenericParamRefs() { + // owner-function ID → set of tparam-name → tparam-node-id. + owned := map[string]map[string]string{} + for n := range r.graph.NodesByKind(graph.KindGenericParam) { + if n.Language != "go" || n.Name == "" { + continue + } + owner := enclosingFunctionForBinding(n.ID) + if owner == "" || owner == n.ID { + continue + } + set, ok := owned[owner] + if !ok { + set = map[string]string{} + owned[owner] = set + } + // Don't overwrite — two tparams with the same name in the + // same function shouldn't happen in valid Go, but be defensive. + if _, dup := set[n.Name]; dup { + set[n.Name] = "" + continue + } + set[n.Name] = n.ID + } + if len(owned) == 0 { + return + } + + var batch []graph.EdgeReindex + // We don't know up front which edge kinds carry type-param refs: + // EdgeReferences for `var x T`, EdgeTypedAs for parameters typed + // as T, EdgeReturns for return signature, EdgeInstantiates for + // generic instantiation expressions. Walk the union. + for _, k := range []graph.EdgeKind{ + graph.EdgeReferences, + graph.EdgeTypedAs, + graph.EdgeReturns, + graph.EdgeInstantiates, + } { + for e := range r.graph.EdgesByKind(k) { + if old := r.tryBindGenericParam(e, owned); old != "" { + batch = append(batch, graph.EdgeReindex{Edge: e, OldTo: old}) + } + } + } + if len(batch) > 0 { + r.graph.ReindexEdges(batch) + } +} + +// tryBindGenericParam returns the old To value (for batched reindex) +// when the edge was rewritten, or "" when left alone. +func (r *Resolver) tryBindGenericParam(e *graph.Edge, owned map[string]map[string]string) string { + if e == nil || !strings.HasPrefix(e.To, "unresolved::") { + return "" + } + name := strings.TrimPrefix(e.To, "unresolved::") + if name == "" || strings.ContainsAny(name, ".*:#") { + return "" + } + ownerID := enclosingFunctionForBinding(e.From) + if ownerID == "" { + return "" + } + set := owned[ownerID] + if len(set) == 0 { + return "" + } + target, ok := set[name] + if !ok || target == "" || target == e.To { + return "" + } + oldTo := e.To + e.To = target + return oldTo +} diff --git a/internal/resolver/generic_param_bind_test.go b/internal/resolver/generic_param_bind_test.go new file mode 100644 index 00000000..2d41b6c6 --- /dev/null +++ b/internal/resolver/generic_param_bind_test.go @@ -0,0 +1,71 @@ +package resolver + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/zzet/gortex/internal/graph" +) + +func TestBindGenericParamRefs_RewritesTRefToTParam(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Map" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Map", FilePath: "pkg/foo.go", Language: "go"}) + + tparamID := owner + "#tparam:T" + g.AddNode(&graph.Node{ID: tparamID, Kind: graph.KindGenericParam, Name: "T", FilePath: "pkg/foo.go", Language: "go"}) + g.AddEdge(&graph.Edge{From: tparamID, To: owner, Kind: graph.EdgeMemberOf}) + + // `var x T` inside Map's body — EdgeTypedAs from a local-ish + // source to the unresolved-T target. + from := owner + "#local:x@+3" + g.AddNode(&graph.Node{ID: from, Kind: graph.KindLocal, Name: "x", FilePath: "pkg/foo.go", StartLine: 3, Language: "go"}) + edge := &graph.Edge{From: from, To: "unresolved::T", Kind: graph.EdgeTypedAs, Line: 3} + g.AddEdge(edge) + + New(g).bindGenericParamRefs() + assert.Equal(t, tparamID, edge.To, "var x T must bind to the function's KindGenericParam T") +} + +func TestBindGenericParamRefs_OtherFunctionsLeftAlone(t *testing.T) { + g := graph.New() + // Function A declares tparam T. + a := "pkg/a.go::A" + g.AddNode(&graph.Node{ID: a, Kind: graph.KindFunction, Name: "A", FilePath: "pkg/a.go", Language: "go"}) + g.AddNode(&graph.Node{ID: a + "#tparam:T", Kind: graph.KindGenericParam, Name: "T", FilePath: "pkg/a.go", Language: "go"}) + g.AddEdge(&graph.Edge{From: a + "#tparam:T", To: a, Kind: graph.EdgeMemberOf}) + + // Function B has its OWN body and references `T`, but doesn't + // declare it. Pass must NOT bind to A's tparam. + b := "pkg/b.go::B" + g.AddNode(&graph.Node{ID: b, Kind: graph.KindFunction, Name: "B", FilePath: "pkg/b.go", Language: "go"}) + edge := &graph.Edge{From: b, To: "unresolved::T", Kind: graph.EdgeReferences, Line: 1} + g.AddEdge(edge) + + New(g).bindGenericParamRefs() + assert.Equal(t, "unresolved::T", edge.To, "must not cross-bind to another function's tparam") +} + +func TestBindGenericParamRefs_QualifiedShapesIgnored(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::F" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "F", FilePath: "pkg/foo.go", Language: "go"}) + g.AddNode(&graph.Node{ID: owner + "#tparam:T", Kind: graph.KindGenericParam, Name: "T", FilePath: "pkg/foo.go", Language: "go"}) + g.AddEdge(&graph.Edge{From: owner + "#tparam:T", To: owner, Kind: graph.EdgeMemberOf}) + + keep := []*graph.Edge{ + {From: owner, To: "unresolved::*.T", Kind: graph.EdgeReferences, Line: 1}, + {From: owner, To: "unresolved::pkg.T", Kind: graph.EdgeReferences, Line: 2}, + } + for _, e := range keep { + g.AddEdge(e) + } + New(g).bindGenericParamRefs() + for _, e := range keep { + assert.True(t, + e.To == "unresolved::*.T" || e.To == "unresolved::pkg.T", + "qualified shape %q must be left alone", e.To, + ) + } +} diff --git a/internal/resolver/go_builtins_attribution.go b/internal/resolver/go_builtins_attribution.go new file mode 100644 index 00000000..58f0a4e6 --- /dev/null +++ b/internal/resolver/go_builtins_attribution.go @@ -0,0 +1,186 @@ +package resolver + +import ( + "strings" + + "github.com/zzet/gortex/internal/graph" +) + +// goBuiltinFuncs is the complete set of pre-declared Go built-in +// functions. Source: https://pkg.go.dev/builtin (functions section). +// Kept in sync with the language spec — when a new builtin lands +// (e.g. clear / min / max in Go 1.21) add it here. +var goBuiltinFuncs = map[string]struct{}{ + "append": {}, "cap": {}, "clear": {}, "close": {}, "complex": {}, + "copy": {}, "delete": {}, "imag": {}, "len": {}, "make": {}, + "max": {}, "min": {}, "new": {}, "panic": {}, "print": {}, + "println": {}, "real": {}, "recover": {}, +} + +// goBuiltinTypes is the complete set of pre-declared Go built-in +// types. Source: https://pkg.go.dev/builtin (types section). +var goBuiltinTypes = map[string]struct{}{ + "any": {}, "bool": {}, "byte": {}, "comparable": {}, + "complex64": {}, "complex128": {}, "error": {}, + "float32": {}, "float64": {}, + "int": {}, "int8": {}, "int16": {}, "int32": {}, "int64": {}, + "rune": {}, "string": {}, + "uint": {}, "uint8": {}, "uint16": {}, "uint32": {}, "uint64": {}, + "uintptr": {}, +} + +// goBuiltinConsts is the set of pre-declared Go constants (true, +// false, iota, nil). Mostly emitted for completeness — `true` / +// `false` rarely show up as unresolved edge targets in practice +// because the parser handles them inline. +var goBuiltinConsts = map[string]struct{}{ + "true": {}, "false": {}, "iota": {}, "nil": {}, +} + +// attributeGoBuiltins rewrites `unresolved::` edges whose name +// is a Go language intrinsic onto the canonical `builtin::go::*` ID, +// and materialises a single KindBuiltin node per unique builtin so +// the rewritten edges land at a real graph node instead of a +// rel-table FK stub. Mirrors the existing builtin::py / builtin::ts +// classifier in internal/resolver/builtins.go but completes the +// pattern by also creating nodes for the targets — so +// `find_usages(builtin::go::type::float64)` answers "every variable +// typed as float64 in this codebase", and the on-disk-backend stub +// inflation drops by ~50k rows on a gortex-scale Go codebase. +// +// Three ID namespaces under `builtin::go::`: +// +// functions: builtin::go:: (append, len, make, ...) +// types: builtin::go::type:: (string, int, float64, ...) +// constants: builtin::go::const:: (true, false, iota, nil) +// +// Functions get the shortest namespace because their fan-in is the +// biggest and the shorter ID is what most downstream `find_usages` +// queries will type. +func (r *Resolver) attributeGoBuiltins() { + // Go-only pass: skip the multi-kind edge scan entirely when the graph + // has no Go nodes (e.g. a TS/Python repo). + if !r.graphHasLanguage("go") { + return + } + materialised := map[string]struct{}{} + var batch []graph.EdgeReindex + + // Every edge kind a builtin can be the target of. Type-system + // edges (typed_as / returns) carry type references; call / + // arg-of / value-flow carry function or const references. + for _, k := range []graph.EdgeKind{ + graph.EdgeCalls, + graph.EdgeReferences, + graph.EdgeReads, + graph.EdgeArgOf, + graph.EdgeValueFlow, + graph.EdgeReturnsTo, + graph.EdgeTypedAs, + graph.EdgeReturns, + graph.EdgeInstantiates, + graph.EdgeCaptures, + graph.EdgeThrows, + } { + for e := range r.graph.EdgesByKind(k) { + if old := r.tryAttributeGoBuiltin(e, materialised); old != "" { + batch = append(batch, graph.EdgeReindex{Edge: e, OldTo: old}) + } + } + } + if len(batch) > 0 { + r.graph.ReindexEdges(batch) + } +} + +// tryAttributeGoBuiltin checks if e.To is `unresolved::` +// where bareName is a Go builtin and the source language is Go (the +// source is inside a Go function / file). On a match it materialises +// the target node (once per unique ID), rewrites e.To, and returns +// the old To value for the batched reindex. Returns "" when the edge +// is left alone. +func (r *Resolver) tryAttributeGoBuiltin(e *graph.Edge, materialised map[string]struct{}) string { + if e == nil || !strings.HasPrefix(e.To, "unresolved::") { + return "" + } + name := strings.TrimPrefix(e.To, "unresolved::") + if name == "" || strings.ContainsAny(name, ".*:#") { + return "" + } + // Only attribute when the source is Go. Without this guard a + // Python reference to a local named `len` would get re-targeted + // at Go's builtin `len`, which would be obviously wrong. + if !r.fromIsGo(e.From) { + return "" + } + newID, kind, builtinKind := goBuiltinTarget(r.callerRepoPrefix(e), name) + if newID == "" { + return "" + } + if _, ok := materialised[newID]; !ok { + // AddNode is idempotent on ID, so even a second + // concurrent pass would not duplicate the row. + r.graph.AddNode(&graph.Node{ + ID: newID, + Kind: kind, + Name: name, + Language: "go", + Meta: map[string]any{ + "builtin": true, + "builtin_kind": builtinKind, + }, + }) + materialised[newID] = struct{}{} + } + oldTo := e.To + e.To = newID + return oldTo +} + +// goBuiltinTarget classifies a bare identifier as one of Go's +// intrinsics. Returns the canonical builtin::go:: ID (per-repo +// prefixed via graph.StubID — see internal/graph/stub.go for why +// two repos can disagree on what's a builtin), the NodeKind to +// materialise it under (always KindBuiltin), and a meta tag +// recording which subspace (func / type / const) it belongs to. +// Returns ("", "", "") when the name is not a Go builtin. +// repoPrefix is the owning repo's RepoPrefix (empty in +// single-repo / legacy callers). +func goBuiltinTarget(repoPrefix, name string) (id string, kind graph.NodeKind, builtinKind string) { + if _, ok := goBuiltinFuncs[name]; ok { + return graph.StubID(repoPrefix, graph.StubKindBuiltin, "go", name), graph.KindBuiltin, "func" + } + if _, ok := goBuiltinTypes[name]; ok { + return graph.StubID(repoPrefix, graph.StubKindBuiltin, "go", "type", name), graph.KindBuiltin, "type" + } + if _, ok := goBuiltinConsts[name]; ok { + return graph.StubID(repoPrefix, graph.StubKindBuiltin, "go", "const", name), graph.KindBuiltin, "const" + } + return "", "", "" +} + +// fromIsGo reports whether the source endpoint of an edge sits +// inside Go code. Uses the From's enclosing function (via the same +// suffix-stripping helper bare-name binding uses) — Go is the only +// language whose IDs follow the `file.go::Func` convention with a +// `.go` extension, so a path-based check is both cheap and reliable. +func (r *Resolver) fromIsGo(fromID string) bool { + owner := enclosingFunctionForBinding(fromID) + if owner == "" { + return false + } + if i := strings.Index(owner, "::"); i > 0 { + // `pkg/foo.go::Func` shape — peek at the file extension. + head := owner[:i] + if strings.HasSuffix(head, ".go") { + return true + } + } + // Fall back to looking up the owner node and checking its + // Language. More expensive but covers edge cases where the ID + // doesn't follow the `.go::Func` pattern. + if n := r.cachedGetNode(owner); n != nil && n.Language == "go" { + return true + } + return false +} diff --git a/internal/resolver/go_builtins_attribution_test.go b/internal/resolver/go_builtins_attribution_test.go new file mode 100644 index 00000000..48cc0f45 --- /dev/null +++ b/internal/resolver/go_builtins_attribution_test.go @@ -0,0 +1,115 @@ +package resolver + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +func TestAttributeGoBuiltins_FunctionCall(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Run" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Run", FilePath: "pkg/foo.go", Language: "go"}) + edge := &graph.Edge{From: owner, To: "unresolved::append", Kind: graph.EdgeArgOf, FilePath: "pkg/foo.go", Line: 5} + g.AddEdge(edge) + + New(g).attributeGoBuiltins() + + assert.Equal(t, "builtin::go::append", edge.To, + "call to `append` must retarget onto builtin::go::append") + n := g.GetNode("builtin::go::append") + require.NotNil(t, n, "KindBuiltin node must be materialised") + assert.Equal(t, graph.KindBuiltin, n.Kind) + assert.Equal(t, "append", n.Name) + assert.Equal(t, "go", n.Language) + assert.Equal(t, true, n.Meta["builtin"]) + assert.Equal(t, "func", n.Meta["builtin_kind"]) +} + +func TestAttributeGoBuiltins_Type(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Handler" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Handler", FilePath: "pkg/foo.go", Language: "go"}) + + paramID := owner + "#param:s" + g.AddNode(&graph.Node{ID: paramID, Kind: graph.KindParam, Name: "s", FilePath: "pkg/foo.go", Language: "go"}) + edge := &graph.Edge{From: paramID, To: "unresolved::string", Kind: graph.EdgeTypedAs, FilePath: "pkg/foo.go", Line: 1} + g.AddEdge(edge) + + New(g).attributeGoBuiltins() + + assert.Equal(t, "builtin::go::type::string", edge.To, + "typed_as `string` must retarget onto builtin::go::type::string") + n := g.GetNode("builtin::go::type::string") + require.NotNil(t, n) + assert.Equal(t, graph.KindBuiltin, n.Kind) + assert.Equal(t, "type", n.Meta["builtin_kind"]) +} + +func TestAttributeGoBuiltins_DedupedAcrossManyEdges(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::F" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "F", FilePath: "pkg/foo.go", Language: "go"}) + + // Many calls to len from the same function. + for i := 1; i <= 5; i++ { + g.AddEdge(&graph.Edge{From: owner, To: "unresolved::len", Kind: graph.EdgeArgOf, FilePath: "pkg/foo.go", Line: i}) + } + + New(g).attributeGoBuiltins() + + // Exactly one KindBuiltin node should be created regardless of + // how many edges referenced it. + count := 0 + for n := range g.NodesByKind(graph.KindBuiltin) { + if n.ID == "builtin::go::len" { + count++ + } + } + assert.Equal(t, 1, count, "exactly one KindBuiltin per unique builtin") +} + +func TestAttributeGoBuiltins_NonGoLeftAlone(t *testing.T) { + g := graph.New() + // A Python source emitting a reference to `len` (Python builtin) + // — must NOT get attributed to Go's `builtin::go::len`. + owner := "pkg/app.py::process" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "process", FilePath: "pkg/app.py", Language: "python"}) + edge := &graph.Edge{From: owner, To: "unresolved::len", Kind: graph.EdgeArgOf, FilePath: "pkg/app.py", Line: 1} + g.AddEdge(edge) + + New(g).attributeGoBuiltins() + + assert.Equal(t, "unresolved::len", edge.To, + "Python source must NOT cross-bind to Go's len builtin") +} + +func TestAttributeGoBuiltins_UnknownNameLeftAlone(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::F" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "F", FilePath: "pkg/foo.go", Language: "go"}) + edge := &graph.Edge{From: owner, To: "unresolved::myCustomFunc", Kind: graph.EdgeArgOf, FilePath: "pkg/foo.go", Line: 1} + g.AddEdge(edge) + + New(g).attributeGoBuiltins() + + assert.Equal(t, "unresolved::myCustomFunc", edge.To, + "non-builtin names must stay unresolved") +} + +func TestAttributeGoBuiltins_QualifiedShapeLeftAlone(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::F" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "F", FilePath: "pkg/foo.go", Language: "go"}) + + // `*.len` is qualified — leave to other passes. + edge := &graph.Edge{From: owner, To: "unresolved::*.len", Kind: graph.EdgeArgOf, FilePath: "pkg/foo.go", Line: 1} + g.AddEdge(edge) + + New(g).attributeGoBuiltins() + + assert.Equal(t, "unresolved::*.len", edge.To, "qualified `*.len` shape must be left alone") +} diff --git a/internal/resolver/grpc_stub_calls.go b/internal/resolver/grpc_stub_calls.go index cc4f2b2a..0b94a3b1 100644 --- a/internal/resolver/grpc_stub_calls.go +++ b/internal/resolver/grpc_stub_calls.go @@ -50,15 +50,25 @@ const grpcStubPrefix = unresolvedPrefix + "grpc::" // // Returns the number of grpc.stub edges pointing at a resolved handler // after the pass. -func ResolveGRPCStubCalls(g *graph.Graph) int { +func ResolveGRPCStubCalls(g graph.Store) int { if g == nil { return 0 } idx := buildGRPCHandlerIndex(g) resolved := 0 - for _, e := range g.AllEdges() { - if e == nil || e.Kind != graph.EdgeCalls || e.Meta == nil { + var reindexBatch []graph.EdgeReindex + // First pass: collect every grpc.stub edge plus the From IDs we'll + // need to read RepoPrefix off, so the per-edge GetNode below + // collapses to a single GetNodesByIDs batch on disk backends. + type stubEdge struct { + edge *graph.Edge + service, method string + } + var stubs []stubEdge + fromIDs := make(map[string]struct{}) + for e := range g.EdgesByKind(graph.EdgeCalls) { + if e == nil || e.Meta == nil { continue } if v, _ := e.Meta["via"].(string); v != "grpc.stub" { @@ -69,16 +79,28 @@ func ResolveGRPCStubCalls(g *graph.Graph) int { if service == "" || method == "" { continue } + stubs = append(stubs, stubEdge{edge: e, service: service, method: method}) + if e.From != "" { + fromIDs[e.From] = struct{}{} + } + } + fromList := make([]string, 0, len(fromIDs)) + for id := range fromIDs { + fromList = append(fromList, id) + } + callerNodes := g.GetNodesByIDs(fromList) + for _, s := range stubs { + e := s.edge callerRepo := "" - if from := g.GetNode(e.From); from != nil { + if from := callerNodes[e.From]; from != nil { callerRepo = from.RepoPrefix } - handlerID, origin, conf := idx.lookup(service, method, callerRepo) + handlerID, origin, conf := idx.lookup(s.service, s.method, callerRepo) want := handlerID if want == "" { - want = grpcStubPlaceholder(service, method) + want = grpcStubPlaceholder(s.service, s.method) } if e.To == want { if handlerID != "" { @@ -104,7 +126,10 @@ func ResolveGRPCStubCalls(g *graph.Graph) int { e.ConfidenceLabel = "" delete(e.Meta, "grpc_resolution") } - g.ReindexEdge(e, oldTo) + reindexBatch = append(reindexBatch, graph.EdgeReindex{Edge: e, OldTo: oldTo}) + } + if len(reindexBatch) > 0 { + g.ReindexEdges(reindexBatch) } return resolved } @@ -138,10 +163,11 @@ func (idx *grpcHandlerIndex) lookup(service, method, callerRepo string) (id, ori // buildGRPCHandlerIndex walks the graph once and indexes server-side // gRPC handler methods by service, via both discovery signals. -func buildGRPCHandlerIndex(g *graph.Graph) *grpcHandlerIndex { +func buildGRPCHandlerIndex(g graph.Store) *grpcHandlerIndex { typesByName := map[string][]*graph.Node{} ifacesByName := map[string][]*graph.Node{} - for _, n := range g.AllNodes() { + typeAndIfaceNodes := nodesByKindsOrAll(g, graph.KindType, graph.KindInterface) + for _, n := range typeAndIfaceNodes { switch n.Kind { case graph.KindType: typesByName[n.Name] = append(typesByName[n.Name], n) @@ -151,28 +177,42 @@ func buildGRPCHandlerIndex(g *graph.Graph) *grpcHandlerIndex { } // methodsByType: type node ID → its method nodes (via EdgeMemberOf). - // implementorsByIface: interface node ID → implementing type node IDs. + // Use the MemberMethodsByType capability — projects only the four + // columns we read (id/name/file/line) per row, no per-edge GetNode. + rawMembers := memberMethodInfosByType(g) methodsByType := map[string][]*graph.Node{} + for typeID, infos := range rawMembers { + nodes := make([]*graph.Node, 0, len(infos)) + for _, m := range infos { + nodes = append(nodes, &graph.Node{ + ID: m.MethodID, + Kind: graph.KindMethod, + Name: m.Name, + FilePath: m.FilePath, + StartLine: m.StartLine, + RepoPrefix: m.RepoPrefix, + }) + } + methodsByType[typeID] = nodes + } + + // implementorsByIface: interface node ID → implementing type node + // IDs. Pull only EdgeImplements; the From IDs are kept as-is for the + // later impl filter (Unimplemented*). implementorsByIface := map[string][]string{} var registrations []*graph.Edge - for _, e := range g.AllEdges() { + for e := range g.EdgesByKind(graph.EdgeImplements) { if e == nil { continue } - switch e.Kind { - case graph.EdgeMemberOf: - mn := g.GetNode(e.From) - if mn != nil && mn.Kind == graph.KindMethod { - methodsByType[e.To] = append(methodsByType[e.To], mn) - } - case graph.EdgeImplements: - implementorsByIface[e.To] = append(implementorsByIface[e.To], e.From) - case graph.EdgeCalls: - if e.Meta != nil { - if svc, _ := e.Meta["grpc_register_service"].(string); svc != "" { - registrations = append(registrations, e) - } - } + implementorsByIface[e.To] = append(implementorsByIface[e.To], e.From) + } + for e := range g.EdgesByKind(graph.EdgeCalls) { + if e == nil || e.Meta == nil { + continue + } + if svc, _ := e.Meta["grpc_register_service"].(string); svc != "" { + registrations = append(registrations, e) } } @@ -181,6 +221,17 @@ func buildGRPCHandlerIndex(g *graph.Graph) *grpcHandlerIndex { iface: map[string][]*graph.Node{}, } + // Prefetch the From nodes for every registration call so the + // per-registration repo / dir lookup collapses to a single batch + // GetNodesByIDs on disk backends. + regFromIDs := make([]string, 0, len(registrations)) + for _, e := range registrations { + if e.From != "" { + regFromIDs = append(regFromIDs, e.From) + } + } + regFromNodes := g.GetNodesByIDs(regFromIDs) + // Signal 1: registration calls. Resolve the impl type named by the // registration's second argument, then index its methods. for _, e := range registrations { @@ -190,7 +241,7 @@ func buildGRPCHandlerIndex(g *graph.Graph) *grpcHandlerIndex { continue } regRepo, regDir := "", "" - if from := g.GetNode(e.From); from != nil { + if from := regFromNodes[e.From]; from != nil { regRepo = from.RepoPrefix regDir = grpcParentDir(from.FilePath) } @@ -201,6 +252,29 @@ func buildGRPCHandlerIndex(g *graph.Graph) *grpcHandlerIndex { idx.registration[service] = append(idx.registration[service], methodsByType[typeNode.ID]...) } + // Prefetch every implementor type referenced by a `Server` + // interface so the per-implementor GetNode in Signal 2 collapses to + // a batch. + implTypeIDs := make(map[string]struct{}) + for name, ifaceNodes := range ifacesByName { + const sfx = "Server" + if len(name) <= len(sfx) || !strings.HasSuffix(name, sfx) { + continue + } + for _, ifn := range ifaceNodes { + for _, typeID := range implementorsByIface[ifn.ID] { + if typeID != "" { + implTypeIDs[typeID] = struct{}{} + } + } + } + } + implTypeList := make([]string, 0, len(implTypeIDs)) + for id := range implTypeIDs { + implTypeList = append(implTypeList, id) + } + implTypeNodes := g.GetNodesByIDs(implTypeList) + // Signal 2: the `Server` interface and the concrete types // that implement it. The generated `UnimplementedServer` // stub also implements the interface — skip it so the fallback @@ -213,7 +287,7 @@ func buildGRPCHandlerIndex(g *graph.Graph) *grpcHandlerIndex { service := name[:len(name)-len(sfx)] for _, ifn := range ifaceNodes { for _, typeID := range implementorsByIface[ifn.ID] { - tn := g.GetNode(typeID) + tn := implTypeNodes[typeID] if tn == nil || strings.HasPrefix(tn.Name, "Unimplemented") { continue } diff --git a/internal/resolver/grpc_stub_calls_test.go b/internal/resolver/grpc_stub_calls_test.go index 76cbcbf1..6bbb314e 100644 --- a/internal/resolver/grpc_stub_calls_test.go +++ b/internal/resolver/grpc_stub_calls_test.go @@ -14,7 +14,7 @@ import ( // grpc.stub call edge, and a server-side handler discoverable via // registration and/or interface satisfaction. type grpcTestGraph struct { - g *graph.Graph + g graph.Store } func newGRPCTestGraph() *grpcTestGraph { return &grpcTestGraph{g: graph.New()} } diff --git a/internal/resolver/language_gate.go b/internal/resolver/language_gate.go new file mode 100644 index 00000000..b95c3d87 --- /dev/null +++ b/internal/resolver/language_gate.go @@ -0,0 +1,42 @@ +package resolver + +import ( + "iter" + + "github.com/zzet/gortex/internal/graph" +) + +// graphHasLanguage reports whether the backing store contains any node of +// the given language. Cheap — a LIMIT-1 probe — on stores that implement +// it (the on-disk backend); conservatively returns true on stores that don't, so a +// language-gated pass still runs rather than being silently skipped. Lets +// the Go / Python attribution passes skip a graph that has none of their +// language instead of scanning + discarding the whole node/edge set. +func (r *Resolver) graphHasLanguage(lang string) bool { + if hl, ok := r.graph.(interface{ HasLanguage(string) bool }); ok { + return hl.HasLanguage(lang) + } + return true +} + +// nodesByKindLang yields nodes of the given kind AND language, pushed +// server-side when the store supports it (so only the matching language's +// nodes cross the cgo boundary), else NodesByKind + an in-Go language +// filter (memory / overlay are already in-memory, so there is no marshal +// cost to push down). +func (r *Resolver) nodesByKindLang(kind graph.NodeKind, lang string) iter.Seq[*graph.Node] { + if nl, ok := r.graph.(interface { + NodesByKindLang(graph.NodeKind, string) iter.Seq[*graph.Node] + }); ok { + return nl.NodesByKindLang(kind, lang) + } + return func(yield func(*graph.Node) bool) { + for n := range r.graph.NodesByKind(kind) { + if n != nil && n.Language == lang { + if !yield(n) { + return + } + } + } + } +} diff --git a/internal/resolver/method_receiver_rebind.go b/internal/resolver/method_receiver_rebind.go new file mode 100644 index 00000000..2d6fad92 --- /dev/null +++ b/internal/resolver/method_receiver_rebind.go @@ -0,0 +1,124 @@ +package resolver + +import ( + "path/filepath" + "strings" + + "github.com/zzet/gortex/internal/graph" +) + +// rebindGoMethodReceivers fixes Go EdgeMemberOf edges whose target is +// a phantom `::TypeName` ID — the artefact of the Go +// extractor building the receiver-type endpoint from the method's own +// file rather than the file the type is actually declared in. Methods +// spread across multiple files in the same package each emit a +// different `::Type` target even though they all logically +// belong to the single type node defined elsewhere. +// +// Without this pass: +// - the on-disk backend materialises phantom Node rows to satisfy the +// rel-table FK on every cross-file method-receiver edge; +// - InferImplements builds a typeID → method-set map keyed on the +// phantom IDs, so a type whose methods span N files appears as N +// partial types each with a fraction of the real method set, and +// interface satisfaction is under-detected; +// - find_implementations / get_class_hierarchy / get_callers over +// interface methods all return partial results for cross-file- +// method types (which is most of any non-trivial Go codebase). +// +// Algorithm: index every Go KindType / KindInterface node by +// (filepath.Dir(file), name); walk EdgeMemberOf; for each Go method +// whose To doesn't resolve, look up (its file's dir, type name); if +// exactly one match, rewrite edge.To to the canonical type ID via +// ReindexEdges (one batched commit instead of per-edge round-trips). +// +// Scope: Go only — other languages (Java / TS / Python) group methods +// inside the class body in the same file, so the cross-file pattern +// doesn't arise. The method node's Language gates the rebind. +func (r *Resolver) rebindGoMethodReceivers() { + type pkgKey struct{ pkg, name string } + typesIdx := make(map[pkgKey]string) + for _, kind := range []graph.NodeKind{graph.KindType, graph.KindInterface} { + // Server-side language scope: only Go type/interface nodes cross + // the cgo boundary. On a graph with few/no Go types (e.g. a TS + // repo) this avoids marshaling + meta-decoding every type node + // just to discard the non-Go majority — the bulk of this pass's + // cost on a large single-language graph. + for n := range r.nodesByKindLang(kind, "go") { + if n.Name == "" || n.FilePath == "" { + continue + } + k := pkgKey{filepath.Dir(n.FilePath), n.Name} + if existing, ok := typesIdx[k]; ok && existing != n.ID { + // Two distinct type nodes with the same name in the + // same package directory shouldn't happen in valid Go, + // but guard against it — leave the edge alone rather + // than pick an arbitrary winner. + typesIdx[k] = "" + continue + } + typesIdx[k] = n.ID + } + } + if len(typesIdx) == 0 { + return + } + // Materialise the MemberOf edges and batch-load their endpoints in one + // GetNodesByIDs: a per-edge GetNode(e.From)+GetNode(e.To) here is two + // query round-trips per method on a disk backend — across tens of + // thousands of methods it was a multi-minute cold-warmup stall. + var memberOf []*graph.Edge + ids := make(map[string]struct{}) + for e := range r.graph.EdgesByKind(graph.EdgeMemberOf) { + memberOf = append(memberOf, e) + if e.From != "" { + ids[e.From] = struct{}{} + } + if e.To != "" { + ids[e.To] = struct{}{} + } + } + if len(memberOf) == 0 { + return + } + idList := make([]string, 0, len(ids)) + for id := range ids { + idList = append(idList, id) + } + nodes := r.graph.GetNodesByIDs(idList) + + var batch []graph.EdgeReindex + for _, e := range memberOf { + method := nodes[e.From] + if method == nil || method.Language != "go" || method.Kind != graph.KindMethod { + continue + } + // Already resolves to a real type node — same-file methods + // land here. Nothing to do. + if n := nodes[e.To]; n != nil && (n.Kind == graph.KindType || n.Kind == graph.KindInterface) { + continue + } + // Parse `::`. The split is on the LAST + // `::` so paths embedded in the ID (none in Go, but stay + // defensive) can't trip us up. + i := strings.LastIndex(e.To, "::") + if i <= 0 { + continue + } + file := e.To[:i] + typeName := e.To[i+2:] + if file == "" || typeName == "" { + continue + } + canonicalID, ok := typesIdx[pkgKey{filepath.Dir(file), typeName}] + if !ok || canonicalID == "" || canonicalID == e.To { + continue + } + oldTo := e.To + e.To = canonicalID + batch = append(batch, graph.EdgeReindex{Edge: e, OldTo: oldTo}) + } + if len(batch) > 0 { + r.graph.ReindexEdges(batch) + } +} diff --git a/internal/resolver/method_receiver_rebind_test.go b/internal/resolver/method_receiver_rebind_test.go new file mode 100644 index 00000000..9222bf5b --- /dev/null +++ b/internal/resolver/method_receiver_rebind_test.go @@ -0,0 +1,135 @@ +package resolver + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +// TestRebindGoMethodReceivers_CollapsesCrossFileMethods is the +// regression for the Go extractor emitting EdgeMemberOf targets as +// ::TypeName. When methods on the same type live in +// different files of the same package, the parser produces a phantom +// type ID per method-file; the rebind pass must collapse them onto +// the canonical ::TypeName node so InferImplements and the +// downstream MCP tools (find_implementations, class_hierarchy) see +// the consolidated method set. +func TestRebindGoMethodReceivers_CollapsesCrossFileMethods(t *testing.T) { + g := graph.New() + + // Type defined in indexer.go. + typeID := "internal/indexer/indexer.go::Indexer" + g.AddNode(&graph.Node{ + ID: typeID, Kind: graph.KindType, Name: "Indexer", + FilePath: "internal/indexer/indexer.go", Language: "go", + }) + + // Method declared in a *different* file in the same package — the + // parser emits a phantom receiver target. + methodID := "internal/indexer/crash_isolation.go::Indexer.crashIsolationEnabled" + g.AddNode(&graph.Node{ + ID: methodID, Kind: graph.KindMethod, Name: "crashIsolationEnabled", + FilePath: "internal/indexer/crash_isolation.go", Language: "go", + }) + phantomTarget := "internal/indexer/crash_isolation.go::Indexer" + memberEdge := &graph.Edge{ + From: methodID, To: phantomTarget, Kind: graph.EdgeMemberOf, + FilePath: "internal/indexer/crash_isolation.go", Line: 23, + } + g.AddEdge(memberEdge) + + // Sanity: pre-pass the phantom target has no real node. + require.Nil(t, g.GetNode(phantomTarget), "phantom target must not exist as a real node") + + r := New(g) + r.rebindGoMethodReceivers() + + // Post-pass: the edge points at the canonical type node. + assert.Equal(t, typeID, memberEdge.To, + "EdgeMemberOf must be rewritten from ::Type to canonical ::Type") + + // And the same-file method on the type works too — covered by not + // breaking a control case: + g2 := graph.New() + g2.AddNode(&graph.Node{ + ID: "pkg/foo.go::Foo", Kind: graph.KindType, Name: "Foo", + FilePath: "pkg/foo.go", Language: "go", + }) + g2.AddNode(&graph.Node{ + ID: "pkg/foo.go::Foo.Bar", Kind: graph.KindMethod, Name: "Bar", + FilePath: "pkg/foo.go", Language: "go", + }) + sameFileEdge := &graph.Edge{ + From: "pkg/foo.go::Foo.Bar", To: "pkg/foo.go::Foo", + Kind: graph.EdgeMemberOf, FilePath: "pkg/foo.go", Line: 5, + } + g2.AddEdge(sameFileEdge) + + New(g2).rebindGoMethodReceivers() + assert.Equal(t, "pkg/foo.go::Foo", sameFileEdge.To, + "same-file method edge must be left unchanged") +} + +// TestRebindGoMethodReceivers_LanguageGated guards against the pass +// rewriting non-Go EdgeMemberOf edges. Java/TS/Python group methods +// in the class body so their EdgeMemberOf targets are already +// in-file; we don't want the pass touching them. +func TestRebindGoMethodReceivers_LanguageGated(t *testing.T) { + g := graph.New() + + // A type and a method in the same Go package — would normally be + // a rebind candidate. + g.AddNode(&graph.Node{ + ID: "pkg/types.go::Server", Kind: graph.KindType, Name: "Server", + FilePath: "pkg/types.go", Language: "go", + }) + // But the METHOD is declared as TypeScript (e.g. a TS extractor + // that emits the same EdgeMemberOf shape for some bridging + // reason). Pass must leave it alone. + tsMethod := &graph.Node{ + ID: "pkg/handler.ts::Server.serve", Kind: graph.KindMethod, Name: "serve", + FilePath: "pkg/handler.ts", Language: "typescript", + } + g.AddNode(tsMethod) + edge := &graph.Edge{ + From: tsMethod.ID, To: "pkg/handler.ts::Server", + Kind: graph.EdgeMemberOf, FilePath: "pkg/handler.ts", Line: 1, + } + g.AddEdge(edge) + + New(g).rebindGoMethodReceivers() + assert.Equal(t, "pkg/handler.ts::Server", edge.To, + "non-Go method edge must NOT be rewritten by the Go-only rebind pass") +} + +// TestRebindGoMethodReceivers_AmbiguousNameSkipped guards against the +// pass picking an arbitrary winner when two distinct types share the +// same name in the same package (shouldn't happen in valid Go, but +// the pass should leave the phantom alone rather than mis-bind). +func TestRebindGoMethodReceivers_AmbiguousNameSkipped(t *testing.T) { + g := graph.New() + g.AddNode(&graph.Node{ + ID: "pkg/a.go::Dup", Kind: graph.KindType, Name: "Dup", + FilePath: "pkg/a.go", Language: "go", + }) + g.AddNode(&graph.Node{ + ID: "pkg/b.go::Dup", Kind: graph.KindType, Name: "Dup", + FilePath: "pkg/b.go", Language: "go", + }) + g.AddNode(&graph.Node{ + ID: "pkg/c.go::Dup.M", Kind: graph.KindMethod, Name: "M", + FilePath: "pkg/c.go", Language: "go", + }) + edge := &graph.Edge{ + From: "pkg/c.go::Dup.M", To: "pkg/c.go::Dup", + Kind: graph.EdgeMemberOf, FilePath: "pkg/c.go", Line: 1, + } + g.AddEdge(edge) + + New(g).rebindGoMethodReceivers() + assert.Equal(t, "pkg/c.go::Dup", edge.To, + "ambiguous type name in same package must leave the edge phantom rather than guess") +} diff --git a/internal/resolver/module_attribution.go b/internal/resolver/module_attribution.go index 1b16f795..78f1ba4f 100644 --- a/internal/resolver/module_attribution.go +++ b/internal/resolver/module_attribution.go @@ -29,6 +29,12 @@ import ( // per-pass set so a second invocation in the same ResolveAll burst // emits no duplicate EdgeDependsOnModule edges. func (r *Resolver) attributeNonGoModuleImports() { + // Python/Dart-only attribution (nonGoImportToModuleID handles exactly + // those two ecosystems). Skip the EdgeImports scan when the graph has + // neither language. + if !r.graphHasLanguage("python") && !r.graphHasLanguage("dart") { + return + } fileLang := r.collectFileLanguages() type pendingEdge struct { edge *graph.Edge @@ -39,10 +45,7 @@ func (r *Resolver) attributeNonGoModuleImports() { moduleSeeds := map[string]moduleSeed{} dependsSeen := map[string]map[string]struct{}{} // fileID → set of moduleIDs - for _, e := range r.graph.AllEdges() { - if e.Kind != graph.EdgeImports { - continue - } + for e := range r.graph.EdgesByKind(graph.EdgeImports) { if !strings.HasPrefix(e.To, "external::") { continue } @@ -72,21 +75,46 @@ func (r *Resolver) attributeNonGoModuleImports() { } // Materialise module nodes first; later loops assume the - // node exists when we add EdgeDependsOnModule. + // node exists when we add EdgeDependsOnModule. Batch the + // presence check via GetNodesByIDs so disk backends do one + // indexed SELECT IN (...) instead of one per-seed GetNode. + seedIDs := make([]string, 0, len(moduleSeeds)) + for id := range moduleSeeds { + seedIDs = append(seedIDs, id) + } + existing := r.graph.GetNodesByIDs(seedIDs) for _, seed := range moduleSeeds { - if r.graph.GetNode(seed.id) != nil { + if _, ok := existing[seed.id]; ok { continue } r.graph.AddNode(buildNonGoModuleNode(seed)) } - // Rewrite each EdgeImports target and re-bucket via - // ReindexEdge so find_usages on the new module sees the - // caller file. + // Pre-build a set of every (fileID, moduleID) pair the graph + // already has an EdgeDependsOnModule edge for. The old code + // called hasDependsOnModule per rewrite, which on a disk backend + // fans out to N per-file GetOutEdges queries (50k+ on a + // gortex-scale pass). One EdgesByKind scan is an indexed range + // read on every backend, plus a Go-side map build that turns + // the per-rewrite check into a constant-time lookup. + existingDepends := make(map[string]map[string]struct{}) + for e := range r.graph.EdgesByKind(graph.EdgeDependsOnModule) { + set := existingDepends[e.From] + if set == nil { + set = make(map[string]struct{}) + existingDepends[e.From] = set + } + set[e.To] = struct{}{} + } + + // Rewrite each EdgeImports target and collect the re-bucket + // jobs into one batch so disk backends commit in chunks rather + // than once per import rewrite. + reindexBatch := make([]graph.EdgeReindex, 0, len(rewrites)) for _, p := range rewrites { p.edge.To = p.moduleID p.edge.Origin = graph.OriginASTResolved - r.graph.ReindexEdge(p.edge, p.oldTo) + reindexBatch = append(reindexBatch, graph.EdgeReindex{Edge: p.edge, OldTo: p.oldTo}) set, ok := dependsSeen[p.edge.From] if !ok { @@ -99,9 +127,12 @@ func (r *Resolver) attributeNonGoModuleImports() { set[p.moduleID] = struct{}{} // Avoid emitting a duplicate EdgeDependsOnModule when an // earlier pass already wired one (e.g. cold + warm - // indexing of the same file). - if r.hasDependsOnModule(p.edge.From, p.moduleID) { - continue + // indexing of the same file). Constant-time map lookup + // against the pre-built existingDepends index. + if existing, ok := existingDepends[p.edge.From]; ok { + if _, dup := existing[p.moduleID]; dup { + continue + } } r.graph.AddEdge(&graph.Edge{ From: p.edge.From, @@ -114,31 +145,21 @@ func (r *Resolver) attributeNonGoModuleImports() { Origin: graph.OriginASTResolved, }) } + if len(reindexBatch) > 0 { + r.graph.ReindexEdges(reindexBatch) + } } // collectFileLanguages walks KindFile nodes once and returns // (file ID → language) for the per-edge dispatch above. func (r *Resolver) collectFileLanguages() map[string]string { out := map[string]string{} - for _, n := range r.graph.AllNodes() { - if n.Kind == graph.KindFile { - out[n.ID] = n.Language - } + for n := range r.graph.NodesByKind(graph.KindFile) { + out[n.ID] = n.Language } return out } -// hasDependsOnModule reports whether the file already has an -// outgoing EdgeDependsOnModule pointing at moduleID. -func (r *Resolver) hasDependsOnModule(fileID, moduleID string) bool { - for _, e := range r.graph.GetOutEdges(fileID) { - if e.Kind == graph.EdgeDependsOnModule && e.To == moduleID { - return true - } - } - return false -} - // nonGoImportToModuleID maps a (language, importPath) pair to its // canonical KindModule ID. The second return value is the module's // own language tag (used at materialisation time so a stdlib module @@ -261,106 +282,106 @@ type moduleSeed struct { // the list covers everything the typical app reaches into, and // false negatives at most degrade the audit's separation of concerns. var pythonStdlibTops = map[string]struct{}{ - "abc": {}, - "argparse": {}, - "array": {}, - "ast": {}, - "asyncio": {}, - "base64": {}, - "binascii": {}, - "bisect": {}, - "builtins": {}, - "calendar": {}, - "cmath": {}, - "collections": {}, - "concurrent": {}, - "configparser": {}, - "contextlib": {}, - "contextvars": {}, - "copy": {}, - "csv": {}, - "ctypes": {}, - "dataclasses": {}, - "datetime": {}, - "decimal": {}, - "difflib": {}, - "dis": {}, - "email": {}, - "enum": {}, - "errno": {}, - "fnmatch": {}, - "fractions": {}, - "functools": {}, - "gc": {}, - "getopt": {}, - "gettext": {}, - "glob": {}, - "gzip": {}, - "hashlib": {}, - "heapq": {}, - "hmac": {}, - "html": {}, - "http": {}, - "imaplib": {}, - "importlib": {}, - "inspect": {}, - "io": {}, - "ipaddress": {}, - "itertools": {}, - "json": {}, - "keyword": {}, - "linecache": {}, - "locale": {}, - "logging": {}, - "math": {}, - "mimetypes": {}, + "abc": {}, + "argparse": {}, + "array": {}, + "ast": {}, + "asyncio": {}, + "base64": {}, + "binascii": {}, + "bisect": {}, + "builtins": {}, + "calendar": {}, + "cmath": {}, + "collections": {}, + "concurrent": {}, + "configparser": {}, + "contextlib": {}, + "contextvars": {}, + "copy": {}, + "csv": {}, + "ctypes": {}, + "dataclasses": {}, + "datetime": {}, + "decimal": {}, + "difflib": {}, + "dis": {}, + "email": {}, + "enum": {}, + "errno": {}, + "fnmatch": {}, + "fractions": {}, + "functools": {}, + "gc": {}, + "getopt": {}, + "gettext": {}, + "glob": {}, + "gzip": {}, + "hashlib": {}, + "heapq": {}, + "hmac": {}, + "html": {}, + "http": {}, + "imaplib": {}, + "importlib": {}, + "inspect": {}, + "io": {}, + "ipaddress": {}, + "itertools": {}, + "json": {}, + "keyword": {}, + "linecache": {}, + "locale": {}, + "logging": {}, + "math": {}, + "mimetypes": {}, "multiprocessing": {}, - "numbers": {}, - "operator": {}, - "os": {}, - "pathlib": {}, - "pickle": {}, - "platform": {}, - "posixpath": {}, - "pprint": {}, - "queue": {}, - "random": {}, - "re": {}, - "secrets": {}, - "shutil": {}, - "signal": {}, - "smtplib": {}, - "socket": {}, - "sqlite3": {}, - "ssl": {}, - "stat": {}, - "statistics": {}, - "string": {}, - "struct": {}, - "subprocess": {}, - "sys": {}, - "sysconfig": {}, - "tarfile": {}, - "tempfile": {}, - "textwrap": {}, - "threading": {}, - "time": {}, - "timeit": {}, - "tokenize": {}, - "traceback": {}, - "types": {}, - "typing": {}, - "unicodedata": {}, - "unittest": {}, - "urllib": {}, - "uuid": {}, - "warnings": {}, - "weakref": {}, - "xml": {}, - "xmlrpc": {}, - "zipfile": {}, - "zlib": {}, - "zoneinfo": {}, + "numbers": {}, + "operator": {}, + "os": {}, + "pathlib": {}, + "pickle": {}, + "platform": {}, + "posixpath": {}, + "pprint": {}, + "queue": {}, + "random": {}, + "re": {}, + "secrets": {}, + "shutil": {}, + "signal": {}, + "smtplib": {}, + "socket": {}, + "sqlite3": {}, + "ssl": {}, + "stat": {}, + "statistics": {}, + "string": {}, + "struct": {}, + "subprocess": {}, + "sys": {}, + "sysconfig": {}, + "tarfile": {}, + "tempfile": {}, + "textwrap": {}, + "threading": {}, + "time": {}, + "timeit": {}, + "tokenize": {}, + "traceback": {}, + "types": {}, + "typing": {}, + "unicodedata": {}, + "unittest": {}, + "urllib": {}, + "uuid": {}, + "warnings": {}, + "weakref": {}, + "xml": {}, + "xmlrpc": {}, + "zipfile": {}, + "zlib": {}, + "zoneinfo": {}, } func isPythonStdlibTop(name string) bool { diff --git a/internal/resolver/module_attribution_test.go b/internal/resolver/module_attribution_test.go index f6b72d66..1a8f139d 100644 --- a/internal/resolver/module_attribution_test.go +++ b/internal/resolver/module_attribution_test.go @@ -11,7 +11,7 @@ import ( // seedFile adds a KindFile node with the given language to the // graph; tests use it to drive the language-aware attribution pass. -func seedFile(g *graph.Graph, fileID, language string) { +func seedFile(g graph.Store, fileID, language string) { g.AddNode(&graph.Node{ ID: fileID, Kind: graph.KindFile, Name: fileID, FilePath: fileID, Language: language, @@ -21,7 +21,7 @@ func seedFile(g *graph.Graph, fileID, language string) { // seedExternalImport drops in an EdgeImports edge that's already // landed at an `external::*` target — the post-pass inputs we want // to exercise. -func seedExternalImport(g *graph.Graph, fileID, importPath string) *graph.Edge { +func seedExternalImport(g graph.Store, fileID, importPath string) *graph.Edge { e := &graph.Edge{ From: fileID, To: "external::" + importPath, @@ -179,7 +179,7 @@ func TestAttributeNonGo_IdempotentOnSecondPass(t *testing.T) { // outEdgesOfKind is a small filter over Graph.GetOutEdges for the // assertions above; declared here to keep the test file self- // contained. -func outEdgesOfKind(g *graph.Graph, fileID string, kind graph.EdgeKind) []*graph.Edge { +func outEdgesOfKind(g graph.Store, fileID string, kind graph.EdgeKind) []*graph.Edge { var out []*graph.Edge for _, e := range g.GetOutEdges(fileID) { if e.Kind == kind { diff --git a/internal/resolver/relative_imports.go b/internal/resolver/relative_imports.go index b87b8419..efc23be6 100644 --- a/internal/resolver/relative_imports.go +++ b/internal/resolver/relative_imports.go @@ -21,11 +21,60 @@ import ( // target file is not in the graph stay as `external::*` so the // module-attribution pass can decide what to do with them. func (r *Resolver) resolveRelativeImports() { + // Python/Dart relative-import resolution only; skip the File-node + + // edge walk when the graph has neither language. + if !r.graphHasLanguage("python") && !r.graphHasLanguage("dart") { + return + } fileLang := r.collectFileLanguages() - for _, e := range r.graph.AllEdges() { - if e.Kind != graph.EdgeImports { - continue + var reindexBatch []graph.EdgeReindex + + // Pre-build a map of every KindFile node's ID. The relative- + // import resolvers below check 1-2 candidate IDs per edge to + // decide whether a target file exists; doing that as a per-edge + // GetNode (a per-edge round-trip on a disk backend) is what made + // this pass dominate disk-backed resolve time. One NodesByKind scan + // materialises the set once at indexed cost; lookups become + // O(1) map hits. + fileIDs := make(map[string]struct{}, 1024) + for n := range r.graph.NodesByKind(graph.KindFile) { + if n != nil && n.ID != "" { + fileIDs[n.ID] = struct{}{} + } + } + resolvePython := func(stem string) string { + if !strings.Contains(stem, "/") { + return "" + } + for _, cand := range []string{stem + ".py", stem + "/__init__.py"} { + if _, ok := fileIDs[cand]; ok { + return cand + } } + return "" + } + resolveDart := func(importingFile, uri string) string { + if uri == "" || strings.HasPrefix(uri, "dart:") || strings.HasPrefix(uri, "package:") { + return "" + } + dir := "" + if i := strings.LastIndex(importingFile, "/"); i >= 0 { + dir = importingFile[:i] + } + target := joinRelativePath(dir, uri) + if target == "" { + return "" + } + if _, ok := fileIDs[target]; ok { + return target + } + return "" + } + + // EdgesByKind pushes the "kind = imports" filter into the store; + // disk backends only enumerate import edges instead of every + // edge in the graph. + for e := range r.graph.EdgesByKind(graph.EdgeImports) { lang, ok := fileLang[e.From] if !ok { continue @@ -38,7 +87,7 @@ func (r *Resolver) resolveRelativeImports() { // Always resolvable via internal-file lookup. path = strings.TrimPrefix(e.To, "unresolved::pyrel::") if lang == "python" { - resolved = resolvePythonRelativeImport(r.graph, path) + resolved = resolvePython(path) } case strings.HasPrefix(e.To, "external::"): // Fallthrough path for Dart relative URIs the main @@ -48,9 +97,9 @@ func (r *Resolver) resolveRelativeImports() { path = strings.TrimPrefix(e.To, "external::") switch lang { case "python": - resolved = resolvePythonRelativeImport(r.graph, path) + resolved = resolvePython(path) case "dart": - resolved = resolveDartRelativeImport(r.graph, e.From, path) + resolved = resolveDart(e.From, path) } default: continue @@ -62,57 +111,18 @@ func (r *Resolver) resolveRelativeImports() { if strings.HasPrefix(e.To, "unresolved::pyrel::") { oldTo := e.To e.To = "external::" + path - r.graph.ReindexEdge(e, oldTo) + reindexBatch = append(reindexBatch, graph.EdgeReindex{Edge: e, OldTo: oldTo}) } continue } oldTo := e.To e.To = resolved e.Origin = graph.OriginASTResolved - r.graph.ReindexEdge(e, oldTo) - } -} - -// resolvePythonRelativeImport maps a project-rooted Python file-path -// stem ("app/util", "pkg/sub") to the matching `KindFile` node ID. -// Tries `.py` first, then `/__init__.py` (package). Returns -// "" if no candidate exists in the graph or if `stem` doesn't look like -// a relative-import stem (no slash separator — those are absolute -// module references handled by attributeNonGoModuleImports). -func resolvePythonRelativeImport(g *graph.Graph, stem string) string { - if !strings.Contains(stem, "/") { - return "" - } - for _, cand := range []string{stem + ".py", stem + "/__init__.py"} { - if n := g.GetNode(cand); n != nil && n.Kind == graph.KindFile { - return n.ID - } - } - return "" -} - -// resolveDartRelativeImport joins a relative Dart import URI against -// the importing file's directory and returns the matching `KindFile` -// node ID. Paths starting with `dart:` or `package:` are caller- -// validated to belong to the module-attribution pass and are skipped -// here. Returns "" when the resolved path escapes the repo root or -// when the target file is not in the graph. -func resolveDartRelativeImport(g *graph.Graph, importingFile, uri string) string { - if uri == "" || strings.HasPrefix(uri, "dart:") || strings.HasPrefix(uri, "package:") { - return "" - } - dir := "" - if i := strings.LastIndex(importingFile, "/"); i >= 0 { - dir = importingFile[:i] - } - target := joinRelativePath(dir, uri) - if target == "" { - return "" + reindexBatch = append(reindexBatch, graph.EdgeReindex{Edge: e, OldTo: oldTo}) } - if n := g.GetNode(target); n != nil && n.Kind == graph.KindFile { - return n.ID + if len(reindexBatch) > 0 { + r.graph.ReindexEdges(reindexBatch) } - return "" } // joinRelativePath joins a relative URI onto a directory and collapses diff --git a/internal/resolver/resolver.go b/internal/resolver/resolver.go index a99f79c1..b8cb90b3 100644 --- a/internal/resolver/resolver.go +++ b/internal/resolver/resolver.go @@ -1,11 +1,16 @@ package resolver import ( + "iter" "path/filepath" "runtime" "sort" "strings" "sync" + "sync/atomic" + "time" + + "go.uber.org/zap" "github.com/zzet/gortex/internal/graph" ) @@ -35,7 +40,8 @@ type ResolveStats struct { // Indexer.IndexFile) crash the daemon with "concurrent map writes" // in buildDirIndexes. type Resolver struct { - graph *graph.Graph + graph graph.Store + logger *zap.Logger dirIndex map[string][]*graph.Node lastDirIndex map[string][]*graph.Node // providesForIdx maps `provides_for: AbstractName` (from @Module @@ -68,7 +74,7 @@ type Resolver struct { // pass, torn down at the end. depModuleIndex map[string][]depModuleEntry // mu serialises resolution phases against the shared graph. - // Pointer so every Resolver built from the same *graph.Graph + // Pointer so every Resolver built from the same graph.Store // locks the same mutex — necessary for MultiIndexer's per-repo // goroutines, each of which spawns its own Resolver instance. // Without the shared lock, concurrent ResolveAll passes race on @@ -76,6 +82,20 @@ type Resolver struct { // goroutine iterates via graph.AllEdges()). mu *sync.Mutex + // lookupCache holds per-pass batched results from GetNodesByIDs / + // FindNodesByNames. Populated by ResolveAll/ResolveFile before + // the worker fan-out and cleared on return. Workers consult these + // maps first; misses fall through to the underlying Store. + // + // Without the cache, the resolver fires ~3-10 store point lookups + // per pending edge — across 10-30k unresolved edges that's 100k+ + // queries, each one a round trip on disk backends (~ms each). + // With the cache the same information lands in two batched + // queries per pass. + nodeByID map[string]*graph.Node + nodesByName map[string][]*graph.Node + nodesByQualName map[string]*graph.Node + // lspHelper, when non-nil, is consulted before falling back to // AST heuristics for cross-file dispatch in languages whose // helper-reported extensions match (today: TS/JS/JSX/TSX via @@ -121,12 +141,50 @@ type depModuleEntry struct { node *graph.Node } -// New creates a Resolver for the given graph. The returned Resolver -// shares graph.ResolveMutex() with every other Resolver built from -// the same Graph, so their ResolveAll / ResolveFile calls serialise -// end-to-end. -func New(g *graph.Graph) *Resolver { - return &Resolver{graph: g, mu: g.ResolveMutex()} +// New creates a Resolver for the given store. The returned Resolver +// shares store.ResolveMutex() with every other Resolver built from +// the same Store, so their ResolveAll / ResolveFile calls serialise +// end-to-end across cross-repo / temporal / external passes. +func New(g graph.Store) *Resolver { + return &Resolver{graph: g, mu: g.ResolveMutex(), logger: zap.NewNop()} +} + +// SetLogger attaches a logger so ResolveAll emits pass-progress +// (pending count, periodic compute progress, compute/apply elapsed). +// A nil logger is replaced with a no-op so the resolver never panics +// when constructed without one (every direct caller of New gets Nop). +func (r *Resolver) SetLogger(l *zap.Logger) { + if l == nil { + l = zap.NewNop() + } + r.logger = l +} + +// SetGraph retargets the Resolver at a different Store. The indexer's +// in-memory shadow-swap path needs this: the Resolver is constructed +// against the disk Store at indexer-New time, but during IndexCtx the +// indexer reassigns its own graph pointer to an in-memory shadow. +// Without SetGraph the Resolver kept reading the (empty) disk Store +// and short-circuited on len(pending) == 0, silently disabling every +// resolver pass for backends that opt into the shadow swap. +// +// Holds the resolve mutex so a concurrent ResolveAll / ResolveFile +// can't observe a half-rotated graph reference, and switches mu to +// the new store's resolve mutex so subsequent passes serialise +// against any Resolver built directly on the new Store. +func (r *Resolver) SetGraph(g graph.Store) { + if g == nil { + return + } + oldMu := r.mu + if oldMu != nil { + oldMu.Lock() + } + r.graph = g + r.mu = g.ResolveMutex() + if oldMu != nil { + oldMu.Unlock() + } } // ResolveAll resolves all unresolved edges in the graph. @@ -159,20 +217,79 @@ func (r *Resolver) ResolveAll() *ResolveStats { defer r.clearReachabilityIndex() defer r.clearLSPIndex() - edges := r.graph.AllEdges() - // Pre-filter to the unresolved subset so workers don't burn time - // re-walking the whole edge slice — ~95% of edges in a settled - // graph are already resolved. - pending := edges[:0:0] - for _, e := range edges { - if strings.HasPrefix(e.To, unresolvedPrefix) { - pending = append(pending, e) - } + // Backend-delegated resolution: when the store implements + // graph.BackendResolver, drain the bulk-tractable subset of the + // resolver's work via a sequence of queries that run + // inside the backend engine. ON BY DEFAULT — opt out with + // GORTEX_BACKEND_RESOLVER=0 (see backendResolverEnabled). ResolveAllBulk + // chains the per-rule methods (SameFile → SamePackage → ImportAware → …) + // in precision-descending order, so higher-precision rules bind first + // and unique-name fallback only resolves what nothing more specific + // covered. + // + // This is the disk-only / large-repo path: without it the Go worker + // pool's ~100k+ per-edge round trips dominate wall time. The bulk pass + // drains the name-equality-tractable edges in-engine before the Go pool + // runs on whatever's left. Errors are non-fatal — the Go resolver + // re-runs on the remainder. + if backendResolverEnabled() { + if br, ok := r.graph.(graph.BackendResolver); ok { + bulkStart := time.Now() + n, err := br.ResolveAllBulk() + r.logger.Info("resolver: backend bulk pass", + zap.Int("resolved", n), + zap.Duration("elapsed", time.Since(bulkStart)), + zap.Error(err)) + } + } + + // Use the predicate-shaped Store method so disk backends scan + // only the contiguous "unresolved::*" slice instead of pulling + // the whole edges table back to the client and filtering in Go. + // In-memory keeps the same cost as the old AllEdges()+prefix-check + // loop. + var pending []*graph.Edge + for e := range r.graph.EdgesWithUnresolvedTarget() { + pending = append(pending, e) } if len(pending) == 0 { return &ResolveStats{} } + passStart := time.Now() + r.logger.Info("resolver: pass start", + zap.Int("pending", len(pending)), + zap.Bool("backend_bulk", backendResolverEnabled())) + var processed atomic.Int64 + progressDone := make(chan struct{}) + go func() { + t := time.NewTicker(3 * time.Second) + defer t.Stop() + for { + select { + case <-progressDone: + return + case <-t.C: + r.logger.Info("resolver: compute progress", + zap.Int64("processed", processed.Load()), + zap.Int("pending", len(pending)), + zap.Duration("elapsed", time.Since(passStart))) + } + } + }() + + // Pre-warm the per-pass lookup cache. The resolver workers below + // will call store.GetNode for endpoints and store.FindNodesByName + // for resolution candidates — across 10-30k pending edges that's + // 100k+ individual queries on a disk backend + // (hundreds of seconds wall time). Collecting the + // IDs / names upfront and batch-loading them collapses those + // queries to ~10 chunked SELECT IN statements. Cleared on return + // via defer so callers outside ResolveAll see the empty caches and + // fall through to the underlying store on every lookup. + r.warmLookupCache(pending) + defer r.clearLookupCache() + workers := runtime.NumCPU() if workers < 1 { workers = 1 @@ -206,6 +323,7 @@ func (r *Resolver) ResolveAll() *ResolveStats { for _, e := range slice { clone := cloneEdgeForResolve(e) oldTo, changed := r.resolveEdge(clone, ws) + processed.Add(1) if changed { jobs = append(jobs, reindexJob{ edge: e, @@ -228,6 +346,8 @@ func (r *Resolver) ResolveAll() *ResolveStats { }(w, pending[start:end]) } wg.Wait() + close(progressDone) + computeElapsed := time.Since(passStart) // Apply mutations + ReindexEdge serially. Mutating e.To inside // a worker would race with the bucket-maintenance reads inside @@ -237,6 +357,18 @@ func (r *Resolver) ResolveAll() *ResolveStats { // the race entirely; it costs ~5% of resolver wall time on a // 12k-edge vscode pass and buys a clean -race run plus simpler // reasoning. + // Collect every mutation across all workers into one slice and hand + // the whole batch to ReindexEdges. Disk-backed stores commit per + // chunk inside the implementation; the in-memory store loops + // through the existing per-edge code. Per-edge ReindexEdge was the + // resolver's bottleneck against bbolt (10k+ ACID round-trips); the + // batch form folds it to ≤(N/5000) commits without changing any + // observable semantics. + totalJobs := 0 + for i := range perWorkerJobs { + totalJobs += len(perWorkerJobs[i]) + } + reindexBatch := make([]graph.EdgeReindex, 0, totalJobs) for i := range perWorkerJobs { for _, j := range perWorkerJobs[i] { j.edge.To = j.newTo @@ -245,9 +377,18 @@ func (r *Resolver) ResolveAll() *ResolveStats { j.edge.Confidence = j.confidence j.edge.Origin = j.origin j.edge.Meta = j.meta - r.graph.ReindexEdge(j.edge, j.oldTo) + reindexBatch = append(reindexBatch, graph.EdgeReindex{Edge: j.edge, OldTo: j.oldTo}) } } + r.logger.Info("resolver: compute done", + zap.Int("pending", len(pending)), + zap.Int("reindex_batch", len(reindexBatch)), + zap.Duration("elapsed", computeElapsed)) + applyStart := time.Now() + r.graph.ReindexEdges(reindexBatch) + r.logger.Info("resolver: apply done", + zap.Int("edges", len(reindexBatch)), + zap.Duration("elapsed", time.Since(applyStart))) // Cross-package name-match guard. The heuristic fallbacks above can // resolve a call by name alone to a candidate in a package the @@ -263,6 +404,54 @@ func (r *Resolver) ResolveAll() *ResolveStats { } } + // Rebind cross-file Go method receivers onto the canonical type + // node ID. The Go extractor builds the EdgeMemberOf target as + // `::TypeName` because it parses one file at a time; + // methods declared in files other than the type's defining file + // point at a phantom ID until this pass collapses them onto the + // real `::TypeName` node. See rebindGoMethodReceivers + // for the full rationale (InferImplements + find_implementations + // + class_hierarchy correctness all ride on this). + r.rebindGoMethodReceivers() + + // Scope-aware bare-name binding. Walks `unresolved::` edges + // whose source is inside a function and rewrites them onto the + // matching KindLocal / KindParam node when exactly one in-scope + // binding wins under the Go shadowing rules. Without this pass + // the worker-pool fallback would scan FindNodesByName(name) + // across the whole graph and fall through to `unresolved::*` for + // every common identifier (err / data / src / ...). The bind + // uses #77's KindLocal nodes — pre-#77 there was nothing to + // bind to. + r.bindBareNameScopeRefs() + + // Bind in-body references to a function's own generic type + // parameters (`var x T`, `func F[T any]() T { ... }`) onto the + // pre-existing KindGenericParam nodes — without this pass they + // stayed as `unresolved::T` even though the parser had already + // materialised the tparam node. + r.bindGenericParamRefs() + + // Attribute Go language intrinsics (append / len / make / string + // / int / ...) to canonical `builtin::go::*` IDs and materialise + // one KindBuiltin node per unique builtin. Eliminates ~50k of + // the bare-name `unresolved::*` population on a Go-heavy + // codebase and turns the analytics queries that need these + // targets (`find_usages(builtin::go::type::float64)` for + // type-drift analysis) into one-hop lookups. + r.attributeGoBuiltins() + + // Materialise stdlib / dep / external call targets as + // KindFunction nodes with KindModule parents so cross-package + // queries (`find_usages(stdlib::fmt::Sprintf)`, + // `get_callers(dep::github.com/stretchr/testify/assert::True)`, + // "what's our usage surface on encoding/json") become one-hop + // lookups. Must run AFTER resolveExtern (which classifies + // `unresolved::extern::*` into the stdlib/dep/external buckets) + // so we materialise the post-classification state, not the + // pre-classification shape. + r.attributeGoExternalCalls() + // Relative-import resolution for Python and Dart files. Runs // before module attribution so internal-target stems never get // mis-mapped to a phantom pypi/pub package. @@ -301,13 +490,11 @@ func (r *Resolver) ResolveAll() *ResolveStats { // - lastDirIndex keys on the last path component of that directory // so an import of "logger" matches any file under .../logger/. func (r *Resolver) buildDirIndexes() { - nodes := r.graph.AllNodes() - r.dirIndex = make(map[string][]*graph.Node, len(nodes)/4) - r.lastDirIndex = make(map[string][]*graph.Node, len(nodes)/4) - for _, n := range nodes { - if n.Kind != graph.KindFile { - continue - } + r.dirIndex = make(map[string][]*graph.Node, 128) + r.lastDirIndex = make(map[string][]*graph.Node, 128) + // NodesByKind pushes the file-kind filter into the store; disk + // backends iterate just the file nodes instead of every node. + for n := range r.graph.NodesByKind(graph.KindFile) { dir := filepath.Dir(n.FilePath) r.dirIndex[dir] = append(r.dirIndex[dir], n) last := lastPathComponent(dir) @@ -322,6 +509,206 @@ func (r *Resolver) clearDirIndexes() { r.lastDirIndex = nil } +// warmLookupCache batches the per-edge GetNode / FindNodesByName +// queries the worker loop would otherwise fire serially. We collect +// every From/To node ID across the pending slice and the bare +// identifier name embedded in each `unresolved::*` target, then issue +// the two batched queries the Store exposes. Workers consult the +// resulting maps via cachedGetNode / cachedFindNodesByName; misses +// fall through to the underlying store. +func (r *Resolver) warmLookupCache(pending []*graph.Edge) { + if len(pending) == 0 { + return + } + idSet := make(map[string]struct{}, len(pending)*2) + nameSet := make(map[string]struct{}, len(pending)) + qualNameSet := make(map[string]struct{}) + for _, e := range pending { + if e == nil { + continue + } + if e.From != "" { + idSet[e.From] = struct{}{} + } + // e.To still carries the "unresolved::" (or multi-repo + // "::unresolved::") prefix. Strip it with + // UnresolvedName, then reduce to the bare identifier the cascade + // resolvers actually look up ("*.m" -> "m", "extern::p::S" -> + // "S"). Seeding the embedded identifier — NOT the raw stub id, + // which matches no node — is what lets the worker's + // cachedFindNodesByName(InRepo) HIT instead of firing one + // FindNodesByName(InRepo) query per edge (the warmup storm). + if name := identifierFromTarget(graph.UnresolvedName(e.To)); name != "" { + nameSet[name] = struct{}{} + } + // Receiver types drive the method/field disambiguation passes + // (receiverIsInterface, same-receiver field/method preference); + // seed them too so those lookups hit the cache (or its + // authoritative negative) instead of falling through to a + // per-edge FindNodesByName. + if rt := edgeReceiverType(e); rt != "" { + nameSet[rt] = struct{}{} + } + // Import targets resolve by qualified name: resolveImport's first + // lookup is GetNodeByQualName(importPath), an unindexed scan per + // import edge on a disk backend. Seed the import path so it hits the + // qual-name cache (or its authoritative negative) instead. + if t := graph.UnresolvedName(e.To); strings.HasPrefix(t, "import::") { + if qn := strings.TrimPrefix(t, "import::"); qn != "" { + qualNameSet[qn] = struct{}{} + } + } + } + ids := make([]string, 0, len(idSet)) + for id := range idSet { + ids = append(ids, id) + } + names := make([]string, 0, len(nameSet)) + for n := range nameSet { + names = append(names, n) + } + r.nodeByID = r.graph.GetNodesByIDs(ids) + r.nodesByName = r.graph.FindNodesByNames(names) + // Authoritative negatives: a name we queried that has NO node in the + // graph (stdlib / external method calls — *.QueryRow, *.Errorf, + // *.Fatalf, *.StringVar, … — dominate the pending set) must be + // recorded as an empty result, not left absent. Absence means "not + // pre-warmed" so the cached lookup falls through to a per-edge + // FindNodesByName scan of the unindexed name column; across 200k+ + // external-method stubs that fall-through IS the warmup hang. + // Backfilling the negative makes the pre-warmed name set + // authoritative — the lookup returns empty without touching the store. + if r.nodesByName == nil { + r.nodesByName = make(map[string][]*graph.Node, len(nameSet)) + } + for n := range nameSet { + if _, ok := r.nodesByName[n]; !ok { + r.nodesByName[n] = nil + } + } + // Fold every candidate node returned by the name lookup into the + // id cache too: when a worker picks a candidate and the + // downstream guard (cross_pkg / cross_repo) calls GetNode on the + // chosen target, the cache should hit instead of falling through + // to a per-id store call. + if r.nodeByID == nil && len(r.nodesByName) > 0 { + r.nodeByID = make(map[string]*graph.Node, len(r.nodesByName)) + } + for _, hits := range r.nodesByName { + for _, n := range hits { + if n == nil || n.ID == "" { + continue + } + if _, ok := r.nodeByID[n.ID]; !ok { + r.nodeByID[n.ID] = n + } + } + } + // Pre-warm the import qual-name cache + record authoritative negatives, + // so resolveImport's GetNodeByQualName hits the cache instead of + // scanning the unindexed qual_name column once per import edge. + if len(qualNameSet) > 0 { + qns := make([]string, 0, len(qualNameSet)) + for q := range qualNameSet { + qns = append(qns, q) + } + r.nodesByQualName = r.graph.GetNodesByQualNames(qns) + if r.nodesByQualName == nil { + r.nodesByQualName = make(map[string]*graph.Node, len(qualNameSet)) + } + for q := range qualNameSet { + if _, ok := r.nodesByQualName[q]; !ok { + r.nodesByQualName[q] = nil + } + } + } +} + +func (r *Resolver) clearLookupCache() { + r.nodeByID = nil + r.nodesByName = nil + r.nodesByQualName = nil +} + +// cachedGetNode returns the node for id, consulting the per-pass +// lookup cache first and falling through to the underlying store on +// miss. The cache is a positive-only fast path — absence means "not +// pre-warmed", not "doesn't exist", so a miss still asks the store. +// Outside a ResolveAll pass the cache is nil and every call goes +// straight to the store. +func (r *Resolver) cachedGetNode(id string) *graph.Node { + if id == "" { + return nil + } + if r.nodeByID != nil { + if n, ok := r.nodeByID[id]; ok { + return n + } + } + return r.graph.GetNode(id) +} + +// cachedFindNodesByName returns the candidates for name, consulting +// the per-pass cache first and falling through to the store on miss. +// Returns the in-cache slice directly when hit — callers MUST treat +// the result as read-only. +func (r *Resolver) cachedFindNodesByName(name string) []*graph.Node { + if name == "" { + return nil + } + if r.nodesByName != nil { + if hits, ok := r.nodesByName[name]; ok { + return hits + } + } + return r.graph.FindNodesByName(name) +} + +// cachedGetNodeByQualName serves resolveImport's qual-name lookup from the +// per-pass cache. A pre-warmed qual_name with no node returns nil +// (authoritative negative — most import paths have no matching package +// node, and the unindexed per-edge GetNodeByQualName scan for them was a +// cold-warmup compute storm); a qual_name absent from the cache falls +// through to the store. +func (r *Resolver) cachedGetNodeByQualName(qualName string) *graph.Node { + if qualName == "" { + return nil + } + if r.nodesByQualName != nil { + if n, ok := r.nodesByQualName[qualName]; ok { + return n + } + } + return r.graph.GetNodeByQualName(qualName) +} + +// cachedFindNodesByNameInRepo is the repo-scoped twin of +// cachedFindNodesByName: name-matched candidates whose RepoPrefix == repo, +// served from the per-pass name cache (filtered in Go) so the +// method/function/type/field cascade doesn't fire one +// FindNodesByNameInRepo query per pending edge — the warmup storm that +// the multi-repo prefixed-stub population (100k+ edges) turned into a +// hang. Falls through to the store on a cache miss, preserving +// correctness; the cache is positive-only (absence means "not +// pre-warmed", not "doesn't exist"). +func (r *Resolver) cachedFindNodesByNameInRepo(name, repo string) []*graph.Node { + if name == "" { + return nil + } + if r.nodesByName != nil { + if hits, ok := r.nodesByName[name]; ok { + var out []*graph.Node + for _, n := range hits { + if n != nil && n.RepoPrefix == repo { + out = append(out, n) + } + } + return out + } + } + return r.graph.FindNodesByNameInRepo(name, repo) +} + // buildDepModuleIndex collects every dep:: contract node // (one per non-indirect `require` line in a tracked go.mod) and groups // them by the owning repo's prefix so resolveImport can bridge a Go @@ -335,12 +722,8 @@ func (r *Resolver) clearDirIndexes() { // repo — those resolve through the cross-repo file graph instead and // have no module path embedded in the ID. func (r *Resolver) buildDepModuleIndex() { - nodes := r.graph.AllNodes() by := make(map[string][]depModuleEntry) - for _, n := range nodes { - if n.Kind != graph.KindContract { - continue - } + for n := range r.graph.NodesByKind(graph.KindContract) { if !strings.HasPrefix(n.ID, "dep::") { continue } @@ -396,20 +779,24 @@ func (r *Resolver) ResolveFile(filePath string) *ResolveStats { stats := &ResolveStats{} // Get all nodes in the file, then check their outgoing edges. - // Single-threaded path — apply ReindexEdge inline as before. - // Resolved edges are also recorded as jobs so the cross-package - // guard can re-check (and, if needed, revert) the weak-tier ones. + // Single-threaded path — collect mutations into a batch and flush + // in one ReindexEdges call after the file's edges are walked, so a + // per-file ResolveFile pass produces one Tx commit on disk + // backends instead of one per resolved edge. Resolved edges are + // also recorded as jobs so the cross-package guard can re-check + // (and, if needed, revert) the weak-tier ones. var jobs []reindexJob + var reindexBatch []graph.EdgeReindex nodes := r.graph.GetFileNodes(filePath) for _, n := range nodes { edges := r.graph.GetOutEdges(n.ID) for _, e := range edges { - if !strings.HasPrefix(e.To, unresolvedPrefix) { + if !graph.IsUnresolvedTarget(e.To) { continue } oldTo, changed := r.resolveEdge(e, stats) if changed { - r.graph.ReindexEdge(e, oldTo) + reindexBatch = append(reindexBatch, graph.EdgeReindex{Edge: e, OldTo: oldTo}) jobs = append(jobs, reindexJob{ edge: e, oldTo: oldTo, @@ -421,6 +808,9 @@ func (r *Resolver) ResolveFile(filePath string) *ResolveStats { } } } + if len(reindexBatch) > 0 { + r.graph.ReindexEdges(reindexBatch) + } // Cross-package name-match guard — same contract as in ResolveAll. if len(jobs) > 0 { @@ -435,9 +825,129 @@ func (r *Resolver) ResolveFile(filePath string) *ResolveStats { } } } + + // Re-run the attribution passes that ResolveAll runs. ResolveFile + // handles incremental updates — a re-parse of one file emits + // fresh `unresolved::` edges that haven't been seen by these + // passes yet, so without re-running them the incremental graph + // diverges from a cold re-index (caught by + // TestIncrementalReindex_ConvergesToFullIndex). Each pass is + // idempotent on already-rewritten edges (the `unresolved::` + // prefix check makes a second sweep a no-op). + r.rebindGoMethodReceivers() + r.bindBareNameScopeRefs() + r.bindGenericParamRefs() + r.attributeGoBuiltins() + r.attributeGoExternalCalls() + + return stats +} + +// ResolveIncomingForFile is the reverse of ResolveFile: instead of +// resolving the file's own OUTGOING references, it binds pending +// `unresolved::` edges in OTHER files that reference a symbol +// (re)defined in this file. After a definition is added or re-indexed, +// callers elsewhere still point at an unresolved stub — either one +// emitted at their own extraction time, or one restubIncomingRefs +// re-created when this file's prior concrete node was evicted. This +// rebinds them, scoped to this file's symbol names, so it costs +// O(references to those names), not a whole-graph ResolveAll. It uses +// the same reachability / import gates as ResolveFile (via resolveEdge), +// so an ambiguous name binds no differently and unsafe matches stay +// pending for the periodic ResolveAll. +func (r *Resolver) ResolveIncomingForFile(filePath string) *ResolveStats { + r.mu.Lock() + defer r.mu.Unlock() + + r.buildDirIndexes() + defer r.clearDirIndexes() + r.buildDepModuleIndex() + defer r.clearDepModuleIndex() + r.buildProvidesForIndex() + defer r.clearProvidesForIndex() + r.buildReachabilityIndex() + defer r.clearReachabilityIndex() + defer r.clearLSPIndex() + + stats := &ResolveStats{} + r.resolveIncomingLocked(filePath, stats) return stats } +// resolveIncomingLocked is the core of the reverse pass. Caller holds +// r.mu and has built the per-pass indexes. For each distinct +// referenceable symbol name defined in filePath it looks up the pending +// edges parked under that name's unresolved-stub id — GetInEdges keyed +// by the `unresolved::` target, so no new index is needed: the +// stub id IS the in-edge bucket key — and runs the normal per-edge +// resolution against them. Both the bare and the `::` +// multi-repo stub forms are probed. +func (r *Resolver) resolveIncomingLocked(filePath string, stats *ResolveStats) { + defNodes := r.graph.GetFileNodes(filePath) + if len(defNodes) == 0 { + return + } + seen := make(map[string]struct{}, len(defNodes)) + var stubKeys []string + for _, n := range defNodes { + if n == nil || n.Name == "" || !graph.IsReferenceableSymbol(n.Kind) { + continue + } + if _, dup := seen[n.Name]; dup { + continue + } + seen[n.Name] = struct{}{} + stubKeys = append(stubKeys, graph.UnresolvedMarker+n.Name) + if n.RepoPrefix != "" { + stubKeys = append(stubKeys, n.RepoPrefix+"::"+graph.UnresolvedMarker+n.Name) + } + } + if len(stubKeys) == 0 { + return + } + + var reindexBatch []graph.EdgeReindex + var jobs []reindexJob + for _, key := range stubKeys { + for _, e := range r.graph.GetInEdges(key) { + if e == nil || !graph.IsUnresolvedTarget(e.To) { + continue + } + oldTo, changed := r.resolveEdge(e, stats) + if changed { + reindexBatch = append(reindexBatch, graph.EdgeReindex{Edge: e, OldTo: oldTo}) + jobs = append(jobs, reindexJob{ + edge: e, + oldTo: oldTo, + newTo: e.To, + kind: e.Kind, + confidence: e.Confidence, + origin: e.Origin, + }) + } + } + } + if len(reindexBatch) > 0 { + r.graph.ReindexEdges(reindexBatch) + } + + // Same cross-package name-match guard ResolveFile applies: revert a + // weak-tier call edge whose freshly-bound target lives in a package + // the caller never imports. + if len(jobs) > 0 { + if closure := r.buildImportClosure(); len(closure) > 0 { + if guarded := r.guardCrossPackageCallEdges(jobs, closure); guarded > 0 { + if stats.Resolved >= guarded { + stats.Resolved -= guarded + } else { + stats.Resolved = 0 + } + stats.Unresolved += guarded + } + } + } +} + // reindexJob captures the resolved state for an edge whose target // changed during a parallel resolution pass. // @@ -524,7 +1034,18 @@ func releaseResolverClone(clone *graph.Edge) { // ResolveAll). When nothing changed the returned bool is false. func (r *Resolver) resolveEdge(e *graph.Edge, stats *ResolveStats) (oldTo string, changed bool) { oldTo = e.To - target := strings.TrimPrefix(e.To, unresolvedPrefix) + // graph.UnresolvedName handles both `unresolved::Name` (legacy) + // and `::unresolved::Name` (multi-repo COPY rewrite). + // strings.TrimPrefix only stripped the bare form, leaving every + // multi-repo edge with target=full-id and no downstream pattern + // match — that was the root cause of find_usages returning zero + // callers across the whole gortex repo. + target := graph.UnresolvedName(e.To) + if target == "" { + // Not an unresolved stub at all — fall through with the raw + // id so the pattern dispatch below sees the original value. + target = strings.TrimPrefix(e.To, unresolvedPrefix) + } // Resolve-time LSP hot-path. Consulted for TS/JS/JSX/TSX files // (and any other languages a future helper claims via @@ -633,7 +1154,7 @@ func (r *Resolver) resolveEdge(e *graph.Edge, stats *ResolveStats) (oldTo string // every CLI-wired command and command-table entry looks // like dead code. if e.Kind == graph.EdgeReads && e.To != before { - if n := r.graph.GetNode(e.To); n != nil && (n.Kind == graph.KindFunction || n.Kind == graph.KindMethod) { + if n := r.cachedGetNode(e.To); n != nil && (n.Kind == graph.KindFunction || n.Kind == graph.KindMethod) { e.Kind = graph.EdgeReferences } } @@ -671,8 +1192,11 @@ func (r *Resolver) resolveExtern(e *graph.Edge, spec string, stats *ResolveStats // Pass 1: does the symbol live in a file under this import path? // Reuse dirIndex populated by buildDirIndexes — no extra scan. + // cachedFindNodesByName lands in the per-pass batch cache for + // the common worker hot path; falls through to the store when + // called outside ResolveAll. callerRepo := r.callerRepoPrefix(e) - candidates := r.graph.FindNodesByName(symbol) + candidates := r.cachedFindNodesByName(symbol) for _, c := range candidates { if c.Kind != graph.KindFunction && c.Kind != graph.KindMethod && c.Kind != graph.KindType && c.Kind != graph.KindInterface { continue @@ -702,12 +1226,15 @@ func (r *Resolver) resolveExtern(e *graph.Edge, spec string, stats *ResolveStats // Pass 2: classify the import path. "stdlib::" when the path looks // like a Go stdlib package (no dot in the first segment and not a // known module vendor prefix). "dep::" otherwise. Callers can treat - // both as external for edge-walk purposes. - prefix := "dep::" + // both as external for edge-walk purposes. The stdlib stub carries + // the caller's repo prefix (see internal/graph/stub.go) so two repos + // pinned to different Go SDK versions get distinct fmt::Errorf nodes + // instead of one shared, version-conflated terminal. if isStdlibLike(importPath) { - prefix = "stdlib::" + e.To = graph.StubID(callerRepo, graph.StubKindStdlib, importPath, symbol) + } else { + e.To = "dep::" + importPath + "::" + symbol } - e.To = prefix + importPath + "::" + symbol stats.External++ } @@ -737,7 +1264,7 @@ func (r *Resolver) resolveImport(e *graph.Edge, importPath string, stats *Resolv importPath, npmAliased := rewriteNpmAliasImport(r.npmAlias, e.FilePath, importPath) // Look for a package node with matching qualified name. - node := r.graph.GetNodeByQualName(importPath) + node := r.cachedGetNodeByQualName(importPath) if node != nil { e.To = node.ID if callerRepo != "" && node.RepoPrefix != "" && node.RepoPrefix != callerRepo { @@ -805,10 +1332,7 @@ func (r *Resolver) resolveImport(e *graph.Edge, importPath string, stats *Resolv } } } else { - for _, n := range r.graph.AllNodes() { - if n.Kind != graph.KindFile { - continue - } + for n := range r.graph.NodesByKind(graph.KindFile) { dir := filepath.Dir(n.FilePath) if strings.HasSuffix(dir, lastPathComponent(importPath)) || dir == importPath { consider(n) @@ -857,7 +1381,7 @@ func (r *Resolver) resolveImport(e *graph.Edge, importPath string, stats *Resolv // sub-module the importer reached for. if npmAliased { if pkg := npmPackagePrefix(importPath); pkg != "" { - if node := r.graph.GetNodeByQualName(pkg); node != nil { + if node := r.cachedGetNodeByQualName(pkg); node != nil { e.To = node.ID if callerRepo != "" && node.RepoPrefix != "" && node.RepoPrefix != callerRepo { e.CrossRepo = true @@ -875,7 +1399,7 @@ func (r *Resolver) resolveImport(e *graph.Edge, importPath string, stats *Resolv func (r *Resolver) resolveFunctionCall(e *graph.Edge, funcName string, stats *ResolveStats) { callerRepo := r.callerRepoPrefix(e) - candidates := r.graph.FindNodesByNameInRepo(funcName, callerRepo) + candidates := r.cachedFindNodesByNameInRepo(funcName, callerRepo) if len(candidates) == 0 { // No same-repo candidate. A genuine cross-repo callee is left // unresolved here for CrossRepoResolver — which alone carries the @@ -935,7 +1459,7 @@ func (r *Resolver) resolveFunctionCall(e *graph.Edge, funcName string, stats *Re // genuine cross-repo case with import-reachability evidence. func (r *Resolver) resolveTypeOrFunc(e *graph.Edge, name string, stats *ResolveStats) { callerRepo := r.callerRepoPrefix(e) - candidates := r.graph.FindNodesByNameInRepo(name, callerRepo) + candidates := r.cachedFindNodesByNameInRepo(name, callerRepo) if len(candidates) == 0 { stats.Unresolved++ return @@ -996,7 +1520,7 @@ func (r *Resolver) resolveTypeRef(e *graph.Edge, name string, stats *ResolveStat // the `*.` and resolve on the bare type name. name = strings.TrimPrefix(name, "*.") callerRepo := r.callerRepoPrefix(e) - candidates := r.graph.FindNodesByNameInRepo(name, callerRepo) + candidates := r.cachedFindNodesByNameInRepo(name, callerRepo) if len(candidates) == 0 { stats.Unresolved++ return @@ -1030,7 +1554,7 @@ func (r *Resolver) resolveTypeRef(e *graph.Edge, name string, stats *ResolveStat // write but the runtime target is actually a method/property). func (r *Resolver) resolveFieldRef(e *graph.Edge, fieldName string, stats *ResolveStats) bool { receiverType := edgeReceiverType(e) - candidates := r.graph.FindNodesByNameInRepo(fieldName, r.callerRepoPrefix(e)) + candidates := r.cachedFindNodesByNameInRepo(fieldName, r.callerRepoPrefix(e)) if len(candidates) == 0 { return false } @@ -1060,7 +1584,7 @@ func (r *Resolver) resolveFieldRef(e *graph.Edge, fieldName string, stats *Resol } // Pass 3: caller is a method on type T, prefer a same-T field. - if callerNode := r.graph.GetNode(e.From); callerNode != nil && callerNode.Kind == graph.KindMethod { + if callerNode := r.cachedGetNode(e.From); callerNode != nil && callerNode.Kind == graph.KindMethod { callerRecv := nodeReceiverType(callerNode) if callerRecv != "" { for _, c := range candidates { @@ -1092,7 +1616,7 @@ func (r *Resolver) resolveMethodCall(e *graph.Edge, methodName string, stats *Re // method call across a repo boundary by name. A cross-repo method // call is left unresolved for CrossRepoResolver, which carries the // import-reachability + workspace-boundary evidence. - rawCandidates := r.graph.FindNodesByNameInRepo(methodName, r.callerRepoPrefix(e)) + rawCandidates := r.cachedFindNodesByNameInRepo(methodName, r.callerRepoPrefix(e)) if len(rawCandidates) == 0 { if r.applyBuiltinIfKnown(e, methodName, stats) { return @@ -1183,7 +1707,7 @@ func (r *Resolver) resolveMethodCall(e *graph.Edge, methodName string, stats *Re // If the caller is a method on type X and there's a candidate method on // type X with the same name, prefer it. This handles e.extractFunctions() // where the type env doesn't have a hint for parameter-bound receivers. - callerNode := r.graph.GetNode(e.From) + callerNode := r.cachedGetNode(e.From) if callerNode != nil && callerNode.Kind == graph.KindMethod { callerRecv := nodeReceiverType(callerNode) if callerRecv != "" { @@ -1303,7 +1827,7 @@ func (r *Resolver) receiverIsInterface(receiverType string) bool { if receiverType == "" { return false } - for _, n := range r.graph.FindNodesByName(receiverType) { + for _, n := range r.cachedFindNodesByName(receiverType) { if n.Kind == graph.KindInterface { return true } @@ -1325,7 +1849,7 @@ func (r *Resolver) applyBuiltinIfKnown(e *graph.Edge, methodName string, stats * if !ok { return false } - e.To = "builtin::" + lang + "::" + category + "::" + methodName + e.To = graph.StubID(r.callerRepoPrefix(e), graph.StubKindBuiltin, lang, category, methodName) stats.External++ return true } @@ -1341,7 +1865,7 @@ func (r *Resolver) resolveTokenRef(e *graph.Edge, name string, stats *ResolveSta // repos ("TOKEN", "CONFIG", …); a cross-repo first-candidate pick // is a name-only guess. CrossRepoResolver handles genuine cross-repo // token references. - candidates := r.graph.FindNodesByNameInRepo(name, r.callerRepoPrefix(e)) + candidates := r.cachedFindNodesByNameInRepo(name, r.callerRepoPrefix(e)) if len(candidates) == 0 { stats.Unresolved++ return @@ -1372,8 +1896,8 @@ func (r *Resolver) resolveTokenRef(e *graph.Edge, name string, stats *ResolveSta // comparisons that found nothing (vscode has zero NestJS modules). func (r *Resolver) buildProvidesForIndex() { idx := make(map[string]map[string]struct{}) - for _, ed := range r.graph.AllEdges() { - if ed.Kind != graph.EdgeProvides || ed.Meta == nil { + for ed := range r.graph.EdgesByKind(graph.EdgeProvides) { + if ed.Meta == nil { continue } pf, _ := ed.Meta["provides_for"].(string) @@ -1385,8 +1909,8 @@ func (r *Resolver) buildProvidesForIndex() { } to := ed.To var name string - if strings.HasPrefix(to, "unresolved::") { - name = strings.TrimPrefix(to, "unresolved::") + if graph.IsUnresolvedTarget(to) { + name = graph.UnresolvedName(to) } else if cut := strings.LastIndex(to, "::"); cut >= 0 { name = to[cut+2:] } else { @@ -1430,21 +1954,15 @@ func (r *Resolver) buildReachabilityIndex() { } // Seed with each indexed file's own directory. - for _, n := range r.graph.AllNodes() { - if n.Kind != graph.KindFile { - continue - } + for n := range r.graph.NodesByKind(graph.KindFile) { addDir(n.ID, filepath.Dir(n.FilePath)) } - for _, e := range r.graph.AllEdges() { - if e.Kind != graph.EdgeImports { - continue - } + for e := range r.graph.EdgesByKind(graph.EdgeImports) { var importedDir string switch { - case strings.HasPrefix(e.To, "unresolved::import::"): - path := strings.TrimPrefix(e.To, "unresolved::import::") + case graph.IsUnresolvedTarget(e.To) && strings.HasPrefix(graph.UnresolvedName(e.To), "import::"): + path := strings.TrimPrefix(graph.UnresolvedName(e.To), "import::") if files := r.dirIndex[path]; len(files) > 0 { importedDir = filepath.Dir(files[0].FilePath) } else if last := lastPathComponent(path); last != "" { @@ -1531,6 +2049,202 @@ func nodeReceiverType(n *graph.Node) string { return "" } +// memberMethodInfosByType returns the storage layer's per-type member +// method projection verbatim. Routed through MemberMethodsByType when +// the backend implements it; falls back to an EdgesByKind + +// per-edge GetNode walk that synthesises matching info rows. +func memberMethodInfosByType(g graph.Store) map[string][]graph.MemberMethodInfo { + if cap, ok := g.(graph.MemberMethodsByType); ok { + return cap.MemberMethodsByType() + } + out := map[string][]graph.MemberMethodInfo{} + for e := range g.EdgesByKind(graph.EdgeMemberOf) { + method := g.GetNode(e.From) + if method == nil || method.Kind != graph.KindMethod { + continue + } + out[e.To] = append(out[e.To], graph.MemberMethodInfo{ + MethodID: method.ID, + Name: method.Name, + FilePath: method.FilePath, + StartLine: method.StartLine, + RepoPrefix: method.RepoPrefix, + }) + } + return out +} + +// edgesByKinds yields every edge whose Kind is in the given set, +// using the EdgesByKindsScanner capability when the backend +// implements it (one query — an IN-list scan) and falling back to a +// chain of per-kind EdgesByKind iterators otherwise. +func edgesByKinds(g graph.Store, kinds []graph.EdgeKind) iter.Seq[*graph.Edge] { + if scan, ok := g.(graph.EdgesByKindsScanner); ok { + return scan.EdgesByKinds(kinds) + } + return func(yield func(*graph.Edge) bool) { + for _, k := range kinds { + for e := range g.EdgesByKind(k) { + if !yield(e) { + return + } + } + } + } +} + +// nodesByKindsOrAll returns every node whose Kind is in the given +// set, using the NodesByKindsScanner capability when the backend +// implements it (a single kind-IN scan) and falling back to +// AllNodes + Go-side filter otherwise. +func nodesByKindsOrAll(g graph.Store, kinds ...graph.NodeKind) []*graph.Node { + if scan, ok := g.(graph.NodesByKindsScanner); ok { + return scan.NodesByKinds(kinds) + } + set := make(map[graph.NodeKind]struct{}, len(kinds)) + for _, k := range kinds { + set[k] = struct{}{} + } + var out []*graph.Node + for _, n := range g.AllNodes() { + if n == nil { + continue + } + if _, ok := set[n.Kind]; ok { + out = append(out, n) + } + } + return out +} + +// memberMethodsByType returns typeID → method-name-set for every +// EdgeMemberOf edge whose source is a KindMethod node. Routed through +// the storage layer's MemberMethodsByType capability when the backend +// implements it (one query — a join, server-side), falling back to the +// EdgesByKind + per-edge GetNode loop the resolver used before the +// capability landed. Used by InferImplements (and shaped to match its +// existing map[string]map[string]bool API). +func memberMethodsByType(g graph.Store) map[string]map[string]bool { + if cap, ok := g.(graph.MemberMethodsByType); ok { + raw := cap.MemberMethodsByType() + if len(raw) == 0 { + return nil + } + out := make(map[string]map[string]bool, len(raw)) + for typeID, methods := range raw { + set := make(map[string]bool, len(methods)) + for _, m := range methods { + set[m.Name] = true + } + out[typeID] = set + } + return out + } + out := map[string]map[string]bool{} + for e := range g.EdgesByKind(graph.EdgeMemberOf) { + methodNode := g.GetNode(e.From) + if methodNode == nil || methodNode.Kind != graph.KindMethod { + continue + } + if out[e.To] == nil { + out[e.To] = make(map[string]bool) + } + out[e.To][methodNode.Name] = true + } + return out +} + +// memberMethodNodesByType returns typeID → name → method-node for +// every EdgeMemberOf edge whose source is a KindMethod node. Routed +// through the storage layer's MemberMethodsByType capability when the +// backend implements it (the projection ships only the four columns +// the consumer reads — ID / Name / FilePath / StartLine — packed into +// a synthetic *Node that carries no Meta / QualName / Language); falls +// back to the EdgesByKind + per-edge GetNode loop otherwise. Used by +// InferOverrides which keys methods by name and reads ID/FilePath/ +// StartLine off the node when it emits an EdgeOverrides edge. +func memberMethodNodesByType(g graph.Store) map[string]map[string]*graph.Node { + if cap, ok := g.(graph.MemberMethodsByType); ok { + raw := cap.MemberMethodsByType() + if len(raw) == 0 { + return nil + } + out := make(map[string]map[string]*graph.Node, len(raw)) + for typeID, methods := range raw { + set := make(map[string]*graph.Node, len(methods)) + for _, m := range methods { + set[m.Name] = &graph.Node{ + ID: m.MethodID, + Kind: graph.KindMethod, + Name: m.Name, + FilePath: m.FilePath, + StartLine: m.StartLine, + RepoPrefix: m.RepoPrefix, + } + } + out[typeID] = set + } + return out + } + out := map[string]map[string]*graph.Node{} + for e := range g.EdgesByKind(graph.EdgeMemberOf) { + method := g.GetNode(e.From) + if method == nil || method.Kind != graph.KindMethod { + continue + } + set := out[e.To] + if set == nil { + set = make(map[string]*graph.Node) + out[e.To] = set + } + set[method.Name] = method + } + return out +} + +// structuralParentEdges returns every EdgeExtends / EdgeImplements / +// EdgeComposes edge whose endpoints are both KindType / KindInterface, +// projected as the (FromID, ToID, Origin) tuples InferOverrides +// consumes. Routed through the storage layer's StructuralParentEdges +// capability when the backend implements it (one query — a join with +// kind filters on both sides — no per-edge GetNode); falls back to +// the AllEdges + per-edge GetNode walk otherwise. +func structuralParentEdges(g graph.Store) []graph.StructuralParentEdgeRow { + if cap, ok := g.(graph.StructuralParentEdges); ok { + return cap.StructuralParentEdges() + } + parentKinds := map[graph.EdgeKind]bool{ + graph.EdgeExtends: true, + graph.EdgeImplements: true, + graph.EdgeComposes: true, + } + var out []graph.StructuralParentEdgeRow + for _, e := range g.AllEdges() { + if e == nil || !parentKinds[e.Kind] { + continue + } + from := g.GetNode(e.From) + to := g.GetNode(e.To) + if from == nil || to == nil { + continue + } + if from.Kind != graph.KindType && from.Kind != graph.KindInterface { + continue + } + if to.Kind != graph.KindType && to.Kind != graph.KindInterface { + continue + } + out = append(out, graph.StructuralParentEdgeRow{ + FromID: from.ID, + ToID: to.ID, + FromKind: from.Kind, + ToKind: to.Kind, + Origin: e.Origin, + }) + } + return out +} + // InferImplements detects structural interface satisfaction by comparing // method sets and adds EdgeImplements edges from types to interfaces. // Returns the number of edges added. @@ -1543,11 +2257,7 @@ func (r *Resolver) InferImplements() int { } var ifaces []ifaceInfo - allNodes := r.graph.AllNodes() - for _, n := range allNodes { - if n.Kind != graph.KindInterface { - continue - } + for n := range r.graph.NodesByKind(graph.KindInterface) { if n.Meta == nil { continue } @@ -1580,23 +2290,7 @@ func (r *Resolver) InferImplements() int { } // Step 2: Build map of type ID -> set of method names via EdgeMemberOf edges. - typeMethods := make(map[string]map[string]bool) - allEdges := r.graph.AllEdges() - for _, e := range allEdges { - if e.Kind != graph.EdgeMemberOf { - continue - } - // EdgeMemberOf: From=method, To=type - methodNode := r.graph.GetNode(e.From) - if methodNode == nil || methodNode.Kind != graph.KindMethod { - continue - } - typeID := e.To - if typeMethods[typeID] == nil { - typeMethods[typeID] = make(map[string]bool) - } - typeMethods[typeID][methodNode.Name] = true - } + typeMethods := memberMethodsByType(r.graph) // Step 3: For each type, check if its method set satisfies each interface. // @@ -1616,6 +2310,12 @@ func (r *Resolver) InferImplements() int { typeList = append(typeList, tid) } + // Prefetch every type node referenced by EdgeMemberOf in one batch + // before the workers spin up — on disk backends a per-worker + // GetNode(typeID) was an N+1 over cgo that the workers' parallelism + // could not hide. + typeNodes := r.graph.GetNodesByIDs(typeList) + workers := runtime.NumCPU() if workers < 1 { workers = 1 @@ -1645,7 +2345,7 @@ func (r *Resolver) InferImplements() int { var out []pair for _, typeID := range slice { methods := typeMethods[typeID] - typeNode := r.graph.GetNode(typeID) + typeNode := typeNodes[typeID] if typeNode == nil || (typeNode.Kind != graph.KindType && typeNode.Kind != graph.KindInterface) { continue } @@ -1723,22 +2423,7 @@ func (r *Resolver) InferOverrides() int { defer r.mu.Unlock() // Step 1: index methods by their owning type via EdgeMemberOf. - typeMembers := make(map[string]map[string]*graph.Node) // typeID → name → method node - for _, e := range r.graph.AllEdges() { - if e.Kind != graph.EdgeMemberOf { - continue - } - method := r.graph.GetNode(e.From) - if method == nil || method.Kind != graph.KindMethod { - continue - } - set := typeMembers[e.To] - if set == nil { - set = make(map[string]*graph.Node) - typeMembers[e.To] = set - } - set[method.Name] = method - } + typeMembers := memberMethodNodesByType(r.graph) // typeID → name → method node if len(typeMembers) == 0 { return 0 } @@ -1747,33 +2432,17 @@ func (r *Resolver) InferOverrides() int { // edge, walk the child's methods and emit EdgeOverrides where the // parent has a same-named method. Skip if the override edge // already exists. - parentKinds := map[graph.EdgeKind]bool{ - graph.EdgeExtends: true, - graph.EdgeImplements: true, - graph.EdgeComposes: true, - } type overridePair struct { from, to *graph.Node origin string } var pending []overridePair - for _, e := range r.graph.AllEdges() { - if !parentKinds[e.Kind] { + for _, row := range structuralParentEdges(r.graph) { + if row.FromID == row.ToID { continue } - child := r.graph.GetNode(e.From) - parent := r.graph.GetNode(e.To) - if child == nil || parent == nil || child.ID == parent.ID { - continue - } - if child.Kind != graph.KindType && child.Kind != graph.KindInterface { - continue - } - if parent.Kind != graph.KindType && parent.Kind != graph.KindInterface { - continue - } - childMethods := typeMembers[child.ID] - parentMethods := typeMembers[parent.ID] + childMethods := typeMembers[row.FromID] + parentMethods := typeMembers[row.ToID] if len(childMethods) == 0 || len(parentMethods) == 0 { continue } @@ -1781,10 +2450,10 @@ func (r *Resolver) InferOverrides() int { // the override edge so blast-radius queries can filter by // min_tier consistently. origin := graph.OriginASTInferred - if e.Origin == graph.OriginASTResolved { + if row.Origin == graph.OriginASTResolved { origin = graph.OriginASTResolved - } else if rank := graph.OriginRank(e.Origin); rank >= graph.OriginRank(graph.OriginLSPDispatch) { - origin = e.Origin + } else if rank := graph.OriginRank(row.Origin); rank >= graph.OriginRank(graph.OriginLSPDispatch) { + origin = row.Origin } for name, cm := range childMethods { pm, ok := parentMethods[name] @@ -1796,6 +2465,7 @@ func (r *Resolver) InferOverrides() int { } added := 0 + var provBatch []graph.EdgeProvenanceUpdate for _, p := range pending { // Skip when the edge already exists. dup := false @@ -1803,11 +2473,13 @@ func (r *Resolver) InferOverrides() int { if existing.Kind == graph.EdgeOverrides && existing.To == p.to.ID { dup = true // Upgrade the provenance of the existing override edge - // through SetEdgeProvenance so the identity change is - // counted — a bare existing.Origin write would bypass - // the revision counter. + // through SetEdgeProvenanceBatch so the identity change + // is counted — a bare existing.Origin write would + // bypass the revision counter. Batched so a large + // hierarchy pass commits its provenance bumps in + // chunks on disk backends. if graph.OriginRank(existing.Origin) < graph.OriginRank(p.origin) { - r.graph.SetEdgeProvenance(existing, p.origin) + provBatch = append(provBatch, graph.EdgeProvenanceUpdate{Edge: existing, NewOrigin: p.origin}) } break } @@ -1827,6 +2499,9 @@ func (r *Resolver) InferOverrides() int { }) added++ } + if len(provBatch) > 0 { + r.graph.SetEdgeProvenanceBatch(provBatch) + } return added } @@ -1859,7 +2534,10 @@ func dirMatchesImport(dir, importPath string) bool { // callerRepoPrefix returns the RepoPrefix of the node that owns the edge's From field. func (r *Resolver) callerRepoPrefix(e *graph.Edge) string { - fromNode := r.graph.GetNode(e.From) + // cachedGetNode: the pre-warm batch-loads every pending edge's From + // id, so this is a map hit during ResolveAll instead of one GetNode + // query per edge. + fromNode := r.cachedGetNode(e.From) if fromNode != nil { return fromNode.RepoPrefix } diff --git a/internal/resolver/resolver_cache_routing_test.go b/internal/resolver/resolver_cache_routing_test.go new file mode 100644 index 00000000..fe74e621 --- /dev/null +++ b/internal/resolver/resolver_cache_routing_test.go @@ -0,0 +1,50 @@ +package resolver_test + +// Guards the cache-routing fix: during ResolveAll the per-pass name +// cache (warmLookupCache) must serve the method/function/type/field +// cascade, so the worker pool issues ZERO per-edge FindNodesByNameInRepo +// store calls. Before the fix, warmLookupCache seeded names from the raw +// `unresolved::*.` stub id (never stripped), so every cascade +// lookup missed the cache and fell through to a per-edge +// FindNodesByNameInRepo — the warmup storm/hang on the 100k+ multi-repo +// prefixed-stub population. + +import ( + "testing" + + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/resolver" +) + +// countingStore wraps the in-memory graph and counts the repo-scoped +// per-edge lookup the cascade used to fire once per pending edge. +type countingStore struct { + *graph.Graph + findInRepoCalls int +} + +func (c *countingStore) FindNodesByNameInRepo(name, repo string) []*graph.Node { + c.findInRepoCalls++ + return c.Graph.FindNodesByNameInRepo(name, repo) +} + +func TestResolveAll_Cascade_ServedFromCache_NoPerEdgeLookup(t *testing.T) { + g := graph.New() + cs := &countingStore{Graph: g} + + // A method call (resolveMethodCall path) and a plain function call + // (resolveFunctionCall path) — both went through FindNodesByNameInRepo. + g.AddNode(&graph.Node{ID: "r1/a.go::Caller", Name: "Caller", Kind: graph.KindFunction, FilePath: "r1/a.go", RepoPrefix: "r1"}) + g.AddNode(&graph.Node{ID: "r1/b.go::doThing", Name: "doThing", Kind: graph.KindMethod, FilePath: "r1/b.go", RepoPrefix: "r1", Meta: map[string]any{"receiver": "T"}}) + g.AddNode(&graph.Node{ID: "r1/c.go::helper", Name: "helper", Kind: graph.KindFunction, FilePath: "r1/c.go", RepoPrefix: "r1"}) + g.AddEdge(&graph.Edge{From: "r1/a.go::Caller", To: "unresolved::*.doThing", Kind: graph.EdgeCalls, FilePath: "r1/a.go", Line: 1}) + g.AddEdge(&graph.Edge{From: "r1/a.go::Caller", To: "unresolved::helper", Kind: graph.EdgeCalls, FilePath: "r1/a.go", Line: 2}) + + // graph.Graph is not a BackendResolver, so ResolveAll runs the pure + // Go worker-pool path — exactly the cascade under test. + resolver.New(cs).ResolveAll() + + if cs.findInRepoCalls != 0 { + t.Errorf("cascade issued %d per-edge FindNodesByNameInRepo calls; want 0 (cache should serve them)", cs.findInRepoCalls) + } +} diff --git a/internal/resolver/scope.go b/internal/resolver/scope.go index dc2da97b..2a1c837e 100644 --- a/internal/resolver/scope.go +++ b/internal/resolver/scope.go @@ -150,7 +150,7 @@ func scopeUseAliases(m map[string]any) map[string]string { // because legacy edges may not carry language). Returning nil keeps // the resolver behavior identical for unsupported languages. func (r *Resolver) preferScopeCandidate(e *graph.Edge, name string, candidates []*graph.Node) *graph.Node { - caller := r.graph.GetNode(e.From) + caller := r.cachedGetNode(e.From) if caller == nil { return nil } @@ -197,9 +197,9 @@ func (r *Resolver) preferCStaticCandidate(e *graph.Edge, caller *graph.Node, can // for an unqualified call `foo(a, b)`, if any of a's, b's argument // types name a class in namespace `N`, then `N::foo` is a candidate. // Implementation order: -// 1. Same-namespace function/method match (lexical scope). -// 2. ADL: walk each scope_arg_types entry's namespace. -// 3. Fall through to the generic cascade. +// 1. Same-namespace function/method match (lexical scope). +// 2. ADL: walk each scope_arg_types entry's namespace. +// 3. Fall through to the generic cascade. func (r *Resolver) preferCppScopeCandidate(e *graph.Edge, caller *graph.Node, name string, candidates []*graph.Node) *graph.Node { callerNs := scopeMetaString(caller.Meta, MetaScopeNamespace) if callerNs != "" { @@ -453,4 +453,3 @@ func splitQualifiedFunctionName(name string) (ns, base string) { } return "", name } - diff --git a/internal/resolver/scope_test.go b/internal/resolver/scope_test.go index bff68cca..c79b768e 100644 --- a/internal/resolver/scope_test.go +++ b/internal/resolver/scope_test.go @@ -25,7 +25,7 @@ func TestScope_CStaticPreference(t *testing.T) { g.AddNode(&graph.Node{ ID: "pkg/a.c::helper", Kind: graph.KindFunction, Name: "helper", FilePath: "pkg/a.c", Language: "c", - Meta: map[string]any{MetaScopeStatic: true}, + Meta: map[string]any{MetaScopeStatic: true}, }) g.AddNode(&graph.Node{ ID: "pkg/b.c::helper", Kind: graph.KindFunction, Name: "helper", @@ -53,17 +53,17 @@ func TestScope_CppSameNamespacePreference(t *testing.T) { g.AddNode(&graph.Node{ ID: "src/a.cpp::caller", Kind: graph.KindFunction, Name: "caller", FilePath: "src/a.cpp", Language: "cpp", - Meta: map[string]any{MetaScopeNamespace: "app"}, + Meta: map[string]any{MetaScopeNamespace: "app"}, }) g.AddNode(&graph.Node{ ID: "src/a.cpp::helper#app", Kind: graph.KindFunction, Name: "helper", FilePath: "src/a.cpp", Language: "cpp", - Meta: map[string]any{MetaScopeNamespace: "app"}, + Meta: map[string]any{MetaScopeNamespace: "app"}, }) g.AddNode(&graph.Node{ ID: "src/a.cpp::helper#util", Kind: graph.KindFunction, Name: "helper", FilePath: "src/a.cpp", Language: "cpp", - Meta: map[string]any{MetaScopeNamespace: "util"}, + Meta: map[string]any{MetaScopeNamespace: "util"}, }) e := &graph.Edge{ From: "src/a.cpp::caller", To: "unresolved::helper", @@ -89,7 +89,7 @@ func TestScope_CppADLViaArgType(t *testing.T) { g.AddNode(&graph.Node{ ID: "src/a.cpp::caller", Kind: graph.KindFunction, Name: "caller", FilePath: "src/a.cpp", Language: "cpp", - Meta: map[string]any{MetaScopeNamespace: "app"}, + Meta: map[string]any{MetaScopeNamespace: "app"}, }) // The only "process" candidate is in namespace `util` — same- // namespace lookup would miss it; ADL via the arg-type hint @@ -97,7 +97,7 @@ func TestScope_CppADLViaArgType(t *testing.T) { g.AddNode(&graph.Node{ ID: "src/b.cpp::process#util", Kind: graph.KindFunction, Name: "process", FilePath: "src/b.cpp", Language: "cpp", - Meta: map[string]any{MetaScopeNamespace: "util"}, + Meta: map[string]any{MetaScopeNamespace: "util"}, }) e := &graph.Edge{ From: "src/a.cpp::caller", To: "unresolved::process", @@ -126,17 +126,17 @@ func TestScope_JavaEnclosingClassPreference(t *testing.T) { g.AddNode(&graph.Node{ ID: "app/User.java::User.save", Kind: graph.KindMethod, Name: "save", FilePath: "app/User.java", Language: "java", - Meta: map[string]any{"receiver": "User", MetaScopeClass: "User"}, + Meta: map[string]any{"receiver": "User", MetaScopeClass: "User"}, }) g.AddNode(&graph.Node{ ID: "app/User.java::User.validate", Kind: graph.KindMethod, Name: "validate", FilePath: "app/User.java", Language: "java", - Meta: map[string]any{"receiver": "User", MetaScopeClass: "User"}, + Meta: map[string]any{"receiver": "User", MetaScopeClass: "User"}, }) g.AddNode(&graph.Node{ ID: "app/Other.java::Other.validate", Kind: graph.KindMethod, Name: "validate", FilePath: "app/Other.java", Language: "java", - Meta: map[string]any{"receiver": "Other", MetaScopeClass: "Other"}, + Meta: map[string]any{"receiver": "Other", MetaScopeClass: "Other"}, }) // User.save() calls validate() unqualified — must bind to User.validate. e := &graph.Edge{ @@ -166,23 +166,23 @@ func TestScope_JavaSuperChainWalk(t *testing.T) { g.AddNode(&graph.Node{ ID: "app/Child.java::Child", Kind: graph.KindType, Name: "Child", FilePath: "app/Child.java", Language: "java", - Meta: map[string]any{MetaScopeParentClass: "Base"}, + Meta: map[string]any{MetaScopeParentClass: "Base"}, }) g.AddNode(&graph.Node{ ID: "app/Base.java::Base.helper", Kind: graph.KindMethod, Name: "helper", FilePath: "app/Base.java", Language: "java", - Meta: map[string]any{"receiver": "Base", MetaScopeClass: "Base"}, + Meta: map[string]any{"receiver": "Base", MetaScopeClass: "Base"}, }) g.AddNode(&graph.Node{ ID: "app/Child.java::Child.run", Kind: graph.KindMethod, Name: "run", FilePath: "app/Child.java", Language: "java", - Meta: map[string]any{"receiver": "Child", MetaScopeClass: "Child"}, + Meta: map[string]any{"receiver": "Child", MetaScopeClass: "Child"}, }) // Decoy: another class has a same-name `helper`. g.AddNode(&graph.Node{ ID: "app/Other.java::Other.helper", Kind: graph.KindMethod, Name: "helper", FilePath: "app/Other.java", Language: "java", - Meta: map[string]any{"receiver": "Other", MetaScopeClass: "Other"}, + Meta: map[string]any{"receiver": "Other", MetaScopeClass: "Other"}, }) // Child.run() calls helper() — should walk to Base.helper. e := &graph.Edge{ @@ -211,22 +211,22 @@ func TestScope_PhpParentCall(t *testing.T) { g.AddNode(&graph.Node{ ID: "src/Child.php::Child", Kind: graph.KindType, Name: "Child", FilePath: "src/Child.php", Language: "php", - Meta: map[string]any{MetaScopeParentClass: "Base"}, + Meta: map[string]any{MetaScopeParentClass: "Base"}, }) g.AddNode(&graph.Node{ ID: "src/Base.php::Base.handle", Kind: graph.KindMethod, Name: "handle", FilePath: "src/Base.php", Language: "php", - Meta: map[string]any{"receiver": "Base", MetaScopeClass: "Base"}, + Meta: map[string]any{"receiver": "Base", MetaScopeClass: "Base"}, }) g.AddNode(&graph.Node{ ID: "src/Child.php::Child.handle", Kind: graph.KindMethod, Name: "handle", FilePath: "src/Child.php", Language: "php", - Meta: map[string]any{"receiver": "Child", MetaScopeClass: "Child"}, + Meta: map[string]any{"receiver": "Child", MetaScopeClass: "Child"}, }) g.AddNode(&graph.Node{ ID: "src/Other.php::Other.handle", Kind: graph.KindMethod, Name: "handle", FilePath: "src/Other.php", Language: "php", - Meta: map[string]any{"receiver": "Other", MetaScopeClass: "Other"}, + Meta: map[string]any{"receiver": "Other", MetaScopeClass: "Other"}, }) // Child.handle() calls parent::handle() — must bind to Base.handle. e := &graph.Edge{ @@ -251,17 +251,17 @@ func TestScope_PhpSelfCall(t *testing.T) { g.AddNode(&graph.Node{ ID: "src/Service.php::Service.boot", Kind: graph.KindMethod, Name: "boot", FilePath: "src/Service.php", Language: "php", - Meta: map[string]any{"receiver": "Service", MetaScopeClass: "Service"}, + Meta: map[string]any{"receiver": "Service", MetaScopeClass: "Service"}, }) g.AddNode(&graph.Node{ ID: "src/Service.php::Service.init", Kind: graph.KindMethod, Name: "init", FilePath: "src/Service.php", Language: "php", - Meta: map[string]any{"receiver": "Service", MetaScopeClass: "Service"}, + Meta: map[string]any{"receiver": "Service", MetaScopeClass: "Service"}, }) g.AddNode(&graph.Node{ ID: "src/Other.php::Other.init", Kind: graph.KindMethod, Name: "init", FilePath: "src/Other.php", Language: "php", - Meta: map[string]any{"receiver": "Other", MetaScopeClass: "Other"}, + Meta: map[string]any{"receiver": "Other", MetaScopeClass: "Other"}, }) e := &graph.Edge{ From: "src/Service.php::Service.boot", To: "unresolved::*.init", @@ -290,7 +290,7 @@ func TestScope_StampedAsScopeResolution(t *testing.T) { g.AddNode(&graph.Node{ ID: "pkg/a.c::helper", Kind: graph.KindFunction, Name: "helper", FilePath: "pkg/a.c", Language: "c", - Meta: map[string]any{MetaScopeStatic: true}, + Meta: map[string]any{MetaScopeStatic: true}, }) g.AddNode(&graph.Node{ ID: "pkg/b.c::helper", Kind: graph.KindFunction, Name: "helper", diff --git a/internal/resolver/temporal_calls.go b/internal/resolver/temporal_calls.go index af4b7ee7..188cc86c 100644 --- a/internal/resolver/temporal_calls.go +++ b/internal/resolver/temporal_calls.go @@ -72,7 +72,7 @@ const ( // // Returns the number of temporal.stub edges pointing at a resolved // handler after the pass. -func ResolveTemporalCalls(g *graph.Graph) int { +func ResolveTemporalCalls(g graph.Store) int { if g == nil { return 0 } @@ -87,8 +87,17 @@ func ResolveTemporalCalls(g *graph.Graph) int { defer mu.Unlock() idx := buildTemporalIndex(g) resolved := 0 - for _, e := range g.AllEdges() { - if e == nil || e.Kind != graph.EdgeCalls || e.Meta == nil { + var reindexBatch []graph.EdgeReindex + // First sweep: collect stub edges and the From IDs we need so the + // per-edge GetNode below collapses to one batch lookup. + type stubEdge struct { + edge *graph.Edge + kind, name string + } + var stubs []stubEdge + fromIDSet := map[string]struct{}{} + for e := range g.EdgesByKind(graph.EdgeCalls) { + if e == nil || e.Meta == nil { continue } if v, _ := e.Meta["via"].(string); v != "temporal.stub" { @@ -99,16 +108,28 @@ func ResolveTemporalCalls(g *graph.Graph) int { if kind == "" || name == "" { continue } + stubs = append(stubs, stubEdge{edge: e, kind: kind, name: name}) + if e.From != "" { + fromIDSet[e.From] = struct{}{} + } + } + fromList := make([]string, 0, len(fromIDSet)) + for id := range fromIDSet { + fromList = append(fromList, id) + } + callerNodes := g.GetNodesByIDs(fromList) + for _, s := range stubs { + e := s.edge callerRepo := "" - if from := g.GetNode(e.From); from != nil { + if from := callerNodes[e.From]; from != nil { callerRepo = from.RepoPrefix } - handlerID, origin, conf := idx.lookup(kind, name, callerRepo) + handlerID, origin, conf := idx.lookup(s.kind, s.name, callerRepo) want := handlerID if want == "" { - want = temporalStubPlaceholder(kind, name) + want = temporalStubPlaceholder(s.kind, s.name) } if e.To == want { if handlerID != "" { @@ -131,7 +152,10 @@ func ResolveTemporalCalls(g *graph.Graph) int { e.ConfidenceLabel = "" delete(e.Meta, "temporal_resolution") } - g.ReindexEdge(e, oldTo) + reindexBatch = append(reindexBatch, graph.EdgeReindex{Edge: e, OldTo: oldTo}) + } + if len(reindexBatch) > 0 { + g.ReindexEdges(reindexBatch) } return resolved } @@ -177,12 +201,24 @@ func (idx *temporalIndex) lookup(kind, name, callerRepo string) (id, origin stri // `@WorkflowInterface` annotations (propagated to interface // implementors), and (b) returns a name index the stub-call resolver // consults. -func buildTemporalIndex(g *graph.Graph) *temporalIndex { +func buildTemporalIndex(g graph.Store) *temporalIndex { idx := &temporalIndex{byKindName: map[string][]*graph.Node{}} // Phase 1 — Go side. Walk `temporal.register` edges and stamp the - // registered function's node. - for _, e := range g.AllEdges() { + // registered function's node. The "via" tag lives on EdgeCalls + // edges, so narrow with EdgesByKind before the Meta filter. + // + // Collect every register edge first so we can batch-fetch every + // caller node and resolve every Go target name in one pair of + // round-trips, instead of N AllNodes scans + N GetNode calls. + type goRegister struct { + edge *graph.Edge + kind, name string + } + var goRegisters []goRegister + registerCallerIDs := map[string]struct{}{} + registerNames := map[string]struct{}{} + for e := range g.EdgesByKind(graph.EdgeCalls) { if e == nil || e.Meta == nil { continue } @@ -194,48 +230,85 @@ func buildTemporalIndex(g *graph.Graph) *temporalIndex { if kind == "" || name == "" { continue } - caller := g.GetNode(e.From) + goRegisters = append(goRegisters, goRegister{edge: e, kind: kind, name: name}) + if e.From != "" { + registerCallerIDs[e.From] = struct{}{} + } + registerNames[name] = struct{}{} + } + callerList := make([]string, 0, len(registerCallerIDs)) + for id := range registerCallerIDs { + callerList = append(callerList, id) + } + registerCallers := g.GetNodesByIDs(callerList) + nameList := make([]string, 0, len(registerNames)) + for n := range registerNames { + nameList = append(nameList, n) + } + candidatesByName := g.FindNodesByNames(nameList) + + for _, r := range goRegisters { + caller := registerCallers[r.edge.From] if caller == nil { continue } - target := findGoTemporalTarget(g, caller, name) + target := pickGoTemporalTarget(candidatesByName[r.name], caller) if target == nil { continue } - stampTemporalRole(target, kind, name) - idx.byKindName[kind+"::"+name] = append(idx.byKindName[kind+"::"+name], target) + stampTemporalRole(g, target, r.kind, r.name) + idx.byKindName[r.kind+"::"+r.name] = append(idx.byKindName[r.kind+"::"+r.name], target) } // Phase 2 — Java side. Walk `EdgeAnnotated` edges to find - // temporal-tagged interfaces and methods. - type javaIfaceTag struct { - ifaceID string - role string // "activity_interface" / "workflow_interface" + // temporal-tagged interfaces and methods. As with Phase 1, collect + // every annotation edge and batch the From-side GetNode calls. + type javaAnno struct { + fromID string + ifaceRole, methodRole string } - var javaIfaces []javaIfaceTag - for _, e := range g.AllEdges() { - if e == nil || e.Kind != graph.EdgeAnnotated { + var javaAnnos []javaAnno + annoFromIDs := map[string]struct{}{} + for e := range g.EdgesByKind(graph.EdgeAnnotated) { + if e == nil { continue } role, methodRole := temporalRoleForJavaAnnotation(e.To) if role == "" && methodRole == "" { continue } - from := g.GetNode(e.From) + javaAnnos = append(javaAnnos, javaAnno{fromID: e.From, ifaceRole: role, methodRole: methodRole}) + if e.From != "" { + annoFromIDs[e.From] = struct{}{} + } + } + annoFromList := make([]string, 0, len(annoFromIDs)) + for id := range annoFromIDs { + annoFromList = append(annoFromList, id) + } + annoFromNodes := g.GetNodesByIDs(annoFromList) + + type javaIfaceTag struct { + ifaceID string + role string // "activity_interface" / "workflow_interface" + } + var javaIfaces []javaIfaceTag + for _, a := range javaAnnos { + from := annoFromNodes[a.fromID] if from == nil { continue } // Method-level annotation: stamp directly. - if methodRole != "" && (from.Kind == graph.KindMethod || from.Kind == graph.KindFunction) { - stampTemporalRole(from, methodRole, from.Name) - idx.byKindName[normaliseTemporalKind(methodRole)+"::"+from.Name] = append( - idx.byKindName[normaliseTemporalKind(methodRole)+"::"+from.Name], from) + if a.methodRole != "" && (from.Kind == graph.KindMethod || from.Kind == graph.KindFunction) { + stampTemporalRole(g, from, a.methodRole, from.Name) + idx.byKindName[normaliseTemporalKind(a.methodRole)+"::"+from.Name] = append( + idx.byKindName[normaliseTemporalKind(a.methodRole)+"::"+from.Name], from) continue } // Interface-level annotation: queue for the propagation pass. - if role != "" && from.Kind == graph.KindInterface { - stampTemporalRole(from, role, from.Name) - javaIfaces = append(javaIfaces, javaIfaceTag{ifaceID: from.ID, role: role}) + if a.ifaceRole != "" && from.Kind == graph.KindInterface { + stampTemporalRole(g, from, a.ifaceRole, from.Name) + javaIfaces = append(javaIfaces, javaIfaceTag{ifaceID: from.ID, role: a.ifaceRole}) } } @@ -243,14 +316,57 @@ func buildTemporalIndex(g *graph.Graph) *temporalIndex { // methods (flat nodes living in the same file, within the // interface's line range) and stamp them. Then walk EdgeImplements // from each implementor and tag its same-named methods. + // + // Build a single Java method index up front via NodesByKind, then + // project it into the two views the propagation needs: + // - methodsByFile: file path → []*method (used for interface + // methods, which the Java extractor emits as flat + // :: nodes whose StartLine sits inside the + // interface's line range). + // - methodsByReceiver: receiver class name → []*method (used for + // impl-class methods, which carry Meta["receiver"]). + // One pass beats AllNodes() per interface. + javaMethodsByFile, javaMethodsByReceiver := buildJavaMethodViews(g, len(javaIfaces)) + + // Prefetch the interface nodes + the implementing-type nodes for + // the entire iface set so the propagation loop never issues an + // inline GetNode. + ifaceIDs := make([]string, 0, len(javaIfaces)) + for _, t := range javaIfaces { + ifaceIDs = append(ifaceIDs, t.ifaceID) + } + ifaceNodes := g.GetNodesByIDs(ifaceIDs) + implTypeIDSet := map[string]struct{}{} + implIDsByIface := map[string][]string{} + for _, t := range javaIfaces { + for _, ie := range g.GetInEdges(t.ifaceID) { + if ie == nil || ie.Kind != graph.EdgeImplements { + continue + } + implIDsByIface[t.ifaceID] = append(implIDsByIface[t.ifaceID], ie.From) + if ie.From != "" { + implTypeIDSet[ie.From] = struct{}{} + } + } + } + implTypeIDList := make([]string, 0, len(implTypeIDSet)) + for id := range implTypeIDSet { + implTypeIDList = append(implTypeIDList, id) + } + implTypeNodes := g.GetNodesByIDs(implTypeIDList) + for _, t := range javaIfaces { methodRole := "activity" if t.role == "workflow_interface" { methodRole = "workflow" } - ifaceMethods := collectJavaInterfaceMethods(g, t.ifaceID) + iface := ifaceNodes[t.ifaceID] + if iface == nil { + continue + } + ifaceMethods := collectJavaInterfaceMethodsFromIndex(iface, javaMethodsByFile) for _, m := range ifaceMethods { - stampTemporalRole(m, methodRole, m.Name) + stampTemporalRole(g, m, methodRole, m.Name) idx.byKindName[methodRole+"::"+m.Name] = append(idx.byKindName[methodRole+"::"+m.Name], m) } // Propagate to implementing classes' methods. @@ -258,19 +374,16 @@ func buildTemporalIndex(g *graph.Graph) *temporalIndex { for _, m := range ifaceMethods { implMethodNames[m.Name] = struct{}{} } - for _, ie := range g.GetInEdges(t.ifaceID) { - if ie == nil || ie.Kind != graph.EdgeImplements { - continue - } - implType := g.GetNode(ie.From) + for _, implTypeID := range implIDsByIface[t.ifaceID] { + implType := implTypeNodes[implTypeID] if implType == nil { continue } - for _, m := range methodsOfJavaType(g, implType) { + for _, m := range methodsOfJavaTypeFromIndex(implType, javaMethodsByReceiver) { if _, ok := implMethodNames[m.Name]; !ok { continue } - stampTemporalRole(m, methodRole, m.Name) + stampTemporalRole(g, m, methodRole, m.Name) idx.byKindName[methodRole+"::"+m.Name] = append(idx.byKindName[methodRole+"::"+m.Name], m) } } @@ -319,7 +432,7 @@ func normaliseTemporalKind(role string) string { // a previously-stamped node is re-stamped with a different role the // new role wins (the resolver runs as a full recompute, so this lets // the latest registration take precedence). -func stampTemporalRole(n *graph.Node, role, name string) { +func stampTemporalRole(g graph.Store, n *graph.Node, role, name string) { if n == nil || role == "" { return } @@ -330,22 +443,36 @@ func stampTemporalRole(n *graph.Node, role, name string) { if name != "" { n.Meta["temporal_name"] = name } + // Round-trip the stamp back through the store. On the in-memory + // backend n is canonical so this is an idempotent re-insert; on disk + // backends n is a per-call GetNode/AllNodes reconstruction, + // so without the write-back temporal_role/temporal_name would be + // discarded the moment this pass returns. ResolveTemporalCalls runs + // from RunGlobalGraphPasses, which can execute after the bulk-load + // buffer is flushed, so the in-place mutation is not otherwise + // captured. Matches reach / coverage / blame / releases / churn. + g.AddNode(n) } -// findGoTemporalTarget locates the Go function or method that a -// `worker.Register*(F)` call refers to. The register call lives at -// `caller` (typically `main` or a worker setup function); the function -// `F` is either declared in the same file or imported. The search -// order is: +// pickGoTemporalTarget selects the Go function or method that a +// `worker.Register*(F)` call refers to from a name-matched candidate +// set. The register call lives at `caller`; the function `F` is +// either declared in the same file or imported. The search order is: // // 1. Same-file function whose name matches. // 2. Same-repo function whose name matches. // 3. Unique workspace-wide function whose name matches. // -// Returns nil when no unambiguous match exists. -func findGoTemporalTarget(g *graph.Graph, caller *graph.Node, name string) *graph.Node { +// Returns nil when no unambiguous match exists. The candidate list +// MUST be pre-filtered to Name == registered name (FindNodesByNames +// already does that); this helper applies the Go-kind and language +// gates plus the locality tie-break. +func pickGoTemporalTarget(candidates []*graph.Node, caller *graph.Node) *graph.Node { + if caller == nil { + return nil + } var sameFile, sameRepo, all []*graph.Node - for _, n := range g.AllNodes() { + for _, n := range candidates { if n == nil { continue } @@ -355,9 +482,6 @@ func findGoTemporalTarget(g *graph.Graph, caller *graph.Node, name string) *grap if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { continue } - if n.Name != name { - continue - } all = append(all, n) if caller.RepoPrefix != "" && n.RepoPrefix == caller.RepoPrefix { sameRepo = append(sameRepo, n) @@ -378,28 +502,47 @@ func findGoTemporalTarget(g *graph.Graph, caller *graph.Node, name string) *grap return nil } -// collectJavaInterfaceMethods returns the interface's method nodes. -// The Java extractor emits interface methods as flat -// `::` nodes (no class-membership edge), -// distinguished from class methods by the absence of a "receiver" -// Meta. We narrow to the interface's source-line range so multiple -// interfaces in one file don't bleed into each other. -func collectJavaInterfaceMethods(g *graph.Graph, ifaceID string) []*graph.Node { - iface := g.GetNode(ifaceID) - if iface == nil { - return nil +// buildJavaMethodViews materialises two indexes over every Java +// method node in the graph: methodsByFile groups nodes whose Meta has +// NO "receiver" (interface methods, per the Java extractor's +// convention); methodsByReceiver groups nodes whose Meta carries a +// non-empty receiver. One NodesByKind scan replaces the N AllNodes() +// passes the old collectJavaInterfaceMethods + methodsOfJavaType +// helpers ran inside the per-interface propagation loop. +// +// ifaceCount == 0 is a fast no-op; with no tagged interfaces the +// indexes are unused so we skip the scan. +func buildJavaMethodViews(g graph.Store, ifaceCount int) (map[string][]*graph.Node, map[string][]*graph.Node) { + if ifaceCount == 0 { + return nil, nil } - var out []*graph.Node - for _, n := range g.AllNodes() { - if n == nil || n.Kind != graph.KindMethod || n.Language != "java" { + methodsByFile := map[string][]*graph.Node{} + methodsByReceiver := map[string][]*graph.Node{} + for n := range g.NodesByKind(graph.KindMethod) { + if n == nil || n.Language != "java" { continue } - if n.FilePath != iface.FilePath { - continue - } - if _, hasReceiver := n.Meta["receiver"]; hasReceiver { - continue + recv, _ := n.Meta["receiver"].(string) + if recv == "" { + methodsByFile[n.FilePath] = append(methodsByFile[n.FilePath], n) + } else { + methodsByReceiver[recv] = append(methodsByReceiver[recv], n) } + } + return methodsByFile, methodsByReceiver +} + +// collectJavaInterfaceMethodsFromIndex returns the interface's method +// nodes — flat KindMethod nodes in the interface's file whose +// StartLine sits inside the interface's line range. Consumes the +// methodsByFile view built by buildJavaMethodViews so the scan is +// O(methods in this file) rather than O(every node). +func collectJavaInterfaceMethodsFromIndex(iface *graph.Node, methodsByFile map[string][]*graph.Node) []*graph.Node { + if iface == nil { + return nil + } + var out []*graph.Node + for _, n := range methodsByFile[iface.FilePath] { if n.StartLine < iface.StartLine || (iface.EndLine > 0 && n.StartLine > iface.EndLine) { continue } @@ -408,27 +551,28 @@ func collectJavaInterfaceMethods(g *graph.Graph, ifaceID string) []*graph.Node { return out } -// methodsOfJavaType returns the method nodes of a Java class — i.e. -// every KindMethod node whose Meta["receiver"] matches the type name. -// The Java extractor uses the receiver field for class membership. -func methodsOfJavaType(g *graph.Graph, t *graph.Node) []*graph.Node { +// methodsOfJavaTypeFromIndex returns the method nodes whose +// Meta["receiver"] matches the type's name (or the receiver-suffix +// shape on the class node's ID). Consumes the methodsByReceiver view +// built by buildJavaMethodViews so the scan is O(methods of this +// receiver) rather than O(every node). +func methodsOfJavaTypeFromIndex(t *graph.Node, methodsByReceiver map[string][]*graph.Node) []*graph.Node { if t == nil { return nil } - var out []*graph.Node - for _, n := range g.AllNodes() { - if n == nil || n.Kind != graph.KindMethod || n.Language != "java" { + out := methodsByReceiver[t.Name] + // Honour the legacy id-suffix tie-break: a class node's id is + // `::`; a method whose receiver matches that + // trailing component is still a member even when the receiver + // Meta carries a fully-qualified name. + for recv, candidates := range methodsByReceiver { + if recv == t.Name { continue } - recv, _ := n.Meta["receiver"].(string) - if recv == "" { + if !strings.HasSuffix(t.ID, "::"+recv) { continue } - // Java method node receiver is the class name; the class node's - // ID shape is `::` so match by suffix. - if recv == t.Name || strings.HasSuffix(t.ID, "::"+recv) { - out = append(out, n) - } + out = append(out, candidates...) } return out } diff --git a/internal/resolver/temporal_calls_test.go b/internal/resolver/temporal_calls_test.go index 7e2c4a9c..82c7922d 100644 --- a/internal/resolver/temporal_calls_test.go +++ b/internal/resolver/temporal_calls_test.go @@ -14,7 +14,7 @@ import ( // either a Go register-call edge or a Java @ActivityInterface + // EdgeImplements chain that names the activity. type temporalTestGraph struct { - g *graph.Graph + g graph.Store } func newTemporalTestGraph() *temporalTestGraph { return &temporalTestGraph{g: graph.New()} } diff --git a/internal/search/hybrid.go b/internal/search/hybrid.go index 13171e4b..61f63899 100644 --- a/internal/search/hybrid.go +++ b/internal/search/hybrid.go @@ -70,7 +70,7 @@ func (h *HybridBackend) Remove(id string) { // for natural-language queries (where semantic similarity catches // synonymous wording). func (h *HybridBackend) Search(query string, limit int) []SearchResult { - textResults, vecIDs := h.searchChannels(query, limit) + textResults, vecIDs, _ := h.searchChannels(query, limit) if len(vecIDs) == 0 { if len(textResults) > limit { return textResults[:limit] @@ -89,17 +89,93 @@ func (h *HybridBackend) Search(query string, limit int) []SearchResult { // contribute as a separate Signal instead of being collapsed into a // single RRF score upstream of the rerank. func (h *HybridBackend) SearchChannels(query string, limit int) (textResults []SearchResult, vectorIDs []string) { + textResults, vectorIDs, _ = h.searchChannels(query, limit) + return textResults, vectorIDs +} + +// ChannelTimings carries per-phase wall-clock numbers from one +// SearchChannelsTimed call. Zero fields = phase didn't run (e.g. +// VectorSearchMS=0 when the vector index is empty). +type ChannelTimings struct { + TextMS int64 + EmbedMS int64 + VectorSearchMS int64 +} + +// VectorChannelOnly returns the vector-channel IDs (embedder + ANN +// search) WITHOUT re-running the text BM25 path. Used by the engine +// when the text channel has already been satisfied via the bundle +// path — the bundle returns Nodes + edges + scores already, so +// re-running text Search would double-pay the FTS cost. Returns +// nil and a zero ChannelTimings when the vector index is empty. +func (h *HybridBackend) VectorChannelOnly(query string, limit int) ([]string, ChannelTimings) { + var stats ChannelTimings + if h == nil || h.vector == nil || h.vector.Count() == 0 { + return nil, stats + } + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + embedStart := time.Now() + queryVec, err := h.embedder.Embed(ctx, query) + stats.EmbedMS = time.Since(embedStart).Milliseconds() + if err != nil || queryVec == nil { + return nil, stats + } + fetch := limit * 2 + if h.vector.HasChunks() { + fetch = limit * 8 + } + vecStart := time.Now() + rawVecIDs := h.vector.Search(queryVec, fetch) + stats.VectorSearchMS = time.Since(vecStart).Milliseconds() + return h.dechunkVectorIDs(rawVecIDs, limit*2), stats +} + +// SearchChannelsTimed is SearchChannels with a per-phase timing +// breakdown so callers can prove which sub-step (text BM25 vs +// vector embed vs vector ANN) actually cost wall-clock time. +// Used by the MCP search_symbols handler's debug-log +// instrumentation; production callers that don't care just use +// SearchChannels. +func (h *HybridBackend) SearchChannelsTimed(query string, limit int) ([]SearchResult, []string, ChannelTimings) { return h.searchChannels(query, limit) } -func (h *HybridBackend) searchChannels(query string, limit int) ([]SearchResult, []string) { +// SearchSymbolBundles forwards to the text backend's bundle path when +// it implements SymbolBundleSearcherBackend. The vector channel does +// not participate — its IDs ride out through SearchChannels/Timed as +// before and the engine merges them with the bundle set. Returns nil +// when the text backend has no bundle support (no-op for the +// fallback path). +// +// HybridBackend wires both channels together in production, so the +// engine's bundle-detection step type-asserts on the outer +// HybridBackend through Swappable; this is what makes the bundle +// path available when the daemon's search is the BM25 + vector +// stack instead of a bare SymbolSearcherBackend. +func (h *HybridBackend) SearchSymbolBundles(query string, limit int) []SymbolBundle { + if h == nil || h.text == nil { + return nil + } + if bs, ok := h.text.(SymbolBundleSearcherBackend); ok { + return bs.SearchSymbolBundles(query, limit) + } + return nil +} + +func (h *HybridBackend) searchChannels(query string, limit int) ([]SearchResult, []string, ChannelTimings) { + var stats ChannelTimings + tStart := time.Now() textResults := h.text.Search(query, limit*2) + stats.TextMS = time.Since(tStart).Milliseconds() var vecIDs []string if h.vector.Count() > 0 { ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() + embedStart := time.Now() queryVec, err := h.embedder.Embed(ctx, query) + stats.EmbedMS = time.Since(embedStart).Milliseconds() if err == nil && queryVec != nil { // When symbols are sub-chunked, one symbol owns several // vectors, so a fixed top-k under-counts distinct symbols. @@ -108,10 +184,13 @@ func (h *HybridBackend) searchChannels(query string, limit int) ([]SearchResult, if h.vector.HasChunks() { fetch = limit * 8 } - vecIDs = h.dechunkVectorIDs(h.vector.Search(queryVec, fetch), limit*2) + vecStart := time.Now() + rawVecIDs := h.vector.Search(queryVec, fetch) + stats.VectorSearchMS = time.Since(vecStart).Milliseconds() + vecIDs = h.dechunkVectorIDs(rawVecIDs, limit*2) } } - return textResults, vecIDs + return textResults, vecIDs, stats } // dechunkVectorIDs maps raw vector-search hits — which may be synthetic diff --git a/internal/search/rerank/context.go b/internal/search/rerank/context.go index 74426148..e28877d8 100644 --- a/internal/search/rerank/context.go +++ b/internal/search/rerank/context.go @@ -121,6 +121,131 @@ type Context struct { // runs once per file rather than once per candidate. Bounded by // the candidate set's file count. pathPenaltyCache map[string]float64 + + // outEdgeCache / inEdgeCache hold the per-candidate edge slices + // fetched in one batched round-trip from Graph at prepare() time. + // FanInSignal / FanOutSignal / MinHashSignal read from these + // instead of calling Graph.GetIn/OutEdges per-candidate, which on + // a disk backend collapses ~6N per-search round-trips + // (~150 calls × 14ms ≈ 2 s) into 2. Empty when Graph is nil. + // Callers must use the inEdges / outEdges accessors so signals + // stay graph-agnostic. + outEdgeCache map[string][]*graph.Edge + inEdgeCache map[string][]*graph.Edge + + // preparedCands is the candidate slice identity prepare() was last + // called against. Pipeline.Rerank skips re-prepare when the same + // slice header is seen back-to-back so callers that pre-call + // Prepare for per-phase timing do not pay for it twice. The check + // is identity-only (same slice, same length) — any mutation that + // reallocates resets it. + preparedCands []*Candidate + + // cachePreSeeded is the caller's promise (via SeedEdgeCaches with + // preSeeded=true) that outEdgeCache / inEdgeCache already cover + // the candidate set the next Prepare call will see. When set, + // prepare() skips the batched edge fetch entirely — the bundle + // path's edges are authoritative and a second fetch is pure + // overhead. Reset by the caller (typically the engine, after each + // Search) to keep the flag from leaking across reranks. + cachePreSeeded bool +} + +// Prepare populates the internal scratch fields used by every signal +// once per Rerank call. Exposed so callers that want to time prepare +// separately (the search hot path) can call it explicitly; in that +// case the subsequent Rerank call detects the prepared state and +// skips the duplicate work. Safe to call multiple times against the +// same slice — it's a full reset on each call. +func (c *Context) Prepare(cands []*Candidate) { c.prepare(cands) } + +// SeedEdgeCaches installs pre-fetched in/out edge maps the caller +// already gathered (today: from the SymbolBundleSearcherBackend hot +// path). The maps are merged into the context — IDs already in the +// cache keep their existing entry, new IDs append. The accompanying +// flag tells prepare() the caches are authoritative for the +// candidate set so it can skip its own batched edge fetch on the +// next Prepare call. +// +// IDs missing from the caller's bundle (vector-channel hits, fallback +// substring matches) still get fetched the slow per-candidate way +// through the outEdges / inEdges accessors when a signal asks for +// them — the seed is a best-effort fast path, not a contract that +// every candidate's edges are present. Callers MUST set +// cachePreSeeded only when the seed covers the expected candidate set +// (i.e. when the bundle backend returned a result for every BM25 +// hit in the merged candidate slice). +func (c *Context) SeedEdgeCaches(inEdges, outEdges map[string][]*graph.Edge, preSeeded bool) { + if c.outEdgeCache == nil { + c.outEdgeCache = make(map[string][]*graph.Edge, len(outEdges)) + } + for id, es := range outEdges { + if _, dup := c.outEdgeCache[id]; dup { + continue + } + c.outEdgeCache[id] = es + } + if c.inEdgeCache == nil { + c.inEdgeCache = make(map[string][]*graph.Edge, len(inEdges)) + } + for id, es := range inEdges { + if _, dup := c.inEdgeCache[id]; dup { + continue + } + c.inEdgeCache[id] = es + } + if preSeeded { + c.cachePreSeeded = true + } +} + +// CachePreSeeded reports whether the caller has signaled (via +// SeedEdgeCaches with preSeeded=true) that the edge caches cover the +// candidate set the next Prepare call will see. Exposed so the +// MCP handler can report a cache-hit-rate / cache-pre-seeded boolean +// in its debug log without grepping internal state. +func (c *Context) CachePreSeeded() bool { return c.cachePreSeeded } + +// InheritEdgeCacheFrom shares the source context's edge caches + +// cachePreSeeded flag onto c. Used by the engine to give per-call +// inner reranks access to the handler-built bundle cache without +// inheriting the handler's session-aware signals (locality, combo, +// frecency, feedback). Cheap pointer-copy of the map references; the +// inner rerank's prepare() reads through them and any backfills it +// triggers land in the SHARED map so subsequent calls benefit. Pass +// nil to clear. +func (c *Context) InheritEdgeCacheFrom(src *Context) { + if c == nil || src == nil { + return + } + c.outEdgeCache = src.outEdgeCache + c.inEdgeCache = src.inEdgeCache + c.cachePreSeeded = src.cachePreSeeded +} + +// EdgeCacheHitRate reports the fraction of nodeIDs that have an entry +// in the in OR out edge cache. 0.0 when the caches are empty; 1.0 when +// every input id has a cache entry on both sides. Used by the +// MCP handler to surface "did the bundle path actually catch?" on +// the search_symbols debug log without exposing internal state. +func (c *Context) EdgeCacheHitRate(ids []string) float64 { + if len(ids) == 0 { + return 0 + } + hits := 0 + for _, id := range ids { + // An id counts as a hit if BOTH the in-edge cache and the + // out-edge cache have an entry for it — that's the contract + // the bundle pre-seed promises. A half-seeded id (only one + // side cached) is a near-miss the prepare() pass would still + // have to satisfy by fetching the missing side. + _, hasOut := c.outEdgeCache[id] + _, hasIn := c.inEdgeCache[id] + if hasOut && hasIn { + hits++ + } + } + return float64(hits) / float64(len(ids)) } // now returns the active timestamp (test-injectable when Now != 0). @@ -133,7 +258,23 @@ func (c *Context) now() int64 { // prepare populates the internal scratch fields once per Rerank call. // Idempotent — safe to call again after mutating the candidate slice. +// +// Edge fetches happen in two batched round-trips (one inbound, one +// outbound) collected from every candidate's ID up front. On a disk +// backend each per-candidate GetInEdges / GetOutEdges call +// costs ~14ms; batching collapses ~150 round-trips per Rerank +// into 2. +// +// Bundle pre-seed fast path: when the caller has set cachePreSeeded +// (via SeedEdgeCaches with preSeeded=true), prepare keeps the existing +// caches in place and skips the batched edge fetch entirely. The +// fanInMax / fanOutMax stats are computed from the already-cached +// maps — same numbers, no cgo. This is the load-bearing skip the +// SymbolBundleSearcherBackend path depends on: the bundle's edges +// were already gathered server-side; a second round-trip here would +// pure-overhead the win. func (c *Context) prepare(cands []*Candidate) { + c.preparedCands = cands c.communityCount = make(map[string]int, len(cands)) c.maxCommunityCount = 0 c.candidateIDs = make(map[string]struct{}, len(cands)) @@ -144,12 +285,23 @@ func (c *Context) prepare(cands []*Candidate) { c.fileScoreSum = make(map[string]float64, len(cands)) c.maxFileScoreSum = 0 c.pathPenaltyCache = make(map[string]float64, len(cands)) + // Preserve the seeded edge caches when the caller signaled + // cachePreSeeded; the legacy reset path below the candidate walk + // only runs when the caches are NOT authoritative. + if !c.cachePreSeeded { + c.outEdgeCache = nil + c.inEdgeCache = nil + } + // First pass: collect candidate IDs (the input to the batched edge + // fetch) and populate the non-edge scratch fields. + ids := make([]string, 0, len(cands)) for _, cand := range cands { if cand == nil || cand.Node == nil { continue } c.candidateIDs[cand.Node.ID] = struct{}{} + ids = append(ids, cand.Node.ID) if c.CommunityOf != nil { com := c.CommunityOf(cand.Node.ID) @@ -161,17 +313,6 @@ func (c *Context) prepare(cands []*Candidate) { } } - if c.Graph != nil { - fi := len(c.Graph.GetInEdges(cand.Node.ID)) - fo := len(c.Graph.GetOutEdges(cand.Node.ID)) - if fi > c.fanInMax { - c.fanInMax = fi - } - if fo > c.fanOutMax { - c.fanOutMax = fo - } - } - ch := c.churnFor(cand.Node) if ch > c.churnMax { c.churnMax = ch @@ -192,6 +333,102 @@ func (c *Context) prepare(cands []*Candidate) { } } } + + // Second pass: one batched in-edge + one out-edge round-trip + // against Graph, scoped to the IDs that are NOT yet cached. + // When cachePreSeeded covers every candidate (the bundle hot + // path's typical shape), the missing slice is empty and the + // round-trips are skipped entirely — pure cache-served fan-in / + // fan-out. When the bundle only covers some IDs (vector or + // fallback hits get appended without bundle edges), we fetch + // only the uncovered tail and merge into the existing cache. + // Skipped when Graph is nil — fan signals contribute 0. + if c.Graph != nil && len(ids) > 0 { + missingOut := missingEdgeIDs(ids, c.outEdgeCache) + missingIn := missingEdgeIDs(ids, c.inEdgeCache) + // Backfill — when the cache already covers everything, both + // missing slices are empty and no cgo round-trip fires. + if len(missingOut) > 0 { + fetched := c.Graph.GetOutEdgesByNodeIDs(missingOut) + if c.outEdgeCache == nil { + c.outEdgeCache = make(map[string][]*graph.Edge, len(fetched)) + } + for id, es := range fetched { + c.outEdgeCache[id] = es + } + } + if len(missingIn) > 0 { + fetched := c.Graph.GetInEdgesByNodeIDs(missingIn) + if c.inEdgeCache == nil { + c.inEdgeCache = make(map[string][]*graph.Edge, len(fetched)) + } + for id, es := range fetched { + c.inEdgeCache[id] = es + } + } + } + for _, id := range ids { + if fi := len(c.inEdgeCache[id]); fi > c.fanInMax { + c.fanInMax = fi + } + if fo := len(c.outEdgeCache[id]); fo > c.fanOutMax { + c.fanOutMax = fo + } + } +} + +// missingEdgeIDs returns the subset of ids whose edge slice is NOT +// already in cache. Used by prepare's backfill: when the bundle path +// pre-seeded most candidates but not all (vector / fallback hits get +// appended without bundle edges), only the uncovered ids cross the +// engine boundary. An empty result means the cache is complete — the +// fetch round-trip can be skipped entirely. +func missingEdgeIDs(ids []string, cache map[string][]*graph.Edge) []string { + if cache == nil { + // No pre-seed at all — caller has to fetch the full set; return + // the input unchanged so the existing batched fetch path runs. + return ids + } + missing := make([]string, 0, len(ids)) + for _, id := range ids { + if _, ok := cache[id]; !ok { + missing = append(missing, id) + } + } + return missing +} + +// outEdges returns the prepared outgoing-edge slice for nodeID. Reads +// from the prepare()-populated cache when available; falls back to a +// direct Graph.GetOutEdges call when prepare did not cache the node +// (a signal calling outside the candidate set, or Graph was nil at +// prepare time but a later mutation set it). Signals must use this +// accessor instead of calling Graph directly so the batched-fetch +// invariant holds. +func (c *Context) outEdges(nodeID string) []*graph.Edge { + if c.outEdgeCache != nil { + if edges, ok := c.outEdgeCache[nodeID]; ok { + return edges + } + } + if c.Graph == nil { + return nil + } + return c.Graph.GetOutEdges(nodeID) +} + +// inEdges is the inbound sibling of outEdges. See that doc-comment +// for the contract. +func (c *Context) inEdges(nodeID string) []*graph.Edge { + if c.inEdgeCache != nil { + if edges, ok := c.inEdgeCache[nodeID]; ok { + return edges + } + } + if c.Graph == nil { + return nil + } + return c.Graph.GetInEdges(nodeID) } // churnFor consults the ChurnOf hook, then Node.Meta["churn"], then diff --git a/internal/search/rerank/pipeline.go b/internal/search/rerank/pipeline.go index 07dd335c..2094deab 100644 --- a/internal/search/rerank/pipeline.go +++ b/internal/search/rerank/pipeline.go @@ -98,7 +98,13 @@ func (p *Pipeline) Rerank(query string, cands []*Candidate, ctx *Context) []*Can if ctx.QueryClass == QueryClassUnknown { ctx.QueryClass = ClassifyQuery(query) } - ctx.prepare(cands) + // Skip prepare when the caller already invoked Context.Prepare + // for per-phase timing on this exact slice — avoids paying the + // batched edge fetch twice on the search hot path. Identity check + // is intentional: any mutation that reallocates resets it. + if !sameSliceHeader(ctx.preparedCands, cands) { + ctx.prepare(cands) + } for _, c := range cands { if c.Signals == nil { @@ -143,6 +149,17 @@ func (p *Pipeline) Rerank(query string, cands []*Candidate, ctx *Context) []*Can return cands } +// sameSliceHeader reports whether a and b alias the same underlying +// candidate slice (same backing array, same length). Used by Rerank to +// detect "the caller already invoked Prepare on this exact slice" and +// skip the duplicate prepare pass. +func sameSliceHeader(a, b []*Candidate) bool { + if len(a) == 0 || len(b) == 0 || len(a) != len(b) { + return false + } + return &a[0] == &b[0] +} + // Nodes is a convenience that unwraps a result slice into the // underlying graph nodes in score order. func Nodes(cands []*Candidate) []*graph.Node { diff --git a/internal/search/rerank/retriever.go b/internal/search/rerank/retriever.go index afb12b20..28042e79 100644 --- a/internal/search/rerank/retriever.go +++ b/internal/search/rerank/retriever.go @@ -26,7 +26,7 @@ type Retriever interface { // The caller passes the graph (so retrievers can do graph // walks without owning a reference). ctx is honoured for // cancellation — long-running retrievers must respect it. - Retrieve(ctx context.Context, g *graph.Graph, query string, limit int) ([]*Candidate, error) + Retrieve(ctx context.Context, g graph.Store, query string, limit int) ([]*Candidate, error) } // GraphCompletion is a Retriever that uses an upstream Retriever for @@ -46,7 +46,7 @@ type Retriever interface { type GraphCompletion struct { // Seeder produces the initial candidate set the 1-hop expansion // will fan out from. Required. - Seeder func(ctx context.Context, g *graph.Graph, query string, limit int) ([]*Candidate, error) + Seeder func(ctx context.Context, g graph.Store, query string, limit int) ([]*Candidate, error) // MaxSeedExpansion caps the number of new candidates produced // per seed. Defaults to 8 — large enough to surface typical @@ -69,7 +69,7 @@ func (gc *GraphCompletion) Name() string { return "graph_completion" } // merged: the seed copy wins and keeps its rank fields. New nodes // added by expansion have TextRank=-1 / VectorRank=-1 so the // downstream rerank knows they came from graph expansion. -func (gc *GraphCompletion) Retrieve(ctx context.Context, g *graph.Graph, query string, limit int) ([]*Candidate, error) { +func (gc *GraphCompletion) Retrieve(ctx context.Context, g graph.Store, query string, limit int) ([]*Candidate, error) { if gc.Seeder == nil { return nil, errNilSeeder } @@ -91,6 +91,7 @@ func (gc *GraphCompletion) Retrieve(ctx context.Context, g *graph.Graph, query s out := make([]*Candidate, 0, len(seeds)*2) seen := make(map[string]*Candidate, len(seeds)*2) + seedIDs := make([]string, 0, len(seeds)) for _, c := range seeds { if c == nil || c.Node == nil { continue @@ -100,14 +101,38 @@ func (gc *GraphCompletion) Retrieve(ctx context.Context, g *graph.Graph, query s } seen[c.Node.ID] = c out = append(out, c) + seedIDs = append(seedIDs, c.Node.ID) } - for _, seed := range seeds { - if seed == nil || seed.Node == nil { - continue + // One batched out-edge round-trip across every seed instead of + // one query per seed. On the disk backend this drops ~30 + // round-trips into 1 for a typical search_symbols completion pass. + outEdges := g.GetOutEdgesByNodeIDs(seedIDs) + + // Collect every distinct target id, then materialise the target + // nodes in one batched GetNodesByIDs call — same shape, same win. + toIDs := make([]string, 0, len(outEdges)*4) + toSeen := make(map[string]struct{}, len(outEdges)*4) + for _, seedID := range seedIDs { + for _, e := range outEdges[seedID] { + if !keepAll && !allowed[e.Kind] { + continue + } + if _, dup := seen[e.To]; dup { + continue + } + if _, dup := toSeen[e.To]; dup { + continue + } + toSeen[e.To] = struct{}{} + toIDs = append(toIDs, e.To) } + } + toNodes := g.GetNodesByIDs(toIDs) + + for _, seedID := range seedIDs { added := 0 - for _, e := range g.GetOutEdges(seed.Node.ID) { + for _, e := range outEdges[seedID] { if !keepAll && !allowed[e.Kind] { continue } @@ -117,7 +142,7 @@ func (gc *GraphCompletion) Retrieve(ctx context.Context, g *graph.Graph, query s if _, dup := seen[e.To]; dup { continue } - toNode := g.GetNode(e.To) + toNode := toNodes[e.To] if toNode == nil { continue } diff --git a/internal/search/rerank/retriever_test.go b/internal/search/rerank/retriever_test.go index 38ce449f..e4d9107d 100644 --- a/internal/search/rerank/retriever_test.go +++ b/internal/search/rerank/retriever_test.go @@ -24,7 +24,7 @@ func newRetrieverGraph(t *testing.T) *graph.Graph { return g } -func seedHub(_ context.Context, g *graph.Graph, _ string, _ int) ([]*Candidate, error) { +func seedHub(_ context.Context, g graph.Store, _ string, _ int) ([]*Candidate, error) { n := g.GetNode("h") if n == nil { return nil, nil @@ -102,7 +102,7 @@ func TestGraphCompletion_NilSeederErrors(t *testing.T) { func TestGraphCompletion_SeederErrorPropagates(t *testing.T) { g := newRetrieverGraph(t) gc := &GraphCompletion{ - Seeder: func(context.Context, *graph.Graph, string, int) ([]*Candidate, error) { + Seeder: func(context.Context, graph.Store, string, int) ([]*Candidate, error) { return nil, errors.New("seeder failed") }, } @@ -114,7 +114,7 @@ func TestGraphCompletion_SeederErrorPropagates(t *testing.T) { func TestGraphCompletion_DedupesSeedFromExpansion(t *testing.T) { g := newRetrieverGraph(t) // Two seeds, the second is reachable from the first. - multiSeed := func(_ context.Context, gr *graph.Graph, _ string, _ int) ([]*Candidate, error) { + multiSeed := func(_ context.Context, gr graph.Store, _ string, _ int) ([]*Candidate, error) { return []*Candidate{ {Node: gr.GetNode("h"), TextRank: 0}, {Node: gr.GetNode("a"), TextRank: 1}, // also reachable from h @@ -136,7 +136,7 @@ func TestGraphCompletion_DedupesSeedFromExpansion(t *testing.T) { func TestGraphCompletion_NilSeedsIgnored(t *testing.T) { g := newRetrieverGraph(t) gc := &GraphCompletion{ - Seeder: func(context.Context, *graph.Graph, string, int) ([]*Candidate, error) { + Seeder: func(context.Context, graph.Store, string, int) ([]*Candidate, error) { return []*Candidate{nil, {Node: nil}, {Node: g.GetNode("h")}}, nil }, } diff --git a/internal/search/rerank/signals_graph.go b/internal/search/rerank/signals_graph.go index 2f19e0c9..33c33dd8 100644 --- a/internal/search/rerank/signals_graph.go +++ b/internal/search/rerank/signals_graph.go @@ -13,7 +13,7 @@ func (FanInSignal) Contribute(_ string, c *Candidate, ctx *Context) float64 { if ctx.Graph == nil { return 0 } - count := len(ctx.Graph.GetInEdges(c.Node.ID)) + count := len(ctx.inEdges(c.Node.ID)) return normLog(count, ctx.fanInMax) } @@ -29,7 +29,7 @@ func (FanOutSignal) Contribute(_ string, c *Candidate, ctx *Context) float64 { if ctx.Graph == nil { return 0 } - count := len(ctx.Graph.GetOutEdges(c.Node.ID)) + count := len(ctx.outEdges(c.Node.ID)) return normLog(count, ctx.fanOutMax) } @@ -47,7 +47,7 @@ func (MinHashSignal) Contribute(_ string, c *Candidate, ctx *Context) float64 { return 0 } var total, n float64 - for _, e := range ctx.Graph.GetOutEdges(c.Node.ID) { + for _, e := range ctx.outEdges(c.Node.ID) { if e.Kind != graph.EdgeSimilarTo { continue } @@ -63,7 +63,7 @@ func (MinHashSignal) Contribute(_ string, c *Candidate, ctx *Context) float64 { } // Symmetric edge — also walk incoming (snapshots that omit // outgoing copies of similar_to don't lose recall). - for _, e := range ctx.Graph.GetInEdges(c.Node.ID) { + for _, e := range ctx.inEdges(c.Node.ID) { if e.Kind != graph.EdgeSimilarTo { continue } diff --git a/internal/search/swappable.go b/internal/search/swappable.go index fa24aaf2..0907687c 100644 --- a/internal/search/swappable.go +++ b/internal/search/swappable.go @@ -81,6 +81,59 @@ func (s *Swappable) SearchChannels(query string, limit int) (textResults []Searc return s.inner.Search(query, limit), nil } +// SearchChannelsTimed delegates to a backend that supports the +// per-phase timing breakdown (today only HybridBackend). Falls back +// to SearchChannels — and a zero-valued ChannelTimings — when the +// inner backend doesn't know how to split phases. +func (s *Swappable) SearchChannelsTimed(query string, limit int) ([]SearchResult, []string, ChannelTimings) { + s.mu.RLock() + defer s.mu.RUnlock() + type timer interface { + SearchChannelsTimed(query string, limit int) ([]SearchResult, []string, ChannelTimings) + } + if cst, ok := s.inner.(timer); ok { + return cst.SearchChannelsTimed(query, limit) + } + if cs, ok := s.inner.(ChannelSearcher); ok { + text, vec := cs.SearchChannels(query, limit) + return text, vec, ChannelTimings{} + } + return s.inner.Search(query, limit), nil, ChannelTimings{} +} + +// SearchSymbolBundles forwards to the inner backend when it implements +// SymbolBundleSearcherBackend (production wiring: a +// SymbolSearcherBackend whose store is the disk Store, or a +// HybridBackend whose text backend is the same). Returns nil when the +// inner backend doesn't expose bundles — the engine treats nil as +// "no bundle support" and falls back to the per-call Search + +// GetNodesByIDs + GetIn/OutEdgesByNodeIDs path. +func (s *Swappable) SearchSymbolBundles(query string, limit int) []SymbolBundle { + s.mu.RLock() + defer s.mu.RUnlock() + if bs, ok := s.inner.(SymbolBundleSearcherBackend); ok { + return bs.SearchSymbolBundles(query, limit) + } + return nil +} + +// VectorChannelOnly forwards to the inner backend when it implements +// the vector-only channel pull (today: HybridBackend). Lets the +// engine fetch the vector channel without re-running text BM25 — +// the bundle path already has the text hits. Returns (nil, zero +// timings) when the inner backend isn't vector-aware. +func (s *Swappable) VectorChannelOnly(query string, limit int) ([]string, ChannelTimings) { + s.mu.RLock() + defer s.mu.RUnlock() + type vco interface { + VectorChannelOnly(query string, limit int) ([]string, ChannelTimings) + } + if v, ok := s.inner.(vco); ok { + return v.VectorChannelOnly(query, limit) + } + return nil, ChannelTimings{} +} + func (s *Swappable) Count() int { s.mu.RLock() defer s.mu.RUnlock() diff --git a/internal/search/symbolsearcher_backend.go b/internal/search/symbolsearcher_backend.go new file mode 100644 index 00000000..68ca59e6 --- /dev/null +++ b/internal/search/symbolsearcher_backend.go @@ -0,0 +1,160 @@ +package search + +import ( + "strings" + "sync/atomic" + + "github.com/zzet/gortex/internal/graph" +) + +// SymbolSearcherBackend adapts a graph.SymbolSearcher into the +// search.Backend the daemon's search-symbols path consumes. +// Engine.gatherBackendCandidates and the rerank pipeline don't need +// to know whether the backend is BM25 / Bleve / native FTS — they +// see a plain search.Backend and call Search on it. +// +// Production wiring: when the indexer detects that the backing +// graph.Store also implements graph.SymbolSearcher, it constructs +// this adapter as the initial +// search.Backend wrapped by search.NewSwappable. The in-process +// Bleve / BM25 build path is then bypassed entirely. +// +// Add / Remove are no-ops on the adapter because the indexer +// already drives the SymbolSearcher writes directly: +// +// - cold-load: BulkUpsertSymbolFTS at shadow-drain commit (see +// internal/indexer.go IndexCtx defer) +// - incremental: UpsertSymbolFTS alongside the parallel +// idx.search.Add in the per-file path +// +// The adapter therefore only carries the read side. Callers that +// invoke Add / Remove still get the right behaviour because the +// indexer is the only entity that ever creates this adapter, and +// it doesn't rely on Add / Remove updating the FTS — those calls +// happen through the direct SymbolSearcher surface. +type SymbolSearcherBackend struct { + s graph.SymbolSearcher + + // count tracks the indexer's incremental Add / Remove deltas + // only — it does NOT report the actual size of the backend + // FTS index (which lives in the disk store and is queryable + // via the SymbolSearcher's own primitives). Used for the + // search.Backend.Count() contract by callers that just want a + // rough magnitude (no caller currently treats this as + // authoritative). + count atomic.Int64 +} + +// NewSymbolSearcherBackend wraps a SymbolSearcher in the +// search.Backend contract. The caller is responsible for keeping +// the underlying SymbolSearcher alive — Close on this adapter is +// a no-op and never touches the wrapped store. +func NewSymbolSearcherBackend(s graph.SymbolSearcher) *SymbolSearcherBackend { + return &SymbolSearcherBackend{s: s} +} + +// SymbolBundle re-exports graph.SymbolBundle so callers (the query +// engine, the rerank seed path) can construct + consume bundles +// without re-importing the graph package next to the search +// package import — symmetric with how SearchResult sits in +// search/. +type SymbolBundle = graph.SymbolBundle + +// SearchSymbolBundles is the bundled-search hot path: it forwards +// to the wrapped graph.SymbolBundleSearcher when the underlying +// store implements that capability, returning the matched node + +// score + in/out edges in one engine round-trip. When the store +// only implements SymbolSearcher (no Bundle support), this method +// returns nil — callers MUST check the result and fall back to the +// per-call Search → GetNodesByIDs → GetIn/OutEdgesByNodeIDs path. +// +// Exposed on SymbolSearcherBackend (the production search.Backend +// adapter used in production) so the engine can type-assert through +// the search.Backend chain via SymbolBundleSearcherBackend without +// touching the daemon's wiring. +func (b *SymbolSearcherBackend) SearchSymbolBundles(query string, limit int) []SymbolBundle { + if b == nil || b.s == nil || strings.TrimSpace(query) == "" { + return nil + } + bs, ok := b.s.(graph.SymbolBundleSearcher) + if !ok { + return nil + } + bundles, err := bs.SearchSymbolBundles(query, limit) + if err != nil { + return nil + } + return bundles +} + +// SymbolBundleSearcherBackend is the interface the engine type-asserts +// on a search.Backend to detect bundle support. Both +// *SymbolSearcherBackend and *HybridBackend implement this; Swappable +// forwards. +type SymbolBundleSearcherBackend interface { + SearchSymbolBundles(query string, limit int) []SymbolBundle +} + +// Search forwards to SymbolSearcher.SearchSymbols and translates +// the per-hit (NodeID, Score) into search.SearchResult so callers +// don't see the graph package at all. +// +// An error from the backend is downgraded to an empty result — the +// daemon's search_symbols path already tolerates an empty primary +// hit set (it falls through to the exact-name / substring tiers in +// query.Engine.gatherBackendCandidates), so returning an error +// surface here would force every caller to grow its own fallback. +func (b *SymbolSearcherBackend) Search(query string, limit int) []SearchResult { + if b == nil || b.s == nil || strings.TrimSpace(query) == "" { + return nil + } + hits, err := b.s.SearchSymbols(query, limit) + if err != nil || len(hits) == 0 { + return nil + } + out := make([]SearchResult, len(hits)) + for i, h := range hits { + out[i] = SearchResult{ID: h.NodeID, Score: h.Score} + } + return out +} + +// Add is a no-op — the indexer drives UpsertSymbolFTS on the wrapped +// SymbolSearcher directly. count is bumped so the Count() figure +// tracks the deltas-since-construction (best-effort, not +// authoritative — the disk index may be larger from a prior cold +// load). +func (b *SymbolSearcherBackend) Add(id string, _ ...string) { + if b == nil || id == "" { + return + } + b.count.Add(1) +} + +// Remove is a no-op for the same reason as Add — the per-call +// removal path (when one lands) routes through SymbolSearcher +// directly, not through the search.Backend contract. count is +// decremented so the Count() figure stays roughly consistent. +func (b *SymbolSearcherBackend) Remove(id string) { + if b == nil || id == "" { + return + } + b.count.Add(-1) +} + +// Count returns the running delta-since-construction. Used for +// observability / "is the index populated?" gates — never as a +// load-bearing decision input. The authoritative size lives in +// the disk FTS index, which is queryable via the +// SymbolSearcher's native primitives if needed. +func (b *SymbolSearcherBackend) Count() int { + if b == nil { + return 0 + } + return int(b.count.Load()) +} + +// Close is a no-op. The wrapped SymbolSearcher is owned by the +// graph.Store; closing it from the search adapter would race the +// indexer's own lifecycle. +func (b *SymbolSearcherBackend) Close() {} diff --git a/internal/search/vector.go b/internal/search/vector.go index 77ffc345..3bc129c4 100644 --- a/internal/search/vector.go +++ b/internal/search/vector.go @@ -9,6 +9,8 @@ import ( "sync" "github.com/coder/hnsw" + + "github.com/zzet/gortex/internal/graph" ) // vectorFrameMagic prefixes the framed VectorBackend.Save format: a @@ -18,7 +20,24 @@ import ( // map — so old snapshots keep working. var vectorFrameMagic = [4]byte{'G', 'V', 'X', '1'} +// VectorDelegate is the subset of graph.VectorSearcher the +// VectorBackend shim consults when it's been told to delegate +// instead of holding an in-process HNSW. Exported (with a +// graph.VectorHit return) so the indexer can install a delegate +// without writing a translation layer — search already depends on +// graph for SymbolHit, so the type sharing is free. +type VectorDelegate interface { + SimilarTo(vec []float32, limit int) ([]graph.VectorHit, error) +} + // VectorBackend stores and searches embedding vectors using HNSW index. +// +// When delegate is set (via SetDelegate), the in-process HNSW is +// bypassed entirely: Add becomes a no-op (the indexer drives the +// delegate's bulk-upsert directly), Search forwards to the +// delegate's SimilarTo. The dims and chunkMap stay live so callers +// that need them (HybridBackend.dechunkVectorIDs) keep working +// against the same VectorBackend surface. type VectorBackend struct { graph *hnsw.Graph[string] count int @@ -30,6 +49,16 @@ type VectorBackend struct { // returned twice and chunk IDs never leak to callers. chunkMap map[string]string mu sync.RWMutex + + // delegate is the optional engine-native vector searcher (today + // only graph.SymbolSearcher-implementing stores). Set means + // "don't build the in-process HNSW; route reads through here". + // The wrapped delegateCount tracks Add-call deltas so Count() + // reports a non-zero figure once the indexer has finished its + // bulk upsert — HybridBackend gates the vector channel on + // Count() > 0. + delegate VectorDelegate + delegateCount int } // NewVector creates a vector search backend for the given embedding dimensions. @@ -75,6 +104,16 @@ func (v *VectorBackend) HasChunks() bool { func (v *VectorBackend) Add(id string, vector []float32) { v.mu.Lock() defer v.mu.Unlock() + if v.delegate != nil { + // Delegated mode: the indexer pushes vectors to the + // engine-native HNSW via the graph.VectorSearcher + // interface directly. Add here is a no-op so the + // in-process hnsw.Graph never allocates memory for what + // the delegate already owns; count tracks deltas so + // Count()'s "is the index populated" gate fires. + v.delegateCount++ + return + } v.graph.Add(hnsw.Node[string]{ Key: id, Value: hnsw.Vector(vector), @@ -82,8 +121,37 @@ func (v *VectorBackend) Add(id string, vector []float32) { v.count++ } +// SetDelegate routes Search / Count through an engine-native vector +// searcher (the disk store's graph.VectorSearcher). After +// the call: +// - Add is a no-op (the indexer talks to the delegate directly via +// graph.VectorSearcher.BulkUpsertEmbeddings / UpsertEmbedding), +// - Search forwards to delegate.SimilarTo, +// - Count reflects the delegate-delta count (not the in-process +// graph), so HybridBackend.searchChannels's `v.Count() > 0` gate +// fires once the indexer has populated the backend. +func (v *VectorBackend) SetDelegate(d VectorDelegate) { + v.mu.Lock() + defer v.mu.Unlock() + v.delegate = d +} + // Search returns the k nearest neighbors to the query vector. func (v *VectorBackend) Search(query []float32, k int) []string { + v.mu.RLock() + d := v.delegate + v.mu.RUnlock() + if d != nil { + hits, err := d.SimilarTo(query, k) + if err != nil || len(hits) == 0 { + return nil + } + ids := make([]string, len(hits)) + for i, h := range hits { + ids[i] = h.NodeID + } + return ids + } v.mu.RLock() defer v.mu.RUnlock() if v.count == 0 { @@ -101,6 +169,9 @@ func (v *VectorBackend) Search(query []float32, k int) []string { func (v *VectorBackend) Count() int { v.mu.RLock() defer v.mu.RUnlock() + if v.delegate != nil { + return v.delegateCount + } return v.count } diff --git a/internal/semantic/enricher.go b/internal/semantic/enricher.go index aa5727b0..c463a84f 100644 --- a/internal/semantic/enricher.go +++ b/internal/semantic/enricher.go @@ -20,13 +20,13 @@ func ConfirmEdge(e *graph.Edge, provider string) { // RefuteEdge removes a false-positive edge from the graph. // Returns true if the edge was removed. -func RefuteEdge(g *graph.Graph, e *graph.Edge) bool { +func RefuteEdge(g graph.Store, e *graph.Edge) bool { return g.RemoveEdge(e.From, e.To, e.Kind) } // AddSemanticEdge adds a new edge discovered by semantic analysis. Origin is // tagged LSP-grade (see ConfirmEdge). -func AddSemanticEdge(g *graph.Graph, from, to string, kind graph.EdgeKind, filePath string, line int, provider string) *graph.Edge { +func AddSemanticEdge(g graph.Store, from, to string, kind graph.EdgeKind, filePath string, line int, provider string) *graph.Edge { e := &graph.Edge{ From: from, To: to, @@ -66,7 +66,7 @@ func EnrichNodeMeta(n *graph.Node, key string, value any, provider string) { } // FindMatchingEdge searches for an existing edge between two nodes of a given kind. -func FindMatchingEdge(g *graph.Graph, from, to string, kind graph.EdgeKind) *graph.Edge { +func FindMatchingEdge(g graph.Store, from, to string, kind graph.EdgeKind) *graph.Edge { edges := g.GetOutEdges(from) for _, e := range edges { if e.To == to && e.Kind == kind { @@ -77,7 +77,7 @@ func FindMatchingEdge(g *graph.Graph, from, to string, kind graph.EdgeKind) *gra } // FindEdgeByTarget searches for an edge from a node to a target with any kind. -func FindEdgeByTarget(g *graph.Graph, from, to string) *graph.Edge { +func FindEdgeByTarget(g graph.Store, from, to string) *graph.Edge { edges := g.GetOutEdges(from) for _, e := range edges { if e.To == to { @@ -88,7 +88,7 @@ func FindEdgeByTarget(g *graph.Graph, from, to string) *graph.Edge { } // NodesByLanguage returns all nodes in the graph that match the given language. -func NodesByLanguage(g *graph.Graph, language string) []*graph.Node { +func NodesByLanguage(g graph.Store, language string) []*graph.Node { var result []*graph.Node for _, n := range g.AllNodes() { if n.Language == language { @@ -99,7 +99,7 @@ func NodesByLanguage(g *graph.Graph, language string) []*graph.Node { } // EdgesByLanguage returns all edges whose source node matches the given language. -func EdgesByLanguage(g *graph.Graph, language string) []*graph.Edge { +func EdgesByLanguage(g graph.Store, language string) []*graph.Edge { var result []*graph.Edge for _, e := range g.AllEdges() { fromNode := g.GetNode(e.From) diff --git a/internal/semantic/goanalysis/externals.go b/internal/semantic/goanalysis/externals.go index cae6dd10..363ee34c 100644 --- a/internal/semantic/goanalysis/externals.go +++ b/internal/semantic/goanalysis/externals.go @@ -39,12 +39,18 @@ const modulePathStdlib = "stdlib" // Statistics counters surface back through ExternalsResult so the caller // can report nodes/edges added. type externalsAttribution struct { - g *graph.Graph + g graph.Store pkgByPath map[string]*packages.Package moduleByPath map[string]string extByObj map[types.Object]string provider string + // repoPrefix is the owning repo's prefix, used to namespace stub + // IDs (graph.StubID). Empty when the caller doesn't supply one + // — in that case stub IDs are emitted in the legacy un-prefixed + // form, which graph.IsStdlibStub / friends still recognise. + repoPrefix string + nodesAdded int edgesAdded int edgesUpgraded int @@ -57,7 +63,7 @@ type externalsAttribution struct { // roots. Walking pkg.Imports collects every dep — stdlib and module-cache // alike — so resolveSymbol can find the owning *packages.Package for an // arbitrary types.Object. -func newExternalsAttribution(g *graph.Graph, roots []*packages.Package, provider string) *externalsAttribution { +func newExternalsAttribution(g graph.Store, roots []*packages.Package, provider string) *externalsAttribution { pkgByPath := make(map[string]*packages.Package) var visit func(p *packages.Package) visit = func(p *packages.Package) { @@ -81,9 +87,34 @@ func newExternalsAttribution(g *graph.Graph, roots []*packages.Package, provider moduleByPath: make(map[string]string), extByObj: make(map[types.Object]string), provider: provider, + repoPrefix: deriveRepoPrefix(g, roots), } } +// deriveRepoPrefix peeks at the first source file across the +// enrichment roots and reads its RepoPrefix from the graph. +// All files belonging to a single semantic.Provider.Enrich call +// share one repo, so a single sample suffices. Returns "" when no +// matching file node is found — stubs then fall back to the +// legacy un-prefixed form, which graph.IsStdlibStub still accepts. +func deriveRepoPrefix(g graph.Store, roots []*packages.Package) string { + for _, r := range roots { + if r == nil { + continue + } + for _, f := range r.GoFiles { + if nodes := g.GetFileNodes(f); len(nodes) > 0 { + for _, n := range nodes { + if n != nil && n.RepoPrefix != "" { + return n.RepoPrefix + } + } + } + } + } + return "" +} + // resolveSymbol returns the graph node ID for an external go/types object, // creating it (and the owning KindModule node, if not already present) // on first sight. Returns "" when the object is unsuitable for @@ -199,7 +230,7 @@ func (e *externalsAttribution) claimAndUpgradeStub(callerID string, importPath s // claimByExactStub handles the canonical resolver-shaped targets. Pulled // out so the fuzzy pass can layer on top. func (e *externalsAttribution) claimByExactStub(callerID string, importPath string, obj types.Object, newTarget string) *graph.Edge { - candidates := stubEdgeTargets(importPath, obj) + candidates := stubEdgeTargets(e.repoPrefix, importPath, obj) for _, target := range candidates { edge := semantic.FindEdgeByTarget(e.g, callerID, target) if edge == nil { @@ -276,9 +307,9 @@ func wantedEdgeKind(obj types.Object) graph.EdgeKind { // strings the resolver writes for unresolved or external lookups. func isStubTarget(to string) bool { switch { - case strings.HasPrefix(to, "unresolved::"), + case graph.IsUnresolvedTarget(to), strings.HasPrefix(to, "external::"), - strings.HasPrefix(to, "stdlib::"), + graph.IsStdlibStub(to), strings.HasPrefix(to, "dep::"): return true } @@ -393,7 +424,12 @@ func (e *externalsAttribution) ensureModuleNode(pkg *packages.Package) string { // written for an external obj. Order matches resolver precedence: // stdlib::/dep:: are produced post-resolve, unresolved::extern:: is the // raw form when resolveExtern wasn't run. -func stubEdgeTargets(importPath string, obj types.Object) []string { +// +// repoPrefix namespaces the stdlib stub form per-repo so two repos +// pinned to different Go SDK versions don't collide on a single +// `stdlib::fmt::Errorf` node. An empty repoPrefix yields the legacy +// un-prefixed form, which the resolver still emits today. +func stubEdgeTargets(repoPrefix, importPath string, obj types.Object) []string { if obj == nil { return nil } @@ -402,7 +438,7 @@ func stubEdgeTargets(importPath string, obj types.Object) []string { return nil } return []string{ - "stdlib::" + importPath + "::" + name, + graph.StubID(repoPrefix, graph.StubKindStdlib, importPath, name), "dep::" + importPath + "::" + name, "unresolved::extern::" + importPath + "::" + name, } diff --git a/internal/semantic/goanalysis/provider.go b/internal/semantic/goanalysis/provider.go index 0cebcc1a..66f2332b 100644 --- a/internal/semantic/goanalysis/provider.go +++ b/internal/semantic/goanalysis/provider.go @@ -65,7 +65,7 @@ func (p *Provider) Available() bool { return true } -func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResult, error) { +func (p *Provider) Enrich(g graph.Store, repoRoot string) (*semantic.EnrichResult, error) { start := time.Now() absRoot, err := filepath.Abs(repoRoot) @@ -245,6 +245,12 @@ func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResu result.EdgesAdded += p.addMissingImplements(g, pkgs, objToNode, absRoot) // Phase 4: Enrich node metadata with type info. + // EnrichNodeMeta mutates Node.Meta in place; on disk backends the + // node is a per-call GetNode reconstruction, so collect every stamped + // node and round-trip it through the store at the end (one AddBatch) + // or the semantic_type / return_type stamps are silently discarded on + // the disk backend. See semantic.EnrichNodeMeta. + var stampedNodes []*graph.Node for _, pkg := range pkgs { if pkg.TypesInfo == nil { continue @@ -262,10 +268,12 @@ func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResu continue } + didStamp := false typeStr := types.TypeString(obj.Type(), nil) if typeStr != "" && typeStr != "invalid type" { semantic.EnrichNodeMeta(node, "semantic_type", typeStr, p.Name()) result.NodesEnriched++ + didStamp = true } // Add return type for functions. @@ -274,18 +282,25 @@ func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResu if ok && sig.Results().Len() > 0 { retType := types.TypeString(sig.Results(), nil) semantic.EnrichNodeMeta(node, "return_type", retType, p.Name()) + didStamp = true } } + if didStamp { + stampedNodes = append(stampedNodes, node) + } _ = ident // used in range } } + if len(stampedNodes) > 0 { + g.AddBatch(stampedNodes, nil) + } result.DurationMs = time.Since(start).Milliseconds() return result, nil } -func (p *Provider) EnrichFile(g *graph.Graph, repoRoot, filePath string) (*semantic.EnrichResult, error) { +func (p *Provider) EnrichFile(g graph.Store, repoRoot, filePath string) (*semantic.EnrichResult, error) { // go/types can do incremental loading per package, but for simplicity // we re-enrich the whole graph. The manager's debounce prevents thrashing. return nil, nil @@ -528,7 +543,7 @@ func (p *Provider) loadPackages(dir string) ([]*packages.Package, *token.FileSet } // enrichImplements confirms existing EdgeImplements edges using go/types. -func (p *Provider) enrichImplements(g *graph.Graph, pkgs []*packages.Package, objToNode map[types.Object]string) int { +func (p *Provider) enrichImplements(g graph.Store, pkgs []*packages.Package, objToNode map[types.Object]string) int { confirmed := 0 // Collect all interfaces from the loaded packages. @@ -565,7 +580,7 @@ func (p *Provider) enrichImplements(g *graph.Graph, pkgs []*packages.Package, ob } // addMissingImplements discovers interface implementations that tree-sitter missed. -func (p *Provider) addMissingImplements(g *graph.Graph, pkgs []*packages.Package, objToNode map[types.Object]string, absRoot string) int { +func (p *Provider) addMissingImplements(g graph.Store, pkgs []*packages.Package, objToNode map[types.Object]string, absRoot string) int { added := 0 // Collect interfaces and concrete types. @@ -619,7 +634,7 @@ func (p *Provider) addMissingImplements(g *graph.Graph, pkgs []*packages.Package } // findContainingFunc finds the Gortex function/method node that contains the given position. -func findContainingFunc(g *graph.Graph, pkgs []*packages.Package, fset *token.FileSet, absRoot string, pos token.Position) *graph.Node { +func findContainingFunc(g graph.Store, pkgs []*packages.Package, fset *token.FileSet, absRoot string, pos token.Position) *graph.Node { relPath := relativePath(pos.Filename, absRoot) if relPath == "" { return nil diff --git a/internal/semantic/lsp/provider.go b/internal/semantic/lsp/provider.go index e6b868fb..98201280 100644 --- a/internal/semantic/lsp/provider.go +++ b/internal/semantic/lsp/provider.go @@ -177,7 +177,7 @@ func (p *Provider) Close() error { return nil } -func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResult, error) { +func (p *Provider) Enrich(g graph.Store, repoRoot string) (*semantic.EnrichResult, error) { start := time.Now() absRoot, err := filepath.Abs(repoRoot) @@ -268,6 +268,11 @@ func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResu // Query hover info for nodes to enrich metadata. enrichedNodes := make(map[string]bool) + // EnrichNodeMeta mutates Node.Meta in place; on disk backends n is a + // per-call AllNodes reconstruction, so collect stamped nodes and + // round-trip them through the store at the end or the semantic_type + // stamp is discarded on the disk backend. See semantic.EnrichNodeMeta. + var stampedNodes []*graph.Node for _, n := range g.AllNodes() { if n.Kind == graph.KindFile || n.Kind == graph.KindImport { continue @@ -300,6 +305,7 @@ func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResu typeInfo := extractTypeFromHover(hoverResult.Contents.Value) if typeInfo != "" { semantic.EnrichNodeMeta(n, "semantic_type", typeInfo, p.Name()) + stampedNodes = append(stampedNodes, n) if !enrichedNodes[n.ID] { result.NodesEnriched++ result.SymbolsCovered++ @@ -307,6 +313,9 @@ func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResu } } } + if len(stampedNodes) > 0 { + g.AddBatch(stampedNodes, nil) + } // Query implementations for interface nodes. for _, n := range g.AllNodes() { @@ -406,7 +415,7 @@ func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResu return result, nil } -func (p *Provider) EnrichFile(g *graph.Graph, repoRoot, filePath string) (*semantic.EnrichResult, error) { +func (p *Provider) EnrichFile(g graph.Store, repoRoot, filePath string) (*semantic.EnrichResult, error) { // LSP supports incremental updates, but for simplicity we skip it. // The full Enrich pass handles this. return nil, nil @@ -1157,7 +1166,7 @@ func (p *Provider) Source(repoRoot, relPath string) []byte { // matching ast_inferred / text_matched EdgeCalls to lsp_resolved, or // add a fresh EdgeCalls when the AST extractor missed the link // (cross-file calls in languages without compile-unit info). -func (p *Provider) enrichCallHierarchy(g *graph.Graph, absRoot string, result *semantic.EnrichResult) { +func (p *Provider) enrichCallHierarchy(g graph.Store, absRoot string, result *semantic.EnrichResult) { for _, n := range g.AllNodes() { if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { continue @@ -1191,7 +1200,7 @@ func (p *Provider) enrichCallHierarchy(g *graph.Graph, absRoot string, result *s // asOutgoing=true means "this node calls other"; false means "other // calls this node" (incoming-calls direction). Existing edges get // promoted to lsp_resolved; missing edges get added. -func (p *Provider) recordHierarchyCall(g *graph.Graph, absRoot string, n *graph.Node, other CallHierarchyItem, asOutgoing bool, result *semantic.EnrichResult) { +func (p *Provider) recordHierarchyCall(g graph.Store, absRoot string, n *graph.Node, other CallHierarchyItem, asOutgoing bool, result *semantic.EnrichResult) { otherPath := uriToPath(other.URI, absRoot) if otherPath == "" { return @@ -1232,7 +1241,7 @@ func (p *Provider) recordHierarchyCall(g *graph.Graph, absRoot string, n *graph. // T → super when the super is an interface kind. // - subtypes(T) = the children of T. Emits EdgeImplements child // → T when T is an interface; EdgeExtends otherwise. -func (p *Provider) enrichTypeHierarchy(g *graph.Graph, absRoot string, result *semantic.EnrichResult) { +func (p *Provider) enrichTypeHierarchy(g graph.Store, absRoot string, result *semantic.EnrichResult) { for _, n := range g.AllNodes() { if n.Kind != graph.KindType && n.Kind != graph.KindInterface { continue @@ -1267,7 +1276,7 @@ func (p *Provider) enrichTypeHierarchy(g *graph.Graph, absRoot string, result *s // whose name matches a method on the parent — closing the // method-level half of the type hierarchy (Joern calls these // CONTAINS + OVERRIDES). -func (p *Provider) linkTypeHierarchy(g *graph.Graph, absRoot string, cur *graph.Node, other TypeHierarchyItem, asSupertype bool, result *semantic.EnrichResult) { +func (p *Provider) linkTypeHierarchy(g graph.Store, absRoot string, cur *graph.Node, other TypeHierarchyItem, asSupertype bool, result *semantic.EnrichResult) { otherPath := uriToPath(other.URI, absRoot) if otherPath == "" { return @@ -1313,7 +1322,7 @@ func (p *Provider) linkTypeHierarchy(g *graph.Graph, absRoot string, cur *graph. // origin lets the caller stamp the edges with lsp_dispatch (LSP- // confirmed parent), ast_resolved (AST-confirmed parent in the same // compilation unit), or ast_inferred (parent is a heuristic match). -func addOverrideEdges(g *graph.Graph, child, parent *graph.Node, provider, origin string, result *semantic.EnrichResult) { +func addOverrideEdges(g graph.Store, child, parent *graph.Node, provider, origin string, result *semantic.EnrichResult) { if child == nil || parent == nil || child.ID == parent.ID { return } diff --git a/internal/semantic/lsp/resolver_helper_integration_test.go b/internal/semantic/lsp/resolver_helper_integration_test.go new file mode 100644 index 00000000..5e327a4d --- /dev/null +++ b/internal/semantic/lsp/resolver_helper_integration_test.go @@ -0,0 +1,115 @@ +package lsp + +import ( + "os" + "os/exec" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap" +) + +// TestResolverHelper_RealTsserver_DefinitionAcrossFiles spins up a +// real typescript-language-server against a tiny on-disk TS fixture +// and asserts the helper resolves a cross-file method call to the +// correct declaration. Skips when typescript-language-server isn't +// on PATH (CI / dev machines without npm install). +// +// This is the load-bearing N5 integration check: the unit tests in +// resolver_registry_test.go cover dispatch logic with a scripted +// stub; this test verifies the underlying LSP-protocol wiring +// (initialize → didOpen → textDocument/definition → response) lands +// on a real graph file path. +func TestResolverHelper_RealTsserver_DefinitionAcrossFiles(t *testing.T) { + if _, err := exec.LookPath("typescript-language-server"); err != nil { + t.Skip("typescript-language-server not on PATH — skip integration test (run `npm i -g typescript-language-server typescript` to enable)") + } + + workspace := t.TempDir() + mustWrite(t, filepath.Join(workspace, "tsconfig.json"), `{"compilerOptions":{"target":"ES2020","module":"commonjs","strict":false}}`) + // Use a method on a class to avoid the import-binding ambiguity: + // tsserver's textDocument/definition on a method invocation + // reliably returns the method declaration, even with TS's + // declaration-merging. + mustWrite(t, filepath.Join(workspace, "lib.ts"), `export class Worker { + doWork(x: number): number { + return x + 1; + } +} +`) + mustWrite(t, filepath.Join(workspace, "caller.ts"), `import { Worker } from "./lib"; + +export function callIt(): number { + const w = new Worker(); + return w.doWork(42); +} +`) + + spec := SpecByName("typescript-language-server") + require.NotNil(t, spec, "TS spec must be in registry") + + provider := NewProviderFromSpec(spec, zap.NewNop()) + helper := NewResolverHelper(provider, workspace, 10*time.Second, zap.NewNop()) + defer func() { _ = helper.Close() }() + + // Warm tsserver up by asking once and discarding the result — + // the workspace project graph loads asynchronously and the first + // definition request often races the workspace warmup. A retry + // loop tolerates 1-2 cold attempts. + var ( + defPath string + defLine int + ok bool + ) + deadline := time.Now().Add(8 * time.Second) + for { + defPath, defLine, ok = helper.Definition("caller.ts", 5, "doWork") + if ok && defPath == "lib.ts" { + break + } + if time.Now().After(deadline) { + break + } + time.Sleep(250 * time.Millisecond) + } + + require.True(t, ok, "tsserver should eventually resolve doWork across files") + assert.Equal(t, "lib.ts", defPath, "definition lives in lib.ts") + // lib.ts: line 1 = `export class Worker {`, line 2 = ` doWork(...) {` + assert.Equal(t, 2, defLine) +} + +// TestResolverHelper_RealTsserver_NoMatchReturnsFalse — when the +// identifier on the requested line doesn't resolve to anything +// (typo, missing import), the helper returns ok=false rather than +// inventing a location. +func TestResolverHelper_RealTsserver_NoMatchReturnsFalse(t *testing.T) { + if _, err := exec.LookPath("typescript-language-server"); err != nil { + t.Skip("typescript-language-server not on PATH") + } + + workspace := t.TempDir() + mustWrite(t, filepath.Join(workspace, "tsconfig.json"), `{"compilerOptions":{"target":"ES2020","module":"commonjs","strict":false}}`) + mustWrite(t, filepath.Join(workspace, "foo.ts"), `// no identifiers worth resolving here +const a = 1; +`) + + spec := SpecByName("typescript-language-server") + provider := NewProviderFromSpec(spec, zap.NewNop()) + helper := NewResolverHelper(provider, workspace, 5*time.Second, zap.NewNop()) + defer func() { _ = helper.Close() }() + + // "ghostFunction" doesn't appear on line 2 — tsserver should + // return an empty location set, the helper should report + // ok=false, the resolver falls through to heuristics. + _, _, ok := helper.Definition("foo.ts", 2, "ghostFunction") + assert.False(t, ok) +} + +func mustWrite(t *testing.T, path, content string) { + t.Helper() + require.NoError(t, os.WriteFile(path, []byte(content), 0644)) +} diff --git a/internal/semantic/manager.go b/internal/semantic/manager.go index b12e8432..e251e15b 100644 --- a/internal/semantic/manager.go +++ b/internal/semantic/manager.go @@ -101,7 +101,7 @@ func (m *Manager) LSPRouter() LSPRouter { // EnrichAll runs all available providers against the graph. // For each language, only the highest-priority available provider runs. -func (m *Manager) EnrichAll(g *graph.Graph, roots map[string]string) ([]*EnrichResult, error) { +func (m *Manager) EnrichAll(g graph.Store, roots map[string]string) ([]*EnrichResult, error) { if !m.config.Enabled { return nil, nil } @@ -202,7 +202,7 @@ func (m *Manager) configPriorityFor(name string) (int, bool) { // repo root and appends the results. Extracted so EnrichAll can share // the logging + lastResults bookkeeping between eager and Router-backed // providers. -func (m *Manager) runEnrichForProvider(g *graph.Graph, roots map[string]string, lang string, provider Provider, results []*EnrichResult) []*EnrichResult { +func (m *Manager) runEnrichForProvider(g graph.Store, roots map[string]string, lang string, provider Provider, results []*EnrichResult) []*EnrichResult { for repoName, repoRoot := range roots { start := time.Now() m.logger.Info("semantic enrichment starting", @@ -245,7 +245,7 @@ func (m *Manager) runEnrichForProvider(g *graph.Graph, roots map[string]string, } // EnrichFile runs incremental enrichment for a single file change. -func (m *Manager) EnrichFile(g *graph.Graph, repoRoot, filePath string) (*EnrichResult, error) { +func (m *Manager) EnrichFile(g graph.Store, repoRoot, filePath string) (*EnrichResult, error) { if !m.config.Enabled || !m.config.EnrichOnWatch { return nil, nil } diff --git a/internal/semantic/manager_test.go b/internal/semantic/manager_test.go index 3a9cd906..26609c3e 100644 --- a/internal/semantic/manager_test.go +++ b/internal/semantic/manager_test.go @@ -15,7 +15,7 @@ type mockProvider struct { name string languages []string available bool - enrichFunc func(g *graph.Graph, root string) (*EnrichResult, error) + enrichFunc func(g graph.Store, root string) (*EnrichResult, error) closed bool } @@ -24,7 +24,7 @@ func (m *mockProvider) Languages() []string { return m.languages } func (m *mockProvider) Available() bool { return m.available } func (m *mockProvider) Close() error { m.closed = true; return nil } -func (m *mockProvider) Enrich(g *graph.Graph, repoRoot string) (*EnrichResult, error) { +func (m *mockProvider) Enrich(g graph.Store, repoRoot string) (*EnrichResult, error) { if m.enrichFunc != nil { return m.enrichFunc(g, repoRoot) } @@ -37,7 +37,7 @@ func (m *mockProvider) Enrich(g *graph.Graph, repoRoot string) (*EnrichResult, e }, nil } -func (m *mockProvider) EnrichFile(g *graph.Graph, repoRoot, filePath string) (*EnrichResult, error) { +func (m *mockProvider) EnrichFile(g graph.Store, repoRoot, filePath string) (*EnrichResult, error) { return nil, nil } @@ -87,7 +87,7 @@ func TestManager_PrioritySelection(t *testing.T) { name: "high-priority", languages: []string{"go"}, available: true, - enrichFunc: func(g *graph.Graph, root string) (*EnrichResult, error) { + enrichFunc: func(g graph.Store, root string) (*EnrichResult, error) { highCalled = true return &EnrichResult{Provider: "high-priority", Language: "go"}, nil }, @@ -96,7 +96,7 @@ func TestManager_PrioritySelection(t *testing.T) { name: "low-priority", languages: []string{"go"}, available: true, - enrichFunc: func(g *graph.Graph, root string) (*EnrichResult, error) { + enrichFunc: func(g graph.Store, root string) (*EnrichResult, error) { lowCalled = true return &EnrichResult{Provider: "low-priority", Language: "go"}, nil }, diff --git a/internal/semantic/matcher.go b/internal/semantic/matcher.go index f5a677e5..6d15c723 100644 --- a/internal/semantic/matcher.go +++ b/internal/semantic/matcher.go @@ -48,7 +48,7 @@ func (m *SymbolMap) Size() int { // MatchNodeByFileLine finds a Gortex node by file path and line number. // This is the primary matching strategy for SCIP and LSP results. // It finds the innermost (smallest range) non-file node containing the line. -func MatchNodeByFileLine(g *graph.Graph, filePath string, line int) *graph.Node { +func MatchNodeByFileLine(g graph.Store, filePath string, line int) *graph.Node { nodes := g.GetFileNodes(filePath) // First: find the innermost node containing this line (smallest range). @@ -89,12 +89,12 @@ func MatchNodeByFileLine(g *graph.Graph, filePath string, line int) *graph.Node } // MatchNodeByQualName finds a Gortex node by qualified name. -func MatchNodeByQualName(g *graph.Graph, qualName string) *graph.Node { +func MatchNodeByQualName(g graph.Store, qualName string) *graph.Node { return g.GetNodeByQualName(qualName) } // MatchNodeByNameInFile finds a Gortex node by name within a specific file. -func MatchNodeByNameInFile(g *graph.Graph, name, filePath string) *graph.Node { +func MatchNodeByNameInFile(g graph.Store, name, filePath string) *graph.Node { nodes := g.GetFileNodes(filePath) for _, n := range nodes { if n.Name == name { diff --git a/internal/semantic/provider.go b/internal/semantic/provider.go index 44bca818..20ff262f 100644 --- a/internal/semantic/provider.go +++ b/internal/semantic/provider.go @@ -20,12 +20,12 @@ type Provider interface { // Enrich performs a full enrichment pass over the graph for the given repo root. // It upgrades edge confidence, adds missing edges, and fills Node.Meta fields. // Called after tree-sitter indexing + resolver pass completes. - Enrich(g *graph.Graph, repoRoot string) (*EnrichResult, error) + Enrich(g graph.Store, repoRoot string) (*EnrichResult, error) // EnrichFile performs a targeted enrichment for a single file and its // immediate dependents. Used in watch mode for incremental updates. // Returns nil result if incremental enrichment is not supported. - EnrichFile(g *graph.Graph, repoRoot string, filePath string) (*EnrichResult, error) + EnrichFile(g graph.Store, repoRoot string, filePath string) (*EnrichResult, error) // Close releases any resources held by the provider (daemon processes, // temp files, connections). diff --git a/internal/semantic/scip/provider.go b/internal/semantic/scip/provider.go index 16c628c4..7877b4a1 100644 --- a/internal/semantic/scip/provider.go +++ b/internal/semantic/scip/provider.go @@ -61,7 +61,7 @@ func (p *Provider) Available() bool { return err == nil } -func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResult, error) { +func (p *Provider) Enrich(g graph.Store, repoRoot string) (*semantic.EnrichResult, error) { start := time.Now() // Run the SCIP indexer. @@ -86,7 +86,7 @@ func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResu return result, nil } -func (p *Provider) EnrichFile(g *graph.Graph, repoRoot, filePath string) (*semantic.EnrichResult, error) { +func (p *Provider) EnrichFile(g graph.Store, repoRoot, filePath string) (*semantic.EnrichResult, error) { // SCIP doesn't support incremental indexing well — re-run full enrichment. // For large repos, this should be gated by the watch debounce. return nil, nil @@ -142,7 +142,7 @@ func (p *Provider) runIndexer(repoRoot string) (string, error) { } // enrichFromIndex maps SCIP data to the Gortex graph. -func (p *Provider) enrichFromIndex(g *graph.Graph, index *SCIPIndex, repoRoot string) *semantic.EnrichResult { +func (p *Provider) enrichFromIndex(g graph.Store, index *SCIPIndex, repoRoot string) *semantic.EnrichResult { result := &semantic.EnrichResult{} symMap := semantic.NewSymbolMap() @@ -272,6 +272,11 @@ func (p *Provider) enrichFromIndex(g *graph.Graph, index *SCIPIndex, repoRoot st } // Phase 4: Enrich node metadata from symbol documentation. + // Collect stamped nodes and round-trip them through the store at the + // end — EnrichNodeMeta mutates Node.Meta in place, which does not + // persist on disk backends (GetNode returns a per-call copy). See + // semantic.EnrichNodeMeta. + var stampedNodes []*graph.Node for _, doc := range index.Documents { for _, sym := range doc.Symbols { nodeID, ok := symMap.GortexID(sym.Symbol) @@ -289,16 +294,20 @@ func (p *Provider) enrichFromIndex(g *graph.Graph, index *SCIPIndex, repoRoot st if typeInfo != "" { semantic.EnrichNodeMeta(node, "semantic_type", typeInfo, p.Name()) result.NodesEnriched++ + stampedNodes = append(stampedNodes, node) } } } } + if len(stampedNodes) > 0 { + g.AddBatch(stampedNodes, nil) + } return result } // findContainingNode finds the innermost Gortex node that contains the given line. -func findContainingNode(g *graph.Graph, filePath string, line int) *graph.Node { +func findContainingNode(g graph.Store, filePath string, line int) *graph.Node { nodes := g.GetFileNodes(filePath) var best *graph.Node bestSize := int(^uint(0) >> 1) diff --git a/internal/server/dashboard.go b/internal/server/dashboard.go index e10b09fa..77db06f0 100644 --- a/internal/server/dashboard.go +++ b/internal/server/dashboard.go @@ -175,7 +175,7 @@ func splitOwner(prefix string) (owner, name string) { return "", prefix } -func reposFromGraph(g *graph.Graph) []repoEntry { +func reposFromGraph(g graph.Store) []repoEntry { stats := g.RepoStats() out := make([]repoEntry, 0, len(stats)) for prefix, s := range stats { @@ -1326,7 +1326,7 @@ func (h *Handler) handleCaveats(w http.ResponseWriter, r *http.Request) { // graph. Entries with an unresolvable symbol (e.g. cycle placeholders // or stale IDs from a prior index) are left untouched so the caller can // detect the gap instead of rendering zeros that look like real data. -func enrichCaveats(g *graph.Graph, caveats []caveatEntry) { +func enrichCaveats(g graph.Store, caveats []caveatEntry) { if g == nil { return } diff --git a/internal/server/handler.go b/internal/server/handler.go index 67fdd3c0..ae61faea 100644 --- a/internal/server/handler.go +++ b/internal/server/handler.go @@ -50,7 +50,7 @@ import ( // SetConfigManager / SetEventHub after construction. type Handler struct { mcpServer *mcpserver.MCPServer - graph *graph.Graph + graph graph.Store version string logger *zap.Logger mux *http.ServeMux @@ -65,7 +65,7 @@ type Handler struct { } // NewHandler creates an HTTP handler that dispatches to MCP tools. -func NewHandler(mcpServer *mcpserver.MCPServer, g *graph.Graph, version string, logger *zap.Logger) *Handler { +func NewHandler(mcpServer *mcpserver.MCPServer, g graph.Store, version string, logger *zap.Logger) *Handler { h := &Handler{ mcpServer: mcpServer, graph: g, @@ -84,7 +84,7 @@ func NewHandler(mcpServer *mcpserver.MCPServer, g *graph.Graph, version string, func (h *Handler) Mux() *http.ServeMux { return h.mux } // Graph returns the graph instance for sub-handlers that need direct access. -func (h *Handler) Graph() *graph.Graph { return h.graph } +func (h *Handler) Graph() graph.Store { return h.graph } // SetEventHub wires the watch-mode event hub so /v1/events can stream // graph-change events to subscribers, and starts the activity-buffer diff --git a/internal/skills/build.go b/internal/skills/build.go index 966132eb..8284d2d2 100644 --- a/internal/skills/build.go +++ b/internal/skills/build.go @@ -19,7 +19,7 @@ type BuildOpts struct { // Returns (nil, "") when no community meets the MinSize threshold — // callers treat both outputs as opaque payloads and pass them through // to adapters via agents.Env. -func Build(g *graph.Graph, opts BuildOpts) ([]GeneratedSkill, string) { +func Build(g graph.Store, opts BuildOpts) ([]GeneratedSkill, string) { if g == nil { return nil, "" } diff --git a/internal/skills/generator.go b/internal/skills/generator.go index ef69be3c..0e4cf6df 100644 --- a/internal/skills/generator.go +++ b/internal/skills/generator.go @@ -16,7 +16,7 @@ import ( type Generator struct { communities *analysis.CommunityResult processes *analysis.ProcessResult - graph *graph.Graph + graph graph.Store minSize int maxSkills int } @@ -30,7 +30,7 @@ type GeneratedSkill struct { } // New creates a skill generator. -func New(communities *analysis.CommunityResult, processes *analysis.ProcessResult, g *graph.Graph) *Generator { +func New(communities *analysis.CommunityResult, processes *analysis.ProcessResult, g graph.Store) *Generator { return &Generator{ communities: communities, processes: processes, diff --git a/internal/sql/registry.go b/internal/sql/registry.go index 085d0b31..41aaaa5d 100644 --- a/internal/sql/registry.go +++ b/internal/sql/registry.go @@ -44,7 +44,7 @@ type RebuildStats struct { // Returns counts for telemetry; rebuilt edges idempotently replace // any existing edges with the same edgeKey, so a second call after // the first reports tablesCreated=0, emittersLinked=0. -func RebuildTablesFromStringRegistry(g *graph.Graph) RebuildStats { +func RebuildTablesFromStringRegistry(g graph.Store) RebuildStats { if g == nil { return RebuildStats{} } diff --git a/internal/tokens/cache.go b/internal/tokens/cache.go index 5720f7d7..c5c2adf7 100644 --- a/internal/tokens/cache.go +++ b/internal/tokens/cache.go @@ -41,7 +41,7 @@ type DiskCache struct { } // DefaultTokenCacheDir returns the default cache location: -// ~/.cache/gortex/token-counts (or the $XDG_CACHE_HOME equivalent). +// ~/.gortex/cache/token-counts (or the $XDG_CACHE_HOME equivalent). func DefaultTokenCacheDir() string { return filepath.Join(platform.CacheDir(), "token-counts") } diff --git a/internal/wiki/enhance_cache.go b/internal/wiki/enhance_cache.go index c293fc5a..3642e2e0 100644 --- a/internal/wiki/enhance_cache.go +++ b/internal/wiki/enhance_cache.go @@ -29,7 +29,7 @@ func NewEnhanceCache(root string) *EnhanceCache { } // DefaultEnhanceCacheDir returns the default cache location: -// ~/.cache/gortex/wiki-enhance (or $XDG_CACHE_HOME equivalent). +// ~/.gortex/cache/wiki-enhance (or $XDG_CACHE_HOME equivalent). func DefaultEnhanceCacheDir() string { return filepath.Join(platform.CacheDir(), "wiki-enhance") } diff --git a/internal/wiki/generator.go b/internal/wiki/generator.go index ac461abc..15dfa959 100644 --- a/internal/wiki/generator.go +++ b/internal/wiki/generator.go @@ -24,7 +24,7 @@ type SemanticProviderStatus struct { // Inputs is the dependency bundle the Generator needs. All fields are // optional except Graph (without a graph there is nothing to render). type Inputs struct { - Graph *graph.Graph + Graph graph.Store Communities *analysis.CommunityResult Processes *analysis.ProcessResult Hotspots []analysis.HotspotEntry @@ -51,7 +51,7 @@ type Result struct { // derives the supporting lookup maps; Generate writes the markdown // pages and flushes the writer. type Generator struct { - graph *graph.Graph + graph graph.Store communities *analysis.CommunityResult processes *analysis.ProcessResult hotspots []analysis.HotspotEntry diff --git a/internal/wiki/mermaid.go b/internal/wiki/mermaid.go index 3fee41fe..c4246291 100644 --- a/internal/wiki/mermaid.go +++ b/internal/wiki/mermaid.go @@ -42,7 +42,7 @@ func mermaidEscape(s string) string { // the cross-community calls between them. Each node is a community; // edge weights are the number of calls flowing across the boundary. // Used both on the index page and as the wiki//_assets file. -func RenderCommunityGraph(g *graph.Graph, communities *analysis.CommunityResult, opts CommunityGraphOpts) string { +func RenderCommunityGraph(g graph.Store, communities *analysis.CommunityResult, opts CommunityGraphOpts) string { if communities == nil || len(communities.Communities) == 0 { return "graph LR\n empty[\"No communities detected\"]\n" } @@ -235,7 +235,7 @@ func stepLabel(id string, nodeByID map[string]*graph.Node) string { // RenderArchitecture emits a Mermaid flowchart showing communities // grouped by parent (when present) plus cross-community arrows. // Mirrors the architecture overview page. -func RenderArchitecture(g *graph.Graph, communities *analysis.CommunityResult, opts CommunityGraphOpts) string { +func RenderArchitecture(g graph.Store, communities *analysis.CommunityResult, opts CommunityGraphOpts) string { if communities == nil || len(communities.Communities) == 0 { return "graph TB\n empty[\"No communities detected\"]\n" } diff --git a/scripts/install.ps1 b/scripts/install.ps1 index dfa0eee9..7d4fced6 100644 --- a/scripts/install.ps1 +++ b/scripts/install.ps1 @@ -4,7 +4,8 @@ .DESCRIPTION Downloads the signed Windows release archive, verifies its SHA-256 - checksum, installs the binary, and puts it on the user PATH. + checksum, installs the self-contained gortex.exe, and puts the install + directory on the user PATH. Usage: irm https://get.gortex.dev/install.ps1 | iex @@ -127,8 +128,9 @@ function Main { } Write-Info 'extracting' - Expand-Archive -Path $zipPath -DestinationPath $tmp -Force - $extracted = Join-Path $tmp $BinName + $staging = Join-Path $tmp 'extract' + Expand-Archive -Path $zipPath -DestinationPath $staging -Force + $extracted = Join-Path $staging $BinName if (-not (Test-Path $extracted)) { Die "archive did not contain a $BinName binary" } @@ -140,7 +142,10 @@ function Main { Write-Info "backing up existing binary to $backup" Move-Item -Path $target -Destination $backup -Force } - Move-Item -Path $extracted -Destination $target -Force + # gortex.exe is a single self-contained binary — the mingw C/C++ + # runtime is statically linked into it — so install is a one-file + # copy with nothing else to place beside it. + Copy-Item -Path $extracted -Destination $target -Force Write-Ok "installed $target" if (-not $env:GORTEX_NO_PATH) {