CodeBoarding · brovatten · Jun 10, 2026 · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026
diff --git a/action.yml b/action.yml
@@ -352,15 +352,24 @@ runs:
       working-directory: target-repo
       env:
         BASE_SHA: ${{ steps.guard.outputs.base_sha }}
+        ACTION_PATH: ${{ github.action_path }}
       run: |
         BASE_DIR="${RUNNER_TEMP}/cb-base"
         HEAD_DIR="${RUNNER_TEMP}/cb-head"
         mkdir -p "$BASE_DIR" "$HEAD_DIR"
         echo "base_dir=$BASE_DIR" >> $GITHUB_OUTPUT
         echo "head_dir=$HEAD_DIR" >> $GITHUB_OUTPUT
         if git show "${BASE_SHA}:.codeboarding/analysis.json" > "${BASE_DIR}/analysis.json" 2>/dev/null; then
-          echo "committed=true" >> $GITHUB_OUTPUT
-          echo "Using committed .codeboarding/analysis.json at ${BASE_SHA}."
+          if python3 "$ACTION_PATH/scripts/cb_engine.py" validate-base \
+              --analysis "${BASE_DIR}/analysis.json" \
+              --expected-sha "$BASE_SHA"; then
+            echo "committed=true" >> $GITHUB_OUTPUT
+            echo "Using committed .codeboarding/analysis.json at ${BASE_SHA}."
+          else
+            rm -f "${BASE_DIR}/analysis.json"
+            echo "committed=false" >> $GITHUB_OUTPUT
+            echo "Committed baseline at ${BASE_SHA} is stale; will generate a fresh base analysis."
+          fi
         else
           rm -f "${BASE_DIR}/analysis.json"
           echo "committed=false" >> $GITHUB_OUTPUT
@@ -373,7 +382,52 @@ runs:
       uses: actions/cache/restore@v4
       with:
         path: ${{ runner.temp }}/cb-base
-        key: cb-base-${{ runner.os }}-${{ steps.guard.outputs.base_sha }}-d${{ inputs.depth_level }}-${{ inputs.engine_ref }}-${{ inputs.llm_provider }}-${{ inputs.agent_model }}-${{ inputs.parsing_model }}
+        key: cb-base-v2-${{ runner.os }}-${{ steps.guard.outputs.base_sha }}-d${{ inputs.depth_level }}-${{ inputs.engine_ref }}-${{ inputs.llm_provider }}-${{ inputs.agent_model }}-${{ inputs.parsing_model }}
+
+    # A committed analysis.json gives the head analysis stable component ids,
+    # but the engine's incremental path ALSO needs the base static_analysis.pkl
+    # with its cluster baseline — which git can't provide (the pkl is never
+    # committed, by design). Build it here deterministically: LSP + Leiden
+    # clustering, no LLM key read. Fail-open: a failed seed degrades to exactly
+    # the previous behavior (the head run falls back to a full analysis).
+    - name: Seed base static-analysis cache (committed baseline, no cached pkl)
+      if: steps.guard.outputs.skip != 'true' && steps.base.outputs.committed == 'true' && steps.basecache.outputs.cache-hit != 'true'
+      id: seedbase
+      continue-on-error: true
+      shell: bash
+      working-directory: codeboarding-engine
+      env:
+        STATIC_ANALYSIS_CONFIG: ${{ github.workspace }}/codeboarding-engine/static_analysis_config.yml
+        PROJECT_ROOT: ${{ github.workspace }}/codeboarding-engine
+        CACHING_DOCUMENTATION: 'false'
+        ENABLE_MONITORING: 'false'
+        ACTION_PATH: ${{ github.action_path }}
+        TARGET: ${{ github.workspace }}/target-repo
+        BASE_DIR: ${{ steps.base.outputs.base_dir }}
+        BASE_SHA: ${{ steps.guard.outputs.base_sha }}
+      run: |
+        # Clean up any stale registration before re-adding (rm -rf alone leaves a
+        # dangling worktree entry that makes a retry's `worktree add` fail).
+        BASE_SRC="${RUNNER_TEMP}/base-src"
+        git -C "$TARGET" worktree remove --force "$BASE_SRC" 2>/dev/null || true
+        git -C "$TARGET" worktree prune
+        rm -rf "$BASE_SRC"
+        git -C "$TARGET" worktree add --detach "$BASE_SRC" "$BASE_SHA"
+        if uv run python "$ACTION_PATH/scripts/cb_engine.py" seed \
+             --repo "$BASE_SRC" \
+             --out "$BASE_DIR" \
+             --source-sha "$BASE_SHA" \
+           && [ -f "$BASE_DIR/static_analysis.pkl" ] && [ -f "$BASE_DIR/static_analysis.sha" ]; then
+          echo "seed_ok=true" >> "$GITHUB_OUTPUT"
+          echo "::notice::Seeded base static-analysis cache for ${BASE_SHA}; head analysis can run incrementally."
+        else
+          # Never leave a partial pkl/sha pair behind: the save step below would
+          # cache it under this base SHA's key and suppress every retry.
+          rm -f "$BASE_DIR/static_analysis.pkl" "$BASE_DIR/static_analysis.sha"
+          echo "seed_ok=false" >> "$GITHUB_OUTPUT"
+          echo "::warning::Base static-analysis seeding failed; head analysis will fall back to a full run."
+        fi
+        git -C "$TARGET" worktree remove --force "$BASE_SRC" 2>/dev/null || true
 
     - name: Generate base analysis (no committed baseline)
       if: steps.guard.outputs.skip != 'true' && steps.base.outputs.committed == 'false' && steps.basecache.outputs.cache-hit != 'true'
@@ -424,12 +478,16 @@ runs:
           exit 1
         fi
 
+    # Covers both base-artifact producers: the full analysis (no committed
+    # baseline) and the seeded pkl (committed baseline). The seed_ok gate is
+    # load-bearing — caching a pkl-less dir under this base SHA's key would
+    # permanently suppress seeding retries for it.
     - name: Save generated base artifacts
-      if: steps.guard.outputs.skip != 'true' && steps.base.outputs.committed == 'false' && steps.basecache.outputs.cache-hit != 'true'
+      if: steps.guard.outputs.skip != 'true' && steps.basecache.outputs.cache-hit != 'true' && (steps.base.outputs.committed == 'false' || steps.seedbase.outputs.seed_ok == 'true')
       uses: actions/cache/save@v4
       with:
         path: ${{ runner.temp }}/cb-base
-        key: cb-base-${{ runner.os }}-${{ steps.guard.outputs.base_sha }}-d${{ inputs.depth_level }}-${{ inputs.engine_ref }}-${{ inputs.llm_provider }}-${{ inputs.agent_model }}-${{ inputs.parsing_model }}
+        key: cb-base-v2-${{ runner.os }}-${{ steps.guard.outputs.base_sha }}-d${{ inputs.depth_level }}-${{ inputs.engine_ref }}-${{ inputs.llm_provider }}-${{ inputs.agent_model }}-${{ inputs.parsing_model }}
 
     - name: Analyze PR head (incremental from base)
       if: steps.guard.outputs.skip != 'true'

diff --git a/docs/COMMIT_STRATEGY.md b/docs/COMMIT_STRATEGY.md
@@ -21,7 +21,7 @@ The engine writes these under `.codeboarding/`:
 - ✅ `health/health_report.json` — required for warnings in the extension/webview. Small text.
 
 **Do NOT commit (binary, bloat):**
-- ❌ `static_analysis.pkl` — binary, MB-scale, noisy diffs, repo bloat. It is a *rebuildable speed cache*, not display data. Keep it in **`actions/cache` keyed by the base SHA** (or a backend). A cache miss just falls back to a cold (full) LSP pass — slower but correct, and the committed `analysis.json` still drives the diagram.
+- ❌ `static_analysis.pkl` — binary, MB-scale, noisy diffs, repo bloat. It is a *rebuildable speed cache*, not display data. Keep it in **`actions/cache` keyed by the base SHA** (or a backend). On a cache miss the review action **seeds it deterministically** (LSP + clustering, no LLM calls) — the pkl is not optional for incremental: the engine refuses to run incrementally without the cluster baseline stored inside it.
 - `static_analysis.sha` — commit **only** if the pkl is kept reachable (cache/backend); on its own it's harmless but unused.
 
 > **Principle:** version-control the *source-of-truth display data* (text, small); *cache* the *rebuildable speed artifacts* (binary, large). This is exactly what keeps the repo clean — the thing that bloats (`.pkl`) never enters git.
@@ -40,7 +40,13 @@ The engine writes these under `.codeboarding/`:
 
 ## Warm-start tradeoff (the `.pkl`)
 
-The warm-start needs the pkl **and** its `.sha`. When the review action has to generate a base analysis, it saves that generated base artifact directory in `actions/cache` keyed by base SHA / depth / engine ref, then seeds the head analysis from that directory. When a committed `analysis.json` already exists but no matching cache exists, the PR still diffs correctly but may run a cold LSP pass. This keeps the repo clean; the cache improves speed but is not required for correctness.
+The warm-start — and the engine's incremental path itself — needs the pkl **and** its `.sha`: the cluster baseline that drives incremental lives only inside the pkl, so a committed `analysis.json` alone forces the head run into a full (LLM) fallback. The review action therefore guarantees the pair exists for the base SHA:
+
+- **No committed baseline:** the generated base analysis writes the pkl as a side effect; the artifact dir is saved in `actions/cache` keyed by base SHA / depth / engine ref.
+- **Committed baseline:** the action first requires `analysis.json.metadata.commit_hash` to equal the PR base SHA. A stale committed diagram is treated like no baseline, so the base is regenerated at the PR base commit before diffing.
+- **Committed baseline, cache miss:** the action *seeds* the pkl deterministically (`cb_engine.py seed`: LSP indexing + the same clustering call a full run makes — **no LLM calls**), then saves it to the same cache. Seeding is fail-open: if it fails, the head run falls back to a full analysis.
+
+Either way the head analysis is seeded from that directory and runs incrementally. This keeps the repo clean — the pkl never enters git — while the cache + seeding make incremental work from the first PR run.
 
 ## Summary
 

diff --git a/scripts/cb_engine.py b/scripts/cb_engine.py
@@ -4,12 +4,16 @@
 Subcommands (all paths/refs come in as argv, never interpolated into source):
 
   base    --repo P --out D --name N --run-id ID --depth K --source-sha SHA
+  seed    --repo P --out D --source-sha SHA
   head    --repo P --out D --name N --run-id ID --depth K --base-ref B --target-ref T --source-sha SHA
   health  --artifact-dir D --repo P --name N --issues-out FILE
 
-``base`` runs a full analysis; ``head`` runs incremental, falling back to full on
-``IncrementalCacheMissingError``/``BaselineUnavailableError``; ``health`` writes the
-WARNING/CRITICAL finding count to ``--issues-out`` (and never fails the run).
+``base`` runs a full analysis; ``seed`` builds the SHA-tagged static-analysis
+pkl for a baseline that came from a committed analysis.json (LSP + clustering,
+no LLM) so ``head`` can run incrementally; ``head`` runs incremental, falling
+back to full on ``IncrementalCacheMissingError``/``BaselineUnavailableError``;
+``health`` writes the WARNING/CRITICAL finding count to ``--issues-out`` (and
+never fails the run).
 
 The engine (``codeboarding_workflows`` etc.) is imported lazily inside each
 function so this module imports without the engine venv present — the tests stub
@@ -38,6 +42,40 @@ def _clear_dir(path: Path) -> None:
             child.unlink()
 
 
+def validate_base_analysis(analysis_path: Path, expected_sha: str) -> tuple[bool, str]:
+    """Return whether ``analysis.json`` was generated for ``expected_sha``.
+
+    The PR action can only reuse a committed baseline when the diagram's own
+    source commit matches the PR base commit. Otherwise the diff would be
+    computed from the PR base while mutating an older diagram snapshot.
+    """
+    try:
+        data = json.loads(analysis_path.read_text(encoding="utf-8"))
+    except FileNotFoundError:
+        return False, f"Baseline analysis is missing: {analysis_path}"
+    except (OSError, json.JSONDecodeError) as exc:
+        return False, f"Baseline analysis is unreadable: {exc}"
+
+    if not isinstance(data, dict):
+        return False, "Baseline analysis root is not a JSON object."
+
+    metadata = data.get("metadata")
+    if not isinstance(metadata, dict):
+        return False, "Baseline analysis metadata is missing."
+
+    actual_sha = metadata.get("commit_hash")
+    if not isinstance(actual_sha, str) or not actual_sha:
+        return False, "Baseline analysis metadata.commit_hash is missing."
+
+    if actual_sha != expected_sha:
+        return (
+            False,
+            f"Baseline analysis was generated for {actual_sha}, expected PR base {expected_sha}.",
+        )
+
+    return True, f"Baseline analysis commit matches PR base {expected_sha}."
+
+
 def run_base(repo: str, out: str, name: str, run_id: str, depth: int, source_sha: str) -> None:
     from codeboarding_workflows.analysis import run_full
 
@@ -53,6 +91,36 @@ def run_base(repo: str, out: str, name: str, run_id: str, depth: int, source_sha
     print(f"Base analysis written: {res}")
 
 
+def run_seed(repo: str, out: str, source_sha: str) -> None:
+    """Build the SHA-tagged static-analysis artifact for *repo* with no LLM calls.
+
+    A committed analysis.json gives the head analysis its component ids, but
+    the engine's incremental path also needs the base ``static_analysis.pkl``
+    with a populated cluster cache — which ``git show`` of analysis.json can
+    never provide. LSP indexing plus Leiden clustering are deterministic and
+    cost no LLM spend, so the action seeds the pkl here instead of letting the
+    head run degrade to a full analysis.
+
+    ``build_all_cluster_results`` is the same call the full run's abstraction
+    agent makes, so the seeded cluster baseline matches a real full analysis.
+    The explicit ``save`` AFTER clustering matters: ``get_static_analysis``
+    persists the pkl on LSP teardown, before clustering — saving only there
+    would recreate the pkl-without-cluster-baseline state this fixes.
+
+    Errors propagate; the action step treats a failed seed as fail-open (the
+    head run falls back to a full analysis, today's behavior).
+    """
+    from static_analyzer import get_static_analysis
+    from static_analyzer.analysis_cache import StaticAnalysisCache
+    from static_analyzer.cluster_helpers import build_all_cluster_results
+
+    results = get_static_analysis(Path(repo), cache_dir=Path(out), source_sha=source_sha)
+    cluster_results = build_all_cluster_results(results)
+    StaticAnalysisCache(Path(out), Path(repo)).save(results, source_sha=source_sha)
+    summary = ", ".join(f"{lang}={len(cr.clusters)}" for lang, cr in sorted(cluster_results.items()))
+    print(f"Seeded static-analysis baseline in {out} (clusters: {summary or 'none'})")
+
+
 def run_head(repo: str, out: str, name: str, run_id: str, depth: int, base_ref: str, target_ref: str, source_sha: str) -> None:
     from codeboarding_workflows.analysis import BaselineUnavailableError, run_full, run_incremental
     from diagram_analysis.exceptions import IncrementalCacheMissingError
@@ -152,6 +220,10 @@ def main(argv=None) -> int:
         b.add_argument(a, required=True)
     b.add_argument("--depth", required=True, type=int, choices=range(1, 4))
 
+    s = sub.add_parser("seed")
+    for a in ("--repo", "--out", "--source-sha"):
+        s.add_argument(a, required=True)
+
     h = sub.add_parser("head")
     for a in ("--repo", "--out", "--name", "--run-id", "--base-ref", "--target-ref", "--source-sha"):
         h.add_argument(a, required=True)
@@ -161,13 +233,23 @@ def main(argv=None) -> int:
     for a in ("--artifact-dir", "--repo", "--name", "--issues-out"):
         hc.add_argument(a, required=True)
 
+    vb = sub.add_parser("validate-base")
+    vb.add_argument("--analysis", required=True)
+    vb.add_argument("--expected-sha", required=True)
+
     args = p.parse_args(argv)
     if args.cmd == "base":
         run_base(args.repo, args.out, args.name, args.run_id, args.depth, args.source_sha)
+    elif args.cmd == "seed":
+        run_seed(args.repo, args.out, args.source_sha)
     elif args.cmd == "head":
         run_head(args.repo, args.out, args.name, args.run_id, args.depth, args.base_ref, args.target_ref, args.source_sha)
     elif args.cmd == "health":
         Path(args.issues_out).write_text(str(run_health(args.artifact_dir, args.repo, args.name)))
+    elif args.cmd == "validate-base":
+        ok, message = validate_base_analysis(Path(args.analysis), args.expected_sha)
+        print(message)
+        return 0 if ok else 1
     return 0
 
 

diff --git a/scripts/run_local.sh b/scripts/run_local.sh
@@ -84,42 +84,60 @@ else
   [ -d "$ENGINE" ] || { echo "Engine not found at $ENGINE (set --engine or \$ENGINE)." >&2; exit 2; }
   [ -n "${OPENROUTER_API_KEY:-}" ] || { echo "Export OPENROUTER_API_KEY for the full pipeline." >&2; exit 2; }
   REPO="$(cd "$REPO" && pwd)"
+  BASE_SHA="$(git -C "$REPO" rev-parse "$BASE_REF^{commit}")"
+  HEAD_SHA="$(git -C "$REPO" rev-parse "$HEAD_REF^{commit}")"
   BASE_DIR="$OUT/base"; HEAD_DIR="$OUT/head"
   rm -rf "$BASE_DIR" "$HEAD_DIR"; mkdir -p "$BASE_DIR" "$HEAD_DIR"
 
-  echo "== Resolving base analysis at $BASE_REF =="
-  if git -C "$REPO" show "$BASE_REF:.codeboarding/analysis.json" > "$BASE_DIR/analysis.json" 2>/dev/null; then
+  echo "== Resolving base analysis at $BASE_SHA =="
+  if git -C "$REPO" show "$BASE_SHA:.codeboarding/analysis.json" > "$BASE_DIR/analysis.json" 2>/dev/null \
+     && run_engine validate-base --analysis "$BASE_DIR/analysis.json" --expected-sha "$BASE_SHA"; then
     echo "  using committed baseline"
+    # Mirror action.yml: a committed analysis.json alone can't drive incremental —
+    # the engine needs the base static_analysis.pkl with its cluster baseline.
+    # Seed it deterministically (LSP + clustering, no LLM); fail-open on error.
+    BASE_SRC="$OUT/base-src"
+    git -C "$REPO" worktree remove --force "$BASE_SRC" 2>/dev/null || true
+    git -C "$REPO" worktree prune
+    rm -rf "$BASE_SRC"
+    git -C "$REPO" worktree add --detach "$BASE_SRC" "$BASE_SHA" >/dev/null
+    if run_engine seed --repo "$BASE_SRC" --out "$BASE_DIR" --source-sha "$BASE_SHA"; then
+      echo "  seeded static-analysis baseline (no LLM)"
+    else
+      rm -f "$BASE_DIR/static_analysis.pkl" "$BASE_DIR/static_analysis.sha"
+      echo "  WARNING: seeding failed; head will fall back to a full run" >&2
+    fi
+    git -C "$REPO" worktree remove --force "$BASE_SRC" >/dev/null 2>&1 || true
   else
     rm -f "$BASE_DIR/analysis.json"
     echo "  no committed baseline; running FULL analysis on base (LLM)..."
     BASE_SRC="$OUT/base-src"
     git -C "$REPO" worktree remove --force "$BASE_SRC" 2>/dev/null || true
     git -C "$REPO" worktree prune
     rm -rf "$BASE_SRC"
-    git -C "$REPO" worktree add --detach "$BASE_SRC" "$BASE_REF" >/dev/null
+    git -C "$REPO" worktree add --detach "$BASE_SRC" "$BASE_SHA" >/dev/null
     run_engine base \
       --repo "$BASE_SRC" \
       --out "$BASE_DIR" \
       --name "$(basename "$REPO")" \
       --run-id local-base \
       --depth "$DEPTH" \
-      --source-sha "$BASE_REF"
+      --source-sha "$BASE_SHA"
     git -C "$REPO" worktree remove --force "$BASE_SRC" >/dev/null 2>&1 || true
     [ -f "$BASE_DIR/analysis.json" ] || { echo "Base full analysis ran but analysis.json is missing." >&2; exit 1; }
   fi
 
-  echo "== Analyzing head at $HEAD_REF (incremental from base) =="
+  echo "== Analyzing head at $HEAD_SHA (incremental from base) =="
   cp -a "$BASE_DIR"/. "$HEAD_DIR"/ 2>/dev/null || true
   run_engine head \
     --repo "$REPO" \
     --out "$HEAD_DIR" \
     --name "$(basename "$REPO")" \
     --run-id local-head \
     --depth "$DEPTH" \
-    --base-ref "$BASE_REF" \
-    --target-ref "$HEAD_REF" \
-    --source-sha "$HEAD_REF"
+    --base-ref "$BASE_SHA" \
+    --target-ref "$HEAD_SHA" \
+    --source-sha "$HEAD_SHA"
   [ -f "$HEAD_DIR/analysis.json" ] || { echo "Head analysis ran but analysis.json is missing." >&2; exit 1; }
   BASE_ANALYSIS="$BASE_DIR/analysis.json"
   HEAD_ANALYSIS="$HEAD_DIR/analysis.json"