Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 63 additions & 5 deletions action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -352,15 +352,24 @@ runs:
working-directory: target-repo
env:
BASE_SHA: ${{ steps.guard.outputs.base_sha }}
ACTION_PATH: ${{ github.action_path }}
run: |
BASE_DIR="${RUNNER_TEMP}/cb-base"
HEAD_DIR="${RUNNER_TEMP}/cb-head"
mkdir -p "$BASE_DIR" "$HEAD_DIR"
echo "base_dir=$BASE_DIR" >> $GITHUB_OUTPUT
echo "head_dir=$HEAD_DIR" >> $GITHUB_OUTPUT
if git show "${BASE_SHA}:.codeboarding/analysis.json" > "${BASE_DIR}/analysis.json" 2>/dev/null; then
echo "committed=true" >> $GITHUB_OUTPUT
echo "Using committed .codeboarding/analysis.json at ${BASE_SHA}."
if python3 "$ACTION_PATH/scripts/cb_engine.py" validate-base \
--analysis "${BASE_DIR}/analysis.json" \
--expected-sha "$BASE_SHA"; then
echo "committed=true" >> $GITHUB_OUTPUT
echo "Using committed .codeboarding/analysis.json at ${BASE_SHA}."
else
rm -f "${BASE_DIR}/analysis.json"
echo "committed=false" >> $GITHUB_OUTPUT
echo "Committed baseline at ${BASE_SHA} is stale; will generate a fresh base analysis."
fi
else
rm -f "${BASE_DIR}/analysis.json"
echo "committed=false" >> $GITHUB_OUTPUT
Expand All @@ -373,7 +382,52 @@ runs:
uses: actions/cache/restore@v4
with:
path: ${{ runner.temp }}/cb-base
key: cb-base-${{ runner.os }}-${{ steps.guard.outputs.base_sha }}-d${{ inputs.depth_level }}-${{ inputs.engine_ref }}-${{ inputs.llm_provider }}-${{ inputs.agent_model }}-${{ inputs.parsing_model }}
key: cb-base-v2-${{ runner.os }}-${{ steps.guard.outputs.base_sha }}-d${{ inputs.depth_level }}-${{ inputs.engine_ref }}-${{ inputs.llm_provider }}-${{ inputs.agent_model }}-${{ inputs.parsing_model }}

# A committed analysis.json gives the head analysis stable component ids,
# but the engine's incremental path ALSO needs the base static_analysis.pkl
# with its cluster baseline — which git can't provide (the pkl is never
# committed, by design). Build it here deterministically: LSP + Leiden
# clustering, no LLM key read. Fail-open: a failed seed degrades to exactly
# the previous behavior (the head run falls back to a full analysis).
- name: Seed base static-analysis cache (committed baseline, no cached pkl)
if: steps.guard.outputs.skip != 'true' && steps.base.outputs.committed == 'true' && steps.basecache.outputs.cache-hit != 'true'
id: seedbase
continue-on-error: true
shell: bash
working-directory: codeboarding-engine
env:
STATIC_ANALYSIS_CONFIG: ${{ github.workspace }}/codeboarding-engine/static_analysis_config.yml
PROJECT_ROOT: ${{ github.workspace }}/codeboarding-engine
CACHING_DOCUMENTATION: 'false'
ENABLE_MONITORING: 'false'
ACTION_PATH: ${{ github.action_path }}
TARGET: ${{ github.workspace }}/target-repo
BASE_DIR: ${{ steps.base.outputs.base_dir }}
BASE_SHA: ${{ steps.guard.outputs.base_sha }}
run: |
# Clean up any stale registration before re-adding (rm -rf alone leaves a
# dangling worktree entry that makes a retry's `worktree add` fail).
BASE_SRC="${RUNNER_TEMP}/base-src"
git -C "$TARGET" worktree remove --force "$BASE_SRC" 2>/dev/null || true
git -C "$TARGET" worktree prune
rm -rf "$BASE_SRC"
git -C "$TARGET" worktree add --detach "$BASE_SRC" "$BASE_SHA"
if uv run python "$ACTION_PATH/scripts/cb_engine.py" seed \
--repo "$BASE_SRC" \
--out "$BASE_DIR" \
--source-sha "$BASE_SHA" \
&& [ -f "$BASE_DIR/static_analysis.pkl" ] && [ -f "$BASE_DIR/static_analysis.sha" ]; then
echo "seed_ok=true" >> "$GITHUB_OUTPUT"
echo "::notice::Seeded base static-analysis cache for ${BASE_SHA}; head analysis can run incrementally."
else
# Never leave a partial pkl/sha pair behind: the save step below would
# cache it under this base SHA's key and suppress every retry.
rm -f "$BASE_DIR/static_analysis.pkl" "$BASE_DIR/static_analysis.sha"
echo "seed_ok=false" >> "$GITHUB_OUTPUT"
echo "::warning::Base static-analysis seeding failed; head analysis will fall back to a full run."
fi
git -C "$TARGET" worktree remove --force "$BASE_SRC" 2>/dev/null || true

- name: Generate base analysis (no committed baseline)
if: steps.guard.outputs.skip != 'true' && steps.base.outputs.committed == 'false' && steps.basecache.outputs.cache-hit != 'true'
Expand Down Expand Up @@ -424,12 +478,16 @@ runs:
exit 1
fi

# Covers both base-artifact producers: the full analysis (no committed
# baseline) and the seeded pkl (committed baseline). The seed_ok gate is
# load-bearing — caching a pkl-less dir under this base SHA's key would
# permanently suppress seeding retries for it.
- name: Save generated base artifacts
if: steps.guard.outputs.skip != 'true' && steps.base.outputs.committed == 'false' && steps.basecache.outputs.cache-hit != 'true'
if: steps.guard.outputs.skip != 'true' && steps.basecache.outputs.cache-hit != 'true' && (steps.base.outputs.committed == 'false' || steps.seedbase.outputs.seed_ok == 'true')
uses: actions/cache/save@v4
with:
path: ${{ runner.temp }}/cb-base
key: cb-base-${{ runner.os }}-${{ steps.guard.outputs.base_sha }}-d${{ inputs.depth_level }}-${{ inputs.engine_ref }}-${{ inputs.llm_provider }}-${{ inputs.agent_model }}-${{ inputs.parsing_model }}
key: cb-base-v2-${{ runner.os }}-${{ steps.guard.outputs.base_sha }}-d${{ inputs.depth_level }}-${{ inputs.engine_ref }}-${{ inputs.llm_provider }}-${{ inputs.agent_model }}-${{ inputs.parsing_model }}

- name: Analyze PR head (incremental from base)
if: steps.guard.outputs.skip != 'true'
Expand Down
10 changes: 8 additions & 2 deletions docs/COMMIT_STRATEGY.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ The engine writes these under `.codeboarding/`:
- ✅ `health/health_report.json` — required for warnings in the extension/webview. Small text.

**Do NOT commit (binary, bloat):**
- ❌ `static_analysis.pkl` — binary, MB-scale, noisy diffs, repo bloat. It is a *rebuildable speed cache*, not display data. Keep it in **`actions/cache` keyed by the base SHA** (or a backend). A cache miss just falls back to a cold (full) LSP pass — slower but correct, and the committed `analysis.json` still drives the diagram.
- ❌ `static_analysis.pkl` — binary, MB-scale, noisy diffs, repo bloat. It is a *rebuildable speed cache*, not display data. Keep it in **`actions/cache` keyed by the base SHA** (or a backend). On a cache miss the review action **seeds it deterministically** (LSP + clustering, no LLM calls) — the pkl is not optional for incremental: the engine refuses to run incrementally without the cluster baseline stored inside it.
- `static_analysis.sha` — commit **only** if the pkl is kept reachable (cache/backend); on its own it's harmless but unused.

> **Principle:** version-control the *source-of-truth display data* (text, small); *cache* the *rebuildable speed artifacts* (binary, large). This is exactly what keeps the repo clean — the thing that bloats (`.pkl`) never enters git.
Expand All @@ -40,7 +40,13 @@ The engine writes these under `.codeboarding/`:

## Warm-start tradeoff (the `.pkl`)

The warm-start needs the pkl **and** its `.sha`. When the review action has to generate a base analysis, it saves that generated base artifact directory in `actions/cache` keyed by base SHA / depth / engine ref, then seeds the head analysis from that directory. When a committed `analysis.json` already exists but no matching cache exists, the PR still diffs correctly but may run a cold LSP pass. This keeps the repo clean; the cache improves speed but is not required for correctness.
The warm-start — and the engine's incremental path itself — needs the pkl **and** its `.sha`: the cluster baseline that drives incremental lives only inside the pkl, so a committed `analysis.json` alone forces the head run into a full (LLM) fallback. The review action therefore guarantees the pair exists for the base SHA:

- **No committed baseline:** the generated base analysis writes the pkl as a side effect; the artifact dir is saved in `actions/cache` keyed by base SHA / depth / engine ref.
- **Committed baseline:** the action first requires `analysis.json.metadata.commit_hash` to equal the PR base SHA. A stale committed diagram is treated like no baseline, so the base is regenerated at the PR base commit before diffing.
- **Committed baseline, cache miss:** the action *seeds* the pkl deterministically (`cb_engine.py seed`: LSP indexing + the same clustering call a full run makes — **no LLM calls**), then saves it to the same cache. Seeding is fail-open: if it fails, the head run falls back to a full analysis.

Either way the head analysis is seeded from that directory and runs incrementally. This keeps the repo clean — the pkl never enters git — while the cache + seeding make incremental work from the first PR run.

## Summary

Expand Down
88 changes: 85 additions & 3 deletions scripts/cb_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@
Subcommands (all paths/refs come in as argv, never interpolated into source):

base --repo P --out D --name N --run-id ID --depth K --source-sha SHA
seed --repo P --out D --source-sha SHA
head --repo P --out D --name N --run-id ID --depth K --base-ref B --target-ref T --source-sha SHA
health --artifact-dir D --repo P --name N --issues-out FILE

``base`` runs a full analysis; ``head`` runs incremental, falling back to full on
``IncrementalCacheMissingError``/``BaselineUnavailableError``; ``health`` writes the
WARNING/CRITICAL finding count to ``--issues-out`` (and never fails the run).
``base`` runs a full analysis; ``seed`` builds the SHA-tagged static-analysis
pkl for a baseline that came from a committed analysis.json (LSP + clustering,
no LLM) so ``head`` can run incrementally; ``head`` runs incremental, falling
back to full on ``IncrementalCacheMissingError``/``BaselineUnavailableError``;
``health`` writes the WARNING/CRITICAL finding count to ``--issues-out`` (and
never fails the run).

The engine (``codeboarding_workflows`` etc.) is imported lazily inside each
function so this module imports without the engine venv present — the tests stub
Expand Down Expand Up @@ -38,6 +42,40 @@ def _clear_dir(path: Path) -> None:
child.unlink()


def validate_base_analysis(analysis_path: Path, expected_sha: str) -> tuple[bool, str]:
"""Return whether ``analysis.json`` was generated for ``expected_sha``.

The PR action can only reuse a committed baseline when the diagram's own
source commit matches the PR base commit. Otherwise the diff would be
computed from the PR base while mutating an older diagram snapshot.
"""
try:
data = json.loads(analysis_path.read_text(encoding="utf-8"))
except FileNotFoundError:
return False, f"Baseline analysis is missing: {analysis_path}"
except (OSError, json.JSONDecodeError) as exc:
return False, f"Baseline analysis is unreadable: {exc}"

if not isinstance(data, dict):
return False, "Baseline analysis root is not a JSON object."

metadata = data.get("metadata")
if not isinstance(metadata, dict):
return False, "Baseline analysis metadata is missing."

actual_sha = metadata.get("commit_hash")
if not isinstance(actual_sha, str) or not actual_sha:
return False, "Baseline analysis metadata.commit_hash is missing."

if actual_sha != expected_sha:
return (
False,
f"Baseline analysis was generated for {actual_sha}, expected PR base {expected_sha}.",
)

return True, f"Baseline analysis commit matches PR base {expected_sha}."


def run_base(repo: str, out: str, name: str, run_id: str, depth: int, source_sha: str) -> None:
from codeboarding_workflows.analysis import run_full

Expand All @@ -53,6 +91,36 @@ def run_base(repo: str, out: str, name: str, run_id: str, depth: int, source_sha
print(f"Base analysis written: {res}")


def run_seed(repo: str, out: str, source_sha: str) -> None:
"""Build the SHA-tagged static-analysis artifact for *repo* with no LLM calls.

A committed analysis.json gives the head analysis its component ids, but
the engine's incremental path also needs the base ``static_analysis.pkl``
with a populated cluster cache — which ``git show`` of analysis.json can
never provide. LSP indexing plus Leiden clustering are deterministic and
cost no LLM spend, so the action seeds the pkl here instead of letting the
head run degrade to a full analysis.

``build_all_cluster_results`` is the same call the full run's abstraction
agent makes, so the seeded cluster baseline matches a real full analysis.
The explicit ``save`` AFTER clustering matters: ``get_static_analysis``
persists the pkl on LSP teardown, before clustering — saving only there
would recreate the pkl-without-cluster-baseline state this fixes.

Errors propagate; the action step treats a failed seed as fail-open (the
head run falls back to a full analysis, today's behavior).
"""
from static_analyzer import get_static_analysis
from static_analyzer.analysis_cache import StaticAnalysisCache
from static_analyzer.cluster_helpers import build_all_cluster_results

results = get_static_analysis(Path(repo), cache_dir=Path(out), source_sha=source_sha)
cluster_results = build_all_cluster_results(results)
StaticAnalysisCache(Path(out), Path(repo)).save(results, source_sha=source_sha)
summary = ", ".join(f"{lang}={len(cr.clusters)}" for lang, cr in sorted(cluster_results.items()))
print(f"Seeded static-analysis baseline in {out} (clusters: {summary or 'none'})")


def run_head(repo: str, out: str, name: str, run_id: str, depth: int, base_ref: str, target_ref: str, source_sha: str) -> None:
from codeboarding_workflows.analysis import BaselineUnavailableError, run_full, run_incremental
from diagram_analysis.exceptions import IncrementalCacheMissingError
Expand Down Expand Up @@ -152,6 +220,10 @@ def main(argv=None) -> int:
b.add_argument(a, required=True)
b.add_argument("--depth", required=True, type=int, choices=range(1, 4))

s = sub.add_parser("seed")
for a in ("--repo", "--out", "--source-sha"):
s.add_argument(a, required=True)

h = sub.add_parser("head")
for a in ("--repo", "--out", "--name", "--run-id", "--base-ref", "--target-ref", "--source-sha"):
h.add_argument(a, required=True)
Expand All @@ -161,13 +233,23 @@ def main(argv=None) -> int:
for a in ("--artifact-dir", "--repo", "--name", "--issues-out"):
hc.add_argument(a, required=True)

vb = sub.add_parser("validate-base")
vb.add_argument("--analysis", required=True)
vb.add_argument("--expected-sha", required=True)

args = p.parse_args(argv)
if args.cmd == "base":
run_base(args.repo, args.out, args.name, args.run_id, args.depth, args.source_sha)
elif args.cmd == "seed":
run_seed(args.repo, args.out, args.source_sha)
elif args.cmd == "head":
run_head(args.repo, args.out, args.name, args.run_id, args.depth, args.base_ref, args.target_ref, args.source_sha)
elif args.cmd == "health":
Path(args.issues_out).write_text(str(run_health(args.artifact_dir, args.repo, args.name)))
elif args.cmd == "validate-base":
ok, message = validate_base_analysis(Path(args.analysis), args.expected_sha)
print(message)
return 0 if ok else 1
return 0


Expand Down
34 changes: 26 additions & 8 deletions scripts/run_local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -84,42 +84,60 @@ else
[ -d "$ENGINE" ] || { echo "Engine not found at $ENGINE (set --engine or \$ENGINE)." >&2; exit 2; }
[ -n "${OPENROUTER_API_KEY:-}" ] || { echo "Export OPENROUTER_API_KEY for the full pipeline." >&2; exit 2; }
REPO="$(cd "$REPO" && pwd)"
BASE_SHA="$(git -C "$REPO" rev-parse "$BASE_REF^{commit}")"
HEAD_SHA="$(git -C "$REPO" rev-parse "$HEAD_REF^{commit}")"
BASE_DIR="$OUT/base"; HEAD_DIR="$OUT/head"
rm -rf "$BASE_DIR" "$HEAD_DIR"; mkdir -p "$BASE_DIR" "$HEAD_DIR"

echo "== Resolving base analysis at $BASE_REF =="
if git -C "$REPO" show "$BASE_REF:.codeboarding/analysis.json" > "$BASE_DIR/analysis.json" 2>/dev/null; then
echo "== Resolving base analysis at $BASE_SHA =="
if git -C "$REPO" show "$BASE_SHA:.codeboarding/analysis.json" > "$BASE_DIR/analysis.json" 2>/dev/null \
&& run_engine validate-base --analysis "$BASE_DIR/analysis.json" --expected-sha "$BASE_SHA"; then
echo " using committed baseline"
# Mirror action.yml: a committed analysis.json alone can't drive incremental —
# the engine needs the base static_analysis.pkl with its cluster baseline.
# Seed it deterministically (LSP + clustering, no LLM); fail-open on error.
BASE_SRC="$OUT/base-src"
git -C "$REPO" worktree remove --force "$BASE_SRC" 2>/dev/null || true
git -C "$REPO" worktree prune
rm -rf "$BASE_SRC"
git -C "$REPO" worktree add --detach "$BASE_SRC" "$BASE_SHA" >/dev/null
if run_engine seed --repo "$BASE_SRC" --out "$BASE_DIR" --source-sha "$BASE_SHA"; then
echo " seeded static-analysis baseline (no LLM)"
else
rm -f "$BASE_DIR/static_analysis.pkl" "$BASE_DIR/static_analysis.sha"
echo " WARNING: seeding failed; head will fall back to a full run" >&2
fi
git -C "$REPO" worktree remove --force "$BASE_SRC" >/dev/null 2>&1 || true
else
rm -f "$BASE_DIR/analysis.json"
echo " no committed baseline; running FULL analysis on base (LLM)..."
BASE_SRC="$OUT/base-src"
git -C "$REPO" worktree remove --force "$BASE_SRC" 2>/dev/null || true
git -C "$REPO" worktree prune
rm -rf "$BASE_SRC"
git -C "$REPO" worktree add --detach "$BASE_SRC" "$BASE_REF" >/dev/null
git -C "$REPO" worktree add --detach "$BASE_SRC" "$BASE_SHA" >/dev/null
run_engine base \
--repo "$BASE_SRC" \
--out "$BASE_DIR" \
--name "$(basename "$REPO")" \
--run-id local-base \
--depth "$DEPTH" \
--source-sha "$BASE_REF"
--source-sha "$BASE_SHA"
git -C "$REPO" worktree remove --force "$BASE_SRC" >/dev/null 2>&1 || true
[ -f "$BASE_DIR/analysis.json" ] || { echo "Base full analysis ran but analysis.json is missing." >&2; exit 1; }
fi

echo "== Analyzing head at $HEAD_REF (incremental from base) =="
echo "== Analyzing head at $HEAD_SHA (incremental from base) =="
cp -a "$BASE_DIR"/. "$HEAD_DIR"/ 2>/dev/null || true
run_engine head \
--repo "$REPO" \
--out "$HEAD_DIR" \
--name "$(basename "$REPO")" \
--run-id local-head \
--depth "$DEPTH" \
--base-ref "$BASE_REF" \
--target-ref "$HEAD_REF" \
--source-sha "$HEAD_REF"
--base-ref "$BASE_SHA" \
--target-ref "$HEAD_SHA" \
--source-sha "$HEAD_SHA"
[ -f "$HEAD_DIR/analysis.json" ] || { echo "Head analysis ran but analysis.json is missing." >&2; exit 1; }
BASE_ANALYSIS="$BASE_DIR/analysis.json"
HEAD_ANALYSIS="$HEAD_DIR/analysis.json"
Expand Down
Loading
Loading