Malkovsky · Malkovsky · Apr 18, 2026 · Mar 30, 2026 · Apr 6, 2026 · Apr 6, 2026
diff --git a/.github/workflows/kilo-dispatch.yml b/.github/workflows/kilo-dispatch.yml
@@ -21,7 +21,7 @@ on:
       timeout_minutes:
         description: Timeout in minutes for job and Kilo run
         required: false
-        default: "30"
+        default: "120"
         type: string
       model:
         description: Gateway model ID for kilocode provider
@@ -57,6 +57,11 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Fetch remote branches for baseline resolution
+        run: git fetch --no-tags origin +refs/heads/*:refs/remotes/origin/*
 
       - name: Setup Node.js
         uses: actions/setup-node@v4
@@ -66,6 +71,21 @@ jobs:
       - name: Install Kilo CLI
         run: npm install -g @kilocode/cli
 
+      - name: Install performance dependencies
+        if: ${{ inputs.command == 'perf-review' }}
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends \
+            build-essential cmake ninja-build pkg-config \
+            python3 python3-pip \
+            python3-numpy python3-scipy \
+            libpfm4 libpfm4-dev
+          cmake --version
+          python3 --version
+          python3 -c "import numpy, scipy; print(f'numpy {numpy.__version__}'); print(f'scipy {scipy.__version__}')"
+          dpkg-query -W -f='${Package} ${Version}\n' libpfm4 libpfm4-dev
+          test -f /usr/include/perfmon/pfmlib.h
+
       - name: Verify KILO_API_TOKEN secret
         run: |
           if [ -z "${{ secrets.KILO_API_TOKEN }}" ]; then
@@ -90,6 +110,85 @@ jobs:
           owner: ${{ github.repository_owner }}
           repositories: ${{ github.event.repository.name }}
 
+      - name: Post start status as GitHub App comment
+        if: ${{ always() && inputs.pr_number != '' && steps.app-token.outputs.token != '' }}
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ steps.app-token.outputs.token }}
+          script: |
+            const prNumber = Number(${{ toJSON(inputs.pr_number) }});
+            const command = ${{ toJSON(inputs.command) }};
+            const commandArgs = ${{ toJSON(inputs.command_args) }};
+            const prompt = ${{ toJSON(inputs.prompt) }};
+            const parentCommentIdRaw = ((${{ toJSON(inputs.parent_comment_id) }} || ${{ toJSON(inputs.review_comment_id) }} || '') + '').trim();
+            const parentCommentId = parentCommentIdRaw ? Number(parentCommentIdRaw) : null;
+            const runUrl = `${process.env.GITHUB_SERVER_URL}/${context.repo.owner}/${context.repo.repo}/actions/runs/${process.env.GITHUB_RUN_ID}`;
+
+            let requestSummary = '';
+            if ((command || '').trim()) {
+              requestSummary = `/${command}${(commandArgs || '').trim() ? ` ${commandArgs}` : ''}`;
+            } else {
+              requestSummary = (prompt || '').trim();
+            }
+
+            requestSummary = requestSummary.replace(/\s+/g, ' ').trim();
+            if (!requestSummary) {
+              requestSummary = '(no request text provided)';
+            }
+            if (requestSummary.length > 180) {
+              requestSummary = `${requestSummary.slice(0, 177)}...`;
+            }
+
+            const body = `Started Kilo run: ${requestSummary}\nRun: ${runUrl}`;
+
+            if (parentCommentId) {
+              try {
+                await github.rest.pulls.createReplyForReviewComment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  pull_number: prNumber,
+                  comment_id: parentCommentId,
+                  body
+                });
+                return;
+              } catch (error) {
+                const status = error?.status;
+                if (status !== 404 && status !== 422) {
+                  throw error;
+                }
+
+                let prefix = '';
+                try {
+                  const parent = await github.rest.issues.getComment({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    comment_id: parentCommentId
+                  });
+                  const login = parent?.data?.user?.login;
+                  if (login) {
+                    prefix = `@${login} `;
+                  }
+                } catch {
+                  // Fallback to plain issue comment if parent lookup fails.
+                }
+
+                await github.rest.issues.createComment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: prNumber,
+                  body: `${prefix}${body}`
+                });
+                return;
+              }
+            }
+
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: prNumber,
+              body
+            });
+
       - name: Run Kilo CLI
         env:
           KILO_API_TOKEN: ${{ secrets.KILO_API_TOKEN }}
@@ -129,13 +228,23 @@ jobs:
             "${kilo_args[@]}" 2>&1 | tee -a kilo-events.log | tee -a kilo-run.log
           fi
 
-          node -e "const fs=require('fs');const strip=(s)=>s.replace(/\x1B\[[0-9;]*[A-Za-z]/g,'');const lines=fs.readFileSync('kilo-events.log','utf8').split(/\r?\n/);const texts=[];for(const line of lines){if(!line.trim())continue;try{const evt=JSON.parse(line);if(evt&&evt.type==='text'&&evt.part&&typeof evt.part.text==='string'){const t=evt.part.text.trim();if(t)texts.push(t);}}catch{}}let out='';if(texts.length){out=texts[texts.length-1];}else{const fallback=strip(fs.readFileSync('kilo-events.log','utf8')).split(/\r?\n/).map(x=>x.trim()).filter(Boolean);out=fallback.length?fallback[fallback.length-1]:'';}fs.writeFileSync('kilo-output.log',out+(out.endsWith('\n')?'':'\n'));"
+          node --experimental-strip-types scripts/kilo-postprocess.ts kilo-events.log kilo-output.log kilo-readable.log
 
-      - name: Post result as GitHub App comment
+      - name: Create GitHub App token for final comment
         if: ${{ always() && inputs.pr_number != '' }}
+        id: app-token-final
+        uses: actions/create-github-app-token@v1
+        with:
+          app-id: ${{ secrets.NIKOLAY_REVIEWER_APP_ID }}
+          private-key: ${{ secrets.NIKOLAY_REVIEWER_PRIVATE_KEY }}
+          owner: ${{ github.repository_owner }}
+          repositories: ${{ github.event.repository.name }}
+
+      - name: Post result as GitHub App comment
+        if: ${{ always() && inputs.pr_number != '' && steps.app-token-final.outputs.token != '' }}
         uses: actions/github-script@v7
         with:
-          github-token: ${{ steps.app-token.outputs.token }}
+          github-token: ${{ steps.app-token-final.outputs.token }}
           script: |
             const fs = require('fs');
             const prNumber = Number('${{ inputs.pr_number }}');
@@ -203,4 +312,6 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: kilo-run-log
-          path: kilo-run.log
+          path: |
+            kilo-run.log
+            kilo-readable.log
diff --git a/.kilo/command/perf-review.md b/.kilo/command/perf-review.md
@@ -19,7 +19,10 @@ Non-negotiable requirements:
 If arguments are omitted:
 - Default target branch to PR base branch from `gh pr view --json baseRefName` when available.
 - Fall back target branch to `main`.
-- Default filter to empty (run full selected benchmark suites).
+- Default filter must be **targeted**, not full-suite:
+  - Derive from changed files and changed symbols.
+  - If `include/pixie/bitvector.h` changed in select path, default to `BM_Select` and add `BM_RankNonInterleaved` as control.
+  - Run full selected suites only as last resort when mapping fails.
 
 ## Step 1 - Resolve Branches and Revisions
 
@@ -43,7 +46,7 @@ Map file paths to benchmark binaries:
 
 | Changed path pattern | Benchmark binary | Coverage |
 |---|---|---|
-| `include/bit_vector*`, `include/interleaved*` | `benchmarks` | BitVector rank/select |
+| `include/pixie/bitvector*`, `include/*bit_vector*`, `include/interleaved*` | `benchmarks` | BitVector rank/select |
 | `include/rmm*` | `bench_rmm` | RmM tree operations |
 | `include/louds*` | `louds_tree_benchmarks` | LOUDS traversal |
 | `include/simd*`, `include/aligned*` | `alignment_comparison` | SIMD and alignment |
@@ -57,10 +60,15 @@ Available benchmark binaries:
 - `louds_tree_benchmarks`
 - `alignment_comparison`
 
-If the mapping is ambiguous, run all benchmark binaries.
+If the mapping is ambiguous, run all benchmark binaries but still apply a focused filter first.
 If `--filter` is provided, pass it through as `--benchmark_filter`.
 Print selected binaries and why they were selected.
 
+Execution guardrails:
+- Do not use background jobs (`nohup`, `&`) for benchmark runs in CI.
+- Do not interleave multiple benchmark runs into one shell command stream.
+- Run one benchmark command at a time and wait for completion.
+
 ## Step 3 - Build Both Revisions (Timing and Profiling Builds)
 
 Use isolated build directories per short hash.
@@ -98,19 +106,38 @@ If a required binary is missing, report failure and stop with a blocked verdict.
 
 ## Step 5 - Run Timing Comparison (Primary Judgment)
 
-Locate compare script from baseline timing build:
-
-`build/benchmarks-all_bench_<baseline_hash>/_deps/googlebenchmark-src/tools/compare.py`
-
-For each selected benchmark binary, run:
-
-`python3 <compare.py> benchmarks <baseline_binary> <contender_binary> [--benchmark_filter="<filter>"]`
-
-Capture full output for each binary and keep it for report details.
+Use a deterministic JSON-first workflow. Do not rely on long-running `compare.py` binary-vs-binary mode.
+
+1. Verify Python benchmark tooling once before runs:
+   - `python3 -c "import numpy, scipy"`
+2. For each selected benchmark binary, run baseline then contender sequentially, each with explicit JSON out:
+   - `--benchmark_filter="<filter>"`
+   - `--benchmark_format=json`
+   - `--benchmark_out=<file>.json`
+   - `--benchmark_report_aggregates_only=true`
+   - `--benchmark_display_aggregates_only=true`
+3. Suppress benchmark stdout/stderr noise when generating JSON artifacts so files stay valid:
+   - `> <file>.log 2>&1`
+4. Validate both JSON files before comparison:
+   - `python3 -m json.tool <file>.json > /dev/null`
+5. Compare using one of:
+   - `python3 <compare.py> -a benchmarks <baseline.json> <contender.json>`
+   - or a deterministic local Python diff script over aggregate means.
+6. Keep raw JSON files and comparison output for auditability.
+
+Timeout and retry policy:
+- Use command timeouts that match benchmark scope.
+- If a run times out once, narrow filter immediately and retry once.
+- Maximum retry count per benchmark group: 1.
+- If still timing out, produce a blocked/partial verdict with explicit scope limitations.
 
 ## Step 6 - Collect Hardware Counter Profiles (Linux Only)
 
-If Linux profiling build is available, run both baseline and contender diagnostic binaries with counter output:
+Run a preflight first to avoid wasted attempts:
+1. Execute one tiny benchmark with perf counters (e.g. one benchmark case) and inspect output for counter availability.
+2. If output includes warnings like `Failed to get a file descriptor for performance counter`, mark counters unavailable and skip counter collection.
+
+If preflight passes and Linux profiling build is available, run both baseline and contender diagnostic binaries with counter output:
 
 - `--benchmark_counters_tabular=true`
 - `--benchmark_format=json`
@@ -128,7 +155,7 @@ Compute derived metrics when denominators are non-zero:
 - Cache miss rate = cache-misses / cache-references
 - Branch mispredict rate = branch-misses / branches
 
-If profiling is unavailable (non-Linux or libpfm not available), continue with timing-only review and explicitly mark profiling as unavailable in the report.
+If profiling is unavailable (non-Linux, libpfm missing, or perf permissions blocked), continue with timing-only review and explicitly mark profiling as unavailable in the report.
 
 ## Step 7 - Analyze Timing and Counter Data
 
@@ -150,6 +177,10 @@ Judgment priority:
 - Base verdict primarily on benchmark timing comparison.
 - Use counter data as explanatory evidence and confidence signal.
 
+Noise-control expectations:
+- Include at least one control benchmark family expected to be unaffected by the code change.
+- Treat isolated swings without pattern as noise unless reproduced across related sizes/fill ratios.
+
 ## Step 8 - Produce Final Markdown Report
 
 Return a structured markdown report with this shape:
@@ -197,3 +228,4 @@ Verdict rules:
 
 - If required builds fail or timing comparison cannot run, output a blocked review with exact failure points and no misleading verdict.
 - If only profiling fails, continue with timing-based verdict and explicitly list profiling limitation.
+- If JSON output is invalid/truncated, discard it and rerun that benchmark command once with tighter filter and explicit output redirection.
diff --git a/.kilo/skills/benchmarks-compare-revisions/SKILL.md b/.kilo/skills/benchmarks-compare-revisions/SKILL.md
@@ -39,30 +39,63 @@ git checkout ${CONTENDER}
 
 ## Step 2 — Compare using compare.py
 
-Use Google Benchmark’s compare.py to run both binaries and compute a statistical comparison.
+Use Google Benchmark compare tooling with a JSON-first flow to avoid long-running binary-vs-binary retries.
 
 Locate compare.py from the Google Benchmark dependency (installed under the build tree):
 ```bash
 COMPARE_PY=build/benchmarks-all_bench_${BASELINE}/_deps/googlebenchmark-src/tools/compare.py
 ```
 
-Run the comparison (benchmarks mode):
+Verify Python deps once (compare.py imports numpy/scipy):
 ```bash
-python3 ${COMPARE_PY} benchmarks \
-  build/benchmarks-all_bench_${BASELINE}/benchmarks \
-  build/benchmarks-all_bench_${CONTENDER}/benchmarks
+python3 -c "import numpy, scipy"
 ```
 
-### Optional: filter to reduce noise
+Generate baseline/contender JSON sequentially with explicit file outputs:
+```bash
+BASE_JSON=/tmp/bench_${BASELINE}.json
+CONT_JSON=/tmp/bench_${CONTENDER}.json
+
+build/benchmarks-all_bench_${BASELINE}/benchmarks \
+  --benchmark_report_aggregates_only=true \
+  --benchmark_display_aggregates_only=true \
+  --benchmark_format=json \
+  --benchmark_out=${BASE_JSON} > /tmp/bench_${BASELINE}.log 2>&1
+
+build/benchmarks-all_bench_${CONTENDER}/benchmarks \
+  --benchmark_report_aggregates_only=true \
+  --benchmark_display_aggregates_only=true \
+  --benchmark_format=json \
+  --benchmark_out=${CONT_JSON} > /tmp/bench_${CONTENDER}.log 2>&1
+```
+
+Validate JSON before comparing:
+```bash
+python3 -m json.tool ${BASE_JSON} > /dev/null
+python3 -m json.tool ${CONT_JSON} > /dev/null
+```
 
-Pass benchmark options after the binaries so compare.py forwards them:
+Run the comparison:
 ```bash
-python3 ${COMPARE_PY} benchmarks \
-  build/benchmarks-all_bench_${BASELINE}/benchmarks \
-  build/benchmarks-all_bench_${CONTENDER}/benchmarks \
-  --benchmark_filter="BM_Rank"
+python3 ${COMPARE_PY} -a benchmarks ${BASE_JSON} ${CONT_JSON}
 ```
 
+### Optional: filter to reduce noise and runtime
+
+Pass filter when generating JSON files:
+```bash
+FILTER="BM_Rank"
+build/benchmarks-all_bench_${BASELINE}/benchmarks --benchmark_filter="${FILTER}" --benchmark_report_aggregates_only=true --benchmark_display_aggregates_only=true ...
+build/benchmarks-all_bench_${CONTENDER}/benchmarks --benchmark_filter="${FILTER}" --benchmark_report_aggregates_only=true --benchmark_display_aggregates_only=true ...
+```
+
+## Retry and Timeout Policy
+
+1. Run benchmarks sequentially; do not background with `nohup`/`&`.
+2. If a run times out, narrow filter and retry once.
+3. Maximum retries per benchmark group: 1.
+4. If still failing, emit blocked/partial findings instead of repeated attempts.
+
 ## Step 3 — Record findings
 
 Capture the compare.py output (terminal transcript or redirected file) and note any statistically significant regressions or wins.

diff --git a/.kilo/skills/benchmarks/SKILL.md b/.kilo/skills/benchmarks/SKILL.md
@@ -25,6 +25,8 @@ BUILD_SUFFIX=local
 
 ## Step 1 — Build
 
+If benchmarks affected by the changes are easily tractable build only related targets.
+
 **Pure timing (benchmarks-all, Release):**
 ```bash
 cmake -B build/benchmarks-all_${BUILD_SUFFIX} -DCMAKE_BUILD_TYPE=Release -DPIXIE_BENCHMARKS=ON
@@ -39,6 +41,13 @@ cmake --build build/benchmarks-diagnostic_${BUILD_SUFFIX} --config RelWithDebInf
 
 ## Step 2 — Run
 
+Prefer running benchmarks with filtering passing the benchmarks that should be affected.
+
+Execution guardrails:
+- Run benchmark commands sequentially in CI.
+- Avoid background jobs (`nohup`, `&`) for benchmark collection.
+- Always write machine-readable results with `--benchmark_out` when data is later parsed.
+
 ### Available benchmark binaries
 
 | Binary | What it covers |
@@ -103,10 +112,17 @@ build/benchmarks-diagnostic_${BUILD_SUFFIX}/RelWithDebInfo/benchmarks \
 ```bash
 build/benchmarks-all_${BUILD_SUFFIX}/Release/benchmarks \
   --benchmark_filter="${FILTER}" \
+  --benchmark_report_aggregates_only=true \
+  --benchmark_display_aggregates_only=true \
   --benchmark_format=json \
   --benchmark_out=results.json
 ```
 
+Validate output before consuming:
+```bash
+python3 -m json.tool results.json > /dev/null
+```
+
 ## Step 3 — Profile with perf (Linux only)
 
 Use when hardware counters alone are not enough and you need a full call-graph profile for post-processing.
@@ -157,3 +173,6 @@ perf script -F +pid > perf.data.txt
 5. **Pin CPU frequency** before timing runs: `sudo cpupower frequency-set -g performance`
 6. **Filter to reduce noise**: narrow the filter regex to the benchmark under investigation
 7. **Save JSON output** when comparing before/after changes: use `--benchmark_out` and diff the files
+8. **Fail fast on environment issues**: precheck Python deps used by compare tooling (`numpy`, `scipy`)
+9. **Use explicit retry limits**: on timeout, narrow scope and retry once; avoid repeated full-suite attempts
+10. **Preflight perf counters**: run a tiny counter-enabled benchmark first; if counters unavailable, skip counter workflow