Malkovsky · Malkovsky · Apr 5, 2026 · Apr 5, 2026 · kilo-code-bot · Apr 5, 2026
diff --git a/.github/workflows/kilo-dispatch.yml b/.github/workflows/kilo-dispatch.yml
@@ -4,14 +4,50 @@ on:
   workflow_dispatch:
     inputs:
       prompt:
-        description: Prompt passed to Kilo CLI
-        required: true
+        description: Fallback prompt when command is not provided
+        required: false
+        default: ""
+        type: string
+      command:
+        description: Kilo command to run (without leading slash), e.g. local-review
+        required: false
+        default: ""
+        type: string
+      command_args:
+        description: Optional arguments passed to command mode
+        required: false
+        default: ""
         type: string
       timeout_minutes:
         description: Timeout in minutes for job and Kilo run
         required: false
         default: "30"
         type: string
+      model:
+        description: Gateway model ID for kilocode provider
+        required: false
+        default: "openai/gpt-5.3-codex"
+        type: string
+      pr_number:
+        description: Pull request number to post result to
+        required: false
+        default: ""
+        type: string
+      review_comment_id:
+        description: Optional PR review comment ID to reply to
+        required: false
+        default: ""
+        type: string
+      parent_comment_id:
+        description: Optional parent comment ID (preferred over review_comment_id)
+        required: false
+        default: ""
+        type: string
+
+permissions:
+  contents: read
+  issues: write
+  pull-requests: write
 
 jobs:
   run-kilo:
@@ -25,7 +61,7 @@ jobs:
       - name: Setup Node.js
         uses: actions/setup-node@v4
         with:
-          node-version: 20
+          node-version: 24
 
       - name: Install Kilo CLI
         run: npm install -g @kilocode/cli
@@ -36,22 +72,132 @@ jobs:
             echo "KILO_API_TOKEN secret is missing. Add it in repository settings before running this workflow."
             exit 1
           fi
+          if [ -z "${{ secrets.NIKOLAY_REVIEWER_APP_ID }}" ]; then
+            echo "NIKOLAY_REVIEWER_APP_ID secret is missing. Add it in repository settings before running this workflow."
+            exit 1
+          fi
+          if [ -z "${{ secrets.NIKOLAY_REVIEWER_PRIVATE_KEY }}" ]; then
+            echo "NIKOLAY_REVIEWER_PRIVATE_KEY secret is missing. Add it in repository settings before running this workflow."
+            exit 1
+          fi
+
+      - name: Create GitHub App token
+        id: app-token
+        uses: actions/create-github-app-token@v1
+        with:
+          app-id: ${{ secrets.NIKOLAY_REVIEWER_APP_ID }}
+          private-key: ${{ secrets.NIKOLAY_REVIEWER_PRIVATE_KEY }}
+          owner: ${{ github.repository_owner }}
+          repositories: ${{ github.event.repository.name }}
 
       - name: Run Kilo CLI
         env:
           KILO_API_TOKEN: ${{ secrets.KILO_API_TOKEN }}
           KILO_API_KEY: ${{ secrets.KILO_API_TOKEN }}
+          KILO_PROVIDER: kilocode
+          KILOCODE_MODEL: ${{ inputs.model }}
         run: |
           set -euo pipefail
           : > kilo-run.log
+          : > kilo-events.log
+          : > kilo-output.log
           kilo --version 2>&1 | tee -a kilo-run.log
+
+          has_timeout=false
           if kilo run --help 2>&1 | grep -q -- '--timeout'; then
-            kilo run --auto "${{ inputs.prompt }}" --timeout "${{ inputs.timeout_minutes }}m" 2>&1 | tee -a kilo-run.log
+            has_timeout=true
+          fi
+
+          kilo_args=(kilo run --auto --format json)
+
+          if [ -n "${{ inputs.command }}" ]; then
+            kilo_args+=(--command "${{ inputs.command }}")
+            if [ -n "${{ inputs.command_args }}" ]; then
+              kilo_args+=("${{ inputs.command_args }}")
+            fi
+          elif [ -n "${{ inputs.prompt }}" ]; then
+            kilo_args+=("${{ inputs.prompt }}")
+          else
+            echo "Either 'command' or 'prompt' must be provided." | tee -a kilo-run.log
+            exit 1
+          fi
+
+          if [ "$has_timeout" = true ]; then
+            "${kilo_args[@]}" --timeout "${{ inputs.timeout_minutes }}m" 2>&1 | tee -a kilo-events.log | tee -a kilo-run.log
           else
-            echo "kilo run does not support --timeout; relying on job timeout-minutes." | tee -a kilo-run.log
-            kilo run --auto "${{ inputs.prompt }}" 2>&1 | tee -a kilo-run.log
+            echo "kilo run does not support --timeout; relying on job timeout-minutes." >> kilo-run.log
+            "${kilo_args[@]}" 2>&1 | tee -a kilo-events.log | tee -a kilo-run.log
           fi
 
+          node -e "const fs=require('fs');const strip=(s)=>s.replace(/\x1B\[[0-9;]*[A-Za-z]/g,'');const lines=fs.readFileSync('kilo-events.log','utf8').split(/\r?\n/);const texts=[];for(const line of lines){if(!line.trim())continue;try{const evt=JSON.parse(line);if(evt&&evt.type==='text'&&evt.part&&typeof evt.part.text==='string'){const t=evt.part.text.trim();if(t)texts.push(t);}}catch{}}let out='';if(texts.length){out=texts[texts.length-1];}else{const fallback=strip(fs.readFileSync('kilo-events.log','utf8')).split(/\r?\n/).map(x=>x.trim()).filter(Boolean);out=fallback.length?fallback[fallback.length-1]:'';}fs.writeFileSync('kilo-output.log',out+(out.endsWith('\n')?'':'\n'));"
+
+      - name: Post result as GitHub App comment
+        if: ${{ always() && inputs.pr_number != '' }}
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ steps.app-token.outputs.token }}
+          script: |
+            const fs = require('fs');
+            const prNumber = Number('${{ inputs.pr_number }}');
+            const parentCommentIdRaw = ('${{ inputs.parent_comment_id }}' || '${{ inputs.review_comment_id }}').trim();
+            const parentCommentId = parentCommentIdRaw ? Number(parentCommentIdRaw) : null;
+            let outputContent = '';
+            try {
+              outputContent = fs.readFileSync('kilo-output.log', 'utf8');
+            } catch {
+              outputContent = 'kilo-output.log not found.';
+            }
+
+            const body = outputContent.trim() || '(empty Kilo output)';
+
+            if (parentCommentId) {
+              try {
+                await github.rest.pulls.createReplyForReviewComment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  pull_number: prNumber,
+                  comment_id: parentCommentId,
+                  body
+                });
+                return;
+              } catch (error) {
+                const status = error?.status;
+                if (status !== 404 && status !== 422) {
+                  throw error;
+                }
+
+                let prefix = '';
+                try {
+                  const parent = await github.rest.issues.getComment({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    comment_id: parentCommentId
+                  });
+                  const login = parent?.data?.user?.login;
+                  if (login) {
+                    prefix = `@${login} `;
+                  }
+                } catch {
+                  // Fallback to plain issue comment if parent lookup fails.
+                }
+
+                await github.rest.issues.createComment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: prNumber,
+                  body: `${prefix}${body}`
+                });
+                return;
+              }
+            }
+
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: prNumber,
+              body
+            });
+
       - name: Upload Kilo run log
         if: always()
         uses: actions/upload-artifact@v4

diff --git a/.kilo/command/perf-review.md b/.kilo/command/perf-review.md
@@ -0,0 +1,199 @@
+---
+description: Benchmark-driven PR performance review versus target branch
+---
+
+# Perf Review Workflow
+
+You are performing a performance review for the current PR branch.
+
+Non-negotiable requirements:
+1. Benchmark timing plus profiling data is the highest-priority judgment tool.
+2. Compare source branch versus target branch and report relevant benchmark metric changes.
+3. Provide analysis and a final verdict: does the PR improve performance or not.
+
+## Inputs
+
+- Optional argument `--target <branch>`: target branch override.
+- Optional argument `--filter <regex>`: benchmark filter regex.
+
+If arguments are omitted:
+- Default target branch to PR base branch from `gh pr view --json baseRefName` when available.
+- Fall back target branch to `main`.
+- Default filter to empty (run full selected benchmark suites).
+
+## Step 1 - Resolve Branches and Revisions
+
+1. Identify contender branch and hash:
+   - Contender branch: current checked-out branch (or `HEAD` if detached).
+   - Contender hash: `git rev-parse --short HEAD`.
+2. Identify baseline branch:
+   - Use `--target` if provided.
+   - Else use PR base branch from GitHub CLI when available.
+   - Else use `main`.
+3. Resolve baseline hash with `git rev-parse --short <baseline-ref>`.
+4. Print branch and hash mapping before running benchmarks.
+
+## Step 2 - Select Relevant Benchmark Binaries
+
+Inspect changed files with:
+
+`git diff --name-only <baseline-ref>...HEAD`
+
+Map file paths to benchmark binaries:
+
+| Changed path pattern | Benchmark binary | Coverage |
+|---|---|---|
+| `include/bit_vector*`, `include/interleaved*` | `benchmarks` | BitVector rank/select |
+| `include/rmm*` | `bench_rmm` | RmM tree operations |
+| `include/louds*` | `louds_tree_benchmarks` | LOUDS traversal |
+| `include/simd*`, `include/aligned*` | `alignment_comparison` | SIMD and alignment |
+| `include/misc/*` | all relevant | Differential helpers |
+| `CMakeLists.txt`, benchmark infra, broad/unknown changes | all benchmarks | Conservative full run |
+
+Available benchmark binaries:
+- `benchmarks`
+- `bench_rmm`
+- `bench_rmm_sdsl`
+- `louds_tree_benchmarks`
+- `alignment_comparison`
+
+If the mapping is ambiguous, run all benchmark binaries.
+If `--filter` is provided, pass it through as `--benchmark_filter`.
+Print selected binaries and why they were selected.
+
+## Step 3 - Build Both Revisions (Timing and Profiling Builds)
+
+Use isolated build directories per short hash.
+
+1. Capture original ref (`git rev-parse --abbrev-ref HEAD` or detached `HEAD`).
+2. If worktree is dirty, stash safely with untracked files:
+   - `git stash push -u -m "perf-review-auto-stash"`
+3. Build baseline revision:
+   - `git checkout <baseline-hash-or-ref>`
+   - Timing build (required):
+     - `cmake -B build/benchmarks-all_bench_<baseline_hash> -DCMAKE_BUILD_TYPE=Release -DPIXIE_BENCHMARKS=ON`
+     - `cmake --build build/benchmarks-all_bench_<baseline_hash> --config Release -j`
+   - Profiling build (Linux only, recommended):
+     - `cmake -B build/benchmarks-diagnostic_bench_<baseline_hash> -DCMAKE_BUILD_TYPE=RelWithDebInfo -DPIXIE_BENCHMARKS=ON -DBENCHMARK_ENABLE_LIBPFM=ON -DPIXIE_DIAGNOSTICS=ON`
+     - `cmake --build build/benchmarks-diagnostic_bench_<baseline_hash> --config RelWithDebInfo -j`
+4. Build contender revision:
+   - `git checkout <contender-hash-or-original-ref>`
+   - Repeat timing and profiling build with contender hash suffix.
+5. Restore original ref and restore stashed state if a stash was created.
+
+Critical guardrails:
+- Never use Debug binaries for timing review.
+- Timing comparisons must use `benchmarks-all` Release builds.
+- Profiling counters should use `benchmarks-diagnostic` RelWithDebInfo builds.
+
+## Step 4 - Resolve Binary Paths
+
+Support both generator layouts:
+
+- Multi-config: `build/<dir>/Release/<binary>` or `build/<dir>/RelWithDebInfo/<binary>`
+- Single-config: `build/<dir>/<binary>`
+
+For each needed binary, detect the existing executable path before running.
+If a required binary is missing, report failure and stop with a blocked verdict.
+
+## Step 5 - Run Timing Comparison (Primary Judgment)
+
+Locate compare script from baseline timing build:
+
+`build/benchmarks-all_bench_<baseline_hash>/_deps/googlebenchmark-src/tools/compare.py`
+
+For each selected benchmark binary, run:
+
+`python3 <compare.py> benchmarks <baseline_binary> <contender_binary> [--benchmark_filter="<filter>"]`
+
+Capture full output for each binary and keep it for report details.
+
+## Step 6 - Collect Hardware Counter Profiles (Linux Only)
+
+If Linux profiling build is available, run both baseline and contender diagnostic binaries with counter output:
+
+- `--benchmark_counters_tabular=true`
+- `--benchmark_format=json`
+- `--benchmark_out=<output.json>`
+- Include `--benchmark_filter` when provided.
+
+Collect and compare at least these counter families when present:
+- `instructions`, `cycles` (for IPC)
+- `cache-misses`, `cache-references` (cache miss rate)
+- `branch-misses`, `branches` (branch mispredict rate)
+- `L1-dcache-load-misses` (L1 data cache pressure)
+
+Compute derived metrics when denominators are non-zero:
+- IPC = instructions / cycles
+- Cache miss rate = cache-misses / cache-references
+- Branch mispredict rate = branch-misses / branches
+
+If profiling is unavailable (non-Linux or libpfm not available), continue with timing-only review and explicitly mark profiling as unavailable in the report.
+
+## Step 7 - Analyze Timing and Counter Data
+
+Timing classification per benchmark entry:
+- Improvement: time delta < -5%
+- Regression: time delta > +5%
+- Neutral: between -5% and +5%
+
+Aggregate per binary:
+- Number of improvements/regressions/neutral
+- Net average percentage change
+- Largest regression and largest improvement
+
+Counter correlation:
+- Use hardware counters to explain major timing changes.
+- Flag anomalies (timing improves while key counters degrade, or opposite).
+
+Judgment priority:
+- Base verdict primarily on benchmark timing comparison.
+- Use counter data as explanatory evidence and confidence signal.
+
+## Step 8 - Produce Final Markdown Report
+
+Return a structured markdown report with this shape:
+
+```markdown
+## Performance Review: <contender_branch> vs <baseline_branch>
+
+### Configuration
+- Baseline: <branch> (<hash>)
+- Contender: <branch> (<hash>)
+- Platform: <os/arch>
+- Benchmarks run: <binary list>
+- Filter: <regex or none>
+- Hardware counters: available / unavailable
+
+### Timing Summary
+| Binary | Improvements | Regressions | Neutral | Net Change |
+|---|---:|---:|---:|---:|
+| ... | N | N | N | +/-X% |
+
+### Detailed Timing Results
+<Annotated compare.py outputs by binary>
+
+### Hardware Counter Profile (if available)
+| Benchmark | IPC (base->new) | Cache Miss Rate (base->new) | Branch Mispredict (base->new) |
+|---|---:|---:|---:|
+| ... | X.XX -> Y.YY | A.A% -> B.B% | C.C% -> D.D% |
+
+### Key Findings
+- <Most important regressions/improvements>
+- <Counter-based explanations for key timing shifts>
+
+### Verdict
+**[IMPROVES PERFORMANCE | REGRESSES PERFORMANCE | NO SIGNIFICANT CHANGE]**
+
+<1-2 sentence justification grounded in benchmark metrics, with profiling context if available>
+```
+
+Verdict rules:
+- `IMPROVES PERFORMANCE`: improvements outnumber regressions, no severe regression (>10%), and net average change is favorable.
+- `REGRESSES PERFORMANCE`: any severe regression (>10%) or regressions dominate with net unfavorable average.
+- `NO SIGNIFICANT CHANGE`: mostly neutral changes or mixed results that approximately cancel out.
+
+## Failure Handling
+
+- If required builds fail or timing comparison cannot run, output a blocked review with exact failure points and no misleading verdict.
+- If only profiling fails, continue with timing-based verdict and explicitly list profiling limitation.