-
Notifications
You must be signed in to change notification settings - Fork 0
Ci/GitHub actions kilo job #36
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,14 +4,50 @@ on: | |
| workflow_dispatch: | ||
| inputs: | ||
| prompt: | ||
| description: Prompt passed to Kilo CLI | ||
| required: true | ||
| description: Fallback prompt when command is not provided | ||
| required: false | ||
| default: "" | ||
| type: string | ||
| command: | ||
| description: Kilo command to run (without leading slash), e.g. local-review | ||
| required: false | ||
| default: "" | ||
| type: string | ||
| command_args: | ||
| description: Optional arguments passed to command mode | ||
| required: false | ||
| default: "" | ||
| type: string | ||
| timeout_minutes: | ||
| description: Timeout in minutes for job and Kilo run | ||
| required: false | ||
| default: "30" | ||
| type: string | ||
| model: | ||
| description: Gateway model ID for kilocode provider | ||
| required: false | ||
| default: "openai/gpt-5.3-codex" | ||
| type: string | ||
| pr_number: | ||
| description: Pull request number to post result to | ||
| required: false | ||
| default: "" | ||
| type: string | ||
| review_comment_id: | ||
| description: Optional PR review comment ID to reply to | ||
| required: false | ||
| default: "" | ||
| type: string | ||
| parent_comment_id: | ||
| description: Optional parent comment ID (preferred over review_comment_id) | ||
| required: false | ||
| default: "" | ||
| type: string | ||
|
|
||
| permissions: | ||
| contents: read | ||
| issues: write | ||
| pull-requests: write | ||
|
|
||
| jobs: | ||
| run-kilo: | ||
|
|
@@ -25,7 +61,7 @@ jobs: | |
| - name: Setup Node.js | ||
| uses: actions/setup-node@v4 | ||
| with: | ||
| node-version: 20 | ||
| node-version: 24 | ||
|
|
||
| - name: Install Kilo CLI | ||
| run: npm install -g @kilocode/cli | ||
|
|
@@ -36,22 +72,132 @@ jobs: | |
| echo "KILO_API_TOKEN secret is missing. Add it in repository settings before running this workflow." | ||
| exit 1 | ||
| fi | ||
| if [ -z "${{ secrets.NIKOLAY_REVIEWER_APP_ID }}" ]; then | ||
| echo "NIKOLAY_REVIEWER_APP_ID secret is missing. Add it in repository settings before running this workflow." | ||
| exit 1 | ||
| fi | ||
| if [ -z "${{ secrets.NIKOLAY_REVIEWER_PRIVATE_KEY }}" ]; then | ||
| echo "NIKOLAY_REVIEWER_PRIVATE_KEY secret is missing. Add it in repository settings before running this workflow." | ||
| exit 1 | ||
| fi | ||
|
|
||
| - name: Create GitHub App token | ||
| id: app-token | ||
| uses: actions/create-github-app-token@v1 | ||
| with: | ||
| app-id: ${{ secrets.NIKOLAY_REVIEWER_APP_ID }} | ||
| private-key: ${{ secrets.NIKOLAY_REVIEWER_PRIVATE_KEY }} | ||
| owner: ${{ github.repository_owner }} | ||
| repositories: ${{ github.event.repository.name }} | ||
|
|
||
| - name: Run Kilo CLI | ||
| env: | ||
| KILO_API_TOKEN: ${{ secrets.KILO_API_TOKEN }} | ||
| KILO_API_KEY: ${{ secrets.KILO_API_TOKEN }} | ||
| KILO_PROVIDER: kilocode | ||
| KILOCODE_MODEL: ${{ inputs.model }} | ||
| run: | | ||
| set -euo pipefail | ||
| : > kilo-run.log | ||
| : > kilo-events.log | ||
| : > kilo-output.log | ||
| kilo --version 2>&1 | tee -a kilo-run.log | ||
|
|
||
| has_timeout=false | ||
| if kilo run --help 2>&1 | grep -q -- '--timeout'; then | ||
| kilo run --auto "${{ inputs.prompt }}" --timeout "${{ inputs.timeout_minutes }}m" 2>&1 | tee -a kilo-run.log | ||
| has_timeout=true | ||
| fi | ||
|
|
||
| kilo_args=(kilo run --auto --format json) | ||
|
|
||
| if [ -n "${{ inputs.command }}" ]; then | ||
| kilo_args+=(--command "${{ inputs.command }}") | ||
| if [ -n "${{ inputs.command_args }}" ]; then | ||
| kilo_args+=("${{ inputs.command_args }}") | ||
| fi | ||
| elif [ -n "${{ inputs.prompt }}" ]; then | ||
| kilo_args+=("${{ inputs.prompt }}") | ||
| else | ||
| echo "Either 'command' or 'prompt' must be provided." | tee -a kilo-run.log | ||
| exit 1 | ||
| fi | ||
|
|
||
| if [ "$has_timeout" = true ]; then | ||
| "${kilo_args[@]}" --timeout "${{ inputs.timeout_minutes }}m" 2>&1 | tee -a kilo-events.log | tee -a kilo-run.log | ||
| else | ||
| echo "kilo run does not support --timeout; relying on job timeout-minutes." | tee -a kilo-run.log | ||
| kilo run --auto "${{ inputs.prompt }}" 2>&1 | tee -a kilo-run.log | ||
| echo "kilo run does not support --timeout; relying on job timeout-minutes." >> kilo-run.log | ||
| "${kilo_args[@]}" 2>&1 | tee -a kilo-events.log | tee -a kilo-run.log | ||
| fi | ||
|
|
||
| node -e "const fs=require('fs');const strip=(s)=>s.replace(/\x1B\[[0-9;]*[A-Za-z]/g,'');const lines=fs.readFileSync('kilo-events.log','utf8').split(/\r?\n/);const texts=[];for(const line of lines){if(!line.trim())continue;try{const evt=JSON.parse(line);if(evt&&evt.type==='text'&&evt.part&&typeof evt.part.text==='string'){const t=evt.part.text.trim();if(t)texts.push(t);}}catch{}}let out='';if(texts.length){out=texts[texts.length-1];}else{const fallback=strip(fs.readFileSync('kilo-events.log','utf8')).split(/\r?\n/).map(x=>x.trim()).filter(Boolean);out=fallback.length?fallback[fallback.length-1]:'';}fs.writeFileSync('kilo-output.log',out+(out.endsWith('\n')?'':'\n'));" | ||
|
|
||
| - name: Post result as GitHub App comment | ||
| if: ${{ always() && inputs.pr_number != '' }} | ||
| uses: actions/github-script@v7 | ||
| with: | ||
| github-token: ${{ steps.app-token.outputs.token }} | ||
| script: | | ||
| const fs = require('fs'); | ||
| const prNumber = Number('${{ inputs.pr_number }}'); | ||
| const parentCommentIdRaw = ('${{ inputs.parent_comment_id }}' || '${{ inputs.review_comment_id }}').trim(); | ||
| const parentCommentId = parentCommentIdRaw ? Number(parentCommentIdRaw) : null; | ||
| let outputContent = ''; | ||
| try { | ||
| outputContent = fs.readFileSync('kilo-output.log', 'utf8'); | ||
| } catch { | ||
| outputContent = 'kilo-output.log not found.'; | ||
| } | ||
|
|
||
| const body = outputContent.trim() || '(empty Kilo output)'; | ||
|
|
||
| if (parentCommentId) { | ||
| try { | ||
| await github.rest.pulls.createReplyForReviewComment({ | ||
| owner: context.repo.owner, | ||
| repo: context.repo.repo, | ||
| pull_number: prNumber, | ||
| comment_id: parentCommentId, | ||
| body | ||
| }); | ||
| return; | ||
| } catch (error) { | ||
| const status = error?.status; | ||
| if (status !== 404 && status !== 422) { | ||
| throw error; | ||
| } | ||
|
|
||
| let prefix = ''; | ||
| try { | ||
| const parent = await github.rest.issues.getComment({ | ||
| owner: context.repo.owner, | ||
| repo: context.repo.repo, | ||
| comment_id: parentCommentId | ||
| }); | ||
| const login = parent?.data?.user?.login; | ||
| if (login) { | ||
| prefix = `@${login} `; | ||
| } | ||
| } catch { | ||
| // Fallback to plain issue comment if parent lookup fails. | ||
| } | ||
|
|
||
| await github.rest.issues.createComment({ | ||
| owner: context.repo.owner, | ||
| repo: context.repo.repo, | ||
| issue_number: prNumber, | ||
| body: `${prefix}${body}` | ||
| }); | ||
| return; | ||
| } | ||
| } | ||
|
|
||
| await github.rest.issues.createComment({ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. SUGGESTION: The workflow still accepts |
||
| owner: context.repo.owner, | ||
| repo: context.repo.repo, | ||
| issue_number: prNumber, | ||
| body | ||
| }); | ||
|
|
||
| - name: Upload Kilo run log | ||
| if: always() | ||
| uses: actions/upload-artifact@v4 | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,199 @@ | ||
| --- | ||
| description: Benchmark-driven PR performance review versus target branch | ||
| --- | ||
|
|
||
| # Perf Review Workflow | ||
|
|
||
| You are performing a performance review for the current PR branch. | ||
|
|
||
| Non-negotiable requirements: | ||
| 1. Benchmark timing plus profiling data is the highest-priority judgment tool. | ||
| 2. Compare source branch versus target branch and report relevant benchmark metric changes. | ||
| 3. Provide analysis and a final verdict: does the PR improve performance or not. | ||
|
|
||
| ## Inputs | ||
|
|
||
| - Optional argument `--target <branch>`: target branch override. | ||
| - Optional argument `--filter <regex>`: benchmark filter regex. | ||
|
|
||
| If arguments are omitted: | ||
| - Default target branch to PR base branch from `gh pr view --json baseRefName` when available. | ||
| - Fall back target branch to `main`. | ||
| - Default filter to empty (run full selected benchmark suites). | ||
|
|
||
| ## Step 1 - Resolve Branches and Revisions | ||
|
|
||
| 1. Identify contender branch and hash: | ||
| - Contender branch: current checked-out branch (or `HEAD` if detached). | ||
| - Contender hash: `git rev-parse --short HEAD`. | ||
| 2. Identify baseline branch: | ||
| - Use `--target` if provided. | ||
| - Else use PR base branch from GitHub CLI when available. | ||
| - Else use `main`. | ||
| 3. Resolve baseline hash with `git rev-parse --short <baseline-ref>`. | ||
| 4. Print branch and hash mapping before running benchmarks. | ||
|
|
||
| ## Step 2 - Select Relevant Benchmark Binaries | ||
|
|
||
| Inspect changed files with: | ||
|
|
||
| `git diff --name-only <baseline-ref>...HEAD` | ||
|
|
||
| Map file paths to benchmark binaries: | ||
|
|
||
| | Changed path pattern | Benchmark binary | Coverage | | ||
| |---|---|---| | ||
| | `include/bit_vector*`, `include/interleaved*` | `benchmarks` | BitVector rank/select | | ||
| | `include/rmm*` | `bench_rmm` | RmM tree operations | | ||
| | `include/louds*` | `louds_tree_benchmarks` | LOUDS traversal | | ||
| | `include/simd*`, `include/aligned*` | `alignment_comparison` | SIMD and alignment | | ||
| | `include/misc/*` | all relevant | Differential helpers | | ||
| | `CMakeLists.txt`, benchmark infra, broad/unknown changes | all benchmarks | Conservative full run | | ||
|
|
||
| Available benchmark binaries: | ||
| - `benchmarks` | ||
| - `bench_rmm` | ||
| - `bench_rmm_sdsl` | ||
| - `louds_tree_benchmarks` | ||
| - `alignment_comparison` | ||
|
|
||
| If the mapping is ambiguous, run all benchmark binaries. | ||
| If `--filter` is provided, pass it through as `--benchmark_filter`. | ||
| Print selected binaries and why they were selected. | ||
|
|
||
| ## Step 3 - Build Both Revisions (Timing and Profiling Builds) | ||
|
|
||
| Use isolated build directories per short hash. | ||
|
|
||
| 1. Capture original ref (`git rev-parse --abbrev-ref HEAD` or detached `HEAD`). | ||
| 2. If worktree is dirty, stash safely with untracked files: | ||
| - `git stash push -u -m "perf-review-auto-stash"` | ||
| 3. Build baseline revision: | ||
| - `git checkout <baseline-hash-or-ref>` | ||
| - Timing build (required): | ||
| - `cmake -B build/benchmarks-all_bench_<baseline_hash> -DCMAKE_BUILD_TYPE=Release -DPIXIE_BENCHMARKS=ON` | ||
| - `cmake --build build/benchmarks-all_bench_<baseline_hash> --config Release -j` | ||
| - Profiling build (Linux only, recommended): | ||
| - `cmake -B build/benchmarks-diagnostic_bench_<baseline_hash> -DCMAKE_BUILD_TYPE=RelWithDebInfo -DPIXIE_BENCHMARKS=ON -DBENCHMARK_ENABLE_LIBPFM=ON -DPIXIE_DIAGNOSTICS=ON` | ||
| - `cmake --build build/benchmarks-diagnostic_bench_<baseline_hash> --config RelWithDebInfo -j` | ||
| 4. Build contender revision: | ||
| - `git checkout <contender-hash-or-original-ref>` | ||
| - Repeat timing and profiling build with contender hash suffix. | ||
| 5. Restore original ref and restore stashed state if a stash was created. | ||
|
|
||
| Critical guardrails: | ||
| - Never use Debug binaries for timing review. | ||
| - Timing comparisons must use `benchmarks-all` Release builds. | ||
| - Profiling counters should use `benchmarks-diagnostic` RelWithDebInfo builds. | ||
|
|
||
| ## Step 4 - Resolve Binary Paths | ||
|
|
||
| Support both generator layouts: | ||
|
|
||
| - Multi-config: `build/<dir>/Release/<binary>` or `build/<dir>/RelWithDebInfo/<binary>` | ||
| - Single-config: `build/<dir>/<binary>` | ||
|
|
||
| For each needed binary, detect the existing executable path before running. | ||
| If a required binary is missing, report failure and stop with a blocked verdict. | ||
|
|
||
| ## Step 5 - Run Timing Comparison (Primary Judgment) | ||
|
|
||
| Locate compare script from baseline timing build: | ||
|
|
||
| `build/benchmarks-all_bench_<baseline_hash>/_deps/googlebenchmark-src/tools/compare.py` | ||
|
|
||
| For each selected benchmark binary, run: | ||
|
|
||
| `python3 <compare.py> benchmarks <baseline_binary> <contender_binary> [--benchmark_filter="<filter>"]` | ||
|
|
||
| Capture full output for each binary and keep it for report details. | ||
|
|
||
| ## Step 6 - Collect Hardware Counter Profiles (Linux Only) | ||
|
|
||
| If Linux profiling build is available, run both baseline and contender diagnostic binaries with counter output: | ||
|
|
||
| - `--benchmark_counters_tabular=true` | ||
| - `--benchmark_format=json` | ||
| - `--benchmark_out=<output.json>` | ||
| - Include `--benchmark_filter` when provided. | ||
|
|
||
| Collect and compare at least these counter families when present: | ||
| - `instructions`, `cycles` (for IPC) | ||
| - `cache-misses`, `cache-references` (cache miss rate) | ||
| - `branch-misses`, `branches` (branch mispredict rate) | ||
| - `L1-dcache-load-misses` (L1 data cache pressure) | ||
|
|
||
| Compute derived metrics when denominators are non-zero: | ||
| - IPC = instructions / cycles | ||
| - Cache miss rate = cache-misses / cache-references | ||
| - Branch mispredict rate = branch-misses / branches | ||
|
|
||
| If profiling is unavailable (non-Linux or libpfm not available), continue with timing-only review and explicitly mark profiling as unavailable in the report. | ||
|
|
||
| ## Step 7 - Analyze Timing and Counter Data | ||
|
|
||
| Timing classification per benchmark entry: | ||
| - Improvement: time delta < -5% | ||
| - Regression: time delta > +5% | ||
| - Neutral: between -5% and +5% | ||
|
|
||
| Aggregate per binary: | ||
| - Number of improvements/regressions/neutral | ||
| - Net average percentage change | ||
| - Largest regression and largest improvement | ||
|
|
||
| Counter correlation: | ||
| - Use hardware counters to explain major timing changes. | ||
| - Flag anomalies (timing improves while key counters degrade, or opposite). | ||
|
|
||
| Judgment priority: | ||
| - Base verdict primarily on benchmark timing comparison. | ||
| - Use counter data as explanatory evidence and confidence signal. | ||
|
|
||
| ## Step 8 - Produce Final Markdown Report | ||
|
|
||
| Return a structured markdown report with this shape: | ||
|
|
||
| ```markdown | ||
| ## Performance Review: <contender_branch> vs <baseline_branch> | ||
|
|
||
| ### Configuration | ||
| - Baseline: <branch> (<hash>) | ||
| - Contender: <branch> (<hash>) | ||
| - Platform: <os/arch> | ||
| - Benchmarks run: <binary list> | ||
| - Filter: <regex or none> | ||
| - Hardware counters: available / unavailable | ||
|
|
||
| ### Timing Summary | ||
| | Binary | Improvements | Regressions | Neutral | Net Change | | ||
| |---|---:|---:|---:|---:| | ||
| | ... | N | N | N | +/-X% | | ||
|
|
||
| ### Detailed Timing Results | ||
| <Annotated compare.py outputs by binary> | ||
|
|
||
| ### Hardware Counter Profile (if available) | ||
| | Benchmark | IPC (base->new) | Cache Miss Rate (base->new) | Branch Mispredict (base->new) | | ||
| |---|---:|---:|---:| | ||
| | ... | X.XX -> Y.YY | A.A% -> B.B% | C.C% -> D.D% | | ||
|
|
||
| ### Key Findings | ||
| - <Most important regressions/improvements> | ||
| - <Counter-based explanations for key timing shifts> | ||
|
|
||
| ### Verdict | ||
| **[IMPROVES PERFORMANCE | REGRESSES PERFORMANCE | NO SIGNIFICANT CHANGE]** | ||
|
|
||
| <1-2 sentence justification grounded in benchmark metrics, with profiling context if available> | ||
| ``` | ||
|
|
||
| Verdict rules: | ||
| - `IMPROVES PERFORMANCE`: improvements outnumber regressions, no severe regression (>10%), and net average change is favorable. | ||
| - `REGRESSES PERFORMANCE`: any severe regression (>10%) or regressions dominate with net unfavorable average. | ||
| - `NO SIGNIFICANT CHANGE`: mostly neutral changes or mixed results that approximately cancel out. | ||
|
|
||
| ## Failure Handling | ||
|
|
||
| - If required builds fail or timing comparison cannot run, output a blocked review with exact failure points and no misleading verdict. | ||
| - If only profiling fails, continue with timing-based verdict and explicitly list profiling limitation. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SUGGESTION: Cache the npm global install to reduce workflow runtime
Installing
@kilocode/clion every run adds cold-start time. Consider enabling npm cache viaactions/setup-node(cache: npm) or caching the global install directory to speed repeated dispatch runs.