From 70cd851ad3544f566f97096febe3415be400e06d Mon Sep 17 00:00:00 2001 From: SkyFi Geek <45924209+mobileskyfi@users.noreply.github.com> Date: Sat, 27 Jun 2026 07:35:56 -0700 Subject: [PATCH] fix(ci): arm64 rollback smoke gate + dedicated PowerShell lint workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Post-merge Extended Verification (run 28281373854 on main) surfaced two reds. PowerShell lint: the `$using:` fix from PR #22 left a non-ASCII em-dash in a comment, tripping PSUseBOMForUnicodeEncodedFile (the rule examples/ PSScriptAnalyzerSettings.psd1 documents as the ASCII-clean guard). Made version-matrix.ps1 ASCII-clean again. Also closes #28: extracted the PSScriptAnalyzer job into a reusable lint-powershell.yml that runs on its own for examples/**/*.ps1 push/PR changes (early signal) and is `uses:`-d by verify-extended.yml, so .ps1 regressions no longer hide behind a manual dispatch. arm64 rollback smoke: the rollback example failed on macos/arm64 + linux/aarch64 while passing on every x86 accelerator. QEMU's internal savevm/loadvm snapshots don't restore a working aarch64 `virt` CHR — loadvm returns clean but the guest wedges and REST never returns (60s waitForBoot exhausted). Gated the example to arch: ["x64"] in the smoke matrix (new `arch` field mirroring the existing `os` gate); documented in DESIGN.md, the snapshot API docs, and the example header. Real fix tracked in #31. Co-Authored-By: Claude Opus 4.8 --- .github/instructions/ci.instructions.md | 5 +- .github/workflows/lint-powershell.yml | 53 ++++++++++++++++++++++ .github/workflows/verify-extended.yml | 20 ++------ BACKLOG.md | 1 + DESIGN.md | 1 + examples/rollback/rollback.ts | 4 ++ examples/version-matrix/version-matrix.ps1 | 2 +- src/lib/types.ts | 4 ++ test/integration/examples-smoke.test.ts | 14 +++++- 9 files changed, 83 insertions(+), 21 deletions(-) create mode 100644 .github/workflows/lint-powershell.yml diff --git a/.github/instructions/ci.instructions.md b/.github/instructions/ci.instructions.md index d76b6fa..3d444e7 100644 --- a/.github/instructions/ci.instructions.md +++ b/.github/instructions/ci.instructions.md @@ -6,12 +6,13 @@ applyTo: ".github/workflows/**" ## Workflow Overview -Three workflows, each with a distinct purpose: +Four workflows, each with a distinct purpose: | Workflow | File | Trigger | Purpose | |----------|------|---------|---------| | **CI** | `ci.yml` | push/PR to `main`, `workflow_dispatch` | Core quality gate — every push | | **Extended Verification** | `verify-extended.yml` | `workflow_dispatch` only | arm64, macOS, Windows integration | +| **PowerShell Lint** | `lint-powershell.yml` | push/PR touching `examples/**/*.ps1`, `workflow_call` | PSScriptAnalyzer over the `.ps1` example mirrors | | **Publish** | `publish.yml` | `push: tags: v*`, `workflow_dispatch` | NPM publish pipeline | ### CI pipeline (ci.yml) @@ -41,7 +42,7 @@ Triggered by `bun run release` (creates and pushes a `vX.Y.Z` tag) or via GitHub `workflow_dispatch` only — never runs on push/PR. One dispatch chooses **which platforms** (the five toggles), **what runs on them** (`run-integration` and/or `run-examples` — the examples smoke harness), against **which RouterOS** (`routeros-target`), on **which branch** (the ref picked in the "Run workflow" dropdown). So platform × mode is the matrix: the same dispatch can verify the integration suite AND the runnable examples across the selected OSes. Use `test-filter` to narrow integration to specific files and `example-filter` to narrow the smoke harness. Unlike `ci.yml`/`publish.yml` (which always boot the default/stable), this is how arm64/macOS/Windows — and, via the selectable `linux-x86` toggle, x86 — get exercised against long-term/testing/development or a pinned version. -Most jobs are independent, except the examples smoke matrix is built dynamically: `plan-smoke` reads the selected platform toggles and emits the `examples-smoke` matrix (one job per chosen OS). **Examples are held to the same bar as the code** — a broken example REDS the workflow on the gating platforms (linux KVM, macOS HVF); macOS/x86 and Windows (TCG) stay non-gating/informational, mirroring the integration jobs. `lint-powershell` (PSScriptAnalyzer, gating) runs whenever `run-examples` is on. (Integration-exercised coverage is **not** collected here yet — the standalone coverage job was removed because it re-ran the whole suite; reworking it as a byproduct of the integration jobs is tracked in [#30](https://github.com/tikoci/quickchr/issues/30).) +Most jobs are independent, except the examples smoke matrix is built dynamically: `plan-smoke` reads the selected platform toggles and emits the `examples-smoke` matrix (one job per chosen OS). **Examples are held to the same bar as the code** — a broken example REDS the workflow on the gating platforms (linux KVM, macOS HVF); macOS/x86 and Windows (TCG) stay non-gating/informational, mirroring the integration jobs. `lint-powershell` (PSScriptAnalyzer, gating) runs whenever `run-examples` is on — it `uses:` the reusable `lint-powershell.yml`, which **also runs on its own** for any push/PR touching `examples/**/*.ps1` so `.ps1` regressions surface in normal PR CI, not just on a manual dispatch ([#28](https://github.com/tikoci/quickchr/issues/28)). (Integration-exercised coverage is **not** collected here yet — the standalone coverage job was removed because it re-ran the whole suite; reworking it as a byproduct of the integration jobs is tracked in [#30](https://github.com/tikoci/quickchr/issues/30).) ## Release Process diff --git a/.github/workflows/lint-powershell.yml b/.github/workflows/lint-powershell.yml new file mode 100644 index 0000000..7ae6058 --- /dev/null +++ b/.github/workflows/lint-powershell.yml @@ -0,0 +1,53 @@ +name: PowerShell Lint + +# Lints the example PowerShell mirrors (examples/**/*.ps1) with PSScriptAnalyzer. +# +# Why a dedicated workflow (not a step in ci.yml or only in verify-extended.yml): +# - PSScriptAnalyzer needs a Windows runner + the module installed; folding it +# into the Linux core pipeline would slow every push for a niche check. +# - It used to live ONLY in verify-extended.yml (workflow_dispatch), so .ps1 +# regressions surfaced late — a PSUseUsingScopeModifierInNewRunspaces failure +# in #22 only appeared on a manual Extended Verification run (see issue #28). +# +# So: trigger on `.ps1` (and the analyzer settings) changes for fast PR feedback, +# AND expose `workflow_call` so Extended Verification reuses the same job — one +# source of truth for "are the .ps1 examples clean?". + +on: + push: + branches: [main] + paths: + - "examples/**/*.ps1" + - "examples/PSScriptAnalyzerSettings.psd1" + - ".github/workflows/lint-powershell.yml" + pull_request: + branches: [main] + paths: + - "examples/**/*.ps1" + - "examples/PSScriptAnalyzerSettings.psd1" + - ".github/workflows/lint-powershell.yml" + # Reusable: verify-extended.yml calls this so the dispatch path stays covered. + workflow_call: + +permissions: + contents: read + +jobs: + lint-powershell: + name: Lint PowerShell examples + runs-on: windows-latest + steps: + - uses: actions/checkout@v5 + - name: PSScriptAnalyzer + shell: pwsh + run: | + Install-Module -Name PSScriptAnalyzer -Force -Scope CurrentUser -ErrorAction Stop + $settings = Join-Path $PWD 'examples/PSScriptAnalyzerSettings.psd1' + $issues = Get-ChildItem -Recurse -Path examples -Filter *.ps1 | + ForEach-Object { Invoke-ScriptAnalyzer -Path $_.FullName -Settings $settings } + $issues | Format-Table -AutoSize + if ($issues.Count -gt 0) { + Write-Error "PSScriptAnalyzer found $($issues.Count) issue(s) in example .ps1 scripts" + exit 1 + } + Write-Host "PSScriptAnalyzer: example .ps1 scripts are clean" diff --git a/.github/workflows/verify-extended.yml b/.github/workflows/verify-extended.yml index c4147f2..b868e3f 100644 --- a/.github/workflows/verify-extended.yml +++ b/.github/workflows/verify-extended.yml @@ -893,22 +893,10 @@ jobs: # Static-analyze the example .ps1 scripts so PowerShell can't silently bit-rot. # Uses examples/PSScriptAnalyzerSettings.psd1 (documents which rules are waived # and why). Gating: a flagged .ps1 reds the workflow. + # Reuses the dedicated PowerShell-lint workflow (also runs on its own for `.ps1` + # changes — see lint-powershell.yml / issue #28), so the dispatch path stays + # covered without duplicating the PSScriptAnalyzer invocation here. lint-powershell: name: Lint PowerShell examples if: ${{ inputs.run-examples == true }} - runs-on: windows-latest - steps: - - uses: actions/checkout@v5 - - name: PSScriptAnalyzer - shell: pwsh - run: | - Install-Module -Name PSScriptAnalyzer -Force -Scope CurrentUser -ErrorAction Stop - $settings = Join-Path $PWD 'examples/PSScriptAnalyzerSettings.psd1' - $issues = Get-ChildItem -Recurse -Path examples -Filter *.ps1 | - ForEach-Object { Invoke-ScriptAnalyzer -Path $_.FullName -Settings $settings } - $issues | Format-Table -AutoSize - if ($issues.Count -gt 0) { - Write-Error "PSScriptAnalyzer found $($issues.Count) issue(s) in example .ps1 scripts" - exit 1 - } - Write-Host "PSScriptAnalyzer: example .ps1 scripts are clean" + uses: ./.github/workflows/lint-powershell.yml diff --git a/BACKLOG.md b/BACKLOG.md index dc9bda8..f59bb43 100644 --- a/BACKLOG.md +++ b/BACKLOG.md @@ -74,6 +74,7 @@ - [x] **CI-checkable:** `bun run check` adds `lint:examples` (validator) + `lint:shell` (shellcheck `-s sh`); biome now includes `examples/**`. Extended verification gains `include-examples`/`example-filter` inputs → `examples-smoke` job (curated subset + one representative per language + an intentional failure-path case asserting teardown) and a `lint-powershell` job (PSScriptAnalyzer). `trial-license` excluded from CI (MikroTik rate limits). - [x] **CI hardening (PR #22 review pass, 2026-06-26):** examples smoke is now a **platform matrix** mirroring integration — `verify-extended.yml` replaces `include-examples` with mode toggles `run-integration`/`run-examples`; a `plan-smoke` job emits the smoke matrix from the platform toggles, so one dispatch runs integration and/or smoke across the chosen OSes against a chosen `routeros-target`. **A broken example gates** the workflow on KVM/HVF platforms (TCG stays informational). The harness picks per-OS representatives (`.ts` everywhere, `.sh`/`.py`-via-`uv` on POSIX, `.ps1` on Windows), runs Python via `uv run`, and **fails fast on a typo'd `EXAMPLE_FILTER`**. PowerShell `.ps1` made ASCII-clean + `examples/PSScriptAnalyzerSettings.psd1` (waives `PSAvoidUsingWriteHost` for interactive demos, with rationale); `Invoke-Qc` now fails on non-zero native exits. `ci.yml` Repo Checks installs shellcheck so `lint:shell` actually enforces POSIX-sh on every push. - [x] **PR #22 close-out (2026-06-26):** fixed CodeQL `js/insecure-randomness` by replacing `Math.random()` with `crypto.randomUUID()` in the example helpers (`examples/lib.ts`, `examples/grounding/grounding.test.ts`); the taint flowed through example-built machine names into credential sinks. Fixed a latent unit-test bug (`qemu-args.test.ts` "x86 HVF uses host CPU model") that only surfaced on KVM runners — KVM, like HVF, adds `-cpu host`, so the `else` branch's `cpuIdx == -1` assertion was wrong; it never ran on a KVM runner in normal CI (unit job has no KVM; integration job doesn't run `test/unit/`). Fixed PowerShell `PSUseUsingScopeModifierInNewRunspaces` in `version-matrix.ps1` (switched `Start-Job` from `param()`+`-ArgumentList` to `$using:`). **Removed the standalone `coverage` job** (it re-ran the whole suite) — reworking coverage as a byproduct of the integration jobs is tracked in #30. Opened follow-ups: #26 (rename `tzspGatewayIp`), #27 (troubleshooting capture example), #28 (PowerShell workflow org), #29 (release/verification reuse), #30 (coverage byproduct). +- [x] **PR #22 close-out, round 2 (2026-06-27):** post-merge Extended Verification turned up two more reds. (1) **PowerShell lint** — the `$using:` fix introduced a non-ASCII em-dash in a comment, tripping `PSUseBOMForUnicodeEncodedFile` (the rule the settings file documents as the ASCII-clean guard); made `version-matrix.ps1` ASCII-clean again. Also **closed #28**: extracted the PSScriptAnalyzer job into a dedicated reusable `lint-powershell.yml` that runs on its own for `examples/**/*.ps1` push/PR changes (early signal) and is `uses:`-d by `verify-extended.yml` (dispatch path still covered). (2) **arm64 rollback smoke** — `rollback` failed on macos/arm64 + linux/aarch64 while passing on every x86 accelerator. Root cause: QEMU internal `savevm`/`loadvm` snapshots don't restore a working aarch64 `virt` CHR (`loadvm` returns clean, guest wedges, REST never returns). Gated the example to `arch: ["x64"]` in the smoke matrix (new `arch` field mirroring the existing `os` gate); documented in `DESIGN.md` + the snapshot API docs + the example header; tracked the real fix in #31. diff --git a/DESIGN.md b/DESIGN.md index bffeb8c..71f8e07 100644 --- a/DESIGN.md +++ b/DESIGN.md @@ -31,6 +31,7 @@ Modules (src/lib/) ← qemu, images, versions, network, state, ... 3. **No shell scripts** — QEMU args built entirely in TypeScript. Enables Windows support and testability. 4. **Optional qcow2** — Default boot disk uses raw `.img` (MikroTik provides them). Users can opt into `qcow2` format for boot resize and QEMU snapshot/restore support. Requires `qemu-img` when enabled. + - **arm64 caveat** — QEMU's internal `savevm`/`loadvm` snapshots are **x86-only in practice**. `loadvm` returns clean on the aarch64 `virt` machine but the restored guest is wedged (REST never comes back), across both HVF and TCG. The examples smoke harness gates the `rollback` example to x64 for this reason. Tracked in issue #31. 5. **ARM64 VirtIO rule** — Never use `if=virtio` on aarch64 `virt` machine. Always explicit `-device virtio-blk-pci,drive=drive0`. diff --git a/examples/rollback/rollback.ts b/examples/rollback/rollback.ts index 1ac12b1..87a2db8 100755 --- a/examples/rollback/rollback.ts +++ b/examples/rollback/rollback.ts @@ -6,6 +6,10 @@ * change the config, then restore the snapshot and prove the change is gone. * Snapshots need a qcow2 boot disk (the quickchr default) — raw disks can't. * + * x86 only: QEMU's internal savevm/loadvm snapshots don't reliably restore an + * aarch64 `virt` CHR (loadvm returns clean but the guest wedges and REST never + * returns). See issue #31 — the examples smoke harness skips this on arm64. + * * Run: bun run examples/rollback/rollback.ts * Time: ~30–50 s. */ diff --git a/examples/version-matrix/version-matrix.ps1 b/examples/version-matrix/version-matrix.ps1 index 77da221..d612bcd 100644 --- a/examples/version-matrix/version-matrix.ps1 +++ b/examples/version-matrix/version-matrix.ps1 @@ -16,7 +16,7 @@ foreach ($ch in $channels) { Write-Host "-> starting $name (channel=$ch, port-base=$base)..." # $using: captures each iteration's loop values at Start-Job time (Start-Job # serializes them into the child runspace). This is the form PSScriptAnalyzer's - # PSUseUsingScopeModifierInNewRunspaces wants — it doesn't recognize the older + # PSUseUsingScopeModifierInNewRunspaces wants -- it doesn't recognize the older # param()+-ArgumentList pattern and flags those as missing the Using: scope. $jobs += Start-Job -ScriptBlock { $parts = $using:qc -split '\s+' diff --git a/src/lib/types.ts b/src/lib/types.ts index 7e6ab10..b6aa472 100644 --- a/src/lib/types.ts +++ b/src/lib/types.ts @@ -564,6 +564,10 @@ export interface ChrInstance { * **For stopped machines**, only `list` works (reads qcow2 metadata directly * via `qemu-img info`). Other operations require the machine to be running. * + * **x86 only in practice.** Internal `savevm`/`loadvm` snapshots do not + * reliably restore an aarch64 `virt` guest — `loadvm` returns clean but the + * restored CHR is wedged and never comes back on REST. See issue #31. + * * @example * const snaps = await instance.snapshot.list(); * await instance.snapshot.save("before-upgrade"); diff --git a/test/integration/examples-smoke.test.ts b/test/integration/examples-smoke.test.ts index 21df0d0..a12b0cc 100644 --- a/test/integration/examples-smoke.test.ts +++ b/test/integration/examples-smoke.test.ts @@ -30,6 +30,9 @@ interface Runnable { env?: Record; // Restrict to these platforms (process.platform). Omitted = all. os?: NodeJS.Platform[]; + // Restrict to these CPU arches (process.arch). Omitted = all. Used to gate + // examples that hit a genuine per-arch QEMU limitation (e.g. rollback, below). + arch?: NodeJS.Architecture[]; } // One representative per language, selected per OS so the CLI mirror that actually @@ -40,7 +43,12 @@ interface Runnable { // Kept small — each entry boots a real CHR. const RUNNABLE: Runnable[] = [ { name: "quickstart", lang: "ts", cmd: ["bun", "run", "examples/quickstart/quickstart.ts"] }, - { name: "rollback", lang: "ts", cmd: ["bun", "run", "examples/rollback/rollback.ts"] }, + // x64-only: QEMU's internal savevm/loadvm snapshots don't restore a working + // aarch64 `virt` CHR — loadvm returns clean but the guest is wedged and REST + // never comes back, so this fails on macos/arm64 + linux/aarch64 while passing + // on every x86 accelerator. Tracked in issue #31. The example itself is + // unchanged; it just isn't exercised where the snapshot round-trip can't work. + { name: "rollback", lang: "ts", cmd: ["bun", "run", "examples/rollback/rollback.ts"], arch: ["x64"] }, { name: "quickstart-sh", lang: "sh", @@ -81,7 +89,9 @@ if (!SKIP && unknownFilter.length > 0) { } const want = (name: string) => FILTER.length === 0 || FILTER.includes(name); -const applies = (r: Runnable) => !r.os || r.os.includes(process.platform); +const applies = (r: Runnable) => + (!r.os || r.os.includes(process.platform)) && + (!r.arch || r.arch.includes(process.arch)); async function run(cmd: string[], env: Record = {}) { const proc = Bun.spawn(cmd, {