From efa9c61f81c83ded503ca2255d6da86ea69c7348 Mon Sep 17 00:00:00 2001 From: Noah Gift Date: Mon, 20 Apr 2026 13:29:43 +0200 Subject: [PATCH] fix(sovereign-ci): chown sibling workspace back from root on container exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause (Five Whys — paiml/infra#69): 1. Security job fails in 15s with rm: EACCES on aprender/**/.git and aprender-present-widgets/src/*.rs in sibling workspace. 2. Files are owned by root:root; the security job runs as the runner user (uid 1000) and can't rewrite them. 3. Root-owned files come from the test/lint/coverage/bench jobs — those run inside a container whose process uid is 0. 4. The container mounts $GITHUB_WORKSPACE (the runner's _work tree) and the sibling checkout writes to $GITHUB_WORKSPACE/.., which is also bind-mounted via the host filesystem. Files written by root inside the container land on the host as root-owned. 5. Subsequent non-container jobs (security) — and every *future* run of the container jobs themselves, which start by `rm -rf`'ing the stale clones — cannot reclaim the tree. Fix: every container job chowns $GITHUB_WORKSPACE/.. back to uid 1000 in an `if: always()` tail step, so root-owned files never escape the container. Security job gets a defense-in-depth `sudo chown` at the top to recover from any pre-fix residue. Manually chowned 83838 files across the 16 runners on intel to unblock existing PRs; this patch prevents the poison from re-accumulating. Co-Authored-By: Claude Opus 4.7 --- .github/workflows/sovereign-ci.yml | 54 ++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/.github/workflows/sovereign-ci.yml b/.github/workflows/sovereign-ci.yml index b615d53..e398dca 100644 --- a/.github/workflows/sovereign-ci.yml +++ b/.github/workflows/sovereign-ci.yml @@ -216,6 +216,20 @@ jobs: "/var/log/ci-metrics/sccache-${{ github.run_id }}-${{ inputs.repo }}-test.json" \ 2>/dev/null || echo "::warning::sccache stats unavailable" + # FIVE-WHYS ROOT CAUSE (2026-04-20, paiml/infra#69): + # Container runs as root; sibling-clone dir $GITHUB_WORKSPACE/.. is + # bind-mounted from the runner host, so root-owned files leak out of + # the container onto the runner's _work tree. Subsequent non-container + # jobs (e.g. `security`) run as the runner user and can't `rm -rf` + # the stale clones → silent 15s failures. + # Fix: chown back to the runner uid/gid before the container exits. + # `always()` so the chown runs even if tests fail, preventing poison + # from surviving red builds. + - name: Restore runner ownership of sibling workspace + if: always() + run: | + chown -R 1000:1000 "$GITHUB_WORKSPACE/.." 2>/dev/null || true + lint: name: lint runs-on: [self-hosted, clean-room] @@ -345,6 +359,10 @@ jobs: else echo "::warning::No deny.toml — skipping supply chain audit" fi + - name: Restore runner ownership of sibling workspace + if: always() + run: | + chown -R 1000:1000 "$GITHUB_WORKSPACE/.." 2>/dev/null || true coverage: name: coverage @@ -471,6 +489,19 @@ jobs: with: files: lcov.info continue-on-error: true + # FIVE-WHYS ROOT CAUSE (2026-04-20, paiml/infra#69): + # Container runs as root; sibling-clone dir $GITHUB_WORKSPACE/.. is + # bind-mounted from the runner host, so root-owned files leak out of + # the container onto the runner's _work tree. Subsequent non-container + # jobs (e.g. `security`) run as the runner user and can't `rm -rf` + # the stale clones → silent 15s failures. + # Fix: chown back to the runner uid/gid before the container exits. + # `always()` so the chown runs even if tests fail, preventing poison + # from surviving red builds. + - name: Restore runner ownership of sibling workspace + if: always() + run: | + chown -R 1000:1000 "$GITHUB_WORKSPACE/.." 2>/dev/null || true bench: name: bench @@ -601,6 +632,19 @@ jobs: path: bench-results.txt retention-days: 90 continue-on-error: true + # FIVE-WHYS ROOT CAUSE (2026-04-20, paiml/infra#69): + # Container runs as root; sibling-clone dir $GITHUB_WORKSPACE/.. is + # bind-mounted from the runner host, so root-owned files leak out of + # the container onto the runner's _work tree. Subsequent non-container + # jobs (e.g. `security`) run as the runner user and can't `rm -rf` + # the stale clones → silent 15s failures. + # Fix: chown back to the runner uid/gid before the container exits. + # `always()` so the chown runs even if tests fail, preventing poison + # from surviving red builds. + - name: Restore runner ownership of sibling workspace + if: always() + run: | + chown -R 1000:1000 "$GITHUB_WORKSPACE/.." 2>/dev/null || true security: name: security @@ -619,6 +663,16 @@ jobs: run: | apt-get update -qq && apt-get install -y -qq ${{ inputs.extra_pkgs }} 2>/dev/null || \ sudo apt-get update -qq && sudo apt-get install -y -qq ${{ inputs.extra_pkgs }} 2>/dev/null || true + # FIVE-WHYS ROOT CAUSE RECOVERY (2026-04-20, paiml/infra#69): + # Container jobs upstream may have left root-owned files in the + # sibling workspace (bind-mount leak). Every container job now + # chowns back on exit, but for defense in depth — and to recover + # from runs that predated the fix — reclaim ownership before we + # touch the sibling tree. Without this, `rm -rf` on stale clones + # fails with EACCES and the job dies in 15s. + - name: Reclaim sibling workspace ownership (defense in depth) + run: | + sudo chown -R "$(id -u):$(id -g)" "$GITHUB_WORKSPACE/.." 2>/dev/null || true - name: Checkout sibling repos (path deps) run: | cd "$GITHUB_WORKSPACE/.."