From b724c9db576169739befdbfc65c2f5fefca04939 Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Sat, 30 May 2026 17:54:40 +0800
Subject: [PATCH] Snapshot shm via APFS fclonefileat for safe CoW

Fork previously sent the parent's live shm_fd via SCM_RIGHTS and the
child mapped it MAP_PRIVATE. The parent stayed on MAP_SHARED, so any
page the child had not yet COW'd reflected the parent's current bytes.
That was benign for typical aarch64 workloads but corrupted
x86_64-via-Rosetta guests: translator-internal structures (TLS slabs,
code caches, indirect-call tables, block lists) cross page boundaries
and observe parent-side mid-update reads. Issue #45 tracked the
resulting fall back to a per-region byte copy through the IPC socket --
10-16x slower per fork than the CoW path.

sys_clone now lifts the !g->is_rosetta gate and always asks
fork_snapshot_shm_via_clonefile() for an APFS fclonefileat snapshot of
the shm file. The clone shares blocks with the parent until either
side writes, so the parent's later writes never reach the child's
backing, and the existing guest_init_from_shm MAP_PRIVATE flow on the
child consumes the snapshot unchanged. The snapshot helper uses a mode
0700 mkdtemp directory (clone inside, then unlink + rmdir) rather than
an earlier mkstemp + unlink + fclonefileat sequence whose freed /tmp
basename gave a local-user TOCTOU window that could DoS the fast path
via EEXIST.

Fallback differs by guest. Rosetta drops use_shm on clonefile failure
and falls through to the legacy region-copy path; sending the live fd
would re-introduce the issue #45 corruption. Native guests keep
use_shm and send g->shm_fd directly, preserving the original CoW
behavior so a non-APFS /tmp does not silently slow forks down to
per-region copy cost.

Overlay sync (pwrite of file-backed MAP_SHARED overlay bytes into
shm_fd) moves before the IPC header so the cloned file picks up
overlay-backed bytes and the header has_shm field reflects the
post-clonefile outcome.

guest_init_from_shm now closes shm_fd on the compute_infra_layout
failure path so the take-ownership contract holds on every error,
not just the post-mmap ones.

tests/bench-fork-cost.sh is added as the regression baseline.
Per-fork wall-clock means from three back-to-back runs on M1
(subshell fork, no exec, per-fork numbers exclude startup via a
0-iter baseline subtraction):

  rss      aarch64 CoW    Rosetta (before / after)   ratio
  0 MiB    ~113 ms/fork   ~1058-1196 / ~113 ms       10x -> ~1x
  1 MiB    ~113 ms/fork   ~1090-1250 / ~117 ms       10x -> ~1x
  16 MiB   ~114 ms/fork   ~1125-1230 / ~120 ms       10x -> ~1x
  64 MiB   ~114 ms/fork   ~1400-1840 / ~220 ms       12-16x -> ~2x

The 64 MiB Rosetta residual is APFS clone metadata plus child-side
MAP_PRIVATE materialization, not byte-copy bandwidth. test-cow-fork
(5/5), make check, and the 71-test make test-rosetta-all suite stay
green.

Close #45
---
 src/core/guest.c         |   9 +-
 src/runtime/forkipc.c    | 237 ++++++++++++++++++++++++++++-----------
 tests/bench-fork-cost.sh | 218 +++++++++++++++++++++++++++++++++++
 3 files changed, 395 insertions(+), 69 deletions(-)
 create mode 100755 tests/bench-fork-cost.sh

diff --git a/src/core/guest.c b/src/core/guest.c
index bcbc3c2..21407d5 100644
--- a/src/core/guest.c
+++ b/src/core/guest.c
@@ -458,8 +458,15 @@ int guest_init_from_shm(guest_t *g,
     g->interp_base = size - 0x100000000ULL;
     g->mmap_limit = size - 0x200000000ULL;
     g->overflow_ipa_next = size;
-    if (compute_infra_layout(g) < 0)
+    if (compute_infra_layout(g) < 0) {
+        /* Layout computation may reject a malformed header (impossible
+         * guest_size / ipa_bits combination) before the mapping is set up;
+         * close the inherited shm fd here so the caller's contract -- this
+         * function takes ownership of shm_fd -- holds on every error path.
+         */
+        close(shm_fd);
         return -1;
+    }
     g->pt_pool_next = g->pt_pool_base;
 
     /* Map the shm fd MAP_PRIVATE: copy-on-write semantics. Reads see
diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c
index 3f8c4a5..435051a 100644
--- a/src/runtime/forkipc.c
+++ b/src/runtime/forkipc.c
@@ -24,6 +24,7 @@
 #include <sys/socket.h>
 #include <dirent.h> /* fdopendir, for DIR* reconstruction in child */
 #include <sys/wait.h>
+#include <sys/clonefile.h> /* fclonefileat for CoW shm snapshots */
 #include <mach-o/dyld.h>
 
 #include "hvutil.h"
@@ -1137,6 +1138,51 @@ static void *vm_clone_thread_run(void *arg)
     return NULL;
 }
 
+/* Create an APFS block-level CoW clone of src_fd via fclonefileat (O(metadata),
+ * independent of the source once either side writes). Returns the clone fd on
+ * success, -1 with errno set on failure (non-APFS /tmp, ENOSYS, ENOSPC, ...).
+ * Callers that issue this snapshot are documented at the call site; the helper
+ * itself only owns the clone-path lifecycle.
+ */
+static int fork_snapshot_shm_via_clonefile(int src_fd)
+{
+    /* fclonefileat needs a destination path on the same APFS volume as the
+     * source. /tmp is APFS on every shipped macOS Apple Silicon configuration;
+     * if a user has remapped /tmp to a different filesystem the call fails
+     * and the caller drops back to the legacy path.
+     *
+     * The destination lives inside a fresh mkdtemp directory (mode 0700) so
+     * no other local user can race to claim the destination basename between
+     * path selection and fclonefileat: an earlier mkstemp + unlink +
+     * fclonefileat sequence left a window where /tmp was world-writable for
+     * that name and a concurrent process could DoS the fast path via EEXIST.
+     */
+    char tmpdir[] = "/tmp/elfuse-fork-XXXXXX";
+    if (mkdtemp(tmpdir) == NULL)
+        return -1;
+    char clone_path[64];
+    snprintf(clone_path, sizeof(clone_path), "%s/snap", tmpdir);
+    if (fclonefileat(src_fd, AT_FDCWD, clone_path, 0) < 0) {
+        int saved_errno = errno;
+        rmdir(tmpdir);
+        errno = saved_errno;
+        return -1;
+    }
+    int clone_fd = open(clone_path, O_RDWR | O_CLOEXEC);
+    int saved_errno = errno;
+    /* Best-effort cleanup: the clone fd alone keeps the inode alive, so any
+     * unlink/rmdir failure here is a directory-leak nuisance, not a
+     * correctness issue. Caller still gets the open fd.
+     */
+    (void) unlink(clone_path);
+    (void) rmdir(tmpdir);
+    if (clone_fd < 0) {
+        errno = saved_errno;
+        return -1;
+    }
+    return clone_fd;
+}
+
 int64_t sys_clone(hv_vcpu_t vcpu,
                   guest_t *g,
                   uint64_t flags,
@@ -1163,13 +1209,13 @@ int64_t sys_clone(hv_vcpu_t vcpu,
                                 verbose);
     }
 
-    /* Rosetta fork takes the helper-process IPC path. The CoW shm fast-path
-     * is gated off in use_shm because HVF caches VA->PA at hv_vm_map time and
-     * the parent's MAP_SHARED mapping cannot be safely remapped under the
-     * running vCPU. The TTBR1 kbuf tree, translator image, and kbuf bytes
-     * ride along as primary-buffer used regions; the child restores
-     * TCR_EL1 / TTBR1_EL1 from ipc_registers_t and recomputes kbuf_base
-     * from kbuf_gpa.
+    /* Rosetta fork takes the helper-process IPC path. The parent cannot remap
+     * its live guest memory under the running vCPU because HVF caches VA->PA at
+     * hv_vm_map time; instead, the fork path snapshots shm with clonefile when
+     * available and otherwise falls back to region copy. The TTBR1 kbuf tree,
+     * translator image, and kbuf bytes ride along as primary-buffer used
+     * regions; the child restores TCR_EL1 / TTBR1_EL1 from ipc_registers_t and
+     * recomputes kbuf_base from kbuf_gpa.
      */
 
     /* elfuse only supports fork-like clone (SIGCHLD) and posix_spawn-like
@@ -1291,10 +1337,17 @@ int64_t sys_clone(hv_vcpu_t vcpu,
         return -LINUX_ENOMEM;
     }
 
-    /* The parent keeps only its end of the control channel. */
+    /* The parent keeps only its end of the control channel. Reset the closed
+     * write end to -1 so the fail_snapshot guarded close at the bottom of the
+     * function cannot double-close it. In a multithreaded guest, another vCPU
+     * could open a new fd between the two closes and get the same number,
+     * which the second close would then steal.
+     */
     close(sock_fds[1]);
-    if (vfork_notify_fds[1] >= 0)
+    if (vfork_notify_fds[1] >= 0) {
         close(vfork_notify_fds[1]);
+        vfork_notify_fds[1] = -1;
+    }
     int ipc_sock = sock_fds[0];
 
     /* Allocate guest PID before serialization so the child header carries its
@@ -1314,6 +1367,10 @@ int64_t sys_clone(hv_vcpu_t vcpu,
     mmap_fork_anon_shared_txn_t *anon_shared_txn = NULL;
     guest_region_t *regions_snapshot = NULL;
     guest_region_t preannounced_snapshot[GUEST_MAX_PREANNOUNCED];
+    /* APFS clone fd for the CoW snapshot sent to the child. Declared up front
+     * so early goto fail_snapshot exits do not read an uninitialized local.
+     */
+    int snapshot_shm_fd = -1;
 
     /* Convert MAP_SHARED|MAP_ANONYMOUS regions that have no backing fd
      * into memfd-backed overlay regions. The conversion seeds a private
@@ -1328,31 +1385,87 @@ int64_t sys_clone(hv_vcpu_t vcpu,
     if (mmap_fork_prepare_anon_shared(g, &anon_shared_txn) < 0)
         goto fail_snapshot;
 
-    /* Determine if elfuse can use the CoW (shm) fast path.
-     * If shm_fd >= 0, elfuse freezes a snapshot via MAP_PRIVATE and sends the
-     * shm fd to the child. Otherwise fall back to region-by-region copy.
+    /* CoW fast path: if shm_fd >= 0, send a snapshot of guest memory to the
+     * child instead of the per-region copy. The child maps that snapshot
+     * MAP_PRIVATE; subsequent writes on either side are private.
      *
-     * Rosetta guests are excluded from CoW even when shm-backed: rosetta's
-     * JIT state (TLS slabs, code caches, indirect-call tables, block lists)
-     * is process-local and corrupts when CoW-shared. The legacy region-copy
-     * path preserves the parent's JIT state independently per child.
-     */
-    bool use_shm = (g->shm_fd >= 0) && !g->is_rosetta;
-
-    /* elfuse does not remap the parent to MAP_PRIVATE here. The parent
-     * stays on MAP_SHARED; its vCPU continues writing to the shared file.
-     * The child maps MAP_PRIVATE, getting a CoW snapshot.
+     * The parent's own mapping cannot be flipped to MAP_PRIVATE here: hv_vm_map
+     * caches the host VA->PA mapping, and a MAP_FIXED remap invalidates it
+     * (the parent then reads stale memory and writev returns EFAULT). So the
+     * parent stays on MAP_SHARED and the snapshot is what isolates the child.
      *
-     * This is safe because the IPC is synchronous: the child maps MAP_PRIVATE
-     * before the parent's vCPU resumes. After that, the child's CoW pages are
-     * frozen (child writes are private, parent writes to MAP_SHARED do not
-     * affect CoW'd child pages).
+     * Two snapshot sources, in preference order (selected just below):
+     *   1. fclonefileat of g->shm_fd to an independent APFS clone. The clone
+     *      shares blocks with the parent until either side writes, so the
+     *      parent's subsequent writes never reach the child's backing.
+     *   2. The live g->shm_fd. Any page the child has not yet COW'd reads the
+     *      parent's current bytes -- benign for typical guest state, but
+     *      corrupts Rosetta's translator-internal structures (TLS slabs, code
+     *      caches, indirect-call tables) on mid-update reads. Issue #45.
      *
-     * an earlier implementation tried remapping the parent to MAP_PRIVATE here,
-     * but that breaks HVF: hv_vm_map caches the host VA->PA mapping, and
-     * MAP_FIXED remap invalidates it. The parent's vCPU then reads stale
-     * memory, causing corrupted syscall data (EFAULT on writev).
+     * Rosetta therefore requires path 1 and falls back to region copy if
+     * fclonefileat fails; native guests accept path 2 as a fallback so a
+     * non-APFS /tmp does not silently slow forks down to per-region copy cost.
      */
+    bool use_shm = (g->shm_fd >= 0);
+
+    /* Overlay sync runs before the snapshot so the cloned file picks up the
+     * overlay-backed bytes. The parent's host VA for each overlay region maps
+     * the overlay file, not shm_fd, so shm_fd's contents at those offsets are
+     * stale (typically zero) until the pwrite below copies them in. Both the
+     * clone-fd path and the live-shm_fd fallback consume this sync.
+     */
+    if (use_shm) {
+        for (int i = 0; i < g->nregions; i++) {
+            const guest_region_t *r = &g->regions[i];
+            if (!r->overlay_active)
+                continue;
+            uint64_t len = r->end - r->start;
+            const uint8_t *src = (const uint8_t *) g->host_base + r->start;
+            uint64_t off = r->start;
+            while (len > 0) {
+                size_t chunk = len > (uint64_t) SSIZE_MAX ? (size_t) SSIZE_MAX
+                                                          : (size_t) len;
+                ssize_t nw = pwrite(g->shm_fd, src, chunk, (off_t) off);
+                if (nw < 0) {
+                    if (errno == EINTR)
+                        continue;
+                    log_error("clone: shm overlay sync pwrite failed: %s",
+                              strerror(errno));
+                    goto fail_snapshot;
+                }
+                if (nw == 0) {
+                    log_error("clone: shm overlay sync pwrite returned 0");
+                    goto fail_snapshot;
+                }
+                src += nw;
+                off += (uint64_t) nw;
+                len -= (uint64_t) nw;
+            }
+        }
+        /* Attempt the APFS clone snapshot for every guest, not just Rosetta:
+         * the clone gives POSIX-style isolation at O(metadata) cost and avoids
+         * torn-snapshot reads in guests that snapshot their own state across
+         * fork (Redis BGSAVE, checkpointing runtimes). On failure the fallback
+         * differs per design above: Rosetta drops use_shm so the region-copy
+         * path runs; native guests keep use_shm and send the live g->shm_fd.
+         */
+        snapshot_shm_fd = fork_snapshot_shm_via_clonefile(g->shm_fd);
+        if (snapshot_shm_fd < 0) {
+            if (g->is_rosetta) {
+                log_warn(
+                    "clone: rosetta CoW snapshot via fclonefileat failed "
+                    "(%s); falling back to region-copy path",
+                    strerror(errno));
+                use_shm = false;
+            } else {
+                log_debug(
+                    "clone: CoW snapshot via fclonefileat failed (%s); "
+                    "sending live shm fd as fallback",
+                    strerror(errno));
+            }
+        }
+    }
 
     /* Snapshot of the semantic region array, populated after the memory dump
      * but before sibling vCPUs resume. Declared up front so all goto paths to
@@ -1401,46 +1514,13 @@ int64_t sys_clone(hv_vcpu_t vcpu,
         goto fail_snapshot;
     }
 
-    /* CoW path: sync MAP_SHARED file overlays back into shm_fd before
-     * sending it to the child. The parent's host VA at each overlay
-     * region maps the overlay file, not shm_fd, so shm_fd's content at
-     * those IPAs is stale (typically zero). The child's MAP_PRIVATE
-     * snapshot would expose that stale data at the overlay IPAs. Copy
-     * the live overlay bytes into shm_fd at the matching offsets so the
-     * child snapshot reflects the parent's view at fork time. Live
-     * cross-fork MAP_SHARED coherence (parent and child both seeing
-     * subsequent writes through the same file) is left to the cross-fork
-     * coherence TODO; this fix only avoids the stale-snapshot regression.
+    /* Send the snapshot fd if fclonefileat succeeded, otherwise the live
+     * g->shm_fd. The Rosetta-failure case already cleared use_shm above so it
+     * never reaches this branch with snapshot_shm_fd < 0.
      */
     if (use_shm) {
-        for (int i = 0; i < g->nregions; i++) {
-            const guest_region_t *r = &g->regions[i];
-            if (!r->overlay_active)
-                continue;
-            uint64_t len = r->end - r->start;
-            const uint8_t *src = (const uint8_t *) g->host_base + r->start;
-            uint64_t off = r->start;
-            while (len > 0) {
-                size_t chunk = len > (uint64_t) SSIZE_MAX ? (size_t) SSIZE_MAX
-                                                          : (size_t) len;
-                ssize_t nw = pwrite(g->shm_fd, src, chunk, (off_t) off);
-                if (nw < 0) {
-                    if (errno == EINTR)
-                        continue;
-                    log_error("clone: shm overlay sync pwrite failed: %s",
-                              strerror(errno));
-                    goto fail_snapshot;
-                }
-                if (nw == 0) {
-                    log_error("clone: shm overlay sync pwrite returned 0");
-                    goto fail_snapshot;
-                }
-                src += nw;
-                off += (uint64_t) nw;
-                len -= (uint64_t) nw;
-            }
-        }
-        if (fork_ipc_send_fds(ipc_sock, &g->shm_fd, 1) < 0) {
+        int fd_to_send = (snapshot_shm_fd >= 0) ? snapshot_shm_fd : g->shm_fd;
+        if (fork_ipc_send_fds(ipc_sock, &fd_to_send, 1) < 0) {
             log_error("clone: failed to send shm fd");
             goto fail_snapshot;
         }
@@ -1555,10 +1635,14 @@ int64_t sys_clone(hv_vcpu_t vcpu,
               child_host_pid);
 
     free(regions_snapshot);
+    if (snapshot_shm_fd >= 0)
+        close(snapshot_shm_fd);
     return child_guest_pid;
 
 fail_snapshot:
     free(regions_snapshot);
+    if (snapshot_shm_fd >= 0)
+        close(snapshot_shm_fd);
     /* Roll back the in-place anon-shared overlay conversion while
      * siblings are still parked. A partial rollback failure (e.g.,
      * region drift past the quiesce timeout) leaves the parent in a
@@ -1578,6 +1662,23 @@ int64_t sys_clone(hv_vcpu_t vcpu,
         close(vfork_notify_fds[0]);
     if (vfork_notify_fds[1] >= 0)
         close(vfork_notify_fds[1]);
+    /* posix_spawn at the top of sys_clone always succeeds before any goto
+     * fail_snapshot fires, so child_host_pid is a live process here. The
+     * IPC socket just closed; the child reads EOF on fork_ipc_read_all and
+     * returns nonzero from fork_child_main. Without an explicit waitpid the
+     * exited child becomes a zombie: proc_register_child only runs on the
+     * success path, so neither proc_reap_finished nor sys_wait4 will ever
+     * pick this PID up, and the guest's fork(2) already reported failure.
+     * Reap it here to keep host PIDs from accumulating across repeated
+     * failures.
+     */
+    pid_t reaped;
+    do {
+        reaped = waitpid(child_host_pid, NULL, 0);
+    } while (reaped < 0 && errno == EINTR);
+    if (reaped < 0)
+        log_warn("clone: failed to reap fork-child pid=%d: %s",
+                 (int) child_host_pid, strerror(errno));
     return -LINUX_ENOMEM;
 }
 
diff --git a/tests/bench-fork-cost.sh b/tests/bench-fork-cost.sh
new file mode 100755
index 0000000..73bcaf3
--- /dev/null
+++ b/tests/bench-fork-cost.sh
@@ -0,0 +1,218 @@
+#!/usr/bin/env bash
+# Per-fork wall-clock cost for aarch64 vs x86_64-via-Rosetta
+#
+# Copyright 2026 elfuse contributors
+# SPDX-License-Identifier: Apache-2.0
+#
+# Measures the per-fork wall-clock cost of clone(SIGCHLD) (subshell fork
+# without exec) at a few resident-memory levels, on the same host, comparing:
+#
+#   - aarch64-musl static busybox  (CoW shm fast path in forkipc.c)
+#   - x86_64-musl  static busybox  (Rosetta helper path; APFS clonefile-backed
+#                                   CoW snapshot when available, otherwise
+#                                   legacy region-copy fallback)
+#
+# Issue #45 tracks the fork-cost gap that this benchmark quantifies.
+#
+# Method:
+#   - Each fork-bench run is a single busybox sh that first inflates an
+#     environment variable to KB bytes (controls parent RSS at fork time)
+#     and then runs a tight subshell loop (:) which forks but does not
+#     exec. Subshell fork keeps the per-iteration cost dominated by the
+#     fork IPC (state transfer + posix_spawn + child bring-up) and stays
+#     free of execve / interpreter setup.
+#   - A baseline run with iter_count=0 captures elfuse and busybox-sh
+#     startup, so the reported per-fork number subtracts startup off.
+#   - Wall-clock is captured via python time.monotonic_ns, in line with
+#     bench-rosetta.sh.
+#
+# Usage: tests/bench-fork-cost.sh [path/to/elfuse] [iterations]
+#
+#   iterations is the per-arch fork count. aarch64 uses iterations as-is;
+#   x86_64-via-Rosetta uses max(2, iterations / 10) by default to keep the
+#   helper-process path in reasonable wall-clock. Override with
+#   ROSETTA_ITERATIONS=N.
+
+set -euo pipefail
+
+ELFUSE_INPUT="${1:-build/elfuse}"
+ITERS="${2:-50}"
+case "$ELFUSE_INPUT" in
+    /*) ELFUSE="$ELFUSE_INPUT" ;;
+    *) ELFUSE="$(pwd)/$ELFUSE_INPUT" ;;
+esac
+
+FIXTURES="${FIXTURES_DIR:-externals/test-fixtures}"
+AARCH64_STATICBIN="${FIXTURES}/aarch64-musl/staticbin/bin"
+X86_64_STATICBIN="${FIXTURES}/x86_64-musl/staticbin/bin"
+ROSETTA_PATH="${MATRIX_ROSETTA_TRANSLATOR:-/Library/Apple/usr/libexec/oah/RosettaLinux/rosetta}"
+
+SHORTDIR=/tmp/elfuse-bfc
+ARM_DIR="${SHORTDIR}/arm"
+X86_DIR="${SHORTDIR}/x86"
+
+VCPU_TIMEOUT="${VCPU_TIMEOUT:-90}"
+
+if [ ! -x "$ELFUSE" ]; then
+    printf 'elfuse not found: %s\n' "$ELFUSE" >&2
+    exit 1
+fi
+if [ ! -x "${AARCH64_STATICBIN}/busybox" ]; then
+    printf 'aarch64-musl fixture missing at %s\n' "$AARCH64_STATICBIN" >&2
+    printf 'stage via: bash tests/fetch-fixtures.sh\n' >&2
+    exit 77
+fi
+have_rosetta=1
+if [ ! -x "${X86_64_STATICBIN}/busybox" ]; then
+    printf 'x86_64-musl fixture missing at %s\n' "$X86_64_STATICBIN" >&2
+    printf 'stage via: INCLUDE_X86_64=1 bash tests/fetch-fixtures.sh\n' >&2
+    have_rosetta=0
+fi
+if [ ! -x "$ROSETTA_PATH" ]; then
+    printf 'rosetta translator not found at %s; skipping x86_64 side\n' \
+        "$ROSETTA_PATH" >&2
+    have_rosetta=0
+fi
+
+rm -rf "$SHORTDIR"
+mkdir -p "$ARM_DIR" "$X86_DIR"
+
+arm_busybox_abs="$(cd "$AARCH64_STATICBIN" && pwd)/busybox"
+ln -s "$arm_busybox_abs" "${ARM_DIR}/busybox"
+ln -s busybox "${ARM_DIR}/sh"
+ln -s busybox "${ARM_DIR}/true"
+
+if [ "$have_rosetta" = 1 ]; then
+    x86_busybox_abs="$(cd "$X86_64_STATICBIN" && pwd)/busybox"
+    ln -s "$x86_busybox_abs" "${X86_DIR}/busybox"
+    ln -s busybox "${X86_DIR}/sh"
+    ln -s busybox "${X86_DIR}/true"
+fi
+
+trap 'rm -rf "$SHORTDIR"' EXIT
+
+# Capture best-of-N wall-clock for one configuration. Runs are retried up to
+# MAX_TRIES per slot; only runs that exit 0 AND print the "OK" sentinel are
+# accepted. Returns "FAIL" on stdout when no run succeeds, so the caller can
+# mark the cell as unreliable rather than reporting a phantom-best time
+# captured from a crash or out-of-memory bail.
+# Args: <iter_count> <rss_bytes> <stagedir> [extra elfuse flags...]
+run_one()
+{
+    local iters="$1" rss="$2" stagedir="$3"
+    shift 3
+    local samples=() runs="${BFC_RUNS:-3}" tries="${BFC_MAX_TRIES:-6}"
+    local good=0 attempt=0
+    local stdout_buf
+    stdout_buf=$(mktemp -t bfc-stdout.XXXXXX)
+    trap 'rm -f "$stdout_buf"' RETURN
+
+    while [ "$good" -lt "$runs" ] && [ "$attempt" -lt "$tries" ]; do
+        attempt=$((attempt + 1))
+        local start_ns end_ns rc
+        start_ns=$(python3 -c 'import time; print(time.monotonic_ns())')
+        set +e
+        "$ELFUSE" "$@" "${stagedir}/sh" -c \
+            "x=\$(printf '%*s' ${rss} '');
+             i=0;
+             while [ \$i -lt ${iters} ]; do (:); i=\$((i+1)); done;
+             echo BFC_OK" \
+            > "$stdout_buf" 2> /dev/null
+        rc=$?
+        set -e
+        end_ns=$(python3 -c 'import time; print(time.monotonic_ns())')
+        if [ "$rc" -eq 0 ] && grep -q '^BFC_OK$' "$stdout_buf"; then
+            samples+=("$((end_ns - start_ns))")
+            good=$((good + 1))
+        fi
+    done
+
+    if [ "$good" -eq 0 ]; then
+        printf 'FAIL\n'
+        return
+    fi
+    printf '%s\n' "${samples[@]}" | sort -n | head -1
+}
+
+# Args: <arch_label> <stagedir> <iter_count> [extra elfuse flags...]
+# Echoes: rss_label rss_bytes baseline_best_ns iter_best_ns per_fork_ms
+report()
+{
+    local arch_label="$1" stagedir="$2" iters="$3"
+    shift 3
+
+    printf '\n%s (iters=%d):\n' "$arch_label" "$iters"
+    printf '  %-12s  %-12s  %-12s  %-12s  %s\n' \
+        "rss" "baseline" "iter_total" "per_fork" "per_fork_excl_startup"
+
+    local rss_label rss
+    for spec in "0:0" "1MiB:1048576" "16MiB:16777216" "64MiB:67108864"; do
+        rss_label="${spec%%:*}"
+        rss="${spec##*:}"
+
+        local base_ns iter_ns
+        base_ns=$(run_one 0 "$rss" "$stagedir" "$@")
+        iter_ns=$(run_one "$iters" "$rss" "$stagedir" "$@")
+
+        if [ "$base_ns" = "FAIL" ] || [ "$iter_ns" = "FAIL" ]; then
+            printf '  %-12s  %s\n' "$rss_label" \
+                "FAIL (all retries crashed or exhausted resources)"
+            continue
+        fi
+
+        local per_total_ms per_excl_ms
+        per_total_ms=$(python3 -c \
+            "print(f'{($iter_ns) / 1e6 / $iters:.2f}')")
+        per_excl_ms=$(python3 -c "
+diff = ($iter_ns) - ($base_ns)
+print(f'{diff / 1e6 / $iters:.2f}' if diff > 0 else 'n/a')")
+
+        printf '  %-12s  %8.1fms    %8.1fms    %8.2fms    %12s\n' \
+            "$rss_label" \
+            "$(python3 -c "print($base_ns / 1e6)")" \
+            "$(python3 -c "print($iter_ns / 1e6)")" \
+            "$per_total_ms" \
+            "${per_excl_ms}ms"
+    done
+}
+
+printf 'elfuse:        %s\n' "$ELFUSE"
+printf 'aarch64 fxtr:  %s/busybox\n' "$AARCH64_STATICBIN"
+if [ "$have_rosetta" = 1 ]; then
+    printf 'x86_64 fxtr:   %s/busybox\n' "$X86_64_STATICBIN"
+    printf 'rosetta:       %s\n' "$ROSETTA_PATH"
+fi
+printf 'iterations:    aarch64=%d  rosetta=%d  best-of-3 per cell\n' \
+    "$ITERS" "${ROSETTA_ITERATIONS:-$((ITERS / 10 > 2 ? ITERS / 10 : 2))}"
+
+# Warm caches. Rosetta translates lazily and caches per-binary in
+# ~/.cache/elfuse-rosettad/; the first invocation pays the translation cost.
+printf '\nWarming caches...\n'
+"$ELFUSE" "${ARM_DIR}/sh" -c ':' > /dev/null 2>&1 || true
+if [ "$have_rosetta" = 1 ]; then
+    "$ELFUSE" --timeout "$VCPU_TIMEOUT" "${X86_DIR}/sh" -c ':' \
+        > /dev/null 2>&1 || true
+    "$ELFUSE" --timeout "$VCPU_TIMEOUT" "${X86_DIR}/sh" -c '(:)' \
+        > /dev/null 2>&1 || true
+fi
+
+report "aarch64 (CoW shm fast path)" "$ARM_DIR" "$ITERS"
+
+if [ "$have_rosetta" = 1 ]; then
+    rosetta_iters="${ROSETTA_ITERATIONS:-$((ITERS / 10 > 2 ? ITERS / 10 : 2))}"
+    report "x86_64-via-Rosetta (clonefile CoW when available)" "$X86_DIR" \
+        "$rosetta_iters" --timeout "$VCPU_TIMEOUT"
+fi
+
+printf '\nNotes:\n'
+printf '  - per_fork is total / iters (includes elfuse + sh startup).\n'
+printf '  - per_fork_excl_startup is (iter_total - baseline) / iters and\n'
+printf '    isolates the per-fork cost from elfuse + sh bring-up.\n'
+printf '  - rss is the size of an inflated parent-sh variable, which\n'
+printf '    sits in the parent guest brk and crosses the fork IPC.\n'
+printf '  - Both architecture columns should stay roughly flat against rss\n'
+printf '    when the CoW shm path succeeds. Rosetta forks first snapshot the\n'
+printf '    shm fd with APFS clonefile, then send that snapshot via SCM_RIGHTS.\n'
+printf '    If clonefile is unavailable, Rosetta falls back to the legacy\n'
+printf '    region-copy path and the x86_64 column should scale with rss.\n'
+printf '  - See issue #45 for context.\n'