diff --git a/src/core/guest.c b/src/core/guest.c index bcbc3c2..21407d5 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -458,8 +458,15 @@ int guest_init_from_shm(guest_t *g, g->interp_base = size - 0x100000000ULL; g->mmap_limit = size - 0x200000000ULL; g->overflow_ipa_next = size; - if (compute_infra_layout(g) < 0) + if (compute_infra_layout(g) < 0) { + /* Layout computation may reject a malformed header (impossible + * guest_size / ipa_bits combination) before the mapping is set up; + * close the inherited shm fd here so the caller's contract -- this + * function takes ownership of shm_fd -- holds on every error path. + */ + close(shm_fd); return -1; + } g->pt_pool_next = g->pt_pool_base; /* Map the shm fd MAP_PRIVATE: copy-on-write semantics. Reads see diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c index 3f8c4a5..435051a 100644 --- a/src/runtime/forkipc.c +++ b/src/runtime/forkipc.c @@ -24,6 +24,7 @@ #include #include /* fdopendir, for DIR* reconstruction in child */ #include +#include /* fclonefileat for CoW shm snapshots */ #include #include "hvutil.h" @@ -1137,6 +1138,51 @@ static void *vm_clone_thread_run(void *arg) return NULL; } +/* Create an APFS block-level CoW clone of src_fd via fclonefileat (O(metadata), + * independent of the source once either side writes). Returns the clone fd on + * success, -1 with errno set on failure (non-APFS /tmp, ENOSYS, ENOSPC, ...). + * Callers that issue this snapshot are documented at the call site; the helper + * itself only owns the clone-path lifecycle. + */ +static int fork_snapshot_shm_via_clonefile(int src_fd) +{ + /* fclonefileat needs a destination path on the same APFS volume as the + * source. /tmp is APFS on every shipped macOS Apple Silicon configuration; + * if a user has remapped /tmp to a different filesystem the call fails + * and the caller drops back to the legacy path. + * + * The destination lives inside a fresh mkdtemp directory (mode 0700) so + * no other local user can race to claim the destination basename between + * path selection and fclonefileat: an earlier mkstemp + unlink + + * fclonefileat sequence left a window where /tmp was world-writable for + * that name and a concurrent process could DoS the fast path via EEXIST. + */ + char tmpdir[] = "/tmp/elfuse-fork-XXXXXX"; + if (mkdtemp(tmpdir) == NULL) + return -1; + char clone_path[64]; + snprintf(clone_path, sizeof(clone_path), "%s/snap", tmpdir); + if (fclonefileat(src_fd, AT_FDCWD, clone_path, 0) < 0) { + int saved_errno = errno; + rmdir(tmpdir); + errno = saved_errno; + return -1; + } + int clone_fd = open(clone_path, O_RDWR | O_CLOEXEC); + int saved_errno = errno; + /* Best-effort cleanup: the clone fd alone keeps the inode alive, so any + * unlink/rmdir failure here is a directory-leak nuisance, not a + * correctness issue. Caller still gets the open fd. + */ + (void) unlink(clone_path); + (void) rmdir(tmpdir); + if (clone_fd < 0) { + errno = saved_errno; + return -1; + } + return clone_fd; +} + int64_t sys_clone(hv_vcpu_t vcpu, guest_t *g, uint64_t flags, @@ -1163,13 +1209,13 @@ int64_t sys_clone(hv_vcpu_t vcpu, verbose); } - /* Rosetta fork takes the helper-process IPC path. The CoW shm fast-path - * is gated off in use_shm because HVF caches VA->PA at hv_vm_map time and - * the parent's MAP_SHARED mapping cannot be safely remapped under the - * running vCPU. The TTBR1 kbuf tree, translator image, and kbuf bytes - * ride along as primary-buffer used regions; the child restores - * TCR_EL1 / TTBR1_EL1 from ipc_registers_t and recomputes kbuf_base - * from kbuf_gpa. + /* Rosetta fork takes the helper-process IPC path. The parent cannot remap + * its live guest memory under the running vCPU because HVF caches VA->PA at + * hv_vm_map time; instead, the fork path snapshots shm with clonefile when + * available and otherwise falls back to region copy. The TTBR1 kbuf tree, + * translator image, and kbuf bytes ride along as primary-buffer used + * regions; the child restores TCR_EL1 / TTBR1_EL1 from ipc_registers_t and + * recomputes kbuf_base from kbuf_gpa. */ /* elfuse only supports fork-like clone (SIGCHLD) and posix_spawn-like @@ -1291,10 +1337,17 @@ int64_t sys_clone(hv_vcpu_t vcpu, return -LINUX_ENOMEM; } - /* The parent keeps only its end of the control channel. */ + /* The parent keeps only its end of the control channel. Reset the closed + * write end to -1 so the fail_snapshot guarded close at the bottom of the + * function cannot double-close it. In a multithreaded guest, another vCPU + * could open a new fd between the two closes and get the same number, + * which the second close would then steal. + */ close(sock_fds[1]); - if (vfork_notify_fds[1] >= 0) + if (vfork_notify_fds[1] >= 0) { close(vfork_notify_fds[1]); + vfork_notify_fds[1] = -1; + } int ipc_sock = sock_fds[0]; /* Allocate guest PID before serialization so the child header carries its @@ -1314,6 +1367,10 @@ int64_t sys_clone(hv_vcpu_t vcpu, mmap_fork_anon_shared_txn_t *anon_shared_txn = NULL; guest_region_t *regions_snapshot = NULL; guest_region_t preannounced_snapshot[GUEST_MAX_PREANNOUNCED]; + /* APFS clone fd for the CoW snapshot sent to the child. Declared up front + * so early goto fail_snapshot exits do not read an uninitialized local. + */ + int snapshot_shm_fd = -1; /* Convert MAP_SHARED|MAP_ANONYMOUS regions that have no backing fd * into memfd-backed overlay regions. The conversion seeds a private @@ -1328,31 +1385,87 @@ int64_t sys_clone(hv_vcpu_t vcpu, if (mmap_fork_prepare_anon_shared(g, &anon_shared_txn) < 0) goto fail_snapshot; - /* Determine if elfuse can use the CoW (shm) fast path. - * If shm_fd >= 0, elfuse freezes a snapshot via MAP_PRIVATE and sends the - * shm fd to the child. Otherwise fall back to region-by-region copy. + /* CoW fast path: if shm_fd >= 0, send a snapshot of guest memory to the + * child instead of the per-region copy. The child maps that snapshot + * MAP_PRIVATE; subsequent writes on either side are private. * - * Rosetta guests are excluded from CoW even when shm-backed: rosetta's - * JIT state (TLS slabs, code caches, indirect-call tables, block lists) - * is process-local and corrupts when CoW-shared. The legacy region-copy - * path preserves the parent's JIT state independently per child. - */ - bool use_shm = (g->shm_fd >= 0) && !g->is_rosetta; - - /* elfuse does not remap the parent to MAP_PRIVATE here. The parent - * stays on MAP_SHARED; its vCPU continues writing to the shared file. - * The child maps MAP_PRIVATE, getting a CoW snapshot. + * The parent's own mapping cannot be flipped to MAP_PRIVATE here: hv_vm_map + * caches the host VA->PA mapping, and a MAP_FIXED remap invalidates it + * (the parent then reads stale memory and writev returns EFAULT). So the + * parent stays on MAP_SHARED and the snapshot is what isolates the child. * - * This is safe because the IPC is synchronous: the child maps MAP_PRIVATE - * before the parent's vCPU resumes. After that, the child's CoW pages are - * frozen (child writes are private, parent writes to MAP_SHARED do not - * affect CoW'd child pages). + * Two snapshot sources, in preference order (selected just below): + * 1. fclonefileat of g->shm_fd to an independent APFS clone. The clone + * shares blocks with the parent until either side writes, so the + * parent's subsequent writes never reach the child's backing. + * 2. The live g->shm_fd. Any page the child has not yet COW'd reads the + * parent's current bytes -- benign for typical guest state, but + * corrupts Rosetta's translator-internal structures (TLS slabs, code + * caches, indirect-call tables) on mid-update reads. Issue #45. * - * an earlier implementation tried remapping the parent to MAP_PRIVATE here, - * but that breaks HVF: hv_vm_map caches the host VA->PA mapping, and - * MAP_FIXED remap invalidates it. The parent's vCPU then reads stale - * memory, causing corrupted syscall data (EFAULT on writev). + * Rosetta therefore requires path 1 and falls back to region copy if + * fclonefileat fails; native guests accept path 2 as a fallback so a + * non-APFS /tmp does not silently slow forks down to per-region copy cost. */ + bool use_shm = (g->shm_fd >= 0); + + /* Overlay sync runs before the snapshot so the cloned file picks up the + * overlay-backed bytes. The parent's host VA for each overlay region maps + * the overlay file, not shm_fd, so shm_fd's contents at those offsets are + * stale (typically zero) until the pwrite below copies them in. Both the + * clone-fd path and the live-shm_fd fallback consume this sync. + */ + if (use_shm) { + for (int i = 0; i < g->nregions; i++) { + const guest_region_t *r = &g->regions[i]; + if (!r->overlay_active) + continue; + uint64_t len = r->end - r->start; + const uint8_t *src = (const uint8_t *) g->host_base + r->start; + uint64_t off = r->start; + while (len > 0) { + size_t chunk = len > (uint64_t) SSIZE_MAX ? (size_t) SSIZE_MAX + : (size_t) len; + ssize_t nw = pwrite(g->shm_fd, src, chunk, (off_t) off); + if (nw < 0) { + if (errno == EINTR) + continue; + log_error("clone: shm overlay sync pwrite failed: %s", + strerror(errno)); + goto fail_snapshot; + } + if (nw == 0) { + log_error("clone: shm overlay sync pwrite returned 0"); + goto fail_snapshot; + } + src += nw; + off += (uint64_t) nw; + len -= (uint64_t) nw; + } + } + /* Attempt the APFS clone snapshot for every guest, not just Rosetta: + * the clone gives POSIX-style isolation at O(metadata) cost and avoids + * torn-snapshot reads in guests that snapshot their own state across + * fork (Redis BGSAVE, checkpointing runtimes). On failure the fallback + * differs per design above: Rosetta drops use_shm so the region-copy + * path runs; native guests keep use_shm and send the live g->shm_fd. + */ + snapshot_shm_fd = fork_snapshot_shm_via_clonefile(g->shm_fd); + if (snapshot_shm_fd < 0) { + if (g->is_rosetta) { + log_warn( + "clone: rosetta CoW snapshot via fclonefileat failed " + "(%s); falling back to region-copy path", + strerror(errno)); + use_shm = false; + } else { + log_debug( + "clone: CoW snapshot via fclonefileat failed (%s); " + "sending live shm fd as fallback", + strerror(errno)); + } + } + } /* Snapshot of the semantic region array, populated after the memory dump * but before sibling vCPUs resume. Declared up front so all goto paths to @@ -1401,46 +1514,13 @@ int64_t sys_clone(hv_vcpu_t vcpu, goto fail_snapshot; } - /* CoW path: sync MAP_SHARED file overlays back into shm_fd before - * sending it to the child. The parent's host VA at each overlay - * region maps the overlay file, not shm_fd, so shm_fd's content at - * those IPAs is stale (typically zero). The child's MAP_PRIVATE - * snapshot would expose that stale data at the overlay IPAs. Copy - * the live overlay bytes into shm_fd at the matching offsets so the - * child snapshot reflects the parent's view at fork time. Live - * cross-fork MAP_SHARED coherence (parent and child both seeing - * subsequent writes through the same file) is left to the cross-fork - * coherence TODO; this fix only avoids the stale-snapshot regression. + /* Send the snapshot fd if fclonefileat succeeded, otherwise the live + * g->shm_fd. The Rosetta-failure case already cleared use_shm above so it + * never reaches this branch with snapshot_shm_fd < 0. */ if (use_shm) { - for (int i = 0; i < g->nregions; i++) { - const guest_region_t *r = &g->regions[i]; - if (!r->overlay_active) - continue; - uint64_t len = r->end - r->start; - const uint8_t *src = (const uint8_t *) g->host_base + r->start; - uint64_t off = r->start; - while (len > 0) { - size_t chunk = len > (uint64_t) SSIZE_MAX ? (size_t) SSIZE_MAX - : (size_t) len; - ssize_t nw = pwrite(g->shm_fd, src, chunk, (off_t) off); - if (nw < 0) { - if (errno == EINTR) - continue; - log_error("clone: shm overlay sync pwrite failed: %s", - strerror(errno)); - goto fail_snapshot; - } - if (nw == 0) { - log_error("clone: shm overlay sync pwrite returned 0"); - goto fail_snapshot; - } - src += nw; - off += (uint64_t) nw; - len -= (uint64_t) nw; - } - } - if (fork_ipc_send_fds(ipc_sock, &g->shm_fd, 1) < 0) { + int fd_to_send = (snapshot_shm_fd >= 0) ? snapshot_shm_fd : g->shm_fd; + if (fork_ipc_send_fds(ipc_sock, &fd_to_send, 1) < 0) { log_error("clone: failed to send shm fd"); goto fail_snapshot; } @@ -1555,10 +1635,14 @@ int64_t sys_clone(hv_vcpu_t vcpu, child_host_pid); free(regions_snapshot); + if (snapshot_shm_fd >= 0) + close(snapshot_shm_fd); return child_guest_pid; fail_snapshot: free(regions_snapshot); + if (snapshot_shm_fd >= 0) + close(snapshot_shm_fd); /* Roll back the in-place anon-shared overlay conversion while * siblings are still parked. A partial rollback failure (e.g., * region drift past the quiesce timeout) leaves the parent in a @@ -1578,6 +1662,23 @@ int64_t sys_clone(hv_vcpu_t vcpu, close(vfork_notify_fds[0]); if (vfork_notify_fds[1] >= 0) close(vfork_notify_fds[1]); + /* posix_spawn at the top of sys_clone always succeeds before any goto + * fail_snapshot fires, so child_host_pid is a live process here. The + * IPC socket just closed; the child reads EOF on fork_ipc_read_all and + * returns nonzero from fork_child_main. Without an explicit waitpid the + * exited child becomes a zombie: proc_register_child only runs on the + * success path, so neither proc_reap_finished nor sys_wait4 will ever + * pick this PID up, and the guest's fork(2) already reported failure. + * Reap it here to keep host PIDs from accumulating across repeated + * failures. + */ + pid_t reaped; + do { + reaped = waitpid(child_host_pid, NULL, 0); + } while (reaped < 0 && errno == EINTR); + if (reaped < 0) + log_warn("clone: failed to reap fork-child pid=%d: %s", + (int) child_host_pid, strerror(errno)); return -LINUX_ENOMEM; } diff --git a/tests/bench-fork-cost.sh b/tests/bench-fork-cost.sh new file mode 100755 index 0000000..73bcaf3 --- /dev/null +++ b/tests/bench-fork-cost.sh @@ -0,0 +1,218 @@ +#!/usr/bin/env bash +# Per-fork wall-clock cost for aarch64 vs x86_64-via-Rosetta +# +# Copyright 2026 elfuse contributors +# SPDX-License-Identifier: Apache-2.0 +# +# Measures the per-fork wall-clock cost of clone(SIGCHLD) (subshell fork +# without exec) at a few resident-memory levels, on the same host, comparing: +# +# - aarch64-musl static busybox (CoW shm fast path in forkipc.c) +# - x86_64-musl static busybox (Rosetta helper path; APFS clonefile-backed +# CoW snapshot when available, otherwise +# legacy region-copy fallback) +# +# Issue #45 tracks the fork-cost gap that this benchmark quantifies. +# +# Method: +# - Each fork-bench run is a single busybox sh that first inflates an +# environment variable to KB bytes (controls parent RSS at fork time) +# and then runs a tight subshell loop (:) which forks but does not +# exec. Subshell fork keeps the per-iteration cost dominated by the +# fork IPC (state transfer + posix_spawn + child bring-up) and stays +# free of execve / interpreter setup. +# - A baseline run with iter_count=0 captures elfuse and busybox-sh +# startup, so the reported per-fork number subtracts startup off. +# - Wall-clock is captured via python time.monotonic_ns, in line with +# bench-rosetta.sh. +# +# Usage: tests/bench-fork-cost.sh [path/to/elfuse] [iterations] +# +# iterations is the per-arch fork count. aarch64 uses iterations as-is; +# x86_64-via-Rosetta uses max(2, iterations / 10) by default to keep the +# helper-process path in reasonable wall-clock. Override with +# ROSETTA_ITERATIONS=N. + +set -euo pipefail + +ELFUSE_INPUT="${1:-build/elfuse}" +ITERS="${2:-50}" +case "$ELFUSE_INPUT" in + /*) ELFUSE="$ELFUSE_INPUT" ;; + *) ELFUSE="$(pwd)/$ELFUSE_INPUT" ;; +esac + +FIXTURES="${FIXTURES_DIR:-externals/test-fixtures}" +AARCH64_STATICBIN="${FIXTURES}/aarch64-musl/staticbin/bin" +X86_64_STATICBIN="${FIXTURES}/x86_64-musl/staticbin/bin" +ROSETTA_PATH="${MATRIX_ROSETTA_TRANSLATOR:-/Library/Apple/usr/libexec/oah/RosettaLinux/rosetta}" + +SHORTDIR=/tmp/elfuse-bfc +ARM_DIR="${SHORTDIR}/arm" +X86_DIR="${SHORTDIR}/x86" + +VCPU_TIMEOUT="${VCPU_TIMEOUT:-90}" + +if [ ! -x "$ELFUSE" ]; then + printf 'elfuse not found: %s\n' "$ELFUSE" >&2 + exit 1 +fi +if [ ! -x "${AARCH64_STATICBIN}/busybox" ]; then + printf 'aarch64-musl fixture missing at %s\n' "$AARCH64_STATICBIN" >&2 + printf 'stage via: bash tests/fetch-fixtures.sh\n' >&2 + exit 77 +fi +have_rosetta=1 +if [ ! -x "${X86_64_STATICBIN}/busybox" ]; then + printf 'x86_64-musl fixture missing at %s\n' "$X86_64_STATICBIN" >&2 + printf 'stage via: INCLUDE_X86_64=1 bash tests/fetch-fixtures.sh\n' >&2 + have_rosetta=0 +fi +if [ ! -x "$ROSETTA_PATH" ]; then + printf 'rosetta translator not found at %s; skipping x86_64 side\n' \ + "$ROSETTA_PATH" >&2 + have_rosetta=0 +fi + +rm -rf "$SHORTDIR" +mkdir -p "$ARM_DIR" "$X86_DIR" + +arm_busybox_abs="$(cd "$AARCH64_STATICBIN" && pwd)/busybox" +ln -s "$arm_busybox_abs" "${ARM_DIR}/busybox" +ln -s busybox "${ARM_DIR}/sh" +ln -s busybox "${ARM_DIR}/true" + +if [ "$have_rosetta" = 1 ]; then + x86_busybox_abs="$(cd "$X86_64_STATICBIN" && pwd)/busybox" + ln -s "$x86_busybox_abs" "${X86_DIR}/busybox" + ln -s busybox "${X86_DIR}/sh" + ln -s busybox "${X86_DIR}/true" +fi + +trap 'rm -rf "$SHORTDIR"' EXIT + +# Capture best-of-N wall-clock for one configuration. Runs are retried up to +# MAX_TRIES per slot; only runs that exit 0 AND print the "OK" sentinel are +# accepted. Returns "FAIL" on stdout when no run succeeds, so the caller can +# mark the cell as unreliable rather than reporting a phantom-best time +# captured from a crash or out-of-memory bail. +# Args: [extra elfuse flags...] +run_one() +{ + local iters="$1" rss="$2" stagedir="$3" + shift 3 + local samples=() runs="${BFC_RUNS:-3}" tries="${BFC_MAX_TRIES:-6}" + local good=0 attempt=0 + local stdout_buf + stdout_buf=$(mktemp -t bfc-stdout.XXXXXX) + trap 'rm -f "$stdout_buf"' RETURN + + while [ "$good" -lt "$runs" ] && [ "$attempt" -lt "$tries" ]; do + attempt=$((attempt + 1)) + local start_ns end_ns rc + start_ns=$(python3 -c 'import time; print(time.monotonic_ns())') + set +e + "$ELFUSE" "$@" "${stagedir}/sh" -c \ + "x=\$(printf '%*s' ${rss} ''); + i=0; + while [ \$i -lt ${iters} ]; do (:); i=\$((i+1)); done; + echo BFC_OK" \ + > "$stdout_buf" 2> /dev/null + rc=$? + set -e + end_ns=$(python3 -c 'import time; print(time.monotonic_ns())') + if [ "$rc" -eq 0 ] && grep -q '^BFC_OK$' "$stdout_buf"; then + samples+=("$((end_ns - start_ns))") + good=$((good + 1)) + fi + done + + if [ "$good" -eq 0 ]; then + printf 'FAIL\n' + return + fi + printf '%s\n' "${samples[@]}" | sort -n | head -1 +} + +# Args: [extra elfuse flags...] +# Echoes: rss_label rss_bytes baseline_best_ns iter_best_ns per_fork_ms +report() +{ + local arch_label="$1" stagedir="$2" iters="$3" + shift 3 + + printf '\n%s (iters=%d):\n' "$arch_label" "$iters" + printf ' %-12s %-12s %-12s %-12s %s\n' \ + "rss" "baseline" "iter_total" "per_fork" "per_fork_excl_startup" + + local rss_label rss + for spec in "0:0" "1MiB:1048576" "16MiB:16777216" "64MiB:67108864"; do + rss_label="${spec%%:*}" + rss="${spec##*:}" + + local base_ns iter_ns + base_ns=$(run_one 0 "$rss" "$stagedir" "$@") + iter_ns=$(run_one "$iters" "$rss" "$stagedir" "$@") + + if [ "$base_ns" = "FAIL" ] || [ "$iter_ns" = "FAIL" ]; then + printf ' %-12s %s\n' "$rss_label" \ + "FAIL (all retries crashed or exhausted resources)" + continue + fi + + local per_total_ms per_excl_ms + per_total_ms=$(python3 -c \ + "print(f'{($iter_ns) / 1e6 / $iters:.2f}')") + per_excl_ms=$(python3 -c " +diff = ($iter_ns) - ($base_ns) +print(f'{diff / 1e6 / $iters:.2f}' if diff > 0 else 'n/a')") + + printf ' %-12s %8.1fms %8.1fms %8.2fms %12s\n' \ + "$rss_label" \ + "$(python3 -c "print($base_ns / 1e6)")" \ + "$(python3 -c "print($iter_ns / 1e6)")" \ + "$per_total_ms" \ + "${per_excl_ms}ms" + done +} + +printf 'elfuse: %s\n' "$ELFUSE" +printf 'aarch64 fxtr: %s/busybox\n' "$AARCH64_STATICBIN" +if [ "$have_rosetta" = 1 ]; then + printf 'x86_64 fxtr: %s/busybox\n' "$X86_64_STATICBIN" + printf 'rosetta: %s\n' "$ROSETTA_PATH" +fi +printf 'iterations: aarch64=%d rosetta=%d best-of-3 per cell\n' \ + "$ITERS" "${ROSETTA_ITERATIONS:-$((ITERS / 10 > 2 ? ITERS / 10 : 2))}" + +# Warm caches. Rosetta translates lazily and caches per-binary in +# ~/.cache/elfuse-rosettad/; the first invocation pays the translation cost. +printf '\nWarming caches...\n' +"$ELFUSE" "${ARM_DIR}/sh" -c ':' > /dev/null 2>&1 || true +if [ "$have_rosetta" = 1 ]; then + "$ELFUSE" --timeout "$VCPU_TIMEOUT" "${X86_DIR}/sh" -c ':' \ + > /dev/null 2>&1 || true + "$ELFUSE" --timeout "$VCPU_TIMEOUT" "${X86_DIR}/sh" -c '(:)' \ + > /dev/null 2>&1 || true +fi + +report "aarch64 (CoW shm fast path)" "$ARM_DIR" "$ITERS" + +if [ "$have_rosetta" = 1 ]; then + rosetta_iters="${ROSETTA_ITERATIONS:-$((ITERS / 10 > 2 ? ITERS / 10 : 2))}" + report "x86_64-via-Rosetta (clonefile CoW when available)" "$X86_DIR" \ + "$rosetta_iters" --timeout "$VCPU_TIMEOUT" +fi + +printf '\nNotes:\n' +printf ' - per_fork is total / iters (includes elfuse + sh startup).\n' +printf ' - per_fork_excl_startup is (iter_total - baseline) / iters and\n' +printf ' isolates the per-fork cost from elfuse + sh bring-up.\n' +printf ' - rss is the size of an inflated parent-sh variable, which\n' +printf ' sits in the parent guest brk and crosses the fork IPC.\n' +printf ' - Both architecture columns should stay roughly flat against rss\n' +printf ' when the CoW shm path succeeds. Rosetta forks first snapshot the\n' +printf ' shm fd with APFS clonefile, then send that snapshot via SCM_RIGHTS.\n' +printf ' If clonefile is unavailable, Rosetta falls back to the legacy\n' +printf ' region-copy path and the x86_64 column should scale with rss.\n' +printf ' - See issue #45 for context.\n'