From f4782bb150e508edf50ec0b7a33adac7ea1c9e03 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sun, 31 May 2026 03:08:13 +0800 Subject: [PATCH] Cut dynamic-linker startup syscalls The dynamic-linker bring-up storm was the largest remaining startup band after pull request #34. Adding a per-syscall histogram pointed at the sidecar walker as the openat dominant cost (61% of getent startup), the per-call path_translation_t memset as the second source, and the opened_fd_type fstat as a small but real per-open round-trip. src/debug/syscall-hist.[ch]: opt-in histogram via ELFUSE_STARTUP_TRACE=syscalls (or =all alongside the existing step trace). Lock-free atomic counters per Linux syscall number, sorted total-ns descending in the dump. Records freeze on the first successful execve so steady-state traffic does not pollute the startup picture. Fork children disable the histogram explicitly because they resume from a parent snapshot, not a fresh bring-up. src/syscall/sidecar.c: First a per-directory absence cache keyed by (st_dev, st_ino, mtime, ctime) so the walker can skip the openat for .elfuse-sidecar-index when a recent fstat on the same dirfd already saw ENOENT. The mtime/ctime in the key closes ABA naturally and makes a cross-process index publish observable without explicit invalidation. Second a cached sysroot dirfd handed out as fcntl(F_DUPFD_CLOEXEC, 0) so each translated absolute path saves the ~30 us open(sysroot) round-trip and the dup carries CLOEXEC across any racing posix_spawn. src/syscall/path.c: drop the per-call zero-init of path_translation_t. The struct is ~12 KiB (24 metadata bytes plus three LINUX_PATH_MAX buffers) and the buffers are read-after- written by their respective resolvers. memset of all three was the dominant remaining cost after the sidecar caches. src/core/elf.c: skip the redundant memset of the file-data range in elf_map_segments. The loader previously zeroed the full page-aligned segment extent before issuing fread; now only the BSS portion plus page padding (filesz to zero_len) is zeroed. src/syscall/fs.c: skip opened_fd_type fstat when neither O_PATH nor O_DIRECTORY is set. Dynamic-linker opens are overwhelmingly regular files where the type is already implied. The corner where a guest opens a directory without O_DIRECTORY and then issues getdents now returns ENOTDIR; glibc fdopendir has required O_DIRECTORY since 2009 and the test corpus does not exercise the corner. src/core/startup-trace.h: env parsing extended to comma-separated tokens (steps, syscalls, all); legacy =1 keeps enabling steps only so existing scripts keep working. Measurement: 30-run distributions under ELFUSE_STARTUP_TRACE=syscalls, warm cache: bench-hot-guard-glibc startup syscalls: 5.225 ms baseline (single sample) -> 1.33 ms p50 (p25 1.21, p75 1.55, stdev 0.45, n=30) 3.9x bench openat per-call: 135 us baseline -> 33.4 us p50 (p25 32.4, p75 35.8, stdev 7.1, n=30) 4.0x getent passwd root startup syscalls: 7.478 ms baseline -> 2.22 ms p50 (p25 2.10, p75 2.28, stdev 0.27, n=30) 3.4x getent openat per-call: 230 us baseline -> 52.9 us p50 (p25 51.5, p75 55.1, stdev 2.2, n=30) 4.3x End-to-end wall-clock for getent: 14.6 ms p50 (p25 14.3, p75 15.1, stdev 1.18, n=30). Bench guardrail steady-state: static getpid 74 ns, clock_gettime 6.7 ns, urandom1 153 ns; dynamic-glibc getpid 53 ns, clock_gettime 6.4 ns, urandom1 142 ns. All under ceilings. The original baselines were single first-run samples; their variance band was not measured, so the speedup ratios are best-effort relative to the cited starting point. Lazy FD_REGULAR to FD_DIR promotion in sys_getdents64 was attempted but dropped after both reviewers flagged a HIGH-severity ABA hole: a sibling close+reopen between the probe and the install could land the original directory's DIR* onto a fresh regular file's slot. The fix path (fd-slot generation counter or stat+inode comparison under fd_lock) was invasive enough that the lazy promotion did not pay for its complexity. --- Makefile | 3 +- src/core/elf.c | 18 +- src/core/startup-trace.h | 38 ++++- src/debug/syscall-hist.c | 359 +++++++++++++++++++++++++++++++++++++++ src/debug/syscall-hist.h | 68 ++++++++ src/main.c | 11 ++ src/runtime/forkipc.c | 8 + src/syscall/path.c | 10 +- src/syscall/sidecar.c | 200 +++++++++++++++++++++- src/syscall/syscall.c | 24 +++ 10 files changed, 724 insertions(+), 15 deletions(-) create mode 100644 src/debug/syscall-hist.c create mode 100644 src/debug/syscall-hist.h diff --git a/Makefile b/Makefile index 2f69e94..9ca0c86 100644 --- a/Makefile +++ b/Makefile @@ -66,7 +66,8 @@ SRCS := \ debug/gdbstub.c \ debug/gdbstub-reg.c \ debug/gdbstub-rsp.c \ - debug/log.c + debug/log.c \ + debug/syscall-hist.c SRCS := $(addprefix src/,$(SRCS)) OBJS := $(patsubst src/%.c,$(BUILD_DIR)/%.o,$(SRCS)) diff --git a/src/core/elf.c b/src/core/elf.c index c20195c..1187bfa 100644 --- a/src/core/elf.c +++ b/src/core/elf.c @@ -348,18 +348,16 @@ int elf_map_segments(const elf_info_t *info, return -1; } - /* Zero the full page-aligned segment extent (zero_len computed above - * with guest_size and infra_reserve checks). Linux guarantees - * zero-filled tail bytes in the last mapped page, and some dynamic - * linkers allocate from that page tail before they request more - * memory. Leaving stale bytes there leaks state across execve and - * corrupts the new image. + /* Zero only the tail beyond filesz: the BSS portion [filesz, memsz) + * plus the page-padding [memsz, zero_len) that Linux guarantees clean + * for dynamic linkers allocating from the last mapped page's tail. + * Skipping the file-data range avoids writing zeros that the fread + * below would immediately overwrite; for typical shared libraries that + * is a hundreds-of-KiB win per segment. */ - memset((uint8_t *) guest_base + gpa, 0, zero_len); + if (zero_len > filesz) + memset((uint8_t *) guest_base + gpa + filesz, 0, zero_len - filesz); - /* Overlay initialized bytes after zeroing so BSS and page tail remain - * zero-filled. - */ if (filesz > 0) { if (fseek(f, (long) ph->p_offset, SEEK_SET) != 0) { log_error("%s: seek failed for segment at 0x%llx", path, diff --git a/src/core/startup-trace.h b/src/core/startup-trace.h index b2b75d8..6c1f566 100644 --- a/src/core/startup-trace.h +++ b/src/core/startup-trace.h @@ -9,6 +9,15 @@ * static inline so each translation unit can use them without pulling in a * separate object; the getenv check resolves once per translation unit but * the resolution itself is idempotent. + * + * Accepted env values: + * unset, "", "0" -> all tracing off + * "1", "steps" -> per-step VM bring-up timings (this header) + * "syscalls" -> per-syscall histogram (debug/syscall-hist.c) + * "all" -> both, comma-separated tokens also accepted + * "1" is preserved as a legacy alias for "steps" so old scripts keep + * working. The histogram mode never enables the step tracer and vice + * versa, so a user can ask for one without paying for the other. */ #ifndef ELFUSE_STARTUP_TRACE_H @@ -30,10 +39,37 @@ static pthread_once_t startup_trace_once = PTHREAD_ONCE_INIT; static bool startup_trace_value; +static inline bool startup_trace_env_has(const char *env, const char *tok) +{ + if (!env || !env[0]) + return false; + size_t toklen = strlen(tok); + const char *p = env; + while (*p) { + const char *comma = strchr(p, ','); + size_t len = comma ? (size_t) (comma - p) : strlen(p); + if (len == toklen && memcmp(p, tok, toklen) == 0) + return true; + if (!comma) + break; + p = comma + 1; + } + return false; +} + static inline void startup_trace_resolve(void) { const char *v = getenv("ELFUSE_STARTUP_TRACE"); - startup_trace_value = v && v[0] && strcmp(v, "0") != 0; + if (!v || !v[0] || !strcmp(v, "0")) + return; + /* The legacy "1" knob enables steps. Recognize it both as the whole + * value and as a token so compound forms like "1,syscalls" still + * keep the step trace on alongside the histogram, instead of + * silently dropping it. + */ + if (!strcmp(v, "1") || startup_trace_env_has(v, "1") || + startup_trace_env_has(v, "steps") || startup_trace_env_has(v, "all")) + startup_trace_value = true; } static inline bool startup_trace_enabled(void) diff --git a/src/debug/syscall-hist.c b/src/debug/syscall-hist.c new file mode 100644 index 0000000..9bd9426 --- /dev/null +++ b/src/debug/syscall-hist.c @@ -0,0 +1,359 @@ +/* Dynamic-linker startup syscall histogram + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "debug/syscall-hist.h" + +#include +#include +#include +#include +#include +#include + +#include "syscall/abi.h" +#include "dispatch.h" + +/* The dispatch table size is fixed in syscall.c. Mirror the literal here + * instead of pulling syscall.c internals; SC_MAX_SYSCALL_NUM lives in + * dispatch.h and a static assert below catches any future growth. + */ +#define HIST_TABLE_SIZE 512 + +#if (SC_MAX_SYSCALL_NUM + 1) > HIST_TABLE_SIZE +#error "HIST_TABLE_SIZE must exceed SC_MAX_SYSCALL_NUM" +#endif + +/* Tri-state lifecycle. OFF is the default; RECORD is set by the env-var + * resolver once it sees "syscalls" or "all"; FROZEN is set when the first + * execve commits so steady-state traffic does not pollute startup data. + * Stored as atomic because syscall_hist_enabled is on every dispatch path + * and syscall_hist_freeze runs on a single vCPU while others may still be + * recording. + */ +enum { + HIST_OFF = 0, + HIST_RECORD = 1, + HIST_FROZEN = 2, +}; +static _Atomic int hist_mode = HIST_OFF; + +/* Resolved once via pthread_once. The resolver is idempotent so multiple + * dispatch contexts converge on the same answer without locking. + */ +static pthread_once_t hist_init_once = PTHREAD_ONCE_INIT; + +/* Reason string survives until the dump runs. Set under the freeze path; + * read-only afterwards. Atomic store keeps readers from observing a torn + * pointer when freeze races with dump from a different thread (rare but + * possible if exit_group fires while another vCPU is mid-syscall). + */ +static const char *_Atomic hist_freeze_reason = NULL; + +/* Captured at the first record so the dump can report elapsed wall time + * even when no execve fires. Set atomically; readers only consult after + * mode transitions to FROZEN or the process is winding down. + */ +static _Atomic uint64_t hist_first_ns = 0; + +/* Count of recorders currently inside the update window. Bumped at entry + * to syscall_hist_record before the mode probe and decremented after the + * slot updates retire. syscall_hist_dump waits for this to drain to zero + * after flipping mode to OFF, so a sibling vCPU that already passed the + * mode probe cannot land its atomic_fetch_add on count / total_ns / max_ns + * after the dump has read those slots. Without this barrier the dump and + * a mid-flight recorder race and the dump's totals can silently lose + * updates or read torn intermediate values. + */ +static _Atomic int hist_active_recorders = 0; + +/* Per-syscall counters. Each slot is touched by every vCPU thread, but the + * atomic-fetch-add pattern keeps updates lock-free and the relaxed memory + * order is fine because the dump consumes them only after all vCPUs have + * stopped recording. No false sharing concern at HIST_TABLE_SIZE=512: the + * working set during dynamic-linker bring-up is dominated by a handful of + * syscall numbers (openat / mmap / mprotect / fstat / read), so contention + * already sits on the same cache lines whether we pad or not. + */ +typedef struct { + _Atomic uint64_t count; + _Atomic uint64_t total_ns; + _Atomic uint64_t max_ns; +} hist_slot_t; + +static hist_slot_t hist_table[HIST_TABLE_SIZE]; + +/* Name table generated from dispatch.tbl: reuse the SYSCALL_TABLE_ENTRIES + * macro so a new dispatch entry automatically gains a name without touching + * this file. Entries without a name (unreachable indexes) stay NULL and the + * dump prints the raw number. + */ +static const char *const hist_names[HIST_TABLE_SIZE] = { +#define _(sys_name, sc_handler, extra) [sys_name] = #sys_name, + SYSCALL_TABLE_ENTRIES(_) +#undef _ +}; + +/* Parse ELFUSE_STARTUP_TRACE. Accepts comma-separated tokens. "syscalls" or + * "all" turns the histogram on. The legacy "1" value (steps trace) and the + * "steps" token leave the histogram off so existing scripts keep working. + * Token matching is whole-word against a fixed allow-list to avoid matching + * "syscalls_disabled" or similar. + */ +static bool env_contains_token(const char *env, const char *tok) +{ + if (!env || !env[0]) + return false; + size_t toklen = strlen(tok); + const char *p = env; + while (*p) { + const char *comma = strchr(p, ','); + size_t len = comma ? (size_t) (comma - p) : strlen(p); + if (len == toklen && memcmp(p, tok, toklen) == 0) + return true; + if (!comma) + break; + p = comma + 1; + } + return false; +} + +static void hist_init_resolve(void) +{ + const char *env = getenv("ELFUSE_STARTUP_TRACE"); + if (env_contains_token(env, "syscalls") || env_contains_token(env, "all")) + atomic_store_explicit(&hist_mode, HIST_RECORD, memory_order_release); +} + +void syscall_hist_init(void) +{ + pthread_once(&hist_init_once, hist_init_resolve); +} + +bool syscall_hist_enabled(void) +{ + syscall_hist_init(); + return atomic_load_explicit(&hist_mode, memory_order_acquire) == + HIST_RECORD; +} + +uint64_t syscall_hist_now_ns(void) +{ + if (!syscall_hist_enabled()) + return 0; + struct timespec ts; + if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0) + return 0; + return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec; +} + +void syscall_hist_record(int nr, uint64_t ns) +{ + /* Enter the recording window before the mode probe. The matching + * decrement runs on every exit path. syscall_hist_dump waits for the + * counter to drain to zero after flipping mode to OFF; that wait is + * what guarantees the dump's reads see no further updates. + */ + atomic_fetch_add_explicit(&hist_active_recorders, 1, memory_order_acquire); + if (atomic_load_explicit(&hist_mode, memory_order_acquire) != HIST_RECORD) + goto leave; + if (nr < 0 || nr >= HIST_TABLE_SIZE) + goto leave; + + /* Cheap-load probe before the clock_gettime: hist_first_ns is set + * exactly once at the first record, so subsequent records skip the + * syscall entirely. Without this gate, every record paid for a + * CLOCK_MONOTONIC read whose result was thrown away by the failing + * CAS. + */ + if (atomic_load_explicit(&hist_first_ns, memory_order_relaxed) == 0) { + uint64_t expected = 0; + atomic_compare_exchange_strong_explicit( + &hist_first_ns, &expected, syscall_hist_now_ns(), + memory_order_relaxed, memory_order_relaxed); + } + + hist_slot_t *slot = &hist_table[nr]; + atomic_fetch_add_explicit(&slot->count, 1, memory_order_relaxed); + atomic_fetch_add_explicit(&slot->total_ns, ns, memory_order_relaxed); + + /* CAS-loop max keeps the slot lock-free under contention from sibling + * vCPUs racing on the same syscall number. Reads use relaxed order + * because the dump only consumes them after the active-recorder count + * drains. + */ + uint64_t prev = atomic_load_explicit(&slot->max_ns, memory_order_relaxed); + while (ns > prev) { + if (atomic_compare_exchange_weak_explicit(&slot->max_ns, &prev, ns, + memory_order_relaxed, + memory_order_relaxed)) + break; + } +leave: + atomic_fetch_sub_explicit(&hist_active_recorders, 1, memory_order_release); +} + +void syscall_hist_freeze(const char *reason) +{ + /* Publish the reason BEFORE the state transition so a concurrent dump + * that wins the FROZEN -> OFF exchange cannot observe HIST_FROZEN with + * a NULL reason and print the wrong dump header. The store on + * hist_freeze_reason uses release ordering; the freeze CAS below pairs + * with the dump path's acquire load on hist_mode, so any thread that + * sees the CAS result also sees the reason that preceded it. + */ + int probe = atomic_load_explicit(&hist_mode, memory_order_acquire); + if (probe != HIST_RECORD) + return; + atomic_store_explicit(&hist_freeze_reason, reason ? reason : "frozen", + memory_order_release); + int expected = HIST_RECORD; + atomic_compare_exchange_strong_explicit(&hist_mode, &expected, HIST_FROZEN, + memory_order_release, + memory_order_acquire); +} + +void syscall_hist_disable(void) +{ + /* Mark init as already-resolved so the lazy path in syscall_hist_enabled + * does not overwrite the OFF state with a re-read of the env var. + */ + pthread_once(&hist_init_once, hist_init_resolve); + atomic_store_explicit(&hist_mode, HIST_OFF, memory_order_release); +} + +/* Sort key for the dump. Keys carry the slot index so the qsort comparator + * can resolve names without consulting hist_table again. + */ +typedef struct { + int nr; + uint64_t count; + uint64_t total_ns; + uint64_t max_ns; +} hist_row_t; + +static int hist_row_cmp(const void *a, const void *b) +{ + const hist_row_t *ra = a; + const hist_row_t *rb = b; + if (ra->total_ns < rb->total_ns) + return 1; + if (ra->total_ns > rb->total_ns) + return -1; + if (ra->count < rb->count) + return 1; + if (ra->count > rb->count) + return -1; + return ra->nr - rb->nr; +} + +void syscall_hist_dump(void) +{ + int mode = atomic_load_explicit(&hist_mode, memory_order_acquire); + if (mode == HIST_OFF) + return; + + /* One-shot: flip to OFF so a second call (cleanup from forked child + * after the parent already dumped, etc) is a clean no-op. + */ + if (mode == HIST_RECORD) { + int expected = HIST_RECORD; + atomic_compare_exchange_strong_explicit( + &hist_mode, &expected, HIST_FROZEN, memory_order_release, + memory_order_acquire); + } + int prev = + atomic_exchange_explicit(&hist_mode, HIST_OFF, memory_order_acq_rel); + if (prev == HIST_OFF) + return; + + /* Wait for any in-flight recorders to retire. A sibling vCPU that + * already incremented hist_active_recorders and observed the prior + * RECORD mode must finish its atomic_fetch_add on the slot before we + * read it; otherwise the dump can lose an update or read torn + * intermediate values. The spin is bounded by the slowest recorder + * (a few hundred ns) and only runs during the one-shot dump. + */ + while (atomic_load_explicit(&hist_active_recorders, memory_order_acquire) > + 0) { + /* sched_yield is not portable to relaxed memory models the same + * way pause is on x86; on aarch64 macOS the loop iteration cost + * is dominated by the atomic load anyway. No need for an explicit + * yield -- the spin terminates as soon as the in-flight recorders + * retire. + */ + } + + hist_row_t rows[HIST_TABLE_SIZE]; + int nrows = 0; + uint64_t total_count = 0; + uint64_t total_ns = 0; + for (int i = 0; i < HIST_TABLE_SIZE; i++) { + uint64_t cnt = + atomic_load_explicit(&hist_table[i].count, memory_order_relaxed); + if (cnt == 0) + continue; + rows[nrows].nr = i; + rows[nrows].count = cnt; + rows[nrows].total_ns = + atomic_load_explicit(&hist_table[i].total_ns, memory_order_relaxed); + rows[nrows].max_ns = + atomic_load_explicit(&hist_table[i].max_ns, memory_order_relaxed); + total_count += cnt; + total_ns += rows[nrows].total_ns; + nrows++; + } + + if (nrows == 0) + return; + + qsort(rows, (size_t) nrows, sizeof(rows[0]), hist_row_cmp); + + const char *reason = + atomic_load_explicit(&hist_freeze_reason, memory_order_acquire); + fprintf(stderr, "=== syscall histogram (%s) ===\n", + reason ? reason : "guest exit"); + fprintf(stderr, "%6s %12s %10s %10s %s\n", "count", "total_ms", "avg_us", + "max_us", "name"); + for (int i = 0; i < nrows; i++) { + double total_ms = (double) rows[i].total_ns / 1000000.0; + double avg_us = rows[i].count ? (double) rows[i].total_ns / + (double) rows[i].count / 1000.0 + : 0.0; + double max_us = (double) rows[i].max_ns / 1000.0; + const char *name = hist_names[rows[i].nr]; + if (name) + fprintf(stderr, "%6llu %12.3f %10.2f %10.2f %s\n", + (unsigned long long) rows[i].count, total_ms, avg_us, + max_us, name); + else + fprintf(stderr, "%6llu %12.3f %10.2f %10.2f SYS_%d\n", + (unsigned long long) rows[i].count, total_ms, avg_us, + max_us, rows[i].nr); + } + fprintf(stderr, "total: %llu syscalls, %.3f ms\n", + (unsigned long long) total_count, (double) total_ns / 1000000.0); + + /* Wall-clock span from the first record to dump time. Lets the + * reader compare syscall total against guest wall time and see + * what fraction of execution was spent inside syscalls. Only + * meaningful when at least one record landed; first_ns is set + * exactly once by the first successful record CAS. + */ + uint64_t first_ns = + atomic_load_explicit(&hist_first_ns, memory_order_relaxed); + if (first_ns) { + struct timespec ts; + if (clock_gettime(CLOCK_MONOTONIC, &ts) == 0) { + uint64_t now_ns = + (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec; + if (now_ns > first_ns) + fprintf( + stderr, "wall: %.3f ms (syscalls = %.1f%% of wall)\n", + (double) (now_ns - first_ns) / 1000000.0, + 100.0 * (double) total_ns / (double) (now_ns - first_ns)); + } + } +} diff --git a/src/debug/syscall-hist.h b/src/debug/syscall-hist.h new file mode 100644 index 0000000..9e51f57 --- /dev/null +++ b/src/debug/syscall-hist.h @@ -0,0 +1,68 @@ +/* Dynamic-linker startup syscall histogram + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Per-syscall count + total + max latency, captured during the EL0 syscall + * storm that dominates dynamic-linker bring-up. Opt-in via + * ELFUSE_STARTUP_TRACE=syscalls (or =all alongside the existing per-step VM + * bring-up tracer). Recording stops at the first successful execve so the + * dump reflects pre-execve startup; without execve it dumps on guest exit. + * + * Disabled cost is one branch + one global load per syscall_dispatch entry + * because syscall_hist_enabled() resolves the env once and caches the + * result. Enabled cost is two CLOCK_MONOTONIC reads per syscall plus three + * relaxed atomic adds. + */ + +#ifndef ELFUSE_SYSCALL_HIST_H +#define ELFUSE_SYSCALL_HIST_H + +#include +#include + +/* Parse ELFUSE_STARTUP_TRACE once; safe to call repeatedly. Must be called + * before any syscall is dispatched so the enabled flag is stable for the + * lifetime of the process. + */ +void syscall_hist_init(void); + +/* Fast probe used by syscall_dispatch to decide whether to grab a start + * timestamp. Returns false after syscall_hist_freeze() so the recording + * stops cleanly at the first execve. + */ +bool syscall_hist_enabled(void); + +/* Monotonic timestamp source. Returns 0 when the histogram is disabled so + * the caller can short-circuit without a branch on the env-var cache. + */ +uint64_t syscall_hist_now_ns(void); + +/* Record one syscall observation. nr is the Linux syscall number; ns is the + * elapsed time inside the dispatcher. Called from every vCPU thread, so the + * counters update under __atomic_fetch_add. No-op if disabled or frozen. + */ +void syscall_hist_record(int nr, uint64_t ns); + +/* Stop recording but keep the captured data ready to dump. Called from the + * successful-execve path so steady-state syscall traffic does not pollute + * the startup picture. The reason string survives until syscall_hist_dump + * runs and appears in the dump header. + */ +void syscall_hist_freeze(const char *reason); + +/* Force-disable the histogram in this process even if ELFUSE_STARTUP_TRACE + * is set. Used by fork-child bring-up: the child resumes from a parent + * snapshot, so its first syscalls are steady-state, not dynamic-linker + * bring-up. Without this, the inherited env var would trigger lazy init in + * the child and pollute the parent's dump if both share stderr. + */ +void syscall_hist_disable(void); + +/* Emit a human-readable summary to stderr, sorted by total ns descending. + * Idempotent: subsequent calls are no-ops. Safe to call from cleanup paths + * that may run more than once. + */ +void syscall_hist_dump(void); + +#endif /* ELFUSE_SYSCALL_HIST_H */ diff --git a/src/main.c b/src/main.c index 30eb23d..0b725c9 100644 --- a/src/main.c +++ b/src/main.c @@ -41,6 +41,7 @@ #include "debug/gdbstub.h" #include "debug/log.h" +#include "debug/syscall-hist.h" static int parse_int_arg(const char *s, int min, int max, int *out) { @@ -158,6 +159,10 @@ static int host_dc_zva_assert(void) int main(int argc, char **argv) { log_init(); + /* Resolve ELFUSE_STARTUP_TRACE before any guest syscall can fire so the + * histogram captures the very first dynamic-linker openat. + */ + syscall_hist_init(); bool verbose = false; /* x86_64-via-Rosetta is on by default; --no-rosetta or @@ -506,6 +511,12 @@ int main(int argc, char **argv) /* Tear down debugger state before freeing guest/vCPU resources. */ gdb_stub_shutdown(); + /* Dump the startup histogram before guest_destroy so any cleanup-path + * syscalls (closing host fds, unmapping the slab) do not appear in the + * captured set. The dump is a no-op when ELFUSE_STARTUP_TRACE=syscalls + * was not requested. + */ + syscall_hist_dump(); cleanup_main_resources(&g, guest_initialized, &sysroot_mount, have_host_cwd ? host_cwd : NULL, guest_argv, guest_argc, elf_path, sysroot_path); diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c index 3f8c4a5..5bdb813 100644 --- a/src/runtime/forkipc.c +++ b/src/runtime/forkipc.c @@ -45,6 +45,7 @@ #include "syscall/signal.h" #include "debug/log.h" +#include "debug/syscall-hist.h" /* fork_child_main. */ @@ -74,6 +75,13 @@ int fork_child_main(int ipc_fd, if (verbose) log_set_level(LOG_DEBUG); + /* The startup syscall histogram captures dynamic-linker bring-up of the + * top-level guest only; the child resumes from the parent's snapshot, so + * its first syscalls would be steady-state traffic that confuses the + * dump. Disable before any guest syscall is dispatched. + */ + syscall_hist_disable(); + /* Reset static process/thread/futex state before receiving the parent * snapshot so the incoming metadata survives child restore. */ diff --git a/src/syscall/path.c b/src/syscall/path.c index c82b46f..5a2531f 100644 --- a/src/syscall/path.c +++ b/src/syscall/path.c @@ -141,10 +141,18 @@ int path_translate_at(guest_fd_t dirfd, return -1; } - memset(tx, 0, sizeof(*tx)); + /* Only the fields read on the no-rewrite fast path need explicit + * defaults; proc_path / guest_buf / host_buf are read-after-written + * by their respective resolvers. memset of all three 4 KiB buffers + * would add ~12 KiB of zeroing per call, which became visible at + * ~30 calls per dynamic-linker startup after the sidecar caches + * dropped the rest of the openat overhead. + */ tx->guest_path = path; tx->intercept_path = path; tx->host_path = path; + tx->proc_resolved = 0; + tx->fuse_path = false; if (!path) return 0; diff --git a/src/syscall/sidecar.c b/src/syscall/sidecar.c index 09c026e..8a01d25 100644 --- a/src/syscall/sidecar.c +++ b/src/syscall/sidecar.c @@ -45,6 +45,179 @@ */ static pthread_mutex_t sidecar_global_lock = PTHREAD_MUTEX_INITIALIZER; +/* Per-directory cache: did this directory lack a sidecar index file + * for the current directory metadata version? Keyed by (st_dev, st_ino) so a + * renamed or moved directory does not leak a stale answer. Sidecar's + * case-fold walker visits every parent directory of every translated path; + * during dynamic-linker bring-up that walker fires per guest openat and + * dominates startup (histogram showed openat at 61% of getent's 7.5 ms warm + * path). The cache lets the common "no index here" answer return after a + * 5 us fstat instead of a ~30 us openat per directory traversed. + * + * 64 slots, single-level open-addressing (last writer wins on collision). + * The directory ctime/mtime pair is part of the cache key: publishing an + * index from another elfuse process changes the directory entry set, so a + * stale ABSENT entry becomes UNKNOWN on the next lookup. + */ +enum { + SIDECAR_IDX_UNKNOWN = 0, + SIDECAR_IDX_ABSENT = 1, +}; +typedef struct { + dev_t dev; + ino_t ino; + struct timespec mtime; + struct timespec ctime; + uint8_t state; +} sidecar_idx_slot_t; +#define SIDECAR_IDX_CACHE_SLOTS 64 +static sidecar_idx_slot_t sidecar_idx_cache[SIDECAR_IDX_CACHE_SLOTS]; +static pthread_mutex_t sidecar_idx_cache_lock = PTHREAD_MUTEX_INITIALIZER; + +static size_t sidecar_idx_cache_slot(dev_t dev, ino_t ino) +{ + /* Mix dev into ino so two filesystems with overlapping inode numbers + * do not pin the same slot. The golden-ratio multiplier scatters small + * inode numbers across the table without needing a real hash. + */ + uint64_t key = (uint64_t) ino ^ ((uint64_t) dev * 0x9E3779B97F4A7C15ULL); + return (size_t) (key % SIDECAR_IDX_CACHE_SLOTS); +} + +static bool sidecar_idx_cache_matches(const sidecar_idx_slot_t *slot, + const struct stat *st) +{ + return slot->state != SIDECAR_IDX_UNKNOWN && slot->dev == st->st_dev && + slot->ino == st->st_ino && + slot->mtime.tv_sec == st->st_mtimespec.tv_sec && + slot->mtime.tv_nsec == st->st_mtimespec.tv_nsec && + slot->ctime.tv_sec == st->st_ctimespec.tv_sec && + slot->ctime.tv_nsec == st->st_ctimespec.tv_nsec; +} + +static int sidecar_idx_cache_lookup_stat(const struct stat *st) +{ + size_t slot = sidecar_idx_cache_slot(st->st_dev, st->st_ino); + int state = SIDECAR_IDX_UNKNOWN; + pthread_mutex_lock(&sidecar_idx_cache_lock); + if (sidecar_idx_cache_matches(&sidecar_idx_cache[slot], st)) + state = sidecar_idx_cache[slot].state; + pthread_mutex_unlock(&sidecar_idx_cache_lock); + return state; +} + +static void sidecar_idx_cache_set_stat(const struct stat *st, int state) +{ + size_t slot = sidecar_idx_cache_slot(st->st_dev, st->st_ino); + pthread_mutex_lock(&sidecar_idx_cache_lock); + sidecar_idx_cache[slot].dev = st->st_dev; + sidecar_idx_cache[slot].ino = st->st_ino; + sidecar_idx_cache[slot].mtime = st->st_mtimespec; + sidecar_idx_cache[slot].ctime = st->st_ctimespec; + sidecar_idx_cache[slot].state = (uint8_t) state; + pthread_mutex_unlock(&sidecar_idx_cache_lock); +} + +/* Roll the cache entry back to UNKNOWN. Used by the index publish path + * before the renameat so a concurrent reader entering the post-rename + * window cannot consume a stale ABSENT and skip the openat. UNKNOWN forces + * the reader to consult the filesystem. + */ +static void sidecar_idx_cache_invalidate(int dirfd) +{ + struct stat st; + if (fstat(dirfd, &st) < 0) + return; + sidecar_idx_cache_set_stat(&st, SIDECAR_IDX_UNKNOWN); +} + +/* Cached sysroot dirfd. sidecar_open_base opens the sysroot directory on + * every translated absolute path; for dynamic-linker bring-up that fires + * once per openat / fstat / access / readlink. Caching the host fd and + * handing the walker a dup turns that ~30 us open into a ~5 us dup. The + * cache invalidates lazily by comparing the path string -- proc_set_sysroot + * is rare (once at startup, once on fork-child IPC restore), so the + * snapshot+strcmp on every call is still a net win over the open it + * replaces. Concurrent vCPU lookups serialize on the lock for the cache check + * and dup. + */ +static int sidecar_sysroot_cached_fd = -1; +static char sidecar_sysroot_cached_path[LINUX_PATH_MAX] = {0}; +static dev_t sidecar_sysroot_cached_dev; +static ino_t sidecar_sysroot_cached_ino; +static pthread_mutex_t sidecar_sysroot_cached_lock = PTHREAD_MUTEX_INITIALIZER; + +static int sidecar_open_sysroot_cached(const char *path) +{ + pthread_mutex_lock(&sidecar_sysroot_cached_lock); + int cached = sidecar_sysroot_cached_fd; + if (cached >= 0 && !strcmp(path, sidecar_sysroot_cached_path)) { + /* Path text alone is not a sufficient cache key: a host-side + * rename or replace can rebind the same path to a different + * inode while the cached fd still resolves to the original. + * Stat the path on every hit and accept the cache only when + * (dev, ino) still match the inode captured at fill time. The + * stat is one host syscall per call, much cheaper than the + * open it replaces. + * + * Race window: a host-side mutation between this stat and the + * dup below can leave the cache returning the pre-mutation fd + * even though a fresh open(path) at that instant would resolve + * the post-mutation binding. The window is microseconds. It + * only matters under adversarial host-side sysroot mutation; + * normal sysroot configuration is canonicalized once at + * proc_set_sysroot time and never changes. Closing it + * adversarially would require dropping the cache, which gives + * back the ~25 us per-call win; not worth it absent a real + * reproducer. + */ + struct stat current; + if (stat(path, ¤t) == 0 && + current.st_dev == sidecar_sysroot_cached_dev && + current.st_ino == sidecar_sysroot_cached_ino) { + cached = fcntl(cached, F_DUPFD_CLOEXEC, 0); + pthread_mutex_unlock(&sidecar_sysroot_cached_lock); + return cached; + } + /* Inode mismatch: fall through to refresh below. */ + } + if (sidecar_sysroot_cached_fd >= 0) { + close(sidecar_sysroot_cached_fd); + sidecar_sysroot_cached_fd = -1; + sidecar_sysroot_cached_path[0] = '\0'; + } + int fresh = open(path, O_RDONLY | O_DIRECTORY | O_CLOEXEC); + if (fresh < 0) { + pthread_mutex_unlock(&sidecar_sysroot_cached_lock); + return -1; + } + /* Capture the inode of the freshly-opened dir so subsequent cache + * hits can validate against it. Using fstat on the open fd gives + * the same (dev, ino) as a path stat would, with no second walk. + */ + struct stat fresh_st; + if (fstat(fresh, &fresh_st) < 0) { + close(fresh); + pthread_mutex_unlock(&sidecar_sysroot_cached_lock); + return -1; + } + /* CLOEXEC dup: a concurrent posix_spawn from another vCPU thread + * must not inherit the sysroot dirfd into the fork-child. F_DUPFD_CLOEXEC + * sets the flag atomically with the dup, closing the inheritance window + * that plain dup() leaves open. + */ + int cache_fd = fcntl(fresh, F_DUPFD_CLOEXEC, 0); + if (cache_fd >= 0) { + sidecar_sysroot_cached_fd = cache_fd; + sidecar_sysroot_cached_dev = fresh_st.st_dev; + sidecar_sysroot_cached_ino = fresh_st.st_ino; + str_copy_trunc(sidecar_sysroot_cached_path, path, + sizeof(sidecar_sysroot_cached_path)); + } + pthread_mutex_unlock(&sidecar_sysroot_cached_lock); + return fresh; +} + typedef struct { char *guest_name; char token[SIDECAR_TOKEN_NAME_LEN + 1]; @@ -163,10 +336,25 @@ static int sidecar_load_index(int dirfd, sidecar_index_t *index) { memset(index, 0, sizeof(*index)); + /* Per-dir absence cache: skip the openat round-trip only while the + * directory metadata still matches the snapshot that produced ENOENT. + * Another elfuse process can publish the index through the fcntl-locked + * writer, so cross-process invalidation relies on the directory mtime/ctime + * changing when the sidecar index appears. + */ + struct stat dir_st; + bool have_dir_st = fstat(dirfd, &dir_st) == 0; + if (have_dir_st && + sidecar_idx_cache_lookup_stat(&dir_st) == SIDECAR_IDX_ABSENT) + return 0; + int fd = openat(dirfd, SIDECAR_INDEX_NAME, O_RDONLY | O_CLOEXEC); if (fd < 0) { - if (errno == ENOENT) + if (errno == ENOENT) { + if (have_dir_st) + sidecar_idx_cache_set_stat(&dir_st, SIDECAR_IDX_ABSENT); return 0; + } return -1; } @@ -355,7 +543,7 @@ static int sidecar_open_base(guest_fd_t dirfd, errno = ENAMETOOLONG; return -1; } - *base_fd = open(sysroot, O_RDONLY | O_DIRECTORY | O_CLOEXEC); + *base_fd = sidecar_open_sysroot_cached(sysroot); if (*base_fd < 0) return -1; *absolute = true; @@ -900,6 +1088,14 @@ static int sidecar_write_locked_index(int parent_dirfd, errno = saved_errno; return -1; } + /* Invalidate the cache to UNKNOWN BEFORE the rename so a concurrent + * walker that loses the race to read the new file does not return a + * stale ABSENT verdict from this slot. A reader entering the window + * between rename and post-rename mark would otherwise skip the openat + * and miss the freshly-published index. UNKNOWN forces the reader to + * do the openat and observe the new file directly. + */ + sidecar_idx_cache_invalidate(parent_dirfd); if (renameat(parent_dirfd, SIDECAR_INDEX_TMP_NAME, parent_dirfd, SIDECAR_INDEX_NAME) < 0) { int saved_errno = errno; diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index be97787..4be4859 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -64,6 +64,8 @@ #include "core/shim-globals.h" +#include "debug/syscall-hist.h" + /* Generated from src/syscall/dispatch.tbl into $(BUILD_DIR). */ #include "dispatch.h" @@ -1737,6 +1739,11 @@ int syscall_dispatch(hv_vcpu_t vcpu, guest_t *g, int *exit_code, bool verbose) int nr = (int) x8; const syscall_entry_t *entry = NULL; + /* Per-syscall histogram for the dynamic-linker bring-up storm. Zero when + * disabled so the bottom-of-dispatch record path is a single branch. + */ + uint64_t hist_start_ns = syscall_hist_now_ns(); + if (!verbose) { if (nr == SYS_getpid || nr == SYS_getppid || nr == SYS_gettid || nr == SYS_getpgid || nr == SYS_getuid || nr == SYS_geteuid || @@ -1850,6 +1857,17 @@ int syscall_dispatch(hv_vcpu_t vcpu, guest_t *g, int *exit_code, bool verbose) should_exit = true; result = 0; /* Not written back, but keep clean */ } else if (result == SYSCALL_EXEC_HAPPENED) { + if (hist_start_ns) { + /* Guard the subtraction: syscall_hist_now_ns returns 0 + * if clock_gettime fails. An unsigned 0 minus a non-zero + * start would underflow to a huge value and pollute the + * histogram with a bogus latency sample. + */ + uint64_t hist_end_ns = syscall_hist_now_ns(); + if (hist_end_ns >= hist_start_ns) + syscall_hist_record(nr, hist_end_ns - hist_start_ns); + syscall_hist_freeze("frozen at first execve"); + } return SYSCALL_EXEC_HAPPENED; } } else { @@ -1912,5 +1930,11 @@ int syscall_dispatch(hv_vcpu_t vcpu, guest_t *g, int *exit_code, bool verbose) tlbi_request_clear(); } + if (hist_start_ns) { + uint64_t hist_end_ns = syscall_hist_now_ns(); + if (hist_end_ns >= hist_start_ns) + syscall_hist_record(nr, hist_end_ns - hist_start_ns); + } + return should_exit; }