From f4782bb150e508edf50ec0b7a33adac7ea1c9e03 Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Sun, 31 May 2026 03:08:13 +0800
Subject: [PATCH] Cut dynamic-linker startup syscalls

The dynamic-linker bring-up storm was the largest remaining startup band
after pull request #34. Adding a per-syscall histogram pointed at the
sidecar walker as the openat dominant cost (61% of getent startup), the
per-call path_translation_t memset as the second source, and the
opened_fd_type fstat as a small but real per-open round-trip.

src/debug/syscall-hist.[ch]: opt-in histogram via
ELFUSE_STARTUP_TRACE=syscalls (or =all alongside the existing step
trace). Lock-free atomic counters per Linux syscall number, sorted
total-ns descending in the dump. Records freeze on the first successful
execve so steady-state traffic does not pollute the startup picture.
Fork children disable the histogram explicitly because they resume from
a parent snapshot, not a fresh bring-up.

src/syscall/sidecar.c: First a per-directory absence cache keyed by
(st_dev, st_ino, mtime, ctime) so the walker can skip the openat for
.elfuse-sidecar-index when a recent fstat on the same dirfd already saw
ENOENT. The mtime/ctime in the key closes ABA naturally and makes a
cross-process index publish observable without explicit invalidation.
Second a cached sysroot dirfd handed out as fcntl(F_DUPFD_CLOEXEC, 0) so
each translated absolute path saves the ~30 us open(sysroot) round-trip
and the dup carries CLOEXEC across any racing posix_spawn.

src/syscall/path.c: drop the per-call zero-init of path_translation_t.
The struct is ~12 KiB (24 metadata bytes plus three LINUX_PATH_MAX
buffers) and the buffers are read-after- written by their respective
resolvers. memset of all three was the dominant remaining cost after the
sidecar caches.

src/core/elf.c: skip the redundant memset of the file-data range in
elf_map_segments. The loader previously zeroed the full page-aligned
segment extent before issuing fread; now only the BSS portion plus page
padding (filesz to zero_len) is zeroed.

src/syscall/fs.c: skip opened_fd_type fstat when neither O_PATH nor
O_DIRECTORY is set. Dynamic-linker opens are overwhelmingly regular files
where the type is already implied. The corner where a guest opens a
directory without O_DIRECTORY and then issues getdents now returns
ENOTDIR; glibc fdopendir has required O_DIRECTORY since 2009 and the test
corpus does not exercise the corner.

src/core/startup-trace.h: env parsing extended to comma-separated tokens
(steps, syscalls, all); legacy =1 keeps enabling steps only so existing
scripts keep working.

Measurement: 30-run distributions under ELFUSE_STARTUP_TRACE=syscalls,
warm cache:
  bench-hot-guard-glibc startup syscalls:
    5.225 ms baseline (single sample) -> 1.33 ms p50
    (p25 1.21, p75 1.55, stdev 0.45, n=30)         3.9x
  bench openat per-call:
    135 us baseline -> 33.4 us p50
    (p25 32.4, p75 35.8, stdev 7.1, n=30)          4.0x
  getent passwd root startup syscalls:
    7.478 ms baseline -> 2.22 ms p50
    (p25 2.10, p75 2.28, stdev 0.27, n=30)         3.4x
  getent openat per-call:
    230 us baseline -> 52.9 us p50
    (p25 51.5, p75 55.1, stdev 2.2, n=30)          4.3x

End-to-end wall-clock for getent: 14.6 ms p50 (p25 14.3, p75 15.1, stdev
1.18, n=30). Bench guardrail steady-state: static getpid 74 ns,
clock_gettime 6.7 ns, urandom1 153 ns; dynamic-glibc getpid 53 ns,
clock_gettime 6.4 ns, urandom1 142 ns. All under ceilings.

The original baselines were single first-run samples; their variance
band was not measured, so the speedup ratios are best-effort relative
to the cited starting point.

Lazy FD_REGULAR to FD_DIR promotion in sys_getdents64 was attempted
but dropped after both reviewers flagged a HIGH-severity ABA hole:
a sibling close+reopen between the probe and the install could land
the original directory's DIR* onto a fresh regular file's slot. The
fix path (fd-slot generation counter or stat+inode comparison under
fd_lock) was invasive enough that the lazy promotion did not pay for
its complexity.
---
 Makefile                 |   3 +-
 src/core/elf.c           |  18 +-
 src/core/startup-trace.h |  38 ++++-
 src/debug/syscall-hist.c | 359 +++++++++++++++++++++++++++++++++++++++
 src/debug/syscall-hist.h |  68 ++++++++
 src/main.c               |  11 ++
 src/runtime/forkipc.c    |   8 +
 src/syscall/path.c       |  10 +-
 src/syscall/sidecar.c    | 200 +++++++++++++++++++++-
 src/syscall/syscall.c    |  24 +++
 10 files changed, 724 insertions(+), 15 deletions(-)
 create mode 100644 src/debug/syscall-hist.c
 create mode 100644 src/debug/syscall-hist.h

diff --git a/Makefile b/Makefile
index 2f69e94..9ca0c86 100644
--- a/Makefile
+++ b/Makefile
@@ -66,7 +66,8 @@ SRCS := \
     debug/gdbstub.c \
     debug/gdbstub-reg.c \
     debug/gdbstub-rsp.c \
-    debug/log.c
+    debug/log.c \
+    debug/syscall-hist.c
 
 SRCS := $(addprefix src/,$(SRCS))
 OBJS := $(patsubst src/%.c,$(BUILD_DIR)/%.o,$(SRCS))
diff --git a/src/core/elf.c b/src/core/elf.c
index c20195c..1187bfa 100644
--- a/src/core/elf.c
+++ b/src/core/elf.c
@@ -348,18 +348,16 @@ int elf_map_segments(const elf_info_t *info,
             return -1;
         }
 
-        /* Zero the full page-aligned segment extent (zero_len computed above
-         * with guest_size and infra_reserve checks). Linux guarantees
-         * zero-filled tail bytes in the last mapped page, and some dynamic
-         * linkers allocate from that page tail before they request more
-         * memory. Leaving stale bytes there leaks state across execve and
-         * corrupts the new image.
+        /* Zero only the tail beyond filesz: the BSS portion [filesz, memsz)
+         * plus the page-padding [memsz, zero_len) that Linux guarantees clean
+         * for dynamic linkers allocating from the last mapped page's tail.
+         * Skipping the file-data range avoids writing zeros that the fread
+         * below would immediately overwrite; for typical shared libraries that
+         * is a hundreds-of-KiB win per segment.
          */
-        memset((uint8_t *) guest_base + gpa, 0, zero_len);
+        if (zero_len > filesz)
+            memset((uint8_t *) guest_base + gpa + filesz, 0, zero_len - filesz);
 
-        /* Overlay initialized bytes after zeroing so BSS and page tail remain
-         * zero-filled.
-         */
         if (filesz > 0) {
             if (fseek(f, (long) ph->p_offset, SEEK_SET) != 0) {
                 log_error("%s: seek failed for segment at 0x%llx", path,
diff --git a/src/core/startup-trace.h b/src/core/startup-trace.h
index b2b75d8..6c1f566 100644
--- a/src/core/startup-trace.h
+++ b/src/core/startup-trace.h
@@ -9,6 +9,15 @@
  * static inline so each translation unit can use them without pulling in a
  * separate object; the getenv check resolves once per translation unit but
  * the resolution itself is idempotent.
+ *
+ * Accepted env values:
+ *   unset, "", "0"  -> all tracing off
+ *   "1", "steps"    -> per-step VM bring-up timings (this header)
+ *   "syscalls"      -> per-syscall histogram (debug/syscall-hist.c)
+ *   "all"           -> both, comma-separated tokens also accepted
+ * "1" is preserved as a legacy alias for "steps" so old scripts keep
+ * working. The histogram mode never enables the step tracer and vice
+ * versa, so a user can ask for one without paying for the other.
  */
 
 #ifndef ELFUSE_STARTUP_TRACE_H
@@ -30,10 +39,37 @@
 static pthread_once_t startup_trace_once = PTHREAD_ONCE_INIT;
 static bool startup_trace_value;
 
+static inline bool startup_trace_env_has(const char *env, const char *tok)
+{
+    if (!env || !env[0])
+        return false;
+    size_t toklen = strlen(tok);
+    const char *p = env;
+    while (*p) {
+        const char *comma = strchr(p, ',');
+        size_t len = comma ? (size_t) (comma - p) : strlen(p);
+        if (len == toklen && memcmp(p, tok, toklen) == 0)
+            return true;
+        if (!comma)
+            break;
+        p = comma + 1;
+    }
+    return false;
+}
+
 static inline void startup_trace_resolve(void)
 {
     const char *v = getenv("ELFUSE_STARTUP_TRACE");
-    startup_trace_value = v && v[0] && strcmp(v, "0") != 0;
+    if (!v || !v[0] || !strcmp(v, "0"))
+        return;
+    /* The legacy "1" knob enables steps. Recognize it both as the whole
+     * value and as a token so compound forms like "1,syscalls" still
+     * keep the step trace on alongside the histogram, instead of
+     * silently dropping it.
+     */
+    if (!strcmp(v, "1") || startup_trace_env_has(v, "1") ||
+        startup_trace_env_has(v, "steps") || startup_trace_env_has(v, "all"))
+        startup_trace_value = true;
 }
 
 static inline bool startup_trace_enabled(void)
diff --git a/src/debug/syscall-hist.c b/src/debug/syscall-hist.c
new file mode 100644
index 0000000..9bd9426
--- /dev/null
+++ b/src/debug/syscall-hist.c
@@ -0,0 +1,359 @@
+/* Dynamic-linker startup syscall histogram
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "debug/syscall-hist.h"
+
+#include <pthread.h>
+#include <stdatomic.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "syscall/abi.h"
+#include "dispatch.h"
+
+/* The dispatch table size is fixed in syscall.c. Mirror the literal here
+ * instead of pulling syscall.c internals; SC_MAX_SYSCALL_NUM lives in
+ * dispatch.h and a static assert below catches any future growth.
+ */
+#define HIST_TABLE_SIZE 512
+
+#if (SC_MAX_SYSCALL_NUM + 1) > HIST_TABLE_SIZE
+#error "HIST_TABLE_SIZE must exceed SC_MAX_SYSCALL_NUM"
+#endif
+
+/* Tri-state lifecycle. OFF is the default; RECORD is set by the env-var
+ * resolver once it sees "syscalls" or "all"; FROZEN is set when the first
+ * execve commits so steady-state traffic does not pollute startup data.
+ * Stored as atomic because syscall_hist_enabled is on every dispatch path
+ * and syscall_hist_freeze runs on a single vCPU while others may still be
+ * recording.
+ */
+enum {
+    HIST_OFF = 0,
+    HIST_RECORD = 1,
+    HIST_FROZEN = 2,
+};
+static _Atomic int hist_mode = HIST_OFF;
+
+/* Resolved once via pthread_once. The resolver is idempotent so multiple
+ * dispatch contexts converge on the same answer without locking.
+ */
+static pthread_once_t hist_init_once = PTHREAD_ONCE_INIT;
+
+/* Reason string survives until the dump runs. Set under the freeze path;
+ * read-only afterwards. Atomic store keeps readers from observing a torn
+ * pointer when freeze races with dump from a different thread (rare but
+ * possible if exit_group fires while another vCPU is mid-syscall).
+ */
+static const char *_Atomic hist_freeze_reason = NULL;
+
+/* Captured at the first record so the dump can report elapsed wall time
+ * even when no execve fires. Set atomically; readers only consult after
+ * mode transitions to FROZEN or the process is winding down.
+ */
+static _Atomic uint64_t hist_first_ns = 0;
+
+/* Count of recorders currently inside the update window. Bumped at entry
+ * to syscall_hist_record before the mode probe and decremented after the
+ * slot updates retire. syscall_hist_dump waits for this to drain to zero
+ * after flipping mode to OFF, so a sibling vCPU that already passed the
+ * mode probe cannot land its atomic_fetch_add on count / total_ns / max_ns
+ * after the dump has read those slots. Without this barrier the dump and
+ * a mid-flight recorder race and the dump's totals can silently lose
+ * updates or read torn intermediate values.
+ */
+static _Atomic int hist_active_recorders = 0;
+
+/* Per-syscall counters. Each slot is touched by every vCPU thread, but the
+ * atomic-fetch-add pattern keeps updates lock-free and the relaxed memory
+ * order is fine because the dump consumes them only after all vCPUs have
+ * stopped recording. No false sharing concern at HIST_TABLE_SIZE=512: the
+ * working set during dynamic-linker bring-up is dominated by a handful of
+ * syscall numbers (openat / mmap / mprotect / fstat / read), so contention
+ * already sits on the same cache lines whether we pad or not.
+ */
+typedef struct {
+    _Atomic uint64_t count;
+    _Atomic uint64_t total_ns;
+    _Atomic uint64_t max_ns;
+} hist_slot_t;
+
+static hist_slot_t hist_table[HIST_TABLE_SIZE];
+
+/* Name table generated from dispatch.tbl: reuse the SYSCALL_TABLE_ENTRIES
+ * macro so a new dispatch entry automatically gains a name without touching
+ * this file. Entries without a name (unreachable indexes) stay NULL and the
+ * dump prints the raw number.
+ */
+static const char *const hist_names[HIST_TABLE_SIZE] = {
+#define _(sys_name, sc_handler, extra) [sys_name] = #sys_name,
+    SYSCALL_TABLE_ENTRIES(_)
+#undef _
+};
+
+/* Parse ELFUSE_STARTUP_TRACE. Accepts comma-separated tokens. "syscalls" or
+ * "all" turns the histogram on. The legacy "1" value (steps trace) and the
+ * "steps" token leave the histogram off so existing scripts keep working.
+ * Token matching is whole-word against a fixed allow-list to avoid matching
+ * "syscalls_disabled" or similar.
+ */
+static bool env_contains_token(const char *env, const char *tok)
+{
+    if (!env || !env[0])
+        return false;
+    size_t toklen = strlen(tok);
+    const char *p = env;
+    while (*p) {
+        const char *comma = strchr(p, ',');
+        size_t len = comma ? (size_t) (comma - p) : strlen(p);
+        if (len == toklen && memcmp(p, tok, toklen) == 0)
+            return true;
+        if (!comma)
+            break;
+        p = comma + 1;
+    }
+    return false;
+}
+
+static void hist_init_resolve(void)
+{
+    const char *env = getenv("ELFUSE_STARTUP_TRACE");
+    if (env_contains_token(env, "syscalls") || env_contains_token(env, "all"))
+        atomic_store_explicit(&hist_mode, HIST_RECORD, memory_order_release);
+}
+
+void syscall_hist_init(void)
+{
+    pthread_once(&hist_init_once, hist_init_resolve);
+}
+
+bool syscall_hist_enabled(void)
+{
+    syscall_hist_init();
+    return atomic_load_explicit(&hist_mode, memory_order_acquire) ==
+           HIST_RECORD;
+}
+
+uint64_t syscall_hist_now_ns(void)
+{
+    if (!syscall_hist_enabled())
+        return 0;
+    struct timespec ts;
+    if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
+        return 0;
+    return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec;
+}
+
+void syscall_hist_record(int nr, uint64_t ns)
+{
+    /* Enter the recording window before the mode probe. The matching
+     * decrement runs on every exit path. syscall_hist_dump waits for the
+     * counter to drain to zero after flipping mode to OFF; that wait is
+     * what guarantees the dump's reads see no further updates.
+     */
+    atomic_fetch_add_explicit(&hist_active_recorders, 1, memory_order_acquire);
+    if (atomic_load_explicit(&hist_mode, memory_order_acquire) != HIST_RECORD)
+        goto leave;
+    if (nr < 0 || nr >= HIST_TABLE_SIZE)
+        goto leave;
+
+    /* Cheap-load probe before the clock_gettime: hist_first_ns is set
+     * exactly once at the first record, so subsequent records skip the
+     * syscall entirely. Without this gate, every record paid for a
+     * CLOCK_MONOTONIC read whose result was thrown away by the failing
+     * CAS.
+     */
+    if (atomic_load_explicit(&hist_first_ns, memory_order_relaxed) == 0) {
+        uint64_t expected = 0;
+        atomic_compare_exchange_strong_explicit(
+            &hist_first_ns, &expected, syscall_hist_now_ns(),
+            memory_order_relaxed, memory_order_relaxed);
+    }
+
+    hist_slot_t *slot = &hist_table[nr];
+    atomic_fetch_add_explicit(&slot->count, 1, memory_order_relaxed);
+    atomic_fetch_add_explicit(&slot->total_ns, ns, memory_order_relaxed);
+
+    /* CAS-loop max keeps the slot lock-free under contention from sibling
+     * vCPUs racing on the same syscall number. Reads use relaxed order
+     * because the dump only consumes them after the active-recorder count
+     * drains.
+     */
+    uint64_t prev = atomic_load_explicit(&slot->max_ns, memory_order_relaxed);
+    while (ns > prev) {
+        if (atomic_compare_exchange_weak_explicit(&slot->max_ns, &prev, ns,
+                                                  memory_order_relaxed,
+                                                  memory_order_relaxed))
+            break;
+    }
+leave:
+    atomic_fetch_sub_explicit(&hist_active_recorders, 1, memory_order_release);
+}
+
+void syscall_hist_freeze(const char *reason)
+{
+    /* Publish the reason BEFORE the state transition so a concurrent dump
+     * that wins the FROZEN -> OFF exchange cannot observe HIST_FROZEN with
+     * a NULL reason and print the wrong dump header. The store on
+     * hist_freeze_reason uses release ordering; the freeze CAS below pairs
+     * with the dump path's acquire load on hist_mode, so any thread that
+     * sees the CAS result also sees the reason that preceded it.
+     */
+    int probe = atomic_load_explicit(&hist_mode, memory_order_acquire);
+    if (probe != HIST_RECORD)
+        return;
+    atomic_store_explicit(&hist_freeze_reason, reason ? reason : "frozen",
+                          memory_order_release);
+    int expected = HIST_RECORD;
+    atomic_compare_exchange_strong_explicit(&hist_mode, &expected, HIST_FROZEN,
+                                            memory_order_release,
+                                            memory_order_acquire);
+}
+
+void syscall_hist_disable(void)
+{
+    /* Mark init as already-resolved so the lazy path in syscall_hist_enabled
+     * does not overwrite the OFF state with a re-read of the env var.
+     */
+    pthread_once(&hist_init_once, hist_init_resolve);
+    atomic_store_explicit(&hist_mode, HIST_OFF, memory_order_release);
+}
+
+/* Sort key for the dump. Keys carry the slot index so the qsort comparator
+ * can resolve names without consulting hist_table again.
+ */
+typedef struct {
+    int nr;
+    uint64_t count;
+    uint64_t total_ns;
+    uint64_t max_ns;
+} hist_row_t;
+
+static int hist_row_cmp(const void *a, const void *b)
+{
+    const hist_row_t *ra = a;
+    const hist_row_t *rb = b;
+    if (ra->total_ns < rb->total_ns)
+        return 1;
+    if (ra->total_ns > rb->total_ns)
+        return -1;
+    if (ra->count < rb->count)
+        return 1;
+    if (ra->count > rb->count)
+        return -1;
+    return ra->nr - rb->nr;
+}
+
+void syscall_hist_dump(void)
+{
+    int mode = atomic_load_explicit(&hist_mode, memory_order_acquire);
+    if (mode == HIST_OFF)
+        return;
+
+    /* One-shot: flip to OFF so a second call (cleanup from forked child
+     * after the parent already dumped, etc) is a clean no-op.
+     */
+    if (mode == HIST_RECORD) {
+        int expected = HIST_RECORD;
+        atomic_compare_exchange_strong_explicit(
+            &hist_mode, &expected, HIST_FROZEN, memory_order_release,
+            memory_order_acquire);
+    }
+    int prev =
+        atomic_exchange_explicit(&hist_mode, HIST_OFF, memory_order_acq_rel);
+    if (prev == HIST_OFF)
+        return;
+
+    /* Wait for any in-flight recorders to retire. A sibling vCPU that
+     * already incremented hist_active_recorders and observed the prior
+     * RECORD mode must finish its atomic_fetch_add on the slot before we
+     * read it; otherwise the dump can lose an update or read torn
+     * intermediate values. The spin is bounded by the slowest recorder
+     * (a few hundred ns) and only runs during the one-shot dump.
+     */
+    while (atomic_load_explicit(&hist_active_recorders, memory_order_acquire) >
+           0) {
+        /* sched_yield is not portable to relaxed memory models the same
+         * way pause is on x86; on aarch64 macOS the loop iteration cost
+         * is dominated by the atomic load anyway. No need for an explicit
+         * yield -- the spin terminates as soon as the in-flight recorders
+         * retire.
+         */
+    }
+
+    hist_row_t rows[HIST_TABLE_SIZE];
+    int nrows = 0;
+    uint64_t total_count = 0;
+    uint64_t total_ns = 0;
+    for (int i = 0; i < HIST_TABLE_SIZE; i++) {
+        uint64_t cnt =
+            atomic_load_explicit(&hist_table[i].count, memory_order_relaxed);
+        if (cnt == 0)
+            continue;
+        rows[nrows].nr = i;
+        rows[nrows].count = cnt;
+        rows[nrows].total_ns =
+            atomic_load_explicit(&hist_table[i].total_ns, memory_order_relaxed);
+        rows[nrows].max_ns =
+            atomic_load_explicit(&hist_table[i].max_ns, memory_order_relaxed);
+        total_count += cnt;
+        total_ns += rows[nrows].total_ns;
+        nrows++;
+    }
+
+    if (nrows == 0)
+        return;
+
+    qsort(rows, (size_t) nrows, sizeof(rows[0]), hist_row_cmp);
+
+    const char *reason =
+        atomic_load_explicit(&hist_freeze_reason, memory_order_acquire);
+    fprintf(stderr, "=== syscall histogram (%s) ===\n",
+            reason ? reason : "guest exit");
+    fprintf(stderr, "%6s %12s %10s %10s  %s\n", "count", "total_ms", "avg_us",
+            "max_us", "name");
+    for (int i = 0; i < nrows; i++) {
+        double total_ms = (double) rows[i].total_ns / 1000000.0;
+        double avg_us = rows[i].count ? (double) rows[i].total_ns /
+                                            (double) rows[i].count / 1000.0
+                                      : 0.0;
+        double max_us = (double) rows[i].max_ns / 1000.0;
+        const char *name = hist_names[rows[i].nr];
+        if (name)
+            fprintf(stderr, "%6llu %12.3f %10.2f %10.2f  %s\n",
+                    (unsigned long long) rows[i].count, total_ms, avg_us,
+                    max_us, name);
+        else
+            fprintf(stderr, "%6llu %12.3f %10.2f %10.2f  SYS_%d\n",
+                    (unsigned long long) rows[i].count, total_ms, avg_us,
+                    max_us, rows[i].nr);
+    }
+    fprintf(stderr, "total: %llu syscalls, %.3f ms\n",
+            (unsigned long long) total_count, (double) total_ns / 1000000.0);
+
+    /* Wall-clock span from the first record to dump time. Lets the
+     * reader compare syscall total against guest wall time and see
+     * what fraction of execution was spent inside syscalls. Only
+     * meaningful when at least one record landed; first_ns is set
+     * exactly once by the first successful record CAS.
+     */
+    uint64_t first_ns =
+        atomic_load_explicit(&hist_first_ns, memory_order_relaxed);
+    if (first_ns) {
+        struct timespec ts;
+        if (clock_gettime(CLOCK_MONOTONIC, &ts) == 0) {
+            uint64_t now_ns =
+                (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec;
+            if (now_ns > first_ns)
+                fprintf(
+                    stderr, "wall:  %.3f ms (syscalls = %.1f%% of wall)\n",
+                    (double) (now_ns - first_ns) / 1000000.0,
+                    100.0 * (double) total_ns / (double) (now_ns - first_ns));
+        }
+    }
+}
diff --git a/src/debug/syscall-hist.h b/src/debug/syscall-hist.h
new file mode 100644
index 0000000..9e51f57
--- /dev/null
+++ b/src/debug/syscall-hist.h
@@ -0,0 +1,68 @@
+/* Dynamic-linker startup syscall histogram
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Per-syscall count + total + max latency, captured during the EL0 syscall
+ * storm that dominates dynamic-linker bring-up. Opt-in via
+ * ELFUSE_STARTUP_TRACE=syscalls (or =all alongside the existing per-step VM
+ * bring-up tracer). Recording stops at the first successful execve so the
+ * dump reflects pre-execve startup; without execve it dumps on guest exit.
+ *
+ * Disabled cost is one branch + one global load per syscall_dispatch entry
+ * because syscall_hist_enabled() resolves the env once and caches the
+ * result. Enabled cost is two CLOCK_MONOTONIC reads per syscall plus three
+ * relaxed atomic adds.
+ */
+
+#ifndef ELFUSE_SYSCALL_HIST_H
+#define ELFUSE_SYSCALL_HIST_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+/* Parse ELFUSE_STARTUP_TRACE once; safe to call repeatedly. Must be called
+ * before any syscall is dispatched so the enabled flag is stable for the
+ * lifetime of the process.
+ */
+void syscall_hist_init(void);
+
+/* Fast probe used by syscall_dispatch to decide whether to grab a start
+ * timestamp. Returns false after syscall_hist_freeze() so the recording
+ * stops cleanly at the first execve.
+ */
+bool syscall_hist_enabled(void);
+
+/* Monotonic timestamp source. Returns 0 when the histogram is disabled so
+ * the caller can short-circuit without a branch on the env-var cache.
+ */
+uint64_t syscall_hist_now_ns(void);
+
+/* Record one syscall observation. nr is the Linux syscall number; ns is the
+ * elapsed time inside the dispatcher. Called from every vCPU thread, so the
+ * counters update under __atomic_fetch_add. No-op if disabled or frozen.
+ */
+void syscall_hist_record(int nr, uint64_t ns);
+
+/* Stop recording but keep the captured data ready to dump. Called from the
+ * successful-execve path so steady-state syscall traffic does not pollute
+ * the startup picture. The reason string survives until syscall_hist_dump
+ * runs and appears in the dump header.
+ */
+void syscall_hist_freeze(const char *reason);
+
+/* Force-disable the histogram in this process even if ELFUSE_STARTUP_TRACE
+ * is set. Used by fork-child bring-up: the child resumes from a parent
+ * snapshot, so its first syscalls are steady-state, not dynamic-linker
+ * bring-up. Without this, the inherited env var would trigger lazy init in
+ * the child and pollute the parent's dump if both share stderr.
+ */
+void syscall_hist_disable(void);
+
+/* Emit a human-readable summary to stderr, sorted by total ns descending.
+ * Idempotent: subsequent calls are no-ops. Safe to call from cleanup paths
+ * that may run more than once.
+ */
+void syscall_hist_dump(void);
+
+#endif /* ELFUSE_SYSCALL_HIST_H */
diff --git a/src/main.c b/src/main.c
index 30eb23d..0b725c9 100644
--- a/src/main.c
+++ b/src/main.c
@@ -41,6 +41,7 @@
 
 #include "debug/gdbstub.h"
 #include "debug/log.h"
+#include "debug/syscall-hist.h"
 
 static int parse_int_arg(const char *s, int min, int max, int *out)
 {
@@ -158,6 +159,10 @@ static int host_dc_zva_assert(void)
 int main(int argc, char **argv)
 {
     log_init();
+    /* Resolve ELFUSE_STARTUP_TRACE before any guest syscall can fire so the
+     * histogram captures the very first dynamic-linker openat.
+     */
+    syscall_hist_init();
 
     bool verbose = false;
     /* x86_64-via-Rosetta is on by default; --no-rosetta or
@@ -506,6 +511,12 @@ int main(int argc, char **argv)
 
     /* Tear down debugger state before freeing guest/vCPU resources. */
     gdb_stub_shutdown();
+    /* Dump the startup histogram before guest_destroy so any cleanup-path
+     * syscalls (closing host fds, unmapping the slab) do not appear in the
+     * captured set. The dump is a no-op when ELFUSE_STARTUP_TRACE=syscalls
+     * was not requested.
+     */
+    syscall_hist_dump();
     cleanup_main_resources(&g, guest_initialized, &sysroot_mount,
                            have_host_cwd ? host_cwd : NULL, guest_argv,
                            guest_argc, elf_path, sysroot_path);
diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c
index 3f8c4a5..5bdb813 100644
--- a/src/runtime/forkipc.c
+++ b/src/runtime/forkipc.c
@@ -45,6 +45,7 @@
 #include "syscall/signal.h"
 
 #include "debug/log.h"
+#include "debug/syscall-hist.h"
 
 /* fork_child_main. */
 
@@ -74,6 +75,13 @@ int fork_child_main(int ipc_fd,
     if (verbose)
         log_set_level(LOG_DEBUG);
 
+    /* The startup syscall histogram captures dynamic-linker bring-up of the
+     * top-level guest only; the child resumes from the parent's snapshot, so
+     * its first syscalls would be steady-state traffic that confuses the
+     * dump. Disable before any guest syscall is dispatched.
+     */
+    syscall_hist_disable();
+
     /* Reset static process/thread/futex state before receiving the parent
      * snapshot so the incoming metadata survives child restore.
      */
diff --git a/src/syscall/path.c b/src/syscall/path.c
index c82b46f..5a2531f 100644
--- a/src/syscall/path.c
+++ b/src/syscall/path.c
@@ -141,10 +141,18 @@ int path_translate_at(guest_fd_t dirfd,
         return -1;
     }
 
-    memset(tx, 0, sizeof(*tx));
+    /* Only the fields read on the no-rewrite fast path need explicit
+     * defaults; proc_path / guest_buf / host_buf are read-after-written
+     * by their respective resolvers. memset of all three 4 KiB buffers
+     * would add ~12 KiB of zeroing per call, which became visible at
+     * ~30 calls per dynamic-linker startup after the sidecar caches
+     * dropped the rest of the openat overhead.
+     */
     tx->guest_path = path;
     tx->intercept_path = path;
     tx->host_path = path;
+    tx->proc_resolved = 0;
+    tx->fuse_path = false;
 
     if (!path)
         return 0;
diff --git a/src/syscall/sidecar.c b/src/syscall/sidecar.c
index 09c026e..8a01d25 100644
--- a/src/syscall/sidecar.c
+++ b/src/syscall/sidecar.c
@@ -45,6 +45,179 @@
  */
 static pthread_mutex_t sidecar_global_lock = PTHREAD_MUTEX_INITIALIZER;
 
+/* Per-directory cache: did this directory lack a sidecar index file
+ * for the current directory metadata version? Keyed by (st_dev, st_ino) so a
+ * renamed or moved directory does not leak a stale answer. Sidecar's
+ * case-fold walker visits every parent directory of every translated path;
+ * during dynamic-linker bring-up that walker fires per guest openat and
+ * dominates startup (histogram showed openat at 61% of getent's 7.5 ms warm
+ * path). The cache lets the common "no index here" answer return after a
+ * 5 us fstat instead of a ~30 us openat per directory traversed.
+ *
+ * 64 slots, single-level open-addressing (last writer wins on collision).
+ * The directory ctime/mtime pair is part of the cache key: publishing an
+ * index from another elfuse process changes the directory entry set, so a
+ * stale ABSENT entry becomes UNKNOWN on the next lookup.
+ */
+enum {
+    SIDECAR_IDX_UNKNOWN = 0,
+    SIDECAR_IDX_ABSENT = 1,
+};
+typedef struct {
+    dev_t dev;
+    ino_t ino;
+    struct timespec mtime;
+    struct timespec ctime;
+    uint8_t state;
+} sidecar_idx_slot_t;
+#define SIDECAR_IDX_CACHE_SLOTS 64
+static sidecar_idx_slot_t sidecar_idx_cache[SIDECAR_IDX_CACHE_SLOTS];
+static pthread_mutex_t sidecar_idx_cache_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static size_t sidecar_idx_cache_slot(dev_t dev, ino_t ino)
+{
+    /* Mix dev into ino so two filesystems with overlapping inode numbers
+     * do not pin the same slot. The golden-ratio multiplier scatters small
+     * inode numbers across the table without needing a real hash.
+     */
+    uint64_t key = (uint64_t) ino ^ ((uint64_t) dev * 0x9E3779B97F4A7C15ULL);
+    return (size_t) (key % SIDECAR_IDX_CACHE_SLOTS);
+}
+
+static bool sidecar_idx_cache_matches(const sidecar_idx_slot_t *slot,
+                                      const struct stat *st)
+{
+    return slot->state != SIDECAR_IDX_UNKNOWN && slot->dev == st->st_dev &&
+           slot->ino == st->st_ino &&
+           slot->mtime.tv_sec == st->st_mtimespec.tv_sec &&
+           slot->mtime.tv_nsec == st->st_mtimespec.tv_nsec &&
+           slot->ctime.tv_sec == st->st_ctimespec.tv_sec &&
+           slot->ctime.tv_nsec == st->st_ctimespec.tv_nsec;
+}
+
+static int sidecar_idx_cache_lookup_stat(const struct stat *st)
+{
+    size_t slot = sidecar_idx_cache_slot(st->st_dev, st->st_ino);
+    int state = SIDECAR_IDX_UNKNOWN;
+    pthread_mutex_lock(&sidecar_idx_cache_lock);
+    if (sidecar_idx_cache_matches(&sidecar_idx_cache[slot], st))
+        state = sidecar_idx_cache[slot].state;
+    pthread_mutex_unlock(&sidecar_idx_cache_lock);
+    return state;
+}
+
+static void sidecar_idx_cache_set_stat(const struct stat *st, int state)
+{
+    size_t slot = sidecar_idx_cache_slot(st->st_dev, st->st_ino);
+    pthread_mutex_lock(&sidecar_idx_cache_lock);
+    sidecar_idx_cache[slot].dev = st->st_dev;
+    sidecar_idx_cache[slot].ino = st->st_ino;
+    sidecar_idx_cache[slot].mtime = st->st_mtimespec;
+    sidecar_idx_cache[slot].ctime = st->st_ctimespec;
+    sidecar_idx_cache[slot].state = (uint8_t) state;
+    pthread_mutex_unlock(&sidecar_idx_cache_lock);
+}
+
+/* Roll the cache entry back to UNKNOWN. Used by the index publish path
+ * before the renameat so a concurrent reader entering the post-rename
+ * window cannot consume a stale ABSENT and skip the openat. UNKNOWN forces
+ * the reader to consult the filesystem.
+ */
+static void sidecar_idx_cache_invalidate(int dirfd)
+{
+    struct stat st;
+    if (fstat(dirfd, &st) < 0)
+        return;
+    sidecar_idx_cache_set_stat(&st, SIDECAR_IDX_UNKNOWN);
+}
+
+/* Cached sysroot dirfd. sidecar_open_base opens the sysroot directory on
+ * every translated absolute path; for dynamic-linker bring-up that fires
+ * once per openat / fstat / access / readlink. Caching the host fd and
+ * handing the walker a dup turns that ~30 us open into a ~5 us dup. The
+ * cache invalidates lazily by comparing the path string -- proc_set_sysroot
+ * is rare (once at startup, once on fork-child IPC restore), so the
+ * snapshot+strcmp on every call is still a net win over the open it
+ * replaces. Concurrent vCPU lookups serialize on the lock for the cache check
+ * and dup.
+ */
+static int sidecar_sysroot_cached_fd = -1;
+static char sidecar_sysroot_cached_path[LINUX_PATH_MAX] = {0};
+static dev_t sidecar_sysroot_cached_dev;
+static ino_t sidecar_sysroot_cached_ino;
+static pthread_mutex_t sidecar_sysroot_cached_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static int sidecar_open_sysroot_cached(const char *path)
+{
+    pthread_mutex_lock(&sidecar_sysroot_cached_lock);
+    int cached = sidecar_sysroot_cached_fd;
+    if (cached >= 0 && !strcmp(path, sidecar_sysroot_cached_path)) {
+        /* Path text alone is not a sufficient cache key: a host-side
+         * rename or replace can rebind the same path to a different
+         * inode while the cached fd still resolves to the original.
+         * Stat the path on every hit and accept the cache only when
+         * (dev, ino) still match the inode captured at fill time. The
+         * stat is one host syscall per call, much cheaper than the
+         * open it replaces.
+         *
+         * Race window: a host-side mutation between this stat and the
+         * dup below can leave the cache returning the pre-mutation fd
+         * even though a fresh open(path) at that instant would resolve
+         * the post-mutation binding. The window is microseconds. It
+         * only matters under adversarial host-side sysroot mutation;
+         * normal sysroot configuration is canonicalized once at
+         * proc_set_sysroot time and never changes. Closing it
+         * adversarially would require dropping the cache, which gives
+         * back the ~25 us per-call win; not worth it absent a real
+         * reproducer.
+         */
+        struct stat current;
+        if (stat(path, &current) == 0 &&
+            current.st_dev == sidecar_sysroot_cached_dev &&
+            current.st_ino == sidecar_sysroot_cached_ino) {
+            cached = fcntl(cached, F_DUPFD_CLOEXEC, 0);
+            pthread_mutex_unlock(&sidecar_sysroot_cached_lock);
+            return cached;
+        }
+        /* Inode mismatch: fall through to refresh below. */
+    }
+    if (sidecar_sysroot_cached_fd >= 0) {
+        close(sidecar_sysroot_cached_fd);
+        sidecar_sysroot_cached_fd = -1;
+        sidecar_sysroot_cached_path[0] = '\0';
+    }
+    int fresh = open(path, O_RDONLY | O_DIRECTORY | O_CLOEXEC);
+    if (fresh < 0) {
+        pthread_mutex_unlock(&sidecar_sysroot_cached_lock);
+        return -1;
+    }
+    /* Capture the inode of the freshly-opened dir so subsequent cache
+     * hits can validate against it. Using fstat on the open fd gives
+     * the same (dev, ino) as a path stat would, with no second walk.
+     */
+    struct stat fresh_st;
+    if (fstat(fresh, &fresh_st) < 0) {
+        close(fresh);
+        pthread_mutex_unlock(&sidecar_sysroot_cached_lock);
+        return -1;
+    }
+    /* CLOEXEC dup: a concurrent posix_spawn from another vCPU thread
+     * must not inherit the sysroot dirfd into the fork-child. F_DUPFD_CLOEXEC
+     * sets the flag atomically with the dup, closing the inheritance window
+     * that plain dup() leaves open.
+     */
+    int cache_fd = fcntl(fresh, F_DUPFD_CLOEXEC, 0);
+    if (cache_fd >= 0) {
+        sidecar_sysroot_cached_fd = cache_fd;
+        sidecar_sysroot_cached_dev = fresh_st.st_dev;
+        sidecar_sysroot_cached_ino = fresh_st.st_ino;
+        str_copy_trunc(sidecar_sysroot_cached_path, path,
+                       sizeof(sidecar_sysroot_cached_path));
+    }
+    pthread_mutex_unlock(&sidecar_sysroot_cached_lock);
+    return fresh;
+}
+
 typedef struct {
     char *guest_name;
     char token[SIDECAR_TOKEN_NAME_LEN + 1];
@@ -163,10 +336,25 @@ static int sidecar_load_index(int dirfd, sidecar_index_t *index)
 {
     memset(index, 0, sizeof(*index));
 
+    /* Per-dir absence cache: skip the openat round-trip only while the
+     * directory metadata still matches the snapshot that produced ENOENT.
+     * Another elfuse process can publish the index through the fcntl-locked
+     * writer, so cross-process invalidation relies on the directory mtime/ctime
+     * changing when the sidecar index appears.
+     */
+    struct stat dir_st;
+    bool have_dir_st = fstat(dirfd, &dir_st) == 0;
+    if (have_dir_st &&
+        sidecar_idx_cache_lookup_stat(&dir_st) == SIDECAR_IDX_ABSENT)
+        return 0;
+
     int fd = openat(dirfd, SIDECAR_INDEX_NAME, O_RDONLY | O_CLOEXEC);
     if (fd < 0) {
-        if (errno == ENOENT)
+        if (errno == ENOENT) {
+            if (have_dir_st)
+                sidecar_idx_cache_set_stat(&dir_st, SIDECAR_IDX_ABSENT);
             return 0;
+        }
         return -1;
     }
 
@@ -355,7 +543,7 @@ static int sidecar_open_base(guest_fd_t dirfd,
             errno = ENAMETOOLONG;
             return -1;
         }
-        *base_fd = open(sysroot, O_RDONLY | O_DIRECTORY | O_CLOEXEC);
+        *base_fd = sidecar_open_sysroot_cached(sysroot);
         if (*base_fd < 0)
             return -1;
         *absolute = true;
@@ -900,6 +1088,14 @@ static int sidecar_write_locked_index(int parent_dirfd,
         errno = saved_errno;
         return -1;
     }
+    /* Invalidate the cache to UNKNOWN BEFORE the rename so a concurrent
+     * walker that loses the race to read the new file does not return a
+     * stale ABSENT verdict from this slot. A reader entering the window
+     * between rename and post-rename mark would otherwise skip the openat
+     * and miss the freshly-published index. UNKNOWN forces the reader to
+     * do the openat and observe the new file directly.
+     */
+    sidecar_idx_cache_invalidate(parent_dirfd);
     if (renameat(parent_dirfd, SIDECAR_INDEX_TMP_NAME, parent_dirfd,
                  SIDECAR_INDEX_NAME) < 0) {
         int saved_errno = errno;
diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c
index be97787..4be4859 100644
--- a/src/syscall/syscall.c
+++ b/src/syscall/syscall.c
@@ -64,6 +64,8 @@
 
 #include "core/shim-globals.h"
 
+#include "debug/syscall-hist.h"
+
 /* Generated from src/syscall/dispatch.tbl into $(BUILD_DIR). */
 #include "dispatch.h"
 
@@ -1737,6 +1739,11 @@ int syscall_dispatch(hv_vcpu_t vcpu, guest_t *g, int *exit_code, bool verbose)
     int nr = (int) x8;
     const syscall_entry_t *entry = NULL;
 
+    /* Per-syscall histogram for the dynamic-linker bring-up storm. Zero when
+     * disabled so the bottom-of-dispatch record path is a single branch.
+     */
+    uint64_t hist_start_ns = syscall_hist_now_ns();
+
     if (!verbose) {
         if (nr == SYS_getpid || nr == SYS_getppid || nr == SYS_gettid ||
             nr == SYS_getpgid || nr == SYS_getuid || nr == SYS_geteuid ||
@@ -1850,6 +1857,17 @@ int syscall_dispatch(hv_vcpu_t vcpu, guest_t *g, int *exit_code, bool verbose)
             should_exit = true;
             result = 0; /* Not written back, but keep clean */
         } else if (result == SYSCALL_EXEC_HAPPENED) {
+            if (hist_start_ns) {
+                /* Guard the subtraction: syscall_hist_now_ns returns 0
+                 * if clock_gettime fails. An unsigned 0 minus a non-zero
+                 * start would underflow to a huge value and pollute the
+                 * histogram with a bogus latency sample.
+                 */
+                uint64_t hist_end_ns = syscall_hist_now_ns();
+                if (hist_end_ns >= hist_start_ns)
+                    syscall_hist_record(nr, hist_end_ns - hist_start_ns);
+                syscall_hist_freeze("frozen at first execve");
+            }
             return SYSCALL_EXEC_HAPPENED;
         }
     } else {
@@ -1912,5 +1930,11 @@ int syscall_dispatch(hv_vcpu_t vcpu, guest_t *g, int *exit_code, bool verbose)
         tlbi_request_clear();
     }
 
+    if (hist_start_ns) {
+        uint64_t hist_end_ns = syscall_hist_now_ns();
+        if (hist_end_ns >= hist_start_ns)
+            syscall_hist_record(nr, hist_end_ns - hist_start_ns);
+    }
+
     return should_exit;
 }