diff --git a/Makefile b/Makefile
index 7f4814f..6e612dd 100644
--- a/Makefile
+++ b/Makefile
@@ -23,6 +23,7 @@ SRCS := \
     core/elf.c \
     core/stack.c \
     core/vdso.c \
+    core/shim-globals.c \
     core/bootstrap.c \
     core/rosetta.c \
     core/sysroot.c \
@@ -160,6 +161,24 @@ $(BUILD_DIR)/test-pthread: tests/test-pthread.c | $(BUILD_DIR)
 	@echo "  CROSS   $< (with -lpthread)"
 	$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread
 
+# test-shim-cred-race spawns a pthread reader while the main thread
+# toggles setresuid; the reader spins on the identity fast path.
+$(BUILD_DIR)/test-shim-cred-race: tests/test-shim-cred-race.c | $(BUILD_DIR)
+	@echo "  CROSS   $< (with -lpthread)"
+	$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread
+
+# test-shim-urandom-smp spawns N pthreads racing on a shared FD_URANDOM
+# slot to exercise the shim's LDXR/STXR head-advance under contention.
+$(BUILD_DIR)/test-shim-urandom-smp: tests/test-shim-urandom-smp.c | $(BUILD_DIR)
+	@echo "  CROSS   $< (with -lpthread)"
+	$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread
+
+# test-shim-urandom-toctou races mprotect(PROT_NONE) against urandom
+# reads to exercise the EL1 data abort recovery path. Needs pthreads.
+$(BUILD_DIR)/test-shim-urandom-toctou: tests/test-shim-urandom-toctou.c | $(BUILD_DIR)
+	@echo "  CROSS   $< (with -lpthread)"
+	$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread
+
 # test-fuse-basic runs a guest daemon thread and consumer in one process
 $(BUILD_DIR)/test-fuse-basic: tests/test-fuse-basic.c | $(BUILD_DIR)
 	@echo "  CROSS   $< (with -lpthread)"
diff --git a/src/core/bootstrap.c b/src/core/bootstrap.c
index c6522df..23625f0 100644
--- a/src/core/bootstrap.c
+++ b/src/core/bootstrap.c
@@ -20,7 +20,9 @@
 
 #include "core/bootstrap.h"
 #include "core/rosetta.h"
+#include "core/shim-globals.h"
 #include "core/stack.h"
+#include "core/startup-trace.h"
 #include "core/vdso.h"
 
 #include "runtime/thread.h"
@@ -30,6 +32,7 @@
 #include "syscall/internal.h"
 #include "syscall/path.h"
 #include "syscall/proc.h"
+#include "syscall/signal.h"
 
 #include "debug/log.h"
 
@@ -94,20 +97,25 @@ static void register_elf_segment_regions(guest_t *g,
     }
 }
 
-/* Publish shim, shim-data, heap, stack-guard, and stack regions to the
+/* Publish shim, shim-data, heap, stack-guard, and stack regions to
  * /proc/self/maps view, and invalidate the null page and stack-guard PTEs.
- * Shared by guest_bootstrap_prepare and guest_bootstrap_rosetta_post_reset;
- * the caller registers ELF or rosetta segments separately because those
- * differ between aarch64 and rosetta guests.
+ * Shared by guest_bootstrap_prepare and guest_bootstrap_rosetta_post_reset; the
+ * caller registers ELF or rosetta segments separately because those differ
+ * between aarch64 and rosetta guests.
  */
 static void register_runtime_regions(guest_t *g, size_t shim_bin_len)
 {
     guest_region_add(g, g->shim_base, g->shim_base + shim_bin_len,
                      LINUX_PROT_READ | LINUX_PROT_EXEC, LINUX_MAP_PRIVATE, 0,
                      "[shim]");
+    /* shim_data is mapped privileged-only (AP[2:1]=00) in the page tables; the
+     * EL1 shim has full RW but EL0 cannot read or write. Report PROT_NONE in
+     * /proc/self/maps so guest tooling treats it as inaccessible, matching what
+     * dereferencing the GVA actually does (translation fault -> EL0 SIGSEGV
+     * path).
+     */
     guest_region_add(g, g->shim_data_base, g->shim_data_base + BLOCK_2MIB,
-                     LINUX_PROT_READ | LINUX_PROT_WRITE, LINUX_MAP_PRIVATE, 0,
-                     "[shim-data]");
+                     LINUX_PROT_NONE, LINUX_MAP_PRIVATE, 0, "[shim-data]");
 
     if (g->brk_base < g->brk_current) {
         guest_region_add(g, g->brk_base, g->brk_current,
@@ -246,8 +254,11 @@ static bool load_interpreter(guest_t *g,
     }
 
     boot->interp_base = g->interp_base;
+    uint64_t infra_lo = g->interp_base - INFRA_RESERVE;
+    uint64_t infra_hi = g->interp_base;
     if (elf_map_segments(&boot->interp_info, boot->interp_resolved,
-                         g->host_base, g->guest_size, boot->interp_base) < 0) {
+                         g->host_base, g->guest_size, boot->interp_base,
+                         infra_lo, infra_hi) < 0) {
         log_error("failed to map interpreter segments");
         if (interp_host_temp)
             unlink(boot->interp_resolved);
@@ -277,20 +288,28 @@ static bool build_boot_regions(mem_region_t *regions,
      */
     if (!append_boot_region(regions, nregions, g->shim_base,
                             g->shim_base + shim_bin_len, MEM_PERM_RX) ||
+        /* shim_data is EL1-only: the guest must not directly read or write the
+         * identity cache, attention flag, urandom bitmap, or ring, any of which
+         * would let it spoof its own syscall results. The EL1 shim itself has
+         * full RW. /proc/self/maps still lists [shim-data] (region tracking is
+         * independent of EL0 access), but EL0 dereferences fault to the SIGSEGV
+         * path.
+         */
         !append_boot_region(regions, nregions, g->shim_data_base,
-                            g->shim_data_base + BLOCK_2MIB, MEM_PERM_RW) ||
+                            g->shim_data_base + BLOCK_2MIB,
+                            MEM_PERM_RW_EL1_ONLY) ||
         !append_boot_region(regions, nregions, VDSO_BASE, VDSO_BASE + VDSO_SIZE,
                             MEM_PERM_RX)) {
         return false;
     }
 
-    /* Rosetta guests never load the x86_64 ELF or its interpreter into
-     * guest memory; rosetta itself reads the target via fd 3 once it is
-     * running. Adding those segments to the page-table builder would emit
-     * ghost L2/L3 entries at the binary's x86_64 link address (typically
-     * 0x400000) pointing into uninitialized primary-buffer GPAs. The
-     * rosetta image's own segments are registered by rosetta_prepare's
-     * separate region append in the bootstrap caller.
+    /* Rosetta guests never load the x86_64 ELF or its interpreter into guest
+     * memory; rosetta itself reads the target via fd 3 once it is running.
+     * Adding those segments to the page-table builder would emit ghost L2/L3
+     * entries at the binary's x86_64 link address (typically 0x400000) pointing
+     * into uninitialized primary-buffer GPAs. The rosetta image's own segments
+     * are registered by rosetta_prepare's separate region append in the
+     * bootstrap caller.
      */
     if (!g->is_rosetta) {
         if (!append_elf_segment_regions(regions, nregions, &boot->elf_info,
@@ -334,14 +353,17 @@ int guest_bootstrap_prepare(guest_t *g,
     mem_region_t regions[MAX_BOOT_REGIONS];
     int nregions = 0;
     uint64_t native_vdso;
+    uint64_t t0;
 
     memset(boot, 0, sizeof(*boot));
     *guest_initialized = false;
 
+    t0 = startup_trace_now_ns();
     if (elf_load(elf_host_path, &boot->elf_info) < 0) {
         log_error("failed to load ELF: %s", elf_host_path);
         return -1;
     }
+    startup_trace_step("elf_load", t0);
 
     bool want_rosetta = false;
     if (boot->elf_info.e_machine == EM_X86_64) {
@@ -366,18 +388,20 @@ int guest_bootstrap_prepare(guest_t *g,
         (unsigned long long) boot->elf_info.load_max,
         want_rosetta ? "x86_64-via-rosetta" : "aarch64");
 
-    /* Rosetta is statically linked at 0x800000000000 (128 TiB), beyond the
-     * 36 and 40-bit IPA ranges. Request 48-bit IPA up-front so the
-     * page-table builder can reach the rosetta segments. HVF clamps to its
-     * supported size; on M1 hosts the upstream hyper-linux audit confirms
-     * 48 is honoured even though the auto-detect default returns 36, so
-     * the request is non-fatal in either direction.
+    /* Rosetta is statically linked at 0x800000000000 (128 TiB), beyond the 36
+     * and 40-bit IPA ranges. Request 48-bit IPA up-front so the page-table
+     * builder can reach the rosetta segments. HVF clamps to its supported size;
+     * on M1 hosts the upstream hyper-linux audit confirms 48 is honoured even
+     * though the auto-detect default returns 36, so the request is non-fatal in
+     * either direction.
      */
     uint32_t req_ipa = want_rosetta ? 48 : 0;
+    t0 = startup_trace_now_ns();
     if (guest_init(g, 0, req_ipa) < 0) {
         log_error("failed to initialize guest");
         return -1;
     }
+    startup_trace_step("guest_init", t0);
     *guest_initialized = true;
     g->is_rosetta = want_rosetta;
     proc_set_rosetta_active(want_rosetta);
@@ -391,8 +415,8 @@ int guest_bootstrap_prepare(guest_t *g,
     if (want_rosetta) {
         /* Rosetta path: no x86_64 ELF segments are loaded into guest memory
          * (rosetta itself does that lazily once it starts running). brk and
-         * stack use the same defaults the aarch64 path falls back to when
-         * the binary sits at low VAs; the x86_64 binary's load_max would be
+         * stack use the same defaults the aarch64 path falls back to when the
+         * binary sits at low VAs; the x86_64 binary's load_max would be
          * meaningless here because nothing of it actually lives in primary
          * buffer GPA space.
          */
@@ -405,11 +429,16 @@ int guest_bootstrap_prepare(guest_t *g,
     } else {
         boot->elf_load_base =
             (boot->elf_info.e_type == ET_DYN) ? PIE_LOAD_BASE : 0;
+        t0 = startup_trace_now_ns();
+        uint64_t infra_lo = g->interp_base - INFRA_RESERVE;
+        uint64_t infra_hi = g->interp_base;
         if (elf_map_segments(&boot->elf_info, elf_host_path, g->host_base,
-                             g->guest_size, boot->elf_load_base) < 0) {
+                             g->guest_size, boot->elf_load_base, infra_lo,
+                             infra_hi) < 0) {
             log_error("failed to map ELF segments");
             return -1;
         }
+        startup_trace_step("elf_map_segments", t0);
 
         /* Track the lowest loaded ELF address so the legacy fork IPC path
          * copies low-linked ET_EXECs (e.g. linked at 0x200000) in full.
@@ -427,8 +456,10 @@ int guest_bootstrap_prepare(guest_t *g,
             g->stack_top = STACK_TOP_DEFAULT;
         g->stack_base = g->stack_top - STACK_SIZE;
 
+        t0 = startup_trace_now_ns();
         if (!load_interpreter(g, sysroot, boot))
             return -1;
+        startup_trace_step("load_interpreter", t0);
     }
 
     if (shim_bin_len > BLOCK_2MIB) {
@@ -436,6 +467,7 @@ int guest_bootstrap_prepare(guest_t *g,
         return -1;
     }
 
+    t0 = startup_trace_now_ns();
     memcpy((uint8_t *) g->host_base + g->shim_base, shim_bin, shim_bin_len);
     log_debug("shim loaded at offset 0x%llx (%zu bytes)",
               (unsigned long long) g->shim_base, shim_bin_len);
@@ -448,12 +480,15 @@ int guest_bootstrap_prepare(guest_t *g,
     }
     sys_icache_invalidate((uint8_t *) g->host_base + g->shim_base,
                           shim_bin_len);
+    startup_trace_step("shim_load_icache", t0);
 
+    t0 = startup_trace_now_ns();
     if (!build_boot_regions(regions, &nregions, g, boot, shim_bin_len)) {
         log_error("too many memory regions (%d >= %d)", nregions,
                   MAX_BOOT_REGIONS);
         return -1;
     }
+    startup_trace_step("build_boot_regions", t0);
 
     /* Rosetta path: append the rosetta image as a non-identity region so the
      * page-table builder maps VA 0x800000000000 -> primary buffer GPA.
@@ -461,24 +496,29 @@ int guest_bootstrap_prepare(guest_t *g,
      * from the same pool that guest_build_page_tables is about to consume).
      */
     if (want_rosetta) {
+        t0 = startup_trace_now_ns();
         if (rosetta_prepare(g, elf_host_path, regions, &nregions,
                             MAX_BOOT_REGIONS, verbose, &rr) < 0) {
             log_error("rosetta_prepare failed for %s", elf_guest_path);
             return -1;
         }
+        startup_trace_step("rosetta_prepare", t0);
     }
 
+    t0 = startup_trace_now_ns();
     boot->ttbr0 = guest_build_page_tables(g, regions, nregions);
     if (!boot->ttbr0) {
         log_error("failed to build page tables");
         return -1;
     }
+    startup_trace_step("guest_build_page_tables", t0);
     /* No TLBI request here: the shim's _start does TLBI VMALLE1IS before
      * enabling the MMU (src/core/shim.S), and the per-vCPU accumulator is the
      * wrong place to stage a bring-up flush -- bootstrap may run on a thread
      * whose slot is later consumed by an unrelated syscall.
      */
 
+    t0 = startup_trace_now_ns();
     if (want_rosetta) {
         /* /proc/self/maps for a rosetta guest reports the rosetta translator
          * as a single anonymous region covering [VA, VA+size). The original
@@ -505,12 +545,14 @@ int guest_bootstrap_prepare(guest_t *g,
     }
 
     register_runtime_regions(g, shim_bin_len);
+    startup_trace_step("register_regions", t0);
 
     log_debug("TTBR0=0x%llx, IPA base=0x%llx", (unsigned long long) boot->ttbr0,
               (unsigned long long) g->ipa_base);
     if (verbose)
         log_initial_page_tables(g, boot->ttbr0);
 
+    t0 = startup_trace_now_ns();
     syscall_init();
     proc_init();
 
@@ -526,6 +568,7 @@ int guest_bootstrap_prepare(guest_t *g,
     proc_set_elf_path(elf_guest_path);
     if (sysroot)
         proc_set_sysroot(sysroot);
+    startup_trace_step("runtime_init", t0);
 
     /* rosetta_finalize pre-opens the x86_64 binary at fd 3, constructs the
      * binfmt_misc argv ([ROSETTA_PATH, binary, original_argv[1..]]), refreshes
@@ -536,18 +579,22 @@ int guest_bootstrap_prepare(guest_t *g,
     int rosetta_argc = 0;
     const char **rosetta_argv = NULL;
     if (want_rosetta) {
+        t0 = startup_trace_now_ns();
         if (rosetta_finalize(g, 0, elf_host_path, elf_host_path_temp,
                              elf_guest_path, guest_argc, guest_argv, &rr,
                              verbose, &rosetta_argc, &rosetta_argv, NULL) < 0) {
             log_error("rosetta_finalize failed");
             return -1;
         }
+        startup_trace_step("rosetta_finalize", t0);
     } else {
         proc_set_cmdline(guest_argc, guest_argv);
     }
     proc_set_environ((const char **) environ);
 
+    t0 = startup_trace_now_ns();
     native_vdso = vdso_build(g);
+    startup_trace_step("vdso_build", t0);
     linux_stack_auxv_t auxv;
     const elf_info_t *stack_elf =
         want_rosetta ? &rr.rosetta_info : &boot->elf_info;
@@ -555,6 +602,7 @@ int guest_bootstrap_prepare(guest_t *g,
     uint64_t stack_interp_base = want_rosetta ? 0 : boot->interp_base;
     int stack_argc = want_rosetta ? rosetta_argc : guest_argc;
     const char **stack_argv = want_rosetta ? rosetta_argv : guest_argv;
+    t0 = startup_trace_now_ns();
     boot->stack_pointer = build_linux_stack(
         g, g->stack_top, stack_argc, stack_argv, (const char **) environ,
         stack_elf, stack_elf_load_base, stack_interp_base, native_vdso, -1,
@@ -564,6 +612,7 @@ int guest_bootstrap_prepare(guest_t *g,
         free(rosetta_argv);
         return -1;
     }
+    startup_trace_step("build_linux_stack", t0);
     /* rosetta_argv was copied into the guest stack; the host allocation is
      * no longer needed. The strings themselves are constants (ROSETTA_PATH)
      * or owned by the caller (binary_path, guest_argv entries) so freeing
@@ -599,6 +648,7 @@ int guest_bootstrap_create_vcpu(guest_t *g,
 {
     uint64_t sctlr;
     uint64_t sctlr_with_mmu;
+    uint64_t t0;
     /* Rosetta needs TTBR1 walks enabled and TBI1=1 so the kbuf window at
      * KBUF_VA_BASE (bits-63-set) resolves and TaggedPointer extraction keeps
      * working. Aarch64 guests stay on the EPD1=1 variant which keeps the
@@ -613,7 +663,9 @@ int guest_bootstrap_create_vcpu(guest_t *g,
     hv_vcpu_t vcpu;
     hv_vcpu_exit_t *vexit;
 
+    t0 = startup_trace_now_ns();
     HV_CHECK(hv_vcpu_create(&vcpu, &vexit, NULL));
+    startup_trace_step("hv_vcpu_create", t0);
     g->vcpu = vcpu;
     g->exit = vexit;
     *out_vcpu = vcpu;
@@ -621,6 +673,7 @@ int guest_bootstrap_create_vcpu(guest_t *g,
 
     thread_register_main(vcpu, vexit, proc_get_pid(), el1_sp);
 
+    t0 = startup_trace_now_ns();
     HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_VBAR_EL1, shim_ipa + 0x800));
     HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_MAIR_EL1, 0xFF00));
     HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TCR_EL1, tcr_value));
@@ -632,6 +685,52 @@ int guest_bootstrap_create_vcpu(guest_t *g,
     HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL0, sp_ipa));
     HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL1, el1_sp));
 
+    /* Round-trip a sentinel through TPIDR_EL1 before installing the real
+     * value. Validates only the hv_vcpu_{set,get}_sys_reg pre-run round
+     * trip, not preservation across hv_vcpu_run -- the test-shim-identity
+     * microbench is the end-to-end check for that.
+     */
+    if (shim_globals_self_test(vcpu) < 0)
+        return -1;
+    /* TPIDR_EL1 -> shim_globals base, CONTEXTIDR_EL1 -> tid (== pid for the
+     * initial main thread). gettid fast path reads CONTEXTIDR_EL1 directly.
+     */
+    if (shim_globals_install_per_vcpu(vcpu, g, proc_get_pid()) < 0)
+        return -1;
+
+    /* Zero the shim-globals region and publish the initial identity so the very
+     * first getpid / getuid / etc. SVC #0 hits the cache instead of returning
+     * the all-zero seed. Future setuid/setgid paths refresh creds via
+     * cred_publish_after; fork-child has its own publish on the inherited
+     * identity.
+     */
+    shim_globals_init(g);
+    shim_globals_set_trace_enabled(g, verbose);
+    shim_globals_publish_pid(g, proc_get_pid(), proc_get_ppid());
+    shim_globals_publish_creds(g, proc_get_uid(), proc_get_euid(),
+                               proc_get_gid(), proc_get_egid());
+    /* Pre-fill the entropy ring so the first read(/dev/urandom) from the guest
+     * is served by the shim fast path with no cold-start HVC for refill.
+     */
+    shim_globals_refill_urandom_ring(g);
+    /* Register the singleton guest pointer so signal_queue and the itimer
+     * setters can raise the attention flag without threading g through every
+     * call site. signal_init clears this defensively; the first registration
+     * must run after both proc_init and shim_globals_init.
+     */
+    signal_set_shim_globals_guest(g);
+    /* Same singleton pattern but for the fd-table hooks that update the urandom
+     * bitmap. Must run before any FD_URANDOM-typed slot is allocated; bootstrap
+     * finishes before any guest syscall runs.
+     */
+    shim_globals_set_singleton(g);
+
+    /* CNTKCTL_EL1.EL0VCTEN | EL0PCTEN: allow EL0 to read {CNTVCT,CNTPCT}_EL0.
+     * Required by the vDSO clock_gettime fast path (and is the default on
+     * native Linux), without which the guest gets 0 back from MRS.
+     */
+    HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_CNTKCTL_EL1, 0x3ULL));
+
     HV_CHECK(hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_SCTLR_EL1, &sctlr));
     log_debug("SCTLR_EL1 default=0x%llx", (unsigned long long) sctlr);
 
@@ -645,6 +744,7 @@ int guest_bootstrap_create_vcpu(guest_t *g,
     sctlr_with_mmu = SCTLR_RES1 | SCTLR_M | SCTLR_C | SCTLR_I | SCTLR_DZE |
                      SCTLR_UCT | SCTLR_UCI;
     HV_CHECK(hv_vcpu_set_reg(vcpu, HV_REG_X0, sctlr_with_mmu));
+    startup_trace_step("hv_vcpu_configure", t0);
 
     log_debug(
         "vCPU configured: PC=0x%llx SCTLR=0x%llx VBAR=0x%llx TTBR0=0x%llx "
diff --git a/src/core/elf.c b/src/core/elf.c
index 316ad7c..c20195c 100644
--- a/src/core/elf.c
+++ b/src/core/elf.c
@@ -208,8 +208,16 @@ int elf_map_segments(const elf_info_t *info,
                      const char *path,
                      void *guest_base,
                      uint64_t guest_size,
-                     uint64_t load_base)
+                     uint64_t load_base,
+                     uint64_t infra_lo,
+                     uint64_t infra_hi)
 {
+    /* Half-open intersection test for [a, a+alen) and [b, b+blen). When
+     * infra_lo == infra_hi the caller opted out (early bring-up before
+     * guest_t is wired up); the host-side writes that follow still get
+     * the existing guest_size bound check.
+     */
+    bool infra_active = infra_lo < infra_hi;
     FILE *f = fopen(path, "rb");
     if (!f) {
         perror(path);
@@ -264,6 +272,17 @@ int elf_map_segments(const elf_info_t *info,
         fclose(f);
         return -1;
     }
+    if (infra_active && phdr_dest < infra_hi &&
+        phdr_dest + ph_total > infra_lo) {
+        log_error(
+            "%s: program headers at 0x%llx overlap infra reserve "
+            "[0x%llx, 0x%llx)",
+            path, (unsigned long long) phdr_dest, (unsigned long long) infra_lo,
+            (unsigned long long) infra_hi);
+        free(ph_buf);
+        fclose(f);
+        return -1;
+    }
     memcpy((uint8_t *) guest_base + phdr_dest, ph_buf, ph_total);
 
     /* Copy PT_LOAD contents after AT_PHDR is in place; ET_DYN segments are
@@ -308,15 +327,34 @@ int elf_map_segments(const elf_info_t *info,
             return -1;
         }
 
-        /* Zero the full page-aligned segment extent, not only p_memsz.
-         * Linux guarantees zero-filled tail bytes in the last mapped page,
-         * and some dynamic linkers allocate from that page tail before they
-         * request more memory. Leaving stale bytes there leaks state across
-         * execve and corrupts the new image.
+        /* The host memset zeros PAGE_ALIGN_UP(memsz) bytes, not just memsz,
+         * so the infra-overlap check has to use the same rounded extent.
+         * Without the rounding here, a segment that ends just below
+         * infra_lo passes the check and still spills up to PAGE_SIZE-1
+         * bytes of zero into the infra reserve via the page tail.
          */
         uint64_t zero_len = PAGE_ALIGN_UP(memsz);
         if (gpa + zero_len > guest_size)
             zero_len = guest_size - gpa;
+        if (infra_active && gpa < infra_hi && gpa + zero_len > infra_lo) {
+            log_error(
+                "%s: segment at 0x%llx+0x%llx (zero-extent 0x%llx) overlaps "
+                "infra reserve [0x%llx, 0x%llx)",
+                path, (unsigned long long) gpa, (unsigned long long) memsz,
+                (unsigned long long) zero_len, (unsigned long long) infra_lo,
+                (unsigned long long) infra_hi);
+            free(ph_buf);
+            fclose(f);
+            return -1;
+        }
+
+        /* Zero the full page-aligned segment extent (zero_len computed above
+         * with guest_size and infra_reserve checks). Linux guarantees
+         * zero-filled tail bytes in the last mapped page, and some dynamic
+         * linkers allocate from that page tail before they request more
+         * memory. Leaving stale bytes there leaks state across execve and
+         * corrupts the new image.
+         */
         memset((uint8_t *) guest_base + gpa, 0, zero_len);
 
         /* Overlay initialized bytes after zeroing so BSS and page tail remain
diff --git a/src/core/elf.h b/src/core/elf.h
index 6ff5fbc..33f4813 100644
--- a/src/core/elf.h
+++ b/src/core/elf.h
@@ -109,13 +109,20 @@ int elf_load(const char *path, elf_info_t *info);
  * Also copies program headers into guest memory for AT_PHDR.
  * load_base is added to all virtual addresses (0 for ET_EXEC at link addr,
  * non-zero for ET_DYN loaded at a chosen base).
+ * infra_lo and infra_hi delimit the runtime infra reserve (page-table pool,
+ * shim text, shim_data, vDSO). Any PT_LOAD or PT_PHDR copy whose destination
+ * intersects [infra_lo, infra_hi) is rejected: those writes go through
+ * host_base directly and would otherwise bypass the EL1-only page-table
+ * protection on shim_data. Pass 0,0 only when the guest_t is not yet built.
  * Returns 0 on success, -1 on failure.
  */
 int elf_map_segments(const elf_info_t *info,
                      const char *path,
                      void *guest_base,
                      uint64_t guest_size,
-                     uint64_t load_base);
+                     uint64_t load_base,
+                     uint64_t infra_lo,
+                     uint64_t infra_hi);
 
 /* Resolve a PT_INTERP path against a sysroot directory.
  * Tries three strategies:
diff --git a/src/core/guest.c b/src/core/guest.c
index 6393b00..fa2a8a6 100644
--- a/src/core/guest.c
+++ b/src/core/guest.c
@@ -38,6 +38,7 @@
 #include <unistd.h>
 
 #include "core/guest.h"
+#include "core/startup-trace.h"
 #include "debug/log.h"
 #include "utils.h"
 #include "runtime/thread.h" /* thread_destroy_all_vcpus */
@@ -60,6 +61,7 @@ static void guest_region_clear(guest_t *g);
 #define PT_UXN (1ULL << 54)      /* Unprivileged Execute Never */
 #define PT_PXN (1ULL << 53)      /* Privileged Execute Never */
 #define PT_AP_RW_EL0 (1ULL << 6) /* AP[2:1]=01: RW at EL1, RW at EL0 */
+#define PT_AP_RW_EL1 (0ULL << 6) /* AP[2:1]=00: RW at EL1, no access EL0 */
 #define PT_AP_RO (3ULL << 6)     /* AP[2:1]=11: RO at EL1, RO at EL0 */
 
 /* PAGE_SIZE / ALIGN_2MB_* live in utils.h; BLOCK_2MIB lives in core/guest.h. */
@@ -202,6 +204,8 @@ static uint64_t *pt_at(const guest_t *g, uint64_t gpa)
 
 int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
 {
+    uint64_t t0;
+
     memset(g, 0, sizeof(*g));
     g->shm_fd = -1;
     g->ipa_base = GUEST_IPA_BASE;
@@ -257,6 +261,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
      * seconds max wait) to handle this gracefully.
      */
     hv_return_t ret = HV_ERROR;
+    t0 = startup_trace_now_ns();
     for (int attempt = 0; attempt < 30; attempt++) {
         hv_vm_config_t config = hv_vm_config_create();
         hv_vm_config_set_ipa_size(config, vm_ipa);
@@ -266,6 +271,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
             break;
         usleep(500000); /* 500ms between attempts */
     }
+    startup_trace_step("hv_vm_create", t0);
     if (ret != HV_SUCCESS) {
         log_error("guest: hv_vm_create failed: %d (ipa_bits=%u)", (int) ret,
                   vm_ipa);
@@ -307,8 +313,10 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
          * physical memory. Do NOT memset because that would touch every
          * page and defeat demand paging.
          */
+        t0 = startup_trace_now_ns();
         g->host_base = mmap(NULL, try_size, PROT_READ | PROT_WRITE,
                             MAP_ANON | MAP_PRIVATE, -1, 0);
+        startup_trace_step("primary_mmap", t0);
         if (g->host_base == MAP_FAILED) {
             perror("guest: mmap");
             g->host_base = NULL;
@@ -320,6 +328,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
          * path instead of SCM_RIGHTS fd passing.
          */
         char tmppath[] = "/tmp/elfuse-XXXXXX";
+        t0 = startup_trace_now_ns();
         int sfd = mkstemp(tmppath);
         if (sfd >= 0) {
             unlink(tmppath); /* Unlink immediately; fd keeps file alive */
@@ -335,9 +344,12 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
                 close(sfd);
             }
         }
+        startup_trace_step("cow_shm_upgrade", t0);
 
+        t0 = startup_trace_now_ns();
         ret = hv_vm_map(g->host_base, GUEST_IPA_BASE, try_size,
                         HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC);
+        startup_trace_step("hv_vm_map", t0);
         if (ret == HV_SUCCESS) {
             mapped_size = try_size;
             mapped = true;
@@ -380,6 +392,8 @@ int guest_init_from_shm(guest_t *g,
                         uint64_t size,
                         uint32_t ipa_bits)
 {
+    uint64_t t0;
+
     memset(g, 0, sizeof(*g));
     g->shm_fd = -1; /* Child does not own the shm */
     g->ipa_base = GUEST_IPA_BASE;
@@ -403,8 +417,10 @@ int guest_init_from_shm(guest_t *g,
      * the parent's frozen snapshot; writes are private to this process.
      * macOS CoW is page-granular: only modified pages are duplicated.
      */
+    t0 = startup_trace_now_ns();
     g->host_base =
         mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, shm_fd, 0);
+    startup_trace_step("shm_mmap", t0);
     if (g->host_base == MAP_FAILED) {
         perror("guest: mmap shm");
         g->host_base = NULL;
@@ -417,6 +433,7 @@ int guest_init_from_shm(guest_t *g,
 
     /* Create HVF VM with the same IPA width as the parent */
     hv_return_t ret = HV_ERROR;
+    t0 = startup_trace_now_ns();
     for (int attempt = 0; attempt < 30; attempt++) {
         hv_vm_config_t config = hv_vm_config_create();
         hv_vm_config_set_ipa_size(config, ipa_bits);
@@ -426,6 +443,7 @@ int guest_init_from_shm(guest_t *g,
             break;
         usleep(500000);
     }
+    startup_trace_step("hv_vm_create_shm", t0);
     if (ret != HV_SUCCESS) {
         log_error("guest: hv_vm_create (shm) failed: %d", (int) ret);
         munmap(g->host_base, size);
@@ -433,8 +451,10 @@ int guest_init_from_shm(guest_t *g,
         return -1;
     }
 
+    t0 = startup_trace_now_ns();
     ret = hv_vm_map(g->host_base, GUEST_IPA_BASE, size,
                     HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC);
+    startup_trace_step("hv_vm_map_shm", t0);
     if (ret != HV_SUCCESS) {
         log_error("guest: hv_vm_map (shm) failed: %d", (int) ret);
         hv_vm_destroy();
@@ -1106,6 +1126,16 @@ static int gva_translate_perm(const guest_t *g,
             return -1;
 
         int perms = desc_to_perms(l3[l3_idx]);
+        /* EL1-only pages (shim_data) are inaccessible to guest EL0 in the
+         * page tables; the host accessors that act on a guest-supplied GVA
+         * must refuse them too, otherwise a guest could pass a shim_data
+         * GVA as a syscall buffer and have the host write into the identity
+         * cache or entropy ring on its behalf. The host's own publishers
+         * use direct host_base+shim_data_base arithmetic and bypass this
+         * walker entirely.
+         */
+        if (perms & MEM_PERM_EL1_ONLY)
+            return -1;
         if ((perms & required_perms) != required_perms)
             return -1;
 
@@ -1136,6 +1166,12 @@ static int gva_translate_perm(const guest_t *g,
 
     /* L2 block descriptor: 2MiB granularity. */
     int perms = desc_to_perms(l2[l2_idx]);
+    /* See the L3 page-descriptor branch above: EL1-only blocks are
+     * inaccessible to host-on-behalf-of-guest accesses for the same
+     * reason. shim_data is mapped as a 2MiB EL1-only block at boot.
+     */
+    if (perms & MEM_PERM_EL1_ONLY)
+        return -1;
     if ((perms & required_perms) != required_perms)
         return -1;
 
@@ -2079,10 +2115,20 @@ static uint64_t make_block_desc(uint64_t gpa, int perms)
     }
 
     /* Write permissions via AP bits:
+     * AP[2:1]=00 -> RW for EL1 only (no EL0 access)
      * AP[2:1]=01 -> RW for EL1 and EL0
      * AP[2:1]=11 -> RO for EL1 and EL0
+     * MEM_PERM_EL1_ONLY drops EL0 access entirely; used for shim_data
+     * so the guest cannot directly read or store to the cache, ring,
+     * bitmap, or attention flag.
      */
-    if (perms & MEM_PERM_W) {
+    if (perms & MEM_PERM_EL1_ONLY) {
+        desc |= PT_AP_RW_EL1;
+        /* EL1-only data: never EL0-executable (already set above if
+         * MEM_PERM_X is unset, but assert defensively).
+         */
+        desc |= PT_UXN | PT_PXN;
+    } else if (perms & MEM_PERM_W) {
         desc |= PT_AP_RW_EL0;
     } else {
         desc |= PT_AP_RO;
@@ -2513,22 +2559,35 @@ static uint64_t make_page_desc(uint64_t pa, int perms)
     if (!(perms & MEM_PERM_X))
         desc |= PT_UXN | PT_PXN;
 
-    if (perms & MEM_PERM_W)
+    if (perms & MEM_PERM_EL1_ONLY) {
+        desc |= PT_AP_RW_EL1;
+        desc |= PT_UXN | PT_PXN; /* EL1-only data never executes */
+    } else if (perms & MEM_PERM_W) {
         desc |= PT_AP_RW_EL0;
-    else
+    } else {
         desc |= PT_AP_RO;
+    }
 
     return desc;
 }
 
-/* Extract MEM_PERM_* flags from a page table descriptor (block or page). */
+/* Extract MEM_PERM_* flags from a page table descriptor (block or page).
+ * The AP[2:1] field encodes the EL1/EL0 access matrix; map 00 to
+ * MEM_PERM_RW | MEM_PERM_EL1_ONLY so callers see the privileged-only
+ * shim_data slots correctly instead of treating them as read-only.
+ */
 static int desc_to_perms(uint64_t desc)
 {
     int perms = MEM_PERM_R;
     if (!(desc & PT_UXN))
         perms |= MEM_PERM_X;
-    if ((desc & (3ULL << 6)) == PT_AP_RW_EL0)
+    uint64_t ap = desc & (3ULL << 6);
+    if (ap == PT_AP_RW_EL0) {
         perms |= MEM_PERM_W;
+    } else if (ap == PT_AP_RW_EL1) {
+        perms |= MEM_PERM_W | MEM_PERM_EL1_ONLY;
+    }
+    /* PT_AP_RO (11) stays MEM_PERM_R only. */
     return perms;
 }
 
diff --git a/src/core/guest.h b/src/core/guest.h
index 5429392..11d05bf 100644
--- a/src/core/guest.h
+++ b/src/core/guest.h
@@ -127,20 +127,28 @@
 #define MEM_PERM_R (1 << 0)
 #define MEM_PERM_W (1 << 1)
 #define MEM_PERM_X (1 << 2)
+/* AP[2:1]=00: privileged-only (no EL0 read/write). Combine with MEM_PERM_R/W.
+ * Used for shim_data so the guest cannot directly read or store to the identity
+ * cache, urandom bitmap, ring, or attention flag. The EL1 shim still has full
+ * RW. EL0 reads/writes fault to the EL0-fault path (SIGSEGV in the guest),
+ * matching what Linux does for kernel-only pages exposed in /proc/self/maps .
+ */
+#define MEM_PERM_EL1_ONLY (1 << 3)
 #define MEM_PERM_RX (MEM_PERM_R | MEM_PERM_X)
 #define MEM_PERM_RW (MEM_PERM_R | MEM_PERM_W)
+#define MEM_PERM_RW_EL1_ONLY (MEM_PERM_R | MEM_PERM_W | MEM_PERM_EL1_ONLY)
 
 /* A contiguous region of guest memory to be mapped in page tables.
  *
- * Default mode (va_base == 0): identity-mapped, VA == GPA. Used by every
- * boot region (shim, vDSO, brk, stack) and every aarch64 ELF segment.
+ * Default mode (va_base == 0): identity-mapped, VA == GPA. Used by every boot
+ * region (shim, vDSO, brk, stack) and every aarch64 ELF segment.
  *
- * Rosetta segments use va_base != 0 to install a non-identity mapping:
- * the rosetta ELF is statically linked at 0x800000000000 (128 TiB) but its
- * bytes live in the primary buffer at a low GPA. Page-table entries are
- * indexed by va_base + (offset within region) and emit a block descriptor
- * whose output address is gpa_start + (offset within region). This is the
- * only place in elfuse where guest VA diverges from guest GPA.
+ * Rosetta segments use va_base != 0 to install a non-identity mapping: the
+ * rosetta ELF is statically linked at 0x800000000000 (128 TiB) but its bytes
+ * live in the primary buffer at a low GPA. Page-table entries are indexed by
+ * va_base + (offset within region) and emit a block descriptor whose output
+ * address is gpa_start + (offset within region). This is the only place in
+ * elfuse where guest VA diverges from guest GPA.
  */
 typedef struct {
     uint64_t gpa_start; /* Output GPA / IPA (2MiB aligned) */
diff --git a/src/core/rosetta.c b/src/core/rosetta.c
index 32588b4..4b3a986 100644
--- a/src/core/rosetta.c
+++ b/src/core/rosetta.c
@@ -268,8 +268,10 @@ int rosetta_prepare(guest_t *g,
          * binaries: uint64_t arithmetic, two's-complement intentional.
          */
         uint64_t load_base = guest_base - va_base;
+        uint64_t infra_lo = g->interp_base - INFRA_RESERVE;
+        uint64_t infra_hi = g->interp_base;
         if (elf_map_segments(ri, ROSETTA_PATH, g->host_base, g->guest_size,
-                             load_base) < 0) {
+                             load_base, infra_lo, infra_hi) < 0) {
             log_error("rosetta: elf_map_segments failed");
             return -1;
         }
@@ -316,8 +318,10 @@ int rosetta_prepare(guest_t *g,
          */
         guest_base = g->rosetta_guest_base;
         uint64_t load_base = guest_base - va_base;
+        uint64_t infra_lo = g->interp_base - INFRA_RESERVE;
+        uint64_t infra_hi = g->interp_base;
         if (elf_map_segments(ri, ROSETTA_PATH, g->host_base, g->guest_size,
-                             load_base) < 0) {
+                             load_base, infra_lo, infra_hi) < 0) {
             log_error("rosetta: re-entry elf_map_segments failed");
             return -1;
         }
@@ -469,7 +473,7 @@ int rosetta_finalize(guest_t *g,
      * goto fail must be introduced below, or the fail handler would
      * double-close it.
      */
-    int bin_guest_fd = fd_alloc_at(3, FD_REGULAR, bin_host_fd);
+    int bin_guest_fd = fd_alloc_at(3, FD_REGULAR, bin_host_fd, NULL);
     if (bin_guest_fd < 0) {
         log_error("rosetta_finalize: fd_alloc_at(3) failed");
         goto fail;
diff --git a/src/core/shim-globals.c b/src/core/shim-globals.c
new file mode 100644
index 0000000..eaf0bf9
--- /dev/null
+++ b/src/core/shim-globals.c
@@ -0,0 +1,361 @@
+/* EL1 shim globals -- host publisher.
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * See core/shim-globals.h for the cache layout, threat model, and
+ * memory-ordering rules. This file implements the host-side publish
+ * and TPIDR_EL1 setup helpers. The shim assembly side is in
+ * src/core/shim.S.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sched.h>
+
+#include "hvutil.h"
+#include "core/guest.h"
+#include "core/shim-globals.h"
+#include "core/vdso.h"
+#include "debug/log.h"
+#include "runtime/thread.h"
+#include "syscall/abi.h"
+#include "syscall/fd.h"
+#include "syscall/internal.h"
+#include "syscall/proc.h"
+#include "syscall/signal.h"
+
+#ifndef HV_SYS_REG_TPIDR_EL1
+/* Older SDKs (e.g., the Nix-pinned apple-sdk-14.4) may lack the
+ * enumerator. The encoding is stable: op0=3, op1=0, CRn=13, CRm=0,
+ * op2=4 -> 0xc684. Mirrors the existing ACTLR_EL1 workaround in
+ * src/syscall/syscall.c.
+ */
+#define HV_SYS_REG_TPIDR_EL1 ((hv_sys_reg_t) 0xc684)
+#endif
+
+#ifndef HV_SYS_REG_CONTEXTIDR_EL1
+/* op0=3, op1=0, CRn=13, CRm=0, op2=1 -> 0xc681. Same SDK-fallback
+ * pattern as TPIDR_EL1.
+ */
+#define HV_SYS_REG_CONTEXTIDR_EL1 ((hv_sys_reg_t) 0xc681)
+#endif
+
+/* shim.S hard-codes these offsets and sizes in its urandom-read
+ * fast path; if they drift here the shim reads from the wrong
+ * place. Catch the drift at compile time.
+ */
+_Static_assert(SHIM_URANDOM_OFF_BITMAP == 0x38,
+               "shim.S urandom fast path hard-codes BITMAP off 0x38");
+_Static_assert(SHIM_URANDOM_OFF_RING_HEAD == 0xB8,
+               "shim.S urandom fast path hard-codes RING_HEAD off 0xB8");
+_Static_assert(SHIM_URANDOM_OFF_RING_TAIL == 0xBC,
+               "shim.S urandom fast path hard-codes RING_TAIL off 0xBC");
+_Static_assert(SHIM_URANDOM_OFF_RING == 0xC0,
+               "shim.S urandom fast path hard-codes RING off 0xC0");
+_Static_assert(SHIM_URANDOM_RING_SIZE == 4096,
+               "shim.S urandom fast path hard-codes RING_SIZE 4096");
+_Static_assert(SHIM_URANDOM_OFF_RING_LOCK == 0x10C0,
+               "shim.S urandom fast path hard-codes RING_LOCK off 0x10C0");
+_Static_assert(FD_TABLE_SIZE == 1024,
+               "shim.S urandom fast path hard-codes FD_TABLE_SIZE 1024");
+
+static uint8_t *cache_base(const guest_t *g)
+{
+    /* The cache lives at the start of the shim_data block, which is
+     * mapped into the host buffer at host_base + shim_data_base.
+     * Direct buffer access bypasses the guest-page-table walk used by
+     * guest_ptr, which is intentional: the host owns shim_data
+     * unconditionally.
+     */
+    return (uint8_t *) g->host_base + g->shim_data_base;
+}
+
+static void store_u64(uint8_t *page, uint32_t off, uint64_t value)
+{
+    uint64_t *slot = (uint64_t *) (page + off);
+    __atomic_store_n(slot, value, __ATOMIC_RELEASE);
+}
+
+static void urandom_ring_lock(uint32_t *lock_p)
+{
+    while (__atomic_exchange_n(lock_p, 1, __ATOMIC_ACQUIRE) != 0)
+        sched_yield();
+}
+
+static void urandom_ring_unlock(uint32_t *lock_p)
+{
+    __atomic_store_n(lock_p, 0, __ATOMIC_RELEASE);
+}
+
+void shim_globals_init(guest_t *g)
+{
+    memset(cache_base(g), 0, SHIM_GLOBALS_SIZE);
+}
+
+void shim_globals_publish_pid(guest_t *g, int64_t pid, int64_t ppid)
+{
+    uint8_t *page = cache_base(g);
+    store_u64(page, SHIM_IDENTITY_OFF_PID, (uint64_t) pid);
+    store_u64(page, SHIM_IDENTITY_OFF_PPID, (uint64_t) ppid);
+}
+
+void shim_globals_publish_creds(guest_t *g,
+                                uint32_t uid,
+                                uint32_t euid,
+                                uint32_t gid,
+                                uint32_t egid)
+{
+    uint8_t *page = cache_base(g);
+    store_u64(page, SHIM_IDENTITY_OFF_UID, uid);
+    store_u64(page, SHIM_IDENTITY_OFF_EUID, euid);
+    store_u64(page, SHIM_IDENTITY_OFF_GID, gid);
+    store_u64(page, SHIM_IDENTITY_OFF_EGID, egid);
+}
+
+uint64_t shim_globals_gva(const guest_t *g)
+{
+    return g->shim_data_base;
+}
+
+int shim_globals_self_test(hv_vcpu_t vcpu)
+{
+    const uint64_t sentinel = 0xCAFEBABEDEADBEEFULL;
+    hv_return_t r = hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL1, sentinel);
+    if (r != HV_SUCCESS) {
+        log_error("shim_globals: TPIDR_EL1 set failed (hv_return=0x%x)", r);
+        return -1;
+    }
+    uint64_t probe = 0;
+    r = hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL1, &probe);
+    if (r != HV_SUCCESS) {
+        log_error("shim_globals: TPIDR_EL1 get failed (hv_return=0x%x)", r);
+        return -1;
+    }
+    if (probe != sentinel) {
+        log_error(
+            "shim_globals: TPIDR_EL1 round-trip mismatch: wrote 0x%llx, "
+            "read 0x%llx",
+            (unsigned long long) sentinel, (unsigned long long) probe);
+        return -1;
+    }
+    return 0;
+}
+
+int shim_globals_install_tpidr(hv_vcpu_t vcpu, const guest_t *g)
+{
+    hv_return_t r =
+        hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL1, shim_globals_gva(g));
+    if (r != HV_SUCCESS) {
+        log_error("shim_globals: install TPIDR_EL1 failed (hv_return=0x%x)", r);
+        return -1;
+    }
+    return 0;
+}
+
+int shim_globals_install_tid(hv_vcpu_t vcpu, int64_t tid)
+{
+    hv_return_t r =
+        hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_CONTEXTIDR_EL1, (uint64_t) tid);
+    if (r != HV_SUCCESS) {
+        log_error(
+            "shim_globals: install CONTEXTIDR_EL1 (tid=%lld) failed "
+            "(hv_return=0x%x)",
+            (long long) tid, r);
+        return -1;
+    }
+    return 0;
+}
+
+int shim_globals_install_per_vcpu(hv_vcpu_t vcpu, const guest_t *g, int64_t tid)
+{
+    if (shim_globals_install_tpidr(vcpu, g) < 0)
+        return -1;
+    return shim_globals_install_tid(vcpu, tid);
+}
+
+/* Singleton guest pointer for the urandom-bitmap hooks called from
+ * the fd table. elfuse runs one VM per process so a single global is
+ * correct; the NULL-or-same-g assertion catches a lifecycle bug.
+ * Mirrors the pattern signal.c uses for the attention-flag singleton.
+ */
+static guest_t *singleton_g;
+
+void shim_globals_set_singleton(guest_t *g)
+{
+    if (g != NULL && singleton_g != NULL && singleton_g != g) {
+        log_error(
+            "shim_globals: singleton already registered to %p, "
+            "refusing to re-register with %p",
+            (void *) singleton_g, (void *) g);
+        return;
+    }
+    singleton_g = g;
+}
+
+void shim_globals_reset_singleton(void)
+{
+    singleton_g = NULL;
+}
+
+static uint64_t *urandom_bitmap_word(int fd)
+{
+    if (!singleton_g)
+        return NULL;
+    if (fd < 0 || fd >= FD_TABLE_SIZE)
+        return NULL;
+    uint8_t *base = cache_base(singleton_g) + SHIM_URANDOM_OFF_BITMAP;
+    return (uint64_t *) base + (fd / 64);
+}
+
+void shim_globals_mark_urandom_fd(int fd, bool is_urandom)
+{
+    uint64_t *word = urandom_bitmap_word(fd);
+    if (!word)
+        return;
+    uint64_t mask = (uint64_t) 1 << (fd & 63);
+    if (is_urandom)
+        __atomic_fetch_or(word, mask, __ATOMIC_RELEASE);
+    else
+        __atomic_fetch_and(word, ~mask, __ATOMIC_RELEASE);
+}
+
+void shim_globals_rebuild_urandom_bitmap(void)
+{
+    if (!singleton_g)
+        return;
+    /* Wipe the bitmap region first; concurrent fd_alloc / close from
+     * other vCPUs is impossible during fork-child init (the child has
+     * not yet started executing guest code), so a non-atomic memset
+     * is safe here.
+     */
+    memset(cache_base(singleton_g) + SHIM_URANDOM_OFF_BITMAP, 0,
+           SHIM_URANDOM_BITMAP_BYTES);
+    /* Walk the fd table; mark every readable FD_URANDOM slot. Reuses
+     * the atomic-OR setter so the visible memory order matches the
+     * normal fd_alloc path.
+     */
+    for (int fd = 0; fd < FD_TABLE_SIZE; fd++) {
+        fd_refresh_urandom_bitmap(fd);
+    }
+}
+
+/* arc4random_buf is documented as deadlock-free and re-entrant. Used
+ * by both the initial fill at bootstrap and by the slow-path refill
+ * that runs from sys_read when the shim's fast path falls through due
+ * to an empty ring.
+ */
+void shim_globals_refill_urandom_ring(guest_t *g)
+{
+    uint8_t *base = cache_base(g);
+    uint32_t *head_p = (uint32_t *) (base + SHIM_URANDOM_OFF_RING_HEAD);
+    uint32_t *tail_p = (uint32_t *) (base + SHIM_URANDOM_OFF_RING_TAIL);
+    uint32_t *lock_p = (uint32_t *) (base + SHIM_URANDOM_OFF_RING_LOCK);
+    uint8_t *ring = base + SHIM_URANDOM_OFF_RING;
+
+    urandom_ring_lock(lock_p);
+
+    uint32_t head = __atomic_load_n(head_p, __ATOMIC_ACQUIRE);
+    uint32_t tail = __atomic_load_n(tail_p, __ATOMIC_RELAXED);
+    uint32_t fill = tail - head;
+    if (fill >= SHIM_URANDOM_RING_SIZE)
+        goto out; /* already full */
+    uint32_t to_fill = SHIM_URANDOM_RING_SIZE - fill;
+
+    /* Producer writes from ring[tail & (SIZE-1)] forward, wrapping
+     * once when needed. Two memcpys at most.
+     */
+    uint32_t pos = tail & (SHIM_URANDOM_RING_SIZE - 1);
+    uint32_t first = SHIM_URANDOM_RING_SIZE - pos;
+    if (first > to_fill)
+        first = to_fill;
+    arc4random_buf(ring + pos, first);
+    if (to_fill > first)
+        arc4random_buf(ring, to_fill - first);
+
+    /* Release-store the new tail so any fast-path consumer that loads
+     * tail with an acquiring read sees the bytes already in the ring.
+     */
+    __atomic_store_n(tail_p, tail + to_fill, __ATOMIC_RELEASE);
+
+out:
+    urandom_ring_unlock(lock_p);
+}
+
+/* Bitmask helpers. The slot lives at SHIM_GLOBALS_OFF_ATTN as a
+ * uint32; ATTN_BIT_SIGTIMER and ATTN_BIT_CRED partition ownership so
+ * the signal/timer lane and the cred-publish lane cannot clobber
+ * each other.
+ */
+void shim_globals_attn_or(guest_t *g, uint32_t bits)
+{
+    uint32_t *slot = (uint32_t *) (cache_base(g) + SHIM_GLOBALS_OFF_ATTN);
+    /* SEQ_CST, not ACQ_REL. The CRED_BRACKETED invariant is the
+     * contrapositive of release-acquire: 'if a sibling vCPU LDAR-loads
+     * attn and sees 0, that sibling also does not yet observe any of
+     * the post-OR publish_creds stores.' Acquire-release only guarantees
+     * the forward direction (if you see the OR, you see prior stores);
+     * the contrapositive needs a total order across atomics, which on
+     * ARM64 SEQ_CST provides via DMB ISH. The OR runs only on rare
+     * setuid/setgid/etc paths so the extra barrier is not a hot-path
+     * cost. shim_globals_attn_and stays RELEASE because it runs after
+     * publish_creds and only needs to order those prior stores before
+     * the clear.
+     */
+    __atomic_fetch_or(slot, bits, __ATOMIC_SEQ_CST);
+    vdso_attention_or(g, bits);
+}
+
+void shim_globals_attn_and(guest_t *g, uint32_t mask)
+{
+    uint32_t *slot = (uint32_t *) (cache_base(g) + SHIM_GLOBALS_OFF_ATTN);
+    /* RELEASE is sufficient for the clear path: the bracket runs
+     * publish_creds BEFORE this clear, and RELEASE here pairs with the
+     * shim's LDAR so any sibling that observes the cleared bit also sees
+     * the published cred slots.
+     */
+    __atomic_fetch_and(slot, mask, __ATOMIC_RELEASE);
+    vdso_attention_and(g, mask);
+}
+
+void shim_globals_raise_attention(guest_t *g)
+{
+    /* Signal/timer/exit-group lane. OR-only update so a concurrent
+     * cred publish's ATTN_BIT_CRED stays set. The release-store
+     * pairs with the shim's LDAR on the same address.
+     */
+    shim_globals_attn_or(g, ATTN_BIT_SIGTIMER);
+
+    /* Kick any vCPU spinning in EL0 on the identity fast path. Without
+     * the exit, the spinning vCPU never traps into EL1 and never
+     * reads the new attention value, so a SIGALRM queued for it
+     * waits until its host-thread timeslice ends. Reusing the
+     * existing signal-preemption helper (which iterates the live
+     * vCPU set under thread_lock) avoids duplicating the iteration
+     * logic; on a single-vCPU guest the loop is essentially a no-op.
+     */
+    thread_interrupt_all();
+}
+
+void shim_globals_recompute_attention(guest_t *g)
+{
+    /* Only owns the SIGTIMER lane; CRED and TRACE stay untouched so a
+     * concurrent setuid/setgid bracket or persistent verbose-tracing gate
+     * cannot be undone by the HVC #5 epilogue dropping signal attention.
+     * Set or clear ATTN_BIT_SIGTIMER atomically.
+     */
+    bool need = proc_exit_group_requested() || signal_attention_needed();
+    if (need)
+        shim_globals_attn_or(g, ATTN_BIT_SIGTIMER);
+    else
+        shim_globals_attn_and(g, ~ATTN_BIT_SIGTIMER);
+}
+
+void shim_globals_set_trace_enabled(guest_t *g, bool enabled)
+{
+    if (enabled)
+        shim_globals_attn_or(g, ATTN_BIT_TRACE);
+    else
+        shim_globals_attn_and(g, ~ATTN_BIT_TRACE);
+}
diff --git a/src/core/shim-globals.h b/src/core/shim-globals.h
new file mode 100644
index 0000000..8e1a389
--- /dev/null
+++ b/src/core/shim-globals.h
@@ -0,0 +1,308 @@
+/* EL1 shim globals (identity cache + attention flag)
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * A small struct of host-published values that the EL1 shim consumes
+ * to serve identity syscalls (getpid 172, getppid 173, getuid 174,
+ * geteuid 175, getgid 176, getegid 177) without an HVC round-trip.
+ *
+ * The cache lives at the start of the shim_data block (high IPA,
+ * inside the infra reserve). Three layered protections keep guest
+ * EL0 code from MAP_FIXED / MREMAP / MADVISE-spoofing the cache:
+ *
+ *   - sys_mmap MAP_FIXED rejects ranges hitting infra
+ *   - sys_munmap and sys_mprotect reject infra ranges
+ *   - sys_mremap (all variants) and sys_madvise reject infra ranges
+ *
+ * Not yet defended: direct EL0 store to the cache GVA. The shim_data
+ * block is mapped PT_AP_RW_EL0 (RW at both ELs), and /proc/self/maps
+ * exposes [shim-data]. A guest that knows the layout can store the
+ * cache base into a register and write spoofed values directly. This
+ * is documented as out of scope; closing it requires a new AP[2:1]=00
+ * permission level (RW at EL1, no EL0 access) which is a separate
+ * hardening item. The elfuse threat model treats the guest as the
+ * user's own binary, not adversarial, so direct-write spoofing is a
+ * defense-in-depth gap rather than an active vulnerability.
+ *
+ * The shim addresses the cache via TPIDR_EL1, which the host sets at
+ * every vCPU init point (bootstrap, fork-child, CLONE_THREAD, exec
+ * re-init). TPIDR_EL1 is unused by elfuse aside from this and is not
+ * trapped under default HCR_EL2 settings at EL1.
+ *
+ * Memory ordering: each publish uses __ATOMIC_RELEASE. The shim reads
+ * the attention flag with LDAR (acquire) to pair with the release.
+ * Identity slot reads stay plain LDR -- each is independent and
+ * naturally-aligned 64-bit loads are single-copy atomic on AArch64.
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "core/guest.h"
+
+/* Layout within shim_data_base (offsets are bytes from the cache base
+ * which equals shim_data_base; the shim's TPIDR_EL1 holds exactly this
+ * address).
+ *
+ * Attention flag sits at offset 0 so the shim's LDAR (which only
+ * supports a register base with no immediate offset) can load it via
+ * 'ldar w_, [x12]' where x12 = mrs tpidr_el1. Identity slots follow
+ * 8-byte-aligned, with PID at offset 0x08; the shim adds 8 to the
+ * base and then indexes by (X8 - 172) * 8 to land on the requested
+ * slot. Attention=0 takes the fast path; nonzero forces HVC.
+ *
+ * Slice A ships attention as always-zero (the setter API exists but
+ * is only called from cred publish in Slice B). The fast path is
+ * gated already so Slice B can wire signal_queue / setitimer / exit-
+ * group setters without further shim changes.
+ */
+#define SHIM_GLOBALS_OFF_ATTN 0x00
+
+/* Attention is a bitmask, not a boolean. Splitting it by owner lets the
+ * HVC #5 epilogue's recompute (which polls signal/itimer state) coexist
+ * with the cred-publish bracket without clobbering it. The shim still
+ * does a single cbnz on the whole word: any bit set forces the slow
+ * path. Bit ownership keeps recompute and cred bracket independent.
+ *
+ *   ATTN_BIT_SIGTIMER   owned by signal_queue / setitimer / exit_group
+ *                       and signal_check_timer's recompute. Set when
+ *                       a signal is pending or an itimer is armed.
+ *   ATTN_BIT_CRED       owned by CRED_BRACKETED in setuid/setgid
+ *                       wrappers. Set across the four-slot publish
+ *                       window so concurrent shim readers fall back
+ *                       to HVC and see _Atomic-coherent host values.
+ *   ATTN_BIT_TRACE      owned by --verbose syscall tracing. Set for the
+ *                       lifetime of a verbose run so EL1 shim fast paths
+ *                       fall back to HVC and syscall_dispatch can log them.
+ *
+ * Earlier revisions used a single boolean: a sibling vCPU's recompute
+ * dropping it to zero mid-publish reopened the torn-cred window the
+ * bracket was meant to close.
+ */
+#define ATTN_BIT_SIGTIMER 0x00000001u
+#define ATTN_BIT_CRED 0x00000002u
+#define ATTN_BIT_TRACE 0x00000004u
+
+#define SHIM_IDENTITY_BASE 0x08
+#define SHIM_IDENTITY_OFF_PID 0x08
+#define SHIM_IDENTITY_OFF_PPID 0x10
+#define SHIM_IDENTITY_OFF_UID 0x18
+#define SHIM_IDENTITY_OFF_EUID 0x20
+#define SHIM_IDENTITY_OFF_GID 0x28
+#define SHIM_IDENTITY_OFF_EGID 0x30
+
+/* Urandom fast path (Slice D / P3): closes the /dev/urandom 1B read
+ * band PR #48 left at the HVF round-trip floor.
+ *
+ * Layout (continues from the identity section):
+ *   0x38 .. 0xB7   URANDOM_FD_BITMAP   128 bytes = 1024 bits = FD_TABLE_SIZE
+ *   0xB8 .. 0xBB   URANDOM_RING_HEAD   uint32, consumer cursor (atomic)
+ *   0xBC .. 0xBF   URANDOM_RING_TAIL   uint32, producer cursor (host-only)
+ *   0xC0 .. 0x10BF URANDOM_RING        4096-byte CSPRNG ring
+ *   0x10C0..0x10C3 URANDOM_RING_LOCK   uint32, producer/consumer lock
+ *
+ * The bitmap is bit N == 1 iff guest fd N currently refers to an
+ * FD_URANDOM-typed entry. The shim's read fast path consults this
+ * before serving from the ring; any other fd type falls through to
+ * HVC. Host maintains the bitmap from fd_alloc / fd_mark_closed.
+ *
+ * Ring head/tail are byte counters that grow monotonically (uint32);
+ * fill = tail - head (uint32 subtract) is the available byte count,
+ * pos = head & (URANDOM_RING_SIZE - 1) is the index in the ring.
+ * Both cursors are atomic. The shim advances head via LDXR/STXR; the
+ * host advances tail via release-store after writing fresh entropy.
+ * The producer and shim consumer also take RING_LOCK while touching the
+ * ring so the host cannot overwrite a slice after the shim reserves it
+ * but before the EL1 copy has loaded it.
+ *
+ * Size must be a power of two so the index mask is AND of (SIZE - 1).
+ */
+#define SHIM_URANDOM_OFF_BITMAP 0x0038
+#define SHIM_URANDOM_BITMAP_BYTES 128
+#define SHIM_URANDOM_OFF_RING_HEAD 0x00B8
+#define SHIM_URANDOM_OFF_RING_TAIL 0x00BC
+#define SHIM_URANDOM_OFF_RING 0x00C0
+#define SHIM_URANDOM_RING_SIZE 4096
+#define SHIM_URANDOM_OFF_RING_LOCK 0x10C0
+
+#define SHIM_GLOBALS_SIZE 0x10C4
+
+/* Initialize the cache region to all-zero. Called once per process at
+ * the same time the shim_data block is set up (initial bootstrap and
+ * fork-child). The initial attention=0 means the shim takes the fast
+ * path until a setter raises it.
+ */
+void shim_globals_init(guest_t *g);
+
+/* Publish pid + ppid pair atomically (release-store per slot). Called
+ * at process init, after fork-child identity is installed, and after
+ * any future PID/PPID mutation. pid and ppid are int64 to match
+ * proc_get_pid/proc_get_ppid; values are stored zero/sign-extended.
+ */
+void shim_globals_publish_pid(guest_t *g, int64_t pid, int64_t ppid);
+
+/* Publish all four credential slots. Slot writes are independent
+ * 64-bit atomic stores; concurrent shim reads on another vCPU may
+ * see partial updates. Slice B's attention bracket eliminates that
+ * race; until then, callers must accept that a concurrent
+ * getuid+geteuid sequence on a different vCPU can witness a torn
+ * cred set across a setresuid moment. Linux semantics require an
+ * atomic cred swap; bracket via attention closes that gap.
+ */
+void shim_globals_publish_creds(guest_t *g,
+                                uint32_t uid,
+                                uint32_t euid,
+                                uint32_t gid,
+                                uint32_t egid);
+
+/* GVA of the cache base. Equal to g->shim_data_base. Exposed so the
+ * TPIDR_EL1 setup site and tests can reference one source of truth.
+ */
+uint64_t shim_globals_gva(const guest_t *g);
+
+/* Pre-flight validation that hv_vcpu_set_sys_reg + hv_vcpu_get_sys_reg
+ * round-trip on TPIDR_EL1. Writes a sentinel and reads it back via
+ * the same HVF accessors the bootstrap uses; aborts (log_error + -1)
+ * on mismatch. ARM documents TPIDR_EL1 as ordinary EL1 thread/CPU
+ * pointer storage with no HCR trap on the EL1-side MRS/MSR.
+ *
+ * Note: this test runs BEFORE the first hv_vcpu_run; it does not
+ * verify that HVF preserves the register across vCPU run/exit
+ * boundaries. The existing test-shim-identity microbench is the
+ * end-to-end check for that property -- if HVF clobbered TPIDR_EL1,
+ * every identity-class fast path would observe a stale base and
+ * test-shim-identity would fail on the first iteration after
+ * remap_vdso_page.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+int shim_globals_self_test(hv_vcpu_t vcpu);
+
+/* Install TPIDR_EL1 = shim_globals_gva(g) on a vCPU. Called from the
+ * four vCPU init sites listed in the file header.
+ */
+int shim_globals_install_tpidr(hv_vcpu_t vcpu, const guest_t *g);
+
+/* Install CONTEXTIDR_EL1 = tid for the gettid shim fast path. The
+ * register is per-vCPU and unused elsewhere in elfuse (HVF preserves
+ * it across hv_vcpu_run alongside the rest of EL1 state). The shim
+ * answers SVC #0 with X8 == 178 (gettid) by emitting a single
+ * 'mrs x0, CONTEXTIDR_EL1' and an 'eret', skipping the HVC #5
+ * round-trip the same way the identity slot loads do for syscalls
+ * 172-177. Caller passes the Linux tid; it is zero/sign-extended
+ * into the 64-bit sysreg slot.
+ *
+ * Setup sites:
+ *   bootstrap.c                  initial main thread (tid == pid)
+ *   forkipc.c fork-child main    tid == child pid
+ *   forkipc.c CLONE_THREAD       tid == thread's allocated guest_tid
+ *   forkipc.c CLONE_VM           tid == child's guest_tid
+ *
+ * sys_execve reuses the vCPU and the main thread's tid does not
+ * change across exec, so no re-set is required there.
+ */
+int shim_globals_install_tid(hv_vcpu_t vcpu, int64_t tid);
+
+/* Combined install: TPIDR_EL1 = shim_globals base, CONTEXTIDR_EL1 = tid.
+ * Used by every vCPU init site (bootstrap, fork-child main, CLONE_THREAD
+ * worker, CLONE_VM child). Returns 0 on success, -1 on either failure.
+ * sys_execve uses install_tpidr alone because the tid is unchanged
+ * across exec.
+ */
+int shim_globals_install_per_vcpu(hv_vcpu_t vcpu,
+                                  const guest_t *g,
+                                  int64_t tid);
+
+/* Attention flag setters (Slice B).
+ *
+ * The shim's identity fast path reads the attention flag with LDAR
+ * before doing anything else. When nonzero, the shim falls back to
+ * HVC #5 so the host's post-syscall epilogue can deliver any pending
+ * signal or itimer expiry.
+ *
+ * shim_globals_raise_attention sets the flag to 1 atomically (release)
+ * and also issues hv_vcpus_exit on every sibling vCPU so any vCPU
+ * already spinning in EL0 drops out of hv_vcpu_run and re-checks the
+ * flag on the next entry. Without the exit, a tight identity loop on
+ * one vCPU could ignore an attention raise on another vCPU until its
+ * timeslice ended.
+ *
+ * shim_globals_recompute_attention re-derives the flag from
+ * (signal_pending OR any guest_itimer active OR exit_group requested).
+ * Called from the HVC #5 epilogue after signal_check_timer to drop
+ * the flag back to zero whenever the slow-path workload has drained.
+ *
+ * The g pointer in both is necessary because the cache is per-guest.
+ * Slice B's signal.c hooks call these via a singleton guest pointer
+ * registered at process init (see signal_set_shim_globals_guest in
+ * src/syscall/signal.h).
+ */
+void shim_globals_raise_attention(guest_t *g);
+void shim_globals_recompute_attention(guest_t *g);
+void shim_globals_set_trace_enabled(guest_t *g, bool enabled);
+
+/* OR / AND specific attention bits without disturbing the others. Used
+ * by the CRED_BRACKETED macro to set ATTN_BIT_CRED before mutating
+ * host credentials and clear it after publish. signal_queue and the
+ * itimer setters take the ATTN_BIT_SIGTIMER lane via raise_attention
+ * and recompute_attention; --verbose owns ATTN_BIT_TRACE. The lanes do not
+ * collide.
+ */
+void shim_globals_attn_or(guest_t *g, uint32_t bits);
+void shim_globals_attn_and(guest_t *g, uint32_t mask);
+
+/* Urandom bitmap maintenance (Slice D / P3).
+ *
+ * The fd-type bitmap is updated by the fd table whenever an FD_URANDOM
+ * slot opens or closes (including dup, fork-IPC restore, etc.). The
+ * shim's read-fast-path consults the bitmap with a single 64-bit load
+ * and a bit test to decide whether the requested fd should hit the
+ * urandom ring or fall through to HVC.
+ *
+ * Updates use atomic OR/AND on the affected 64-bit word so concurrent
+ * dup races (sibling vCPU dup'ing into a freshly-opened slot) cannot
+ * lose either bit. Storing as uint64 rather than per-bit-CAS keeps
+ * the host hook trivial.
+ *
+ * shim_globals_set_singleton publishes the live guest_t * so the
+ * fd-table hooks can update the bitmap without threading g through
+ * every fd_alloc / fd_mark_closed call site. Same NULL-or-same
+ * lifecycle assertion as the signal.c singleton. Call from bootstrap
+ * (initial) and fork-child (after guest_init).
+ */
+void shim_globals_set_singleton(guest_t *g);
+
+/* Reset the singleton to NULL. Called from syscall_init() at process
+ * start so a stale parent-process pointer cannot survive across a
+ * posix_spawn fork-child re-init and silently drop bitmap updates.
+ * Mirrors signal_init()'s attention_guest=NULL reset.
+ */
+void shim_globals_reset_singleton(void);
+
+void shim_globals_mark_urandom_fd(int fd, bool is_urandom);
+
+/* Rebuild the urandom bitmap from the current fd table state. Used by
+ * the fork-child path: the inherited fd table holds the parent's
+ * FD_URANDOM slots but the child just zeroed its shim-globals via
+ * shim_globals_init, so the bitmap must be re-populated to reflect
+ * what the child actually has open. Acquires fd_lock internally.
+ */
+void shim_globals_rebuild_urandom_bitmap(void);
+
+/* Refill the entropy ring with fresh CSPRNG bytes from arc4random_buf.
+ * Called from the host's sys_read slow path when a FD_URANDOM read
+ * encounters an empty (or low-water) ring. The fill always brings tail
+ * up to head + URANDOM_RING_SIZE so the ring is full after refill.
+ *
+ * The initial fill is NOT done by shim_globals_init (which only zeros the
+ * cache). Every bring-up path that uses the urandom fast path must call
+ * this explicitly after shim_globals_init: bootstrap.c does it during VM
+ * bring-up, src/syscall/exec.c does it on execve, and src/runtime/forkipc.c
+ * does it on the fork-child receive path. Any future init site that forgets
+ * this call leaves the ring empty, so the first urandom read on that vCPU is
+ * forced through the host SVC.
+ */
+void shim_globals_refill_urandom_ring(guest_t *g);
diff --git a/src/core/shim.S b/src/core/shim.S
index 7c1dbd2..a2613c3 100644
--- a/src/core/shim.S
+++ b/src/core/shim.S
@@ -272,11 +272,220 @@ svc_handler:
     /* Extract SVC immediate (bits [15:0]) */
     and x11, x9, #0xFFFF
 
-    cmp x11, #0               /* SVC #0 = Linux syscall? */
-    b.eq handle_svc_0
+    /* Inverted from "b.eq handle_svc_0" so the SVC #0 fast-path
+     * dispatch can fall through without an extra branch.
+     */
+    cmp x11, #0
+    b.ne restore_and_bad
+
+    /* Identity-class fast path. X8 in [172, 179) is one of getpid (172),
+     * getppid (173), getuid (174), geteuid (175), getgid (176), getegid (177),
+     * or gettid (178). The first six read from the shim-globals cache
+     * (TPIDR_EL1 base, host-published scalar slots starting at offset 8);
+     * gettid reads its per-vCPU tid from CONTEXTIDR_EL1 directly. Layout-wise
+     * the cache has the attention flag at offset 0 (LDAR'd here to enforce the
+     * slow-path gate from Slice B and the --verbose trace gate) and six
+     * identity slots after it.
+     *
+     * Saved X8 is at [sp+64] per SAVE_GPRS. Scratch X10..X13 are restored from
+     * the frame by RESTORE_GPRS_KEEP_X0 at the named tail (svc_restore_eret),
+     * so the Linux ABI guarantee (X1..X30 preserved across SVC) is intact even
+     * on the fast path.
+     */
+    ldr x10, [sp, #64]               /* saved X8 (syscall nr) */
+    sub x11, x10, #172
+    cmp x11, #7                       /* in identity-class range? */
+    b.lo identity_class_fast          /* 172..178 -> identity / gettid */
+    cmp x10, #63                       /* SYS_read? */
+    b.eq urandom_read_fast
+    b handle_svc_0
+
+identity_class_fast:
+    mrs x12, tpidr_el1               /* shim-globals base */
+    ldar w13, [x12]                  /* attention flag, acquire */
+    cbnz w13, handle_svc_0           /* slow-path required */
+    cmp x11, #6                       /* bias == 6 ==> gettid (178) */
+    b.eq gettid_fast
+    add x12, x12, #8                 /* skip attention -> identity[0] */
+    ldr x0, [x12, x11, lsl #3]       /* identity[bias] for 172..177 */
+    b svc_restore_eret
+
+gettid_fast:
+    mrs x0, contextidr_el1            /* per-vCPU tid */
+    b svc_restore_eret
+
+    /* Urandom-read fast path (Slice D / P3). Serves
+     * read(urandom_fd, buf, len) with len in [1, 64] by popping
+     * len bytes from the shim-globals entropy ring (TPIDR_EL1 base +
+     * 0xC0) into the guest-supplied buffer (X1), advancing the ring
+     * head atomically. If the requested fd is not FD_URANDOM, or
+     * the ring is low, or the read would cross a ring-wrap boundary,
+     * falls through to handle_svc_0 so the host serves the read and
+     * refills the ring.
+     *
+     * Layout offsets (match core/shim-globals.h SHIM_URANDOM_OFF_*):
+     *   0x0038  URANDOM_FD_BITMAP   1024 bits = 128 bytes
+     *   0x00B8  RING_HEAD            uint32, atomic consumer cursor
+     *   0x00BC  RING_TAIL            uint32, host-released tail
+     *   0x00C0  RING                  4096 bytes
+     *   0x10C0  RING_LOCK             uint32
+     */
+urandom_read_fast:
+    mrs x12, tpidr_el1
+    ldar w13, [x12]                  /* attention flag */
+    cbnz w13, handle_svc_0
+
+    ldr x14, [sp, #0]                /* saved X0 = fd */
+    cmp x14, #1024                    /* FD_TABLE_SIZE */
+    b.hs handle_svc_0
+    ldr x15, [sp, #16]               /* saved X2 = len */
+    cbz x15, handle_svc_0            /* host handles len == 0 */
+    cmp x15, #64                      /* URANDOM_INLINE_LIMIT */
+    b.hi handle_svc_0
+
+    /* Bitmap test: word = fd >> 6, bit = fd & 63. */
+    add x16, x12, #0x38              /* SHIM_URANDOM_OFF_BITMAP */
+    lsr x17, x14, #6
+    ldr x17, [x16, x17, lsl #3]
+    and x18, x14, #63
+    lsr x17, x17, x18
+    tbz w17, #0, handle_svc_0
+
+    ldr x20, [sp, #8]                /* saved X1 = buf */
+    /* Probe the guest buffer for stage-1 EL0-write translations before
+     * doing any EL1 store. PROT_NONE or unmapped pages bail to the
+     * slow path here; the host's sys_read returns -EFAULT.
+     *
+     * The probe handles the STATIC case (buffer already unmapped at
+     * entry). The DYNAMIC case where a sibling vCPU munmaps the buffer
+     * in the window between probe and strb is caught later by the
+     * EL1 data abort vector routing into handle_el1_data_abort_recover
+     * (which rolls back the ring head, releases the lock, and returns
+     * -EFAULT). Without that recovery the EL1 strb would fault into
+     * BAD_VEC and halt the VM.
+     *
+     * len is in [1, 64]. Probing the first and last byte covers every page
+     * the inline copy can touch on Linux/AArch64, whose base page size is
+     * much larger than the inline limit.
+     */
+    at s1e0w, x20
+    isb
+    mrs x16, par_el1
+    tbnz x16, #0, urandom_slow_no_clrex
+    sub x16, x15, #1
+    adds x16, x20, x16
+    b.cs urandom_slow_no_clrex
+    at s1e0w, x16
+    isb
+    mrs x16, par_el1
+    tbnz x16, #0, urandom_slow_no_clrex
 
-    /* Unrecognized SVC; restore and report as bad exception */
-    b restore_and_bad
+    /* Serialize host refill against the shim's reserve-then-copy window.
+     * Lock word lives after the 4096-byte ring at offset 0x10C0.
+     */
+    add x19, x12, #0x1, lsl #12      /* base + 0x1000 */
+    add x19, x19, #0xC0              /* &ring_lock */
+    mov w18, #1
+urandom_lock_spin:
+    ldaxr w17, [x19]
+    cbnz w17, urandom_lock_busy
+    stxr w17, w18, [x19]
+    cbnz w17, urandom_lock_spin
+    b urandom_locked
+urandom_lock_busy:
+    clrex
+    yield
+    b urandom_lock_spin
+
+urandom_locked:
+    add x21, x12, #0xB8              /* &ring_head */
+    add x22, x12, #0xBC              /* &ring_tail */
+0:  ldxr w23, [x21]                  /* head */
+    ldar w24, [x22]                  /* tail (host release-store) */
+    sub  w25, w24, w23                /* fill = tail - head */
+    cmp  w25, w15
+    b.lo urandom_clrex_slow           /* ring too low */
+    and  w26, w23, #(4096 - 1)        /* pos = head & (RING_SIZE - 1) */
+    add  w27, w26, w15
+    cmp  w27, #4096
+    b.hi urandom_clrex_slow           /* would wrap: let slow path serve */
+    add  w27, w23, w15                /* new head = head + len */
+    stxr w28, w27, [x21]
+    cbnz w28, 0b
+
+    /* Head reserved; lock held. Snapshot ELR_EL1 + SPSR_EL1 into a
+     * recovery slot below the EL1 stack frame so the EL1 data abort
+     * recovery handler (handle_el1_data_abort_recover, below) can
+     * restore them if a subsequent strb faults. A sibling vCPU can
+     * munmap the guest buffer in the window between the AT probe and
+     * the byte copy; without this slot the resulting EL1 data abort
+     * overwrites ELR_EL1 with the strb PC and there is no way to
+     * resume EL0 at the post-SVC instruction. Both success exits
+     * pop the slot before svc_restore_eret.
+     */
+    mrs x29, elr_el1
+    mrs x30, spsr_el1
+    stp x29, x30, [sp, #-16]!
+
+    /* Copy bytes from ring[pos] to buf. len is in [1, 64].
+     * w26 holds pos in [0, 4096); writing to w26 above zero-extends
+     * into x26, so a plain reg add (no extension) is correct.
+     */
+    add  x16, x12, #0xC0             /* ring base */
+    add  x16, x16, x26                /* ring + pos */
+    cmp  x15, #1
+    b.ne urandom_copy_loop
+
+    /* Common case: 1-byte read. Single byte transfer. */
+.globl urandom_strb_1byte_start
+.globl urandom_strb_1byte_end
+urandom_strb_1byte_start:
+    ldrb w0, [x16]
+    strb w0, [x20]
+urandom_strb_1byte_end:
+    add  sp, sp, #16                 /* pop ELR/SPSR recovery slot */
+    mov  x0, #1
+    stlr wzr, [x19]                  /* release ring_lock */
+    b svc_restore_eret
+
+urandom_copy_loop:
+    /* Byte-wise copy for len in [2, 64]. Unrolling would help but
+     * the slow path is the realistic target for large reads. The
+     * loop runs at most 64 times; total cost is dwarfed by the EL0
+     * entry/exit transitions.
+     */
+    mov  x29, #0
+.globl urandom_strb_loop_start
+.globl urandom_strb_loop_end
+urandom_strb_loop_start:
+1:  ldrb w0, [x16, x29]
+    strb w0, [x20, x29]
+    add  x29, x29, #1
+    cmp  x29, x15
+    b.ne 1b
+urandom_strb_loop_end:
+    add  sp, sp, #16                 /* pop ELR/SPSR recovery slot */
+    mov  x0, x15
+    stlr wzr, [x19]                  /* release ring_lock */
+    b svc_restore_eret
+
+urandom_clrex_slow:
+    /* LDXR opened an exclusive monitor that the slow path will not
+     * release on its own. CLREX drops the monitor so subsequent
+     * LDXR/STXR sequences (including this shim's own next entry)
+     * are not poisoned. Release ring_lock before handing the read to
+     * the host; the host may need the same lock to refill the ring.
+     */
+    clrex
+    stlr wzr, [x19]                  /* release ring_lock */
+
+urandom_slow_no_clrex:
+    /* Reached on probe failure (no exclusive monitor open yet) or
+     * via the above clrex path. Both route into handle_svc_0 which
+     * runs the regular HVC #5 sys_read. The host then returns
+     * -EFAULT for the bad pointer or fulfills the read normally.
+     */
+    b handle_svc_0
 
 not_svc:
     /* EC=0x18: MSR/MRS / system instruction trap. */
@@ -305,6 +514,16 @@ not_svc:
     cmp x10, #0x3C
     b.eq handle_brk
 
+    /* EC=0x25: Data abort taken without a change in Exception level.
+     * The only legitimate source today is the urandom fast path: a
+     * sibling vCPU can munmap the guest buffer between the AT probe
+     * and the EL1 strb, faulting the store. Recover by returning
+     * -EFAULT to the guest instead of halting the VM (which is what
+     * the EL1-from-EL1 default below would do).
+     */
+    cmp x10, #0x25
+    b.eq handle_el1_data_abort_recover
+
     /* Unrecognized EC. Check if from EL0 (deliver signal) or EL1 (shim bug).
      * X11 is saved on stack, safe to use as scratch.
      */
@@ -313,6 +532,61 @@ not_svc:
     cbnz x11, restore_and_bad   /* EL1 (M>=4): genuine shim bug */
     b handle_el0_fault           /* EL0 (M=0): forward for signal delivery */
 
+/* handle_el1_data_abort_recover: tag a data abort whose faulting PC sits
+ * inside the urandom-copy strb region as a recoverable EFAULT.
+ *
+ * Layout invariants exploited here:
+ *   - SAVE_GPRS allocated 256 bytes for THIS (inner) entry on top of
+ *     whatever the outer SVC entry already pushed. Dropping that 256
+ *     puts SP at the recovery slot urandom_read_fast pushed before
+ *     entering the strb region.
+ *   - Recovery slot is 16 bytes: [sp+0]=ELR_EL1, [sp+8]=SPSR_EL1.
+ *   - The lock at TPIDR_EL1 + 0x10C0 is held by this vCPU at the
+ *     time the strb faulted, so release before transferring out.
+ *
+ * If the fault PC is outside the urandom strb ranges this is a genuine
+ * shim bug; fall back to restore_and_bad.
+ */
+handle_el1_data_abort_recover:
+    mrs x11, elr_el1
+    adr x12, urandom_strb_1byte_start
+    cmp x11, x12
+    b.lo 2f
+    adr x12, urandom_strb_1byte_end
+    cmp x11, x12
+    b.lo 1f                      /* in [1byte_start, 1byte_end) */
+2:  adr x12, urandom_strb_loop_start
+    cmp x11, x12
+    b.lo restore_and_bad
+    adr x12, urandom_strb_loop_end
+    cmp x11, x12
+    b.hs restore_and_bad
+1:
+    /* Drop the inner SAVE_GPRS frame; SP back at the urandom recovery slot. */
+    add sp, sp, #256
+    /* Pop the saved EL0 return state. ldp/post-index restores SP to the
+     * outer SAVE_GPRS frame top so svc_restore_eret's RESTORE_GPRS_KEEP_X0
+     * pulls the original EL0 GPR values.
+     */
+    ldp x9, x10, [sp], #16
+    msr elr_el1, x9
+    msr spsr_el1, x10
+    /* Release the urandom ring_lock at TPIDR_EL1 + 0x10C0 (held by this
+     * vCPU since urandom_locked acquired it before the byte copy).
+     */
+    mrs x11, tpidr_el1
+    add x11, x11, #0x1, lsl #12
+    add x11, x11, #0xC0
+    stlr wzr, [x11]
+    /* Drop any open exclusive monitor (the byte loop does not hold one
+     * after the head STXR retired, but CLREX is cheap and removes a
+     * latent footgun for future readers of this code).
+     */
+    clrex
+    /* Linux EFAULT = 14; the kernel returns -14 to userspace. */
+    mov x0, #-14
+    b svc_restore_eret
+
 /* handle_sysreg_trap: EC=0x18: MSR/MRS / system instruction
  *
  * MRS reads (Direction=1): forward to host via HVC #7 to read the system
@@ -626,11 +900,19 @@ tlbi_selective:
     ic iallu
     dsb ish
     isb
-    b 1f
+    b svc_restore_eret
 
+svc_restore_eret:
 1:
     /* Restore all guest registers except X0, which now holds the syscall
      * return value.  Linux preserves X1-X30, including X8.
+     *
+     * Named alias for the cross-function jump from the identity fast
+     * path in svc_handler. A bare 'b 1f' from up there would resolve
+     * to the next forward '1:' -- which sits inside handle_inst_abort
+     * -- and silently re-route the identity result into a W^X toggle.
+     * Caught the hard way during the prior P2 attempt; keep the
+     * named symbol to make the intent explicit.
      */
     RESTORE_GPRS_KEEP_X0
 
diff --git a/src/core/startup-trace.h b/src/core/startup-trace.h
new file mode 100644
index 0000000..b2b75d8
--- /dev/null
+++ b/src/core/startup-trace.h
@@ -0,0 +1,66 @@
+/* Startup tracing helpers
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Lightweight per-step wall-time tracer for VM bring-up. Gated by the
+ * ELFUSE_STARTUP_TRACE environment variable so a release-build run pays
+ * exactly one getenv + one branch per step when disabled. The helpers are
+ * static inline so each translation unit can use them without pulling in a
+ * separate object; the getenv check resolves once per translation unit but
+ * the resolution itself is idempotent.
+ */
+
+#ifndef ELFUSE_STARTUP_TRACE_H
+#define ELFUSE_STARTUP_TRACE_H
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+/* File-scope cache (one copy per translation unit including this header).
+ * pthread_once serializes concurrent first callers and supplies the
+ * memory ordering that makes the cached value safely visible to all
+ * subsequent readers without explicit atomics.
+ */
+static pthread_once_t startup_trace_once = PTHREAD_ONCE_INIT;
+static bool startup_trace_value;
+
+static inline void startup_trace_resolve(void)
+{
+    const char *v = getenv("ELFUSE_STARTUP_TRACE");
+    startup_trace_value = v && v[0] && strcmp(v, "0") != 0;
+}
+
+static inline bool startup_trace_enabled(void)
+{
+    pthread_once(&startup_trace_once, startup_trace_resolve);
+    return startup_trace_value;
+}
+
+static inline uint64_t startup_trace_now_ns(void)
+{
+    if (!startup_trace_enabled())
+        return 0;
+    struct timespec ts;
+    if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
+        return 0;
+    return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec;
+}
+
+static inline void startup_trace_step(const char *label, uint64_t start_ns)
+{
+    if (start_ns == 0)
+        return;
+    uint64_t end_ns = startup_trace_now_ns();
+    if (end_ns < start_ns)
+        return;
+    fprintf(stderr, "startup %-28s %8.3f ms\n", label,
+            (double) (end_ns - start_ns) / 1000000.0);
+}
+
+#endif /* ELFUSE_STARTUP_TRACE_H */
diff --git a/src/core/vdso.c b/src/core/vdso.c
index 444be88..6cf8f6f 100644
--- a/src/core/vdso.c
+++ b/src/core/vdso.c
@@ -4,16 +4,25 @@
  * Copyright 2025 Moritz Angermann, zw3rk pte. ltd.
  * SPDX-License-Identifier: Apache-2.0
  *
- * Builds a minimal vDSO ELF image in guest memory exposing
- * __kernel_{rt_sigreturn,clock_getres,clock_gettime,gettimeofday}. Each entry
- * point is an SVC trampoline that traps back to the host for the actual work.
+ * Builds a minimal vDSO ELF image in guest memory exposing versioned
+ * __kernel_{rt_sigreturn,clock_getres,clock_gettime,gettimeofday}.
+ * __kernel_clock_gettime is a CNTVCT-based fast-path trampoline that serves
+ * CLOCK_MONOTONIC (clockid 1) and CLOCK_REALTIME (clockid 0) inline without
+ * trapping; rt_sigreturn / clock_getres / gettimeofday remain 12-byte SVC
+ * trampolines that fall back to the host syscall implementations.
  *
- * An earlier revision had a CNTVCT-based fast path for clock_gettime backed by
- * a host-updated vvar page. That path was incorrect under HVF: the host writes
- * CNTVCT_EL0 from the macOS frame of reference while the guest reads it through
- * HVF's CNTVOFF_EL2 virtualization, so the seqlock interpolation produced bogus
- * times (year 26382). The fast path is gone; SVC is correct and the trap cost
- * is negligible compared to the work clock_gettime callers tend to do anyway.
+ * The fast path reads CNTVCT_EL0 at EL0 (enabled via CNTKCTL_EL1.EL0VCTEN in
+ * the bootstrap), looks up the host-published anchor in the vvar (initialized,
+ * anchor_cntvct, anchor_mono_sec/nsec, anchor_real_sec/nsec), and interpolates
+ * the requested clock from the CNTVCT delta. The vvar is seeded on the first
+ * clock_gettime SVC fallback, gated on ELR_EL1 == svc_fallback_pc + 4 so an
+ * unrelated raw syscall(SYS_clock_gettime, ...) cannot poison the anchor from
+ * an arbitrary X9 value. A three-state CAS (0 -> 2 -> 1) keeps concurrent
+ * first-callers from tearing anchor fields.
+ *
+ * Wall-clock anchors are not refreshed if macOS NTP steps host time; long-
+ * running daemons can observe drift relative to a fresh REALTIME SVC. The
+ * SVC path remains correct in all cases for callers that bypass the vDSO.
  */
 
 #include <stdint.h>
@@ -44,11 +53,28 @@ typedef struct {
     uint64_t st_value, st_size;
 } elf64_sym_t;
 
+typedef struct {
+    uint16_t vd_version;
+    uint16_t vd_flags;
+    uint16_t vd_ndx;
+    uint16_t vd_cnt;
+    uint32_t vd_hash;
+    uint32_t vd_aux;
+    uint32_t vd_next;
+} elf64_verdef_t;
+
+typedef struct {
+    uint32_t vda_name;
+    uint32_t vda_next;
+} elf64_verdaux_t;
+
 /* ELF constants */
 #define SHT_STRTAB 3
 #define SHT_HASH 5
 #define SHT_DYNAMIC 6
 #define SHT_DYNSYM 11
+#define SHT_GNU_VERDEF 0x6ffffffd
+#define SHT_GNU_VERSYM 0x6fffffff
 #define SHF_ALLOC (1ULL << 1)
 #define SHF_EXECINSTR (1ULL << 2)
 #define DT_NULL 0
@@ -57,10 +83,26 @@ typedef struct {
 #define DT_SYMTAB 6
 #define DT_STRSZ 10
 #define DT_SYMENT 11
+#define DT_VERSYM 0x6ffffff0
+#define DT_VERDEF 0x6ffffffc
+#define DT_VERDEFNUM 0x6ffffffd
 #define STB_GLOBAL 1
 #define STT_FUNC 2
+#define VER_DEF_CURRENT 1
+#define VDSO_LINUX_VERSION_INDEX 2
 #define ELF_ST_INFO(bind, type) (((bind) << 4) | ((type) & 0xf))
 
+/* Host-owned vDSO page accessor. The vDSO is mapped RX to EL0, so guest
+ * permission walkers cannot write here; route every host build/seed/attention
+ * mutation through this bounds-checked direct host_base+VDSO_BASE pointer.
+ */
+static uint8_t *vdso_host_page(guest_t *g)
+{
+    if (VDSO_BASE + VDSO_SIZE > g->guest_size)
+        return NULL;
+    return (uint8_t *) g->host_base + VDSO_BASE;
+}
+
 /* Layout.
  *
  * Symbol layout (all entries are 12-byte SVC trampolines):
@@ -75,50 +117,137 @@ typedef struct {
 #define VDSO_OFF_PHDR 0x040
 #define VDSO_OFF_PHDR1 0x078
 
-/* .text trampolines (each 12 bytes: mov x8, #N; svc #0; ret). */
-#define TEXT_OFF_SIGRET 0x0B0
-#define TEXT_OFF_GETRES 0x0BC
-#define TEXT_OFF_GETTIME 0x0C8
-#define TEXT_OFF_GETTOD 0x0D4
-#define TEXT_END 0x0E0
+/* vvar at fixed offset; host writes the wall-clock anchor on first
+ * clock_gettime SVC, after the guest trampoline has stored its own
+ * CNTVCT_EL0 read into X9. Layout:
+ *   +0   uint32 initialized (host sets 1 after the anchor fields)
+ *   +4   uint32 attention (host mirrors shim attention bits; nonzero -> SVC)
+ *   +8   uint64 anchor_cntvct (guest frame, written by host from X9)
+ *   +16  uint64 anchor_mono_sec  (CLOCK_MONOTONIC anchor)
+ *   +24  uint64 anchor_mono_nsec
+ *   +32  uint64 anchor_real_sec  (CLOCK_REALTIME anchor)
+ *   +40  uint64 anchor_real_nsec
+ *
+ * Both anchor pairs are seeded together at the first vDSO-mediated
+ * clock_gettime SVC. The trampoline interpolates either pair from the
+ * shared CNTVCT delta; the picking of MONO vs REAL is done by adding
+ * VVAR_OFF_ANCHOR_MONO_SEC or VVAR_OFF_ANCHOR_REAL_SEC to the vvar base
+ * and LDPing the two-doubleword anchor.
+ *
+ * Wall-clock anchors are not refreshed on macOS NTP steps; long-running
+ * processes that observe sub-second wall-clock movements will see drift
+ * relative to a fresh clock_gettime(REALTIME) syscall. This matches the
+ * existing CNTVCT-based design and the standard tradeoff for vDSO time
+ * routines that lack a kernel-driven seqlock.
+ */
+#define VDSO_OFF_VVAR 0x0B0
+#define VVAR_OFF_INITIALIZED 0x00
+#define VVAR_OFF_ATTENTION 0x04
+#define VVAR_OFF_ANCHOR_CNTVCT 0x08
+#define VVAR_OFF_ANCHOR_MONO_SEC 0x10
+#define VVAR_OFF_ANCHOR_MONO_NSEC 0x18
+#define VVAR_OFF_ANCHOR_REAL_SEC 0x20
+#define VVAR_OFF_ANCHOR_REAL_NSEC 0x28
+#define VVAR_SIZE 0x30
+
+/* .text trampolines. rt_sigreturn / clock_getres / gettimeofday are 12-byte
+ * SVC trampolines. clock_gettime is the CNTVCT-based fast-path trampoline
+ * (140 bytes = 35 instructions including the svc_fallback tail). The
+ * trampoline uses LDAR on the vvar initialized flag, treats both states
+ * 0 (unseeded) and 2 (host-side reservation in vdso_seed_anchor) as
+ * fall-back, also falls back while attention is pending, and guards the
+ * CNTVCT-anchor subtraction against unsigned underflow via SUBS + B.LO. The
+ * fast path now serves both clockid 0 (CLOCK_REALTIME) and clockid 1
+ * (CLOCK_MONOTONIC); other clockids fall back to SVC.
+ */
+#define TEXT_OFF_SIGRET 0x0E0
+#define TEXT_OFF_GETRES 0x0EC
+#define TEXT_OFF_GETTIME 0x0F8
+#define TEXT_GETTIME_SIZE 0x8C
+#define TEXT_OFF_GETTOD (TEXT_OFF_GETTIME + TEXT_GETTIME_SIZE)
+#define TEXT_END (TEXT_OFF_GETTOD + 12)
+/* Offset of the SVC instruction inside __kernel_clock_gettime's svc_fallback
+ * (svc_fallback opens at instruction 33 of 35, i.e. byte 0x80; the SVC is
+ * the second instruction of the fallback, at byte 0x84). The host's
+ * sys_clock_gettime uses this value to gate vvar seeding: only a trap whose
+ * ELR_EL1 equals SVC_PC + 4 came from the trampoline and may carry a
+ * trustworthy CNTVCT in X9.
+ */
+#define VDSO_CLOCK_GETTIME_SVC_PC (TEXT_OFF_GETTIME + 0x84)
+
+/* dynstr, dynsym, hash, GNU version metadata, dynamic, shdr follow.
+ * TEXT_END is 0x190 after the attention-check expansion.
+ */
+#define VDSO_OFF_DYNSTR 0x190
+
+/* Padded to 8-byte align: 0x190 + 103 = 0x1F7, pad to 0x1F8 */
+#define VDSO_OFF_DYNSYM 0x1F8
 
-/* dynstr, dynsym, hash, dynamic, shdr follow */
-#define VDSO_OFF_DYNSTR 0x0E0
-#define DYNSTR_SIZE 90
+/* 5 * 24 = 120, 0x1F8 + 120 = 0x270 */
+#define VDSO_OFF_HASH 0x270
 
-/* Padded to 4-byte align: 0x0E0 + 90 = 0x13A, pad to 0x13C */
-#define VDSO_OFF_DYNSYM 0x13C
+/* 2+1+5 = 8 words * 4 = 32, 0x270 + 32 = 0x290 */
+#define VDSO_OFF_VERSYM 0x290
 
-/* 5 * 24 = 120, 0x13C + 120 = 0x1B4 */
-#define VDSO_OFF_HASH 0x1B4
+/* 5 * 2 = 10, 0x290 + 10 = 0x29A, pad to 0x2A0 */
+#define VDSO_OFF_VERDEF 0x2A0
 
-/* 2+1+5 = 8 words * 4 = 32, 0x1B4 + 32 = 0x1D4, pad to 0x1D8 */
-#define VDSO_OFF_DYNAMIC 0x1D8
+/* Verdef + verdaux = 28, 0x2A0 + 28 = 0x2BC, pad to 0x2C0 */
+#define VDSO_OFF_DYNAMIC 0x2C0
 
-/* 6 * 16 = 96, 0x1D8 + 96 = 0x238 */
-#define VDSO_OFF_SHDR 0x238
+/* 9 * 16 = 144, 0x2C0 + 144 = 0x350 */
+#define VDSO_OFF_SHDR 0x350
+
+/* 8 * 64 = 512, 0x350 + 512 = 0x550 (fits in 4 KiB) */
 
-/* 6 * 64 = 384, 0x238 + 384 = 0x3B8 (fits in 4KiB) */
 #define VDSO_NUM_SYMS 4
 #define HASH_NCHAIN (VDSO_NUM_SYMS + 1)
 #define HASH_NBUCKET 1
 #define HASH_SIZE ((2 + HASH_NBUCKET + HASH_NCHAIN) * sizeof(uint32_t))
+#define VERSYM_SIZE ((VDSO_NUM_SYMS + 1) * sizeof(uint16_t))
+#define VERDEF_SIZE (sizeof(elf64_verdef_t) + sizeof(elf64_verdaux_t))
+#define VDSO_NUM_DYN 9
 
 /* .dynstr data */
 static const char dynstr_data[] =
     "\0__kernel_rt_sigreturn"
     "\0__kernel_clock_getres"
     "\0__kernel_clock_gettime"
-    "\0__kernel_gettimeofday";
-
-/* Symbol name offsets */
-static const uint32_t sym_name_offsets[VDSO_NUM_SYMS] = {1, 23, 45, 68};
+    "\0__kernel_gettimeofday"
+    "\0LINUX_2.6.39";
+#define DYNSTR_SIZE sizeof(dynstr_data)
+
+/* Symbol name offsets, derived from preceding string-literal lengths so a
+ * future edit to dynstr_data shifts them in lockstep instead of silently
+ * breaking the version lookup (sizeof("\0X") - 1 == bytes contributed when
+ * X is concatenated into dynstr_data; only the very last literal's trailing
+ * NUL survives concatenation).
+ */
+#define DYNSTR_BYTES_RT_SIGRETURN (sizeof("\0__kernel_rt_sigreturn") - 1)
+#define DYNSTR_BYTES_CLOCK_GETRES (sizeof("\0__kernel_clock_getres") - 1)
+#define DYNSTR_BYTES_CLOCK_GETTIME (sizeof("\0__kernel_clock_gettime") - 1)
+#define DYNSTR_BYTES_GETTIMEOFDAY (sizeof("\0__kernel_gettimeofday") - 1)
+
+static const uint32_t sym_name_offsets[VDSO_NUM_SYMS] = {
+    1,
+    DYNSTR_BYTES_RT_SIGRETURN + 1,
+    DYNSTR_BYTES_RT_SIGRETURN + DYNSTR_BYTES_CLOCK_GETRES + 1,
+    DYNSTR_BYTES_RT_SIGRETURN + DYNSTR_BYTES_CLOCK_GETRES +
+        DYNSTR_BYTES_CLOCK_GETTIME + 1,
+};
+/* Skip the leading \0 of "\0LINUX_2.6.39" to land on 'L'. */
+#define VDSO_LINUX_VERSION_NAME_OFF                          \
+    (DYNSTR_BYTES_RT_SIGRETURN + DYNSTR_BYTES_CLOCK_GETRES + \
+     DYNSTR_BYTES_CLOCK_GETTIME + DYNSTR_BYTES_GETTIMEOFDAY + 1)
+
+_Static_assert(sizeof(dynstr_data) <= 104,
+               "dynstr_data outgrew the DYNSYM padding window");
 
 /* Symbol text offsets and sizes */
 static const uint32_t sym_text_off[VDSO_NUM_SYMS] = {
     TEXT_OFF_SIGRET, TEXT_OFF_GETRES, TEXT_OFF_GETTIME, TEXT_OFF_GETTOD};
-static const uint32_t sym_text_size[VDSO_NUM_SYMS] = {
-    12, 12, TEXT_OFF_GETTOD - TEXT_OFF_GETTIME, 12};
+static const uint32_t sym_text_size[VDSO_NUM_SYMS] = {12, 12, TEXT_GETTIME_SIZE,
+                                                      12};
 
 /* Emit a 12-byte SVC trampoline: mov x8, #syscall_nr; svc #0; ret. */
 static void emit_svc_trampoline(uint32_t *code, unsigned syscall_nr)
@@ -129,9 +258,265 @@ static void emit_svc_trampoline(uint32_t *code, unsigned syscall_nr)
     code[2] = 0xD65F03C0U; /* ret    */
 }
 
+/* CNTVCT-based fast-path trampoline for __kernel_clock_gettime. The guest
+ * always reads CNTVCT_EL0 into X9 first, then either falls through to a
+ * full SVC (unsupported clockids, pending attention, vvar uninitialized) or
+ * interpolates wall_clock from the vvar anchor. The host's
+ * sys_clock_gettime handler reads X9 on the first SVC and seeds the vvar
+ * (anchor_cntvct = X9, anchor_sec/nsec = wall_clock), so subsequent calls
+ * skip the trap while attention remains clear. CNTKCTL_EL1.EL0VCTEN is set
+ * in bootstrap to allow the MRS at EL0; without that the trampoline gets
+ * 0 back and the math collapses.
+ *
+ * The svc_fallback tail lives in __kernel_clock_gettime's slot too so a
+ * single RET ends the function in either path.
+ */
+
+/* AArch64 instruction encoders (only the ones used here). */
+static uint32_t enc_movz_x(unsigned rd, uint16_t imm)
+{
+    return 0xD2800000U | ((uint32_t) imm << 5) | (rd & 0x1F);
+}
+
+static uint32_t enc_movk_x_lsl16(unsigned rd, uint16_t imm)
+{
+    return 0xF2A00000U | ((uint32_t) imm << 5) | (rd & 0x1F);
+}
+
+static uint32_t enc_adr(unsigned rd, int32_t pc_rel)
+{
+    uint32_t immlo = (uint32_t) (pc_rel & 0x3);
+    uint32_t immhi = (uint32_t) ((pc_rel >> 2) & 0x7FFFF);
+    return 0x10000000U | (immlo << 29) | (immhi << 5) | (rd & 0x1F);
+}
+
+/* B.cond imm19. cond is the 4-bit AArch64 condition (NE=0x1, LO=0x3, etc.). */
+#define COND_NE 0x1
+#define COND_LO 0x3
+static uint32_t enc_bcond_imm19(unsigned cond, int32_t pc_rel)
+{
+    uint32_t imm19 = (uint32_t) ((pc_rel >> 2) & 0x7FFFF);
+    return 0x54000000U | (imm19 << 5) | (cond & 0xF);
+}
+
+static uint32_t enc_ldr_x_imm12(unsigned rt, unsigned rn, uint32_t off_bytes)
+{
+    return 0xF9400000U | ((off_bytes / 8) << 10) | ((rn & 0x1F) << 5) |
+           (rt & 0x1F);
+}
+
+static uint32_t enc_add_x(unsigned rd, unsigned rn, unsigned rm)
+{
+    return 0x8B000000U | ((rm & 0x1F) << 16) | ((rn & 0x1F) << 5) | (rd & 0x1F);
+}
+
+static uint32_t enc_add_x_imm12(unsigned rd, unsigned rn, uint16_t imm)
+{
+    return 0x91000000U | (((uint32_t) imm & 0xFFF) << 10) | ((rn & 0x1F) << 5) |
+           (rd & 0x1F);
+}
+
+static uint32_t enc_mul_x(unsigned rd, unsigned rn, unsigned rm)
+{
+    return 0x9B007C00U | ((rm & 0x1F) << 16) | ((rn & 0x1F) << 5) | (rd & 0x1F);
+}
+
+static uint32_t enc_udiv_x(unsigned rd, unsigned rn, unsigned rm)
+{
+    return 0x9AC00800U | ((rm & 0x1F) << 16) | ((rn & 0x1F) << 5) | (rd & 0x1F);
+}
+
+static uint32_t enc_msub_x(unsigned rd, unsigned rn, unsigned rm, unsigned ra)
+{
+    return 0x9B008000U | ((rm & 0x1F) << 16) | ((ra & 0x1F) << 10) |
+           ((rn & 0x1F) << 5) | (rd & 0x1F);
+}
+
+static uint32_t enc_stp_x_imm7(unsigned rt1,
+                               unsigned rt2,
+                               unsigned rn,
+                               int32_t off_bytes)
+{
+    int32_t imm7 = (off_bytes / 8) & 0x7F;
+    return 0xA9000000U | ((uint32_t) imm7 << 15) | ((rt2 & 0x1F) << 10) |
+           ((rn & 0x1F) << 5) | (rt1 & 0x1F);
+}
+
+static uint32_t enc_cmp_w_imm12(unsigned rn, uint32_t imm12)
+{
+    /* SUBS WZR, Wn, #imm12 */
+    return 0x7100001FU | ((imm12 & 0xFFF) << 10) | ((rn & 0x1F) << 5);
+}
+
+/* LDAR Wt, [Xn] -- acquire load of a 32-bit word. Pairs with the host's
+ * __atomic_store_n(initialized, ..., __ATOMIC_RELEASE) so that observing
+ * initialized != 0 also makes the prior anchor stores visible.
+ */
+static uint32_t enc_ldar_w(unsigned rt, unsigned rn)
+{
+    return 0x88DFFC00U | ((rn & 0x1F) << 5) | (rt & 0x1F);
+}
+
+/* SUBS Xd, Xn, Xm (set flags). */
+static uint32_t enc_subs_x(unsigned rd, unsigned rn, unsigned rm)
+{
+    return 0xEB000000U | ((rm & 0x1F) << 16) | ((rn & 0x1F) << 5) | (rd & 0x1F);
+}
+
+/* CBZ Wt, imm19 (byte-relative; encoder shifts >>2 internally). */
+static uint32_t enc_cbz_w(unsigned rt, int32_t pc_rel)
+{
+    uint32_t imm19 = (uint32_t) ((pc_rel >> 2) & 0x7FFFF);
+    return 0x34000000U | (imm19 << 5) | (rt & 0x1F);
+}
+
+static uint32_t enc_cbnz_w(unsigned rt, int32_t pc_rel)
+{
+    uint32_t imm19 = (uint32_t) ((pc_rel >> 2) & 0x7FFFF);
+    return 0x35000000U | (imm19 << 5) | (rt & 0x1F);
+}
+
+/* B imm26 unconditional branch (byte-relative). */
+static uint32_t enc_b(int32_t pc_rel)
+{
+    uint32_t imm26 = (uint32_t) ((pc_rel >> 2) & 0x3FFFFFF);
+    return 0x14000000U | imm26;
+}
+
+/* LDP Xt1, Xt2, [Xn, #off_bytes] (signed 7-bit imm, multiple of 8). */
+static uint32_t enc_ldp_x_imm7(unsigned rt1,
+                               unsigned rt2,
+                               unsigned rn,
+                               int32_t off_bytes)
+{
+    int32_t imm7 = (off_bytes / 8) & 0x7F;
+    return 0xA9400000U | ((uint32_t) imm7 << 15) | ((rt2 & 0x1F) << 10) |
+           ((rn & 0x1F) << 5) | (rt1 & 0x1F);
+}
+
+/* Emit the CNTVCT fast-path clock_gettime trampoline at page+pc_off; the
+ * vvar lives at page+vvar_off. The trampoline is exactly TEXT_GETTIME_SIZE
+ * bytes; the static_assert below catches drift.
+ *
+ * Layout (35 instructions, 0x8c bytes):
+ *
+ *   0x00  mrs  x9, cntvct_el0           ; always read first
+ *   0x04  cbz  w0, .Lreal               ; clockid==0 -> CLOCK_REALTIME
+ *   0x08  cmp  w0, #1                   ; clockid==1 -> CLOCK_MONOTONIC
+ *   0x0C  b.ne svc_fallback              ; other clockid -> SVC
+ *   0x10  mov  w7, #ANCHOR_MONO_SEC      ; offset within vvar of MONO sec
+ *   0x14  b    .Linit
+ *   0x18  .Lreal: mov w7, #ANCHOR_REAL_SEC
+ *   0x1C  .Linit: adr x2, vvar
+ *   0x20  add  x10, x2, #ATTENTION
+ *   0x24  ldar w3, [x10]                 ; load attention flag (acquire)
+ *   0x28  cbnz w3, svc_fallback          ; timers/signals need epilogue
+ *   0x2C  ldar w3, [x2]                  ; load initialized flag (acquire)
+ *   0x30  cmp  w3, #1
+ *   0x34  b.ne svc_fallback              ; not seeded yet
+ *   0x38  ldr  x3, [x2, #ANCHOR_CNTVCT]
+ *   0x3C  add  x8, x2, x7                ; x8 = vvar base + sec_offset
+ *   0x40  ldp  x4, x5, [x8]              ; x4=anchor_sec, x5=anchor_nsec
+ *   0x44  subs x6, x9, x3                ; cntvct delta
+ *   0x48  b.lo svc_fallback              ; underflow -> SVC
+ *   ... (math identical to original: delta*125/3 ns, +nsec, carry into sec)
+ *   0x74  stp  x4, x5, [x1]              ; store {sec, nsec}
+ *   0x78  mov  x0, #0
+ *   0x7C  ret
+ *   0x80  svc_fallback: mov x8, #113
+ *   0x84  svc  #0
+ *   0x88  ret
+ *
+ * Both clockids share the same CNTVCT delta math; only the anchor pair
+ * loaded via LDP changes. Picking via a runtime offset register avoids
+ * duplicating the entire math block per clockid.
+ */
+static void emit_clock_gettime_trampoline(uint32_t *code,
+                                          uint32_t pc_off,
+                                          uint32_t vvar_off)
+{
+    /* Branch targets within the trampoline. */
+    int32_t real_off = 0x18;         /* .Lreal */
+    int32_t init_off = 0x1C;         /* .Linit (common path entry) */
+    int32_t svc_fallback_off = 0x80; /* svc_fallback */
+    int32_t adr_pc_off = 0x1C;       /* offset of 'adr x2, vvar' */
+    int32_t vvar_rel = (int32_t) vvar_off - (int32_t) (pc_off + adr_pc_off);
+
+    code[0] = 0xD53BE049U;                   /* mrs  x9, cntvct_el0           */
+    code[1] = enc_cbz_w(0, real_off - 0x04); /* cbz w0, .Lreal     */
+    code[2] = enc_cmp_w_imm12(0, 1);         /* cmp  w0, #1        */
+    code[3] = enc_bcond_imm19(COND_NE, svc_fallback_off - 0x0C);
+    /* b.ne svc_fallback  */
+    code[4] = enc_movz_x(7, VVAR_OFF_ANCHOR_MONO_SEC);
+    code[5] = enc_b(init_off - 0x14);                  /* b .Linit           */
+    code[6] = enc_movz_x(7, VVAR_OFF_ANCHOR_REAL_SEC); /* .Lreal       */
+    code[7] = enc_adr(2, vvar_rel);                    /* .Linit: adr x2,vv  */
+    code[8] = enc_add_x_imm12(10, 2, VVAR_OFF_ATTENTION);
+    code[9] = enc_ldar_w(3, 10);
+    code[10] = enc_cbnz_w(3, svc_fallback_off - 0x28);
+    code[11] = enc_ldar_w(3, 2);      /* ldar w3, [x2]      */
+    code[12] = enc_cmp_w_imm12(3, 1); /* cmp  w3, #1        */
+    code[13] = enc_bcond_imm19(COND_NE, svc_fallback_off - 0x34);
+    /* b.ne svc_fallback  */
+    code[14] = enc_ldr_x_imm12(3, 2, VVAR_OFF_ANCHOR_CNTVCT);
+    code[15] = enc_add_x(8, 2, 7);         /* add x8, x2, x7     */
+    code[16] = enc_ldp_x_imm7(4, 5, 8, 0); /* ldp x4, x5, [x8]   */
+    code[17] = enc_subs_x(6, 9, 3);        /* subs x6, x9, x3    */
+    code[18] = enc_bcond_imm19(COND_LO, svc_fallback_off - 0x48);
+    /* b.lo svc_fallback  */
+    code[19] = enc_movz_x(7, 125);
+    code[20] = enc_mul_x(6, 6, 7); /* delta * 125        */
+    code[21] = enc_movz_x(7, 3);
+    code[22] = enc_udiv_x(6, 6, 7); /* delta_ns           */
+    code[23] = enc_add_x(5, 5, 6);  /* nsec += delta_ns   */
+    code[24] = enc_movz_x(7, 0xCA00);
+    code[25] = enc_movk_x_lsl16(7, 0x3B9A); /* x7 = 1e9           */
+    code[26] = enc_udiv_x(8, 5, 7);         /* sec_carry          */
+    code[27] = enc_msub_x(5, 8, 7, 5);      /* nsec %= 1e9        */
+    code[28] = enc_add_x(4, 4, 8);          /* sec += carry       */
+    code[29] = enc_stp_x_imm7(4, 5, 1, 0);  /* stp x4, x5, [x1]   */
+    code[30] = enc_movz_x(0, 0);            /* mov x0, #0         */
+    code[31] = 0xD65F03C0U;                 /* ret                */
+    /* svc_fallback at offset 0x80 (instruction 32) */
+    code[32] = enc_movz_x(8, 113); /* mov x8, #113       */
+    code[33] = 0xD4000001U;        /* svc #0             */
+    code[34] = 0xD65F03C0U;        /* ret                */
+}
+
+_Static_assert(TEXT_GETTIME_SIZE == 35 * sizeof(uint32_t),
+               "clock_gettime trampoline size must match emitter");
+
+/* The public sigret offset declared in core/vdso.h must match the
+ * internal layout above; signal.c sets X30 to VDSO_BASE + VDSO_OFF_SIGRET
+ * as the return-from-handler target.
+ */
+_Static_assert(VDSO_OFF_SIGRET == TEXT_OFF_SIGRET,
+               "VDSO_OFF_SIGRET in core/vdso.h must equal TEXT_OFF_SIGRET");
+
+static uint32_t elf_hash(const char *name)
+{
+    uint32_t h = 0, g;
+
+    while (*name) {
+        h = (h << 4) + (unsigned char) *name++;
+        g = h & 0xf0000000U;
+        if (g)
+            h ^= g >> 24;
+        h &= ~g;
+    }
+    return h;
+}
+
 uint64_t vdso_build(guest_t *g)
 {
-    uint8_t *page = (uint8_t *) guest_ptr(g, VDSO_BASE);
+    /* The vDSO page is host-built into the guest backing buffer before any
+     * page-table entry covers it, so route through vdso_host_page which
+     * just bounds-checks against guest_size. The earlier guest_ptr walk
+     * worked by coincidence (the slot happened to be reachable) but tied
+     * host construction to whatever EL0 permission walker state existed
+     * at the time -- a fragile coupling for host-owned memory.
+     */
+    uint8_t *page = vdso_host_page(g);
     if (!page) {
         log_error("vdso: VDSO_BASE 0x%llx out of guest memory",
                   (unsigned long long) VDSO_BASE);
@@ -160,7 +545,7 @@ uint64_t vdso_build(guest_t *g)
     ehdr->e_phentsize = sizeof(elf64_phdr_t);
     ehdr->e_phnum = 2;
     ehdr->e_shentsize = sizeof(elf64_shdr_t);
-    ehdr->e_shnum = 6;
+    ehdr->e_shnum = 8;
     ehdr->e_shstrndx = 2;
 
     /* Program header 0: PT_LOAD. */
@@ -181,8 +566,8 @@ uint64_t vdso_build(guest_t *g)
     phdr1->p_offset = VDSO_OFF_DYNAMIC;
     phdr1->p_vaddr = VDSO_OFF_DYNAMIC;
     phdr1->p_paddr = VDSO_OFF_DYNAMIC;
-    phdr1->p_filesz = 6 * sizeof(elf64_dyn_t);
-    phdr1->p_memsz = 6 * sizeof(elf64_dyn_t);
+    phdr1->p_filesz = VDSO_NUM_DYN * sizeof(elf64_dyn_t);
+    phdr1->p_memsz = VDSO_NUM_DYN * sizeof(elf64_dyn_t);
     phdr1->p_align = 8;
 
     /* Text trampolines.  Each entry is the same 12-byte mov/svc/ret pattern
@@ -190,9 +575,14 @@ uint64_t vdso_build(guest_t *g)
      */
     emit_svc_trampoline((uint32_t *) (page + TEXT_OFF_SIGRET), 139);
     emit_svc_trampoline((uint32_t *) (page + TEXT_OFF_GETRES), 114);
-    emit_svc_trampoline((uint32_t *) (page + TEXT_OFF_GETTIME), 113);
+    emit_clock_gettime_trampoline((uint32_t *) (page + TEXT_OFF_GETTIME),
+                                  TEXT_OFF_GETTIME, VDSO_OFF_VVAR);
     emit_svc_trampoline((uint32_t *) (page + TEXT_OFF_GETTOD), 169);
 
+    /* vvar starts zero (initialized==0). The first __kernel_clock_gettime
+     * SVC fallback will let the host populate the anchor.
+     */
+
     /* Dynamic string table. */
     memcpy(page + VDSO_OFF_DYNSTR, dynstr_data, DYNSTR_SIZE);
 
@@ -221,6 +611,27 @@ uint64_t vdso_build(guest_t *g)
     }
     hash[2] = first_sym;
 
+    /* GNU symbol versioning. glibc's aarch64 vDSO resolver asks for
+     * LINUX_2.6.39 and ignores unversioned helpers.
+     */
+    uint16_t *versym = (uint16_t *) (page + VDSO_OFF_VERSYM);
+    versym[0] = 0;
+    for (int i = 1; i <= VDSO_NUM_SYMS; i++)
+        versym[i] = VDSO_LINUX_VERSION_INDEX;
+
+    elf64_verdef_t *verdef = (elf64_verdef_t *) (page + VDSO_OFF_VERDEF);
+    elf64_verdaux_t *verdaux =
+        (elf64_verdaux_t *) (page + VDSO_OFF_VERDEF + sizeof(*verdef));
+    verdef->vd_version = VER_DEF_CURRENT;
+    verdef->vd_flags = 0;
+    verdef->vd_ndx = VDSO_LINUX_VERSION_INDEX;
+    verdef->vd_cnt = 1;
+    verdef->vd_hash = elf_hash("LINUX_2.6.39");
+    verdef->vd_aux = sizeof(*verdef);
+    verdef->vd_next = 0;
+    verdaux->vda_name = VDSO_LINUX_VERSION_NAME_OFF;
+    verdaux->vda_next = 0;
+
     /* Dynamic table. */
     elf64_dyn_t *dyn = (elf64_dyn_t *) (page + VDSO_OFF_DYNAMIC);
     dyn[0] = (elf64_dyn_t) {DT_HASH, VDSO_OFF_HASH};
@@ -228,7 +639,10 @@ uint64_t vdso_build(guest_t *g)
     dyn[2] = (elf64_dyn_t) {DT_STRTAB, VDSO_OFF_DYNSTR};
     dyn[3] = (elf64_dyn_t) {DT_STRSZ, DYNSTR_SIZE};
     dyn[4] = (elf64_dyn_t) {DT_SYMENT, sizeof(elf64_sym_t)};
-    dyn[5] = (elf64_dyn_t) {DT_NULL, 0};
+    dyn[5] = (elf64_dyn_t) {DT_VERSYM, VDSO_OFF_VERSYM};
+    dyn[6] = (elf64_dyn_t) {DT_VERDEF, VDSO_OFF_VERDEF};
+    dyn[7] = (elf64_dyn_t) {DT_VERDEFNUM, 1};
+    dyn[8] = (elf64_dyn_t) {DT_NULL, 0};
 
     /* Section headers. */
     elf64_shdr_t *shdr = (elf64_shdr_t *) (page + VDSO_OFF_SHDR);
@@ -276,10 +690,134 @@ uint64_t vdso_build(guest_t *g)
     shdr[5].sh_flags = SHF_ALLOC;
     shdr[5].sh_addr = VDSO_OFF_DYNAMIC;
     shdr[5].sh_offset = VDSO_OFF_DYNAMIC;
-    shdr[5].sh_size = 6 * sizeof(elf64_dyn_t);
+    shdr[5].sh_size = VDSO_NUM_DYN * sizeof(elf64_dyn_t);
     shdr[5].sh_link = 2;
     shdr[5].sh_addralign = 8;
     shdr[5].sh_entsize = sizeof(elf64_dyn_t);
 
+    shdr[6].sh_name = 0;
+    shdr[6].sh_type = SHT_GNU_VERSYM;
+    shdr[6].sh_flags = SHF_ALLOC;
+    shdr[6].sh_addr = VDSO_OFF_VERSYM;
+    shdr[6].sh_offset = VDSO_OFF_VERSYM;
+    shdr[6].sh_size = VERSYM_SIZE;
+    shdr[6].sh_link = 3;
+    shdr[6].sh_addralign = 2;
+    shdr[6].sh_entsize = sizeof(uint16_t);
+
+    shdr[7].sh_name = 0;
+    shdr[7].sh_type = SHT_GNU_VERDEF;
+    shdr[7].sh_flags = SHF_ALLOC;
+    shdr[7].sh_addr = VDSO_OFF_VERDEF;
+    shdr[7].sh_offset = VDSO_OFF_VERDEF;
+    shdr[7].sh_size = VERDEF_SIZE;
+    shdr[7].sh_link = 2;
+    shdr[7].sh_info = 1;
+    shdr[7].sh_addralign = 4;
+
     return VDSO_BASE;
 }
+
+void vdso_seed_anchor(guest_t *g,
+                      uint64_t guest_cntvct,
+                      int64_t mono_sec,
+                      int64_t mono_nsec,
+                      int64_t real_sec,
+                      int64_t real_nsec)
+{
+    /* Match vdso_attention_or: host-owned vvar writes go through the
+     * direct host_base + VDSO_BASE accessor, not the guest permission
+     * walker. The vDSO is RX to EL0 so guest_ptr_w would silently bail
+     * here; guest_ptr happens to work because it only requires read
+     * perm, but that inconsistency is brittle.
+     */
+    uint8_t *page = vdso_host_page(g);
+    if (!page)
+        return;
+    uint32_t *initialized = (uint32_t *) (page + VDSO_OFF_VVAR);
+    uint8_t *vvar = page + VDSO_OFF_VVAR;
+
+    /* Three-state CAS reservation: 0 = unseeded, 2 = reserving (one host
+     * thread owns the anchor stores), 1 = ready. Multiple host threads can
+     * concurrently take the SVC fallback on the first guest call; without
+     * the reservation they race on the plain anchor stores. The CAS winner
+     * writes the fields and releases 1; losers bail. The guest trampoline
+     * loads initialized with LDAR and only takes the fast path on
+     * initialized == 1, so state 2 still routes to the SVC fallback.
+     *
+     * Both MONO and REAL anchor pairs are written together so a fast-path
+     * caller for either clockid sees a consistent pair after observing
+     * initialized == 1. The two pairs share anchor_cntvct (the trampoline's
+     * X9 at first call); macOS clock_gettime for MONO and REAL was issued
+     * by the host between then and now, so the anchor wall_clock values
+     * trail X9 by a small constant offset that propagates unchanged into
+     * every fast-path result.
+     */
+    uint32_t expected = 0;
+    if (!__atomic_compare_exchange_n(initialized, &expected, 2,
+                                     /* weak */ false, __ATOMIC_ACQUIRE,
+                                     __ATOMIC_RELAXED))
+        return;
+
+    *(uint64_t *) (vvar + VVAR_OFF_ANCHOR_CNTVCT) = guest_cntvct;
+    *(uint64_t *) (vvar + VVAR_OFF_ANCHOR_MONO_SEC) = (uint64_t) mono_sec;
+    *(uint64_t *) (vvar + VVAR_OFF_ANCHOR_MONO_NSEC) = (uint64_t) mono_nsec;
+    *(uint64_t *) (vvar + VVAR_OFF_ANCHOR_REAL_SEC) = (uint64_t) real_sec;
+    *(uint64_t *) (vvar + VVAR_OFF_ANCHOR_REAL_NSEC) = (uint64_t) real_nsec;
+
+    /* The release-store on initialized pairs with the trampoline's LDAR
+     * load on the same address; observing 1 also makes the anchor fields
+     * visible to the guest.
+     */
+    __atomic_store_n(initialized, 1, __ATOMIC_RELEASE);
+}
+
+uint64_t vdso_clock_gettime_svc_pc(void)
+{
+    return VDSO_BASE + VDSO_CLOCK_GETTIME_SVC_PC;
+}
+
+bool vdso_anchor_is_seeded(guest_t *g)
+{
+    uint8_t *page = vdso_host_page(g);
+    if (!page)
+        return false;
+    uint32_t *initialized = (uint32_t *) (page + VDSO_OFF_VVAR);
+    /* Pairs with the release store in vdso_seed_anchor that publishes the
+     * anchor fields. Only state 1 (ready) qualifies; state 2 (one host
+     * thread reserving) still needs the seeding gate to run for any
+     * subsequent caller that wins after the reservation completes.
+     */
+    return __atomic_load_n(initialized, __ATOMIC_ACQUIRE) == 1;
+}
+
+void vdso_attention_or(guest_t *g, uint32_t bits)
+{
+    /* The vDSO is mapped RX to EL0, but the host owns the embedded vvar and
+     * must still be able to mirror shim attention into it. Bypass the
+     * guest-permission walker just like shim_globals does for shim_data.
+     */
+    uint8_t *page = vdso_host_page(g);
+    if (!page)
+        return;
+    uint32_t *attention =
+        (uint32_t *) (page + VDSO_OFF_VVAR + VVAR_OFF_ATTENTION);
+    /* SEQ_CST mirrors shim_globals_attn_or. The vDSO attention word is
+     * read by EL0 vDSO fast paths (libc time/getcpu/etc.) without going
+     * through HVC, so the same contrapositive-style ordering claim
+     * applies: a reader that LDAR-loads attn=0 must not observe later
+     * publish_creds stores. ACQ_REL alone does not provide that
+     * (release-acquire only orders the forward direction).
+     */
+    __atomic_fetch_or(attention, bits, __ATOMIC_SEQ_CST);
+}
+
+void vdso_attention_and(guest_t *g, uint32_t mask)
+{
+    uint8_t *page = vdso_host_page(g);
+    if (!page)
+        return;
+    uint32_t *attention =
+        (uint32_t *) (page + VDSO_OFF_VVAR + VVAR_OFF_ATTENTION);
+    __atomic_fetch_and(attention, mask, __ATOMIC_RELEASE);
+}
diff --git a/src/core/vdso.h b/src/core/vdso.h
index e3a41d5..0986ab5 100644
--- a/src/core/vdso.h
+++ b/src/core/vdso.h
@@ -12,17 +12,63 @@
 
 #pragma once
 
+#include <stdbool.h>
+#include <stdint.h>
+
 #include "core/guest.h"
 
 /* Guest address where the vDSO is placed (one 4KiB page, below PT pool) */
 #define VDSO_BASE 0x0000F000ULL
 #define VDSO_SIZE 0x00001000ULL /* 4KiB */
-#define VDSO_OFF_TEXT 0x0B0     /* Offset of .text (trampoline code) */
+/* Offset of __kernel_rt_sigreturn (the signal trampoline glibc/musl jumps
+ * to via X30/LR after the handler returns). Must match TEXT_OFF_SIGRET in
+ * src/core/vdso.c; kept here so signal.c can target it without including
+ * the vDSO internals.
+ */
+#define VDSO_OFF_SIGRET 0x0E0
 
 /* Build a minimal vDSO ELF image at VDSO_BASE in guest memory.
  * The image contains a valid ELF header, one LOAD program header, SHT_DYNSYM
- * and SHT_STRTAB sections, and a __kernel_rt_sigreturn symbol pointing to
- * a small trampoline (mov x8, #139; svc #0).
+ * and SHT_STRTAB sections, and a __kernel_rt_sigreturn symbol pointing to a
+ * small trampoline (mov x8, #139; svc #0).
  * Returns the GVA of the ELF header (== VDSO_BASE), or 0 on failure.
  */
 uint64_t vdso_build(guest_t *g);
+
+/* If the vvar anchor has not been seeded yet, install the supplied cntvct as
+ * the guest-frame anchor paired with the given monotonic and realtime
+ * wall_clock values. Idempotent: subsequent calls with initialized==1 are
+ * no-ops. Used by sys_clock_gettime to upgrade the first
+ * __kernel_clock_gettime SVC fallback into a permanent vvar fast path that
+ * serves both CLOCK_MONOTONIC and CLOCK_REALTIME.
+ */
+void vdso_seed_anchor(guest_t *g,
+                      uint64_t guest_cntvct,
+                      int64_t mono_sec,
+                      int64_t mono_nsec,
+                      int64_t real_sec,
+                      int64_t real_nsec);
+
+/* GVA at which the trampoline's svc_fallback issues its SVC. Used by
+ * sys_clock_gettime to verify a clock_gettime trap actually came from the vDSO
+ * fallback path (and thus carries a guest-frame CNTVCT in X9) versus an
+ * unrelated raw syscall(SYS_clock_gettime, ...). The trap returns to SVC_PC
+ * + 4, so callers compare ELR_EL1 against that.
+ */
+uint64_t vdso_clock_gettime_svc_pc(void);
+
+/* Returns true once the vvar anchor has been published (initialized==1) and
+ * the fast path can never be reseeded. Lets the post-SVC handler in
+ * sys_clock_gettime skip the ELR_EL1 + X9 HVF reads it otherwise needs for
+ * the seeding gate, since the second-call onward gate is moot once seeded.
+ * Uses acquire ordering paired with vdso_seed_anchor's release store.
+ */
+bool vdso_anchor_is_seeded(guest_t *g);
+
+/* Mirror the shim attention bitmask into the vvar page. The vDSO
+ * clock_gettime fast path reads this word and falls back to SVC whenever
+ * it is nonzero, preserving the normal post-HVC timer/signal epilogue while
+ * guest attention is pending.
+ */
+void vdso_attention_or(guest_t *g, uint32_t bits);
+void vdso_attention_and(guest_t *g, uint32_t mask);
diff --git a/src/runtime/fork-state.c b/src/runtime/fork-state.c
index f9746cd..a36a673 100644
--- a/src/runtime/fork-state.c
+++ b/src/runtime/fork-state.c
@@ -21,6 +21,7 @@
 #include "debug/log.h"
 #include "syscall/abi.h"
 #include "syscall/internal.h"
+#include "syscall/io.h"
 #include "syscall/mem.h"
 #include "syscall/proc.h"
 
@@ -34,6 +35,15 @@ int fork_ipc_write_all(int fd, const void *buf, size_t len)
                 continue;
             return -1;
         }
+        if (n == 0) {
+            /* Defensive: an unexpected zero return on a blocking socket
+             * would otherwise spin forever, since p and len stay at the
+             * same offset. Treat it as an IO failure so the parent and
+             * child both bail rather than wedge.
+             */
+            errno = EIO;
+            return -1;
+        }
         p += n;
         len -= n;
     }
@@ -249,9 +259,19 @@ int fork_ipc_send_fd_table(int ipc_sock)
         if (fd_table[i].type == FD_CLOSED)
             continue;
 
+        /* Synthetic-fd types are filtered here; see fd_type_is_synthetic
+         * in syscall/internal.h for the rationale (kqueue cannot cross
+         * SCM_RIGHTS on macOS, per-class side tables are not serialized).
+         * The child sees these slots as FD_CLOSED and recreates them via
+         * the appropriate syscall.
+         */
+        int t = fd_table[i].type;
+        if (fd_type_is_synthetic(t))
+            continue;
+
         int host_fd;
         bool was_duped = false;
-        if (fd_table[i].type != FD_STDIO) {
+        if (t != FD_STDIO) {
             int duped = dup(fd_table[i].host_fd);
             if (duped < 0)
                 continue;
@@ -315,8 +335,11 @@ int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g)
         return -1;
     }
 
-    if (num_fds == 0)
+    if (num_fds == 0) {
+        for (int fd = 0; fd < 3; fd++)
+            fd_mark_closed(fd);
         return 0;
+    }
 
     ipc_fd_entry_t *fd_entries = calloc(num_fds, sizeof(ipc_fd_entry_t));
     if (!fd_entries)
@@ -328,6 +351,16 @@ int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g)
         return -1;
     }
 
+    bool low_fd_present[3] = {false, false, false};
+    for (uint32_t i = 0; i < num_fds; i++) {
+        int gfd = fd_entries[i].guest_fd;
+        if (RANGE_CHECK(gfd, 0, 3) && !fd_type_is_synthetic(fd_entries[i].type))
+            low_fd_present[gfd] = true;
+    }
+    for (int fd = 0; fd < 3; fd++)
+        if (!low_fd_present[fd])
+            fd_mark_closed(fd);
+
     int *host_fds = calloc(num_fds, sizeof(int));
     if (!host_fds) {
         free(fd_entries);
@@ -361,15 +394,35 @@ int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g)
         if (fd_entries[i].type == FD_STDIO) {
             close(host_fds[i]);
             fd_table[gfd].linux_flags = fd_entries[i].linux_flags;
+            fd_refresh_urandom_bitmap(gfd);
             memcpy(fd_table[gfd].proc_path, fd_entries[i].proc_path,
                    sizeof(fd_table[gfd].proc_path));
             fd_table[gfd].seals = fd_entries[i].seals;
+        } else if (fd_type_is_synthetic(fd_entries[i].type)) {
+            /* Defense in depth: the parent's fork_ipc_send_fd_table
+             * already filters synthetic types out of the SCM_RIGHTS
+             * payload (see fd_type_is_synthetic in syscall/internal.h).
+             * If anything still arrives here, drop the inherited host
+             * fd and leave the slot FD_CLOSED so the child must
+             * recreate the fd via the appropriate syscall.
+             */
+            log_debug(
+                "fork-child: dropping unexpected synthetic-type fd %d (type "
+                "%d)",
+                gfd, fd_entries[i].type);
+            close(host_fds[i]);
+            fd_mark_closed(gfd);
+            continue;
         } else {
-            fd_alloc_at(gfd, fd_entries[i].type, host_fds[i]);
+            void (*cleanup)(int) = fd_cleanup_for_type(fd_entries[i].type);
+            fd_alloc_at(gfd, fd_entries[i].type, host_fds[i], cleanup);
             fd_table[gfd].linux_flags = fd_entries[i].linux_flags;
+            fd_refresh_urandom_bitmap(gfd);
             memcpy(fd_table[gfd].proc_path, fd_entries[i].proc_path,
                    sizeof(fd_table[gfd].proc_path));
             fd_table[gfd].seals = fd_entries[i].seals;
+            if (fd_entries[i].type == FD_URANDOM)
+                urandom_fd_reset_cache(gfd);
 
             if (fd_entries[i].type != FD_DIR)
                 continue;
@@ -656,15 +709,25 @@ int fork_ipc_recv_process_state(int ipc_fd, guest_t *g, signal_state_t *sig)
         log_error("fork-child: failed to read region count");
         return -1;
     }
-    if (num_guest_regions > GUEST_MAX_REGIONS)
-        num_guest_regions = GUEST_MAX_REGIONS;
-    if (num_guest_regions > 0 &&
+    uint32_t recv_regions = num_guest_regions;
+    if (recv_regions > GUEST_MAX_REGIONS)
+        recv_regions = GUEST_MAX_REGIONS;
+    if (recv_regions > 0 &&
         fork_ipc_read_all(ipc_fd, g->regions,
-                          num_guest_regions * sizeof(guest_region_t)) < 0) {
+                          recv_regions * sizeof(guest_region_t)) < 0) {
         log_error("fork-child: failed to read regions");
         return -1;
     }
-    g->nregions = (int) num_guest_regions;
+    /* Drain any excess records the parent serialized beyond the local cap.
+     * Without this drain, the next read (num_preannounced) consumes stale
+     * region bytes and desynchronizes the rest of the IPC payload. Mirrors
+     * the preannounced-region drain below.
+     */
+    if (num_guest_regions > recv_regions &&
+        fork_ipc_drain_bytes(ipc_fd, (num_guest_regions - recv_regions) *
+                                         sizeof(guest_region_t)) < 0)
+        return -1;
+    g->nregions = (int) recv_regions;
 
     uint32_t num_preannounced = 0;
     if (fork_ipc_read_all(ipc_fd, &num_preannounced, sizeof(num_preannounced)) <
diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c
index 963cb61..3f8c4a5 100644
--- a/src/runtime/forkipc.c
+++ b/src/runtime/forkipc.c
@@ -29,6 +29,8 @@
 #include "hvutil.h"
 #include "utils.h"
 
+#include "core/shim-globals.h"
+
 #include "runtime/forkipc.h"
 #include "runtime/fork-state.h"
 #include "runtime/futex.h"
@@ -299,6 +301,20 @@ int fork_child_main(int ipc_fd,
     HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL1, regs.sp_el1));
     HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL0, regs.tpidr_el0));
 
+    /* TPIDR_EL1 is set by the host (never inherited from the parent's
+     * register snapshot) because it must point at the child's own
+     * shim_globals base in the child's IPA; shim_data_base happens to
+     * be the same value in both processes (layout derives from
+     * guest_size + ipa_bits which match across fork), but installing
+     * it explicitly keeps the child consistent with the bootstrap path.
+     * CONTEXTIDR_EL1 holds the per-vCPU tid (== child pid for the
+     * single-threaded child at this point).
+     */
+    if (shim_globals_install_per_vcpu(vcpu, &g, hdr.child_pid) < 0) {
+        guest_destroy(&g);
+        return 1;
+    }
+
     /* Enable MMU directly (page tables already in guest memory from IPC).
      * SCTLR must include MMU-enable (M), caches (C, I), RES1 bits, and EL0
      * cache maintenance access (UCI, UCT) for JIT translators.
@@ -333,6 +349,39 @@ int fork_child_main(int ipc_fd,
      */
     thread_register_main(vcpu, vexit, hdr.child_pid, regs.sp_el1);
 
+    /* Re-publish identity into the child's shim-globals cache: the
+     * CoW / region copy inherits the parent's pid/uid values, and the
+     * shim's identity fast path would otherwise return the parent's
+     * pid to the child. Identity is now committed via the same path
+     * the bootstrap uses.
+     */
+    shim_globals_init(&g);
+    shim_globals_set_trace_enabled(&g, verbose);
+    shim_globals_publish_pid(&g, hdr.child_pid, hdr.parent_pid);
+    shim_globals_publish_creds(&g, hdr.uid, hdr.euid, hdr.gid, hdr.egid);
+    /* Fresh entropy for the child. Linux's vDSO getrandom epoch-bumps
+     * across fork; here we just re-fill the ring from arc4random_buf
+     * which seeds from the host kernel's RNG, so parent and child do
+     * not share future urandom output.
+     */
+    shim_globals_refill_urandom_ring(&g);
+    /* Register the singleton for the child's signal.c so its
+     * attention setters know which guest to update.
+     */
+    signal_set_shim_globals_guest(&g);
+    /* Same for the fd-table hooks. Must precede any fd_alloc the
+     * child performs (the fd-table-restore step has already run
+     * above, but those slots are populated via direct memcpy of the
+     * parent's entries; subsequent open/dup/close in the child rely
+     * on this registration to keep the bitmap in sync).
+     */
+    shim_globals_set_singleton(&g);
+    /* shim_globals_init above zeroed the urandom bitmap. Walk the inherited fd
+     * table and re-mark every readable FD_URANDOM slot so the shim's read fast
+     * path sees the correct state from the first syscall onward.
+     */
+    shim_globals_rebuild_urandom_bitmap();
+
     /* Now that current_thread is set, apply signal state. This must happen
      * after thread_register_main() so the per-thread blocked mask and altstack
      * are properly restored to the thread entry.
@@ -669,6 +718,14 @@ static void *thread_create_and_run(void *arg)
     WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TTBR0_EL1, tca->ttbr0));
     WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_CPACR_EL1, tca->cpacr));
 
+    /* All worker vCPUs in the process share the same shim_globals base
+     * (one VM per process); a fresh TPIDR_EL1 set is still required
+     * because HVF created this vCPU empty. CONTEXTIDR_EL1 holds the
+     * per-thread tid that the gettid shim fast path returns.
+     */
+    if (shim_globals_install_per_vcpu(vcpu, tca->guest, t->guest_tid) < 0)
+        goto startup_failed;
+
     /* MMU already on, so set SCTLR with M=1 directly (page tables exist) */
     WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SCTLR_EL1, tca->sctlr));
 
@@ -980,6 +1037,11 @@ static void *vm_clone_thread_run(void *arg)
     HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SCTLR_EL1, tca->sctlr));
     HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL1, tca->sp_el1));
     HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL0, tca->child_stack));
+    if (shim_globals_install_per_vcpu(vcpu, tca->guest, t->guest_tid) < 0) {
+        thread_deactivate(t);
+        free(tca);
+        return NULL;
+    }
 
     /* TLS pointer */
     if (tca->flags & LINUX_CLONE_SETTLS) {
@@ -1272,7 +1334,7 @@ int64_t sys_clone(hv_vcpu_t vcpu,
      *
      * Rosetta guests are excluded from CoW even when shm-backed: rosetta's
      * JIT state (TLS slabs, code caches, indirect-call tables, block lists)
-     * is process-local and corrupts when COW-shared. The legacy region-copy
+     * is process-local and corrupts when CoW-shared. The legacy region-copy
      * path preserves the parent's JIT state independently per child.
      */
     bool use_shm = (g->shm_fd >= 0) && !g->is_rosetta;
diff --git a/src/syscall/abi.h b/src/syscall/abi.h
index eda9bc7..b87848e 100644
--- a/src/syscall/abi.h
+++ b/src/syscall/abi.h
@@ -364,6 +364,11 @@ typedef struct {
 #define LINUX_O_RDONLY 0x0000
 #define LINUX_O_WRONLY 0x0001
 #define LINUX_O_RDWR 0x0002
+/* O_ACCMODE is the mask covering O_RDONLY, O_WRONLY, O_RDWR. The urandom
+ * read fast-path bitmap and the dup-alias metadata both need this mask to
+ * isolate the access-mode bits from the other LINUX_O_* flags.
+ */
+#define LINUX_O_ACCMODE 0x0003
 #define LINUX_O_CREAT 0x0040
 #define LINUX_O_EXCL 0x0080
 #define LINUX_O_NOCTTY 0x0100
@@ -639,6 +644,7 @@ typedef struct {
 #define FD_FUSE_DEV 14
 #define FD_FUSE_FILE 15
 #define FD_FUSE_DIR 16
+#define FD_URANDOM 17
 #define FD_VIRTUAL_PATH_MAX 64
 
 /* File sealing flags (F_SEAL_*) for memfd_create. Tracked per-FD. */
diff --git a/src/syscall/exec.c b/src/syscall/exec.c
index cecfcb2..6d8ca2e 100644
--- a/src/syscall/exec.c
+++ b/src/syscall/exec.c
@@ -25,6 +25,7 @@
 #include "core/bootstrap.h"
 #include "core/elf.h"
 #include "core/rosetta.h"
+#include "core/shim-globals.h"
 #include "core/stack.h"
 #include "core/vdso.h"
 
@@ -61,6 +62,37 @@ static void exec_sync_vcpu_regs(hv_vcpu_t vcpu)
     (void) vcpu_get_reg(vcpu, HV_REG_X8);
 }
 
+static void exec_republish_shim_globals_or_die(hv_vcpu_t vcpu,
+                                               guest_t *g,
+                                               bool verbose)
+{
+    /* guest_reset zeros shim_data. Reinitialize the host-owned fast-path
+     * state before returning to either native aarch64 code or the Rosetta
+     * runtime, otherwise identity and urandom fast paths observe all-zero
+     * cache state after exec.
+     */
+    shim_globals_init(g);
+    shim_globals_set_trace_enabled(g, verbose);
+
+    /* TPIDR_EL1 carries the shim_globals base. Past PNR, failure leaves the
+     * replacement image unable to use the EL1 shim safely, so abort in the
+     * same shape as other post-reset fatal errors.
+     */
+    if (shim_globals_install_tpidr(vcpu, g) < 0) {
+        log_fatal(
+            "execve failed after point of no return: "
+            "shim_globals_install_tpidr");
+        exit(128);
+    }
+
+    shim_globals_publish_pid(g, proc_get_pid(), proc_get_ppid());
+    shim_globals_publish_creds(g, proc_get_uid(), proc_get_euid(),
+                               proc_get_gid(), proc_get_egid());
+    shim_globals_rebuild_urandom_bitmap();
+    shim_globals_refill_urandom_ring(g);
+    shim_globals_recompute_attention(g);
+}
+
 /* Release the buffers and temporary host-side files that sys_execve allocates
  * before crossing the point of no return. Used by both the Rosetta and the
  * aarch64 success paths.
@@ -728,6 +760,7 @@ int64_t sys_execve(hv_vcpu_t vcpu,
                 path);
             exit(128);
         }
+        exec_republish_shim_globals_or_die(vcpu, g, verbose);
 
         /* I-cache for the (possibly re-mapped) rosetta segments has already
          * been invalidated inside rosetta_prepare; only the shim needs an
@@ -760,8 +793,10 @@ int64_t sys_execve(hv_vcpu_t vcpu,
     }
 
     /* Load the executable image that was validated before guest_reset(). */
+    uint64_t infra_lo = g->interp_base - INFRA_RESERVE;
+    uint64_t infra_hi = g->interp_base;
     if (elf_map_segments(&elf_info, path_host, g->host_base, g->guest_size,
-                         elf_load_base) < 0) {
+                         elf_load_base, infra_lo, infra_hi) < 0) {
         log_fatal(
             "execve failed after point of no return: "
             "failed to map ELF segments for %s",
@@ -782,7 +817,8 @@ int64_t sys_execve(hv_vcpu_t vcpu,
     if (elf_info.interp_path[0] != '\0') {
         interp_base = g->interp_base;
         if (elf_map_segments(&interp_info, interp_resolved, g->host_base,
-                             g->guest_size, interp_base) < 0) {
+                             g->guest_size, interp_base, infra_lo,
+                             infra_hi) < 0) {
             log_fatal(
                 "execve failed after point of no return: "
                 "failed to map interpreter segments");
@@ -851,13 +887,18 @@ int64_t sys_execve(hv_vcpu_t vcpu,
                                           .gpa_end = g->shim_base + shim_size,
                                           .perms = MEM_PERM_RX};
 
-    /* EL1 exception handlers use this block for stack and scratch state. */
+    /* EL1 exception handlers use this block for stack and scratch state.
+     * EL1-only so EL0 cannot read or store directly to the identity cache,
+     * urandom ring, or attention word that the shim fast paths consult.
+     * Matches bootstrap.c; if this regresses to plain RW, execve quietly
+     * defeats the protection on every new image.
+     */
     if (nregions >= MAX_REGIONS)
         goto too_many_regions;
     regions[nregions++] =
         (mem_region_t) {.gpa_start = g->shim_data_base,
                         .gpa_end = g->shim_data_base + BLOCK_2MIB,
-                        .perms = MEM_PERM_RW};
+                        .perms = MEM_PERM_RW_EL1_ONLY};
 
     /* The vDSO sits in the same 2MiB block as the shim. The page-table builder
      * splits the block into 4KiB L3 pages when its regions don't fully cover
@@ -943,9 +984,12 @@ int64_t sys_execve(hv_vcpu_t vcpu,
     guest_region_add(g, g->shim_base, g->shim_base + shim_size,
                      LINUX_PROT_READ | LINUX_PROT_EXEC, LINUX_MAP_PRIVATE, 0,
                      "[shim]");
+    /* Report PROT_NONE for [shim-data] to match the EL1-only mapping (see
+     * matching bootstrap.c registration). EL0 dereferences fault, so user
+     * tooling reading /proc/self/maps should see the same access state.
+     */
     guest_region_add(g, g->shim_data_base, g->shim_data_base + BLOCK_2MIB,
-                     LINUX_PROT_READ | LINUX_PROT_WRITE, LINUX_MAP_PRIVATE, 0,
-                     "[shim-data]");
+                     LINUX_PROT_NONE, LINUX_MAP_PRIVATE, 0, "[shim-data]");
     for (int i = 0; i < elf_info.num_segments; i++) {
         guest_region_add(g, elf_info.segments[i].gpa + elf_load_base,
                          elf_info.segments[i].gpa + elf_info.segments[i].memsz +
@@ -991,6 +1035,7 @@ int64_t sys_execve(hv_vcpu_t vcpu,
          * omit sa_restorer.
          */
         uint64_t exec_vdso = vdso_build(g);
+        exec_republish_shim_globals_or_die(vcpu, g, verbose);
 
         sp = build_linux_stack(g, g->stack_top, argc, argv_const, envp_const,
                                &elf_info, elf_load_base, interp_base, exec_vdso,
diff --git a/src/syscall/fd.c b/src/syscall/fd.c
index c1f828f..f06b0d2 100644
--- a/src/syscall/fd.c
+++ b/src/syscall/fd.c
@@ -104,6 +104,7 @@ void timerfd_init(void)
 {
     for (int i = 0; i < TIMERFD_MAX; i++)
         timerfd_state[i].guest_fd = -1;
+    fd_register_cleanup(FD_TIMERFD, timerfd_close);
 }
 
 static int timerfd_find(int guest_fd)
@@ -514,10 +515,20 @@ static void timerfd_close(int guest_fd)
 #define LINUX_EFD_NONBLOCK 0x800  /* Same as O_NONBLOCK */
 #define LINUX_EFD_SEMAPHORE 1
 
-/* Per-eventfd state */
+/* Per-eventfd state. The slot is shared across guest_fds that point at it (via
+ * dup/dup2/fcntl F_DUPFD), matching the Linux contract that dup'd eventfd fds
+ * share the same kernel object. eventfd_owner[gfd] maps a guest_fd to its slot;
+ * multiple guest_fds can map to the same slot. The slot owns its own read end
+ * for readiness/drain/blocking operations so it does not depend on any one
+ * guest fd remaining open. The slot is freed when refcount drops to zero. The
+ * slot's guest_fd field is retained for sfd_alloc_slot's
+ * "free if guest_fd == -1" convention and tracks the most recently allocated
+ * primary owner.
+ */
 #define EVENTFD_MAX 32
 static struct {
-    int guest_fd;     /* Guest fd (-1 if unused) */
+    int guest_fd;     /* Primary guest fd, -1 when slot is free */
+    int refcount;     /* Number of guest_fds bound to this slot */
     int pipe_rd;      /* Read end of self-pipe (for poll/epoll readiness) */
     int pipe_wr;      /* Write end of self-pipe */
     uint64_t counter; /* Accumulated event counter */
@@ -525,16 +536,22 @@ static struct {
     int nonblock;     /* O_NONBLOCK */
 } eventfd_state[EVENTFD_MAX];
 
+static int eventfd_owner[FD_TABLE_SIZE]; /* guest_fd -> slot, or -1 */
+
 void eventfd_init(void)
 {
     for (int i = 0; i < EVENTFD_MAX; i++)
         eventfd_state[i].guest_fd = -1;
+    for (int i = 0; i < FD_TABLE_SIZE; i++)
+        eventfd_owner[i] = -1;
+    fd_register_cleanup(FD_EVENTFD, eventfd_close);
 }
 
 static int eventfd_find(int guest_fd)
 {
-    return sfd_find_slot(eventfd_state, EVENTFD_MAX, sizeof(eventfd_state[0]),
-                         guest_fd);
+    if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE))
+        return -1;
+    return eventfd_owner[guest_fd];
 }
 
 static int eventfd_slot_alloc(void)
@@ -542,6 +559,19 @@ static int eventfd_slot_alloc(void)
     return sfd_alloc_slot(eventfd_state, EVENTFD_MAX, sizeof(eventfd_state[0]));
 }
 
+static void eventfd_release_ref_locked(int slot)
+{
+    if (--eventfd_state[slot].refcount <= 0) {
+        close(eventfd_state[slot].pipe_rd);
+        close(eventfd_state[slot].pipe_wr);
+        eventfd_state[slot].guest_fd = -1;
+        eventfd_state[slot].counter = 0;
+        eventfd_state[slot].refcount = 0;
+        eventfd_state[slot].pipe_rd = -1;
+        eventfd_state[slot].pipe_wr = -1;
+    }
+}
+
 int64_t sys_eventfd2(unsigned int initval, int flags)
 {
     if (flags & ~(LINUX_EFD_CLOEXEC | LINUX_EFD_NONBLOCK | LINUX_EFD_SEMAPHORE))
@@ -564,9 +594,22 @@ int64_t sys_eventfd2(unsigned int initval, int flags)
         return linux_errno();
     }
 
+    int state_rd = dup(pipefd[0]);
+    if (state_rd < 0 || fd_set_nonblock(state_rd) < 0 ||
+        fd_set_cloexec(state_rd) < 0) {
+        int saved_errno = errno;
+        if (state_rd >= 0)
+            close(state_rd);
+        close(pipefd[0]);
+        close(pipefd[1]);
+        errno = saved_errno;
+        return linux_errno();
+    }
+
     /* Allocate guest fd: use read end as the host fd so epoll/poll sees it */
     int gfd = fd_alloc(FD_EVENTFD, pipefd[0], eventfd_close);
     if (gfd < 0) {
+        close(state_rd);
         close(pipefd[0]);
         close(pipefd[1]);
         return -LINUX_EMFILE;
@@ -577,17 +620,20 @@ int64_t sys_eventfd2(unsigned int initval, int flags)
     if (slot < 0) {
         pthread_mutex_unlock(&sfd_lock);
         fd_mark_closed(gfd);
+        close(state_rd);
         close(pipefd[0]);
         close(pipefd[1]);
         return -LINUX_ENOMEM;
     }
 
     eventfd_state[slot].guest_fd = gfd;
-    eventfd_state[slot].pipe_rd = pipefd[0];
+    eventfd_state[slot].refcount = 1;
+    eventfd_state[slot].pipe_rd = state_rd;
     eventfd_state[slot].pipe_wr = pipefd[1];
     eventfd_state[slot].counter = (uint64_t) initval;
     eventfd_state[slot].semaphore = (flags & LINUX_EFD_SEMAPHORE) ? 1 : 0;
     eventfd_state[slot].nonblock = (flags & LINUX_EFD_NONBLOCK) ? 1 : 0;
+    eventfd_owner[gfd] = slot;
     pthread_mutex_unlock(&sfd_lock);
 
     fd_table[gfd].linux_flags =
@@ -610,14 +656,117 @@ static void eventfd_close(int guest_fd)
     pthread_mutex_lock(&sfd_lock);
     int slot = eventfd_find(guest_fd);
     if (slot >= 0) {
-        close(eventfd_state[slot].pipe_wr);
-        /* pipe_rd is closed by sys_close() as host_fd */
-        eventfd_state[slot].guest_fd = -1;
-        eventfd_state[slot].counter = 0;
+        eventfd_owner[guest_fd] = -1;
+        eventfd_release_ref_locked(slot);
     }
     pthread_mutex_unlock(&sfd_lock);
 }
 
+/* Bind an additional guest_fd to the same slot as src_fd, sharing the
+ * counter and pipe state. Two races to defeat:
+ *
+ *   - Source identity. duplicate_guest_fd() snapshots src_fd under
+ *     fd_lock, releases it, then calls us. Between those points src_fd
+ *     could be closed and rebound to a different eventfd. We carry the
+ *     caller's snapshot of fd_table[src_fd].host_fd as src_host_fd and verify
+ *     under fd_lock + sfd_lock that the source fd still has that host fd and
+ *     still maps to a live eventfd slot.
+ *
+ *   - Destination close. fd_alloc_*_relaxed publishes the new guest_fd
+ *     with eventfd_close as cleanup before we install the owner mapping.
+ *     A racing close would run eventfd_close, see owner == -1, skip the
+ *     refcount decrement, and leak the slot. We defeat this by reserving a
+ *     slot ref before publishing the destination, then holding fd_lock +
+ *     sfd_lock together while we verify fd_table[new] is still FD_EVENTFD with
+ *     the host_fd we allocated and set eventfd_owner. Any close that already
+ *     ran is observed here as FD_CLOSED, and we abandon the bind cleanly with
+ *     no leak.
+ */
+int eventfd_dup_fd(int src_fd,
+                   int src_host_fd,
+                   int min_guest_fd,
+                   int fixed_guest_fd,
+                   bool fixed_slot,
+                   int linux_flags)
+{
+    /* Pin the source under fd_lock + sfd_lock and dup the slot-owned
+     * readiness fd. The slot fd is independent of any guest alias, so closing
+     * the source later cannot invalidate eventfd_state[slot].pipe_rd.
+     */
+    pthread_mutex_lock(&fd_lock);
+    pthread_mutex_lock(&sfd_lock);
+    int slot = eventfd_find(src_fd);
+    if (slot < 0 || fd_table[src_fd].type != FD_EVENTFD ||
+        fd_table[src_fd].host_fd != src_host_fd ||
+        eventfd_state[slot].refcount <= 0) {
+        pthread_mutex_unlock(&sfd_lock);
+        pthread_mutex_unlock(&fd_lock);
+        errno = EBADF;
+        return -1;
+    }
+    eventfd_state[slot].refcount++;
+    int new_host_fd = dup(eventfd_state[slot].pipe_rd);
+    int original_pipe_rd = eventfd_state[slot].pipe_rd;
+    if (new_host_fd < 0)
+        eventfd_release_ref_locked(slot);
+    pthread_mutex_unlock(&sfd_lock);
+    pthread_mutex_unlock(&fd_lock);
+    if (new_host_fd < 0)
+        return -1;
+
+    /* Publish the destination fd with eventfd_close as cleanup. The
+     * eventfd_owner mapping is still -1, so a racing close here observes
+     * owner == -1 and does nothing; we detect that below.
+     */
+    int new_guest_fd = fixed_slot
+                           ? fd_alloc_at_relaxed(fixed_guest_fd, FD_EVENTFD,
+                                                 new_host_fd, eventfd_close)
+                           : fd_alloc_from_relaxed(min_guest_fd, FD_EVENTFD,
+                                                   new_host_fd, eventfd_close);
+    if (new_guest_fd < 0) {
+        close(new_host_fd);
+        pthread_mutex_lock(&sfd_lock);
+        eventfd_release_ref_locked(slot);
+        pthread_mutex_unlock(&sfd_lock);
+        if (fixed_slot)
+            errno = EBADF;
+        return -1;
+    }
+
+    /* Commit the bind under both locks in the documented order
+     * (fd_lock then sfd_lock). If a close already ran, fd_table[new].type
+     * is FD_CLOSED and we just bail with -EBADF; the host_fd is already
+     * gone via sys_close. Otherwise verify the source slot is still
+     * alive and unchanged, then install owner for the reserved ref.
+     */
+    pthread_mutex_lock(&fd_lock);
+    pthread_mutex_lock(&sfd_lock);
+    if (fd_table[new_guest_fd].type != FD_EVENTFD ||
+        fd_table[new_guest_fd].host_fd != new_host_fd ||
+        eventfd_state[slot].refcount <= 0 ||
+        eventfd_state[slot].pipe_rd != original_pipe_rd) {
+        pthread_mutex_unlock(&sfd_lock);
+        pthread_mutex_unlock(&fd_lock);
+        /* If the destination is still open but the source went away,
+         * tear it down. (If the destination already closed itself, the
+         * snapshot below sees FD_CLOSED and is a no-op.)
+         */
+        fd_entry_t snap;
+        if (fd_snapshot_and_close(new_guest_fd, &snap))
+            fd_cleanup_entry(new_guest_fd, &snap);
+        pthread_mutex_lock(&sfd_lock);
+        eventfd_release_ref_locked(slot);
+        pthread_mutex_unlock(&sfd_lock);
+        errno = EBADF;
+        return -1;
+    }
+    eventfd_owner[new_guest_fd] = slot;
+    fd_table[new_guest_fd].linux_flags = linux_flags;
+    pthread_mutex_unlock(&sfd_lock);
+    pthread_mutex_unlock(&fd_lock);
+    return new_guest_fd;
+}
+
 /* Read from eventfd: return 8-byte counter value, then reset to 0.
  * In EFD_SEMAPHORE mode, return 1 and decrement counter by 1.
  */
@@ -657,8 +806,12 @@ int64_t eventfd_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count)
             return linux_errno();
 
         pthread_mutex_lock(&sfd_lock);
-        /* Re-validate: slot may have been freed by eventfd_close() */
-        if (eventfd_state[slot].guest_fd != guest_fd) {
+        /* Re-validate via the owner table, not eventfd_state[slot].guest_fd:
+         * dup'd aliases bind multiple guest_fds to the same slot, so a
+         * legitimate caller's guest_fd may not equal the primary owner.
+         */
+        if (eventfd_owner[guest_fd] != slot ||
+            eventfd_state[slot].refcount <= 0) {
             pthread_mutex_unlock(&sfd_lock);
             return -LINUX_EBADF;
         }
@@ -809,6 +962,7 @@ void signalfd_init(void)
 {
     for (int i = 0; i < SIGNALFD_MAX; i++)
         signalfd_state[i].guest_fd = -1;
+    fd_register_cleanup(FD_SIGNALFD, signalfd_close);
 }
 
 static int signalfd_find(int guest_fd)
diff --git a/src/syscall/fd.h b/src/syscall/fd.h
index e087ed4..faaf958 100644
--- a/src/syscall/fd.h
+++ b/src/syscall/fd.h
@@ -33,6 +33,21 @@ int64_t sys_timerfd_gettime(guest_t *g, int fd, uint64_t curr_value_gva);
 /* eventfd (emulated via pipe + counter) */
 int64_t sys_eventfd2(unsigned int initval, int flags);
 
+/* Duplicate an eventfd into a new guest_fd slot, sharing the counter and
+ * pipe state with src_fd. Mirrors the Linux contract that dup'd eventfds
+ * share the same underlying kernel object. src_host_fd must be the host
+ * fd snapshotted from fd_table[src_fd].host_fd by the caller; the
+ * implementation uses it to verify under fd_lock + sfd_lock that the source
+ * fd still refers to the same live eventfd between the caller's snapshot and
+ * the dup commit. Returns the new guest_fd or -1 with errno set.
+ */
+int eventfd_dup_fd(int src_fd,
+                   int src_host_fd,
+                   int min_guest_fd,
+                   int fixed_guest_fd,
+                   bool fixed_slot,
+                   int linux_flags);
+
 /* signalfd (emulated via synthetic signal reads) */
 int64_t sys_signalfd4(guest_t *g,
                       int fd,
diff --git a/src/syscall/fdtable.c b/src/syscall/fdtable.c
index 5455f41..ff62307 100644
--- a/src/syscall/fdtable.c
+++ b/src/syscall/fdtable.c
@@ -20,6 +20,7 @@
 
 #include "utils.h"
 
+#include "core/shim-globals.h"
 #include "syscall/abi.h"
 #include "syscall/internal.h"
 
@@ -82,6 +83,33 @@ static inline void fd_init_entry(int fd,
     fd_table[fd].seals = 0;
     sock_opt_clear(&fd_table[fd]);
     fd_table[fd].cleanup = cleanup;
+    /* Start conservative. Callers that set linux_flags after allocation
+     * republish the readable-urandom state once the access mode is known.
+     */
+    shim_globals_mark_urandom_fd(fd, false);
+}
+
+void fd_refresh_urandom_bitmap(int fd)
+{
+    if (!RANGE_CHECK(fd, 0, FD_TABLE_SIZE))
+        return;
+
+    /* Hold fd_lock across both the read of (type, linux_flags) AND the
+     * shim_globals bitmap publish. Dropping the lock before the publish
+     * would let a concurrent sys_close flip the slot to FD_CLOSED in
+     * the gap; the subsequent mark would then stomp a stale 'readable
+     * urandom' bit onto a freed slot, and the EL1 fast path honors that
+     * bitmap. shim_globals_mark_urandom_fd is itself atomic on the
+     * bitmap word, but atomicity is meaningless without an in-lock
+     * source-to-publish window.
+     */
+    pthread_mutex_lock(&fd_lock);
+    int type = fd_table[fd].type;
+    int linux_flags = fd_table[fd].linux_flags;
+    bool readable_urandom =
+        type == FD_URANDOM && (linux_flags & LINUX_O_ACCMODE) != LINUX_O_WRONLY;
+    shim_globals_mark_urandom_fd(fd, readable_urandom);
+    pthread_mutex_unlock(&fd_lock);
 }
 
 /* Find the lowest free FD >= minfd using the bitmap.
@@ -169,26 +197,29 @@ int fd_alloc(int type, int host_fd, void (*cleanup)(int))
 /* Allocate the lowest available FD >= minfd. Returns -1 if none available
  * or RLIMIT_NOFILE would be exceeded.
  */
-int fd_alloc_from(int minfd, int type, int host_fd)
+int fd_alloc_from(int minfd, int type, int host_fd, void (*cleanup)(int))
 {
     pthread_mutex_lock(&fd_lock);
-    int fd = fd_alloc_locked(minfd, type, host_fd, NULL);
+    int fd = fd_alloc_locked(minfd, type, host_fd, cleanup);
     pthread_mutex_unlock(&fd_lock);
     return fd;
 }
 
-int fd_alloc_from_relaxed(int minfd, int type, int host_fd)
+int fd_alloc_from_relaxed(int minfd,
+                          int type,
+                          int host_fd,
+                          void (*cleanup)(int))
 {
     if (!thread_is_single_active())
-        return fd_alloc_from(minfd, type, host_fd);
-    return fd_alloc_locked(minfd, type, host_fd, NULL);
+        return fd_alloc_from(minfd, type, host_fd, cleanup);
+    return fd_alloc_locked(minfd, type, host_fd, cleanup);
 }
 
 /* Allocate a specific FD slot. Enforces RLIMIT_NOFILE. Properly cleans up any
  * existing entry (including DIR* for directory FDs) before overwriting. Returns
  * -1 if out of range.
  */
-int fd_alloc_at(int fd, int type, int host_fd)
+int fd_alloc_at(int fd, int type, int host_fd, void (*cleanup)(int))
 {
     if (!RANGE_CHECK(fd, 0, FD_TABLE_SIZE))
         return -1;
@@ -204,7 +235,7 @@ int fd_alloc_at(int fd, int type, int host_fd)
     pthread_mutex_lock(&fd_lock);
     if (fd_table[fd].type != FD_CLOSED)
         old = fd_table[fd];
-    fd_init_entry(fd, type, host_fd, NULL);
+    fd_init_entry(fd, type, host_fd, cleanup);
     pthread_mutex_unlock(&fd_lock);
 
     /* Clean up old resources outside fd_lock */
@@ -214,19 +245,19 @@ int fd_alloc_at(int fd, int type, int host_fd)
     return fd;
 }
 
-int fd_alloc_at_relaxed(int fd, int type, int host_fd)
+int fd_alloc_at_relaxed(int fd, int type, int host_fd, void (*cleanup)(int))
 {
     if (!RANGE_CHECK(fd, 0, FD_TABLE_SIZE))
         return -1;
     if (fd >= rlimit_nofile_cur)
         return -1;
     if (!thread_is_single_active())
-        return fd_alloc_at(fd, type, host_fd);
+        return fd_alloc_at(fd, type, host_fd, cleanup);
 
     if (fd_table[fd].type != FD_CLOSED)
-        return fd_alloc_at(fd, type, host_fd);
+        return fd_alloc_at(fd, type, host_fd, cleanup);
 
-    fd_init_entry(fd, type, host_fd, NULL);
+    fd_init_entry(fd, type, host_fd, cleanup);
     return fd;
 }
 
@@ -238,6 +269,11 @@ int fd_alloc_at_relaxed(int fd, int type, int host_fd)
  */
 void fd_mark_closed_unlocked(int fd)
 {
+    /* Clear before publishing FD_CLOSED/free. The EL1 urandom read fast path
+     * intentionally avoids fd_lock, so it must not observe a stale urandom
+     * bit after this slot has become invalid or reusable.
+     */
+    shim_globals_mark_urandom_fd(fd, false);
     fd_table[fd].type = FD_CLOSED;
     fd_table[fd].host_fd = -1;
     fd_table[fd].dir = NULL;
@@ -334,6 +370,53 @@ bool fd_snapshot(int guest_fd, fd_entry_t *out)
     return ok;
 }
 
+int fd_snapshot_and_dup(int guest_fd, fd_entry_t *out)
+{
+    out->type = FD_CLOSED;
+    if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE))
+        return -1;
+    pthread_mutex_lock(&fd_lock);
+    if (!fd_snapshot_locked(guest_fd, out, false)) {
+        pthread_mutex_unlock(&fd_lock);
+        return -1;
+    }
+    int host = (out->host_fd >= 0) ? dup(out->host_fd) : -1;
+    pthread_mutex_unlock(&fd_lock);
+    return host;
+}
+
+int fd_get_type(int guest_fd)
+{
+    if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE))
+        return FD_CLOSED;
+    pthread_mutex_lock(&fd_lock);
+    int type = fd_table[guest_fd].type;
+    pthread_mutex_unlock(&fd_lock);
+    return type;
+}
+
+/* Sized to cover all FD_* constants in abi.h plus a small headroom. Indexed
+ * by type. Each slot defaults to NULL (no per-type cleanup). Modules that
+ * own a type call fd_register_cleanup() at init time; dup and fork-restore
+ * paths read back the binding via fd_cleanup_for_type().
+ */
+#define FD_TYPE_REGISTRY_SIZE 32
+static void (*fd_type_cleanup[FD_TYPE_REGISTRY_SIZE])(int);
+
+void fd_register_cleanup(int type, void (*cleanup)(int))
+{
+    if (type < 0 || type >= FD_TYPE_REGISTRY_SIZE)
+        return;
+    fd_type_cleanup[type] = cleanup;
+}
+
+void (*fd_cleanup_for_type(int type))(int)
+{
+    if (type < 0 || type >= FD_TYPE_REGISTRY_SIZE)
+        return NULL;
+    return fd_type_cleanup[type];
+}
+
 /* Look up a guest FD and return a dup'd host fd that the caller owns.
  * The dup is performed under fd_lock so that close() on another thread
  * cannot invalidate the host fd between lookup and dup. Caller must
diff --git a/src/syscall/fs.c b/src/syscall/fs.c
index ce951eb..45e1ef0 100644
--- a/src/syscall/fs.c
+++ b/src/syscall/fs.c
@@ -24,9 +24,12 @@
 #include "debug/log.h"
 #include "utils.h"
 
+#include "core/shim-globals.h" /* shim_globals_mark_urandom_fd */
+
 #include "runtime/procemu.h"
 
 #include "syscall/abi.h"
+#include "syscall/fd.h" /* eventfd_dup_fd */
 #include "syscall/fuse.h"
 #include "syscall/fs.h"
 #include "syscall/internal.h"
@@ -62,6 +65,16 @@ static int opened_fd_type(int host_fd, int linux_flags)
     return FD_REGULAR;
 }
 
+static int intercepted_fd_type(const char *path, int host_fd, int linux_flags)
+{
+    int type = opened_fd_type(host_fd, linux_flags);
+    if (type < 0)
+        return type;
+    if (type == FD_REGULAR && path && !strcmp(path, "/dev/urandom"))
+        return FD_URANDOM;
+    return type;
+}
+
 static const char *proc_virtual_dir_path(const char *path,
                                          char *buf,
                                          size_t bufsz);
@@ -168,16 +181,11 @@ static const char *proc_virtual_dir_path(const char *path,
     return virt;
 }
 
-static int dup_fd_type(int guest_fd)
-{
-    return fd_table[guest_fd].type == FD_STDIO ? FD_REGULAR
-                                               : fd_table[guest_fd].type;
-}
-
 static int fd_alloc_opened_host(int host_fd,
                                 int type,
                                 int linux_flags,
-                                int min_guest_fd)
+                                int min_guest_fd,
+                                void (*cleanup)(int))
 {
     DIR *dir = NULL;
 
@@ -193,9 +201,10 @@ static int fd_alloc_opened_host(int host_fd,
         }
     }
 
-    int guest_fd = min_guest_fd >= 0
-                       ? fd_alloc_from_relaxed(min_guest_fd, type, host_fd)
-                       : fd_alloc_from_relaxed(0, type, host_fd);
+    int guest_fd =
+        min_guest_fd >= 0
+            ? fd_alloc_from_relaxed(min_guest_fd, type, host_fd, cleanup)
+            : fd_alloc_from_relaxed(0, type, host_fd, cleanup);
     if (guest_fd < 0) {
         int saved_errno = errno;
         if (dir)
@@ -204,9 +213,35 @@ static int fd_alloc_opened_host(int host_fd,
         return -1;
     }
 
-    fd_table[guest_fd].linux_flags = linux_flags;
-    if (dir)
-        fd_table[guest_fd].dir = dir;
+    /* Publish linux_flags, dir, and the urandom bitmap bit atomically
+     * with respect to the slot's identity. fd_alloc_*_relaxed drops
+     * fd_lock before returning, so a sibling vCPU's pathological
+     * close(guest_fd) + open() could reuse the slot between alloc and
+     * the metadata install below. Re-acquire fd_lock and verify the
+     * (type, host_fd) tuple still matches what just got allocated;
+     * if it does not, the slot belongs to a different file now and
+     * any install would clobber the sibling's entry. The sibling's
+     * close path already cleaned up our host_fd via fd_cleanup_entry,
+     * so this side only owns dir, which gets closed below.
+     */
+    bool installed = false;
+    pthread_mutex_lock(&fd_lock);
+    if (fd_table[guest_fd].type == type &&
+        fd_table[guest_fd].host_fd == host_fd) {
+        fd_table[guest_fd].linux_flags = linux_flags;
+        if (dir)
+            fd_table[guest_fd].dir = dir;
+        bool readable_urandom =
+            type == FD_URANDOM &&
+            (linux_flags & LINUX_O_ACCMODE) != LINUX_O_WRONLY;
+        shim_globals_mark_urandom_fd(guest_fd, readable_urandom);
+        installed = true;
+    }
+    pthread_mutex_unlock(&fd_lock);
+
+    if (!installed && dir)
+        closedir(dir);
+
     return guest_fd;
 }
 
@@ -249,7 +284,7 @@ int64_t sys_openat_path(guest_t *g,
                 return linux_errno();
             }
             int guest_fd =
-                fd_alloc_opened_host(sidecar_fd, type, linux_flags, -1);
+                fd_alloc_opened_host(sidecar_fd, type, linux_flags, -1, NULL);
             if (guest_fd < 0) {
                 close_keep_errno(sidecar_fd);
                 return linux_errno();
@@ -278,7 +313,8 @@ int64_t sys_openat_path(guest_t *g,
             close_keep_errno(host_fd);
             return linux_errno();
         }
-        int guest_fd = fd_alloc_opened_host(host_fd, type, linux_flags, -1);
+        int guest_fd =
+            fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL);
         if (guest_fd < 0) {
             close_keep_errno(host_fd);
             return linux_errno();
@@ -303,15 +339,17 @@ int64_t sys_openat_path(guest_t *g,
              * /proc files use fd_alloc_from(128) to avoid races with
              * concurrent GC finalizers that may close stale low-numbered fds.
              */
-            int type = opened_fd_type(intercepted, linux_flags);
+            int type = intercepted_fd_type(tx.intercept_path, intercepted,
+                                           linux_flags);
             if (type < 0) {
                 close_keep_errno(intercepted);
                 return linux_errno();
             }
             int min_guest_fd =
                 (!strncmp(tx.intercept_path, "/dev/", 5)) ? -1 : 128;
-            int guest_fd = fd_alloc_opened_host(intercepted, type, linux_flags,
-                                                min_guest_fd);
+            int guest_fd =
+                fd_alloc_opened_host(intercepted, type, linux_flags,
+                                     min_guest_fd, fd_cleanup_for_type(type));
             if (guest_fd < 0) {
                 close_keep_errno(intercepted);
                 return linux_errno();
@@ -336,7 +374,8 @@ int64_t sys_openat_path(guest_t *g,
             close_keep_errno(host_fd);
             return linux_errno();
         }
-        int guest_fd = fd_alloc_opened_host(host_fd, type, linux_flags, -1);
+        int guest_fd =
+            fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL);
         if (guest_fd < 0) {
             close_keep_errno(host_fd);
             return linux_errno();
@@ -358,7 +397,7 @@ int64_t sys_openat_path(guest_t *g,
         close_keep_errno(host_fd);
         return linux_errno();
     }
-    int guest_fd = fd_alloc_opened_host(host_fd, type, linux_flags, -1);
+    int guest_fd = fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL);
     if (guest_fd < 0) {
         close_keep_errno(host_fd);
         return linux_errno();
@@ -410,41 +449,82 @@ int64_t sys_close(int fd)
 
 /* dup/fcntl. */
 
-static int clone_dir_stream_if_needed(int src_fd, int dst_fd, int dst_host_fd)
+static void discard_allocated_fd(int guest_fd)
 {
-    if (fd_table[src_fd].type != FD_DIR)
-        return 0;
+    fd_entry_t snap;
+    if (fd_snapshot_and_close(guest_fd, &snap))
+        fd_cleanup_entry(guest_fd, &snap);
+}
 
-    int dir_fd = dup(dst_host_fd);
-    if (dir_fd < 0)
-        return -1;
+/* Open a DIR stream over a dup of dst_host_fd if the source was an
+ * FD_DIR. Returns NULL on success-but-no-stream-needed (non-dir source)
+ * or on dup/fdopendir failure with errno preserved. Pulled out of the
+ * critical section in install_fd_alias_metadata_atomic because dup and
+ * fdopendir are slow syscalls that must not hold fd_lock.
+ */
+static DIR *clone_dir_stream(const fd_entry_t *src_snap,
+                             int dst_host_fd,
+                             bool *out_failed)
+{
+    *out_failed = false;
+    if (src_snap->type != FD_DIR)
+        return NULL;
 
+    int dir_fd = dup(dst_host_fd);
+    if (dir_fd < 0) {
+        *out_failed = true;
+        return NULL;
+    }
     DIR *dir = fdopendir(dir_fd);
     if (!dir) {
+        int saved_errno = errno;
         close(dir_fd);
-        return -1;
+        errno = saved_errno;
+        *out_failed = true;
+        return NULL;
     }
-
-    fd_table[dst_fd].dir = dir;
-    return 0;
+    return dir;
 }
 
-static void discard_allocated_fd(int guest_fd)
+/* Install dup-alias metadata atomically with the slot identity. Uses
+ * the (type, host_fd) tuple as proof that the slot still belongs to
+ * the in-flight duplicate_guest_fd call; a sibling vCPU's pathological
+ * close + open between the relaxed allocator's lock release and this
+ * call could otherwise clobber the sibling's freshly-installed entry.
+ * Returns true on successful install, false if the slot was
+ * reallocated (caller must closedir any cloned dir to avoid a leak).
+ */
+static bool install_fd_alias_metadata_atomic(int dst_fd,
+                                             int expected_type,
+                                             int expected_host_fd,
+                                             const fd_entry_t *src_snap,
+                                             int linux_flags,
+                                             DIR *dir)
 {
-    fd_entry_t snap;
-    if (fd_snapshot_and_close(guest_fd, &snap))
-        fd_cleanup_entry(guest_fd, &snap);
-}
+    int preserved_flags =
+        src_snap->linux_flags &
+        (LINUX_O_ACCMODE | LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
+         LINUX_O_DIRECT | LINUX_O_LARGEFILE);
+    int final_flags = preserved_flags | linux_flags;
 
-static void copy_fd_alias_metadata(int src_fd, int dst_fd, int linux_flags)
-{
-    int preserved_flags = fd_table[src_fd].linux_flags &
-                          (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
-                           LINUX_O_DIRECT | LINUX_O_LARGEFILE);
-    fd_table[dst_fd].linux_flags = preserved_flags | linux_flags;
-    fd_table[dst_fd].seals = fd_table[src_fd].seals;
-    memcpy(fd_table[dst_fd].proc_path, fd_table[src_fd].proc_path,
-           sizeof(fd_table[dst_fd].proc_path));
+    bool installed = false;
+    pthread_mutex_lock(&fd_lock);
+    if (fd_table[dst_fd].type == expected_type &&
+        fd_table[dst_fd].host_fd == expected_host_fd) {
+        fd_table[dst_fd].linux_flags = final_flags;
+        fd_table[dst_fd].seals = src_snap->seals;
+        memcpy(fd_table[dst_fd].proc_path, src_snap->proc_path,
+               sizeof(fd_table[dst_fd].proc_path));
+        if (dir)
+            fd_table[dst_fd].dir = dir;
+        bool readable_urandom =
+            expected_type == FD_URANDOM &&
+            (final_flags & LINUX_O_ACCMODE) != LINUX_O_WRONLY;
+        shim_globals_mark_urandom_fd(dst_fd, readable_urandom);
+        installed = true;
+    }
+    pthread_mutex_unlock(&fd_lock);
+    return installed;
 }
 
 /* Duplicate a guest fd into either the next free slot >= min_guest_fd or a
@@ -457,28 +537,44 @@ static int duplicate_guest_fd(int src_fd,
                               bool fixed_slot,
                               int linux_flags)
 {
-    if (RANGE_CHECK(src_fd, 0, FD_TABLE_SIZE)) {
-        int t = fd_table[src_fd].type;
-        if (t == FD_FUSE_DEV || t == FD_FUSE_FILE || t == FD_FUSE_DIR)
-            return fuse_dup_fd(src_fd, min_guest_fd, fixed_guest_fd, fixed_slot,
-                               linux_flags);
-    }
-
-    host_fd_ref_t host_ref;
-    if (host_fd_ref_open(src_fd, &host_ref) < 0) {
+    /* Snapshot the source entry and dup its host fd in a single fd_lock
+     * critical section so the type, host fd, and metadata captured here
+     * cannot drift apart under a racing close + reopen.
+     */
+    fd_entry_t src_snap;
+    int new_host_fd = fd_snapshot_and_dup(src_fd, &src_snap);
+    if (new_host_fd < 0 && src_snap.type == FD_CLOSED) {
         errno = EBADF;
         return -1;
     }
-
-    int new_type = dup_fd_type(src_fd);
-    int new_host_fd = dup(host_ref.fd);
-    host_fd_ref_close(&host_ref);
+    if (src_snap.type == FD_FUSE_DEV || src_snap.type == FD_FUSE_FILE ||
+        src_snap.type == FD_FUSE_DIR) {
+        if (new_host_fd >= 0)
+            close_keep_errno(new_host_fd);
+        return fuse_dup_fd(src_fd, min_guest_fd, fixed_guest_fd, fixed_slot,
+                           linux_flags);
+    }
+    /* eventfd dup must share the underlying counter and pipe state across
+     * the source and destination fds (Linux contract). Pass src_snap's
+     * host_fd through so eventfd_dup_fd can verify the source fd still
+     * refers to the same live eventfd between the snapshot here and the
+     * bind there.
+     */
+    if (src_snap.type == FD_EVENTFD) {
+        if (new_host_fd >= 0)
+            close_keep_errno(new_host_fd);
+        return eventfd_dup_fd(src_fd, src_snap.host_fd, min_guest_fd,
+                              fixed_guest_fd, fixed_slot, linux_flags);
+    }
     if (new_host_fd < 0)
         return -1;
 
-    int guest_fd =
-        fixed_slot ? fd_alloc_at_relaxed(fixed_guest_fd, new_type, new_host_fd)
-                   : fd_alloc_from_relaxed(min_guest_fd, new_type, new_host_fd);
+    int new_type = (src_snap.type == FD_STDIO) ? FD_REGULAR : src_snap.type;
+    void (*cleanup)(int) = fd_cleanup_for_type(new_type);
+    int guest_fd = fixed_slot ? fd_alloc_at_relaxed(fixed_guest_fd, new_type,
+                                                    new_host_fd, cleanup)
+                              : fd_alloc_from_relaxed(min_guest_fd, new_type,
+                                                      new_host_fd, cleanup);
     if (guest_fd < 0) {
         if (fixed_slot)
             errno = EBADF;
@@ -486,14 +582,31 @@ static int duplicate_guest_fd(int src_fd,
         return -1;
     }
 
-    copy_fd_alias_metadata(src_fd, guest_fd, linux_flags);
-    if (clone_dir_stream_if_needed(src_fd, guest_fd, new_host_fd) < 0) {
+    /* Clone the DIR stream outside fd_lock (dup + fdopendir would block
+     * other fd ops), then install everything atomically under fd_lock
+     * with a tuple verification so a sibling close + reopen on the same
+     * guest_fd cannot make this install land on an unrelated slot.
+     */
+    bool dir_clone_failed = false;
+    DIR *dir = clone_dir_stream(&src_snap, new_host_fd, &dir_clone_failed);
+    if (dir_clone_failed) {
         int saved_errno = errno;
         discard_allocated_fd(guest_fd);
         errno = saved_errno;
         return -1;
     }
 
+    if (!install_fd_alias_metadata_atomic(guest_fd, new_type, new_host_fd,
+                                          &src_snap, linux_flags, dir)) {
+        /* Slot was reallocated by a sibling while metadata install was
+         * pending; the sibling's close path already cleaned up new_host_fd
+         * via fd_cleanup_entry, so the only resource this side still
+         * owns is the cloned DIR stream.
+         */
+        if (dir)
+            closedir(dir);
+    }
+
     return guest_fd;
 }
 
@@ -600,7 +713,7 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
             return linux_errno();
         int linux_fl = mac_to_linux_status_flags(mac_fl);
         if (snap.type == FD_REGULAR || snap.type == FD_DIR ||
-            snap.type == FD_PATH)
+            snap.type == FD_PATH || snap.type == FD_URANDOM)
             linux_fl = (linux_fl & ~O_ACCMODE) | (snap.linux_flags & 3);
         linux_fl |= snap.linux_flags &
                     (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
diff --git a/src/syscall/fuse.c b/src/syscall/fuse.c
index ae248e1..157191a 100644
--- a/src/syscall/fuse.c
+++ b/src/syscall/fuse.c
@@ -1281,6 +1281,9 @@ void fuse_init(void)
     memset(fuse_file_bindings, 0, sizeof(fuse_file_bindings));
     fuse_next_mount_id = 100;
     pthread_mutex_unlock(&fuse_lock);
+    fd_register_cleanup(FD_FUSE_DEV, fuse_fd_cleanup);
+    fd_register_cleanup(FD_FUSE_FILE, fuse_fd_cleanup);
+    fd_register_cleanup(FD_FUSE_DIR, fuse_fd_cleanup);
 }
 
 int fuse_proc_open(int linux_flags)
@@ -2540,9 +2543,15 @@ int fuse_dup_fd(int src_fd,
         return -1;
     }
 
-    int guest_fd = fixed_slot
-                       ? fd_alloc_at_relaxed(fixed_guest_fd, snap.type, -1)
-                       : fd_alloc_from_relaxed(min_guest_fd, snap.type, -1);
+    /* Install cleanup atomically with the type. Without this, a racing
+     * close between fd_alloc_*_relaxed publishing the slot and the later
+     * fd_table[guest_fd].cleanup assignment would skip fuse_fd_cleanup
+     * and leak the session or file ref.
+     */
+    int guest_fd = fixed_slot ? fd_alloc_at_relaxed(fixed_guest_fd, snap.type,
+                                                    -1, fuse_fd_cleanup)
+                              : fd_alloc_from_relaxed(min_guest_fd, snap.type,
+                                                      -1, fuse_fd_cleanup);
     if (guest_fd < 0) {
         if (fixed_slot)
             errno = EBADF;
@@ -2588,7 +2597,6 @@ int fuse_dup_fd(int src_fd,
                           (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
                            LINUX_O_DIRECT | LINUX_O_LARGEFILE);
     fd_table[guest_fd].linux_flags = preserved_flags | linux_flags;
-    fd_table[guest_fd].cleanup = fuse_fd_cleanup;
     pthread_mutex_unlock(&fuse_lock);
     return guest_fd;
 }
diff --git a/src/syscall/inotify.c b/src/syscall/inotify.c
index 7513e5c..d9b54dd 100644
--- a/src/syscall/inotify.c
+++ b/src/syscall/inotify.c
@@ -111,6 +111,7 @@ void inotify_init(void)
 {
     for (int i = 0; i < INOTIFY_MAX; i++)
         inotify_state[i].guest_fd = -1;
+    fd_register_cleanup(FD_INOTIFY, inotify_close);
 }
 
 static int inotify_find(int guest_fd)
diff --git a/src/syscall/internal.h b/src/syscall/internal.h
index 2760ce9..8534e6a 100644
--- a/src/syscall/internal.h
+++ b/src/syscall/internal.h
@@ -59,32 +59,84 @@ void fdtable_init(void);
  */
 int fd_alloc(int type, int host_fd, void (*cleanup)(int));
 
-/* Allocate the lowest available FD >= minfd. Returns -1 if none available. */
-int fd_alloc_from(int minfd, int type, int host_fd);
+/* Allocate the lowest available FD >= minfd. Returns -1 if none available.
+ * cleanup is set atomically under fd_lock (pass NULL for plain fds).
+ */
+int fd_alloc_from(int minfd, int type, int host_fd, void (*cleanup)(int));
 
 /* Allocate the lowest available FD >= minfd with a single-thread fast path.
  * Falls back to fd_alloc_from() when multiple guest threads are active.
  */
-int fd_alloc_from_relaxed(int minfd, int type, int host_fd);
+int fd_alloc_from_relaxed(int minfd,
+                          int type,
+                          int host_fd,
+                          void (*cleanup)(int));
 
-/* Allocate a specific FD slot. Returns -1 if out of range. */
-int fd_alloc_at(int fd, int type, int host_fd);
+/* Allocate a specific FD slot.
+ * Returns -1 if out of range.
+ * cleanup is set atomically under fd_lock (pass NULL for plain fds).
+ */
+int fd_alloc_at(int fd, int type, int host_fd, void (*cleanup)(int));
 
 /* Allocate a specific FD slot with a single-thread fast path.
  * Falls back to fd_alloc_at() when replacement/cleanup must stay serialized.
  */
-int fd_alloc_at_relaxed(int fd, int type, int host_fd);
+int fd_alloc_at_relaxed(int fd, int type, int host_fd, void (*cleanup)(int));
 
 /* Look up a guest FD. Returns host FD or -1 if invalid.
  * Unsafe for concurrent use; see fd_snapshot/fd_to_host_dup.
  */
 int fd_to_host(int guest_fd);
 
-/* Snapshot an fd entry under fd_lock. Thread-safe alternative to
- * direct fd_table[] access. Returns true on success, false if closed.
+/* Snapshot an fd entry under fd_lock. Thread-safe alternative to direct
+ * fd_table[] access.
+ * Returns true on success, false if closed.
  */
 bool fd_snapshot(int guest_fd, fd_entry_t *out);
 
+/* Snapshot an fd entry AND dup its host fd in a single fd_lock critical
+ * section. Eliminates the TOCTOU window between reading the type/metadata
+ * and duplicating the host fd in the dup(2) path. Returns the dup'd host
+ * fd (owned by the caller) on success, -1 on failure. On success the
+ * snapshot in *out is consistent with the dup'd host fd.
+ */
+int fd_snapshot_and_dup(int guest_fd, fd_entry_t *out);
+
+/* Read just the fd type under fd_lock. Returns FD_CLOSED for out-of-range or
+ * closed slots. Cheaper than fd_snapshot when only the type is needed for
+ * dispatch (sys_read/sys_readv/sys_writev fast paths).
+ */
+int fd_get_type(int guest_fd);
+
+/* Republish the EL1 urandom read fast-path bit for this fd from the current
+ * fd_table type and access mode. Only readable /dev/urandom descriptors are
+ * eligible for the bitmap.
+ */
+void fd_refresh_urandom_bitmap(int fd);
+
+/* Type -> cleanup registry. Modules that own a synthetic fd type register
+ * their cleanup at init time; dup and fork-restore paths look up the
+ * cleanup from the type so the binding stays consistent without each path
+ * re-deriving the dispatch table.
+ */
+void fd_register_cleanup(int type, void (*cleanup)(int));
+void (*fd_cleanup_for_type(int type))(int);
+
+/* True for fd types whose host backing (kqueue for timerfd/inotify, pipe
+ * halves for eventfd/signalfd/netlink/pidfd, epoll instance) cannot be
+ * meaningfully inherited across fork IPC: macOS SCM_RIGHTS rejects kqueue
+ * fds, and the per-class side-table state (eventfd counter, signalfd mask,
+ * pidfd target, epoll set, ...) is not serialized. The child must recreate
+ * such fds via the appropriate syscall, so the parent filters them from the
+ * SCM_RIGHTS payload and the receiver drops any that still arrive.
+ */
+static inline bool fd_type_is_synthetic(int type)
+{
+    return type == FD_EVENTFD || type == FD_SIGNALFD || type == FD_TIMERFD ||
+           type == FD_INOTIFY || type == FD_NETLINK || type == FD_PIDFD ||
+           type == FD_EPOLL;
+}
+
 /* Look up a guest FD and return a dup'd host fd owned by the caller.
  * Thread-safe: dup is performed under fd_lock. Returns -1 on failure.
  * Caller MUST close() the returned fd when done.
diff --git a/src/syscall/io.c b/src/syscall/io.c
index ee183dd..ef04d56 100644
--- a/src/syscall/io.c
+++ b/src/syscall/io.c
@@ -19,6 +19,7 @@
 #include <errno.h>
 #include <stdbool.h>
 #include <limits.h>
+#include <pthread.h>
 #include <sys/stat.h>
 #include <sys/uio.h>
 #include <sys/ioctl.h>
@@ -27,6 +28,7 @@
 #include "utils.h"
 
 #include "core/rosetta.h"
+#include "core/shim-globals.h"
 #include "hvutil.h"
 #include "runtime/procemu.h"
 #include "runtime/thread.h"
@@ -43,6 +45,7 @@
 
 #define SYSCALL_IOV_MAX 1024
 #define SYSCALL_IOV_STACK_MAX 64
+#define URANDOM_CACHE_SIZE 4096
 
 /* Linux terminal struct types. */
 
@@ -60,6 +63,27 @@ typedef struct {
     uint8_t c_cc[19];
 } linux_termios_t;
 
+/* Per-fd lock embedded in the cache so a urandom read on fd A does not
+ * serialize behind a concurrent urandom read on fd B. The previous design
+ * used a single global mutex covering the whole cache array, which made
+ * the per-fd cache pointless under any sibling-vCPU urandom traffic.
+ * The lock array is initialized at startup by io_init().
+ */
+typedef struct {
+    pthread_mutex_t lock;
+    uint8_t buf[URANDOM_CACHE_SIZE];
+    size_t off;
+    size_t len;
+} urandom_cache_t;
+
+static urandom_cache_t urandom_cache[FD_TABLE_SIZE];
+
+void io_init(void)
+{
+    for (int i = 0; i < FD_TABLE_SIZE; i++)
+        pthread_mutex_init(&urandom_cache[i].lock, NULL);
+}
+
 _Static_assert(sizeof(linux_termios_t) == 36,
                "aarch64 Linux TCGETS struct termios must be 36 bytes");
 
@@ -123,6 +147,136 @@ static int64_t io_return_zero(host_fd_ref_t *host_ref)
     return 0;
 }
 
+void urandom_fd_reset_cache(int guest_fd)
+{
+    if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE))
+        return;
+
+    /* Preserve the embedded lock; reset only the entropy fields. memset of
+     * the whole struct would clobber the mutex state.
+     */
+    urandom_cache_t *c = &urandom_cache[guest_fd];
+    pthread_mutex_lock(&c->lock);
+    memset(c->buf, 0, sizeof(c->buf));
+    c->off = 0;
+    c->len = 0;
+    pthread_mutex_unlock(&c->lock);
+}
+
+void urandom_fd_cleanup(int guest_fd)
+{
+    if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE))
+        return;
+
+    urandom_fd_reset_cache(guest_fd);
+}
+
+static int64_t urandom_check_readable(int guest_fd)
+{
+    fd_entry_t snap;
+    if (!fd_snapshot(guest_fd, &snap) || snap.type != FD_URANDOM)
+        return -LINUX_EBADF;
+    if ((snap.linux_flags & 3) == LINUX_O_WRONLY)
+        return -LINUX_EBADF;
+    return 0;
+}
+
+static int64_t urandom_fill_iov(int guest_fd,
+                                const struct iovec *iov,
+                                int iovcnt)
+{
+    int64_t err = urandom_check_readable(guest_fd);
+    if (err < 0)
+        return err;
+
+    size_t total = 0;
+    for (int i = 0; i < iovcnt; i++) {
+        if (iov[i].iov_len > (size_t) SSIZE_MAX - total)
+            return -LINUX_EINVAL;
+        total += iov[i].iov_len;
+    }
+    if (total == 0)
+        return 0;
+
+    urandom_cache_t *c = &urandom_cache[guest_fd];
+    pthread_mutex_lock(&c->lock);
+    size_t done = 0;
+    for (int i = 0; i < iovcnt && done < total; i++) {
+        uint8_t *dst = iov[i].iov_base;
+        size_t iov_done = 0;
+        size_t iov_len = iov[i].iov_len;
+        if (iov_len > total - done)
+            iov_len = total - done;
+        while (iov_done < iov_len) {
+            if (c->off == c->len) {
+                arc4random_buf(c->buf, sizeof(c->buf));
+                c->off = 0;
+                c->len = sizeof(c->buf);
+            }
+            size_t chunk = c->len - c->off;
+            if (chunk > iov_len - iov_done)
+                chunk = iov_len - iov_done;
+            memcpy(dst + iov_done, c->buf + c->off, chunk);
+            c->off += chunk;
+            iov_done += chunk;
+            done += chunk;
+        }
+    }
+    pthread_mutex_unlock(&c->lock);
+    return (int64_t) done;
+}
+
+static int64_t validate_iov_total(guest_t *g, uint64_t iov_gva, int iovcnt)
+{
+    if (iovcnt <= 0 || iovcnt > SYSCALL_IOV_MAX)
+        return -LINUX_EINVAL;
+
+    size_t total = 0;
+    for (int i = 0; i < iovcnt; i++) {
+        linux_iovec_t giov;
+        if (guest_read_small(g, iov_gva + (uint64_t) i * sizeof(giov), &giov,
+                             sizeof(giov)) < 0)
+            return -LINUX_EFAULT;
+        if (giov.iov_len > (uint64_t) SSIZE_MAX - total)
+            return -LINUX_EINVAL;
+        total += (size_t) giov.iov_len;
+    }
+    return 0;
+}
+
+static int64_t urandom_read(guest_t *g,
+                            int guest_fd,
+                            uint64_t buf_gva,
+                            uint64_t count)
+{
+    if (count > SSIZE_MAX)
+        count = SSIZE_MAX;
+    if (count == 0) {
+        struct iovec empty = {0};
+        return urandom_fill_iov(guest_fd, &empty, 1);
+    }
+
+    uint64_t avail = 0;
+    void *dst = guest_ptr_bound(g, buf_gva, &avail, MEM_PERM_W, count);
+    if (!dst)
+        return -LINUX_EFAULT;
+    if (count > avail)
+        count = avail;
+
+    struct iovec iov = {.iov_base = dst, .iov_len = (size_t) count};
+    int64_t rc = urandom_fill_iov(guest_fd, &iov, 1);
+
+    /* This slow path runs when the shim's identity-class fast path
+     * could not serve the read: either the request was larger than
+     * the shim's inline limit, or the ring was empty. Refill the
+     * shim's entropy ring before returning so a subsequent
+     * read(/dev/urandom) from the same vCPU sees a populated ring
+     * and stays on the fast path.
+     */
+    shim_globals_refill_urandom_ring(g);
+    return rc;
+}
+
 static bool rosetta_ioctl_target_fd(guest_t *g, int host_fd)
 {
     if (!g->is_rosetta)
@@ -689,12 +843,11 @@ static int64_t io_write_result(ssize_t ret)
 
 int64_t sys_write(guest_t *g, int fd, uint64_t buf_gva, uint64_t count)
 {
-    if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) {
-        if (fd_table[fd].type == FD_FUSE_DEV)
-            return fuse_dev_write(g, fd, buf_gva, count);
-        if (fd_table[fd].type == FD_EVENTFD)
-            return eventfd_write(fd, g, buf_gva, count);
-    }
+    int type = fd_get_type(fd);
+    if (type == FD_FUSE_DEV)
+        return fuse_dev_write(g, fd, buf_gva, count);
+    if (type == FD_EVENTFD)
+        return eventfd_write(fd, g, buf_gva, count);
 
     host_fd_ref_t host_ref;
     int64_t err = host_fd_ref_open_checked(fd, &host_ref, true);
@@ -741,21 +894,28 @@ int64_t sys_write(guest_t *g, int fd, uint64_t buf_gva, uint64_t count)
 
 int64_t sys_read(guest_t *g, int fd, uint64_t buf_gva, uint64_t count)
 {
-    if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) {
-        if (fd_table[fd].type == FD_FUSE_DEV)
-            return fuse_dev_read(fd, g, buf_gva, count);
-        if (fd_table[fd].type == FD_FUSE_FILE)
-            return fuse_read_fd(g, fd, buf_gva, count);
-        if (fd_table[fd].type == FD_EVENTFD)
-            return eventfd_read(fd, g, buf_gva, count);
-        if (fd_table[fd].type == FD_SIGNALFD)
-            return signalfd_read(fd, g, buf_gva, count);
-        if (fd_table[fd].type == FD_TIMERFD)
-            return timerfd_read(fd, g, buf_gva, count);
-        if (fd_table[fd].type == FD_INOTIFY)
-            return inotify_read(fd, g, buf_gva, count);
-        if (fd_table[fd].type == FD_NETLINK)
-            return netlink_read(fd, g, buf_gva, count);
+    /* Read the type once under fd_lock so a concurrent close/reopen cannot
+     * make different dispatch checks disagree. Each handler still
+     * re-validates internally and returns EBADF if its slot changed.
+     */
+    int type = fd_get_type(fd);
+    switch (type) {
+    case FD_FUSE_DEV:
+        return fuse_dev_read(fd, g, buf_gva, count);
+    case FD_FUSE_FILE:
+        return fuse_read_fd(g, fd, buf_gva, count);
+    case FD_EVENTFD:
+        return eventfd_read(fd, g, buf_gva, count);
+    case FD_SIGNALFD:
+        return signalfd_read(fd, g, buf_gva, count);
+    case FD_TIMERFD:
+        return timerfd_read(fd, g, buf_gva, count);
+    case FD_INOTIFY:
+        return inotify_read(fd, g, buf_gva, count);
+    case FD_NETLINK:
+        return netlink_read(fd, g, buf_gva, count);
+    case FD_URANDOM:
+        return urandom_read(g, fd, buf_gva, count);
     }
 
     host_fd_ref_t host_ref;
@@ -914,11 +1074,23 @@ static int64_t build_host_iov(guest_t *g,
                 free(guest_iov);
             return -LINUX_EFAULT;
         }
-        /* Cap to contiguous permitted bytes */
+        /* Cap to contiguous permitted bytes. When the guest iov entry
+         * spans a non-contiguous boundary (different mapping or
+         * permission), zero every subsequent host iov length so the
+         * host readv/writev returns a POSIX-compliant short I/O rather
+         * than silently packing the truncated tail of buffer i into
+         * buffer i+1 -- which corrupts the guest's data layout.
+         */
         uint64_t len = guest_iov[i].iov_len;
-        if (len > avail)
-            len = avail;
         host_iov[i].iov_base = base;
+        if (len > avail) {
+            host_iov[i].iov_len = avail;
+            for (int j = i + 1; j < iovcnt; j++) {
+                host_iov[j].iov_base = NULL;
+                host_iov[j].iov_len = 0;
+            }
+            break;
+        }
         host_iov[i].iov_len = len;
     }
     if (guest_iov != stack_giov)
@@ -981,29 +1153,55 @@ int64_t sys_readv(guest_t *g, int fd, uint64_t iov_gva, int iovcnt)
         int64_t err = single_guest_iov(g, iov_gva, &giov);
         if (err < 0)
             return err;
+        if (fd_get_type(fd) == FD_URANDOM &&
+            giov.iov_len > (uint64_t) SSIZE_MAX) {
+            err = urandom_check_readable(fd);
+            if (err < 0)
+                return err;
+            return -LINUX_EINVAL;
+        }
         return sys_read(g, fd, giov.iov_base, giov.iov_len);
     }
 
     /* Special FD types need their custom read handlers because glibc may use
      * readv() instead of read() for the same logical operation. Delegate
-     * to the first iov entry's buffer.  Use the first iov's length (not
-     * the sum of all iovs) because the data goes into giov[0].iov_base
-     * which is only giov[0].iov_len bytes long.
+     * scalar special fds to the first iov entry's buffer. Use the first iov's
+     * length (not the sum of all iovs) because the data goes into
+     * giov[0].iov_base which is only giov[0].iov_len bytes long.
      */
-    if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) {
-        int type = fd_table[fd].type;
-        if (type == FD_EVENTFD || type == FD_SIGNALFD || type == FD_TIMERFD ||
-            type == FD_INOTIFY) {
-            if (iovcnt <= 0)
-                return -LINUX_EINVAL;
-            /* Use guest_read for the iov array since guest_ptr alone is unsafe
-             * if the array spans a 2MiB block boundary.
-             */
-            linux_iovec_t giov;
-            if (guest_read_small(g, iov_gva, &giov, sizeof(giov)) < 0)
-                return -LINUX_EFAULT;
-            return sys_read(g, fd, giov.iov_base, giov.iov_len);
-        }
+    int type = fd_get_type(fd);
+    if (type == FD_URANDOM) {
+        int64_t err = urandom_check_readable(fd);
+        if (err < 0)
+            return err;
+        err = validate_iov_total(g, iov_gva, iovcnt);
+        if (err < 0)
+            return err;
+        host_iov_buf_t host_iov;
+        err = host_iov_prepare(g, iov_gva, iovcnt, MEM_PERM_W, &host_iov);
+        if (err < 0)
+            return err;
+        int64_t ret = urandom_fill_iov(fd, host_iov.iov, iovcnt);
+        host_iov_free(&host_iov);
+        /* Mirror sys_read's slow-path refill so a readv consumer that
+         * drains the shim ring leaves it ready for the next call,
+         * instead of forcing every subsequent EL1 fast-path attempt
+         * back through HVC until some other path triggers a refill.
+         */
+        shim_globals_refill_urandom_ring(g);
+        return ret;
+    }
+    if (type == FD_EVENTFD || type == FD_SIGNALFD || type == FD_TIMERFD ||
+        type == FD_INOTIFY) {
+        if (iovcnt <= 0)
+            return -LINUX_EINVAL;
+        /* Use guest_read for the iov array since guest_ptr alone is unsafe
+         * if the array spans a 2MiB block boundary.
+         */
+        linux_iovec_t giov;
+        if (guest_read_small(g, iov_gva, &giov, sizeof(giov)) < 0)
+            return -LINUX_EFAULT;
+        return sys_read(g, fd, giov.iov_base, giov.iov_len);
     }
 
     host_fd_ref_t host_ref;
@@ -1051,7 +1249,7 @@ int64_t sys_writev(guest_t *g, int fd, uint64_t iov_gva, int iovcnt)
      * sum of all iovs) because the data is at giov.iov_base which is only
      * giov.iov_len bytes.  eventfd expects exactly 8 bytes.
      */
-    if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE) && fd_table[fd].type == FD_EVENTFD) {
+    if (fd_get_type(fd) == FD_EVENTFD) {
         if (iovcnt <= 0)
             return -LINUX_EINVAL;
         linux_iovec_t giov;
diff --git a/src/syscall/io.h b/src/syscall/io.h
index 05a3321..dde34a2 100644
--- a/src/syscall/io.h
+++ b/src/syscall/io.h
@@ -22,6 +22,13 @@
 /* read/write and their positional variants. */
 int64_t sys_write(guest_t *g, int fd, uint64_t buf_gva, uint64_t count);
 int64_t sys_read(guest_t *g, int fd, uint64_t buf_gva, uint64_t count);
+void urandom_fd_cleanup(int guest_fd);
+void urandom_fd_reset_cache(int guest_fd);
+/* Initialize the per-fd urandom cache locks. Must run before any guest
+ * thread enters sys_read or sys_readv on /dev/urandom. Called from
+ * syscall_init alongside the other subsystem init hooks.
+ */
+void io_init(void);
 int64_t sys_pread64(guest_t *g,
                     int fd,
                     uint64_t buf_gva,
diff --git a/src/syscall/mem.c b/src/syscall/mem.c
index 13a0157..bcdd48d 100644
--- a/src/syscall/mem.c
+++ b/src/syscall/mem.c
@@ -2458,6 +2458,16 @@ int64_t sys_mremap(guest_t *g,
     if (old_size > 0 && old_size > g->guest_size - old_off)
         return -LINUX_EFAULT;
 
+    /* Reject mremap whose source range touches VM infrastructure (page
+     * tables, shim code, shim data). Without this guard a guest can move
+     * the shim_data block out from under the EL1 stack or the shim-
+     * globals identity cache, since the move path issues raw memmove,
+     * memset, region removal and PTE invalidation. Matches the parallel
+     * guards in sys_mmap MAP_FIXED, sys_munmap and sys_mprotect.
+     */
+    if (guest_range_hits_infra(g, old_off, old_off + old_size))
+        return -LINUX_EINVAL;
+
     /* Verify the whole source range is covered by one tracked VMA. mremap()
      * must not copy holes or unrelated adjacent mappings.
      */
@@ -2500,6 +2510,14 @@ int64_t sys_mremap(guest_t *g,
         if (new_off > g->guest_size || new_size > g->guest_size - new_off)
             return -LINUX_ENOMEM;
 
+        /* Same infrastructure protection as the source range: the move
+         * tail removes any existing dest region and rewrites PTEs, which
+         * would corrupt page tables / shim text / shim data if the dest
+         * lands inside infra.
+         */
+        if (guest_range_hits_infra(g, new_off, new_off + new_size))
+            return -LINUX_EINVAL;
+
         /* Linux rejects MREMAP_FIXED when old and new ranges overlap */
         uint64_t old_end = old_off + old_size, new_end = new_off + new_size;
         if (old_off < new_end && new_off < old_end)
@@ -2706,6 +2724,14 @@ int64_t sys_mremap(guest_t *g,
     if (new_size > old_size) {
         uint64_t grow_off = old_off + old_size, grow_len = new_size - old_size;
 
+        /* Reject growing into infrastructure (page tables, shim text,
+         * shim data). The source-range infra guard above only covers
+         * [old_off, old_off+old_size); the grown tail can still spill
+         * into infra without it.
+         */
+        if (guest_range_hits_infra(g, grow_off, grow_off + grow_len))
+            return -LINUX_EINVAL;
+
         /* Check if the space after the old region is free (overflow-safe) */
         if (grow_off <= g->guest_size && grow_len <= g->guest_size - grow_off) {
             bool can_grow = true;
@@ -2974,6 +3000,16 @@ int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice)
     if (off > g->guest_size || length > g->guest_size - off)
         return -LINUX_ENOMEM;
 
+    /* Defensive guard against destructive advice on infrastructure
+     * ranges (page tables, shim text, shim data). MADV_DONTNEED would
+     * zero shim data via raw host_base+off arithmetic; MADV_FREE on a
+     * future flag change could do the same. Today the destructive
+     * advice paths happen to skip non-anonymous regions, but a future
+     * regression should not silently reopen the hole.
+     */
+    if (guest_range_hits_infra(g, off, off + length))
+        return -LINUX_EINVAL;
+
     switch (advice) {
     case LINUX_MADV_DONTNEED: {
         /* MADV_DONTNEED: zero anon pages so next access sees zero-fill,
diff --git a/src/syscall/net-msg.c b/src/syscall/net-msg.c
index ecc9f71..96221ff 100644
--- a/src/syscall/net-msg.c
+++ b/src/syscall/net-msg.c
@@ -98,7 +98,7 @@ static void recvmsg_close_host_rights(const void *data_src, size_t data_len)
 
 int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags)
 {
-    if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE) && fd_table[fd].type == FD_NETLINK)
+    if (fd_get_type(fd) == FD_NETLINK)
         return netlink_sendmsg(fd, g, msg_gva, linux_flags);
 
     host_fd_ref_t host_ref;
@@ -339,7 +339,7 @@ int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags)
 
 int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags)
 {
-    if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE) && fd_table[fd].type == FD_NETLINK)
+    if (fd_get_type(fd) == FD_NETLINK)
         return netlink_recvmsg(fd, g, msg_gva, flags);
 
     host_fd_ref_t host_ref;
diff --git a/src/syscall/net.c b/src/syscall/net.c
index b80ca18..05b0c76 100644
--- a/src/syscall/net.c
+++ b/src/syscall/net.c
@@ -215,7 +215,7 @@ int64_t sys_socketpair(guest_t *g,
 int64_t sys_bind(guest_t *g, int fd, uint64_t addr_gva, uint32_t addrlen)
 {
     /* Netlink sockets use synthetic fd; dispatch to netlink handler */
-    if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE) && fd_table[fd].type == FD_NETLINK)
+    if (fd_get_type(fd) == FD_NETLINK)
         return netlink_bind(fd, g, addr_gva, addrlen);
 
     host_fd_ref_t host_ref;
@@ -469,7 +469,7 @@ int64_t sys_connect(guest_t *g, int fd, uint64_t addr_gva, uint32_t addrlen)
             return linux_errno();
         }
 
-        if (fd_alloc_at(fd, FD_SOCKET, pair[0]) < 0) {
+        if (fd_alloc_at(fd, FD_SOCKET, pair[0], NULL) < 0) {
             close(pair[0]);
             close(pair[1]);
             host_fd_ref_close(&host_ref);
diff --git a/src/syscall/netlink.c b/src/syscall/netlink.c
index a1b555e..32c3ec3 100644
--- a/src/syscall/netlink.c
+++ b/src/syscall/netlink.c
@@ -396,6 +396,7 @@ static int nl_build_getaddr(netlink_state_t *ns)
 void netlink_init(void)
 {
     memset(nl_state, 0, sizeof(nl_state));
+    fd_register_cleanup(FD_NETLINK, netlink_close);
 }
 
 int64_t netlink_socket(int protocol, int type)
diff --git a/src/syscall/proc-pidfd.c b/src/syscall/proc-pidfd.c
index 62480f3..635eb88 100644
--- a/src/syscall/proc-pidfd.c
+++ b/src/syscall/proc-pidfd.c
@@ -50,6 +50,13 @@ static pidfd_entry_t *pidfd_find_guest_fd_entry(int guest_fd)
     return NULL;
 }
 
+static void pidfd_cleanup(int guest_fd);
+
+void pidfd_init(void)
+{
+    fd_register_cleanup(FD_PIDFD, pidfd_cleanup);
+}
+
 static void pidfd_cleanup(int guest_fd)
 {
     pthread_mutex_lock(&pidfd_lock);
diff --git a/src/syscall/proc-pidfd.h b/src/syscall/proc-pidfd.h
index 8d02df4..79e55e5 100644
--- a/src/syscall/proc-pidfd.h
+++ b/src/syscall/proc-pidfd.h
@@ -10,6 +10,7 @@
 
 #include "core/guest.h"
 
+void pidfd_init(void);
 int pidfd_create(guest_t *g, int64_t target_pid);
 void proc_pidfd_notify_exit(int64_t exited_pid);
 int64_t proc_pidfd_lookup_pid(int guest_fd);
diff --git a/src/syscall/proc.c b/src/syscall/proc.c
index 73bc39c..33cde52 100644
--- a/src/syscall/proc.c
+++ b/src/syscall/proc.c
@@ -33,6 +33,7 @@
 #include "hvutil.h"
 #include "utils.h"
 
+#include "core/shim-globals.h"
 #include "core/vdso.h"
 
 #include "runtime/futex.h"
@@ -1157,6 +1158,17 @@ int vcpu_run_loop(hv_vcpu_t vcpu,
                     /* Check guest ITIMER_REAL expiry (queues SIGALRM if due) */
                     signal_check_timer();
 
+                    /* Recompute the shim-globals attention flag now that
+                     * signal_check_timer has had a chance to drain pending
+                     * work. If nothing is pending and no itimer is armed, drop
+                     * the flag back to zero so the identity fast path
+                     * re-engages for the next getpid loop. Without this clear,
+                     * the attention flag set by signal_queue (e.g., on a
+                     * subprocess's SIGCHLD) would stick forever and
+                     * permanently disable the fast path.
+                     */
+                    shim_globals_recompute_attention(g);
+
                     /* Diagnostic: log signal state after exec/sigreturn
                      * to help debug signal delivery issues.
                      */
diff --git a/src/syscall/signal.c b/src/syscall/signal.c
index 2156638..6f6f4e1 100644
--- a/src/syscall/signal.c
+++ b/src/syscall/signal.c
@@ -28,6 +28,7 @@
 
 #include "hvutil.h"
 
+#include "core/shim-globals.h"
 #include "core/vdso.h"
 
 #include "runtime/thread.h"
@@ -255,14 +256,76 @@ static inline bool *thread_saved_valid_ptr(void)
 
 /* Public API. */
 
+/* Singleton guest pointer used by attention-flag setters in this file.
+ * elfuse runs one VM per process so a single global is correct. The
+ * setter (signal_set_shim_globals_guest) asserts NULL-or-same to catch
+ * a lifecycle bug in any future multi-VM design.
+ *
+ * Atomic because attention_raise runs on every signal queue from any
+ * thread without holding sig_lock, while signal_init clears it across
+ * the execve reset window. ARM64 aligned 64-bit pointer writes are
+ * single-copy atomic, but plain reads/writes have no ordering, so a
+ * concurrent attention_raise could observe a stale value or fail to
+ * see a fresh registration. The release-acquire pair seals the window.
+ */
+static _Atomic(guest_t *) attention_guest;
+
 void signal_init(void)
 {
     memset(&sig_state, 0, sizeof(sig_state));
+    /* Clear the attention singleton on every init pass. Bootstrap and
+     * the fork-child receive path both call this before
+     * signal_set_shim_globals_guest publishes the live g; the reset
+     * keeps the setter's NULL-or-same assertion from latching onto a
+     * stale parent pointer in the child process. Release-store so a
+     * sibling thread that ACQUIRE-loads the slot after init observes
+     * NULL and falls back to thread_interrupt_all instead of a stale
+     * parent pointer.
+     */
+    atomic_store_explicit(&attention_guest, NULL, memory_order_release);
     /* Altstack is now per-thread (in thread_entry_t), initialized to
      * SS_DISABLE by thread_register_main() and thread_alloc().
      */
 }
 
+void signal_set_shim_globals_guest(guest_t *g)
+{
+    guest_t *cur = atomic_load_explicit(&attention_guest, memory_order_acquire);
+    if (g != NULL && cur != NULL && cur != g) {
+        log_error(
+            "signal: shim-globals guest already registered to %p, "
+            "refusing to re-register with %p",
+            (void *) cur, (void *) g);
+        return;
+    }
+    atomic_store_explicit(&attention_guest, g, memory_order_release);
+}
+
+/* Raise the shim-globals attention flag if the singleton has been
+ * registered; otherwise fall back to a bare vCPU interrupt. Both paths
+ * end up running thread_interrupt_all (shim_globals_raise_attention
+ * issues it internally), so callers only need this single helper.
+ */
+static inline void attention_raise(void)
+{
+    guest_t *g = atomic_load_explicit(&attention_guest, memory_order_acquire);
+    if (g)
+        shim_globals_raise_attention(g);
+    else
+        thread_interrupt_all();
+}
+
+/* Predicate matches the deliverability gate used by signal_queue and
+ * signal_queue_info: SIGKILL/SIGSTOP are uncatchable and must always
+ * interrupt; other signals only interrupt when at least one active
+ * thread does not block them.
+ */
+static inline bool signal_should_interrupt(int signum)
+{
+    return sig_uncatchable(signum) ||
+           thread_signal_deliverable(sig_bit(signum));
+}
+
 void signal_reset_for_exec(void)
 {
     thread_entry_t *t = current_thread;
@@ -319,13 +382,14 @@ void signal_queue(int signum)
      */
     signalfd_notify(signum);
 
-    /* Only force vCPUs out of hv_vcpu_run() if the signal is actually
-     * deliverable to at least one thread. SIGKILL/SIGSTOP cannot be
-     * blocked and always need interruption. For other signals, check
-     * per-thread blocked masks to avoid spurious context switches --
-     * Go, JVM, and Node.js mask signals in worker threads, causing
-     * thousands of unnecessary ~1000ns VM exit+re-entry cycles per
-     * second if signal emulation interrupts unconditionally.
+    /* Only force vCPUs out of hv_vcpu_run(), and only force the shim's
+     * identity fast path off, if the signal is actually deliverable to
+     * at least one thread. SIGKILL/SIGSTOP cannot be blocked and always
+     * need interruption. For other signals, check per-thread blocked masks
+     * to avoid spurious context switches -- Go, JVM, and Node.js mask
+     * signals in worker threads, causing thousands of unnecessary ~1000ns
+     * VM exit+re-entry cycles per second if signal emulation interrupts
+     * unconditionally.
      *
      * Race: if a thread concurrently unblocks this signal via
      * rt_sigprocmask, the pending signal could be missed here.
@@ -333,8 +397,8 @@ void signal_queue(int signum)
      * signals after unblocking and interrupting the current thread
      * if delivery became possible.
      */
-    if (sig_uncatchable(signum) || thread_signal_deliverable(sig_bit(signum)))
-        thread_interrupt_all();
+    if (signal_should_interrupt(signum))
+        attention_raise();
 }
 
 void signal_queue_rt(int signum,
@@ -373,8 +437,11 @@ void signal_queue_info(int signum,
                           memory_order_release);
     pthread_mutex_unlock(&sig_lock);
     signalfd_notify(signum);
-    if (thread_signal_deliverable(sig_bit(signum)))
-        thread_interrupt_all();
+    /* Same shim-globals attention raise as signal_queue: force the fast
+     * path off only when the queued signal can reach signal_deliver.
+     */
+    if (signal_should_interrupt(signum))
+        attention_raise();
 }
 
 void signal_set_fault_info(int si_code, uint64_t addr, uint64_t esr)
@@ -407,6 +474,30 @@ int signal_pending(void)
     return result;
 }
 
+bool signal_attention_needed(void)
+{
+    /* Cheap atomic load on the sig-pending hint first; if a signal is
+     * queued and deliverable to at least one active thread, the shim should
+     * drop to the slow path even before we touch the itimer state. A pending
+     * signal blocked by every active thread is not useful slow-path work and
+     * should not keep identity syscalls out of the fast path indefinitely.
+     */
+    uint64_t hint =
+        atomic_load_explicit(&sig_pending_hint, memory_order_acquire);
+    if (hint != 0 && thread_signal_deliverable(hint))
+        return true;
+    /* Active guest itimers: even if no signal is queued YET, the
+     * timer can fire at any moment, and signal_check_timer needs an
+     * HVC #5 epilogue to notice it. Keep attention raised while any
+     * timer is armed.
+     */
+    if (__atomic_load_n(&guest_itimer.active, __ATOMIC_ACQUIRE) ||
+        __atomic_load_n(&guest_itimer_virt.active, __ATOMIC_ACQUIRE) ||
+        __atomic_load_n(&guest_itimer_prof.active, __ATOMIC_ACQUIRE))
+        return true;
+    return false;
+}
+
 bool signal_pending_interruption(bool *restart_out)
 {
     pthread_mutex_lock(&sig_lock);
@@ -752,15 +843,32 @@ void signal_set_itimer(const struct timeval *value,
         pthread_mutex_unlock(&sig_lock);
         return;
     }
-    if (value->tv_sec == 0 && value->tv_usec == 0) {
+    bool arm = (value->tv_sec != 0 || value->tv_usec != 0);
+    if (!arm) {
         /* Disarm */
         __atomic_store_n(&guest_itimer.active, 0, __ATOMIC_RELEASE);
     } else {
-        __atomic_store_n(&guest_itimer.active, 1, __ATOMIC_RELEASE);
+        /* Publish expiry and interval BEFORE the release-store of active.
+         * signal_check_timer and signal_attention_needed ACQUIRE-load
+         * active without holding sig_lock; if active is published before
+         * its associated fields, a consumer can observe active=1 with
+         * stale expiry/interval and decide an early or late SIGALRM.
+         * Matches the field order in signal_set_itimer_virt.
+         */
         guest_itimer.expiry = timeval_add(&now, value);
         guest_itimer.interval = interval ? *interval : (struct timeval) {0, 0};
+        __atomic_store_n(&guest_itimer.active, 1, __ATOMIC_RELEASE);
     }
     pthread_mutex_unlock(&sig_lock);
+
+    /* Arming any timer requires the shim's identity fast path to drop
+     * to the slow path so signal_check_timer can see the expiry. The
+     * disarm case is handled by signal_attention_needed returning
+     * false at the next HVC epilogue recompute -- no explicit clear
+     * here.
+     */
+    if (arm)
+        attention_raise();
 }
 
 void signal_get_itimer(struct timeval *value, struct timeval *interval)
@@ -850,8 +958,9 @@ void signal_set_itimer_virt(int which,
         else
             *old_value = (struct timeval) {0, 0};
     }
+    bool arm = value && (value->tv_sec != 0 || value->tv_usec != 0);
     if (value) {
-        if (value->tv_sec == 0 && value->tv_usec == 0) {
+        if (!arm) {
             __atomic_store_n(&timer->active, 0, __ATOMIC_RELEASE);
         } else {
             timer->expiry = timeval_add(&now, value);
@@ -860,6 +969,9 @@ void signal_set_itimer_virt(int which,
         }
     }
     pthread_mutex_unlock(&sig_lock);
+
+    if (arm)
+        attention_raise();
 }
 
 void signal_get_itimer_virt(int which,
@@ -1447,7 +1559,7 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code)
      * glibc leaves sa_restorer uninitialized (garbage); musl sets it to
      * __restore_rt.  Match the kernel: always use the vDSO trampoline.
      */
-    hv_vcpu_set_reg(vcpu, HV_REG_X30, VDSO_BASE + VDSO_OFF_TEXT);
+    hv_vcpu_set_reg(vcpu, HV_REG_X30, VDSO_BASE + VDSO_OFF_SIGRET);
 
     if (act->sa_flags & LINUX_SA_SIGINFO) {
         /* X1 = pointer to siginfo, X2 = pointer to ucontext */
diff --git a/src/syscall/signal.h b/src/syscall/signal.h
index 99602b0..f5792f5 100644
--- a/src/syscall/signal.h
+++ b/src/syscall/signal.h
@@ -244,6 +244,28 @@ void signal_set_fault_info(int si_code, uint64_t addr, uint64_t esr);
 int signal_pending(void);
 bool signal_pending_interruption(bool *restart_out);
 
+/* True if anything that would normally be drained by signal_check_timer is
+ * currently live: an unblocked pending signal, OR any of the three guest
+ * itimers is armed. The shim's identity fast path consults this (indirectly
+ * via shim_globals attention flag) to decide whether to skip the HVC #5
+ * round-trip. Whenever this returns true, the shim must take the slow path so
+ * the epilogue's signal_check_timer + queue drain runs.
+ */
+bool signal_attention_needed(void);
+
+/* Register the shim-globals guest pointer used by the attention setters in
+ * signal_queue / setitimer / proc_set_exit_group. Called from bootstrap and
+ * fork-child after guest_init. Asserts that the value is NULL or matches the
+ * already-registered g; elfuse runs one VM per process and the singleton
+ * catches lifecycle bugs (multiple concurrent VMs in one process would
+ * violate this invariant).
+ *
+ * Passing NULL clears the registration (used by signal_init for a defensive
+ * reset; the attention setters become no-ops in that state, matching the
+ * pre-registration behavior).
+ */
+void signal_set_shim_globals_guest(guest_t *g);
+
 /* Deliver the highest-priority pending unblocked signal to the guest.
  * Builds an rt_sigframe on the guest stack and redirects vCPU to handler.
  * Returns: 1 if signal was delivered, 0 if nothing pending,
@@ -289,9 +311,9 @@ const signal_state_t *signal_get_state(void);
 void signal_set_state(const signal_state_t *state);
 
 /* Snapshot or consume pending signals for signalfd.
- * signal_peek_signalfd() snapshots up to max matching entries without
- * consuming them. signal_take_signalfd_exact() then consumes those exact
- * entries, preserving any matching signals that arrived later.
+ * signal_peek_signalfd() snapshots up to max matching entries without consuming
+ * them. signal_take_signalfd_exact() then consumes those exact entries,
+ * preserving any matching signals that arrived later.
  */
 size_t signal_peek_signalfd(uint64_t mask, signal_rt_info_t *out, size_t max);
 size_t signal_take_signalfd_exact(const signal_rt_info_t *expected, size_t max);
diff --git a/src/syscall/sys.c b/src/syscall/sys.c
index 9166850..1284090 100644
--- a/src/syscall/sys.c
+++ b/src/syscall/sys.c
@@ -178,12 +178,28 @@ int64_t sys_uname(guest_t *g, uint64_t buf_gva)
     return 0;
 }
 
+/* Linux getrandom(2) flags. arc4random_buf is always non-blocking and always
+ * seeded, so GRND_NONBLOCK / GRND_RANDOM / GRND_INSECURE all collapse to the
+ * same behavior here. Unknown flag bits must still return EINVAL per kernel
+ * behavior (kernel/random.c rejects flags & ~SUPPORTED_FLAGS) so callers do
+ * not silently fossilize wrong assumptions about the elfuse implementation.
+ */
+#define LINUX_GRND_NONBLOCK 0x0001
+#define LINUX_GRND_RANDOM 0x0002
+#define LINUX_GRND_INSECURE 0x0004
+#define LINUX_GRND_SUPPORTED_MASK \
+    (LINUX_GRND_NONBLOCK | LINUX_GRND_RANDOM | LINUX_GRND_INSECURE)
+
 int64_t sys_getrandom(guest_t *g,
                       uint64_t buf_gva,
                       uint64_t buflen,
                       unsigned int flags)
 {
-    (void) flags;
+    if (flags & ~LINUX_GRND_SUPPORTED_MASK)
+        return -LINUX_EINVAL;
+    if ((flags & (LINUX_GRND_RANDOM | LINUX_GRND_INSECURE)) ==
+        (LINUX_GRND_RANDOM | LINUX_GRND_INSECURE))
+        return -LINUX_EINVAL;
     if (buflen == 0)
         return 0;
     if (buf_gva > UINT64_MAX - buflen)
diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c
index 68cad6d..be97787 100644
--- a/src/syscall/syscall.c
+++ b/src/syscall/syscall.c
@@ -56,11 +56,14 @@
 #include "syscall/poll.h"
 #include "syscall/path.h"
 #include "syscall/proc.h"
+#include "syscall/proc-pidfd.h"
 #include "syscall/signal.h"
 #include "syscall/sys.h"
 #include "syscall/sysvipc.h"
 #include "syscall/time.h"
 
+#include "core/shim-globals.h"
+
 /* Generated from src/syscall/dispatch.tbl into $(BUILD_DIR). */
 #include "dispatch.h"
 
@@ -84,6 +87,11 @@ void syscall_init(void)
 {
     fdtable_init();
     signal_init();
+    /* Mirror signal_init's attention_guest reset for the fd/urandom
+     * bitmap singleton in shim-globals. Defends against a stale
+     * parent-process pointer surviving across posix_spawn re-init.
+     */
+    shim_globals_reset_singleton();
 
     /* Initialize special FD subsystems (eventfd, signalfd, timerfd, inotify).
      * Must happen before any guest code runs so that concurrent CLONE_THREAD
@@ -95,6 +103,9 @@ void syscall_init(void)
     inotify_init();
     netlink_init();
     fuse_init();
+    pidfd_init();
+    io_init();
+    fd_register_cleanup(FD_URANDOM, urandom_fd_cleanup);
     wakeup_pipe_init();
 }
 
@@ -163,6 +174,35 @@ typedef int64_t (*syscall_handler_t)(guest_t *g,
 
 #define SC_STUB(name, val) SC_FORWARD(name, (val))
 
+/* Bracket setuid/setgid family invocations so concurrent shim-fast-path
+ * readers cannot observe stale credentials. The host-side proc_sys_*
+ * mutators flip the _Atomic credential slots inside proc-identity.c;
+ * the shim cache must reflect that under the same atomic window.
+ *
+ * Sequence: OR ATTN_BIT_CRED -> mutator -> on success publish_creds ->
+ * AND ~ATTN_BIT_CRED. The OR-only update preserves whatever
+ * ATTN_BIT_SIGTIMER state the HVC #5 epilogue's recompute may have
+ * set or cleared in parallel; AND-only clear at the end leaves the
+ * SIGTIMER lane alone. Earlier revisions wrote the full word, which
+ * let a sibling's recompute drop the flag to zero mid-publish and
+ * reopened the torn-cred race the bracket was meant to close.
+ *
+ * Implemented as a statement-expression macro so the SC_FORWARD body
+ * stays a single expression and the mutator runs after the attention
+ * raise as part of normal C sequencing.
+ */
+#define CRED_BRACKETED(g_, mutator_)                                       \
+    __extension__({                                                        \
+        guest_t *_g = (g_);                                                \
+        shim_globals_attn_or(_g, ATTN_BIT_CRED);                           \
+        int64_t _rc = (mutator_);                                          \
+        if (_rc == 0)                                                      \
+            shim_globals_publish_creds(_g, proc_get_uid(), proc_get_euid(),\
+                                       proc_get_gid(), proc_get_egid());   \
+        shim_globals_attn_and(_g, ~ATTN_BIT_CRED);                         \
+        _rc;                                                               \
+    })
+
 /* sc_xxx forwarding wrappers: thin adapters that unpack the syscall ABI
  * argument tuple (x0..x5) into a sys_xxx() call.
  */
@@ -494,12 +534,12 @@ SC_FORWARD(sc_getuid,   (int64_t) proc_get_uid())
 SC_FORWARD(sc_geteuid,  (int64_t) proc_get_euid())
 SC_FORWARD(sc_getgid,   (int64_t) proc_get_gid())
 SC_FORWARD(sc_getegid,  (int64_t) proc_get_egid())
-SC_FORWARD(sc_setuid,   proc_sys_setuid((uint32_t) x0))
-SC_FORWARD(sc_setgid,   proc_sys_setgid((uint32_t) x0))
-SC_FORWARD(sc_setreuid,  proc_sys_setreuid((uint32_t) x0, (uint32_t) x1))
-SC_FORWARD(sc_setregid,  proc_sys_setregid((uint32_t) x0, (uint32_t) x1))
-SC_FORWARD(sc_setresuid, proc_sys_setresuid((uint32_t) x0, (uint32_t) x1, (uint32_t) x2))
-SC_FORWARD(sc_setresgid, proc_sys_setresgid((uint32_t) x0, (uint32_t) x1, (uint32_t) x2))
+SC_FORWARD(sc_setuid,   CRED_BRACKETED(g, proc_sys_setuid((uint32_t) x0)))
+SC_FORWARD(sc_setgid,   CRED_BRACKETED(g, proc_sys_setgid((uint32_t) x0)))
+SC_FORWARD(sc_setreuid,  CRED_BRACKETED(g, proc_sys_setreuid((uint32_t) x0, (uint32_t) x1)))
+SC_FORWARD(sc_setregid,  CRED_BRACKETED(g, proc_sys_setregid((uint32_t) x0, (uint32_t) x1)))
+SC_FORWARD(sc_setresuid, CRED_BRACKETED(g, proc_sys_setresuid((uint32_t) x0, (uint32_t) x1, (uint32_t) x2)))
+SC_FORWARD(sc_setresgid, CRED_BRACKETED(g, proc_sys_setresgid((uint32_t) x0, (uint32_t) x1, (uint32_t) x2)))
 SC_FORWARD(sc_setpgid, proc_sys_setpgid((int64_t) x0, (int64_t) x1))
 SC_STUB(sc_fadvise64,           0)
 SC_STUB(sc_sched_yield,         (sched_yield(), 0))
diff --git a/src/syscall/time.c b/src/syscall/time.c
index 8a76c4b..f584990 100644
--- a/src/syscall/time.c
+++ b/src/syscall/time.c
@@ -15,6 +15,7 @@
 
 #include "utils.h"
 
+#include "core/vdso.h"
 #include "runtime/thread.h" /* current_thread, guest_tid */
 #include "syscall/abi.h"
 #include "syscall/internal.h"
@@ -57,10 +58,9 @@ _Static_assert(sizeof(struct timespec) == sizeof(linux_timespec_t),
 _Static_assert(sizeof(struct timeval) == sizeof(linux_timeval_t),
                "host and guest timeval must match on LP64");
 
-static bool linux_timespec_valid(const linux_timespec_t *ts,
-                                 bool allow_negative_sec)
+static bool linux_timespec_valid(const linux_timespec_t *ts)
 {
-    if (!allow_negative_sec && ts->tv_sec < 0)
+    if (ts->tv_sec < 0)
         return false;
     return ts->tv_nsec >= 0 && ts->tv_nsec < NSEC_PER_SEC;
 }
@@ -243,16 +243,83 @@ int64_t sys_clock_getres(guest_t *g, int clockid, uint64_t tp_gva)
 
 int64_t sys_clock_gettime(guest_t *g, int clockid, uint64_t tp_gva)
 {
-    struct timespec ts;
     int mac_clockid = translate_clockid(clockid);
     if (mac_clockid < 0)
         return -LINUX_EINVAL;
+
+    /* If this trap came from the __kernel_clock_gettime vDSO svc_fallback,
+     * the trampoline parked the guest's CNTVCT_EL0 read in X9 before
+     * issuing SVC, and ELR_EL1 holds the address immediately after that
+     * SVC. Pair X9 with both the MONOTONIC and REALTIME wall_clocks and
+     * seed the vvar so subsequent calls hit the fast path for either
+     * clockid. Skip the seed for any other trap (raw
+     * syscall(SYS_clock_gettime, ...) from guest code, etc.): X9 is
+     * then arbitrary guest state, and seeding from it would poison the
+     * anchor and break every later fast-path call.
+     *
+     * Skip the gate entirely once the anchor is published: vdso_seed_anchor
+     * is a one-shot CAS that can never fire again, so the HVF reads of
+     * ELR_EL1 and X9 below would be pure waste on every subsequent trap.
+     * Both clockid 0 (REALTIME) and clockid 1 (MONOTONIC) take the vDSO
+     * fast path, so either may be the first caller; either way both
+     * anchor pairs are seeded from a single set of host clock_gettime
+     * calls.
+     *
+     * Order matters: read X9 first, then sample both host wall clocks
+     * back-to-back, then write to guest and seed. Sampling host clocks
+     * before checking X9 would bake a permanent positive bias (~50-200 ns)
+     * into the anchor because every host call ages the X9 timestamp by
+     * the seeding gate's HVF round-trip. The back-to-back wall-clock
+     * reads minimize MONO/REAL skew within the anchor.
+     */
+    bool seed_eligible = (clockid == 0 /* CLOCK_REALTIME */ ||
+                          clockid == 1 /* CLOCK_MONOTONIC */) &&
+                         current_thread && !vdso_anchor_is_seeded(g);
+
+    uint64_t guest_cntvct = 0;
+    if (seed_eligible) {
+        uint64_t elr = 0;
+        if (hv_vcpu_get_sys_reg(current_thread->vcpu, HV_SYS_REG_ELR_EL1,
+                                &elr) != HV_SUCCESS ||
+            elr != vdso_clock_gettime_svc_pc() + 4 ||
+            hv_vcpu_get_reg(current_thread->vcpu, HV_REG_X9, &guest_cntvct) !=
+                HV_SUCCESS ||
+            guest_cntvct == 0) {
+            /* Trap came from a path other than the vDSO trampoline; X9 is
+             * arbitrary, fall through to the non-seeding path.
+             */
+            seed_eligible = false;
+        }
+    }
+
+    struct timespec ts;
     if (clock_gettime(mac_clockid, &ts) < 0)
         return linux_errno();
 
+    /* For the seeding path, sample the OTHER clockid back-to-back so both
+     * anchor pairs reflect roughly the same host moment. If the second
+     * clock_gettime fails (unreachable on macOS but defensive), skip
+     * seeding rather than fail the user's request: the user already has
+     * the value they asked for.
+     */
+    struct timespec ts_other;
+    bool can_seed = false;
+    if (seed_eligible) {
+        int other_mac = (clockid == 1) ? CLOCK_REALTIME : CLOCK_MONOTONIC;
+        if (clock_gettime(other_mac, &ts_other) == 0)
+            can_seed = true;
+    }
+
     if (guest_write_small(g, tp_gva, &ts, sizeof(ts)) < 0)
         return -LINUX_EFAULT;
 
+    if (can_seed) {
+        const struct timespec *ts_mono = (clockid == 1) ? &ts : &ts_other;
+        const struct timespec *ts_real = (clockid == 0) ? &ts : &ts_other;
+        vdso_seed_anchor(g, guest_cntvct, ts_mono->tv_sec, ts_mono->tv_nsec,
+                         ts_real->tv_sec, ts_real->tv_nsec);
+    }
+
     return 0;
 }
 
@@ -268,7 +335,7 @@ int64_t sys_nanosleep(guest_t *g, uint64_t req_gva, uint64_t rem_gva)
     if (guest_read_small(g, req_gva, &lreq, sizeof(lreq)) < 0)
         return -LINUX_EFAULT;
 
-    if (!linux_timespec_valid(&lreq, false))
+    if (!linux_timespec_valid(&lreq))
         return -LINUX_EINVAL;
 
     return interruptible_sleep_ns(g, linux_timespec_to_ns_sat(&lreq), rem_gva,
@@ -287,7 +354,14 @@ int64_t sys_clock_nanosleep(guest_t *g,
 
     if (flags & ~TIMER_ABSTIME)
         return -LINUX_EINVAL;
-    if (!linux_timespec_valid(&lreq, (flags & TIMER_ABSTIME) != 0))
+    /* Linux's hrtimer_nanosleep_clockid validates the timespec via
+     * timespec64_valid_strict() (kernel/time/hrtimer.c) before deciding
+     * whether the absolute deadline has expired. Negative tv_sec is
+     * rejected with EINVAL even when TIMER_ABSTIME is set, not silently
+     * treated as 'already expired'. Reject negative tv_sec unconditionally
+     * so both relative and absolute callers match the kernel contract.
+     */
+    if (!linux_timespec_valid(&lreq))
         return -LINUX_EINVAL;
 
     int mac_clockid = translate_clockid(clockid);
@@ -297,9 +371,6 @@ int64_t sys_clock_nanosleep(guest_t *g,
     int64_t remaining_ns;
 
     if (flags & TIMER_ABSTIME) {
-        if (lreq.tv_sec < 0)
-            return 0;
-
         struct timespec now;
         if (clock_gettime(mac_clockid, &now) < 0)
             return linux_errno();
@@ -340,9 +411,15 @@ int64_t sys_setitimer(guest_t *g, int which, uint64_t new_gva, uint64_t old_gva)
     if (new_gva) {
         if (guest_read_small(g, new_gva, &lnew, sizeof(lnew)) < 0)
             return -LINUX_EFAULT;
-        /* Linux rejects tv_usec outside [0, 999999] for value and interval. */
+        /* Linux rejects tv_usec outside [0, 999999] AND negative tv_sec for
+         * both value and interval. Accepting a negative tv_sec would cast
+         * through (long) below and arm an expired timer instead of returning
+         * EINVAL, diverging from the kernel contract.
+         */
         if (!RANGE_CHECK(lnew.it_value.tv_usec, 0, 1000000) ||
-            !RANGE_CHECK(lnew.it_interval.tv_usec, 0, 1000000))
+            !RANGE_CHECK(lnew.it_interval.tv_usec, 0, 1000000) ||
+            (int64_t) lnew.it_value.tv_sec < 0 ||
+            (int64_t) lnew.it_interval.tv_sec < 0)
             return -LINUX_EINVAL;
         has_new = true;
     }
diff --git a/tests/bench-futex-pingpong.c b/tests/bench-futex-pingpong.c
new file mode 100644
index 0000000..707e898
--- /dev/null
+++ b/tests/bench-futex-pingpong.c
@@ -0,0 +1,110 @@
+/* Futex ping-pong microbenchmark
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Two threads handing off via FUTEX_WAIT and FUTEX_WAKE on private futexes.
+ * Measures the round-trip cost of the core wait/wake hot path. Reports total
+ * elapsed time in milliseconds for the configured handoff count.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "raw-syscall.h"
+
+#define HANDOFFS 20000
+
+static volatile int turn_a;
+static volatile int turn_b;
+static volatile int b_done;
+
+static int child_stack_storage[16384] __attribute__((aligned(16)));
+
+static int child_fn(void *arg)
+{
+    (void) arg;
+    for (int i = 0; i < HANDOFFS; i++) {
+        while (__atomic_load_n(&turn_b, __ATOMIC_ACQUIRE) == 0)
+            raw_futex_wait((int *) &turn_b, 0);
+        __atomic_store_n(&turn_b, 0, __ATOMIC_RELEASE);
+        __atomic_store_n(&turn_a, 1, __ATOMIC_RELEASE);
+        raw_futex_wake((int *) &turn_a, 1);
+    }
+    __atomic_store_n(&b_done, 1, __ATOMIC_RELEASE);
+    raw_futex_wake((int *) &b_done, 1);
+    raw_exit(0);
+    return 0;
+}
+
+int main(void)
+{
+    struct timeval start, end;
+
+    /* Allocate a child stack via the local array (already 16-aligned). */
+    void *stack_top =
+        (char *) child_stack_storage + sizeof(child_stack_storage);
+
+    int ctid = 0;
+    long flags = 0x00010f00 | 0x00200000; /* CLONE_VM|FS|FILES|SIGHAND|THREAD|
+                                           * SYSVSEM|CHILD_CLEARTID
+                                           * matched at raw level.
+                                           */
+    /* Use the conventional pthread-like flag mask. */
+    flags = 0x3D0F00; /* CLONE_VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS off |
+                       * PARENT_SETTID off | CHILD_CLEARTID|CHILD_SETTID.
+                       */
+    /* Simpler: just CLONE_VM|CLONE_THREAD|CLONE_SIGHAND|CLONE_FS|CLONE_FILES.
+     */
+    flags = 0x00000100 | 0x00010000 | 0x00000800 | 0x00000200 |
+            0x00000400; /* VM|THREAD|SIGHAND|FS|FILES */
+
+    /* aarch64 clone ABI: x0=flags, x1=child_stack, x2=parent_tid,
+     * x3=tls, x4=child_tid. The child returns at the same site.
+     */
+    long rc = raw_clone((unsigned long) flags, stack_top, NULL, 0, &ctid);
+    if (rc == 0) {
+        child_fn(NULL);
+        /* unreachable */
+        return 0;
+    }
+    if (rc < 0) {
+        fprintf(stderr, "clone failed: %ld\n", rc);
+        return 1;
+    }
+
+    gettimeofday(&start, NULL);
+
+    /* Kick off the round-trip. */
+    __atomic_store_n(&turn_b, 1, __ATOMIC_RELEASE);
+    raw_futex_wake((int *) &turn_b, 1);
+
+    for (int i = 0; i < HANDOFFS; i++) {
+        while (__atomic_load_n(&turn_a, __ATOMIC_ACQUIRE) == 0)
+            raw_futex_wait((int *) &turn_a, 0);
+        __atomic_store_n(&turn_a, 0, __ATOMIC_RELEASE);
+        if (i + 1 < HANDOFFS) {
+            __atomic_store_n(&turn_b, 1, __ATOMIC_RELEASE);
+            raw_futex_wake((int *) &turn_b, 1);
+        }
+    }
+
+    /* Wait for child to finish so timing covers full handoff count. */
+    while (__atomic_load_n(&b_done, __ATOMIC_ACQUIRE) == 0)
+        raw_futex_wait((int *) &b_done, 0);
+
+    gettimeofday(&end, NULL);
+
+    long elapsed_us =
+        (end.tv_sec - start.tv_sec) * 1000000L + (end.tv_usec - start.tv_usec);
+    /* Print elapsed time in milliseconds (3 decimal places). */
+    long ms_int = elapsed_us / 1000;
+    long ms_frac = elapsed_us % 1000;
+    printf("elapsed_ms %ld.%03ld\n", ms_int, ms_frac);
+    return 0;
+}
diff --git a/tests/manifest.txt b/tests/manifest.txt
index ff9631b..19b1b27 100644
--- a/tests/manifest.txt
+++ b/tests/manifest.txt
@@ -45,10 +45,18 @@ test-file-ops
 test-sysinfo
 test-io-opt
 test-syscall-smoke
+test-vdso
+test-shim-identity
+test-shim-identity-attention
+test-shim-verbose-trace
+test-shim-data-el1
+test-shim-urandom-smp
+test-shim-urandom-toctou
 test-poll                      # diff=skip
 
 [section] I/O subsystem tests
 test-eventfd
+test-eventfd-dup
 test-signalfd
 test-signalfd-hardening
 test-epoll
@@ -83,8 +91,9 @@ test-clone3                    # diff=skip
 test-fork-exec $TESTDIR/echo-test
 test-fork-lowbase
 
-[section] COW fork isolation tests
+[section] CoW fork isolation tests
 test-cow-fork
+test-fork-synthetic-fd
 
 [section] O_CLOEXEC tests
 test-cloexec
@@ -102,6 +111,8 @@ test-lowbase-mem-300000
 
 [section] mremap tests
 test-mremap
+test-mremap-infra
+test-shim-cred-race
 
 [section] msync MAP_SHARED tests
 test-msync
diff --git a/tests/test-cow-fork.c b/tests/test-cow-fork.c
index 8770420..f7cc0c7 100644
--- a/tests/test-cow-fork.c
+++ b/tests/test-cow-fork.c
@@ -1,4 +1,4 @@
-/* COW fork memory isolation tests
+/* CoW fork memory isolation tests
  *
  * Copyright 2026 elfuse contributors
  * Copyright 2025 Moritz Angermann, zw3rk pte. ltd.
@@ -166,11 +166,11 @@ static void test_mmap_isolation(void)
     munmap(region, 4096);
 }
 
-/* Test 4: Large region COW (verify no corruption) */
+/* Test 4: Large region CoW (verify no corruption) */
 
 static void test_large_cow(void)
 {
-    TEST("fork: 1MiB COW integrity");
+    TEST("fork: 1MiB CoW integrity");
 
     int pipefd[2];
     if (pipe(pipefd) != 0) {
@@ -229,7 +229,7 @@ static void test_large_cow(void)
     int status;
     waitpid(pid, &status, 0);
 
-    EXPECT_TRUE(parent_ok && child_ok, "1MiB COW integrity failed");
+    EXPECT_TRUE(parent_ok && child_ok, "1MiB CoW integrity failed");
     munmap(buf, sz);
 }
 
@@ -302,7 +302,7 @@ static void test_brk_isolation(void)
 
 int main(void)
 {
-    printf("test-cow-fork: COW fork memory isolation tests\n");
+    printf("test-cow-fork: CoW fork memory isolation tests\n");
 
     test_stack_isolation();
     test_heap_isolation();
diff --git a/tests/test-eventfd-dup.c b/tests/test-eventfd-dup.c
new file mode 100644
index 0000000..484c2d7
--- /dev/null
+++ b/tests/test-eventfd-dup.c
@@ -0,0 +1,65 @@
+/* test-eventfd-dup.c -- dup of eventfd shares state (Linux contract)
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Linux dup of an eventfd produces a second descriptor that points at the
+ * same kernel object; reads and writes on either fd see the same counter.
+ * elfuse used to give each dup'd guest_fd a fresh side-table slot, so
+ * dup'd eventfds diverged and breaking programs that signal across the
+ * pair. This test pins the contract by:
+ *   - duping an eventfd initialised with counter=7, reading via the dup,
+ *     verifying the dup observes the source's initial value
+ *   - writing via the source, reading via the dup, verifying state shares
+ *   - closing one end of the alias and continuing to operate on the other
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <poll.h>
+#include <sys/eventfd.h>
+#include <unistd.h>
+
+static int failures = 0;
+
+#define EXPECT(cond, msg)                       \
+    do {                                        \
+        if (!(cond)) {                          \
+            fprintf(stderr, "FAIL: %s\n", msg); \
+            failures++;                         \
+        }                                       \
+    } while (0)
+
+int main(void)
+{
+    int a = eventfd(7, EFD_CLOEXEC);
+    EXPECT(a >= 0, "eventfd(7) returned valid fd");
+    int b = dup(a);
+    EXPECT(b >= 0, "dup(a) returned valid fd");
+
+    uint64_t v = 0;
+    EXPECT(read(b, &v, 8) == 8, "read 8 bytes from dup'd fd");
+    EXPECT(v == 7, "dup'd fd observes source initial counter (7)");
+
+    uint64_t n = 42;
+    EXPECT(write(a, &n, 8) == 8, "write 42 to source fd");
+    EXPECT(read(b, &v, 8) == 8, "read counter from dup'd fd");
+    EXPECT(v == 42, "dup'd fd observes source write (42)");
+
+    close(a);
+    n = 99;
+    EXPECT(write(b, &n, 8) == 8, "write 99 to alias after closing source");
+    EXPECT(read(b, &v, 8) == 8, "read after partial close");
+    EXPECT(v == 99, "alias still functional after partial close");
+    struct pollfd pfd = {.fd = b, .events = POLLIN};
+    EXPECT(poll(&pfd, 1, 0) == 0, "alias is not readable after drain");
+    close(b);
+
+    if (failures) {
+        printf("test-eventfd-dup: %d FAIL\n", failures);
+        return 1;
+    }
+    puts("test-eventfd-dup: PASS");
+    return 0;
+}
diff --git a/tests/test-fork-synthetic-fd.c b/tests/test-fork-synthetic-fd.c
new file mode 100644
index 0000000..1e89a46
--- /dev/null
+++ b/tests/test-fork-synthetic-fd.c
@@ -0,0 +1,218 @@
+/* test-fork-synthetic-fd.c -- fork inheritance contract for synthetic fds
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The fork-IPC handoff does NOT serialize per-class side tables for
+ * eventfd/signalfd/timerfd/inotify/netlink/pidfd. Restoring the
+ * inherited host fd without that state leaves a half-functional slot,
+ * so fork-state.c explicitly drops these in the child. This test pins
+ * that contract:
+ *   - urandom IS inherited (no per-class state to lose; cache is fresh
+ *     in the child and arc4random_buf works)
+ *   - eventfd / signalfd / timerfd / inotify are NOT inherited; the
+ *     child sees EBADF and can recreate the fd at the same slot
+ *   - the inherited host fd does not leak in the child
+ *
+ * Once a subsystem grows a serialize/restore path, the corresponding
+ * EBADF expectation here flips to a positive inheritance check.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/eventfd.h>
+#include <sys/inotify.h>
+#include <sys/signalfd.h>
+#include <sys/timerfd.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+static int failures = 0;
+
+#define EXPECT(cond, msg)                       \
+    do {                                        \
+        if (!(cond)) {                          \
+            fprintf(stderr, "FAIL: %s\n", msg); \
+            failures++;                         \
+        }                                       \
+    } while (0)
+
+static int run_child(int (*fn)(int), int fd)
+{
+    pid_t pid = fork();
+    if (pid < 0)
+        return -1;
+    if (pid == 0)
+        _exit(fn(fd));
+    int status = 0;
+    if (waitpid(pid, &status, 0) < 0)
+        return -1;
+    return WIFEXITED(status) ? WEXITSTATUS(status) : -1;
+}
+
+static int child_urandom_read(int fd)
+{
+    unsigned char b[8];
+    if (read(fd, b, sizeof(b)) != (ssize_t) sizeof(b))
+        return 1;
+    int seen_nonzero = 0;
+    for (size_t i = 0; i < sizeof(b); i++)
+        if (b[i] != 0)
+            seen_nonzero = 1;
+    return seen_nonzero ? 0 : 2;
+}
+
+static int child_ebadf_read(int fd)
+{
+    char buf[8] = {0};
+    errno = 0;
+    ssize_t n = read(fd, buf, sizeof(buf));
+    if (n != -1)
+        return 1;
+    if (errno != EBADF)
+        return 2;
+    return 0;
+}
+
+static int child_ebadf_reusable_at_same_fd(int fd)
+{
+    int rc = child_ebadf_read(fd);
+    if (rc != 0)
+        return rc;
+    int again = open("/dev/null", O_RDONLY | O_CLOEXEC);
+    if (again < 0)
+        return 3;
+    if (again != fd) {
+        close(again);
+        return 4;
+    }
+    close(again);
+    return 0;
+}
+
+static int child_eventfd_recreate(int fd)
+{
+    /* The inherited eventfd slot should be FD_CLOSED in the child; we
+     * should be able to create a fresh eventfd that works normally.
+     */
+    char buf[8];
+    errno = 0;
+    if (read(fd, buf, sizeof(buf)) != -1 || errno != EBADF)
+        return 1;
+    close(fd); /* harmless on a closed slot */
+    int e = eventfd(0, EFD_CLOEXEC);
+    if (e < 0)
+        return 2;
+    uint64_t one = 1;
+    if (write(e, &one, sizeof(one)) != (ssize_t) sizeof(one)) {
+        close(e);
+        return 3;
+    }
+    uint64_t got = 0;
+    if (read(e, &got, sizeof(got)) != (ssize_t) sizeof(got) || got != 1) {
+        close(e);
+        return 4;
+    }
+    close(e);
+    return 0;
+}
+
+static void test_urandom_inherited(void)
+{
+    int fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC);
+    EXPECT(fd >= 0, "open /dev/urandom");
+    if (fd < 0)
+        return;
+    int rc = run_child(child_urandom_read, fd);
+    EXPECT(rc == 0, "child can read inherited /dev/urandom");
+    close(fd);
+}
+
+static void test_synthetic_dropped(const char *label, int (*opener)(void))
+{
+    int fd = opener();
+    EXPECT(fd >= 0, label);
+    if (fd < 0)
+        return;
+    int rc = run_child(child_ebadf_read, fd);
+    char msg[80];
+    snprintf(msg, sizeof(msg), "child sees EBADF on inherited %s", label);
+    EXPECT(rc == 0, msg);
+    close(fd);
+}
+
+static void test_eventfd_recreate(void)
+{
+    int fd = eventfd(0, EFD_CLOEXEC);
+    EXPECT(fd >= 0, "open eventfd");
+    if (fd < 0)
+        return;
+    int rc = run_child(child_eventfd_recreate, fd);
+    EXPECT(rc == 0, "child can recreate eventfd after drop");
+    close(fd);
+}
+
+static void test_low_synthetic_dropped(void)
+{
+    int saved_stdin = dup(STDIN_FILENO);
+    EXPECT(saved_stdin >= 0, "save stdin");
+    if (saved_stdin < 0)
+        return;
+
+    EXPECT(close(STDIN_FILENO) == 0, "close stdin");
+    int fd = eventfd(0, EFD_CLOEXEC);
+    EXPECT(fd == STDIN_FILENO, "eventfd reuses fd 0");
+    if (fd == STDIN_FILENO) {
+        int rc = run_child(child_ebadf_reusable_at_same_fd, fd);
+        EXPECT(rc == 0, "child sees EBADF on low inherited eventfd");
+        close(fd);
+    } else if (fd >= 0) {
+        close(fd);
+    }
+
+    EXPECT(dup2(saved_stdin, STDIN_FILENO) == STDIN_FILENO, "restore stdin");
+    close(saved_stdin);
+}
+
+static int open_eventfd(void)
+{
+    return eventfd(0, EFD_CLOEXEC);
+}
+static int open_timerfd(void)
+{
+    return timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC);
+}
+static int open_signalfd(void)
+{
+    sigset_t s;
+    sigemptyset(&s);
+    sigaddset(&s, SIGUSR1);
+    return signalfd(-1, &s, SFD_CLOEXEC);
+}
+static int open_inotify(void)
+{
+    return inotify_init1(IN_CLOEXEC);
+}
+
+int main(void)
+{
+    printf("test-fork-synthetic-fd: synthetic fd fork inheritance contract\n");
+    test_urandom_inherited();
+    test_synthetic_dropped("eventfd", open_eventfd);
+    test_synthetic_dropped("timerfd", open_timerfd);
+    test_synthetic_dropped("signalfd", open_signalfd);
+    test_synthetic_dropped("inotify", open_inotify);
+    test_eventfd_recreate();
+    test_low_synthetic_dropped();
+    if (failures) {
+        printf("test-fork-synthetic-fd: %d FAIL\n", failures);
+        return 1;
+    }
+    puts("test-fork-synthetic-fd: PASS");
+    return 0;
+}
diff --git a/tests/test-large-io-boundary.c b/tests/test-large-io-boundary.c
index 28b76e7..c50adf8 100644
--- a/tests/test-large-io-boundary.c
+++ b/tests/test-large-io-boundary.c
@@ -182,12 +182,88 @@ static void test_large_read_from_split_block(void)
     EXPECT_TRUE(ok, "read returned short count or corrupted data");
 }
 
+static void test_urandom_read_crosses_boundary(void)
+{
+    TEST("/dev/urandom partial read at mapping boundary");
+
+    size_t page = (size_t) sysconf(_SC_PAGESIZE);
+    unsigned char *map = mmap(NULL, page * 2, PROT_READ | PROT_WRITE,
+                              MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (map == MAP_FAILED) {
+        FAIL("mmap failed");
+        return;
+    }
+    if (munmap(map + page, page) != 0) {
+        munmap(map, page);
+        FAIL("munmap guard failed");
+        return;
+    }
+
+    memset(map, 0, page);
+
+    int fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC);
+    if (fd < 0) {
+        munmap(map, page);
+        FAIL("open failed");
+        return;
+    }
+
+    ssize_t ret = read(fd, map, page * 2);
+    close(fd);
+
+    bool any_nonzero = false;
+    for (size_t i = 0; i < page; i++) {
+        if (map[i] != 0) {
+            any_nonzero = true;
+            break;
+        }
+    }
+
+    munmap(map, page);
+    EXPECT_TRUE(ret == (ssize_t) page && any_nonzero,
+                "urandom read did not preserve partial boundary result");
+}
+
+static void test_urandom_small_read_crosses_boundary(void)
+{
+    TEST("/dev/urandom small read at mapping boundary");
+
+    size_t page = (size_t) sysconf(_SC_PAGESIZE);
+    unsigned char *map = mmap(NULL, page * 2, PROT_READ | PROT_WRITE,
+                              MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (map == MAP_FAILED) {
+        FAIL("mmap failed");
+        return;
+    }
+    if (munmap(map + page, page) != 0) {
+        munmap(map, page);
+        FAIL("munmap guard failed");
+        return;
+    }
+
+    int fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC);
+    if (fd < 0) {
+        munmap(map, page);
+        FAIL("open failed");
+        return;
+    }
+
+    ssize_t ret = read(fd, map + page - 1, 2);
+    close(fd);
+
+    munmap(map, page);
+    EXPECT_TRUE(ret == 1,
+                "urandom small boundary read did not fall back safely");
+}
+
 int main(void)
 {
     printf("large I/O boundary tests\n\n");
 
     test_large_write();
     test_large_read_from_split_block();
+    test_urandom_read_crosses_boundary();
+    test_urandom_small_read_crosses_boundary();
 
     SUMMARY("test-large-io-boundary");
     return fails ? 1 : 0;
diff --git a/tests/test-matrix.sh b/tests/test-matrix.sh
index e6a6140..ad6921b 100755
--- a/tests/test-matrix.sh
+++ b/tests/test-matrix.sh
@@ -494,7 +494,7 @@ run_unit_tests()
     printf "\nNegative tests\n"
     test_check "$runner" "test-negative" "0 failed" "$bindir/test-negative"
 
-    printf "\nCOW fork isolation\n"
+    printf "\nCoW fork isolation\n"
     test_check "$runner" "test-cow-fork" "PASS" "$bindir/test-cow-fork"
 
     printf "\nGuard page / mmap edge cases\n"
diff --git a/tests/test-mremap-infra.c b/tests/test-mremap-infra.c
new file mode 100644
index 0000000..a06a65f
--- /dev/null
+++ b/tests/test-mremap-infra.c
@@ -0,0 +1,152 @@
+/* test-mremap-infra.c -- mremap/madvise must reject ranges hitting infra
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The infrastructure reserve at the top of guest IPA holds page tables,
+ * the EL1 shim's code, and the shim's data block. None of these are
+ * legal targets for guest memory management. sys_mmap MAP_FIXED,
+ * sys_munmap and sys_mprotect already enforce this via
+ * guest_range_hits_infra; sys_mremap and sys_madvise did not, leaving a
+ * spoofing/corruption vector for code that knows the infra GVA.
+ *
+ * This test exercises the four guarded variants:
+ *   1. mremap source range hits infra
+ *   2. mremap MREMAP_FIXED destination hits infra
+ *   3. mremap grow-in-place tail spills into infra
+ *   4. madvise(MADV_DONTNEED) on an infra range
+ *
+ * All four must fail with EINVAL. The infra base is read at runtime
+ * from /proc/self/maps so the test stays portable across the 36-bit
+ * and 40-bit IPA configurations.
+ */
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#ifndef MREMAP_MAYMOVE
+#define MREMAP_MAYMOVE 1
+#endif
+#ifndef MREMAP_FIXED
+#define MREMAP_FIXED 2
+#endif
+
+#define PAGE_SIZE 4096
+
+static int failures = 0;
+
+#define EXPECT(cond, msg)                       \
+    do {                                        \
+        if (!(cond)) {                          \
+            fprintf(stderr, "FAIL: %s\n", msg); \
+            failures++;                         \
+        }                                       \
+    } while (0)
+
+/* Parse the lower bound of the named region from /proc/self/maps.
+ * /proc/self/maps lines look like:
+ *   ffffffc00000-ffffffc00400 r-xp 00000000 00:00 0          [shim]
+ * Returns 0 if not found.
+ */
+static uint64_t find_region_base(const char *name)
+{
+    FILE *fp = fopen("/proc/self/maps", "r");
+    if (!fp)
+        return 0;
+    char line[512];
+    uint64_t base = 0;
+    while (fgets(line, sizeof(line), fp)) {
+        if (strstr(line, name)) {
+            unsigned long long lo = 0;
+            if (sscanf(line, "%llx-", &lo) == 1) {
+                base = lo;
+                break;
+            }
+        }
+    }
+    fclose(fp);
+    return base;
+}
+
+int main(void)
+{
+    printf("test-mremap-infra: mremap/madvise reject infra-range targets\n");
+
+    /* Locate [shim-data]; if absent, [shim] is also acceptable as the
+     * infra reserve covers both. The test only needs ANY infra GVA.
+     */
+    uint64_t infra = find_region_base("[shim-data]");
+    if (!infra)
+        infra = find_region_base("[shim]");
+    if (!infra) {
+        fprintf(stderr,
+                "FAIL: could not locate infra region in /proc/self/maps\n");
+        return 1;
+    }
+    printf("infra base = 0x%llx\n", (unsigned long long) infra);
+
+    /* Allocate a scratch mapping to use as the source for mremap variants. */
+    void *src = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE,
+                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (src == MAP_FAILED) {
+        fprintf(stderr, "FAIL: scratch mmap: %s\n", strerror(errno));
+        return 1;
+    }
+
+    /* Case 1: mremap source range hits infra. The source must be a
+     * legal VMA for mremap to consider it, but pointing the call
+     * directly at the infra base is enough to make the kernel try.
+     */
+    errno = 0;
+    void *r = mremap((void *) (uintptr_t) infra, PAGE_SIZE, PAGE_SIZE,
+                     MREMAP_MAYMOVE);
+    EXPECT(r == MAP_FAILED && errno == EINVAL,
+           "mremap source==infra rejected with EINVAL");
+
+    /* Case 2: MREMAP_FIXED destination hits infra. */
+    errno = 0;
+    r = mremap(src, PAGE_SIZE, PAGE_SIZE, MREMAP_MAYMOVE | MREMAP_FIXED,
+               (void *) (uintptr_t) infra);
+    EXPECT(r == MAP_FAILED && errno == EINVAL,
+           "mremap MREMAP_FIXED dest==infra rejected with EINVAL");
+
+    /* Case 3: grow-in-place tail spills into infra. Map a one-page
+     * region immediately below the infra base (assumes nothing
+     * else sits in that hole; if it does, the test is inconclusive
+     * but still safe).
+     */
+    void *base = (void *) (uintptr_t) (infra - PAGE_SIZE);
+    void *p = mmap(base, PAGE_SIZE, PROT_READ | PROT_WRITE,
+                   MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+    if (p != MAP_FAILED) {
+        errno = 0;
+        r = mremap(p, PAGE_SIZE, 2 * PAGE_SIZE, 0);
+        EXPECT(r == MAP_FAILED && errno == EINVAL,
+               "mremap grow-in-place into infra rejected with EINVAL");
+        munmap(p, PAGE_SIZE);
+    } else {
+        printf(
+            "SKIP grow-in-place: cannot place sentinel mapping (already "
+            "taken)\n");
+    }
+
+    /* Case 4: madvise(MADV_DONTNEED) on an infra range. */
+    errno = 0;
+    int rc = madvise((void *) (uintptr_t) infra, PAGE_SIZE, MADV_DONTNEED);
+    EXPECT(rc == -1 && errno == EINVAL,
+           "madvise(MADV_DONTNEED) on infra rejected with EINVAL");
+
+    munmap(src, PAGE_SIZE);
+
+    if (failures) {
+        fprintf(stderr, "FAIL: %d check(s) failed\n", failures);
+        return 1;
+    }
+    printf("OK\n");
+    return 0;
+}
diff --git a/tests/test-shim-cred-race.c b/tests/test-shim-cred-race.c
new file mode 100644
index 0000000..1c747d3
--- /dev/null
+++ b/tests/test-shim-cred-race.c
@@ -0,0 +1,106 @@
+/* test-shim-cred-race.c -- shim identity cache stays consistent under
+ * concurrent setuid traffic.
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * elfuse's permission model rejects setuid/setresuid to any value not
+ * already in {real, effective, saved}, which means a guest binary
+ * cannot legally toggle between two distinct uids without privileged
+ * setup. This test therefore exercises the no-op-publish path: the
+ * mutator calls setresuid(uid, uid, uid) in a tight loop while the
+ * reader spins on geteuid via the shim's identity fast path.
+ *
+ * What it pins:
+ *
+ *   - cred_publish_after runs without corrupting the cache: every
+ *     reader observation must equal the initial euid.
+ *   - The publish path is wired into the SC_FORWARD setuid family
+ *     (a regression that bypasses publish would still pass because
+ *     values don't change, but a regression that crashes during the
+ *     atomic store would surface as a SIGSEGV or hang).
+ *
+ * What it does NOT pin (deferred to Slice B's attention bracket):
+ *
+ *   - True cred-tearing during a multi-field publish. Demonstrating
+ *     that requires a setuid path that mutates {uid, euid, gid,
+ *     egid} as a coherent group; elfuse's permission model does not
+ *     support such a state transition from the guest side.
+ */
+
+#include <errno.h>
+#include <pthread.h>
+#include <stdatomic.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "raw-syscall.h"
+
+#ifndef __NR_geteuid
+#define __NR_geteuid 175
+#endif
+#ifndef __NR_setresuid
+#define __NR_setresuid 147
+#endif
+
+static atomic_int stop;
+static atomic_long observed_other;
+static long expected_euid;
+
+static void *reader(void *arg)
+{
+    (void) arg;
+    while (!atomic_load_explicit(&stop, memory_order_relaxed)) {
+        long v = raw_syscall0(__NR_geteuid);
+        if (v != expected_euid)
+            atomic_store_explicit(&observed_other, v, memory_order_relaxed);
+    }
+    return NULL;
+}
+
+int main(void)
+{
+    expected_euid = raw_syscall0(__NR_geteuid);
+    if (expected_euid < 0) {
+        fprintf(stderr, "FAIL: initial geteuid returned %ld\n", expected_euid);
+        return 1;
+    }
+    atomic_store(&observed_other, -1);
+    atomic_store(&stop, 0);
+
+    pthread_t tid;
+    if (pthread_create(&tid, NULL, reader, NULL) != 0)
+        return 1;
+
+    /* 50_000 no-op setresuid calls. Each triggers cred_publish_after
+     * on the elfuse side, racing the reader thread.
+     */
+    for (int i = 0; i < 50000; i++) {
+        long r = raw_syscall3(__NR_setresuid, (long) expected_euid,
+                              (long) expected_euid, (long) expected_euid);
+        if (r != 0) {
+            fprintf(stderr, "FAIL setresuid(%ld,%ld,%ld) iter %d: errno %ld\n",
+                    expected_euid, expected_euid, expected_euid, i, -r);
+            atomic_store(&stop, 1);
+            pthread_join(tid, NULL);
+            return 1;
+        }
+    }
+    atomic_store(&stop, 1);
+    pthread_join(tid, NULL);
+
+    long bad = atomic_load(&observed_other);
+    if (bad != -1) {
+        fprintf(stderr, "FAIL: reader observed euid %ld (expected %ld)\n", bad,
+                expected_euid);
+        return 1;
+    }
+
+    printf("OK (50000 no-op publishes, no torn read)\n");
+    return 0;
+}
diff --git a/tests/test-shim-data-el1.c b/tests/test-shim-data-el1.c
new file mode 100644
index 0000000..b3d31c5
--- /dev/null
+++ b/tests/test-shim-data-el1.c
@@ -0,0 +1,224 @@
+/* test-shim-data-el1.c -- guest EL0 cannot read or write shim_data.
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The shim_data block holds the identity cache, attention flag,
+ * urandom bitmap, ring, and lock. Mapping it with AP[2:1]=00
+ * (privileged-only) prevents a guest from spoofing its own identity
+ * by storing directly to the cache GVA, or from observing the bytes
+ * the urandom fast path will hand out next.
+ *
+ * This test:
+ *   1. Parses /proc/self/maps to find [shim-data].
+ *   2. Verifies the permission string is "---p" (PROT_NONE).
+ *   3. Installs SIGSEGV handler + sigsetjmp; loads the first byte
+ *      from the [shim-data] base; expects SIGSEGV.
+ *   4. Same with a store; expects SIGSEGV.
+ *   5. Verifies the identity and urandom fast paths still work
+ *      AFTER the EL0 access attempts (no shim corruption).
+ *   6. execve's self with argv[1]='post-exec' and reruns the perms
+ *      and fault checks against the new image. Catches the
+ *      regression where the execve mapping path forgets to apply
+ *      EL1-only and silently downgrades shim_data to plain RW.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static sigjmp_buf segv_jmp;
+
+static void on_sigsegv(int sig)
+{
+    (void) sig;
+    siglongjmp(segv_jmp, 1);
+}
+
+static uint64_t find_shim_data_base(void)
+{
+    FILE *fp = fopen("/proc/self/maps", "r");
+    if (!fp)
+        return 0;
+    char line[512];
+    uint64_t base = 0;
+    while (fgets(line, sizeof(line), fp)) {
+        if (strstr(line, "[shim-data]")) {
+            unsigned long long lo = 0;
+            char perms[8] = {0};
+            if (sscanf(line, "%llx-%*llx %7s", &lo, perms) == 2) {
+                printf("[shim-data] base=0x%llx perms=%s\n", lo, perms);
+                if (strcmp(perms, "---p") != 0) {
+                    fprintf(stderr,
+                            "FAIL: [shim-data] perms %s, expected ---p\n",
+                            perms);
+                    fclose(fp);
+                    return 0;
+                }
+                base = lo;
+            }
+            break;
+        }
+    }
+    fclose(fp);
+    return base;
+}
+
+static int probe_read(uint64_t addr)
+{
+    if (sigsetjmp(segv_jmp, 1) != 0)
+        return -1; /* SIGSEGV caught */
+    volatile uint8_t *p = (volatile uint8_t *) (uintptr_t) addr;
+    volatile uint8_t v = *p;
+    (void) v;
+    return 0;
+}
+
+static int probe_write(uint64_t addr)
+{
+    if (sigsetjmp(segv_jmp, 1) != 0)
+        return -1;
+    volatile uint8_t *p = (volatile uint8_t *) (uintptr_t) addr;
+    *p = 0xA5;
+    return 0;
+}
+
+/* Phase 2 (post-execve): only the perm-string and fault checks. The
+ * identity and urandom sanity is already exercised in phase 1; here
+ * the goal is to catch a regression where execve maps shim_data with
+ * plain RW instead of RW_EL1_ONLY.
+ */
+static int run_post_exec_checks(void)
+{
+    uint64_t base = find_shim_data_base();
+    if (!base) {
+        fprintf(stderr, "FAIL post-exec: shim-data missing or wrong perms\n");
+        return 1;
+    }
+    struct sigaction sa = {0};
+    sa.sa_handler = on_sigsegv;
+    sigemptyset(&sa.sa_mask);
+    sa.sa_flags = SA_NODEFER;
+    sigaction(SIGSEGV, &sa, NULL);
+    sigaction(SIGBUS, &sa, NULL);
+    if (probe_read(base) != -1) {
+        fprintf(stderr,
+                "FAIL post-exec: read at 0x%llx did not fault "
+                "(execve mapped shim_data RW instead of RW_EL1_ONLY)\n",
+                (unsigned long long) base);
+        return 1;
+    }
+    if (probe_write(base) != -1) {
+        fprintf(stderr, "FAIL post-exec: write at 0x%llx did not fault\n",
+                (unsigned long long) base);
+        return 1;
+    }
+    printf("OK post-exec [shim-data] still EL1-only\n");
+    printf("OK\n");
+    return 0;
+}
+
+int main(int argc, char **argv)
+{
+    if (argc > 1 && strcmp(argv[1], "post-exec") == 0)
+        return run_post_exec_checks();
+
+    uint64_t base = find_shim_data_base();
+    if (!base)
+        return 1;
+
+    struct sigaction sa = {0};
+    sa.sa_handler = on_sigsegv;
+    sigemptyset(&sa.sa_mask);
+    sa.sa_flags = SA_NODEFER;
+    sigaction(SIGSEGV, &sa, NULL);
+    sigaction(SIGBUS, &sa, NULL);
+
+    /* Read of [shim-data] must fault. */
+    if (probe_read(base) != -1) {
+        fprintf(stderr, "FAIL: read at 0x%llx did not fault\n",
+                (unsigned long long) base);
+        return 1;
+    }
+    printf("OK read fault at base\n");
+
+    /* Read further into the region (urandom ring area, offset 0x100). */
+    if (probe_read(base + 0x100) != -1) {
+        fprintf(stderr, "FAIL: read at 0x%llx did not fault\n",
+                (unsigned long long) (base + 0x100));
+        return 1;
+    }
+    printf("OK read fault at base+0x100\n");
+
+    /* Store attempt must fault too. */
+    if (probe_write(base) != -1) {
+        fprintf(stderr, "FAIL: write at 0x%llx did not fault\n",
+                (unsigned long long) base);
+        return 1;
+    }
+    printf("OK write fault at base\n");
+
+    /* After the fault attempts, identity fast path must still work. */
+    register long x0 __asm__("x0");
+    register long x8 __asm__("x8") = 172; /* getpid */
+    __asm__ volatile("svc #0" : "=r"(x0) : "r"(x8) : "memory", "cc");
+    if (x0 != getpid()) {
+        fprintf(stderr,
+                "FAIL: identity fast path broken after faults: "
+                "raw getpid=%ld libc=%d\n",
+                x0, getpid());
+        return 1;
+    }
+    printf("OK identity fast path still works (pid=%ld)\n", x0);
+
+    /* Urandom fast path too. */
+    int fd = open("/dev/urandom", O_RDONLY);
+    if (fd < 0) {
+        perror("open /dev/urandom");
+        return 1;
+    }
+    char b;
+    if (read(fd, &b, 1) != 1) {
+        fprintf(stderr, "FAIL: urandom read broken after faults\n");
+        return 1;
+    }
+    printf("OK urandom fast path still works\n");
+
+    /* The host syscall handlers must also refuse to act on a
+     * guest-supplied [shim-data] GVA. Without this defense, a guest
+     * could spoof the identity cache via read(fd, shim_data_gva, n)
+     * instead of a direct EL0 store. The host's gva_translate_perm
+     * rejects EL1-only descriptors before any host_base+offset write
+     * fires; the syscall returns EFAULT.
+     */
+    errno = 0;
+    ssize_t rc = read(fd, (void *) (uintptr_t) base, 1);
+    if (rc != -1 || errno != EFAULT) {
+        fprintf(stderr,
+                "FAIL: read(fd, [shim-data], 1) = %zd errno=%d "
+                "(expected -1/EFAULT, attacker could have spoofed cache)\n",
+                rc, errno);
+        close(fd);
+        return 1;
+    }
+    printf("OK host-side spoofing attempt via read returned EFAULT\n");
+    close(fd);
+
+    /* Phase 2: re-exec self with argv[1]='post-exec' so the post-execve
+     * shim_data mapping path is exercised. If exec.c forgets to use
+     * RW_EL1_ONLY, the child process's [shim-data] perms come back as
+     * 'rw-p' and the probe_read in run_post_exec_checks succeeds (no
+     * SIGSEGV), failing the regression. The original child reaches
+     * argc=1 above; this path only runs once.
+     */
+    char *exec_argv[] = {argv[0], "post-exec", NULL};
+    execv("/proc/self/exe", exec_argv);
+    perror("execv");
+    return 1;
+}
diff --git a/tests/test-shim-identity-attention.c b/tests/test-shim-identity-attention.c
new file mode 100644
index 0000000..d123304
--- /dev/null
+++ b/tests/test-shim-identity-attention.c
@@ -0,0 +1,136 @@
+/* test-shim-identity-attention.c -- SIGALRM survives fast paths.
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Slice A of the identity fast-path optimization routes getpid (and the
+ * other five identity syscalls) through the EL1 shim without HVC #5.
+ * That skips the post-HVC signal_check_timer epilogue in vcpu_run_loop,
+ * which is what normally notices a fired guest ITIMER_REAL and queues
+ * SIGALRM. Without Slice B's attention flag, a vCPU stuck in a tight
+ * getpid loop would never re-enter EL1 and SIGALRM would arrive late
+ * (worst case: not until the per-iteration vCPU alarm timeout fires,
+ * potentially hundreds of milliseconds).
+ *
+ * This test arms an ITIMER_REAL for 100 ms, then spins for ~1 second
+ * OR until SIGALRM fires. It covers both getpid via the raw SVC and a
+ * seeded CLOCK_REALTIME vDSO loop, because both fast paths otherwise
+ * bypass the HVC epilogue that runs signal_check_timer().
+ */
+
+#include <errno.h>
+#include <signal.h>
+#include <stdatomic.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "raw-syscall.h"
+
+#ifndef __NR_getpid
+#define __NR_getpid 172
+#endif
+
+static volatile sig_atomic_t alarm_fired;
+static struct timespec alarm_ts;
+
+static void on_sigalrm(int sig)
+{
+    (void) sig;
+    clock_gettime(CLOCK_MONOTONIC, (struct timespec *) &alarm_ts);
+    alarm_fired = 1;
+}
+
+static long ns_diff(const struct timespec *a, const struct timespec *b)
+{
+    return (long) ((a->tv_sec - b->tv_sec) * 1000000000LL +
+                   (a->tv_nsec - b->tv_nsec));
+}
+
+static int run_alarm_spin(const char *name, int use_realtime_vdso)
+{
+    alarm_fired = 0;
+    alarm_ts = (struct timespec) {0};
+    struct timespec t_arm;
+
+    if (use_realtime_vdso) {
+        struct timespec seed;
+        clock_gettime(CLOCK_REALTIME, &seed);
+    }
+
+    clock_gettime(CLOCK_MONOTONIC, &t_arm);
+
+    struct itimerval iv = {0};
+    iv.it_value.tv_sec = 0;
+    iv.it_value.tv_usec = 100 * 1000; /* 100 ms */
+    if (setitimer(ITIMER_REAL, &iv, NULL) < 0) {
+        fprintf(stderr, "FAIL %s setitimer: %s\n", name, strerror(errno));
+        return 1;
+    }
+
+    /* With attention raised on setitimer arm, fast paths fall back to
+     * HVC and signal_check_timer eventually notices the 100 ms expiry.
+     * Bound the spin to 1 s so a broken attention path manifests as
+     * test failure rather than a hang.
+     */
+    long iterations = 0;
+    while (!alarm_fired) {
+        if (use_realtime_vdso) {
+            struct timespec now_rt;
+            clock_gettime(CLOCK_REALTIME, &now_rt);
+        } else {
+            (void) raw_syscall0(__NR_getpid);
+        }
+        iterations++;
+        if ((iterations & 0xFFFF) == 0) {
+            struct timespec now;
+            clock_gettime(CLOCK_MONOTONIC, &now);
+            if (ns_diff(&now, &t_arm) > 1000000000L)
+                break;
+        }
+    }
+
+    if (!alarm_fired) {
+        fprintf(stderr,
+                "FAIL %s: SIGALRM did not fire within 1 s (iterations=%ld)\n",
+                name, iterations);
+        return 1;
+    }
+
+    long delivered_ns = ns_diff((struct timespec *) &alarm_ts, &t_arm);
+    /* The 100 ms timer should deliver within ~150 ms in practice;
+     * grant 300 ms to absorb host scheduling jitter under load.
+     */
+    if (delivered_ns > 300 * 1000 * 1000L) {
+        fprintf(stderr, "FAIL %s: SIGALRM delivered after %ld ns (>300 ms)\n",
+                name, delivered_ns);
+        return 1;
+    }
+
+    printf("OK %s: SIGALRM after %ld ns (iterations=%ld)\n", name, delivered_ns,
+           iterations);
+    return 0;
+}
+
+int main(void)
+{
+    struct sigaction sa = {0};
+    sa.sa_handler = on_sigalrm;
+    sigemptyset(&sa.sa_mask);
+    sa.sa_flags = 0;
+    if (sigaction(SIGALRM, &sa, NULL) < 0) {
+        fprintf(stderr, "FAIL sigaction: %s\n", strerror(errno));
+        return 1;
+    }
+
+    if (run_alarm_spin("getpid", 0) != 0)
+        return 1;
+    if (run_alarm_spin("clock_realtime_vdso", 1) != 0)
+        return 1;
+    return 0;
+}
diff --git a/tests/test-shim-identity.c b/tests/test-shim-identity.c
new file mode 100644
index 0000000..db529df
--- /dev/null
+++ b/tests/test-shim-identity.c
@@ -0,0 +1,170 @@
+/* test-shim-identity.c -- verify identity syscalls do not trust vDSO memory
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * A guest can legally unmap or replace its vDSO page. getpid/getppid/getuid/
+ * geteuid/getgid/getegid must still be sourced from host-side process state,
+ * not from guest-remappable vDSO contents.
+ */
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "raw-syscall.h"
+
+#ifndef __NR_getpid
+#define __NR_getpid 172
+#endif
+#ifndef __NR_getppid
+#define __NR_getppid 173
+#endif
+#ifndef __NR_getuid
+#define __NR_getuid 174
+#endif
+#ifndef __NR_geteuid
+#define __NR_geteuid 175
+#endif
+#ifndef __NR_getgid
+#define __NR_getgid 176
+#endif
+#ifndef __NR_getegid
+#define __NR_getegid 177
+#endif
+#ifndef __NR_mmap
+#define __NR_mmap 222
+#endif
+
+#define VDSO_BASE ((void *) (uintptr_t) 0x0000F000UL)
+#define VDSO_SIZE 0x1000UL
+
+static int failures = 0;
+
+#define EXPECT_EQ_LONG(a_expr, b_expr, label)                        \
+    do {                                                             \
+        long _a = (long) (a_expr);                                   \
+        long _b = (long) (b_expr);                                   \
+        if (_a != _b) {                                              \
+            fprintf(stderr, "FAIL %s: %ld != %ld\n", label, _a, _b); \
+            failures++;                                              \
+        }                                                            \
+    } while (0)
+
+static long parse_status_field(const char *key)
+{
+    FILE *fp = fopen("/proc/self/status", "r");
+    if (!fp)
+        return -1;
+    char line[256];
+    long value = -1;
+    size_t klen = strlen(key);
+    while (fgets(line, sizeof(line), fp)) {
+        if (strncmp(line, key, klen) == 0 && line[klen] == ':') {
+            value = strtol(line + klen + 1, NULL, 10);
+            break;
+        }
+    }
+    fclose(fp);
+    return value;
+}
+
+static void check_self(const char *phase)
+{
+    long pid = raw_syscall0(__NR_getpid);
+    long ppid = raw_syscall0(__NR_getppid);
+    long uid = raw_syscall0(__NR_getuid);
+    long euid = raw_syscall0(__NR_geteuid);
+    long gid = raw_syscall0(__NR_getgid);
+    long egid = raw_syscall0(__NR_getegid);
+
+    EXPECT_EQ_LONG(pid, parse_status_field("Pid"), "getpid vs /proc");
+    EXPECT_EQ_LONG(ppid, parse_status_field("PPid"), "getppid vs /proc");
+
+    /* Repeated calls must be stable. */
+    EXPECT_EQ_LONG(pid, raw_syscall0(__NR_getpid), "getpid repeat");
+    EXPECT_EQ_LONG(uid, raw_syscall0(__NR_getuid), "getuid repeat");
+    EXPECT_EQ_LONG(euid, raw_syscall0(__NR_geteuid), "geteuid repeat");
+    EXPECT_EQ_LONG(gid, raw_syscall0(__NR_getgid), "getgid repeat");
+    EXPECT_EQ_LONG(egid, raw_syscall0(__NR_getegid), "getegid repeat");
+
+    printf("%s: pid=%ld ppid=%ld uid=%ld euid=%ld gid=%ld egid=%ld\n", phase,
+           pid, ppid, uid, euid, gid, egid);
+}
+
+static void remap_vdso_page(void)
+{
+    long p = raw_syscall6(__NR_mmap, (long) VDSO_BASE, VDSO_SIZE,
+                          PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+    if (p < 0) {
+        fprintf(stderr, "FAIL mmap(MAP_FIXED vDSO): %s\n", strerror((int) -p));
+        failures++;
+        return;
+    }
+    if ((void *) (uintptr_t) p != VDSO_BASE) {
+        fprintf(stderr, "FAIL mmap(MAP_FIXED vDSO): got %p\n",
+                (void *) (uintptr_t) p);
+        failures++;
+        return;
+    }
+
+    memset(VDSO_BASE, 0xA5, VDSO_SIZE);
+}
+
+static void check_fork_child(void)
+{
+    long parent_pid = raw_syscall0(__NR_getpid);
+    pid_t kid = fork();
+    if (kid < 0) {
+        fprintf(stderr, "FAIL fork: %s\n", strerror(errno));
+        failures++;
+        return;
+    }
+    if (kid == 0) {
+        long child_pid = raw_syscall0(__NR_getpid);
+        long child_ppid = raw_syscall0(__NR_getppid);
+        if (child_pid == parent_pid) {
+            fprintf(stderr,
+                    "FAIL fork-child: getpid==parent_pid (stale vvar)\n");
+            _exit(2);
+        }
+        if (child_ppid != parent_pid) {
+            fprintf(stderr, "FAIL fork-child: getppid=%ld parent_pid=%ld\n",
+                    child_ppid, parent_pid);
+            _exit(3);
+        }
+        _exit(0);
+    }
+    int wstatus = 0;
+    if (waitpid(kid, &wstatus, 0) != kid) {
+        fprintf(stderr, "FAIL fork-child waitpid: %s\n", strerror(errno));
+        failures++;
+        return;
+    }
+    if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 0) {
+        fprintf(stderr, "FAIL fork-child exit status %d\n", wstatus);
+        failures++;
+    }
+}
+
+int main(void)
+{
+    printf("test-shim-identity: identity syscalls ignore remapped vDSO\n");
+    check_self("before-remap");
+    remap_vdso_page();
+    check_self("after-remap");
+    check_fork_child();
+    if (failures) {
+        fprintf(stderr, "FAIL: %d check(s) failed\n", failures);
+        return 1;
+    }
+    printf("OK\n");
+    return 0;
+}
diff --git a/tests/test-shim-urandom-smp.c b/tests/test-shim-urandom-smp.c
new file mode 100644
index 0000000..f8d657c
--- /dev/null
+++ b/tests/test-shim-urandom-smp.c
@@ -0,0 +1,149 @@
+/* test-shim-urandom-smp.c -- multi-thread urandom-read stress.
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The shim's urandom-read fast path advances a shared ring head via
+ * LDXR/STXR. Concurrent vCPUs reading /dev/urandom must:
+ *   1. Never see a torn or zero-filled byte (host always refills
+ *      with arc4random_buf output).
+ *   2. Never observe the same byte sequence as a sibling thread
+ *      (each thread's atomic head-advance reserves its own slice
+ *      of the ring).
+ *   3. Keep the head from overflowing or underflowing the ring.
+ *
+ * Each thread reads N 1-byte samples and records them in a private
+ * histogram. After the run we check:
+ *   - Total bytes consumed across all threads equals N * threads.
+ *   - No thread's per-byte distribution is degenerate (all-zero or
+ *     all-one buckets indicate the fast path served stale memory).
+ *   - The sums across threads differ from each other (a hard test
+ *     that the threads are actually getting independent bytes).
+ *
+ * The test runs only under elfuse, where the urandom fast path is
+ * live; on native Linux the read() goes straight to the kernel and
+ * the same invariants hold trivially.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define NTHREADS 4
+#define NSAMPLES 16384
+
+typedef struct {
+    int fd;
+    int tid_index;
+    int histogram[256];
+    uint64_t sum;
+} worker_arg_t;
+
+static void *worker(void *arg)
+{
+    worker_arg_t *w = arg;
+    char b;
+    for (int i = 0; i < NSAMPLES; i++) {
+        ssize_t r = read(w->fd, &b, 1);
+        if (r != 1) {
+            fprintf(stderr,
+                    "FAIL thread %d iter %d: read returned %zd "
+                    "(errno=%d)\n",
+                    w->tid_index, i, r, errno);
+            return (void *) (uintptr_t) 1;
+        }
+        unsigned char ub = (unsigned char) b;
+        w->histogram[ub]++;
+        w->sum += ub;
+    }
+    return NULL;
+}
+
+int main(void)
+{
+    /* One shared fd: every thread shares the same FD_URANDOM slot,
+     * so the shim's fast path is exercised on the same bitmap bit
+     * by all threads simultaneously.
+     */
+    int fd = open("/dev/urandom", O_RDONLY);
+    if (fd < 0) {
+        perror("open /dev/urandom");
+        return 1;
+    }
+
+    worker_arg_t workers[NTHREADS];
+    pthread_t threads[NTHREADS];
+    for (int i = 0; i < NTHREADS; i++) {
+        memset(&workers[i], 0, sizeof(workers[i]));
+        workers[i].fd = fd;
+        workers[i].tid_index = i;
+        if (pthread_create(&threads[i], NULL, worker, &workers[i]) != 0) {
+            fprintf(stderr, "FAIL pthread_create %d: %s\n", i, strerror(errno));
+            return 1;
+        }
+    }
+
+    int failures = 0;
+    for (int i = 0; i < NTHREADS; i++) {
+        void *rc = NULL;
+        pthread_join(threads[i], &rc);
+        if (rc != NULL) {
+            failures++;
+            continue;
+        }
+        /* Per-thread distribution sanity: each bucket should be
+         * roughly NSAMPLES / 256 = 64 with stddev about 8. Flag any
+         * thread whose distribution is wildly off.
+         */
+        int min = NSAMPLES, max = 0, zeros = 0;
+        for (int b = 0; b < 256; b++) {
+            int c = workers[i].histogram[b];
+            if (c < min)
+                min = c;
+            if (c > max)
+                max = c;
+            if (c == 0)
+                zeros++;
+        }
+        printf("thread %d: sum=%llu min=%d max=%d zero-buckets=%d\n", i,
+               (unsigned long long) workers[i].sum, min, max, zeros);
+        if (max == NSAMPLES) {
+            fprintf(stderr, "FAIL thread %d: all bytes identical\n", i);
+            failures++;
+        }
+        if (zeros > 32) {
+            fprintf(stderr, "FAIL thread %d: %d unused buckets (degenerate)\n",
+                    i, zeros);
+            failures++;
+        }
+    }
+    close(fd);
+
+    /* Threads must have seen different total sums. Equal sums imply
+     * they consumed identical byte sequences, which means the shim's
+     * head-advance lost the race or served stale ring data.
+     */
+    for (int i = 0; i < NTHREADS; i++) {
+        for (int j = i + 1; j < NTHREADS; j++) {
+            if (workers[i].sum == workers[j].sum) {
+                fprintf(stderr,
+                        "FAIL threads %d and %d have identical sum=%llu\n", i,
+                        j, (unsigned long long) workers[i].sum);
+                failures++;
+            }
+        }
+    }
+
+    if (failures) {
+        fprintf(stderr, "FAIL: %d issue(s)\n", failures);
+        return 1;
+    }
+    printf("OK: %d threads x %d 1B reads each, ring stayed consistent\n",
+           NTHREADS, NSAMPLES);
+    return 0;
+}
diff --git a/tests/test-shim-urandom-toctou.c b/tests/test-shim-urandom-toctou.c
new file mode 100644
index 0000000..1cb0ce9
--- /dev/null
+++ b/tests/test-shim-urandom-toctou.c
@@ -0,0 +1,124 @@
+/* test-shim-urandom-toctou.c -- urandom EL1 fault recovery survives
+ * concurrent mprotect(PROT_NONE) of the read buffer.
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The urandom-read shim fast path probes the guest buffer (AT s1e0w) and
+ * then performs an EL1 strb into it. A sibling vCPU can mprotect or
+ * munmap the buffer between the probe and the store, faulting the EL1
+ * write. Without handle_el1_data_abort_recover, that fault routes to
+ * BAD_VEC and halts the VM.
+ *
+ * This test runs a tight loop of read(/dev/urandom, buf, 1) while a
+ * sibling thread continuously flips the buffer between PROT_READ|WRITE
+ * and PROT_NONE via mprotect. Expected behavior:
+ *   - read returns 1 (success) when the buffer is RW
+ *   - read returns -1 with errno=EFAULT when the buffer is PROT_NONE
+ *   - elfuse never halts
+ *
+ * If the recovery handler is missing or wrong, the VM crashes mid-run
+ * and the test process never returns; the make-check timeout catches
+ * that as a failure.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdatomic.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#define PAGE_SIZE 4096
+#define ITERATIONS 20000
+
+static atomic_int stop;
+static atomic_int flips;
+static atomic_int efault_count;
+static atomic_int success_count;
+static void *shared_buf;
+
+static void *protect_flipper(void *arg)
+{
+    (void) arg;
+    int prot = PROT_READ | PROT_WRITE;
+    while (!atomic_load_explicit(&stop, memory_order_relaxed)) {
+        prot ^= (PROT_READ | PROT_WRITE);
+        if (mprotect(shared_buf, PAGE_SIZE, prot) != 0) {
+            fprintf(stderr, "mprotect failed: %s\n", strerror(errno));
+            return (void *) (uintptr_t) 1;
+        }
+        atomic_fetch_add(&flips, 1);
+    }
+    /* Leave the buffer accessible at exit. */
+    mprotect(shared_buf, PAGE_SIZE, PROT_READ | PROT_WRITE);
+    return NULL;
+}
+
+int main(void)
+{
+    shared_buf = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE,
+                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (shared_buf == MAP_FAILED) {
+        perror("mmap");
+        return 1;
+    }
+    int fd = open("/dev/urandom", O_RDONLY);
+    if (fd < 0) {
+        perror("open /dev/urandom");
+        return 1;
+    }
+
+    atomic_store(&stop, 0);
+    pthread_t flipper;
+    if (pthread_create(&flipper, NULL, protect_flipper, NULL) != 0) {
+        perror("pthread_create");
+        return 1;
+    }
+
+    /* Reader: each iteration calls read(); accepts either success or
+     * EFAULT. Any other result (or a crash, which manifests as the
+     * VM halting before we reach the join) is a failure.
+     */
+    for (int i = 0; i < ITERATIONS; i++) {
+        char b;
+        errno = 0;
+        ssize_t r = read(fd, &b, 1);
+        if (r == 1) {
+            atomic_fetch_add(&success_count, 1);
+        } else if (r == -1 && errno == EFAULT) {
+            atomic_fetch_add(&efault_count, 1);
+        } else {
+            fprintf(stderr, "FAIL iter %d: unexpected read rc=%zd errno=%d\n",
+                    i, r, errno);
+            atomic_store(&stop, 1);
+            pthread_join(flipper, NULL);
+            return 1;
+        }
+    }
+
+    atomic_store(&stop, 1);
+    pthread_join(flipper, NULL);
+    close(fd);
+    munmap(shared_buf, PAGE_SIZE);
+
+    int s = atomic_load(&success_count);
+    int e = atomic_load(&efault_count);
+    int f = atomic_load(&flips);
+    printf("iters=%d success=%d efault=%d mprotect_flips=%d\n", ITERATIONS, s,
+           e, f);
+    if (s + e != ITERATIONS) {
+        fprintf(stderr, "FAIL: success+efault != iterations\n");
+        return 1;
+    }
+    if (e == 0)
+        printf(
+            "WARN: no EFAULT observed; race window may be too short on "
+            "this host. VM did not crash, which is the primary check.\n");
+    printf("OK\n");
+    return 0;
+}
diff --git a/tests/test-shim-verbose-trace.c b/tests/test-shim-verbose-trace.c
new file mode 100644
index 0000000..372facd
--- /dev/null
+++ b/tests/test-shim-verbose-trace.c
@@ -0,0 +1,50 @@
+/* test-shim-verbose-trace.c -- fixture for verbose tracing of shim fast paths
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <stdio.h>
+
+#include "raw-syscall.h"
+
+#ifndef __NR_openat
+#define __NR_openat 56
+#endif
+#ifndef __NR_close
+#define __NR_close 57
+#endif
+#ifndef __NR_read
+#define __NR_read 63
+#endif
+#ifndef __NR_getpid
+#define __NR_getpid 172
+#endif
+
+#define AT_FDCWD -100
+
+int main(void)
+{
+    long pid = raw_syscall0(__NR_getpid);
+    if (pid <= 0) {
+        fprintf(stderr, "getpid failed: %ld\n", pid);
+        return 1;
+    }
+
+    long fd = raw_syscall4(__NR_openat, AT_FDCWD, (long) "/dev/urandom", 0, 0);
+    if (fd < 0) {
+        fprintf(stderr, "openat /dev/urandom failed: %ld\n", fd);
+        return 1;
+    }
+
+    unsigned char byte = 0;
+    long n = raw_syscall3(__NR_read, fd, (long) &byte, 1);
+    long close_rc = raw_syscall1(__NR_close, fd);
+    if (n != 1 || close_rc < 0) {
+        fprintf(stderr, "read/close failed: n=%ld close=%ld\n", n, close_rc);
+        return 1;
+    }
+
+    printf("pid=%ld byte=%u\n", pid, (unsigned) byte);
+    return 0;
+}
diff --git a/tests/test-syscall-smoke.c b/tests/test-syscall-smoke.c
index 809998f..a59925f 100644
--- a/tests/test-syscall-smoke.c
+++ b/tests/test-syscall-smoke.c
@@ -6,6 +6,7 @@
 
 #include <errno.h>
 #include <fcntl.h>
+#include <limits.h>
 #include <netinet/in.h>
 #include <poll.h>
 #include <signal.h>
@@ -57,6 +58,10 @@
 #define SYS_sigaltstack 132
 #endif
 
+#ifndef O_PATH
+#define O_PATH 010000000
+#endif
+
 #ifndef SYS_set_tid_address
 #define SYS_set_tid_address 96
 #endif
@@ -623,6 +628,262 @@ static void test_sysv_semaphore_ops(void)
     }
 }
 
+static void test_urandom_byte_reads(void)
+{
+    TEST("/dev/urandom byte reads");
+    int fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC);
+    if (fd < 0) {
+        FAIL("open");
+        return;
+    }
+
+    unsigned char bytes[32];
+    for (size_t i = 0; i < sizeof(bytes); i++) {
+        ssize_t n = read(fd, &bytes[i], 1);
+        if (n != 1) {
+            close(fd);
+            FAIL("read");
+            return;
+        }
+    }
+    close(fd);
+
+    bool all_same = true;
+    for (size_t i = 1; i < sizeof(bytes); i++) {
+        if (bytes[i] != bytes[0]) {
+            all_same = false;
+            break;
+        }
+    }
+    if (all_same) {
+        FAIL("entropy stream did not vary");
+        return;
+    }
+    PASS();
+}
+
+static void test_urandom_open_flags(void)
+{
+    TEST("/dev/urandom open flags");
+
+    errno = 0;
+    int dirfd = open("/dev/urandom", O_RDONLY | O_DIRECTORY);
+    if (dirfd >= 0) {
+        close(dirfd);
+        FAIL("O_DIRECTORY open succeeded");
+        return;
+    }
+    if (errno != ENOTDIR) {
+        FAIL("O_DIRECTORY errno");
+        return;
+    }
+
+    int pathfd = open("/dev/urandom", O_PATH | O_CLOEXEC);
+    if (pathfd < 0) {
+        FAIL("O_PATH open");
+        return;
+    }
+    unsigned char b = 0;
+    errno = 0;
+    ssize_t n = read(pathfd, &b, 1);
+    int saved_errno = errno;
+    close(pathfd);
+    if (n != -1 || saved_errno != EBADF) {
+        FAIL("O_PATH read");
+        return;
+    }
+
+    int wfd = open("/dev/urandom", O_WRONLY | O_CLOEXEC);
+    if (wfd < 0) {
+        FAIL("O_WRONLY open");
+        return;
+    }
+    int fl = fcntl(wfd, F_GETFL);
+    errno = 0;
+    n = read(wfd, &b, 1);
+    saved_errno = errno;
+    close(wfd);
+    if (fl < 0 || (fl & O_ACCMODE) != O_WRONLY) {
+        FAIL("O_WRONLY F_GETFL");
+        return;
+    }
+    if (n != -1 || saved_errno != EBADF) {
+        FAIL("O_WRONLY read");
+        return;
+    }
+
+    wfd = open("/dev/urandom", O_WRONLY | O_CLOEXEC);
+    if (wfd < 0) {
+        FAIL("O_WRONLY open dup");
+        return;
+    }
+    int dupfd = dup(wfd);
+    close(wfd);
+    if (dupfd < 0) {
+        FAIL("O_WRONLY dup");
+        return;
+    }
+    errno = 0;
+    n = read(dupfd, &b, 1);
+    saved_errno = errno;
+    close(dupfd);
+    if (n != -1 || saved_errno != EBADF) {
+        FAIL("O_WRONLY dup read");
+        return;
+    }
+
+    wfd = open("/dev/urandom", O_WRONLY | O_CLOEXEC);
+    if (wfd < 0) {
+        FAIL("O_WRONLY open readv");
+        return;
+    }
+    struct iovec wv[2] = {{&b, 1}, {&b, 1}};
+    errno = 0;
+    n = readv(wfd, wv, 2);
+    saved_errno = errno;
+    close(wfd);
+    if (n != -1 || saved_errno != EBADF) {
+        FAIL("O_WRONLY readv");
+        return;
+    }
+
+    wfd = open("/dev/urandom", O_WRONLY | O_CLOEXEC);
+    if (wfd < 0) {
+        FAIL("O_WRONLY open oversized readv");
+        return;
+    }
+    struct iovec huge_wv[2] = {{&b, SSIZE_MAX}, {&b, 1}};
+    errno = 0;
+    n = readv(wfd, huge_wv, 2);
+    saved_errno = errno;
+    close(wfd);
+    if (n != -1 || saved_errno != EBADF) {
+        FAIL("O_WRONLY oversized readv");
+        return;
+    }
+
+    wfd = open("/dev/urandom", O_WRONLY | O_CLOEXEC);
+    if (wfd < 0) {
+        FAIL("O_WRONLY open oversized single readv");
+        return;
+    }
+    struct iovec huge_one_wv = {&b, (size_t) SSIZE_MAX + 1};
+    errno = 0;
+    n = readv(wfd, &huge_one_wv, 1);
+    saved_errno = errno;
+    close(wfd);
+    if (n != -1 || saved_errno != EBADF) {
+        FAIL("O_WRONLY oversized single readv");
+        return;
+    }
+
+    int rfd = open("/dev/urandom", O_RDONLY | O_CLOEXEC);
+    if (rfd < 0) {
+        FAIL("O_RDONLY open readv");
+        return;
+    }
+    unsigned char rb[2] = {0};
+    struct iovec rv[2] = {{&rb[0], 1}, {&rb[1], 1}};
+    n = readv(rfd, rv, 2);
+    if (n != 2) {
+        close(rfd);
+        FAIL("O_RDONLY readv");
+        return;
+    }
+
+    struct iovec huge[2] = {{&b, SSIZE_MAX}, {&b, 1}};
+    errno = 0;
+    n = readv(rfd, huge, 2);
+    saved_errno = errno;
+    if (n != -1 || saved_errno != EINVAL) {
+        close(rfd);
+        FAIL("oversized readv");
+        return;
+    }
+
+    struct iovec huge_one = {&b, (size_t) SSIZE_MAX + 1};
+    errno = 0;
+    n = readv(rfd, &huge_one, 1);
+    saved_errno = errno;
+    if (n != -1 || saved_errno != EINVAL) {
+        close(rfd);
+        FAIL("oversized single readv");
+        return;
+    }
+
+    pid_t pid = fork();
+    if (pid < 0) {
+        close(rfd);
+        FAIL("fork inherited urandom");
+        return;
+    }
+    if (pid == 0) {
+        unsigned char child_b = 0;
+        _exit(read(rfd, &child_b, 1) == 1 ? 0 : 1);
+    }
+    int status = 0;
+    waitpid(pid, &status, 0);
+    if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
+        close(rfd);
+        FAIL("inherited urandom read");
+        return;
+    }
+
+    int p[2];
+    if (pipe(p) != 0) {
+        close(rfd);
+        FAIL("urandom fork pipe");
+        return;
+    }
+    unsigned char seed = 0;
+    if (read(rfd, &seed, 1) != 1) {
+        close(rfd);
+        close(p[0]);
+        close(p[1]);
+        FAIL("prime urandom cache before fork");
+        return;
+    }
+    pid = fork();
+    if (pid < 0) {
+        close(rfd);
+        close(p[0]);
+        close(p[1]);
+        FAIL("fork urandom cache isolation");
+        return;
+    }
+    if (pid == 0) {
+        close(p[0]);
+        unsigned char child_buf[64];
+        ssize_t got = read(rfd, child_buf, sizeof(child_buf));
+        ssize_t put = got == (ssize_t) sizeof(child_buf)
+                          ? write(p[1], child_buf, sizeof(child_buf))
+                          : -1;
+        close(p[1]);
+        _exit(put == (ssize_t) sizeof(child_buf) ? 0 : 1);
+    }
+    close(p[1]);
+    unsigned char parent_buf[64];
+    unsigned char child_buf[64];
+    ssize_t parent_n = read(rfd, parent_buf, sizeof(parent_buf));
+    ssize_t child_n = read(p[0], child_buf, sizeof(child_buf));
+    close(p[0]);
+    status = 0;
+    waitpid(pid, &status, 0);
+    close(rfd);
+    if (parent_n != (ssize_t) sizeof(parent_buf) ||
+        child_n != (ssize_t) sizeof(child_buf) || !WIFEXITED(status) ||
+        WEXITSTATUS(status) != 0) {
+        FAIL("urandom fork cache isolation read");
+        return;
+    }
+    if (memcmp(parent_buf, child_buf, sizeof(parent_buf)) == 0) {
+        FAIL("urandom fork duplicated cached bytes");
+        return;
+    }
+
+    PASS();
+}
+
 int main(int argc, char **argv)
 {
     printf("test-syscall-smoke: direct syscall smoke coverage\n\n");
@@ -642,6 +903,8 @@ int main(int argc, char **argv)
     test_memory_stubs();
     test_accept4();
     test_sysv_semaphore_ops();
+    test_urandom_byte_reads();
+    test_urandom_open_flags();
 
     SUMMARY("test-syscall-smoke");
     return fails > 0 ? 1 : 0;
diff --git a/tests/test-vdso.c b/tests/test-vdso.c
new file mode 100644
index 0000000..4d32d44
--- /dev/null
+++ b/tests/test-vdso.c
@@ -0,0 +1,269 @@
+/* test-vdso.c -- vDSO ELF correctness and symbol-resolution probe
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Confirms the synthetic vDSO emitted by src/core/vdso.c:
+ *   1. is published via AT_SYSINFO_EHDR
+ *   2. parses as a valid ELF shared object
+ *   3. exports the four __kernel_* symbols at addresses inside the page
+ *   4. carries GNU symbol versioning naming LINUX_2.6.39 so glibc/musl
+ *      dl_vdso_vsym() can resolve unversioned lookups
+ *   5. trampolines actually execute (call __kernel_clock_gettime and
+ *      compare the result against a direct SVC clock_gettime)
+ *
+ * Static binary so the standard test driver runs it under elfuse with
+ * no sysroot. The probe walks the vDSO's dynamic linker structure
+ * itself rather than relying on dlsym (which is unavailable in static
+ * builds anyway), so a regression in the elf layout fails this test
+ * regardless of which libc would later consume it.
+ */
+
+#include <elf.h>
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/auxv.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <unistd.h>
+
+static int failures = 0;
+
+#define FAIL(msg)                           \
+    do {                                    \
+        fprintf(stderr, "FAIL: %s\n", msg); \
+        failures++;                         \
+    } while (0)
+
+#define EXPECT(cond, msg) \
+    do {                  \
+        if (!(cond))      \
+            FAIL(msg);    \
+    } while (0)
+
+/* SysV ELF hash, matches the implementation in src/core/vdso.c. */
+static uint32_t elf_hash(const char *name)
+{
+    uint32_t h = 0, g;
+    while (*name) {
+        h = (h << 4) + (unsigned char) *name++;
+        g = h & 0xf0000000U;
+        if (g)
+            h ^= g >> 24;
+        h &= ~g;
+    }
+    return h;
+}
+
+static const Elf64_Sym *lookup_sym(const Elf64_Ehdr *ehdr,
+                                   const Elf64_Sym *symtab,
+                                   const char *strtab,
+                                   const uint32_t *hash,
+                                   const char *name)
+{
+    uint32_t nbucket = hash[0];
+    uint32_t nchain = hash[1];
+    const uint32_t *bucket = &hash[2];
+    const uint32_t *chain = &bucket[nbucket];
+    uint32_t h = elf_hash(name) % nbucket;
+    for (uint32_t i = bucket[h]; i && i < nchain; i = chain[i]) {
+        if (strcmp(&strtab[symtab[i].st_name], name) == 0)
+            return &symtab[i];
+    }
+    (void) ehdr;
+    return NULL;
+}
+
+typedef struct {
+    const Elf64_Sym *symtab;
+    const char *strtab;
+    const uint32_t *hash;
+    const uint16_t *versym;
+    const Elf64_Verdef *verdef;
+    size_t strsz;
+    int verdef_count;
+} vdso_t;
+
+static int parse_vdso(const Elf64_Ehdr *ehdr, vdso_t *v)
+{
+    memset(v, 0, sizeof(*v));
+    const Elf64_Phdr *phdr =
+        (const Elf64_Phdr *) ((const uint8_t *) ehdr + ehdr->e_phoff);
+    const Elf64_Dyn *dyn = NULL;
+    for (int i = 0; i < ehdr->e_phnum; i++) {
+        if (phdr[i].p_type == PT_DYNAMIC) {
+            dyn =
+                (const Elf64_Dyn *) ((const uint8_t *) ehdr + phdr[i].p_offset);
+            break;
+        }
+    }
+    if (!dyn)
+        return -1;
+    for (; dyn->d_tag != DT_NULL; dyn++) {
+        const uint8_t *p = (const uint8_t *) ehdr + dyn->d_un.d_ptr;
+        switch (dyn->d_tag) {
+        case DT_SYMTAB:
+            v->symtab = (const Elf64_Sym *) p;
+            break;
+        case DT_STRTAB:
+            v->strtab = (const char *) p;
+            break;
+        case DT_STRSZ:
+            v->strsz = (size_t) dyn->d_un.d_val;
+            break;
+        case DT_HASH:
+            v->hash = (const uint32_t *) p;
+            break;
+        case DT_VERSYM:
+            v->versym = (const uint16_t *) p;
+            break;
+        case DT_VERDEF:
+            v->verdef = (const Elf64_Verdef *) p;
+            break;
+        case DT_VERDEFNUM:
+            v->verdef_count = (int) dyn->d_un.d_val;
+            break;
+        default:
+            break;
+        }
+    }
+    return (v->symtab && v->strtab && v->hash) ? 0 : -1;
+}
+
+static const char *verdef_name_for_ndx(const vdso_t *v, uint16_t ndx)
+{
+    const Elf64_Verdef *vd = v->verdef;
+    for (int i = 0; i < v->verdef_count && vd; i++) {
+        if (vd->vd_ndx == ndx) {
+            const Elf64_Verdaux *aux =
+                (const Elf64_Verdaux *) ((const uint8_t *) vd + vd->vd_aux);
+            return &v->strtab[aux->vda_name];
+        }
+        if (!vd->vd_next)
+            break;
+        vd = (const Elf64_Verdef *) ((const uint8_t *) vd + vd->vd_next);
+    }
+    return NULL;
+}
+
+typedef int (*clock_gettime_fn)(clockid_t, struct timespec *);
+
+static void test_vdso(void)
+{
+    unsigned long base = getauxval(AT_SYSINFO_EHDR);
+    EXPECT(base != 0, "AT_SYSINFO_EHDR is zero");
+    if (!base)
+        return;
+    printf("AT_SYSINFO_EHDR = 0x%lx\n", base);
+
+    const Elf64_Ehdr *ehdr = (const Elf64_Ehdr *) base;
+    EXPECT(memcmp(ehdr->e_ident,
+                  "\x7f"
+                  "ELF",
+                  4) == 0,
+           "vDSO ELF magic");
+    EXPECT(ehdr->e_machine == EM_AARCH64, "vDSO e_machine");
+    EXPECT(ehdr->e_type == ET_DYN, "vDSO e_type");
+
+    vdso_t v;
+    EXPECT(parse_vdso(ehdr, &v) == 0, "vDSO dynamic section parse");
+    if (!v.symtab || !v.strtab || !v.hash)
+        return;
+
+    /* All four __kernel_* symbols must resolve and land in the vDSO page. */
+    static const char *names[] = {
+        "__kernel_rt_sigreturn", "__kernel_clock_getres",
+        "__kernel_clock_gettime", "__kernel_gettimeofday"};
+    const Elf64_Sym *syms[4] = {0};
+    for (int i = 0; i < 4; i++) {
+        syms[i] = lookup_sym(ehdr, v.symtab, v.strtab, v.hash, names[i]);
+        char buf[64];
+        snprintf(buf, sizeof(buf), "lookup %s", names[i]);
+        EXPECT(syms[i] != NULL, buf);
+        if (!syms[i])
+            continue;
+        uint64_t addr = base + syms[i]->st_value;
+        snprintf(buf, sizeof(buf), "%s address in vDSO page", names[i]);
+        EXPECT(addr >= base && addr < base + 0x1000, buf);
+    }
+
+    /* Symbol versioning: every defined symbol must point at LINUX_2.6.39. */
+    EXPECT(v.versym != NULL, "vDSO DT_VERSYM present");
+    EXPECT(v.verdef != NULL, "vDSO DT_VERDEF present");
+    if (v.versym && v.verdef) {
+        for (int i = 0; i < 4; i++) {
+            if (!syms[i])
+                continue;
+            uint32_t sym_idx = (uint32_t) (syms[i] - v.symtab);
+            uint16_t ndx = v.versym[sym_idx];
+            const char *ver = verdef_name_for_ndx(&v, ndx);
+            char buf[80];
+            snprintf(buf, sizeof(buf), "%s versioned LINUX_2.6.39", names[i]);
+            EXPECT(ver && strcmp(ver, "LINUX_2.6.39") == 0, buf);
+        }
+    }
+
+    /* Direct call into the vDSO trampoline. Must agree with SVC for both
+     * CLOCK_MONOTONIC and CLOCK_REALTIME. The trampoline interpolates each
+     * clockid from a shared CNTVCT anchor pair; the seed runs on first
+     * call so the second clockid here always exercises the post-seed
+     * fast path.
+     */
+    const Elf64_Sym *cg =
+        lookup_sym(ehdr, v.symtab, v.strtab, v.hash, "__kernel_clock_gettime");
+    if (cg) {
+        clock_gettime_fn fn =
+            (clock_gettime_fn) (uintptr_t) (base + cg->st_value);
+        struct {
+            clockid_t id;
+            const char *label;
+            int64_t tolerance_ns;
+        } cases[] = {
+            /* CLOCK_MONOTONIC: tight tolerance, anchor-derived value
+             * cannot drift relative to the SVC reference beyond the gap
+             * between calls.
+             */
+            {CLOCK_MONOTONIC, "MONOTONIC", 10000000},
+            /* CLOCK_REALTIME: tolerance loose enough to absorb host
+             * scheduling jitter between the two clock_gettime calls.
+             */
+            {CLOCK_REALTIME, "REALTIME", 10000000},
+        };
+        for (size_t i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
+            struct timespec via_vdso = {0}, via_svc = {0};
+            int r1 = fn(cases[i].id, &via_vdso);
+            int r2 = (int) syscall(SYS_clock_gettime, cases[i].id, &via_svc);
+            char buf[80];
+            snprintf(buf, sizeof(buf), "vDSO clock_gettime(%s) returned 0",
+                     cases[i].label);
+            EXPECT(r1 == 0, buf);
+            snprintf(buf, sizeof(buf), "SVC clock_gettime(%s) returned 0",
+                     cases[i].label);
+            EXPECT(r2 == 0, buf);
+            int64_t delta_ns =
+                ((int64_t) via_svc.tv_sec - via_vdso.tv_sec) * 1000000000LL +
+                (via_svc.tv_nsec - via_vdso.tv_nsec);
+            if (delta_ns < 0)
+                delta_ns = -delta_ns;
+            snprintf(buf, sizeof(buf), "vDSO and SVC clock_gettime(%s) agree",
+                     cases[i].label);
+            EXPECT(delta_ns < cases[i].tolerance_ns, buf);
+            printf("vDSO/SVC clock_gettime(%s) delta = %" PRId64 " ns\n",
+                   cases[i].label, delta_ns);
+        }
+    }
+}
+
+int main(void)
+{
+    printf("test-vdso: vDSO ELF + symbol-versioning probe\n");
+    test_vdso();
+    if (failures) {
+        printf("test-vdso: %d FAIL\n", failures);
+        return 1;
+    }
+    puts("test-vdso: PASS");
+    return 0;
+}