diff --git a/Makefile b/Makefile index 7f4814f..6e612dd 100644 --- a/Makefile +++ b/Makefile @@ -23,6 +23,7 @@ SRCS := \ core/elf.c \ core/stack.c \ core/vdso.c \ + core/shim-globals.c \ core/bootstrap.c \ core/rosetta.c \ core/sysroot.c \ @@ -160,6 +161,24 @@ $(BUILD_DIR)/test-pthread: tests/test-pthread.c | $(BUILD_DIR) @echo " CROSS $< (with -lpthread)" $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread +# test-shim-cred-race spawns a pthread reader while the main thread +# toggles setresuid; the reader spins on the identity fast path. +$(BUILD_DIR)/test-shim-cred-race: tests/test-shim-cred-race.c | $(BUILD_DIR) + @echo " CROSS $< (with -lpthread)" + $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread + +# test-shim-urandom-smp spawns N pthreads racing on a shared FD_URANDOM +# slot to exercise the shim's LDXR/STXR head-advance under contention. +$(BUILD_DIR)/test-shim-urandom-smp: tests/test-shim-urandom-smp.c | $(BUILD_DIR) + @echo " CROSS $< (with -lpthread)" + $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread + +# test-shim-urandom-toctou races mprotect(PROT_NONE) against urandom +# reads to exercise the EL1 data abort recovery path. Needs pthreads. +$(BUILD_DIR)/test-shim-urandom-toctou: tests/test-shim-urandom-toctou.c | $(BUILD_DIR) + @echo " CROSS $< (with -lpthread)" + $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread + # test-fuse-basic runs a guest daemon thread and consumer in one process $(BUILD_DIR)/test-fuse-basic: tests/test-fuse-basic.c | $(BUILD_DIR) @echo " CROSS $< (with -lpthread)" diff --git a/src/core/bootstrap.c b/src/core/bootstrap.c index c6522df..23625f0 100644 --- a/src/core/bootstrap.c +++ b/src/core/bootstrap.c @@ -20,7 +20,9 @@ #include "core/bootstrap.h" #include "core/rosetta.h" +#include "core/shim-globals.h" #include "core/stack.h" +#include "core/startup-trace.h" #include "core/vdso.h" #include "runtime/thread.h" @@ -30,6 +32,7 @@ #include "syscall/internal.h" #include "syscall/path.h" #include "syscall/proc.h" +#include "syscall/signal.h" #include "debug/log.h" @@ -94,20 +97,25 @@ static void register_elf_segment_regions(guest_t *g, } } -/* Publish shim, shim-data, heap, stack-guard, and stack regions to the +/* Publish shim, shim-data, heap, stack-guard, and stack regions to * /proc/self/maps view, and invalidate the null page and stack-guard PTEs. - * Shared by guest_bootstrap_prepare and guest_bootstrap_rosetta_post_reset; - * the caller registers ELF or rosetta segments separately because those - * differ between aarch64 and rosetta guests. + * Shared by guest_bootstrap_prepare and guest_bootstrap_rosetta_post_reset; the + * caller registers ELF or rosetta segments separately because those differ + * between aarch64 and rosetta guests. */ static void register_runtime_regions(guest_t *g, size_t shim_bin_len) { guest_region_add(g, g->shim_base, g->shim_base + shim_bin_len, LINUX_PROT_READ | LINUX_PROT_EXEC, LINUX_MAP_PRIVATE, 0, "[shim]"); + /* shim_data is mapped privileged-only (AP[2:1]=00) in the page tables; the + * EL1 shim has full RW but EL0 cannot read or write. Report PROT_NONE in + * /proc/self/maps so guest tooling treats it as inaccessible, matching what + * dereferencing the GVA actually does (translation fault -> EL0 SIGSEGV + * path). + */ guest_region_add(g, g->shim_data_base, g->shim_data_base + BLOCK_2MIB, - LINUX_PROT_READ | LINUX_PROT_WRITE, LINUX_MAP_PRIVATE, 0, - "[shim-data]"); + LINUX_PROT_NONE, LINUX_MAP_PRIVATE, 0, "[shim-data]"); if (g->brk_base < g->brk_current) { guest_region_add(g, g->brk_base, g->brk_current, @@ -246,8 +254,11 @@ static bool load_interpreter(guest_t *g, } boot->interp_base = g->interp_base; + uint64_t infra_lo = g->interp_base - INFRA_RESERVE; + uint64_t infra_hi = g->interp_base; if (elf_map_segments(&boot->interp_info, boot->interp_resolved, - g->host_base, g->guest_size, boot->interp_base) < 0) { + g->host_base, g->guest_size, boot->interp_base, + infra_lo, infra_hi) < 0) { log_error("failed to map interpreter segments"); if (interp_host_temp) unlink(boot->interp_resolved); @@ -277,20 +288,28 @@ static bool build_boot_regions(mem_region_t *regions, */ if (!append_boot_region(regions, nregions, g->shim_base, g->shim_base + shim_bin_len, MEM_PERM_RX) || + /* shim_data is EL1-only: the guest must not directly read or write the + * identity cache, attention flag, urandom bitmap, or ring, any of which + * would let it spoof its own syscall results. The EL1 shim itself has + * full RW. /proc/self/maps still lists [shim-data] (region tracking is + * independent of EL0 access), but EL0 dereferences fault to the SIGSEGV + * path. + */ !append_boot_region(regions, nregions, g->shim_data_base, - g->shim_data_base + BLOCK_2MIB, MEM_PERM_RW) || + g->shim_data_base + BLOCK_2MIB, + MEM_PERM_RW_EL1_ONLY) || !append_boot_region(regions, nregions, VDSO_BASE, VDSO_BASE + VDSO_SIZE, MEM_PERM_RX)) { return false; } - /* Rosetta guests never load the x86_64 ELF or its interpreter into - * guest memory; rosetta itself reads the target via fd 3 once it is - * running. Adding those segments to the page-table builder would emit - * ghost L2/L3 entries at the binary's x86_64 link address (typically - * 0x400000) pointing into uninitialized primary-buffer GPAs. The - * rosetta image's own segments are registered by rosetta_prepare's - * separate region append in the bootstrap caller. + /* Rosetta guests never load the x86_64 ELF or its interpreter into guest + * memory; rosetta itself reads the target via fd 3 once it is running. + * Adding those segments to the page-table builder would emit ghost L2/L3 + * entries at the binary's x86_64 link address (typically 0x400000) pointing + * into uninitialized primary-buffer GPAs. The rosetta image's own segments + * are registered by rosetta_prepare's separate region append in the + * bootstrap caller. */ if (!g->is_rosetta) { if (!append_elf_segment_regions(regions, nregions, &boot->elf_info, @@ -334,14 +353,17 @@ int guest_bootstrap_prepare(guest_t *g, mem_region_t regions[MAX_BOOT_REGIONS]; int nregions = 0; uint64_t native_vdso; + uint64_t t0; memset(boot, 0, sizeof(*boot)); *guest_initialized = false; + t0 = startup_trace_now_ns(); if (elf_load(elf_host_path, &boot->elf_info) < 0) { log_error("failed to load ELF: %s", elf_host_path); return -1; } + startup_trace_step("elf_load", t0); bool want_rosetta = false; if (boot->elf_info.e_machine == EM_X86_64) { @@ -366,18 +388,20 @@ int guest_bootstrap_prepare(guest_t *g, (unsigned long long) boot->elf_info.load_max, want_rosetta ? "x86_64-via-rosetta" : "aarch64"); - /* Rosetta is statically linked at 0x800000000000 (128 TiB), beyond the - * 36 and 40-bit IPA ranges. Request 48-bit IPA up-front so the - * page-table builder can reach the rosetta segments. HVF clamps to its - * supported size; on M1 hosts the upstream hyper-linux audit confirms - * 48 is honoured even though the auto-detect default returns 36, so - * the request is non-fatal in either direction. + /* Rosetta is statically linked at 0x800000000000 (128 TiB), beyond the 36 + * and 40-bit IPA ranges. Request 48-bit IPA up-front so the page-table + * builder can reach the rosetta segments. HVF clamps to its supported size; + * on M1 hosts the upstream hyper-linux audit confirms 48 is honoured even + * though the auto-detect default returns 36, so the request is non-fatal in + * either direction. */ uint32_t req_ipa = want_rosetta ? 48 : 0; + t0 = startup_trace_now_ns(); if (guest_init(g, 0, req_ipa) < 0) { log_error("failed to initialize guest"); return -1; } + startup_trace_step("guest_init", t0); *guest_initialized = true; g->is_rosetta = want_rosetta; proc_set_rosetta_active(want_rosetta); @@ -391,8 +415,8 @@ int guest_bootstrap_prepare(guest_t *g, if (want_rosetta) { /* Rosetta path: no x86_64 ELF segments are loaded into guest memory * (rosetta itself does that lazily once it starts running). brk and - * stack use the same defaults the aarch64 path falls back to when - * the binary sits at low VAs; the x86_64 binary's load_max would be + * stack use the same defaults the aarch64 path falls back to when the + * binary sits at low VAs; the x86_64 binary's load_max would be * meaningless here because nothing of it actually lives in primary * buffer GPA space. */ @@ -405,11 +429,16 @@ int guest_bootstrap_prepare(guest_t *g, } else { boot->elf_load_base = (boot->elf_info.e_type == ET_DYN) ? PIE_LOAD_BASE : 0; + t0 = startup_trace_now_ns(); + uint64_t infra_lo = g->interp_base - INFRA_RESERVE; + uint64_t infra_hi = g->interp_base; if (elf_map_segments(&boot->elf_info, elf_host_path, g->host_base, - g->guest_size, boot->elf_load_base) < 0) { + g->guest_size, boot->elf_load_base, infra_lo, + infra_hi) < 0) { log_error("failed to map ELF segments"); return -1; } + startup_trace_step("elf_map_segments", t0); /* Track the lowest loaded ELF address so the legacy fork IPC path * copies low-linked ET_EXECs (e.g. linked at 0x200000) in full. @@ -427,8 +456,10 @@ int guest_bootstrap_prepare(guest_t *g, g->stack_top = STACK_TOP_DEFAULT; g->stack_base = g->stack_top - STACK_SIZE; + t0 = startup_trace_now_ns(); if (!load_interpreter(g, sysroot, boot)) return -1; + startup_trace_step("load_interpreter", t0); } if (shim_bin_len > BLOCK_2MIB) { @@ -436,6 +467,7 @@ int guest_bootstrap_prepare(guest_t *g, return -1; } + t0 = startup_trace_now_ns(); memcpy((uint8_t *) g->host_base + g->shim_base, shim_bin, shim_bin_len); log_debug("shim loaded at offset 0x%llx (%zu bytes)", (unsigned long long) g->shim_base, shim_bin_len); @@ -448,12 +480,15 @@ int guest_bootstrap_prepare(guest_t *g, } sys_icache_invalidate((uint8_t *) g->host_base + g->shim_base, shim_bin_len); + startup_trace_step("shim_load_icache", t0); + t0 = startup_trace_now_ns(); if (!build_boot_regions(regions, &nregions, g, boot, shim_bin_len)) { log_error("too many memory regions (%d >= %d)", nregions, MAX_BOOT_REGIONS); return -1; } + startup_trace_step("build_boot_regions", t0); /* Rosetta path: append the rosetta image as a non-identity region so the * page-table builder maps VA 0x800000000000 -> primary buffer GPA. @@ -461,24 +496,29 @@ int guest_bootstrap_prepare(guest_t *g, * from the same pool that guest_build_page_tables is about to consume). */ if (want_rosetta) { + t0 = startup_trace_now_ns(); if (rosetta_prepare(g, elf_host_path, regions, &nregions, MAX_BOOT_REGIONS, verbose, &rr) < 0) { log_error("rosetta_prepare failed for %s", elf_guest_path); return -1; } + startup_trace_step("rosetta_prepare", t0); } + t0 = startup_trace_now_ns(); boot->ttbr0 = guest_build_page_tables(g, regions, nregions); if (!boot->ttbr0) { log_error("failed to build page tables"); return -1; } + startup_trace_step("guest_build_page_tables", t0); /* No TLBI request here: the shim's _start does TLBI VMALLE1IS before * enabling the MMU (src/core/shim.S), and the per-vCPU accumulator is the * wrong place to stage a bring-up flush -- bootstrap may run on a thread * whose slot is later consumed by an unrelated syscall. */ + t0 = startup_trace_now_ns(); if (want_rosetta) { /* /proc/self/maps for a rosetta guest reports the rosetta translator * as a single anonymous region covering [VA, VA+size). The original @@ -505,12 +545,14 @@ int guest_bootstrap_prepare(guest_t *g, } register_runtime_regions(g, shim_bin_len); + startup_trace_step("register_regions", t0); log_debug("TTBR0=0x%llx, IPA base=0x%llx", (unsigned long long) boot->ttbr0, (unsigned long long) g->ipa_base); if (verbose) log_initial_page_tables(g, boot->ttbr0); + t0 = startup_trace_now_ns(); syscall_init(); proc_init(); @@ -526,6 +568,7 @@ int guest_bootstrap_prepare(guest_t *g, proc_set_elf_path(elf_guest_path); if (sysroot) proc_set_sysroot(sysroot); + startup_trace_step("runtime_init", t0); /* rosetta_finalize pre-opens the x86_64 binary at fd 3, constructs the * binfmt_misc argv ([ROSETTA_PATH, binary, original_argv[1..]]), refreshes @@ -536,18 +579,22 @@ int guest_bootstrap_prepare(guest_t *g, int rosetta_argc = 0; const char **rosetta_argv = NULL; if (want_rosetta) { + t0 = startup_trace_now_ns(); if (rosetta_finalize(g, 0, elf_host_path, elf_host_path_temp, elf_guest_path, guest_argc, guest_argv, &rr, verbose, &rosetta_argc, &rosetta_argv, NULL) < 0) { log_error("rosetta_finalize failed"); return -1; } + startup_trace_step("rosetta_finalize", t0); } else { proc_set_cmdline(guest_argc, guest_argv); } proc_set_environ((const char **) environ); + t0 = startup_trace_now_ns(); native_vdso = vdso_build(g); + startup_trace_step("vdso_build", t0); linux_stack_auxv_t auxv; const elf_info_t *stack_elf = want_rosetta ? &rr.rosetta_info : &boot->elf_info; @@ -555,6 +602,7 @@ int guest_bootstrap_prepare(guest_t *g, uint64_t stack_interp_base = want_rosetta ? 0 : boot->interp_base; int stack_argc = want_rosetta ? rosetta_argc : guest_argc; const char **stack_argv = want_rosetta ? rosetta_argv : guest_argv; + t0 = startup_trace_now_ns(); boot->stack_pointer = build_linux_stack( g, g->stack_top, stack_argc, stack_argv, (const char **) environ, stack_elf, stack_elf_load_base, stack_interp_base, native_vdso, -1, @@ -564,6 +612,7 @@ int guest_bootstrap_prepare(guest_t *g, free(rosetta_argv); return -1; } + startup_trace_step("build_linux_stack", t0); /* rosetta_argv was copied into the guest stack; the host allocation is * no longer needed. The strings themselves are constants (ROSETTA_PATH) * or owned by the caller (binary_path, guest_argv entries) so freeing @@ -599,6 +648,7 @@ int guest_bootstrap_create_vcpu(guest_t *g, { uint64_t sctlr; uint64_t sctlr_with_mmu; + uint64_t t0; /* Rosetta needs TTBR1 walks enabled and TBI1=1 so the kbuf window at * KBUF_VA_BASE (bits-63-set) resolves and TaggedPointer extraction keeps * working. Aarch64 guests stay on the EPD1=1 variant which keeps the @@ -613,7 +663,9 @@ int guest_bootstrap_create_vcpu(guest_t *g, hv_vcpu_t vcpu; hv_vcpu_exit_t *vexit; + t0 = startup_trace_now_ns(); HV_CHECK(hv_vcpu_create(&vcpu, &vexit, NULL)); + startup_trace_step("hv_vcpu_create", t0); g->vcpu = vcpu; g->exit = vexit; *out_vcpu = vcpu; @@ -621,6 +673,7 @@ int guest_bootstrap_create_vcpu(guest_t *g, thread_register_main(vcpu, vexit, proc_get_pid(), el1_sp); + t0 = startup_trace_now_ns(); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_VBAR_EL1, shim_ipa + 0x800)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_MAIR_EL1, 0xFF00)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TCR_EL1, tcr_value)); @@ -632,6 +685,52 @@ int guest_bootstrap_create_vcpu(guest_t *g, HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL0, sp_ipa)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL1, el1_sp)); + /* Round-trip a sentinel through TPIDR_EL1 before installing the real + * value. Validates only the hv_vcpu_{set,get}_sys_reg pre-run round + * trip, not preservation across hv_vcpu_run -- the test-shim-identity + * microbench is the end-to-end check for that. + */ + if (shim_globals_self_test(vcpu) < 0) + return -1; + /* TPIDR_EL1 -> shim_globals base, CONTEXTIDR_EL1 -> tid (== pid for the + * initial main thread). gettid fast path reads CONTEXTIDR_EL1 directly. + */ + if (shim_globals_install_per_vcpu(vcpu, g, proc_get_pid()) < 0) + return -1; + + /* Zero the shim-globals region and publish the initial identity so the very + * first getpid / getuid / etc. SVC #0 hits the cache instead of returning + * the all-zero seed. Future setuid/setgid paths refresh creds via + * cred_publish_after; fork-child has its own publish on the inherited + * identity. + */ + shim_globals_init(g); + shim_globals_set_trace_enabled(g, verbose); + shim_globals_publish_pid(g, proc_get_pid(), proc_get_ppid()); + shim_globals_publish_creds(g, proc_get_uid(), proc_get_euid(), + proc_get_gid(), proc_get_egid()); + /* Pre-fill the entropy ring so the first read(/dev/urandom) from the guest + * is served by the shim fast path with no cold-start HVC for refill. + */ + shim_globals_refill_urandom_ring(g); + /* Register the singleton guest pointer so signal_queue and the itimer + * setters can raise the attention flag without threading g through every + * call site. signal_init clears this defensively; the first registration + * must run after both proc_init and shim_globals_init. + */ + signal_set_shim_globals_guest(g); + /* Same singleton pattern but for the fd-table hooks that update the urandom + * bitmap. Must run before any FD_URANDOM-typed slot is allocated; bootstrap + * finishes before any guest syscall runs. + */ + shim_globals_set_singleton(g); + + /* CNTKCTL_EL1.EL0VCTEN | EL0PCTEN: allow EL0 to read {CNTVCT,CNTPCT}_EL0. + * Required by the vDSO clock_gettime fast path (and is the default on + * native Linux), without which the guest gets 0 back from MRS. + */ + HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_CNTKCTL_EL1, 0x3ULL)); + HV_CHECK(hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_SCTLR_EL1, &sctlr)); log_debug("SCTLR_EL1 default=0x%llx", (unsigned long long) sctlr); @@ -645,6 +744,7 @@ int guest_bootstrap_create_vcpu(guest_t *g, sctlr_with_mmu = SCTLR_RES1 | SCTLR_M | SCTLR_C | SCTLR_I | SCTLR_DZE | SCTLR_UCT | SCTLR_UCI; HV_CHECK(hv_vcpu_set_reg(vcpu, HV_REG_X0, sctlr_with_mmu)); + startup_trace_step("hv_vcpu_configure", t0); log_debug( "vCPU configured: PC=0x%llx SCTLR=0x%llx VBAR=0x%llx TTBR0=0x%llx " diff --git a/src/core/elf.c b/src/core/elf.c index 316ad7c..c20195c 100644 --- a/src/core/elf.c +++ b/src/core/elf.c @@ -208,8 +208,16 @@ int elf_map_segments(const elf_info_t *info, const char *path, void *guest_base, uint64_t guest_size, - uint64_t load_base) + uint64_t load_base, + uint64_t infra_lo, + uint64_t infra_hi) { + /* Half-open intersection test for [a, a+alen) and [b, b+blen). When + * infra_lo == infra_hi the caller opted out (early bring-up before + * guest_t is wired up); the host-side writes that follow still get + * the existing guest_size bound check. + */ + bool infra_active = infra_lo < infra_hi; FILE *f = fopen(path, "rb"); if (!f) { perror(path); @@ -264,6 +272,17 @@ int elf_map_segments(const elf_info_t *info, fclose(f); return -1; } + if (infra_active && phdr_dest < infra_hi && + phdr_dest + ph_total > infra_lo) { + log_error( + "%s: program headers at 0x%llx overlap infra reserve " + "[0x%llx, 0x%llx)", + path, (unsigned long long) phdr_dest, (unsigned long long) infra_lo, + (unsigned long long) infra_hi); + free(ph_buf); + fclose(f); + return -1; + } memcpy((uint8_t *) guest_base + phdr_dest, ph_buf, ph_total); /* Copy PT_LOAD contents after AT_PHDR is in place; ET_DYN segments are @@ -308,15 +327,34 @@ int elf_map_segments(const elf_info_t *info, return -1; } - /* Zero the full page-aligned segment extent, not only p_memsz. - * Linux guarantees zero-filled tail bytes in the last mapped page, - * and some dynamic linkers allocate from that page tail before they - * request more memory. Leaving stale bytes there leaks state across - * execve and corrupts the new image. + /* The host memset zeros PAGE_ALIGN_UP(memsz) bytes, not just memsz, + * so the infra-overlap check has to use the same rounded extent. + * Without the rounding here, a segment that ends just below + * infra_lo passes the check and still spills up to PAGE_SIZE-1 + * bytes of zero into the infra reserve via the page tail. */ uint64_t zero_len = PAGE_ALIGN_UP(memsz); if (gpa + zero_len > guest_size) zero_len = guest_size - gpa; + if (infra_active && gpa < infra_hi && gpa + zero_len > infra_lo) { + log_error( + "%s: segment at 0x%llx+0x%llx (zero-extent 0x%llx) overlaps " + "infra reserve [0x%llx, 0x%llx)", + path, (unsigned long long) gpa, (unsigned long long) memsz, + (unsigned long long) zero_len, (unsigned long long) infra_lo, + (unsigned long long) infra_hi); + free(ph_buf); + fclose(f); + return -1; + } + + /* Zero the full page-aligned segment extent (zero_len computed above + * with guest_size and infra_reserve checks). Linux guarantees + * zero-filled tail bytes in the last mapped page, and some dynamic + * linkers allocate from that page tail before they request more + * memory. Leaving stale bytes there leaks state across execve and + * corrupts the new image. + */ memset((uint8_t *) guest_base + gpa, 0, zero_len); /* Overlay initialized bytes after zeroing so BSS and page tail remain diff --git a/src/core/elf.h b/src/core/elf.h index 6ff5fbc..33f4813 100644 --- a/src/core/elf.h +++ b/src/core/elf.h @@ -109,13 +109,20 @@ int elf_load(const char *path, elf_info_t *info); * Also copies program headers into guest memory for AT_PHDR. * load_base is added to all virtual addresses (0 for ET_EXEC at link addr, * non-zero for ET_DYN loaded at a chosen base). + * infra_lo and infra_hi delimit the runtime infra reserve (page-table pool, + * shim text, shim_data, vDSO). Any PT_LOAD or PT_PHDR copy whose destination + * intersects [infra_lo, infra_hi) is rejected: those writes go through + * host_base directly and would otherwise bypass the EL1-only page-table + * protection on shim_data. Pass 0,0 only when the guest_t is not yet built. * Returns 0 on success, -1 on failure. */ int elf_map_segments(const elf_info_t *info, const char *path, void *guest_base, uint64_t guest_size, - uint64_t load_base); + uint64_t load_base, + uint64_t infra_lo, + uint64_t infra_hi); /* Resolve a PT_INTERP path against a sysroot directory. * Tries three strategies: diff --git a/src/core/guest.c b/src/core/guest.c index 6393b00..fa2a8a6 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -38,6 +38,7 @@ #include #include "core/guest.h" +#include "core/startup-trace.h" #include "debug/log.h" #include "utils.h" #include "runtime/thread.h" /* thread_destroy_all_vcpus */ @@ -60,6 +61,7 @@ static void guest_region_clear(guest_t *g); #define PT_UXN (1ULL << 54) /* Unprivileged Execute Never */ #define PT_PXN (1ULL << 53) /* Privileged Execute Never */ #define PT_AP_RW_EL0 (1ULL << 6) /* AP[2:1]=01: RW at EL1, RW at EL0 */ +#define PT_AP_RW_EL1 (0ULL << 6) /* AP[2:1]=00: RW at EL1, no access EL0 */ #define PT_AP_RO (3ULL << 6) /* AP[2:1]=11: RO at EL1, RO at EL0 */ /* PAGE_SIZE / ALIGN_2MB_* live in utils.h; BLOCK_2MIB lives in core/guest.h. */ @@ -202,6 +204,8 @@ static uint64_t *pt_at(const guest_t *g, uint64_t gpa) int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) { + uint64_t t0; + memset(g, 0, sizeof(*g)); g->shm_fd = -1; g->ipa_base = GUEST_IPA_BASE; @@ -257,6 +261,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) * seconds max wait) to handle this gracefully. */ hv_return_t ret = HV_ERROR; + t0 = startup_trace_now_ns(); for (int attempt = 0; attempt < 30; attempt++) { hv_vm_config_t config = hv_vm_config_create(); hv_vm_config_set_ipa_size(config, vm_ipa); @@ -266,6 +271,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) break; usleep(500000); /* 500ms between attempts */ } + startup_trace_step("hv_vm_create", t0); if (ret != HV_SUCCESS) { log_error("guest: hv_vm_create failed: %d (ipa_bits=%u)", (int) ret, vm_ipa); @@ -307,8 +313,10 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) * physical memory. Do NOT memset because that would touch every * page and defeat demand paging. */ + t0 = startup_trace_now_ns(); g->host_base = mmap(NULL, try_size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); + startup_trace_step("primary_mmap", t0); if (g->host_base == MAP_FAILED) { perror("guest: mmap"); g->host_base = NULL; @@ -320,6 +328,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) * path instead of SCM_RIGHTS fd passing. */ char tmppath[] = "/tmp/elfuse-XXXXXX"; + t0 = startup_trace_now_ns(); int sfd = mkstemp(tmppath); if (sfd >= 0) { unlink(tmppath); /* Unlink immediately; fd keeps file alive */ @@ -335,9 +344,12 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) close(sfd); } } + startup_trace_step("cow_shm_upgrade", t0); + t0 = startup_trace_now_ns(); ret = hv_vm_map(g->host_base, GUEST_IPA_BASE, try_size, HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC); + startup_trace_step("hv_vm_map", t0); if (ret == HV_SUCCESS) { mapped_size = try_size; mapped = true; @@ -380,6 +392,8 @@ int guest_init_from_shm(guest_t *g, uint64_t size, uint32_t ipa_bits) { + uint64_t t0; + memset(g, 0, sizeof(*g)); g->shm_fd = -1; /* Child does not own the shm */ g->ipa_base = GUEST_IPA_BASE; @@ -403,8 +417,10 @@ int guest_init_from_shm(guest_t *g, * the parent's frozen snapshot; writes are private to this process. * macOS CoW is page-granular: only modified pages are duplicated. */ + t0 = startup_trace_now_ns(); g->host_base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, shm_fd, 0); + startup_trace_step("shm_mmap", t0); if (g->host_base == MAP_FAILED) { perror("guest: mmap shm"); g->host_base = NULL; @@ -417,6 +433,7 @@ int guest_init_from_shm(guest_t *g, /* Create HVF VM with the same IPA width as the parent */ hv_return_t ret = HV_ERROR; + t0 = startup_trace_now_ns(); for (int attempt = 0; attempt < 30; attempt++) { hv_vm_config_t config = hv_vm_config_create(); hv_vm_config_set_ipa_size(config, ipa_bits); @@ -426,6 +443,7 @@ int guest_init_from_shm(guest_t *g, break; usleep(500000); } + startup_trace_step("hv_vm_create_shm", t0); if (ret != HV_SUCCESS) { log_error("guest: hv_vm_create (shm) failed: %d", (int) ret); munmap(g->host_base, size); @@ -433,8 +451,10 @@ int guest_init_from_shm(guest_t *g, return -1; } + t0 = startup_trace_now_ns(); ret = hv_vm_map(g->host_base, GUEST_IPA_BASE, size, HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC); + startup_trace_step("hv_vm_map_shm", t0); if (ret != HV_SUCCESS) { log_error("guest: hv_vm_map (shm) failed: %d", (int) ret); hv_vm_destroy(); @@ -1106,6 +1126,16 @@ static int gva_translate_perm(const guest_t *g, return -1; int perms = desc_to_perms(l3[l3_idx]); + /* EL1-only pages (shim_data) are inaccessible to guest EL0 in the + * page tables; the host accessors that act on a guest-supplied GVA + * must refuse them too, otherwise a guest could pass a shim_data + * GVA as a syscall buffer and have the host write into the identity + * cache or entropy ring on its behalf. The host's own publishers + * use direct host_base+shim_data_base arithmetic and bypass this + * walker entirely. + */ + if (perms & MEM_PERM_EL1_ONLY) + return -1; if ((perms & required_perms) != required_perms) return -1; @@ -1136,6 +1166,12 @@ static int gva_translate_perm(const guest_t *g, /* L2 block descriptor: 2MiB granularity. */ int perms = desc_to_perms(l2[l2_idx]); + /* See the L3 page-descriptor branch above: EL1-only blocks are + * inaccessible to host-on-behalf-of-guest accesses for the same + * reason. shim_data is mapped as a 2MiB EL1-only block at boot. + */ + if (perms & MEM_PERM_EL1_ONLY) + return -1; if ((perms & required_perms) != required_perms) return -1; @@ -2079,10 +2115,20 @@ static uint64_t make_block_desc(uint64_t gpa, int perms) } /* Write permissions via AP bits: + * AP[2:1]=00 -> RW for EL1 only (no EL0 access) * AP[2:1]=01 -> RW for EL1 and EL0 * AP[2:1]=11 -> RO for EL1 and EL0 + * MEM_PERM_EL1_ONLY drops EL0 access entirely; used for shim_data + * so the guest cannot directly read or store to the cache, ring, + * bitmap, or attention flag. */ - if (perms & MEM_PERM_W) { + if (perms & MEM_PERM_EL1_ONLY) { + desc |= PT_AP_RW_EL1; + /* EL1-only data: never EL0-executable (already set above if + * MEM_PERM_X is unset, but assert defensively). + */ + desc |= PT_UXN | PT_PXN; + } else if (perms & MEM_PERM_W) { desc |= PT_AP_RW_EL0; } else { desc |= PT_AP_RO; @@ -2513,22 +2559,35 @@ static uint64_t make_page_desc(uint64_t pa, int perms) if (!(perms & MEM_PERM_X)) desc |= PT_UXN | PT_PXN; - if (perms & MEM_PERM_W) + if (perms & MEM_PERM_EL1_ONLY) { + desc |= PT_AP_RW_EL1; + desc |= PT_UXN | PT_PXN; /* EL1-only data never executes */ + } else if (perms & MEM_PERM_W) { desc |= PT_AP_RW_EL0; - else + } else { desc |= PT_AP_RO; + } return desc; } -/* Extract MEM_PERM_* flags from a page table descriptor (block or page). */ +/* Extract MEM_PERM_* flags from a page table descriptor (block or page). + * The AP[2:1] field encodes the EL1/EL0 access matrix; map 00 to + * MEM_PERM_RW | MEM_PERM_EL1_ONLY so callers see the privileged-only + * shim_data slots correctly instead of treating them as read-only. + */ static int desc_to_perms(uint64_t desc) { int perms = MEM_PERM_R; if (!(desc & PT_UXN)) perms |= MEM_PERM_X; - if ((desc & (3ULL << 6)) == PT_AP_RW_EL0) + uint64_t ap = desc & (3ULL << 6); + if (ap == PT_AP_RW_EL0) { perms |= MEM_PERM_W; + } else if (ap == PT_AP_RW_EL1) { + perms |= MEM_PERM_W | MEM_PERM_EL1_ONLY; + } + /* PT_AP_RO (11) stays MEM_PERM_R only. */ return perms; } diff --git a/src/core/guest.h b/src/core/guest.h index 5429392..11d05bf 100644 --- a/src/core/guest.h +++ b/src/core/guest.h @@ -127,20 +127,28 @@ #define MEM_PERM_R (1 << 0) #define MEM_PERM_W (1 << 1) #define MEM_PERM_X (1 << 2) +/* AP[2:1]=00: privileged-only (no EL0 read/write). Combine with MEM_PERM_R/W. + * Used for shim_data so the guest cannot directly read or store to the identity + * cache, urandom bitmap, ring, or attention flag. The EL1 shim still has full + * RW. EL0 reads/writes fault to the EL0-fault path (SIGSEGV in the guest), + * matching what Linux does for kernel-only pages exposed in /proc/self/maps . + */ +#define MEM_PERM_EL1_ONLY (1 << 3) #define MEM_PERM_RX (MEM_PERM_R | MEM_PERM_X) #define MEM_PERM_RW (MEM_PERM_R | MEM_PERM_W) +#define MEM_PERM_RW_EL1_ONLY (MEM_PERM_R | MEM_PERM_W | MEM_PERM_EL1_ONLY) /* A contiguous region of guest memory to be mapped in page tables. * - * Default mode (va_base == 0): identity-mapped, VA == GPA. Used by every - * boot region (shim, vDSO, brk, stack) and every aarch64 ELF segment. + * Default mode (va_base == 0): identity-mapped, VA == GPA. Used by every boot + * region (shim, vDSO, brk, stack) and every aarch64 ELF segment. * - * Rosetta segments use va_base != 0 to install a non-identity mapping: - * the rosetta ELF is statically linked at 0x800000000000 (128 TiB) but its - * bytes live in the primary buffer at a low GPA. Page-table entries are - * indexed by va_base + (offset within region) and emit a block descriptor - * whose output address is gpa_start + (offset within region). This is the - * only place in elfuse where guest VA diverges from guest GPA. + * Rosetta segments use va_base != 0 to install a non-identity mapping: the + * rosetta ELF is statically linked at 0x800000000000 (128 TiB) but its bytes + * live in the primary buffer at a low GPA. Page-table entries are indexed by + * va_base + (offset within region) and emit a block descriptor whose output + * address is gpa_start + (offset within region). This is the only place in + * elfuse where guest VA diverges from guest GPA. */ typedef struct { uint64_t gpa_start; /* Output GPA / IPA (2MiB aligned) */ diff --git a/src/core/rosetta.c b/src/core/rosetta.c index 32588b4..4b3a986 100644 --- a/src/core/rosetta.c +++ b/src/core/rosetta.c @@ -268,8 +268,10 @@ int rosetta_prepare(guest_t *g, * binaries: uint64_t arithmetic, two's-complement intentional. */ uint64_t load_base = guest_base - va_base; + uint64_t infra_lo = g->interp_base - INFRA_RESERVE; + uint64_t infra_hi = g->interp_base; if (elf_map_segments(ri, ROSETTA_PATH, g->host_base, g->guest_size, - load_base) < 0) { + load_base, infra_lo, infra_hi) < 0) { log_error("rosetta: elf_map_segments failed"); return -1; } @@ -316,8 +318,10 @@ int rosetta_prepare(guest_t *g, */ guest_base = g->rosetta_guest_base; uint64_t load_base = guest_base - va_base; + uint64_t infra_lo = g->interp_base - INFRA_RESERVE; + uint64_t infra_hi = g->interp_base; if (elf_map_segments(ri, ROSETTA_PATH, g->host_base, g->guest_size, - load_base) < 0) { + load_base, infra_lo, infra_hi) < 0) { log_error("rosetta: re-entry elf_map_segments failed"); return -1; } @@ -469,7 +473,7 @@ int rosetta_finalize(guest_t *g, * goto fail must be introduced below, or the fail handler would * double-close it. */ - int bin_guest_fd = fd_alloc_at(3, FD_REGULAR, bin_host_fd); + int bin_guest_fd = fd_alloc_at(3, FD_REGULAR, bin_host_fd, NULL); if (bin_guest_fd < 0) { log_error("rosetta_finalize: fd_alloc_at(3) failed"); goto fail; diff --git a/src/core/shim-globals.c b/src/core/shim-globals.c new file mode 100644 index 0000000..eaf0bf9 --- /dev/null +++ b/src/core/shim-globals.c @@ -0,0 +1,361 @@ +/* EL1 shim globals -- host publisher. + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * See core/shim-globals.h for the cache layout, threat model, and + * memory-ordering rules. This file implements the host-side publish + * and TPIDR_EL1 setup helpers. The shim assembly side is in + * src/core/shim.S. + */ + +#include +#include +#include +#include + +#include "hvutil.h" +#include "core/guest.h" +#include "core/shim-globals.h" +#include "core/vdso.h" +#include "debug/log.h" +#include "runtime/thread.h" +#include "syscall/abi.h" +#include "syscall/fd.h" +#include "syscall/internal.h" +#include "syscall/proc.h" +#include "syscall/signal.h" + +#ifndef HV_SYS_REG_TPIDR_EL1 +/* Older SDKs (e.g., the Nix-pinned apple-sdk-14.4) may lack the + * enumerator. The encoding is stable: op0=3, op1=0, CRn=13, CRm=0, + * op2=4 -> 0xc684. Mirrors the existing ACTLR_EL1 workaround in + * src/syscall/syscall.c. + */ +#define HV_SYS_REG_TPIDR_EL1 ((hv_sys_reg_t) 0xc684) +#endif + +#ifndef HV_SYS_REG_CONTEXTIDR_EL1 +/* op0=3, op1=0, CRn=13, CRm=0, op2=1 -> 0xc681. Same SDK-fallback + * pattern as TPIDR_EL1. + */ +#define HV_SYS_REG_CONTEXTIDR_EL1 ((hv_sys_reg_t) 0xc681) +#endif + +/* shim.S hard-codes these offsets and sizes in its urandom-read + * fast path; if they drift here the shim reads from the wrong + * place. Catch the drift at compile time. + */ +_Static_assert(SHIM_URANDOM_OFF_BITMAP == 0x38, + "shim.S urandom fast path hard-codes BITMAP off 0x38"); +_Static_assert(SHIM_URANDOM_OFF_RING_HEAD == 0xB8, + "shim.S urandom fast path hard-codes RING_HEAD off 0xB8"); +_Static_assert(SHIM_URANDOM_OFF_RING_TAIL == 0xBC, + "shim.S urandom fast path hard-codes RING_TAIL off 0xBC"); +_Static_assert(SHIM_URANDOM_OFF_RING == 0xC0, + "shim.S urandom fast path hard-codes RING off 0xC0"); +_Static_assert(SHIM_URANDOM_RING_SIZE == 4096, + "shim.S urandom fast path hard-codes RING_SIZE 4096"); +_Static_assert(SHIM_URANDOM_OFF_RING_LOCK == 0x10C0, + "shim.S urandom fast path hard-codes RING_LOCK off 0x10C0"); +_Static_assert(FD_TABLE_SIZE == 1024, + "shim.S urandom fast path hard-codes FD_TABLE_SIZE 1024"); + +static uint8_t *cache_base(const guest_t *g) +{ + /* The cache lives at the start of the shim_data block, which is + * mapped into the host buffer at host_base + shim_data_base. + * Direct buffer access bypasses the guest-page-table walk used by + * guest_ptr, which is intentional: the host owns shim_data + * unconditionally. + */ + return (uint8_t *) g->host_base + g->shim_data_base; +} + +static void store_u64(uint8_t *page, uint32_t off, uint64_t value) +{ + uint64_t *slot = (uint64_t *) (page + off); + __atomic_store_n(slot, value, __ATOMIC_RELEASE); +} + +static void urandom_ring_lock(uint32_t *lock_p) +{ + while (__atomic_exchange_n(lock_p, 1, __ATOMIC_ACQUIRE) != 0) + sched_yield(); +} + +static void urandom_ring_unlock(uint32_t *lock_p) +{ + __atomic_store_n(lock_p, 0, __ATOMIC_RELEASE); +} + +void shim_globals_init(guest_t *g) +{ + memset(cache_base(g), 0, SHIM_GLOBALS_SIZE); +} + +void shim_globals_publish_pid(guest_t *g, int64_t pid, int64_t ppid) +{ + uint8_t *page = cache_base(g); + store_u64(page, SHIM_IDENTITY_OFF_PID, (uint64_t) pid); + store_u64(page, SHIM_IDENTITY_OFF_PPID, (uint64_t) ppid); +} + +void shim_globals_publish_creds(guest_t *g, + uint32_t uid, + uint32_t euid, + uint32_t gid, + uint32_t egid) +{ + uint8_t *page = cache_base(g); + store_u64(page, SHIM_IDENTITY_OFF_UID, uid); + store_u64(page, SHIM_IDENTITY_OFF_EUID, euid); + store_u64(page, SHIM_IDENTITY_OFF_GID, gid); + store_u64(page, SHIM_IDENTITY_OFF_EGID, egid); +} + +uint64_t shim_globals_gva(const guest_t *g) +{ + return g->shim_data_base; +} + +int shim_globals_self_test(hv_vcpu_t vcpu) +{ + const uint64_t sentinel = 0xCAFEBABEDEADBEEFULL; + hv_return_t r = hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL1, sentinel); + if (r != HV_SUCCESS) { + log_error("shim_globals: TPIDR_EL1 set failed (hv_return=0x%x)", r); + return -1; + } + uint64_t probe = 0; + r = hv_vcpu_get_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL1, &probe); + if (r != HV_SUCCESS) { + log_error("shim_globals: TPIDR_EL1 get failed (hv_return=0x%x)", r); + return -1; + } + if (probe != sentinel) { + log_error( + "shim_globals: TPIDR_EL1 round-trip mismatch: wrote 0x%llx, " + "read 0x%llx", + (unsigned long long) sentinel, (unsigned long long) probe); + return -1; + } + return 0; +} + +int shim_globals_install_tpidr(hv_vcpu_t vcpu, const guest_t *g) +{ + hv_return_t r = + hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL1, shim_globals_gva(g)); + if (r != HV_SUCCESS) { + log_error("shim_globals: install TPIDR_EL1 failed (hv_return=0x%x)", r); + return -1; + } + return 0; +} + +int shim_globals_install_tid(hv_vcpu_t vcpu, int64_t tid) +{ + hv_return_t r = + hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_CONTEXTIDR_EL1, (uint64_t) tid); + if (r != HV_SUCCESS) { + log_error( + "shim_globals: install CONTEXTIDR_EL1 (tid=%lld) failed " + "(hv_return=0x%x)", + (long long) tid, r); + return -1; + } + return 0; +} + +int shim_globals_install_per_vcpu(hv_vcpu_t vcpu, const guest_t *g, int64_t tid) +{ + if (shim_globals_install_tpidr(vcpu, g) < 0) + return -1; + return shim_globals_install_tid(vcpu, tid); +} + +/* Singleton guest pointer for the urandom-bitmap hooks called from + * the fd table. elfuse runs one VM per process so a single global is + * correct; the NULL-or-same-g assertion catches a lifecycle bug. + * Mirrors the pattern signal.c uses for the attention-flag singleton. + */ +static guest_t *singleton_g; + +void shim_globals_set_singleton(guest_t *g) +{ + if (g != NULL && singleton_g != NULL && singleton_g != g) { + log_error( + "shim_globals: singleton already registered to %p, " + "refusing to re-register with %p", + (void *) singleton_g, (void *) g); + return; + } + singleton_g = g; +} + +void shim_globals_reset_singleton(void) +{ + singleton_g = NULL; +} + +static uint64_t *urandom_bitmap_word(int fd) +{ + if (!singleton_g) + return NULL; + if (fd < 0 || fd >= FD_TABLE_SIZE) + return NULL; + uint8_t *base = cache_base(singleton_g) + SHIM_URANDOM_OFF_BITMAP; + return (uint64_t *) base + (fd / 64); +} + +void shim_globals_mark_urandom_fd(int fd, bool is_urandom) +{ + uint64_t *word = urandom_bitmap_word(fd); + if (!word) + return; + uint64_t mask = (uint64_t) 1 << (fd & 63); + if (is_urandom) + __atomic_fetch_or(word, mask, __ATOMIC_RELEASE); + else + __atomic_fetch_and(word, ~mask, __ATOMIC_RELEASE); +} + +void shim_globals_rebuild_urandom_bitmap(void) +{ + if (!singleton_g) + return; + /* Wipe the bitmap region first; concurrent fd_alloc / close from + * other vCPUs is impossible during fork-child init (the child has + * not yet started executing guest code), so a non-atomic memset + * is safe here. + */ + memset(cache_base(singleton_g) + SHIM_URANDOM_OFF_BITMAP, 0, + SHIM_URANDOM_BITMAP_BYTES); + /* Walk the fd table; mark every readable FD_URANDOM slot. Reuses + * the atomic-OR setter so the visible memory order matches the + * normal fd_alloc path. + */ + for (int fd = 0; fd < FD_TABLE_SIZE; fd++) { + fd_refresh_urandom_bitmap(fd); + } +} + +/* arc4random_buf is documented as deadlock-free and re-entrant. Used + * by both the initial fill at bootstrap and by the slow-path refill + * that runs from sys_read when the shim's fast path falls through due + * to an empty ring. + */ +void shim_globals_refill_urandom_ring(guest_t *g) +{ + uint8_t *base = cache_base(g); + uint32_t *head_p = (uint32_t *) (base + SHIM_URANDOM_OFF_RING_HEAD); + uint32_t *tail_p = (uint32_t *) (base + SHIM_URANDOM_OFF_RING_TAIL); + uint32_t *lock_p = (uint32_t *) (base + SHIM_URANDOM_OFF_RING_LOCK); + uint8_t *ring = base + SHIM_URANDOM_OFF_RING; + + urandom_ring_lock(lock_p); + + uint32_t head = __atomic_load_n(head_p, __ATOMIC_ACQUIRE); + uint32_t tail = __atomic_load_n(tail_p, __ATOMIC_RELAXED); + uint32_t fill = tail - head; + if (fill >= SHIM_URANDOM_RING_SIZE) + goto out; /* already full */ + uint32_t to_fill = SHIM_URANDOM_RING_SIZE - fill; + + /* Producer writes from ring[tail & (SIZE-1)] forward, wrapping + * once when needed. Two memcpys at most. + */ + uint32_t pos = tail & (SHIM_URANDOM_RING_SIZE - 1); + uint32_t first = SHIM_URANDOM_RING_SIZE - pos; + if (first > to_fill) + first = to_fill; + arc4random_buf(ring + pos, first); + if (to_fill > first) + arc4random_buf(ring, to_fill - first); + + /* Release-store the new tail so any fast-path consumer that loads + * tail with an acquiring read sees the bytes already in the ring. + */ + __atomic_store_n(tail_p, tail + to_fill, __ATOMIC_RELEASE); + +out: + urandom_ring_unlock(lock_p); +} + +/* Bitmask helpers. The slot lives at SHIM_GLOBALS_OFF_ATTN as a + * uint32; ATTN_BIT_SIGTIMER and ATTN_BIT_CRED partition ownership so + * the signal/timer lane and the cred-publish lane cannot clobber + * each other. + */ +void shim_globals_attn_or(guest_t *g, uint32_t bits) +{ + uint32_t *slot = (uint32_t *) (cache_base(g) + SHIM_GLOBALS_OFF_ATTN); + /* SEQ_CST, not ACQ_REL. The CRED_BRACKETED invariant is the + * contrapositive of release-acquire: 'if a sibling vCPU LDAR-loads + * attn and sees 0, that sibling also does not yet observe any of + * the post-OR publish_creds stores.' Acquire-release only guarantees + * the forward direction (if you see the OR, you see prior stores); + * the contrapositive needs a total order across atomics, which on + * ARM64 SEQ_CST provides via DMB ISH. The OR runs only on rare + * setuid/setgid/etc paths so the extra barrier is not a hot-path + * cost. shim_globals_attn_and stays RELEASE because it runs after + * publish_creds and only needs to order those prior stores before + * the clear. + */ + __atomic_fetch_or(slot, bits, __ATOMIC_SEQ_CST); + vdso_attention_or(g, bits); +} + +void shim_globals_attn_and(guest_t *g, uint32_t mask) +{ + uint32_t *slot = (uint32_t *) (cache_base(g) + SHIM_GLOBALS_OFF_ATTN); + /* RELEASE is sufficient for the clear path: the bracket runs + * publish_creds BEFORE this clear, and RELEASE here pairs with the + * shim's LDAR so any sibling that observes the cleared bit also sees + * the published cred slots. + */ + __atomic_fetch_and(slot, mask, __ATOMIC_RELEASE); + vdso_attention_and(g, mask); +} + +void shim_globals_raise_attention(guest_t *g) +{ + /* Signal/timer/exit-group lane. OR-only update so a concurrent + * cred publish's ATTN_BIT_CRED stays set. The release-store + * pairs with the shim's LDAR on the same address. + */ + shim_globals_attn_or(g, ATTN_BIT_SIGTIMER); + + /* Kick any vCPU spinning in EL0 on the identity fast path. Without + * the exit, the spinning vCPU never traps into EL1 and never + * reads the new attention value, so a SIGALRM queued for it + * waits until its host-thread timeslice ends. Reusing the + * existing signal-preemption helper (which iterates the live + * vCPU set under thread_lock) avoids duplicating the iteration + * logic; on a single-vCPU guest the loop is essentially a no-op. + */ + thread_interrupt_all(); +} + +void shim_globals_recompute_attention(guest_t *g) +{ + /* Only owns the SIGTIMER lane; CRED and TRACE stay untouched so a + * concurrent setuid/setgid bracket or persistent verbose-tracing gate + * cannot be undone by the HVC #5 epilogue dropping signal attention. + * Set or clear ATTN_BIT_SIGTIMER atomically. + */ + bool need = proc_exit_group_requested() || signal_attention_needed(); + if (need) + shim_globals_attn_or(g, ATTN_BIT_SIGTIMER); + else + shim_globals_attn_and(g, ~ATTN_BIT_SIGTIMER); +} + +void shim_globals_set_trace_enabled(guest_t *g, bool enabled) +{ + if (enabled) + shim_globals_attn_or(g, ATTN_BIT_TRACE); + else + shim_globals_attn_and(g, ~ATTN_BIT_TRACE); +} diff --git a/src/core/shim-globals.h b/src/core/shim-globals.h new file mode 100644 index 0000000..8e1a389 --- /dev/null +++ b/src/core/shim-globals.h @@ -0,0 +1,308 @@ +/* EL1 shim globals (identity cache + attention flag) + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * A small struct of host-published values that the EL1 shim consumes + * to serve identity syscalls (getpid 172, getppid 173, getuid 174, + * geteuid 175, getgid 176, getegid 177) without an HVC round-trip. + * + * The cache lives at the start of the shim_data block (high IPA, + * inside the infra reserve). Three layered protections keep guest + * EL0 code from MAP_FIXED / MREMAP / MADVISE-spoofing the cache: + * + * - sys_mmap MAP_FIXED rejects ranges hitting infra + * - sys_munmap and sys_mprotect reject infra ranges + * - sys_mremap (all variants) and sys_madvise reject infra ranges + * + * Not yet defended: direct EL0 store to the cache GVA. The shim_data + * block is mapped PT_AP_RW_EL0 (RW at both ELs), and /proc/self/maps + * exposes [shim-data]. A guest that knows the layout can store the + * cache base into a register and write spoofed values directly. This + * is documented as out of scope; closing it requires a new AP[2:1]=00 + * permission level (RW at EL1, no EL0 access) which is a separate + * hardening item. The elfuse threat model treats the guest as the + * user's own binary, not adversarial, so direct-write spoofing is a + * defense-in-depth gap rather than an active vulnerability. + * + * The shim addresses the cache via TPIDR_EL1, which the host sets at + * every vCPU init point (bootstrap, fork-child, CLONE_THREAD, exec + * re-init). TPIDR_EL1 is unused by elfuse aside from this and is not + * trapped under default HCR_EL2 settings at EL1. + * + * Memory ordering: each publish uses __ATOMIC_RELEASE. The shim reads + * the attention flag with LDAR (acquire) to pair with the release. + * Identity slot reads stay plain LDR -- each is independent and + * naturally-aligned 64-bit loads are single-copy atomic on AArch64. + */ + +#pragma once + +#include +#include + +#include "core/guest.h" + +/* Layout within shim_data_base (offsets are bytes from the cache base + * which equals shim_data_base; the shim's TPIDR_EL1 holds exactly this + * address). + * + * Attention flag sits at offset 0 so the shim's LDAR (which only + * supports a register base with no immediate offset) can load it via + * 'ldar w_, [x12]' where x12 = mrs tpidr_el1. Identity slots follow + * 8-byte-aligned, with PID at offset 0x08; the shim adds 8 to the + * base and then indexes by (X8 - 172) * 8 to land on the requested + * slot. Attention=0 takes the fast path; nonzero forces HVC. + * + * Slice A ships attention as always-zero (the setter API exists but + * is only called from cred publish in Slice B). The fast path is + * gated already so Slice B can wire signal_queue / setitimer / exit- + * group setters without further shim changes. + */ +#define SHIM_GLOBALS_OFF_ATTN 0x00 + +/* Attention is a bitmask, not a boolean. Splitting it by owner lets the + * HVC #5 epilogue's recompute (which polls signal/itimer state) coexist + * with the cred-publish bracket without clobbering it. The shim still + * does a single cbnz on the whole word: any bit set forces the slow + * path. Bit ownership keeps recompute and cred bracket independent. + * + * ATTN_BIT_SIGTIMER owned by signal_queue / setitimer / exit_group + * and signal_check_timer's recompute. Set when + * a signal is pending or an itimer is armed. + * ATTN_BIT_CRED owned by CRED_BRACKETED in setuid/setgid + * wrappers. Set across the four-slot publish + * window so concurrent shim readers fall back + * to HVC and see _Atomic-coherent host values. + * ATTN_BIT_TRACE owned by --verbose syscall tracing. Set for the + * lifetime of a verbose run so EL1 shim fast paths + * fall back to HVC and syscall_dispatch can log them. + * + * Earlier revisions used a single boolean: a sibling vCPU's recompute + * dropping it to zero mid-publish reopened the torn-cred window the + * bracket was meant to close. + */ +#define ATTN_BIT_SIGTIMER 0x00000001u +#define ATTN_BIT_CRED 0x00000002u +#define ATTN_BIT_TRACE 0x00000004u + +#define SHIM_IDENTITY_BASE 0x08 +#define SHIM_IDENTITY_OFF_PID 0x08 +#define SHIM_IDENTITY_OFF_PPID 0x10 +#define SHIM_IDENTITY_OFF_UID 0x18 +#define SHIM_IDENTITY_OFF_EUID 0x20 +#define SHIM_IDENTITY_OFF_GID 0x28 +#define SHIM_IDENTITY_OFF_EGID 0x30 + +/* Urandom fast path (Slice D / P3): closes the /dev/urandom 1B read + * band PR #48 left at the HVF round-trip floor. + * + * Layout (continues from the identity section): + * 0x38 .. 0xB7 URANDOM_FD_BITMAP 128 bytes = 1024 bits = FD_TABLE_SIZE + * 0xB8 .. 0xBB URANDOM_RING_HEAD uint32, consumer cursor (atomic) + * 0xBC .. 0xBF URANDOM_RING_TAIL uint32, producer cursor (host-only) + * 0xC0 .. 0x10BF URANDOM_RING 4096-byte CSPRNG ring + * 0x10C0..0x10C3 URANDOM_RING_LOCK uint32, producer/consumer lock + * + * The bitmap is bit N == 1 iff guest fd N currently refers to an + * FD_URANDOM-typed entry. The shim's read fast path consults this + * before serving from the ring; any other fd type falls through to + * HVC. Host maintains the bitmap from fd_alloc / fd_mark_closed. + * + * Ring head/tail are byte counters that grow monotonically (uint32); + * fill = tail - head (uint32 subtract) is the available byte count, + * pos = head & (URANDOM_RING_SIZE - 1) is the index in the ring. + * Both cursors are atomic. The shim advances head via LDXR/STXR; the + * host advances tail via release-store after writing fresh entropy. + * The producer and shim consumer also take RING_LOCK while touching the + * ring so the host cannot overwrite a slice after the shim reserves it + * but before the EL1 copy has loaded it. + * + * Size must be a power of two so the index mask is AND of (SIZE - 1). + */ +#define SHIM_URANDOM_OFF_BITMAP 0x0038 +#define SHIM_URANDOM_BITMAP_BYTES 128 +#define SHIM_URANDOM_OFF_RING_HEAD 0x00B8 +#define SHIM_URANDOM_OFF_RING_TAIL 0x00BC +#define SHIM_URANDOM_OFF_RING 0x00C0 +#define SHIM_URANDOM_RING_SIZE 4096 +#define SHIM_URANDOM_OFF_RING_LOCK 0x10C0 + +#define SHIM_GLOBALS_SIZE 0x10C4 + +/* Initialize the cache region to all-zero. Called once per process at + * the same time the shim_data block is set up (initial bootstrap and + * fork-child). The initial attention=0 means the shim takes the fast + * path until a setter raises it. + */ +void shim_globals_init(guest_t *g); + +/* Publish pid + ppid pair atomically (release-store per slot). Called + * at process init, after fork-child identity is installed, and after + * any future PID/PPID mutation. pid and ppid are int64 to match + * proc_get_pid/proc_get_ppid; values are stored zero/sign-extended. + */ +void shim_globals_publish_pid(guest_t *g, int64_t pid, int64_t ppid); + +/* Publish all four credential slots. Slot writes are independent + * 64-bit atomic stores; concurrent shim reads on another vCPU may + * see partial updates. Slice B's attention bracket eliminates that + * race; until then, callers must accept that a concurrent + * getuid+geteuid sequence on a different vCPU can witness a torn + * cred set across a setresuid moment. Linux semantics require an + * atomic cred swap; bracket via attention closes that gap. + */ +void shim_globals_publish_creds(guest_t *g, + uint32_t uid, + uint32_t euid, + uint32_t gid, + uint32_t egid); + +/* GVA of the cache base. Equal to g->shim_data_base. Exposed so the + * TPIDR_EL1 setup site and tests can reference one source of truth. + */ +uint64_t shim_globals_gva(const guest_t *g); + +/* Pre-flight validation that hv_vcpu_set_sys_reg + hv_vcpu_get_sys_reg + * round-trip on TPIDR_EL1. Writes a sentinel and reads it back via + * the same HVF accessors the bootstrap uses; aborts (log_error + -1) + * on mismatch. ARM documents TPIDR_EL1 as ordinary EL1 thread/CPU + * pointer storage with no HCR trap on the EL1-side MRS/MSR. + * + * Note: this test runs BEFORE the first hv_vcpu_run; it does not + * verify that HVF preserves the register across vCPU run/exit + * boundaries. The existing test-shim-identity microbench is the + * end-to-end check for that property -- if HVF clobbered TPIDR_EL1, + * every identity-class fast path would observe a stale base and + * test-shim-identity would fail on the first iteration after + * remap_vdso_page. + * + * Returns 0 on success, -1 on failure. + */ +int shim_globals_self_test(hv_vcpu_t vcpu); + +/* Install TPIDR_EL1 = shim_globals_gva(g) on a vCPU. Called from the + * four vCPU init sites listed in the file header. + */ +int shim_globals_install_tpidr(hv_vcpu_t vcpu, const guest_t *g); + +/* Install CONTEXTIDR_EL1 = tid for the gettid shim fast path. The + * register is per-vCPU and unused elsewhere in elfuse (HVF preserves + * it across hv_vcpu_run alongside the rest of EL1 state). The shim + * answers SVC #0 with X8 == 178 (gettid) by emitting a single + * 'mrs x0, CONTEXTIDR_EL1' and an 'eret', skipping the HVC #5 + * round-trip the same way the identity slot loads do for syscalls + * 172-177. Caller passes the Linux tid; it is zero/sign-extended + * into the 64-bit sysreg slot. + * + * Setup sites: + * bootstrap.c initial main thread (tid == pid) + * forkipc.c fork-child main tid == child pid + * forkipc.c CLONE_THREAD tid == thread's allocated guest_tid + * forkipc.c CLONE_VM tid == child's guest_tid + * + * sys_execve reuses the vCPU and the main thread's tid does not + * change across exec, so no re-set is required there. + */ +int shim_globals_install_tid(hv_vcpu_t vcpu, int64_t tid); + +/* Combined install: TPIDR_EL1 = shim_globals base, CONTEXTIDR_EL1 = tid. + * Used by every vCPU init site (bootstrap, fork-child main, CLONE_THREAD + * worker, CLONE_VM child). Returns 0 on success, -1 on either failure. + * sys_execve uses install_tpidr alone because the tid is unchanged + * across exec. + */ +int shim_globals_install_per_vcpu(hv_vcpu_t vcpu, + const guest_t *g, + int64_t tid); + +/* Attention flag setters (Slice B). + * + * The shim's identity fast path reads the attention flag with LDAR + * before doing anything else. When nonzero, the shim falls back to + * HVC #5 so the host's post-syscall epilogue can deliver any pending + * signal or itimer expiry. + * + * shim_globals_raise_attention sets the flag to 1 atomically (release) + * and also issues hv_vcpus_exit on every sibling vCPU so any vCPU + * already spinning in EL0 drops out of hv_vcpu_run and re-checks the + * flag on the next entry. Without the exit, a tight identity loop on + * one vCPU could ignore an attention raise on another vCPU until its + * timeslice ended. + * + * shim_globals_recompute_attention re-derives the flag from + * (signal_pending OR any guest_itimer active OR exit_group requested). + * Called from the HVC #5 epilogue after signal_check_timer to drop + * the flag back to zero whenever the slow-path workload has drained. + * + * The g pointer in both is necessary because the cache is per-guest. + * Slice B's signal.c hooks call these via a singleton guest pointer + * registered at process init (see signal_set_shim_globals_guest in + * src/syscall/signal.h). + */ +void shim_globals_raise_attention(guest_t *g); +void shim_globals_recompute_attention(guest_t *g); +void shim_globals_set_trace_enabled(guest_t *g, bool enabled); + +/* OR / AND specific attention bits without disturbing the others. Used + * by the CRED_BRACKETED macro to set ATTN_BIT_CRED before mutating + * host credentials and clear it after publish. signal_queue and the + * itimer setters take the ATTN_BIT_SIGTIMER lane via raise_attention + * and recompute_attention; --verbose owns ATTN_BIT_TRACE. The lanes do not + * collide. + */ +void shim_globals_attn_or(guest_t *g, uint32_t bits); +void shim_globals_attn_and(guest_t *g, uint32_t mask); + +/* Urandom bitmap maintenance (Slice D / P3). + * + * The fd-type bitmap is updated by the fd table whenever an FD_URANDOM + * slot opens or closes (including dup, fork-IPC restore, etc.). The + * shim's read-fast-path consults the bitmap with a single 64-bit load + * and a bit test to decide whether the requested fd should hit the + * urandom ring or fall through to HVC. + * + * Updates use atomic OR/AND on the affected 64-bit word so concurrent + * dup races (sibling vCPU dup'ing into a freshly-opened slot) cannot + * lose either bit. Storing as uint64 rather than per-bit-CAS keeps + * the host hook trivial. + * + * shim_globals_set_singleton publishes the live guest_t * so the + * fd-table hooks can update the bitmap without threading g through + * every fd_alloc / fd_mark_closed call site. Same NULL-or-same + * lifecycle assertion as the signal.c singleton. Call from bootstrap + * (initial) and fork-child (after guest_init). + */ +void shim_globals_set_singleton(guest_t *g); + +/* Reset the singleton to NULL. Called from syscall_init() at process + * start so a stale parent-process pointer cannot survive across a + * posix_spawn fork-child re-init and silently drop bitmap updates. + * Mirrors signal_init()'s attention_guest=NULL reset. + */ +void shim_globals_reset_singleton(void); + +void shim_globals_mark_urandom_fd(int fd, bool is_urandom); + +/* Rebuild the urandom bitmap from the current fd table state. Used by + * the fork-child path: the inherited fd table holds the parent's + * FD_URANDOM slots but the child just zeroed its shim-globals via + * shim_globals_init, so the bitmap must be re-populated to reflect + * what the child actually has open. Acquires fd_lock internally. + */ +void shim_globals_rebuild_urandom_bitmap(void); + +/* Refill the entropy ring with fresh CSPRNG bytes from arc4random_buf. + * Called from the host's sys_read slow path when a FD_URANDOM read + * encounters an empty (or low-water) ring. The fill always brings tail + * up to head + URANDOM_RING_SIZE so the ring is full after refill. + * + * The initial fill is NOT done by shim_globals_init (which only zeros the + * cache). Every bring-up path that uses the urandom fast path must call + * this explicitly after shim_globals_init: bootstrap.c does it during VM + * bring-up, src/syscall/exec.c does it on execve, and src/runtime/forkipc.c + * does it on the fork-child receive path. Any future init site that forgets + * this call leaves the ring empty, so the first urandom read on that vCPU is + * forced through the host SVC. + */ +void shim_globals_refill_urandom_ring(guest_t *g); diff --git a/src/core/shim.S b/src/core/shim.S index 7c1dbd2..a2613c3 100644 --- a/src/core/shim.S +++ b/src/core/shim.S @@ -272,11 +272,220 @@ svc_handler: /* Extract SVC immediate (bits [15:0]) */ and x11, x9, #0xFFFF - cmp x11, #0 /* SVC #0 = Linux syscall? */ - b.eq handle_svc_0 + /* Inverted from "b.eq handle_svc_0" so the SVC #0 fast-path + * dispatch can fall through without an extra branch. + */ + cmp x11, #0 + b.ne restore_and_bad + + /* Identity-class fast path. X8 in [172, 179) is one of getpid (172), + * getppid (173), getuid (174), geteuid (175), getgid (176), getegid (177), + * or gettid (178). The first six read from the shim-globals cache + * (TPIDR_EL1 base, host-published scalar slots starting at offset 8); + * gettid reads its per-vCPU tid from CONTEXTIDR_EL1 directly. Layout-wise + * the cache has the attention flag at offset 0 (LDAR'd here to enforce the + * slow-path gate from Slice B and the --verbose trace gate) and six + * identity slots after it. + * + * Saved X8 is at [sp+64] per SAVE_GPRS. Scratch X10..X13 are restored from + * the frame by RESTORE_GPRS_KEEP_X0 at the named tail (svc_restore_eret), + * so the Linux ABI guarantee (X1..X30 preserved across SVC) is intact even + * on the fast path. + */ + ldr x10, [sp, #64] /* saved X8 (syscall nr) */ + sub x11, x10, #172 + cmp x11, #7 /* in identity-class range? */ + b.lo identity_class_fast /* 172..178 -> identity / gettid */ + cmp x10, #63 /* SYS_read? */ + b.eq urandom_read_fast + b handle_svc_0 + +identity_class_fast: + mrs x12, tpidr_el1 /* shim-globals base */ + ldar w13, [x12] /* attention flag, acquire */ + cbnz w13, handle_svc_0 /* slow-path required */ + cmp x11, #6 /* bias == 6 ==> gettid (178) */ + b.eq gettid_fast + add x12, x12, #8 /* skip attention -> identity[0] */ + ldr x0, [x12, x11, lsl #3] /* identity[bias] for 172..177 */ + b svc_restore_eret + +gettid_fast: + mrs x0, contextidr_el1 /* per-vCPU tid */ + b svc_restore_eret + + /* Urandom-read fast path (Slice D / P3). Serves + * read(urandom_fd, buf, len) with len in [1, 64] by popping + * len bytes from the shim-globals entropy ring (TPIDR_EL1 base + + * 0xC0) into the guest-supplied buffer (X1), advancing the ring + * head atomically. If the requested fd is not FD_URANDOM, or + * the ring is low, or the read would cross a ring-wrap boundary, + * falls through to handle_svc_0 so the host serves the read and + * refills the ring. + * + * Layout offsets (match core/shim-globals.h SHIM_URANDOM_OFF_*): + * 0x0038 URANDOM_FD_BITMAP 1024 bits = 128 bytes + * 0x00B8 RING_HEAD uint32, atomic consumer cursor + * 0x00BC RING_TAIL uint32, host-released tail + * 0x00C0 RING 4096 bytes + * 0x10C0 RING_LOCK uint32 + */ +urandom_read_fast: + mrs x12, tpidr_el1 + ldar w13, [x12] /* attention flag */ + cbnz w13, handle_svc_0 + + ldr x14, [sp, #0] /* saved X0 = fd */ + cmp x14, #1024 /* FD_TABLE_SIZE */ + b.hs handle_svc_0 + ldr x15, [sp, #16] /* saved X2 = len */ + cbz x15, handle_svc_0 /* host handles len == 0 */ + cmp x15, #64 /* URANDOM_INLINE_LIMIT */ + b.hi handle_svc_0 + + /* Bitmap test: word = fd >> 6, bit = fd & 63. */ + add x16, x12, #0x38 /* SHIM_URANDOM_OFF_BITMAP */ + lsr x17, x14, #6 + ldr x17, [x16, x17, lsl #3] + and x18, x14, #63 + lsr x17, x17, x18 + tbz w17, #0, handle_svc_0 + + ldr x20, [sp, #8] /* saved X1 = buf */ + /* Probe the guest buffer for stage-1 EL0-write translations before + * doing any EL1 store. PROT_NONE or unmapped pages bail to the + * slow path here; the host's sys_read returns -EFAULT. + * + * The probe handles the STATIC case (buffer already unmapped at + * entry). The DYNAMIC case where a sibling vCPU munmaps the buffer + * in the window between probe and strb is caught later by the + * EL1 data abort vector routing into handle_el1_data_abort_recover + * (which rolls back the ring head, releases the lock, and returns + * -EFAULT). Without that recovery the EL1 strb would fault into + * BAD_VEC and halt the VM. + * + * len is in [1, 64]. Probing the first and last byte covers every page + * the inline copy can touch on Linux/AArch64, whose base page size is + * much larger than the inline limit. + */ + at s1e0w, x20 + isb + mrs x16, par_el1 + tbnz x16, #0, urandom_slow_no_clrex + sub x16, x15, #1 + adds x16, x20, x16 + b.cs urandom_slow_no_clrex + at s1e0w, x16 + isb + mrs x16, par_el1 + tbnz x16, #0, urandom_slow_no_clrex - /* Unrecognized SVC; restore and report as bad exception */ - b restore_and_bad + /* Serialize host refill against the shim's reserve-then-copy window. + * Lock word lives after the 4096-byte ring at offset 0x10C0. + */ + add x19, x12, #0x1, lsl #12 /* base + 0x1000 */ + add x19, x19, #0xC0 /* &ring_lock */ + mov w18, #1 +urandom_lock_spin: + ldaxr w17, [x19] + cbnz w17, urandom_lock_busy + stxr w17, w18, [x19] + cbnz w17, urandom_lock_spin + b urandom_locked +urandom_lock_busy: + clrex + yield + b urandom_lock_spin + +urandom_locked: + add x21, x12, #0xB8 /* &ring_head */ + add x22, x12, #0xBC /* &ring_tail */ +0: ldxr w23, [x21] /* head */ + ldar w24, [x22] /* tail (host release-store) */ + sub w25, w24, w23 /* fill = tail - head */ + cmp w25, w15 + b.lo urandom_clrex_slow /* ring too low */ + and w26, w23, #(4096 - 1) /* pos = head & (RING_SIZE - 1) */ + add w27, w26, w15 + cmp w27, #4096 + b.hi urandom_clrex_slow /* would wrap: let slow path serve */ + add w27, w23, w15 /* new head = head + len */ + stxr w28, w27, [x21] + cbnz w28, 0b + + /* Head reserved; lock held. Snapshot ELR_EL1 + SPSR_EL1 into a + * recovery slot below the EL1 stack frame so the EL1 data abort + * recovery handler (handle_el1_data_abort_recover, below) can + * restore them if a subsequent strb faults. A sibling vCPU can + * munmap the guest buffer in the window between the AT probe and + * the byte copy; without this slot the resulting EL1 data abort + * overwrites ELR_EL1 with the strb PC and there is no way to + * resume EL0 at the post-SVC instruction. Both success exits + * pop the slot before svc_restore_eret. + */ + mrs x29, elr_el1 + mrs x30, spsr_el1 + stp x29, x30, [sp, #-16]! + + /* Copy bytes from ring[pos] to buf. len is in [1, 64]. + * w26 holds pos in [0, 4096); writing to w26 above zero-extends + * into x26, so a plain reg add (no extension) is correct. + */ + add x16, x12, #0xC0 /* ring base */ + add x16, x16, x26 /* ring + pos */ + cmp x15, #1 + b.ne urandom_copy_loop + + /* Common case: 1-byte read. Single byte transfer. */ +.globl urandom_strb_1byte_start +.globl urandom_strb_1byte_end +urandom_strb_1byte_start: + ldrb w0, [x16] + strb w0, [x20] +urandom_strb_1byte_end: + add sp, sp, #16 /* pop ELR/SPSR recovery slot */ + mov x0, #1 + stlr wzr, [x19] /* release ring_lock */ + b svc_restore_eret + +urandom_copy_loop: + /* Byte-wise copy for len in [2, 64]. Unrolling would help but + * the slow path is the realistic target for large reads. The + * loop runs at most 64 times; total cost is dwarfed by the EL0 + * entry/exit transitions. + */ + mov x29, #0 +.globl urandom_strb_loop_start +.globl urandom_strb_loop_end +urandom_strb_loop_start: +1: ldrb w0, [x16, x29] + strb w0, [x20, x29] + add x29, x29, #1 + cmp x29, x15 + b.ne 1b +urandom_strb_loop_end: + add sp, sp, #16 /* pop ELR/SPSR recovery slot */ + mov x0, x15 + stlr wzr, [x19] /* release ring_lock */ + b svc_restore_eret + +urandom_clrex_slow: + /* LDXR opened an exclusive monitor that the slow path will not + * release on its own. CLREX drops the monitor so subsequent + * LDXR/STXR sequences (including this shim's own next entry) + * are not poisoned. Release ring_lock before handing the read to + * the host; the host may need the same lock to refill the ring. + */ + clrex + stlr wzr, [x19] /* release ring_lock */ + +urandom_slow_no_clrex: + /* Reached on probe failure (no exclusive monitor open yet) or + * via the above clrex path. Both route into handle_svc_0 which + * runs the regular HVC #5 sys_read. The host then returns + * -EFAULT for the bad pointer or fulfills the read normally. + */ + b handle_svc_0 not_svc: /* EC=0x18: MSR/MRS / system instruction trap. */ @@ -305,6 +514,16 @@ not_svc: cmp x10, #0x3C b.eq handle_brk + /* EC=0x25: Data abort taken without a change in Exception level. + * The only legitimate source today is the urandom fast path: a + * sibling vCPU can munmap the guest buffer between the AT probe + * and the EL1 strb, faulting the store. Recover by returning + * -EFAULT to the guest instead of halting the VM (which is what + * the EL1-from-EL1 default below would do). + */ + cmp x10, #0x25 + b.eq handle_el1_data_abort_recover + /* Unrecognized EC. Check if from EL0 (deliver signal) or EL1 (shim bug). * X11 is saved on stack, safe to use as scratch. */ @@ -313,6 +532,61 @@ not_svc: cbnz x11, restore_and_bad /* EL1 (M>=4): genuine shim bug */ b handle_el0_fault /* EL0 (M=0): forward for signal delivery */ +/* handle_el1_data_abort_recover: tag a data abort whose faulting PC sits + * inside the urandom-copy strb region as a recoverable EFAULT. + * + * Layout invariants exploited here: + * - SAVE_GPRS allocated 256 bytes for THIS (inner) entry on top of + * whatever the outer SVC entry already pushed. Dropping that 256 + * puts SP at the recovery slot urandom_read_fast pushed before + * entering the strb region. + * - Recovery slot is 16 bytes: [sp+0]=ELR_EL1, [sp+8]=SPSR_EL1. + * - The lock at TPIDR_EL1 + 0x10C0 is held by this vCPU at the + * time the strb faulted, so release before transferring out. + * + * If the fault PC is outside the urandom strb ranges this is a genuine + * shim bug; fall back to restore_and_bad. + */ +handle_el1_data_abort_recover: + mrs x11, elr_el1 + adr x12, urandom_strb_1byte_start + cmp x11, x12 + b.lo 2f + adr x12, urandom_strb_1byte_end + cmp x11, x12 + b.lo 1f /* in [1byte_start, 1byte_end) */ +2: adr x12, urandom_strb_loop_start + cmp x11, x12 + b.lo restore_and_bad + adr x12, urandom_strb_loop_end + cmp x11, x12 + b.hs restore_and_bad +1: + /* Drop the inner SAVE_GPRS frame; SP back at the urandom recovery slot. */ + add sp, sp, #256 + /* Pop the saved EL0 return state. ldp/post-index restores SP to the + * outer SAVE_GPRS frame top so svc_restore_eret's RESTORE_GPRS_KEEP_X0 + * pulls the original EL0 GPR values. + */ + ldp x9, x10, [sp], #16 + msr elr_el1, x9 + msr spsr_el1, x10 + /* Release the urandom ring_lock at TPIDR_EL1 + 0x10C0 (held by this + * vCPU since urandom_locked acquired it before the byte copy). + */ + mrs x11, tpidr_el1 + add x11, x11, #0x1, lsl #12 + add x11, x11, #0xC0 + stlr wzr, [x11] + /* Drop any open exclusive monitor (the byte loop does not hold one + * after the head STXR retired, but CLREX is cheap and removes a + * latent footgun for future readers of this code). + */ + clrex + /* Linux EFAULT = 14; the kernel returns -14 to userspace. */ + mov x0, #-14 + b svc_restore_eret + /* handle_sysreg_trap: EC=0x18: MSR/MRS / system instruction * * MRS reads (Direction=1): forward to host via HVC #7 to read the system @@ -626,11 +900,19 @@ tlbi_selective: ic iallu dsb ish isb - b 1f + b svc_restore_eret +svc_restore_eret: 1: /* Restore all guest registers except X0, which now holds the syscall * return value. Linux preserves X1-X30, including X8. + * + * Named alias for the cross-function jump from the identity fast + * path in svc_handler. A bare 'b 1f' from up there would resolve + * to the next forward '1:' -- which sits inside handle_inst_abort + * -- and silently re-route the identity result into a W^X toggle. + * Caught the hard way during the prior P2 attempt; keep the + * named symbol to make the intent explicit. */ RESTORE_GPRS_KEEP_X0 diff --git a/src/core/startup-trace.h b/src/core/startup-trace.h new file mode 100644 index 0000000..b2b75d8 --- /dev/null +++ b/src/core/startup-trace.h @@ -0,0 +1,66 @@ +/* Startup tracing helpers + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Lightweight per-step wall-time tracer for VM bring-up. Gated by the + * ELFUSE_STARTUP_TRACE environment variable so a release-build run pays + * exactly one getenv + one branch per step when disabled. The helpers are + * static inline so each translation unit can use them without pulling in a + * separate object; the getenv check resolves once per translation unit but + * the resolution itself is idempotent. + */ + +#ifndef ELFUSE_STARTUP_TRACE_H +#define ELFUSE_STARTUP_TRACE_H + +#include +#include +#include +#include +#include +#include +#include + +/* File-scope cache (one copy per translation unit including this header). + * pthread_once serializes concurrent first callers and supplies the + * memory ordering that makes the cached value safely visible to all + * subsequent readers without explicit atomics. + */ +static pthread_once_t startup_trace_once = PTHREAD_ONCE_INIT; +static bool startup_trace_value; + +static inline void startup_trace_resolve(void) +{ + const char *v = getenv("ELFUSE_STARTUP_TRACE"); + startup_trace_value = v && v[0] && strcmp(v, "0") != 0; +} + +static inline bool startup_trace_enabled(void) +{ + pthread_once(&startup_trace_once, startup_trace_resolve); + return startup_trace_value; +} + +static inline uint64_t startup_trace_now_ns(void) +{ + if (!startup_trace_enabled()) + return 0; + struct timespec ts; + if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0) + return 0; + return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec; +} + +static inline void startup_trace_step(const char *label, uint64_t start_ns) +{ + if (start_ns == 0) + return; + uint64_t end_ns = startup_trace_now_ns(); + if (end_ns < start_ns) + return; + fprintf(stderr, "startup %-28s %8.3f ms\n", label, + (double) (end_ns - start_ns) / 1000000.0); +} + +#endif /* ELFUSE_STARTUP_TRACE_H */ diff --git a/src/core/vdso.c b/src/core/vdso.c index 444be88..6cf8f6f 100644 --- a/src/core/vdso.c +++ b/src/core/vdso.c @@ -4,16 +4,25 @@ * Copyright 2025 Moritz Angermann, zw3rk pte. ltd. * SPDX-License-Identifier: Apache-2.0 * - * Builds a minimal vDSO ELF image in guest memory exposing - * __kernel_{rt_sigreturn,clock_getres,clock_gettime,gettimeofday}. Each entry - * point is an SVC trampoline that traps back to the host for the actual work. + * Builds a minimal vDSO ELF image in guest memory exposing versioned + * __kernel_{rt_sigreturn,clock_getres,clock_gettime,gettimeofday}. + * __kernel_clock_gettime is a CNTVCT-based fast-path trampoline that serves + * CLOCK_MONOTONIC (clockid 1) and CLOCK_REALTIME (clockid 0) inline without + * trapping; rt_sigreturn / clock_getres / gettimeofday remain 12-byte SVC + * trampolines that fall back to the host syscall implementations. * - * An earlier revision had a CNTVCT-based fast path for clock_gettime backed by - * a host-updated vvar page. That path was incorrect under HVF: the host writes - * CNTVCT_EL0 from the macOS frame of reference while the guest reads it through - * HVF's CNTVOFF_EL2 virtualization, so the seqlock interpolation produced bogus - * times (year 26382). The fast path is gone; SVC is correct and the trap cost - * is negligible compared to the work clock_gettime callers tend to do anyway. + * The fast path reads CNTVCT_EL0 at EL0 (enabled via CNTKCTL_EL1.EL0VCTEN in + * the bootstrap), looks up the host-published anchor in the vvar (initialized, + * anchor_cntvct, anchor_mono_sec/nsec, anchor_real_sec/nsec), and interpolates + * the requested clock from the CNTVCT delta. The vvar is seeded on the first + * clock_gettime SVC fallback, gated on ELR_EL1 == svc_fallback_pc + 4 so an + * unrelated raw syscall(SYS_clock_gettime, ...) cannot poison the anchor from + * an arbitrary X9 value. A three-state CAS (0 -> 2 -> 1) keeps concurrent + * first-callers from tearing anchor fields. + * + * Wall-clock anchors are not refreshed if macOS NTP steps host time; long- + * running daemons can observe drift relative to a fresh REALTIME SVC. The + * SVC path remains correct in all cases for callers that bypass the vDSO. */ #include @@ -44,11 +53,28 @@ typedef struct { uint64_t st_value, st_size; } elf64_sym_t; +typedef struct { + uint16_t vd_version; + uint16_t vd_flags; + uint16_t vd_ndx; + uint16_t vd_cnt; + uint32_t vd_hash; + uint32_t vd_aux; + uint32_t vd_next; +} elf64_verdef_t; + +typedef struct { + uint32_t vda_name; + uint32_t vda_next; +} elf64_verdaux_t; + /* ELF constants */ #define SHT_STRTAB 3 #define SHT_HASH 5 #define SHT_DYNAMIC 6 #define SHT_DYNSYM 11 +#define SHT_GNU_VERDEF 0x6ffffffd +#define SHT_GNU_VERSYM 0x6fffffff #define SHF_ALLOC (1ULL << 1) #define SHF_EXECINSTR (1ULL << 2) #define DT_NULL 0 @@ -57,10 +83,26 @@ typedef struct { #define DT_SYMTAB 6 #define DT_STRSZ 10 #define DT_SYMENT 11 +#define DT_VERSYM 0x6ffffff0 +#define DT_VERDEF 0x6ffffffc +#define DT_VERDEFNUM 0x6ffffffd #define STB_GLOBAL 1 #define STT_FUNC 2 +#define VER_DEF_CURRENT 1 +#define VDSO_LINUX_VERSION_INDEX 2 #define ELF_ST_INFO(bind, type) (((bind) << 4) | ((type) & 0xf)) +/* Host-owned vDSO page accessor. The vDSO is mapped RX to EL0, so guest + * permission walkers cannot write here; route every host build/seed/attention + * mutation through this bounds-checked direct host_base+VDSO_BASE pointer. + */ +static uint8_t *vdso_host_page(guest_t *g) +{ + if (VDSO_BASE + VDSO_SIZE > g->guest_size) + return NULL; + return (uint8_t *) g->host_base + VDSO_BASE; +} + /* Layout. * * Symbol layout (all entries are 12-byte SVC trampolines): @@ -75,50 +117,137 @@ typedef struct { #define VDSO_OFF_PHDR 0x040 #define VDSO_OFF_PHDR1 0x078 -/* .text trampolines (each 12 bytes: mov x8, #N; svc #0; ret). */ -#define TEXT_OFF_SIGRET 0x0B0 -#define TEXT_OFF_GETRES 0x0BC -#define TEXT_OFF_GETTIME 0x0C8 -#define TEXT_OFF_GETTOD 0x0D4 -#define TEXT_END 0x0E0 +/* vvar at fixed offset; host writes the wall-clock anchor on first + * clock_gettime SVC, after the guest trampoline has stored its own + * CNTVCT_EL0 read into X9. Layout: + * +0 uint32 initialized (host sets 1 after the anchor fields) + * +4 uint32 attention (host mirrors shim attention bits; nonzero -> SVC) + * +8 uint64 anchor_cntvct (guest frame, written by host from X9) + * +16 uint64 anchor_mono_sec (CLOCK_MONOTONIC anchor) + * +24 uint64 anchor_mono_nsec + * +32 uint64 anchor_real_sec (CLOCK_REALTIME anchor) + * +40 uint64 anchor_real_nsec + * + * Both anchor pairs are seeded together at the first vDSO-mediated + * clock_gettime SVC. The trampoline interpolates either pair from the + * shared CNTVCT delta; the picking of MONO vs REAL is done by adding + * VVAR_OFF_ANCHOR_MONO_SEC or VVAR_OFF_ANCHOR_REAL_SEC to the vvar base + * and LDPing the two-doubleword anchor. + * + * Wall-clock anchors are not refreshed on macOS NTP steps; long-running + * processes that observe sub-second wall-clock movements will see drift + * relative to a fresh clock_gettime(REALTIME) syscall. This matches the + * existing CNTVCT-based design and the standard tradeoff for vDSO time + * routines that lack a kernel-driven seqlock. + */ +#define VDSO_OFF_VVAR 0x0B0 +#define VVAR_OFF_INITIALIZED 0x00 +#define VVAR_OFF_ATTENTION 0x04 +#define VVAR_OFF_ANCHOR_CNTVCT 0x08 +#define VVAR_OFF_ANCHOR_MONO_SEC 0x10 +#define VVAR_OFF_ANCHOR_MONO_NSEC 0x18 +#define VVAR_OFF_ANCHOR_REAL_SEC 0x20 +#define VVAR_OFF_ANCHOR_REAL_NSEC 0x28 +#define VVAR_SIZE 0x30 + +/* .text trampolines. rt_sigreturn / clock_getres / gettimeofday are 12-byte + * SVC trampolines. clock_gettime is the CNTVCT-based fast-path trampoline + * (140 bytes = 35 instructions including the svc_fallback tail). The + * trampoline uses LDAR on the vvar initialized flag, treats both states + * 0 (unseeded) and 2 (host-side reservation in vdso_seed_anchor) as + * fall-back, also falls back while attention is pending, and guards the + * CNTVCT-anchor subtraction against unsigned underflow via SUBS + B.LO. The + * fast path now serves both clockid 0 (CLOCK_REALTIME) and clockid 1 + * (CLOCK_MONOTONIC); other clockids fall back to SVC. + */ +#define TEXT_OFF_SIGRET 0x0E0 +#define TEXT_OFF_GETRES 0x0EC +#define TEXT_OFF_GETTIME 0x0F8 +#define TEXT_GETTIME_SIZE 0x8C +#define TEXT_OFF_GETTOD (TEXT_OFF_GETTIME + TEXT_GETTIME_SIZE) +#define TEXT_END (TEXT_OFF_GETTOD + 12) +/* Offset of the SVC instruction inside __kernel_clock_gettime's svc_fallback + * (svc_fallback opens at instruction 33 of 35, i.e. byte 0x80; the SVC is + * the second instruction of the fallback, at byte 0x84). The host's + * sys_clock_gettime uses this value to gate vvar seeding: only a trap whose + * ELR_EL1 equals SVC_PC + 4 came from the trampoline and may carry a + * trustworthy CNTVCT in X9. + */ +#define VDSO_CLOCK_GETTIME_SVC_PC (TEXT_OFF_GETTIME + 0x84) + +/* dynstr, dynsym, hash, GNU version metadata, dynamic, shdr follow. + * TEXT_END is 0x190 after the attention-check expansion. + */ +#define VDSO_OFF_DYNSTR 0x190 + +/* Padded to 8-byte align: 0x190 + 103 = 0x1F7, pad to 0x1F8 */ +#define VDSO_OFF_DYNSYM 0x1F8 -/* dynstr, dynsym, hash, dynamic, shdr follow */ -#define VDSO_OFF_DYNSTR 0x0E0 -#define DYNSTR_SIZE 90 +/* 5 * 24 = 120, 0x1F8 + 120 = 0x270 */ +#define VDSO_OFF_HASH 0x270 -/* Padded to 4-byte align: 0x0E0 + 90 = 0x13A, pad to 0x13C */ -#define VDSO_OFF_DYNSYM 0x13C +/* 2+1+5 = 8 words * 4 = 32, 0x270 + 32 = 0x290 */ +#define VDSO_OFF_VERSYM 0x290 -/* 5 * 24 = 120, 0x13C + 120 = 0x1B4 */ -#define VDSO_OFF_HASH 0x1B4 +/* 5 * 2 = 10, 0x290 + 10 = 0x29A, pad to 0x2A0 */ +#define VDSO_OFF_VERDEF 0x2A0 -/* 2+1+5 = 8 words * 4 = 32, 0x1B4 + 32 = 0x1D4, pad to 0x1D8 */ -#define VDSO_OFF_DYNAMIC 0x1D8 +/* Verdef + verdaux = 28, 0x2A0 + 28 = 0x2BC, pad to 0x2C0 */ +#define VDSO_OFF_DYNAMIC 0x2C0 -/* 6 * 16 = 96, 0x1D8 + 96 = 0x238 */ -#define VDSO_OFF_SHDR 0x238 +/* 9 * 16 = 144, 0x2C0 + 144 = 0x350 */ +#define VDSO_OFF_SHDR 0x350 + +/* 8 * 64 = 512, 0x350 + 512 = 0x550 (fits in 4 KiB) */ -/* 6 * 64 = 384, 0x238 + 384 = 0x3B8 (fits in 4KiB) */ #define VDSO_NUM_SYMS 4 #define HASH_NCHAIN (VDSO_NUM_SYMS + 1) #define HASH_NBUCKET 1 #define HASH_SIZE ((2 + HASH_NBUCKET + HASH_NCHAIN) * sizeof(uint32_t)) +#define VERSYM_SIZE ((VDSO_NUM_SYMS + 1) * sizeof(uint16_t)) +#define VERDEF_SIZE (sizeof(elf64_verdef_t) + sizeof(elf64_verdaux_t)) +#define VDSO_NUM_DYN 9 /* .dynstr data */ static const char dynstr_data[] = "\0__kernel_rt_sigreturn" "\0__kernel_clock_getres" "\0__kernel_clock_gettime" - "\0__kernel_gettimeofday"; - -/* Symbol name offsets */ -static const uint32_t sym_name_offsets[VDSO_NUM_SYMS] = {1, 23, 45, 68}; + "\0__kernel_gettimeofday" + "\0LINUX_2.6.39"; +#define DYNSTR_SIZE sizeof(dynstr_data) + +/* Symbol name offsets, derived from preceding string-literal lengths so a + * future edit to dynstr_data shifts them in lockstep instead of silently + * breaking the version lookup (sizeof("\0X") - 1 == bytes contributed when + * X is concatenated into dynstr_data; only the very last literal's trailing + * NUL survives concatenation). + */ +#define DYNSTR_BYTES_RT_SIGRETURN (sizeof("\0__kernel_rt_sigreturn") - 1) +#define DYNSTR_BYTES_CLOCK_GETRES (sizeof("\0__kernel_clock_getres") - 1) +#define DYNSTR_BYTES_CLOCK_GETTIME (sizeof("\0__kernel_clock_gettime") - 1) +#define DYNSTR_BYTES_GETTIMEOFDAY (sizeof("\0__kernel_gettimeofday") - 1) + +static const uint32_t sym_name_offsets[VDSO_NUM_SYMS] = { + 1, + DYNSTR_BYTES_RT_SIGRETURN + 1, + DYNSTR_BYTES_RT_SIGRETURN + DYNSTR_BYTES_CLOCK_GETRES + 1, + DYNSTR_BYTES_RT_SIGRETURN + DYNSTR_BYTES_CLOCK_GETRES + + DYNSTR_BYTES_CLOCK_GETTIME + 1, +}; +/* Skip the leading \0 of "\0LINUX_2.6.39" to land on 'L'. */ +#define VDSO_LINUX_VERSION_NAME_OFF \ + (DYNSTR_BYTES_RT_SIGRETURN + DYNSTR_BYTES_CLOCK_GETRES + \ + DYNSTR_BYTES_CLOCK_GETTIME + DYNSTR_BYTES_GETTIMEOFDAY + 1) + +_Static_assert(sizeof(dynstr_data) <= 104, + "dynstr_data outgrew the DYNSYM padding window"); /* Symbol text offsets and sizes */ static const uint32_t sym_text_off[VDSO_NUM_SYMS] = { TEXT_OFF_SIGRET, TEXT_OFF_GETRES, TEXT_OFF_GETTIME, TEXT_OFF_GETTOD}; -static const uint32_t sym_text_size[VDSO_NUM_SYMS] = { - 12, 12, TEXT_OFF_GETTOD - TEXT_OFF_GETTIME, 12}; +static const uint32_t sym_text_size[VDSO_NUM_SYMS] = {12, 12, TEXT_GETTIME_SIZE, + 12}; /* Emit a 12-byte SVC trampoline: mov x8, #syscall_nr; svc #0; ret. */ static void emit_svc_trampoline(uint32_t *code, unsigned syscall_nr) @@ -129,9 +258,265 @@ static void emit_svc_trampoline(uint32_t *code, unsigned syscall_nr) code[2] = 0xD65F03C0U; /* ret */ } +/* CNTVCT-based fast-path trampoline for __kernel_clock_gettime. The guest + * always reads CNTVCT_EL0 into X9 first, then either falls through to a + * full SVC (unsupported clockids, pending attention, vvar uninitialized) or + * interpolates wall_clock from the vvar anchor. The host's + * sys_clock_gettime handler reads X9 on the first SVC and seeds the vvar + * (anchor_cntvct = X9, anchor_sec/nsec = wall_clock), so subsequent calls + * skip the trap while attention remains clear. CNTKCTL_EL1.EL0VCTEN is set + * in bootstrap to allow the MRS at EL0; without that the trampoline gets + * 0 back and the math collapses. + * + * The svc_fallback tail lives in __kernel_clock_gettime's slot too so a + * single RET ends the function in either path. + */ + +/* AArch64 instruction encoders (only the ones used here). */ +static uint32_t enc_movz_x(unsigned rd, uint16_t imm) +{ + return 0xD2800000U | ((uint32_t) imm << 5) | (rd & 0x1F); +} + +static uint32_t enc_movk_x_lsl16(unsigned rd, uint16_t imm) +{ + return 0xF2A00000U | ((uint32_t) imm << 5) | (rd & 0x1F); +} + +static uint32_t enc_adr(unsigned rd, int32_t pc_rel) +{ + uint32_t immlo = (uint32_t) (pc_rel & 0x3); + uint32_t immhi = (uint32_t) ((pc_rel >> 2) & 0x7FFFF); + return 0x10000000U | (immlo << 29) | (immhi << 5) | (rd & 0x1F); +} + +/* B.cond imm19. cond is the 4-bit AArch64 condition (NE=0x1, LO=0x3, etc.). */ +#define COND_NE 0x1 +#define COND_LO 0x3 +static uint32_t enc_bcond_imm19(unsigned cond, int32_t pc_rel) +{ + uint32_t imm19 = (uint32_t) ((pc_rel >> 2) & 0x7FFFF); + return 0x54000000U | (imm19 << 5) | (cond & 0xF); +} + +static uint32_t enc_ldr_x_imm12(unsigned rt, unsigned rn, uint32_t off_bytes) +{ + return 0xF9400000U | ((off_bytes / 8) << 10) | ((rn & 0x1F) << 5) | + (rt & 0x1F); +} + +static uint32_t enc_add_x(unsigned rd, unsigned rn, unsigned rm) +{ + return 0x8B000000U | ((rm & 0x1F) << 16) | ((rn & 0x1F) << 5) | (rd & 0x1F); +} + +static uint32_t enc_add_x_imm12(unsigned rd, unsigned rn, uint16_t imm) +{ + return 0x91000000U | (((uint32_t) imm & 0xFFF) << 10) | ((rn & 0x1F) << 5) | + (rd & 0x1F); +} + +static uint32_t enc_mul_x(unsigned rd, unsigned rn, unsigned rm) +{ + return 0x9B007C00U | ((rm & 0x1F) << 16) | ((rn & 0x1F) << 5) | (rd & 0x1F); +} + +static uint32_t enc_udiv_x(unsigned rd, unsigned rn, unsigned rm) +{ + return 0x9AC00800U | ((rm & 0x1F) << 16) | ((rn & 0x1F) << 5) | (rd & 0x1F); +} + +static uint32_t enc_msub_x(unsigned rd, unsigned rn, unsigned rm, unsigned ra) +{ + return 0x9B008000U | ((rm & 0x1F) << 16) | ((ra & 0x1F) << 10) | + ((rn & 0x1F) << 5) | (rd & 0x1F); +} + +static uint32_t enc_stp_x_imm7(unsigned rt1, + unsigned rt2, + unsigned rn, + int32_t off_bytes) +{ + int32_t imm7 = (off_bytes / 8) & 0x7F; + return 0xA9000000U | ((uint32_t) imm7 << 15) | ((rt2 & 0x1F) << 10) | + ((rn & 0x1F) << 5) | (rt1 & 0x1F); +} + +static uint32_t enc_cmp_w_imm12(unsigned rn, uint32_t imm12) +{ + /* SUBS WZR, Wn, #imm12 */ + return 0x7100001FU | ((imm12 & 0xFFF) << 10) | ((rn & 0x1F) << 5); +} + +/* LDAR Wt, [Xn] -- acquire load of a 32-bit word. Pairs with the host's + * __atomic_store_n(initialized, ..., __ATOMIC_RELEASE) so that observing + * initialized != 0 also makes the prior anchor stores visible. + */ +static uint32_t enc_ldar_w(unsigned rt, unsigned rn) +{ + return 0x88DFFC00U | ((rn & 0x1F) << 5) | (rt & 0x1F); +} + +/* SUBS Xd, Xn, Xm (set flags). */ +static uint32_t enc_subs_x(unsigned rd, unsigned rn, unsigned rm) +{ + return 0xEB000000U | ((rm & 0x1F) << 16) | ((rn & 0x1F) << 5) | (rd & 0x1F); +} + +/* CBZ Wt, imm19 (byte-relative; encoder shifts >>2 internally). */ +static uint32_t enc_cbz_w(unsigned rt, int32_t pc_rel) +{ + uint32_t imm19 = (uint32_t) ((pc_rel >> 2) & 0x7FFFF); + return 0x34000000U | (imm19 << 5) | (rt & 0x1F); +} + +static uint32_t enc_cbnz_w(unsigned rt, int32_t pc_rel) +{ + uint32_t imm19 = (uint32_t) ((pc_rel >> 2) & 0x7FFFF); + return 0x35000000U | (imm19 << 5) | (rt & 0x1F); +} + +/* B imm26 unconditional branch (byte-relative). */ +static uint32_t enc_b(int32_t pc_rel) +{ + uint32_t imm26 = (uint32_t) ((pc_rel >> 2) & 0x3FFFFFF); + return 0x14000000U | imm26; +} + +/* LDP Xt1, Xt2, [Xn, #off_bytes] (signed 7-bit imm, multiple of 8). */ +static uint32_t enc_ldp_x_imm7(unsigned rt1, + unsigned rt2, + unsigned rn, + int32_t off_bytes) +{ + int32_t imm7 = (off_bytes / 8) & 0x7F; + return 0xA9400000U | ((uint32_t) imm7 << 15) | ((rt2 & 0x1F) << 10) | + ((rn & 0x1F) << 5) | (rt1 & 0x1F); +} + +/* Emit the CNTVCT fast-path clock_gettime trampoline at page+pc_off; the + * vvar lives at page+vvar_off. The trampoline is exactly TEXT_GETTIME_SIZE + * bytes; the static_assert below catches drift. + * + * Layout (35 instructions, 0x8c bytes): + * + * 0x00 mrs x9, cntvct_el0 ; always read first + * 0x04 cbz w0, .Lreal ; clockid==0 -> CLOCK_REALTIME + * 0x08 cmp w0, #1 ; clockid==1 -> CLOCK_MONOTONIC + * 0x0C b.ne svc_fallback ; other clockid -> SVC + * 0x10 mov w7, #ANCHOR_MONO_SEC ; offset within vvar of MONO sec + * 0x14 b .Linit + * 0x18 .Lreal: mov w7, #ANCHOR_REAL_SEC + * 0x1C .Linit: adr x2, vvar + * 0x20 add x10, x2, #ATTENTION + * 0x24 ldar w3, [x10] ; load attention flag (acquire) + * 0x28 cbnz w3, svc_fallback ; timers/signals need epilogue + * 0x2C ldar w3, [x2] ; load initialized flag (acquire) + * 0x30 cmp w3, #1 + * 0x34 b.ne svc_fallback ; not seeded yet + * 0x38 ldr x3, [x2, #ANCHOR_CNTVCT] + * 0x3C add x8, x2, x7 ; x8 = vvar base + sec_offset + * 0x40 ldp x4, x5, [x8] ; x4=anchor_sec, x5=anchor_nsec + * 0x44 subs x6, x9, x3 ; cntvct delta + * 0x48 b.lo svc_fallback ; underflow -> SVC + * ... (math identical to original: delta*125/3 ns, +nsec, carry into sec) + * 0x74 stp x4, x5, [x1] ; store {sec, nsec} + * 0x78 mov x0, #0 + * 0x7C ret + * 0x80 svc_fallback: mov x8, #113 + * 0x84 svc #0 + * 0x88 ret + * + * Both clockids share the same CNTVCT delta math; only the anchor pair + * loaded via LDP changes. Picking via a runtime offset register avoids + * duplicating the entire math block per clockid. + */ +static void emit_clock_gettime_trampoline(uint32_t *code, + uint32_t pc_off, + uint32_t vvar_off) +{ + /* Branch targets within the trampoline. */ + int32_t real_off = 0x18; /* .Lreal */ + int32_t init_off = 0x1C; /* .Linit (common path entry) */ + int32_t svc_fallback_off = 0x80; /* svc_fallback */ + int32_t adr_pc_off = 0x1C; /* offset of 'adr x2, vvar' */ + int32_t vvar_rel = (int32_t) vvar_off - (int32_t) (pc_off + adr_pc_off); + + code[0] = 0xD53BE049U; /* mrs x9, cntvct_el0 */ + code[1] = enc_cbz_w(0, real_off - 0x04); /* cbz w0, .Lreal */ + code[2] = enc_cmp_w_imm12(0, 1); /* cmp w0, #1 */ + code[3] = enc_bcond_imm19(COND_NE, svc_fallback_off - 0x0C); + /* b.ne svc_fallback */ + code[4] = enc_movz_x(7, VVAR_OFF_ANCHOR_MONO_SEC); + code[5] = enc_b(init_off - 0x14); /* b .Linit */ + code[6] = enc_movz_x(7, VVAR_OFF_ANCHOR_REAL_SEC); /* .Lreal */ + code[7] = enc_adr(2, vvar_rel); /* .Linit: adr x2,vv */ + code[8] = enc_add_x_imm12(10, 2, VVAR_OFF_ATTENTION); + code[9] = enc_ldar_w(3, 10); + code[10] = enc_cbnz_w(3, svc_fallback_off - 0x28); + code[11] = enc_ldar_w(3, 2); /* ldar w3, [x2] */ + code[12] = enc_cmp_w_imm12(3, 1); /* cmp w3, #1 */ + code[13] = enc_bcond_imm19(COND_NE, svc_fallback_off - 0x34); + /* b.ne svc_fallback */ + code[14] = enc_ldr_x_imm12(3, 2, VVAR_OFF_ANCHOR_CNTVCT); + code[15] = enc_add_x(8, 2, 7); /* add x8, x2, x7 */ + code[16] = enc_ldp_x_imm7(4, 5, 8, 0); /* ldp x4, x5, [x8] */ + code[17] = enc_subs_x(6, 9, 3); /* subs x6, x9, x3 */ + code[18] = enc_bcond_imm19(COND_LO, svc_fallback_off - 0x48); + /* b.lo svc_fallback */ + code[19] = enc_movz_x(7, 125); + code[20] = enc_mul_x(6, 6, 7); /* delta * 125 */ + code[21] = enc_movz_x(7, 3); + code[22] = enc_udiv_x(6, 6, 7); /* delta_ns */ + code[23] = enc_add_x(5, 5, 6); /* nsec += delta_ns */ + code[24] = enc_movz_x(7, 0xCA00); + code[25] = enc_movk_x_lsl16(7, 0x3B9A); /* x7 = 1e9 */ + code[26] = enc_udiv_x(8, 5, 7); /* sec_carry */ + code[27] = enc_msub_x(5, 8, 7, 5); /* nsec %= 1e9 */ + code[28] = enc_add_x(4, 4, 8); /* sec += carry */ + code[29] = enc_stp_x_imm7(4, 5, 1, 0); /* stp x4, x5, [x1] */ + code[30] = enc_movz_x(0, 0); /* mov x0, #0 */ + code[31] = 0xD65F03C0U; /* ret */ + /* svc_fallback at offset 0x80 (instruction 32) */ + code[32] = enc_movz_x(8, 113); /* mov x8, #113 */ + code[33] = 0xD4000001U; /* svc #0 */ + code[34] = 0xD65F03C0U; /* ret */ +} + +_Static_assert(TEXT_GETTIME_SIZE == 35 * sizeof(uint32_t), + "clock_gettime trampoline size must match emitter"); + +/* The public sigret offset declared in core/vdso.h must match the + * internal layout above; signal.c sets X30 to VDSO_BASE + VDSO_OFF_SIGRET + * as the return-from-handler target. + */ +_Static_assert(VDSO_OFF_SIGRET == TEXT_OFF_SIGRET, + "VDSO_OFF_SIGRET in core/vdso.h must equal TEXT_OFF_SIGRET"); + +static uint32_t elf_hash(const char *name) +{ + uint32_t h = 0, g; + + while (*name) { + h = (h << 4) + (unsigned char) *name++; + g = h & 0xf0000000U; + if (g) + h ^= g >> 24; + h &= ~g; + } + return h; +} + uint64_t vdso_build(guest_t *g) { - uint8_t *page = (uint8_t *) guest_ptr(g, VDSO_BASE); + /* The vDSO page is host-built into the guest backing buffer before any + * page-table entry covers it, so route through vdso_host_page which + * just bounds-checks against guest_size. The earlier guest_ptr walk + * worked by coincidence (the slot happened to be reachable) but tied + * host construction to whatever EL0 permission walker state existed + * at the time -- a fragile coupling for host-owned memory. + */ + uint8_t *page = vdso_host_page(g); if (!page) { log_error("vdso: VDSO_BASE 0x%llx out of guest memory", (unsigned long long) VDSO_BASE); @@ -160,7 +545,7 @@ uint64_t vdso_build(guest_t *g) ehdr->e_phentsize = sizeof(elf64_phdr_t); ehdr->e_phnum = 2; ehdr->e_shentsize = sizeof(elf64_shdr_t); - ehdr->e_shnum = 6; + ehdr->e_shnum = 8; ehdr->e_shstrndx = 2; /* Program header 0: PT_LOAD. */ @@ -181,8 +566,8 @@ uint64_t vdso_build(guest_t *g) phdr1->p_offset = VDSO_OFF_DYNAMIC; phdr1->p_vaddr = VDSO_OFF_DYNAMIC; phdr1->p_paddr = VDSO_OFF_DYNAMIC; - phdr1->p_filesz = 6 * sizeof(elf64_dyn_t); - phdr1->p_memsz = 6 * sizeof(elf64_dyn_t); + phdr1->p_filesz = VDSO_NUM_DYN * sizeof(elf64_dyn_t); + phdr1->p_memsz = VDSO_NUM_DYN * sizeof(elf64_dyn_t); phdr1->p_align = 8; /* Text trampolines. Each entry is the same 12-byte mov/svc/ret pattern @@ -190,9 +575,14 @@ uint64_t vdso_build(guest_t *g) */ emit_svc_trampoline((uint32_t *) (page + TEXT_OFF_SIGRET), 139); emit_svc_trampoline((uint32_t *) (page + TEXT_OFF_GETRES), 114); - emit_svc_trampoline((uint32_t *) (page + TEXT_OFF_GETTIME), 113); + emit_clock_gettime_trampoline((uint32_t *) (page + TEXT_OFF_GETTIME), + TEXT_OFF_GETTIME, VDSO_OFF_VVAR); emit_svc_trampoline((uint32_t *) (page + TEXT_OFF_GETTOD), 169); + /* vvar starts zero (initialized==0). The first __kernel_clock_gettime + * SVC fallback will let the host populate the anchor. + */ + /* Dynamic string table. */ memcpy(page + VDSO_OFF_DYNSTR, dynstr_data, DYNSTR_SIZE); @@ -221,6 +611,27 @@ uint64_t vdso_build(guest_t *g) } hash[2] = first_sym; + /* GNU symbol versioning. glibc's aarch64 vDSO resolver asks for + * LINUX_2.6.39 and ignores unversioned helpers. + */ + uint16_t *versym = (uint16_t *) (page + VDSO_OFF_VERSYM); + versym[0] = 0; + for (int i = 1; i <= VDSO_NUM_SYMS; i++) + versym[i] = VDSO_LINUX_VERSION_INDEX; + + elf64_verdef_t *verdef = (elf64_verdef_t *) (page + VDSO_OFF_VERDEF); + elf64_verdaux_t *verdaux = + (elf64_verdaux_t *) (page + VDSO_OFF_VERDEF + sizeof(*verdef)); + verdef->vd_version = VER_DEF_CURRENT; + verdef->vd_flags = 0; + verdef->vd_ndx = VDSO_LINUX_VERSION_INDEX; + verdef->vd_cnt = 1; + verdef->vd_hash = elf_hash("LINUX_2.6.39"); + verdef->vd_aux = sizeof(*verdef); + verdef->vd_next = 0; + verdaux->vda_name = VDSO_LINUX_VERSION_NAME_OFF; + verdaux->vda_next = 0; + /* Dynamic table. */ elf64_dyn_t *dyn = (elf64_dyn_t *) (page + VDSO_OFF_DYNAMIC); dyn[0] = (elf64_dyn_t) {DT_HASH, VDSO_OFF_HASH}; @@ -228,7 +639,10 @@ uint64_t vdso_build(guest_t *g) dyn[2] = (elf64_dyn_t) {DT_STRTAB, VDSO_OFF_DYNSTR}; dyn[3] = (elf64_dyn_t) {DT_STRSZ, DYNSTR_SIZE}; dyn[4] = (elf64_dyn_t) {DT_SYMENT, sizeof(elf64_sym_t)}; - dyn[5] = (elf64_dyn_t) {DT_NULL, 0}; + dyn[5] = (elf64_dyn_t) {DT_VERSYM, VDSO_OFF_VERSYM}; + dyn[6] = (elf64_dyn_t) {DT_VERDEF, VDSO_OFF_VERDEF}; + dyn[7] = (elf64_dyn_t) {DT_VERDEFNUM, 1}; + dyn[8] = (elf64_dyn_t) {DT_NULL, 0}; /* Section headers. */ elf64_shdr_t *shdr = (elf64_shdr_t *) (page + VDSO_OFF_SHDR); @@ -276,10 +690,134 @@ uint64_t vdso_build(guest_t *g) shdr[5].sh_flags = SHF_ALLOC; shdr[5].sh_addr = VDSO_OFF_DYNAMIC; shdr[5].sh_offset = VDSO_OFF_DYNAMIC; - shdr[5].sh_size = 6 * sizeof(elf64_dyn_t); + shdr[5].sh_size = VDSO_NUM_DYN * sizeof(elf64_dyn_t); shdr[5].sh_link = 2; shdr[5].sh_addralign = 8; shdr[5].sh_entsize = sizeof(elf64_dyn_t); + shdr[6].sh_name = 0; + shdr[6].sh_type = SHT_GNU_VERSYM; + shdr[6].sh_flags = SHF_ALLOC; + shdr[6].sh_addr = VDSO_OFF_VERSYM; + shdr[6].sh_offset = VDSO_OFF_VERSYM; + shdr[6].sh_size = VERSYM_SIZE; + shdr[6].sh_link = 3; + shdr[6].sh_addralign = 2; + shdr[6].sh_entsize = sizeof(uint16_t); + + shdr[7].sh_name = 0; + shdr[7].sh_type = SHT_GNU_VERDEF; + shdr[7].sh_flags = SHF_ALLOC; + shdr[7].sh_addr = VDSO_OFF_VERDEF; + shdr[7].sh_offset = VDSO_OFF_VERDEF; + shdr[7].sh_size = VERDEF_SIZE; + shdr[7].sh_link = 2; + shdr[7].sh_info = 1; + shdr[7].sh_addralign = 4; + return VDSO_BASE; } + +void vdso_seed_anchor(guest_t *g, + uint64_t guest_cntvct, + int64_t mono_sec, + int64_t mono_nsec, + int64_t real_sec, + int64_t real_nsec) +{ + /* Match vdso_attention_or: host-owned vvar writes go through the + * direct host_base + VDSO_BASE accessor, not the guest permission + * walker. The vDSO is RX to EL0 so guest_ptr_w would silently bail + * here; guest_ptr happens to work because it only requires read + * perm, but that inconsistency is brittle. + */ + uint8_t *page = vdso_host_page(g); + if (!page) + return; + uint32_t *initialized = (uint32_t *) (page + VDSO_OFF_VVAR); + uint8_t *vvar = page + VDSO_OFF_VVAR; + + /* Three-state CAS reservation: 0 = unseeded, 2 = reserving (one host + * thread owns the anchor stores), 1 = ready. Multiple host threads can + * concurrently take the SVC fallback on the first guest call; without + * the reservation they race on the plain anchor stores. The CAS winner + * writes the fields and releases 1; losers bail. The guest trampoline + * loads initialized with LDAR and only takes the fast path on + * initialized == 1, so state 2 still routes to the SVC fallback. + * + * Both MONO and REAL anchor pairs are written together so a fast-path + * caller for either clockid sees a consistent pair after observing + * initialized == 1. The two pairs share anchor_cntvct (the trampoline's + * X9 at first call); macOS clock_gettime for MONO and REAL was issued + * by the host between then and now, so the anchor wall_clock values + * trail X9 by a small constant offset that propagates unchanged into + * every fast-path result. + */ + uint32_t expected = 0; + if (!__atomic_compare_exchange_n(initialized, &expected, 2, + /* weak */ false, __ATOMIC_ACQUIRE, + __ATOMIC_RELAXED)) + return; + + *(uint64_t *) (vvar + VVAR_OFF_ANCHOR_CNTVCT) = guest_cntvct; + *(uint64_t *) (vvar + VVAR_OFF_ANCHOR_MONO_SEC) = (uint64_t) mono_sec; + *(uint64_t *) (vvar + VVAR_OFF_ANCHOR_MONO_NSEC) = (uint64_t) mono_nsec; + *(uint64_t *) (vvar + VVAR_OFF_ANCHOR_REAL_SEC) = (uint64_t) real_sec; + *(uint64_t *) (vvar + VVAR_OFF_ANCHOR_REAL_NSEC) = (uint64_t) real_nsec; + + /* The release-store on initialized pairs with the trampoline's LDAR + * load on the same address; observing 1 also makes the anchor fields + * visible to the guest. + */ + __atomic_store_n(initialized, 1, __ATOMIC_RELEASE); +} + +uint64_t vdso_clock_gettime_svc_pc(void) +{ + return VDSO_BASE + VDSO_CLOCK_GETTIME_SVC_PC; +} + +bool vdso_anchor_is_seeded(guest_t *g) +{ + uint8_t *page = vdso_host_page(g); + if (!page) + return false; + uint32_t *initialized = (uint32_t *) (page + VDSO_OFF_VVAR); + /* Pairs with the release store in vdso_seed_anchor that publishes the + * anchor fields. Only state 1 (ready) qualifies; state 2 (one host + * thread reserving) still needs the seeding gate to run for any + * subsequent caller that wins after the reservation completes. + */ + return __atomic_load_n(initialized, __ATOMIC_ACQUIRE) == 1; +} + +void vdso_attention_or(guest_t *g, uint32_t bits) +{ + /* The vDSO is mapped RX to EL0, but the host owns the embedded vvar and + * must still be able to mirror shim attention into it. Bypass the + * guest-permission walker just like shim_globals does for shim_data. + */ + uint8_t *page = vdso_host_page(g); + if (!page) + return; + uint32_t *attention = + (uint32_t *) (page + VDSO_OFF_VVAR + VVAR_OFF_ATTENTION); + /* SEQ_CST mirrors shim_globals_attn_or. The vDSO attention word is + * read by EL0 vDSO fast paths (libc time/getcpu/etc.) without going + * through HVC, so the same contrapositive-style ordering claim + * applies: a reader that LDAR-loads attn=0 must not observe later + * publish_creds stores. ACQ_REL alone does not provide that + * (release-acquire only orders the forward direction). + */ + __atomic_fetch_or(attention, bits, __ATOMIC_SEQ_CST); +} + +void vdso_attention_and(guest_t *g, uint32_t mask) +{ + uint8_t *page = vdso_host_page(g); + if (!page) + return; + uint32_t *attention = + (uint32_t *) (page + VDSO_OFF_VVAR + VVAR_OFF_ATTENTION); + __atomic_fetch_and(attention, mask, __ATOMIC_RELEASE); +} diff --git a/src/core/vdso.h b/src/core/vdso.h index e3a41d5..0986ab5 100644 --- a/src/core/vdso.h +++ b/src/core/vdso.h @@ -12,17 +12,63 @@ #pragma once +#include +#include + #include "core/guest.h" /* Guest address where the vDSO is placed (one 4KiB page, below PT pool) */ #define VDSO_BASE 0x0000F000ULL #define VDSO_SIZE 0x00001000ULL /* 4KiB */ -#define VDSO_OFF_TEXT 0x0B0 /* Offset of .text (trampoline code) */ +/* Offset of __kernel_rt_sigreturn (the signal trampoline glibc/musl jumps + * to via X30/LR after the handler returns). Must match TEXT_OFF_SIGRET in + * src/core/vdso.c; kept here so signal.c can target it without including + * the vDSO internals. + */ +#define VDSO_OFF_SIGRET 0x0E0 /* Build a minimal vDSO ELF image at VDSO_BASE in guest memory. * The image contains a valid ELF header, one LOAD program header, SHT_DYNSYM - * and SHT_STRTAB sections, and a __kernel_rt_sigreturn symbol pointing to - * a small trampoline (mov x8, #139; svc #0). + * and SHT_STRTAB sections, and a __kernel_rt_sigreturn symbol pointing to a + * small trampoline (mov x8, #139; svc #0). * Returns the GVA of the ELF header (== VDSO_BASE), or 0 on failure. */ uint64_t vdso_build(guest_t *g); + +/* If the vvar anchor has not been seeded yet, install the supplied cntvct as + * the guest-frame anchor paired with the given monotonic and realtime + * wall_clock values. Idempotent: subsequent calls with initialized==1 are + * no-ops. Used by sys_clock_gettime to upgrade the first + * __kernel_clock_gettime SVC fallback into a permanent vvar fast path that + * serves both CLOCK_MONOTONIC and CLOCK_REALTIME. + */ +void vdso_seed_anchor(guest_t *g, + uint64_t guest_cntvct, + int64_t mono_sec, + int64_t mono_nsec, + int64_t real_sec, + int64_t real_nsec); + +/* GVA at which the trampoline's svc_fallback issues its SVC. Used by + * sys_clock_gettime to verify a clock_gettime trap actually came from the vDSO + * fallback path (and thus carries a guest-frame CNTVCT in X9) versus an + * unrelated raw syscall(SYS_clock_gettime, ...). The trap returns to SVC_PC + * + 4, so callers compare ELR_EL1 against that. + */ +uint64_t vdso_clock_gettime_svc_pc(void); + +/* Returns true once the vvar anchor has been published (initialized==1) and + * the fast path can never be reseeded. Lets the post-SVC handler in + * sys_clock_gettime skip the ELR_EL1 + X9 HVF reads it otherwise needs for + * the seeding gate, since the second-call onward gate is moot once seeded. + * Uses acquire ordering paired with vdso_seed_anchor's release store. + */ +bool vdso_anchor_is_seeded(guest_t *g); + +/* Mirror the shim attention bitmask into the vvar page. The vDSO + * clock_gettime fast path reads this word and falls back to SVC whenever + * it is nonzero, preserving the normal post-HVC timer/signal epilogue while + * guest attention is pending. + */ +void vdso_attention_or(guest_t *g, uint32_t bits); +void vdso_attention_and(guest_t *g, uint32_t mask); diff --git a/src/runtime/fork-state.c b/src/runtime/fork-state.c index f9746cd..a36a673 100644 --- a/src/runtime/fork-state.c +++ b/src/runtime/fork-state.c @@ -21,6 +21,7 @@ #include "debug/log.h" #include "syscall/abi.h" #include "syscall/internal.h" +#include "syscall/io.h" #include "syscall/mem.h" #include "syscall/proc.h" @@ -34,6 +35,15 @@ int fork_ipc_write_all(int fd, const void *buf, size_t len) continue; return -1; } + if (n == 0) { + /* Defensive: an unexpected zero return on a blocking socket + * would otherwise spin forever, since p and len stay at the + * same offset. Treat it as an IO failure so the parent and + * child both bail rather than wedge. + */ + errno = EIO; + return -1; + } p += n; len -= n; } @@ -249,9 +259,19 @@ int fork_ipc_send_fd_table(int ipc_sock) if (fd_table[i].type == FD_CLOSED) continue; + /* Synthetic-fd types are filtered here; see fd_type_is_synthetic + * in syscall/internal.h for the rationale (kqueue cannot cross + * SCM_RIGHTS on macOS, per-class side tables are not serialized). + * The child sees these slots as FD_CLOSED and recreates them via + * the appropriate syscall. + */ + int t = fd_table[i].type; + if (fd_type_is_synthetic(t)) + continue; + int host_fd; bool was_duped = false; - if (fd_table[i].type != FD_STDIO) { + if (t != FD_STDIO) { int duped = dup(fd_table[i].host_fd); if (duped < 0) continue; @@ -315,8 +335,11 @@ int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g) return -1; } - if (num_fds == 0) + if (num_fds == 0) { + for (int fd = 0; fd < 3; fd++) + fd_mark_closed(fd); return 0; + } ipc_fd_entry_t *fd_entries = calloc(num_fds, sizeof(ipc_fd_entry_t)); if (!fd_entries) @@ -328,6 +351,16 @@ int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g) return -1; } + bool low_fd_present[3] = {false, false, false}; + for (uint32_t i = 0; i < num_fds; i++) { + int gfd = fd_entries[i].guest_fd; + if (RANGE_CHECK(gfd, 0, 3) && !fd_type_is_synthetic(fd_entries[i].type)) + low_fd_present[gfd] = true; + } + for (int fd = 0; fd < 3; fd++) + if (!low_fd_present[fd]) + fd_mark_closed(fd); + int *host_fds = calloc(num_fds, sizeof(int)); if (!host_fds) { free(fd_entries); @@ -361,15 +394,35 @@ int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g) if (fd_entries[i].type == FD_STDIO) { close(host_fds[i]); fd_table[gfd].linux_flags = fd_entries[i].linux_flags; + fd_refresh_urandom_bitmap(gfd); memcpy(fd_table[gfd].proc_path, fd_entries[i].proc_path, sizeof(fd_table[gfd].proc_path)); fd_table[gfd].seals = fd_entries[i].seals; + } else if (fd_type_is_synthetic(fd_entries[i].type)) { + /* Defense in depth: the parent's fork_ipc_send_fd_table + * already filters synthetic types out of the SCM_RIGHTS + * payload (see fd_type_is_synthetic in syscall/internal.h). + * If anything still arrives here, drop the inherited host + * fd and leave the slot FD_CLOSED so the child must + * recreate the fd via the appropriate syscall. + */ + log_debug( + "fork-child: dropping unexpected synthetic-type fd %d (type " + "%d)", + gfd, fd_entries[i].type); + close(host_fds[i]); + fd_mark_closed(gfd); + continue; } else { - fd_alloc_at(gfd, fd_entries[i].type, host_fds[i]); + void (*cleanup)(int) = fd_cleanup_for_type(fd_entries[i].type); + fd_alloc_at(gfd, fd_entries[i].type, host_fds[i], cleanup); fd_table[gfd].linux_flags = fd_entries[i].linux_flags; + fd_refresh_urandom_bitmap(gfd); memcpy(fd_table[gfd].proc_path, fd_entries[i].proc_path, sizeof(fd_table[gfd].proc_path)); fd_table[gfd].seals = fd_entries[i].seals; + if (fd_entries[i].type == FD_URANDOM) + urandom_fd_reset_cache(gfd); if (fd_entries[i].type != FD_DIR) continue; @@ -656,15 +709,25 @@ int fork_ipc_recv_process_state(int ipc_fd, guest_t *g, signal_state_t *sig) log_error("fork-child: failed to read region count"); return -1; } - if (num_guest_regions > GUEST_MAX_REGIONS) - num_guest_regions = GUEST_MAX_REGIONS; - if (num_guest_regions > 0 && + uint32_t recv_regions = num_guest_regions; + if (recv_regions > GUEST_MAX_REGIONS) + recv_regions = GUEST_MAX_REGIONS; + if (recv_regions > 0 && fork_ipc_read_all(ipc_fd, g->regions, - num_guest_regions * sizeof(guest_region_t)) < 0) { + recv_regions * sizeof(guest_region_t)) < 0) { log_error("fork-child: failed to read regions"); return -1; } - g->nregions = (int) num_guest_regions; + /* Drain any excess records the parent serialized beyond the local cap. + * Without this drain, the next read (num_preannounced) consumes stale + * region bytes and desynchronizes the rest of the IPC payload. Mirrors + * the preannounced-region drain below. + */ + if (num_guest_regions > recv_regions && + fork_ipc_drain_bytes(ipc_fd, (num_guest_regions - recv_regions) * + sizeof(guest_region_t)) < 0) + return -1; + g->nregions = (int) recv_regions; uint32_t num_preannounced = 0; if (fork_ipc_read_all(ipc_fd, &num_preannounced, sizeof(num_preannounced)) < diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c index 963cb61..3f8c4a5 100644 --- a/src/runtime/forkipc.c +++ b/src/runtime/forkipc.c @@ -29,6 +29,8 @@ #include "hvutil.h" #include "utils.h" +#include "core/shim-globals.h" + #include "runtime/forkipc.h" #include "runtime/fork-state.h" #include "runtime/futex.h" @@ -299,6 +301,20 @@ int fork_child_main(int ipc_fd, HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL1, regs.sp_el1)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL0, regs.tpidr_el0)); + /* TPIDR_EL1 is set by the host (never inherited from the parent's + * register snapshot) because it must point at the child's own + * shim_globals base in the child's IPA; shim_data_base happens to + * be the same value in both processes (layout derives from + * guest_size + ipa_bits which match across fork), but installing + * it explicitly keeps the child consistent with the bootstrap path. + * CONTEXTIDR_EL1 holds the per-vCPU tid (== child pid for the + * single-threaded child at this point). + */ + if (shim_globals_install_per_vcpu(vcpu, &g, hdr.child_pid) < 0) { + guest_destroy(&g); + return 1; + } + /* Enable MMU directly (page tables already in guest memory from IPC). * SCTLR must include MMU-enable (M), caches (C, I), RES1 bits, and EL0 * cache maintenance access (UCI, UCT) for JIT translators. @@ -333,6 +349,39 @@ int fork_child_main(int ipc_fd, */ thread_register_main(vcpu, vexit, hdr.child_pid, regs.sp_el1); + /* Re-publish identity into the child's shim-globals cache: the + * CoW / region copy inherits the parent's pid/uid values, and the + * shim's identity fast path would otherwise return the parent's + * pid to the child. Identity is now committed via the same path + * the bootstrap uses. + */ + shim_globals_init(&g); + shim_globals_set_trace_enabled(&g, verbose); + shim_globals_publish_pid(&g, hdr.child_pid, hdr.parent_pid); + shim_globals_publish_creds(&g, hdr.uid, hdr.euid, hdr.gid, hdr.egid); + /* Fresh entropy for the child. Linux's vDSO getrandom epoch-bumps + * across fork; here we just re-fill the ring from arc4random_buf + * which seeds from the host kernel's RNG, so parent and child do + * not share future urandom output. + */ + shim_globals_refill_urandom_ring(&g); + /* Register the singleton for the child's signal.c so its + * attention setters know which guest to update. + */ + signal_set_shim_globals_guest(&g); + /* Same for the fd-table hooks. Must precede any fd_alloc the + * child performs (the fd-table-restore step has already run + * above, but those slots are populated via direct memcpy of the + * parent's entries; subsequent open/dup/close in the child rely + * on this registration to keep the bitmap in sync). + */ + shim_globals_set_singleton(&g); + /* shim_globals_init above zeroed the urandom bitmap. Walk the inherited fd + * table and re-mark every readable FD_URANDOM slot so the shim's read fast + * path sees the correct state from the first syscall onward. + */ + shim_globals_rebuild_urandom_bitmap(); + /* Now that current_thread is set, apply signal state. This must happen * after thread_register_main() so the per-thread blocked mask and altstack * are properly restored to the thread entry. @@ -669,6 +718,14 @@ static void *thread_create_and_run(void *arg) WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TTBR0_EL1, tca->ttbr0)); WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_CPACR_EL1, tca->cpacr)); + /* All worker vCPUs in the process share the same shim_globals base + * (one VM per process); a fresh TPIDR_EL1 set is still required + * because HVF created this vCPU empty. CONTEXTIDR_EL1 holds the + * per-thread tid that the gettid shim fast path returns. + */ + if (shim_globals_install_per_vcpu(vcpu, tca->guest, t->guest_tid) < 0) + goto startup_failed; + /* MMU already on, so set SCTLR with M=1 directly (page tables exist) */ WORKER_HV(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SCTLR_EL1, tca->sctlr)); @@ -980,6 +1037,11 @@ static void *vm_clone_thread_run(void *arg) HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SCTLR_EL1, tca->sctlr)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL1, tca->sp_el1)); HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SP_EL0, tca->child_stack)); + if (shim_globals_install_per_vcpu(vcpu, tca->guest, t->guest_tid) < 0) { + thread_deactivate(t); + free(tca); + return NULL; + } /* TLS pointer */ if (tca->flags & LINUX_CLONE_SETTLS) { @@ -1272,7 +1334,7 @@ int64_t sys_clone(hv_vcpu_t vcpu, * * Rosetta guests are excluded from CoW even when shm-backed: rosetta's * JIT state (TLS slabs, code caches, indirect-call tables, block lists) - * is process-local and corrupts when COW-shared. The legacy region-copy + * is process-local and corrupts when CoW-shared. The legacy region-copy * path preserves the parent's JIT state independently per child. */ bool use_shm = (g->shm_fd >= 0) && !g->is_rosetta; diff --git a/src/syscall/abi.h b/src/syscall/abi.h index eda9bc7..b87848e 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -364,6 +364,11 @@ typedef struct { #define LINUX_O_RDONLY 0x0000 #define LINUX_O_WRONLY 0x0001 #define LINUX_O_RDWR 0x0002 +/* O_ACCMODE is the mask covering O_RDONLY, O_WRONLY, O_RDWR. The urandom + * read fast-path bitmap and the dup-alias metadata both need this mask to + * isolate the access-mode bits from the other LINUX_O_* flags. + */ +#define LINUX_O_ACCMODE 0x0003 #define LINUX_O_CREAT 0x0040 #define LINUX_O_EXCL 0x0080 #define LINUX_O_NOCTTY 0x0100 @@ -639,6 +644,7 @@ typedef struct { #define FD_FUSE_DEV 14 #define FD_FUSE_FILE 15 #define FD_FUSE_DIR 16 +#define FD_URANDOM 17 #define FD_VIRTUAL_PATH_MAX 64 /* File sealing flags (F_SEAL_*) for memfd_create. Tracked per-FD. */ diff --git a/src/syscall/exec.c b/src/syscall/exec.c index cecfcb2..6d8ca2e 100644 --- a/src/syscall/exec.c +++ b/src/syscall/exec.c @@ -25,6 +25,7 @@ #include "core/bootstrap.h" #include "core/elf.h" #include "core/rosetta.h" +#include "core/shim-globals.h" #include "core/stack.h" #include "core/vdso.h" @@ -61,6 +62,37 @@ static void exec_sync_vcpu_regs(hv_vcpu_t vcpu) (void) vcpu_get_reg(vcpu, HV_REG_X8); } +static void exec_republish_shim_globals_or_die(hv_vcpu_t vcpu, + guest_t *g, + bool verbose) +{ + /* guest_reset zeros shim_data. Reinitialize the host-owned fast-path + * state before returning to either native aarch64 code or the Rosetta + * runtime, otherwise identity and urandom fast paths observe all-zero + * cache state after exec. + */ + shim_globals_init(g); + shim_globals_set_trace_enabled(g, verbose); + + /* TPIDR_EL1 carries the shim_globals base. Past PNR, failure leaves the + * replacement image unable to use the EL1 shim safely, so abort in the + * same shape as other post-reset fatal errors. + */ + if (shim_globals_install_tpidr(vcpu, g) < 0) { + log_fatal( + "execve failed after point of no return: " + "shim_globals_install_tpidr"); + exit(128); + } + + shim_globals_publish_pid(g, proc_get_pid(), proc_get_ppid()); + shim_globals_publish_creds(g, proc_get_uid(), proc_get_euid(), + proc_get_gid(), proc_get_egid()); + shim_globals_rebuild_urandom_bitmap(); + shim_globals_refill_urandom_ring(g); + shim_globals_recompute_attention(g); +} + /* Release the buffers and temporary host-side files that sys_execve allocates * before crossing the point of no return. Used by both the Rosetta and the * aarch64 success paths. @@ -728,6 +760,7 @@ int64_t sys_execve(hv_vcpu_t vcpu, path); exit(128); } + exec_republish_shim_globals_or_die(vcpu, g, verbose); /* I-cache for the (possibly re-mapped) rosetta segments has already * been invalidated inside rosetta_prepare; only the shim needs an @@ -760,8 +793,10 @@ int64_t sys_execve(hv_vcpu_t vcpu, } /* Load the executable image that was validated before guest_reset(). */ + uint64_t infra_lo = g->interp_base - INFRA_RESERVE; + uint64_t infra_hi = g->interp_base; if (elf_map_segments(&elf_info, path_host, g->host_base, g->guest_size, - elf_load_base) < 0) { + elf_load_base, infra_lo, infra_hi) < 0) { log_fatal( "execve failed after point of no return: " "failed to map ELF segments for %s", @@ -782,7 +817,8 @@ int64_t sys_execve(hv_vcpu_t vcpu, if (elf_info.interp_path[0] != '\0') { interp_base = g->interp_base; if (elf_map_segments(&interp_info, interp_resolved, g->host_base, - g->guest_size, interp_base) < 0) { + g->guest_size, interp_base, infra_lo, + infra_hi) < 0) { log_fatal( "execve failed after point of no return: " "failed to map interpreter segments"); @@ -851,13 +887,18 @@ int64_t sys_execve(hv_vcpu_t vcpu, .gpa_end = g->shim_base + shim_size, .perms = MEM_PERM_RX}; - /* EL1 exception handlers use this block for stack and scratch state. */ + /* EL1 exception handlers use this block for stack and scratch state. + * EL1-only so EL0 cannot read or store directly to the identity cache, + * urandom ring, or attention word that the shim fast paths consult. + * Matches bootstrap.c; if this regresses to plain RW, execve quietly + * defeats the protection on every new image. + */ if (nregions >= MAX_REGIONS) goto too_many_regions; regions[nregions++] = (mem_region_t) {.gpa_start = g->shim_data_base, .gpa_end = g->shim_data_base + BLOCK_2MIB, - .perms = MEM_PERM_RW}; + .perms = MEM_PERM_RW_EL1_ONLY}; /* The vDSO sits in the same 2MiB block as the shim. The page-table builder * splits the block into 4KiB L3 pages when its regions don't fully cover @@ -943,9 +984,12 @@ int64_t sys_execve(hv_vcpu_t vcpu, guest_region_add(g, g->shim_base, g->shim_base + shim_size, LINUX_PROT_READ | LINUX_PROT_EXEC, LINUX_MAP_PRIVATE, 0, "[shim]"); + /* Report PROT_NONE for [shim-data] to match the EL1-only mapping (see + * matching bootstrap.c registration). EL0 dereferences fault, so user + * tooling reading /proc/self/maps should see the same access state. + */ guest_region_add(g, g->shim_data_base, g->shim_data_base + BLOCK_2MIB, - LINUX_PROT_READ | LINUX_PROT_WRITE, LINUX_MAP_PRIVATE, 0, - "[shim-data]"); + LINUX_PROT_NONE, LINUX_MAP_PRIVATE, 0, "[shim-data]"); for (int i = 0; i < elf_info.num_segments; i++) { guest_region_add(g, elf_info.segments[i].gpa + elf_load_base, elf_info.segments[i].gpa + elf_info.segments[i].memsz + @@ -991,6 +1035,7 @@ int64_t sys_execve(hv_vcpu_t vcpu, * omit sa_restorer. */ uint64_t exec_vdso = vdso_build(g); + exec_republish_shim_globals_or_die(vcpu, g, verbose); sp = build_linux_stack(g, g->stack_top, argc, argv_const, envp_const, &elf_info, elf_load_base, interp_base, exec_vdso, diff --git a/src/syscall/fd.c b/src/syscall/fd.c index c1f828f..f06b0d2 100644 --- a/src/syscall/fd.c +++ b/src/syscall/fd.c @@ -104,6 +104,7 @@ void timerfd_init(void) { for (int i = 0; i < TIMERFD_MAX; i++) timerfd_state[i].guest_fd = -1; + fd_register_cleanup(FD_TIMERFD, timerfd_close); } static int timerfd_find(int guest_fd) @@ -514,10 +515,20 @@ static void timerfd_close(int guest_fd) #define LINUX_EFD_NONBLOCK 0x800 /* Same as O_NONBLOCK */ #define LINUX_EFD_SEMAPHORE 1 -/* Per-eventfd state */ +/* Per-eventfd state. The slot is shared across guest_fds that point at it (via + * dup/dup2/fcntl F_DUPFD), matching the Linux contract that dup'd eventfd fds + * share the same kernel object. eventfd_owner[gfd] maps a guest_fd to its slot; + * multiple guest_fds can map to the same slot. The slot owns its own read end + * for readiness/drain/blocking operations so it does not depend on any one + * guest fd remaining open. The slot is freed when refcount drops to zero. The + * slot's guest_fd field is retained for sfd_alloc_slot's + * "free if guest_fd == -1" convention and tracks the most recently allocated + * primary owner. + */ #define EVENTFD_MAX 32 static struct { - int guest_fd; /* Guest fd (-1 if unused) */ + int guest_fd; /* Primary guest fd, -1 when slot is free */ + int refcount; /* Number of guest_fds bound to this slot */ int pipe_rd; /* Read end of self-pipe (for poll/epoll readiness) */ int pipe_wr; /* Write end of self-pipe */ uint64_t counter; /* Accumulated event counter */ @@ -525,16 +536,22 @@ static struct { int nonblock; /* O_NONBLOCK */ } eventfd_state[EVENTFD_MAX]; +static int eventfd_owner[FD_TABLE_SIZE]; /* guest_fd -> slot, or -1 */ + void eventfd_init(void) { for (int i = 0; i < EVENTFD_MAX; i++) eventfd_state[i].guest_fd = -1; + for (int i = 0; i < FD_TABLE_SIZE; i++) + eventfd_owner[i] = -1; + fd_register_cleanup(FD_EVENTFD, eventfd_close); } static int eventfd_find(int guest_fd) { - return sfd_find_slot(eventfd_state, EVENTFD_MAX, sizeof(eventfd_state[0]), - guest_fd); + if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE)) + return -1; + return eventfd_owner[guest_fd]; } static int eventfd_slot_alloc(void) @@ -542,6 +559,19 @@ static int eventfd_slot_alloc(void) return sfd_alloc_slot(eventfd_state, EVENTFD_MAX, sizeof(eventfd_state[0])); } +static void eventfd_release_ref_locked(int slot) +{ + if (--eventfd_state[slot].refcount <= 0) { + close(eventfd_state[slot].pipe_rd); + close(eventfd_state[slot].pipe_wr); + eventfd_state[slot].guest_fd = -1; + eventfd_state[slot].counter = 0; + eventfd_state[slot].refcount = 0; + eventfd_state[slot].pipe_rd = -1; + eventfd_state[slot].pipe_wr = -1; + } +} + int64_t sys_eventfd2(unsigned int initval, int flags) { if (flags & ~(LINUX_EFD_CLOEXEC | LINUX_EFD_NONBLOCK | LINUX_EFD_SEMAPHORE)) @@ -564,9 +594,22 @@ int64_t sys_eventfd2(unsigned int initval, int flags) return linux_errno(); } + int state_rd = dup(pipefd[0]); + if (state_rd < 0 || fd_set_nonblock(state_rd) < 0 || + fd_set_cloexec(state_rd) < 0) { + int saved_errno = errno; + if (state_rd >= 0) + close(state_rd); + close(pipefd[0]); + close(pipefd[1]); + errno = saved_errno; + return linux_errno(); + } + /* Allocate guest fd: use read end as the host fd so epoll/poll sees it */ int gfd = fd_alloc(FD_EVENTFD, pipefd[0], eventfd_close); if (gfd < 0) { + close(state_rd); close(pipefd[0]); close(pipefd[1]); return -LINUX_EMFILE; @@ -577,17 +620,20 @@ int64_t sys_eventfd2(unsigned int initval, int flags) if (slot < 0) { pthread_mutex_unlock(&sfd_lock); fd_mark_closed(gfd); + close(state_rd); close(pipefd[0]); close(pipefd[1]); return -LINUX_ENOMEM; } eventfd_state[slot].guest_fd = gfd; - eventfd_state[slot].pipe_rd = pipefd[0]; + eventfd_state[slot].refcount = 1; + eventfd_state[slot].pipe_rd = state_rd; eventfd_state[slot].pipe_wr = pipefd[1]; eventfd_state[slot].counter = (uint64_t) initval; eventfd_state[slot].semaphore = (flags & LINUX_EFD_SEMAPHORE) ? 1 : 0; eventfd_state[slot].nonblock = (flags & LINUX_EFD_NONBLOCK) ? 1 : 0; + eventfd_owner[gfd] = slot; pthread_mutex_unlock(&sfd_lock); fd_table[gfd].linux_flags = @@ -610,14 +656,117 @@ static void eventfd_close(int guest_fd) pthread_mutex_lock(&sfd_lock); int slot = eventfd_find(guest_fd); if (slot >= 0) { - close(eventfd_state[slot].pipe_wr); - /* pipe_rd is closed by sys_close() as host_fd */ - eventfd_state[slot].guest_fd = -1; - eventfd_state[slot].counter = 0; + eventfd_owner[guest_fd] = -1; + eventfd_release_ref_locked(slot); } pthread_mutex_unlock(&sfd_lock); } +/* Bind an additional guest_fd to the same slot as src_fd, sharing the + * counter and pipe state. Two races to defeat: + * + * - Source identity. duplicate_guest_fd() snapshots src_fd under + * fd_lock, releases it, then calls us. Between those points src_fd + * could be closed and rebound to a different eventfd. We carry the + * caller's snapshot of fd_table[src_fd].host_fd as src_host_fd and verify + * under fd_lock + sfd_lock that the source fd still has that host fd and + * still maps to a live eventfd slot. + * + * - Destination close. fd_alloc_*_relaxed publishes the new guest_fd + * with eventfd_close as cleanup before we install the owner mapping. + * A racing close would run eventfd_close, see owner == -1, skip the + * refcount decrement, and leak the slot. We defeat this by reserving a + * slot ref before publishing the destination, then holding fd_lock + + * sfd_lock together while we verify fd_table[new] is still FD_EVENTFD with + * the host_fd we allocated and set eventfd_owner. Any close that already + * ran is observed here as FD_CLOSED, and we abandon the bind cleanly with + * no leak. + */ +int eventfd_dup_fd(int src_fd, + int src_host_fd, + int min_guest_fd, + int fixed_guest_fd, + bool fixed_slot, + int linux_flags) +{ + /* Pin the source under fd_lock + sfd_lock and dup the slot-owned + * readiness fd. The slot fd is independent of any guest alias, so closing + * the source later cannot invalidate eventfd_state[slot].pipe_rd. + */ + pthread_mutex_lock(&fd_lock); + pthread_mutex_lock(&sfd_lock); + int slot = eventfd_find(src_fd); + if (slot < 0 || fd_table[src_fd].type != FD_EVENTFD || + fd_table[src_fd].host_fd != src_host_fd || + eventfd_state[slot].refcount <= 0) { + pthread_mutex_unlock(&sfd_lock); + pthread_mutex_unlock(&fd_lock); + errno = EBADF; + return -1; + } + eventfd_state[slot].refcount++; + int new_host_fd = dup(eventfd_state[slot].pipe_rd); + int original_pipe_rd = eventfd_state[slot].pipe_rd; + if (new_host_fd < 0) + eventfd_release_ref_locked(slot); + pthread_mutex_unlock(&sfd_lock); + pthread_mutex_unlock(&fd_lock); + if (new_host_fd < 0) + return -1; + + /* Publish the destination fd with eventfd_close as cleanup. The + * eventfd_owner mapping is still -1, so a racing close here observes + * owner == -1 and does nothing; we detect that below. + */ + int new_guest_fd = fixed_slot + ? fd_alloc_at_relaxed(fixed_guest_fd, FD_EVENTFD, + new_host_fd, eventfd_close) + : fd_alloc_from_relaxed(min_guest_fd, FD_EVENTFD, + new_host_fd, eventfd_close); + if (new_guest_fd < 0) { + close(new_host_fd); + pthread_mutex_lock(&sfd_lock); + eventfd_release_ref_locked(slot); + pthread_mutex_unlock(&sfd_lock); + if (fixed_slot) + errno = EBADF; + return -1; + } + + /* Commit the bind under both locks in the documented order + * (fd_lock then sfd_lock). If a close already ran, fd_table[new].type + * is FD_CLOSED and we just bail with -EBADF; the host_fd is already + * gone via sys_close. Otherwise verify the source slot is still + * alive and unchanged, then install owner for the reserved ref. + */ + pthread_mutex_lock(&fd_lock); + pthread_mutex_lock(&sfd_lock); + if (fd_table[new_guest_fd].type != FD_EVENTFD || + fd_table[new_guest_fd].host_fd != new_host_fd || + eventfd_state[slot].refcount <= 0 || + eventfd_state[slot].pipe_rd != original_pipe_rd) { + pthread_mutex_unlock(&sfd_lock); + pthread_mutex_unlock(&fd_lock); + /* If the destination is still open but the source went away, + * tear it down. (If the destination already closed itself, the + * snapshot below sees FD_CLOSED and is a no-op.) + */ + fd_entry_t snap; + if (fd_snapshot_and_close(new_guest_fd, &snap)) + fd_cleanup_entry(new_guest_fd, &snap); + pthread_mutex_lock(&sfd_lock); + eventfd_release_ref_locked(slot); + pthread_mutex_unlock(&sfd_lock); + errno = EBADF; + return -1; + } + eventfd_owner[new_guest_fd] = slot; + fd_table[new_guest_fd].linux_flags = linux_flags; + pthread_mutex_unlock(&sfd_lock); + pthread_mutex_unlock(&fd_lock); + return new_guest_fd; +} + /* Read from eventfd: return 8-byte counter value, then reset to 0. * In EFD_SEMAPHORE mode, return 1 and decrement counter by 1. */ @@ -657,8 +806,12 @@ int64_t eventfd_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count) return linux_errno(); pthread_mutex_lock(&sfd_lock); - /* Re-validate: slot may have been freed by eventfd_close() */ - if (eventfd_state[slot].guest_fd != guest_fd) { + /* Re-validate via the owner table, not eventfd_state[slot].guest_fd: + * dup'd aliases bind multiple guest_fds to the same slot, so a + * legitimate caller's guest_fd may not equal the primary owner. + */ + if (eventfd_owner[guest_fd] != slot || + eventfd_state[slot].refcount <= 0) { pthread_mutex_unlock(&sfd_lock); return -LINUX_EBADF; } @@ -809,6 +962,7 @@ void signalfd_init(void) { for (int i = 0; i < SIGNALFD_MAX; i++) signalfd_state[i].guest_fd = -1; + fd_register_cleanup(FD_SIGNALFD, signalfd_close); } static int signalfd_find(int guest_fd) diff --git a/src/syscall/fd.h b/src/syscall/fd.h index e087ed4..faaf958 100644 --- a/src/syscall/fd.h +++ b/src/syscall/fd.h @@ -33,6 +33,21 @@ int64_t sys_timerfd_gettime(guest_t *g, int fd, uint64_t curr_value_gva); /* eventfd (emulated via pipe + counter) */ int64_t sys_eventfd2(unsigned int initval, int flags); +/* Duplicate an eventfd into a new guest_fd slot, sharing the counter and + * pipe state with src_fd. Mirrors the Linux contract that dup'd eventfds + * share the same underlying kernel object. src_host_fd must be the host + * fd snapshotted from fd_table[src_fd].host_fd by the caller; the + * implementation uses it to verify under fd_lock + sfd_lock that the source + * fd still refers to the same live eventfd between the caller's snapshot and + * the dup commit. Returns the new guest_fd or -1 with errno set. + */ +int eventfd_dup_fd(int src_fd, + int src_host_fd, + int min_guest_fd, + int fixed_guest_fd, + bool fixed_slot, + int linux_flags); + /* signalfd (emulated via synthetic signal reads) */ int64_t sys_signalfd4(guest_t *g, int fd, diff --git a/src/syscall/fdtable.c b/src/syscall/fdtable.c index 5455f41..ff62307 100644 --- a/src/syscall/fdtable.c +++ b/src/syscall/fdtable.c @@ -20,6 +20,7 @@ #include "utils.h" +#include "core/shim-globals.h" #include "syscall/abi.h" #include "syscall/internal.h" @@ -82,6 +83,33 @@ static inline void fd_init_entry(int fd, fd_table[fd].seals = 0; sock_opt_clear(&fd_table[fd]); fd_table[fd].cleanup = cleanup; + /* Start conservative. Callers that set linux_flags after allocation + * republish the readable-urandom state once the access mode is known. + */ + shim_globals_mark_urandom_fd(fd, false); +} + +void fd_refresh_urandom_bitmap(int fd) +{ + if (!RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) + return; + + /* Hold fd_lock across both the read of (type, linux_flags) AND the + * shim_globals bitmap publish. Dropping the lock before the publish + * would let a concurrent sys_close flip the slot to FD_CLOSED in + * the gap; the subsequent mark would then stomp a stale 'readable + * urandom' bit onto a freed slot, and the EL1 fast path honors that + * bitmap. shim_globals_mark_urandom_fd is itself atomic on the + * bitmap word, but atomicity is meaningless without an in-lock + * source-to-publish window. + */ + pthread_mutex_lock(&fd_lock); + int type = fd_table[fd].type; + int linux_flags = fd_table[fd].linux_flags; + bool readable_urandom = + type == FD_URANDOM && (linux_flags & LINUX_O_ACCMODE) != LINUX_O_WRONLY; + shim_globals_mark_urandom_fd(fd, readable_urandom); + pthread_mutex_unlock(&fd_lock); } /* Find the lowest free FD >= minfd using the bitmap. @@ -169,26 +197,29 @@ int fd_alloc(int type, int host_fd, void (*cleanup)(int)) /* Allocate the lowest available FD >= minfd. Returns -1 if none available * or RLIMIT_NOFILE would be exceeded. */ -int fd_alloc_from(int minfd, int type, int host_fd) +int fd_alloc_from(int minfd, int type, int host_fd, void (*cleanup)(int)) { pthread_mutex_lock(&fd_lock); - int fd = fd_alloc_locked(minfd, type, host_fd, NULL); + int fd = fd_alloc_locked(minfd, type, host_fd, cleanup); pthread_mutex_unlock(&fd_lock); return fd; } -int fd_alloc_from_relaxed(int minfd, int type, int host_fd) +int fd_alloc_from_relaxed(int minfd, + int type, + int host_fd, + void (*cleanup)(int)) { if (!thread_is_single_active()) - return fd_alloc_from(minfd, type, host_fd); - return fd_alloc_locked(minfd, type, host_fd, NULL); + return fd_alloc_from(minfd, type, host_fd, cleanup); + return fd_alloc_locked(minfd, type, host_fd, cleanup); } /* Allocate a specific FD slot. Enforces RLIMIT_NOFILE. Properly cleans up any * existing entry (including DIR* for directory FDs) before overwriting. Returns * -1 if out of range. */ -int fd_alloc_at(int fd, int type, int host_fd) +int fd_alloc_at(int fd, int type, int host_fd, void (*cleanup)(int)) { if (!RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) return -1; @@ -204,7 +235,7 @@ int fd_alloc_at(int fd, int type, int host_fd) pthread_mutex_lock(&fd_lock); if (fd_table[fd].type != FD_CLOSED) old = fd_table[fd]; - fd_init_entry(fd, type, host_fd, NULL); + fd_init_entry(fd, type, host_fd, cleanup); pthread_mutex_unlock(&fd_lock); /* Clean up old resources outside fd_lock */ @@ -214,19 +245,19 @@ int fd_alloc_at(int fd, int type, int host_fd) return fd; } -int fd_alloc_at_relaxed(int fd, int type, int host_fd) +int fd_alloc_at_relaxed(int fd, int type, int host_fd, void (*cleanup)(int)) { if (!RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) return -1; if (fd >= rlimit_nofile_cur) return -1; if (!thread_is_single_active()) - return fd_alloc_at(fd, type, host_fd); + return fd_alloc_at(fd, type, host_fd, cleanup); if (fd_table[fd].type != FD_CLOSED) - return fd_alloc_at(fd, type, host_fd); + return fd_alloc_at(fd, type, host_fd, cleanup); - fd_init_entry(fd, type, host_fd, NULL); + fd_init_entry(fd, type, host_fd, cleanup); return fd; } @@ -238,6 +269,11 @@ int fd_alloc_at_relaxed(int fd, int type, int host_fd) */ void fd_mark_closed_unlocked(int fd) { + /* Clear before publishing FD_CLOSED/free. The EL1 urandom read fast path + * intentionally avoids fd_lock, so it must not observe a stale urandom + * bit after this slot has become invalid or reusable. + */ + shim_globals_mark_urandom_fd(fd, false); fd_table[fd].type = FD_CLOSED; fd_table[fd].host_fd = -1; fd_table[fd].dir = NULL; @@ -334,6 +370,53 @@ bool fd_snapshot(int guest_fd, fd_entry_t *out) return ok; } +int fd_snapshot_and_dup(int guest_fd, fd_entry_t *out) +{ + out->type = FD_CLOSED; + if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE)) + return -1; + pthread_mutex_lock(&fd_lock); + if (!fd_snapshot_locked(guest_fd, out, false)) { + pthread_mutex_unlock(&fd_lock); + return -1; + } + int host = (out->host_fd >= 0) ? dup(out->host_fd) : -1; + pthread_mutex_unlock(&fd_lock); + return host; +} + +int fd_get_type(int guest_fd) +{ + if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE)) + return FD_CLOSED; + pthread_mutex_lock(&fd_lock); + int type = fd_table[guest_fd].type; + pthread_mutex_unlock(&fd_lock); + return type; +} + +/* Sized to cover all FD_* constants in abi.h plus a small headroom. Indexed + * by type. Each slot defaults to NULL (no per-type cleanup). Modules that + * own a type call fd_register_cleanup() at init time; dup and fork-restore + * paths read back the binding via fd_cleanup_for_type(). + */ +#define FD_TYPE_REGISTRY_SIZE 32 +static void (*fd_type_cleanup[FD_TYPE_REGISTRY_SIZE])(int); + +void fd_register_cleanup(int type, void (*cleanup)(int)) +{ + if (type < 0 || type >= FD_TYPE_REGISTRY_SIZE) + return; + fd_type_cleanup[type] = cleanup; +} + +void (*fd_cleanup_for_type(int type))(int) +{ + if (type < 0 || type >= FD_TYPE_REGISTRY_SIZE) + return NULL; + return fd_type_cleanup[type]; +} + /* Look up a guest FD and return a dup'd host fd that the caller owns. * The dup is performed under fd_lock so that close() on another thread * cannot invalidate the host fd between lookup and dup. Caller must diff --git a/src/syscall/fs.c b/src/syscall/fs.c index ce951eb..45e1ef0 100644 --- a/src/syscall/fs.c +++ b/src/syscall/fs.c @@ -24,9 +24,12 @@ #include "debug/log.h" #include "utils.h" +#include "core/shim-globals.h" /* shim_globals_mark_urandom_fd */ + #include "runtime/procemu.h" #include "syscall/abi.h" +#include "syscall/fd.h" /* eventfd_dup_fd */ #include "syscall/fuse.h" #include "syscall/fs.h" #include "syscall/internal.h" @@ -62,6 +65,16 @@ static int opened_fd_type(int host_fd, int linux_flags) return FD_REGULAR; } +static int intercepted_fd_type(const char *path, int host_fd, int linux_flags) +{ + int type = opened_fd_type(host_fd, linux_flags); + if (type < 0) + return type; + if (type == FD_REGULAR && path && !strcmp(path, "/dev/urandom")) + return FD_URANDOM; + return type; +} + static const char *proc_virtual_dir_path(const char *path, char *buf, size_t bufsz); @@ -168,16 +181,11 @@ static const char *proc_virtual_dir_path(const char *path, return virt; } -static int dup_fd_type(int guest_fd) -{ - return fd_table[guest_fd].type == FD_STDIO ? FD_REGULAR - : fd_table[guest_fd].type; -} - static int fd_alloc_opened_host(int host_fd, int type, int linux_flags, - int min_guest_fd) + int min_guest_fd, + void (*cleanup)(int)) { DIR *dir = NULL; @@ -193,9 +201,10 @@ static int fd_alloc_opened_host(int host_fd, } } - int guest_fd = min_guest_fd >= 0 - ? fd_alloc_from_relaxed(min_guest_fd, type, host_fd) - : fd_alloc_from_relaxed(0, type, host_fd); + int guest_fd = + min_guest_fd >= 0 + ? fd_alloc_from_relaxed(min_guest_fd, type, host_fd, cleanup) + : fd_alloc_from_relaxed(0, type, host_fd, cleanup); if (guest_fd < 0) { int saved_errno = errno; if (dir) @@ -204,9 +213,35 @@ static int fd_alloc_opened_host(int host_fd, return -1; } - fd_table[guest_fd].linux_flags = linux_flags; - if (dir) - fd_table[guest_fd].dir = dir; + /* Publish linux_flags, dir, and the urandom bitmap bit atomically + * with respect to the slot's identity. fd_alloc_*_relaxed drops + * fd_lock before returning, so a sibling vCPU's pathological + * close(guest_fd) + open() could reuse the slot between alloc and + * the metadata install below. Re-acquire fd_lock and verify the + * (type, host_fd) tuple still matches what just got allocated; + * if it does not, the slot belongs to a different file now and + * any install would clobber the sibling's entry. The sibling's + * close path already cleaned up our host_fd via fd_cleanup_entry, + * so this side only owns dir, which gets closed below. + */ + bool installed = false; + pthread_mutex_lock(&fd_lock); + if (fd_table[guest_fd].type == type && + fd_table[guest_fd].host_fd == host_fd) { + fd_table[guest_fd].linux_flags = linux_flags; + if (dir) + fd_table[guest_fd].dir = dir; + bool readable_urandom = + type == FD_URANDOM && + (linux_flags & LINUX_O_ACCMODE) != LINUX_O_WRONLY; + shim_globals_mark_urandom_fd(guest_fd, readable_urandom); + installed = true; + } + pthread_mutex_unlock(&fd_lock); + + if (!installed && dir) + closedir(dir); + return guest_fd; } @@ -249,7 +284,7 @@ int64_t sys_openat_path(guest_t *g, return linux_errno(); } int guest_fd = - fd_alloc_opened_host(sidecar_fd, type, linux_flags, -1); + fd_alloc_opened_host(sidecar_fd, type, linux_flags, -1, NULL); if (guest_fd < 0) { close_keep_errno(sidecar_fd); return linux_errno(); @@ -278,7 +313,8 @@ int64_t sys_openat_path(guest_t *g, close_keep_errno(host_fd); return linux_errno(); } - int guest_fd = fd_alloc_opened_host(host_fd, type, linux_flags, -1); + int guest_fd = + fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL); if (guest_fd < 0) { close_keep_errno(host_fd); return linux_errno(); @@ -303,15 +339,17 @@ int64_t sys_openat_path(guest_t *g, * /proc files use fd_alloc_from(128) to avoid races with * concurrent GC finalizers that may close stale low-numbered fds. */ - int type = opened_fd_type(intercepted, linux_flags); + int type = intercepted_fd_type(tx.intercept_path, intercepted, + linux_flags); if (type < 0) { close_keep_errno(intercepted); return linux_errno(); } int min_guest_fd = (!strncmp(tx.intercept_path, "/dev/", 5)) ? -1 : 128; - int guest_fd = fd_alloc_opened_host(intercepted, type, linux_flags, - min_guest_fd); + int guest_fd = + fd_alloc_opened_host(intercepted, type, linux_flags, + min_guest_fd, fd_cleanup_for_type(type)); if (guest_fd < 0) { close_keep_errno(intercepted); return linux_errno(); @@ -336,7 +374,8 @@ int64_t sys_openat_path(guest_t *g, close_keep_errno(host_fd); return linux_errno(); } - int guest_fd = fd_alloc_opened_host(host_fd, type, linux_flags, -1); + int guest_fd = + fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL); if (guest_fd < 0) { close_keep_errno(host_fd); return linux_errno(); @@ -358,7 +397,7 @@ int64_t sys_openat_path(guest_t *g, close_keep_errno(host_fd); return linux_errno(); } - int guest_fd = fd_alloc_opened_host(host_fd, type, linux_flags, -1); + int guest_fd = fd_alloc_opened_host(host_fd, type, linux_flags, -1, NULL); if (guest_fd < 0) { close_keep_errno(host_fd); return linux_errno(); @@ -410,41 +449,82 @@ int64_t sys_close(int fd) /* dup/fcntl. */ -static int clone_dir_stream_if_needed(int src_fd, int dst_fd, int dst_host_fd) +static void discard_allocated_fd(int guest_fd) { - if (fd_table[src_fd].type != FD_DIR) - return 0; + fd_entry_t snap; + if (fd_snapshot_and_close(guest_fd, &snap)) + fd_cleanup_entry(guest_fd, &snap); +} - int dir_fd = dup(dst_host_fd); - if (dir_fd < 0) - return -1; +/* Open a DIR stream over a dup of dst_host_fd if the source was an + * FD_DIR. Returns NULL on success-but-no-stream-needed (non-dir source) + * or on dup/fdopendir failure with errno preserved. Pulled out of the + * critical section in install_fd_alias_metadata_atomic because dup and + * fdopendir are slow syscalls that must not hold fd_lock. + */ +static DIR *clone_dir_stream(const fd_entry_t *src_snap, + int dst_host_fd, + bool *out_failed) +{ + *out_failed = false; + if (src_snap->type != FD_DIR) + return NULL; + int dir_fd = dup(dst_host_fd); + if (dir_fd < 0) { + *out_failed = true; + return NULL; + } DIR *dir = fdopendir(dir_fd); if (!dir) { + int saved_errno = errno; close(dir_fd); - return -1; + errno = saved_errno; + *out_failed = true; + return NULL; } - - fd_table[dst_fd].dir = dir; - return 0; + return dir; } -static void discard_allocated_fd(int guest_fd) +/* Install dup-alias metadata atomically with the slot identity. Uses + * the (type, host_fd) tuple as proof that the slot still belongs to + * the in-flight duplicate_guest_fd call; a sibling vCPU's pathological + * close + open between the relaxed allocator's lock release and this + * call could otherwise clobber the sibling's freshly-installed entry. + * Returns true on successful install, false if the slot was + * reallocated (caller must closedir any cloned dir to avoid a leak). + */ +static bool install_fd_alias_metadata_atomic(int dst_fd, + int expected_type, + int expected_host_fd, + const fd_entry_t *src_snap, + int linux_flags, + DIR *dir) { - fd_entry_t snap; - if (fd_snapshot_and_close(guest_fd, &snap)) - fd_cleanup_entry(guest_fd, &snap); -} + int preserved_flags = + src_snap->linux_flags & + (LINUX_O_ACCMODE | LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | + LINUX_O_DIRECT | LINUX_O_LARGEFILE); + int final_flags = preserved_flags | linux_flags; -static void copy_fd_alias_metadata(int src_fd, int dst_fd, int linux_flags) -{ - int preserved_flags = fd_table[src_fd].linux_flags & - (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | - LINUX_O_DIRECT | LINUX_O_LARGEFILE); - fd_table[dst_fd].linux_flags = preserved_flags | linux_flags; - fd_table[dst_fd].seals = fd_table[src_fd].seals; - memcpy(fd_table[dst_fd].proc_path, fd_table[src_fd].proc_path, - sizeof(fd_table[dst_fd].proc_path)); + bool installed = false; + pthread_mutex_lock(&fd_lock); + if (fd_table[dst_fd].type == expected_type && + fd_table[dst_fd].host_fd == expected_host_fd) { + fd_table[dst_fd].linux_flags = final_flags; + fd_table[dst_fd].seals = src_snap->seals; + memcpy(fd_table[dst_fd].proc_path, src_snap->proc_path, + sizeof(fd_table[dst_fd].proc_path)); + if (dir) + fd_table[dst_fd].dir = dir; + bool readable_urandom = + expected_type == FD_URANDOM && + (final_flags & LINUX_O_ACCMODE) != LINUX_O_WRONLY; + shim_globals_mark_urandom_fd(dst_fd, readable_urandom); + installed = true; + } + pthread_mutex_unlock(&fd_lock); + return installed; } /* Duplicate a guest fd into either the next free slot >= min_guest_fd or a @@ -457,28 +537,44 @@ static int duplicate_guest_fd(int src_fd, bool fixed_slot, int linux_flags) { - if (RANGE_CHECK(src_fd, 0, FD_TABLE_SIZE)) { - int t = fd_table[src_fd].type; - if (t == FD_FUSE_DEV || t == FD_FUSE_FILE || t == FD_FUSE_DIR) - return fuse_dup_fd(src_fd, min_guest_fd, fixed_guest_fd, fixed_slot, - linux_flags); - } - - host_fd_ref_t host_ref; - if (host_fd_ref_open(src_fd, &host_ref) < 0) { + /* Snapshot the source entry and dup its host fd in a single fd_lock + * critical section so the type, host fd, and metadata captured here + * cannot drift apart under a racing close + reopen. + */ + fd_entry_t src_snap; + int new_host_fd = fd_snapshot_and_dup(src_fd, &src_snap); + if (new_host_fd < 0 && src_snap.type == FD_CLOSED) { errno = EBADF; return -1; } - - int new_type = dup_fd_type(src_fd); - int new_host_fd = dup(host_ref.fd); - host_fd_ref_close(&host_ref); + if (src_snap.type == FD_FUSE_DEV || src_snap.type == FD_FUSE_FILE || + src_snap.type == FD_FUSE_DIR) { + if (new_host_fd >= 0) + close_keep_errno(new_host_fd); + return fuse_dup_fd(src_fd, min_guest_fd, fixed_guest_fd, fixed_slot, + linux_flags); + } + /* eventfd dup must share the underlying counter and pipe state across + * the source and destination fds (Linux contract). Pass src_snap's + * host_fd through so eventfd_dup_fd can verify the source fd still + * refers to the same live eventfd between the snapshot here and the + * bind there. + */ + if (src_snap.type == FD_EVENTFD) { + if (new_host_fd >= 0) + close_keep_errno(new_host_fd); + return eventfd_dup_fd(src_fd, src_snap.host_fd, min_guest_fd, + fixed_guest_fd, fixed_slot, linux_flags); + } if (new_host_fd < 0) return -1; - int guest_fd = - fixed_slot ? fd_alloc_at_relaxed(fixed_guest_fd, new_type, new_host_fd) - : fd_alloc_from_relaxed(min_guest_fd, new_type, new_host_fd); + int new_type = (src_snap.type == FD_STDIO) ? FD_REGULAR : src_snap.type; + void (*cleanup)(int) = fd_cleanup_for_type(new_type); + int guest_fd = fixed_slot ? fd_alloc_at_relaxed(fixed_guest_fd, new_type, + new_host_fd, cleanup) + : fd_alloc_from_relaxed(min_guest_fd, new_type, + new_host_fd, cleanup); if (guest_fd < 0) { if (fixed_slot) errno = EBADF; @@ -486,14 +582,31 @@ static int duplicate_guest_fd(int src_fd, return -1; } - copy_fd_alias_metadata(src_fd, guest_fd, linux_flags); - if (clone_dir_stream_if_needed(src_fd, guest_fd, new_host_fd) < 0) { + /* Clone the DIR stream outside fd_lock (dup + fdopendir would block + * other fd ops), then install everything atomically under fd_lock + * with a tuple verification so a sibling close + reopen on the same + * guest_fd cannot make this install land on an unrelated slot. + */ + bool dir_clone_failed = false; + DIR *dir = clone_dir_stream(&src_snap, new_host_fd, &dir_clone_failed); + if (dir_clone_failed) { int saved_errno = errno; discard_allocated_fd(guest_fd); errno = saved_errno; return -1; } + if (!install_fd_alias_metadata_atomic(guest_fd, new_type, new_host_fd, + &src_snap, linux_flags, dir)) { + /* Slot was reallocated by a sibling while metadata install was + * pending; the sibling's close path already cleaned up new_host_fd + * via fd_cleanup_entry, so the only resource this side still + * owns is the cloned DIR stream. + */ + if (dir) + closedir(dir); + } + return guest_fd; } @@ -600,7 +713,7 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) return linux_errno(); int linux_fl = mac_to_linux_status_flags(mac_fl); if (snap.type == FD_REGULAR || snap.type == FD_DIR || - snap.type == FD_PATH) + snap.type == FD_PATH || snap.type == FD_URANDOM) linux_fl = (linux_fl & ~O_ACCMODE) | (snap.linux_flags & 3); linux_fl |= snap.linux_flags & (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | diff --git a/src/syscall/fuse.c b/src/syscall/fuse.c index ae248e1..157191a 100644 --- a/src/syscall/fuse.c +++ b/src/syscall/fuse.c @@ -1281,6 +1281,9 @@ void fuse_init(void) memset(fuse_file_bindings, 0, sizeof(fuse_file_bindings)); fuse_next_mount_id = 100; pthread_mutex_unlock(&fuse_lock); + fd_register_cleanup(FD_FUSE_DEV, fuse_fd_cleanup); + fd_register_cleanup(FD_FUSE_FILE, fuse_fd_cleanup); + fd_register_cleanup(FD_FUSE_DIR, fuse_fd_cleanup); } int fuse_proc_open(int linux_flags) @@ -2540,9 +2543,15 @@ int fuse_dup_fd(int src_fd, return -1; } - int guest_fd = fixed_slot - ? fd_alloc_at_relaxed(fixed_guest_fd, snap.type, -1) - : fd_alloc_from_relaxed(min_guest_fd, snap.type, -1); + /* Install cleanup atomically with the type. Without this, a racing + * close between fd_alloc_*_relaxed publishing the slot and the later + * fd_table[guest_fd].cleanup assignment would skip fuse_fd_cleanup + * and leak the session or file ref. + */ + int guest_fd = fixed_slot ? fd_alloc_at_relaxed(fixed_guest_fd, snap.type, + -1, fuse_fd_cleanup) + : fd_alloc_from_relaxed(min_guest_fd, snap.type, + -1, fuse_fd_cleanup); if (guest_fd < 0) { if (fixed_slot) errno = EBADF; @@ -2588,7 +2597,6 @@ int fuse_dup_fd(int src_fd, (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE); fd_table[guest_fd].linux_flags = preserved_flags | linux_flags; - fd_table[guest_fd].cleanup = fuse_fd_cleanup; pthread_mutex_unlock(&fuse_lock); return guest_fd; } diff --git a/src/syscall/inotify.c b/src/syscall/inotify.c index 7513e5c..d9b54dd 100644 --- a/src/syscall/inotify.c +++ b/src/syscall/inotify.c @@ -111,6 +111,7 @@ void inotify_init(void) { for (int i = 0; i < INOTIFY_MAX; i++) inotify_state[i].guest_fd = -1; + fd_register_cleanup(FD_INOTIFY, inotify_close); } static int inotify_find(int guest_fd) diff --git a/src/syscall/internal.h b/src/syscall/internal.h index 2760ce9..8534e6a 100644 --- a/src/syscall/internal.h +++ b/src/syscall/internal.h @@ -59,32 +59,84 @@ void fdtable_init(void); */ int fd_alloc(int type, int host_fd, void (*cleanup)(int)); -/* Allocate the lowest available FD >= minfd. Returns -1 if none available. */ -int fd_alloc_from(int minfd, int type, int host_fd); +/* Allocate the lowest available FD >= minfd. Returns -1 if none available. + * cleanup is set atomically under fd_lock (pass NULL for plain fds). + */ +int fd_alloc_from(int minfd, int type, int host_fd, void (*cleanup)(int)); /* Allocate the lowest available FD >= minfd with a single-thread fast path. * Falls back to fd_alloc_from() when multiple guest threads are active. */ -int fd_alloc_from_relaxed(int minfd, int type, int host_fd); +int fd_alloc_from_relaxed(int minfd, + int type, + int host_fd, + void (*cleanup)(int)); -/* Allocate a specific FD slot. Returns -1 if out of range. */ -int fd_alloc_at(int fd, int type, int host_fd); +/* Allocate a specific FD slot. + * Returns -1 if out of range. + * cleanup is set atomically under fd_lock (pass NULL for plain fds). + */ +int fd_alloc_at(int fd, int type, int host_fd, void (*cleanup)(int)); /* Allocate a specific FD slot with a single-thread fast path. * Falls back to fd_alloc_at() when replacement/cleanup must stay serialized. */ -int fd_alloc_at_relaxed(int fd, int type, int host_fd); +int fd_alloc_at_relaxed(int fd, int type, int host_fd, void (*cleanup)(int)); /* Look up a guest FD. Returns host FD or -1 if invalid. * Unsafe for concurrent use; see fd_snapshot/fd_to_host_dup. */ int fd_to_host(int guest_fd); -/* Snapshot an fd entry under fd_lock. Thread-safe alternative to - * direct fd_table[] access. Returns true on success, false if closed. +/* Snapshot an fd entry under fd_lock. Thread-safe alternative to direct + * fd_table[] access. + * Returns true on success, false if closed. */ bool fd_snapshot(int guest_fd, fd_entry_t *out); +/* Snapshot an fd entry AND dup its host fd in a single fd_lock critical + * section. Eliminates the TOCTOU window between reading the type/metadata + * and duplicating the host fd in the dup(2) path. Returns the dup'd host + * fd (owned by the caller) on success, -1 on failure. On success the + * snapshot in *out is consistent with the dup'd host fd. + */ +int fd_snapshot_and_dup(int guest_fd, fd_entry_t *out); + +/* Read just the fd type under fd_lock. Returns FD_CLOSED for out-of-range or + * closed slots. Cheaper than fd_snapshot when only the type is needed for + * dispatch (sys_read/sys_readv/sys_writev fast paths). + */ +int fd_get_type(int guest_fd); + +/* Republish the EL1 urandom read fast-path bit for this fd from the current + * fd_table type and access mode. Only readable /dev/urandom descriptors are + * eligible for the bitmap. + */ +void fd_refresh_urandom_bitmap(int fd); + +/* Type -> cleanup registry. Modules that own a synthetic fd type register + * their cleanup at init time; dup and fork-restore paths look up the + * cleanup from the type so the binding stays consistent without each path + * re-deriving the dispatch table. + */ +void fd_register_cleanup(int type, void (*cleanup)(int)); +void (*fd_cleanup_for_type(int type))(int); + +/* True for fd types whose host backing (kqueue for timerfd/inotify, pipe + * halves for eventfd/signalfd/netlink/pidfd, epoll instance) cannot be + * meaningfully inherited across fork IPC: macOS SCM_RIGHTS rejects kqueue + * fds, and the per-class side-table state (eventfd counter, signalfd mask, + * pidfd target, epoll set, ...) is not serialized. The child must recreate + * such fds via the appropriate syscall, so the parent filters them from the + * SCM_RIGHTS payload and the receiver drops any that still arrive. + */ +static inline bool fd_type_is_synthetic(int type) +{ + return type == FD_EVENTFD || type == FD_SIGNALFD || type == FD_TIMERFD || + type == FD_INOTIFY || type == FD_NETLINK || type == FD_PIDFD || + type == FD_EPOLL; +} + /* Look up a guest FD and return a dup'd host fd owned by the caller. * Thread-safe: dup is performed under fd_lock. Returns -1 on failure. * Caller MUST close() the returned fd when done. diff --git a/src/syscall/io.c b/src/syscall/io.c index ee183dd..ef04d56 100644 --- a/src/syscall/io.c +++ b/src/syscall/io.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,7 @@ #include "utils.h" #include "core/rosetta.h" +#include "core/shim-globals.h" #include "hvutil.h" #include "runtime/procemu.h" #include "runtime/thread.h" @@ -43,6 +45,7 @@ #define SYSCALL_IOV_MAX 1024 #define SYSCALL_IOV_STACK_MAX 64 +#define URANDOM_CACHE_SIZE 4096 /* Linux terminal struct types. */ @@ -60,6 +63,27 @@ typedef struct { uint8_t c_cc[19]; } linux_termios_t; +/* Per-fd lock embedded in the cache so a urandom read on fd A does not + * serialize behind a concurrent urandom read on fd B. The previous design + * used a single global mutex covering the whole cache array, which made + * the per-fd cache pointless under any sibling-vCPU urandom traffic. + * The lock array is initialized at startup by io_init(). + */ +typedef struct { + pthread_mutex_t lock; + uint8_t buf[URANDOM_CACHE_SIZE]; + size_t off; + size_t len; +} urandom_cache_t; + +static urandom_cache_t urandom_cache[FD_TABLE_SIZE]; + +void io_init(void) +{ + for (int i = 0; i < FD_TABLE_SIZE; i++) + pthread_mutex_init(&urandom_cache[i].lock, NULL); +} + _Static_assert(sizeof(linux_termios_t) == 36, "aarch64 Linux TCGETS struct termios must be 36 bytes"); @@ -123,6 +147,136 @@ static int64_t io_return_zero(host_fd_ref_t *host_ref) return 0; } +void urandom_fd_reset_cache(int guest_fd) +{ + if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE)) + return; + + /* Preserve the embedded lock; reset only the entropy fields. memset of + * the whole struct would clobber the mutex state. + */ + urandom_cache_t *c = &urandom_cache[guest_fd]; + pthread_mutex_lock(&c->lock); + memset(c->buf, 0, sizeof(c->buf)); + c->off = 0; + c->len = 0; + pthread_mutex_unlock(&c->lock); +} + +void urandom_fd_cleanup(int guest_fd) +{ + if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE)) + return; + + urandom_fd_reset_cache(guest_fd); +} + +static int64_t urandom_check_readable(int guest_fd) +{ + fd_entry_t snap; + if (!fd_snapshot(guest_fd, &snap) || snap.type != FD_URANDOM) + return -LINUX_EBADF; + if ((snap.linux_flags & 3) == LINUX_O_WRONLY) + return -LINUX_EBADF; + return 0; +} + +static int64_t urandom_fill_iov(int guest_fd, + const struct iovec *iov, + int iovcnt) +{ + int64_t err = urandom_check_readable(guest_fd); + if (err < 0) + return err; + + size_t total = 0; + for (int i = 0; i < iovcnt; i++) { + if (iov[i].iov_len > (size_t) SSIZE_MAX - total) + return -LINUX_EINVAL; + total += iov[i].iov_len; + } + if (total == 0) + return 0; + + urandom_cache_t *c = &urandom_cache[guest_fd]; + pthread_mutex_lock(&c->lock); + size_t done = 0; + for (int i = 0; i < iovcnt && done < total; i++) { + uint8_t *dst = iov[i].iov_base; + size_t iov_done = 0; + size_t iov_len = iov[i].iov_len; + if (iov_len > total - done) + iov_len = total - done; + while (iov_done < iov_len) { + if (c->off == c->len) { + arc4random_buf(c->buf, sizeof(c->buf)); + c->off = 0; + c->len = sizeof(c->buf); + } + size_t chunk = c->len - c->off; + if (chunk > iov_len - iov_done) + chunk = iov_len - iov_done; + memcpy(dst + iov_done, c->buf + c->off, chunk); + c->off += chunk; + iov_done += chunk; + done += chunk; + } + } + pthread_mutex_unlock(&c->lock); + return (int64_t) done; +} + +static int64_t validate_iov_total(guest_t *g, uint64_t iov_gva, int iovcnt) +{ + if (iovcnt <= 0 || iovcnt > SYSCALL_IOV_MAX) + return -LINUX_EINVAL; + + size_t total = 0; + for (int i = 0; i < iovcnt; i++) { + linux_iovec_t giov; + if (guest_read_small(g, iov_gva + (uint64_t) i * sizeof(giov), &giov, + sizeof(giov)) < 0) + return -LINUX_EFAULT; + if (giov.iov_len > (uint64_t) SSIZE_MAX - total) + return -LINUX_EINVAL; + total += (size_t) giov.iov_len; + } + return 0; +} + +static int64_t urandom_read(guest_t *g, + int guest_fd, + uint64_t buf_gva, + uint64_t count) +{ + if (count > SSIZE_MAX) + count = SSIZE_MAX; + if (count == 0) { + struct iovec empty = {0}; + return urandom_fill_iov(guest_fd, &empty, 1); + } + + uint64_t avail = 0; + void *dst = guest_ptr_bound(g, buf_gva, &avail, MEM_PERM_W, count); + if (!dst) + return -LINUX_EFAULT; + if (count > avail) + count = avail; + + struct iovec iov = {.iov_base = dst, .iov_len = (size_t) count}; + int64_t rc = urandom_fill_iov(guest_fd, &iov, 1); + + /* This slow path runs when the shim's identity-class fast path + * could not serve the read: either the request was larger than + * the shim's inline limit, or the ring was empty. Refill the + * shim's entropy ring before returning so a subsequent + * read(/dev/urandom) from the same vCPU sees a populated ring + * and stays on the fast path. + */ + shim_globals_refill_urandom_ring(g); + return rc; +} + static bool rosetta_ioctl_target_fd(guest_t *g, int host_fd) { if (!g->is_rosetta) @@ -689,12 +843,11 @@ static int64_t io_write_result(ssize_t ret) int64_t sys_write(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) { - if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) { - if (fd_table[fd].type == FD_FUSE_DEV) - return fuse_dev_write(g, fd, buf_gva, count); - if (fd_table[fd].type == FD_EVENTFD) - return eventfd_write(fd, g, buf_gva, count); - } + int type = fd_get_type(fd); + if (type == FD_FUSE_DEV) + return fuse_dev_write(g, fd, buf_gva, count); + if (type == FD_EVENTFD) + return eventfd_write(fd, g, buf_gva, count); host_fd_ref_t host_ref; int64_t err = host_fd_ref_open_checked(fd, &host_ref, true); @@ -741,21 +894,28 @@ int64_t sys_write(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) int64_t sys_read(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) { - if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) { - if (fd_table[fd].type == FD_FUSE_DEV) - return fuse_dev_read(fd, g, buf_gva, count); - if (fd_table[fd].type == FD_FUSE_FILE) - return fuse_read_fd(g, fd, buf_gva, count); - if (fd_table[fd].type == FD_EVENTFD) - return eventfd_read(fd, g, buf_gva, count); - if (fd_table[fd].type == FD_SIGNALFD) - return signalfd_read(fd, g, buf_gva, count); - if (fd_table[fd].type == FD_TIMERFD) - return timerfd_read(fd, g, buf_gva, count); - if (fd_table[fd].type == FD_INOTIFY) - return inotify_read(fd, g, buf_gva, count); - if (fd_table[fd].type == FD_NETLINK) - return netlink_read(fd, g, buf_gva, count); + /* Read the type once under fd_lock so a concurrent close/reopen cannot + * make different dispatch checks disagree. Each handler still + * re-validates internally and returns EBADF if its slot changed. + */ + int type = fd_get_type(fd); + switch (type) { + case FD_FUSE_DEV: + return fuse_dev_read(fd, g, buf_gva, count); + case FD_FUSE_FILE: + return fuse_read_fd(g, fd, buf_gva, count); + case FD_EVENTFD: + return eventfd_read(fd, g, buf_gva, count); + case FD_SIGNALFD: + return signalfd_read(fd, g, buf_gva, count); + case FD_TIMERFD: + return timerfd_read(fd, g, buf_gva, count); + case FD_INOTIFY: + return inotify_read(fd, g, buf_gva, count); + case FD_NETLINK: + return netlink_read(fd, g, buf_gva, count); + case FD_URANDOM: + return urandom_read(g, fd, buf_gva, count); } host_fd_ref_t host_ref; @@ -914,11 +1074,23 @@ static int64_t build_host_iov(guest_t *g, free(guest_iov); return -LINUX_EFAULT; } - /* Cap to contiguous permitted bytes */ + /* Cap to contiguous permitted bytes. When the guest iov entry + * spans a non-contiguous boundary (different mapping or + * permission), zero every subsequent host iov length so the + * host readv/writev returns a POSIX-compliant short I/O rather + * than silently packing the truncated tail of buffer i into + * buffer i+1 -- which corrupts the guest's data layout. + */ uint64_t len = guest_iov[i].iov_len; - if (len > avail) - len = avail; host_iov[i].iov_base = base; + if (len > avail) { + host_iov[i].iov_len = avail; + for (int j = i + 1; j < iovcnt; j++) { + host_iov[j].iov_base = NULL; + host_iov[j].iov_len = 0; + } + break; + } host_iov[i].iov_len = len; } if (guest_iov != stack_giov) @@ -981,29 +1153,55 @@ int64_t sys_readv(guest_t *g, int fd, uint64_t iov_gva, int iovcnt) int64_t err = single_guest_iov(g, iov_gva, &giov); if (err < 0) return err; + if (fd_get_type(fd) == FD_URANDOM && + giov.iov_len > (uint64_t) SSIZE_MAX) { + err = urandom_check_readable(fd); + if (err < 0) + return err; + return -LINUX_EINVAL; + } return sys_read(g, fd, giov.iov_base, giov.iov_len); } /* Special FD types need their custom read handlers because glibc may use * readv() instead of read() for the same logical operation. Delegate - * to the first iov entry's buffer. Use the first iov's length (not - * the sum of all iovs) because the data goes into giov[0].iov_base - * which is only giov[0].iov_len bytes long. + * scalar special fds to the first iov entry's buffer. Use the first iov's + * length (not the sum of all iovs) because the data goes into + * giov[0].iov_base which is only giov[0].iov_len bytes long. */ - if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) { - int type = fd_table[fd].type; - if (type == FD_EVENTFD || type == FD_SIGNALFD || type == FD_TIMERFD || - type == FD_INOTIFY) { - if (iovcnt <= 0) - return -LINUX_EINVAL; - /* Use guest_read for the iov array since guest_ptr alone is unsafe - * if the array spans a 2MiB block boundary. - */ - linux_iovec_t giov; - if (guest_read_small(g, iov_gva, &giov, sizeof(giov)) < 0) - return -LINUX_EFAULT; - return sys_read(g, fd, giov.iov_base, giov.iov_len); - } + int type = fd_get_type(fd); + if (type == FD_URANDOM) { + int64_t err = urandom_check_readable(fd); + if (err < 0) + return err; + err = validate_iov_total(g, iov_gva, iovcnt); + if (err < 0) + return err; + host_iov_buf_t host_iov; + err = host_iov_prepare(g, iov_gva, iovcnt, MEM_PERM_W, &host_iov); + if (err < 0) + return err; + int64_t ret = urandom_fill_iov(fd, host_iov.iov, iovcnt); + host_iov_free(&host_iov); + /* Mirror sys_read's slow-path refill so a readv consumer that + * drains the shim ring leaves it ready for the next call, + * instead of forcing every subsequent EL1 fast-path attempt + * back through HVC until some other path triggers a refill. + */ + shim_globals_refill_urandom_ring(g); + return ret; + } + if (type == FD_EVENTFD || type == FD_SIGNALFD || type == FD_TIMERFD || + type == FD_INOTIFY) { + if (iovcnt <= 0) + return -LINUX_EINVAL; + /* Use guest_read for the iov array since guest_ptr alone is unsafe + * if the array spans a 2MiB block boundary. + */ + linux_iovec_t giov; + if (guest_read_small(g, iov_gva, &giov, sizeof(giov)) < 0) + return -LINUX_EFAULT; + return sys_read(g, fd, giov.iov_base, giov.iov_len); } host_fd_ref_t host_ref; @@ -1051,7 +1249,7 @@ int64_t sys_writev(guest_t *g, int fd, uint64_t iov_gva, int iovcnt) * sum of all iovs) because the data is at giov.iov_base which is only * giov.iov_len bytes. eventfd expects exactly 8 bytes. */ - if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE) && fd_table[fd].type == FD_EVENTFD) { + if (fd_get_type(fd) == FD_EVENTFD) { if (iovcnt <= 0) return -LINUX_EINVAL; linux_iovec_t giov; diff --git a/src/syscall/io.h b/src/syscall/io.h index 05a3321..dde34a2 100644 --- a/src/syscall/io.h +++ b/src/syscall/io.h @@ -22,6 +22,13 @@ /* read/write and their positional variants. */ int64_t sys_write(guest_t *g, int fd, uint64_t buf_gva, uint64_t count); int64_t sys_read(guest_t *g, int fd, uint64_t buf_gva, uint64_t count); +void urandom_fd_cleanup(int guest_fd); +void urandom_fd_reset_cache(int guest_fd); +/* Initialize the per-fd urandom cache locks. Must run before any guest + * thread enters sys_read or sys_readv on /dev/urandom. Called from + * syscall_init alongside the other subsystem init hooks. + */ +void io_init(void); int64_t sys_pread64(guest_t *g, int fd, uint64_t buf_gva, diff --git a/src/syscall/mem.c b/src/syscall/mem.c index 13a0157..bcdd48d 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -2458,6 +2458,16 @@ int64_t sys_mremap(guest_t *g, if (old_size > 0 && old_size > g->guest_size - old_off) return -LINUX_EFAULT; + /* Reject mremap whose source range touches VM infrastructure (page + * tables, shim code, shim data). Without this guard a guest can move + * the shim_data block out from under the EL1 stack or the shim- + * globals identity cache, since the move path issues raw memmove, + * memset, region removal and PTE invalidation. Matches the parallel + * guards in sys_mmap MAP_FIXED, sys_munmap and sys_mprotect. + */ + if (guest_range_hits_infra(g, old_off, old_off + old_size)) + return -LINUX_EINVAL; + /* Verify the whole source range is covered by one tracked VMA. mremap() * must not copy holes or unrelated adjacent mappings. */ @@ -2500,6 +2510,14 @@ int64_t sys_mremap(guest_t *g, if (new_off > g->guest_size || new_size > g->guest_size - new_off) return -LINUX_ENOMEM; + /* Same infrastructure protection as the source range: the move + * tail removes any existing dest region and rewrites PTEs, which + * would corrupt page tables / shim text / shim data if the dest + * lands inside infra. + */ + if (guest_range_hits_infra(g, new_off, new_off + new_size)) + return -LINUX_EINVAL; + /* Linux rejects MREMAP_FIXED when old and new ranges overlap */ uint64_t old_end = old_off + old_size, new_end = new_off + new_size; if (old_off < new_end && new_off < old_end) @@ -2706,6 +2724,14 @@ int64_t sys_mremap(guest_t *g, if (new_size > old_size) { uint64_t grow_off = old_off + old_size, grow_len = new_size - old_size; + /* Reject growing into infrastructure (page tables, shim text, + * shim data). The source-range infra guard above only covers + * [old_off, old_off+old_size); the grown tail can still spill + * into infra without it. + */ + if (guest_range_hits_infra(g, grow_off, grow_off + grow_len)) + return -LINUX_EINVAL; + /* Check if the space after the old region is free (overflow-safe) */ if (grow_off <= g->guest_size && grow_len <= g->guest_size - grow_off) { bool can_grow = true; @@ -2974,6 +3000,16 @@ int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice) if (off > g->guest_size || length > g->guest_size - off) return -LINUX_ENOMEM; + /* Defensive guard against destructive advice on infrastructure + * ranges (page tables, shim text, shim data). MADV_DONTNEED would + * zero shim data via raw host_base+off arithmetic; MADV_FREE on a + * future flag change could do the same. Today the destructive + * advice paths happen to skip non-anonymous regions, but a future + * regression should not silently reopen the hole. + */ + if (guest_range_hits_infra(g, off, off + length)) + return -LINUX_EINVAL; + switch (advice) { case LINUX_MADV_DONTNEED: { /* MADV_DONTNEED: zero anon pages so next access sees zero-fill, diff --git a/src/syscall/net-msg.c b/src/syscall/net-msg.c index ecc9f71..96221ff 100644 --- a/src/syscall/net-msg.c +++ b/src/syscall/net-msg.c @@ -98,7 +98,7 @@ static void recvmsg_close_host_rights(const void *data_src, size_t data_len) int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags) { - if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE) && fd_table[fd].type == FD_NETLINK) + if (fd_get_type(fd) == FD_NETLINK) return netlink_sendmsg(fd, g, msg_gva, linux_flags); host_fd_ref_t host_ref; @@ -339,7 +339,7 @@ int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags) int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags) { - if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE) && fd_table[fd].type == FD_NETLINK) + if (fd_get_type(fd) == FD_NETLINK) return netlink_recvmsg(fd, g, msg_gva, flags); host_fd_ref_t host_ref; diff --git a/src/syscall/net.c b/src/syscall/net.c index b80ca18..05b0c76 100644 --- a/src/syscall/net.c +++ b/src/syscall/net.c @@ -215,7 +215,7 @@ int64_t sys_socketpair(guest_t *g, int64_t sys_bind(guest_t *g, int fd, uint64_t addr_gva, uint32_t addrlen) { /* Netlink sockets use synthetic fd; dispatch to netlink handler */ - if (RANGE_CHECK(fd, 0, FD_TABLE_SIZE) && fd_table[fd].type == FD_NETLINK) + if (fd_get_type(fd) == FD_NETLINK) return netlink_bind(fd, g, addr_gva, addrlen); host_fd_ref_t host_ref; @@ -469,7 +469,7 @@ int64_t sys_connect(guest_t *g, int fd, uint64_t addr_gva, uint32_t addrlen) return linux_errno(); } - if (fd_alloc_at(fd, FD_SOCKET, pair[0]) < 0) { + if (fd_alloc_at(fd, FD_SOCKET, pair[0], NULL) < 0) { close(pair[0]); close(pair[1]); host_fd_ref_close(&host_ref); diff --git a/src/syscall/netlink.c b/src/syscall/netlink.c index a1b555e..32c3ec3 100644 --- a/src/syscall/netlink.c +++ b/src/syscall/netlink.c @@ -396,6 +396,7 @@ static int nl_build_getaddr(netlink_state_t *ns) void netlink_init(void) { memset(nl_state, 0, sizeof(nl_state)); + fd_register_cleanup(FD_NETLINK, netlink_close); } int64_t netlink_socket(int protocol, int type) diff --git a/src/syscall/proc-pidfd.c b/src/syscall/proc-pidfd.c index 62480f3..635eb88 100644 --- a/src/syscall/proc-pidfd.c +++ b/src/syscall/proc-pidfd.c @@ -50,6 +50,13 @@ static pidfd_entry_t *pidfd_find_guest_fd_entry(int guest_fd) return NULL; } +static void pidfd_cleanup(int guest_fd); + +void pidfd_init(void) +{ + fd_register_cleanup(FD_PIDFD, pidfd_cleanup); +} + static void pidfd_cleanup(int guest_fd) { pthread_mutex_lock(&pidfd_lock); diff --git a/src/syscall/proc-pidfd.h b/src/syscall/proc-pidfd.h index 8d02df4..79e55e5 100644 --- a/src/syscall/proc-pidfd.h +++ b/src/syscall/proc-pidfd.h @@ -10,6 +10,7 @@ #include "core/guest.h" +void pidfd_init(void); int pidfd_create(guest_t *g, int64_t target_pid); void proc_pidfd_notify_exit(int64_t exited_pid); int64_t proc_pidfd_lookup_pid(int guest_fd); diff --git a/src/syscall/proc.c b/src/syscall/proc.c index 73bc39c..33cde52 100644 --- a/src/syscall/proc.c +++ b/src/syscall/proc.c @@ -33,6 +33,7 @@ #include "hvutil.h" #include "utils.h" +#include "core/shim-globals.h" #include "core/vdso.h" #include "runtime/futex.h" @@ -1157,6 +1158,17 @@ int vcpu_run_loop(hv_vcpu_t vcpu, /* Check guest ITIMER_REAL expiry (queues SIGALRM if due) */ signal_check_timer(); + /* Recompute the shim-globals attention flag now that + * signal_check_timer has had a chance to drain pending + * work. If nothing is pending and no itimer is armed, drop + * the flag back to zero so the identity fast path + * re-engages for the next getpid loop. Without this clear, + * the attention flag set by signal_queue (e.g., on a + * subprocess's SIGCHLD) would stick forever and + * permanently disable the fast path. + */ + shim_globals_recompute_attention(g); + /* Diagnostic: log signal state after exec/sigreturn * to help debug signal delivery issues. */ diff --git a/src/syscall/signal.c b/src/syscall/signal.c index 2156638..6f6f4e1 100644 --- a/src/syscall/signal.c +++ b/src/syscall/signal.c @@ -28,6 +28,7 @@ #include "hvutil.h" +#include "core/shim-globals.h" #include "core/vdso.h" #include "runtime/thread.h" @@ -255,14 +256,76 @@ static inline bool *thread_saved_valid_ptr(void) /* Public API. */ +/* Singleton guest pointer used by attention-flag setters in this file. + * elfuse runs one VM per process so a single global is correct. The + * setter (signal_set_shim_globals_guest) asserts NULL-or-same to catch + * a lifecycle bug in any future multi-VM design. + * + * Atomic because attention_raise runs on every signal queue from any + * thread without holding sig_lock, while signal_init clears it across + * the execve reset window. ARM64 aligned 64-bit pointer writes are + * single-copy atomic, but plain reads/writes have no ordering, so a + * concurrent attention_raise could observe a stale value or fail to + * see a fresh registration. The release-acquire pair seals the window. + */ +static _Atomic(guest_t *) attention_guest; + void signal_init(void) { memset(&sig_state, 0, sizeof(sig_state)); + /* Clear the attention singleton on every init pass. Bootstrap and + * the fork-child receive path both call this before + * signal_set_shim_globals_guest publishes the live g; the reset + * keeps the setter's NULL-or-same assertion from latching onto a + * stale parent pointer in the child process. Release-store so a + * sibling thread that ACQUIRE-loads the slot after init observes + * NULL and falls back to thread_interrupt_all instead of a stale + * parent pointer. + */ + atomic_store_explicit(&attention_guest, NULL, memory_order_release); /* Altstack is now per-thread (in thread_entry_t), initialized to * SS_DISABLE by thread_register_main() and thread_alloc(). */ } +void signal_set_shim_globals_guest(guest_t *g) +{ + guest_t *cur = atomic_load_explicit(&attention_guest, memory_order_acquire); + if (g != NULL && cur != NULL && cur != g) { + log_error( + "signal: shim-globals guest already registered to %p, " + "refusing to re-register with %p", + (void *) cur, (void *) g); + return; + } + atomic_store_explicit(&attention_guest, g, memory_order_release); +} + +/* Raise the shim-globals attention flag if the singleton has been + * registered; otherwise fall back to a bare vCPU interrupt. Both paths + * end up running thread_interrupt_all (shim_globals_raise_attention + * issues it internally), so callers only need this single helper. + */ +static inline void attention_raise(void) +{ + guest_t *g = atomic_load_explicit(&attention_guest, memory_order_acquire); + if (g) + shim_globals_raise_attention(g); + else + thread_interrupt_all(); +} + +/* Predicate matches the deliverability gate used by signal_queue and + * signal_queue_info: SIGKILL/SIGSTOP are uncatchable and must always + * interrupt; other signals only interrupt when at least one active + * thread does not block them. + */ +static inline bool signal_should_interrupt(int signum) +{ + return sig_uncatchable(signum) || + thread_signal_deliverable(sig_bit(signum)); +} + void signal_reset_for_exec(void) { thread_entry_t *t = current_thread; @@ -319,13 +382,14 @@ void signal_queue(int signum) */ signalfd_notify(signum); - /* Only force vCPUs out of hv_vcpu_run() if the signal is actually - * deliverable to at least one thread. SIGKILL/SIGSTOP cannot be - * blocked and always need interruption. For other signals, check - * per-thread blocked masks to avoid spurious context switches -- - * Go, JVM, and Node.js mask signals in worker threads, causing - * thousands of unnecessary ~1000ns VM exit+re-entry cycles per - * second if signal emulation interrupts unconditionally. + /* Only force vCPUs out of hv_vcpu_run(), and only force the shim's + * identity fast path off, if the signal is actually deliverable to + * at least one thread. SIGKILL/SIGSTOP cannot be blocked and always + * need interruption. For other signals, check per-thread blocked masks + * to avoid spurious context switches -- Go, JVM, and Node.js mask + * signals in worker threads, causing thousands of unnecessary ~1000ns + * VM exit+re-entry cycles per second if signal emulation interrupts + * unconditionally. * * Race: if a thread concurrently unblocks this signal via * rt_sigprocmask, the pending signal could be missed here. @@ -333,8 +397,8 @@ void signal_queue(int signum) * signals after unblocking and interrupting the current thread * if delivery became possible. */ - if (sig_uncatchable(signum) || thread_signal_deliverable(sig_bit(signum))) - thread_interrupt_all(); + if (signal_should_interrupt(signum)) + attention_raise(); } void signal_queue_rt(int signum, @@ -373,8 +437,11 @@ void signal_queue_info(int signum, memory_order_release); pthread_mutex_unlock(&sig_lock); signalfd_notify(signum); - if (thread_signal_deliverable(sig_bit(signum))) - thread_interrupt_all(); + /* Same shim-globals attention raise as signal_queue: force the fast + * path off only when the queued signal can reach signal_deliver. + */ + if (signal_should_interrupt(signum)) + attention_raise(); } void signal_set_fault_info(int si_code, uint64_t addr, uint64_t esr) @@ -407,6 +474,30 @@ int signal_pending(void) return result; } +bool signal_attention_needed(void) +{ + /* Cheap atomic load on the sig-pending hint first; if a signal is + * queued and deliverable to at least one active thread, the shim should + * drop to the slow path even before we touch the itimer state. A pending + * signal blocked by every active thread is not useful slow-path work and + * should not keep identity syscalls out of the fast path indefinitely. + */ + uint64_t hint = + atomic_load_explicit(&sig_pending_hint, memory_order_acquire); + if (hint != 0 && thread_signal_deliverable(hint)) + return true; + /* Active guest itimers: even if no signal is queued YET, the + * timer can fire at any moment, and signal_check_timer needs an + * HVC #5 epilogue to notice it. Keep attention raised while any + * timer is armed. + */ + if (__atomic_load_n(&guest_itimer.active, __ATOMIC_ACQUIRE) || + __atomic_load_n(&guest_itimer_virt.active, __ATOMIC_ACQUIRE) || + __atomic_load_n(&guest_itimer_prof.active, __ATOMIC_ACQUIRE)) + return true; + return false; +} + bool signal_pending_interruption(bool *restart_out) { pthread_mutex_lock(&sig_lock); @@ -752,15 +843,32 @@ void signal_set_itimer(const struct timeval *value, pthread_mutex_unlock(&sig_lock); return; } - if (value->tv_sec == 0 && value->tv_usec == 0) { + bool arm = (value->tv_sec != 0 || value->tv_usec != 0); + if (!arm) { /* Disarm */ __atomic_store_n(&guest_itimer.active, 0, __ATOMIC_RELEASE); } else { - __atomic_store_n(&guest_itimer.active, 1, __ATOMIC_RELEASE); + /* Publish expiry and interval BEFORE the release-store of active. + * signal_check_timer and signal_attention_needed ACQUIRE-load + * active without holding sig_lock; if active is published before + * its associated fields, a consumer can observe active=1 with + * stale expiry/interval and decide an early or late SIGALRM. + * Matches the field order in signal_set_itimer_virt. + */ guest_itimer.expiry = timeval_add(&now, value); guest_itimer.interval = interval ? *interval : (struct timeval) {0, 0}; + __atomic_store_n(&guest_itimer.active, 1, __ATOMIC_RELEASE); } pthread_mutex_unlock(&sig_lock); + + /* Arming any timer requires the shim's identity fast path to drop + * to the slow path so signal_check_timer can see the expiry. The + * disarm case is handled by signal_attention_needed returning + * false at the next HVC epilogue recompute -- no explicit clear + * here. + */ + if (arm) + attention_raise(); } void signal_get_itimer(struct timeval *value, struct timeval *interval) @@ -850,8 +958,9 @@ void signal_set_itimer_virt(int which, else *old_value = (struct timeval) {0, 0}; } + bool arm = value && (value->tv_sec != 0 || value->tv_usec != 0); if (value) { - if (value->tv_sec == 0 && value->tv_usec == 0) { + if (!arm) { __atomic_store_n(&timer->active, 0, __ATOMIC_RELEASE); } else { timer->expiry = timeval_add(&now, value); @@ -860,6 +969,9 @@ void signal_set_itimer_virt(int which, } } pthread_mutex_unlock(&sig_lock); + + if (arm) + attention_raise(); } void signal_get_itimer_virt(int which, @@ -1447,7 +1559,7 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code) * glibc leaves sa_restorer uninitialized (garbage); musl sets it to * __restore_rt. Match the kernel: always use the vDSO trampoline. */ - hv_vcpu_set_reg(vcpu, HV_REG_X30, VDSO_BASE + VDSO_OFF_TEXT); + hv_vcpu_set_reg(vcpu, HV_REG_X30, VDSO_BASE + VDSO_OFF_SIGRET); if (act->sa_flags & LINUX_SA_SIGINFO) { /* X1 = pointer to siginfo, X2 = pointer to ucontext */ diff --git a/src/syscall/signal.h b/src/syscall/signal.h index 99602b0..f5792f5 100644 --- a/src/syscall/signal.h +++ b/src/syscall/signal.h @@ -244,6 +244,28 @@ void signal_set_fault_info(int si_code, uint64_t addr, uint64_t esr); int signal_pending(void); bool signal_pending_interruption(bool *restart_out); +/* True if anything that would normally be drained by signal_check_timer is + * currently live: an unblocked pending signal, OR any of the three guest + * itimers is armed. The shim's identity fast path consults this (indirectly + * via shim_globals attention flag) to decide whether to skip the HVC #5 + * round-trip. Whenever this returns true, the shim must take the slow path so + * the epilogue's signal_check_timer + queue drain runs. + */ +bool signal_attention_needed(void); + +/* Register the shim-globals guest pointer used by the attention setters in + * signal_queue / setitimer / proc_set_exit_group. Called from bootstrap and + * fork-child after guest_init. Asserts that the value is NULL or matches the + * already-registered g; elfuse runs one VM per process and the singleton + * catches lifecycle bugs (multiple concurrent VMs in one process would + * violate this invariant). + * + * Passing NULL clears the registration (used by signal_init for a defensive + * reset; the attention setters become no-ops in that state, matching the + * pre-registration behavior). + */ +void signal_set_shim_globals_guest(guest_t *g); + /* Deliver the highest-priority pending unblocked signal to the guest. * Builds an rt_sigframe on the guest stack and redirects vCPU to handler. * Returns: 1 if signal was delivered, 0 if nothing pending, @@ -289,9 +311,9 @@ const signal_state_t *signal_get_state(void); void signal_set_state(const signal_state_t *state); /* Snapshot or consume pending signals for signalfd. - * signal_peek_signalfd() snapshots up to max matching entries without - * consuming them. signal_take_signalfd_exact() then consumes those exact - * entries, preserving any matching signals that arrived later. + * signal_peek_signalfd() snapshots up to max matching entries without consuming + * them. signal_take_signalfd_exact() then consumes those exact entries, + * preserving any matching signals that arrived later. */ size_t signal_peek_signalfd(uint64_t mask, signal_rt_info_t *out, size_t max); size_t signal_take_signalfd_exact(const signal_rt_info_t *expected, size_t max); diff --git a/src/syscall/sys.c b/src/syscall/sys.c index 9166850..1284090 100644 --- a/src/syscall/sys.c +++ b/src/syscall/sys.c @@ -178,12 +178,28 @@ int64_t sys_uname(guest_t *g, uint64_t buf_gva) return 0; } +/* Linux getrandom(2) flags. arc4random_buf is always non-blocking and always + * seeded, so GRND_NONBLOCK / GRND_RANDOM / GRND_INSECURE all collapse to the + * same behavior here. Unknown flag bits must still return EINVAL per kernel + * behavior (kernel/random.c rejects flags & ~SUPPORTED_FLAGS) so callers do + * not silently fossilize wrong assumptions about the elfuse implementation. + */ +#define LINUX_GRND_NONBLOCK 0x0001 +#define LINUX_GRND_RANDOM 0x0002 +#define LINUX_GRND_INSECURE 0x0004 +#define LINUX_GRND_SUPPORTED_MASK \ + (LINUX_GRND_NONBLOCK | LINUX_GRND_RANDOM | LINUX_GRND_INSECURE) + int64_t sys_getrandom(guest_t *g, uint64_t buf_gva, uint64_t buflen, unsigned int flags) { - (void) flags; + if (flags & ~LINUX_GRND_SUPPORTED_MASK) + return -LINUX_EINVAL; + if ((flags & (LINUX_GRND_RANDOM | LINUX_GRND_INSECURE)) == + (LINUX_GRND_RANDOM | LINUX_GRND_INSECURE)) + return -LINUX_EINVAL; if (buflen == 0) return 0; if (buf_gva > UINT64_MAX - buflen) diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index 68cad6d..be97787 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -56,11 +56,14 @@ #include "syscall/poll.h" #include "syscall/path.h" #include "syscall/proc.h" +#include "syscall/proc-pidfd.h" #include "syscall/signal.h" #include "syscall/sys.h" #include "syscall/sysvipc.h" #include "syscall/time.h" +#include "core/shim-globals.h" + /* Generated from src/syscall/dispatch.tbl into $(BUILD_DIR). */ #include "dispatch.h" @@ -84,6 +87,11 @@ void syscall_init(void) { fdtable_init(); signal_init(); + /* Mirror signal_init's attention_guest reset for the fd/urandom + * bitmap singleton in shim-globals. Defends against a stale + * parent-process pointer surviving across posix_spawn re-init. + */ + shim_globals_reset_singleton(); /* Initialize special FD subsystems (eventfd, signalfd, timerfd, inotify). * Must happen before any guest code runs so that concurrent CLONE_THREAD @@ -95,6 +103,9 @@ void syscall_init(void) inotify_init(); netlink_init(); fuse_init(); + pidfd_init(); + io_init(); + fd_register_cleanup(FD_URANDOM, urandom_fd_cleanup); wakeup_pipe_init(); } @@ -163,6 +174,35 @@ typedef int64_t (*syscall_handler_t)(guest_t *g, #define SC_STUB(name, val) SC_FORWARD(name, (val)) +/* Bracket setuid/setgid family invocations so concurrent shim-fast-path + * readers cannot observe stale credentials. The host-side proc_sys_* + * mutators flip the _Atomic credential slots inside proc-identity.c; + * the shim cache must reflect that under the same atomic window. + * + * Sequence: OR ATTN_BIT_CRED -> mutator -> on success publish_creds -> + * AND ~ATTN_BIT_CRED. The OR-only update preserves whatever + * ATTN_BIT_SIGTIMER state the HVC #5 epilogue's recompute may have + * set or cleared in parallel; AND-only clear at the end leaves the + * SIGTIMER lane alone. Earlier revisions wrote the full word, which + * let a sibling's recompute drop the flag to zero mid-publish and + * reopened the torn-cred race the bracket was meant to close. + * + * Implemented as a statement-expression macro so the SC_FORWARD body + * stays a single expression and the mutator runs after the attention + * raise as part of normal C sequencing. + */ +#define CRED_BRACKETED(g_, mutator_) \ + __extension__({ \ + guest_t *_g = (g_); \ + shim_globals_attn_or(_g, ATTN_BIT_CRED); \ + int64_t _rc = (mutator_); \ + if (_rc == 0) \ + shim_globals_publish_creds(_g, proc_get_uid(), proc_get_euid(),\ + proc_get_gid(), proc_get_egid()); \ + shim_globals_attn_and(_g, ~ATTN_BIT_CRED); \ + _rc; \ + }) + /* sc_xxx forwarding wrappers: thin adapters that unpack the syscall ABI * argument tuple (x0..x5) into a sys_xxx() call. */ @@ -494,12 +534,12 @@ SC_FORWARD(sc_getuid, (int64_t) proc_get_uid()) SC_FORWARD(sc_geteuid, (int64_t) proc_get_euid()) SC_FORWARD(sc_getgid, (int64_t) proc_get_gid()) SC_FORWARD(sc_getegid, (int64_t) proc_get_egid()) -SC_FORWARD(sc_setuid, proc_sys_setuid((uint32_t) x0)) -SC_FORWARD(sc_setgid, proc_sys_setgid((uint32_t) x0)) -SC_FORWARD(sc_setreuid, proc_sys_setreuid((uint32_t) x0, (uint32_t) x1)) -SC_FORWARD(sc_setregid, proc_sys_setregid((uint32_t) x0, (uint32_t) x1)) -SC_FORWARD(sc_setresuid, proc_sys_setresuid((uint32_t) x0, (uint32_t) x1, (uint32_t) x2)) -SC_FORWARD(sc_setresgid, proc_sys_setresgid((uint32_t) x0, (uint32_t) x1, (uint32_t) x2)) +SC_FORWARD(sc_setuid, CRED_BRACKETED(g, proc_sys_setuid((uint32_t) x0))) +SC_FORWARD(sc_setgid, CRED_BRACKETED(g, proc_sys_setgid((uint32_t) x0))) +SC_FORWARD(sc_setreuid, CRED_BRACKETED(g, proc_sys_setreuid((uint32_t) x0, (uint32_t) x1))) +SC_FORWARD(sc_setregid, CRED_BRACKETED(g, proc_sys_setregid((uint32_t) x0, (uint32_t) x1))) +SC_FORWARD(sc_setresuid, CRED_BRACKETED(g, proc_sys_setresuid((uint32_t) x0, (uint32_t) x1, (uint32_t) x2))) +SC_FORWARD(sc_setresgid, CRED_BRACKETED(g, proc_sys_setresgid((uint32_t) x0, (uint32_t) x1, (uint32_t) x2))) SC_FORWARD(sc_setpgid, proc_sys_setpgid((int64_t) x0, (int64_t) x1)) SC_STUB(sc_fadvise64, 0) SC_STUB(sc_sched_yield, (sched_yield(), 0)) diff --git a/src/syscall/time.c b/src/syscall/time.c index 8a76c4b..f584990 100644 --- a/src/syscall/time.c +++ b/src/syscall/time.c @@ -15,6 +15,7 @@ #include "utils.h" +#include "core/vdso.h" #include "runtime/thread.h" /* current_thread, guest_tid */ #include "syscall/abi.h" #include "syscall/internal.h" @@ -57,10 +58,9 @@ _Static_assert(sizeof(struct timespec) == sizeof(linux_timespec_t), _Static_assert(sizeof(struct timeval) == sizeof(linux_timeval_t), "host and guest timeval must match on LP64"); -static bool linux_timespec_valid(const linux_timespec_t *ts, - bool allow_negative_sec) +static bool linux_timespec_valid(const linux_timespec_t *ts) { - if (!allow_negative_sec && ts->tv_sec < 0) + if (ts->tv_sec < 0) return false; return ts->tv_nsec >= 0 && ts->tv_nsec < NSEC_PER_SEC; } @@ -243,16 +243,83 @@ int64_t sys_clock_getres(guest_t *g, int clockid, uint64_t tp_gva) int64_t sys_clock_gettime(guest_t *g, int clockid, uint64_t tp_gva) { - struct timespec ts; int mac_clockid = translate_clockid(clockid); if (mac_clockid < 0) return -LINUX_EINVAL; + + /* If this trap came from the __kernel_clock_gettime vDSO svc_fallback, + * the trampoline parked the guest's CNTVCT_EL0 read in X9 before + * issuing SVC, and ELR_EL1 holds the address immediately after that + * SVC. Pair X9 with both the MONOTONIC and REALTIME wall_clocks and + * seed the vvar so subsequent calls hit the fast path for either + * clockid. Skip the seed for any other trap (raw + * syscall(SYS_clock_gettime, ...) from guest code, etc.): X9 is + * then arbitrary guest state, and seeding from it would poison the + * anchor and break every later fast-path call. + * + * Skip the gate entirely once the anchor is published: vdso_seed_anchor + * is a one-shot CAS that can never fire again, so the HVF reads of + * ELR_EL1 and X9 below would be pure waste on every subsequent trap. + * Both clockid 0 (REALTIME) and clockid 1 (MONOTONIC) take the vDSO + * fast path, so either may be the first caller; either way both + * anchor pairs are seeded from a single set of host clock_gettime + * calls. + * + * Order matters: read X9 first, then sample both host wall clocks + * back-to-back, then write to guest and seed. Sampling host clocks + * before checking X9 would bake a permanent positive bias (~50-200 ns) + * into the anchor because every host call ages the X9 timestamp by + * the seeding gate's HVF round-trip. The back-to-back wall-clock + * reads minimize MONO/REAL skew within the anchor. + */ + bool seed_eligible = (clockid == 0 /* CLOCK_REALTIME */ || + clockid == 1 /* CLOCK_MONOTONIC */) && + current_thread && !vdso_anchor_is_seeded(g); + + uint64_t guest_cntvct = 0; + if (seed_eligible) { + uint64_t elr = 0; + if (hv_vcpu_get_sys_reg(current_thread->vcpu, HV_SYS_REG_ELR_EL1, + &elr) != HV_SUCCESS || + elr != vdso_clock_gettime_svc_pc() + 4 || + hv_vcpu_get_reg(current_thread->vcpu, HV_REG_X9, &guest_cntvct) != + HV_SUCCESS || + guest_cntvct == 0) { + /* Trap came from a path other than the vDSO trampoline; X9 is + * arbitrary, fall through to the non-seeding path. + */ + seed_eligible = false; + } + } + + struct timespec ts; if (clock_gettime(mac_clockid, &ts) < 0) return linux_errno(); + /* For the seeding path, sample the OTHER clockid back-to-back so both + * anchor pairs reflect roughly the same host moment. If the second + * clock_gettime fails (unreachable on macOS but defensive), skip + * seeding rather than fail the user's request: the user already has + * the value they asked for. + */ + struct timespec ts_other; + bool can_seed = false; + if (seed_eligible) { + int other_mac = (clockid == 1) ? CLOCK_REALTIME : CLOCK_MONOTONIC; + if (clock_gettime(other_mac, &ts_other) == 0) + can_seed = true; + } + if (guest_write_small(g, tp_gva, &ts, sizeof(ts)) < 0) return -LINUX_EFAULT; + if (can_seed) { + const struct timespec *ts_mono = (clockid == 1) ? &ts : &ts_other; + const struct timespec *ts_real = (clockid == 0) ? &ts : &ts_other; + vdso_seed_anchor(g, guest_cntvct, ts_mono->tv_sec, ts_mono->tv_nsec, + ts_real->tv_sec, ts_real->tv_nsec); + } + return 0; } @@ -268,7 +335,7 @@ int64_t sys_nanosleep(guest_t *g, uint64_t req_gva, uint64_t rem_gva) if (guest_read_small(g, req_gva, &lreq, sizeof(lreq)) < 0) return -LINUX_EFAULT; - if (!linux_timespec_valid(&lreq, false)) + if (!linux_timespec_valid(&lreq)) return -LINUX_EINVAL; return interruptible_sleep_ns(g, linux_timespec_to_ns_sat(&lreq), rem_gva, @@ -287,7 +354,14 @@ int64_t sys_clock_nanosleep(guest_t *g, if (flags & ~TIMER_ABSTIME) return -LINUX_EINVAL; - if (!linux_timespec_valid(&lreq, (flags & TIMER_ABSTIME) != 0)) + /* Linux's hrtimer_nanosleep_clockid validates the timespec via + * timespec64_valid_strict() (kernel/time/hrtimer.c) before deciding + * whether the absolute deadline has expired. Negative tv_sec is + * rejected with EINVAL even when TIMER_ABSTIME is set, not silently + * treated as 'already expired'. Reject negative tv_sec unconditionally + * so both relative and absolute callers match the kernel contract. + */ + if (!linux_timespec_valid(&lreq)) return -LINUX_EINVAL; int mac_clockid = translate_clockid(clockid); @@ -297,9 +371,6 @@ int64_t sys_clock_nanosleep(guest_t *g, int64_t remaining_ns; if (flags & TIMER_ABSTIME) { - if (lreq.tv_sec < 0) - return 0; - struct timespec now; if (clock_gettime(mac_clockid, &now) < 0) return linux_errno(); @@ -340,9 +411,15 @@ int64_t sys_setitimer(guest_t *g, int which, uint64_t new_gva, uint64_t old_gva) if (new_gva) { if (guest_read_small(g, new_gva, &lnew, sizeof(lnew)) < 0) return -LINUX_EFAULT; - /* Linux rejects tv_usec outside [0, 999999] for value and interval. */ + /* Linux rejects tv_usec outside [0, 999999] AND negative tv_sec for + * both value and interval. Accepting a negative tv_sec would cast + * through (long) below and arm an expired timer instead of returning + * EINVAL, diverging from the kernel contract. + */ if (!RANGE_CHECK(lnew.it_value.tv_usec, 0, 1000000) || - !RANGE_CHECK(lnew.it_interval.tv_usec, 0, 1000000)) + !RANGE_CHECK(lnew.it_interval.tv_usec, 0, 1000000) || + (int64_t) lnew.it_value.tv_sec < 0 || + (int64_t) lnew.it_interval.tv_sec < 0) return -LINUX_EINVAL; has_new = true; } diff --git a/tests/bench-futex-pingpong.c b/tests/bench-futex-pingpong.c new file mode 100644 index 0000000..707e898 --- /dev/null +++ b/tests/bench-futex-pingpong.c @@ -0,0 +1,110 @@ +/* Futex ping-pong microbenchmark + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Two threads handing off via FUTEX_WAIT and FUTEX_WAKE on private futexes. + * Measures the round-trip cost of the core wait/wake hot path. Reports total + * elapsed time in milliseconds for the configured handoff count. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "raw-syscall.h" + +#define HANDOFFS 20000 + +static volatile int turn_a; +static volatile int turn_b; +static volatile int b_done; + +static int child_stack_storage[16384] __attribute__((aligned(16))); + +static int child_fn(void *arg) +{ + (void) arg; + for (int i = 0; i < HANDOFFS; i++) { + while (__atomic_load_n(&turn_b, __ATOMIC_ACQUIRE) == 0) + raw_futex_wait((int *) &turn_b, 0); + __atomic_store_n(&turn_b, 0, __ATOMIC_RELEASE); + __atomic_store_n(&turn_a, 1, __ATOMIC_RELEASE); + raw_futex_wake((int *) &turn_a, 1); + } + __atomic_store_n(&b_done, 1, __ATOMIC_RELEASE); + raw_futex_wake((int *) &b_done, 1); + raw_exit(0); + return 0; +} + +int main(void) +{ + struct timeval start, end; + + /* Allocate a child stack via the local array (already 16-aligned). */ + void *stack_top = + (char *) child_stack_storage + sizeof(child_stack_storage); + + int ctid = 0; + long flags = 0x00010f00 | 0x00200000; /* CLONE_VM|FS|FILES|SIGHAND|THREAD| + * SYSVSEM|CHILD_CLEARTID + * matched at raw level. + */ + /* Use the conventional pthread-like flag mask. */ + flags = 0x3D0F00; /* CLONE_VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS off | + * PARENT_SETTID off | CHILD_CLEARTID|CHILD_SETTID. + */ + /* Simpler: just CLONE_VM|CLONE_THREAD|CLONE_SIGHAND|CLONE_FS|CLONE_FILES. + */ + flags = 0x00000100 | 0x00010000 | 0x00000800 | 0x00000200 | + 0x00000400; /* VM|THREAD|SIGHAND|FS|FILES */ + + /* aarch64 clone ABI: x0=flags, x1=child_stack, x2=parent_tid, + * x3=tls, x4=child_tid. The child returns at the same site. + */ + long rc = raw_clone((unsigned long) flags, stack_top, NULL, 0, &ctid); + if (rc == 0) { + child_fn(NULL); + /* unreachable */ + return 0; + } + if (rc < 0) { + fprintf(stderr, "clone failed: %ld\n", rc); + return 1; + } + + gettimeofday(&start, NULL); + + /* Kick off the round-trip. */ + __atomic_store_n(&turn_b, 1, __ATOMIC_RELEASE); + raw_futex_wake((int *) &turn_b, 1); + + for (int i = 0; i < HANDOFFS; i++) { + while (__atomic_load_n(&turn_a, __ATOMIC_ACQUIRE) == 0) + raw_futex_wait((int *) &turn_a, 0); + __atomic_store_n(&turn_a, 0, __ATOMIC_RELEASE); + if (i + 1 < HANDOFFS) { + __atomic_store_n(&turn_b, 1, __ATOMIC_RELEASE); + raw_futex_wake((int *) &turn_b, 1); + } + } + + /* Wait for child to finish so timing covers full handoff count. */ + while (__atomic_load_n(&b_done, __ATOMIC_ACQUIRE) == 0) + raw_futex_wait((int *) &b_done, 0); + + gettimeofday(&end, NULL); + + long elapsed_us = + (end.tv_sec - start.tv_sec) * 1000000L + (end.tv_usec - start.tv_usec); + /* Print elapsed time in milliseconds (3 decimal places). */ + long ms_int = elapsed_us / 1000; + long ms_frac = elapsed_us % 1000; + printf("elapsed_ms %ld.%03ld\n", ms_int, ms_frac); + return 0; +} diff --git a/tests/manifest.txt b/tests/manifest.txt index ff9631b..19b1b27 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -45,10 +45,18 @@ test-file-ops test-sysinfo test-io-opt test-syscall-smoke +test-vdso +test-shim-identity +test-shim-identity-attention +test-shim-verbose-trace +test-shim-data-el1 +test-shim-urandom-smp +test-shim-urandom-toctou test-poll # diff=skip [section] I/O subsystem tests test-eventfd +test-eventfd-dup test-signalfd test-signalfd-hardening test-epoll @@ -83,8 +91,9 @@ test-clone3 # diff=skip test-fork-exec $TESTDIR/echo-test test-fork-lowbase -[section] COW fork isolation tests +[section] CoW fork isolation tests test-cow-fork +test-fork-synthetic-fd [section] O_CLOEXEC tests test-cloexec @@ -102,6 +111,8 @@ test-lowbase-mem-300000 [section] mremap tests test-mremap +test-mremap-infra +test-shim-cred-race [section] msync MAP_SHARED tests test-msync diff --git a/tests/test-cow-fork.c b/tests/test-cow-fork.c index 8770420..f7cc0c7 100644 --- a/tests/test-cow-fork.c +++ b/tests/test-cow-fork.c @@ -1,4 +1,4 @@ -/* COW fork memory isolation tests +/* CoW fork memory isolation tests * * Copyright 2026 elfuse contributors * Copyright 2025 Moritz Angermann, zw3rk pte. ltd. @@ -166,11 +166,11 @@ static void test_mmap_isolation(void) munmap(region, 4096); } -/* Test 4: Large region COW (verify no corruption) */ +/* Test 4: Large region CoW (verify no corruption) */ static void test_large_cow(void) { - TEST("fork: 1MiB COW integrity"); + TEST("fork: 1MiB CoW integrity"); int pipefd[2]; if (pipe(pipefd) != 0) { @@ -229,7 +229,7 @@ static void test_large_cow(void) int status; waitpid(pid, &status, 0); - EXPECT_TRUE(parent_ok && child_ok, "1MiB COW integrity failed"); + EXPECT_TRUE(parent_ok && child_ok, "1MiB CoW integrity failed"); munmap(buf, sz); } @@ -302,7 +302,7 @@ static void test_brk_isolation(void) int main(void) { - printf("test-cow-fork: COW fork memory isolation tests\n"); + printf("test-cow-fork: CoW fork memory isolation tests\n"); test_stack_isolation(); test_heap_isolation(); diff --git a/tests/test-eventfd-dup.c b/tests/test-eventfd-dup.c new file mode 100644 index 0000000..484c2d7 --- /dev/null +++ b/tests/test-eventfd-dup.c @@ -0,0 +1,65 @@ +/* test-eventfd-dup.c -- dup of eventfd shares state (Linux contract) + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Linux dup of an eventfd produces a second descriptor that points at the + * same kernel object; reads and writes on either fd see the same counter. + * elfuse used to give each dup'd guest_fd a fresh side-table slot, so + * dup'd eventfds diverged and breaking programs that signal across the + * pair. This test pins the contract by: + * - duping an eventfd initialised with counter=7, reading via the dup, + * verifying the dup observes the source's initial value + * - writing via the source, reading via the dup, verifying state shares + * - closing one end of the alias and continuing to operate on the other + */ + +#include +#include +#include +#include +#include +#include + +static int failures = 0; + +#define EXPECT(cond, msg) \ + do { \ + if (!(cond)) { \ + fprintf(stderr, "FAIL: %s\n", msg); \ + failures++; \ + } \ + } while (0) + +int main(void) +{ + int a = eventfd(7, EFD_CLOEXEC); + EXPECT(a >= 0, "eventfd(7) returned valid fd"); + int b = dup(a); + EXPECT(b >= 0, "dup(a) returned valid fd"); + + uint64_t v = 0; + EXPECT(read(b, &v, 8) == 8, "read 8 bytes from dup'd fd"); + EXPECT(v == 7, "dup'd fd observes source initial counter (7)"); + + uint64_t n = 42; + EXPECT(write(a, &n, 8) == 8, "write 42 to source fd"); + EXPECT(read(b, &v, 8) == 8, "read counter from dup'd fd"); + EXPECT(v == 42, "dup'd fd observes source write (42)"); + + close(a); + n = 99; + EXPECT(write(b, &n, 8) == 8, "write 99 to alias after closing source"); + EXPECT(read(b, &v, 8) == 8, "read after partial close"); + EXPECT(v == 99, "alias still functional after partial close"); + struct pollfd pfd = {.fd = b, .events = POLLIN}; + EXPECT(poll(&pfd, 1, 0) == 0, "alias is not readable after drain"); + close(b); + + if (failures) { + printf("test-eventfd-dup: %d FAIL\n", failures); + return 1; + } + puts("test-eventfd-dup: PASS"); + return 0; +} diff --git a/tests/test-fork-synthetic-fd.c b/tests/test-fork-synthetic-fd.c new file mode 100644 index 0000000..1e89a46 --- /dev/null +++ b/tests/test-fork-synthetic-fd.c @@ -0,0 +1,218 @@ +/* test-fork-synthetic-fd.c -- fork inheritance contract for synthetic fds + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The fork-IPC handoff does NOT serialize per-class side tables for + * eventfd/signalfd/timerfd/inotify/netlink/pidfd. Restoring the + * inherited host fd without that state leaves a half-functional slot, + * so fork-state.c explicitly drops these in the child. This test pins + * that contract: + * - urandom IS inherited (no per-class state to lose; cache is fresh + * in the child and arc4random_buf works) + * - eventfd / signalfd / timerfd / inotify are NOT inherited; the + * child sees EBADF and can recreate the fd at the same slot + * - the inherited host fd does not leak in the child + * + * Once a subsystem grows a serialize/restore path, the corresponding + * EBADF expectation here flips to a positive inheritance check. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int failures = 0; + +#define EXPECT(cond, msg) \ + do { \ + if (!(cond)) { \ + fprintf(stderr, "FAIL: %s\n", msg); \ + failures++; \ + } \ + } while (0) + +static int run_child(int (*fn)(int), int fd) +{ + pid_t pid = fork(); + if (pid < 0) + return -1; + if (pid == 0) + _exit(fn(fd)); + int status = 0; + if (waitpid(pid, &status, 0) < 0) + return -1; + return WIFEXITED(status) ? WEXITSTATUS(status) : -1; +} + +static int child_urandom_read(int fd) +{ + unsigned char b[8]; + if (read(fd, b, sizeof(b)) != (ssize_t) sizeof(b)) + return 1; + int seen_nonzero = 0; + for (size_t i = 0; i < sizeof(b); i++) + if (b[i] != 0) + seen_nonzero = 1; + return seen_nonzero ? 0 : 2; +} + +static int child_ebadf_read(int fd) +{ + char buf[8] = {0}; + errno = 0; + ssize_t n = read(fd, buf, sizeof(buf)); + if (n != -1) + return 1; + if (errno != EBADF) + return 2; + return 0; +} + +static int child_ebadf_reusable_at_same_fd(int fd) +{ + int rc = child_ebadf_read(fd); + if (rc != 0) + return rc; + int again = open("/dev/null", O_RDONLY | O_CLOEXEC); + if (again < 0) + return 3; + if (again != fd) { + close(again); + return 4; + } + close(again); + return 0; +} + +static int child_eventfd_recreate(int fd) +{ + /* The inherited eventfd slot should be FD_CLOSED in the child; we + * should be able to create a fresh eventfd that works normally. + */ + char buf[8]; + errno = 0; + if (read(fd, buf, sizeof(buf)) != -1 || errno != EBADF) + return 1; + close(fd); /* harmless on a closed slot */ + int e = eventfd(0, EFD_CLOEXEC); + if (e < 0) + return 2; + uint64_t one = 1; + if (write(e, &one, sizeof(one)) != (ssize_t) sizeof(one)) { + close(e); + return 3; + } + uint64_t got = 0; + if (read(e, &got, sizeof(got)) != (ssize_t) sizeof(got) || got != 1) { + close(e); + return 4; + } + close(e); + return 0; +} + +static void test_urandom_inherited(void) +{ + int fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC); + EXPECT(fd >= 0, "open /dev/urandom"); + if (fd < 0) + return; + int rc = run_child(child_urandom_read, fd); + EXPECT(rc == 0, "child can read inherited /dev/urandom"); + close(fd); +} + +static void test_synthetic_dropped(const char *label, int (*opener)(void)) +{ + int fd = opener(); + EXPECT(fd >= 0, label); + if (fd < 0) + return; + int rc = run_child(child_ebadf_read, fd); + char msg[80]; + snprintf(msg, sizeof(msg), "child sees EBADF on inherited %s", label); + EXPECT(rc == 0, msg); + close(fd); +} + +static void test_eventfd_recreate(void) +{ + int fd = eventfd(0, EFD_CLOEXEC); + EXPECT(fd >= 0, "open eventfd"); + if (fd < 0) + return; + int rc = run_child(child_eventfd_recreate, fd); + EXPECT(rc == 0, "child can recreate eventfd after drop"); + close(fd); +} + +static void test_low_synthetic_dropped(void) +{ + int saved_stdin = dup(STDIN_FILENO); + EXPECT(saved_stdin >= 0, "save stdin"); + if (saved_stdin < 0) + return; + + EXPECT(close(STDIN_FILENO) == 0, "close stdin"); + int fd = eventfd(0, EFD_CLOEXEC); + EXPECT(fd == STDIN_FILENO, "eventfd reuses fd 0"); + if (fd == STDIN_FILENO) { + int rc = run_child(child_ebadf_reusable_at_same_fd, fd); + EXPECT(rc == 0, "child sees EBADF on low inherited eventfd"); + close(fd); + } else if (fd >= 0) { + close(fd); + } + + EXPECT(dup2(saved_stdin, STDIN_FILENO) == STDIN_FILENO, "restore stdin"); + close(saved_stdin); +} + +static int open_eventfd(void) +{ + return eventfd(0, EFD_CLOEXEC); +} +static int open_timerfd(void) +{ + return timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC); +} +static int open_signalfd(void) +{ + sigset_t s; + sigemptyset(&s); + sigaddset(&s, SIGUSR1); + return signalfd(-1, &s, SFD_CLOEXEC); +} +static int open_inotify(void) +{ + return inotify_init1(IN_CLOEXEC); +} + +int main(void) +{ + printf("test-fork-synthetic-fd: synthetic fd fork inheritance contract\n"); + test_urandom_inherited(); + test_synthetic_dropped("eventfd", open_eventfd); + test_synthetic_dropped("timerfd", open_timerfd); + test_synthetic_dropped("signalfd", open_signalfd); + test_synthetic_dropped("inotify", open_inotify); + test_eventfd_recreate(); + test_low_synthetic_dropped(); + if (failures) { + printf("test-fork-synthetic-fd: %d FAIL\n", failures); + return 1; + } + puts("test-fork-synthetic-fd: PASS"); + return 0; +} diff --git a/tests/test-large-io-boundary.c b/tests/test-large-io-boundary.c index 28b76e7..c50adf8 100644 --- a/tests/test-large-io-boundary.c +++ b/tests/test-large-io-boundary.c @@ -182,12 +182,88 @@ static void test_large_read_from_split_block(void) EXPECT_TRUE(ok, "read returned short count or corrupted data"); } +static void test_urandom_read_crosses_boundary(void) +{ + TEST("/dev/urandom partial read at mapping boundary"); + + size_t page = (size_t) sysconf(_SC_PAGESIZE); + unsigned char *map = mmap(NULL, page * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (map == MAP_FAILED) { + FAIL("mmap failed"); + return; + } + if (munmap(map + page, page) != 0) { + munmap(map, page); + FAIL("munmap guard failed"); + return; + } + + memset(map, 0, page); + + int fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC); + if (fd < 0) { + munmap(map, page); + FAIL("open failed"); + return; + } + + ssize_t ret = read(fd, map, page * 2); + close(fd); + + bool any_nonzero = false; + for (size_t i = 0; i < page; i++) { + if (map[i] != 0) { + any_nonzero = true; + break; + } + } + + munmap(map, page); + EXPECT_TRUE(ret == (ssize_t) page && any_nonzero, + "urandom read did not preserve partial boundary result"); +} + +static void test_urandom_small_read_crosses_boundary(void) +{ + TEST("/dev/urandom small read at mapping boundary"); + + size_t page = (size_t) sysconf(_SC_PAGESIZE); + unsigned char *map = mmap(NULL, page * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (map == MAP_FAILED) { + FAIL("mmap failed"); + return; + } + if (munmap(map + page, page) != 0) { + munmap(map, page); + FAIL("munmap guard failed"); + return; + } + + int fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC); + if (fd < 0) { + munmap(map, page); + FAIL("open failed"); + return; + } + + ssize_t ret = read(fd, map + page - 1, 2); + close(fd); + + munmap(map, page); + EXPECT_TRUE(ret == 1, + "urandom small boundary read did not fall back safely"); +} + int main(void) { printf("large I/O boundary tests\n\n"); test_large_write(); test_large_read_from_split_block(); + test_urandom_read_crosses_boundary(); + test_urandom_small_read_crosses_boundary(); SUMMARY("test-large-io-boundary"); return fails ? 1 : 0; diff --git a/tests/test-matrix.sh b/tests/test-matrix.sh index e6a6140..ad6921b 100755 --- a/tests/test-matrix.sh +++ b/tests/test-matrix.sh @@ -494,7 +494,7 @@ run_unit_tests() printf "\nNegative tests\n" test_check "$runner" "test-negative" "0 failed" "$bindir/test-negative" - printf "\nCOW fork isolation\n" + printf "\nCoW fork isolation\n" test_check "$runner" "test-cow-fork" "PASS" "$bindir/test-cow-fork" printf "\nGuard page / mmap edge cases\n" diff --git a/tests/test-mremap-infra.c b/tests/test-mremap-infra.c new file mode 100644 index 0000000..a06a65f --- /dev/null +++ b/tests/test-mremap-infra.c @@ -0,0 +1,152 @@ +/* test-mremap-infra.c -- mremap/madvise must reject ranges hitting infra + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The infrastructure reserve at the top of guest IPA holds page tables, + * the EL1 shim's code, and the shim's data block. None of these are + * legal targets for guest memory management. sys_mmap MAP_FIXED, + * sys_munmap and sys_mprotect already enforce this via + * guest_range_hits_infra; sys_mremap and sys_madvise did not, leaving a + * spoofing/corruption vector for code that knows the infra GVA. + * + * This test exercises the four guarded variants: + * 1. mremap source range hits infra + * 2. mremap MREMAP_FIXED destination hits infra + * 3. mremap grow-in-place tail spills into infra + * 4. madvise(MADV_DONTNEED) on an infra range + * + * All four must fail with EINVAL. The infra base is read at runtime + * from /proc/self/maps so the test stays portable across the 36-bit + * and 40-bit IPA configurations. + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifndef MREMAP_MAYMOVE +#define MREMAP_MAYMOVE 1 +#endif +#ifndef MREMAP_FIXED +#define MREMAP_FIXED 2 +#endif + +#define PAGE_SIZE 4096 + +static int failures = 0; + +#define EXPECT(cond, msg) \ + do { \ + if (!(cond)) { \ + fprintf(stderr, "FAIL: %s\n", msg); \ + failures++; \ + } \ + } while (0) + +/* Parse the lower bound of the named region from /proc/self/maps. + * /proc/self/maps lines look like: + * ffffffc00000-ffffffc00400 r-xp 00000000 00:00 0 [shim] + * Returns 0 if not found. + */ +static uint64_t find_region_base(const char *name) +{ + FILE *fp = fopen("/proc/self/maps", "r"); + if (!fp) + return 0; + char line[512]; + uint64_t base = 0; + while (fgets(line, sizeof(line), fp)) { + if (strstr(line, name)) { + unsigned long long lo = 0; + if (sscanf(line, "%llx-", &lo) == 1) { + base = lo; + break; + } + } + } + fclose(fp); + return base; +} + +int main(void) +{ + printf("test-mremap-infra: mremap/madvise reject infra-range targets\n"); + + /* Locate [shim-data]; if absent, [shim] is also acceptable as the + * infra reserve covers both. The test only needs ANY infra GVA. + */ + uint64_t infra = find_region_base("[shim-data]"); + if (!infra) + infra = find_region_base("[shim]"); + if (!infra) { + fprintf(stderr, + "FAIL: could not locate infra region in /proc/self/maps\n"); + return 1; + } + printf("infra base = 0x%llx\n", (unsigned long long) infra); + + /* Allocate a scratch mapping to use as the source for mremap variants. */ + void *src = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (src == MAP_FAILED) { + fprintf(stderr, "FAIL: scratch mmap: %s\n", strerror(errno)); + return 1; + } + + /* Case 1: mremap source range hits infra. The source must be a + * legal VMA for mremap to consider it, but pointing the call + * directly at the infra base is enough to make the kernel try. + */ + errno = 0; + void *r = mremap((void *) (uintptr_t) infra, PAGE_SIZE, PAGE_SIZE, + MREMAP_MAYMOVE); + EXPECT(r == MAP_FAILED && errno == EINVAL, + "mremap source==infra rejected with EINVAL"); + + /* Case 2: MREMAP_FIXED destination hits infra. */ + errno = 0; + r = mremap(src, PAGE_SIZE, PAGE_SIZE, MREMAP_MAYMOVE | MREMAP_FIXED, + (void *) (uintptr_t) infra); + EXPECT(r == MAP_FAILED && errno == EINVAL, + "mremap MREMAP_FIXED dest==infra rejected with EINVAL"); + + /* Case 3: grow-in-place tail spills into infra. Map a one-page + * region immediately below the infra base (assumes nothing + * else sits in that hole; if it does, the test is inconclusive + * but still safe). + */ + void *base = (void *) (uintptr_t) (infra - PAGE_SIZE); + void *p = mmap(base, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (p != MAP_FAILED) { + errno = 0; + r = mremap(p, PAGE_SIZE, 2 * PAGE_SIZE, 0); + EXPECT(r == MAP_FAILED && errno == EINVAL, + "mremap grow-in-place into infra rejected with EINVAL"); + munmap(p, PAGE_SIZE); + } else { + printf( + "SKIP grow-in-place: cannot place sentinel mapping (already " + "taken)\n"); + } + + /* Case 4: madvise(MADV_DONTNEED) on an infra range. */ + errno = 0; + int rc = madvise((void *) (uintptr_t) infra, PAGE_SIZE, MADV_DONTNEED); + EXPECT(rc == -1 && errno == EINVAL, + "madvise(MADV_DONTNEED) on infra rejected with EINVAL"); + + munmap(src, PAGE_SIZE); + + if (failures) { + fprintf(stderr, "FAIL: %d check(s) failed\n", failures); + return 1; + } + printf("OK\n"); + return 0; +} diff --git a/tests/test-shim-cred-race.c b/tests/test-shim-cred-race.c new file mode 100644 index 0000000..1c747d3 --- /dev/null +++ b/tests/test-shim-cred-race.c @@ -0,0 +1,106 @@ +/* test-shim-cred-race.c -- shim identity cache stays consistent under + * concurrent setuid traffic. + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * elfuse's permission model rejects setuid/setresuid to any value not + * already in {real, effective, saved}, which means a guest binary + * cannot legally toggle between two distinct uids without privileged + * setup. This test therefore exercises the no-op-publish path: the + * mutator calls setresuid(uid, uid, uid) in a tight loop while the + * reader spins on geteuid via the shim's identity fast path. + * + * What it pins: + * + * - cred_publish_after runs without corrupting the cache: every + * reader observation must equal the initial euid. + * - The publish path is wired into the SC_FORWARD setuid family + * (a regression that bypasses publish would still pass because + * values don't change, but a regression that crashes during the + * atomic store would surface as a SIGSEGV or hang). + * + * What it does NOT pin (deferred to Slice B's attention bracket): + * + * - True cred-tearing during a multi-field publish. Demonstrating + * that requires a setuid path that mutates {uid, euid, gid, + * egid} as a coherent group; elfuse's permission model does not + * support such a state transition from the guest side. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "raw-syscall.h" + +#ifndef __NR_geteuid +#define __NR_geteuid 175 +#endif +#ifndef __NR_setresuid +#define __NR_setresuid 147 +#endif + +static atomic_int stop; +static atomic_long observed_other; +static long expected_euid; + +static void *reader(void *arg) +{ + (void) arg; + while (!atomic_load_explicit(&stop, memory_order_relaxed)) { + long v = raw_syscall0(__NR_geteuid); + if (v != expected_euid) + atomic_store_explicit(&observed_other, v, memory_order_relaxed); + } + return NULL; +} + +int main(void) +{ + expected_euid = raw_syscall0(__NR_geteuid); + if (expected_euid < 0) { + fprintf(stderr, "FAIL: initial geteuid returned %ld\n", expected_euid); + return 1; + } + atomic_store(&observed_other, -1); + atomic_store(&stop, 0); + + pthread_t tid; + if (pthread_create(&tid, NULL, reader, NULL) != 0) + return 1; + + /* 50_000 no-op setresuid calls. Each triggers cred_publish_after + * on the elfuse side, racing the reader thread. + */ + for (int i = 0; i < 50000; i++) { + long r = raw_syscall3(__NR_setresuid, (long) expected_euid, + (long) expected_euid, (long) expected_euid); + if (r != 0) { + fprintf(stderr, "FAIL setresuid(%ld,%ld,%ld) iter %d: errno %ld\n", + expected_euid, expected_euid, expected_euid, i, -r); + atomic_store(&stop, 1); + pthread_join(tid, NULL); + return 1; + } + } + atomic_store(&stop, 1); + pthread_join(tid, NULL); + + long bad = atomic_load(&observed_other); + if (bad != -1) { + fprintf(stderr, "FAIL: reader observed euid %ld (expected %ld)\n", bad, + expected_euid); + return 1; + } + + printf("OK (50000 no-op publishes, no torn read)\n"); + return 0; +} diff --git a/tests/test-shim-data-el1.c b/tests/test-shim-data-el1.c new file mode 100644 index 0000000..b3d31c5 --- /dev/null +++ b/tests/test-shim-data-el1.c @@ -0,0 +1,224 @@ +/* test-shim-data-el1.c -- guest EL0 cannot read or write shim_data. + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The shim_data block holds the identity cache, attention flag, + * urandom bitmap, ring, and lock. Mapping it with AP[2:1]=00 + * (privileged-only) prevents a guest from spoofing its own identity + * by storing directly to the cache GVA, or from observing the bytes + * the urandom fast path will hand out next. + * + * This test: + * 1. Parses /proc/self/maps to find [shim-data]. + * 2. Verifies the permission string is "---p" (PROT_NONE). + * 3. Installs SIGSEGV handler + sigsetjmp; loads the first byte + * from the [shim-data] base; expects SIGSEGV. + * 4. Same with a store; expects SIGSEGV. + * 5. Verifies the identity and urandom fast paths still work + * AFTER the EL0 access attempts (no shim corruption). + * 6. execve's self with argv[1]='post-exec' and reruns the perms + * and fault checks against the new image. Catches the + * regression where the execve mapping path forgets to apply + * EL1-only and silently downgrades shim_data to plain RW. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static sigjmp_buf segv_jmp; + +static void on_sigsegv(int sig) +{ + (void) sig; + siglongjmp(segv_jmp, 1); +} + +static uint64_t find_shim_data_base(void) +{ + FILE *fp = fopen("/proc/self/maps", "r"); + if (!fp) + return 0; + char line[512]; + uint64_t base = 0; + while (fgets(line, sizeof(line), fp)) { + if (strstr(line, "[shim-data]")) { + unsigned long long lo = 0; + char perms[8] = {0}; + if (sscanf(line, "%llx-%*llx %7s", &lo, perms) == 2) { + printf("[shim-data] base=0x%llx perms=%s\n", lo, perms); + if (strcmp(perms, "---p") != 0) { + fprintf(stderr, + "FAIL: [shim-data] perms %s, expected ---p\n", + perms); + fclose(fp); + return 0; + } + base = lo; + } + break; + } + } + fclose(fp); + return base; +} + +static int probe_read(uint64_t addr) +{ + if (sigsetjmp(segv_jmp, 1) != 0) + return -1; /* SIGSEGV caught */ + volatile uint8_t *p = (volatile uint8_t *) (uintptr_t) addr; + volatile uint8_t v = *p; + (void) v; + return 0; +} + +static int probe_write(uint64_t addr) +{ + if (sigsetjmp(segv_jmp, 1) != 0) + return -1; + volatile uint8_t *p = (volatile uint8_t *) (uintptr_t) addr; + *p = 0xA5; + return 0; +} + +/* Phase 2 (post-execve): only the perm-string and fault checks. The + * identity and urandom sanity is already exercised in phase 1; here + * the goal is to catch a regression where execve maps shim_data with + * plain RW instead of RW_EL1_ONLY. + */ +static int run_post_exec_checks(void) +{ + uint64_t base = find_shim_data_base(); + if (!base) { + fprintf(stderr, "FAIL post-exec: shim-data missing or wrong perms\n"); + return 1; + } + struct sigaction sa = {0}; + sa.sa_handler = on_sigsegv; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_NODEFER; + sigaction(SIGSEGV, &sa, NULL); + sigaction(SIGBUS, &sa, NULL); + if (probe_read(base) != -1) { + fprintf(stderr, + "FAIL post-exec: read at 0x%llx did not fault " + "(execve mapped shim_data RW instead of RW_EL1_ONLY)\n", + (unsigned long long) base); + return 1; + } + if (probe_write(base) != -1) { + fprintf(stderr, "FAIL post-exec: write at 0x%llx did not fault\n", + (unsigned long long) base); + return 1; + } + printf("OK post-exec [shim-data] still EL1-only\n"); + printf("OK\n"); + return 0; +} + +int main(int argc, char **argv) +{ + if (argc > 1 && strcmp(argv[1], "post-exec") == 0) + return run_post_exec_checks(); + + uint64_t base = find_shim_data_base(); + if (!base) + return 1; + + struct sigaction sa = {0}; + sa.sa_handler = on_sigsegv; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_NODEFER; + sigaction(SIGSEGV, &sa, NULL); + sigaction(SIGBUS, &sa, NULL); + + /* Read of [shim-data] must fault. */ + if (probe_read(base) != -1) { + fprintf(stderr, "FAIL: read at 0x%llx did not fault\n", + (unsigned long long) base); + return 1; + } + printf("OK read fault at base\n"); + + /* Read further into the region (urandom ring area, offset 0x100). */ + if (probe_read(base + 0x100) != -1) { + fprintf(stderr, "FAIL: read at 0x%llx did not fault\n", + (unsigned long long) (base + 0x100)); + return 1; + } + printf("OK read fault at base+0x100\n"); + + /* Store attempt must fault too. */ + if (probe_write(base) != -1) { + fprintf(stderr, "FAIL: write at 0x%llx did not fault\n", + (unsigned long long) base); + return 1; + } + printf("OK write fault at base\n"); + + /* After the fault attempts, identity fast path must still work. */ + register long x0 __asm__("x0"); + register long x8 __asm__("x8") = 172; /* getpid */ + __asm__ volatile("svc #0" : "=r"(x0) : "r"(x8) : "memory", "cc"); + if (x0 != getpid()) { + fprintf(stderr, + "FAIL: identity fast path broken after faults: " + "raw getpid=%ld libc=%d\n", + x0, getpid()); + return 1; + } + printf("OK identity fast path still works (pid=%ld)\n", x0); + + /* Urandom fast path too. */ + int fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + perror("open /dev/urandom"); + return 1; + } + char b; + if (read(fd, &b, 1) != 1) { + fprintf(stderr, "FAIL: urandom read broken after faults\n"); + return 1; + } + printf("OK urandom fast path still works\n"); + + /* The host syscall handlers must also refuse to act on a + * guest-supplied [shim-data] GVA. Without this defense, a guest + * could spoof the identity cache via read(fd, shim_data_gva, n) + * instead of a direct EL0 store. The host's gva_translate_perm + * rejects EL1-only descriptors before any host_base+offset write + * fires; the syscall returns EFAULT. + */ + errno = 0; + ssize_t rc = read(fd, (void *) (uintptr_t) base, 1); + if (rc != -1 || errno != EFAULT) { + fprintf(stderr, + "FAIL: read(fd, [shim-data], 1) = %zd errno=%d " + "(expected -1/EFAULT, attacker could have spoofed cache)\n", + rc, errno); + close(fd); + return 1; + } + printf("OK host-side spoofing attempt via read returned EFAULT\n"); + close(fd); + + /* Phase 2: re-exec self with argv[1]='post-exec' so the post-execve + * shim_data mapping path is exercised. If exec.c forgets to use + * RW_EL1_ONLY, the child process's [shim-data] perms come back as + * 'rw-p' and the probe_read in run_post_exec_checks succeeds (no + * SIGSEGV), failing the regression. The original child reaches + * argc=1 above; this path only runs once. + */ + char *exec_argv[] = {argv[0], "post-exec", NULL}; + execv("/proc/self/exe", exec_argv); + perror("execv"); + return 1; +} diff --git a/tests/test-shim-identity-attention.c b/tests/test-shim-identity-attention.c new file mode 100644 index 0000000..d123304 --- /dev/null +++ b/tests/test-shim-identity-attention.c @@ -0,0 +1,136 @@ +/* test-shim-identity-attention.c -- SIGALRM survives fast paths. + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Slice A of the identity fast-path optimization routes getpid (and the + * other five identity syscalls) through the EL1 shim without HVC #5. + * That skips the post-HVC signal_check_timer epilogue in vcpu_run_loop, + * which is what normally notices a fired guest ITIMER_REAL and queues + * SIGALRM. Without Slice B's attention flag, a vCPU stuck in a tight + * getpid loop would never re-enter EL1 and SIGALRM would arrive late + * (worst case: not until the per-iteration vCPU alarm timeout fires, + * potentially hundreds of milliseconds). + * + * This test arms an ITIMER_REAL for 100 ms, then spins for ~1 second + * OR until SIGALRM fires. It covers both getpid via the raw SVC and a + * seeded CLOCK_REALTIME vDSO loop, because both fast paths otherwise + * bypass the HVC epilogue that runs signal_check_timer(). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "raw-syscall.h" + +#ifndef __NR_getpid +#define __NR_getpid 172 +#endif + +static volatile sig_atomic_t alarm_fired; +static struct timespec alarm_ts; + +static void on_sigalrm(int sig) +{ + (void) sig; + clock_gettime(CLOCK_MONOTONIC, (struct timespec *) &alarm_ts); + alarm_fired = 1; +} + +static long ns_diff(const struct timespec *a, const struct timespec *b) +{ + return (long) ((a->tv_sec - b->tv_sec) * 1000000000LL + + (a->tv_nsec - b->tv_nsec)); +} + +static int run_alarm_spin(const char *name, int use_realtime_vdso) +{ + alarm_fired = 0; + alarm_ts = (struct timespec) {0}; + struct timespec t_arm; + + if (use_realtime_vdso) { + struct timespec seed; + clock_gettime(CLOCK_REALTIME, &seed); + } + + clock_gettime(CLOCK_MONOTONIC, &t_arm); + + struct itimerval iv = {0}; + iv.it_value.tv_sec = 0; + iv.it_value.tv_usec = 100 * 1000; /* 100 ms */ + if (setitimer(ITIMER_REAL, &iv, NULL) < 0) { + fprintf(stderr, "FAIL %s setitimer: %s\n", name, strerror(errno)); + return 1; + } + + /* With attention raised on setitimer arm, fast paths fall back to + * HVC and signal_check_timer eventually notices the 100 ms expiry. + * Bound the spin to 1 s so a broken attention path manifests as + * test failure rather than a hang. + */ + long iterations = 0; + while (!alarm_fired) { + if (use_realtime_vdso) { + struct timespec now_rt; + clock_gettime(CLOCK_REALTIME, &now_rt); + } else { + (void) raw_syscall0(__NR_getpid); + } + iterations++; + if ((iterations & 0xFFFF) == 0) { + struct timespec now; + clock_gettime(CLOCK_MONOTONIC, &now); + if (ns_diff(&now, &t_arm) > 1000000000L) + break; + } + } + + if (!alarm_fired) { + fprintf(stderr, + "FAIL %s: SIGALRM did not fire within 1 s (iterations=%ld)\n", + name, iterations); + return 1; + } + + long delivered_ns = ns_diff((struct timespec *) &alarm_ts, &t_arm); + /* The 100 ms timer should deliver within ~150 ms in practice; + * grant 300 ms to absorb host scheduling jitter under load. + */ + if (delivered_ns > 300 * 1000 * 1000L) { + fprintf(stderr, "FAIL %s: SIGALRM delivered after %ld ns (>300 ms)\n", + name, delivered_ns); + return 1; + } + + printf("OK %s: SIGALRM after %ld ns (iterations=%ld)\n", name, delivered_ns, + iterations); + return 0; +} + +int main(void) +{ + struct sigaction sa = {0}; + sa.sa_handler = on_sigalrm; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + if (sigaction(SIGALRM, &sa, NULL) < 0) { + fprintf(stderr, "FAIL sigaction: %s\n", strerror(errno)); + return 1; + } + + if (run_alarm_spin("getpid", 0) != 0) + return 1; + if (run_alarm_spin("clock_realtime_vdso", 1) != 0) + return 1; + return 0; +} diff --git a/tests/test-shim-identity.c b/tests/test-shim-identity.c new file mode 100644 index 0000000..db529df --- /dev/null +++ b/tests/test-shim-identity.c @@ -0,0 +1,170 @@ +/* test-shim-identity.c -- verify identity syscalls do not trust vDSO memory + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * A guest can legally unmap or replace its vDSO page. getpid/getppid/getuid/ + * geteuid/getgid/getegid must still be sourced from host-side process state, + * not from guest-remappable vDSO contents. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "raw-syscall.h" + +#ifndef __NR_getpid +#define __NR_getpid 172 +#endif +#ifndef __NR_getppid +#define __NR_getppid 173 +#endif +#ifndef __NR_getuid +#define __NR_getuid 174 +#endif +#ifndef __NR_geteuid +#define __NR_geteuid 175 +#endif +#ifndef __NR_getgid +#define __NR_getgid 176 +#endif +#ifndef __NR_getegid +#define __NR_getegid 177 +#endif +#ifndef __NR_mmap +#define __NR_mmap 222 +#endif + +#define VDSO_BASE ((void *) (uintptr_t) 0x0000F000UL) +#define VDSO_SIZE 0x1000UL + +static int failures = 0; + +#define EXPECT_EQ_LONG(a_expr, b_expr, label) \ + do { \ + long _a = (long) (a_expr); \ + long _b = (long) (b_expr); \ + if (_a != _b) { \ + fprintf(stderr, "FAIL %s: %ld != %ld\n", label, _a, _b); \ + failures++; \ + } \ + } while (0) + +static long parse_status_field(const char *key) +{ + FILE *fp = fopen("/proc/self/status", "r"); + if (!fp) + return -1; + char line[256]; + long value = -1; + size_t klen = strlen(key); + while (fgets(line, sizeof(line), fp)) { + if (strncmp(line, key, klen) == 0 && line[klen] == ':') { + value = strtol(line + klen + 1, NULL, 10); + break; + } + } + fclose(fp); + return value; +} + +static void check_self(const char *phase) +{ + long pid = raw_syscall0(__NR_getpid); + long ppid = raw_syscall0(__NR_getppid); + long uid = raw_syscall0(__NR_getuid); + long euid = raw_syscall0(__NR_geteuid); + long gid = raw_syscall0(__NR_getgid); + long egid = raw_syscall0(__NR_getegid); + + EXPECT_EQ_LONG(pid, parse_status_field("Pid"), "getpid vs /proc"); + EXPECT_EQ_LONG(ppid, parse_status_field("PPid"), "getppid vs /proc"); + + /* Repeated calls must be stable. */ + EXPECT_EQ_LONG(pid, raw_syscall0(__NR_getpid), "getpid repeat"); + EXPECT_EQ_LONG(uid, raw_syscall0(__NR_getuid), "getuid repeat"); + EXPECT_EQ_LONG(euid, raw_syscall0(__NR_geteuid), "geteuid repeat"); + EXPECT_EQ_LONG(gid, raw_syscall0(__NR_getgid), "getgid repeat"); + EXPECT_EQ_LONG(egid, raw_syscall0(__NR_getegid), "getegid repeat"); + + printf("%s: pid=%ld ppid=%ld uid=%ld euid=%ld gid=%ld egid=%ld\n", phase, + pid, ppid, uid, euid, gid, egid); +} + +static void remap_vdso_page(void) +{ + long p = raw_syscall6(__NR_mmap, (long) VDSO_BASE, VDSO_SIZE, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (p < 0) { + fprintf(stderr, "FAIL mmap(MAP_FIXED vDSO): %s\n", strerror((int) -p)); + failures++; + return; + } + if ((void *) (uintptr_t) p != VDSO_BASE) { + fprintf(stderr, "FAIL mmap(MAP_FIXED vDSO): got %p\n", + (void *) (uintptr_t) p); + failures++; + return; + } + + memset(VDSO_BASE, 0xA5, VDSO_SIZE); +} + +static void check_fork_child(void) +{ + long parent_pid = raw_syscall0(__NR_getpid); + pid_t kid = fork(); + if (kid < 0) { + fprintf(stderr, "FAIL fork: %s\n", strerror(errno)); + failures++; + return; + } + if (kid == 0) { + long child_pid = raw_syscall0(__NR_getpid); + long child_ppid = raw_syscall0(__NR_getppid); + if (child_pid == parent_pid) { + fprintf(stderr, + "FAIL fork-child: getpid==parent_pid (stale vvar)\n"); + _exit(2); + } + if (child_ppid != parent_pid) { + fprintf(stderr, "FAIL fork-child: getppid=%ld parent_pid=%ld\n", + child_ppid, parent_pid); + _exit(3); + } + _exit(0); + } + int wstatus = 0; + if (waitpid(kid, &wstatus, 0) != kid) { + fprintf(stderr, "FAIL fork-child waitpid: %s\n", strerror(errno)); + failures++; + return; + } + if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 0) { + fprintf(stderr, "FAIL fork-child exit status %d\n", wstatus); + failures++; + } +} + +int main(void) +{ + printf("test-shim-identity: identity syscalls ignore remapped vDSO\n"); + check_self("before-remap"); + remap_vdso_page(); + check_self("after-remap"); + check_fork_child(); + if (failures) { + fprintf(stderr, "FAIL: %d check(s) failed\n", failures); + return 1; + } + printf("OK\n"); + return 0; +} diff --git a/tests/test-shim-urandom-smp.c b/tests/test-shim-urandom-smp.c new file mode 100644 index 0000000..f8d657c --- /dev/null +++ b/tests/test-shim-urandom-smp.c @@ -0,0 +1,149 @@ +/* test-shim-urandom-smp.c -- multi-thread urandom-read stress. + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The shim's urandom-read fast path advances a shared ring head via + * LDXR/STXR. Concurrent vCPUs reading /dev/urandom must: + * 1. Never see a torn or zero-filled byte (host always refills + * with arc4random_buf output). + * 2. Never observe the same byte sequence as a sibling thread + * (each thread's atomic head-advance reserves its own slice + * of the ring). + * 3. Keep the head from overflowing or underflowing the ring. + * + * Each thread reads N 1-byte samples and records them in a private + * histogram. After the run we check: + * - Total bytes consumed across all threads equals N * threads. + * - No thread's per-byte distribution is degenerate (all-zero or + * all-one buckets indicate the fast path served stale memory). + * - The sums across threads differ from each other (a hard test + * that the threads are actually getting independent bytes). + * + * The test runs only under elfuse, where the urandom fast path is + * live; on native Linux the read() goes straight to the kernel and + * the same invariants hold trivially. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define NTHREADS 4 +#define NSAMPLES 16384 + +typedef struct { + int fd; + int tid_index; + int histogram[256]; + uint64_t sum; +} worker_arg_t; + +static void *worker(void *arg) +{ + worker_arg_t *w = arg; + char b; + for (int i = 0; i < NSAMPLES; i++) { + ssize_t r = read(w->fd, &b, 1); + if (r != 1) { + fprintf(stderr, + "FAIL thread %d iter %d: read returned %zd " + "(errno=%d)\n", + w->tid_index, i, r, errno); + return (void *) (uintptr_t) 1; + } + unsigned char ub = (unsigned char) b; + w->histogram[ub]++; + w->sum += ub; + } + return NULL; +} + +int main(void) +{ + /* One shared fd: every thread shares the same FD_URANDOM slot, + * so the shim's fast path is exercised on the same bitmap bit + * by all threads simultaneously. + */ + int fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + perror("open /dev/urandom"); + return 1; + } + + worker_arg_t workers[NTHREADS]; + pthread_t threads[NTHREADS]; + for (int i = 0; i < NTHREADS; i++) { + memset(&workers[i], 0, sizeof(workers[i])); + workers[i].fd = fd; + workers[i].tid_index = i; + if (pthread_create(&threads[i], NULL, worker, &workers[i]) != 0) { + fprintf(stderr, "FAIL pthread_create %d: %s\n", i, strerror(errno)); + return 1; + } + } + + int failures = 0; + for (int i = 0; i < NTHREADS; i++) { + void *rc = NULL; + pthread_join(threads[i], &rc); + if (rc != NULL) { + failures++; + continue; + } + /* Per-thread distribution sanity: each bucket should be + * roughly NSAMPLES / 256 = 64 with stddev about 8. Flag any + * thread whose distribution is wildly off. + */ + int min = NSAMPLES, max = 0, zeros = 0; + for (int b = 0; b < 256; b++) { + int c = workers[i].histogram[b]; + if (c < min) + min = c; + if (c > max) + max = c; + if (c == 0) + zeros++; + } + printf("thread %d: sum=%llu min=%d max=%d zero-buckets=%d\n", i, + (unsigned long long) workers[i].sum, min, max, zeros); + if (max == NSAMPLES) { + fprintf(stderr, "FAIL thread %d: all bytes identical\n", i); + failures++; + } + if (zeros > 32) { + fprintf(stderr, "FAIL thread %d: %d unused buckets (degenerate)\n", + i, zeros); + failures++; + } + } + close(fd); + + /* Threads must have seen different total sums. Equal sums imply + * they consumed identical byte sequences, which means the shim's + * head-advance lost the race or served stale ring data. + */ + for (int i = 0; i < NTHREADS; i++) { + for (int j = i + 1; j < NTHREADS; j++) { + if (workers[i].sum == workers[j].sum) { + fprintf(stderr, + "FAIL threads %d and %d have identical sum=%llu\n", i, + j, (unsigned long long) workers[i].sum); + failures++; + } + } + } + + if (failures) { + fprintf(stderr, "FAIL: %d issue(s)\n", failures); + return 1; + } + printf("OK: %d threads x %d 1B reads each, ring stayed consistent\n", + NTHREADS, NSAMPLES); + return 0; +} diff --git a/tests/test-shim-urandom-toctou.c b/tests/test-shim-urandom-toctou.c new file mode 100644 index 0000000..1cb0ce9 --- /dev/null +++ b/tests/test-shim-urandom-toctou.c @@ -0,0 +1,124 @@ +/* test-shim-urandom-toctou.c -- urandom EL1 fault recovery survives + * concurrent mprotect(PROT_NONE) of the read buffer. + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The urandom-read shim fast path probes the guest buffer (AT s1e0w) and + * then performs an EL1 strb into it. A sibling vCPU can mprotect or + * munmap the buffer between the probe and the store, faulting the EL1 + * write. Without handle_el1_data_abort_recover, that fault routes to + * BAD_VEC and halts the VM. + * + * This test runs a tight loop of read(/dev/urandom, buf, 1) while a + * sibling thread continuously flips the buffer between PROT_READ|WRITE + * and PROT_NONE via mprotect. Expected behavior: + * - read returns 1 (success) when the buffer is RW + * - read returns -1 with errno=EFAULT when the buffer is PROT_NONE + * - elfuse never halts + * + * If the recovery handler is missing or wrong, the VM crashes mid-run + * and the test process never returns; the make-check timeout catches + * that as a failure. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PAGE_SIZE 4096 +#define ITERATIONS 20000 + +static atomic_int stop; +static atomic_int flips; +static atomic_int efault_count; +static atomic_int success_count; +static void *shared_buf; + +static void *protect_flipper(void *arg) +{ + (void) arg; + int prot = PROT_READ | PROT_WRITE; + while (!atomic_load_explicit(&stop, memory_order_relaxed)) { + prot ^= (PROT_READ | PROT_WRITE); + if (mprotect(shared_buf, PAGE_SIZE, prot) != 0) { + fprintf(stderr, "mprotect failed: %s\n", strerror(errno)); + return (void *) (uintptr_t) 1; + } + atomic_fetch_add(&flips, 1); + } + /* Leave the buffer accessible at exit. */ + mprotect(shared_buf, PAGE_SIZE, PROT_READ | PROT_WRITE); + return NULL; +} + +int main(void) +{ + shared_buf = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (shared_buf == MAP_FAILED) { + perror("mmap"); + return 1; + } + int fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + perror("open /dev/urandom"); + return 1; + } + + atomic_store(&stop, 0); + pthread_t flipper; + if (pthread_create(&flipper, NULL, protect_flipper, NULL) != 0) { + perror("pthread_create"); + return 1; + } + + /* Reader: each iteration calls read(); accepts either success or + * EFAULT. Any other result (or a crash, which manifests as the + * VM halting before we reach the join) is a failure. + */ + for (int i = 0; i < ITERATIONS; i++) { + char b; + errno = 0; + ssize_t r = read(fd, &b, 1); + if (r == 1) { + atomic_fetch_add(&success_count, 1); + } else if (r == -1 && errno == EFAULT) { + atomic_fetch_add(&efault_count, 1); + } else { + fprintf(stderr, "FAIL iter %d: unexpected read rc=%zd errno=%d\n", + i, r, errno); + atomic_store(&stop, 1); + pthread_join(flipper, NULL); + return 1; + } + } + + atomic_store(&stop, 1); + pthread_join(flipper, NULL); + close(fd); + munmap(shared_buf, PAGE_SIZE); + + int s = atomic_load(&success_count); + int e = atomic_load(&efault_count); + int f = atomic_load(&flips); + printf("iters=%d success=%d efault=%d mprotect_flips=%d\n", ITERATIONS, s, + e, f); + if (s + e != ITERATIONS) { + fprintf(stderr, "FAIL: success+efault != iterations\n"); + return 1; + } + if (e == 0) + printf( + "WARN: no EFAULT observed; race window may be too short on " + "this host. VM did not crash, which is the primary check.\n"); + printf("OK\n"); + return 0; +} diff --git a/tests/test-shim-verbose-trace.c b/tests/test-shim-verbose-trace.c new file mode 100644 index 0000000..372facd --- /dev/null +++ b/tests/test-shim-verbose-trace.c @@ -0,0 +1,50 @@ +/* test-shim-verbose-trace.c -- fixture for verbose tracing of shim fast paths + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +#include "raw-syscall.h" + +#ifndef __NR_openat +#define __NR_openat 56 +#endif +#ifndef __NR_close +#define __NR_close 57 +#endif +#ifndef __NR_read +#define __NR_read 63 +#endif +#ifndef __NR_getpid +#define __NR_getpid 172 +#endif + +#define AT_FDCWD -100 + +int main(void) +{ + long pid = raw_syscall0(__NR_getpid); + if (pid <= 0) { + fprintf(stderr, "getpid failed: %ld\n", pid); + return 1; + } + + long fd = raw_syscall4(__NR_openat, AT_FDCWD, (long) "/dev/urandom", 0, 0); + if (fd < 0) { + fprintf(stderr, "openat /dev/urandom failed: %ld\n", fd); + return 1; + } + + unsigned char byte = 0; + long n = raw_syscall3(__NR_read, fd, (long) &byte, 1); + long close_rc = raw_syscall1(__NR_close, fd); + if (n != 1 || close_rc < 0) { + fprintf(stderr, "read/close failed: n=%ld close=%ld\n", n, close_rc); + return 1; + } + + printf("pid=%ld byte=%u\n", pid, (unsigned) byte); + return 0; +} diff --git a/tests/test-syscall-smoke.c b/tests/test-syscall-smoke.c index 809998f..a59925f 100644 --- a/tests/test-syscall-smoke.c +++ b/tests/test-syscall-smoke.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -57,6 +58,10 @@ #define SYS_sigaltstack 132 #endif +#ifndef O_PATH +#define O_PATH 010000000 +#endif + #ifndef SYS_set_tid_address #define SYS_set_tid_address 96 #endif @@ -623,6 +628,262 @@ static void test_sysv_semaphore_ops(void) } } +static void test_urandom_byte_reads(void) +{ + TEST("/dev/urandom byte reads"); + int fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC); + if (fd < 0) { + FAIL("open"); + return; + } + + unsigned char bytes[32]; + for (size_t i = 0; i < sizeof(bytes); i++) { + ssize_t n = read(fd, &bytes[i], 1); + if (n != 1) { + close(fd); + FAIL("read"); + return; + } + } + close(fd); + + bool all_same = true; + for (size_t i = 1; i < sizeof(bytes); i++) { + if (bytes[i] != bytes[0]) { + all_same = false; + break; + } + } + if (all_same) { + FAIL("entropy stream did not vary"); + return; + } + PASS(); +} + +static void test_urandom_open_flags(void) +{ + TEST("/dev/urandom open flags"); + + errno = 0; + int dirfd = open("/dev/urandom", O_RDONLY | O_DIRECTORY); + if (dirfd >= 0) { + close(dirfd); + FAIL("O_DIRECTORY open succeeded"); + return; + } + if (errno != ENOTDIR) { + FAIL("O_DIRECTORY errno"); + return; + } + + int pathfd = open("/dev/urandom", O_PATH | O_CLOEXEC); + if (pathfd < 0) { + FAIL("O_PATH open"); + return; + } + unsigned char b = 0; + errno = 0; + ssize_t n = read(pathfd, &b, 1); + int saved_errno = errno; + close(pathfd); + if (n != -1 || saved_errno != EBADF) { + FAIL("O_PATH read"); + return; + } + + int wfd = open("/dev/urandom", O_WRONLY | O_CLOEXEC); + if (wfd < 0) { + FAIL("O_WRONLY open"); + return; + } + int fl = fcntl(wfd, F_GETFL); + errno = 0; + n = read(wfd, &b, 1); + saved_errno = errno; + close(wfd); + if (fl < 0 || (fl & O_ACCMODE) != O_WRONLY) { + FAIL("O_WRONLY F_GETFL"); + return; + } + if (n != -1 || saved_errno != EBADF) { + FAIL("O_WRONLY read"); + return; + } + + wfd = open("/dev/urandom", O_WRONLY | O_CLOEXEC); + if (wfd < 0) { + FAIL("O_WRONLY open dup"); + return; + } + int dupfd = dup(wfd); + close(wfd); + if (dupfd < 0) { + FAIL("O_WRONLY dup"); + return; + } + errno = 0; + n = read(dupfd, &b, 1); + saved_errno = errno; + close(dupfd); + if (n != -1 || saved_errno != EBADF) { + FAIL("O_WRONLY dup read"); + return; + } + + wfd = open("/dev/urandom", O_WRONLY | O_CLOEXEC); + if (wfd < 0) { + FAIL("O_WRONLY open readv"); + return; + } + struct iovec wv[2] = {{&b, 1}, {&b, 1}}; + errno = 0; + n = readv(wfd, wv, 2); + saved_errno = errno; + close(wfd); + if (n != -1 || saved_errno != EBADF) { + FAIL("O_WRONLY readv"); + return; + } + + wfd = open("/dev/urandom", O_WRONLY | O_CLOEXEC); + if (wfd < 0) { + FAIL("O_WRONLY open oversized readv"); + return; + } + struct iovec huge_wv[2] = {{&b, SSIZE_MAX}, {&b, 1}}; + errno = 0; + n = readv(wfd, huge_wv, 2); + saved_errno = errno; + close(wfd); + if (n != -1 || saved_errno != EBADF) { + FAIL("O_WRONLY oversized readv"); + return; + } + + wfd = open("/dev/urandom", O_WRONLY | O_CLOEXEC); + if (wfd < 0) { + FAIL("O_WRONLY open oversized single readv"); + return; + } + struct iovec huge_one_wv = {&b, (size_t) SSIZE_MAX + 1}; + errno = 0; + n = readv(wfd, &huge_one_wv, 1); + saved_errno = errno; + close(wfd); + if (n != -1 || saved_errno != EBADF) { + FAIL("O_WRONLY oversized single readv"); + return; + } + + int rfd = open("/dev/urandom", O_RDONLY | O_CLOEXEC); + if (rfd < 0) { + FAIL("O_RDONLY open readv"); + return; + } + unsigned char rb[2] = {0}; + struct iovec rv[2] = {{&rb[0], 1}, {&rb[1], 1}}; + n = readv(rfd, rv, 2); + if (n != 2) { + close(rfd); + FAIL("O_RDONLY readv"); + return; + } + + struct iovec huge[2] = {{&b, SSIZE_MAX}, {&b, 1}}; + errno = 0; + n = readv(rfd, huge, 2); + saved_errno = errno; + if (n != -1 || saved_errno != EINVAL) { + close(rfd); + FAIL("oversized readv"); + return; + } + + struct iovec huge_one = {&b, (size_t) SSIZE_MAX + 1}; + errno = 0; + n = readv(rfd, &huge_one, 1); + saved_errno = errno; + if (n != -1 || saved_errno != EINVAL) { + close(rfd); + FAIL("oversized single readv"); + return; + } + + pid_t pid = fork(); + if (pid < 0) { + close(rfd); + FAIL("fork inherited urandom"); + return; + } + if (pid == 0) { + unsigned char child_b = 0; + _exit(read(rfd, &child_b, 1) == 1 ? 0 : 1); + } + int status = 0; + waitpid(pid, &status, 0); + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + close(rfd); + FAIL("inherited urandom read"); + return; + } + + int p[2]; + if (pipe(p) != 0) { + close(rfd); + FAIL("urandom fork pipe"); + return; + } + unsigned char seed = 0; + if (read(rfd, &seed, 1) != 1) { + close(rfd); + close(p[0]); + close(p[1]); + FAIL("prime urandom cache before fork"); + return; + } + pid = fork(); + if (pid < 0) { + close(rfd); + close(p[0]); + close(p[1]); + FAIL("fork urandom cache isolation"); + return; + } + if (pid == 0) { + close(p[0]); + unsigned char child_buf[64]; + ssize_t got = read(rfd, child_buf, sizeof(child_buf)); + ssize_t put = got == (ssize_t) sizeof(child_buf) + ? write(p[1], child_buf, sizeof(child_buf)) + : -1; + close(p[1]); + _exit(put == (ssize_t) sizeof(child_buf) ? 0 : 1); + } + close(p[1]); + unsigned char parent_buf[64]; + unsigned char child_buf[64]; + ssize_t parent_n = read(rfd, parent_buf, sizeof(parent_buf)); + ssize_t child_n = read(p[0], child_buf, sizeof(child_buf)); + close(p[0]); + status = 0; + waitpid(pid, &status, 0); + close(rfd); + if (parent_n != (ssize_t) sizeof(parent_buf) || + child_n != (ssize_t) sizeof(child_buf) || !WIFEXITED(status) || + WEXITSTATUS(status) != 0) { + FAIL("urandom fork cache isolation read"); + return; + } + if (memcmp(parent_buf, child_buf, sizeof(parent_buf)) == 0) { + FAIL("urandom fork duplicated cached bytes"); + return; + } + + PASS(); +} + int main(int argc, char **argv) { printf("test-syscall-smoke: direct syscall smoke coverage\n\n"); @@ -642,6 +903,8 @@ int main(int argc, char **argv) test_memory_stubs(); test_accept4(); test_sysv_semaphore_ops(); + test_urandom_byte_reads(); + test_urandom_open_flags(); SUMMARY("test-syscall-smoke"); return fails > 0 ? 1 : 0; diff --git a/tests/test-vdso.c b/tests/test-vdso.c new file mode 100644 index 0000000..4d32d44 --- /dev/null +++ b/tests/test-vdso.c @@ -0,0 +1,269 @@ +/* test-vdso.c -- vDSO ELF correctness and symbol-resolution probe + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Confirms the synthetic vDSO emitted by src/core/vdso.c: + * 1. is published via AT_SYSINFO_EHDR + * 2. parses as a valid ELF shared object + * 3. exports the four __kernel_* symbols at addresses inside the page + * 4. carries GNU symbol versioning naming LINUX_2.6.39 so glibc/musl + * dl_vdso_vsym() can resolve unversioned lookups + * 5. trampolines actually execute (call __kernel_clock_gettime and + * compare the result against a direct SVC clock_gettime) + * + * Static binary so the standard test driver runs it under elfuse with + * no sysroot. The probe walks the vDSO's dynamic linker structure + * itself rather than relying on dlsym (which is unavailable in static + * builds anyway), so a regression in the elf layout fails this test + * regardless of which libc would later consume it. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int failures = 0; + +#define FAIL(msg) \ + do { \ + fprintf(stderr, "FAIL: %s\n", msg); \ + failures++; \ + } while (0) + +#define EXPECT(cond, msg) \ + do { \ + if (!(cond)) \ + FAIL(msg); \ + } while (0) + +/* SysV ELF hash, matches the implementation in src/core/vdso.c. */ +static uint32_t elf_hash(const char *name) +{ + uint32_t h = 0, g; + while (*name) { + h = (h << 4) + (unsigned char) *name++; + g = h & 0xf0000000U; + if (g) + h ^= g >> 24; + h &= ~g; + } + return h; +} + +static const Elf64_Sym *lookup_sym(const Elf64_Ehdr *ehdr, + const Elf64_Sym *symtab, + const char *strtab, + const uint32_t *hash, + const char *name) +{ + uint32_t nbucket = hash[0]; + uint32_t nchain = hash[1]; + const uint32_t *bucket = &hash[2]; + const uint32_t *chain = &bucket[nbucket]; + uint32_t h = elf_hash(name) % nbucket; + for (uint32_t i = bucket[h]; i && i < nchain; i = chain[i]) { + if (strcmp(&strtab[symtab[i].st_name], name) == 0) + return &symtab[i]; + } + (void) ehdr; + return NULL; +} + +typedef struct { + const Elf64_Sym *symtab; + const char *strtab; + const uint32_t *hash; + const uint16_t *versym; + const Elf64_Verdef *verdef; + size_t strsz; + int verdef_count; +} vdso_t; + +static int parse_vdso(const Elf64_Ehdr *ehdr, vdso_t *v) +{ + memset(v, 0, sizeof(*v)); + const Elf64_Phdr *phdr = + (const Elf64_Phdr *) ((const uint8_t *) ehdr + ehdr->e_phoff); + const Elf64_Dyn *dyn = NULL; + for (int i = 0; i < ehdr->e_phnum; i++) { + if (phdr[i].p_type == PT_DYNAMIC) { + dyn = + (const Elf64_Dyn *) ((const uint8_t *) ehdr + phdr[i].p_offset); + break; + } + } + if (!dyn) + return -1; + for (; dyn->d_tag != DT_NULL; dyn++) { + const uint8_t *p = (const uint8_t *) ehdr + dyn->d_un.d_ptr; + switch (dyn->d_tag) { + case DT_SYMTAB: + v->symtab = (const Elf64_Sym *) p; + break; + case DT_STRTAB: + v->strtab = (const char *) p; + break; + case DT_STRSZ: + v->strsz = (size_t) dyn->d_un.d_val; + break; + case DT_HASH: + v->hash = (const uint32_t *) p; + break; + case DT_VERSYM: + v->versym = (const uint16_t *) p; + break; + case DT_VERDEF: + v->verdef = (const Elf64_Verdef *) p; + break; + case DT_VERDEFNUM: + v->verdef_count = (int) dyn->d_un.d_val; + break; + default: + break; + } + } + return (v->symtab && v->strtab && v->hash) ? 0 : -1; +} + +static const char *verdef_name_for_ndx(const vdso_t *v, uint16_t ndx) +{ + const Elf64_Verdef *vd = v->verdef; + for (int i = 0; i < v->verdef_count && vd; i++) { + if (vd->vd_ndx == ndx) { + const Elf64_Verdaux *aux = + (const Elf64_Verdaux *) ((const uint8_t *) vd + vd->vd_aux); + return &v->strtab[aux->vda_name]; + } + if (!vd->vd_next) + break; + vd = (const Elf64_Verdef *) ((const uint8_t *) vd + vd->vd_next); + } + return NULL; +} + +typedef int (*clock_gettime_fn)(clockid_t, struct timespec *); + +static void test_vdso(void) +{ + unsigned long base = getauxval(AT_SYSINFO_EHDR); + EXPECT(base != 0, "AT_SYSINFO_EHDR is zero"); + if (!base) + return; + printf("AT_SYSINFO_EHDR = 0x%lx\n", base); + + const Elf64_Ehdr *ehdr = (const Elf64_Ehdr *) base; + EXPECT(memcmp(ehdr->e_ident, + "\x7f" + "ELF", + 4) == 0, + "vDSO ELF magic"); + EXPECT(ehdr->e_machine == EM_AARCH64, "vDSO e_machine"); + EXPECT(ehdr->e_type == ET_DYN, "vDSO e_type"); + + vdso_t v; + EXPECT(parse_vdso(ehdr, &v) == 0, "vDSO dynamic section parse"); + if (!v.symtab || !v.strtab || !v.hash) + return; + + /* All four __kernel_* symbols must resolve and land in the vDSO page. */ + static const char *names[] = { + "__kernel_rt_sigreturn", "__kernel_clock_getres", + "__kernel_clock_gettime", "__kernel_gettimeofday"}; + const Elf64_Sym *syms[4] = {0}; + for (int i = 0; i < 4; i++) { + syms[i] = lookup_sym(ehdr, v.symtab, v.strtab, v.hash, names[i]); + char buf[64]; + snprintf(buf, sizeof(buf), "lookup %s", names[i]); + EXPECT(syms[i] != NULL, buf); + if (!syms[i]) + continue; + uint64_t addr = base + syms[i]->st_value; + snprintf(buf, sizeof(buf), "%s address in vDSO page", names[i]); + EXPECT(addr >= base && addr < base + 0x1000, buf); + } + + /* Symbol versioning: every defined symbol must point at LINUX_2.6.39. */ + EXPECT(v.versym != NULL, "vDSO DT_VERSYM present"); + EXPECT(v.verdef != NULL, "vDSO DT_VERDEF present"); + if (v.versym && v.verdef) { + for (int i = 0; i < 4; i++) { + if (!syms[i]) + continue; + uint32_t sym_idx = (uint32_t) (syms[i] - v.symtab); + uint16_t ndx = v.versym[sym_idx]; + const char *ver = verdef_name_for_ndx(&v, ndx); + char buf[80]; + snprintf(buf, sizeof(buf), "%s versioned LINUX_2.6.39", names[i]); + EXPECT(ver && strcmp(ver, "LINUX_2.6.39") == 0, buf); + } + } + + /* Direct call into the vDSO trampoline. Must agree with SVC for both + * CLOCK_MONOTONIC and CLOCK_REALTIME. The trampoline interpolates each + * clockid from a shared CNTVCT anchor pair; the seed runs on first + * call so the second clockid here always exercises the post-seed + * fast path. + */ + const Elf64_Sym *cg = + lookup_sym(ehdr, v.symtab, v.strtab, v.hash, "__kernel_clock_gettime"); + if (cg) { + clock_gettime_fn fn = + (clock_gettime_fn) (uintptr_t) (base + cg->st_value); + struct { + clockid_t id; + const char *label; + int64_t tolerance_ns; + } cases[] = { + /* CLOCK_MONOTONIC: tight tolerance, anchor-derived value + * cannot drift relative to the SVC reference beyond the gap + * between calls. + */ + {CLOCK_MONOTONIC, "MONOTONIC", 10000000}, + /* CLOCK_REALTIME: tolerance loose enough to absorb host + * scheduling jitter between the two clock_gettime calls. + */ + {CLOCK_REALTIME, "REALTIME", 10000000}, + }; + for (size_t i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { + struct timespec via_vdso = {0}, via_svc = {0}; + int r1 = fn(cases[i].id, &via_vdso); + int r2 = (int) syscall(SYS_clock_gettime, cases[i].id, &via_svc); + char buf[80]; + snprintf(buf, sizeof(buf), "vDSO clock_gettime(%s) returned 0", + cases[i].label); + EXPECT(r1 == 0, buf); + snprintf(buf, sizeof(buf), "SVC clock_gettime(%s) returned 0", + cases[i].label); + EXPECT(r2 == 0, buf); + int64_t delta_ns = + ((int64_t) via_svc.tv_sec - via_vdso.tv_sec) * 1000000000LL + + (via_svc.tv_nsec - via_vdso.tv_nsec); + if (delta_ns < 0) + delta_ns = -delta_ns; + snprintf(buf, sizeof(buf), "vDSO and SVC clock_gettime(%s) agree", + cases[i].label); + EXPECT(delta_ns < cases[i].tolerance_ns, buf); + printf("vDSO/SVC clock_gettime(%s) delta = %" PRId64 " ns\n", + cases[i].label, delta_ns); + } + } +} + +int main(void) +{ + printf("test-vdso: vDSO ELF + symbol-versioning probe\n"); + test_vdso(); + if (failures) { + printf("test-vdso: %d FAIL\n", failures); + return 1; + } + puts("test-vdso: PASS"); + return 0; +}