diff --git a/src/core/bootstrap.c b/src/core/bootstrap.c index 23625f0..be9df20 100644 --- a/src/core/bootstrap.c +++ b/src/core/bootstrap.c @@ -709,6 +709,7 @@ int guest_bootstrap_create_vcpu(guest_t *g, shim_globals_publish_pid(g, proc_get_pid(), proc_get_ppid()); shim_globals_publish_creds(g, proc_get_uid(), proc_get_euid(), proc_get_gid(), proc_get_egid()); + proc_publish_pgsid_snapshot(g); /* Pre-fill the entropy ring so the first read(/dev/urandom) from the guest * is served by the shim fast path with no cold-start HVC for refill. */ diff --git a/src/core/shim-globals.c b/src/core/shim-globals.c index eaf0bf9..51300dd 100644 --- a/src/core/shim-globals.c +++ b/src/core/shim-globals.c @@ -9,7 +9,9 @@ * src/core/shim.S. */ +#include #include +#include #include #include #include @@ -60,6 +62,35 @@ _Static_assert(SHIM_URANDOM_OFF_RING_LOCK == 0x10C0, "shim.S urandom fast path hard-codes RING_LOCK off 0x10C0"); _Static_assert(FD_TABLE_SIZE == 1024, "shim.S urandom fast path hard-codes FD_TABLE_SIZE 1024"); +_Static_assert(SHIM_URANDOM_INLINE_LIMIT == 256, + "shim.S urandom/getrandom fast path hard-codes 256-byte cap"); + +/* shim.S COUNTER_INC macro hardcodes (SHIM_COUNTERS_OFF & 0xFFF) and the + * 0x1, lsl #12 carry. Keep the literal in sync so a layout shift fails + * the build rather than silently routing increments to the wrong slot. + */ +_Static_assert(SHIM_COUNTERS_OFF == 0x10C8, + "shim.S COUNTER_INC hard-codes SHIM_COUNTERS_OFF=0x10C8"); +/* shim.S splits SHIM_COUNTERS_OFF into a shifted-add carry (0x1000) plus + * an imm12 load/store offset (0xC8 + slot byte). Pin the split so any + * future layout shift fails the build instead of silently routing + * increments to the wrong slot. + */ +_Static_assert((SHIM_COUNTERS_OFF & 0xFFF) == 0xC8, + "shim.S SHIM_COUNTERS_OFF_LO12 hard-coded to 0xC8"); +_Static_assert((SHIM_COUNTERS_OFF & ~0xFFF) == 0x1000, + "shim.S SHIM_COUNTERS_OFF_HI hard-coded to 0x1000"); +_Static_assert(SHIM_IDENTITY_OFF_PGID == 0x1148, + "shim.S getpgid fast path hard-codes PGID off 0x1148"); +_Static_assert(SHIM_IDENTITY_OFF_SID == 0x1150, + "shim.S getsid fast path hard-codes SID off 0x1150"); +_Static_assert(SHIM_GLOBALS_SIZE >= SHIM_IDENTITY_OFF_SID + 8, + "SHIM_GLOBALS_SIZE must cover the PGID/SID slots"); +_Static_assert(SHIM_GLOBALS_SIZE <= BLOCK_2MIB, + "SHIM_GLOBALS_SIZE must fit inside the 2 MiB shim_data block"); +_Static_assert(SHIM_COUNTERS_OFF + SHIM_COUNTERS_N * 8 <= + SHIM_IDENTITY_OFF_PGID, + "counter array must not overlap the PGID slot"); static uint8_t *cache_base(const guest_t *g) { @@ -114,6 +145,13 @@ void shim_globals_publish_creds(guest_t *g, store_u64(page, SHIM_IDENTITY_OFF_EGID, egid); } +void shim_globals_publish_pgsid(guest_t *g, int64_t pgid, int64_t sid) +{ + uint8_t *page = cache_base(g); + store_u64(page, SHIM_IDENTITY_OFF_PGID, (uint64_t) pgid); + store_u64(page, SHIM_IDENTITY_OFF_SID, (uint64_t) sid); +} + uint64_t shim_globals_gva(const guest_t *g) { return g->shim_data_base; @@ -242,9 +280,18 @@ void shim_globals_rebuild_urandom_bitmap(void) } /* arc4random_buf is documented as deadlock-free and re-entrant. Used - * by both the initial fill at bootstrap and by the slow-path refill - * that runs from sys_read when the shim's fast path falls through due - * to an empty ring. + * by the initial fill at bootstrap and by the slow-path refill that + * runs from sys_read/sys_getrandom when the shim's fast path falls + * through due to an empty ring. + * + * Entropy is generated OUTSIDE the ring_lock: arc4random_buf can take + * microseconds, and any sibling vCPU that hits the fast path while the + * lock is held spins (yield) until release. Generate up to a full ring + * into a stack scratch buffer, then take the lock only to re-read + * head/fill and copy the publishable prefix into the ring. The recheck + * after lock acquire matters: a concurrent fast path may have advanced + * head while entropy was being generated, raising the publishable + * count beyond the pre-lock estimate. */ void shim_globals_refill_urandom_ring(guest_t *g) { @@ -254,13 +301,31 @@ void shim_globals_refill_urandom_ring(guest_t *g) uint32_t *lock_p = (uint32_t *) (base + SHIM_URANDOM_OFF_RING_LOCK); uint8_t *ring = base + SHIM_URANDOM_OFF_RING; + /* Pre-lock estimate: skip the arc4random_buf + lock when the ring + * is already full. Both cursors are read RELAXED so a torn snapshot + * (head_pre observed past a producer step but tail_pre observed + * before it) can make tail_pre - head_pre wrap to a huge unsigned + * value. A loose ">= RING_SIZE" check would treat that garbage as + * "already full" and skip a genuinely-needed refill. Only the exact + * == RING_SIZE value is a safe full-detection; any other (valid or + * torn) reading falls through to the lock-held recheck below. + */ + uint32_t head_pre = __atomic_load_n(head_p, __ATOMIC_RELAXED); + uint32_t tail_pre = __atomic_load_n(tail_p, __ATOMIC_RELAXED); + uint32_t fill_pre = tail_pre - head_pre; + if (fill_pre == SHIM_URANDOM_RING_SIZE) + return; + + uint8_t scratch[SHIM_URANDOM_RING_SIZE]; + arc4random_buf(scratch, sizeof(scratch)); + urandom_ring_lock(lock_p); uint32_t head = __atomic_load_n(head_p, __ATOMIC_ACQUIRE); uint32_t tail = __atomic_load_n(tail_p, __ATOMIC_RELAXED); uint32_t fill = tail - head; if (fill >= SHIM_URANDOM_RING_SIZE) - goto out; /* already full */ + goto out; /* concurrent refill caught up */ uint32_t to_fill = SHIM_URANDOM_RING_SIZE - fill; /* Producer writes from ring[tail & (SIZE-1)] forward, wrapping @@ -270,9 +335,9 @@ void shim_globals_refill_urandom_ring(guest_t *g) uint32_t first = SHIM_URANDOM_RING_SIZE - pos; if (first > to_fill) first = to_fill; - arc4random_buf(ring + pos, first); + memcpy(ring + pos, scratch, first); if (to_fill > first) - arc4random_buf(ring, to_fill - first); + memcpy(ring, scratch + first, to_fill - first); /* Release-store the new tail so any fast-path consumer that loads * tail with an acquiring read sees the bytes already in the ring. @@ -359,3 +424,61 @@ void shim_globals_set_trace_enabled(guest_t *g, bool enabled) else shim_globals_attn_and(g, ~ATTN_BIT_TRACE); } + +static const char *const counter_names[SHIM_COUNTERS_N] = { + [SHIM_COUNTER_ATTN_BAIL] = "ATTN_BAIL", + [SHIM_COUNTER_URANDOM_FD_OOR] = "URANDOM_FD_OOR", + [SHIM_COUNTER_URANDOM_FD_BMISS] = "URANDOM_FD_BMISS", + [SHIM_COUNTER_URANDOM_LEN_ZERO] = "URANDOM_LEN_ZERO", + [SHIM_COUNTER_URANDOM_LEN_OVER] = "URANDOM_LEN_OVER", + [SHIM_COUNTER_URANDOM_RING_LOW] = "URANDOM_RING_LOW", + [SHIM_COUNTER_URANDOM_RING_WRAP] = "URANDOM_RING_WRAP", + [SHIM_COUNTER_URANDOM_PROBE_FAIL] = "URANDOM_PROBE_FAIL", + [SHIM_COUNTER_IDENTITY_HIT] = "IDENTITY_HIT", + [SHIM_COUNTER_URANDOM_HIT] = "URANDOM_HIT", + [SHIM_COUNTER_GETRANDOM_HIT] = "GETRANDOM_HIT", + [SHIM_COUNTER_PGSID_HIT] = "PGSID_HIT", + /* Slots 12..15 (SHIM_COUNTERS_N == 16) are intentionally unnamed; + * the dump prints "(reserved)" so they appear in the output when + * non-zero, which would flag an out-of-band increment. Bind a name + * here when a future EL1 service claims one of these slots. + */ +}; + +uint64_t shim_globals_counter_get(const guest_t *g, unsigned slot) +{ + if (slot >= SHIM_COUNTERS_N) + return 0; + const uint8_t *page = (const uint8_t *) g->host_base + g->shim_data_base; + const uint64_t *slot_p = + (const uint64_t *) (page + SHIM_COUNTERS_OFF) + slot; + return __atomic_load_n(slot_p, __ATOMIC_RELAXED); +} + +void shim_globals_counters_dump(const guest_t *g) +{ + fprintf(stderr, "shim-stats (pid=%lld)\n", (long long) proc_get_pid()); + for (unsigned i = 0; i < SHIM_COUNTERS_N; i++) { + const char *name = counter_names[i]; + uint64_t v = shim_globals_counter_get(g, i); + if (!name && v == 0) + continue; + fprintf(stderr, " %-20s %llu\n", name ? name : "(reserved)", + (unsigned long long) v); + } +} + +static pthread_once_t stats_once = PTHREAD_ONCE_INIT; +static bool stats_enabled_cache; + +static void stats_resolve(void) +{ + const char *v = getenv("ELFUSE_SHIM_STATS"); + stats_enabled_cache = v && v[0] && strcmp(v, "0") != 0; +} + +bool shim_globals_stats_enabled(void) +{ + pthread_once(&stats_once, stats_resolve); + return stats_enabled_cache; +} diff --git a/src/core/shim-globals.h b/src/core/shim-globals.h index 8e1a389..6aaee91 100644 --- a/src/core/shim-globals.h +++ b/src/core/shim-globals.h @@ -128,7 +128,63 @@ #define SHIM_URANDOM_RING_SIZE 4096 #define SHIM_URANDOM_OFF_RING_LOCK 0x10C0 -#define SHIM_GLOBALS_SIZE 0x10C4 +/* Upper bound on the per-call byte count served by the shim's + * urandom/getrandom fast paths. The probe coverage assumes the buffer + * spans at most two host pages so a first+last byte AT probe suffices; + * 256 fits comfortably within both 4 KiB and 16 KiB page sizes. The + * shim itself hardcodes the literal; a static_assert in shim-globals.c + * pins the C macro to the assembly. Ring wraps are handled inline by + * splitting the byte copy at the 4 KiB boundary, so this cap is bounded + * only by probe coverage and per-call ring-fill cost (256 keeps the + * 4 KiB ring serviceable for 16 sequential reads before host refill). + */ +#define SHIM_URANDOM_INLINE_LIMIT 256 + +/* Fast-path hit / miss counters. + * + * 16 uint64 slots placed after the urandom ring lock. The shim's + * identity_class_fast and urandom_read_fast paths bump the relevant + * slot on every entry and at every bail point so the host can attribute + * fast-path activity instead of guessing. Counters are non-atomic plain + * load-add-store -- under multi-vCPU concurrent bails a small fraction + * of increments race and are lost, which is acceptable for diagnostic + * ratios. Slots 0..7 cover the eight bail reasons the shim distinguishes + * (sticky attention, fd out of range, fd not in urandom bitmap, len zero, + * len over inline cap, ring fill below request, ring wrap, EL0 buffer + * probe failure). Slots 8..11 record fast-path hits so bail rates can be + * computed against a hit denominator. Slots 12..15 are reserved. + * + * The shim hardcodes the byte offset of each slot; the static_asserts + * in shim-globals.c keep the C-side macros and the assembly in sync. + */ +#define SHIM_COUNTERS_OFF 0x10C8 +#define SHIM_COUNTERS_N 16 + +#define SHIM_COUNTER_ATTN_BAIL 0 +#define SHIM_COUNTER_URANDOM_FD_OOR 1 +#define SHIM_COUNTER_URANDOM_FD_BMISS 2 +#define SHIM_COUNTER_URANDOM_LEN_ZERO 3 +#define SHIM_COUNTER_URANDOM_LEN_OVER 4 +#define SHIM_COUNTER_URANDOM_RING_LOW 5 +#define SHIM_COUNTER_URANDOM_RING_WRAP 6 +#define SHIM_COUNTER_URANDOM_PROBE_FAIL 7 +#define SHIM_COUNTER_IDENTITY_HIT 8 +#define SHIM_COUNTER_URANDOM_HIT 9 +#define SHIM_COUNTER_GETRANDOM_HIT 10 +#define SHIM_COUNTER_PGSID_HIT 11 + +/* Extended identity slots: pgid and sid. + * + * getpgid(0) and getsid(0) are pure cache reads when the argument is + * zero; the shim serves them out of these slots whenever X0 == 0 and + * the syscall number matches. The host re-publishes after setpgid / + * setsid / exec / fork so the slots match guest_pgid / guest_sid in + * proc-identity.c. + */ +#define SHIM_IDENTITY_OFF_PGID 0x1148 +#define SHIM_IDENTITY_OFF_SID 0x1150 + +#define SHIM_GLOBALS_SIZE 0x1158 /* Initialize the cache region to all-zero. Called once per process at * the same time the shim_data block is set up (initial bootstrap and @@ -158,6 +214,21 @@ void shim_globals_publish_creds(guest_t *g, uint32_t gid, uint32_t egid); +/* Publish pgid + sid so the shim's getpgid(0) / getsid(0) inline service + * sees the current session/process-group state. Call from process init, + * fork-child receive, exec, setsid, and setpgid. Slot writes are + * independent 64-bit atomic release stores. + * + * No attention bit guards this publish: setpgid / setsid are infrequent + * and the model accepts a brief window in which a concurrent + * getpgid(0) / getsid(0) on a sibling vCPU observes the pre-publish + * value (consistent with Linux's lockless session lookups). Session + * mutators and cache-initialization callers publish through proc-identity + * while holding session_lock, so successful setpgid / setsid calls cannot + * overwrite the cache out of order. + */ +void shim_globals_publish_pgsid(guest_t *g, int64_t pgid, int64_t sid); + /* GVA of the cache base. Equal to g->shim_data_base. Exposed so the * TPIDR_EL1 setup site and tests can reference one source of truth. */ @@ -306,3 +377,20 @@ void shim_globals_rebuild_urandom_bitmap(void); * forced through the host SVC. */ void shim_globals_refill_urandom_ring(guest_t *g); + +/* Counter access for diagnostics. shim_globals_counter_get returns the + * cumulative slot value (lossy under multi-vCPU bail contention; see the + * comment block on SHIM_COUNTERS_OFF). slot must be in [0, SHIM_COUNTERS_N). + * shim_globals_counters_dump writes a one-line-per-slot summary to out + * with the SHIM_COUNTER_* names and current values; intended for use at + * process exit when ELFUSE_SHIM_STATS is set. + */ +uint64_t shim_globals_counter_get(const guest_t *g, unsigned slot); +void shim_globals_counters_dump(const guest_t *g); + +/* ELFUSE_SHIM_STATS env-var gate (idempotent / cached). When enabled the + * exit path dumps the counter table to stderr so a single bench run + * attributes every fast-path bail without rebuilds. Mirrors the + * ELFUSE_STARTUP_TRACE pattern in core/startup-trace.h. + */ +bool shim_globals_stats_enabled(void); diff --git a/src/core/shim.S b/src/core/shim.S index a2613c3..9e3c6d0 100644 --- a/src/core/shim.S +++ b/src/core/shim.S @@ -125,6 +125,57 @@ .endr .endm +/* Counter byte offsets within shim_data. Mirror SHIM_COUNTER_* indices in + * src/core/shim-globals.h; the static_asserts in shim-globals.c keep both + * sides locked together. Byte offset = SHIM_COUNTERS_OFF + 8 * slot_index. + * + * SHIM_COUNTERS_OFF is 0x10C8, beyond AArch64's 12-bit add immediate range. + * Split it explicitly so COUNTER_INC can fold the carry into one shifted add + * and the low half plus the slot offset into the load/store immediate. The + * shim-globals.c static_assert pins both halves. + */ +.equ SHIM_COUNTERS_OFF_HI, 0x1000 +.equ SHIM_COUNTERS_OFF_LO12, 0xC8 + +.equ CB_ATTN_BAIL, 0 +.equ CB_URANDOM_FD_OOR, 8 +.equ CB_URANDOM_FD_BMISS, 16 +.equ CB_URANDOM_LEN_ZERO, 24 +.equ CB_URANDOM_LEN_OVER, 32 +.equ CB_URANDOM_RING_LOW, 40 +.equ CB_URANDOM_PROBE_FAIL, 56 +.equ CB_IDENTITY_HIT, 64 +.equ CB_URANDOM_HIT, 72 +.equ CB_GETRANDOM_HIT, 80 +.equ CB_PGSID_HIT, 88 +/* Slot 48 (SHIM_COUNTER_URANDOM_RING_WRAP) deliberately omitted: wrap + * is handled inline by urandom_copy_loop, so no assembly path bumps it. + * The C-side enum keeps the index for ABI stability. + */ + +/* COUNTER_INC: bump diagnostic counter at byte offset + * (SHIM_COUNTERS_OFF_LO12 + \byte_off) from TPIDR_EL1 by 1. + * + * One shifted add carries the SHIM_COUNTERS_OFF_HI (#0x1000) high half; + * the ldr/str fold the LO12 + slot-byte offset into the immediate (well + * within the imm12*8 = 32760 byte range). + * + * Non-atomic ldr/add/str. Multi-vCPU concurrent bails may race and lose + * a small fraction of counts; acceptable for diagnostic ratios. Each + * use expands to 5 instructions: mrs + add + ldr + add + str. + * + * Both scratch registers are clobbered. svc_restore_eret's + * RESTORE_GPRS_KEEP_X0 reloads X1..X30 from the saved frame, so x29/x30 + * are safe choices on any fast path. + */ +.macro COUNTER_INC byte_off, tmp_addr, tmp_val + mrs \tmp_addr, tpidr_el1 + add \tmp_addr, \tmp_addr, #SHIM_COUNTERS_OFF_HI + ldr \tmp_val, [\tmp_addr, #(SHIM_COUNTERS_OFF_LO12 + \byte_off)] + add \tmp_val, \tmp_val, #1 + str \tmp_val, [\tmp_addr, #(SHIM_COUNTERS_OFF_LO12 + \byte_off)] +.endm + /* BAD_VEC: vector-table entry that reports an unexpected exception. * Each table slot is 128 bytes; the leading .align 7 places this entry at the * next 128-byte boundary. @@ -298,30 +349,63 @@ svc_handler: b.lo identity_class_fast /* 172..178 -> identity / gettid */ cmp x10, #63 /* SYS_read? */ b.eq urandom_read_fast + cmp x10, #155 /* SYS_getpgid? */ + b.eq getpgid_fast + cmp x10, #156 /* SYS_getsid? */ + b.eq getsid_fast + cmp x10, #278 /* SYS_getrandom? */ + b.eq getrandom_fast b handle_svc_0 identity_class_fast: mrs x12, tpidr_el1 /* shim-globals base */ ldar w13, [x12] /* attention flag, acquire */ - cbnz w13, handle_svc_0 /* slow-path required */ + cbnz w13, attn_bail /* slow-path required */ cmp x11, #6 /* bias == 6 ==> gettid (178) */ b.eq gettid_fast add x12, x12, #8 /* skip attention -> identity[0] */ ldr x0, [x12, x11, lsl #3] /* identity[bias] for 172..177 */ + COUNTER_INC CB_IDENTITY_HIT, x29, x30 b svc_restore_eret gettid_fast: mrs x0, contextidr_el1 /* per-vCPU tid */ + COUNTER_INC CB_IDENTITY_HIT, x29, x30 + b svc_restore_eret + +/* getpgid_fast / getsid_fast: serve getpgid(0) and getsid(0) from + * shim-globals slots. Any non-zero pid argument or set attention bit + * falls through to the host so per-pid lookups and post-setpgid/setsid + * publish ordering remain authoritative. + */ +getpgid_fast: + ldr x14, [sp, #0] /* saved X0 = pid arg */ + cbnz x14, handle_svc_0 /* pid != 0: not a pure cache read */ + mrs x12, tpidr_el1 + ldar w13, [x12] + cbnz w13, attn_bail + ldr x0, [x12, #0x1148] /* SHIM_IDENTITY_OFF_PGID */ + COUNTER_INC CB_PGSID_HIT, x29, x30 b svc_restore_eret - /* Urandom-read fast path (Slice D / P3). Serves - * read(urandom_fd, buf, len) with len in [1, 64] by popping - * len bytes from the shim-globals entropy ring (TPIDR_EL1 base + - * 0xC0) into the guest-supplied buffer (X1), advancing the ring - * head atomically. If the requested fd is not FD_URANDOM, or - * the ring is low, or the read would cross a ring-wrap boundary, - * falls through to handle_svc_0 so the host serves the read and - * refills the ring. +getsid_fast: + ldr x14, [sp, #0] + cbnz x14, handle_svc_0 + mrs x12, tpidr_el1 + ldar w13, [x12] + cbnz w13, attn_bail + ldr x0, [x12, #0x1150] /* SHIM_IDENTITY_OFF_SID */ + COUNTER_INC CB_PGSID_HIT, x29, x30 + b svc_restore_eret + + /* Urandom-read fast path. Serves read(urandom_fd, buf, len) with + * len in [1, SHIM_URANDOM_INLINE_LIMIT (256)] by popping len bytes + * from the shim-globals entropy ring (TPIDR_EL1 base + 0xC0) into + * the guest-supplied buffer (X1), advancing the ring head + * atomically. The 4 KiB ring boundary is handled inline via a + * split-copy with a one-shot wrap flag. If the requested fd is not + * FD_URANDOM or the ring is below the requested fill, falls through + * to handle_svc_0 so the host serves the read and refills the ring. * * Layout offsets (match core/shim-globals.h SHIM_URANDOM_OFF_*): * 0x0038 URANDOM_FD_BITMAP 1024 bits = 128 bytes @@ -333,15 +417,15 @@ gettid_fast: urandom_read_fast: mrs x12, tpidr_el1 ldar w13, [x12] /* attention flag */ - cbnz w13, handle_svc_0 + cbnz w13, attn_bail ldr x14, [sp, #0] /* saved X0 = fd */ cmp x14, #1024 /* FD_TABLE_SIZE */ - b.hs handle_svc_0 + b.hs urandom_fd_oor_bail ldr x15, [sp, #16] /* saved X2 = len */ - cbz x15, handle_svc_0 /* host handles len == 0 */ - cmp x15, #64 /* URANDOM_INLINE_LIMIT */ - b.hi handle_svc_0 + cbz x15, urandom_len_zero_bail /* host handles len == 0 */ + cmp x15, #256 /* SHIM_URANDOM_INLINE_LIMIT */ + b.hi urandom_len_over_bail /* Bitmap test: word = fd >> 6, bit = fd & 63. */ add x16, x12, #0x38 /* SHIM_URANDOM_OFF_BITMAP */ @@ -349,7 +433,7 @@ urandom_read_fast: ldr x17, [x16, x17, lsl #3] and x18, x14, #63 lsr x17, x17, x18 - tbz w17, #0, handle_svc_0 + tbz w17, #0, urandom_fd_bmiss_bail ldr x20, [sp, #8] /* saved X1 = buf */ /* Probe the guest buffer for stage-1 EL0-write translations before @@ -360,25 +444,40 @@ urandom_read_fast: * entry). The DYNAMIC case where a sibling vCPU munmaps the buffer * in the window between probe and strb is caught later by the * EL1 data abort vector routing into handle_el1_data_abort_recover - * (which rolls back the ring head, releases the lock, and returns - * -EFAULT). Without that recovery the EL1 strb would fault into - * BAD_VEC and halt the VM. + * (which discards the reserved entropy, releases the lock, and + * returns -EFAULT to EL0; the ring head is not rolled back -- the + * already-published bytes are simply skipped on the next read). + * Without that recovery the EL1 strb would fault into BAD_VEC and + * halt the VM. * - * len is in [1, 64]. Probing the first and last byte covers every page - * the inline copy can touch on Linux/AArch64, whose base page size is - * much larger than the inline limit. + * len is in [1, SHIM_URANDOM_INLINE_LIMIT=256]. Probing the first + * and last byte covers every page the inline copy can touch: even + * at the smaller 4 KiB host page size a 256-byte buffer straddles + * at most one page boundary, so probe(buf) + probe(buf+len-1) hits + * both pages. The second probe is skipped when buf and buf+len-1 + * fall in the same 4 KiB page -- the dominant case for small + * crypto/SSH-handshake reads. Detected via xor + mask. */ + sub x16, x15, #1 /* len - 1 */ + adds x17, x20, x16 /* last_byte = buf + len - 1 */ + b.cs urandom_probe_fail_bail /* overflow */ at s1e0w, x20 isb mrs x16, par_el1 - tbnz x16, #0, urandom_slow_no_clrex - sub x16, x15, #1 - adds x16, x20, x16 - b.cs urandom_slow_no_clrex - at s1e0w, x16 + tbnz x16, #0, urandom_probe_fail_bail + /* Same 4 KiB page? If (buf ^ last_byte) & ~0xFFF == 0, skip the + * second probe. eor + tst against the page mask is two scalar ops + * and one fused branch; cheaper than the full AT/ISB/MRS sequence + * (~5-15 ns). + */ + eor x18, x20, x17 + tst x18, #~0xFFF + b.eq 7f + at s1e0w, x17 isb mrs x16, par_el1 - tbnz x16, #0, urandom_slow_no_clrex + tbnz x16, #0, urandom_probe_fail_bail +7: /* Serialize host refill against the shim's reserve-then-copy window. * Lock word lives after the 4096-byte ring at offset 0x10C0. @@ -387,13 +486,17 @@ urandom_read_fast: add x19, x19, #0xC0 /* &ring_lock */ mov w18, #1 urandom_lock_spin: - ldaxr w17, [x19] + /* LSE swpal: atomic exchange. w17 receives the previous lock value; + * w18 (1) is stored unconditionally. If the previous was 0, we + * acquired. If it was 1, a sibling holds it; yield and retry. + * Apple Silicon implements ARMv8.1 LSE atomics, so swpal is one + * instruction (vs the prior ldaxr/stxr exclusive sequence). Release + * on unlock stays as stlr wzr, [x19]. + */ + swpal w18, w17, [x19] cbnz w17, urandom_lock_busy - stxr w17, w18, [x19] - cbnz w17, urandom_lock_spin b urandom_locked urandom_lock_busy: - clrex yield b urandom_lock_spin @@ -404,12 +507,12 @@ urandom_locked: ldar w24, [x22] /* tail (host release-store) */ sub w25, w24, w23 /* fill = tail - head */ cmp w25, w15 - b.lo urandom_clrex_slow /* ring too low */ + b.lo urandom_ring_low_bail /* ring too low */ and w26, w23, #(4096 - 1) /* pos = head & (RING_SIZE - 1) */ - add w27, w26, w15 - cmp w27, #4096 - b.hi urandom_clrex_slow /* would wrap: let slow path serve */ - add w27, w23, w15 /* new head = head + len */ + add w27, w23, w15 /* new head = head + len (wraps via mask + * at the next read; the copy below + * splits at the 4 KiB boundary). + */ stxr w28, w27, [x21] cbnz w28, 0b @@ -427,7 +530,7 @@ urandom_locked: mrs x30, spsr_el1 stp x29, x30, [sp, #-16]! - /* Copy bytes from ring[pos] to buf. len is in [1, 64]. + /* Copy bytes from ring[pos] to buf. len is in [1, 256]. * w26 holds pos in [0, 4096); writing to w26 above zero-extends * into x26, so a plain reg add (no extension) is correct. */ @@ -436,7 +539,10 @@ urandom_locked: cmp x15, #1 b.ne urandom_copy_loop - /* Common case: 1-byte read. Single byte transfer. */ + /* Common case: 1-byte read. Single byte transfer. 1-byte reads + * never wrap (a single byte at pos is always within the ring), + * so the split-copy logic in urandom_copy_loop is skipped. + */ .globl urandom_strb_1byte_start .globl urandom_strb_1byte_end urandom_strb_1byte_start: @@ -446,29 +552,147 @@ urandom_strb_1byte_end: add sp, sp, #16 /* pop ELR/SPSR recovery slot */ mov x0, #1 stlr wzr, [x19] /* release ring_lock */ + /* x10 still holds the syscall nr loaded by the svc_handler + * dispatcher; none of the urandom/getrandom fast-path body writes + * x10, so it remains 63 for read and 278 for getrandom. Use it to + * pick the matching hit counter without burning a scratch register. + * RESTORE_GPRS_KEEP_X0 reloads x10 from the saved frame. + */ + cmp x10, #63 + b.ne 1f + COUNTER_INC CB_URANDOM_HIT, x29, x30 + b svc_restore_eret +1: COUNTER_INC CB_GETRANDOM_HIT, x29, x30 b svc_restore_eret urandom_copy_loop: - /* Byte-wise copy for len in [2, 64]. Unrolling would help but - * the slow path is the realistic target for large reads. The - * loop runs at most 64 times; total cost is dwarfed by the EL0 - * entry/exit transitions. + /* Bulk + tail copy for len in [2, 256] with split-at-4-KiB wrap + * handling. + * + * Bulk pass moves 16 bytes per ldp/stp on Apple Silicon; tail + * peels the remaining 0..15 bytes via tbz on the bits of the + * residue. The wrap split runs the same code twice with rebased + * dst/src/limit. Unaligned ldp/stp on normal memory is supported + * by M-series with minimal penalty (one extra cycle when crossing + * a 64-byte cache line); the small penalty is dwarfed by the win + * from collapsing the byte loop's ~5 inst/byte into ldp/stp at + * 16 bytes/cycle peak. + * + * handle_el1_data_abort_recover covers a single PC range + * (urandom_strb_loop_start..urandom_strb_loop_end) that spans both + * the bulk loop and the tail copies, so any sibling-vCPU munmap + * mid-copy still routes to the EFAULT recovery path. + */ + mov w17, #4096 + sub w17, w17, w26 /* w17 = 4096 - pos */ + cmp w15, w17 + csel w17, w15, w17, ls /* w17 = first segment length */ + /* Wrap-done flag: 0 on the first segment, set to 1 before re- + * entering for the second segment. With len <= 256 and ring size + * 4096, exactly one wrap is possible, so the flag guarantees the + * loop runs at most twice (vs comparing x17 to x15, which would + * infinitely re-wrap once the segment counter is rebased). + */ + mov w14, #0 +urandom_copy_segment: + /* Inputs: x16 = src cursor, x20 = dst cursor, x17 = segment bytes. + * ldp/stp lack a register-offset form, so we use post-incremented + * addressing. x16 and x20 advance through the segment; the wrap + * rebase below recomputes x16 (ring base) and lets x20 keep its + * post-increment position (exactly where segment 2 must start + * writing). */ - mov x29, #0 + cmp x17, #16 + b.lo urandom_copy_tail_entry /* < 16 bytes -> tail only */ + and x28, x17, #~15 /* bulk byte count = w17 & ~15 */ + add x28, x16, x28 /* bulk_end = src + bulk_count */ .globl urandom_strb_loop_start .globl urandom_strb_loop_end urandom_strb_loop_start: -1: ldrb w0, [x16, x29] - strb w0, [x20, x29] - add x29, x29, #1 - cmp x29, x15 - b.ne 1b +1: ldp x9, x11, [x16], #16 + stp x9, x11, [x20], #16 + cmp x16, x28 + b.lo 1b +urandom_copy_tail_entry: + and x27, x17, #15 /* tail = segment_len % 16 */ + cbz x27, urandom_copy_segment_done + tbz w27, #3, 2f /* 8-byte chunk */ + ldr x9, [x16], #8 + str x9, [x20], #8 +2: tbz w27, #2, 3f /* 4-byte chunk */ + ldr w9, [x16], #4 + str w9, [x20], #4 +3: tbz w27, #1, 4f /* 2-byte chunk */ + ldrh w9, [x16], #2 + strh w9, [x20], #2 +4: tbz w27, #0, urandom_copy_segment_done + ldrb w9, [x16], #1 + strb w9, [x20], #1 urandom_strb_loop_end: +urandom_copy_segment_done: + cbnz w14, urandom_loop_done /* second segment already ran */ + cmp x17, x15 /* first_copy_len == total? */ + b.eq urandom_loop_done /* yes: no wrap needed */ + /* Wrap rebase: src goes back to ring base; dst is already at + * (original buf + first_len) thanks to post-increment, so no + * rebase needed there. Segment limit becomes the remaining count. + */ + mov w14, #1 + add x16, x12, #0xC0 /* ring base = tpidr_el1 + 0xC0 */ + sub x17, x15, x17 /* loop limit = remaining */ + b urandom_copy_segment +urandom_loop_done: add sp, sp, #16 /* pop ELR/SPSR recovery slot */ mov x0, x15 stlr wzr, [x19] /* release ring_lock */ + cmp x10, #63 + b.ne 1f + COUNTER_INC CB_URANDOM_HIT, x29, x30 + b svc_restore_eret +1: COUNTER_INC CB_GETRANDOM_HIT, x29, x30 b svc_restore_eret +/* Named bail labels: branch targets for every fast-path exit that gives + * up. Each increments the matching diagnostic counter and routes into + * the right slow-path predecessor: probe failures jump into the no- + * clrex tail (no exclusive monitor was opened); ring_low jumps into + * the clrex tail (the LDXR opened a monitor and the lock is held). + * Ring wrap is no longer a bail reason now that urandom_copy_loop + * splits the byte copy at the 4 KiB boundary inline. + * + * attn_bail is shared by identity/pgsid/urandom/getrandom because the + * ATTN check predates any path-specific state setup; routing every + * attention-set fast path through the same CB_ATTN_BAIL counter and + * the same handle_svc_0 branch keeps the bail cluster compact. + */ +attn_bail: + COUNTER_INC CB_ATTN_BAIL, x29, x30 + b handle_svc_0 +urandom_fd_oor_bail: + COUNTER_INC CB_URANDOM_FD_OOR, x29, x30 + b handle_svc_0 +urandom_fd_bmiss_bail: + COUNTER_INC CB_URANDOM_FD_BMISS, x29, x30 + b handle_svc_0 +urandom_len_zero_bail: + COUNTER_INC CB_URANDOM_LEN_ZERO, x29, x30 + b handle_svc_0 +urandom_len_over_bail: + COUNTER_INC CB_URANDOM_LEN_OVER, x29, x30 + b handle_svc_0 +urandom_probe_fail_bail: + COUNTER_INC CB_URANDOM_PROBE_FAIL, x29, x30 + b urandom_slow_no_clrex +urandom_ring_low_bail: + COUNTER_INC CB_URANDOM_RING_LOW, x29, x30 + b urandom_clrex_slow + /* SHIM_COUNTER_URANDOM_RING_WRAP (C-side index 6) has no assembly + * binding now that urandom_copy_loop splits the byte copy at the + * 4 KiB ring boundary inline. The slot stays in the C enum for + * ABI stability; a non-zero dump reading flags a regression that + * re-introduced a wrap bail. + */ + urandom_clrex_slow: /* LDXR opened an exclusive monitor that the slow path will not * release on its own. CLREX drops the monitor so subsequent @@ -487,6 +711,107 @@ urandom_slow_no_clrex: */ b handle_svc_0 +/* getrandom_fast: serve getrandom(buf, len, flags) with len in + * [1, SHIM_URANDOM_INLINE_LIMIT (256)] and flags in {0, GRND_NONBLOCK + * (0x1)} by reusing the same urandom ring as the /dev/urandom fast + * path. Shares the copy + ring epilogue at urandom_strb_1byte_start + * and urandom_copy_loop (including the 4 KiB wrap split); the success + * path discriminates GETRANDOM_HIT vs URANDOM_HIT off the syscall + * number left in x10 by the dispatcher. Any other flag bit set + * (GRND_RANDOM, GRND_INSECURE, conflicting combinations, or any + * future kernel flag) falls through to sys_getrandom so the host + * preserves the full Linux contract. + */ +getrandom_fast: + mrs x12, tpidr_el1 + ldar w13, [x12] + cbnz w13, attn_bail + + ldr x15, [sp, #8] /* saved X1 = len */ + cbz x15, getrandom_len_zero_bail + cmp x15, #256 /* SHIM_URANDOM_INLINE_LIMIT */ + b.hi getrandom_len_over_bail + + ldr x16, [sp, #16] /* saved X2 = flags */ + /* Accept flags == 0 and flags == GRND_NONBLOCK (0x1); both behave + * identically against our arc4random-backed ring (always non- + * blocking, always seeded). Any other bit set -- GRND_RANDOM (0x2), + * GRND_INSECURE (0x4), the invalid combination of the two, or any + * future kernel flag -- falls through to sys_getrandom so the host + * preserves the full Linux contract (EINVAL on conflict, etc.). + */ + cmp x16, #1 + b.hi handle_svc_0 + + ldr x20, [sp, #0] /* saved X0 = buf */ + /* See urandom_read_fast for the probe rationale; same shape, with + * the single-page skip path so single-page reads pay only one AT. + */ + sub x16, x15, #1 + adds x17, x20, x16 + b.cs getrandom_probe_fail_bail + at s1e0w, x20 + isb + mrs x16, par_el1 + tbnz x16, #0, getrandom_probe_fail_bail + eor x18, x20, x17 + tst x18, #~0xFFF + b.eq 7f + at s1e0w, x17 + isb + mrs x16, par_el1 + tbnz x16, #0, getrandom_probe_fail_bail +7: + + add x19, x12, #0x1, lsl #12 + add x19, x19, #0xC0 /* &ring_lock */ + mov w18, #1 +getrandom_lock_spin: + swpal w18, w17, [x19] /* see urandom_lock_spin notes */ + cbnz w17, getrandom_lock_busy + b getrandom_locked +getrandom_lock_busy: + yield + b getrandom_lock_spin + +getrandom_locked: + add x21, x12, #0xB8 + add x22, x12, #0xBC +0: ldxr w23, [x21] + ldar w24, [x22] + sub w25, w24, w23 + cmp w25, w15 + b.lo getrandom_ring_low_bail + and w26, w23, #(4096 - 1) + add w27, w23, w15 /* new head = head + len; wrap handled + * inline by the shared copy epilogue. + */ + stxr w28, w27, [x21] + cbnz w28, 0b + + mrs x29, elr_el1 + mrs x30, spsr_el1 + stp x29, x30, [sp, #-16]! + + add x16, x12, #0xC0 + add x16, x16, x26 + cmp x15, #1 + b.eq urandom_strb_1byte_start + b urandom_copy_loop + +getrandom_len_zero_bail: + COUNTER_INC CB_URANDOM_LEN_ZERO, x29, x30 + b handle_svc_0 +getrandom_len_over_bail: + COUNTER_INC CB_URANDOM_LEN_OVER, x29, x30 + b handle_svc_0 +getrandom_probe_fail_bail: + COUNTER_INC CB_URANDOM_PROBE_FAIL, x29, x30 + b urandom_slow_no_clrex +getrandom_ring_low_bail: + COUNTER_INC CB_URANDOM_RING_LOW, x29, x30 + b urandom_clrex_slow + not_svc: /* EC=0x18: MSR/MRS / system instruction trap. */ cmp x10, #0x18 diff --git a/src/main.c b/src/main.c index 30eb23d..4b1963b 100644 --- a/src/main.c +++ b/src/main.c @@ -30,6 +30,7 @@ #include "core/bootstrap.h" #include "core/guest.h" #include "core/rosetta.h" +#include "core/shim-globals.h" #include "core/sysroot.h" #include "runtime/forkipc.h" @@ -506,6 +507,14 @@ int main(int argc, char **argv) /* Tear down debugger state before freeing guest/vCPU resources. */ gdb_stub_shutdown(); + + /* Diagnostic counter dump runs before guest_destroy so the shim_data + * mapping is still valid. ELFUSE_SHIM_STATS is the gate; an unset + * variable produces no output. + */ + if (shim_globals_stats_enabled()) + shim_globals_counters_dump(&g); + cleanup_main_resources(&g, guest_initialized, &sysroot_mount, have_host_cwd ? host_cwd : NULL, guest_argv, guest_argc, elf_path, sysroot_path); diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c index 3f8c4a5..a3bb2a5 100644 --- a/src/runtime/forkipc.c +++ b/src/runtime/forkipc.c @@ -359,6 +359,13 @@ int fork_child_main(int ipc_fd, shim_globals_set_trace_enabled(&g, verbose); shim_globals_publish_pid(&g, hdr.child_pid, hdr.parent_pid); shim_globals_publish_creds(&g, hdr.uid, hdr.euid, hdr.gid, hdr.egid); + /* proc_set_session above committed hdr.pgid/sid into proc-identity; + * mirror into the shim cache so the child's getpgid(0)/getsid(0) + * fast paths see the inherited session state from the first syscall. + * Publish via proc-identity to keep parity with the syscall-time + * session_lock ordering even though no sibling vCPU exists at this point. + */ + proc_publish_pgsid_snapshot(&g); /* Fresh entropy for the child. Linux's vDSO getrandom epoch-bumps * across fork; here we just re-fill the ring from arc4random_buf * which seeds from the host kernel's RNG, so parent and child do diff --git a/src/syscall/exec.c b/src/syscall/exec.c index 6d8ca2e..3b83053 100644 --- a/src/syscall/exec.c +++ b/src/syscall/exec.c @@ -88,6 +88,7 @@ static void exec_republish_shim_globals_or_die(hv_vcpu_t vcpu, shim_globals_publish_pid(g, proc_get_pid(), proc_get_ppid()); shim_globals_publish_creds(g, proc_get_uid(), proc_get_euid(), proc_get_gid(), proc_get_egid()); + proc_publish_pgsid_snapshot(g); shim_globals_rebuild_urandom_bitmap(); shim_globals_refill_urandom_ring(g); shim_globals_recompute_attention(g); diff --git a/src/syscall/proc-identity.c b/src/syscall/proc-identity.c index 0c8f05f..5a157cd 100644 --- a/src/syscall/proc-identity.c +++ b/src/syscall/proc-identity.c @@ -9,6 +9,7 @@ #include #include "syscall/abi.h" +#include "core/shim-globals.h" #include "syscall/proc-identity.h" #include "syscall/proc.h" @@ -210,6 +211,18 @@ int64_t proc_get_pgid(void) return guest_pgid; } +static void proc_publish_pgsid_locked(guest_t *g) +{ + shim_globals_publish_pgsid(g, guest_pgid, guest_sid); +} + +void proc_publish_pgsid_snapshot(guest_t *g) +{ + pthread_mutex_lock(&session_lock); + proc_publish_pgsid_locked(g); + pthread_mutex_unlock(&session_lock); +} + int64_t proc_get_fg_pgrp(void) { return guest_fg_pgrp; @@ -232,7 +245,7 @@ void proc_set_ctty(int has_ctty) guest_has_ctty = has_ctty; } -int64_t proc_sys_setsid(void) +int64_t proc_sys_setsid(guest_t *g) { int64_t pid = guest_pid; @@ -246,11 +259,12 @@ int64_t proc_sys_setsid(void) guest_pgid = pid; guest_fg_pgrp = pid; guest_has_ctty = 0; + proc_publish_pgsid_locked(g); pthread_mutex_unlock(&session_lock); return pid; } -int64_t proc_sys_setpgid(int64_t pid, int64_t pgid) +int64_t proc_sys_setpgid(guest_t *g, int64_t pid, int64_t pgid) { int64_t self = guest_pid; @@ -268,6 +282,7 @@ int64_t proc_sys_setpgid(int64_t pid, int64_t pgid) } guest_pgid = pgid; + proc_publish_pgsid_locked(g); pthread_mutex_unlock(&session_lock); return 0; } diff --git a/src/syscall/proc.h b/src/syscall/proc.h index c3f282e..b6c6f52 100644 --- a/src/syscall/proc.h +++ b/src/syscall/proc.h @@ -126,22 +126,31 @@ const void *proc_get_auxv(size_t *len_out); /* Set guest identity (called from fork_child_main). */ void proc_set_identity(int64_t pid, int64_t ppid); -/* Session / process-group state. - * Accessors are lock-free (_Atomic); writers are single-threaded - * (called from startup, fork child init, or setsid/setpgid). +/* Session / process-group state. Accessors are lock-free (_Atomic); syscall + * writers serialize with session_lock. */ int64_t proc_get_sid(void); int64_t proc_get_pgid(void); int64_t proc_get_fg_pgrp(void); +/* Publish the current pgid/sid pair into the shim cache while holding + * session_lock. Use this at cache initialization points so an external + * snapshot cannot overwrite a newer setpgid/setsid publish. + */ +void proc_publish_pgsid_snapshot(guest_t *g); + /* Restore session/pgid from fork IPC. */ void proc_set_session(int64_t sid, int64_t pgid); -/* setsid: create new session. Returns SID or -LINUX_EPERM. */ -int64_t proc_sys_setsid(void); +/* setsid: create new session and publish pgid/sid cache under session_lock. + * Returns SID or -LINUX_EPERM. + */ +int64_t proc_sys_setsid(guest_t *g); -/* setpgid: set process group. Returns 0 or negative errno. */ -int64_t proc_sys_setpgid(int64_t pid, int64_t pgid); +/* setpgid: set process group and publish pgid/sid cache under session_lock. + * Returns 0 or negative errno. + */ +int64_t proc_sys_setpgid(guest_t *g, int64_t pid, int64_t pgid); /* getsid: query session ID. Returns SID or -LINUX_ESRCH. */ int64_t proc_sys_getsid(int64_t pid); diff --git a/src/syscall/sys.c b/src/syscall/sys.c index 1284090..8a82c36 100644 --- a/src/syscall/sys.c +++ b/src/syscall/sys.c @@ -23,6 +23,7 @@ #include "utils.h" +#include "core/shim-globals.h" #include "syscall/abi.h" #include "syscall/internal.h" #include "syscall/proc.h" @@ -221,6 +222,7 @@ int64_t sys_getrandom(guest_t *g, offset += chunk; } + shim_globals_refill_urandom_ring(g); return (int64_t) buflen; } diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index be97787..6ca1705 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -369,7 +369,7 @@ SC_FORWARD(sc_exit, SC_EXIT_SENTINEL | ((int) x0 & 0xFF)) SC_FORWARD(sc_getpid, proc_get_pid()) SC_FORWARD(sc_getppid, proc_get_ppid()) SC_FORWARD(sc_getpgid, ((int) x0 == 0 || (int) x0 == (int) proc_get_pid()) ? proc_get_pgid() : -LINUX_ESRCH) -SC_FORWARD(sc_setsid, proc_sys_setsid()) +SC_FORWARD(sc_setsid, proc_sys_setsid(g)) SC_FORWARD(sc_getsid, proc_sys_getsid((int64_t) x0)) SC_FORWARD(sc_gettid, current_thread ? current_thread->guest_tid : proc_get_pid()) @@ -540,7 +540,7 @@ SC_FORWARD(sc_setreuid, CRED_BRACKETED(g, proc_sys_setreuid((uint32_t) x0, (uin SC_FORWARD(sc_setregid, CRED_BRACKETED(g, proc_sys_setregid((uint32_t) x0, (uint32_t) x1))) SC_FORWARD(sc_setresuid, CRED_BRACKETED(g, proc_sys_setresuid((uint32_t) x0, (uint32_t) x1, (uint32_t) x2))) SC_FORWARD(sc_setresgid, CRED_BRACKETED(g, proc_sys_setresgid((uint32_t) x0, (uint32_t) x1, (uint32_t) x2))) -SC_FORWARD(sc_setpgid, proc_sys_setpgid((int64_t) x0, (int64_t) x1)) +SC_FORWARD(sc_setpgid, proc_sys_setpgid(g, (int64_t) x0, (int64_t) x1)) SC_STUB(sc_fadvise64, 0) SC_STUB(sc_sched_yield, (sched_yield(), 0)) SC_STUB(sc_mlock, 0) diff --git a/tests/manifest.txt b/tests/manifest.txt index 19b1b27..475f68a 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -52,6 +52,7 @@ test-shim-verbose-trace test-shim-data-el1 test-shim-urandom-smp test-shim-urandom-toctou +test-shim-urandom-wrap test-poll # diff=skip [section] I/O subsystem tests diff --git a/tests/test-shim-urandom-wrap.c b/tests/test-shim-urandom-wrap.c new file mode 100644 index 0000000..715b5b8 --- /dev/null +++ b/tests/test-shim-urandom-wrap.c @@ -0,0 +1,87 @@ +/* test-shim-urandom-wrap.c -- regression for wrapped shim urandom copies. + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The EL1 getrandom fast path copies out of a 4096-byte ring. When a read + * starts at ring[4095], the copy splits into a one-byte tail segment plus a + * wrapped second segment. A missing post-increment on the first segment's + * byte store used to make the second segment overwrite byte 0 of the caller + * buffer and leave the final requested byte untouched while still returning + * success. + */ + +#include +#include +#include +#include +#include +#include + +#define RING_SIZE 4096 +#define INLINE_LIMIT 256 +#define SLOW_LEN (INLINE_LIMIT + 1) +#define ITERATIONS 8 +#define SENTINEL 0xA5 + +static int getrandom_exact(void *buf, size_t len) +{ + long r = syscall(SYS_getrandom, buf, len, 0); + if (r != (long) len) { + fprintf(stderr, "getrandom(%zu) returned %ld errno=%d\n", len, r, + errno); + return -1; + } + return 0; +} + +static int refill_ring(void) +{ + unsigned char scratch[SLOW_LEN]; + return getrandom_exact(scratch, sizeof(scratch)); +} + +static int advance_fast_bytes(unsigned *pos, unsigned target) +{ + unsigned char b; + + while (*pos != target) { + if (getrandom_exact(&b, 1) != 0) + return -1; + *pos = (*pos + 1) & (RING_SIZE - 1); + } + return 0; +} + +int main(void) +{ + unsigned pos = 0; + int untouched = 0; + + for (int i = 0; i < ITERATIONS; i++) { + if (refill_ring() != 0) + return 1; + if (advance_fast_bytes(&pos, RING_SIZE - 1) != 0) + return 1; + if (refill_ring() != 0) + return 1; + + unsigned char buf[INLINE_LIMIT]; + memset(buf, SENTINEL, sizeof(buf)); + if (getrandom_exact(buf, sizeof(buf)) != 0) + return 1; + pos = (pos + INLINE_LIMIT) & (RING_SIZE - 1); + + if (buf[sizeof(buf) - 1] == SENTINEL) + untouched++; + } + + if (untouched == ITERATIONS) { + fprintf(stderr, + "FAIL: wrapped getrandom left the final byte untouched\n"); + return 1; + } + + printf("OK: wrapped getrandom wrote through the caller buffer\n"); + return 0; +}