diff --git a/src/core/bootstrap.c b/src/core/bootstrap.c
index 23625f0..be9df20 100644
--- a/src/core/bootstrap.c
+++ b/src/core/bootstrap.c
@@ -709,6 +709,7 @@ int guest_bootstrap_create_vcpu(guest_t *g,
     shim_globals_publish_pid(g, proc_get_pid(), proc_get_ppid());
     shim_globals_publish_creds(g, proc_get_uid(), proc_get_euid(),
                                proc_get_gid(), proc_get_egid());
+    proc_publish_pgsid_snapshot(g);
     /* Pre-fill the entropy ring so the first read(/dev/urandom) from the guest
      * is served by the shim fast path with no cold-start HVC for refill.
      */
diff --git a/src/core/shim-globals.c b/src/core/shim-globals.c
index eaf0bf9..51300dd 100644
--- a/src/core/shim-globals.c
+++ b/src/core/shim-globals.c
@@ -9,7 +9,9 @@
  * src/core/shim.S.
  */
 
+#include <pthread.h>
 #include <stdint.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sched.h>
@@ -60,6 +62,35 @@ _Static_assert(SHIM_URANDOM_OFF_RING_LOCK == 0x10C0,
                "shim.S urandom fast path hard-codes RING_LOCK off 0x10C0");
 _Static_assert(FD_TABLE_SIZE == 1024,
                "shim.S urandom fast path hard-codes FD_TABLE_SIZE 1024");
+_Static_assert(SHIM_URANDOM_INLINE_LIMIT == 256,
+               "shim.S urandom/getrandom fast path hard-codes 256-byte cap");
+
+/* shim.S COUNTER_INC macro hardcodes (SHIM_COUNTERS_OFF & 0xFFF) and the
+ * 0x1, lsl #12 carry. Keep the literal in sync so a layout shift fails
+ * the build rather than silently routing increments to the wrong slot.
+ */
+_Static_assert(SHIM_COUNTERS_OFF == 0x10C8,
+               "shim.S COUNTER_INC hard-codes SHIM_COUNTERS_OFF=0x10C8");
+/* shim.S splits SHIM_COUNTERS_OFF into a shifted-add carry (0x1000) plus
+ * an imm12 load/store offset (0xC8 + slot byte). Pin the split so any
+ * future layout shift fails the build instead of silently routing
+ * increments to the wrong slot.
+ */
+_Static_assert((SHIM_COUNTERS_OFF & 0xFFF) == 0xC8,
+               "shim.S SHIM_COUNTERS_OFF_LO12 hard-coded to 0xC8");
+_Static_assert((SHIM_COUNTERS_OFF & ~0xFFF) == 0x1000,
+               "shim.S SHIM_COUNTERS_OFF_HI hard-coded to 0x1000");
+_Static_assert(SHIM_IDENTITY_OFF_PGID == 0x1148,
+               "shim.S getpgid fast path hard-codes PGID off 0x1148");
+_Static_assert(SHIM_IDENTITY_OFF_SID == 0x1150,
+               "shim.S getsid fast path hard-codes SID off 0x1150");
+_Static_assert(SHIM_GLOBALS_SIZE >= SHIM_IDENTITY_OFF_SID + 8,
+               "SHIM_GLOBALS_SIZE must cover the PGID/SID slots");
+_Static_assert(SHIM_GLOBALS_SIZE <= BLOCK_2MIB,
+               "SHIM_GLOBALS_SIZE must fit inside the 2 MiB shim_data block");
+_Static_assert(SHIM_COUNTERS_OFF + SHIM_COUNTERS_N * 8 <=
+                   SHIM_IDENTITY_OFF_PGID,
+               "counter array must not overlap the PGID slot");
 
 static uint8_t *cache_base(const guest_t *g)
 {
@@ -114,6 +145,13 @@ void shim_globals_publish_creds(guest_t *g,
     store_u64(page, SHIM_IDENTITY_OFF_EGID, egid);
 }
 
+void shim_globals_publish_pgsid(guest_t *g, int64_t pgid, int64_t sid)
+{
+    uint8_t *page = cache_base(g);
+    store_u64(page, SHIM_IDENTITY_OFF_PGID, (uint64_t) pgid);
+    store_u64(page, SHIM_IDENTITY_OFF_SID, (uint64_t) sid);
+}
+
 uint64_t shim_globals_gva(const guest_t *g)
 {
     return g->shim_data_base;
@@ -242,9 +280,18 @@ void shim_globals_rebuild_urandom_bitmap(void)
 }
 
 /* arc4random_buf is documented as deadlock-free and re-entrant. Used
- * by both the initial fill at bootstrap and by the slow-path refill
- * that runs from sys_read when the shim's fast path falls through due
- * to an empty ring.
+ * by the initial fill at bootstrap and by the slow-path refill that
+ * runs from sys_read/sys_getrandom when the shim's fast path falls
+ * through due to an empty ring.
+ *
+ * Entropy is generated OUTSIDE the ring_lock: arc4random_buf can take
+ * microseconds, and any sibling vCPU that hits the fast path while the
+ * lock is held spins (yield) until release. Generate up to a full ring
+ * into a stack scratch buffer, then take the lock only to re-read
+ * head/fill and copy the publishable prefix into the ring. The recheck
+ * after lock acquire matters: a concurrent fast path may have advanced
+ * head while entropy was being generated, raising the publishable
+ * count beyond the pre-lock estimate.
  */
 void shim_globals_refill_urandom_ring(guest_t *g)
 {
@@ -254,13 +301,31 @@ void shim_globals_refill_urandom_ring(guest_t *g)
     uint32_t *lock_p = (uint32_t *) (base + SHIM_URANDOM_OFF_RING_LOCK);
     uint8_t *ring = base + SHIM_URANDOM_OFF_RING;
 
+    /* Pre-lock estimate: skip the arc4random_buf + lock when the ring
+     * is already full. Both cursors are read RELAXED so a torn snapshot
+     * (head_pre observed past a producer step but tail_pre observed
+     * before it) can make tail_pre - head_pre wrap to a huge unsigned
+     * value. A loose ">= RING_SIZE" check would treat that garbage as
+     * "already full" and skip a genuinely-needed refill. Only the exact
+     * == RING_SIZE value is a safe full-detection; any other (valid or
+     * torn) reading falls through to the lock-held recheck below.
+     */
+    uint32_t head_pre = __atomic_load_n(head_p, __ATOMIC_RELAXED);
+    uint32_t tail_pre = __atomic_load_n(tail_p, __ATOMIC_RELAXED);
+    uint32_t fill_pre = tail_pre - head_pre;
+    if (fill_pre == SHIM_URANDOM_RING_SIZE)
+        return;
+
+    uint8_t scratch[SHIM_URANDOM_RING_SIZE];
+    arc4random_buf(scratch, sizeof(scratch));
+
     urandom_ring_lock(lock_p);
 
     uint32_t head = __atomic_load_n(head_p, __ATOMIC_ACQUIRE);
     uint32_t tail = __atomic_load_n(tail_p, __ATOMIC_RELAXED);
     uint32_t fill = tail - head;
     if (fill >= SHIM_URANDOM_RING_SIZE)
-        goto out; /* already full */
+        goto out; /* concurrent refill caught up */
     uint32_t to_fill = SHIM_URANDOM_RING_SIZE - fill;
 
     /* Producer writes from ring[tail & (SIZE-1)] forward, wrapping
@@ -270,9 +335,9 @@ void shim_globals_refill_urandom_ring(guest_t *g)
     uint32_t first = SHIM_URANDOM_RING_SIZE - pos;
     if (first > to_fill)
         first = to_fill;
-    arc4random_buf(ring + pos, first);
+    memcpy(ring + pos, scratch, first);
     if (to_fill > first)
-        arc4random_buf(ring, to_fill - first);
+        memcpy(ring, scratch + first, to_fill - first);
 
     /* Release-store the new tail so any fast-path consumer that loads
      * tail with an acquiring read sees the bytes already in the ring.
@@ -359,3 +424,61 @@ void shim_globals_set_trace_enabled(guest_t *g, bool enabled)
     else
         shim_globals_attn_and(g, ~ATTN_BIT_TRACE);
 }
+
+static const char *const counter_names[SHIM_COUNTERS_N] = {
+    [SHIM_COUNTER_ATTN_BAIL] = "ATTN_BAIL",
+    [SHIM_COUNTER_URANDOM_FD_OOR] = "URANDOM_FD_OOR",
+    [SHIM_COUNTER_URANDOM_FD_BMISS] = "URANDOM_FD_BMISS",
+    [SHIM_COUNTER_URANDOM_LEN_ZERO] = "URANDOM_LEN_ZERO",
+    [SHIM_COUNTER_URANDOM_LEN_OVER] = "URANDOM_LEN_OVER",
+    [SHIM_COUNTER_URANDOM_RING_LOW] = "URANDOM_RING_LOW",
+    [SHIM_COUNTER_URANDOM_RING_WRAP] = "URANDOM_RING_WRAP",
+    [SHIM_COUNTER_URANDOM_PROBE_FAIL] = "URANDOM_PROBE_FAIL",
+    [SHIM_COUNTER_IDENTITY_HIT] = "IDENTITY_HIT",
+    [SHIM_COUNTER_URANDOM_HIT] = "URANDOM_HIT",
+    [SHIM_COUNTER_GETRANDOM_HIT] = "GETRANDOM_HIT",
+    [SHIM_COUNTER_PGSID_HIT] = "PGSID_HIT",
+    /* Slots 12..15 (SHIM_COUNTERS_N == 16) are intentionally unnamed;
+     * the dump prints "(reserved)" so they appear in the output when
+     * non-zero, which would flag an out-of-band increment. Bind a name
+     * here when a future EL1 service claims one of these slots.
+     */
+};
+
+uint64_t shim_globals_counter_get(const guest_t *g, unsigned slot)
+{
+    if (slot >= SHIM_COUNTERS_N)
+        return 0;
+    const uint8_t *page = (const uint8_t *) g->host_base + g->shim_data_base;
+    const uint64_t *slot_p =
+        (const uint64_t *) (page + SHIM_COUNTERS_OFF) + slot;
+    return __atomic_load_n(slot_p, __ATOMIC_RELAXED);
+}
+
+void shim_globals_counters_dump(const guest_t *g)
+{
+    fprintf(stderr, "shim-stats (pid=%lld)\n", (long long) proc_get_pid());
+    for (unsigned i = 0; i < SHIM_COUNTERS_N; i++) {
+        const char *name = counter_names[i];
+        uint64_t v = shim_globals_counter_get(g, i);
+        if (!name && v == 0)
+            continue;
+        fprintf(stderr, "  %-20s %llu\n", name ? name : "(reserved)",
+                (unsigned long long) v);
+    }
+}
+
+static pthread_once_t stats_once = PTHREAD_ONCE_INIT;
+static bool stats_enabled_cache;
+
+static void stats_resolve(void)
+{
+    const char *v = getenv("ELFUSE_SHIM_STATS");
+    stats_enabled_cache = v && v[0] && strcmp(v, "0") != 0;
+}
+
+bool shim_globals_stats_enabled(void)
+{
+    pthread_once(&stats_once, stats_resolve);
+    return stats_enabled_cache;
+}
diff --git a/src/core/shim-globals.h b/src/core/shim-globals.h
index 8e1a389..6aaee91 100644
--- a/src/core/shim-globals.h
+++ b/src/core/shim-globals.h
@@ -128,7 +128,63 @@
 #define SHIM_URANDOM_RING_SIZE 4096
 #define SHIM_URANDOM_OFF_RING_LOCK 0x10C0
 
-#define SHIM_GLOBALS_SIZE 0x10C4
+/* Upper bound on the per-call byte count served by the shim's
+ * urandom/getrandom fast paths. The probe coverage assumes the buffer
+ * spans at most two host pages so a first+last byte AT probe suffices;
+ * 256 fits comfortably within both 4 KiB and 16 KiB page sizes. The
+ * shim itself hardcodes the literal; a static_assert in shim-globals.c
+ * pins the C macro to the assembly. Ring wraps are handled inline by
+ * splitting the byte copy at the 4 KiB boundary, so this cap is bounded
+ * only by probe coverage and per-call ring-fill cost (256 keeps the
+ * 4 KiB ring serviceable for 16 sequential reads before host refill).
+ */
+#define SHIM_URANDOM_INLINE_LIMIT 256
+
+/* Fast-path hit / miss counters.
+ *
+ * 16 uint64 slots placed after the urandom ring lock. The shim's
+ * identity_class_fast and urandom_read_fast paths bump the relevant
+ * slot on every entry and at every bail point so the host can attribute
+ * fast-path activity instead of guessing. Counters are non-atomic plain
+ * load-add-store -- under multi-vCPU concurrent bails a small fraction
+ * of increments race and are lost, which is acceptable for diagnostic
+ * ratios. Slots 0..7 cover the eight bail reasons the shim distinguishes
+ * (sticky attention, fd out of range, fd not in urandom bitmap, len zero,
+ * len over inline cap, ring fill below request, ring wrap, EL0 buffer
+ * probe failure). Slots 8..11 record fast-path hits so bail rates can be
+ * computed against a hit denominator. Slots 12..15 are reserved.
+ *
+ * The shim hardcodes the byte offset of each slot; the static_asserts
+ * in shim-globals.c keep the C-side macros and the assembly in sync.
+ */
+#define SHIM_COUNTERS_OFF 0x10C8
+#define SHIM_COUNTERS_N 16
+
+#define SHIM_COUNTER_ATTN_BAIL 0
+#define SHIM_COUNTER_URANDOM_FD_OOR 1
+#define SHIM_COUNTER_URANDOM_FD_BMISS 2
+#define SHIM_COUNTER_URANDOM_LEN_ZERO 3
+#define SHIM_COUNTER_URANDOM_LEN_OVER 4
+#define SHIM_COUNTER_URANDOM_RING_LOW 5
+#define SHIM_COUNTER_URANDOM_RING_WRAP 6
+#define SHIM_COUNTER_URANDOM_PROBE_FAIL 7
+#define SHIM_COUNTER_IDENTITY_HIT 8
+#define SHIM_COUNTER_URANDOM_HIT 9
+#define SHIM_COUNTER_GETRANDOM_HIT 10
+#define SHIM_COUNTER_PGSID_HIT 11
+
+/* Extended identity slots: pgid and sid.
+ *
+ * getpgid(0) and getsid(0) are pure cache reads when the argument is
+ * zero; the shim serves them out of these slots whenever X0 == 0 and
+ * the syscall number matches. The host re-publishes after setpgid /
+ * setsid / exec / fork so the slots match guest_pgid / guest_sid in
+ * proc-identity.c.
+ */
+#define SHIM_IDENTITY_OFF_PGID 0x1148
+#define SHIM_IDENTITY_OFF_SID 0x1150
+
+#define SHIM_GLOBALS_SIZE 0x1158
 
 /* Initialize the cache region to all-zero. Called once per process at
  * the same time the shim_data block is set up (initial bootstrap and
@@ -158,6 +214,21 @@ void shim_globals_publish_creds(guest_t *g,
                                 uint32_t gid,
                                 uint32_t egid);
 
+/* Publish pgid + sid so the shim's getpgid(0) / getsid(0) inline service
+ * sees the current session/process-group state. Call from process init,
+ * fork-child receive, exec, setsid, and setpgid. Slot writes are
+ * independent 64-bit atomic release stores.
+ *
+ * No attention bit guards this publish: setpgid / setsid are infrequent
+ * and the model accepts a brief window in which a concurrent
+ * getpgid(0) / getsid(0) on a sibling vCPU observes the pre-publish
+ * value (consistent with Linux's lockless session lookups). Session
+ * mutators and cache-initialization callers publish through proc-identity
+ * while holding session_lock, so successful setpgid / setsid calls cannot
+ * overwrite the cache out of order.
+ */
+void shim_globals_publish_pgsid(guest_t *g, int64_t pgid, int64_t sid);
+
 /* GVA of the cache base. Equal to g->shim_data_base. Exposed so the
  * TPIDR_EL1 setup site and tests can reference one source of truth.
  */
@@ -306,3 +377,20 @@ void shim_globals_rebuild_urandom_bitmap(void);
  * forced through the host SVC.
  */
 void shim_globals_refill_urandom_ring(guest_t *g);
+
+/* Counter access for diagnostics. shim_globals_counter_get returns the
+ * cumulative slot value (lossy under multi-vCPU bail contention; see the
+ * comment block on SHIM_COUNTERS_OFF). slot must be in [0, SHIM_COUNTERS_N).
+ * shim_globals_counters_dump writes a one-line-per-slot summary to out
+ * with the SHIM_COUNTER_* names and current values; intended for use at
+ * process exit when ELFUSE_SHIM_STATS is set.
+ */
+uint64_t shim_globals_counter_get(const guest_t *g, unsigned slot);
+void shim_globals_counters_dump(const guest_t *g);
+
+/* ELFUSE_SHIM_STATS env-var gate (idempotent / cached). When enabled the
+ * exit path dumps the counter table to stderr so a single bench run
+ * attributes every fast-path bail without rebuilds. Mirrors the
+ * ELFUSE_STARTUP_TRACE pattern in core/startup-trace.h.
+ */
+bool shim_globals_stats_enabled(void);
diff --git a/src/core/shim.S b/src/core/shim.S
index a2613c3..9e3c6d0 100644
--- a/src/core/shim.S
+++ b/src/core/shim.S
@@ -125,6 +125,57 @@
     .endr
 .endm
 
+/* Counter byte offsets within shim_data. Mirror SHIM_COUNTER_* indices in
+ * src/core/shim-globals.h; the static_asserts in shim-globals.c keep both
+ * sides locked together. Byte offset = SHIM_COUNTERS_OFF + 8 * slot_index.
+ *
+ * SHIM_COUNTERS_OFF is 0x10C8, beyond AArch64's 12-bit add immediate range.
+ * Split it explicitly so COUNTER_INC can fold the carry into one shifted add
+ * and the low half plus the slot offset into the load/store immediate. The
+ * shim-globals.c static_assert pins both halves.
+ */
+.equ SHIM_COUNTERS_OFF_HI,   0x1000
+.equ SHIM_COUNTERS_OFF_LO12, 0xC8
+
+.equ CB_ATTN_BAIL,           0
+.equ CB_URANDOM_FD_OOR,      8
+.equ CB_URANDOM_FD_BMISS,    16
+.equ CB_URANDOM_LEN_ZERO,    24
+.equ CB_URANDOM_LEN_OVER,    32
+.equ CB_URANDOM_RING_LOW,    40
+.equ CB_URANDOM_PROBE_FAIL,  56
+.equ CB_IDENTITY_HIT,        64
+.equ CB_URANDOM_HIT,         72
+.equ CB_GETRANDOM_HIT,       80
+.equ CB_PGSID_HIT,           88
+/* Slot 48 (SHIM_COUNTER_URANDOM_RING_WRAP) deliberately omitted: wrap
+ * is handled inline by urandom_copy_loop, so no assembly path bumps it.
+ * The C-side enum keeps the index for ABI stability.
+ */
+
+/* COUNTER_INC: bump diagnostic counter at byte offset
+ * (SHIM_COUNTERS_OFF_LO12 + \byte_off) from TPIDR_EL1 by 1.
+ *
+ * One shifted add carries the SHIM_COUNTERS_OFF_HI (#0x1000) high half;
+ * the ldr/str fold the LO12 + slot-byte offset into the immediate (well
+ * within the imm12*8 = 32760 byte range).
+ *
+ * Non-atomic ldr/add/str. Multi-vCPU concurrent bails may race and lose
+ * a small fraction of counts; acceptable for diagnostic ratios. Each
+ * use expands to 5 instructions: mrs + add + ldr + add + str.
+ *
+ * Both scratch registers are clobbered. svc_restore_eret's
+ * RESTORE_GPRS_KEEP_X0 reloads X1..X30 from the saved frame, so x29/x30
+ * are safe choices on any fast path.
+ */
+.macro COUNTER_INC byte_off, tmp_addr, tmp_val
+    mrs \tmp_addr, tpidr_el1
+    add \tmp_addr, \tmp_addr, #SHIM_COUNTERS_OFF_HI
+    ldr \tmp_val, [\tmp_addr, #(SHIM_COUNTERS_OFF_LO12 + \byte_off)]
+    add \tmp_val, \tmp_val, #1
+    str \tmp_val, [\tmp_addr, #(SHIM_COUNTERS_OFF_LO12 + \byte_off)]
+.endm
+
 /* BAD_VEC: vector-table entry that reports an unexpected exception.
  * Each table slot is 128 bytes; the leading .align 7 places this entry at the
  * next 128-byte boundary.
@@ -298,30 +349,63 @@ svc_handler:
     b.lo identity_class_fast          /* 172..178 -> identity / gettid */
     cmp x10, #63                       /* SYS_read? */
     b.eq urandom_read_fast
+    cmp x10, #155                      /* SYS_getpgid? */
+    b.eq getpgid_fast
+    cmp x10, #156                      /* SYS_getsid? */
+    b.eq getsid_fast
+    cmp x10, #278                      /* SYS_getrandom? */
+    b.eq getrandom_fast
     b handle_svc_0
 
 identity_class_fast:
     mrs x12, tpidr_el1               /* shim-globals base */
     ldar w13, [x12]                  /* attention flag, acquire */
-    cbnz w13, handle_svc_0           /* slow-path required */
+    cbnz w13, attn_bail              /* slow-path required */
     cmp x11, #6                       /* bias == 6 ==> gettid (178) */
     b.eq gettid_fast
     add x12, x12, #8                 /* skip attention -> identity[0] */
     ldr x0, [x12, x11, lsl #3]       /* identity[bias] for 172..177 */
+    COUNTER_INC CB_IDENTITY_HIT, x29, x30
     b svc_restore_eret
 
 gettid_fast:
     mrs x0, contextidr_el1            /* per-vCPU tid */
+    COUNTER_INC CB_IDENTITY_HIT, x29, x30
+    b svc_restore_eret
+
+/* getpgid_fast / getsid_fast: serve getpgid(0) and getsid(0) from
+ * shim-globals slots. Any non-zero pid argument or set attention bit
+ * falls through to the host so per-pid lookups and post-setpgid/setsid
+ * publish ordering remain authoritative.
+ */
+getpgid_fast:
+    ldr x14, [sp, #0]                /* saved X0 = pid arg */
+    cbnz x14, handle_svc_0           /* pid != 0: not a pure cache read */
+    mrs x12, tpidr_el1
+    ldar w13, [x12]
+    cbnz w13, attn_bail
+    ldr x0, [x12, #0x1148]            /* SHIM_IDENTITY_OFF_PGID */
+    COUNTER_INC CB_PGSID_HIT, x29, x30
     b svc_restore_eret
 
-    /* Urandom-read fast path (Slice D / P3). Serves
-     * read(urandom_fd, buf, len) with len in [1, 64] by popping
-     * len bytes from the shim-globals entropy ring (TPIDR_EL1 base +
-     * 0xC0) into the guest-supplied buffer (X1), advancing the ring
-     * head atomically. If the requested fd is not FD_URANDOM, or
-     * the ring is low, or the read would cross a ring-wrap boundary,
-     * falls through to handle_svc_0 so the host serves the read and
-     * refills the ring.
+getsid_fast:
+    ldr x14, [sp, #0]
+    cbnz x14, handle_svc_0
+    mrs x12, tpidr_el1
+    ldar w13, [x12]
+    cbnz w13, attn_bail
+    ldr x0, [x12, #0x1150]            /* SHIM_IDENTITY_OFF_SID */
+    COUNTER_INC CB_PGSID_HIT, x29, x30
+    b svc_restore_eret
+
+    /* Urandom-read fast path. Serves read(urandom_fd, buf, len) with
+     * len in [1, SHIM_URANDOM_INLINE_LIMIT (256)] by popping len bytes
+     * from the shim-globals entropy ring (TPIDR_EL1 base + 0xC0) into
+     * the guest-supplied buffer (X1), advancing the ring head
+     * atomically. The 4 KiB ring boundary is handled inline via a
+     * split-copy with a one-shot wrap flag. If the requested fd is not
+     * FD_URANDOM or the ring is below the requested fill, falls through
+     * to handle_svc_0 so the host serves the read and refills the ring.
      *
      * Layout offsets (match core/shim-globals.h SHIM_URANDOM_OFF_*):
      *   0x0038  URANDOM_FD_BITMAP   1024 bits = 128 bytes
@@ -333,15 +417,15 @@ gettid_fast:
 urandom_read_fast:
     mrs x12, tpidr_el1
     ldar w13, [x12]                  /* attention flag */
-    cbnz w13, handle_svc_0
+    cbnz w13, attn_bail
 
     ldr x14, [sp, #0]                /* saved X0 = fd */
     cmp x14, #1024                    /* FD_TABLE_SIZE */
-    b.hs handle_svc_0
+    b.hs urandom_fd_oor_bail
     ldr x15, [sp, #16]               /* saved X2 = len */
-    cbz x15, handle_svc_0            /* host handles len == 0 */
-    cmp x15, #64                      /* URANDOM_INLINE_LIMIT */
-    b.hi handle_svc_0
+    cbz x15, urandom_len_zero_bail   /* host handles len == 0 */
+    cmp x15, #256                     /* SHIM_URANDOM_INLINE_LIMIT */
+    b.hi urandom_len_over_bail
 
     /* Bitmap test: word = fd >> 6, bit = fd & 63. */
     add x16, x12, #0x38              /* SHIM_URANDOM_OFF_BITMAP */
@@ -349,7 +433,7 @@ urandom_read_fast:
     ldr x17, [x16, x17, lsl #3]
     and x18, x14, #63
     lsr x17, x17, x18
-    tbz w17, #0, handle_svc_0
+    tbz w17, #0, urandom_fd_bmiss_bail
 
     ldr x20, [sp, #8]                /* saved X1 = buf */
     /* Probe the guest buffer for stage-1 EL0-write translations before
@@ -360,25 +444,40 @@ urandom_read_fast:
      * entry). The DYNAMIC case where a sibling vCPU munmaps the buffer
      * in the window between probe and strb is caught later by the
      * EL1 data abort vector routing into handle_el1_data_abort_recover
-     * (which rolls back the ring head, releases the lock, and returns
-     * -EFAULT). Without that recovery the EL1 strb would fault into
-     * BAD_VEC and halt the VM.
+     * (which discards the reserved entropy, releases the lock, and
+     * returns -EFAULT to EL0; the ring head is not rolled back -- the
+     * already-published bytes are simply skipped on the next read).
+     * Without that recovery the EL1 strb would fault into BAD_VEC and
+     * halt the VM.
      *
-     * len is in [1, 64]. Probing the first and last byte covers every page
-     * the inline copy can touch on Linux/AArch64, whose base page size is
-     * much larger than the inline limit.
+     * len is in [1, SHIM_URANDOM_INLINE_LIMIT=256]. Probing the first
+     * and last byte covers every page the inline copy can touch: even
+     * at the smaller 4 KiB host page size a 256-byte buffer straddles
+     * at most one page boundary, so probe(buf) + probe(buf+len-1) hits
+     * both pages. The second probe is skipped when buf and buf+len-1
+     * fall in the same 4 KiB page -- the dominant case for small
+     * crypto/SSH-handshake reads. Detected via xor + mask.
      */
+    sub x16, x15, #1                 /* len - 1 */
+    adds x17, x20, x16               /* last_byte = buf + len - 1 */
+    b.cs urandom_probe_fail_bail     /* overflow */
     at s1e0w, x20
     isb
     mrs x16, par_el1
-    tbnz x16, #0, urandom_slow_no_clrex
-    sub x16, x15, #1
-    adds x16, x20, x16
-    b.cs urandom_slow_no_clrex
-    at s1e0w, x16
+    tbnz x16, #0, urandom_probe_fail_bail
+    /* Same 4 KiB page? If (buf ^ last_byte) & ~0xFFF == 0, skip the
+     * second probe. eor + tst against the page mask is two scalar ops
+     * and one fused branch; cheaper than the full AT/ISB/MRS sequence
+     * (~5-15 ns).
+     */
+    eor x18, x20, x17
+    tst x18, #~0xFFF
+    b.eq 7f
+    at s1e0w, x17
     isb
     mrs x16, par_el1
-    tbnz x16, #0, urandom_slow_no_clrex
+    tbnz x16, #0, urandom_probe_fail_bail
+7:
 
     /* Serialize host refill against the shim's reserve-then-copy window.
      * Lock word lives after the 4096-byte ring at offset 0x10C0.
@@ -387,13 +486,17 @@ urandom_read_fast:
     add x19, x19, #0xC0              /* &ring_lock */
     mov w18, #1
 urandom_lock_spin:
-    ldaxr w17, [x19]
+    /* LSE swpal: atomic exchange. w17 receives the previous lock value;
+     * w18 (1) is stored unconditionally. If the previous was 0, we
+     * acquired. If it was 1, a sibling holds it; yield and retry.
+     * Apple Silicon implements ARMv8.1 LSE atomics, so swpal is one
+     * instruction (vs the prior ldaxr/stxr exclusive sequence). Release
+     * on unlock stays as stlr wzr, [x19].
+     */
+    swpal w18, w17, [x19]
     cbnz w17, urandom_lock_busy
-    stxr w17, w18, [x19]
-    cbnz w17, urandom_lock_spin
     b urandom_locked
 urandom_lock_busy:
-    clrex
     yield
     b urandom_lock_spin
 
@@ -404,12 +507,12 @@ urandom_locked:
     ldar w24, [x22]                  /* tail (host release-store) */
     sub  w25, w24, w23                /* fill = tail - head */
     cmp  w25, w15
-    b.lo urandom_clrex_slow           /* ring too low */
+    b.lo urandom_ring_low_bail        /* ring too low */
     and  w26, w23, #(4096 - 1)        /* pos = head & (RING_SIZE - 1) */
-    add  w27, w26, w15
-    cmp  w27, #4096
-    b.hi urandom_clrex_slow           /* would wrap: let slow path serve */
-    add  w27, w23, w15                /* new head = head + len */
+    add  w27, w23, w15                /* new head = head + len (wraps via mask
+                                       * at the next read; the copy below
+                                       * splits at the 4 KiB boundary).
+                                       */
     stxr w28, w27, [x21]
     cbnz w28, 0b
 
@@ -427,7 +530,7 @@ urandom_locked:
     mrs x30, spsr_el1
     stp x29, x30, [sp, #-16]!
 
-    /* Copy bytes from ring[pos] to buf. len is in [1, 64].
+    /* Copy bytes from ring[pos] to buf. len is in [1, 256].
      * w26 holds pos in [0, 4096); writing to w26 above zero-extends
      * into x26, so a plain reg add (no extension) is correct.
      */
@@ -436,7 +539,10 @@ urandom_locked:
     cmp  x15, #1
     b.ne urandom_copy_loop
 
-    /* Common case: 1-byte read. Single byte transfer. */
+    /* Common case: 1-byte read. Single byte transfer. 1-byte reads
+     * never wrap (a single byte at pos is always within the ring),
+     * so the split-copy logic in urandom_copy_loop is skipped.
+     */
 .globl urandom_strb_1byte_start
 .globl urandom_strb_1byte_end
 urandom_strb_1byte_start:
@@ -446,29 +552,147 @@ urandom_strb_1byte_end:
     add  sp, sp, #16                 /* pop ELR/SPSR recovery slot */
     mov  x0, #1
     stlr wzr, [x19]                  /* release ring_lock */
+    /* x10 still holds the syscall nr loaded by the svc_handler
+     * dispatcher; none of the urandom/getrandom fast-path body writes
+     * x10, so it remains 63 for read and 278 for getrandom. Use it to
+     * pick the matching hit counter without burning a scratch register.
+     * RESTORE_GPRS_KEEP_X0 reloads x10 from the saved frame.
+     */
+    cmp  x10, #63
+    b.ne 1f
+    COUNTER_INC CB_URANDOM_HIT, x29, x30
+    b svc_restore_eret
+1:  COUNTER_INC CB_GETRANDOM_HIT, x29, x30
     b svc_restore_eret
 
 urandom_copy_loop:
-    /* Byte-wise copy for len in [2, 64]. Unrolling would help but
-     * the slow path is the realistic target for large reads. The
-     * loop runs at most 64 times; total cost is dwarfed by the EL0
-     * entry/exit transitions.
+    /* Bulk + tail copy for len in [2, 256] with split-at-4-KiB wrap
+     * handling.
+     *
+     * Bulk pass moves 16 bytes per ldp/stp on Apple Silicon; tail
+     * peels the remaining 0..15 bytes via tbz on the bits of the
+     * residue. The wrap split runs the same code twice with rebased
+     * dst/src/limit. Unaligned ldp/stp on normal memory is supported
+     * by M-series with minimal penalty (one extra cycle when crossing
+     * a 64-byte cache line); the small penalty is dwarfed by the win
+     * from collapsing the byte loop's ~5 inst/byte into ldp/stp at
+     * 16 bytes/cycle peak.
+     *
+     * handle_el1_data_abort_recover covers a single PC range
+     * (urandom_strb_loop_start..urandom_strb_loop_end) that spans both
+     * the bulk loop and the tail copies, so any sibling-vCPU munmap
+     * mid-copy still routes to the EFAULT recovery path.
+     */
+    mov  w17, #4096
+    sub  w17, w17, w26                /* w17 = 4096 - pos */
+    cmp  w15, w17
+    csel w17, w15, w17, ls            /* w17 = first segment length */
+    /* Wrap-done flag: 0 on the first segment, set to 1 before re-
+     * entering for the second segment. With len <= 256 and ring size
+     * 4096, exactly one wrap is possible, so the flag guarantees the
+     * loop runs at most twice (vs comparing x17 to x15, which would
+     * infinitely re-wrap once the segment counter is rebased).
+     */
+    mov  w14, #0
+urandom_copy_segment:
+    /* Inputs: x16 = src cursor, x20 = dst cursor, x17 = segment bytes.
+     * ldp/stp lack a register-offset form, so we use post-incremented
+     * addressing. x16 and x20 advance through the segment; the wrap
+     * rebase below recomputes x16 (ring base) and lets x20 keep its
+     * post-increment position (exactly where segment 2 must start
+     * writing).
      */
-    mov  x29, #0
+    cmp  x17, #16
+    b.lo urandom_copy_tail_entry      /* < 16 bytes -> tail only */
+    and  x28, x17, #~15               /* bulk byte count = w17 & ~15 */
+    add  x28, x16, x28                /* bulk_end = src + bulk_count */
 .globl urandom_strb_loop_start
 .globl urandom_strb_loop_end
 urandom_strb_loop_start:
-1:  ldrb w0, [x16, x29]
-    strb w0, [x20, x29]
-    add  x29, x29, #1
-    cmp  x29, x15
-    b.ne 1b
+1:  ldp  x9, x11, [x16], #16
+    stp  x9, x11, [x20], #16
+    cmp  x16, x28
+    b.lo 1b
+urandom_copy_tail_entry:
+    and  x27, x17, #15                /* tail = segment_len % 16 */
+    cbz  x27, urandom_copy_segment_done
+    tbz  w27, #3, 2f                  /* 8-byte chunk */
+    ldr  x9, [x16], #8
+    str  x9, [x20], #8
+2:  tbz  w27, #2, 3f                  /* 4-byte chunk */
+    ldr  w9, [x16], #4
+    str  w9, [x20], #4
+3:  tbz  w27, #1, 4f                  /* 2-byte chunk */
+    ldrh w9, [x16], #2
+    strh w9, [x20], #2
+4:  tbz  w27, #0, urandom_copy_segment_done
+    ldrb w9, [x16], #1
+    strb w9, [x20], #1
 urandom_strb_loop_end:
+urandom_copy_segment_done:
+    cbnz w14, urandom_loop_done       /* second segment already ran */
+    cmp  x17, x15                     /* first_copy_len == total? */
+    b.eq urandom_loop_done            /* yes: no wrap needed */
+    /* Wrap rebase: src goes back to ring base; dst is already at
+     * (original buf + first_len) thanks to post-increment, so no
+     * rebase needed there. Segment limit becomes the remaining count.
+     */
+    mov  w14, #1
+    add  x16, x12, #0xC0              /* ring base = tpidr_el1 + 0xC0 */
+    sub  x17, x15, x17                /* loop limit = remaining */
+    b    urandom_copy_segment
+urandom_loop_done:
     add  sp, sp, #16                 /* pop ELR/SPSR recovery slot */
     mov  x0, x15
     stlr wzr, [x19]                  /* release ring_lock */
+    cmp  x10, #63
+    b.ne 1f
+    COUNTER_INC CB_URANDOM_HIT, x29, x30
+    b svc_restore_eret
+1:  COUNTER_INC CB_GETRANDOM_HIT, x29, x30
     b svc_restore_eret
 
+/* Named bail labels: branch targets for every fast-path exit that gives
+ * up. Each increments the matching diagnostic counter and routes into
+ * the right slow-path predecessor: probe failures jump into the no-
+ * clrex tail (no exclusive monitor was opened); ring_low jumps into
+ * the clrex tail (the LDXR opened a monitor and the lock is held).
+ * Ring wrap is no longer a bail reason now that urandom_copy_loop
+ * splits the byte copy at the 4 KiB boundary inline.
+ *
+ * attn_bail is shared by identity/pgsid/urandom/getrandom because the
+ * ATTN check predates any path-specific state setup; routing every
+ * attention-set fast path through the same CB_ATTN_BAIL counter and
+ * the same handle_svc_0 branch keeps the bail cluster compact.
+ */
+attn_bail:
+    COUNTER_INC CB_ATTN_BAIL, x29, x30
+    b handle_svc_0
+urandom_fd_oor_bail:
+    COUNTER_INC CB_URANDOM_FD_OOR, x29, x30
+    b handle_svc_0
+urandom_fd_bmiss_bail:
+    COUNTER_INC CB_URANDOM_FD_BMISS, x29, x30
+    b handle_svc_0
+urandom_len_zero_bail:
+    COUNTER_INC CB_URANDOM_LEN_ZERO, x29, x30
+    b handle_svc_0
+urandom_len_over_bail:
+    COUNTER_INC CB_URANDOM_LEN_OVER, x29, x30
+    b handle_svc_0
+urandom_probe_fail_bail:
+    COUNTER_INC CB_URANDOM_PROBE_FAIL, x29, x30
+    b urandom_slow_no_clrex
+urandom_ring_low_bail:
+    COUNTER_INC CB_URANDOM_RING_LOW, x29, x30
+    b urandom_clrex_slow
+    /* SHIM_COUNTER_URANDOM_RING_WRAP (C-side index 6) has no assembly
+     * binding now that urandom_copy_loop splits the byte copy at the
+     * 4 KiB ring boundary inline. The slot stays in the C enum for
+     * ABI stability; a non-zero dump reading flags a regression that
+     * re-introduced a wrap bail.
+     */
+
 urandom_clrex_slow:
     /* LDXR opened an exclusive monitor that the slow path will not
      * release on its own. CLREX drops the monitor so subsequent
@@ -487,6 +711,107 @@ urandom_slow_no_clrex:
      */
     b handle_svc_0
 
+/* getrandom_fast: serve getrandom(buf, len, flags) with len in
+ * [1, SHIM_URANDOM_INLINE_LIMIT (256)] and flags in {0, GRND_NONBLOCK
+ * (0x1)} by reusing the same urandom ring as the /dev/urandom fast
+ * path. Shares the copy + ring epilogue at urandom_strb_1byte_start
+ * and urandom_copy_loop (including the 4 KiB wrap split); the success
+ * path discriminates GETRANDOM_HIT vs URANDOM_HIT off the syscall
+ * number left in x10 by the dispatcher. Any other flag bit set
+ * (GRND_RANDOM, GRND_INSECURE, conflicting combinations, or any
+ * future kernel flag) falls through to sys_getrandom so the host
+ * preserves the full Linux contract.
+ */
+getrandom_fast:
+    mrs x12, tpidr_el1
+    ldar w13, [x12]
+    cbnz w13, attn_bail
+
+    ldr x15, [sp, #8]                /* saved X1 = len */
+    cbz x15, getrandom_len_zero_bail
+    cmp x15, #256                     /* SHIM_URANDOM_INLINE_LIMIT */
+    b.hi getrandom_len_over_bail
+
+    ldr x16, [sp, #16]               /* saved X2 = flags */
+    /* Accept flags == 0 and flags == GRND_NONBLOCK (0x1); both behave
+     * identically against our arc4random-backed ring (always non-
+     * blocking, always seeded). Any other bit set -- GRND_RANDOM (0x2),
+     * GRND_INSECURE (0x4), the invalid combination of the two, or any
+     * future kernel flag -- falls through to sys_getrandom so the host
+     * preserves the full Linux contract (EINVAL on conflict, etc.).
+     */
+    cmp x16, #1
+    b.hi handle_svc_0
+
+    ldr x20, [sp, #0]                /* saved X0 = buf */
+    /* See urandom_read_fast for the probe rationale; same shape, with
+     * the single-page skip path so single-page reads pay only one AT.
+     */
+    sub x16, x15, #1
+    adds x17, x20, x16
+    b.cs getrandom_probe_fail_bail
+    at s1e0w, x20
+    isb
+    mrs x16, par_el1
+    tbnz x16, #0, getrandom_probe_fail_bail
+    eor x18, x20, x17
+    tst x18, #~0xFFF
+    b.eq 7f
+    at s1e0w, x17
+    isb
+    mrs x16, par_el1
+    tbnz x16, #0, getrandom_probe_fail_bail
+7:
+
+    add x19, x12, #0x1, lsl #12
+    add x19, x19, #0xC0              /* &ring_lock */
+    mov w18, #1
+getrandom_lock_spin:
+    swpal w18, w17, [x19]            /* see urandom_lock_spin notes */
+    cbnz w17, getrandom_lock_busy
+    b getrandom_locked
+getrandom_lock_busy:
+    yield
+    b getrandom_lock_spin
+
+getrandom_locked:
+    add x21, x12, #0xB8
+    add x22, x12, #0xBC
+0:  ldxr w23, [x21]
+    ldar w24, [x22]
+    sub  w25, w24, w23
+    cmp  w25, w15
+    b.lo getrandom_ring_low_bail
+    and  w26, w23, #(4096 - 1)
+    add  w27, w23, w15                /* new head = head + len; wrap handled
+                                       * inline by the shared copy epilogue.
+                                       */
+    stxr w28, w27, [x21]
+    cbnz w28, 0b
+
+    mrs x29, elr_el1
+    mrs x30, spsr_el1
+    stp x29, x30, [sp, #-16]!
+
+    add  x16, x12, #0xC0
+    add  x16, x16, x26
+    cmp  x15, #1
+    b.eq urandom_strb_1byte_start
+    b urandom_copy_loop
+
+getrandom_len_zero_bail:
+    COUNTER_INC CB_URANDOM_LEN_ZERO, x29, x30
+    b handle_svc_0
+getrandom_len_over_bail:
+    COUNTER_INC CB_URANDOM_LEN_OVER, x29, x30
+    b handle_svc_0
+getrandom_probe_fail_bail:
+    COUNTER_INC CB_URANDOM_PROBE_FAIL, x29, x30
+    b urandom_slow_no_clrex
+getrandom_ring_low_bail:
+    COUNTER_INC CB_URANDOM_RING_LOW, x29, x30
+    b urandom_clrex_slow
+
 not_svc:
     /* EC=0x18: MSR/MRS / system instruction trap. */
     cmp x10, #0x18
diff --git a/src/main.c b/src/main.c
index 30eb23d..4b1963b 100644
--- a/src/main.c
+++ b/src/main.c
@@ -30,6 +30,7 @@
 #include "core/bootstrap.h"
 #include "core/guest.h"
 #include "core/rosetta.h"
+#include "core/shim-globals.h"
 #include "core/sysroot.h"
 
 #include "runtime/forkipc.h"
@@ -506,6 +507,14 @@ int main(int argc, char **argv)
 
     /* Tear down debugger state before freeing guest/vCPU resources. */
     gdb_stub_shutdown();
+
+    /* Diagnostic counter dump runs before guest_destroy so the shim_data
+     * mapping is still valid. ELFUSE_SHIM_STATS is the gate; an unset
+     * variable produces no output.
+     */
+    if (shim_globals_stats_enabled())
+        shim_globals_counters_dump(&g);
+
     cleanup_main_resources(&g, guest_initialized, &sysroot_mount,
                            have_host_cwd ? host_cwd : NULL, guest_argv,
                            guest_argc, elf_path, sysroot_path);
diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c
index 3f8c4a5..a3bb2a5 100644
--- a/src/runtime/forkipc.c
+++ b/src/runtime/forkipc.c
@@ -359,6 +359,13 @@ int fork_child_main(int ipc_fd,
     shim_globals_set_trace_enabled(&g, verbose);
     shim_globals_publish_pid(&g, hdr.child_pid, hdr.parent_pid);
     shim_globals_publish_creds(&g, hdr.uid, hdr.euid, hdr.gid, hdr.egid);
+    /* proc_set_session above committed hdr.pgid/sid into proc-identity;
+     * mirror into the shim cache so the child's getpgid(0)/getsid(0)
+     * fast paths see the inherited session state from the first syscall.
+     * Publish via proc-identity to keep parity with the syscall-time
+     * session_lock ordering even though no sibling vCPU exists at this point.
+     */
+    proc_publish_pgsid_snapshot(&g);
     /* Fresh entropy for the child. Linux's vDSO getrandom epoch-bumps
      * across fork; here we just re-fill the ring from arc4random_buf
      * which seeds from the host kernel's RNG, so parent and child do
diff --git a/src/syscall/exec.c b/src/syscall/exec.c
index 6d8ca2e..3b83053 100644
--- a/src/syscall/exec.c
+++ b/src/syscall/exec.c
@@ -88,6 +88,7 @@ static void exec_republish_shim_globals_or_die(hv_vcpu_t vcpu,
     shim_globals_publish_pid(g, proc_get_pid(), proc_get_ppid());
     shim_globals_publish_creds(g, proc_get_uid(), proc_get_euid(),
                                proc_get_gid(), proc_get_egid());
+    proc_publish_pgsid_snapshot(g);
     shim_globals_rebuild_urandom_bitmap();
     shim_globals_refill_urandom_ring(g);
     shim_globals_recompute_attention(g);
diff --git a/src/syscall/proc-identity.c b/src/syscall/proc-identity.c
index 0c8f05f..5a157cd 100644
--- a/src/syscall/proc-identity.c
+++ b/src/syscall/proc-identity.c
@@ -9,6 +9,7 @@
 #include <pthread.h>
 
 #include "syscall/abi.h"
+#include "core/shim-globals.h"
 #include "syscall/proc-identity.h"
 #include "syscall/proc.h"
 
@@ -210,6 +211,18 @@ int64_t proc_get_pgid(void)
     return guest_pgid;
 }
 
+static void proc_publish_pgsid_locked(guest_t *g)
+{
+    shim_globals_publish_pgsid(g, guest_pgid, guest_sid);
+}
+
+void proc_publish_pgsid_snapshot(guest_t *g)
+{
+    pthread_mutex_lock(&session_lock);
+    proc_publish_pgsid_locked(g);
+    pthread_mutex_unlock(&session_lock);
+}
+
 int64_t proc_get_fg_pgrp(void)
 {
     return guest_fg_pgrp;
@@ -232,7 +245,7 @@ void proc_set_ctty(int has_ctty)
     guest_has_ctty = has_ctty;
 }
 
-int64_t proc_sys_setsid(void)
+int64_t proc_sys_setsid(guest_t *g)
 {
     int64_t pid = guest_pid;
 
@@ -246,11 +259,12 @@ int64_t proc_sys_setsid(void)
     guest_pgid = pid;
     guest_fg_pgrp = pid;
     guest_has_ctty = 0;
+    proc_publish_pgsid_locked(g);
     pthread_mutex_unlock(&session_lock);
     return pid;
 }
 
-int64_t proc_sys_setpgid(int64_t pid, int64_t pgid)
+int64_t proc_sys_setpgid(guest_t *g, int64_t pid, int64_t pgid)
 {
     int64_t self = guest_pid;
 
@@ -268,6 +282,7 @@ int64_t proc_sys_setpgid(int64_t pid, int64_t pgid)
     }
 
     guest_pgid = pgid;
+    proc_publish_pgsid_locked(g);
     pthread_mutex_unlock(&session_lock);
     return 0;
 }
diff --git a/src/syscall/proc.h b/src/syscall/proc.h
index c3f282e..b6c6f52 100644
--- a/src/syscall/proc.h
+++ b/src/syscall/proc.h
@@ -126,22 +126,31 @@ const void *proc_get_auxv(size_t *len_out);
 /* Set guest identity (called from fork_child_main). */
 void proc_set_identity(int64_t pid, int64_t ppid);
 
-/* Session / process-group state.
- * Accessors are lock-free (_Atomic); writers are single-threaded
- * (called from startup, fork child init, or setsid/setpgid).
+/* Session / process-group state. Accessors are lock-free (_Atomic); syscall
+ * writers serialize with session_lock.
  */
 int64_t proc_get_sid(void);
 int64_t proc_get_pgid(void);
 int64_t proc_get_fg_pgrp(void);
 
+/* Publish the current pgid/sid pair into the shim cache while holding
+ * session_lock. Use this at cache initialization points so an external
+ * snapshot cannot overwrite a newer setpgid/setsid publish.
+ */
+void proc_publish_pgsid_snapshot(guest_t *g);
+
 /* Restore session/pgid from fork IPC. */
 void proc_set_session(int64_t sid, int64_t pgid);
 
-/* setsid: create new session. Returns SID or -LINUX_EPERM. */
-int64_t proc_sys_setsid(void);
+/* setsid: create new session and publish pgid/sid cache under session_lock.
+ * Returns SID or -LINUX_EPERM.
+ */
+int64_t proc_sys_setsid(guest_t *g);
 
-/* setpgid: set process group. Returns 0 or negative errno. */
-int64_t proc_sys_setpgid(int64_t pid, int64_t pgid);
+/* setpgid: set process group and publish pgid/sid cache under session_lock.
+ * Returns 0 or negative errno.
+ */
+int64_t proc_sys_setpgid(guest_t *g, int64_t pid, int64_t pgid);
 
 /* getsid: query session ID. Returns SID or -LINUX_ESRCH. */
 int64_t proc_sys_getsid(int64_t pid);
diff --git a/src/syscall/sys.c b/src/syscall/sys.c
index 1284090..8a82c36 100644
--- a/src/syscall/sys.c
+++ b/src/syscall/sys.c
@@ -23,6 +23,7 @@
 
 #include "utils.h"
 
+#include "core/shim-globals.h"
 #include "syscall/abi.h"
 #include "syscall/internal.h"
 #include "syscall/proc.h"
@@ -221,6 +222,7 @@ int64_t sys_getrandom(guest_t *g,
         offset += chunk;
     }
 
+    shim_globals_refill_urandom_ring(g);
     return (int64_t) buflen;
 }
 
diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c
index be97787..6ca1705 100644
--- a/src/syscall/syscall.c
+++ b/src/syscall/syscall.c
@@ -369,7 +369,7 @@ SC_FORWARD(sc_exit,    SC_EXIT_SENTINEL | ((int) x0 & 0xFF))
 SC_FORWARD(sc_getpid,  proc_get_pid())
 SC_FORWARD(sc_getppid, proc_get_ppid())
 SC_FORWARD(sc_getpgid, ((int) x0 == 0 || (int) x0 == (int) proc_get_pid()) ? proc_get_pgid() : -LINUX_ESRCH)
-SC_FORWARD(sc_setsid,  proc_sys_setsid())
+SC_FORWARD(sc_setsid,  proc_sys_setsid(g))
 SC_FORWARD(sc_getsid,  proc_sys_getsid((int64_t) x0))
 SC_FORWARD(sc_gettid,  current_thread ? current_thread->guest_tid : proc_get_pid())
 
@@ -540,7 +540,7 @@ SC_FORWARD(sc_setreuid,  CRED_BRACKETED(g, proc_sys_setreuid((uint32_t) x0, (uin
 SC_FORWARD(sc_setregid,  CRED_BRACKETED(g, proc_sys_setregid((uint32_t) x0, (uint32_t) x1)))
 SC_FORWARD(sc_setresuid, CRED_BRACKETED(g, proc_sys_setresuid((uint32_t) x0, (uint32_t) x1, (uint32_t) x2)))
 SC_FORWARD(sc_setresgid, CRED_BRACKETED(g, proc_sys_setresgid((uint32_t) x0, (uint32_t) x1, (uint32_t) x2)))
-SC_FORWARD(sc_setpgid, proc_sys_setpgid((int64_t) x0, (int64_t) x1))
+SC_FORWARD(sc_setpgid, proc_sys_setpgid(g, (int64_t) x0, (int64_t) x1))
 SC_STUB(sc_fadvise64,           0)
 SC_STUB(sc_sched_yield,         (sched_yield(), 0))
 SC_STUB(sc_mlock,               0)
diff --git a/tests/manifest.txt b/tests/manifest.txt
index 19b1b27..475f68a 100644
--- a/tests/manifest.txt
+++ b/tests/manifest.txt
@@ -52,6 +52,7 @@ test-shim-verbose-trace
 test-shim-data-el1
 test-shim-urandom-smp
 test-shim-urandom-toctou
+test-shim-urandom-wrap
 test-poll                      # diff=skip
 
 [section] I/O subsystem tests
diff --git a/tests/test-shim-urandom-wrap.c b/tests/test-shim-urandom-wrap.c
new file mode 100644
index 0000000..715b5b8
--- /dev/null
+++ b/tests/test-shim-urandom-wrap.c
@@ -0,0 +1,87 @@
+/* test-shim-urandom-wrap.c -- regression for wrapped shim urandom copies.
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The EL1 getrandom fast path copies out of a 4096-byte ring. When a read
+ * starts at ring[4095], the copy splits into a one-byte tail segment plus a
+ * wrapped second segment. A missing post-increment on the first segment's
+ * byte store used to make the second segment overwrite byte 0 of the caller
+ * buffer and leave the final requested byte untouched while still returning
+ * success.
+ */
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#define RING_SIZE 4096
+#define INLINE_LIMIT 256
+#define SLOW_LEN (INLINE_LIMIT + 1)
+#define ITERATIONS 8
+#define SENTINEL 0xA5
+
+static int getrandom_exact(void *buf, size_t len)
+{
+    long r = syscall(SYS_getrandom, buf, len, 0);
+    if (r != (long) len) {
+        fprintf(stderr, "getrandom(%zu) returned %ld errno=%d\n", len, r,
+                errno);
+        return -1;
+    }
+    return 0;
+}
+
+static int refill_ring(void)
+{
+    unsigned char scratch[SLOW_LEN];
+    return getrandom_exact(scratch, sizeof(scratch));
+}
+
+static int advance_fast_bytes(unsigned *pos, unsigned target)
+{
+    unsigned char b;
+
+    while (*pos != target) {
+        if (getrandom_exact(&b, 1) != 0)
+            return -1;
+        *pos = (*pos + 1) & (RING_SIZE - 1);
+    }
+    return 0;
+}
+
+int main(void)
+{
+    unsigned pos = 0;
+    int untouched = 0;
+
+    for (int i = 0; i < ITERATIONS; i++) {
+        if (refill_ring() != 0)
+            return 1;
+        if (advance_fast_bytes(&pos, RING_SIZE - 1) != 0)
+            return 1;
+        if (refill_ring() != 0)
+            return 1;
+
+        unsigned char buf[INLINE_LIMIT];
+        memset(buf, SENTINEL, sizeof(buf));
+        if (getrandom_exact(buf, sizeof(buf)) != 0)
+            return 1;
+        pos = (pos + INLINE_LIMIT) & (RING_SIZE - 1);
+
+        if (buf[sizeof(buf) - 1] == SENTINEL)
+            untouched++;
+    }
+
+    if (untouched == ITERATIONS) {
+        fprintf(stderr,
+                "FAIL: wrapped getrandom left the final byte untouched\n");
+        return 1;
+    }
+
+    printf("OK: wrapped getrandom wrote through the caller buffer\n");
+    return 0;
+}