diff --git a/Makefile b/Makefile
index 2f69e94..2cafed0 100644
--- a/Makefile
+++ b/Makefile
@@ -131,6 +131,16 @@ $(BUILD_DIR)/test-multi-vcpu: $(BUILD_DIR)/test-multi-vcpu.o | $(BUILD_DIR)
 $(BUILD_DIR)/test-rwx: $(BUILD_DIR)/test-rwx.o | $(BUILD_DIR)
 	$(call link-and-sign,$@,$<)
 
+## Build the TLBI RVAE1IS operand encoder unit test (native macOS binary).
+# Pure C; no HVF entitlement needed. Verifies the architectural bit-layout
+# of tlbi_rvae1is_operand so a future regression that drops TG=01 (which
+# the Apple Silicon integration tests would silently tolerate) fails CI
+# immediately.
+$(BUILD_DIR)/test-tlbi-encoder-host: $(BUILD_DIR)/test-tlbi-encoder-host.o \
+		| $(BUILD_DIR)
+	@echo "  LD      $@"
+	$(Q)$(CC) $(CFLAGS) -o $@ $^
+
 ## Build the proctitle argv-tail regression test (native macOS binary)
 # Links against the project-built proctitle.o so the exact in-tree code is
 # exercised; no HVF entitlement is needed because the test only manipulates
@@ -167,6 +177,12 @@ $(BUILD_DIR)/test-shim-cred-race: tests/test-shim-cred-race.c | $(BUILD_DIR)
 	@echo "  CROSS   $< (with -lpthread)"
 	$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread
 
+# test-mprotect-mt stresses multi-vCPU mprotect under concurrent reader
+# threads to surface stale-TLB regressions.
+$(BUILD_DIR)/test-mprotect-mt: tests/test-mprotect-mt.c | $(BUILD_DIR)
+	@echo "  CROSS   $< (with -lpthread)"
+	$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread
+
 # test-shim-urandom-smp spawns N pthreads racing on a shared FD_URANDOM
 # slot to exercise the shim's LDXR/STXR head-advance under contention.
 $(BUILD_DIR)/test-shim-urandom-smp: tests/test-shim-urandom-smp.c | $(BUILD_DIR)
diff --git a/mk/config.mk b/mk/config.mk
index 232da91..7270e28 100644
--- a/mk/config.mk
+++ b/mk/config.mk
@@ -15,7 +15,8 @@ ifeq ($(origin GUEST_TEST_BINARIES), undefined)
 endif
 
 # Exclude native macOS test files from cross-compilation
-NATIVE_TESTS := tests/test-multi-vcpu.c tests/test-rwx.c
+NATIVE_TESTS := tests/test-multi-vcpu.c tests/test-rwx.c \
+                tests/test-tlbi-encoder-host.c
 SPECIAL_TEST_SRCS := tests/test-lowbase-mem.c
 SPECIAL_TEST_BINS := $(BUILD_DIR)/test-lowbase-mem-200000 $(BUILD_DIR)/test-lowbase-mem-300000
 
diff --git a/mk/tests.mk b/mk/tests.mk
index 03947be..fd412ff 100644
--- a/mk/tests.mk
+++ b/mk/tests.mk
@@ -35,8 +35,11 @@ define RUN_OPTIONAL_SKIP77
 endef
 
 ## Run the unit test suite plus busybox applet validation
-check: $(ELFUSE_BIN) $(TEST_DEPS) check-syscall-coverage
+check: $(ELFUSE_BIN) $(TEST_DEPS) check-syscall-coverage \
+		$(BUILD_DIR)/test-tlbi-encoder-host
 	@bash tests/driver.sh -e $(ELFUSE_BIN) -d $(TEST_DIR) -v
+	@printf "\n$(BLUE)━━━ TLBI RVAE1IS encoder unit test ━━━$(RESET)\n"
+	@$(BUILD_DIR)/test-tlbi-encoder-host
 	@printf "\n$(BLUE)━━━ proctitle argv-tail regression ━━━$(RESET)\n"
 	@$(MAKE) --no-print-directory test-proctitle-host
 	@printf "\n$(BLUE)━━━ proctitle low-stack regression ━━━$(RESET)\n"
diff --git a/src/core/guest.c b/src/core/guest.c
index fa2a8a6..bcbc3c2 100644
--- a/src/core/guest.c
+++ b/src/core/guest.c
@@ -35,6 +35,7 @@
 #include <string.h>
 #include <pthread.h>
 #include <sys/mman.h>
+#include <sys/sysctl.h>
 #include <unistd.h>
 
 #include "core/guest.h"
@@ -48,6 +49,13 @@
  */
 _Thread_local tlbi_request_t cpu_tlbi_req;
 
+/* FEAT_TLBIRANGE host capability flag. Set once at bootstrap by
+ * guest_probe_tlbi_range and treated as read-only thereafter. Apple Silicon
+ * M1+ implements ARMv8.5-A which mandates FEAT_TLBIRANGE; the probe stays
+ * conservative and defaults to false until the flag is explicitly set so
+ * future ports to non-Apple aarch64 hosts inherit the safe fallback. */
+bool g_tlbi_range_supported = false;
+
 static void guest_region_clear(guest_t *g);
 
 /* Page table descriptor bits. */
@@ -202,10 +210,51 @@ static uint64_t *pt_at(const guest_t *g, uint64_t gpa)
 
 /* Public API */
 
+/* FEAT_TLBIRANGE probe -- runs exactly once via pthread_once. ARMv8.4
+ * introduced TLBI RVAE1IS for single-shot range invalidation; ARMv8.5+
+ * makes it mandatory. macOS does not surface a sysctl entry for
+ * FEAT_TLBIRANGE directly, so use FEAT_LSE2 as a proxy -- both became
+ * mandatory in ARMv8.4 and Apple ships them together across the entire
+ * M-series. A future non-Apple aarch64 host or an older ARM PE without
+ * FEAT_TLBIRANGE would otherwise trap the shim's `tlbi rvae1is, x9` to
+ * BAD_VEC; the proxy probe keeps the accumulator on the per-page VAE1IS /
+ * VMALLE1IS path in that case.
+ *
+ * Width-tolerant read: macOS currently exposes the boolean as a 4-byte int,
+ * but a future kernel could widen it to uint64_t. Read into a 64-bit slot
+ * and accept any non-zero answer for any length sysctl actually returned.
+ *
+ * ELFUSE_DISABLE_TLBI_RANGE=1 forces the broadcast fallback so the
+ * VAE1IS-only / VMALLE1IS path stays exercisable in CI on Apple Silicon --
+ * otherwise the fallback is unreachable on any host where the sysctl probe
+ * succeeds.
+ *
+ * pthread_once gates the probe so a re-bootstrap path (sys_execve, fork
+ * IPC restore) cannot race a live vCPU reading the flag. The first
+ * guest_init wins and the result is immutable for the process lifetime. */
+static pthread_once_t tlbi_range_probe_once = PTHREAD_ONCE_INIT;
+
+static void tlbi_range_probe_run(void)
+{
+    const char *disable_env = getenv("ELFUSE_DISABLE_TLBI_RANGE");
+    if (disable_env && disable_env[0] && disable_env[0] != '0') {
+        g_tlbi_range_supported = false;
+        return;
+    }
+    uint64_t lse2_raw = 0;
+    size_t lse2_len = sizeof(lse2_raw);
+    g_tlbi_range_supported =
+        (sysctlbyname("hw.optional.arm.FEAT_LSE2", &lse2_raw, &lse2_len, NULL,
+                      0) == 0) &&
+        lse2_raw != 0;
+}
+
 int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
 {
     uint64_t t0;
 
+    pthread_once(&tlbi_range_probe_once, tlbi_range_probe_run);
+
     memset(g, 0, sizeof(*g));
     g->shm_fd = -1;
     g->ipa_base = GUEST_IPA_BASE;
@@ -929,6 +978,10 @@ int guest_map_va_range(guest_t *g,
         return -1;
 
     uint64_t cur_gpa = gpa_start;
+    uint64_t changed_lo = UINT64_MAX, changed_hi = 0;
+    bool bcast = tlbi_request_is_broadcast();
+    if (perms & MEM_PERM_X)
+        tlbi_request_mark_icache();
     for (uint64_t va = va_start; va < va_end;
          va += BLOCK_2MIB, cur_gpa += BLOCK_2MIB) {
         unsigned l0_idx = (unsigned) (va / (512ULL * BLOCK_1GIB));
@@ -976,12 +1029,22 @@ int guest_map_va_range(guest_t *g,
             continue;
         }
         l2[l2_idx] = make_block_desc(cur_gpa, perms);
+        if (!bcast) {
+            if (va < changed_lo)
+                changed_lo = va;
+            if (va + BLOCK_2MIB > changed_hi)
+                changed_hi = va + BLOCK_2MIB;
+        }
     }
 
     /* The new entries are visible to the host immediately; the shim flushes
-     * the matching TLBs on syscall return via the per-vCPU accumulator.
+     * the matching TLBs on syscall return via the per-vCPU accumulator. Skip
+     * the request when every block was already mapped (no negative TLB
+     * entries can apply since the prior install already invalidated them),
+     * or when the accumulator already promised a broadcast.
      */
-    tlbi_request_range(va_start, va_end);
+    if (!bcast && changed_hi > changed_lo)
+        tlbi_request_range(changed_lo, changed_hi);
     guest_pt_gen_bump(g);
     return 0;
 }
@@ -2438,6 +2501,13 @@ int guest_extend_page_tables(guest_t *g,
             (unsigned long long) start, (unsigned long long) end);
         return -1;
     }
+    /* Defensive: end is bounded by guest_size above, so the ALIGN_2MIB_UP
+     * below cannot wrap on any reachable input. The explicit guard documents
+     * the contract and matches the wrap guards in guest_invalidate_ptes /
+     * guest_update_perms; keeps the three sites in sync if a future caller
+     * lifts the guest_size cap. */
+    if (end > UINT64_MAX - (BLOCK_2MIB - 1))
+        return -1;
 
     uint64_t base = g->ipa_base;
 
@@ -2445,8 +2515,18 @@ int guest_extend_page_tables(guest_t *g,
     uint64_t l0_gpa_off = g->ttbr0 - base;
     uint64_t *l0 = pt_at(g, l0_gpa_off);
 
-    /* Walk 2MiB blocks in [start, end) */
+    /* Walk 2MiB blocks in [start, end). Track the smallest sub-range whose
+     * L2 entry actually transitioned from unmapped to mapped; blocks that
+     * were already valid get no new descriptor and need no TLBI
+     * (false-positive elimination mirrors guest_update_perms). Once the
+     * accumulator is already TLBI_BROADCAST, the bookkeeping is wasted
+     * work.
+     */
+    if (perms & MEM_PERM_X)
+        tlbi_request_mark_icache();
     uint64_t addr_start = ALIGN_2MIB_DOWN(start), addr_end = ALIGN_2MIB_UP(end);
+    uint64_t changed_lo = UINT64_MAX, changed_hi = 0;
+    bool bcast = tlbi_request_is_broadcast();
 
     for (uint64_t addr = addr_start; addr < addr_end; addr += BLOCK_2MIB) {
         uint64_t ipa = base + addr;
@@ -2492,18 +2572,33 @@ int guest_extend_page_tables(guest_t *g,
 
         unsigned l2_idx = (unsigned) ((ipa % BLOCK_1GIB) / BLOCK_2MIB);
 
-        /* Only map if not already mapped */
-        if (!(l2[l2_idx] & PT_BLOCK)) {
-            l2[l2_idx] = make_block_desc(ipa, perms);
+        /* Only map if not already mapped. A negative TLB entry from a prior
+         * translation fault is possible only for VAs that were unmapped at
+         * the time of the fault, so the TLBI is only needed for blocks
+         * actually installed by this call.
+         */
+        /* At L2 a valid descriptor is either a 2 MiB block (bits[1:0] = 01)
+         * or a table descriptor pointing to an L3 page table (bits[1:0] = 11).
+         * Both indicate the slot is already mapped at some granule, so the
+         * extend has nothing to install; skip without flushing. The previous
+         * `& PT_BLOCK` test relied on PT_BLOCK == PT_VALID == bit 0 to cover
+         * both cases by coincidence -- write it as an explicit PT_VALID test
+         * so the intent survives a future descriptor-bit renumbering.
+         */
+        if (l2[l2_idx] & PT_VALID)
+            continue;
+        l2[l2_idx] = make_block_desc(ipa, perms);
+        if (!bcast) {
+            if (addr < changed_lo)
+                changed_lo = addr;
+            if (addr + BLOCK_2MIB > changed_hi)
+                changed_hi = addr + BLOCK_2MIB;
         }
     }
 
-    /* Use the page-aligned bounds the loop actually covered. Extend grows
-     * the mapped range; existing VAs may carry negative TLB entries from
-     * prior translation faults at this address, so a flush is still needed.
-     * Large extends will exceed the selective cap and become broadcast.
-     */
-    tlbi_request_range(addr_start + base, addr_end + base);
+    /* Large extends will exceed the selective cap and become broadcast. */
+    if (!bcast && changed_hi > changed_lo)
+        tlbi_request_range(base + changed_lo, base + changed_hi);
     guest_pt_gen_bump(g);
     return 0;
 }
@@ -2632,8 +2727,23 @@ static uint64_t *find_l2_entry(guest_t *g, uint64_t va)
 }
 
 /* Split a 2MiB L2 block descriptor into 512 x 4KiB L3 page descriptors.
- * The caller provides the L2 entry via find_l2_entry.
- * Extracts the output IPA from the existing descriptor.
+ * The caller provides the L2 entry via find_l2_entry. Extracts the output
+ * IPA from the existing descriptor.
+ *
+ * No TLBI is issued by the split itself. The block-to-table transition
+ * preserves the output address, permissions, and attributes of every page
+ * in the 2 MiB range, so any cached translation from the old block
+ * descriptor remains semantically correct. Per ARM ARM (FEAT_BBM Level 2),
+ * a CPU that implements level-2 break-before-make support allows
+ * block <-> table changes that preserve the resulting translation in all
+ * other respects without a BBM sequence. Apple Silicon implements
+ * FEAT_BBM Level 2 across M1+; the split-heavy stress paths in tests/
+ * (test-stress mprotect cycling, test-shim-urandom-toctou rapid flips,
+ * test-mprotect-mt R<->RW toggling, plus dynamic-linker RELRO setup)
+ * run cleanly. A future PE without FEAT_BBM Level 2 would need either
+ * a real BBM sequence here (invalidate, TLBI, write table) or an
+ * unconditional broadcast TLBI on every split; revisit if that ever
+ * surfaces a TLB conflict abort.
  */
 static int split_l2_block(guest_t *g, uint64_t *l2_entry)
 {
@@ -2681,9 +2791,17 @@ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end)
 {
     uint64_t base = g->ipa_base;
 
-    /* Page-align the range */
+    /* Page-align the range. The ALIGN_UP step on end could wrap to 0 for
+     * inputs within PAGE_SIZE-1 of UINT64_MAX, silently turning the
+     * invalidation into a no-op against a 0-length loop. Reject the
+     * pathological input rather than allow a stale mapping to survive.
+     */
+    if (end > UINT64_MAX - (PAGE_SIZE - 1))
+        return -1;
     start = start & ~(PAGE_SIZE - 1);
     end = (end + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+    if (end <= start)
+        return 0;
 
     for (uint64_t addr = start; addr < end;) {
         uint64_t *l2_entry = find_l2_entry(g, addr);
@@ -2723,20 +2841,36 @@ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end)
                 return -1;
         }
 
-        /* L3 table: invalidate individual 4KiB page descriptors */
+        /* L3 table: invalidate individual 4KiB page descriptors. Track the
+         * smallest sub-range whose descriptor actually transitioned from
+         * mapped to invalid; a page that was already 0 needs no TLBI
+         * (false-positive elimination mirrors the guest_update_perms path).
+         * Skip the per-page bookkeeping once a broadcast is already pending.
+         */
         uint64_t l3_ipa = *l2_entry & 0xFFFFFFFFF000ULL;
         uint64_t *l3 = pt_at(g, l3_ipa - base);
 
         uint64_t page_start = (addr > block_start) ? addr : block_start;
         uint64_t page_end = (end < block_end) ? end : block_end;
+        uint64_t changed_lo = UINT64_MAX, changed_hi = 0;
+        bool bcast = tlbi_request_is_broadcast();
 
         for (uint64_t pa = page_start; pa < page_end; pa += PAGE_SIZE) {
             unsigned l3_idx =
                 (unsigned) (((base + pa) % BLOCK_2MIB) / PAGE_SIZE);
-            l3[l3_idx] = 0; /* Invalid descriptor */
+            if (l3[l3_idx] != 0) {
+                l3[l3_idx] = 0; /* Invalid descriptor */
+                if (!bcast) {
+                    if (pa < changed_lo)
+                        changed_lo = pa;
+                    if (pa + PAGE_SIZE > changed_hi)
+                        changed_hi = pa + PAGE_SIZE;
+                }
+            }
         }
 
-        tlbi_request_range(base + page_start, base + page_end);
+        if (!bcast && changed_hi > changed_lo)
+            tlbi_request_range(base + changed_lo, base + changed_hi);
         addr = page_end;
     }
 
@@ -2748,9 +2882,23 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms)
 {
     uint64_t base = g->ipa_base;
 
-    /* Page-align the range */
+    /* Page-align the range. The ALIGN_UP on end could wrap to 0 for inputs
+     * within PAGE_SIZE-1 of UINT64_MAX, silently degrading the call to a
+     * no-op against a 0-length loop. Reject the pathological input rather
+     * than leave stale perms in place.
+     */
+    if (end > UINT64_MAX - (PAGE_SIZE - 1))
+        return -1;
     start = start & ~(PAGE_SIZE - 1);
     end = (end + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+    if (end <= start)
+        return 0;
+
+    /* New perms include exec: the shim must IC IALLU on syscall return so a
+     * VA that previously held NX content fetches the new instructions. The
+     * inverse (removing exec) leaves no new code visible. */
+    if (perms & MEM_PERM_X)
+        tlbi_request_mark_icache();
 
     /* Aliasing-proof invariant: TTBR1 maps the kbuf RW + UXN + PXN. The same
      * physical pages will be dual-mapped at KBUF_USER_VA under TTBR0 by the
@@ -2825,10 +2973,15 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms)
         /* Update pages within this 2MiB block that fall in [start, end). Track
          * the smallest sub-range that actually changed so the TLBI request only
          * covers descriptors whose value changed (false-positive elimination).
+         * Once the accumulator has already promoted to TLBI_BROADCAST, the
+         * bounding-box bookkeeping is wasted work -- the broadcast invalidates
+         * everything regardless -- so the loop skips the compares in that
+         * mode while still writing every changed descriptor.
          */
         uint64_t page_start = (addr > block_start) ? addr : block_start;
         uint64_t page_end = (end < block_end) ? end : block_end;
         uint64_t changed_lo = UINT64_MAX, changed_hi = 0;
+        bool bcast = tlbi_request_is_broadcast();
 
         for (uint64_t pa = page_start; pa < page_end; pa += PAGE_SIZE) {
             unsigned l3_idx =
@@ -2858,14 +3011,16 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms)
             uint64_t new_desc = make_page_desc(page_ipa, perms);
             if (l3[l3_idx] != new_desc) {
                 l3[l3_idx] = new_desc;
-                if (pa < changed_lo)
-                    changed_lo = pa;
-                if (pa + PAGE_SIZE > changed_hi)
-                    changed_hi = pa + PAGE_SIZE;
+                if (!bcast) {
+                    if (pa < changed_lo)
+                        changed_lo = pa;
+                    if (pa + PAGE_SIZE > changed_hi)
+                        changed_hi = pa + PAGE_SIZE;
+                }
             }
         }
 
-        if (changed_hi > changed_lo)
+        if (!bcast && changed_hi > changed_lo)
             tlbi_request_range(base + changed_lo, base + changed_hi);
         addr = page_end;
     }
@@ -2908,13 +3063,19 @@ int guest_install_va_pages(guest_t *g,
 
     uint64_t base = g->ipa_base;
     uint64_t end = va + length;
+    uint64_t changed_lo = UINT64_MAX, changed_hi = 0;
+    bool bcast = tlbi_request_is_broadcast();
+    if (perms & MEM_PERM_X)
+        tlbi_request_mark_icache();
 
     /* Walk one 4 KiB page at a time. find_l2_entry locates the L2 slot for
      * each VA; split_l2_block converts an L2 block descriptor into a table
      * lazily so individual L3 entries can be written. The L3 entry is then
      * unconditionally overwritten with the requested gpa + perms, so a prior
      * invalidation (or a fresh split inheriting the wrong block address)
-     * cannot leave behind a stale or zero descriptor.
+     * cannot leave behind a stale or zero descriptor. Pages whose descriptor
+     * is already identical are no-ops for TLBI purposes; skip them. Skip the
+     * per-page bookkeeping once a broadcast is already pending.
      */
     for (uint64_t v = va, p = gpa; v < end; v += PAGE_SIZE, p += PAGE_SIZE) {
         uint64_t *l2_entry = find_l2_entry(g, v);
@@ -2931,10 +3092,20 @@ int guest_install_va_pages(guest_t *g,
         if (!l3)
             return -1;
         unsigned l3_idx = (unsigned) (((base + v) % BLOCK_2MIB) / PAGE_SIZE);
-        l3[l3_idx] = make_page_desc(base + p, perms);
+        uint64_t new_desc = make_page_desc(base + p, perms);
+        if (l3[l3_idx] != new_desc) {
+            l3[l3_idx] = new_desc;
+            if (!bcast) {
+                if (v < changed_lo)
+                    changed_lo = v;
+                if (v + PAGE_SIZE > changed_hi)
+                    changed_hi = v + PAGE_SIZE;
+            }
+        }
     }
 
-    tlbi_request_range(va, end);
+    if (!bcast && changed_hi > changed_lo)
+        tlbi_request_range(changed_lo, changed_hi);
     guest_pt_gen_bump(g);
     return 0;
 }
diff --git a/src/core/guest.h b/src/core/guest.h
index 11d05bf..9e40ae3 100644
--- a/src/core/guest.h
+++ b/src/core/guest.h
@@ -245,21 +245,86 @@ typedef enum {
     TLBI_NONE = 0,
     TLBI_BROADCAST = 1,
     TLBI_RANGE = 2,
+    TLBI_RANGE_LARGE = 3, /* FEAT_TLBIRANGE single-shot TLBI RVAE1IS for
+                           * ranges that exceed TLBI_SELECTIVE_MAX_PAGES but
+                           * stay within TLBI_RVAE_MAX_PAGES; encoded as
+                           * X8 = 4 on the wire. */
 } tlbi_kind_t;
 
-/* Cap selective TLBI at this many 4 KiB pages. Beyond this, fall back to
- * TLBI_BROADCAST: each TLBI VAE1IS broadcasts to all cores, so for large
- * ranges the per-instruction issue cost outweighs the benefit of preserving
- * unrelated TLB entries. 16 pages == 64 KiB covers RELRO and other typical
- * mprotect / munmap targets.
+/* Cap selective per-page TLBI VAE1IS at this many 4 KiB pages. Beyond this,
+ * use TLBI RVAE1IS if FEAT_TLBIRANGE is available, else fall back to
+ * TLBI_BROADCAST: per-instruction issue cost outweighs the benefit once the
+ * range is large. 16 pages == 64 KiB covers RELRO and other typical mprotect
+ * / munmap targets.
  */
 #define TLBI_SELECTIVE_MAX_PAGES 16
 
+/* Cap single-shot TLBI RVAE1IS at this many 4 KiB pages. With SCALE=0 the
+ * RVAE1IS operand encoding covers (NUM+1)*2 pages with NUM in [0..31], so a
+ * single instruction reaches 64 pages == 256 KiB. Beyond that the host would
+ * need SCALE=1 (NUM*64 step), which over-invalidates for the typical
+ * dynamic-linker RELRO / glibc-bring-up storm sizes seen in practice; stay
+ * at SCALE=0 for now and broadcast above 64 pages.
+ */
+#define TLBI_RVAE_MAX_PAGES 64
+
+/* TLBI RVAE1IS operand bit-field constants. Per ARM ARM DDI 0487J.a D8.7.6
+ * the operand layout is:
+ *   bits [36:0]   BaseADDR  (VA[48:12] for 4 KiB granule, DS=0)
+ *   bits [38:37]  TTL       (0 = any level)
+ *   bits [43:39]  NUM
+ *   bits [45:44]  SCALE
+ *   bits [47:46]  TG        (00 = RESERVED, 01 = 4 KiB, 10 = 16 KiB,
+ *                            11 = 64 KiB)
+ *   bits [63:48]  ASID
+ * elfuse only ever issues 4 KiB-granule TLBIs (TCR_EL1.TG0 = 4 KiB), so
+ * TG is hard-pinned to 01 and the corresponding bit is named here. */
+#define RVAE_OPERAND_BADDR_MASK ((1ULL << 37) - 1)
+#define RVAE_OPERAND_NUM_SHIFT 39
+#define RVAE_OPERAND_TG_4KB (1ULL << 46)
+
+/* Pure encoder: build the TLBI RVAE1IS Xt operand from a 4 KiB-aligned VA
+ * and a page count in the SCALE=0 range (1..TLBI_RVAE_MAX_PAGES). Lives in
+ * the header as `static inline` so tlbi_request_emit_to_vcpu and any
+ * future caller (host-side unit tests included) compile to the same
+ * expression. NUM = ceil(pages / 2) - 1 over-invalidates odd page counts
+ * by exactly one page, which is a perf-only side effect (the extra
+ * invalidation evicts a neighbour TLB entry that the guest's next access
+ * reloads). pages < 2 is clamped to 2 because SCALE=0 NUM=0 means 2
+ * pages -- the encoder cannot represent a single page through RVAE1IS;
+ * single-page callers go through the per-page VAE1IS path instead, but
+ * the clamp keeps the encoder total in any pathological input. */
+static inline uint64_t tlbi_rvae1is_operand(uint64_t start_va, uint16_t pages)
+{
+    if (pages < 2)
+        pages = 2;
+    uint64_t baddr = (start_va >> 12) & RVAE_OPERAND_BADDR_MASK;
+    uint64_t num = ((pages + 1) / 2) - 1;
+    if (num > 31)
+        num = 31;
+    return baddr | (num << RVAE_OPERAND_NUM_SHIFT) | RVAE_OPERAND_TG_4KB;
+}
+
+/* Runtime feature flag: TRUE when the host PE implements FEAT_TLBIRANGE
+ * (ARMv8.4+, present on every Apple Silicon M1+). Probed once at bootstrap.
+ * Read-only after startup so callers do not need an atomic load. */
+extern bool g_tlbi_range_supported;
+
 typedef struct {
-    uint8_t kind;   /* tlbi_kind_t */
-    uint16_t pages; /* Page count when kind == TLBI_RANGE (1..MAX) */
-    uint64_t start; /* Page-aligned VA when kind == TLBI_RANGE */
+    uint8_t kind;         /* tlbi_kind_t */
+    uint8_t icache_flush; /* 1 = the change introduced executable content
+                           *     visible to EL0, so the shim must IC IALLU
+                           *     after the TLBI sequence. 0 = data-only
+                           *     change, skip the I-cache invalidation. */
+    uint16_t pages;       /* Page count when kind == TLBI_RANGE (1..MAX) */
+    uint64_t start;       /* Page-aligned VA when kind == TLBI_RANGE */
 } tlbi_request_t;
+/* Layout contract: 16 bytes (1+1+2+4 padding+8). Documents the padding and
+ * pins the TLS slot size so future field additions surface as a build break
+ * rather than silently growing the per-vCPU footprint. */
+_Static_assert(sizeof(tlbi_request_t) == 16,
+               "tlbi_request_t must stay 16 bytes; update tlbi_request_clear "
+               "and the syscall epilogue if the layout changes");
 
 /* Multi-region IPA mapping.
  *
@@ -486,6 +551,7 @@ extern _Thread_local tlbi_request_t cpu_tlbi_req;
 static inline void tlbi_request_clear(void)
 {
     cpu_tlbi_req.kind = TLBI_NONE;
+    cpu_tlbi_req.icache_flush = 0;
     cpu_tlbi_req.pages = 0;
     cpu_tlbi_req.start = 0;
 }
@@ -495,6 +561,72 @@ static inline void tlbi_request_broadcast(void)
     cpu_tlbi_req.kind = TLBI_BROADCAST;
 }
 
+/* True if the accumulator is already at TLBI_BROADCAST. PT mutation helpers
+ * use this to skip the per-page bounding-box bookkeeping (changed_lo /
+ * changed_hi tracking and the final tlbi_request_range call) once a broadcast
+ * is already promised; the inline tlbi_request_range itself short-circuits
+ * for the same reason but the call-site loops still pay for the compares.
+ */
+static inline bool tlbi_request_is_broadcast(void)
+{
+    return cpu_tlbi_req.kind == TLBI_BROADCAST;
+}
+
+/* Mark that the current syscall's PT mutation introduced executable content
+ * visible to EL0 (a new X mapping, or an mprotect that added MEM_PERM_X to
+ * a previously-NX page). The shim consults this via X11 on syscall return
+ * to decide whether IC IALLU is needed after the TLBI sequence. Data-only
+ * page-table changes (mprotect RW<->R, munmap of data, etc.) leave this
+ * cleared so the I-cache invalidation is skipped.
+ */
+static inline void tlbi_request_mark_icache(void)
+{
+    cpu_tlbi_req.icache_flush = 1;
+}
+
+/* Encode the pending TLBI request into the vCPU's X8/X9/X10/X11 registers
+ * for the shim's post-HVC dispatch and clear the per-vCPU accumulator.
+ * Both the syscall HVC #5 epilogue and the HVC #11 EL0-fault handler use
+ * this so the same X8 wire codes (and X11 I-cache hint) drive every TLBI
+ * the host issues on behalf of the guest. Keeping the helper inline lets
+ * the call sites compile to the same switch in both files.
+ */
+static inline void tlbi_request_emit_to_vcpu(hv_vcpu_t vcpu)
+{
+    switch ((tlbi_kind_t) cpu_tlbi_req.kind) {
+    case TLBI_BROADCAST:
+        hv_vcpu_set_reg(vcpu, HV_REG_X8, 1);
+        hv_vcpu_set_reg(vcpu, HV_REG_X11, cpu_tlbi_req.icache_flush ? 1 : 0);
+        break;
+    case TLBI_RANGE:
+        hv_vcpu_set_reg(vcpu, HV_REG_X8, 3);
+        hv_vcpu_set_reg(vcpu, HV_REG_X9, cpu_tlbi_req.start);
+        hv_vcpu_set_reg(vcpu, HV_REG_X10, cpu_tlbi_req.pages);
+        hv_vcpu_set_reg(vcpu, HV_REG_X11, cpu_tlbi_req.icache_flush ? 1 : 0);
+        break;
+    case TLBI_RANGE_LARGE: {
+        /* Single-shot TLBI RVAE1IS for ranges in (16..64] pages. The
+         * operand format and the SCALE=0 / TG=01 / ASID=0 assumptions are
+         * documented at tlbi_rvae1is_operand above. ASID stays 0 because
+         * the shim runs single-ASID (TCR_EL1.A1=0, TTBR0 ASID=0; rosetta
+         * does not allocate a separate ASID). If a future change
+         * introduces non-zero ASIDs, the helper signature and the
+         * tlbi_request_t accumulator both need an ASID field. */
+        uint64_t operand =
+            tlbi_rvae1is_operand(cpu_tlbi_req.start, cpu_tlbi_req.pages);
+        hv_vcpu_set_reg(vcpu, HV_REG_X8, 4);
+        hv_vcpu_set_reg(vcpu, HV_REG_X9, operand);
+        hv_vcpu_set_reg(vcpu, HV_REG_X11, cpu_tlbi_req.icache_flush ? 1 : 0);
+        break;
+    }
+    case TLBI_NONE:
+    default:
+        hv_vcpu_set_reg(vcpu, HV_REG_X8, 0);
+        break;
+    }
+    tlbi_request_clear();
+}
+
 static inline void tlbi_request_range(uint64_t start, uint64_t end)
 {
     if (cpu_tlbi_req.kind == TLBI_BROADCAST)
@@ -513,26 +645,34 @@ static inline void tlbi_request_range(uint64_t start, uint64_t end)
     uint64_t s = start & ~mask;
     uint64_t e = (end + mask) & ~mask;
     uint64_t n = (e - s) >> 12;
-    if (n > TLBI_SELECTIVE_MAX_PAGES) {
+    /* Two thresholds. (a) <= TLBI_SELECTIVE_MAX_PAGES uses the per-page
+     * VAE1IS loop, which preserves the most TLB entries. (b) <=
+     * TLBI_RVAE_MAX_PAGES uses a single TLBI RVAE1IS via FEAT_TLBIRANGE,
+     * which still preserves unrelated TLB entries but costs only one
+     * instruction issue. Above TLBI_RVAE_MAX_PAGES or when the feature is
+     * absent, broadcast (TLBI VMALLE1IS). */
+    uint64_t large_cap =
+        g_tlbi_range_supported ? TLBI_RVAE_MAX_PAGES : TLBI_SELECTIVE_MAX_PAGES;
+    if (n > large_cap) {
         tlbi_request_broadcast();
         return;
     }
     if (cpu_tlbi_req.kind == TLBI_NONE) {
-        cpu_tlbi_req.kind = TLBI_RANGE;
+        cpu_tlbi_req.kind =
+            (n > TLBI_SELECTIVE_MAX_PAGES) ? TLBI_RANGE_LARGE : TLBI_RANGE;
         cpu_tlbi_req.start = s;
         cpu_tlbi_req.pages = (uint16_t) n;
         return;
     }
-    /* TLBI_RANGE: coalesce by union. Disjoint ranges still produce a single
-     * bounding interval; if it stays within the cap, the per-page TLBI loop
-     * still wins over a full flush by preserving the rest of the TLB.
+    /* Coalesce by union. Disjoint ranges still produce a single bounding
+     * interval; if it stays within the active cap, the range TLBI still
+     * wins over a full flush by preserving unrelated TLB entries.
      */
     uint64_t es = cpu_tlbi_req.start;
     uint64_t pe = (uint64_t) cpu_tlbi_req.pages * 4096ULL;
-    /* The accumulator only ever holds page counts <= TLBI_SELECTIVE_MAX_PAGES
-     * (see the cap check above), so es + pe never overflows on real callers,
-     * but be explicit.
-     */
+    /* The accumulator only ever holds page counts <= large_cap (enforced by
+     * the cap check above), so es + pe never overflows on real callers, but
+     * be explicit. */
     if (es > UINT64_MAX - pe) {
         tlbi_request_broadcast();
         return;
@@ -541,12 +681,17 @@ static inline void tlbi_request_range(uint64_t start, uint64_t end)
     uint64_t us = s < es ? s : es;
     uint64_t ue = e > ee ? e : ee;
     uint64_t un = (ue - us) >> 12;
-    if (un > TLBI_SELECTIVE_MAX_PAGES) {
+    if (un > large_cap) {
         tlbi_request_broadcast();
         return;
     }
     cpu_tlbi_req.start = us;
     cpu_tlbi_req.pages = (uint16_t) un;
+    /* Promote kind if the coalesced range now exceeds the per-page cap. The
+     * inverse direction (LARGE -> RANGE) is impossible because un >= pe / 4096
+     * after coalescing. */
+    if (un > TLBI_SELECTIVE_MAX_PAGES)
+        cpu_tlbi_req.kind = TLBI_RANGE_LARGE;
 }
 
 /* Convert a guest offset (0-based) to an IPA/VA (ipa_base + offset) */
diff --git a/src/core/shim.S b/src/core/shim.S
index a2613c3..47ee5ed 100644
--- a/src/core/shim.S
+++ b/src/core/shim.S
@@ -22,11 +22,32 @@
  *                            2 = execve replaced register state (full flush
  *                                + drop frame + ERET without GPR restore)
  *                            3 = selective: TLBI VAE1IS over X10 pages
- *                                starting at page-aligned VA in X9)
+ *                                starting at page-aligned VA in X9
+ *                            4 = single-shot TLBI RVAE1IS (FEAT_TLBIRANGE);
+ *                                X9 carries the pre-encoded RVAE1IS operand
+ *                                (baddr | NUM<<39 | SCALE<<44 | TTL<<37 |
+ *                                ASID<<48; SCALE=0, TTL=0, ASID=0 today)
+ *                          X11 carries the I-cache hint for X8 in {1, 3, 4}:
+ *                          1 = IC IALLU after the TLBI sequence (new
+ *                          executable content visible to EL0), 0 = skip the
+ *                          I-cache invalidation (data-only PT change). The
+ *                          X8 == 2 exec_drop_frame path always flushes
+ *                          regardless because execve loads new code. The
+ *                          shim restores X11 from the saved frame before
+ *                          ERET so the EL0 caller never observes this hint.)
  *   #7  MRS trap          (host reads reg from ESR ISS; returns value in x0)
  *   #9  W^X toggle        (x0=FAR, x1=type: 0=exec->RX, 1=write->RW)
  *   #10 BRK from EL0      (SIGTRAP delivery / ptrace-stop; GPRs in frame)
- *   #11 EL0 fault         (SIGSEGV/SIGILL delivery; GPRs in frame)
+ *   #11 EL0 fault         (SIGSEGV/SIGILL delivery; GPRs in frame.
+ *                          On return the host sets X8 to the same TLBI
+ *                          wire code as SVC #5 above (0/1/3/4 only;
+ *                          X8 == 2 exec_drop_frame is rejected and falls
+ *                          through to the conservative full flush). The
+ *                          lazy MAP_NORESERVE materialize path uses this
+ *                          to invalidate any negative TLB entry the EL0
+ *                          retry would otherwise re-fault on. Eret tail
+ *                          preserves X0/X1/X2/X30 so signal_deliver's
+ *                          register writes survive.)
  *   #12 System instr trap (cache maintenance logging: DC CVAU, IC IVAU, etc.)
  *
  * macOS as uses ';' as a comment character on AArch64, NOT as a statement
@@ -118,6 +139,35 @@
     add sp, sp, #256
 .endm
 
+/* RESTORE_GPRS_KEEP_SIGFRAME: load X3-X29 from the saved frame and pop it,
+ * leaving X0/X1/X2/X30 untouched. signal_deliver writes the signum, siginfo
+ * pointer, ucontext pointer, and sa_restorer address into those live regs
+ * via hv_vcpu_set_reg before returning from HVC #11; the standard
+ * RESTORE_GPRS_KEEP_X0 tail would clobber X1/X2/X30 with their pre-fault
+ * EL0 values. Used by the HVC #11 post-handler so the lazy-materialize
+ * path can run TLBI ops (which clobber X11/X12/X13 as scratch) while
+ * still preserving signal_deliver's register writes on the SIGSEGV /
+ * SIGILL delivery path. The caller is responsible for setting X8 = 0
+ * if no TLBI is needed; X8 is loaded from the frame here regardless.
+ */
+.macro RESTORE_GPRS_KEEP_SIGFRAME
+    ldr x3, [sp, #24]
+    ldp x4, x5, [sp, #32]
+    ldp x6, x7, [sp, #48]
+    ldp x8, x9, [sp, #64]
+    ldp x10, x11, [sp, #80]
+    ldp x12, x13, [sp, #96]
+    ldp x14, x15, [sp, #112]
+    ldp x16, x17, [sp, #128]
+    ldp x18, x19, [sp, #144]
+    ldp x20, x21, [sp, #160]
+    ldp x22, x23, [sp, #176]
+    ldp x24, x25, [sp, #192]
+    ldp x26, x27, [sp, #208]
+    ldp x28, x29, [sp, #224]
+    add sp, sp, #256
+.endm
+
 /* ZERO_GPRS: clear X0-X30 (used before the EL1->EL0 transition). */
 .macro ZERO_GPRS
     .irp r, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
@@ -743,19 +793,104 @@ handle_data_abort:
  * EC=0x00 (undefined) or other -> SIGILL.
  */
 handle_el0_fault:
-    /* Restore all GPRs from stack frame (host needs EL0 state) */
-    RESTORE_GPRS
+    /* Load EL0 GPRs from the saved frame WITHOUT popping it. The frame
+     * must stay live across HVC #11 so the post-HVC dispatch below can
+     * restore X3-X29 via the frameless RESTORE_GPRS_KEEP_SIGFRAME tail
+     * after the host has overwritten X0/X1/X2/X30 via hv_vcpu_set_reg
+     * (signal_deliver) and the inline TLBI handlers have clobbered
+     * X11/X12/X13 as scratch.
+     */
+    LOAD_GPRS
 
-    /* Forward to host for SIGSEGV delivery.
-     * Host reads ESR_EL1 (fault syndrome), FAR_EL1 (faulting address), and
-     * ELR_EL1 (PC at fault) to decode the fault and build the signal frame.
+    /* Forward to host. Host paths:
+     *   (1) signal delivery -> writes X0=signum, X1=siginfo*, X2=ucontext*,
+     *       X30=sa_restorer via hv_vcpu_set_reg; host leaves X8=0.
+     *   (2) lazy MAP_NORESERVE materialize -> populates cpu_tlbi_req via
+     *       guest_extend_page_tables / guest_update_perms / friends, then
+     *       calls tlbi_request_emit_to_vcpu(vcpu) which writes
+     *       X8/X9/X10/X11 per the same wire protocol the SVC #5 epilogue
+     *       uses. Single-shot RVAE1IS via X8=4 is included.
+     *
+     * The dispatch below mirrors handle_svc_0's TLBI dispatch. exec_drop
+     * (X8=2) is rejected on this path (the host never sets it here); fall
+     * through to the conservative full flush rather than silently skip.
      */
     hvc #11
 
-    /* Host has set up signal frame (if SIGSEGV handler registered) or
-     * flagged for termination (default SIGSEGV = core dump). ERET to
-     * new PC (handler address or unchanged if terminating).
+    cbz x8, .Lel0_fault_eret_only
+    cmp x8, #1
+    b.eq .Lel0_fault_tlbi_full
+    cmp x8, #3
+    b.eq .Lel0_fault_tlbi_sel
+    cmp x8, #4
+    b.eq .Lel0_fault_tlbi_rvae
+    /* Unknown X8: conservative broadcast + I-cache flush. */
+    mov x11, #1
+    b .Lel0_fault_tlbi_full
+
+.Lel0_fault_tlbi_full:
+    /* Broadcast TLB + conditional I-cache flush. X11=0 skips IC IALLU. */
+    tlbi vmalle1is
+    dsb ish
+    cbz x11, .Lel0_fault_full_no_ic
+    ic iallu
+    dsb ish
+.Lel0_fault_full_no_ic:
+    isb
+    b .Lel0_fault_eret_restore
+
+.Lel0_fault_tlbi_sel:
+    /* Selective per-page TLBI VAE1IS loop. X9 = page-aligned VA,
+     * X10 = page count (1..TLBI_SELECTIVE_MAX_PAGES). X11 carries the
+     * I-cache hint on entry; save into X13 before the loop clobbers X11.
+     * ubfx (not plain lsr) pins the VA to the 44-bit [43:0] field so a
+     * future LPA2 / TTL / tagged-address change cannot leak high bits
+     * into the operand's TTL [47:44] or ASID [63:48] fields. */
+    cbz x10, .Lel0_fault_eret_only
+    mov x13, x11
+    ubfx x11, x9, #12, #44
+    mov x12, x10
+4:  tlbi vae1is, x11
+    add x11, x11, #1
+    subs x12, x12, #1
+    b.ne 4b
+    dsb ish
+    cbz x13, .Lel0_fault_sel_no_ic
+    ic iallu
+    dsb ish
+.Lel0_fault_sel_no_ic:
+    isb
+    b .Lel0_fault_eret_restore
+
+.Lel0_fault_tlbi_rvae:
+    /* Single-shot TLBI RVAE1IS (FEAT_TLBIRANGE). X9 carries the pre-encoded
+     * operand (baddr | NUM<<39 | TG=01<<46); X11 the I-cache hint. */
+    tlbi rvae1is, x9
+    dsb ish
+    cbz x11, .Lel0_fault_rvae_no_ic
+    ic iallu
+    dsb ish
+.Lel0_fault_rvae_no_ic:
+    isb
+    /* fall through */
+
+.Lel0_fault_eret_restore:
+    /* TLBI clobbered X11/X12/X13 (and possibly X9/X10 in the selective
+     * path). Reload X3-X29 from the saved frame so the EL0 retry sees the
+     * same scratch state as pre-fault; skip X0/X1/X2/X30 in case
+     * signal delivery set them. On lazy materialization and conservative
+     * unknown-X8 fallback paths, the skipped registers still match the frame.
      */
+    RESTORE_GPRS_KEEP_SIGFRAME
+    eret
+
+.Lel0_fault_eret_only:
+    /* X8 == 0: no TLBI requested. signal-delivery and no-delivery paths
+     * land here after the host wrote X8 as the post-HVC protocol value.
+     * Reload X3-X29 so EL0 sees the pre-fault scratch state, while keeping
+     * X0/X1/X2/X30 live for a materialized signal handler.
+     */
+    RESTORE_GPRS_KEEP_SIGFRAME
     eret
 
 /* Shared exit paths for exception handlers
@@ -768,7 +903,11 @@ tlbi_restore_eret:
      * raise further exceptions, so FAR_EL1 is preserved.
      */
     mrs x0, far_el1
-    lsr x0, x0, #12          /* TLBI VAE1IS operand: VA[55:12] */
+    /* TLBI VAE1IS operand: VA[55:12] held in bits [43:0]. ubfx pins the
+     * 44-bit width so future LPA2 / TTL / tagged-address support cannot
+     * leak VA bits into the TTL [47:44] or ASID [63:48] operand fields.
+     */
+    ubfx x0, x0, #12, #44
     tlbi vae1is, x0
     dsb ish
     ic iallu
@@ -856,31 +995,39 @@ handle_svc_0:
     b.eq tlbi_full
     cmp x8, #3
     b.eq tlbi_selective
+    cmp x8, #4
+    b.eq tlbi_range_large
     cmp x8, #2
     b.eq exec_drop_frame
-    /* Unknown X8: be conservative, broadcast and continue. */
+    /* Unknown X8: be conservative, broadcast, flush I-cache, and continue. */
+    mov x11, #1
 
 tlbi_full:
-    /* Broadcast TLB + I-cache flush. Used for page-table edits whose
-     * affected range exceeds the selective cap, or any time the host could
-     * not bound the change.
+    /* Broadcast TLB + (conditional) I-cache flush. Used for page-table edits
+     * whose affected range exceeds the selective cap, or any time the host
+     * could not bound the change. X11 carries the I-cache hint: non-zero
+     * means the host introduced executable content visible to EL0 (new X
+     * mapping, NX->X mprotect, lazy materialize from a region that may
+     * include exec), so the shim must IC IALLU; zero means a data-only
+     * PT change and the I-cache invalidation is skipped.
      */
     tlbi vmalle1is
     dsb ish
+    cbz x11, .Ltlbi_full_skip_ic
     ic iallu
     dsb ish
+.Ltlbi_full_skip_ic:
     isb
-    b 1f
+    b svc_restore_eret
 
 tlbi_selective:
     /* Selective TLBI VAE1IS loop.
      *   x9  = page-aligned VA of the first page to invalidate
      *   x10 = page count (1..TLBI_SELECTIVE_MAX_PAGES, see core/guest.h)
+     *   x11 = I-cache hint (see tlbi_full above)
      * TLBI VAE1IS takes a Xt operand of (VA[55:12] | (ASID << 48)). The
      * guest runs single-ASID at EL0, so just shift the VA right by 12.
-     * Issue all TLBI ops, then a single DSB ISH + IC IALLU + DSB + ISB
-     * matches broadcast semantics (preserves I-cache invalidation behavior
-     * for callers like file-backed mmap of executable pages).
+     * Issue all TLBI ops, then DSB ISH + (conditional) IC IALLU + DSB + ISB.
      *
      * Defensive: if x10 == 0, skip the loop. The per-vCPU host-side
      * accumulator (cpu_tlbi_req in core/guest.h) never sets pages == 0
@@ -888,17 +1035,43 @@ tlbi_selective:
      * write ever produced the pair X8=3, X10=0, the subs x12, x12, #1
      * below would underflow to 0xFFFFFFFFFFFFFFFF and the b.ne would loop
      * ~2^64 iterations, hanging this vCPU. Cheap guard.
+     *
+     * x11 is the I-cache hint on entry but the per-page TLBI operand is
+     * also computed from x9 -- save the hint into x13 before clobbering.
      */
     cbz x10, 1f
-    lsr x11, x9, #12          /* x11 = VA >> 12 (current page operand) */
+    mov x13, x11              /* x13 = saved I-cache hint */
+    /* TLBI VAE1IS operand: VA[55:12] held in bits [43:0]. ubfx pins the
+     * 44-bit width so future LPA2 / TTL / tagged-address support cannot
+     * leak VA bits into the TTL [47:44] or ASID [63:48] operand fields. */
+    ubfx x11, x9, #12, #44    /* x11 = VA[55:12] (current page operand) */
     mov x12, x10              /* x12 = remaining page counter */
 3:  tlbi vae1is, x11
     add x11, x11, #1          /* next page (operand is in 4 KiB units) */
     subs x12, x12, #1
     b.ne 3b
     dsb ish
+    cbz x13, .Ltlbi_sel_skip_ic
+    ic iallu
+    dsb ish
+.Ltlbi_sel_skip_ic:
+    isb
+    b svc_restore_eret
+
+tlbi_range_large:
+    /* Single-shot TLBI RVAE1IS (FEAT_TLBIRANGE, ARMv8.4+). The host has
+     * encoded the full operand in X9: baddr (VA >> 12), TTL=0, NUM in bits
+     * [43:39], SCALE=0, ASID=0. One instruction covers up to 64 pages,
+     * avoiding the broadcast TLBI VMALLE1IS that the prior selective cap
+     * forced for 17..64-page ranges. X11 carries the I-cache hint as in
+     * tlbi_full / tlbi_selective.
+     */
+    tlbi rvae1is, x9
+    dsb ish
+    cbz x11, .Ltlbi_rvae_skip_ic
     ic iallu
     dsb ish
+.Ltlbi_rvae_skip_ic:
     isb
     b svc_restore_eret
 
diff --git a/src/syscall/proc.c b/src/syscall/proc.c
index 33cde52..647bab8 100644
--- a/src/syscall/proc.c
+++ b/src/syscall/proc.c
@@ -1585,70 +1585,25 @@ int vcpu_run_loop(hv_vcpu_t vcpu,
 
                     uint32_t fault_ec = (uint32_t) ((esr >> 26) & 0x3F);
 
-                    int signum, si_code;
-                    uint64_t si_addr;
-
-                    if (fault_ec == 0x20 || fault_ec == 0x24) {
-                        /* Instruction or data abort -> check lazy page
-                         * materialization before delivering SIGSEGV.
-                         */
-                        uint32_t fsc = (uint32_t) (esr & 0x3F);
-                        uint32_t fsc_type = (fsc >> 2) & 0xF;
-
-                        /* Translation faults have xFSC[5:2] == 0x1; the low
-                         * bits select the translation table level. These may
-                         * come from a MAP_NORESERVE region with deferred page
-                         * table creation, so try to materialize the page
-                         * before declaring SIGSEGV.
-                         */
-                        if (fsc_type == 0x01) {
-                            uint64_t fault_off = far_addr - g->ipa_base;
-                            pthread_mutex_lock(&mmap_lock);
-                            int mat = guest_materialize_lazy(g, fault_off);
-                            pthread_mutex_unlock(&mmap_lock);
-                            if (mat == 0) {
-                                /* Page materialized; TLBI and retry the
-                                 * faulting instruction. Set X8=1 to request
-                                 * TLBI from the shim before ERET.
-                                 */
-                                hv_vcpu_set_reg(vcpu, HV_REG_X8, 1);
-                                break;
-                            }
-                        }
-
-                        signum = LINUX_SIGSEGV;
-                        /* Permission faults have xFSC[5:2] == 0x3. Address
-                         * size, translation, and access-flag faults remain
-                         * mapping errors for Linux-visible SIGSEGV delivery.
-                         */
-                        si_code = (fsc_type == 0x03) ? LINUX_SEGV_ACCERR
-                                                     : LINUX_SEGV_MAPERR;
-                        si_addr = far_addr;
-
-                        if (verbose) {
-                            const char *fault_type =
-                                (fault_ec == 0x20) ? "inst" : "data";
-                            const char *code_name =
-                                (si_code == LINUX_SEGV_MAPERR) ? "MAPERR"
-                                                               : "ACCERR";
-                            log_debug(
-                                "%s: EL0 %s fault at 0x%llx "
-                                "PC=0x%llx (ESR=0x%llx FSC=0x%x) "
-                                "-> SIGSEGV/%s",
-                                prefix, fault_type,
-                                (unsigned long long) far_addr,
-                                (unsigned long long) elr_addr,
-                                (unsigned long long) esr, fsc, code_name);
-                        }
-                    } else {
-                        /* EC=0x00 (undefined instruction) or other unrecognized
-                         * EC from EL0 -> SIGILL. Use ELR_EL1 as si_addr because
-                         * FAR_EL1 is UNKNOWN for non-abort exceptions.
-                         */
-                        signum = LINUX_SIGILL;
-                        si_code = LINUX_ILL_ILLOPC;
-                        si_addr = elr_addr;
-
+                    /* Non-abort EC -> SIGILL. Branch out early so the
+                     * abort / SIGSEGV path below stays at the case-body
+                     * indent rather than nested inside an else branch.
+                     * FAR_EL1 is UNKNOWN for non-abort exceptions, so use
+                     * ELR_EL1 for si_addr.
+                     *
+                     * Only EC 0x20 (instruction abort from a lower EL) and
+                     * EC 0x24 (data abort from a lower EL) are intentionally
+                     * routed to the SIGSEGV path that follows. Every other
+                     * forwarded EC -- 0x00 (undefined instruction), 0x18
+                     * (system instruction trap), 0x32/0x33 (software
+                     * step), 0x3C (BRK), and any unrecognized class --
+                     * lands here as SIGILL. If a future change adds a new
+                     * lower-EL abort class (e.g. 0x21 / 0x25 for higher
+                     * exception levels) that should map to SIGSEGV, the
+                     * test below needs explicit widening; do NOT relax
+                     * the check casually.
+                     */
+                    if (fault_ec != 0x20 && fault_ec != 0x24) {
                         if (verbose)
                             log_debug(
                                 "%s: EL0 undefined insn at "
@@ -1656,18 +1611,91 @@ int vcpu_run_loop(hv_vcpu_t vcpu,
                                 "-> SIGILL/ILL_ILLOPC",
                                 prefix, (unsigned long long) elr_addr,
                                 (unsigned long long) esr, fault_ec);
+                        signal_set_fault_info(LINUX_ILL_ILLOPC, elr_addr, esr);
+                        signal_queue(LINUX_SIGILL);
+                        int sig_ret = signal_deliver(vcpu, g, &exit_code);
+                        /* HVC #11 consumes X8 as the post-fault TLBI opcode.
+                         * signal_deliver() may leave it unchanged when no
+                         * handler is materialized, or set the syscall-path
+                         * frame-drop marker when one is. Neither is a TLBI
+                         * request here; lazy materialization emits its own
+                         * request and exits before this path.
+                         */
+                        hv_vcpu_set_reg(vcpu, HV_REG_X8, 0);
+                        if (verbose)
+                            log_debug("%s: signal %d deliver returned %d",
+                                      prefix, LINUX_SIGILL, sig_ret);
+                        if (sig_ret < 0)
+                            running = false; /* SIG_DFL core => terminate. */
+                        break;
+                    }
+
+                    /* Instruction or data abort. Try lazy page materialization
+                     * before declaring SIGSEGV: translation faults
+                     * (xFSC[5:2] == 0x1) may come from a MAP_NORESERVE region
+                     * with deferred page-table creation.
+                     */
+                    uint32_t fsc = (uint32_t) (esr & 0x3F);
+                    uint32_t fsc_type = (fsc >> 2) & 0xF;
+                    if (fsc_type == 0x01) {
+                        uint64_t fault_off = far_addr - g->ipa_base;
+                        pthread_mutex_lock(&mmap_lock);
+                        int mat = guest_materialize_lazy(g, fault_off);
+                        pthread_mutex_unlock(&mmap_lock);
+                        if (mat == 0) {
+                            /* Page materialized; the helpers inside
+                             * guest_materialize_lazy populated the
+                             * per-vCPU TLBI accumulator with the range
+                             * just installed (plus the I-cache hint if
+                             * the region's prot includes PROT_EXEC).
+                             * Drain it through the shared emit helper
+                             * so the shim's post-HVC-11 dispatch
+                             * (handle_el0_fault) actually issues the
+                             * TLBI before ERET. Without this, a PE that
+                             * caches translation-fault (negative)
+                             * entries would re-fault on the retry,
+                             * looping until the entry self-evicts. */
+                            tlbi_request_emit_to_vcpu(vcpu);
+                            break;
+                        }
                     }
 
-                    signal_set_fault_info(si_code, si_addr, esr);
-                    signal_queue(signum);
+                    /* Real SIGSEGV. Permission faults (xFSC[5:2] == 0x3) map
+                     * to SEGV_ACCERR; address size, translation, and
+                     * access-flag faults map to SEGV_MAPERR for Linux.
+                     */
+                    int si_code = (fsc_type == 0x03) ? LINUX_SEGV_ACCERR
+                                                     : LINUX_SEGV_MAPERR;
+                    if (verbose) {
+                        const char *fault_type =
+                            (fault_ec == 0x20) ? "inst" : "data";
+                        const char *code_name = (si_code == LINUX_SEGV_MAPERR)
+                                                    ? "MAPERR"
+                                                    : "ACCERR";
+                        log_debug(
+                            "%s: EL0 %s fault at 0x%llx "
+                            "PC=0x%llx (ESR=0x%llx FSC=0x%x) "
+                            "-> SIGSEGV/%s",
+                            prefix, fault_type, (unsigned long long) far_addr,
+                            (unsigned long long) elr_addr,
+                            (unsigned long long) esr, fsc, code_name);
+                    }
+                    signal_set_fault_info(si_code, far_addr, esr);
+                    signal_queue(LINUX_SIGSEGV);
                     int sig_ret = signal_deliver(vcpu, g, &exit_code);
+                    /* HVC #11 consumes X8 as the post-fault TLBI opcode.
+                     * signal_deliver() may leave it unchanged when no
+                     * handler is materialized, or set the syscall-path
+                     * frame-drop marker when one is. Neither is a TLBI
+                     * request here; lazy materialization emits its own
+                     * request and exits before this path.
+                     */
+                    hv_vcpu_set_reg(vcpu, HV_REG_X8, 0);
                     if (verbose)
                         log_debug("%s: signal %d deliver returned %d", prefix,
-                                  signum, sig_ret);
-                    if (sig_ret < 0) {
-                        /* Core dispositions terminate without a core file. */
-                        running = false;
-                    }
+                                  LINUX_SIGSEGV, sig_ret);
+                    if (sig_ret < 0)
+                        running = false; /* SIG_DFL core => terminate. */
                     break;
                 }
 
diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c
index be97787..3b67e2d 100644
--- a/src/syscall/syscall.c
+++ b/src/syscall/syscall.c
@@ -1883,33 +1883,17 @@ int syscall_dispatch(hv_vcpu_t vcpu, guest_t *g, int *exit_code, bool verbose)
         hv_vcpu_set_reg(vcpu, HV_REG_X0, (uint64_t) result);
 
         /* Signal the shim to flush TLB if this vCPU modified page tables.
-         * Protocol after HVC #5 (X8 carries the request):
-         *   0 -> skip
-         *   1 -> broadcast TLBI VMALLE1IS
-         *   2 -> reserved for execve (set by sys_execve, never reached here)
-         *   3 -> selective TLBI VAE1IS over X10 pages starting at X9
-         * Must explicitly write X8 because the shim reads its post-HVC value;
-         * the pre-syscall X8 is the syscall number (always non-zero) and would
-         * spuriously TLBI on every return.
+         * Protocol after HVC #5 lives in tlbi_request_emit_to_vcpu (see
+         * src/core/guest.h); the helper also handles the HVC #11 EL0-fault
+         * lazy-materialize path so both call sites use the same wire codes.
+         * Must call the emit helper because the shim reads X8 unconditionally
+         * on return; the pre-syscall X8 is the syscall number (always
+         * non-zero) and would spuriously TLBI on every return.
          *
          * cpu_tlbi_req is a per-vCPU TLS slot, so this read needs no lock and
          * cannot be drained or torn by another vCPU's epilogue.
          */
-        switch ((tlbi_kind_t) cpu_tlbi_req.kind) {
-        case TLBI_BROADCAST:
-            hv_vcpu_set_reg(vcpu, HV_REG_X8, 1);
-            break;
-        case TLBI_RANGE:
-            hv_vcpu_set_reg(vcpu, HV_REG_X8, 3);
-            hv_vcpu_set_reg(vcpu, HV_REG_X9, cpu_tlbi_req.start);
-            hv_vcpu_set_reg(vcpu, HV_REG_X10, cpu_tlbi_req.pages);
-            break;
-        case TLBI_NONE:
-        default:
-            hv_vcpu_set_reg(vcpu, HV_REG_X8, 0);
-            break;
-        }
-        tlbi_request_clear();
+        tlbi_request_emit_to_vcpu(vcpu);
     }
 
     return should_exit;
diff --git a/tests/manifest.txt b/tests/manifest.txt
index 19b1b27..b819975 100644
--- a/tests/manifest.txt
+++ b/tests/manifest.txt
@@ -79,6 +79,7 @@ test-simd-clone                # diff=skip
 
 [section] Stress tests
 test-stress                    # diff=skip
+test-mprotect-mt               # diff=skip
 
 [section] Negative / error-path tests
 test-negative                  # diff=skip
diff --git a/tests/test-mprotect-mt.c b/tests/test-mprotect-mt.c
new file mode 100644
index 0000000..ae23ed7
--- /dev/null
+++ b/tests/test-mprotect-mt.c
@@ -0,0 +1,651 @@
+/* Multi-vCPU concurrent mprotect stress
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Two scenarios run back-to-back to surface stale-TLB / mprotect-TLBI
+ * regressions across vCPUs:
+ *
+ *   1. No-op false-positive stress. A toggler thread repeatedly mprotects a
+ *      shared page to its existing perms (RW -> RW). Four reader threads do
+ *      direct EL0 writes to the page in a tight loop. Validates that the
+ *      false-positive elimination in guest_update_perms /
+ *      guest_invalidate_ptes does not lose write visibility when the
+ *      requested perms already match the live PTE.
+ *
+ *   2. R <-> RW alternation via syscall write path. A toggler flips perms
+ *      while reader threads call read(/dev/urandom, page, n). The kernel
+ *      page-walks before touching the buffer, so any stale-TLB-induced
+ *      anomaly surfaces as an unexpected return value (anything other than
+ *      n or -EFAULT). The VM crashing mid-run -- the failure mode the
+ *      bounded-retry hardening item in TODO.md is gated on -- is also
+ *      caught here because the test driver wraps every run in a timeout.
+ *
+ * The test does not try to PROVE the cross-vCPU race window absent. A
+ * passing run is evidence the bounded-retry hardening lacks a concrete
+ * reproducer today; a hard crash or accounting mismatch would supply one.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "test-harness.h"
+
+int passes = 0, fails = 0;
+
+#define PAGE_SIZE 4096
+#define READER_THREADS 4
+#define NOOP_ITERS 50000
+#define ALT_READS 5000
+#define ALT_TOGGLE_ITERS 5000
+
+static atomic_int g_running;
+static atomic_uint_least64_t g_writes;
+static atomic_uint_least64_t g_mismatches;
+static atomic_uint_least64_t g_success;
+static atomic_uint_least64_t g_efault;
+static atomic_uint_least64_t g_other;
+
+struct noop_ctx {
+    volatile uint32_t *page;
+    uint32_t tag;
+    int iters;
+};
+
+static void *noop_reader(void *arg)
+{
+    struct noop_ctx *ctx = arg;
+    for (int i = 0; i < ctx->iters && atomic_load(&g_running); i++) {
+        uint32_t v = (ctx->tag << 16) | (uint32_t) (i & 0xFFFF);
+        ctx->page[ctx->tag] = v;
+        atomic_fetch_add_explicit(&g_writes, 1, memory_order_relaxed);
+        uint32_t back = ctx->page[ctx->tag];
+        if (back != v) {
+            /* Another thread targets a different slot, so any value other
+             * than what this thread just wrote is a coherence bug.
+             */
+            atomic_fetch_add_explicit(&g_mismatches, 1, memory_order_relaxed);
+        }
+    }
+    return NULL;
+}
+
+static void *noop_toggler(void *arg)
+{
+    volatile uint32_t *page = arg;
+    while (atomic_load(&g_running)) {
+        if (mprotect((void *) page, PAGE_SIZE, PROT_READ | PROT_WRITE) != 0) {
+            atomic_fetch_add_explicit(&g_mismatches, 1, memory_order_relaxed);
+            return NULL;
+        }
+    }
+    return NULL;
+}
+
+static void test_noop_mprotect_stress(void)
+{
+    TEST("no-op mprotect false-positive stress");
+
+    void *p = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE,
+                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (p == MAP_FAILED) {
+        FAIL("mmap");
+        return;
+    }
+    memset(p, 0, PAGE_SIZE);
+
+    atomic_store(&g_writes, 0);
+    atomic_store(&g_mismatches, 0);
+    atomic_store(&g_running, 1);
+
+    pthread_t readers[READER_THREADS];
+    struct noop_ctx ctxs[READER_THREADS];
+    for (int i = 0; i < READER_THREADS; i++) {
+        ctxs[i].page = p;
+        ctxs[i].tag = (uint32_t) i;
+        ctxs[i].iters = NOOP_ITERS;
+        if (pthread_create(&readers[i], NULL, noop_reader, &ctxs[i]) != 0) {
+            atomic_store(&g_running, 0);
+            for (int j = 0; j < i; j++)
+                pthread_join(readers[j], NULL);
+            munmap(p, PAGE_SIZE);
+            FAIL("pthread_create reader");
+            return;
+        }
+    }
+
+    pthread_t toggler;
+    if (pthread_create(&toggler, NULL, noop_toggler, p) != 0) {
+        atomic_store(&g_running, 0);
+        for (int i = 0; i < READER_THREADS; i++)
+            pthread_join(readers[i], NULL);
+        munmap(p, PAGE_SIZE);
+        FAIL("pthread_create toggler");
+        return;
+    }
+
+    for (int i = 0; i < READER_THREADS; i++)
+        pthread_join(readers[i], NULL);
+    atomic_store(&g_running, 0);
+    pthread_join(toggler, NULL);
+
+    uint64_t writes = atomic_load(&g_writes);
+    uint64_t mismatches = atomic_load(&g_mismatches);
+    munmap(p, PAGE_SIZE);
+
+    if (mismatches != 0 || writes == 0) {
+        char msg[96];
+        snprintf(msg, sizeof(msg), "writes=%llu mismatches=%llu",
+                 (unsigned long long) writes, (unsigned long long) mismatches);
+        FAIL(msg);
+        return;
+    }
+    PASS();
+}
+
+struct alt_ctx {
+    void *page;
+    int fd;
+    int iters;
+};
+
+static void *alt_reader(void *arg)
+{
+    struct alt_ctx *ctx = arg;
+    char *p = ctx->page;
+    for (int i = 0; i < ctx->iters && atomic_load(&g_running); i++) {
+        errno = 0;
+        ssize_t r = read(ctx->fd, p, 64);
+        if (r == 64) {
+            atomic_fetch_add_explicit(&g_success, 1, memory_order_relaxed);
+        } else if (r < 0 && errno == EFAULT) {
+            atomic_fetch_add_explicit(&g_efault, 1, memory_order_relaxed);
+        } else {
+            atomic_fetch_add_explicit(&g_other, 1, memory_order_relaxed);
+        }
+    }
+    return NULL;
+}
+
+static void *alt_toggler(void *arg)
+{
+    void *page = arg;
+    int local_iters = ALT_TOGGLE_ITERS;
+    while (atomic_load(&g_running) && local_iters-- > 0) {
+        if (mprotect(page, PAGE_SIZE, PROT_READ) != 0) {
+            atomic_fetch_add_explicit(&g_other, 1, memory_order_relaxed);
+            return NULL;
+        }
+        if (mprotect(page, PAGE_SIZE, PROT_READ | PROT_WRITE) != 0) {
+            atomic_fetch_add_explicit(&g_other, 1, memory_order_relaxed);
+            return NULL;
+        }
+    }
+    return NULL;
+}
+
+static void test_alternating_mprotect_stress(void)
+{
+    TEST("R<->RW mprotect stress (syscall reader)");
+
+    int fd = open("/dev/urandom", O_RDONLY);
+    if (fd < 0) {
+        FAIL("open /dev/urandom");
+        return;
+    }
+    void *p = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE,
+                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (p == MAP_FAILED) {
+        close(fd);
+        FAIL("mmap");
+        return;
+    }
+    memset(p, 0, PAGE_SIZE);
+
+    atomic_store(&g_success, 0);
+    atomic_store(&g_efault, 0);
+    atomic_store(&g_other, 0);
+    atomic_store(&g_running, 1);
+
+    pthread_t readers[READER_THREADS];
+    struct alt_ctx ctxs[READER_THREADS];
+    for (int i = 0; i < READER_THREADS; i++) {
+        ctxs[i].page = p;
+        ctxs[i].fd = fd;
+        ctxs[i].iters = ALT_READS;
+        if (pthread_create(&readers[i], NULL, alt_reader, &ctxs[i]) != 0) {
+            atomic_store(&g_running, 0);
+            for (int j = 0; j < i; j++)
+                pthread_join(readers[j], NULL);
+            munmap(p, PAGE_SIZE);
+            close(fd);
+            FAIL("pthread_create reader");
+            return;
+        }
+    }
+
+    pthread_t toggler;
+    if (pthread_create(&toggler, NULL, alt_toggler, p) != 0) {
+        atomic_store(&g_running, 0);
+        for (int i = 0; i < READER_THREADS; i++)
+            pthread_join(readers[i], NULL);
+        munmap(p, PAGE_SIZE);
+        close(fd);
+        FAIL("pthread_create toggler");
+        return;
+    }
+
+    for (int i = 0; i < READER_THREADS; i++)
+        pthread_join(readers[i], NULL);
+    atomic_store(&g_running, 0);
+    pthread_join(toggler, NULL);
+
+    uint64_t s = atomic_load(&g_success);
+    uint64_t e = atomic_load(&g_efault);
+    uint64_t o = atomic_load(&g_other);
+    uint64_t total = s + e + o;
+    uint64_t expected = (uint64_t) READER_THREADS * (uint64_t) ALT_READS;
+
+    /* Always restore RW before unmap so the cleanup is clean. */
+    mprotect(p, PAGE_SIZE, PROT_READ | PROT_WRITE);
+    munmap(p, PAGE_SIZE);
+    close(fd);
+
+    if (o != 0) {
+        char msg[128];
+        snprintf(msg, sizeof(msg),
+                 "unexpected read returns: ok=%llu efault=%llu other=%llu",
+                 (unsigned long long) s, (unsigned long long) e,
+                 (unsigned long long) o);
+        FAIL(msg);
+        return;
+    }
+    if (total != expected) {
+        char msg[128];
+        snprintf(msg, sizeof(msg),
+                 "missing iterations: total=%llu expected=%llu",
+                 (unsigned long long) total, (unsigned long long) expected);
+        FAIL(msg);
+        return;
+    }
+    printf("ok=%llu efault=%llu ... ", (unsigned long long) s,
+           (unsigned long long) e);
+    PASS();
+}
+
+/* Single-threaded sweep across page counts that exercise the three TLBI
+ * accumulator branches: <=TLBI_SELECTIVE_MAX_PAGES (per-page VAE1IS),
+ * 17..64 pages (FEAT_TLBIRANGE RVAE1IS single shot), >64 pages (broadcast
+ * VMALLE1IS). Each size is mprotect-cycled R<->RW with full readback. A
+ * stale TLB or wrong RVAE1IS NUM/SCALE encoding would surface as a data
+ * mismatch or a SIGSEGV during the readback phase. */
+static void test_rvae_boundary_sweep(void)
+{
+    /* 2 hits the smallest RVAE1IS encoding (NUM=0) if it ever reaches the
+     * TLBI_RANGE_LARGE path via coalescing; today the selective threshold
+     * gates it off, but the test pins the encoding contract. The remaining
+     * sizes straddle the selective / RVAE1IS / broadcast accumulator
+     * boundaries. */
+    static const int sizes[] = {2, 16, 17, 32, 63, 64, 65, 128};
+    static const int n_sizes = (int) (sizeof(sizes) / sizeof(sizes[0]));
+    for (int k = 0; k < n_sizes; k++) {
+        int npages = sizes[k];
+        char label[64];
+        snprintf(label, sizeof(label), "RVAE1IS boundary sweep (%d pages)",
+                 npages);
+        TEST(label);
+
+        size_t sz = (size_t) npages * PAGE_SIZE;
+        uint8_t *p = mmap(NULL, sz, PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+        if (p == MAP_FAILED) {
+            FAIL("mmap");
+            continue;
+        }
+        for (int i = 0; i < npages; i++)
+            p[(size_t) i * PAGE_SIZE] = (uint8_t) i;
+
+        bool ok = true;
+        for (int cycle = 0; cycle < 20 && ok; cycle++) {
+            if (mprotect(p, sz, PROT_READ) != 0) {
+                ok = false;
+                break;
+            }
+            for (int i = 0; i < npages; i++)
+                if (p[(size_t) i * PAGE_SIZE] != (uint8_t) i) {
+                    ok = false;
+                    break;
+                }
+            if (!ok)
+                break;
+            if (mprotect(p, sz, PROT_READ | PROT_WRITE) != 0) {
+                ok = false;
+                break;
+            }
+            for (int i = 0; i < npages; i++) {
+                if (p[(size_t) i * PAGE_SIZE] != (uint8_t) i) {
+                    ok = false;
+                    break;
+                }
+                p[(size_t) i * PAGE_SIZE] = (uint8_t) (i ^ cycle);
+            }
+            if (!ok)
+                break;
+            for (int i = 0; i < npages; i++) {
+                if (p[(size_t) i * PAGE_SIZE] != (uint8_t) (i ^ cycle)) {
+                    ok = false;
+                    break;
+                }
+                p[(size_t) i * PAGE_SIZE] = (uint8_t) i;
+            }
+        }
+        munmap(p, sz);
+        if (ok)
+            PASS();
+        else
+            FAIL("readback or mprotect failed");
+    }
+}
+
+/* Multi-vCPU variant of the alternating R<->RW test but on a 32-page region
+ * so the toggler hits the TLBI_RANGE_LARGE path (RVAE1IS) instead of the
+ * single-page selective TLBI. Inner-shareable RVAE1IS must invalidate the
+ * sibling vCPU TLBs; if it doesn't, the reader threads see stale TLB entries
+ * and the test surfaces an unexpected read return code or a VM crash. */
+struct rvae_toggler_arg {
+    void *page;
+    size_t size;
+};
+
+static void *rvae_mt_toggler(void *arg)
+{
+    struct rvae_toggler_arg *a = arg;
+    int local_iters = ALT_TOGGLE_ITERS;
+    while (atomic_load(&g_running) && local_iters-- > 0) {
+        if (mprotect(a->page, a->size, PROT_READ) != 0) {
+            atomic_fetch_add_explicit(&g_other, 1, memory_order_relaxed);
+            return NULL;
+        }
+        if (mprotect(a->page, a->size, PROT_READ | PROT_WRITE) != 0) {
+            atomic_fetch_add_explicit(&g_other, 1, memory_order_relaxed);
+            return NULL;
+        }
+    }
+    return NULL;
+}
+
+static void test_rvae_multi_vcpu_stress(int npages)
+{
+    char label[64];
+    snprintf(label, sizeof(label), "RVAE1IS multi-vCPU %d-page stress (NUM=%d)",
+             npages, ((npages + 1) / 2) - 1);
+    TEST(label);
+
+    int fd = open("/dev/urandom", O_RDONLY);
+    if (fd < 0) {
+        FAIL("open /dev/urandom");
+        return;
+    }
+    size_t sz = (size_t) npages * PAGE_SIZE;
+    void *p = mmap(NULL, sz, PROT_READ | PROT_WRITE,
+                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (p == MAP_FAILED) {
+        close(fd);
+        FAIL("mmap");
+        return;
+    }
+    memset(p, 0, sz);
+
+    atomic_store(&g_success, 0);
+    atomic_store(&g_efault, 0);
+    atomic_store(&g_other, 0);
+    atomic_store(&g_running, 1);
+
+    pthread_t readers[READER_THREADS];
+    struct alt_ctx ctxs[READER_THREADS];
+    for (int i = 0; i < READER_THREADS; i++) {
+        ctxs[i].page = p;
+        ctxs[i].fd = fd;
+        ctxs[i].iters = ALT_READS;
+        if (pthread_create(&readers[i], NULL, alt_reader, &ctxs[i]) != 0) {
+            atomic_store(&g_running, 0);
+            for (int j = 0; j < i; j++)
+                pthread_join(readers[j], NULL);
+            munmap(p, sz);
+            close(fd);
+            FAIL("pthread_create reader");
+            return;
+        }
+    }
+
+    pthread_t toggler;
+    struct rvae_toggler_arg targ = {p, sz};
+    if (pthread_create(&toggler, NULL, rvae_mt_toggler, &targ) != 0) {
+        atomic_store(&g_running, 0);
+        for (int i = 0; i < READER_THREADS; i++)
+            pthread_join(readers[i], NULL);
+        munmap(p, sz);
+        close(fd);
+        FAIL("pthread_create toggler");
+        return;
+    }
+
+    for (int i = 0; i < READER_THREADS; i++)
+        pthread_join(readers[i], NULL);
+    atomic_store(&g_running, 0);
+    pthread_join(toggler, NULL);
+
+    uint64_t s = atomic_load(&g_success);
+    uint64_t e = atomic_load(&g_efault);
+    uint64_t o = atomic_load(&g_other);
+    uint64_t total = s + e + o;
+    uint64_t expected = (uint64_t) READER_THREADS * (uint64_t) ALT_READS;
+
+    mprotect(p, sz, PROT_READ | PROT_WRITE);
+    munmap(p, sz);
+    close(fd);
+
+    if (o != 0) {
+        char msg[128];
+        snprintf(msg, sizeof(msg),
+                 "unexpected read returns: ok=%llu efault=%llu other=%llu",
+                 (unsigned long long) s, (unsigned long long) e,
+                 (unsigned long long) o);
+        FAIL(msg);
+        return;
+    }
+    if (total != expected) {
+        char msg[128];
+        snprintf(msg, sizeof(msg),
+                 "missing iterations: total=%llu expected=%llu",
+                 (unsigned long long) total, (unsigned long long) expected);
+        FAIL(msg);
+        return;
+    }
+    printf("ok=%llu efault=%llu ... ", (unsigned long long) s,
+           (unsigned long long) e);
+    PASS();
+}
+
+/* 32-page mprotect cycle that deterministically straddles a 2 MiB guest
+ * block boundary. The boundary forces guest_split_block on both blocks
+ * the range crosses (16 pages each side), exercising the split-then-
+ * tlbi-range-large code path that the ordinary boundary sweep only hits
+ * by chance depending on gap-finder placement. */
+static void test_rvae_2mib_straddle(void)
+{
+    TEST("RVAE1IS 2 MiB block-straddle cycle");
+
+    /* Allocate enough headroom to guarantee a 2 MiB boundary with at least
+     * 16 pages on each side, regardless of where mmap places the region.
+     * Worst case: mmap returns a 2 MiB-aligned base, so the first usable
+     * boundary is mmap_base + 2 MiB; we need 16 pages below that boundary
+     * (i.e. inside the first 2 MiB) and 16 pages above (inside the second
+     * 2 MiB). 4 MiB + slack covers it. */
+    const size_t mib_2 = 2 * 1024 * 1024;
+    size_t alloc_sz = 4 * mib_2 + 64 * PAGE_SIZE;
+    uint8_t *region = mmap(NULL, alloc_sz, PROT_READ | PROT_WRITE,
+                           MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (region == MAP_FAILED) {
+        FAIL("mmap");
+        return;
+    }
+    /* Pick the first 2 MiB boundary AT LEAST 16 pages above the start so
+     * the 32-page protect window straddles it (16 pages below + 16 above).
+     * If the natural rounded-up boundary is too close to base, jump to the
+     * next one -- the allocation is sized to keep that within range. */
+    uintptr_t base = (uintptr_t) region;
+    uintptr_t boundary = (base + mib_2 - 1) & ~(uintptr_t) (mib_2 - 1);
+    if (boundary - base < 16 * PAGE_SIZE)
+        boundary += mib_2;
+    if (boundary + 16 * PAGE_SIZE > base + alloc_sz) {
+        munmap(region, alloc_sz);
+        FAIL("boundary not addressable inside region");
+        return;
+    }
+    uint8_t *p = (uint8_t *) (boundary - 16 * PAGE_SIZE);
+    size_t sz = 32 * PAGE_SIZE;
+
+    for (size_t i = 0; i < 32; i++)
+        p[i * PAGE_SIZE] = (uint8_t) i;
+
+    bool ok = true;
+    for (int cycle = 0; cycle < 20 && ok; cycle++) {
+        if (mprotect(p, sz, PROT_READ) != 0) {
+            ok = false;
+            break;
+        }
+        for (size_t i = 0; i < 32; i++)
+            if (p[i * PAGE_SIZE] != (uint8_t) i) {
+                ok = false;
+                break;
+            }
+        if (!ok)
+            break;
+        if (mprotect(p, sz, PROT_READ | PROT_WRITE) != 0) {
+            ok = false;
+            break;
+        }
+        for (size_t i = 0; i < 32; i++) {
+            if (p[i * PAGE_SIZE] != (uint8_t) i) {
+                ok = false;
+                break;
+            }
+            p[i * PAGE_SIZE] = (uint8_t) ((unsigned) i ^ (unsigned) cycle);
+        }
+        if (!ok)
+            break;
+        for (size_t i = 0; i < 32; i++) {
+            if (p[i * PAGE_SIZE] !=
+                (uint8_t) ((unsigned) i ^ (unsigned) cycle)) {
+                ok = false;
+                break;
+            }
+            p[i * PAGE_SIZE] = (uint8_t) i;
+        }
+    }
+    munmap(region, alloc_sz);
+    if (ok)
+        PASS();
+    else
+        FAIL("straddle readback or mprotect failed");
+}
+
+/* R<->RX cycle on a 32-page region. Each cycle writes a unique
+ * `mov w0, #imm; ret` epilogue to every page while RW, then mprotects to
+ * RX and calls the page. The expected return value is the imm just written.
+ * If the X11 I-cache hint were dropped from the TLBI_RANGE_LARGE path, the
+ * call would execute stale instructions cached from a prior cycle and the
+ * returned imm would mismatch.
+ *
+ * The RVAE1IS path is exercised because the 32-page range exceeds
+ * TLBI_SELECTIVE_MAX_PAGES = 16; combined with PROT_EXEC the helper marks
+ * icache_flush=1 and the shim's tlbi_range_large branch runs IC IALLU. */
+static void test_rvae_icache_stress(void)
+{
+    TEST("RVAE1IS R<->RX I-cache hint coverage");
+
+    enum { NPAGES = 32 };
+    size_t sz = (size_t) NPAGES * PAGE_SIZE;
+    uint32_t *p = mmap(NULL, sz, PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (p == MAP_FAILED) {
+        FAIL("mmap");
+        return;
+    }
+
+    bool ok = true;
+    for (int cycle = 0; cycle < 16 && ok; cycle++) {
+        /* Distinct imm per cycle so a stale I-cache fetch surfaces as a
+         * value mismatch. imm range [1, 0xFFF] -- mov-imm encoding takes a
+         * 16-bit literal at bits [20:5], easy to keep small. */
+        uint32_t imm = (uint32_t) (cycle + 1) & 0xFFFu;
+        /* mov w0, #imm  =  0x52800000 | (imm << 5) */
+        uint32_t mov = 0x52800000u | (imm << 5);
+        /* ret  =  0xD65F03C0 (RET X30) */
+        uint32_t ret = 0xD65F03C0u;
+
+        for (size_t i = 0; i < NPAGES; i++) {
+            uint32_t *pg = (uint32_t *) ((uint8_t *) p + i * PAGE_SIZE);
+            pg[0] = mov;
+            pg[1] = ret;
+        }
+
+        if (mprotect(p, sz, PROT_READ | PROT_EXEC) != 0) {
+            ok = false;
+            break;
+        }
+
+        /* Call each page; verify the return value matches the imm we just
+         * wrote. A mismatch indicates the I-cache held a stale instruction
+         * from a prior cycle (i.e. the RVAE1IS path skipped IC IALLU). */
+        for (size_t i = 0; i < NPAGES; i++) {
+            uint32_t (*fn)(void) =
+                (uint32_t (*)(void))((uint8_t *) p + i * PAGE_SIZE);
+            uint32_t got = fn();
+            if (got != imm) {
+                ok = false;
+                break;
+            }
+        }
+
+        if (mprotect(p, sz, PROT_READ | PROT_WRITE) != 0) {
+            ok = false;
+            break;
+        }
+    }
+    munmap(p, sz);
+
+    if (ok)
+        PASS();
+    else
+        FAIL("I-cache content mismatch or mprotect failure");
+}
+
+int main(void)
+{
+    printf("test-mprotect-mt: multi-vCPU mprotect stress\n");
+
+    test_noop_mprotect_stress();
+    test_alternating_mprotect_stress();
+    test_rvae_boundary_sweep();
+    test_rvae_2mib_straddle();
+    test_rvae_icache_stress();
+    /* Drive the RVAE1IS NUM encoding across its boundaries under contention:
+     * 17 pages -> NUM=8, 32 -> NUM=15 (mid), 64 -> NUM=31 (max). */
+    test_rvae_multi_vcpu_stress(17);
+    test_rvae_multi_vcpu_stress(32);
+    test_rvae_multi_vcpu_stress(64);
+
+    SUMMARY("test-mprotect-mt");
+    return fails > 0 ? 1 : 0;
+}
diff --git a/tests/test-tlbi-encoder-host.c b/tests/test-tlbi-encoder-host.c
new file mode 100644
index 0000000..13e1377
--- /dev/null
+++ b/tests/test-tlbi-encoder-host.c
@@ -0,0 +1,146 @@
+/* Native-host unit test for the TLBI RVAE1IS operand encoder.
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The integration tests in tests/test-mprotect-mt.c exercise the operand
+ * end-to-end inside a VM, but they happily pass on M4 even when the
+ * encoder dropped the TG=01 bit (the M-series PE silently falls back to
+ * TCR_EL1.TGn). This host-side test decodes the operand bit-by-bit and
+ * asserts every field matches the ARM ARM DDI 0487J.a D8.7.6 layout, so a
+ * future regression in the encoder surfaces as a build / CI failure
+ * regardless of the running PE's tolerance for reserved encodings.
+ *
+ * Native macOS binary; no HVF entitlement needed (the encoder is pure C).
+ * Symbols pulled from core/guest.h that the encoder does not actually
+ * reference still need to link, so a stub cpu_tlbi_req / g_tlbi_range_*
+ * definition lives below.
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "core/guest.h"
+
+/* Stubs for the extern symbols guest.h declares. The encoder under test
+ * does not read them, but the linker needs definitions. */
+_Thread_local tlbi_request_t cpu_tlbi_req;
+bool g_tlbi_range_supported;
+
+static int passes;
+static int fails;
+
+static void check_field(const char *label, uint64_t got, uint64_t expect)
+{
+    if (got == expect) {
+        passes++;
+    } else {
+        fails++;
+        fprintf(stderr, "FAIL %s: got 0x%llx, expected 0x%llx\n", label,
+                (unsigned long long) got, (unsigned long long) expect);
+    }
+}
+
+/* Decompose the operand per ARM ARM D8.7.6 and compare each field against
+ * the expected value. baseADDR is VA>>12 masked to 37 bits; TG must be 01
+ * (4 KiB); SCALE must be 0; TTL must be 0; ASID must be 0. NUM derives
+ * from the page count via the ceil(pages/2) - 1 SCALE=0 encoding. */
+static void verify_operand(uint64_t start_va,
+                           uint16_t pages,
+                           uint64_t expect_num)
+{
+    uint64_t op = tlbi_rvae1is_operand(start_va, pages);
+
+    uint64_t baddr = op & ((1ULL << 37) - 1);
+    uint64_t ttl = (op >> 37) & 0x3;
+    uint64_t num = (op >> 39) & 0x1F;
+    uint64_t scale = (op >> 44) & 0x3;
+    uint64_t tg = (op >> 46) & 0x3;
+    uint64_t asid = (op >> 48) & 0xFFFF;
+
+    char label[64];
+    snprintf(label, sizeof(label), "BaseADDR (start=0x%llx)",
+             (unsigned long long) start_va);
+    check_field(label, baddr, (start_va >> 12) & ((1ULL << 37) - 1));
+
+    snprintf(label, sizeof(label), "TTL (start=0x%llx)",
+             (unsigned long long) start_va);
+    check_field(label, ttl, 0);
+
+    snprintf(label, sizeof(label), "NUM (pages=%u)", (unsigned) pages);
+    check_field(label, num, expect_num);
+
+    snprintf(label, sizeof(label), "SCALE (pages=%u)", (unsigned) pages);
+    check_field(label, scale, 0);
+
+    snprintf(label, sizeof(label), "TG (start=0x%llx)",
+             (unsigned long long) start_va);
+    check_field(label, tg, 1); /* 4 KiB granule */
+
+    snprintf(label, sizeof(label), "ASID (start=0x%llx)",
+             (unsigned long long) start_va);
+    check_field(label, asid, 0);
+}
+
+int main(void)
+{
+    printf("test-tlbi-encoder-host: RVAE1IS operand bit-field verification\n");
+
+    /* SCALE=0 NUM table: NUM = ceil(pages/2) - 1.
+     *   pages 2 -> NUM 0 (covers 2)
+     *   pages 3 -> NUM 1 (covers 4, over-invalidates by 1)
+     *   pages 16 -> NUM 7 (covers 16, exact)
+     *   pages 17 -> NUM 8 (covers 18)
+     *   pages 32 -> NUM 15 (covers 32)
+     *   pages 63 -> NUM 31 (covers 64)
+     *   pages 64 -> NUM 31 (covers 64)
+     */
+    verify_operand(0x10000000ULL, 2, 0);
+    verify_operand(0x10000000ULL, 3, 1);
+    verify_operand(0x10000000ULL, 16, 7);
+    verify_operand(0x10000000ULL, 17, 8);
+    verify_operand(0x10000000ULL, 32, 15);
+    verify_operand(0x10000000ULL, 63, 31);
+    verify_operand(0x10000000ULL, 64, 31);
+
+    /* Boundary VAs. 4 KiB-aligned, low-VA, MMAP_BASE (8 GiB), high-VA
+     * just below the 48-bit BaseADDR truncation point. */
+    verify_operand(0x00000000ULL, 32, 15);         /* zero base */
+    verify_operand(0x200000000ULL, 32, 15);        /* MMAP_BASE */
+    verify_operand(0x800000000000ULL, 32, 15);     /* Rosetta image */
+    verify_operand(0x0000FFFFF0000000ULL, 32, 15); /* KBUF_USER_VA */
+
+    /* Pathological inputs the clamp must catch:
+     *   pages = 0 -> clamped to 2 -> NUM 0
+     *   pages = 1 -> clamped to 2 -> NUM 0 (callers never reach here)
+     *   pages = UINT16_MAX -> NUM clamped to 31 (saturating)
+     */
+    verify_operand(0x10000000ULL, 0, 0);
+    verify_operand(0x10000000ULL, 1, 0);
+    verify_operand(0x10000000ULL, UINT16_MAX, 31);
+
+    /* TG bit is the architectural lynchpin -- if the encoder ever drops
+     * it the integration tests on Apple Silicon would still pass. Pin a
+     * direct bit-46 inspection so a regression to TG=00 fails this test
+     * immediately. */
+    uint64_t op = tlbi_rvae1is_operand(0x10000000ULL, 32);
+    if (op & (1ULL << 46)) {
+        passes++;
+    } else {
+        fails++;
+        fprintf(stderr, "FAIL TG bit 46 must be set (4 KiB granule, TG=01)\n");
+    }
+    if (op & (1ULL << 47)) {
+        fails++;
+        fprintf(stderr,
+                "FAIL TG bit 47 must be clear (TG=01 has bit 47 = 0)\n");
+    } else {
+        passes++;
+    }
+
+    printf("\ntest-tlbi-encoder-host: %d passed, %d failed%s\n", passes, fails,
+           fails == 0 ? " - PASS" : " - FAIL");
+    return fails > 0 ? 1 : 0;
+}