diff --git a/Makefile b/Makefile index 2f69e94..2cafed0 100644 --- a/Makefile +++ b/Makefile @@ -131,6 +131,16 @@ $(BUILD_DIR)/test-multi-vcpu: $(BUILD_DIR)/test-multi-vcpu.o | $(BUILD_DIR) $(BUILD_DIR)/test-rwx: $(BUILD_DIR)/test-rwx.o | $(BUILD_DIR) $(call link-and-sign,$@,$<) +## Build the TLBI RVAE1IS operand encoder unit test (native macOS binary). +# Pure C; no HVF entitlement needed. Verifies the architectural bit-layout +# of tlbi_rvae1is_operand so a future regression that drops TG=01 (which +# the Apple Silicon integration tests would silently tolerate) fails CI +# immediately. +$(BUILD_DIR)/test-tlbi-encoder-host: $(BUILD_DIR)/test-tlbi-encoder-host.o \ + | $(BUILD_DIR) + @echo " LD $@" + $(Q)$(CC) $(CFLAGS) -o $@ $^ + ## Build the proctitle argv-tail regression test (native macOS binary) # Links against the project-built proctitle.o so the exact in-tree code is # exercised; no HVF entitlement is needed because the test only manipulates @@ -167,6 +177,12 @@ $(BUILD_DIR)/test-shim-cred-race: tests/test-shim-cred-race.c | $(BUILD_DIR) @echo " CROSS $< (with -lpthread)" $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread +# test-mprotect-mt stresses multi-vCPU mprotect under concurrent reader +# threads to surface stale-TLB regressions. +$(BUILD_DIR)/test-mprotect-mt: tests/test-mprotect-mt.c | $(BUILD_DIR) + @echo " CROSS $< (with -lpthread)" + $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread + # test-shim-urandom-smp spawns N pthreads racing on a shared FD_URANDOM # slot to exercise the shim's LDXR/STXR head-advance under contention. $(BUILD_DIR)/test-shim-urandom-smp: tests/test-shim-urandom-smp.c | $(BUILD_DIR) diff --git a/mk/config.mk b/mk/config.mk index 232da91..7270e28 100644 --- a/mk/config.mk +++ b/mk/config.mk @@ -15,7 +15,8 @@ ifeq ($(origin GUEST_TEST_BINARIES), undefined) endif # Exclude native macOS test files from cross-compilation -NATIVE_TESTS := tests/test-multi-vcpu.c tests/test-rwx.c +NATIVE_TESTS := tests/test-multi-vcpu.c tests/test-rwx.c \ + tests/test-tlbi-encoder-host.c SPECIAL_TEST_SRCS := tests/test-lowbase-mem.c SPECIAL_TEST_BINS := $(BUILD_DIR)/test-lowbase-mem-200000 $(BUILD_DIR)/test-lowbase-mem-300000 diff --git a/mk/tests.mk b/mk/tests.mk index 03947be..fd412ff 100644 --- a/mk/tests.mk +++ b/mk/tests.mk @@ -35,8 +35,11 @@ define RUN_OPTIONAL_SKIP77 endef ## Run the unit test suite plus busybox applet validation -check: $(ELFUSE_BIN) $(TEST_DEPS) check-syscall-coverage +check: $(ELFUSE_BIN) $(TEST_DEPS) check-syscall-coverage \ + $(BUILD_DIR)/test-tlbi-encoder-host @bash tests/driver.sh -e $(ELFUSE_BIN) -d $(TEST_DIR) -v + @printf "\n$(BLUE)━━━ TLBI RVAE1IS encoder unit test ━━━$(RESET)\n" + @$(BUILD_DIR)/test-tlbi-encoder-host @printf "\n$(BLUE)━━━ proctitle argv-tail regression ━━━$(RESET)\n" @$(MAKE) --no-print-directory test-proctitle-host @printf "\n$(BLUE)━━━ proctitle low-stack regression ━━━$(RESET)\n" diff --git a/src/core/guest.c b/src/core/guest.c index fa2a8a6..bcbc3c2 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include "core/guest.h" @@ -48,6 +49,13 @@ */ _Thread_local tlbi_request_t cpu_tlbi_req; +/* FEAT_TLBIRANGE host capability flag. Set once at bootstrap by + * guest_probe_tlbi_range and treated as read-only thereafter. Apple Silicon + * M1+ implements ARMv8.5-A which mandates FEAT_TLBIRANGE; the probe stays + * conservative and defaults to false until the flag is explicitly set so + * future ports to non-Apple aarch64 hosts inherit the safe fallback. */ +bool g_tlbi_range_supported = false; + static void guest_region_clear(guest_t *g); /* Page table descriptor bits. */ @@ -202,10 +210,51 @@ static uint64_t *pt_at(const guest_t *g, uint64_t gpa) /* Public API */ +/* FEAT_TLBIRANGE probe -- runs exactly once via pthread_once. ARMv8.4 + * introduced TLBI RVAE1IS for single-shot range invalidation; ARMv8.5+ + * makes it mandatory. macOS does not surface a sysctl entry for + * FEAT_TLBIRANGE directly, so use FEAT_LSE2 as a proxy -- both became + * mandatory in ARMv8.4 and Apple ships them together across the entire + * M-series. A future non-Apple aarch64 host or an older ARM PE without + * FEAT_TLBIRANGE would otherwise trap the shim's `tlbi rvae1is, x9` to + * BAD_VEC; the proxy probe keeps the accumulator on the per-page VAE1IS / + * VMALLE1IS path in that case. + * + * Width-tolerant read: macOS currently exposes the boolean as a 4-byte int, + * but a future kernel could widen it to uint64_t. Read into a 64-bit slot + * and accept any non-zero answer for any length sysctl actually returned. + * + * ELFUSE_DISABLE_TLBI_RANGE=1 forces the broadcast fallback so the + * VAE1IS-only / VMALLE1IS path stays exercisable in CI on Apple Silicon -- + * otherwise the fallback is unreachable on any host where the sysctl probe + * succeeds. + * + * pthread_once gates the probe so a re-bootstrap path (sys_execve, fork + * IPC restore) cannot race a live vCPU reading the flag. The first + * guest_init wins and the result is immutable for the process lifetime. */ +static pthread_once_t tlbi_range_probe_once = PTHREAD_ONCE_INIT; + +static void tlbi_range_probe_run(void) +{ + const char *disable_env = getenv("ELFUSE_DISABLE_TLBI_RANGE"); + if (disable_env && disable_env[0] && disable_env[0] != '0') { + g_tlbi_range_supported = false; + return; + } + uint64_t lse2_raw = 0; + size_t lse2_len = sizeof(lse2_raw); + g_tlbi_range_supported = + (sysctlbyname("hw.optional.arm.FEAT_LSE2", &lse2_raw, &lse2_len, NULL, + 0) == 0) && + lse2_raw != 0; +} + int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) { uint64_t t0; + pthread_once(&tlbi_range_probe_once, tlbi_range_probe_run); + memset(g, 0, sizeof(*g)); g->shm_fd = -1; g->ipa_base = GUEST_IPA_BASE; @@ -929,6 +978,10 @@ int guest_map_va_range(guest_t *g, return -1; uint64_t cur_gpa = gpa_start; + uint64_t changed_lo = UINT64_MAX, changed_hi = 0; + bool bcast = tlbi_request_is_broadcast(); + if (perms & MEM_PERM_X) + tlbi_request_mark_icache(); for (uint64_t va = va_start; va < va_end; va += BLOCK_2MIB, cur_gpa += BLOCK_2MIB) { unsigned l0_idx = (unsigned) (va / (512ULL * BLOCK_1GIB)); @@ -976,12 +1029,22 @@ int guest_map_va_range(guest_t *g, continue; } l2[l2_idx] = make_block_desc(cur_gpa, perms); + if (!bcast) { + if (va < changed_lo) + changed_lo = va; + if (va + BLOCK_2MIB > changed_hi) + changed_hi = va + BLOCK_2MIB; + } } /* The new entries are visible to the host immediately; the shim flushes - * the matching TLBs on syscall return via the per-vCPU accumulator. + * the matching TLBs on syscall return via the per-vCPU accumulator. Skip + * the request when every block was already mapped (no negative TLB + * entries can apply since the prior install already invalidated them), + * or when the accumulator already promised a broadcast. */ - tlbi_request_range(va_start, va_end); + if (!bcast && changed_hi > changed_lo) + tlbi_request_range(changed_lo, changed_hi); guest_pt_gen_bump(g); return 0; } @@ -2438,6 +2501,13 @@ int guest_extend_page_tables(guest_t *g, (unsigned long long) start, (unsigned long long) end); return -1; } + /* Defensive: end is bounded by guest_size above, so the ALIGN_2MIB_UP + * below cannot wrap on any reachable input. The explicit guard documents + * the contract and matches the wrap guards in guest_invalidate_ptes / + * guest_update_perms; keeps the three sites in sync if a future caller + * lifts the guest_size cap. */ + if (end > UINT64_MAX - (BLOCK_2MIB - 1)) + return -1; uint64_t base = g->ipa_base; @@ -2445,8 +2515,18 @@ int guest_extend_page_tables(guest_t *g, uint64_t l0_gpa_off = g->ttbr0 - base; uint64_t *l0 = pt_at(g, l0_gpa_off); - /* Walk 2MiB blocks in [start, end) */ + /* Walk 2MiB blocks in [start, end). Track the smallest sub-range whose + * L2 entry actually transitioned from unmapped to mapped; blocks that + * were already valid get no new descriptor and need no TLBI + * (false-positive elimination mirrors guest_update_perms). Once the + * accumulator is already TLBI_BROADCAST, the bookkeeping is wasted + * work. + */ + if (perms & MEM_PERM_X) + tlbi_request_mark_icache(); uint64_t addr_start = ALIGN_2MIB_DOWN(start), addr_end = ALIGN_2MIB_UP(end); + uint64_t changed_lo = UINT64_MAX, changed_hi = 0; + bool bcast = tlbi_request_is_broadcast(); for (uint64_t addr = addr_start; addr < addr_end; addr += BLOCK_2MIB) { uint64_t ipa = base + addr; @@ -2492,18 +2572,33 @@ int guest_extend_page_tables(guest_t *g, unsigned l2_idx = (unsigned) ((ipa % BLOCK_1GIB) / BLOCK_2MIB); - /* Only map if not already mapped */ - if (!(l2[l2_idx] & PT_BLOCK)) { - l2[l2_idx] = make_block_desc(ipa, perms); + /* Only map if not already mapped. A negative TLB entry from a prior + * translation fault is possible only for VAs that were unmapped at + * the time of the fault, so the TLBI is only needed for blocks + * actually installed by this call. + */ + /* At L2 a valid descriptor is either a 2 MiB block (bits[1:0] = 01) + * or a table descriptor pointing to an L3 page table (bits[1:0] = 11). + * Both indicate the slot is already mapped at some granule, so the + * extend has nothing to install; skip without flushing. The previous + * `& PT_BLOCK` test relied on PT_BLOCK == PT_VALID == bit 0 to cover + * both cases by coincidence -- write it as an explicit PT_VALID test + * so the intent survives a future descriptor-bit renumbering. + */ + if (l2[l2_idx] & PT_VALID) + continue; + l2[l2_idx] = make_block_desc(ipa, perms); + if (!bcast) { + if (addr < changed_lo) + changed_lo = addr; + if (addr + BLOCK_2MIB > changed_hi) + changed_hi = addr + BLOCK_2MIB; } } - /* Use the page-aligned bounds the loop actually covered. Extend grows - * the mapped range; existing VAs may carry negative TLB entries from - * prior translation faults at this address, so a flush is still needed. - * Large extends will exceed the selective cap and become broadcast. - */ - tlbi_request_range(addr_start + base, addr_end + base); + /* Large extends will exceed the selective cap and become broadcast. */ + if (!bcast && changed_hi > changed_lo) + tlbi_request_range(base + changed_lo, base + changed_hi); guest_pt_gen_bump(g); return 0; } @@ -2632,8 +2727,23 @@ static uint64_t *find_l2_entry(guest_t *g, uint64_t va) } /* Split a 2MiB L2 block descriptor into 512 x 4KiB L3 page descriptors. - * The caller provides the L2 entry via find_l2_entry. - * Extracts the output IPA from the existing descriptor. + * The caller provides the L2 entry via find_l2_entry. Extracts the output + * IPA from the existing descriptor. + * + * No TLBI is issued by the split itself. The block-to-table transition + * preserves the output address, permissions, and attributes of every page + * in the 2 MiB range, so any cached translation from the old block + * descriptor remains semantically correct. Per ARM ARM (FEAT_BBM Level 2), + * a CPU that implements level-2 break-before-make support allows + * block <-> table changes that preserve the resulting translation in all + * other respects without a BBM sequence. Apple Silicon implements + * FEAT_BBM Level 2 across M1+; the split-heavy stress paths in tests/ + * (test-stress mprotect cycling, test-shim-urandom-toctou rapid flips, + * test-mprotect-mt R<->RW toggling, plus dynamic-linker RELRO setup) + * run cleanly. A future PE without FEAT_BBM Level 2 would need either + * a real BBM sequence here (invalidate, TLBI, write table) or an + * unconditional broadcast TLBI on every split; revisit if that ever + * surfaces a TLB conflict abort. */ static int split_l2_block(guest_t *g, uint64_t *l2_entry) { @@ -2681,9 +2791,17 @@ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end) { uint64_t base = g->ipa_base; - /* Page-align the range */ + /* Page-align the range. The ALIGN_UP step on end could wrap to 0 for + * inputs within PAGE_SIZE-1 of UINT64_MAX, silently turning the + * invalidation into a no-op against a 0-length loop. Reject the + * pathological input rather than allow a stale mapping to survive. + */ + if (end > UINT64_MAX - (PAGE_SIZE - 1)) + return -1; start = start & ~(PAGE_SIZE - 1); end = (end + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); + if (end <= start) + return 0; for (uint64_t addr = start; addr < end;) { uint64_t *l2_entry = find_l2_entry(g, addr); @@ -2723,20 +2841,36 @@ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end) return -1; } - /* L3 table: invalidate individual 4KiB page descriptors */ + /* L3 table: invalidate individual 4KiB page descriptors. Track the + * smallest sub-range whose descriptor actually transitioned from + * mapped to invalid; a page that was already 0 needs no TLBI + * (false-positive elimination mirrors the guest_update_perms path). + * Skip the per-page bookkeeping once a broadcast is already pending. + */ uint64_t l3_ipa = *l2_entry & 0xFFFFFFFFF000ULL; uint64_t *l3 = pt_at(g, l3_ipa - base); uint64_t page_start = (addr > block_start) ? addr : block_start; uint64_t page_end = (end < block_end) ? end : block_end; + uint64_t changed_lo = UINT64_MAX, changed_hi = 0; + bool bcast = tlbi_request_is_broadcast(); for (uint64_t pa = page_start; pa < page_end; pa += PAGE_SIZE) { unsigned l3_idx = (unsigned) (((base + pa) % BLOCK_2MIB) / PAGE_SIZE); - l3[l3_idx] = 0; /* Invalid descriptor */ + if (l3[l3_idx] != 0) { + l3[l3_idx] = 0; /* Invalid descriptor */ + if (!bcast) { + if (pa < changed_lo) + changed_lo = pa; + if (pa + PAGE_SIZE > changed_hi) + changed_hi = pa + PAGE_SIZE; + } + } } - tlbi_request_range(base + page_start, base + page_end); + if (!bcast && changed_hi > changed_lo) + tlbi_request_range(base + changed_lo, base + changed_hi); addr = page_end; } @@ -2748,9 +2882,23 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms) { uint64_t base = g->ipa_base; - /* Page-align the range */ + /* Page-align the range. The ALIGN_UP on end could wrap to 0 for inputs + * within PAGE_SIZE-1 of UINT64_MAX, silently degrading the call to a + * no-op against a 0-length loop. Reject the pathological input rather + * than leave stale perms in place. + */ + if (end > UINT64_MAX - (PAGE_SIZE - 1)) + return -1; start = start & ~(PAGE_SIZE - 1); end = (end + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); + if (end <= start) + return 0; + + /* New perms include exec: the shim must IC IALLU on syscall return so a + * VA that previously held NX content fetches the new instructions. The + * inverse (removing exec) leaves no new code visible. */ + if (perms & MEM_PERM_X) + tlbi_request_mark_icache(); /* Aliasing-proof invariant: TTBR1 maps the kbuf RW + UXN + PXN. The same * physical pages will be dual-mapped at KBUF_USER_VA under TTBR0 by the @@ -2825,10 +2973,15 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms) /* Update pages within this 2MiB block that fall in [start, end). Track * the smallest sub-range that actually changed so the TLBI request only * covers descriptors whose value changed (false-positive elimination). + * Once the accumulator has already promoted to TLBI_BROADCAST, the + * bounding-box bookkeeping is wasted work -- the broadcast invalidates + * everything regardless -- so the loop skips the compares in that + * mode while still writing every changed descriptor. */ uint64_t page_start = (addr > block_start) ? addr : block_start; uint64_t page_end = (end < block_end) ? end : block_end; uint64_t changed_lo = UINT64_MAX, changed_hi = 0; + bool bcast = tlbi_request_is_broadcast(); for (uint64_t pa = page_start; pa < page_end; pa += PAGE_SIZE) { unsigned l3_idx = @@ -2858,14 +3011,16 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms) uint64_t new_desc = make_page_desc(page_ipa, perms); if (l3[l3_idx] != new_desc) { l3[l3_idx] = new_desc; - if (pa < changed_lo) - changed_lo = pa; - if (pa + PAGE_SIZE > changed_hi) - changed_hi = pa + PAGE_SIZE; + if (!bcast) { + if (pa < changed_lo) + changed_lo = pa; + if (pa + PAGE_SIZE > changed_hi) + changed_hi = pa + PAGE_SIZE; + } } } - if (changed_hi > changed_lo) + if (!bcast && changed_hi > changed_lo) tlbi_request_range(base + changed_lo, base + changed_hi); addr = page_end; } @@ -2908,13 +3063,19 @@ int guest_install_va_pages(guest_t *g, uint64_t base = g->ipa_base; uint64_t end = va + length; + uint64_t changed_lo = UINT64_MAX, changed_hi = 0; + bool bcast = tlbi_request_is_broadcast(); + if (perms & MEM_PERM_X) + tlbi_request_mark_icache(); /* Walk one 4 KiB page at a time. find_l2_entry locates the L2 slot for * each VA; split_l2_block converts an L2 block descriptor into a table * lazily so individual L3 entries can be written. The L3 entry is then * unconditionally overwritten with the requested gpa + perms, so a prior * invalidation (or a fresh split inheriting the wrong block address) - * cannot leave behind a stale or zero descriptor. + * cannot leave behind a stale or zero descriptor. Pages whose descriptor + * is already identical are no-ops for TLBI purposes; skip them. Skip the + * per-page bookkeeping once a broadcast is already pending. */ for (uint64_t v = va, p = gpa; v < end; v += PAGE_SIZE, p += PAGE_SIZE) { uint64_t *l2_entry = find_l2_entry(g, v); @@ -2931,10 +3092,20 @@ int guest_install_va_pages(guest_t *g, if (!l3) return -1; unsigned l3_idx = (unsigned) (((base + v) % BLOCK_2MIB) / PAGE_SIZE); - l3[l3_idx] = make_page_desc(base + p, perms); + uint64_t new_desc = make_page_desc(base + p, perms); + if (l3[l3_idx] != new_desc) { + l3[l3_idx] = new_desc; + if (!bcast) { + if (v < changed_lo) + changed_lo = v; + if (v + PAGE_SIZE > changed_hi) + changed_hi = v + PAGE_SIZE; + } + } } - tlbi_request_range(va, end); + if (!bcast && changed_hi > changed_lo) + tlbi_request_range(changed_lo, changed_hi); guest_pt_gen_bump(g); return 0; } diff --git a/src/core/guest.h b/src/core/guest.h index 11d05bf..9e40ae3 100644 --- a/src/core/guest.h +++ b/src/core/guest.h @@ -245,21 +245,86 @@ typedef enum { TLBI_NONE = 0, TLBI_BROADCAST = 1, TLBI_RANGE = 2, + TLBI_RANGE_LARGE = 3, /* FEAT_TLBIRANGE single-shot TLBI RVAE1IS for + * ranges that exceed TLBI_SELECTIVE_MAX_PAGES but + * stay within TLBI_RVAE_MAX_PAGES; encoded as + * X8 = 4 on the wire. */ } tlbi_kind_t; -/* Cap selective TLBI at this many 4 KiB pages. Beyond this, fall back to - * TLBI_BROADCAST: each TLBI VAE1IS broadcasts to all cores, so for large - * ranges the per-instruction issue cost outweighs the benefit of preserving - * unrelated TLB entries. 16 pages == 64 KiB covers RELRO and other typical - * mprotect / munmap targets. +/* Cap selective per-page TLBI VAE1IS at this many 4 KiB pages. Beyond this, + * use TLBI RVAE1IS if FEAT_TLBIRANGE is available, else fall back to + * TLBI_BROADCAST: per-instruction issue cost outweighs the benefit once the + * range is large. 16 pages == 64 KiB covers RELRO and other typical mprotect + * / munmap targets. */ #define TLBI_SELECTIVE_MAX_PAGES 16 +/* Cap single-shot TLBI RVAE1IS at this many 4 KiB pages. With SCALE=0 the + * RVAE1IS operand encoding covers (NUM+1)*2 pages with NUM in [0..31], so a + * single instruction reaches 64 pages == 256 KiB. Beyond that the host would + * need SCALE=1 (NUM*64 step), which over-invalidates for the typical + * dynamic-linker RELRO / glibc-bring-up storm sizes seen in practice; stay + * at SCALE=0 for now and broadcast above 64 pages. + */ +#define TLBI_RVAE_MAX_PAGES 64 + +/* TLBI RVAE1IS operand bit-field constants. Per ARM ARM DDI 0487J.a D8.7.6 + * the operand layout is: + * bits [36:0] BaseADDR (VA[48:12] for 4 KiB granule, DS=0) + * bits [38:37] TTL (0 = any level) + * bits [43:39] NUM + * bits [45:44] SCALE + * bits [47:46] TG (00 = RESERVED, 01 = 4 KiB, 10 = 16 KiB, + * 11 = 64 KiB) + * bits [63:48] ASID + * elfuse only ever issues 4 KiB-granule TLBIs (TCR_EL1.TG0 = 4 KiB), so + * TG is hard-pinned to 01 and the corresponding bit is named here. */ +#define RVAE_OPERAND_BADDR_MASK ((1ULL << 37) - 1) +#define RVAE_OPERAND_NUM_SHIFT 39 +#define RVAE_OPERAND_TG_4KB (1ULL << 46) + +/* Pure encoder: build the TLBI RVAE1IS Xt operand from a 4 KiB-aligned VA + * and a page count in the SCALE=0 range (1..TLBI_RVAE_MAX_PAGES). Lives in + * the header as `static inline` so tlbi_request_emit_to_vcpu and any + * future caller (host-side unit tests included) compile to the same + * expression. NUM = ceil(pages / 2) - 1 over-invalidates odd page counts + * by exactly one page, which is a perf-only side effect (the extra + * invalidation evicts a neighbour TLB entry that the guest's next access + * reloads). pages < 2 is clamped to 2 because SCALE=0 NUM=0 means 2 + * pages -- the encoder cannot represent a single page through RVAE1IS; + * single-page callers go through the per-page VAE1IS path instead, but + * the clamp keeps the encoder total in any pathological input. */ +static inline uint64_t tlbi_rvae1is_operand(uint64_t start_va, uint16_t pages) +{ + if (pages < 2) + pages = 2; + uint64_t baddr = (start_va >> 12) & RVAE_OPERAND_BADDR_MASK; + uint64_t num = ((pages + 1) / 2) - 1; + if (num > 31) + num = 31; + return baddr | (num << RVAE_OPERAND_NUM_SHIFT) | RVAE_OPERAND_TG_4KB; +} + +/* Runtime feature flag: TRUE when the host PE implements FEAT_TLBIRANGE + * (ARMv8.4+, present on every Apple Silicon M1+). Probed once at bootstrap. + * Read-only after startup so callers do not need an atomic load. */ +extern bool g_tlbi_range_supported; + typedef struct { - uint8_t kind; /* tlbi_kind_t */ - uint16_t pages; /* Page count when kind == TLBI_RANGE (1..MAX) */ - uint64_t start; /* Page-aligned VA when kind == TLBI_RANGE */ + uint8_t kind; /* tlbi_kind_t */ + uint8_t icache_flush; /* 1 = the change introduced executable content + * visible to EL0, so the shim must IC IALLU + * after the TLBI sequence. 0 = data-only + * change, skip the I-cache invalidation. */ + uint16_t pages; /* Page count when kind == TLBI_RANGE (1..MAX) */ + uint64_t start; /* Page-aligned VA when kind == TLBI_RANGE */ } tlbi_request_t; +/* Layout contract: 16 bytes (1+1+2+4 padding+8). Documents the padding and + * pins the TLS slot size so future field additions surface as a build break + * rather than silently growing the per-vCPU footprint. */ +_Static_assert(sizeof(tlbi_request_t) == 16, + "tlbi_request_t must stay 16 bytes; update tlbi_request_clear " + "and the syscall epilogue if the layout changes"); /* Multi-region IPA mapping. * @@ -486,6 +551,7 @@ extern _Thread_local tlbi_request_t cpu_tlbi_req; static inline void tlbi_request_clear(void) { cpu_tlbi_req.kind = TLBI_NONE; + cpu_tlbi_req.icache_flush = 0; cpu_tlbi_req.pages = 0; cpu_tlbi_req.start = 0; } @@ -495,6 +561,72 @@ static inline void tlbi_request_broadcast(void) cpu_tlbi_req.kind = TLBI_BROADCAST; } +/* True if the accumulator is already at TLBI_BROADCAST. PT mutation helpers + * use this to skip the per-page bounding-box bookkeeping (changed_lo / + * changed_hi tracking and the final tlbi_request_range call) once a broadcast + * is already promised; the inline tlbi_request_range itself short-circuits + * for the same reason but the call-site loops still pay for the compares. + */ +static inline bool tlbi_request_is_broadcast(void) +{ + return cpu_tlbi_req.kind == TLBI_BROADCAST; +} + +/* Mark that the current syscall's PT mutation introduced executable content + * visible to EL0 (a new X mapping, or an mprotect that added MEM_PERM_X to + * a previously-NX page). The shim consults this via X11 on syscall return + * to decide whether IC IALLU is needed after the TLBI sequence. Data-only + * page-table changes (mprotect RW<->R, munmap of data, etc.) leave this + * cleared so the I-cache invalidation is skipped. + */ +static inline void tlbi_request_mark_icache(void) +{ + cpu_tlbi_req.icache_flush = 1; +} + +/* Encode the pending TLBI request into the vCPU's X8/X9/X10/X11 registers + * for the shim's post-HVC dispatch and clear the per-vCPU accumulator. + * Both the syscall HVC #5 epilogue and the HVC #11 EL0-fault handler use + * this so the same X8 wire codes (and X11 I-cache hint) drive every TLBI + * the host issues on behalf of the guest. Keeping the helper inline lets + * the call sites compile to the same switch in both files. + */ +static inline void tlbi_request_emit_to_vcpu(hv_vcpu_t vcpu) +{ + switch ((tlbi_kind_t) cpu_tlbi_req.kind) { + case TLBI_BROADCAST: + hv_vcpu_set_reg(vcpu, HV_REG_X8, 1); + hv_vcpu_set_reg(vcpu, HV_REG_X11, cpu_tlbi_req.icache_flush ? 1 : 0); + break; + case TLBI_RANGE: + hv_vcpu_set_reg(vcpu, HV_REG_X8, 3); + hv_vcpu_set_reg(vcpu, HV_REG_X9, cpu_tlbi_req.start); + hv_vcpu_set_reg(vcpu, HV_REG_X10, cpu_tlbi_req.pages); + hv_vcpu_set_reg(vcpu, HV_REG_X11, cpu_tlbi_req.icache_flush ? 1 : 0); + break; + case TLBI_RANGE_LARGE: { + /* Single-shot TLBI RVAE1IS for ranges in (16..64] pages. The + * operand format and the SCALE=0 / TG=01 / ASID=0 assumptions are + * documented at tlbi_rvae1is_operand above. ASID stays 0 because + * the shim runs single-ASID (TCR_EL1.A1=0, TTBR0 ASID=0; rosetta + * does not allocate a separate ASID). If a future change + * introduces non-zero ASIDs, the helper signature and the + * tlbi_request_t accumulator both need an ASID field. */ + uint64_t operand = + tlbi_rvae1is_operand(cpu_tlbi_req.start, cpu_tlbi_req.pages); + hv_vcpu_set_reg(vcpu, HV_REG_X8, 4); + hv_vcpu_set_reg(vcpu, HV_REG_X9, operand); + hv_vcpu_set_reg(vcpu, HV_REG_X11, cpu_tlbi_req.icache_flush ? 1 : 0); + break; + } + case TLBI_NONE: + default: + hv_vcpu_set_reg(vcpu, HV_REG_X8, 0); + break; + } + tlbi_request_clear(); +} + static inline void tlbi_request_range(uint64_t start, uint64_t end) { if (cpu_tlbi_req.kind == TLBI_BROADCAST) @@ -513,26 +645,34 @@ static inline void tlbi_request_range(uint64_t start, uint64_t end) uint64_t s = start & ~mask; uint64_t e = (end + mask) & ~mask; uint64_t n = (e - s) >> 12; - if (n > TLBI_SELECTIVE_MAX_PAGES) { + /* Two thresholds. (a) <= TLBI_SELECTIVE_MAX_PAGES uses the per-page + * VAE1IS loop, which preserves the most TLB entries. (b) <= + * TLBI_RVAE_MAX_PAGES uses a single TLBI RVAE1IS via FEAT_TLBIRANGE, + * which still preserves unrelated TLB entries but costs only one + * instruction issue. Above TLBI_RVAE_MAX_PAGES or when the feature is + * absent, broadcast (TLBI VMALLE1IS). */ + uint64_t large_cap = + g_tlbi_range_supported ? TLBI_RVAE_MAX_PAGES : TLBI_SELECTIVE_MAX_PAGES; + if (n > large_cap) { tlbi_request_broadcast(); return; } if (cpu_tlbi_req.kind == TLBI_NONE) { - cpu_tlbi_req.kind = TLBI_RANGE; + cpu_tlbi_req.kind = + (n > TLBI_SELECTIVE_MAX_PAGES) ? TLBI_RANGE_LARGE : TLBI_RANGE; cpu_tlbi_req.start = s; cpu_tlbi_req.pages = (uint16_t) n; return; } - /* TLBI_RANGE: coalesce by union. Disjoint ranges still produce a single - * bounding interval; if it stays within the cap, the per-page TLBI loop - * still wins over a full flush by preserving the rest of the TLB. + /* Coalesce by union. Disjoint ranges still produce a single bounding + * interval; if it stays within the active cap, the range TLBI still + * wins over a full flush by preserving unrelated TLB entries. */ uint64_t es = cpu_tlbi_req.start; uint64_t pe = (uint64_t) cpu_tlbi_req.pages * 4096ULL; - /* The accumulator only ever holds page counts <= TLBI_SELECTIVE_MAX_PAGES - * (see the cap check above), so es + pe never overflows on real callers, - * but be explicit. - */ + /* The accumulator only ever holds page counts <= large_cap (enforced by + * the cap check above), so es + pe never overflows on real callers, but + * be explicit. */ if (es > UINT64_MAX - pe) { tlbi_request_broadcast(); return; @@ -541,12 +681,17 @@ static inline void tlbi_request_range(uint64_t start, uint64_t end) uint64_t us = s < es ? s : es; uint64_t ue = e > ee ? e : ee; uint64_t un = (ue - us) >> 12; - if (un > TLBI_SELECTIVE_MAX_PAGES) { + if (un > large_cap) { tlbi_request_broadcast(); return; } cpu_tlbi_req.start = us; cpu_tlbi_req.pages = (uint16_t) un; + /* Promote kind if the coalesced range now exceeds the per-page cap. The + * inverse direction (LARGE -> RANGE) is impossible because un >= pe / 4096 + * after coalescing. */ + if (un > TLBI_SELECTIVE_MAX_PAGES) + cpu_tlbi_req.kind = TLBI_RANGE_LARGE; } /* Convert a guest offset (0-based) to an IPA/VA (ipa_base + offset) */ diff --git a/src/core/shim.S b/src/core/shim.S index a2613c3..47ee5ed 100644 --- a/src/core/shim.S +++ b/src/core/shim.S @@ -22,11 +22,32 @@ * 2 = execve replaced register state (full flush * + drop frame + ERET without GPR restore) * 3 = selective: TLBI VAE1IS over X10 pages - * starting at page-aligned VA in X9) + * starting at page-aligned VA in X9 + * 4 = single-shot TLBI RVAE1IS (FEAT_TLBIRANGE); + * X9 carries the pre-encoded RVAE1IS operand + * (baddr | NUM<<39 | SCALE<<44 | TTL<<37 | + * ASID<<48; SCALE=0, TTL=0, ASID=0 today) + * X11 carries the I-cache hint for X8 in {1, 3, 4}: + * 1 = IC IALLU after the TLBI sequence (new + * executable content visible to EL0), 0 = skip the + * I-cache invalidation (data-only PT change). The + * X8 == 2 exec_drop_frame path always flushes + * regardless because execve loads new code. The + * shim restores X11 from the saved frame before + * ERET so the EL0 caller never observes this hint.) * #7 MRS trap (host reads reg from ESR ISS; returns value in x0) * #9 W^X toggle (x0=FAR, x1=type: 0=exec->RX, 1=write->RW) * #10 BRK from EL0 (SIGTRAP delivery / ptrace-stop; GPRs in frame) - * #11 EL0 fault (SIGSEGV/SIGILL delivery; GPRs in frame) + * #11 EL0 fault (SIGSEGV/SIGILL delivery; GPRs in frame. + * On return the host sets X8 to the same TLBI + * wire code as SVC #5 above (0/1/3/4 only; + * X8 == 2 exec_drop_frame is rejected and falls + * through to the conservative full flush). The + * lazy MAP_NORESERVE materialize path uses this + * to invalidate any negative TLB entry the EL0 + * retry would otherwise re-fault on. Eret tail + * preserves X0/X1/X2/X30 so signal_deliver's + * register writes survive.) * #12 System instr trap (cache maintenance logging: DC CVAU, IC IVAU, etc.) * * macOS as uses ';' as a comment character on AArch64, NOT as a statement @@ -118,6 +139,35 @@ add sp, sp, #256 .endm +/* RESTORE_GPRS_KEEP_SIGFRAME: load X3-X29 from the saved frame and pop it, + * leaving X0/X1/X2/X30 untouched. signal_deliver writes the signum, siginfo + * pointer, ucontext pointer, and sa_restorer address into those live regs + * via hv_vcpu_set_reg before returning from HVC #11; the standard + * RESTORE_GPRS_KEEP_X0 tail would clobber X1/X2/X30 with their pre-fault + * EL0 values. Used by the HVC #11 post-handler so the lazy-materialize + * path can run TLBI ops (which clobber X11/X12/X13 as scratch) while + * still preserving signal_deliver's register writes on the SIGSEGV / + * SIGILL delivery path. The caller is responsible for setting X8 = 0 + * if no TLBI is needed; X8 is loaded from the frame here regardless. + */ +.macro RESTORE_GPRS_KEEP_SIGFRAME + ldr x3, [sp, #24] + ldp x4, x5, [sp, #32] + ldp x6, x7, [sp, #48] + ldp x8, x9, [sp, #64] + ldp x10, x11, [sp, #80] + ldp x12, x13, [sp, #96] + ldp x14, x15, [sp, #112] + ldp x16, x17, [sp, #128] + ldp x18, x19, [sp, #144] + ldp x20, x21, [sp, #160] + ldp x22, x23, [sp, #176] + ldp x24, x25, [sp, #192] + ldp x26, x27, [sp, #208] + ldp x28, x29, [sp, #224] + add sp, sp, #256 +.endm + /* ZERO_GPRS: clear X0-X30 (used before the EL1->EL0 transition). */ .macro ZERO_GPRS .irp r, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 @@ -743,19 +793,104 @@ handle_data_abort: * EC=0x00 (undefined) or other -> SIGILL. */ handle_el0_fault: - /* Restore all GPRs from stack frame (host needs EL0 state) */ - RESTORE_GPRS + /* Load EL0 GPRs from the saved frame WITHOUT popping it. The frame + * must stay live across HVC #11 so the post-HVC dispatch below can + * restore X3-X29 via the frameless RESTORE_GPRS_KEEP_SIGFRAME tail + * after the host has overwritten X0/X1/X2/X30 via hv_vcpu_set_reg + * (signal_deliver) and the inline TLBI handlers have clobbered + * X11/X12/X13 as scratch. + */ + LOAD_GPRS - /* Forward to host for SIGSEGV delivery. - * Host reads ESR_EL1 (fault syndrome), FAR_EL1 (faulting address), and - * ELR_EL1 (PC at fault) to decode the fault and build the signal frame. + /* Forward to host. Host paths: + * (1) signal delivery -> writes X0=signum, X1=siginfo*, X2=ucontext*, + * X30=sa_restorer via hv_vcpu_set_reg; host leaves X8=0. + * (2) lazy MAP_NORESERVE materialize -> populates cpu_tlbi_req via + * guest_extend_page_tables / guest_update_perms / friends, then + * calls tlbi_request_emit_to_vcpu(vcpu) which writes + * X8/X9/X10/X11 per the same wire protocol the SVC #5 epilogue + * uses. Single-shot RVAE1IS via X8=4 is included. + * + * The dispatch below mirrors handle_svc_0's TLBI dispatch. exec_drop + * (X8=2) is rejected on this path (the host never sets it here); fall + * through to the conservative full flush rather than silently skip. */ hvc #11 - /* Host has set up signal frame (if SIGSEGV handler registered) or - * flagged for termination (default SIGSEGV = core dump). ERET to - * new PC (handler address or unchanged if terminating). + cbz x8, .Lel0_fault_eret_only + cmp x8, #1 + b.eq .Lel0_fault_tlbi_full + cmp x8, #3 + b.eq .Lel0_fault_tlbi_sel + cmp x8, #4 + b.eq .Lel0_fault_tlbi_rvae + /* Unknown X8: conservative broadcast + I-cache flush. */ + mov x11, #1 + b .Lel0_fault_tlbi_full + +.Lel0_fault_tlbi_full: + /* Broadcast TLB + conditional I-cache flush. X11=0 skips IC IALLU. */ + tlbi vmalle1is + dsb ish + cbz x11, .Lel0_fault_full_no_ic + ic iallu + dsb ish +.Lel0_fault_full_no_ic: + isb + b .Lel0_fault_eret_restore + +.Lel0_fault_tlbi_sel: + /* Selective per-page TLBI VAE1IS loop. X9 = page-aligned VA, + * X10 = page count (1..TLBI_SELECTIVE_MAX_PAGES). X11 carries the + * I-cache hint on entry; save into X13 before the loop clobbers X11. + * ubfx (not plain lsr) pins the VA to the 44-bit [43:0] field so a + * future LPA2 / TTL / tagged-address change cannot leak high bits + * into the operand's TTL [47:44] or ASID [63:48] fields. */ + cbz x10, .Lel0_fault_eret_only + mov x13, x11 + ubfx x11, x9, #12, #44 + mov x12, x10 +4: tlbi vae1is, x11 + add x11, x11, #1 + subs x12, x12, #1 + b.ne 4b + dsb ish + cbz x13, .Lel0_fault_sel_no_ic + ic iallu + dsb ish +.Lel0_fault_sel_no_ic: + isb + b .Lel0_fault_eret_restore + +.Lel0_fault_tlbi_rvae: + /* Single-shot TLBI RVAE1IS (FEAT_TLBIRANGE). X9 carries the pre-encoded + * operand (baddr | NUM<<39 | TG=01<<46); X11 the I-cache hint. */ + tlbi rvae1is, x9 + dsb ish + cbz x11, .Lel0_fault_rvae_no_ic + ic iallu + dsb ish +.Lel0_fault_rvae_no_ic: + isb + /* fall through */ + +.Lel0_fault_eret_restore: + /* TLBI clobbered X11/X12/X13 (and possibly X9/X10 in the selective + * path). Reload X3-X29 from the saved frame so the EL0 retry sees the + * same scratch state as pre-fault; skip X0/X1/X2/X30 in case + * signal delivery set them. On lazy materialization and conservative + * unknown-X8 fallback paths, the skipped registers still match the frame. */ + RESTORE_GPRS_KEEP_SIGFRAME + eret + +.Lel0_fault_eret_only: + /* X8 == 0: no TLBI requested. signal-delivery and no-delivery paths + * land here after the host wrote X8 as the post-HVC protocol value. + * Reload X3-X29 so EL0 sees the pre-fault scratch state, while keeping + * X0/X1/X2/X30 live for a materialized signal handler. + */ + RESTORE_GPRS_KEEP_SIGFRAME eret /* Shared exit paths for exception handlers @@ -768,7 +903,11 @@ tlbi_restore_eret: * raise further exceptions, so FAR_EL1 is preserved. */ mrs x0, far_el1 - lsr x0, x0, #12 /* TLBI VAE1IS operand: VA[55:12] */ + /* TLBI VAE1IS operand: VA[55:12] held in bits [43:0]. ubfx pins the + * 44-bit width so future LPA2 / TTL / tagged-address support cannot + * leak VA bits into the TTL [47:44] or ASID [63:48] operand fields. + */ + ubfx x0, x0, #12, #44 tlbi vae1is, x0 dsb ish ic iallu @@ -856,31 +995,39 @@ handle_svc_0: b.eq tlbi_full cmp x8, #3 b.eq tlbi_selective + cmp x8, #4 + b.eq tlbi_range_large cmp x8, #2 b.eq exec_drop_frame - /* Unknown X8: be conservative, broadcast and continue. */ + /* Unknown X8: be conservative, broadcast, flush I-cache, and continue. */ + mov x11, #1 tlbi_full: - /* Broadcast TLB + I-cache flush. Used for page-table edits whose - * affected range exceeds the selective cap, or any time the host could - * not bound the change. + /* Broadcast TLB + (conditional) I-cache flush. Used for page-table edits + * whose affected range exceeds the selective cap, or any time the host + * could not bound the change. X11 carries the I-cache hint: non-zero + * means the host introduced executable content visible to EL0 (new X + * mapping, NX->X mprotect, lazy materialize from a region that may + * include exec), so the shim must IC IALLU; zero means a data-only + * PT change and the I-cache invalidation is skipped. */ tlbi vmalle1is dsb ish + cbz x11, .Ltlbi_full_skip_ic ic iallu dsb ish +.Ltlbi_full_skip_ic: isb - b 1f + b svc_restore_eret tlbi_selective: /* Selective TLBI VAE1IS loop. * x9 = page-aligned VA of the first page to invalidate * x10 = page count (1..TLBI_SELECTIVE_MAX_PAGES, see core/guest.h) + * x11 = I-cache hint (see tlbi_full above) * TLBI VAE1IS takes a Xt operand of (VA[55:12] | (ASID << 48)). The * guest runs single-ASID at EL0, so just shift the VA right by 12. - * Issue all TLBI ops, then a single DSB ISH + IC IALLU + DSB + ISB - * matches broadcast semantics (preserves I-cache invalidation behavior - * for callers like file-backed mmap of executable pages). + * Issue all TLBI ops, then DSB ISH + (conditional) IC IALLU + DSB + ISB. * * Defensive: if x10 == 0, skip the loop. The per-vCPU host-side * accumulator (cpu_tlbi_req in core/guest.h) never sets pages == 0 @@ -888,17 +1035,43 @@ tlbi_selective: * write ever produced the pair X8=3, X10=0, the subs x12, x12, #1 * below would underflow to 0xFFFFFFFFFFFFFFFF and the b.ne would loop * ~2^64 iterations, hanging this vCPU. Cheap guard. + * + * x11 is the I-cache hint on entry but the per-page TLBI operand is + * also computed from x9 -- save the hint into x13 before clobbering. */ cbz x10, 1f - lsr x11, x9, #12 /* x11 = VA >> 12 (current page operand) */ + mov x13, x11 /* x13 = saved I-cache hint */ + /* TLBI VAE1IS operand: VA[55:12] held in bits [43:0]. ubfx pins the + * 44-bit width so future LPA2 / TTL / tagged-address support cannot + * leak VA bits into the TTL [47:44] or ASID [63:48] operand fields. */ + ubfx x11, x9, #12, #44 /* x11 = VA[55:12] (current page operand) */ mov x12, x10 /* x12 = remaining page counter */ 3: tlbi vae1is, x11 add x11, x11, #1 /* next page (operand is in 4 KiB units) */ subs x12, x12, #1 b.ne 3b dsb ish + cbz x13, .Ltlbi_sel_skip_ic + ic iallu + dsb ish +.Ltlbi_sel_skip_ic: + isb + b svc_restore_eret + +tlbi_range_large: + /* Single-shot TLBI RVAE1IS (FEAT_TLBIRANGE, ARMv8.4+). The host has + * encoded the full operand in X9: baddr (VA >> 12), TTL=0, NUM in bits + * [43:39], SCALE=0, ASID=0. One instruction covers up to 64 pages, + * avoiding the broadcast TLBI VMALLE1IS that the prior selective cap + * forced for 17..64-page ranges. X11 carries the I-cache hint as in + * tlbi_full / tlbi_selective. + */ + tlbi rvae1is, x9 + dsb ish + cbz x11, .Ltlbi_rvae_skip_ic ic iallu dsb ish +.Ltlbi_rvae_skip_ic: isb b svc_restore_eret diff --git a/src/syscall/proc.c b/src/syscall/proc.c index 33cde52..647bab8 100644 --- a/src/syscall/proc.c +++ b/src/syscall/proc.c @@ -1585,70 +1585,25 @@ int vcpu_run_loop(hv_vcpu_t vcpu, uint32_t fault_ec = (uint32_t) ((esr >> 26) & 0x3F); - int signum, si_code; - uint64_t si_addr; - - if (fault_ec == 0x20 || fault_ec == 0x24) { - /* Instruction or data abort -> check lazy page - * materialization before delivering SIGSEGV. - */ - uint32_t fsc = (uint32_t) (esr & 0x3F); - uint32_t fsc_type = (fsc >> 2) & 0xF; - - /* Translation faults have xFSC[5:2] == 0x1; the low - * bits select the translation table level. These may - * come from a MAP_NORESERVE region with deferred page - * table creation, so try to materialize the page - * before declaring SIGSEGV. - */ - if (fsc_type == 0x01) { - uint64_t fault_off = far_addr - g->ipa_base; - pthread_mutex_lock(&mmap_lock); - int mat = guest_materialize_lazy(g, fault_off); - pthread_mutex_unlock(&mmap_lock); - if (mat == 0) { - /* Page materialized; TLBI and retry the - * faulting instruction. Set X8=1 to request - * TLBI from the shim before ERET. - */ - hv_vcpu_set_reg(vcpu, HV_REG_X8, 1); - break; - } - } - - signum = LINUX_SIGSEGV; - /* Permission faults have xFSC[5:2] == 0x3. Address - * size, translation, and access-flag faults remain - * mapping errors for Linux-visible SIGSEGV delivery. - */ - si_code = (fsc_type == 0x03) ? LINUX_SEGV_ACCERR - : LINUX_SEGV_MAPERR; - si_addr = far_addr; - - if (verbose) { - const char *fault_type = - (fault_ec == 0x20) ? "inst" : "data"; - const char *code_name = - (si_code == LINUX_SEGV_MAPERR) ? "MAPERR" - : "ACCERR"; - log_debug( - "%s: EL0 %s fault at 0x%llx " - "PC=0x%llx (ESR=0x%llx FSC=0x%x) " - "-> SIGSEGV/%s", - prefix, fault_type, - (unsigned long long) far_addr, - (unsigned long long) elr_addr, - (unsigned long long) esr, fsc, code_name); - } - } else { - /* EC=0x00 (undefined instruction) or other unrecognized - * EC from EL0 -> SIGILL. Use ELR_EL1 as si_addr because - * FAR_EL1 is UNKNOWN for non-abort exceptions. - */ - signum = LINUX_SIGILL; - si_code = LINUX_ILL_ILLOPC; - si_addr = elr_addr; - + /* Non-abort EC -> SIGILL. Branch out early so the + * abort / SIGSEGV path below stays at the case-body + * indent rather than nested inside an else branch. + * FAR_EL1 is UNKNOWN for non-abort exceptions, so use + * ELR_EL1 for si_addr. + * + * Only EC 0x20 (instruction abort from a lower EL) and + * EC 0x24 (data abort from a lower EL) are intentionally + * routed to the SIGSEGV path that follows. Every other + * forwarded EC -- 0x00 (undefined instruction), 0x18 + * (system instruction trap), 0x32/0x33 (software + * step), 0x3C (BRK), and any unrecognized class -- + * lands here as SIGILL. If a future change adds a new + * lower-EL abort class (e.g. 0x21 / 0x25 for higher + * exception levels) that should map to SIGSEGV, the + * test below needs explicit widening; do NOT relax + * the check casually. + */ + if (fault_ec != 0x20 && fault_ec != 0x24) { if (verbose) log_debug( "%s: EL0 undefined insn at " @@ -1656,18 +1611,91 @@ int vcpu_run_loop(hv_vcpu_t vcpu, "-> SIGILL/ILL_ILLOPC", prefix, (unsigned long long) elr_addr, (unsigned long long) esr, fault_ec); + signal_set_fault_info(LINUX_ILL_ILLOPC, elr_addr, esr); + signal_queue(LINUX_SIGILL); + int sig_ret = signal_deliver(vcpu, g, &exit_code); + /* HVC #11 consumes X8 as the post-fault TLBI opcode. + * signal_deliver() may leave it unchanged when no + * handler is materialized, or set the syscall-path + * frame-drop marker when one is. Neither is a TLBI + * request here; lazy materialization emits its own + * request and exits before this path. + */ + hv_vcpu_set_reg(vcpu, HV_REG_X8, 0); + if (verbose) + log_debug("%s: signal %d deliver returned %d", + prefix, LINUX_SIGILL, sig_ret); + if (sig_ret < 0) + running = false; /* SIG_DFL core => terminate. */ + break; + } + + /* Instruction or data abort. Try lazy page materialization + * before declaring SIGSEGV: translation faults + * (xFSC[5:2] == 0x1) may come from a MAP_NORESERVE region + * with deferred page-table creation. + */ + uint32_t fsc = (uint32_t) (esr & 0x3F); + uint32_t fsc_type = (fsc >> 2) & 0xF; + if (fsc_type == 0x01) { + uint64_t fault_off = far_addr - g->ipa_base; + pthread_mutex_lock(&mmap_lock); + int mat = guest_materialize_lazy(g, fault_off); + pthread_mutex_unlock(&mmap_lock); + if (mat == 0) { + /* Page materialized; the helpers inside + * guest_materialize_lazy populated the + * per-vCPU TLBI accumulator with the range + * just installed (plus the I-cache hint if + * the region's prot includes PROT_EXEC). + * Drain it through the shared emit helper + * so the shim's post-HVC-11 dispatch + * (handle_el0_fault) actually issues the + * TLBI before ERET. Without this, a PE that + * caches translation-fault (negative) + * entries would re-fault on the retry, + * looping until the entry self-evicts. */ + tlbi_request_emit_to_vcpu(vcpu); + break; + } } - signal_set_fault_info(si_code, si_addr, esr); - signal_queue(signum); + /* Real SIGSEGV. Permission faults (xFSC[5:2] == 0x3) map + * to SEGV_ACCERR; address size, translation, and + * access-flag faults map to SEGV_MAPERR for Linux. + */ + int si_code = (fsc_type == 0x03) ? LINUX_SEGV_ACCERR + : LINUX_SEGV_MAPERR; + if (verbose) { + const char *fault_type = + (fault_ec == 0x20) ? "inst" : "data"; + const char *code_name = (si_code == LINUX_SEGV_MAPERR) + ? "MAPERR" + : "ACCERR"; + log_debug( + "%s: EL0 %s fault at 0x%llx " + "PC=0x%llx (ESR=0x%llx FSC=0x%x) " + "-> SIGSEGV/%s", + prefix, fault_type, (unsigned long long) far_addr, + (unsigned long long) elr_addr, + (unsigned long long) esr, fsc, code_name); + } + signal_set_fault_info(si_code, far_addr, esr); + signal_queue(LINUX_SIGSEGV); int sig_ret = signal_deliver(vcpu, g, &exit_code); + /* HVC #11 consumes X8 as the post-fault TLBI opcode. + * signal_deliver() may leave it unchanged when no + * handler is materialized, or set the syscall-path + * frame-drop marker when one is. Neither is a TLBI + * request here; lazy materialization emits its own + * request and exits before this path. + */ + hv_vcpu_set_reg(vcpu, HV_REG_X8, 0); if (verbose) log_debug("%s: signal %d deliver returned %d", prefix, - signum, sig_ret); - if (sig_ret < 0) { - /* Core dispositions terminate without a core file. */ - running = false; - } + LINUX_SIGSEGV, sig_ret); + if (sig_ret < 0) + running = false; /* SIG_DFL core => terminate. */ break; } diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index be97787..3b67e2d 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -1883,33 +1883,17 @@ int syscall_dispatch(hv_vcpu_t vcpu, guest_t *g, int *exit_code, bool verbose) hv_vcpu_set_reg(vcpu, HV_REG_X0, (uint64_t) result); /* Signal the shim to flush TLB if this vCPU modified page tables. - * Protocol after HVC #5 (X8 carries the request): - * 0 -> skip - * 1 -> broadcast TLBI VMALLE1IS - * 2 -> reserved for execve (set by sys_execve, never reached here) - * 3 -> selective TLBI VAE1IS over X10 pages starting at X9 - * Must explicitly write X8 because the shim reads its post-HVC value; - * the pre-syscall X8 is the syscall number (always non-zero) and would - * spuriously TLBI on every return. + * Protocol after HVC #5 lives in tlbi_request_emit_to_vcpu (see + * src/core/guest.h); the helper also handles the HVC #11 EL0-fault + * lazy-materialize path so both call sites use the same wire codes. + * Must call the emit helper because the shim reads X8 unconditionally + * on return; the pre-syscall X8 is the syscall number (always + * non-zero) and would spuriously TLBI on every return. * * cpu_tlbi_req is a per-vCPU TLS slot, so this read needs no lock and * cannot be drained or torn by another vCPU's epilogue. */ - switch ((tlbi_kind_t) cpu_tlbi_req.kind) { - case TLBI_BROADCAST: - hv_vcpu_set_reg(vcpu, HV_REG_X8, 1); - break; - case TLBI_RANGE: - hv_vcpu_set_reg(vcpu, HV_REG_X8, 3); - hv_vcpu_set_reg(vcpu, HV_REG_X9, cpu_tlbi_req.start); - hv_vcpu_set_reg(vcpu, HV_REG_X10, cpu_tlbi_req.pages); - break; - case TLBI_NONE: - default: - hv_vcpu_set_reg(vcpu, HV_REG_X8, 0); - break; - } - tlbi_request_clear(); + tlbi_request_emit_to_vcpu(vcpu); } return should_exit; diff --git a/tests/manifest.txt b/tests/manifest.txt index 19b1b27..b819975 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -79,6 +79,7 @@ test-simd-clone # diff=skip [section] Stress tests test-stress # diff=skip +test-mprotect-mt # diff=skip [section] Negative / error-path tests test-negative # diff=skip diff --git a/tests/test-mprotect-mt.c b/tests/test-mprotect-mt.c new file mode 100644 index 0000000..ae23ed7 --- /dev/null +++ b/tests/test-mprotect-mt.c @@ -0,0 +1,651 @@ +/* Multi-vCPU concurrent mprotect stress + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Two scenarios run back-to-back to surface stale-TLB / mprotect-TLBI + * regressions across vCPUs: + * + * 1. No-op false-positive stress. A toggler thread repeatedly mprotects a + * shared page to its existing perms (RW -> RW). Four reader threads do + * direct EL0 writes to the page in a tight loop. Validates that the + * false-positive elimination in guest_update_perms / + * guest_invalidate_ptes does not lose write visibility when the + * requested perms already match the live PTE. + * + * 2. R <-> RW alternation via syscall write path. A toggler flips perms + * while reader threads call read(/dev/urandom, page, n). The kernel + * page-walks before touching the buffer, so any stale-TLB-induced + * anomaly surfaces as an unexpected return value (anything other than + * n or -EFAULT). The VM crashing mid-run -- the failure mode the + * bounded-retry hardening item in TODO.md is gated on -- is also + * caught here because the test driver wraps every run in a timeout. + * + * The test does not try to PROVE the cross-vCPU race window absent. A + * passing run is evidence the bounded-retry hardening lacks a concrete + * reproducer today; a hard crash or accounting mismatch would supply one. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test-harness.h" + +int passes = 0, fails = 0; + +#define PAGE_SIZE 4096 +#define READER_THREADS 4 +#define NOOP_ITERS 50000 +#define ALT_READS 5000 +#define ALT_TOGGLE_ITERS 5000 + +static atomic_int g_running; +static atomic_uint_least64_t g_writes; +static atomic_uint_least64_t g_mismatches; +static atomic_uint_least64_t g_success; +static atomic_uint_least64_t g_efault; +static atomic_uint_least64_t g_other; + +struct noop_ctx { + volatile uint32_t *page; + uint32_t tag; + int iters; +}; + +static void *noop_reader(void *arg) +{ + struct noop_ctx *ctx = arg; + for (int i = 0; i < ctx->iters && atomic_load(&g_running); i++) { + uint32_t v = (ctx->tag << 16) | (uint32_t) (i & 0xFFFF); + ctx->page[ctx->tag] = v; + atomic_fetch_add_explicit(&g_writes, 1, memory_order_relaxed); + uint32_t back = ctx->page[ctx->tag]; + if (back != v) { + /* Another thread targets a different slot, so any value other + * than what this thread just wrote is a coherence bug. + */ + atomic_fetch_add_explicit(&g_mismatches, 1, memory_order_relaxed); + } + } + return NULL; +} + +static void *noop_toggler(void *arg) +{ + volatile uint32_t *page = arg; + while (atomic_load(&g_running)) { + if (mprotect((void *) page, PAGE_SIZE, PROT_READ | PROT_WRITE) != 0) { + atomic_fetch_add_explicit(&g_mismatches, 1, memory_order_relaxed); + return NULL; + } + } + return NULL; +} + +static void test_noop_mprotect_stress(void) +{ + TEST("no-op mprotect false-positive stress"); + + void *p = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + FAIL("mmap"); + return; + } + memset(p, 0, PAGE_SIZE); + + atomic_store(&g_writes, 0); + atomic_store(&g_mismatches, 0); + atomic_store(&g_running, 1); + + pthread_t readers[READER_THREADS]; + struct noop_ctx ctxs[READER_THREADS]; + for (int i = 0; i < READER_THREADS; i++) { + ctxs[i].page = p; + ctxs[i].tag = (uint32_t) i; + ctxs[i].iters = NOOP_ITERS; + if (pthread_create(&readers[i], NULL, noop_reader, &ctxs[i]) != 0) { + atomic_store(&g_running, 0); + for (int j = 0; j < i; j++) + pthread_join(readers[j], NULL); + munmap(p, PAGE_SIZE); + FAIL("pthread_create reader"); + return; + } + } + + pthread_t toggler; + if (pthread_create(&toggler, NULL, noop_toggler, p) != 0) { + atomic_store(&g_running, 0); + for (int i = 0; i < READER_THREADS; i++) + pthread_join(readers[i], NULL); + munmap(p, PAGE_SIZE); + FAIL("pthread_create toggler"); + return; + } + + for (int i = 0; i < READER_THREADS; i++) + pthread_join(readers[i], NULL); + atomic_store(&g_running, 0); + pthread_join(toggler, NULL); + + uint64_t writes = atomic_load(&g_writes); + uint64_t mismatches = atomic_load(&g_mismatches); + munmap(p, PAGE_SIZE); + + if (mismatches != 0 || writes == 0) { + char msg[96]; + snprintf(msg, sizeof(msg), "writes=%llu mismatches=%llu", + (unsigned long long) writes, (unsigned long long) mismatches); + FAIL(msg); + return; + } + PASS(); +} + +struct alt_ctx { + void *page; + int fd; + int iters; +}; + +static void *alt_reader(void *arg) +{ + struct alt_ctx *ctx = arg; + char *p = ctx->page; + for (int i = 0; i < ctx->iters && atomic_load(&g_running); i++) { + errno = 0; + ssize_t r = read(ctx->fd, p, 64); + if (r == 64) { + atomic_fetch_add_explicit(&g_success, 1, memory_order_relaxed); + } else if (r < 0 && errno == EFAULT) { + atomic_fetch_add_explicit(&g_efault, 1, memory_order_relaxed); + } else { + atomic_fetch_add_explicit(&g_other, 1, memory_order_relaxed); + } + } + return NULL; +} + +static void *alt_toggler(void *arg) +{ + void *page = arg; + int local_iters = ALT_TOGGLE_ITERS; + while (atomic_load(&g_running) && local_iters-- > 0) { + if (mprotect(page, PAGE_SIZE, PROT_READ) != 0) { + atomic_fetch_add_explicit(&g_other, 1, memory_order_relaxed); + return NULL; + } + if (mprotect(page, PAGE_SIZE, PROT_READ | PROT_WRITE) != 0) { + atomic_fetch_add_explicit(&g_other, 1, memory_order_relaxed); + return NULL; + } + } + return NULL; +} + +static void test_alternating_mprotect_stress(void) +{ + TEST("R<->RW mprotect stress (syscall reader)"); + + int fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + FAIL("open /dev/urandom"); + return; + } + void *p = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + close(fd); + FAIL("mmap"); + return; + } + memset(p, 0, PAGE_SIZE); + + atomic_store(&g_success, 0); + atomic_store(&g_efault, 0); + atomic_store(&g_other, 0); + atomic_store(&g_running, 1); + + pthread_t readers[READER_THREADS]; + struct alt_ctx ctxs[READER_THREADS]; + for (int i = 0; i < READER_THREADS; i++) { + ctxs[i].page = p; + ctxs[i].fd = fd; + ctxs[i].iters = ALT_READS; + if (pthread_create(&readers[i], NULL, alt_reader, &ctxs[i]) != 0) { + atomic_store(&g_running, 0); + for (int j = 0; j < i; j++) + pthread_join(readers[j], NULL); + munmap(p, PAGE_SIZE); + close(fd); + FAIL("pthread_create reader"); + return; + } + } + + pthread_t toggler; + if (pthread_create(&toggler, NULL, alt_toggler, p) != 0) { + atomic_store(&g_running, 0); + for (int i = 0; i < READER_THREADS; i++) + pthread_join(readers[i], NULL); + munmap(p, PAGE_SIZE); + close(fd); + FAIL("pthread_create toggler"); + return; + } + + for (int i = 0; i < READER_THREADS; i++) + pthread_join(readers[i], NULL); + atomic_store(&g_running, 0); + pthread_join(toggler, NULL); + + uint64_t s = atomic_load(&g_success); + uint64_t e = atomic_load(&g_efault); + uint64_t o = atomic_load(&g_other); + uint64_t total = s + e + o; + uint64_t expected = (uint64_t) READER_THREADS * (uint64_t) ALT_READS; + + /* Always restore RW before unmap so the cleanup is clean. */ + mprotect(p, PAGE_SIZE, PROT_READ | PROT_WRITE); + munmap(p, PAGE_SIZE); + close(fd); + + if (o != 0) { + char msg[128]; + snprintf(msg, sizeof(msg), + "unexpected read returns: ok=%llu efault=%llu other=%llu", + (unsigned long long) s, (unsigned long long) e, + (unsigned long long) o); + FAIL(msg); + return; + } + if (total != expected) { + char msg[128]; + snprintf(msg, sizeof(msg), + "missing iterations: total=%llu expected=%llu", + (unsigned long long) total, (unsigned long long) expected); + FAIL(msg); + return; + } + printf("ok=%llu efault=%llu ... ", (unsigned long long) s, + (unsigned long long) e); + PASS(); +} + +/* Single-threaded sweep across page counts that exercise the three TLBI + * accumulator branches: <=TLBI_SELECTIVE_MAX_PAGES (per-page VAE1IS), + * 17..64 pages (FEAT_TLBIRANGE RVAE1IS single shot), >64 pages (broadcast + * VMALLE1IS). Each size is mprotect-cycled R<->RW with full readback. A + * stale TLB or wrong RVAE1IS NUM/SCALE encoding would surface as a data + * mismatch or a SIGSEGV during the readback phase. */ +static void test_rvae_boundary_sweep(void) +{ + /* 2 hits the smallest RVAE1IS encoding (NUM=0) if it ever reaches the + * TLBI_RANGE_LARGE path via coalescing; today the selective threshold + * gates it off, but the test pins the encoding contract. The remaining + * sizes straddle the selective / RVAE1IS / broadcast accumulator + * boundaries. */ + static const int sizes[] = {2, 16, 17, 32, 63, 64, 65, 128}; + static const int n_sizes = (int) (sizeof(sizes) / sizeof(sizes[0])); + for (int k = 0; k < n_sizes; k++) { + int npages = sizes[k]; + char label[64]; + snprintf(label, sizeof(label), "RVAE1IS boundary sweep (%d pages)", + npages); + TEST(label); + + size_t sz = (size_t) npages * PAGE_SIZE; + uint8_t *p = mmap(NULL, sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + FAIL("mmap"); + continue; + } + for (int i = 0; i < npages; i++) + p[(size_t) i * PAGE_SIZE] = (uint8_t) i; + + bool ok = true; + for (int cycle = 0; cycle < 20 && ok; cycle++) { + if (mprotect(p, sz, PROT_READ) != 0) { + ok = false; + break; + } + for (int i = 0; i < npages; i++) + if (p[(size_t) i * PAGE_SIZE] != (uint8_t) i) { + ok = false; + break; + } + if (!ok) + break; + if (mprotect(p, sz, PROT_READ | PROT_WRITE) != 0) { + ok = false; + break; + } + for (int i = 0; i < npages; i++) { + if (p[(size_t) i * PAGE_SIZE] != (uint8_t) i) { + ok = false; + break; + } + p[(size_t) i * PAGE_SIZE] = (uint8_t) (i ^ cycle); + } + if (!ok) + break; + for (int i = 0; i < npages; i++) { + if (p[(size_t) i * PAGE_SIZE] != (uint8_t) (i ^ cycle)) { + ok = false; + break; + } + p[(size_t) i * PAGE_SIZE] = (uint8_t) i; + } + } + munmap(p, sz); + if (ok) + PASS(); + else + FAIL("readback or mprotect failed"); + } +} + +/* Multi-vCPU variant of the alternating R<->RW test but on a 32-page region + * so the toggler hits the TLBI_RANGE_LARGE path (RVAE1IS) instead of the + * single-page selective TLBI. Inner-shareable RVAE1IS must invalidate the + * sibling vCPU TLBs; if it doesn't, the reader threads see stale TLB entries + * and the test surfaces an unexpected read return code or a VM crash. */ +struct rvae_toggler_arg { + void *page; + size_t size; +}; + +static void *rvae_mt_toggler(void *arg) +{ + struct rvae_toggler_arg *a = arg; + int local_iters = ALT_TOGGLE_ITERS; + while (atomic_load(&g_running) && local_iters-- > 0) { + if (mprotect(a->page, a->size, PROT_READ) != 0) { + atomic_fetch_add_explicit(&g_other, 1, memory_order_relaxed); + return NULL; + } + if (mprotect(a->page, a->size, PROT_READ | PROT_WRITE) != 0) { + atomic_fetch_add_explicit(&g_other, 1, memory_order_relaxed); + return NULL; + } + } + return NULL; +} + +static void test_rvae_multi_vcpu_stress(int npages) +{ + char label[64]; + snprintf(label, sizeof(label), "RVAE1IS multi-vCPU %d-page stress (NUM=%d)", + npages, ((npages + 1) / 2) - 1); + TEST(label); + + int fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + FAIL("open /dev/urandom"); + return; + } + size_t sz = (size_t) npages * PAGE_SIZE; + void *p = mmap(NULL, sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + close(fd); + FAIL("mmap"); + return; + } + memset(p, 0, sz); + + atomic_store(&g_success, 0); + atomic_store(&g_efault, 0); + atomic_store(&g_other, 0); + atomic_store(&g_running, 1); + + pthread_t readers[READER_THREADS]; + struct alt_ctx ctxs[READER_THREADS]; + for (int i = 0; i < READER_THREADS; i++) { + ctxs[i].page = p; + ctxs[i].fd = fd; + ctxs[i].iters = ALT_READS; + if (pthread_create(&readers[i], NULL, alt_reader, &ctxs[i]) != 0) { + atomic_store(&g_running, 0); + for (int j = 0; j < i; j++) + pthread_join(readers[j], NULL); + munmap(p, sz); + close(fd); + FAIL("pthread_create reader"); + return; + } + } + + pthread_t toggler; + struct rvae_toggler_arg targ = {p, sz}; + if (pthread_create(&toggler, NULL, rvae_mt_toggler, &targ) != 0) { + atomic_store(&g_running, 0); + for (int i = 0; i < READER_THREADS; i++) + pthread_join(readers[i], NULL); + munmap(p, sz); + close(fd); + FAIL("pthread_create toggler"); + return; + } + + for (int i = 0; i < READER_THREADS; i++) + pthread_join(readers[i], NULL); + atomic_store(&g_running, 0); + pthread_join(toggler, NULL); + + uint64_t s = atomic_load(&g_success); + uint64_t e = atomic_load(&g_efault); + uint64_t o = atomic_load(&g_other); + uint64_t total = s + e + o; + uint64_t expected = (uint64_t) READER_THREADS * (uint64_t) ALT_READS; + + mprotect(p, sz, PROT_READ | PROT_WRITE); + munmap(p, sz); + close(fd); + + if (o != 0) { + char msg[128]; + snprintf(msg, sizeof(msg), + "unexpected read returns: ok=%llu efault=%llu other=%llu", + (unsigned long long) s, (unsigned long long) e, + (unsigned long long) o); + FAIL(msg); + return; + } + if (total != expected) { + char msg[128]; + snprintf(msg, sizeof(msg), + "missing iterations: total=%llu expected=%llu", + (unsigned long long) total, (unsigned long long) expected); + FAIL(msg); + return; + } + printf("ok=%llu efault=%llu ... ", (unsigned long long) s, + (unsigned long long) e); + PASS(); +} + +/* 32-page mprotect cycle that deterministically straddles a 2 MiB guest + * block boundary. The boundary forces guest_split_block on both blocks + * the range crosses (16 pages each side), exercising the split-then- + * tlbi-range-large code path that the ordinary boundary sweep only hits + * by chance depending on gap-finder placement. */ +static void test_rvae_2mib_straddle(void) +{ + TEST("RVAE1IS 2 MiB block-straddle cycle"); + + /* Allocate enough headroom to guarantee a 2 MiB boundary with at least + * 16 pages on each side, regardless of where mmap places the region. + * Worst case: mmap returns a 2 MiB-aligned base, so the first usable + * boundary is mmap_base + 2 MiB; we need 16 pages below that boundary + * (i.e. inside the first 2 MiB) and 16 pages above (inside the second + * 2 MiB). 4 MiB + slack covers it. */ + const size_t mib_2 = 2 * 1024 * 1024; + size_t alloc_sz = 4 * mib_2 + 64 * PAGE_SIZE; + uint8_t *region = mmap(NULL, alloc_sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (region == MAP_FAILED) { + FAIL("mmap"); + return; + } + /* Pick the first 2 MiB boundary AT LEAST 16 pages above the start so + * the 32-page protect window straddles it (16 pages below + 16 above). + * If the natural rounded-up boundary is too close to base, jump to the + * next one -- the allocation is sized to keep that within range. */ + uintptr_t base = (uintptr_t) region; + uintptr_t boundary = (base + mib_2 - 1) & ~(uintptr_t) (mib_2 - 1); + if (boundary - base < 16 * PAGE_SIZE) + boundary += mib_2; + if (boundary + 16 * PAGE_SIZE > base + alloc_sz) { + munmap(region, alloc_sz); + FAIL("boundary not addressable inside region"); + return; + } + uint8_t *p = (uint8_t *) (boundary - 16 * PAGE_SIZE); + size_t sz = 32 * PAGE_SIZE; + + for (size_t i = 0; i < 32; i++) + p[i * PAGE_SIZE] = (uint8_t) i; + + bool ok = true; + for (int cycle = 0; cycle < 20 && ok; cycle++) { + if (mprotect(p, sz, PROT_READ) != 0) { + ok = false; + break; + } + for (size_t i = 0; i < 32; i++) + if (p[i * PAGE_SIZE] != (uint8_t) i) { + ok = false; + break; + } + if (!ok) + break; + if (mprotect(p, sz, PROT_READ | PROT_WRITE) != 0) { + ok = false; + break; + } + for (size_t i = 0; i < 32; i++) { + if (p[i * PAGE_SIZE] != (uint8_t) i) { + ok = false; + break; + } + p[i * PAGE_SIZE] = (uint8_t) ((unsigned) i ^ (unsigned) cycle); + } + if (!ok) + break; + for (size_t i = 0; i < 32; i++) { + if (p[i * PAGE_SIZE] != + (uint8_t) ((unsigned) i ^ (unsigned) cycle)) { + ok = false; + break; + } + p[i * PAGE_SIZE] = (uint8_t) i; + } + } + munmap(region, alloc_sz); + if (ok) + PASS(); + else + FAIL("straddle readback or mprotect failed"); +} + +/* R<->RX cycle on a 32-page region. Each cycle writes a unique + * `mov w0, #imm; ret` epilogue to every page while RW, then mprotects to + * RX and calls the page. The expected return value is the imm just written. + * If the X11 I-cache hint were dropped from the TLBI_RANGE_LARGE path, the + * call would execute stale instructions cached from a prior cycle and the + * returned imm would mismatch. + * + * The RVAE1IS path is exercised because the 32-page range exceeds + * TLBI_SELECTIVE_MAX_PAGES = 16; combined with PROT_EXEC the helper marks + * icache_flush=1 and the shim's tlbi_range_large branch runs IC IALLU. */ +static void test_rvae_icache_stress(void) +{ + TEST("RVAE1IS R<->RX I-cache hint coverage"); + + enum { NPAGES = 32 }; + size_t sz = (size_t) NPAGES * PAGE_SIZE; + uint32_t *p = mmap(NULL, sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + FAIL("mmap"); + return; + } + + bool ok = true; + for (int cycle = 0; cycle < 16 && ok; cycle++) { + /* Distinct imm per cycle so a stale I-cache fetch surfaces as a + * value mismatch. imm range [1, 0xFFF] -- mov-imm encoding takes a + * 16-bit literal at bits [20:5], easy to keep small. */ + uint32_t imm = (uint32_t) (cycle + 1) & 0xFFFu; + /* mov w0, #imm = 0x52800000 | (imm << 5) */ + uint32_t mov = 0x52800000u | (imm << 5); + /* ret = 0xD65F03C0 (RET X30) */ + uint32_t ret = 0xD65F03C0u; + + for (size_t i = 0; i < NPAGES; i++) { + uint32_t *pg = (uint32_t *) ((uint8_t *) p + i * PAGE_SIZE); + pg[0] = mov; + pg[1] = ret; + } + + if (mprotect(p, sz, PROT_READ | PROT_EXEC) != 0) { + ok = false; + break; + } + + /* Call each page; verify the return value matches the imm we just + * wrote. A mismatch indicates the I-cache held a stale instruction + * from a prior cycle (i.e. the RVAE1IS path skipped IC IALLU). */ + for (size_t i = 0; i < NPAGES; i++) { + uint32_t (*fn)(void) = + (uint32_t (*)(void))((uint8_t *) p + i * PAGE_SIZE); + uint32_t got = fn(); + if (got != imm) { + ok = false; + break; + } + } + + if (mprotect(p, sz, PROT_READ | PROT_WRITE) != 0) { + ok = false; + break; + } + } + munmap(p, sz); + + if (ok) + PASS(); + else + FAIL("I-cache content mismatch or mprotect failure"); +} + +int main(void) +{ + printf("test-mprotect-mt: multi-vCPU mprotect stress\n"); + + test_noop_mprotect_stress(); + test_alternating_mprotect_stress(); + test_rvae_boundary_sweep(); + test_rvae_2mib_straddle(); + test_rvae_icache_stress(); + /* Drive the RVAE1IS NUM encoding across its boundaries under contention: + * 17 pages -> NUM=8, 32 -> NUM=15 (mid), 64 -> NUM=31 (max). */ + test_rvae_multi_vcpu_stress(17); + test_rvae_multi_vcpu_stress(32); + test_rvae_multi_vcpu_stress(64); + + SUMMARY("test-mprotect-mt"); + return fails > 0 ? 1 : 0; +} diff --git a/tests/test-tlbi-encoder-host.c b/tests/test-tlbi-encoder-host.c new file mode 100644 index 0000000..13e1377 --- /dev/null +++ b/tests/test-tlbi-encoder-host.c @@ -0,0 +1,146 @@ +/* Native-host unit test for the TLBI RVAE1IS operand encoder. + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * The integration tests in tests/test-mprotect-mt.c exercise the operand + * end-to-end inside a VM, but they happily pass on M4 even when the + * encoder dropped the TG=01 bit (the M-series PE silently falls back to + * TCR_EL1.TGn). This host-side test decodes the operand bit-by-bit and + * asserts every field matches the ARM ARM DDI 0487J.a D8.7.6 layout, so a + * future regression in the encoder surfaces as a build / CI failure + * regardless of the running PE's tolerance for reserved encodings. + * + * Native macOS binary; no HVF entitlement needed (the encoder is pure C). + * Symbols pulled from core/guest.h that the encoder does not actually + * reference still need to link, so a stub cpu_tlbi_req / g_tlbi_range_* + * definition lives below. + */ + +#include +#include +#include +#include + +#include "core/guest.h" + +/* Stubs for the extern symbols guest.h declares. The encoder under test + * does not read them, but the linker needs definitions. */ +_Thread_local tlbi_request_t cpu_tlbi_req; +bool g_tlbi_range_supported; + +static int passes; +static int fails; + +static void check_field(const char *label, uint64_t got, uint64_t expect) +{ + if (got == expect) { + passes++; + } else { + fails++; + fprintf(stderr, "FAIL %s: got 0x%llx, expected 0x%llx\n", label, + (unsigned long long) got, (unsigned long long) expect); + } +} + +/* Decompose the operand per ARM ARM D8.7.6 and compare each field against + * the expected value. baseADDR is VA>>12 masked to 37 bits; TG must be 01 + * (4 KiB); SCALE must be 0; TTL must be 0; ASID must be 0. NUM derives + * from the page count via the ceil(pages/2) - 1 SCALE=0 encoding. */ +static void verify_operand(uint64_t start_va, + uint16_t pages, + uint64_t expect_num) +{ + uint64_t op = tlbi_rvae1is_operand(start_va, pages); + + uint64_t baddr = op & ((1ULL << 37) - 1); + uint64_t ttl = (op >> 37) & 0x3; + uint64_t num = (op >> 39) & 0x1F; + uint64_t scale = (op >> 44) & 0x3; + uint64_t tg = (op >> 46) & 0x3; + uint64_t asid = (op >> 48) & 0xFFFF; + + char label[64]; + snprintf(label, sizeof(label), "BaseADDR (start=0x%llx)", + (unsigned long long) start_va); + check_field(label, baddr, (start_va >> 12) & ((1ULL << 37) - 1)); + + snprintf(label, sizeof(label), "TTL (start=0x%llx)", + (unsigned long long) start_va); + check_field(label, ttl, 0); + + snprintf(label, sizeof(label), "NUM (pages=%u)", (unsigned) pages); + check_field(label, num, expect_num); + + snprintf(label, sizeof(label), "SCALE (pages=%u)", (unsigned) pages); + check_field(label, scale, 0); + + snprintf(label, sizeof(label), "TG (start=0x%llx)", + (unsigned long long) start_va); + check_field(label, tg, 1); /* 4 KiB granule */ + + snprintf(label, sizeof(label), "ASID (start=0x%llx)", + (unsigned long long) start_va); + check_field(label, asid, 0); +} + +int main(void) +{ + printf("test-tlbi-encoder-host: RVAE1IS operand bit-field verification\n"); + + /* SCALE=0 NUM table: NUM = ceil(pages/2) - 1. + * pages 2 -> NUM 0 (covers 2) + * pages 3 -> NUM 1 (covers 4, over-invalidates by 1) + * pages 16 -> NUM 7 (covers 16, exact) + * pages 17 -> NUM 8 (covers 18) + * pages 32 -> NUM 15 (covers 32) + * pages 63 -> NUM 31 (covers 64) + * pages 64 -> NUM 31 (covers 64) + */ + verify_operand(0x10000000ULL, 2, 0); + verify_operand(0x10000000ULL, 3, 1); + verify_operand(0x10000000ULL, 16, 7); + verify_operand(0x10000000ULL, 17, 8); + verify_operand(0x10000000ULL, 32, 15); + verify_operand(0x10000000ULL, 63, 31); + verify_operand(0x10000000ULL, 64, 31); + + /* Boundary VAs. 4 KiB-aligned, low-VA, MMAP_BASE (8 GiB), high-VA + * just below the 48-bit BaseADDR truncation point. */ + verify_operand(0x00000000ULL, 32, 15); /* zero base */ + verify_operand(0x200000000ULL, 32, 15); /* MMAP_BASE */ + verify_operand(0x800000000000ULL, 32, 15); /* Rosetta image */ + verify_operand(0x0000FFFFF0000000ULL, 32, 15); /* KBUF_USER_VA */ + + /* Pathological inputs the clamp must catch: + * pages = 0 -> clamped to 2 -> NUM 0 + * pages = 1 -> clamped to 2 -> NUM 0 (callers never reach here) + * pages = UINT16_MAX -> NUM clamped to 31 (saturating) + */ + verify_operand(0x10000000ULL, 0, 0); + verify_operand(0x10000000ULL, 1, 0); + verify_operand(0x10000000ULL, UINT16_MAX, 31); + + /* TG bit is the architectural lynchpin -- if the encoder ever drops + * it the integration tests on Apple Silicon would still pass. Pin a + * direct bit-46 inspection so a regression to TG=00 fails this test + * immediately. */ + uint64_t op = tlbi_rvae1is_operand(0x10000000ULL, 32); + if (op & (1ULL << 46)) { + passes++; + } else { + fails++; + fprintf(stderr, "FAIL TG bit 46 must be set (4 KiB granule, TG=01)\n"); + } + if (op & (1ULL << 47)) { + fails++; + fprintf(stderr, + "FAIL TG bit 47 must be clear (TG=01 has bit 47 = 0)\n"); + } else { + passes++; + } + + printf("\ntest-tlbi-encoder-host: %d passed, %d failed%s\n", passes, fails, + fails == 0 ? " - PASS" : " - FAIL"); + return fails > 0 ? 1 : 0; +}