Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ SRCS := \
core/elf.c \
core/stack.c \
core/vdso.c \
core/shim-globals.c \
core/bootstrap.c \
core/rosetta.c \
core/sysroot.c \
Expand Down Expand Up @@ -160,6 +161,24 @@ $(BUILD_DIR)/test-pthread: tests/test-pthread.c | $(BUILD_DIR)
@echo " CROSS $< (with -lpthread)"
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread

# test-shim-cred-race spawns a pthread reader while the main thread
# toggles setresuid; the reader spins on the identity fast path.
$(BUILD_DIR)/test-shim-cred-race: tests/test-shim-cred-race.c | $(BUILD_DIR)
@echo " CROSS $< (with -lpthread)"
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread

# test-shim-urandom-smp spawns N pthreads racing on a shared FD_URANDOM
# slot to exercise the shim's LDXR/STXR head-advance under contention.
$(BUILD_DIR)/test-shim-urandom-smp: tests/test-shim-urandom-smp.c | $(BUILD_DIR)
@echo " CROSS $< (with -lpthread)"
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread

# test-shim-urandom-toctou races mprotect(PROT_NONE) against urandom
# reads to exercise the EL1 data abort recovery path. Needs pthreads.
$(BUILD_DIR)/test-shim-urandom-toctou: tests/test-shim-urandom-toctou.c | $(BUILD_DIR)
@echo " CROSS $< (with -lpthread)"
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread

# test-fuse-basic runs a guest daemon thread and consumer in one process
$(BUILD_DIR)/test-fuse-basic: tests/test-fuse-basic.c | $(BUILD_DIR)
@echo " CROSS $< (with -lpthread)"
Expand Down
148 changes: 124 additions & 24 deletions src/core/bootstrap.c

Large diffs are not rendered by default.

50 changes: 44 additions & 6 deletions src/core/elf.c
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,16 @@ int elf_map_segments(const elf_info_t *info,
const char *path,
void *guest_base,
uint64_t guest_size,
uint64_t load_base)
uint64_t load_base,
uint64_t infra_lo,
uint64_t infra_hi)
{
/* Half-open intersection test for [a, a+alen) and [b, b+blen). When
* infra_lo == infra_hi the caller opted out (early bring-up before
* guest_t is wired up); the host-side writes that follow still get
* the existing guest_size bound check.
*/
bool infra_active = infra_lo < infra_hi;
FILE *f = fopen(path, "rb");
if (!f) {
perror(path);
Expand Down Expand Up @@ -264,6 +272,17 @@ int elf_map_segments(const elf_info_t *info,
fclose(f);
return -1;
}
if (infra_active && phdr_dest < infra_hi &&
phdr_dest + ph_total > infra_lo) {
log_error(
"%s: program headers at 0x%llx overlap infra reserve "
"[0x%llx, 0x%llx)",
path, (unsigned long long) phdr_dest, (unsigned long long) infra_lo,
(unsigned long long) infra_hi);
free(ph_buf);
fclose(f);
return -1;
}
memcpy((uint8_t *) guest_base + phdr_dest, ph_buf, ph_total);

/* Copy PT_LOAD contents after AT_PHDR is in place; ET_DYN segments are
Expand Down Expand Up @@ -308,15 +327,34 @@ int elf_map_segments(const elf_info_t *info,
return -1;
}

/* Zero the full page-aligned segment extent, not only p_memsz.
* Linux guarantees zero-filled tail bytes in the last mapped page,
* and some dynamic linkers allocate from that page tail before they
* request more memory. Leaving stale bytes there leaks state across
* execve and corrupts the new image.
/* The host memset zeros PAGE_ALIGN_UP(memsz) bytes, not just memsz,
* so the infra-overlap check has to use the same rounded extent.
* Without the rounding here, a segment that ends just below
* infra_lo passes the check and still spills up to PAGE_SIZE-1
* bytes of zero into the infra reserve via the page tail.
*/
uint64_t zero_len = PAGE_ALIGN_UP(memsz);
if (gpa + zero_len > guest_size)
zero_len = guest_size - gpa;
if (infra_active && gpa < infra_hi && gpa + zero_len > infra_lo) {
log_error(
"%s: segment at 0x%llx+0x%llx (zero-extent 0x%llx) overlaps "
"infra reserve [0x%llx, 0x%llx)",
path, (unsigned long long) gpa, (unsigned long long) memsz,
(unsigned long long) zero_len, (unsigned long long) infra_lo,
(unsigned long long) infra_hi);
free(ph_buf);
fclose(f);
return -1;
}

/* Zero the full page-aligned segment extent (zero_len computed above
* with guest_size and infra_reserve checks). Linux guarantees
* zero-filled tail bytes in the last mapped page, and some dynamic
* linkers allocate from that page tail before they request more
* memory. Leaving stale bytes there leaks state across execve and
* corrupts the new image.
*/
memset((uint8_t *) guest_base + gpa, 0, zero_len);

/* Overlay initialized bytes after zeroing so BSS and page tail remain
Expand Down
9 changes: 8 additions & 1 deletion src/core/elf.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,20 @@ int elf_load(const char *path, elf_info_t *info);
* Also copies program headers into guest memory for AT_PHDR.
* load_base is added to all virtual addresses (0 for ET_EXEC at link addr,
* non-zero for ET_DYN loaded at a chosen base).
* infra_lo and infra_hi delimit the runtime infra reserve (page-table pool,
* shim text, shim_data, vDSO). Any PT_LOAD or PT_PHDR copy whose destination
* intersects [infra_lo, infra_hi) is rejected: those writes go through
* host_base directly and would otherwise bypass the EL1-only page-table
* protection on shim_data. Pass 0,0 only when the guest_t is not yet built.
* Returns 0 on success, -1 on failure.
*/
int elf_map_segments(const elf_info_t *info,
const char *path,
void *guest_base,
uint64_t guest_size,
uint64_t load_base);
uint64_t load_base,
uint64_t infra_lo,
uint64_t infra_hi);

/* Resolve a PT_INTERP path against a sysroot directory.
* Tries three strategies:
Expand Down
69 changes: 64 additions & 5 deletions src/core/guest.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include <unistd.h>

#include "core/guest.h"
#include "core/startup-trace.h"
#include "debug/log.h"
#include "utils.h"
#include "runtime/thread.h" /* thread_destroy_all_vcpus */
Expand All @@ -60,6 +61,7 @@ static void guest_region_clear(guest_t *g);
#define PT_UXN (1ULL << 54) /* Unprivileged Execute Never */
#define PT_PXN (1ULL << 53) /* Privileged Execute Never */
#define PT_AP_RW_EL0 (1ULL << 6) /* AP[2:1]=01: RW at EL1, RW at EL0 */
#define PT_AP_RW_EL1 (0ULL << 6) /* AP[2:1]=00: RW at EL1, no access EL0 */
#define PT_AP_RO (3ULL << 6) /* AP[2:1]=11: RO at EL1, RO at EL0 */

/* PAGE_SIZE / ALIGN_2MB_* live in utils.h; BLOCK_2MIB lives in core/guest.h. */
Expand Down Expand Up @@ -202,6 +204,8 @@ static uint64_t *pt_at(const guest_t *g, uint64_t gpa)

int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
{
uint64_t t0;

memset(g, 0, sizeof(*g));
g->shm_fd = -1;
g->ipa_base = GUEST_IPA_BASE;
Expand Down Expand Up @@ -257,6 +261,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
* seconds max wait) to handle this gracefully.
*/
hv_return_t ret = HV_ERROR;
t0 = startup_trace_now_ns();
for (int attempt = 0; attempt < 30; attempt++) {
hv_vm_config_t config = hv_vm_config_create();
hv_vm_config_set_ipa_size(config, vm_ipa);
Expand All @@ -266,6 +271,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
break;
usleep(500000); /* 500ms between attempts */
}
startup_trace_step("hv_vm_create", t0);
if (ret != HV_SUCCESS) {
log_error("guest: hv_vm_create failed: %d (ipa_bits=%u)", (int) ret,
vm_ipa);
Expand Down Expand Up @@ -307,8 +313,10 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
* physical memory. Do NOT memset because that would touch every
* page and defeat demand paging.
*/
t0 = startup_trace_now_ns();
g->host_base = mmap(NULL, try_size, PROT_READ | PROT_WRITE,
MAP_ANON | MAP_PRIVATE, -1, 0);
startup_trace_step("primary_mmap", t0);
if (g->host_base == MAP_FAILED) {
perror("guest: mmap");
g->host_base = NULL;
Expand All @@ -320,6 +328,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
* path instead of SCM_RIGHTS fd passing.
*/
char tmppath[] = "/tmp/elfuse-XXXXXX";
t0 = startup_trace_now_ns();
int sfd = mkstemp(tmppath);
if (sfd >= 0) {
unlink(tmppath); /* Unlink immediately; fd keeps file alive */
Expand All @@ -335,9 +344,12 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
close(sfd);
}
}
startup_trace_step("cow_shm_upgrade", t0);

t0 = startup_trace_now_ns();
ret = hv_vm_map(g->host_base, GUEST_IPA_BASE, try_size,
HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC);
startup_trace_step("hv_vm_map", t0);
if (ret == HV_SUCCESS) {
mapped_size = try_size;
mapped = true;
Expand Down Expand Up @@ -380,6 +392,8 @@ int guest_init_from_shm(guest_t *g,
uint64_t size,
uint32_t ipa_bits)
{
uint64_t t0;

memset(g, 0, sizeof(*g));
g->shm_fd = -1; /* Child does not own the shm */
g->ipa_base = GUEST_IPA_BASE;
Expand All @@ -403,8 +417,10 @@ int guest_init_from_shm(guest_t *g,
* the parent's frozen snapshot; writes are private to this process.
* macOS CoW is page-granular: only modified pages are duplicated.
*/
t0 = startup_trace_now_ns();
g->host_base =
mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, shm_fd, 0);
startup_trace_step("shm_mmap", t0);
if (g->host_base == MAP_FAILED) {
perror("guest: mmap shm");
g->host_base = NULL;
Expand All @@ -417,6 +433,7 @@ int guest_init_from_shm(guest_t *g,

/* Create HVF VM with the same IPA width as the parent */
hv_return_t ret = HV_ERROR;
t0 = startup_trace_now_ns();
for (int attempt = 0; attempt < 30; attempt++) {
hv_vm_config_t config = hv_vm_config_create();
hv_vm_config_set_ipa_size(config, ipa_bits);
Expand All @@ -426,15 +443,18 @@ int guest_init_from_shm(guest_t *g,
break;
usleep(500000);
}
startup_trace_step("hv_vm_create_shm", t0);
if (ret != HV_SUCCESS) {
log_error("guest: hv_vm_create (shm) failed: %d", (int) ret);
munmap(g->host_base, size);
g->host_base = NULL;
return -1;
}

t0 = startup_trace_now_ns();
ret = hv_vm_map(g->host_base, GUEST_IPA_BASE, size,
HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC);
startup_trace_step("hv_vm_map_shm", t0);
if (ret != HV_SUCCESS) {
log_error("guest: hv_vm_map (shm) failed: %d", (int) ret);
hv_vm_destroy();
Expand Down Expand Up @@ -1106,6 +1126,16 @@ static int gva_translate_perm(const guest_t *g,
return -1;

int perms = desc_to_perms(l3[l3_idx]);
/* EL1-only pages (shim_data) are inaccessible to guest EL0 in the
* page tables; the host accessors that act on a guest-supplied GVA
* must refuse them too, otherwise a guest could pass a shim_data
* GVA as a syscall buffer and have the host write into the identity
* cache or entropy ring on its behalf. The host's own publishers
* use direct host_base+shim_data_base arithmetic and bypass this
* walker entirely.
*/
if (perms & MEM_PERM_EL1_ONLY)
return -1;
if ((perms & required_perms) != required_perms)
return -1;

Expand Down Expand Up @@ -1136,6 +1166,12 @@ static int gva_translate_perm(const guest_t *g,

/* L2 block descriptor: 2MiB granularity. */
int perms = desc_to_perms(l2[l2_idx]);
/* See the L3 page-descriptor branch above: EL1-only blocks are
* inaccessible to host-on-behalf-of-guest accesses for the same
* reason. shim_data is mapped as a 2MiB EL1-only block at boot.
*/
if (perms & MEM_PERM_EL1_ONLY)
return -1;
if ((perms & required_perms) != required_perms)
return -1;

Expand Down Expand Up @@ -2079,10 +2115,20 @@ static uint64_t make_block_desc(uint64_t gpa, int perms)
}

/* Write permissions via AP bits:
* AP[2:1]=00 -> RW for EL1 only (no EL0 access)
* AP[2:1]=01 -> RW for EL1 and EL0
* AP[2:1]=11 -> RO for EL1 and EL0
* MEM_PERM_EL1_ONLY drops EL0 access entirely; used for shim_data
* so the guest cannot directly read or store to the cache, ring,
* bitmap, or attention flag.
*/
if (perms & MEM_PERM_W) {
if (perms & MEM_PERM_EL1_ONLY) {
desc |= PT_AP_RW_EL1;
/* EL1-only data: never EL0-executable (already set above if
* MEM_PERM_X is unset, but assert defensively).
*/
desc |= PT_UXN | PT_PXN;
} else if (perms & MEM_PERM_W) {
desc |= PT_AP_RW_EL0;
} else {
desc |= PT_AP_RO;
Expand Down Expand Up @@ -2513,22 +2559,35 @@ static uint64_t make_page_desc(uint64_t pa, int perms)
if (!(perms & MEM_PERM_X))
desc |= PT_UXN | PT_PXN;

if (perms & MEM_PERM_W)
if (perms & MEM_PERM_EL1_ONLY) {
desc |= PT_AP_RW_EL1;
desc |= PT_UXN | PT_PXN; /* EL1-only data never executes */
} else if (perms & MEM_PERM_W) {
desc |= PT_AP_RW_EL0;
else
} else {
desc |= PT_AP_RO;
}

return desc;
}

/* Extract MEM_PERM_* flags from a page table descriptor (block or page). */
/* Extract MEM_PERM_* flags from a page table descriptor (block or page).
* The AP[2:1] field encodes the EL1/EL0 access matrix; map 00 to
* MEM_PERM_RW | MEM_PERM_EL1_ONLY so callers see the privileged-only
* shim_data slots correctly instead of treating them as read-only.
*/
static int desc_to_perms(uint64_t desc)
{
int perms = MEM_PERM_R;
if (!(desc & PT_UXN))
perms |= MEM_PERM_X;
if ((desc & (3ULL << 6)) == PT_AP_RW_EL0)
uint64_t ap = desc & (3ULL << 6);
if (ap == PT_AP_RW_EL0) {
perms |= MEM_PERM_W;
} else if (ap == PT_AP_RW_EL1) {
perms |= MEM_PERM_W | MEM_PERM_EL1_ONLY;
}
/* PT_AP_RO (11) stays MEM_PERM_R only. */
return perms;
}

Expand Down
24 changes: 16 additions & 8 deletions src/core/guest.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,20 +127,28 @@
#define MEM_PERM_R (1 << 0)
#define MEM_PERM_W (1 << 1)
#define MEM_PERM_X (1 << 2)
/* AP[2:1]=00: privileged-only (no EL0 read/write). Combine with MEM_PERM_R/W.
* Used for shim_data so the guest cannot directly read or store to the identity
* cache, urandom bitmap, ring, or attention flag. The EL1 shim still has full
* RW. EL0 reads/writes fault to the EL0-fault path (SIGSEGV in the guest),
* matching what Linux does for kernel-only pages exposed in /proc/self/maps .
*/
#define MEM_PERM_EL1_ONLY (1 << 3)
#define MEM_PERM_RX (MEM_PERM_R | MEM_PERM_X)
#define MEM_PERM_RW (MEM_PERM_R | MEM_PERM_W)
#define MEM_PERM_RW_EL1_ONLY (MEM_PERM_R | MEM_PERM_W | MEM_PERM_EL1_ONLY)

/* A contiguous region of guest memory to be mapped in page tables.
*
* Default mode (va_base == 0): identity-mapped, VA == GPA. Used by every
* boot region (shim, vDSO, brk, stack) and every aarch64 ELF segment.
* Default mode (va_base == 0): identity-mapped, VA == GPA. Used by every boot
* region (shim, vDSO, brk, stack) and every aarch64 ELF segment.
*
* Rosetta segments use va_base != 0 to install a non-identity mapping:
* the rosetta ELF is statically linked at 0x800000000000 (128 TiB) but its
* bytes live in the primary buffer at a low GPA. Page-table entries are
* indexed by va_base + (offset within region) and emit a block descriptor
* whose output address is gpa_start + (offset within region). This is the
* only place in elfuse where guest VA diverges from guest GPA.
* Rosetta segments use va_base != 0 to install a non-identity mapping: the
* rosetta ELF is statically linked at 0x800000000000 (128 TiB) but its bytes
* live in the primary buffer at a low GPA. Page-table entries are indexed by
* va_base + (offset within region) and emit a block descriptor whose output
* address is gpa_start + (offset within region). This is the only place in
* elfuse where guest VA diverges from guest GPA.
*/
typedef struct {
uint64_t gpa_start; /* Output GPA / IPA (2MiB aligned) */
Expand Down
Loading
Loading