Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,25 @@ $(BUILD_DIR)/test-lowbase-mem-300000: tests/test-lowbase-mem.c | $(BUILD_DIR)
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -no-pie \
-Wl,-Ttext-segment=0x300000 -o $@ $<

# bench-hot-guard-glibc is the dynamic-glibc twin of bench-hot-guard.
# Built only when the cross-glibc toolchain ships its own sysroot
# (so a host without that toolchain can still run the rest of the
# suite). Linked without -static so glibc resolves time / urandom
# syscalls through the vDSO trampoline -- which is exactly what the
# guardrail script verifies against the 50 ns / 200 ns ceilings.
ifneq ($(wildcard $(LINUX_TOOLCHAIN)/aarch64-unknown-linux-gnu/sysroot/.),)
# -DGUARD_USE_LIBC_CG switches the bench's clock_gettime case from a
# direct vDSO trampoline call to the libc wrapper, so the dynamic-glibc
# build measures glibc's actual routing decision. A regression in the
# NT_GNU_ABI_TAG note or LINUX_2.6.39 versioning would push this
# measurement from ~7 ns up to SVC time (~2000 ns) and fail the
# guardrail.
$(BUILD_DIR)/bench-hot-guard-glibc: tests/bench-hot-guard.c | $(BUILD_DIR)
@echo " CROSS $< (dynamic glibc)"
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -DGUARD_USE_LIBC_CG=1 -O2 \
-o $@ $<
endif

endif

include mk/tests.mk
Expand Down
22 changes: 22 additions & 0 deletions mk/tests.mk
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,28 @@ check: $(ELFUSE_BIN) $(TEST_DEPS) check-syscall-coverage
@$(MAKE) --no-print-directory test-timeout-disable
@printf "\n$(BLUE)━━━ rosetta CLI gating ━━━$(RESET)\n"
@$(MAKE) --no-print-directory test-rosetta-cli
@printf "\n$(BLUE)━━━ hot-syscall guardrail ━━━$(RESET)\n"
@$(MAKE) --no-print-directory test-bench-guardrail

## Hot-syscall performance guardrail: ensure getpid, libc clock_gettime,
## and 1-byte /dev/urandom reads stay under their TODO ns/op ceilings.
## Builds the dynamic-glibc variant opportunistically; the script skips
## that arm when the cross-toolchain sysroot is missing.
BENCH_GUARDRAIL_DEPS := $(ELFUSE_BIN)
BENCH_GUARDRAIL_REQUIRE_STATIC := 0
ifndef GUEST_TEST_BINARIES
BENCH_GUARDRAIL_DEPS += $(BUILD_DIR)/bench-hot-guard
BENCH_GUARDRAIL_REQUIRE_STATIC := 1
ifneq ($(wildcard $(LINUX_TOOLCHAIN)/aarch64-unknown-linux-gnu/sysroot/.),)
BENCH_GUARDRAIL_DEPS += $(BUILD_DIR)/bench-hot-guard-glibc
endif
endif
test-bench-guardrail: $(BENCH_GUARDRAIL_DEPS)
@ELFUSE="$(ELFUSE_BIN)" \
BENCH_GUARDRAIL_DIR="$(TEST_DIR)" \
BENCH_GUARDRAIL_REQUIRE_STATIC="$(BENCH_GUARDRAIL_REQUIRE_STATIC)" \
LINUX_TOOLCHAIN="$(LINUX_TOOLCHAIN)" \
bash tests/test-bench-guardrail.sh

test-sysroot-rename: $(ELFUSE_BIN) $(BUILD_DIR)/test-sysroot-rename
@set -e; \
Expand Down
1 change: 1 addition & 0 deletions src/core/elf.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#define PT_LOAD 1
#define PT_DYNAMIC 2
#define PT_INTERP 3
#define PT_NOTE 4

/* Program header flags */
#define PF_X 1
Expand Down
1,032 changes: 855 additions & 177 deletions src/core/vdso.c

Large diffs are not rendered by default.

51 changes: 40 additions & 11 deletions src/core/vdso.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,23 @@
*/
uint64_t vdso_build(guest_t *g);

/* If the vvar anchor has not been seeded yet, install the supplied cntvct as
* the guest-frame anchor paired with the given monotonic and realtime
* wall_clock values. Idempotent: subsequent calls with initialized==1 are
* no-ops. Used by sys_clock_gettime to upgrade the first
* __kernel_clock_gettime SVC fallback into a permanent vvar fast path that
* serves both CLOCK_MONOTONIC and CLOCK_REALTIME.
/* Publish a new vvar anchor under the seqlock. Handles both the initial
* seed (seq 0 -> 1 -> 2) and refresh (seq 2K -> 2K+1 -> 2K+2) atomically
* through one CAS-then-release-store sequence. Concurrent publishers
* either lose the CAS or observe an odd seq and bail without blocking;
* trampoline readers detect mid-write tearing via their own LDAR
* snapshot/recheck. Callers (sys_clock_gettime / sys_gettimeofday) only
* need to invoke this when an SVC trap from the vDSO trampoline carries a
* trustworthy guest CNTVCT in X9.
*
* Overflow invariant: this function, the trampoline math, and
* vdso_realtime_drift_exceeded all depend on VDSO_ANCHOR_AGE_SHIFT == 22
* capping the per-call CNTVCT delta below 2^22. That bound keeps
* (delta * 699050666) below 2^52 (no uint64 overflow) and keeps
* anchor_nsec + delta_ns below 2e9 (so the trampoline's sub-1e9 carry
* collapses to a single SUBS + CSEL + CINC instead of a UDIV). The
* host-side drift check must apply the same formula and the same cap;
* any divergence lets the trampoline interpolate from a stale anchor.
*/
void vdso_seed_anchor(guest_t *g,
uint64_t guest_cntvct,
Expand All @@ -56,12 +67,13 @@ void vdso_seed_anchor(guest_t *g,
* + 4, so callers compare ELR_EL1 against that.
*/
uint64_t vdso_clock_gettime_svc_pc(void);
uint64_t vdso_gettimeofday_svc_pc(void);

/* Returns true once the vvar anchor has been published (initialized==1) and
* the fast path can never be reseeded. Lets the post-SVC handler in
* sys_clock_gettime skip the ELR_EL1 + X9 HVF reads it otherwise needs for
* the seeding gate, since the second-call onward gate is moot once seeded.
* Uses acquire ordering paired with vdso_seed_anchor's release store.
/* Returns true when the seqlock counter is at a stable (nonzero, even)
* generation, i.e. the anchor is currently publishable. Uses acquire
* ordering paired with vdso_seed_anchor's release store of the next
* even generation. Callers use this to gate the age / drift checks
* that decide whether to publish a refresh.
*/
bool vdso_anchor_is_seeded(guest_t *g);

Expand All @@ -72,3 +84,20 @@ bool vdso_anchor_is_seeded(guest_t *g);
*/
void vdso_attention_or(guest_t *g, uint32_t bits);
void vdso_attention_and(guest_t *g, uint32_t mask);

/* True iff the anchor is currently stable AND (current_cntvct -
* anchor_cntvct) has exceeded the trampoline's age cap. The host uses
* this with a freshly-sampled CNTVCT to decide whether to publish a
* refresh through vdso_seed_anchor.
*/
bool vdso_anchor_age_exceeded(guest_t *g, uint64_t current_cntvct);

/* True iff the anchor is seeded AND the wall-clock value predicted from
* the anchor + CNTVCT delta differs from the supplied freshly-sampled
* REALTIME (real_sec, real_nsec) by more than VDSO_ANCHOR_MAX_DRIFT_NS.
* Catches macOS NTP steps that shift wall time without bumping CNTVCT.
*/
bool vdso_realtime_drift_exceeded(guest_t *g,
uint64_t current_cntvct,
int64_t real_sec,
int64_t real_nsec);
153 changes: 107 additions & 46 deletions src/syscall/time.c
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,27 @@ typedef struct {

/* Time/timer syscall handlers. */

#define LINUX_COARSE_CLOCK_RES_NS 1000000

static bool linux_clock_getres_fixed(int clockid, linux_timespec_t *ts)
{
switch (clockid) {
case 0: /* CLOCK_REALTIME */
case 1: /* CLOCK_MONOTONIC */
case 4: /* CLOCK_MONOTONIC_RAW */
case 7: /* CLOCK_BOOTTIME */
*ts = (linux_timespec_t) {.tv_sec = 0, .tv_nsec = 1};
return true;
case 5: /* CLOCK_REALTIME_COARSE */
case 6: /* CLOCK_MONOTONIC_COARSE */
*ts = (linux_timespec_t) {.tv_sec = 0,
.tv_nsec = LINUX_COARSE_CLOCK_RES_NS};
return true;
default:
return false;
}
}

int64_t sys_clock_getres(guest_t *g, int clockid, uint64_t tp_gva)
{
int mac_clockid = translate_clockid(clockid);
Expand All @@ -231,9 +252,16 @@ int64_t sys_clock_getres(guest_t *g, int clockid, uint64_t tp_gva)
if (!tp_gva)
return 0;

struct timespec ts;
if (clock_getres(mac_clockid, &ts) < 0)
return linux_errno();
linux_timespec_t ts;
if (!linux_clock_getres_fixed(clockid, &ts)) {
struct timespec host_ts;
if (clock_getres(mac_clockid, &host_ts) < 0)
return linux_errno();
ts = (linux_timespec_t) {
.tv_sec = host_ts.tv_sec,
.tv_nsec = host_ts.tv_nsec,
};
}

if (guest_write_small(g, tp_gva, &ts, sizeof(ts)) < 0)
return -LINUX_EFAULT;
Expand All @@ -247,64 +275,46 @@ int64_t sys_clock_gettime(guest_t *g, int clockid, uint64_t tp_gva)
if (mac_clockid < 0)
return -LINUX_EINVAL;

/* If this trap came from the __kernel_clock_gettime vDSO svc_fallback,
* the trampoline parked the guest's CNTVCT_EL0 read in X9 before
* issuing SVC, and ELR_EL1 holds the address immediately after that
* SVC. Pair X9 with both the MONOTONIC and REALTIME wall_clocks and
* seed the vvar so subsequent calls hit the fast path for either
* clockid. Skip the seed for any other trap (raw
* syscall(SYS_clock_gettime, ...) from guest code, etc.): X9 is
* then arbitrary guest state, and seeding from it would poison the
* anchor and break every later fast-path call.
*
* Skip the gate entirely once the anchor is published: vdso_seed_anchor
* is a one-shot CAS that can never fire again, so the HVF reads of
* ELR_EL1 and X9 below would be pure waste on every subsequent trap.
* Both clockid 0 (REALTIME) and clockid 1 (MONOTONIC) take the vDSO
* fast path, so either may be the first caller; either way both
* anchor pairs are seeded from a single set of host clock_gettime
* calls.
/* When the trap came from the __kernel_clock_gettime vDSO
* svc_fallback, the trampoline parked the guest's CNTVCT_EL0 read in
* X9 before SVC, and ELR_EL1 holds SVC_PC + 4. Use X9 to seed (or
* refresh) the vvar anchor so subsequent calls hit the fast path.
* Reject any other trap: X9 would then be arbitrary guest state and
* seeding from it would poison the anchor.
*
* Order matters: read X9 first, then sample both host wall clocks
* back-to-back, then write to guest and seed. Sampling host clocks
* before checking X9 would bake a permanent positive bias (~50-200 ns)
* into the anchor because every host call ages the X9 timestamp by
* the seeding gate's HVF round-trip. The back-to-back wall-clock
* reads minimize MONO/REAL skew within the anchor.
* Order matters: read X9 first, then sample host wall clocks
* back-to-back, then write the guest result and seed. Sampling host
* clocks before checking X9 would bake a permanent positive bias
* into the anchor from the HVF round-trip in the seeding gate.
*/
bool seed_eligible = (clockid == 0 /* CLOCK_REALTIME */ ||
clockid == 1 /* CLOCK_MONOTONIC */) &&
current_thread && !vdso_anchor_is_seeded(g);
bool from_trampoline = (clockid == 0 /* CLOCK_REALTIME */ ||
clockid == 1 /* CLOCK_MONOTONIC */) &&
current_thread;

uint64_t guest_cntvct = 0;
if (seed_eligible) {
if (from_trampoline) {
uint64_t elr = 0;
if (hv_vcpu_get_sys_reg(current_thread->vcpu, HV_SYS_REG_ELR_EL1,
&elr) != HV_SUCCESS ||
elr != vdso_clock_gettime_svc_pc() + 4 ||
hv_vcpu_get_reg(current_thread->vcpu, HV_REG_X9, &guest_cntvct) !=
HV_SUCCESS ||
guest_cntvct == 0) {
/* Trap came from a path other than the vDSO trampoline; X9 is
* arbitrary, fall through to the non-seeding path.
*/
seed_eligible = false;
}
guest_cntvct == 0)
from_trampoline = false;
}

struct timespec ts;
if (clock_gettime(mac_clockid, &ts) < 0)
return linux_errno();

/* For the seeding path, sample the OTHER clockid back-to-back so both
* anchor pairs reflect roughly the same host moment. If the second
* clock_gettime fails (unreachable on macOS but defensive), skip
* seeding rather than fail the user's request: the user already has
* the value they asked for.
/* Sample the OTHER clockid back-to-back so both anchor pairs reflect
* roughly the same host moment. If the second clock_gettime fails
* (defensive; unreachable on macOS), skip seeding rather than fail
* the user's request.
*/
struct timespec ts_other;
bool can_seed = false;
if (seed_eligible) {
if (from_trampoline) {
int other_mac = (clockid == 1) ? CLOCK_REALTIME : CLOCK_MONOTONIC;
if (clock_gettime(other_mac, &ts_other) == 0)
can_seed = true;
Expand All @@ -316,8 +326,17 @@ int64_t sys_clock_gettime(guest_t *g, int clockid, uint64_t tp_gva)
if (can_seed) {
const struct timespec *ts_mono = (clockid == 1) ? &ts : &ts_other;
const struct timespec *ts_real = (clockid == 0) ? &ts : &ts_other;
vdso_seed_anchor(g, guest_cntvct, ts_mono->tv_sec, ts_mono->tv_nsec,
ts_real->tv_sec, ts_real->tv_nsec);

/* Publish when the vvar is unseeded, has aged out, or has
* drifted relative to the freshly-sampled REALTIME (catches
* macOS NTP steps).
*/
if (!vdso_anchor_is_seeded(g) ||
vdso_anchor_age_exceeded(g, guest_cntvct) ||
vdso_realtime_drift_exceeded(g, guest_cntvct, ts_real->tv_sec,
ts_real->tv_nsec))
vdso_seed_anchor(g, guest_cntvct, ts_mono->tv_sec, ts_mono->tv_nsec,
ts_real->tv_sec, ts_real->tv_nsec);
}

return 0;
Expand Down Expand Up @@ -391,13 +410,55 @@ int64_t sys_clock_nanosleep(guest_t *g,

int64_t sys_gettimeofday(guest_t *g, uint64_t tv_gva, uint64_t tz_gva)
{
(void) tz_gva; /* timezone is obsolete */
bool from_trampoline = current_thread;
uint64_t guest_cntvct = 0;
if (from_trampoline) {
uint64_t elr = 0;
if (hv_vcpu_get_sys_reg(current_thread->vcpu, HV_SYS_REG_ELR_EL1,
&elr) != HV_SUCCESS ||
elr != vdso_gettimeofday_svc_pc() + 4 ||
hv_vcpu_get_reg(current_thread->vcpu, HV_REG_X9, &guest_cntvct) !=
HV_SUCCESS ||
guest_cntvct == 0)
from_trampoline = false;
}

struct timeval tv;
if (gettimeofday(&tv, NULL) < 0)
return linux_errno();

if (tv_gva && guest_write_small(g, tv_gva, &tv, sizeof(tv)) < 0)
struct timespec ts_mono;
struct timespec ts_real;
bool can_seed = false;
if (from_trampoline && clock_gettime(CLOCK_MONOTONIC, &ts_mono) == 0 &&
clock_gettime(CLOCK_REALTIME, &ts_real) == 0)
can_seed = true;

linux_timeval_t ltv = {
.tv_sec = tv.tv_sec,
.tv_usec = tv.tv_usec,
};
if (tv_gva && guest_write_small(g, tv_gva, &ltv, sizeof(ltv)) < 0)
return -LINUX_EFAULT;

/* tz is obsolete on Linux but the kernel still zeroes a non-null
* pointer (struct timezone has two int32 fields, 8 bytes total).
* Matching the vDSO fast path's `str xzr, [tz]` here keeps SVC and
* fast-path callers observationally identical.
*/
if (tz_gva) {
const uint64_t tz_zero = 0;
if (guest_write_small(g, tz_gva, &tz_zero, sizeof(tz_zero)) < 0)
return -LINUX_EFAULT;
}

if (can_seed && (!vdso_anchor_is_seeded(g) ||
vdso_anchor_age_exceeded(g, guest_cntvct) ||
vdso_realtime_drift_exceeded(
g, guest_cntvct, ts_real.tv_sec, ts_real.tv_nsec)))
vdso_seed_anchor(g, guest_cntvct, ts_mono.tv_sec, ts_mono.tv_nsec,
ts_real.tv_sec, ts_real.tv_nsec);

return 0;
}

Expand Down
Loading
Loading