Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/core/bootstrap.c
Original file line number Diff line number Diff line change
Expand Up @@ -709,6 +709,7 @@ int guest_bootstrap_create_vcpu(guest_t *g,
shim_globals_publish_pid(g, proc_get_pid(), proc_get_ppid());
shim_globals_publish_creds(g, proc_get_uid(), proc_get_euid(),
proc_get_gid(), proc_get_egid());
proc_publish_pgsid_snapshot(g);
/* Pre-fill the entropy ring so the first read(/dev/urandom) from the guest
* is served by the shim fast path with no cold-start HVC for refill.
*/
Expand Down
135 changes: 129 additions & 6 deletions src/core/shim-globals.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
* src/core/shim.S.
*/

#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sched.h>
Expand Down Expand Up @@ -60,6 +62,35 @@ _Static_assert(SHIM_URANDOM_OFF_RING_LOCK == 0x10C0,
"shim.S urandom fast path hard-codes RING_LOCK off 0x10C0");
_Static_assert(FD_TABLE_SIZE == 1024,
"shim.S urandom fast path hard-codes FD_TABLE_SIZE 1024");
_Static_assert(SHIM_URANDOM_INLINE_LIMIT == 256,
"shim.S urandom/getrandom fast path hard-codes 256-byte cap");

/* shim.S COUNTER_INC macro hardcodes (SHIM_COUNTERS_OFF & 0xFFF) and the
* 0x1, lsl #12 carry. Keep the literal in sync so a layout shift fails
* the build rather than silently routing increments to the wrong slot.
*/
_Static_assert(SHIM_COUNTERS_OFF == 0x10C8,
"shim.S COUNTER_INC hard-codes SHIM_COUNTERS_OFF=0x10C8");
/* shim.S splits SHIM_COUNTERS_OFF into a shifted-add carry (0x1000) plus
* an imm12 load/store offset (0xC8 + slot byte). Pin the split so any
* future layout shift fails the build instead of silently routing
* increments to the wrong slot.
*/
_Static_assert((SHIM_COUNTERS_OFF & 0xFFF) == 0xC8,
"shim.S SHIM_COUNTERS_OFF_LO12 hard-coded to 0xC8");
_Static_assert((SHIM_COUNTERS_OFF & ~0xFFF) == 0x1000,
"shim.S SHIM_COUNTERS_OFF_HI hard-coded to 0x1000");
_Static_assert(SHIM_IDENTITY_OFF_PGID == 0x1148,
"shim.S getpgid fast path hard-codes PGID off 0x1148");
_Static_assert(SHIM_IDENTITY_OFF_SID == 0x1150,
"shim.S getsid fast path hard-codes SID off 0x1150");
_Static_assert(SHIM_GLOBALS_SIZE >= SHIM_IDENTITY_OFF_SID + 8,
"SHIM_GLOBALS_SIZE must cover the PGID/SID slots");
_Static_assert(SHIM_GLOBALS_SIZE <= BLOCK_2MIB,
"SHIM_GLOBALS_SIZE must fit inside the 2 MiB shim_data block");
_Static_assert(SHIM_COUNTERS_OFF + SHIM_COUNTERS_N * 8 <=
SHIM_IDENTITY_OFF_PGID,
"counter array must not overlap the PGID slot");

static uint8_t *cache_base(const guest_t *g)
{
Expand Down Expand Up @@ -114,6 +145,13 @@ void shim_globals_publish_creds(guest_t *g,
store_u64(page, SHIM_IDENTITY_OFF_EGID, egid);
}

void shim_globals_publish_pgsid(guest_t *g, int64_t pgid, int64_t sid)
{
uint8_t *page = cache_base(g);
store_u64(page, SHIM_IDENTITY_OFF_PGID, (uint64_t) pgid);
store_u64(page, SHIM_IDENTITY_OFF_SID, (uint64_t) sid);
}

uint64_t shim_globals_gva(const guest_t *g)
{
return g->shim_data_base;
Expand Down Expand Up @@ -242,9 +280,18 @@ void shim_globals_rebuild_urandom_bitmap(void)
}

/* arc4random_buf is documented as deadlock-free and re-entrant. Used
* by both the initial fill at bootstrap and by the slow-path refill
* that runs from sys_read when the shim's fast path falls through due
* to an empty ring.
* by the initial fill at bootstrap and by the slow-path refill that
* runs from sys_read/sys_getrandom when the shim's fast path falls
* through due to an empty ring.
*
* Entropy is generated OUTSIDE the ring_lock: arc4random_buf can take
* microseconds, and any sibling vCPU that hits the fast path while the
* lock is held spins (yield) until release. Generate up to a full ring
* into a stack scratch buffer, then take the lock only to re-read
* head/fill and copy the publishable prefix into the ring. The recheck
* after lock acquire matters: a concurrent fast path may have advanced
* head while entropy was being generated, raising the publishable
* count beyond the pre-lock estimate.
*/
void shim_globals_refill_urandom_ring(guest_t *g)
{
Expand All @@ -254,13 +301,31 @@ void shim_globals_refill_urandom_ring(guest_t *g)
uint32_t *lock_p = (uint32_t *) (base + SHIM_URANDOM_OFF_RING_LOCK);
uint8_t *ring = base + SHIM_URANDOM_OFF_RING;

/* Pre-lock estimate: skip the arc4random_buf + lock when the ring
* is already full. Both cursors are read RELAXED so a torn snapshot
* (head_pre observed past a producer step but tail_pre observed
* before it) can make tail_pre - head_pre wrap to a huge unsigned
* value. A loose ">= RING_SIZE" check would treat that garbage as
* "already full" and skip a genuinely-needed refill. Only the exact
* == RING_SIZE value is a safe full-detection; any other (valid or
* torn) reading falls through to the lock-held recheck below.
*/
uint32_t head_pre = __atomic_load_n(head_p, __ATOMIC_RELAXED);
uint32_t tail_pre = __atomic_load_n(tail_p, __ATOMIC_RELAXED);
uint32_t fill_pre = tail_pre - head_pre;
if (fill_pre == SHIM_URANDOM_RING_SIZE)
return;

uint8_t scratch[SHIM_URANDOM_RING_SIZE];
arc4random_buf(scratch, sizeof(scratch));

urandom_ring_lock(lock_p);

uint32_t head = __atomic_load_n(head_p, __ATOMIC_ACQUIRE);
uint32_t tail = __atomic_load_n(tail_p, __ATOMIC_RELAXED);
uint32_t fill = tail - head;
if (fill >= SHIM_URANDOM_RING_SIZE)
goto out; /* already full */
goto out; /* concurrent refill caught up */
uint32_t to_fill = SHIM_URANDOM_RING_SIZE - fill;

/* Producer writes from ring[tail & (SIZE-1)] forward, wrapping
Expand All @@ -270,9 +335,9 @@ void shim_globals_refill_urandom_ring(guest_t *g)
uint32_t first = SHIM_URANDOM_RING_SIZE - pos;
if (first > to_fill)
first = to_fill;
arc4random_buf(ring + pos, first);
memcpy(ring + pos, scratch, first);
if (to_fill > first)
arc4random_buf(ring, to_fill - first);
memcpy(ring, scratch + first, to_fill - first);

/* Release-store the new tail so any fast-path consumer that loads
* tail with an acquiring read sees the bytes already in the ring.
Expand Down Expand Up @@ -359,3 +424,61 @@ void shim_globals_set_trace_enabled(guest_t *g, bool enabled)
else
shim_globals_attn_and(g, ~ATTN_BIT_TRACE);
}

static const char *const counter_names[SHIM_COUNTERS_N] = {
[SHIM_COUNTER_ATTN_BAIL] = "ATTN_BAIL",
[SHIM_COUNTER_URANDOM_FD_OOR] = "URANDOM_FD_OOR",
[SHIM_COUNTER_URANDOM_FD_BMISS] = "URANDOM_FD_BMISS",
[SHIM_COUNTER_URANDOM_LEN_ZERO] = "URANDOM_LEN_ZERO",
[SHIM_COUNTER_URANDOM_LEN_OVER] = "URANDOM_LEN_OVER",
[SHIM_COUNTER_URANDOM_RING_LOW] = "URANDOM_RING_LOW",
[SHIM_COUNTER_URANDOM_RING_WRAP] = "URANDOM_RING_WRAP",
[SHIM_COUNTER_URANDOM_PROBE_FAIL] = "URANDOM_PROBE_FAIL",
[SHIM_COUNTER_IDENTITY_HIT] = "IDENTITY_HIT",
[SHIM_COUNTER_URANDOM_HIT] = "URANDOM_HIT",
[SHIM_COUNTER_GETRANDOM_HIT] = "GETRANDOM_HIT",
[SHIM_COUNTER_PGSID_HIT] = "PGSID_HIT",
/* Slots 12..15 (SHIM_COUNTERS_N == 16) are intentionally unnamed;
* the dump prints "(reserved)" so they appear in the output when
* non-zero, which would flag an out-of-band increment. Bind a name
* here when a future EL1 service claims one of these slots.
*/
};

uint64_t shim_globals_counter_get(const guest_t *g, unsigned slot)
{
if (slot >= SHIM_COUNTERS_N)
return 0;
const uint8_t *page = (const uint8_t *) g->host_base + g->shim_data_base;
const uint64_t *slot_p =
(const uint64_t *) (page + SHIM_COUNTERS_OFF) + slot;
return __atomic_load_n(slot_p, __ATOMIC_RELAXED);
}

void shim_globals_counters_dump(const guest_t *g)
{
fprintf(stderr, "shim-stats (pid=%lld)\n", (long long) proc_get_pid());
for (unsigned i = 0; i < SHIM_COUNTERS_N; i++) {
const char *name = counter_names[i];
uint64_t v = shim_globals_counter_get(g, i);
if (!name && v == 0)
continue;
fprintf(stderr, " %-20s %llu\n", name ? name : "(reserved)",
(unsigned long long) v);
}
}

static pthread_once_t stats_once = PTHREAD_ONCE_INIT;
static bool stats_enabled_cache;

static void stats_resolve(void)
{
const char *v = getenv("ELFUSE_SHIM_STATS");
stats_enabled_cache = v && v[0] && strcmp(v, "0") != 0;
}

bool shim_globals_stats_enabled(void)
{
pthread_once(&stats_once, stats_resolve);
return stats_enabled_cache;
}
90 changes: 89 additions & 1 deletion src/core/shim-globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,63 @@
#define SHIM_URANDOM_RING_SIZE 4096
#define SHIM_URANDOM_OFF_RING_LOCK 0x10C0

#define SHIM_GLOBALS_SIZE 0x10C4
/* Upper bound on the per-call byte count served by the shim's
* urandom/getrandom fast paths. The probe coverage assumes the buffer
* spans at most two host pages so a first+last byte AT probe suffices;
* 256 fits comfortably within both 4 KiB and 16 KiB page sizes. The
* shim itself hardcodes the literal; a static_assert in shim-globals.c
* pins the C macro to the assembly. Ring wraps are handled inline by
* splitting the byte copy at the 4 KiB boundary, so this cap is bounded
* only by probe coverage and per-call ring-fill cost (256 keeps the
* 4 KiB ring serviceable for 16 sequential reads before host refill).
*/
#define SHIM_URANDOM_INLINE_LIMIT 256

/* Fast-path hit / miss counters.
*
* 16 uint64 slots placed after the urandom ring lock. The shim's
* identity_class_fast and urandom_read_fast paths bump the relevant
* slot on every entry and at every bail point so the host can attribute
* fast-path activity instead of guessing. Counters are non-atomic plain
* load-add-store -- under multi-vCPU concurrent bails a small fraction
* of increments race and are lost, which is acceptable for diagnostic
* ratios. Slots 0..7 cover the eight bail reasons the shim distinguishes
* (sticky attention, fd out of range, fd not in urandom bitmap, len zero,
* len over inline cap, ring fill below request, ring wrap, EL0 buffer
* probe failure). Slots 8..11 record fast-path hits so bail rates can be
* computed against a hit denominator. Slots 12..15 are reserved.
*
* The shim hardcodes the byte offset of each slot; the static_asserts
* in shim-globals.c keep the C-side macros and the assembly in sync.
*/
#define SHIM_COUNTERS_OFF 0x10C8
#define SHIM_COUNTERS_N 16

#define SHIM_COUNTER_ATTN_BAIL 0
#define SHIM_COUNTER_URANDOM_FD_OOR 1
#define SHIM_COUNTER_URANDOM_FD_BMISS 2
#define SHIM_COUNTER_URANDOM_LEN_ZERO 3
#define SHIM_COUNTER_URANDOM_LEN_OVER 4
#define SHIM_COUNTER_URANDOM_RING_LOW 5
#define SHIM_COUNTER_URANDOM_RING_WRAP 6
#define SHIM_COUNTER_URANDOM_PROBE_FAIL 7
#define SHIM_COUNTER_IDENTITY_HIT 8
#define SHIM_COUNTER_URANDOM_HIT 9
#define SHIM_COUNTER_GETRANDOM_HIT 10
#define SHIM_COUNTER_PGSID_HIT 11

/* Extended identity slots: pgid and sid.
*
* getpgid(0) and getsid(0) are pure cache reads when the argument is
* zero; the shim serves them out of these slots whenever X0 == 0 and
* the syscall number matches. The host re-publishes after setpgid /
* setsid / exec / fork so the slots match guest_pgid / guest_sid in
* proc-identity.c.
*/
#define SHIM_IDENTITY_OFF_PGID 0x1148
#define SHIM_IDENTITY_OFF_SID 0x1150

#define SHIM_GLOBALS_SIZE 0x1158

/* Initialize the cache region to all-zero. Called once per process at
* the same time the shim_data block is set up (initial bootstrap and
Expand Down Expand Up @@ -158,6 +214,21 @@ void shim_globals_publish_creds(guest_t *g,
uint32_t gid,
uint32_t egid);

/* Publish pgid + sid so the shim's getpgid(0) / getsid(0) inline service
* sees the current session/process-group state. Call from process init,
* fork-child receive, exec, setsid, and setpgid. Slot writes are
* independent 64-bit atomic release stores.
*
* No attention bit guards this publish: setpgid / setsid are infrequent
* and the model accepts a brief window in which a concurrent
* getpgid(0) / getsid(0) on a sibling vCPU observes the pre-publish
* value (consistent with Linux's lockless session lookups). Session
* mutators and cache-initialization callers publish through proc-identity
* while holding session_lock, so successful setpgid / setsid calls cannot
* overwrite the cache out of order.
*/
void shim_globals_publish_pgsid(guest_t *g, int64_t pgid, int64_t sid);

/* GVA of the cache base. Equal to g->shim_data_base. Exposed so the
* TPIDR_EL1 setup site and tests can reference one source of truth.
*/
Expand Down Expand Up @@ -306,3 +377,20 @@ void shim_globals_rebuild_urandom_bitmap(void);
* forced through the host SVC.
*/
void shim_globals_refill_urandom_ring(guest_t *g);

/* Counter access for diagnostics. shim_globals_counter_get returns the
* cumulative slot value (lossy under multi-vCPU bail contention; see the
* comment block on SHIM_COUNTERS_OFF). slot must be in [0, SHIM_COUNTERS_N).
* shim_globals_counters_dump writes a one-line-per-slot summary to out
* with the SHIM_COUNTER_* names and current values; intended for use at
* process exit when ELFUSE_SHIM_STATS is set.
*/
uint64_t shim_globals_counter_get(const guest_t *g, unsigned slot);
void shim_globals_counters_dump(const guest_t *g);

/* ELFUSE_SHIM_STATS env-var gate (idempotent / cached). When enabled the
* exit path dumps the counter table to stderr so a single bench run
* attributes every fast-path bail without rebuilds. Mirrors the
* ELFUSE_STARTUP_TRACE pattern in core/startup-trace.h.
*/
bool shim_globals_stats_enabled(void);
Loading
Loading